diff --git a/.mergify.yml b/.mergify.yml
index 3347c6dc..6dae66d0 100644
--- a/.mergify.yml
+++ b/.mergify.yml
@@ -80,6 +80,12 @@ pull_request_rules:
actions:
label:
add: ["CLI"]
+ - name: "auto add label=Server"
+ conditions:
+ - files~=^paddlespeech/server
+ actions:
+ label:
+ add: ["Server"]
- name: "auto add label=Demo"
conditions:
- files~=^demos/
@@ -130,7 +136,7 @@ pull_request_rules:
add: ["Docker"]
- name: "auto add label=Deployment"
conditions:
- - files~=^speechnn/
+ - files~=^speechx/
actions:
label:
add: ["Deployment"]
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 2f80e46b..60f0b92f 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,11 +1,12 @@
+repos:
- repo: https://github.com/pre-commit/mirrors-yapf.git
- sha: v0.16.0
+ rev: v0.16.0
hooks:
- id: yapf
files: \.py$
exclude: (?=third_party).*(\.py)$
- repo: https://github.com/pre-commit/pre-commit-hooks
- sha: a11d9314b22d8f8c7556443875b731ef05965464
+ rev: a11d9314b22d8f8c7556443875b731ef05965464
hooks:
- id: check-merge-conflict
- id: check-symlinks
@@ -31,7 +32,7 @@
- --jobs=1
exclude: (?=third_party).*(\.py)$
- repo : https://github.com/Lucas-C/pre-commit-hooks
- sha: v1.0.1
+ rev: v1.0.1
hooks:
- id: forbid-crlf
files: \.md$
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5ffe8098..6e8315e7 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,11 +1,46 @@
# Changelog
+Date: 2022-1-29, Author: yt605155624.
+Add features to: T2S:
+ - Update aishell3 vc0 with new Tacotron2.
+ - PRLink: https://github.com/PaddlePaddle/PaddleSpeech/pull/1419
+
+Date: 2022-1-29, Author: yt605155624.
+Add features to: T2S:
+ - Add ljspeech Tacotron2.
+ - PRLink: https://github.com/PaddlePaddle/PaddleSpeech/pull/1416
+
+Date: 2022-1-24, Author: yt605155624.
+Add features to: T2S:
+ - Add csmsc WaveRNN.
+ - PRLink: https://github.com/PaddlePaddle/PaddleSpeech/pull/1379
+
+Date: 2022-1-19, Author: yt605155624.
+Add features to: T2S:
+ - Add csmsc Tacotron2.
+ - PRLink: https://github.com/PaddlePaddle/PaddleSpeech/pull/1314
+
Date: 2022-1-10, Author: Jackwaterveg.
-Add features to: CLI:
- - Support English (librispeech/asr1/transformer).
+Add features to: CLI:
+ - Support English (librispeech/asr1/transformer).
- Support choosing `decode_method` for conformer and transformer models.
- Refactor the config, using the unified config.
- PRLink: https://github.com/PaddlePaddle/PaddleSpeech/pull/1297
***
+
+Date: 2022-1-17, Author: Jackwaterveg.
+Add features to: CLI:
+ - Support deepspeech2 online/offline model(aishell).
+ - PRLink: https://github.com/PaddlePaddle/PaddleSpeech/pull/1356
+
+***
+
+Date: 2022-1-24, Author: Jackwaterveg.
+Add features to: ctc_decoders:
+ - Support online ctc prefix-beam search decoder.
+ - Unified ctc online decoder and ctc offline decoder.
+ - PRLink: https://github.com/PaddlePaddle/PaddleSpeech/pull/821
+
+***
diff --git a/README.md b/README.md
index cca1cb53..9a2fe2aa 100644
--- a/README.md
+++ b/README.md
@@ -16,12 +16,15 @@
-
+
+
+
+
@@ -143,6 +146,8 @@ For more synthesized audios, please refer to [PaddleSpeech Text-to-Speech sample
+- [PaddleSpeech Demo Video](https://paddlespeech.readthedocs.io/en/latest/demo_video.html)
+
### 🔥 Hot Activities
- 2021.12.21~12.24
@@ -236,7 +241,7 @@ PaddleSpeech supports a series of most popular models. They are summarized in [r
- Speech Recogination
+ Speech Recogination
Aishell
DeepSpeech2 RNN + Conv based Models
@@ -249,7 +254,7 @@ PaddleSpeech supports a series of most popular models. They are summarized in [r
u2.transformer.conformer-aishell
-
+
Librispeech
Transformer based Attention Models
@@ -257,6 +262,13 @@ PaddleSpeech supports a series of most popular models. They are summarized in [r
+
+ TIMIT
+ Unified Streaming & Non-streaming Two-pass
+
+ u2-timit
+
+
Alignment
THCHS30
@@ -266,20 +278,13 @@ PaddleSpeech supports a series of most popular models. They are summarized in [r
- Language Model
+ Language Model
Ngram Language Model
kenlm
-
- TIMIT
- Unified Streaming & Non-streaming Two-pass
-
- u2-timit
-
-
-
+
Speech Translation (English to Chinese)
TED En-Zh
Transformer + ASR MTL
@@ -317,14 +322,15 @@ PaddleSpeech supports a series of most popular models. They are summarized in [r
Acoustic Model
- Tacotron2
- LJSpeech
+ Tacotron2
+ LJSpeech / CSMSC
- tacotron2-ljspeech
+ tacotron2-ljspeech / tacotron2-csmsc
Transformer TTS
+ LJSpeech
transformer-ljspeech
@@ -344,7 +350,7 @@ PaddleSpeech supports a series of most popular models. They are summarized in [r
- Vocoder
+ Vocoder
WaveFlow
LJSpeech
@@ -378,7 +384,14 @@ PaddleSpeech supports a series of most popular models. They are summarized in [r
HiFiGAN-csmsc
-
+
+
+ WaveRNN
+ CSMSC
+
+ WaveRNN-csmsc
+
+
Voice Cloning
GE2E
@@ -416,7 +429,6 @@ PaddleSpeech supports a series of most popular models. They are summarized in [r
-
Audio Classification
ESC-50
@@ -440,7 +452,6 @@ PaddleSpeech supports a series of most popular models. They are summarized in [r
-
Punctuation Restoration
IWLST2012_zh
@@ -463,7 +474,6 @@ Normally, [Speech SoTA](https://paperswithcode.com/area/speech), [Audio SoTA](ht
- [Automatic Speech Recognition](./docs/source/asr/quick_start.md)
- [Introduction](./docs/source/asr/models_introduction.md)
- [Data Preparation](./docs/source/asr/data_preparation.md)
- - [Data Augmentation](./docs/source/asr/augmentation.md)
- [Ngram LM](./docs/source/asr/ngram_lm.md)
- [Text-to-Speech](./docs/source/tts/quick_start.md)
- [Introduction](./docs/source/tts/models_introduction.md)
@@ -489,7 +499,17 @@ author={PaddlePaddle Authors},
howpublished = {\url{https://github.com/PaddlePaddle/PaddleSpeech}},
year={2021}
}
+
+@inproceedings{zheng2021fused,
+ title={Fused acoustic and text encoding for multimodal bilingual pretraining and speech translation},
+ author={Zheng, Renjie and Chen, Junkun and Ma, Mingbo and Huang, Liang},
+ booktitle={International Conference on Machine Learning},
+ pages={12736--12746},
+ year={2021},
+ organization={PMLR}
+}
```
+
## Contribute to PaddleSpeech
@@ -540,6 +560,7 @@ You are warmly welcome to submit questions in [discussions](https://github.com/P
- Many thanks to [mymagicpower](https://github.com/mymagicpower) for the Java implementation of ASR upon [short](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_sdk) and [long](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_long_audio_sdk) audio files.
- Many thanks to [JiehangXie](https://github.com/JiehangXie)/[PaddleBoBo](https://github.com/JiehangXie/PaddleBoBo) for developing Virtual Uploader(VUP)/Virtual YouTuber(VTuber) with PaddleSpeech TTS function.
- Many thanks to [745165806](https://github.com/745165806)/[PaddleSpeechTask](https://github.com/745165806/PaddleSpeechTask) for contributing Punctuation Restoration model.
+- Many thanks to [kslz](https://github.com/745165806) for supplementary Chinese documents.
Besides, PaddleSpeech depends on a lot of open source repositories. See [references](./docs/source/reference.md) for more information.
diff --git a/README_cn.md b/README_cn.md
index ddf189c3..409b7a25 100644
--- a/README_cn.md
+++ b/README_cn.md
@@ -147,6 +147,8 @@ from https://github.com/18F/open-source-guide/blob/18f-pages/pages/making-readme
+- [PaddleSpeech 示例视频](https://paddlespeech.readthedocs.io/en/latest/demo_video.html)
+
### 🔥 热门活动
@@ -233,7 +235,7 @@ PaddleSpeech 的 **语音转文本** 包含语音识别声学模型、语音识
- 语音识别
+ 语音识别
Aishell
DeepSpeech2 RNN + Conv based Models
@@ -254,6 +256,13 @@ PaddleSpeech 的 **语音转文本** 包含语音识别声学模型、语音识
+
+ TIMIT
+ Unified Streaming & Non-streaming Two-pass
+
+ u2-timit
+
+
对齐
THCHS30
@@ -263,19 +272,12 @@ PaddleSpeech 的 **语音转文本** 包含语音识别声学模型、语音识
- 语言模型
+ 语言模型
Ngram 语言模型
kenlm
-
- TIMIT
- Unified Streaming & Non-streaming Two-pass
-
- u2-timit
-
-
语音翻译(英译中)
TED En-Zh
@@ -315,14 +317,15 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声
声学模型
- Tacotron2
- LJSpeech
+ Tacotron2
+ LJSpeech / CSMSC
- tacotron2-ljspeech
+ tacotron2-ljspeech / tacotron2-csmsc
Transformer TTS
+ LJSpeech
transformer-ljspeech
@@ -342,7 +345,7 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声
- 声码器
+ 声码器
WaveFlow
LJSpeech
@@ -376,7 +379,14 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声
HiFiGAN-csmsc
-
+
+
+ WaveRNN
+ CSMSC
+
+ WaveRNN-csmsc
+
+
声音克隆
GE2E
@@ -415,8 +425,6 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声
-
-
声音分类
ESC-50
@@ -440,7 +448,6 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声
-
标点恢复
IWLST2012_zh
@@ -468,7 +475,6 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声
- [语音识别自定义训练](./docs/source/asr/quick_start.md)
- [简介](./docs/source/asr/models_introduction.md)
- [数据准备](./docs/source/asr/data_preparation.md)
- - [数据增强](./docs/source/asr/augmentation.md)
- [Ngram 语言模型](./docs/source/asr/ngram_lm.md)
- [语音合成自定义训练](./docs/source/tts/quick_start.md)
- [简介](./docs/source/tts/models_introduction.md)
@@ -549,6 +555,7 @@ year={2021}
- 非常感谢 [mymagicpower](https://github.com/mymagicpower) 采用PaddleSpeech 对 ASR 的[短语音](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_sdk)及[长语音](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_long_audio_sdk)进行 Java 实现。
- 非常感谢 [JiehangXie](https://github.com/JiehangXie)/[PaddleBoBo](https://github.com/JiehangXie/PaddleBoBo) 采用 PaddleSpeech 语音合成功能实现 Virtual Uploader(VUP)/Virtual YouTuber(VTuber) 虚拟主播。
- 非常感谢 [745165806](https://github.com/745165806)/[PaddleSpeechTask](https://github.com/745165806/PaddleSpeechTask) 贡献标点重建相关模型。
+- 非常感谢 [kslz](https://github.com/kslz) 补充中文文档。
此外,PaddleSpeech 依赖于许多开源存储库。有关更多信息,请参阅 [references](./docs/source/reference.md)。
diff --git a/dataset/voxceleb/README.md b/dataset/voxceleb/README.md
new file mode 100644
index 00000000..3efb3519
--- /dev/null
+++ b/dataset/voxceleb/README.md
@@ -0,0 +1,10 @@
+# [VoxCeleb](http://www.robots.ox.ac.uk/~vgg/data/voxceleb/)
+VoxCeleb is an audio-visual dataset consisting of short clips of human speech, extracted from interview videos uploaded to YouTube。
+
+VoxCeleb contains speech from speakers spanning a wide range of different ethnicities, accents, professions and ages.
+All speaking face-tracks are captured "in the wild", with background chatter, laughter, overlapping speech, pose variation and different lighting conditions.
+VoxCeleb consists of both audio and video. Each segment is at least 3 seconds long.
+
+The dataset consists of two versions, VoxCeleb1 and VoxCeleb2. Each version has it's own train/test split. For each we provide YouTube URLs, face detections and tracks, audio files, cropped face videos and speaker meta-data. There is no overlap between the two versions.
+
+more info in details refers to http://www.robots.ox.ac.uk/~vgg/data/voxceleb/
diff --git a/dataset/voxceleb/voxceleb1.py b/dataset/voxceleb/voxceleb1.py
new file mode 100644
index 00000000..ce744751
--- /dev/null
+++ b/dataset/voxceleb/voxceleb1.py
@@ -0,0 +1,188 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Prepare VoxCeleb1 dataset
+
+create manifest files.
+Manifest file is a json-format file with each line containing the
+meta data (i.e. audio filepath, transcript and audio duration)
+of each audio file in the data set.
+
+researchers should download the voxceleb1 dataset yourselves
+through google form to get the username & password and unpack the data
+"""
+import argparse
+import codecs
+import glob
+import json
+import os
+import subprocess
+from pathlib import Path
+
+import soundfile
+
+from utils.utility import check_md5sum
+from utils.utility import download
+from utils.utility import unzip
+
+# all the data will be download in the current data/voxceleb directory default
+DATA_HOME = os.path.expanduser('.')
+
+# if you use the http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/ as the download base url
+# you need to get the username & password via the google form
+
+# if you use the https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a as the download base url,
+# you need use --no-check-certificate to connect the target download url
+
+BASE_URL = "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a"
+
+# dev data
+DEV_LIST = {
+ "vox1_dev_wav_partaa": "e395d020928bc15670b570a21695ed96",
+ "vox1_dev_wav_partab": "bbfaaccefab65d82b21903e81a8a8020",
+ "vox1_dev_wav_partac": "017d579a2a96a077f40042ec33e51512",
+ "vox1_dev_wav_partad": "7bb1e9f70fddc7a678fa998ea8b3ba19",
+}
+DEV_TARGET_DATA = "vox1_dev_wav_parta* vox1_dev_wav.zip ae63e55b951748cc486645f532ba230b"
+
+# test data
+TEST_LIST = {"vox1_test_wav.zip": "185fdc63c3c739954633d50379a3d102"}
+TEST_TARGET_DATA = "vox1_test_wav.zip vox1_test_wav.zip 185fdc63c3c739954633d50379a3d102"
+
+# kaldi trial
+# this trial file is organized by kaldi according the official file,
+# which is a little different with the official trial veri_test2.txt
+KALDI_BASE_URL = "http://www.openslr.org/resources/49/"
+TRIAL_LIST = {"voxceleb1_test_v2.txt": "29fc7cc1c5d59f0816dc15d6e8be60f7"}
+TRIAL_TARGET_DATA = "voxceleb1_test_v2.txt voxceleb1_test_v2.txt 29fc7cc1c5d59f0816dc15d6e8be60f7"
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+ "--target_dir",
+ default=DATA_HOME + "/voxceleb1/",
+ type=str,
+ help="Directory to save the voxceleb1 dataset. (default: %(default)s)")
+parser.add_argument(
+ "--manifest_prefix",
+ default="manifest",
+ type=str,
+ help="Filepath prefix for output manifests. (default: %(default)s)")
+
+args = parser.parse_args()
+
+def create_manifest(data_dir, manifest_path_prefix):
+ print("Creating manifest %s ..." % manifest_path_prefix)
+ json_lines = []
+ data_path = os.path.join(data_dir, "wav", "**", "*.wav")
+ total_sec = 0.0
+ total_text = 0.0
+ total_num = 0
+ speakers = set()
+ for audio_path in glob.glob(data_path, recursive=True):
+ audio_id = "-".join(audio_path.split("/")[-3:])
+ utt2spk = audio_path.split("/")[-3]
+ duration = soundfile.info(audio_path).duration
+ text = ""
+ json_lines.append(
+ json.dumps(
+ {
+ "utt": audio_id,
+ "utt2spk": str(utt2spk),
+ "feat": audio_path,
+ "feat_shape": (duration, ),
+ "text": text # compatible with asr data format
+ },
+ ensure_ascii=False))
+
+ total_sec += duration
+ total_text += len(text)
+ total_num += 1
+ speakers.add(utt2spk)
+
+ # data_dir_name refer to dev or test
+ # voxceleb1 is given explicit in the path
+ data_dir_name = Path(data_dir).name
+ manifest_path_prefix = manifest_path_prefix + "." + data_dir_name
+ with codecs.open(manifest_path_prefix, 'w', encoding='utf-8') as f:
+ for line in json_lines:
+ f.write(line + "\n")
+
+ manifest_dir = os.path.dirname(manifest_path_prefix)
+ meta_path = os.path.join(manifest_dir, "voxceleb1." +
+ data_dir_name) + ".meta"
+ with codecs.open(meta_path, 'w', encoding='utf-8') as f:
+ print(f"{total_num} utts", file=f)
+ print(f"{len(speakers)} speakers", file=f)
+ print(f"{total_sec / (60 * 60)} h", file=f)
+ print(f"{total_text} text", file=f)
+ print(f"{total_text / total_sec} text/sec", file=f)
+ print(f"{total_sec / total_num} sec/utt", file=f)
+
+def prepare_dataset(base_url, data_list, target_dir, manifest_path,
+ target_data):
+ if not os.path.exists(target_dir):
+ os.mkdir(target_dir)
+
+ # wav directory already exists, it need do nothing
+ if not os.path.exists(os.path.join(target_dir, "wav")):
+ # download all dataset part
+ for zip_part in data_list.keys():
+ download_url = " --no-check-certificate " + base_url + "/" + zip_part
+ download(
+ url=download_url,
+ md5sum=data_list[zip_part],
+ target_dir=target_dir)
+
+ # pack the all part to target zip file
+ all_target_part, target_name, target_md5sum = target_data.split()
+ target_name = os.path.join(target_dir, target_name)
+ if not os.path.exists(target_name):
+ pack_part_cmd = "cat {}/{} > {}".format(target_dir, all_target_part,
+ target_name)
+ subprocess.call(pack_part_cmd, shell=True)
+
+ # check the target zip file md5sum
+ if not check_md5sum(target_name, target_md5sum):
+ raise RuntimeError("{} MD5 checkssum failed".format(target_name))
+ else:
+ print("Check {} md5sum successfully".format(target_name))
+
+ # unzip the all zip file
+ if target_name.endswith(".zip"):
+ unzip(target_name, target_dir)
+
+ # create the manifest file
+ create_manifest(data_dir=target_dir, manifest_path_prefix=manifest_path)
+
+def main():
+ if args.target_dir.startswith('~'):
+ args.target_dir = os.path.expanduser(args.target_dir)
+
+ prepare_dataset(
+ base_url=BASE_URL,
+ data_list=DEV_LIST,
+ target_dir=os.path.join(args.target_dir, "dev"),
+ manifest_path=args.manifest_prefix,
+ target_data=DEV_TARGET_DATA)
+
+ prepare_dataset(
+ base_url=BASE_URL,
+ data_list=TEST_LIST,
+ target_dir=os.path.join(args.target_dir, "test"),
+ manifest_path=args.manifest_prefix,
+ target_data=TEST_TARGET_DATA)
+
+ print("Manifest prepare done!")
+
+if __name__ == '__main__':
+ main()
diff --git a/docs/images/arch/PaddleSpeech_Server_architecture_diagram.png b/docs/images/arch/PaddleSpeech_Server_architecture_diagram.png
new file mode 100644
index 00000000..16f8ddcc
Binary files /dev/null and b/docs/images/arch/PaddleSpeech_Server_architecture_diagram.png differ
diff --git a/docs/images/arch/PaddleSpeech_Server_class_diagram.png b/docs/images/arch/PaddleSpeech_Server_class_diagram.png
new file mode 100644
index 00000000..0c3daddd
Binary files /dev/null and b/docs/images/arch/PaddleSpeech_Server_class_diagram.png differ
diff --git a/docs/images/arch/paddlespeech_high_layout.jpg b/docs/images/arch/paddlespeech_high_layout.jpg
new file mode 100644
index 00000000..f726aa2e
Binary files /dev/null and b/docs/images/arch/paddlespeech_high_layout.jpg differ
diff --git a/docs/source/asr/augmentation.md b/docs/source/asr/augmentation.md
deleted file mode 100644
index 8e65cb19..00000000
--- a/docs/source/asr/augmentation.md
+++ /dev/null
@@ -1,40 +0,0 @@
-# Data Augmentation Pipeline
-
-Data augmentation has often been a highly effective technique to boost deep learning performance. We augment our speech data by synthesizing new audios with small random perturbation (label-invariant transformation) added upon raw audios. You don't have to do the syntheses on your own, as it is already embedded into the data provider and is done on the fly, randomly for each epoch during training.
-
-Six optional augmentation components are provided to be selected, configured, and inserted into the processing pipeline.
-
-* Audio
- - Volume Perturbation
- - Speed Perturbation
- - Shifting Perturbation
- - Online Bayesian normalization
- - Noise Perturbation (need background noise audio files)
- - Impulse Response (need impulse audio files)
-
-* Feature
- - SpecAugment
- - Adaptive SpecAugment
-
-To inform the trainer of what augmentation components are needed and what their processing orders are, it is required to prepare in advance an *augmentation configuration file* in [JSON](http://www.json.org/) format. For example:
-
-```
-[{
- "type": "speed",
- "params": {"min_speed_rate": 0.95,
- "max_speed_rate": 1.05},
- "prob": 0.6
-},
-{
- "type": "shift",
- "params": {"min_shift_ms": -5,
- "max_shift_ms": 5},
- "prob": 0.8
-}]
-```
-
-When the `augment_conf_file` argument is set to the path of the above example configuration file, every audio clip in every epoch will be processed: with 60% of chance, it will first be speed perturbed with a uniformly random sampled speed-rate between 0.95 and 1.05, and then with 80% of chance it will be shifted in time with a randomly sampled offset between -5 ms and 5 ms. Finally, this newly synthesized audio clip will be fed into the feature extractor for further training.
-
-For other configuration examples, please refer to `examples/conf/augmentation.example.json`.
-
-Be careful when utilizing the data augmentation technique, as improper augmentation will harm the training, due to the enlarged train-test gap.
diff --git a/docs/source/asr/models_introduction.md b/docs/source/asr/models_introduction.md
index d82e12c0..56d58197 100644
--- a/docs/source/asr/models_introduction.md
+++ b/docs/source/asr/models_introduction.md
@@ -38,7 +38,7 @@ vi examples/librispeech/s0/data/vocab.txt
```
#### CMVN
-For CMVN, a subset of the full of the training set is selected and be used to compute the feature mean and std.
+For CMVN, a subset of or full of the training set is selected and be used to compute the feature mean and std.
```
# The code to compute the feature mean and std
cd examples/aishell/s0
diff --git a/docs/source/demo_video.rst b/docs/source/demo_video.rst
new file mode 100644
index 00000000..dc7e718a
--- /dev/null
+++ b/docs/source/demo_video.rst
@@ -0,0 +1,13 @@
+Demo Video
+==================
+
+.. raw:: html
+
+
+
+
+
+ Sorry, your browser doesn't support embedded videos.
+
+
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 5bbc9319..7f9c87bd 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -27,7 +27,6 @@ Contents
asr/models_introduction
asr/data_preparation
- asr/augmentation
asr/feature_list
asr/ngram_lm
@@ -42,6 +41,7 @@ Contents
tts/gan_vocoder
tts/demo
tts/demo_2
+
.. toctree::
:maxdepth: 1
@@ -51,12 +51,14 @@ Contents
.. toctree::
:maxdepth: 1
- :caption: Acknowledgement
-
- asr/reference
-
-
+ :caption: Demos
+ demo_video
+ tts_demo_video
+.. toctree::
+ :maxdepth: 1
+ :caption: Acknowledgement
+ asr/reference
diff --git a/docs/source/released_model.md b/docs/source/released_model.md
index 3310bfb2..8f855f7c 100644
--- a/docs/source/released_model.md
+++ b/docs/source/released_model.md
@@ -1,3 +1,4 @@
+
# Released Models
## Speech-to-Text Models
@@ -9,9 +10,10 @@ Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER |
[Ds2 Offline Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_aishell_ckpt_0.1.1.model.tar.gz)| Aishell Dataset | Char-based | 306 MB | 2 Conv + 3 bidirectional GRU layers| 0.064 |-| 151 h | [Ds2 Offline Aishell ASR0](../../examples/aishell/asr0)
[Conformer Offline Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_conformer_aishell_ckpt_0.1.1.model.tar.gz) | Aishell Dataset | Char-based | 284 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.056 |-| 151 h | [Conformer Offline Aishell ASR1](../../examples/aishell/asr1)
[Transformer Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_transformer_aishell_ckpt_0.1.1.model.tar.gz) | Aishell Dataset | Char-based | 128 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0523 || 151 h | [Transformer Aishell ASR1](../../examples/aishell/asr1)
-[Conformer Librispeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/asr1_conformer_librispeech_ckpt_0.1.1.model.tar.gz) | Librispeech Dataset | subword-based | 191 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0337 | 960 h | [Conformer Librispeech ASR1](../../example/librispeech/asr1)
-[Transformer Librispeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/asr1_transformer_librispeech_ckpt_0.1.1.model.tar.gz) | Librispeech Dataset | subword-based | 131 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0381 | 960 h | [Transformer Librispeech ASR1](../../example/librispeech/asr1)
-[Transformer Librispeech ASR2 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr2/asr2_transformer_librispeech_ckpt_0.1.1.model.tar.gz) | Librispeech Dataset | subword-based | 131 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: JoinCTC w/ LM |-| 0.0240 | 960 h | [Transformer Librispeech ASR2](../../example/librispeech/asr2)
+[Ds2 Offline Librispeech ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr0/asr0_deepspeech2_librispeech_ckpt_0.1.1.model.tar.gz)| Librispeech Dataset | Char-based | 518 MB | 2 Conv + 3 bidirectional LSTM layers| - |0.0725| 960 h | [Ds2 Offline Librispeech ASR0](../../examples/librispeech/asr0)
+[Conformer Librispeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/asr1_conformer_librispeech_ckpt_0.1.1.model.tar.gz) | Librispeech Dataset | subword-based | 191 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0337 | 960 h | [Conformer Librispeech ASR1](../../examples/librispeech/asr1)
+[Transformer Librispeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/asr1_transformer_librispeech_ckpt_0.1.1.model.tar.gz) | Librispeech Dataset | subword-based | 131 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0381 | 960 h | [Transformer Librispeech ASR1](../../examples/librispeech/asr1)
+[Transformer Librispeech ASR2 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr2/asr2_transformer_librispeech_ckpt_0.1.1.model.tar.gz) | Librispeech Dataset | subword-based | 131 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: JoinCTC w/ LM |-| 0.0240 | 960 h | [Transformer Librispeech ASR2](../../examples/librispeech/asr2)
### Language Model based on NGram
Language Model | Training Data | Token-based | Size | Descriptions
@@ -31,14 +33,15 @@ Language Model | Training Data | Token-based | Size | Descriptions
### Acoustic Models
Model Type | Dataset| Example Link | Pretrained Models|Static Models|Size (static)
:-------------:| :------------:| :-----: | :-----:| :-----:| :-----:
-Tacotron2|LJSpeech|[tacotron2-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts0)|[tacotron2_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_ljspeech_ckpt_0.3.zip)|||
+Tacotron2|LJSpeech|[tacotron2-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts0)|[tacotron2_ljspeech_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_ljspeech_ckpt_0.2.0.zip)|||
+Tacotron2|CSMSC|[tacotron2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts0)|[tacotron2_csmsc_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_csmsc_ckpt_0.2.0.zip)|[tacotron2_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_csmsc_static_0.2.0.zip)|103MB|
TransformerTTS| LJSpeech| [transformer-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts1)|[transformer_tts_ljspeech_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/transformer_tts/transformer_tts_ljspeech_ckpt_0.4.zip)|||
SpeedySpeech| CSMSC | [speedyspeech-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts2) |[speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_ckpt_0.5.zip)|[speedyspeech_nosil_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_static_0.5.zip)|12MB|
FastSpeech2| CSMSC |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3)|[fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip)|[fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_static_0.4.zip)|157MB|
FastSpeech2-Conformer| CSMSC |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3)|[fastspeech2_conformer_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_conformer_baker_ckpt_0.5.zip)|||
FastSpeech2| AISHELL-3 |[fastspeech2-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/tts3)|[fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_ckpt_0.4.zip)|||
FastSpeech2| LJSpeech |[fastspeech2-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts3)|[fastspeech2_nosil_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_ljspeech_ckpt_0.5.zip)|||
-FastSpeech2| VCTK |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/tts3)|[fastspeech2_nosil_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_vctk_ckpt_0.5.zip)|||
+FastSpeech2| VCTK |[fastspeech2-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/tts3)|[fastspeech2_nosil_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_vctk_ckpt_0.5.zip)|||
### Vocoders
Model Type | Dataset| Example Link | Pretrained Models| Static Models|Size (static)
@@ -51,12 +54,14 @@ Parallel WaveGAN| VCTK |[PWGAN-vctk](https://github.com/PaddlePaddle/PaddleSpeec
|Multi Band MelGAN | CSMSC |[MB MelGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc3) | [mb_melgan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_ckpt_0.1.1.zip) [mb_melgan_baker_finetune_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_finetune_ckpt_0.5.zip)|[mb_melgan_csmsc_static_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_static_0.1.1.zip) |8.2MB|
Style MelGAN | CSMSC |[Style MelGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc4)|[style_melgan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/style_melgan/style_melgan_csmsc_ckpt_0.1.1.zip)| | |
HiFiGAN | CSMSC |[HiFiGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc5)|[hifigan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_ckpt_0.1.1.zip)|[hifigan_csmsc_static_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_static_0.1.1.zip)|50MB|
+WaveRNN | CSMSC |[WaveRNN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc6)|[wavernn_csmsc_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/wavernn/wavernn_csmsc_ckpt_0.2.0.zip)|[wavernn_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/wavernn/wavernn_csmsc_static_0.2.0.zip)|18MB|
+
### Voice Cloning
Model Type | Dataset| Example Link | Pretrained Models
:-------------:| :------------:| :-----: | :-----:
GE2E| AISHELL-3, etc. |[ge2e](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/ge2e)|[ge2e_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/ge2e/ge2e_ckpt_0.3.zip)
-GE2E + Tactron2| AISHELL-3 |[ge2e-tactron2-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/vc0)|[tacotron2_aishell3_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_aishell3_ckpt_0.3.zip)
+GE2E + Tactron2| AISHELL-3 |[ge2e-tactron2-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/vc0)|[tacotron2_aishell3_ckpt_vc0_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_aishell3_ckpt_vc0_0.2.0.zip)
GE2E + FastSpeech2 | AISHELL-3 |[ge2e-fastspeech2-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/vc1)|[fastspeech2_nosil_aishell3_vc1_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_vc1_ckpt_0.5.zip)
@@ -65,7 +70,7 @@ GE2E + FastSpeech2 | AISHELL-3 |[ge2e-fastspeech2-aishell3](https://github.com/
Model Type | Dataset| Example Link | Pretrained Models
:-------------:| :------------:| :-----: | :-----:
PANN | Audioset| [audioset_tagging_cnn](https://github.com/qiuqiangkong/audioset_tagging_cnn) | [panns_cnn6.pdparams](https://bj.bcebos.com/paddleaudio/models/panns_cnn6.pdparams), [panns_cnn10.pdparams](https://bj.bcebos.com/paddleaudio/models/panns_cnn10.pdparams), [panns_cnn14.pdparams](https://bj.bcebos.com/paddleaudio/models/panns_cnn14.pdparams)
-PANN | ESC-50 |[pann-esc50]("./examples/esc50/cls0")|[esc50_cnn6.tar.gz](https://paddlespeech.bj.bcebos.com/cls/esc50/esc50_cnn6.tar.gz), [esc50_cnn10.tar.gz](https://paddlespeech.bj.bcebos.com/cls/esc50/esc50_cnn10.tar.gz), [esc50_cnn14.tar.gz](https://paddlespeech.bj.bcebos.com/cls/esc50/esc50_cnn14.tar.gz)
+PANN | ESC-50 |[pann-esc50](../../examples/esc50/cls0)|[esc50_cnn6.tar.gz](https://paddlespeech.bj.bcebos.com/cls/esc50/esc50_cnn6.tar.gz), [esc50_cnn10.tar.gz](https://paddlespeech.bj.bcebos.com/cls/esc50/esc50_cnn10.tar.gz), [esc50_cnn14.tar.gz](https://paddlespeech.bj.bcebos.com/cls/esc50/esc50_cnn14.tar.gz)
## Punctuation Restoration Models
Model Type | Dataset| Example Link | Pretrained Models
diff --git a/docs/source/tts/README.md b/docs/source/tts/README.md
index 3de8901b..835db08e 100644
--- a/docs/source/tts/README.md
+++ b/docs/source/tts/README.md
@@ -71,7 +71,3 @@ Check our [website](https://paddlespeech.readthedocs.io/en/latest/tts/demo.html)
#### GE2E
1. [ge2e_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/ge2e_ckpt_0.3.zip)
-
-## License
-
-Parakeet is provided under the [Apache-2.0 license](LICENSE).
diff --git a/docs/source/tts/quick_start.md b/docs/source/tts/quick_start.md
index 3180d80a..bddee778 100644
--- a/docs/source/tts/quick_start.md
+++ b/docs/source/tts/quick_start.md
@@ -1,3 +1,4 @@
+([简体中文](./quick_start_cn.md)|English)
# Quick Start of Text-to-Speech
The examples in PaddleSpeech are mainly classified by datasets, the TTS datasets we mainly used are:
* CSMCS (Mandarin single speaker)
diff --git a/docs/source/tts/quick_start_cn.md b/docs/source/tts/quick_start_cn.md
new file mode 100644
index 00000000..37246e84
--- /dev/null
+++ b/docs/source/tts/quick_start_cn.md
@@ -0,0 +1,205 @@
+(简体中文|[English](./quick_start.md))
+# 语音合成快速开始
+这些PaddleSpeech中的样例主要按数据集分类,我们主要使用的TTS数据集有:
+
+* CSMCS (普通话单发音人)
+* AISHELL3 (普通话多发音人)
+* LJSpeech (英文单发音人)
+* VCTK (英文多发音人)
+
+PaddleSpeech 的 TTS 模型具有以下映射关系:
+
+* tts0 - Tactron2
+* tts1 - TransformerTTS
+* tts2 - SpeedySpeech
+* tts3 - FastSpeech2
+* voc0 - WaveFlow
+* voc1 - Parallel WaveGAN
+* voc2 - MelGAN
+* voc3 - MultiBand MelGAN
+* voc4 - Style MelGAN
+* voc5 - HiFiGAN
+* vc0 - Tactron2 Voice Clone with GE2E
+* vc1 - FastSpeech2 Voice Clone with GE2E
+
+## 快速开始
+
+让我们以 FastSpeech2 + Parallel WaveGAN 和 CSMSC 数据集 为例. [examples/csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc)
+
+### 用 CSMSC 数据集训练 Parallel WaveGAN
+
+- 进入目录
+ ```bash
+ cd examples/csmsc/voc1
+ ```
+- 设置环境变量
+ ```bash
+ source path.sh
+ ```
+ **在你开始做任何事情之前,必须先做这步**
+ 将 `MAIN_ROOT` 设置为项目目录. 使用 `parallelwave_gan` 模型作为 `MODEL`.
+
+- 运行
+ ```bash
+ bash run.sh
+ ```
+ 这只是一个演示,请确保源数据已经准备好,并且在下一个 `step` 之前每个 `step` 都运行正常.
+### 用CSMSC数据集训练FastSpeech2
+
+- 进入目录
+ ```bash
+ cd examples/csmsc/tts3
+ ```
+
+- 设置环境变量
+ ```bash
+ source path.sh
+ ```
+ **在你开始做任何事情之前,必须先做这步**
+ 将 `MAIN_ROOT` 设置为项目目录. 使用 `fastspeech2` 模型作为 `MODEL` 。
+
+- 运行
+ ```bash
+ bash run.sh
+ ```
+ 这只是一个演示,请确保源数据已经准备好,并且在下一个 `step` 之前每个 `step` 都运行正常。
+
+`run.sh` 中主要包括以下步骤:
+
+- 设置路径。
+- 预处理数据集,
+- 训练模型。
+- 从 `metadata.jsonl` 中合成波形
+- 从文本文件合成波形。(在声学模型中)
+- 使用静态模型进行推理。(可选)
+
+有关更多详细信息,请参见 examples 中的 `README.md`
+
+## TTS 流水线
+本节介绍如何使用 TTS 提供的预训练模型,并对其进行推理。
+
+TTS中的预训练模型在压缩包中提供。将其解压缩以获得如下文件夹:
+**Acoustic Models:**
+
+```text
+checkpoint_name
+├── default.yaml
+├── snapshot_iter_*.pdz
+├── speech_stats.npy
+├── phone_id_map.txt
+├── spk_id_map.txt (optimal)
+└── tone_id_map.txt (optimal)
+```
+**Vocoders:**
+```text
+checkpoint_name
+├── default.yaml
+├── snapshot_iter_*.pdz
+└── stats.npy
+```
+- `default.yaml` 存储用于训练模型的配置。
+- `snapshot_iter_*.pdz` 是检查点文件,其中`*`是它经过训练的步骤。
+- `*_stats.npy` 是特征的统计文件,如果它在训练前已被标准化。
+- `phone_id_map.txt` 是音素到音素 ID 的映射关系。
+- `tone_id_map.txt` 是在训练声学模型之前分割音调和拼音时,音调到音调 ID 的映射关系。(例如在 csmsc/speedyspeech 的示例中)
+- `spk_id_map.txt` 是多发音人声学模型中 "发音人" 到 "spk_ids" 的映射关系。
+
+下面的示例代码显示了如何使用模型进行预测。
+### Acoustic Models 声学模型(文本到频谱图)
+下面的代码显示了如何使用 `FastSpeech2` 模型。加载预训练模型后,使用它和 normalizer 对象构建预测对象,然后使用 `fastspeech2_inferencet(phone_ids)` 生成频谱图,频谱图可进一步用于使用声码器合成原始音频。
+
+```python
+from pathlib import Path
+import numpy as np
+import paddle
+import yaml
+from yacs.config import CfgNode
+from paddlespeech.t2s.models.fastspeech2 import FastSpeech2
+from paddlespeech.t2s.models.fastspeech2 import FastSpeech2Inference
+from paddlespeech.t2s.modules.normalizer import ZScore
+# examples/fastspeech2/baker/frontend.py
+from frontend import Frontend
+
+# 加载预训练模型
+checkpoint_dir = Path("fastspeech2_nosil_baker_ckpt_0.4")
+with open(checkpoint_dir / "phone_id_map.txt", "r") as f:
+ phn_id = [line.strip().split() for line in f.readlines()]
+vocab_size = len(phn_id)
+with open(checkpoint_dir / "default.yaml") as f:
+ fastspeech2_config = CfgNode(yaml.safe_load(f))
+odim = fastspeech2_config.n_mels
+model = FastSpeech2(
+ idim=vocab_size, odim=odim, **fastspeech2_config["model"])
+model.set_state_dict(
+ paddle.load(args.fastspeech2_checkpoint)["main_params"])
+model.eval()
+
+# 加载特征文件
+stat = np.load(checkpoint_dir / "speech_stats.npy")
+mu, std = stat
+mu = paddle.to_tensor(mu)
+std = paddle.to_tensor(std)
+fastspeech2_normalizer = ZScore(mu, std)
+
+# 构建预测对象
+fastspeech2_inference = FastSpeech2Inference(fastspeech2_normalizer, model)
+
+# load Chinese Frontend
+frontend = Frontend(checkpoint_dir / "phone_id_map.txt")
+
+# 构建一个中文前端
+sentence = "你好吗?"
+input_ids = frontend.get_input_ids(sentence, merge_sentences=True)
+phone_ids = input_ids["phone_ids"]
+flags = 0
+# 构建预测对象加载中文前端,对中文文本前端的输出进行分段
+for part_phone_ids in phone_ids:
+ with paddle.no_grad():
+ temp_mel = fastspeech2_inference(part_phone_ids)
+ if flags == 0:
+ mel = temp_mel
+ flags = 1
+ else:
+ mel = paddle.concat([mel, temp_mel])
+```
+
+### Vcoder声码器(谱图到波形)
+下面的代码显示了如何使用 `Parallel WaveGAN` 模型。像上面的例子一样,加载预训练模型后,使用它和 normalizer 对象构建预测对象,然后使用 `pwg_inference(mel)` 生成原始音频( wav 格式)。
+
+```python
+from pathlib import Path
+import numpy as np
+import paddle
+import soundfile as sf
+import yaml
+from yacs.config import CfgNode
+from paddlespeech.t2s.models.parallel_wavegan import PWGGenerator
+from paddlespeech.t2s.models.parallel_wavegan import PWGInference
+from paddlespeech.t2s.modules.normalizer import ZScore
+
+# 加载预训练模型
+checkpoint_dir = Path("parallel_wavegan_baker_ckpt_0.4")
+with open(checkpoint_dir / "pwg_default.yaml") as f:
+ pwg_config = CfgNode(yaml.safe_load(f))
+vocoder = PWGGenerator(**pwg_config["generator_params"])
+vocoder.set_state_dict(paddle.load(args.pwg_params))
+vocoder.remove_weight_norm()
+vocoder.eval()
+
+# 加载特征文件
+stat = np.load(checkpoint_dir / "pwg_stats.npy")
+mu, std = stat
+mu = paddle.to_tensor(mu)
+std = paddle.to_tensor(std)
+pwg_normalizer = ZScore(mu, std)
+
+# 加载预训练模型构造预测对象
+pwg_inference = PWGInference(pwg_normalizer, vocoder)
+
+# 频谱图到波形
+wav = pwg_inference(mel)
+sf.write(
+ audio_path,
+ wav.numpy(),
+ samplerate=fastspeech2_config.fs)
+```
diff --git a/docs/source/tts/tts_datasets.md b/docs/source/tts/tts_datasets.md
new file mode 100644
index 00000000..a79981df
--- /dev/null
+++ b/docs/source/tts/tts_datasets.md
@@ -0,0 +1,75 @@
+# TTS Datasets
+
+## Mandarin
+- [CSMSC](https://www.data-baker.com/open_source.html): Chinese Standard Mandarin Speech Copus
+ - Duration/h: 12
+ - Number of Sentences: 10,000
+ - Size: 2.14GB
+ - Speaker: 1 female, ages 20 ~30
+ - Sample Rate: 48 kHz、16bit
+ - Mean Words per Clip: 16
+- [AISHELL-3](http://www.aishelltech.com/aishell_3)
+ - Duration/h: 85
+ - Number of Sentences: 88,035
+ - Size: 17.75GB
+ - Speaker: 218
+ - Sample Rate: 44.1 kHz、16bit
+
+## English
+- [LJSpeech](https://keithito.com/LJ-Speech-Dataset/)
+ - Duration/h: 24
+ - Number of Sentences: 13,100
+ - Size: 2.56GB
+ - Speaker: 1, age 20 ~30
+ - Sample Rate: 22050 Hz、16bit
+ - Mean Words per Clip: 17.23
+- [VCTK](https://datashare.ed.ac.uk/handle/10283/3443)
+ - Number of Sentences: 44,583
+ - Size: 10.94GB
+ - Speaker: 110
+ - Sample Rate: 48 kHz、16bit
+ - Mean Words per Clip: 17.23
+
+## Japanese
+
+
+- [tri-jek](https://sites.google.com/site/shinnosuketakamichi/research-topics/tri-jek_corpus): Japanese-English-Korean tri-lingual corpus
+- [JSSS-misc](https://sites.google.com/site/shinnosuketakamichi/research-topics/jsss-misc_corpus): misc tasks of JSSS corpus
+- [JTubeSpeech](https://github.com/sarulab-speech/jtubespeech): Corpus of Japanese speech collected from YouTube
+- [J-MAC](https://sites.google.com/site/shinnosuketakamichi/research-topics/j-mac_corpus): Japanese multi-speaker audiobook corpus
+- [J-KAC](https://sites.google.com/site/shinnosuketakamichi/research-topics/j-kac_corpus): Japanese Kamishibai and audiobook corpus
+- [JMD](https://sites.google.com/site/shinnosuketakamichi/research-topics/jmd_corpus): Japanese multi-dialect corpus
+- [JSSS](https://sites.google.com/site/shinnosuketakamichi/research-topics/jsss_corpus): Japanese multi-style (summarization and simplification) corpus
+- [RWCP-SSD-Onomatopoeia](https://www.ksuke.net/dataset/rwcp-ssd-onomatopoeia): onomatopoeic word dataset for environmental sounds
+- [Life-m](https://sites.google.com/site/shinnosuketakamichi/research-topics/life-m_corpus): landmark image-themed music corpus
+- [PJS](https://sites.google.com/site/shinnosuketakamichi/research-topics/pjs_corpus): Phoneme-balanced Japanese singing voice corpus
+- [JVS-MuSiC](https://sites.google.com/site/shinnosuketakamichi/research-topics/jvs_music): Japanese multi-speaker singing-voice corpus
+- [JVS](https://sites.google.com/site/shinnosuketakamichi/research-topics/jvs_corpus): Japanese multi-speaker voice corpus
+- [JSUT-book](https://sites.google.com/site/shinnosuketakamichi/publication/jsut-book): audiobook corpus by a single Japanese speaker
+- [JSUT-vi](https://sites.google.com/site/shinnosuketakamichi/publication/jsut-vi): vocal imitation corpus by a single Japanese speaker
+- [JSUT-song](https://sites.google.com/site/shinnosuketakamichi/publication/jsut-song): singing voice corpus by a single Japanese singer
+- [JSUT](https://sites.google.com/site/shinnosuketakamichi/publication/jsut): a large-scaled corpus of reading-style Japanese speech by a single speaker
+
+## Emotions
+### English
+- [CREMA-D](https://github.com/CheyneyComputerScience/CREMA-D)
+- [Seen and Unseen emotional style transfer for voice conversion with a new emotional speech dataset](https://kunzhou9646.github.io/controllable-evc/)
+ - paper : [Seen and Unseen emotional style transfer for voice conversion with a new emotional speech dataset](https://arxiv.org/abs/2010.14794)
+### Mandarin
+- [EMOVIE Dataset](https://viem-ccy.github.io/EMOVIE/dataset_release )
+ - paper: [EMOVIE: A Mandarin Emotion Speech Dataset with a Simple Emotional Text-to-Speech Model](https://arxiv.org/abs/2106.09317)
+- MASC
+ - paper: [MASC: A Speech Corpus in Mandarin for Emotion Analysis and Affective Speaker Recognition](https://ieeexplore.ieee.org/document/4013501)
+### English && Mandarin
+- [Emotional Voice Conversion: Theory, Databases and ESD](https://github.com/HLTSingapore/Emotional-Speech-Data)
+ - paper: [Emotional Voice Conversion: Theory, Databases and ESD](https://arxiv.org/abs/2105.14762)
+
+## Music
+- [GiantMIDI-Piano](https://github.com/bytedance/GiantMIDI-Piano)
+- [MAESTRO Dataset](https://magenta.tensorflow.org/datasets/maestro)
+ - [tf code](https://www.tensorflow.org/tutorials/audio/music_generation)
+- [Opencpop](https://wenet.org.cn/opencpop/)
diff --git a/docs/source/tts_demo_video.rst b/docs/source/tts_demo_video.rst
new file mode 100644
index 00000000..4f807165
--- /dev/null
+++ b/docs/source/tts_demo_video.rst
@@ -0,0 +1,12 @@
+TTS Demo Video
+==================
+
+.. raw:: html
+
+
+
+
+ Sorry, your browser doesn't support embedded videos.
+
+
diff --git a/docs/tutorial/asr/tutorial_deepspeech2.ipynb b/docs/tutorial/asr/tutorial_deepspeech2.ipynb
index 86790473..34c0090a 100644
--- a/docs/tutorial/asr/tutorial_deepspeech2.ipynb
+++ b/docs/tutorial/asr/tutorial_deepspeech2.ipynb
@@ -265,7 +265,7 @@
},
"outputs": [],
"source": [
- "!pip install --upgrade pip && pip install paddlespeech"
+ "!pip install --upgrade pip && pip install paddlespeech==0.1.0"
]
},
{
diff --git a/docs/tutorial/asr/tutorial_transformer.ipynb b/docs/tutorial/asr/tutorial_transformer.ipynb
index c9eb5ebb..dc303006 100644
--- a/docs/tutorial/asr/tutorial_transformer.ipynb
+++ b/docs/tutorial/asr/tutorial_transformer.ipynb
@@ -138,7 +138,7 @@
},
"outputs": [],
"source": [
- "!pip install --upgrade pip && pip install paddlespeech"
+ "!pip install --upgrade pip && pip install paddlespeech==0.1.0"
]
},
{
diff --git a/examples/aishell/asr0/conf/tuning/chunk_decode.yaml b/examples/aishell/asr0/conf/tuning/chunk_decode.yaml
index 9de06711..c07bc77e 100644
--- a/examples/aishell/asr0/conf/tuning/chunk_decode.yaml
+++ b/examples/aishell/asr0/conf/tuning/chunk_decode.yaml
@@ -1,10 +1,10 @@
-chunk_batch_size: 32
+decode_batch_size: 32
error_rate_type: cer
decoding_method: ctc_beam_search
lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm
alpha: 2.2 #1.9
beta: 4.3
-beam_size: 300
+beam_size: 500
cutoff_prob: 0.99
cutoff_top_n: 40
num_proc_bsearch: 10
diff --git a/examples/aishell3/tts3/README.md b/examples/aishell3/tts3/README.md
index 2538e8f9..281ad836 100644
--- a/examples/aishell3/tts3/README.md
+++ b/examples/aishell3/tts3/README.md
@@ -257,6 +257,7 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
--output_dir=exp/default/test_e2e \
--phones_dict=fastspeech2_nosil_aishell3_ckpt_0.4/phone_id_map.txt \
--speaker_dict=fastspeech2_nosil_aishell3_ckpt_0.4/speaker_id_map.txt \
- --spk_id=0
+ --spk_id=0 \
+ --inference_dir=exp/default/inference
```
diff --git a/examples/aishell3/tts3/conf/default.yaml b/examples/aishell3/tts3/conf/default.yaml
index 3a57e902..ac495674 100644
--- a/examples/aishell3/tts3/conf/default.yaml
+++ b/examples/aishell3/tts3/conf/default.yaml
@@ -16,8 +16,8 @@ fmax: 7600 # Maximum frequency of Mel basis.
n_mels: 80 # The number of mel basis.
# Only used for the model using pitch features (e.g. FastSpeech2)
-f0min: 80 # Maximum f0 for pitch extraction.
-f0max: 400 # Minimum f0 for pitch extraction.
+f0min: 80 # Minimum f0 for pitch extraction.
+f0max: 400 # Maximum f0 for pitch extraction.
###########################################################
@@ -64,14 +64,14 @@ model:
pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor
pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch
pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch
- stop_gradient_from_pitch_predictor: true # whether to stop the gradient from pitch predictor to encoder
+ stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder
energy_predictor_layers: 2 # number of conv layers in energy predictor
energy_predictor_chans: 256 # number of channels of conv layers in energy predictor
energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor
energy_predictor_dropout: 0.5 # dropout rate in energy predictor
energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy
energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy
- stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder
+ stop_gradient_from_energy_predictor: False # whether to stop the gradient from energy predictor to encoder
spk_embed_dim: 256 # speaker embedding dimension
spk_embed_integration_type: concat # speaker embedding integration type
@@ -84,7 +84,6 @@ updater:
use_masking: True # whether to apply masking for padded part in loss calculation
-
###########################################################
# OPTIMIZER SETTING #
###########################################################
diff --git a/examples/aishell3/tts3/local/inference.sh b/examples/aishell3/tts3/local/inference.sh
new file mode 100755
index 00000000..3b03b53c
--- /dev/null
+++ b/examples/aishell3/tts3/local/inference.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+train_output_path=$1
+
+stage=0
+stop_stage=0
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+ python3 ${BIN_DIR}/../inference.py \
+ --inference_dir=${train_output_path}/inference \
+ --am=fastspeech2_aishell3 \
+ --voc=pwgan_aishell3 \
+ --text=${BIN_DIR}/../sentences.txt \
+ --output_dir=${train_output_path}/pd_infer_out \
+ --phones_dict=dump/phone_id_map.txt \
+ --speaker_dict=dump/speaker_id_map.txt \
+ --spk_id=0
+fi
+
diff --git a/examples/aishell3/tts3/local/synthesize_e2e.sh b/examples/aishell3/tts3/local/synthesize_e2e.sh
index d0d92585..60e1a5ce 100755
--- a/examples/aishell3/tts3/local/synthesize_e2e.sh
+++ b/examples/aishell3/tts3/local/synthesize_e2e.sh
@@ -20,4 +20,5 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--speaker_dict=dump/speaker_id_map.txt \
- --spk_id=0
+ --spk_id=0 \
+ --inference_dir=${train_output_path}/inference
diff --git a/examples/aishell3/vc0/README.md b/examples/aishell3/vc0/README.md
index 91d32619..664ec1ac 100644
--- a/examples/aishell3/vc0/README.md
+++ b/examples/aishell3/vc0/README.md
@@ -1,94 +1,140 @@
# Tacotron2 + AISHELL-3 Voice Cloning
-This example contains code used to train a [Tacotron2 ](https://arxiv.org/abs/1712.05884) model with [AISHELL-3](http://www.aishelltech.com/aishell_3). The trained model can be used in Voice Cloning Task, We refer to the model structure of [Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558.pdf). The general steps are as follows:
-1. Speaker Encoder: We use Speaker Verification to train a speaker encoder. Datasets used in this task are different from those used in Tacotron2 because the transcriptions are not needed, we use more datasets, refer to [ge2e](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/ge2e).
-2. Synthesizer: We use the trained speaker encoder to generate speaker embedding for each sentence in AISHELL-3. This embedding is an extra input of Tacotron2 which will be concated with encoder outputs.
-3. Vocoder: We use WaveFlow as the neural Vocoder, refer to [waveflow](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc0).
+This example contains code used to train a [Tacotron2](https://arxiv.org/abs/1712.05884) model with [AISHELL-3](http://www.aishelltech.com/aishell_3). The trained model can be used in Voice Cloning Task, We refer to the model structure of [Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558.pdf). The general steps are as follows:
+1. Speaker Encoder: We use Speaker Verification to train a speaker encoder. Datasets used in this task are different from those used in `Tacotron2` because the transcriptions are not needed, we use more datasets, refer to [ge2e](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/ge2e).
+2. Synthesizer: We use the trained speaker encoder to generate speaker embedding for each sentence in AISHELL-3. This embedding is an extra input of `Tacotron2` which will be concated with encoder outputs.
+3. Vocoder: We use [Parallel Wave GAN](http://arxiv.org/abs/1910.11480) as the neural Vocoder, refer to [voc1](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc1).
+
+## Dataset
+### Download and Extract
+Download AISHELL-3.
+```bash
+wget https://www.openslr.org/resources/93/data_aishell3.tgz
+```
+Extract AISHELL-3.
+```bash
+mkdir data_aishell3
+tar zxvf data_aishell3.tgz -C data_aishell3
+```
+### Get MFA Result and Extract
+We use [MFA2.x](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get phonemes for Tacotron2, the durations of MFA are not needed here.
+You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) (use MFA1.x now) of our repo.
+
+## Pretrained GE2E Model
+We use pretrained GE2E model to generate speaker embedding for each sentence.
+
+Download pretrained GE2E model from here [ge2e_ckpt_0.3.zip](https://bj.bcebos.com/paddlespeech/Parakeet/released_models/ge2e/ge2e_ckpt_0.3.zip), and `unzip` it.
## Get Started
Assume the path to the dataset is `~/datasets/data_aishell3`.
-Assume the path to the MFA result of AISHELL-3 is `./alignment`.
-Assume the path to the pretrained ge2e model is `ge2e_ckpt_path=./ge2e_ckpt_0.3/step-3000000`
+Assume the path to the MFA result of AISHELL-3 is `./aishell3_alignment_tone`.
+Assume the path to the pretrained ge2e model is `./ge2e_ckpt_0.3`.
+
Run the command below to
1. **source path**.
2. preprocess the dataset.
3. train the model.
-4. start a voice cloning inference.
+4. synthesize waveform from `metadata.jsonl`.
+5. start a voice cloning inference.
```bash
./run.sh
```
-You can choose a range of stages you want to run, or set `stage` equal to `stop-stage` to use only one stage, for example, run the following command will only preprocess the dataset.
+You can choose a range of stages you want to run, or set `stage` equal to `stop-stage` to use only one stage, for example, running the following command will only preprocess the dataset.
```bash
./run.sh --stage 0 --stop-stage 0
```
### Data Preprocessing
```bash
-CUDA_VISIBLE_DEVICES=${gpus} ./local/preprocess.sh ${input} ${preprocess_path} ${alignment} ${ge2e_ckpt_path}
+CUDA_VISIBLE_DEVICES=${gpus} ./local/preprocess.sh ${conf_path} ${ge2e_ckpt_path}
```
-#### Generate Speaker Embedding
- Use pretrained GE2E (speaker encoder) to generate speaker embedding for each sentence in AISHELL-3, which has the same file structure with wav files and the format is `.npy`.
-
-```bash
-if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
- python3 ${BIN_DIR}/../ge2e/inference.py \
- --input=${input} \
- --output=${preprocess_path}/embed \
- --ngpu=1 \
- --checkpoint_path=${ge2e_ckpt_path}
-fi
+When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below.
+```text
+dump
+├── dev
+│ ├── norm
+│ └── raw
+├── embed
+│ ├── SSB0005
+│ ├── SSB0009
+│ ├── ...
+│ └── ...
+├── phone_id_map.txt
+├── speaker_id_map.txt
+├── test
+│ ├── norm
+│ └── raw
+└── train
+ ├── norm
+ ├── raw
+ └── speech_stats.npy
```
+The `embed` contains the generated speaker embedding for each sentence in AISHELL-3, which has the same file structure with wav files and the format is `.npy`.
The computing time of utterance embedding can be x hours.
-#### Process Wav
-There is silence in the edge of AISHELL-3's wavs, and the audio amplitude is very small, so, we need to remove the silence and normalize the audio. You can the silence remove method based on volume or energy, but the effect is not very good, We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get the alignment of text and speech, then utilize the alignment results to remove the silence.
-We use Montreal Force Aligner 1.0. The label in aishell3 includes pinyin,so the lexicon we provided to MFA is pinyin rather than Chinese characters. And the prosody marks(`$` and `%`) need to be removed. You should preprocess the dataset into the format which MFA needs, the texts have the same name with wavs and have the suffix `.lab`.
+The dataset is split into 3 parts, namely `train`, `dev`, and` test`, each of which contains a `norm` and `raw` subfolder. The raw folder contains speech features of each utterance, while the norm folder contains normalized ones. The statistics used to normalize features are computed from the training set, which is located in `dump/train/*_stats.npy`.
-We use [lexicon.txt](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/lexicon.txt) as the lexicon.
-
-You can download the alignment results from here [alignment_aishell3.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/alignment_aishell3.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) (use MFA1.x now) of our repo.
+Also, there is a `metadata.jsonl` in each subfolder. It is a table-like file that contains phones, text_lengths, speech_lengths, durations, the path of speech features, speaker, and id of each utterance.
+The preprocessing step is very similar to that one of [tts0](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts0), but there is one more `ge2e/inference` step here.
+### Model Training
+`./local/train.sh` calls `${BIN_DIR}/train.py`.
```bash
-if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
- echo "Process wav ..."
- python3 ${BIN_DIR}/process_wav.py \
- --input=${input}/wav \
- --output=${preprocess_path}/normalized_wav \
- --alignment=${alignment}
-fi
+CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
```
+The training step is very similar to that one of [tts0](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts0), but we should set `--voice-cloning=True` when calling `${BIN_DIR}/train.py`.
-#### Preprocess Transcription
-We revert the transcription into `phones` and `tones`. It is worth noting that our processing here is different from that used for MFA, we separated the tones. This is a processing method, of course, you can only segment initials and vowels.
-
+### Synthesizing
+We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc1) as the neural vocoder.
+Download pretrained parallel wavegan model from [pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip) and unzip it.
```bash
-if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
- python3 ${BIN_DIR}/preprocess_transcription.py \
- --input=${input} \
- --output=${preprocess_path}
-fi
+unzip pwg_aishell3_ckpt_0.5.zip
```
-The default input is `~/datasets/data_aishell3/train`,which contains `label_train-set.txt`, the processed results are `metadata.yaml` and `metadata.pickle`. the former is a text format for easy viewing, and the latter is a binary format for direct reading.
-#### Extract Mel
-```python
-if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
- python3 ${BIN_DIR}/extract_mel.py \
- --input=${preprocess_path}/normalized_wav \
- --output=${preprocess_path}/mel
-fi
+Parallel WaveGAN checkpoint contains files listed below.
+```text
+pwg_aishell3_ckpt_0.5
+├── default.yaml # default config used to train parallel wavegan
+├── feats_stats.npy # statistics used to normalize spectrogram when training parallel wavegan
+└── snapshot_iter_1000000.pdz # generator parameters of parallel wavegan
```
-
-### Model Training
+`./local/synthesize.sh` calls `${BIN_DIR}/../synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
```bash
-CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${preprocess_path} ${train_output_path}
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
```
+The synthesizing step is very similar to that one of [tts0](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts0), but we should set `--voice-cloning=True` when calling `${BIN_DIR}/../synthesize.py`.
-Our model removes stop token prediction in Tacotron2, because of the problem of the extremely unbalanced proportion of positive and negative samples of stop token prediction, and it's very sensitive to the clip of audio silence. We use the last symbol from the highest point of attention to the encoder side as the termination condition.
-
-In addition, to accelerate the convergence of the model, we add `guided attention loss` to induce the alignment between encoder and decoder to show diagonal lines faster.
### Voice Cloning
+Assume there are some reference audios in `./ref_audio`
+```text
+ref_audio
+├── 001238.wav
+├── LJ015-0254.wav
+└── audio_self_test.mp3
+```
+`./local/voice_cloning.sh` calls `${BIN_DIR}/../voice_cloning.py`
+
```bash
-CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${ge2e_params_path} ${tacotron2_params_path} ${waveflow_params_path} ${vc_input} ${vc_output}
+CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${conf_path} ${train_output_path} ${ckpt_name} ${ge2e_params_path} ${ref_audio_dir}
```
+
## Pretrained Model
-[tacotron2_aishell3_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_aishell3_ckpt_0.3.zip).
+[tacotron2_aishell3_ckpt_vc0_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_aishell3_ckpt_vc0_0.2.0.zip)
+
+
+Model | Step | eval/loss | eval/l1_loss | eval/mse_loss | eval/bce_loss| eval/attn_loss
+:-------------:| :------------:| :-----: | :-----: | :--------: |:--------:|:---------:
+default| 2(gpu) x 37596|0.58704|0.39623|0.15073|0.039|1.9981e-04|
+
+Tacotron2 checkpoint contains files listed below.
+(There is no need for `speaker_id_map.txt` here )
+
+```text
+tacotron2_aishell3_ckpt_vc0_0.2.0
+├── default.yaml # default config used to train tacotron2
+├── phone_id_map.txt # phone vocabulary file when training tacotron2
+├── snapshot_iter_37596.pdz # model parameters and optimizer states
+└── speech_stats.npy # statistics used to normalize spectrogram when training tacotron2
+```
+
+## More
+We strongly recommend that you use [FastSpeech2 + AISHELL-3 Voice Cloning](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/vc1) which works better.
diff --git a/examples/aishell3/vc0/conf/default.yaml b/examples/aishell3/vc0/conf/default.yaml
new file mode 100644
index 00000000..26096eb2
--- /dev/null
+++ b/examples/aishell3/vc0/conf/default.yaml
@@ -0,0 +1,86 @@
+###########################################################
+# FEATURE EXTRACTION SETTING #
+###########################################################
+
+fs: 24000 # sr
+n_fft: 2048 # FFT size (samples).
+n_shift: 300 # Hop size (samples). 12.5ms
+win_length: 1200 # Window length (samples). 50ms
+ # If set to null, it will be the same as fft_size.
+window: "hann" # Window function.
+
+# Only used for feats_type != raw
+
+fmin: 80 # Minimum frequency of Mel basis.
+fmax: 7600 # Maximum frequency of Mel basis.
+n_mels: 80 # The number of mel basis.
+
+###########################################################
+# DATA SETTING #
+###########################################################
+batch_size: 64
+num_workers: 2
+
+###########################################################
+# MODEL SETTING #
+###########################################################
+model: # keyword arguments for the selected model
+ embed_dim: 512 # char or phn embedding dimension
+ elayers: 1 # number of blstm layers in encoder
+ eunits: 512 # number of blstm units
+ econv_layers: 3 # number of convolutional layers in encoder
+ econv_chans: 512 # number of channels in convolutional layer
+ econv_filts: 5 # filter size of convolutional layer
+ atype: location # attention function type
+ adim: 512 # attention dimension
+ aconv_chans: 32 # number of channels in convolutional layer of attention
+ aconv_filts: 15 # filter size of convolutional layer of attention
+ cumulate_att_w: True # whether to cumulate attention weight
+ dlayers: 2 # number of lstm layers in decoder
+ dunits: 1024 # number of lstm units in decoder
+ prenet_layers: 2 # number of layers in prenet
+ prenet_units: 256 # number of units in prenet
+ postnet_layers: 5 # number of layers in postnet
+ postnet_chans: 512 # number of channels in postnet
+ postnet_filts: 5 # filter size of postnet layer
+ output_activation: null # activation function for the final output
+ use_batch_norm: True # whether to use batch normalization in encoder
+ use_concate: True # whether to concatenate encoder embedding with decoder outputs
+ use_residual: False # whether to use residual connection in encoder
+ dropout_rate: 0.5 # dropout rate
+ zoneout_rate: 0.1 # zoneout rate
+ reduction_factor: 1 # reduction factor
+ spk_embed_dim: 256 # speaker embedding dimension
+ spk_embed_integration_type: concat # how to integrate speaker embedding
+
+
+###########################################################
+# UPDATER SETTING #
+###########################################################
+updater:
+ use_masking: True # whether to apply masking for padded part in loss calculation
+ bce_pos_weight: 5.0 # weight of positive sample in binary cross entropy calculation
+ use_guided_attn_loss: True # whether to use guided attention loss
+ guided_attn_loss_sigma: 0.4 # sigma of guided attention loss
+ guided_attn_loss_lambda: 1.0 # strength of guided attention loss
+
+
+##########################################################
+# OPTIMIZER SETTING #
+##########################################################
+optimizer:
+ optim: adam # optimizer type
+ learning_rate: 1.0e-03 # learning rate
+ epsilon: 1.0e-06 # epsilon
+ weight_decay: 0.0 # weight decay coefficient
+
+###########################################################
+# TRAINING SETTING #
+###########################################################
+max_epoch: 100
+num_snapshots: 5
+
+###########################################################
+# OTHER SETTING #
+###########################################################
+seed: 42
\ No newline at end of file
diff --git a/examples/aishell3/vc0/local/preprocess.sh b/examples/aishell3/vc0/local/preprocess.sh
index 5bf88066..069cf94c 100755
--- a/examples/aishell3/vc0/local/preprocess.sh
+++ b/examples/aishell3/vc0/local/preprocess.sh
@@ -1,36 +1,72 @@
#!/bin/bash
-stage=0
+stage=3
stop_stage=100
-input=$1
-preprocess_path=$2
-alignment=$3
-ge2e_ckpt_path=$4
+config_path=$1
+ge2e_ckpt_path=$2
+# gen speaker embedding
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
python3 ${MAIN_ROOT}/paddlespeech/vector/exps/ge2e/inference.py \
- --input=${input}/wav \
- --output=${preprocess_path}/embed \
+ --input=~/datasets/data_aishell3/train/wav/ \
+ --output=dump/embed \
--checkpoint_path=${ge2e_ckpt_path}
fi
+# copy from tts3/preprocess
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
- echo "Process wav ..."
- python3 ${BIN_DIR}/process_wav.py \
- --input=${input}/wav \
- --output=${preprocess_path}/normalized_wav \
- --alignment=${alignment}
+ # get durations from MFA's result
+ echo "Generate durations.txt from MFA results ..."
+ python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
+ --inputdir=./aishell3_alignment_tone \
+ --output durations.txt \
+ --config=${config_path}
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
- python3 ${BIN_DIR}/preprocess_transcription.py \
- --input=${input} \
- --output=${preprocess_path}
+ # extract features
+ echo "Extract features ..."
+ python3 ${BIN_DIR}/preprocess.py \
+ --dataset=aishell3 \
+ --rootdir=~/datasets/data_aishell3/ \
+ --dumpdir=dump \
+ --dur-file=durations.txt \
+ --config=${config_path} \
+ --num-cpu=20 \
+ --cut-sil=True \
+ --spk_emb_dir=dump/embed
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
- python3 ${BIN_DIR}/extract_mel.py \
- --input=${preprocess_path}/normalized_wav \
- --output=${preprocess_path}/mel
+ # get features' stats(mean and std)
+ echo "Get features' stats ..."
+ python3 ${MAIN_ROOT}/utils/compute_statistics.py \
+ --metadata=dump/train/raw/metadata.jsonl \
+ --field-name="speech"
+fi
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+ # normalize and covert phone to id, dev and test should use train's stats
+ echo "Normalize ..."
+ python3 ${BIN_DIR}/normalize.py \
+ --metadata=dump/train/raw/metadata.jsonl \
+ --dumpdir=dump/train/norm \
+ --speech-stats=dump/train/speech_stats.npy \
+ --phones-dict=dump/phone_id_map.txt \
+ --speaker-dict=dump/speaker_id_map.txt
+
+ python3 ${BIN_DIR}/normalize.py \
+ --metadata=dump/dev/raw/metadata.jsonl \
+ --dumpdir=dump/dev/norm \
+ --speech-stats=dump/train/speech_stats.npy \
+ --phones-dict=dump/phone_id_map.txt \
+ --speaker-dict=dump/speaker_id_map.txt
+
+ python3 ${BIN_DIR}/normalize.py \
+ --metadata=dump/test/raw/metadata.jsonl \
+ --dumpdir=dump/test/norm \
+ --speech-stats=dump/train/speech_stats.npy \
+ --phones-dict=dump/phone_id_map.txt \
+ --speaker-dict=dump/speaker_id_map.txt
fi
diff --git a/examples/aishell3/vc0/local/synthesize.sh b/examples/aishell3/vc0/local/synthesize.sh
new file mode 100755
index 00000000..98430280
--- /dev/null
+++ b/examples/aishell3/vc0/local/synthesize.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+
+FLAGS_allocator_strategy=naive_best_fit \
+FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+python3 ${BIN_DIR}/../synthesize.py \
+ --am=tacotron2_aishell3 \
+ --am_config=${config_path} \
+ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+ --am_stat=dump/train/speech_stats.npy \
+ --voc=pwgan_aishell3 \
+ --voc_config=pwg_aishell3_ckpt_0.5/default.yaml \
+ --voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
+ --voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \
+ --test_metadata=dump/test/norm/metadata.jsonl \
+ --output_dir=${train_output_path}/test \
+ --phones_dict=dump/phone_id_map.txt \
+ --speaker_dict=dump/speaker_id_map.txt \
+ --voice-cloning=True
diff --git a/examples/aishell3/vc0/local/train.sh b/examples/aishell3/vc0/local/train.sh
index f062cbbf..c775fcad 100755
--- a/examples/aishell3/vc0/local/train.sh
+++ b/examples/aishell3/vc0/local/train.sh
@@ -1,9 +1,13 @@
#!/bin/bash
-preprocess_path=$1
+config_path=$1
train_output_path=$2
python3 ${BIN_DIR}/train.py \
- --data=${preprocess_path} \
- --output=${train_output_path} \
- --ngpu=1
\ No newline at end of file
+ --train-metadata=dump/train/norm/metadata.jsonl \
+ --dev-metadata=dump/dev/norm/metadata.jsonl \
+ --config=${config_path} \
+ --output-dir=${train_output_path} \
+ --ngpu=2 \
+ --phones-dict=dump/phone_id_map.txt \
+ --voice-cloning=True
\ No newline at end of file
diff --git a/examples/aishell3/vc0/local/voice_cloning.sh b/examples/aishell3/vc0/local/voice_cloning.sh
index 3fe3de76..79831f3f 100755
--- a/examples/aishell3/vc0/local/voice_cloning.sh
+++ b/examples/aishell3/vc0/local/voice_cloning.sh
@@ -1,14 +1,24 @@
#!/bin/bash
-ge2e_params_path=$1
-tacotron2_params_path=$2
-waveflow_params_path=$3
-vc_input=$4
-vc_output=$5
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+ge2e_params_path=$4
+ref_audio_dir=$5
-python3 ${BIN_DIR}/voice_cloning.py \
+FLAGS_allocator_strategy=naive_best_fit \
+FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+python3 ${BIN_DIR}/../voice_cloning.py \
+ --am=tacotron2_aishell3 \
+ --am_config=${config_path} \
+ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+ --am_stat=dump/train/speech_stats.npy \
+ --voc=pwgan_aishell3 \
+ --voc_config=pwg_aishell3_ckpt_0.5/default.yaml \
+ --voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
+ --voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \
--ge2e_params_path=${ge2e_params_path} \
- --tacotron2_params_path=${tacotron2_params_path} \
- --waveflow_params_path=${waveflow_params_path} \
- --input-dir=${vc_input} \
- --output-dir=${vc_output}
\ No newline at end of file
+ --text="凯莫瑞安联合体的经济崩溃迫在眉睫。" \
+ --input-dir=${ref_audio_dir} \
+ --output-dir=${train_output_path}/vc_syn \
+ --phones-dict=dump/phone_id_map.txt
diff --git a/examples/aishell3/vc0/path.sh b/examples/aishell3/vc0/path.sh
index dfae49af..a37cd21e 100755
--- a/examples/aishell3/vc0/path.sh
+++ b/examples/aishell3/vc0/path.sh
@@ -9,5 +9,5 @@ export PYTHONDONTWRITEBYTECODE=1
export PYTHONIOENCODING=UTF-8
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-MODEL=voice_cloning/tacotron2_ge2e
+MODEL=tacotron2
export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
diff --git a/examples/aishell3/vc0/run.sh b/examples/aishell3/vc0/run.sh
index 870360c1..64f4ee3b 100755
--- a/examples/aishell3/vc0/run.sh
+++ b/examples/aishell3/vc0/run.sh
@@ -3,25 +3,20 @@
set -e
source path.sh
-gpus=0
+gpus=0,1
stage=0
stop_stage=100
-input=~/datasets/data_aishell3/train
-preprocess_path=dump
-alignment=./alignment
+conf_path=conf/default.yaml
+train_output_path=exp/default
+ckpt_name=snapshot_iter_482.pdz
+ref_audio_dir=ref_audio
# not include ".pdparams" here
ge2e_ckpt_path=./ge2e_ckpt_0.3/step-3000000
-train_output_path=output
+
# include ".pdparams" here
ge2e_params_path=${ge2e_ckpt_path}.pdparams
-tacotron2_params_path=${train_output_path}/checkpoints/step-1000.pdparams
-# pretrained model
-# tacotron2_params_path=./tacotron2_aishell3_ckpt_0.3/step-450000.pdparams
-waveflow_params_path=./waveflow_ljspeech_ckpt_0.3/step-2000000.pdparams
-vc_input=ref_audio
-vc_output=syn_audio
# with the following command, you can choose the stage range you want to run
# such as `./run.sh --stage 0 --stop-stage 0`
@@ -30,15 +25,20 @@ source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# prepare data
- CUDA_VISIBLE_DEVICES=${gpus} ./local/preprocess.sh ${input} ${preprocess_path} ${alignment} ${ge2e_ckpt_path} || exit -1
+ CUDA_VISIBLE_DEVICES=${gpus} ./local/preprocess.sh ${conf_path} ${ge2e_ckpt_path} || exit -1
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
- CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${preprocess_path} ${train_output_path} || exit -1
+ # train model, all `ckpt` under `train_output_path/checkpoints/` dir
+ CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
- CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${ge2e_params_path} ${tacotron2_params_path} ${waveflow_params_path} ${vc_input} ${vc_output} || exit -1
+ # synthesize, vocoder is pwgan
+ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
fi
-
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+ # synthesize, vocoder is pwgan
+ CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${conf_path} ${train_output_path} ${ckpt_name} ${ge2e_params_path} ${ref_audio_dir} || exit -1
+fi
diff --git a/examples/aishell3/vc1/README.md b/examples/aishell3/vc1/README.md
index d5745bc3..04b83a5f 100644
--- a/examples/aishell3/vc1/README.md
+++ b/examples/aishell3/vc1/README.md
@@ -1,4 +1,3 @@
-
# FastSpeech2 + AISHELL-3 Voice Cloning
This example contains code used to train a [FastSpeech2](https://arxiv.org/abs/2006.04558) model with [AISHELL-3](http://www.aishelltech.com/aishell_3). The trained model can be used in Voice Cloning Task, We refer to the model structure of [Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558.pdf). The general steps are as follows:
1. Speaker Encoder: We use Speaker Verification to train a speaker encoder. Datasets used in this task are different from those used in `FastSpeech2` because the transcriptions are not needed, we use more datasets, refer to [ge2e](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/ge2e).
@@ -114,7 +113,7 @@ ref_audio
├── LJ015-0254.wav
└── audio_self_test.mp3
```
-`./local/voice_cloning.sh` calls `${BIN_DIR}/voice_cloning.py`
+`./local/voice_cloning.sh` calls `${BIN_DIR}/../voice_cloning.py`
```bash
CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${conf_path} ${train_output_path} ${ckpt_name} ${ge2e_params_path} ${ref_audio_dir}
diff --git a/examples/aishell3/vc1/conf/default.yaml b/examples/aishell3/vc1/conf/default.yaml
index 557a5a0a..ac495674 100644
--- a/examples/aishell3/vc1/conf/default.yaml
+++ b/examples/aishell3/vc1/conf/default.yaml
@@ -16,8 +16,8 @@ fmax: 7600 # Maximum frequency of Mel basis.
n_mels: 80 # The number of mel basis.
# Only used for the model using pitch features (e.g. FastSpeech2)
-f0min: 80 # Maximum f0 for pitch extraction.
-f0max: 400 # Minimum f0 for pitch extraction.
+f0min: 80 # Minimum f0 for pitch extraction.
+f0max: 400 # Maximum f0 for pitch extraction.
###########################################################
@@ -64,14 +64,14 @@ model:
pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor
pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch
pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch
- stop_gradient_from_pitch_predictor: true # whether to stop the gradient from pitch predictor to encoder
+ stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder
energy_predictor_layers: 2 # number of conv layers in energy predictor
energy_predictor_chans: 256 # number of channels of conv layers in energy predictor
energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor
energy_predictor_dropout: 0.5 # dropout rate in energy predictor
energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy
energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy
- stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder
+ stop_gradient_from_energy_predictor: False # whether to stop the gradient from energy predictor to encoder
spk_embed_dim: 256 # speaker embedding dimension
spk_embed_integration_type: concat # speaker embedding integration type
diff --git a/examples/aishell3/vc1/local/voice_cloning.sh b/examples/aishell3/vc1/local/voice_cloning.sh
index 6a50826e..2a8864ba 100755
--- a/examples/aishell3/vc1/local/voice_cloning.sh
+++ b/examples/aishell3/vc1/local/voice_cloning.sh
@@ -8,13 +8,15 @@ ref_audio_dir=$5
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 ${BIN_DIR}/voice_cloning.py \
- --fastspeech2-config=${config_path} \
- --fastspeech2-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
- --fastspeech2-stat=dump/train/speech_stats.npy \
- --pwg-config=pwg_aishell3_ckpt_0.5/default.yaml \
- --pwg-checkpoint=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
- --pwg-stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \
+python3 ${BIN_DIR}/../voice_cloning.py \
+ --am=fastspeech2_aishell3 \
+ --am_config=${config_path} \
+ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+ --am_stat=dump/train/speech_stats.npy \
+ --voc=pwgan_aishell3 \
+ --voc_config=pwg_aishell3_ckpt_0.5/default.yaml \
+ --voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
+ --voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \
--ge2e_params_path=${ge2e_params_path} \
--text="凯莫瑞安联合体的经济崩溃迫在眉睫。" \
--input-dir=${ref_audio_dir} \
diff --git a/examples/aishell3/voc1/conf/default.yaml b/examples/aishell3/voc1/conf/default.yaml
index 7fbffbdd..e2102d6e 100644
--- a/examples/aishell3/voc1/conf/default.yaml
+++ b/examples/aishell3/voc1/conf/default.yaml
@@ -33,7 +33,7 @@ generator_params:
aux_context_window: 2 # Context window size for auxiliary feature.
# If set to 2, previous 2 and future 2 frames will be considered.
dropout: 0.0 # Dropout rate. 0.0 means no dropout applied.
- use_weight_norm: true # Whether to use weight norm.
+ use_weight_norm: True # Whether to use weight norm.
# If set to true, it will be applied to all of the conv layers.
upsample_scales: [4, 5, 3, 5] # Upsampling scales. prod(upsample_scales) == n_shift
@@ -46,8 +46,8 @@ discriminator_params:
kernel_size: 3 # Number of output channels.
layers: 10 # Number of conv layers.
conv_channels: 64 # Number of chnn layers.
- bias: true # Whether to use bias parameter in conv.
- use_weight_norm: true # Whether to use weight norm.
+ bias: True # Whether to use bias parameter in conv.
+ use_weight_norm: True # Whether to use weight norm.
# If set to true, it will be applied to all of the conv layers.
nonlinear_activation: "leakyrelu" # Nonlinear function after each conv.
nonlinear_activation_params: # Nonlinear function parameters
diff --git a/examples/ami/README.md b/examples/ami/README.md
new file mode 100644
index 00000000..a038eaeb
--- /dev/null
+++ b/examples/ami/README.md
@@ -0,0 +1,3 @@
+# Speaker Diarization on AMI corpus
+
+* sd0 - speaker diarization by AHC,SC base on x-vectors
diff --git a/examples/ami/sd0/.gitignore b/examples/ami/sd0/.gitignore
new file mode 100644
index 00000000..872aa273
--- /dev/null
+++ b/examples/ami/sd0/.gitignore
@@ -0,0 +1 @@
+results
\ No newline at end of file
diff --git a/examples/ami/sd0/README.md b/examples/ami/sd0/README.md
new file mode 100644
index 00000000..ffe95741
--- /dev/null
+++ b/examples/ami/sd0/README.md
@@ -0,0 +1,13 @@
+# Speaker Diarization on AMI corpus
+
+## About the AMI corpus:
+"The AMI Meeting Corpus consists of 100 hours of meeting recordings. The recordings use a range of signals synchronized to a common timeline. These include close-talking and far-field microphones, individual and room-view video cameras, and output from a slide projector and an electronic whiteboard. During the meetings, the participants also have unsynchronized pens available to them that record what is written. The meetings were recorded in English using three different rooms with different acoustic properties, and include mostly non-native speakers." See [ami overview](http://groups.inf.ed.ac.uk/ami/corpus/overview.shtml) for more details.
+
+## About the example
+The script performs diarization using x-vectors(TDNN,ECAPA-TDNN) on the AMI mix-headset data. We demonstrate the use of different clustering methods: AHC, spectral.
+
+## How to Run
+Use the following command to run diarization on AMI corpus.
+`bash ./run.sh`
+
+## Results (DER) coming soon! :)
diff --git a/examples/ami/sd0/local/ami_prepare.py b/examples/ami/sd0/local/ami_prepare.py
new file mode 100644
index 00000000..b7bb8e67
--- /dev/null
+++ b/examples/ami/sd0/local/ami_prepare.py
@@ -0,0 +1,572 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Data preparation.
+
+Download: http://groups.inf.ed.ac.uk/ami/download/
+
+Prepares metadata files (JSON) from manual annotations "segments/" using RTTM format (Oracle VAD).
+
+Authors
+ * qingenz123@126.com (Qingen ZHAO) 2022
+
+"""
+
+import os
+import logging
+import argparse
+import xml.etree.ElementTree as et
+import glob
+import json
+from ami_splits import get_AMI_split
+from distutils.util import strtobool
+
+from dataio import (
+ load_pkl,
+ save_pkl, )
+
+logger = logging.getLogger(__name__)
+SAMPLERATE = 16000
+
+
+def prepare_ami(
+ data_folder,
+ manual_annot_folder,
+ save_folder,
+ ref_rttm_dir,
+ meta_data_dir,
+ split_type="full_corpus_asr",
+ skip_TNO=True,
+ mic_type="Mix-Headset",
+ vad_type="oracle",
+ max_subseg_dur=3.0,
+ overlap=1.5, ):
+ """
+ Prepares reference RTTM and JSON files for the AMI dataset.
+
+ Arguments
+ ---------
+ data_folder : str
+ Path to the folder where the original amicorpus is stored.
+ manual_annot_folder : str
+ Directory where the manual annotations are stored.
+ save_folder : str
+ The save directory in results.
+ ref_rttm_dir : str
+ Directory to store reference RTTM files.
+ meta_data_dir : str
+ Directory to store the meta data (json) files.
+ split_type : str
+ Standard dataset split. See ami_splits.py for more information.
+ Allowed split_type: "scenario_only", "full_corpus" or "full_corpus_asr"
+ skip_TNO: bool
+ Skips TNO meeting recordings if True.
+ mic_type : str
+ Type of microphone to be used.
+ vad_type : str
+ Type of VAD. Kept for future when VAD will be added.
+ max_subseg_dur : float
+ Duration in seconds of a subsegments to be prepared from larger segments.
+ overlap : float
+ Overlap duration in seconds between adjacent subsegments
+
+ Example
+ -------
+ >>> from dataset.ami.ami_prepare import prepare_ami
+ >>> data_folder = '/home/data/ami/amicorpus/'
+ >>> manual_annot_folder = '/home/data/ami/ami_public_manual/'
+ >>> save_folder = './results/
+ >>> split_type = 'full_corpus_asr'
+ >>> mic_type = 'Mix-Headset'
+ >>> prepare_ami(data_folder, manual_annot_folder, save_folder, split_type, mic_type)
+ """
+
+ # Meta files
+ meta_files = [
+ os.path.join(meta_data_dir, "ami_train." + mic_type + ".subsegs.json"),
+ os.path.join(meta_data_dir, "ami_dev." + mic_type + ".subsegs.json"),
+ os.path.join(meta_data_dir, "ami_eval." + mic_type + ".subsegs.json"),
+ ]
+
+ # Create configuration for easily skipping data_preparation stage
+ conf = {
+ "data_folder": data_folder,
+ "save_folder": save_folder,
+ "ref_rttm_dir": ref_rttm_dir,
+ "meta_data_dir": meta_data_dir,
+ "split_type": split_type,
+ "skip_TNO": skip_TNO,
+ "mic_type": mic_type,
+ "vad": vad_type,
+ "max_subseg_dur": max_subseg_dur,
+ "overlap": overlap,
+ "meta_files": meta_files,
+ }
+
+ if not os.path.exists(save_folder):
+ os.makedirs(save_folder)
+
+ # Setting output option files.
+ opt_file = "opt_ami_prepare." + mic_type + ".pkl"
+
+ # Check if this phase is already done (if so, skip it)
+ if skip(save_folder, conf, meta_files, opt_file):
+ logger.info(
+ "Skipping data preparation, as it was completed in previous run.")
+ return
+
+ msg = "\tCreating meta-data file for the AMI Dataset.."
+ logger.debug(msg)
+
+ # Get the split
+ train_set, dev_set, eval_set = get_AMI_split(split_type)
+
+ # Prepare RTTM from XML(manual annot) and store are groundtruth
+ # Create ref_RTTM directory
+ if not os.path.exists(ref_rttm_dir):
+ os.makedirs(ref_rttm_dir)
+
+ # Create reference RTTM files
+ splits = ["train", "dev", "eval"]
+ for i in splits:
+ rttm_file = ref_rttm_dir + "/fullref_ami_" + i + ".rttm"
+ if i == "train":
+ prepare_segs_for_RTTM(
+ train_set,
+ rttm_file,
+ data_folder,
+ manual_annot_folder,
+ i,
+ skip_TNO, )
+ if i == "dev":
+ prepare_segs_for_RTTM(
+ dev_set,
+ rttm_file,
+ data_folder,
+ manual_annot_folder,
+ i,
+ skip_TNO, )
+ if i == "eval":
+ prepare_segs_for_RTTM(
+ eval_set,
+ rttm_file,
+ data_folder,
+ manual_annot_folder,
+ i,
+ skip_TNO, )
+
+ # Create meta_files for splits
+ meta_data_dir = meta_data_dir
+ if not os.path.exists(meta_data_dir):
+ os.makedirs(meta_data_dir)
+
+ for i in splits:
+ rttm_file = ref_rttm_dir + "/fullref_ami_" + i + ".rttm"
+ meta_filename_prefix = "ami_" + i
+ prepare_metadata(
+ rttm_file,
+ meta_data_dir,
+ data_folder,
+ meta_filename_prefix,
+ max_subseg_dur,
+ overlap,
+ mic_type, )
+
+ save_opt_file = os.path.join(save_folder, opt_file)
+ save_pkl(conf, save_opt_file)
+
+
+def get_RTTM_per_rec(segs, spkrs_list, rec_id):
+ """Prepares rttm for each recording
+ """
+
+ rttm = []
+
+ # Prepare header
+ for spkr_id in spkrs_list:
+ # e.g. SPKR-INFO ES2008c 0 unknown ES2008c.A_PM
+ line = ("SPKR-INFO " + rec_id + " 0 unknown " + spkr_id +
+ " ")
+ rttm.append(line)
+
+ # Append remaining lines
+ for row in segs:
+ # e.g. SPEAKER ES2008c 0 37.880 0.590 ES2008c.A_PM
+
+ if float(row[1]) < float(row[0]):
+ msg1 = (
+ "Possibly Incorrect Annotation Found!! transcriber_start (%s) > transcriber_end (%s)"
+ % (row[0], row[1]))
+ msg2 = (
+ "Excluding this incorrect row from the RTTM : %s, %s, %s, %s" %
+ (rec_id, row[0], str(round(float(row[1]) - float(row[0]), 4)),
+ str(row[2]), ))
+ logger.info(msg1)
+ logger.info(msg2)
+ continue
+
+ line = ("SPEAKER " + rec_id + " 0 " + str(round(float(row[0]), 4)) + " "
+ + str(round(float(row[1]) - float(row[0]), 4)) + " " +
+ str(row[2]) + " ")
+ rttm.append(line)
+
+ return rttm
+
+
+def prepare_segs_for_RTTM(list_ids, out_rttm_file, audio_dir, annot_dir,
+ split_type, skip_TNO):
+
+ RTTM = [] # Stores all RTTMs clubbed together for a given dataset split
+
+ for main_meet_id in list_ids:
+
+ # Skip TNO meetings from dev and eval sets
+ if (main_meet_id.startswith("TS") and split_type != "train" and
+ skip_TNO is True):
+ msg = ("Skipping TNO meeting in AMI " + str(split_type) + " set : "
+ + str(main_meet_id))
+ logger.info(msg)
+ continue
+
+ list_sessions = glob.glob(audio_dir + "/" + main_meet_id + "*")
+ list_sessions.sort()
+
+ for sess in list_sessions:
+ rec_id = os.path.basename(sess)
+ path = annot_dir + "/segments/" + rec_id
+ f = path + ".*.segments.xml"
+ list_spkr_xmls = glob.glob(f)
+ list_spkr_xmls.sort() # A, B, C, D, E etc (Speakers)
+ segs = []
+ spkrs_list = (
+ []) # Since non-scenario recordings contains 3-5 speakers
+
+ for spkr_xml_file in list_spkr_xmls:
+
+ # Speaker ID
+ spkr = os.path.basename(spkr_xml_file).split(".")[1]
+ spkr_ID = rec_id + "." + spkr
+ spkrs_list.append(spkr_ID)
+
+ # Parse xml tree
+ tree = et.parse(spkr_xml_file)
+ root = tree.getroot()
+
+ # Start, end and speaker_ID from xml file
+ segs = segs + [[
+ elem.attrib["transcriber_start"],
+ elem.attrib["transcriber_end"],
+ spkr_ID,
+ ] for elem in root.iter("segment")]
+
+ # Sort rows as per the start time (per recording)
+ segs.sort(key=lambda x: float(x[0]))
+
+ rttm_per_rec = get_RTTM_per_rec(segs, spkrs_list, rec_id)
+ RTTM = RTTM + rttm_per_rec
+
+ # Write one RTTM as groundtruth. For example, "fullref_eval.rttm"
+ with open(out_rttm_file, "w") as f:
+ for item in RTTM:
+ f.write("%s\n" % item)
+
+
+def is_overlapped(end1, start2):
+ """Returns True if the two segments overlap
+
+ Arguments
+ ---------
+ end1 : float
+ End time of the first segment.
+ start2 : float
+ Start time of the second segment.
+ """
+
+ if start2 > end1:
+ return False
+ else:
+ return True
+
+
+def merge_rttm_intervals(rttm_segs):
+ """Merges adjacent segments in rttm if they overlap.
+ """
+ # For one recording
+ # rec_id = rttm_segs[0][1]
+ rttm_segs.sort(key=lambda x: float(x[3]))
+
+ # first_seg = rttm_segs[0] # first interval.. as it is
+ merged_segs = [rttm_segs[0]]
+ strt = float(rttm_segs[0][3])
+ end = float(rttm_segs[0][3]) + float(rttm_segs[0][4])
+
+ for row in rttm_segs[1:]:
+ s = float(row[3])
+ e = float(row[3]) + float(row[4])
+
+ if is_overlapped(end, s):
+ # Update only end. The strt will be same as in last segment
+ # Just update last row in the merged_segs
+ end = max(end, e)
+ merged_segs[-1][3] = str(round(strt, 4))
+ merged_segs[-1][4] = str(round((end - strt), 4))
+ merged_segs[-1][7] = "overlap" # previous_row[7] + '-'+ row[7]
+ else:
+ # Add a new disjoint segment
+ strt = s
+ end = e
+ merged_segs.append(row) # this will have 1 spkr ID
+
+ return merged_segs
+
+
+def get_subsegments(merged_segs, max_subseg_dur=3.0, overlap=1.5):
+ """Divides bigger segments into smaller sub-segments
+ """
+
+ shift = max_subseg_dur - overlap
+ subsegments = []
+
+ # These rows are in RTTM format
+ for row in merged_segs:
+ seg_dur = float(row[4])
+ rec_id = row[1]
+
+ if seg_dur > max_subseg_dur:
+ num_subsegs = int(seg_dur / shift)
+ # Taking 0.01 sec as small step
+ seg_start = float(row[3])
+ seg_end = seg_start + seg_dur
+
+ # Now divide this segment (new_row) in smaller subsegments
+ for i in range(num_subsegs):
+ subseg_start = seg_start + i * shift
+ subseg_end = min(subseg_start + max_subseg_dur - 0.01, seg_end)
+ subseg_dur = subseg_end - subseg_start
+
+ new_row = [
+ "SPEAKER",
+ rec_id,
+ "0",
+ str(round(float(subseg_start), 4)),
+ str(round(float(subseg_dur), 4)),
+ "",
+ "",
+ row[7],
+ "",
+ "",
+ ]
+
+ subsegments.append(new_row)
+
+ # Break if exceeding the boundary
+ if subseg_end >= seg_end:
+ break
+ else:
+ subsegments.append(row)
+
+ return subsegments
+
+
+def prepare_metadata(rttm_file, save_dir, data_dir, filename, max_subseg_dur,
+ overlap, mic_type):
+ # Read RTTM, get unique meeting_IDs (from RTTM headers)
+ # For each MeetingID. select that meetID -> merge -> subsegment -> json -> append
+
+ # Read RTTM
+ RTTM = []
+ with open(rttm_file, "r") as f:
+ for line in f:
+ entry = line[:-1]
+ RTTM.append(entry)
+
+ spkr_info = filter(lambda x: x.startswith("SPKR-INFO"), RTTM)
+ rec_ids = list(set([row.split(" ")[1] for row in spkr_info]))
+ rec_ids.sort() # sorting just to make JSON look in proper sequence
+
+ # For each recording merge segments and then perform subsegmentation
+ MERGED_SEGMENTS = []
+ SUBSEGMENTS = []
+ for rec_id in rec_ids:
+ segs_iter = filter(lambda x: x.startswith("SPEAKER " + str(rec_id)),
+ RTTM)
+ gt_rttm_segs = [row.split(" ") for row in segs_iter]
+
+ # Merge, subsegment and then convert to json format.
+ merged_segs = merge_rttm_intervals(
+ gt_rttm_segs) # We lose speaker_ID after merging
+ MERGED_SEGMENTS = MERGED_SEGMENTS + merged_segs
+
+ # Divide segments into smaller sub-segments
+ subsegs = get_subsegments(merged_segs, max_subseg_dur, overlap)
+ SUBSEGMENTS = SUBSEGMENTS + subsegs
+
+ # Write segment AND sub-segments (in RTTM format)
+ segs_file = save_dir + "/" + filename + ".segments.rttm"
+ subsegment_file = save_dir + "/" + filename + ".subsegments.rttm"
+
+ with open(segs_file, "w") as f:
+ for row in MERGED_SEGMENTS:
+ line_str = " ".join(row)
+ f.write("%s\n" % line_str)
+
+ with open(subsegment_file, "w") as f:
+ for row in SUBSEGMENTS:
+ line_str = " ".join(row)
+ f.write("%s\n" % line_str)
+
+ # Create JSON from subsegments
+ json_dict = {}
+ for row in SUBSEGMENTS:
+ rec_id = row[1]
+ strt = str(round(float(row[3]), 4))
+ end = str(round((float(row[3]) + float(row[4])), 4))
+ subsegment_ID = rec_id + "_" + strt + "_" + end
+ dur = row[4]
+ start_sample = int(float(strt) * SAMPLERATE)
+ end_sample = int(float(end) * SAMPLERATE)
+
+ # If multi-mic audio is selected
+ if mic_type == "Array1":
+ wav_file_base_path = (data_dir + "/" + rec_id + "/audio/" + rec_id +
+ "." + mic_type + "-")
+
+ f = [] # adding all 8 mics
+ for i in range(8):
+ f.append(wav_file_base_path + str(i + 1).zfill(2) + ".wav")
+ audio_files_path_list = f
+
+ # Note: key "files" with 's' is used for multi-mic
+ json_dict[subsegment_ID] = {
+ "wav": {
+ "files": audio_files_path_list,
+ "duration": float(dur),
+ "start": int(start_sample),
+ "stop": int(end_sample),
+ },
+ }
+ else:
+ # Single mic audio
+ wav_file_path = (data_dir + "/" + rec_id + "/audio/" + rec_id + "."
+ + mic_type + ".wav")
+
+ # Note: key "file" without 's' is used for single-mic
+ json_dict[subsegment_ID] = {
+ "wav": {
+ "file": wav_file_path,
+ "duration": float(dur),
+ "start": int(start_sample),
+ "stop": int(end_sample),
+ },
+ }
+
+ out_json_file = save_dir + "/" + filename + "." + mic_type + ".subsegs.json"
+ with open(out_json_file, mode="w") as json_f:
+ json.dump(json_dict, json_f, indent=2)
+
+ msg = "%s JSON prepared" % (out_json_file)
+ logger.debug(msg)
+
+
+def skip(save_folder, conf, meta_files, opt_file):
+ """
+ Detects if the AMI data_preparation has been already done.
+ If the preparation has been done, we can skip it.
+
+ Returns
+ -------
+ bool
+ if True, the preparation phase can be skipped.
+ if False, it must be done.
+ """
+ # Checking if meta (json) files are available
+ skip = True
+ for file_path in meta_files:
+ if not os.path.isfile(file_path):
+ skip = False
+
+ # Checking saved options
+ save_opt_file = os.path.join(save_folder, opt_file)
+ if skip is True:
+ if os.path.isfile(save_opt_file):
+ opts_old = load_pkl(save_opt_file)
+ if opts_old == conf:
+ skip = True
+ else:
+ skip = False
+ else:
+ skip = False
+
+ return skip
+
+
+if __name__ == '__main__':
+
+ parser = argparse.ArgumentParser(
+ prog='python ami_prepare.py --data_folder /home/data/ami/amicorpus \
+ --manual_annot_folder /home/data/ami/ami_public_manual_1.6.2 \
+ --save_folder ./results/ --ref_rttm_dir ./results/ref_rttms \
+ --meta_data_dir ./results/metadata',
+ description='AMI Data preparation')
+ parser.add_argument(
+ '--data_folder',
+ required=True,
+ help='Path to the folder where the original amicorpus is stored')
+ parser.add_argument(
+ '--manual_annot_folder',
+ required=True,
+ help='Directory where the manual annotations are stored')
+ parser.add_argument(
+ '--save_folder', required=True, help='The save directory in results')
+ parser.add_argument(
+ '--ref_rttm_dir',
+ required=True,
+ help='Directory to store reference RTTM files')
+ parser.add_argument(
+ '--meta_data_dir',
+ required=True,
+ help='Directory to store the meta data (json) files')
+ parser.add_argument(
+ '--split_type',
+ default="full_corpus_asr",
+ help='Standard dataset split. See ami_splits.py for more information')
+ parser.add_argument(
+ '--skip_TNO',
+ default=True,
+ type=strtobool,
+ help='Skips TNO meeting recordings if True')
+ parser.add_argument(
+ '--mic_type',
+ default="Mix-Headset",
+ help='Type of microphone to be used')
+ parser.add_argument(
+ '--vad_type',
+ default="oracle",
+ help='Type of VAD. Kept for future when VAD will be added')
+ parser.add_argument(
+ '--max_subseg_dur',
+ default=3.0,
+ type=float,
+ help='Duration in seconds of a subsegments to be prepared from larger segments'
+ )
+ parser.add_argument(
+ '--overlap',
+ default=1.5,
+ type=float,
+ help='Overlap duration in seconds between adjacent subsegments')
+
+ args = parser.parse_args()
+
+ prepare_ami(args.data_folder, args.manual_annot_folder, args.save_folder,
+ args.ref_rttm_dir, args.meta_data_dir)
diff --git a/examples/ami/sd0/local/ami_splits.py b/examples/ami/sd0/local/ami_splits.py
new file mode 100644
index 00000000..010638a3
--- /dev/null
+++ b/examples/ami/sd0/local/ami_splits.py
@@ -0,0 +1,234 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+AMI corpus contained 100 hours of meeting recording.
+This script returns the standard train, dev and eval split for AMI corpus.
+For more information on dataset please refer to http://groups.inf.ed.ac.uk/ami/corpus/datasets.shtml
+
+Authors
+ * qingenz123@126.com (Qingen ZHAO) 2022
+
+"""
+
+ALLOWED_OPTIONS = ["scenario_only", "full_corpus", "full_corpus_asr"]
+
+
+def get_AMI_split(split_option):
+ """
+ Prepares train, dev, and test sets for given split_option
+
+ Arguments
+ ---------
+ split_option: str
+ The standard split option.
+ Allowed options: "scenario_only", "full_corpus", "full_corpus_asr"
+
+ Returns
+ -------
+ Meeting IDs for train, dev, and test sets for given split_option
+ """
+
+ if split_option not in ALLOWED_OPTIONS:
+ print(
+ f'Invalid split "{split_option}" requested!\nValid split_options are: ',
+ ALLOWED_OPTIONS, )
+ return
+
+ if split_option == "scenario_only":
+
+ train_set = [
+ "ES2002",
+ "ES2005",
+ "ES2006",
+ "ES2007",
+ "ES2008",
+ "ES2009",
+ "ES2010",
+ "ES2012",
+ "ES2013",
+ "ES2015",
+ "ES2016",
+ "IS1000",
+ "IS1001",
+ "IS1002",
+ "IS1003",
+ "IS1004",
+ "IS1005",
+ "IS1006",
+ "IS1007",
+ "TS3005",
+ "TS3008",
+ "TS3009",
+ "TS3010",
+ "TS3011",
+ "TS3012",
+ ]
+
+ dev_set = [
+ "ES2003",
+ "ES2011",
+ "IS1008",
+ "TS3004",
+ "TS3006",
+ ]
+
+ test_set = [
+ "ES2004",
+ "ES2014",
+ "IS1009",
+ "TS3003",
+ "TS3007",
+ ]
+
+ if split_option == "full_corpus":
+ # List of train: SA (TRAINING PART OF SEEN DATA)
+ train_set = [
+ "ES2002",
+ "ES2005",
+ "ES2006",
+ "ES2007",
+ "ES2008",
+ "ES2009",
+ "ES2010",
+ "ES2012",
+ "ES2013",
+ "ES2015",
+ "ES2016",
+ "IS1000",
+ "IS1001",
+ "IS1002",
+ "IS1003",
+ "IS1004",
+ "IS1005",
+ "IS1006",
+ "IS1007",
+ "TS3005",
+ "TS3008",
+ "TS3009",
+ "TS3010",
+ "TS3011",
+ "TS3012",
+ "EN2001",
+ "EN2003",
+ "EN2004",
+ "EN2005",
+ "EN2006",
+ "EN2009",
+ "IN1001",
+ "IN1002",
+ "IN1005",
+ "IN1007",
+ "IN1008",
+ "IN1009",
+ "IN1012",
+ "IN1013",
+ "IN1014",
+ "IN1016",
+ ]
+
+ # List of dev: SB (DEV PART OF SEEN DATA)
+ dev_set = [
+ "ES2003",
+ "ES2011",
+ "IS1008",
+ "TS3004",
+ "TS3006",
+ "IB4001",
+ "IB4002",
+ "IB4003",
+ "IB4004",
+ "IB4010",
+ "IB4011",
+ ]
+
+ # List of test: SC (UNSEEN DATA FOR EVALUATION)
+ # Note that IB4005 does not appear because it has speakers in common with two sets of data.
+ test_set = [
+ "ES2004",
+ "ES2014",
+ "IS1009",
+ "TS3003",
+ "TS3007",
+ "EN2002",
+ ]
+
+ if split_option == "full_corpus_asr":
+ train_set = [
+ "ES2002",
+ "ES2003",
+ "ES2005",
+ "ES2006",
+ "ES2007",
+ "ES2008",
+ "ES2009",
+ "ES2010",
+ "ES2012",
+ "ES2013",
+ "ES2014",
+ "ES2015",
+ "ES2016",
+ "IS1000",
+ "IS1001",
+ "IS1002",
+ "IS1003",
+ "IS1004",
+ "IS1005",
+ "IS1006",
+ "IS1007",
+ "TS3005",
+ "TS3006",
+ "TS3007",
+ "TS3008",
+ "TS3009",
+ "TS3010",
+ "TS3011",
+ "TS3012",
+ "EN2001",
+ "EN2003",
+ "EN2004",
+ "EN2005",
+ "EN2006",
+ "EN2009",
+ "IN1001",
+ "IN1002",
+ "IN1005",
+ "IN1007",
+ "IN1008",
+ "IN1009",
+ "IN1012",
+ "IN1013",
+ "IN1014",
+ "IN1016",
+ ]
+
+ dev_set = [
+ "ES2011",
+ "IS1008",
+ "TS3004",
+ "IB4001",
+ "IB4002",
+ "IB4003",
+ "IB4004",
+ "IB4010",
+ "IB4011",
+ ]
+
+ test_set = [
+ "ES2004",
+ "IS1009",
+ "TS3003",
+ "EN2002",
+ ]
+
+ return train_set, dev_set, test_set
diff --git a/examples/ami/sd0/local/data.sh b/examples/ami/sd0/local/data.sh
new file mode 100755
index 00000000..478ec432
--- /dev/null
+++ b/examples/ami/sd0/local/data.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+
+stage=1
+
+TARGET_DIR=${MAIN_ROOT}/dataset/ami
+data_folder=${TARGET_DIR}/amicorpus #e.g., /path/to/amicorpus/
+manual_annot_folder=${TARGET_DIR}/ami_public_manual_1.6.2 #e.g., /path/to/ami_public_manual_1.6.2/
+
+save_folder=${MAIN_ROOT}/examples/ami/sd0/data
+ref_rttm_dir=${save_folder}/ref_rttms
+meta_data_dir=${save_folder}/metadata
+
+set=L
+
+. ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
+set -u
+set -o pipefail
+
+mkdir -p ${save_folder}
+
+if [ ${stage} -le 0 ]; then
+ # Download AMI corpus, You need around 10GB of free space to get whole data
+ # The signals are too large to package in this way,
+ # so you need to use the chooser to indicate which ones you wish to download
+ echo "Please follow https://groups.inf.ed.ac.uk/ami/download/ to download the data."
+ echo "Annotations: AMI manual annotations v1.6.2 "
+ echo "Signals: "
+ echo "1) Select one or more AMI meetings: the IDs please follow ./ami_split.py"
+ echo "2) Select media streams: Just select Headset mix"
+ exit 0;
+fi
+
+if [ ${stage} -le 1 ]; then
+ echo "AMI Data preparation"
+
+ python local/ami_prepare.py --data_folder ${data_folder} \
+ --manual_annot_folder ${manual_annot_folder} \
+ --save_folder ${save_folder} --ref_rttm_dir ${ref_rttm_dir} \
+ --meta_data_dir ${meta_data_dir}
+
+ if [ $? -ne 0 ]; then
+ echo "Prepare AMI failed. Please check log message."
+ exit 1
+ fi
+
+fi
+
+echo "AMI data preparation done."
+exit 0
diff --git a/examples/ami/sd0/local/dataio.py b/examples/ami/sd0/local/dataio.py
new file mode 100644
index 00000000..f7fe8815
--- /dev/null
+++ b/examples/ami/sd0/local/dataio.py
@@ -0,0 +1,97 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Data reading and writing.
+
+Authors
+ * qingenz123@126.com (Qingen ZHAO) 2022
+
+"""
+import os
+import pickle
+
+
+def save_pkl(obj, file):
+ """Save an object in pkl format.
+
+ Arguments
+ ---------
+ obj : object
+ Object to save in pkl format
+ file : str
+ Path to the output file
+ sampling_rate : int
+ Sampling rate of the audio file, TODO: this is not used?
+
+ Example
+ -------
+ >>> tmpfile = os.path.join(getfixture('tmpdir'), "example.pkl")
+ >>> save_pkl([1, 2, 3, 4, 5], tmpfile)
+ >>> load_pkl(tmpfile)
+ [1, 2, 3, 4, 5]
+ """
+ with open(file, "wb") as f:
+ pickle.dump(obj, f)
+
+
+def load_pickle(pickle_path):
+ """Utility function for loading .pkl pickle files.
+
+ Arguments
+ ---------
+ pickle_path : str
+ Path to pickle file.
+
+ Returns
+ -------
+ out : object
+ Python object loaded from pickle.
+ """
+ with open(pickle_path, "rb") as f:
+ out = pickle.load(f)
+ return out
+
+
+def load_pkl(file):
+ """Loads a pkl file.
+
+ For an example, see `save_pkl`.
+
+ Arguments
+ ---------
+ file : str
+ Path to the input pkl file.
+
+ Returns
+ -------
+ The loaded object.
+ """
+
+ # Deals with the situation where two processes are trying
+ # to access the same label dictionary by creating a lock
+ count = 100
+ while count > 0:
+ if os.path.isfile(file + ".lock"):
+ time.sleep(1)
+ count -= 1
+ else:
+ break
+
+ try:
+ open(file + ".lock", "w").close()
+ with open(file, "rb") as f:
+ return pickle.load(f)
+ finally:
+ if os.path.isfile(file + ".lock"):
+ os.remove(file + ".lock")
diff --git a/examples/ami/sd0/path.sh b/examples/ami/sd0/path.sh
new file mode 100644
index 00000000..60146113
--- /dev/null
+++ b/examples/ami/sd0/path.sh
@@ -0,0 +1,15 @@
+export MAIN_ROOT=`realpath ${PWD}/../../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
+
+# model exp
+#MODEL=ECAPA_TDNN
+#export BIN_DIR=${MAIN_ROOT}/paddlespeech/vector/exps/${MODEL}/bin
diff --git a/examples/ami/sd0/run.sh b/examples/ami/sd0/run.sh
new file mode 100644
index 00000000..91d4b706
--- /dev/null
+++ b/examples/ami/sd0/run.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+. path.sh || exit 1;
+set -e
+
+stage=1
+
+
+. ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
+
+if [ ${stage} -le 1 ]; then
+ # prepare data
+ bash ./local/data.sh || exit -1
+fi
\ No newline at end of file
diff --git a/examples/ami/sd0/utils b/examples/ami/sd0/utils
new file mode 120000
index 00000000..973afe67
--- /dev/null
+++ b/examples/ami/sd0/utils
@@ -0,0 +1 @@
+../../../utils
\ No newline at end of file
diff --git a/examples/callcenter/README.md b/examples/callcenter/README.md
new file mode 100644
index 00000000..1c715cb6
--- /dev/null
+++ b/examples/callcenter/README.md
@@ -0,0 +1,20 @@
+# Callcenter 8k sample rate
+
+Data distribution:
+
+```
+676048 utts
+491.4004722221223 h
+4357792.0 text
+2.4633630739178654 text/sec
+2.6167397877068495 sec/utt
+```
+
+train/dev/test partition:
+
+```
+ 33802 manifest.dev
+ 67606 manifest.test
+ 574640 manifest.train
+ 676048 total
+```
diff --git a/examples/csmsc/README.md b/examples/csmsc/README.md
index a59a06ed..2aad609c 100644
--- a/examples/csmsc/README.md
+++ b/examples/csmsc/README.md
@@ -10,3 +10,5 @@
* voc2 - MelGAN
* voc3 - MultiBand MelGAN
* voc4 - Style MelGAN
+* voc5 - HiFiGAN
+* voc6 - WaveRNN
diff --git a/examples/csmsc/tts0/README.md b/examples/csmsc/tts0/README.md
new file mode 100644
index 00000000..0129329a
--- /dev/null
+++ b/examples/csmsc/tts0/README.md
@@ -0,0 +1,250 @@
+# Tacotron2 with CSMSC
+This example contains code used to train a [Tacotron2](https://arxiv.org/abs/1712.05884) model with [Chinese Standard Mandarin Speech Copus](https://www.data-baker.com/open_source.html).
+
+## Dataset
+### Download and Extract
+Download CSMSC from it's [Official Website](https://test.data-baker.com/data/index/source).
+
+### Get MFA Result and Extract
+We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get phonemes for Tacotron2, the durations of MFA are not needed here.
+You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo.
+
+## Get Started
+Assume the path to the dataset is `~/datasets/BZNSYP`.
+Assume the path to the MFA result of CSMSC is `./baker_alignment_tone`.
+Run the command below to
+1. **source path**.
+2. preprocess the dataset.
+3. train the model.
+4. synthesize wavs.
+ - synthesize waveform from `metadata.jsonl`.
+ - synthesize waveform from a text file.
+
+```bash
+./run.sh
+```
+You can choose a range of stages you want to run, or set `stage` equal to `stop-stage` to use only one stage, for example, running the following command will only preprocess the dataset.
+```bash
+./run.sh --stage 0 --stop-stage 0
+```
+### Data Preprocessing
+```bash
+./local/preprocess.sh ${conf_path}
+```
+When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below.
+
+```text
+dump
+├── dev
+│ ├── norm
+│ └── raw
+├── phone_id_map.txt
+├── speaker_id_map.txt
+├── test
+│ ├── norm
+│ └── raw
+└── train
+ ├── norm
+ ├── raw
+ └── speech_stats.npy
+```
+The dataset is split into 3 parts, namely `train`, `dev`, and` test`, each of which contains a `norm` and `raw` subfolder. The raw folder contains speech features of each utterance, while the norm folder contains normalized ones. The statistics used to normalize features are computed from the training set, which is located in `dump/train/*_stats.npy`.
+
+Also, there is a `metadata.jsonl` in each subfolder. It is a table-like file that contains phones, text_lengths, speech_lengths, durations, the path of speech features, speaker, and the id of each utterance.
+
+### Model Training
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
+```
+`./local/train.sh` calls `${BIN_DIR}/train.py`.
+Here's the complete help message.
+```text
+usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
+ [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
+ [--ngpu NGPU] [--phones-dict PHONES_DICT]
+
+Train a Tacotron2 model.
+
+optional arguments:
+ -h, --help show this help message and exit
+ --config CONFIG tacotron2 config file.
+ --train-metadata TRAIN_METADATA
+ training data.
+ --dev-metadata DEV_METADATA
+ dev data.
+ --output-dir OUTPUT_DIR
+ output dir.
+ --ngpu NGPU if ngpu == 0, use cpu.
+ --phones-dict PHONES_DICT
+ phone vocabulary file.
+```
+1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`.
+2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder.
+3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are saved in `checkpoints/` inside this directory.
+4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
+5. `--phones-dict` is the path of the phone vocabulary file.
+
+### Synthesizing
+We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc1) as the neural vocoder.
+Download pretrained parallel wavegan model from [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip) and unzip it.
+```bash
+unzip pwg_baker_ckpt_0.4.zip
+```
+Parallel WaveGAN checkpoint contains files listed below.
+```text
+pwg_baker_ckpt_0.4
+├── pwg_default.yaml # default config used to train parallel wavegan
+├── pwg_snapshot_iter_400000.pdz # model parameters of parallel wavegan
+└── pwg_stats.npy # statistics used to normalize spectrogram when training parallel wavegan
+```
+`./local/synthesize.sh` calls `${BIN_DIR}/../synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
+```
+```text
+usage: synthesize.py [-h]
+ [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc}]
+ [--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
+ [--am_stat AM_STAT] [--phones_dict PHONES_DICT]
+ [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT]
+ [--voice-cloning VOICE_CLONING]
+ [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}]
+ [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
+ [--voc_stat VOC_STAT] [--ngpu NGPU]
+ [--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR]
+
+Synthesize with acoustic model & vocoder
+
+optional arguments:
+ -h, --help show this help message and exit
+ --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc}
+ Choose acoustic model type of tts task.
+ --am_config AM_CONFIG
+ Config of acoustic model. Use deault config when it is
+ None.
+ --am_ckpt AM_CKPT Checkpoint file of acoustic model.
+ --am_stat AM_STAT mean and standard deviation used to normalize
+ spectrogram when training acoustic model.
+ --phones_dict PHONES_DICT
+ phone vocabulary file.
+ --tones_dict TONES_DICT
+ tone vocabulary file.
+ --speaker_dict SPEAKER_DICT
+ speaker id map file.
+ --voice-cloning VOICE_CLONING
+ whether training voice cloning model.
+ --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}
+ Choose vocoder type of tts task.
+ --voc_config VOC_CONFIG
+ Config of voc. Use deault config when it is None.
+ --voc_ckpt VOC_CKPT Checkpoint file of voc.
+ --voc_stat VOC_STAT mean and standard deviation used to normalize
+ spectrogram when training voc.
+ --ngpu NGPU if ngpu == 0, use cpu.
+ --test_metadata TEST_METADATA
+ test metadata.
+ --output_dir OUTPUT_DIR
+ output dir.
+```
+`./local/synthesize_e2e.sh` calls `${BIN_DIR}/../synthesize_e2e.py`, which can synthesize waveform from text file.
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name}
+```
+```text
+usage: synthesize_e2e.py [-h]
+ [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc}]
+ [--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
+ [--am_stat AM_STAT] [--phones_dict PHONES_DICT]
+ [--tones_dict TONES_DICT]
+ [--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID]
+ [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc}]
+ [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
+ [--voc_stat VOC_STAT] [--lang LANG]
+ [--inference_dir INFERENCE_DIR] [--ngpu NGPU]
+ [--text TEXT] [--output_dir OUTPUT_DIR]
+
+Synthesize with acoustic model & vocoder
+
+optional arguments:
+ -h, --help show this help message and exit
+ --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc}
+ Choose acoustic model type of tts task.
+ --am_config AM_CONFIG
+ Config of acoustic model. Use deault config when it is
+ None.
+ --am_ckpt AM_CKPT Checkpoint file of acoustic model.
+ --am_stat AM_STAT mean and standard deviation used to normalize
+ spectrogram when training acoustic model.
+ --phones_dict PHONES_DICT
+ phone vocabulary file.
+ --tones_dict TONES_DICT
+ tone vocabulary file.
+ --speaker_dict SPEAKER_DICT
+ speaker id map file.
+ --spk_id SPK_ID spk id for multi speaker acoustic model
+ --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc}
+ Choose vocoder type of tts task.
+ --voc_config VOC_CONFIG
+ Config of voc. Use deault config when it is None.
+ --voc_ckpt VOC_CKPT Checkpoint file of voc.
+ --voc_stat VOC_STAT mean and standard deviation used to normalize
+ spectrogram when training voc.
+ --lang LANG Choose model language. zh or en
+ --inference_dir INFERENCE_DIR
+ dir to save inference models
+ --ngpu NGPU if ngpu == 0, use cpu.
+ --text TEXT text to synthesize, a 'utt_id sentence' pair per line.
+ --output_dir OUTPUT_DIR
+ output dir.
+```
+1. `--am` is acoustic model type with the format {model_name}_{dataset}
+2. `--am_config`, `--am_checkpoint`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the Tacotron2 pretrained model.
+3. `--voc` is vocoder type with the format {model_name}_{dataset}
+4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model.
+5. `--lang` is the model language, which can be `zh` or `en`.
+6. `--test_metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder.
+7. `--text` is the text file, which contains sentences to synthesize.
+8. `--output_dir` is the directory to save synthesized audio files.
+9. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
+
+
+## Pretrained Model
+Pretrained Tacotron2 model with no silence in the edge of audios:
+- [tacotron2_csmsc_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_csmsc_ckpt_0.2.0.zip)
+
+The static model can be downloaded here [tacotron2_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_csmsc_static_0.2.0.zip).
+
+
+Model | Step | eval/loss | eval/l1_loss | eval/mse_loss | eval/bce_loss| eval/attn_loss
+:-------------:| :------------:| :-----: | :-----: | :--------: |:--------:|:---------:
+default| 1(gpu) x 30600|0.57185|0.39614|0.14642|0.029|5.8e-05|
+
+Tacotron2 checkpoint contains files listed below.
+```text
+tacotron2_csmsc_ckpt_0.2.0
+├── default.yaml # default config used to train Tacotron2
+├── phone_id_map.txt # phone vocabulary file when training Tacotron2
+├── snapshot_iter_30600.pdz # model parameters and optimizer states
+└── speech_stats.npy # statistics used to normalize spectrogram when training Tacotron2
+```
+You can use the following scripts to synthesize for `${BIN_DIR}/../sentences.txt` using pretrained Tacotron2 and parallel wavegan models.
+```bash
+source path.sh
+
+FLAGS_allocator_strategy=naive_best_fit \
+FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+python3 ${BIN_DIR}/../synthesize_e2e.py \
+ --am=tacotron2_csmsc \
+ --am_config=tacotron2_csmsc_ckpt_0.2.0/default.yaml \
+ --am_ckpt=tacotron2_csmsc_ckpt_0.2.0/snapshot_iter_30600.pdz \
+ --am_stat=tacotron2_csmsc_ckpt_0.2.0/speech_stats.npy \
+ --voc=pwgan_csmsc \
+ --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \
+ --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
+ --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
+ --lang=zh \
+ --text=${BIN_DIR}/../sentences.txt \
+ --output_dir=exp/default/test_e2e \
+ --inference_dir=exp/default/inference \
+ --phones_dict=tacotron2_csmsc_ckpt_0.2.0/phone_id_map.txt
+```
diff --git a/examples/csmsc/tts0/conf/default.yaml b/examples/csmsc/tts0/conf/default.yaml
new file mode 100644
index 00000000..42635c50
--- /dev/null
+++ b/examples/csmsc/tts0/conf/default.yaml
@@ -0,0 +1,91 @@
+# This configuration is for Paddle to train Tacotron 2. Compared to the
+# original paper, this configuration additionally use the guided attention
+# loss to accelerate the learning of the diagonal attention. It requires
+# only a single GPU with 12 GB memory and it takes ~1 days to finish the
+# training on Titan V.
+
+###########################################################
+# FEATURE EXTRACTION SETTING #
+###########################################################
+
+fs: 24000 # sr
+n_fft: 2048 # FFT size (samples).
+n_shift: 300 # Hop size (samples). 12.5ms
+win_length: 1200 # Window length (samples). 50ms
+ # If set to null, it will be the same as fft_size.
+window: "hann" # Window function.
+
+# Only used for feats_type != raw
+
+fmin: 80 # Minimum frequency of Mel basis.
+fmax: 7600 # Maximum frequency of Mel basis.
+n_mels: 80 # The number of mel basis.
+
+###########################################################
+# DATA SETTING #
+###########################################################
+batch_size: 64
+num_workers: 2
+
+###########################################################
+# MODEL SETTING #
+###########################################################
+model: # keyword arguments for the selected model
+ embed_dim: 512 # char or phn embedding dimension
+ elayers: 1 # number of blstm layers in encoder
+ eunits: 512 # number of blstm units
+ econv_layers: 3 # number of convolutional layers in encoder
+ econv_chans: 512 # number of channels in convolutional layer
+ econv_filts: 5 # filter size of convolutional layer
+ atype: location # attention function type
+ adim: 512 # attention dimension
+ aconv_chans: 32 # number of channels in convolutional layer of attention
+ aconv_filts: 15 # filter size of convolutional layer of attention
+ cumulate_att_w: True # whether to cumulate attention weight
+ dlayers: 2 # number of lstm layers in decoder
+ dunits: 1024 # number of lstm units in decoder
+ prenet_layers: 2 # number of layers in prenet
+ prenet_units: 256 # number of units in prenet
+ postnet_layers: 5 # number of layers in postnet
+ postnet_chans: 512 # number of channels in postnet
+ postnet_filts: 5 # filter size of postnet layer
+ output_activation: null # activation function for the final output
+ use_batch_norm: True # whether to use batch normalization in encoder
+ use_concate: True # whether to concatenate encoder embedding with decoder outputs
+ use_residual: False # whether to use residual connection in encoder
+ dropout_rate: 0.5 # dropout rate
+ zoneout_rate: 0.1 # zoneout rate
+ reduction_factor: 1 # reduction factor
+ spk_embed_dim: null # speaker embedding dimension
+
+
+###########################################################
+# UPDATER SETTING #
+###########################################################
+updater:
+ use_masking: True # whether to apply masking for padded part in loss calculation
+ bce_pos_weight: 5.0 # weight of positive sample in binary cross entropy calculation
+ use_guided_attn_loss: True # whether to use guided attention loss
+ guided_attn_loss_sigma: 0.4 # sigma of guided attention loss
+ guided_attn_loss_lambda: 1.0 # strength of guided attention loss
+
+
+##########################################################
+# OPTIMIZER SETTING #
+##########################################################
+optimizer:
+ optim: adam # optimizer type
+ learning_rate: 1.0e-03 # learning rate
+ epsilon: 1.0e-06 # epsilon
+ weight_decay: 0.0 # weight decay coefficient
+
+###########################################################
+# TRAINING SETTING #
+###########################################################
+max_epoch: 200
+num_snapshots: 5
+
+###########################################################
+# OTHER SETTING #
+###########################################################
+seed: 42
\ No newline at end of file
diff --git a/examples/csmsc/tts0/local/inference.sh b/examples/csmsc/tts0/local/inference.sh
new file mode 100755
index 00000000..e417d748
--- /dev/null
+++ b/examples/csmsc/tts0/local/inference.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+
+train_output_path=$1
+
+stage=0
+stop_stage=0
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+ python3 ${BIN_DIR}/../inference.py \
+ --inference_dir=${train_output_path}/inference \
+ --am=tacotron2_csmsc \
+ --voc=pwgan_csmsc \
+ --text=${BIN_DIR}/../sentences.txt \
+ --output_dir=${train_output_path}/pd_infer_out \
+ --phones_dict=dump/phone_id_map.txt
+fi
+
+# for more GAN Vocoders
+# multi band melgan
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+ python3 ${BIN_DIR}/../inference.py \
+ --inference_dir=${train_output_path}/inference \
+ --am=tacotron2_csmsc \
+ --voc=mb_melgan_csmsc \
+ --text=${BIN_DIR}/../sentences.txt \
+ --output_dir=${train_output_path}/pd_infer_out \
+ --phones_dict=dump/phone_id_map.txt
+fi
+
+# style melgan
+# style melgan's Dygraph to Static Graph is not ready now
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+ python3 ${BIN_DIR}/../inference.py \
+ --inference_dir=${train_output_path}/inference \
+ --am=tacotron2_csmsc \
+ --voc=style_melgan_csmsc \
+ --text=${BIN_DIR}/../sentences.txt \
+ --output_dir=${train_output_path}/pd_infer_out \
+ --phones_dict=dump/phone_id_map.txt
+fi
+
+# hifigan
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+ python3 ${BIN_DIR}/../inference.py \
+ --inference_dir=${train_output_path}/inference \
+ --am=tacotron2_csmsc \
+ --voc=hifigan_csmsc \
+ --text=${BIN_DIR}/../sentences.txt \
+ --output_dir=${train_output_path}/pd_infer_out \
+ --phones_dict=dump/phone_id_map.txt
+fi
\ No newline at end of file
diff --git a/examples/csmsc/tts0/local/preprocess.sh b/examples/csmsc/tts0/local/preprocess.sh
new file mode 100755
index 00000000..8a4b8dd9
--- /dev/null
+++ b/examples/csmsc/tts0/local/preprocess.sh
@@ -0,0 +1,62 @@
+#!/bin/bash
+
+stage=0
+stop_stage=100
+
+config_path=$1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+ # get durations from MFA's result
+ echo "Generate durations.txt from MFA results ..."
+ python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
+ --inputdir=./baker_alignment_tone \
+ --output=durations.txt \
+ --config=${config_path}
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+ # extract features
+ echo "Extract features ..."
+ python3 ${BIN_DIR}/preprocess.py \
+ --dataset=baker \
+ --rootdir=~/datasets/BZNSYP/ \
+ --dumpdir=dump \
+ --dur-file=durations.txt \
+ --config=${config_path} \
+ --num-cpu=20 \
+ --cut-sil=True
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+ # get features' stats(mean and std)
+ echo "Get features' stats ..."
+ python3 ${MAIN_ROOT}/utils/compute_statistics.py \
+ --metadata=dump/train/raw/metadata.jsonl \
+ --field-name="speech"
+
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+ # normalize and covert phone to id, dev and test should use train's stats
+ echo "Normalize ..."
+ python3 ${BIN_DIR}/normalize.py \
+ --metadata=dump/train/raw/metadata.jsonl \
+ --dumpdir=dump/train/norm \
+ --speech-stats=dump/train/speech_stats.npy \
+ --phones-dict=dump/phone_id_map.txt \
+ --speaker-dict=dump/speaker_id_map.txt
+
+ python3 ${BIN_DIR}/normalize.py \
+ --metadata=dump/dev/raw/metadata.jsonl \
+ --dumpdir=dump/dev/norm \
+ --speech-stats=dump/train/speech_stats.npy \
+ --phones-dict=dump/phone_id_map.txt \
+ --speaker-dict=dump/speaker_id_map.txt
+
+ python3 ${BIN_DIR}/normalize.py \
+ --metadata=dump/test/raw/metadata.jsonl \
+ --dumpdir=dump/test/norm \
+ --speech-stats=dump/train/speech_stats.npy \
+ --phones-dict=dump/phone_id_map.txt \
+ --speaker-dict=dump/speaker_id_map.txt
+fi
diff --git a/examples/csmsc/tts0/local/synthesize.sh b/examples/csmsc/tts0/local/synthesize.sh
new file mode 100755
index 00000000..4be06dd8
--- /dev/null
+++ b/examples/csmsc/tts0/local/synthesize.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+
+FLAGS_allocator_strategy=naive_best_fit \
+FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+python3 ${BIN_DIR}/../synthesize.py \
+ --am=tacotron2_csmsc \
+ --am_config=${config_path} \
+ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+ --am_stat=dump/train/speech_stats.npy \
+ --voc=pwgan_csmsc \
+ --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \
+ --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
+ --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
+ --test_metadata=dump/test/norm/metadata.jsonl \
+ --output_dir=${train_output_path}/test \
+ --phones_dict=dump/phone_id_map.txt
diff --git a/examples/csmsc/tts0/local/synthesize_e2e.sh b/examples/csmsc/tts0/local/synthesize_e2e.sh
new file mode 100755
index 00000000..79bb9f83
--- /dev/null
+++ b/examples/csmsc/tts0/local/synthesize_e2e.sh
@@ -0,0 +1,114 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+
+stage=0
+stop_stage=0
+
+# TODO: tacotron2 动转静的结果没有静态图的响亮, 可能还是 decode 的时候某个函数动静不对齐
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+ FLAGS_allocator_strategy=naive_best_fit \
+ FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+ python3 ${BIN_DIR}/../synthesize_e2e.py \
+ --am=tacotron2_csmsc \
+ --am_config=${config_path} \
+ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+ --am_stat=dump/train/speech_stats.npy \
+ --voc=pwgan_csmsc \
+ --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \
+ --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
+ --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
+ --lang=zh \
+ --text=${BIN_DIR}/../sentences.txt \
+ --output_dir=${train_output_path}/test_e2e \
+ --phones_dict=dump/phone_id_map.txt \
+ --inference_dir=${train_output_path}/inference
+
+fi
+
+# for more GAN Vocoders
+# multi band melgan
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+ FLAGS_allocator_strategy=naive_best_fit \
+ FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+ python3 ${BIN_DIR}/../synthesize_e2e.py \
+ --am=tacotron2_csmsc \
+ --am_config=${config_path} \
+ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+ --am_stat=dump/train/speech_stats.npy \
+ --voc=mb_melgan_csmsc \
+ --voc_config=mb_melgan_baker_finetune_ckpt_0.5/finetune.yaml \
+ --voc_ckpt=mb_melgan_baker_finetune_ckpt_0.5/snapshot_iter_2000000.pdz\
+ --voc_stat=mb_melgan_baker_finetune_ckpt_0.5/feats_stats.npy \
+ --lang=zh \
+ --text=${BIN_DIR}/../sentences.txt \
+ --output_dir=${train_output_path}/test_e2e \
+ --inference_dir=${train_output_path}/inference \
+ --phones_dict=dump/phone_id_map.txt
+fi
+
+# the pretrained models haven't release now
+# style melgan
+# style melgan's Dygraph to Static Graph is not ready now
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+ FLAGS_allocator_strategy=naive_best_fit \
+ FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+ python3 ${BIN_DIR}/../synthesize_e2e.py \
+ --am=tacotron2_csmsc \
+ --am_config=${config_path} \
+ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+ --am_stat=dump/train/speech_stats.npy \
+ --voc=style_melgan_csmsc \
+ --voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
+ --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
+ --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
+ --lang=zh \
+ --text=${BIN_DIR}/../sentences.txt \
+ --output_dir=${train_output_path}/test_e2e \
+ --phones_dict=dump/phone_id_map.txt
+ # --inference_dir=${train_output_path}/inference
+fi
+
+# hifigan
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+ echo "in hifigan syn_e2e"
+ FLAGS_allocator_strategy=naive_best_fit \
+ FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+ python3 ${BIN_DIR}/../synthesize_e2e.py \
+ --am=tacotron2_csmsc \
+ --am_config=${config_path} \
+ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+ --am_stat=dump/train/speech_stats.npy \
+ --voc=hifigan_csmsc \
+ --voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
+ --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
+ --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
+ --lang=zh \
+ --text=${BIN_DIR}/../sentences.txt \
+ --output_dir=${train_output_path}/test_e2e \
+ --inference_dir=${train_output_path}/inference \
+ --phones_dict=dump/phone_id_map.txt
+fi
+
+# wavernn
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+ echo "in wavernn syn_e2e"
+ FLAGS_allocator_strategy=naive_best_fit \
+ FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+ python3 ${BIN_DIR}/../synthesize_e2e.py \
+ --am=tacotron2_csmsc \
+ --am_config=${config_path} \
+ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+ --am_stat=dump/train/speech_stats.npy \
+ --voc=wavernn_csmsc \
+ --voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \
+ --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
+ --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
+ --lang=zh \
+ --text=${BIN_DIR}/../sentences.txt \
+ --output_dir=${train_output_path}/test_e2e \
+ --phones_dict=dump/phone_id_map.txt \
+ --inference_dir=${train_output_path}/inference
+fi
\ No newline at end of file
diff --git a/examples/csmsc/tts0/local/train.sh b/examples/csmsc/tts0/local/train.sh
new file mode 100755
index 00000000..f90db915
--- /dev/null
+++ b/examples/csmsc/tts0/local/train.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+
+python3 ${BIN_DIR}/train.py \
+ --train-metadata=dump/train/norm/metadata.jsonl \
+ --dev-metadata=dump/dev/norm/metadata.jsonl \
+ --config=${config_path} \
+ --output-dir=${train_output_path} \
+ --ngpu=1 \
+ --phones-dict=dump/phone_id_map.txt
\ No newline at end of file
diff --git a/examples/csmsc/tts0/path.sh b/examples/csmsc/tts0/path.sh
new file mode 100755
index 00000000..a37cd21e
--- /dev/null
+++ b/examples/csmsc/tts0/path.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+export MAIN_ROOT=`realpath ${PWD}/../../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+MODEL=tacotron2
+export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
diff --git a/examples/csmsc/tts0/run.sh b/examples/csmsc/tts0/run.sh
new file mode 100755
index 00000000..8f06e933
--- /dev/null
+++ b/examples/csmsc/tts0/run.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+set -e
+source path.sh
+
+gpus=0,1
+stage=0
+stop_stage=100
+
+conf_path=conf/default.yaml
+train_output_path=exp/default
+ckpt_name=snapshot_iter_153.pdz
+
+# with the following command, you can choose the stage range you want to run
+# such as `./run.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+ # prepare data
+ ./local/preprocess.sh ${conf_path} || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+ # train model, all `ckpt` under `train_output_path/checkpoints/` dir
+ CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+ # synthesize, vocoder is pwgan
+ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+ # synthesize_e2e, vocoder is pwgan
+ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+ # inference with static model
+ CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path} || exit -1
+fi
diff --git a/examples/csmsc/tts2/local/synthesize_e2e.sh b/examples/csmsc/tts2/local/synthesize_e2e.sh
index 0a4cf69b..35fcf251 100755
--- a/examples/csmsc/tts2/local/synthesize_e2e.sh
+++ b/examples/csmsc/tts2/local/synthesize_e2e.sh
@@ -92,3 +92,26 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt
fi
+
+
+# wavernn
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+ echo "in wavernn syn_e2e"
+ FLAGS_allocator_strategy=naive_best_fit \
+ FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+ python3 ${BIN_DIR}/../synthesize_e2e.py \
+ --am=speedyspeech_csmsc \
+ --am_config=${config_path} \
+ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+ --am_stat=dump/train/feats_stats.npy \
+ --voc=wavernn_csmsc \
+ --voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \
+ --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
+ --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
+ --lang=zh \
+ --text=${BIN_DIR}/../sentences.txt \
+ --output_dir=${train_output_path}/test_e2e \
+ --phones_dict=dump/phone_id_map.txt \
+ --tones_dict=dump/tone_id_map.txt \
+ --inference_dir=${train_output_path}/inference
+fi
diff --git a/examples/csmsc/tts3/README.md b/examples/csmsc/tts3/README.md
index 13d291b5..7b803526 100644
--- a/examples/csmsc/tts3/README.md
+++ b/examples/csmsc/tts3/README.md
@@ -1,3 +1,4 @@
+([简体中文](./README_cn.md)|English)
# FastSpeech2 with CSMSC
This example contains code used to train a [Fastspeech2](https://arxiv.org/abs/2006.04558) model with [Chinese Standard Mandarin Speech Copus](https://www.data-baker.com/open_source.html).
@@ -242,6 +243,8 @@ fastspeech2_nosil_baker_ckpt_0.4
└── speech_stats.npy # statistics used to normalize spectrogram when training fastspeech2
```
You can use the following scripts to synthesize for `${BIN_DIR}/../sentences.txt` using pretrained fastspeech2 and parallel wavegan models.
+
+If you want to use fastspeech2_conformer, you must delete this line `--inference_dir=exp/default/inference \` to skip the step of dygraph to static graph, cause we haven't tested dygraph to static graph for fastspeech2_conformer till now.
```bash
source path.sh
diff --git a/examples/csmsc/tts3/README_cn.md b/examples/csmsc/tts3/README_cn.md
new file mode 100644
index 00000000..25931ecb
--- /dev/null
+++ b/examples/csmsc/tts3/README_cn.md
@@ -0,0 +1,273 @@
+(简体中文|[English](./README.md))
+# 用 CSMSC 数据集训练 FastSpeech2 模型
+
+本用例包含用于训练 [Fastspeech2](https://arxiv.org/abs/2006.04558) 模型的代码,使用 [Chinese Standard Mandarin Speech Copus](https://www.data-baker.com/open_source.html) 数据集。
+
+## 数据集
+### 下载并解压
+从 [官方网站](https://test.data-baker.com/data/index/source) 下载数据集
+
+### 获取MFA结果并解压
+我们使用 [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) 去获得 fastspeech2 的音素持续时间。
+你们可以从这里下载 [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), 或参考 [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) 训练你自己的模型。
+
+## 开始
+假设数据集的路径是 `~/datasets/BZNSYP`.
+假设CSMSC的MFA结果路径为 `./baker_alignment_tone`.
+运行下面的命令会进行如下操作:
+
+1. **设置原路径**。
+2. 对数据集进行预处理。
+3. 训练模型
+4. 合成波形
+ - 从 `metadata.jsonl` 合成波形。
+ - 从文本文件合成波形。
+5. 使用静态模型进行推理。
+```bash
+./run.sh
+```
+您可以选择要运行的一系列阶段,或者将 `stage` 设置为 `stop-stage` 以仅使用一个阶段,例如,运行以下命令只会预处理数据集。
+```bash
+./run.sh --stage 0 --stop-stage 0
+```
+### 数据预处理
+```bash
+./local/preprocess.sh ${conf_path}
+```
+当它完成时。将在当前目录中创建 `dump` 文件夹。转储文件夹的结构如下所示。
+
+```text
+dump
+├── dev
+│ ├── norm
+│ └── raw
+├── phone_id_map.txt
+├── speaker_id_map.txt
+├── test
+│ ├── norm
+│ └── raw
+└── train
+ ├── energy_stats.npy
+ ├── norm
+ ├── pitch_stats.npy
+ ├── raw
+ └── speech_stats.npy
+```
+
+数据集分为三个部分,即 `train` 、 `dev` 和 `test` ,每个部分都包含一个 `norm` 和 `raw` 子文件夹。原始文件夹包含每个话语的语音、音调和能量特征,而 `norm` 文件夹包含规范化的特征。用于规范化特征的统计数据是从 `dump/train/*_stats.npy` 中的训练集计算出来的。
+
+此外,还有一个 `metadata.jsonl` 在每个子文件夹中。它是一个类似表格的文件,包含音素、文本长度、语音长度、持续时间、语音特征路径、音调特征路径、能量特征路径、说话人和每个话语的 id。
+
+### 模型训练
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
+```
+`./local/train.sh` 调用 `${BIN_DIR}/train.py` 。
+以下是完整的帮助信息。
+
+```text
+usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
+ [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
+ [--ngpu NGPU] [--phones-dict PHONES_DICT]
+ [--speaker-dict SPEAKER_DICT] [--voice-cloning VOICE_CLONING]
+
+Train a FastSpeech2 model.
+
+optional arguments:
+ -h, --help show this help message and exit
+ --config CONFIG fastspeech2 config file.
+ --train-metadata TRAIN_METADATA
+ training data.
+ --dev-metadata DEV_METADATA
+ dev data.
+ --output-dir OUTPUT_DIR
+ output dir.
+ --ngpu NGPU if ngpu=0, use cpu.
+ --phones-dict PHONES_DICT
+ phone vocabulary file.
+ --speaker-dict SPEAKER_DICT
+ speaker id map file for multiple speaker model.
+ --voice-cloning VOICE_CLONING
+ whether training voice cloning model.
+```
+1. `--config` 是一个 yaml 格式的配置文件,用于覆盖默认配置,位于 `conf/default.yaml`.
+2. `--train-metadata` 和 `--dev-metadata` 应为 `dump` 文件夹中 `train` 和 `dev` 下的规范化元数据文件
+3. `--output-dir` 是保存结果的目录。 检查点保存在此目录中的 `checkpoints/` 目录下。
+4. `--ngpu` 要使用的 GPU 数,如果 ngpu==0,则使用 cpu 。
+5. `--phones-dict` 是音素词汇表文件的路径。
+
+### 合成
+我们使用 [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc1) 作为神经声码器(vocoder)。
+从 [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip) 下载预训练的 parallel wavegan 模型并将其解压。
+
+```bash
+unzip pwg_baker_ckpt_0.4.zip
+```
+Parallel WaveGAN 检查点包含如下文件。
+```text
+pwg_baker_ckpt_0.4
+├── pwg_default.yaml # 用于训练 parallel wavegan 的默认配置
+├── pwg_snapshot_iter_400000.pdz # parallel wavegan 的模型参数
+└── pwg_stats.npy # 训练平行波形时用于规范化谱图的统计数据
+```
+`./local/synthesize.sh` 调用 `${BIN_DIR}/../synthesize.py` 即可从 `metadata.jsonl`中合成波形。
+
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
+```
+```text
+usage: synthesize.py [-h]
+ [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}]
+ [--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
+ [--am_stat AM_STAT] [--phones_dict PHONES_DICT]
+ [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT]
+ [--voice-cloning VOICE_CLONING]
+ [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}]
+ [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
+ [--voc_stat VOC_STAT] [--ngpu NGPU]
+ [--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR]
+
+Synthesize with acoustic model & vocoder
+
+optional arguments:
+ -h, --help show this help message and exit
+ --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}
+ Choose acoustic model type of tts task.
+ --am_config AM_CONFIG
+ Config of acoustic model. Use deault config when it is
+ None.
+ --am_ckpt AM_CKPT Checkpoint file of acoustic model.
+ --am_stat AM_STAT mean and standard deviation used to normalize
+ spectrogram when training acoustic model.
+ --phones_dict PHONES_DICT
+ phone vocabulary file.
+ --tones_dict TONES_DICT
+ tone vocabulary file.
+ --speaker_dict SPEAKER_DICT
+ speaker id map file.
+ --voice-cloning VOICE_CLONING
+ whether training voice cloning model.
+ --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}
+ Choose vocoder type of tts task.
+ --voc_config VOC_CONFIG
+ Config of voc. Use deault config when it is None.
+ --voc_ckpt VOC_CKPT Checkpoint file of voc.
+ --voc_stat VOC_STAT mean and standard deviation used to normalize
+ spectrogram when training voc.
+ --ngpu NGPU if ngpu == 0, use cpu.
+ --test_metadata TEST_METADATA
+ test metadata.
+ --output_dir OUTPUT_DIR
+ output dir.
+```
+`./local/synthesize_e2e.sh` 调用 `${BIN_DIR}/../synthesize_e2e.py`,即可从文本文件中合成波形。
+
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name}
+```
+```text
+usage: synthesize_e2e.py [-h]
+ [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}]
+ [--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
+ [--am_stat AM_STAT] [--phones_dict PHONES_DICT]
+ [--tones_dict TONES_DICT]
+ [--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID]
+ [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}]
+ [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
+ [--voc_stat VOC_STAT] [--lang LANG]
+ [--inference_dir INFERENCE_DIR] [--ngpu NGPU]
+ [--text TEXT] [--output_dir OUTPUT_DIR]
+
+Synthesize with acoustic model & vocoder
+
+optional arguments:
+ -h, --help show this help message and exit
+ --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}
+ Choose acoustic model type of tts task.
+ --am_config AM_CONFIG
+ Config of acoustic model. Use deault config when it is
+ None.
+ --am_ckpt AM_CKPT Checkpoint file of acoustic model.
+ --am_stat AM_STAT mean and standard deviation used to normalize
+ spectrogram when training acoustic model.
+ --phones_dict PHONES_DICT
+ phone vocabulary file.
+ --tones_dict TONES_DICT
+ tone vocabulary file.
+ --speaker_dict SPEAKER_DICT
+ speaker id map file.
+ --spk_id SPK_ID spk id for multi speaker acoustic model
+ --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}
+ Choose vocoder type of tts task.
+ --voc_config VOC_CONFIG
+ Config of voc. Use deault config when it is None.
+ --voc_ckpt VOC_CKPT Checkpoint file of voc.
+ --voc_stat VOC_STAT mean and standard deviation used to normalize
+ spectrogram when training voc.
+ --lang LANG Choose model language. zh or en
+ --inference_dir INFERENCE_DIR
+ dir to save inference models
+ --ngpu NGPU if ngpu == 0, use cpu.
+ --text TEXT text to synthesize, a 'utt_id sentence' pair per line.
+ --output_dir OUTPUT_DIR
+ output dir.
+```
+1. `--am` 声学模型格式是否符合 {model_name}_{dataset}
+2. `--am_config`, `--am_checkpoint`, `--am_stat` 和 `--phones_dict` 是声学模型的参数,对应于 fastspeech2 预训练模型中的 4 个文件。
+3. `--voc` 声码器(vocoder)格式是否符合 {model_name}_{dataset}
+4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` 是声码器的参数,对应于 parallel wavegan 预训练模型中的 3 个文件。
+5. `--lang` 对应模型的语言可以是 `zh` 或 `en` 。
+6. `--test_metadata` 应为 `dump` 文件夹中 `test` 下的规范化元数据文件、
+7. `--text` 是文本文件,其中包含要合成的句子。
+8. `--output_dir` 是保存合成音频文件的目录。
+9. `--ngpu` 要使用的GPU数,如果 ngpu==0,则使用 cpu 。
+
+### 推理
+在合成之后,我们将在 `${train_output_path}/inference` 中得到 fastspeech2 和 pwgan 的静态模型
+`./local/inference.sh` 调用 `${BIN_DIR}/inference.py` 为 fastspeech2 + pwgan 综合提供了一个 paddle 静态模型推理示例。
+
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path}
+```
+
+## 预训练模型
+预先训练的 FastSpeech2 模型,在音频边缘没有空白音频:
+- [fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip)
+- [fastspeech2_conformer_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_conformer_baker_ckpt_0.5.zip)
+
+静态模型可以在这里下载 [fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_static_0.4.zip).
+
+Model | Step | eval/loss | eval/l1_loss | eval/duration_loss | eval/pitch_loss| eval/energy_loss
+:-------------:| :------------:| :-----: | :-----: | :--------: |:--------:|:---------:
+default| 2(gpu) x 76000|1.0991|0.59132|0.035815|0.31915|0.15287|
+conformer| 2(gpu) x 76000|1.0675|0.56103|0.035869|0.31553|0.15509|
+
+FastSpeech2检查点包含下列文件。
+```text
+fastspeech2_nosil_baker_ckpt_0.4
+├── default.yaml # 用于训练 fastspeech2 的默认配置
+├── phone_id_map.txt # 训练 fastspeech2 时的音素词汇文件
+├── snapshot_iter_76000.pdz # 模型参数和优化器状态
+└── speech_stats.npy # 训练 fastspeech2 时用于规范化频谱图的统计数据
+```
+您可以使用以下脚本通过使用预训练的 fastspeech2 和 parallel wavegan 模型为 `${BIN_DIR}/../sentences.txt` 合成句子
+```bash
+source path.sh
+
+FLAGS_allocator_strategy=naive_best_fit \
+FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+python3 ${BIN_DIR}/../synthesize_e2e.py \
+ --am=fastspeech2_csmsc \
+ --am_config=fastspeech2_nosil_baker_ckpt_0.4/default.yaml \
+ --am_ckpt=fastspeech2_nosil_baker_ckpt_0.4/snapshot_iter_76000.pdz \
+ --am_stat=fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy \
+ --voc=pwgan_csmsc \
+ --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \
+ --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
+ --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
+ --lang=zh \
+ --text=${BIN_DIR}/../sentences.txt \
+ --output_dir=exp/default/test_e2e \
+ --inference_dir=exp/default/inference \
+ --phones_dict=fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt
+```
diff --git a/examples/csmsc/tts3/conf/conformer.yaml b/examples/csmsc/tts3/conf/conformer.yaml
index 252f634d..fcad8615 100644
--- a/examples/csmsc/tts3/conf/conformer.yaml
+++ b/examples/csmsc/tts3/conf/conformer.yaml
@@ -16,8 +16,8 @@ fmax: 7600 # Maximum frequency of Mel basis.
n_mels: 80 # The number of mel basis.
# Only used for the model using pitch features (e.g. FastSpeech2)
-f0min: 80 # Maximum f0 for pitch extraction.
-f0max: 400 # Minimum f0 for pitch extraction.
+f0min: 80 # Minimum f0 for pitch extraction.
+f0max: 400 # Maximum f0 for pitch extraction.
###########################################################
@@ -53,8 +53,8 @@ model:
conformer_pos_enc_layer_type: rel_pos # conformer positional encoding type
conformer_self_attn_layer_type: rel_selfattn # conformer self-attention type
conformer_activation_type: swish # conformer activation type
- use_macaron_style_in_conformer: true # whether to use macaron style in conformer
- use_cnn_in_conformer: true # whether to use CNN in conformer
+ use_macaron_style_in_conformer: True # whether to use macaron style in conformer
+ use_cnn_in_conformer: True # whether to use CNN in conformer
conformer_enc_kernel_size: 7 # kernel size in CNN module of conformer-based encoder
conformer_dec_kernel_size: 31 # kernel size in CNN module of conformer-based decoder
init_type: xavier_uniform # initialization type
@@ -70,14 +70,14 @@ model:
pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor
pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch
pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch
- stop_gradient_from_pitch_predictor: true # whether to stop the gradient from pitch predictor to encoder
+ stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder
energy_predictor_layers: 2 # number of conv layers in energy predictor
energy_predictor_chans: 256 # number of channels of conv layers in energy predictor
energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor
energy_predictor_dropout: 0.5 # dropout rate in energy predictor
energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy
energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy
- stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder
+ stop_gradient_from_energy_predictor: False # whether to stop the gradient from energy predictor to encoder
diff --git a/examples/csmsc/tts3/conf/default.yaml b/examples/csmsc/tts3/conf/default.yaml
index 1f723d67..2c2a1ea1 100644
--- a/examples/csmsc/tts3/conf/default.yaml
+++ b/examples/csmsc/tts3/conf/default.yaml
@@ -16,8 +16,8 @@ fmax: 7600 # Maximum frequency of Mel basis.
n_mels: 80 # The number of mel basis.
# Only used for the model using pitch features (e.g. FastSpeech2)
-f0min: 80 # Maximum f0 for pitch extraction.
-f0max: 400 # Minimum f0 for pitch extraction.
+f0min: 80 # Minimum f0 for pitch extraction.
+f0max: 400 # Maximum f0 for pitch extraction.
###########################################################
@@ -64,14 +64,14 @@ model:
pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor
pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch
pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch
- stop_gradient_from_pitch_predictor: true # whether to stop the gradient from pitch predictor to encoder
+ stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder
energy_predictor_layers: 2 # number of conv layers in energy predictor
energy_predictor_chans: 256 # number of channels of conv layers in energy predictor
energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor
energy_predictor_dropout: 0.5 # dropout rate in energy predictor
energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy
energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy
- stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder
+ stop_gradient_from_energy_predictor: False # whether to stop the gradient from energy predictor to encoder
@@ -82,7 +82,6 @@ updater:
use_masking: True # whether to apply masking for padded part in loss calculation
-
###########################################################
# OPTIMIZER SETTING #
###########################################################
diff --git a/examples/csmsc/tts3/local/inference.sh b/examples/csmsc/tts3/local/inference.sh
index 7c58980c..9322cfd6 100755
--- a/examples/csmsc/tts3/local/inference.sh
+++ b/examples/csmsc/tts3/local/inference.sh
@@ -48,4 +48,15 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
--text=${BIN_DIR}/../sentences.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt
+fi
+
+# wavernn
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+ python3 ${BIN_DIR}/../inference.py \
+ --inference_dir=${train_output_path}/inference \
+ --am=fastspeech2_csmsc \
+ --voc=wavernn_csmsc \
+ --text=${BIN_DIR}/../sentences.txt \
+ --output_dir=${train_output_path}/pd_infer_out \
+ --phones_dict=dump/phone_id_map.txt
fi
\ No newline at end of file
diff --git a/examples/csmsc/tts3/local/synthesize_e2e.sh b/examples/csmsc/tts3/local/synthesize_e2e.sh
index d4744486..44356e4b 100755
--- a/examples/csmsc/tts3/local/synthesize_e2e.sh
+++ b/examples/csmsc/tts3/local/synthesize_e2e.sh
@@ -89,3 +89,25 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
--inference_dir=${train_output_path}/inference \
--phones_dict=dump/phone_id_map.txt
fi
+
+
+# wavernn
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+ echo "in wavernn syn_e2e"
+ FLAGS_allocator_strategy=naive_best_fit \
+ FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+ python3 ${BIN_DIR}/../synthesize_e2e.py \
+ --am=fastspeech2_csmsc \
+ --am_config=${config_path} \
+ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+ --am_stat=dump/train/speech_stats.npy \
+ --voc=wavernn_csmsc \
+ --voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \
+ --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
+ --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
+ --lang=zh \
+ --text=${BIN_DIR}/../sentences.txt \
+ --output_dir=${train_output_path}/test_e2e \
+ --phones_dict=dump/phone_id_map.txt \
+ --inference_dir=${train_output_path}/inference
+fi
diff --git a/examples/csmsc/tts3/run.sh b/examples/csmsc/tts3/run.sh
index c1ddd3b9..e1a149b6 100755
--- a/examples/csmsc/tts3/run.sh
+++ b/examples/csmsc/tts3/run.sh
@@ -18,7 +18,7 @@ source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# prepare data
- bash ./local/preprocess.sh ${conf_path} || exit -1
+ ./local/preprocess.sh ${conf_path} || exit -1
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
@@ -40,3 +40,4 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
# inference with static model
CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path} || exit -1
fi
+
diff --git a/examples/csmsc/voc1/conf/default.yaml b/examples/csmsc/voc1/conf/default.yaml
index 28d218ff..703be21b 100644
--- a/examples/csmsc/voc1/conf/default.yaml
+++ b/examples/csmsc/voc1/conf/default.yaml
@@ -34,10 +34,10 @@ generator_params:
aux_context_window: 2 # Context window size for auxiliary feature.
# If set to 2, previous 2 and future 2 frames will be considered.
dropout: 0.0 # Dropout rate. 0.0 means no dropout applied.
- bias: true # use bias in residual blocks
- use_weight_norm: true # Whether to use weight norm.
+ bias: True # use bias in residual blocks
+ use_weight_norm: True # Whether to use weight norm.
# If set to true, it will be applied to all of the conv layers.
- use_causal_conv: false # use causal conv in residual blocks and upsample layers
+ use_causal_conv: False # use causal conv in residual blocks and upsample layers
upsample_scales: [4, 5, 3, 5] # Upsampling scales. Prodcut of these must be the same as hop size.
interpolate_mode: "nearest" # upsample net interpolate mode
freq_axis_kernel_size: 1 # upsamling net: convolution kernel size in frequencey axis
@@ -53,8 +53,8 @@ discriminator_params:
kernel_size: 3 # Number of output channels.
layers: 10 # Number of conv layers.
conv_channels: 64 # Number of chnn layers.
- bias: true # Whether to use bias parameter in conv.
- use_weight_norm: true # Whether to use weight norm.
+ bias: True # Whether to use bias parameter in conv.
+ use_weight_norm: True # Whether to use weight norm.
# If set to true, it will be applied to all of the conv layers.
nonlinear_activation: "leakyrelu" # Nonlinear function after each conv.
nonlinear_activation_params: # Nonlinear function parameters
diff --git a/examples/csmsc/voc3/conf/default.yaml b/examples/csmsc/voc3/conf/default.yaml
index 27e97664..fbff54f1 100644
--- a/examples/csmsc/voc3/conf/default.yaml
+++ b/examples/csmsc/voc3/conf/default.yaml
@@ -63,13 +63,13 @@ discriminator_params:
###########################################################
# STFT LOSS SETTING #
###########################################################
-use_stft_loss: true
+use_stft_loss: True
stft_loss_params:
fft_sizes: [1024, 2048, 512] # List of FFT size for STFT-based loss.
hop_sizes: [120, 240, 50] # List of hop size for STFT-based loss.
win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
window: "hann" # Window function for STFT-based loss
-use_subband_stft_loss: true
+use_subband_stft_loss: True
subband_stft_loss_params:
fft_sizes: [384, 683, 171] # List of FFT size for STFT-based loss.
hop_sizes: [30, 60, 10] # List of hop size for STFT-based loss
@@ -79,7 +79,7 @@ subband_stft_loss_params:
###########################################################
# ADVERSARIAL LOSS SETTING #
###########################################################
-use_feat_match_loss: false # Whether to use feature matching loss.
+use_feat_match_loss: False # Whether to use feature matching loss.
lambda_adv: 2.5 # Loss balancing coefficient for adversarial loss.
###########################################################
diff --git a/examples/csmsc/voc3/conf/finetune.yaml b/examples/csmsc/voc3/conf/finetune.yaml
index a3b1d8b1..0a38c282 100644
--- a/examples/csmsc/voc3/conf/finetune.yaml
+++ b/examples/csmsc/voc3/conf/finetune.yaml
@@ -63,13 +63,13 @@ discriminator_params:
###########################################################
# STFT LOSS SETTING #
###########################################################
-use_stft_loss: true
+use_stft_loss: True
stft_loss_params:
fft_sizes: [1024, 2048, 512] # List of FFT size for STFT-based loss.
hop_sizes: [120, 240, 50] # List of hop size for STFT-based loss
win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
window: "hann" # Window function for STFT-based loss
-use_subband_stft_loss: true
+use_subband_stft_loss: True
subband_stft_loss_params:
fft_sizes: [384, 683, 171] # List of FFT size for STFT-based loss.
hop_sizes: [30, 60, 10] # List of hop size for STFT-based loss.
@@ -79,7 +79,7 @@ subband_stft_loss_params:
###########################################################
# ADVERSARIAL LOSS SETTING #
###########################################################
-use_feat_match_loss: false # Whether to use feature matching loss.
+use_feat_match_loss: False # Whether to use feature matching loss.
lambda_adv: 2.5 # Loss balancing coefficient for adversarial loss.
###########################################################
diff --git a/examples/csmsc/voc4/conf/default.yaml b/examples/csmsc/voc4/conf/default.yaml
index c9abf78d..cd8f8e28 100644
--- a/examples/csmsc/voc4/conf/default.yaml
+++ b/examples/csmsc/voc4/conf/default.yaml
@@ -65,7 +65,7 @@ discriminator_params:
###########################################################
# STFT LOSS SETTING #
###########################################################
-use_stft_loss: true
+use_stft_loss: True
stft_loss_params:
fft_sizes: [1024, 2048, 512] # List of FFT size for STFT-based loss.
hop_sizes: [120, 240, 50] # List of hop size for STFT-based loss
@@ -78,9 +78,9 @@ lambda_aux: 1.0 # Loss balancing coefficient for aux loss.
###########################################################
lambda_adv: 1.0 # Loss balancing coefficient for adv loss.
generator_adv_loss_params:
- average_by_discriminators: false # Whether to average loss by #discriminators.
+ average_by_discriminators: False # Whether to average loss by #discriminators.
discriminator_adv_loss_params:
- average_by_discriminators: false # Whether to average loss by #discriminators.
+ average_by_discriminators: False # Whether to average loss by #discriminators.
###########################################################
# DATA LOADER SETTING #
diff --git a/examples/csmsc/voc5/conf/default.yaml b/examples/csmsc/voc5/conf/default.yaml
index f42fc385..38b94cf5 100644
--- a/examples/csmsc/voc5/conf/default.yaml
+++ b/examples/csmsc/voc5/conf/default.yaml
@@ -35,12 +35,12 @@ generator_params:
- [1, 3, 5]
- [1, 3, 5]
- [1, 3, 5]
- use_additional_convs: true # Whether to use additional conv layer in residual blocks.
- bias: true # Whether to use bias parameter in conv.
+ use_additional_convs: True # Whether to use additional conv layer in residual blocks.
+ bias: True # Whether to use bias parameter in conv.
nonlinear_activation: "leakyrelu" # Nonlinear activation type.
nonlinear_activation_params: # Nonlinear activation paramters.
negative_slope: 0.1
- use_weight_norm: true # Whether to apply weight normalization.
+ use_weight_norm: True # Whether to apply weight normalization.
###########################################################
@@ -60,12 +60,12 @@ discriminator_params:
channels: 128 # Initial number of channels.
max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers.
max_groups: 16 # Maximum number of groups in downsampling conv layers.
- bias: true
+ bias: True
downsample_scales: [4, 4, 4, 4, 1] # Downsampling scales.
nonlinear_activation: "leakyrelu" # Nonlinear activation.
nonlinear_activation_params:
negative_slope: 0.1
- follow_official_norm: true # Whether to follow the official norm setting.
+ follow_official_norm: True # Whether to follow the official norm setting.
periods: [2, 3, 5, 7, 11] # List of period for multi-period discriminator.
period_discriminator_params:
in_channels: 1 # Number of input channels.
@@ -74,19 +74,19 @@ discriminator_params:
channels: 32 # Initial number of channels.
downsample_scales: [3, 3, 3, 3, 1] # Downsampling scales.
max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers.
- bias: true # Whether to use bias parameter in conv layer."
+ bias: True # Whether to use bias parameter in conv layer."
nonlinear_activation: "leakyrelu" # Nonlinear activation.
nonlinear_activation_params: # Nonlinear activation paramters.
negative_slope: 0.1
- use_weight_norm: true # Whether to apply weight normalization.
- use_spectral_norm: false # Whether to apply spectral normalization.
+ use_weight_norm: True # Whether to apply weight normalization.
+ use_spectral_norm: False # Whether to apply spectral normalization.
###########################################################
# STFT LOSS SETTING #
###########################################################
-use_stft_loss: false # Whether to use multi-resolution STFT loss.
-use_mel_loss: true # Whether to use Mel-spectrogram loss.
+use_stft_loss: False # Whether to use multi-resolution STFT loss.
+use_mel_loss: True # Whether to use Mel-spectrogram loss.
mel_loss_params:
fs: 24000
fft_size: 2048
@@ -98,14 +98,14 @@ mel_loss_params:
fmax: 12000
log_base: null
generator_adv_loss_params:
- average_by_discriminators: false # Whether to average loss by #discriminators.
+ average_by_discriminators: False # Whether to average loss by #discriminators.
discriminator_adv_loss_params:
- average_by_discriminators: false # Whether to average loss by #discriminators.
-use_feat_match_loss: true
+ average_by_discriminators: False # Whether to average loss by #discriminators.
+use_feat_match_loss: True
feat_match_loss_params:
- average_by_discriminators: false # Whether to average loss by #discriminators.
- average_by_layers: false # Whether to average loss by #layers in each discriminator.
- include_final_outputs: false # Whether to include final outputs in feat match loss calculation.
+ average_by_discriminators: False # Whether to average loss by #discriminators.
+ average_by_layers: False # Whether to average loss by #layers in each discriminator.
+ include_final_outputs: False # Whether to include final outputs in feat match loss calculation.
###########################################################
# ADVERSARIAL LOSS SETTING #
diff --git a/examples/csmsc/voc5/conf/finetune.yaml b/examples/csmsc/voc5/conf/finetune.yaml
index 73420625..110ae052 100644
--- a/examples/csmsc/voc5/conf/finetune.yaml
+++ b/examples/csmsc/voc5/conf/finetune.yaml
@@ -35,12 +35,12 @@ generator_params:
- [1, 3, 5]
- [1, 3, 5]
- [1, 3, 5]
- use_additional_convs: true # Whether to use additional conv layer in residual blocks.
- bias: true # Whether to use bias parameter in conv.
+ use_additional_convs: True # Whether to use additional conv layer in residual blocks.
+ bias: True # Whether to use bias parameter in conv.
nonlinear_activation: "leakyrelu" # Nonlinear activation type.
nonlinear_activation_params: # Nonlinear activation paramters.
negative_slope: 0.1
- use_weight_norm: true # Whether to apply weight normalization.
+ use_weight_norm: True # Whether to apply weight normalization.
###########################################################
@@ -60,12 +60,12 @@ discriminator_params:
channels: 128 # Initial number of channels.
max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers.
max_groups: 16 # Maximum number of groups in downsampling conv layers.
- bias: true
+ bias: True
downsample_scales: [4, 4, 4, 4, 1] # Downsampling scales.
nonlinear_activation: "leakyrelu" # Nonlinear activation.
nonlinear_activation_params:
negative_slope: 0.1
- follow_official_norm: true # Whether to follow the official norm setting.
+ follow_official_norm: True # Whether to follow the official norm setting.
periods: [2, 3, 5, 7, 11] # List of period for multi-period discriminator.
period_discriminator_params:
in_channels: 1 # Number of input channels.
@@ -74,19 +74,19 @@ discriminator_params:
channels: 32 # Initial number of channels.
downsample_scales: [3, 3, 3, 3, 1] # Downsampling scales.
max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers.
- bias: true # Whether to use bias parameter in conv layer."
+ bias: True # Whether to use bias parameter in conv layer."
nonlinear_activation: "leakyrelu" # Nonlinear activation.
nonlinear_activation_params: # Nonlinear activation paramters.
negative_slope: 0.1
- use_weight_norm: true # Whether to apply weight normalization.
- use_spectral_norm: false # Whether to apply spectral normalization.
+ use_weight_norm: True # Whether to apply weight normalization.
+ use_spectral_norm: False # Whether to apply spectral normalization.
###########################################################
# STFT LOSS SETTING #
###########################################################
-use_stft_loss: false # Whether to use multi-resolution STFT loss.
-use_mel_loss: true # Whether to use Mel-spectrogram loss.
+use_stft_loss: False # Whether to use multi-resolution STFT loss.
+use_mel_loss: True # Whether to use Mel-spectrogram loss.
mel_loss_params:
fs: 24000
fft_size: 2048
@@ -98,14 +98,14 @@ mel_loss_params:
fmax: 12000
log_base: null
generator_adv_loss_params:
- average_by_discriminators: false # Whether to average loss by #discriminators.
+ average_by_discriminators: False # Whether to average loss by #discriminators.
discriminator_adv_loss_params:
- average_by_discriminators: false # Whether to average loss by #discriminators.
-use_feat_match_loss: true
+ average_by_discriminators: False # Whether to average loss by #discriminators.
+use_feat_match_loss: True
feat_match_loss_params:
- average_by_discriminators: false # Whether to average loss by #discriminators.
- average_by_layers: false # Whether to average loss by #layers in each discriminator.
- include_final_outputs: false # Whether to include final outputs in feat match loss calculation.
+ average_by_discriminators: False # Whether to average loss by #discriminators.
+ average_by_layers: False # Whether to average loss by #layers in each discriminator.
+ include_final_outputs: False # Whether to include final outputs in feat match loss calculation.
###########################################################
# ADVERSARIAL LOSS SETTING #
diff --git a/examples/csmsc/voc6/README.md b/examples/csmsc/voc6/README.md
new file mode 100644
index 00000000..7763b355
--- /dev/null
+++ b/examples/csmsc/voc6/README.md
@@ -0,0 +1,127 @@
+# WaveRNN with CSMSC
+This example contains code used to train a [WaveRNN](https://arxiv.org/abs/1802.08435) model with [Chinese Standard Mandarin Speech Copus](https://www.data-baker.com/open_source.html).
+## Dataset
+### Download and Extract
+Download CSMSC from the [official website](https://www.data-baker.com/data/index/source) and extract it to `~/datasets`. Then the dataset is in the directory `~/datasets/BZNSYP`.
+
+### Get MFA Result and Extract
+We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut silence at the edge of audio.
+You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo.
+
+## Get Started
+Assume the path to the dataset is `~/datasets/BZNSYP`.
+Assume the path to the MFA result of CSMSC is `./baker_alignment_tone`.
+Run the command below to
+1. **source path**.
+2. preprocess the dataset.
+3. train the model.
+4. synthesize wavs.
+ - synthesize waveform from `metadata.jsonl`.
+```bash
+./run.sh
+```
+You can choose a range of stages you want to run, or set `stage` equal to `stop-stage` to use only one stage, for example, running the following command will only preprocess the dataset.
+```bash
+./run.sh --stage 0 --stop-stage 0
+```
+### Data Preprocessing
+```bash
+./local/preprocess.sh ${conf_path}
+```
+When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below.
+
+```text
+dump
+├── dev
+│ ├── norm
+│ └── raw
+├── test
+│ ├── norm
+│ └── raw
+└── train
+ ├── norm
+ ├── raw
+ └── feats_stats.npy
+```
+The dataset is split into 3 parts, namely `train`, `dev`, and `test`, each of which contains a `norm` and `raw` subfolder. The `raw` folder contains the log magnitude of the mel spectrogram of each utterance, while the norm folder contains the normalized spectrogram. The statistics used to normalize the spectrogram are computed from the training set, which is located in `dump/train/feats_stats.npy`.
+
+Also, there is a `metadata.jsonl` in each subfolder. It is a table-like file that contains id and paths to the spectrogram of each utterance.
+
+### Model Training
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
+```
+`./local/train.sh` calls `${BIN_DIR}/train.py`.
+Here's the complete help message.
+
+```text
+usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
+ [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
+ [--ngpu NGPU]
+
+Train a WaveRNN model.
+
+optional arguments:
+ -h, --help show this help message and exit
+ --config CONFIG config file to overwrite default config.
+ --train-metadata TRAIN_METADATA
+ training data.
+ --dev-metadata DEV_METADATA
+ dev data.
+ --output-dir OUTPUT_DIR
+ output dir.
+ --ngpu NGPU if ngpu == 0, use cpu.
+```
+
+1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`.
+2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder.
+3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are saved in `checkpoints/` inside this directory.
+4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
+
+### Synthesizing
+`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
+```
+```text
+usage: synthesize.py [-h] [--config CONFIG] [--checkpoint CHECKPOINT]
+ [--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR]
+ [--ngpu NGPU]
+
+Synthesize with WaveRNN.
+
+optional arguments:
+ -h, --help show this help message and exit
+ --config CONFIG Vocoder config file.
+ --checkpoint CHECKPOINT
+ snapshot to load.
+ --test-metadata TEST_METADATA
+ dev data.
+ --output-dir OUTPUT_DIR
+ output dir.
+ --ngpu NGPU if ngpu == 0, use cpu.
+```
+
+1. `--config` wavernn config file. You should use the same config with which the model is trained.
+2. `--checkpoint` is the checkpoint to load. Pick one of the checkpoints from `checkpoints` inside the training output directory.
+3. `--test-metadata` is the metadata of the test dataset. Use the `metadata.jsonl` in the `dev/norm` subfolder from the processed directory.
+4. `--output-dir` is the directory to save the synthesized audio files.
+5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
+
+## Pretrained Models
+The pretrained model can be downloaded here [wavernn_csmsc_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/wavernn/wavernn_csmsc_ckpt_0.2.0.zip).
+
+The static model can be downloaded here [wavernn_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/wavernn/wavernn_csmsc_static_0.2.0.zip).
+
+Model | Step | eval/loss
+:-------------:|:------------:| :------------:
+default| 1(gpu) x 400000|2.602768
+
+WaveRNN checkpoint contains files listed below.
+
+```text
+wavernn_csmsc_ckpt_0.2.0
+├── default.yaml # default config used to train wavernn
+├── feats_stats.npy # statistics used to normalize spectrogram when training wavernn
+└── snapshot_iter_400000.pdz # parameters of wavernn
+```
diff --git a/examples/csmsc/voc6/conf/default.yaml b/examples/csmsc/voc6/conf/default.yaml
new file mode 100644
index 00000000..e7696cf4
--- /dev/null
+++ b/examples/csmsc/voc6/conf/default.yaml
@@ -0,0 +1,67 @@
+
+###########################################################
+# FEATURE EXTRACTION SETTING #
+###########################################################
+fs: 24000 # Sampling rate.
+n_fft: 2048 # FFT size (samples).
+n_shift: 300 # Hop size (samples). 12.5ms
+win_length: 1200 # Window length (samples). 50ms
+ # If set to null, it will be the same as fft_size.
+window: "hann" # Window function.
+n_mels: 80 # Number of mel basis.
+fmin: 80 # Minimum freq in mel basis calculation. (Hz)
+fmax: 7600 # Maximum frequency in mel basis calculation. (Hz)
+mu_law: True # Recommended to suppress noise if using raw bitsexit()
+
+
+###########################################################
+# MODEL SETTING #
+###########################################################
+model:
+ rnn_dims: 512 # Hidden dims of RNN Layers.
+ fc_dims: 512
+ bits: 9 # Bit depth of signal
+ aux_context_window: 2 # Context window size for auxiliary feature.
+ # If set to 2, previous 2 and future 2 frames will be considered.
+ aux_channels: 80 # Number of channels for auxiliary feature conv.
+ # Must be the same as num_mels.
+ upsample_scales: [4, 5, 3, 5] # Upsampling scales. Prodcut of these must be the same as hop size, same with pwgan here
+ compute_dims: 128 # Dims of Conv1D in MelResNet.
+ res_out_dims: 128 # Dims of output in MelResNet.
+ res_blocks: 10 # Number of residual blocks.
+ mode: RAW # either 'raw'(softmax on raw bits) or 'mold' (sample from mixture of logistics)
+inference:
+ gen_batched: True # whether to genenate sample in batch mode
+ target: 12000 # target number of samples to be generated in each batch entry
+ overlap: 600 # number of samples for crossfading between batches
+
+
+###########################################################
+# DATA LOADER SETTING #
+###########################################################
+batch_size: 64 # Batch size.
+batch_max_steps: 4500 # Length of each audio in batch. Make sure dividable by hop_size.
+num_workers: 2 # Number of workers in DataLoader.
+
+###########################################################
+# OPTIMIZER SETTING #
+###########################################################
+grad_clip: 4.0
+learning_rate: 1.0e-4
+
+
+###########################################################
+# INTERVAL SETTING #
+###########################################################
+
+train_max_steps: 400000 # Number of training steps.
+save_interval_steps: 5000 # Interval steps to save checkpoint.
+eval_interval_steps: 1000 # Interval steps to evaluate the network.
+gen_eval_samples_interval_steps: 5000 # the iteration interval of generating valid samples
+generate_num: 5 # number of samples to generate at each checkpoint
+
+###########################################################
+# OTHER SETTING #
+###########################################################
+num_snapshots: 10 # max number of snapshots to keep while training
+seed: 42 # random seed for paddle, random, and np.random
diff --git a/examples/csmsc/voc6/local/preprocess.sh b/examples/csmsc/voc6/local/preprocess.sh
new file mode 100755
index 00000000..2dcc39ac
--- /dev/null
+++ b/examples/csmsc/voc6/local/preprocess.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+
+stage=0
+stop_stage=100
+
+config_path=$1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+ # get durations from MFA's result
+ echo "Generate durations.txt from MFA results ..."
+ python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
+ --inputdir=./baker_alignment_tone \
+ --output=durations.txt \
+ --config=${config_path}
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+ # extract features
+ echo "Extract features ..."
+ python3 ${BIN_DIR}/../gan_vocoder/preprocess.py \
+ --rootdir=~/datasets/BZNSYP/ \
+ --dataset=baker \
+ --dumpdir=dump \
+ --dur-file=durations.txt \
+ --config=${config_path} \
+ --cut-sil=True \
+ --num-cpu=20
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+ # get features' stats(mean and std)
+ echo "Get features' stats ..."
+ python3 ${MAIN_ROOT}/utils/compute_statistics.py \
+ --metadata=dump/train/raw/metadata.jsonl \
+ --field-name="feats"
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+ # normalize, dev and test should use train's stats
+ echo "Normalize ..."
+
+ python3 ${BIN_DIR}/../gan_vocoder/normalize.py \
+ --metadata=dump/train/raw/metadata.jsonl \
+ --dumpdir=dump/train/norm \
+ --stats=dump/train/feats_stats.npy
+ python3 ${BIN_DIR}/../gan_vocoder/normalize.py \
+ --metadata=dump/dev/raw/metadata.jsonl \
+ --dumpdir=dump/dev/norm \
+ --stats=dump/train/feats_stats.npy
+
+ python3 ${BIN_DIR}/../gan_vocoder/normalize.py \
+ --metadata=dump/test/raw/metadata.jsonl \
+ --dumpdir=dump/test/norm \
+ --stats=dump/train/feats_stats.npy
+fi
diff --git a/examples/csmsc/voc6/local/synthesize.sh b/examples/csmsc/voc6/local/synthesize.sh
new file mode 100755
index 00000000..7f0cbe48
--- /dev/null
+++ b/examples/csmsc/voc6/local/synthesize.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+
+FLAGS_allocator_strategy=naive_best_fit \
+FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+python3 ${BIN_DIR}/synthesize.py \
+ --config=${config_path} \
+ --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
+ --test-metadata=dump/test/norm/metadata.jsonl \
+ --output-dir=${train_output_path}/test
diff --git a/examples/csmsc/voc6/local/train.sh b/examples/csmsc/voc6/local/train.sh
new file mode 100755
index 00000000..9695631e
--- /dev/null
+++ b/examples/csmsc/voc6/local/train.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+
+FLAGS_cudnn_exhaustive_search=true \
+FLAGS_conv_workspace_size_limit=4000 \
+python ${BIN_DIR}/train.py \
+ --train-metadata=dump/train/norm/metadata.jsonl \
+ --dev-metadata=dump/dev/norm/metadata.jsonl \
+ --config=${config_path} \
+ --output-dir=${train_output_path} \
+ --ngpu=1
diff --git a/examples/csmsc/voc6/path.sh b/examples/csmsc/voc6/path.sh
new file mode 100755
index 00000000..b0c98584
--- /dev/null
+++ b/examples/csmsc/voc6/path.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+export MAIN_ROOT=`realpath ${PWD}/../../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+MODEL=wavernn
+export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
\ No newline at end of file
diff --git a/examples/csmsc/voc6/run.sh b/examples/csmsc/voc6/run.sh
new file mode 100755
index 00000000..5f754fff
--- /dev/null
+++ b/examples/csmsc/voc6/run.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+set -e
+source path.sh
+
+gpus=0,1
+stage=0
+stop_stage=100
+
+conf_path=conf/default.yaml
+train_output_path=exp/default
+test_input=dump/dump_gta_test
+ckpt_name=snapshot_iter_100000.pdz
+
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+ # prepare data
+ ./local/preprocess.sh ${conf_path} || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+ # prepare data
+ CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+ # synthesize
+ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
diff --git a/examples/esc50/README.md b/examples/esc50/README.md
index 2ce57ae0..911a72ad 100644
--- a/examples/esc50/README.md
+++ b/examples/esc50/README.md
@@ -122,3 +122,6 @@ $ CUDA_VISIBLE_DEVICES=0 ./run.sh 4 cpu ./export /audio/dog.wav
- `device`: 指定模型预测时使用的设备。
- `model_dir`: 导出静态图模型和参数文件的保存目录。
- `wav`: 指定预测的音频文件。
+
+## Reference
+* [PANNs(PANNs: Large-Scale Pretrained Audio Neural Networks for Audio Pattern Recognition)](https://arxiv.org/abs/1912.10211)
diff --git a/examples/ljspeech/tts0/README.md b/examples/ljspeech/tts0/README.md
index baaec818..ba7ad619 100644
--- a/examples/ljspeech/tts0/README.md
+++ b/examples/ljspeech/tts0/README.md
@@ -1,20 +1,25 @@
-# Tacotron2 with LJSpeech
-PaddlePaddle dynamic graph implementation of Tacotron2, a neural network architecture for speech synthesis directly from the text. The implementation is based on [Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions](https://arxiv.org/abs/1712.05884).
+# Tacotron2 with LJSpeech-1.1
+This example contains code used to train a [Tacotron2](https://arxiv.org/abs/1712.05884) model with [LJSpeech-1.1](https://keithito.com/LJ-Speech-Dataset/)
## Dataset
-We experiment with the LJSpeech dataset. Download and unzip [LJSpeech](https://keithito.com/LJ-Speech-Dataset/).
+### Download and Extract
+Download LJSpeech-1.1 from the [official website](https://keithito.com/LJ-Speech-Dataset/).
+
+### Get MFA Result and Extract
+We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get phonemes for Tacotron2, the durations of MFA are not needed here.
+You can download from here [ljspeech_alignment.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/LJSpeech-1.1/ljspeech_alignment.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo.
-```bash
-wget https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
-tar xjvf LJSpeech-1.1.tar.bz2
-```
## Get Started
Assume the path to the dataset is `~/datasets/LJSpeech-1.1`.
+Assume the path to the MFA result of LJSpeech-1.1 is `./ljspeech_alignment`.
Run the command below to
1. **source path**.
2. preprocess the dataset.
3. train the model.
-4. synthesize mels.
+4. synthesize wavs.
+ - synthesize waveform from `metadata.jsonl`.
+ - synthesize waveform from a text file.
+
```bash
./run.sh
```
@@ -26,64 +31,217 @@ You can choose a range of stages you want to run, or set `stage` equal to `stop-
```bash
./local/preprocess.sh ${conf_path}
```
+When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below.
+
+```text
+dump
+├── dev
+│ ├── norm
+│ └── raw
+├── phone_id_map.txt
+├── speaker_id_map.txt
+├── test
+│ ├── norm
+│ └── raw
+└── train
+ ├── norm
+ ├── raw
+ └── speech_stats.npy
+```
+The dataset is split into 3 parts, namely `train`, `dev`, and` test`, each of which contains a `norm` and `raw` subfolder. The raw folder contains speech features of each utterance, while the norm folder contains normalized ones. The statistics used to normalize features are computed from the training set, which is located in `dump/train/*_stats.npy`.
+
+Also, there is a `metadata.jsonl` in each subfolder. It is a table-like file that contains phones, text_lengths, speech_lengths, durations, the path of speech features, speaker, and the id of each utterance.
+
### Model Training
-`./local/train.sh` calls `${BIN_DIR}/train.py`.
```bash
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
```
+`./local/train.sh` calls `${BIN_DIR}/train.py`.
Here's the complete help message.
```text
-usage: train.py [-h] [--config FILE] [--data DATA_DIR] [--output OUTPUT_DIR]
- [--checkpoint_path CHECKPOINT_PATH] [--ngpu NGPU] [--opts ...]
+usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
+ [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
+ [--ngpu NGPU] [--phones-dict PHONES_DICT]
+
+Train a Tacotron2 model.
optional arguments:
-h, --help show this help message and exit
- --config FILE path of the config file to overwrite to default config
- with.
- --data DATA_DIR path to the dataset.
- --output OUTPUT_DIR path to save checkpoint and logs.
- --checkpoint_path CHECKPOINT_PATH
- path of the checkpoint to load
+ --config CONFIG tacotron2 config file.
+ --train-metadata TRAIN_METADATA
+ training data.
+ --dev-metadata DEV_METADATA
+ dev data.
+ --output-dir OUTPUT_DIR
+ output dir.
--ngpu NGPU if ngpu == 0, use cpu.
- --opts ... options to overwrite --config file and the default
- config, passing in KEY VALUE pairs
+ --phones-dict PHONES_DICT
+ phone vocabulary file.
```
-
-If you want to train on CPU, just set `--ngpu=0`.
-If you want to train on multiple GPUs, just set `--ngpu` as the num of GPU.
-By default, training will be resumed from the latest checkpoint in `--output`, if you want to start a new training, please use a new `${OUTPUTPATH}` with no checkpoint.
-And if you want to resume from another existing model, you should set `checkpoint_path` to be the checkpoint path you want to load.
-**Note: The checkpoint path cannot contain the file extension.**
+1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`.
+2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder.
+3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are saved in `checkpoints/` inside this directory.
+4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
+5. `--phones-dict` is the path of the phone vocabulary file.
### Synthesizing
-`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which synthesize **mels** from text_list here.
+We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc1) as the neural vocoder.
+Download pretrained parallel wavegan model from [pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_ljspeech_ckpt_0.5.zip) and unzip it.
+```bash
+unzip pwg_ljspeech_ckpt_0.5.zip
+```
+Parallel WaveGAN checkpoint contains files listed below.
+```text
+pwg_ljspeech_ckpt_0.5
+├── pwg_default.yaml # default config used to train parallel wavegan
+├── pwg_snapshot_iter_400000.pdz # generator parameters of parallel wavegan
+└── pwg_stats.npy # statistics used to normalize spectrogram when training parallel wavegan
+```
+`./local/synthesize.sh` calls `${BIN_DIR}/../synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
```bash
-CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${train_output_path} ${ckpt_name}
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
```
```text
-usage: synthesize.py [-h] [--config FILE] [--checkpoint_path CHECKPOINT_PATH]
- [--input INPUT] [--output OUTPUT] [--ngpu NGPU]
- [--opts ...] [-v]
+usage: synthesize.py [-h]
+ [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc}]
+ [--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
+ [--am_stat AM_STAT] [--phones_dict PHONES_DICT]
+ [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT]
+ [--voice-cloning VOICE_CLONING]
+ [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}]
+ [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
+ [--voc_stat VOC_STAT] [--ngpu NGPU]
+ [--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR]
-generate mel spectrogram with TransformerTTS.
+Synthesize with acoustic model & vocoder
optional arguments:
-h, --help show this help message and exit
- --config FILE extra config to overwrite the default config
- --checkpoint_path CHECKPOINT_PATH
- path of the checkpoint to load.
- --input INPUT path of the text sentences
- --output OUTPUT path to save outputs
+ --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc}
+ Choose acoustic model type of tts task.
+ --am_config AM_CONFIG
+ Config of acoustic model. Use deault config when it is
+ None.
+ --am_ckpt AM_CKPT Checkpoint file of acoustic model.
+ --am_stat AM_STAT mean and standard deviation used to normalize
+ spectrogram when training acoustic model.
+ --phones_dict PHONES_DICT
+ phone vocabulary file.
+ --tones_dict TONES_DICT
+ tone vocabulary file.
+ --speaker_dict SPEAKER_DICT
+ speaker id map file.
+ --voice-cloning VOICE_CLONING
+ whether training voice cloning model.
+ --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}
+ Choose vocoder type of tts task.
+ --voc_config VOC_CONFIG
+ Config of voc. Use deault config when it is None.
+ --voc_ckpt VOC_CKPT Checkpoint file of voc.
+ --voc_stat VOC_STAT mean and standard deviation used to normalize
+ spectrogram when training voc.
--ngpu NGPU if ngpu == 0, use cpu.
- --opts ... options to overwrite --config file and the default
- config, passing in KEY VALUE pairs
- -v, --verbose print msg
+ --test_metadata TEST_METADATA
+ test metadata.
+ --output_dir OUTPUT_DIR
+ output dir.
```
-**Ps.** You can use [waveflow](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc0) as the neural vocoder to synthesize mels to wavs. (Please refer to `synthesize.sh` in our LJSpeech waveflow example)
+`./local/synthesize_e2e.sh` calls `${BIN_DIR}/../synthesize_e2e.py`, which can synthesize waveform from text file.
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name}
+```
+```text
+usage: synthesize_e2e.py [-h]
+ [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc}]
+ [--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
+ [--am_stat AM_STAT] [--phones_dict PHONES_DICT]
+ [--tones_dict TONES_DICT]
+ [--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID]
+ [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc}]
+ [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
+ [--voc_stat VOC_STAT] [--lang LANG]
+ [--inference_dir INFERENCE_DIR] [--ngpu NGPU]
+ [--text TEXT] [--output_dir OUTPUT_DIR]
+
+Synthesize with acoustic model & vocoder
+
+optional arguments:
+ -h, --help show this help message and exit
+ --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc}
+ Choose acoustic model type of tts task.
+ --am_config AM_CONFIG
+ Config of acoustic model. Use deault config when it is
+ None.
+ --am_ckpt AM_CKPT Checkpoint file of acoustic model.
+ --am_stat AM_STAT mean and standard deviation used to normalize
+ spectrogram when training acoustic model.
+ --phones_dict PHONES_DICT
+ phone vocabulary file.
+ --tones_dict TONES_DICT
+ tone vocabulary file.
+ --speaker_dict SPEAKER_DICT
+ speaker id map file.
+ --spk_id SPK_ID spk id for multi speaker acoustic model
+ --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc}
+ Choose vocoder type of tts task.
+ --voc_config VOC_CONFIG
+ Config of voc. Use deault config when it is None.
+ --voc_ckpt VOC_CKPT Checkpoint file of voc.
+ --voc_stat VOC_STAT mean and standard deviation used to normalize
+ spectrogram when training voc.
+ --lang LANG Choose model language. zh or en
+ --inference_dir INFERENCE_DIR
+ dir to save inference models
+ --ngpu NGPU if ngpu == 0, use cpu.
+ --text TEXT text to synthesize, a 'utt_id sentence' pair per line.
+ --output_dir OUTPUT_DIR
+ output dir.
+```
+1. `--am` is acoustic model type with the format {model_name}_{dataset}
+2. `--am_config`, `--am_checkpoint`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the Tacotron2 pretrained model.
+3. `--voc` is vocoder type with the format {model_name}_{dataset}
+4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model.
+5. `--lang` is the model language, which can be `zh` or `en`.
+6. `--test_metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder.
+7. `--text` is the text file, which contains sentences to synthesize.
+8. `--output_dir` is the directory to save synthesized audio files.
+9. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
+
+
+## Pretrained Model
+Pretrained Tacotron2 model with no silence in the edge of audios:
+- [tacotron2_ljspeech_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_ljspeech_ckpt_0.2.0.zip)
+
-## Pretrained Models
-Pretrained Models can be downloaded from the links below. We provide 2 models with different configurations.
+Model | Step | eval/loss | eval/l1_loss | eval/mse_loss | eval/bce_loss| eval/attn_loss
+:-------------:| :------------:| :-----: | :-----: | :--------: |:--------:|:---------:
+default| 1(gpu) x 60300|0.554092|0.394260|0.141046|0.018747|3.8e-05|
-1. This model uses a binary classifier to predict the stop token. [tacotron2_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_ljspeech_ckpt_0.3.zip)
+Tacotron2 checkpoint contains files listed below.
+```text
+tacotron2_ljspeech_ckpt_0.2.0
+├── default.yaml # default config used to train Tacotron2
+├── phone_id_map.txt # phone vocabulary file when training Tacotron2
+├── snapshot_iter_60300.pdz # model parameters and optimizer states
+└── speech_stats.npy # statistics used to normalize spectrogram when training Tacotron2
+```
+You can use the following scripts to synthesize for `${BIN_DIR}/../sentences_en.txt` using pretrained Tacotron2 and parallel wavegan models.
+```bash
+source path.sh
-2. This model does not have a stop token predictor. It uses the attention peak position to decide whether all the contents have been uttered. Also, guided attention loss is used to speed up training. This model is trained with `configs/alternative.yaml`.[tacotron2_ljspeech_ckpt_0.3_alternative.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_ljspeech_ckpt_0.3_alternative.zip)
+FLAGS_allocator_strategy=naive_best_fit \
+FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+python3 ${BIN_DIR}/../synthesize_e2e.py \
+ --am=tacotron2_ljspeech \
+ --am_config=tacotron2_ljspeech_ckpt_0.2.0/default.yaml \
+ --am_ckpt=tacotron2_ljspeech_ckpt_0.2.0/snapshot_iter_60300.pdz \
+ --am_stat=tacotron2_ljspeech_ckpt_0.2.0/speech_stats.npy \
+ --voc=pwgan_ljspeech\
+ --voc_config=pwg_ljspeech_ckpt_0.5/pwg_default.yaml \
+ --voc_ckpt=pwg_ljspeech_ckpt_0.5/pwg_snapshot_iter_400000.pdz \
+ --voc_stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \
+ --lang=en \
+ --text=${BIN_DIR}/../sentences_en.txt \
+ --output_dir=exp/default/test_e2e \
+ --phones_dict=tacotron2_ljspeech_ckpt_0.2.0/phone_id_map.txt
+```
diff --git a/examples/ljspeech/tts0/conf/default.yaml b/examples/ljspeech/tts0/conf/default.yaml
new file mode 100644
index 00000000..d76ebd43
--- /dev/null
+++ b/examples/ljspeech/tts0/conf/default.yaml
@@ -0,0 +1,87 @@
+# This configuration is for Paddle to train Tacotron 2. Compared to the
+# original paper, this configuration additionally use the guided attention
+# loss to accelerate the learning of the diagonal attention. It requires
+# only a single GPU with 12 GB memory and it takes ~1 days to finish the
+# training on Titan V.
+
+###########################################################
+# FEATURE EXTRACTION SETTING #
+###########################################################
+fs: 22050 # Sampling rate.
+n_fft: 1024 # FFT size (samples).
+n_shift: 256 # Hop size (samples). 11.6ms
+win_length: null # Window length (samples).
+ # If set to null, it will be the same as fft_size.
+window: "hann" # Window function.
+n_mels: 80 # Number of mel basis.
+fmin: 80 # Minimum freq in mel basis calculation. (Hz)
+fmax: 7600 # Maximum frequency in mel basis calculation. (Hz)
+
+###########################################################
+# DATA SETTING #
+###########################################################
+batch_size: 64
+num_workers: 2
+
+###########################################################
+# MODEL SETTING #
+###########################################################
+model: # keyword arguments for the selected model
+ embed_dim: 512 # char or phn embedding dimension
+ elayers: 1 # number of blstm layers in encoder
+ eunits: 512 # number of blstm units
+ econv_layers: 3 # number of convolutional layers in encoder
+ econv_chans: 512 # number of channels in convolutional layer
+ econv_filts: 5 # filter size of convolutional layer
+ atype: location # attention function type
+ adim: 512 # attention dimension
+ aconv_chans: 32 # number of channels in convolutional layer of attention
+ aconv_filts: 15 # filter size of convolutional layer of attention
+ cumulate_att_w: True # whether to cumulate attention weight
+ dlayers: 2 # number of lstm layers in decoder
+ dunits: 1024 # number of lstm units in decoder
+ prenet_layers: 2 # number of layers in prenet
+ prenet_units: 256 # number of units in prenet
+ postnet_layers: 5 # number of layers in postnet
+ postnet_chans: 512 # number of channels in postnet
+ postnet_filts: 5 # filter size of postnet layer
+ output_activation: null # activation function for the final output
+ use_batch_norm: True # whether to use batch normalization in encoder
+ use_concate: True # whether to concatenate encoder embedding with decoder outputs
+ use_residual: False # whether to use residual connection in encoder
+ dropout_rate: 0.5 # dropout rate
+ zoneout_rate: 0.1 # zoneout rate
+ reduction_factor: 1 # reduction factor
+ spk_embed_dim: null # speaker embedding dimension
+
+
+###########################################################
+# UPDATER SETTING #
+###########################################################
+updater:
+ use_masking: True # whether to apply masking for padded part in loss calculation
+ bce_pos_weight: 5.0 # weight of positive sample in binary cross entropy calculation
+ use_guided_attn_loss: True # whether to use guided attention loss
+ guided_attn_loss_sigma: 0.4 # sigma of guided attention loss
+ guided_attn_loss_lambda: 1.0 # strength of guided attention loss
+
+
+##########################################################
+# OPTIMIZER SETTING #
+##########################################################
+optimizer:
+ optim: adam # optimizer type
+ learning_rate: 1.0e-03 # learning rate
+ epsilon: 1.0e-06 # epsilon
+ weight_decay: 0.0 # weight decay coefficient
+
+###########################################################
+# TRAINING SETTING #
+###########################################################
+max_epoch: 300
+num_snapshots: 5
+
+###########################################################
+# OTHER SETTING #
+###########################################################
+seed: 42
diff --git a/examples/ljspeech/tts0/local/preprocess.sh b/examples/ljspeech/tts0/local/preprocess.sh
index c39a3172..e0e4bc7a 100755
--- a/examples/ljspeech/tts0/local/preprocess.sh
+++ b/examples/ljspeech/tts0/local/preprocess.sh
@@ -1,8 +1,62 @@
#!/bin/bash
-preprocess_path=$1
+stage=0
+stop_stage=100
-python3 ${BIN_DIR}/preprocess.py \
- --input=~/datasets/LJSpeech-1.1 \
- --output=${preprocess_path} \
- -v \
\ No newline at end of file
+config_path=$1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+ # get durations from MFA's result
+ echo "Generate durations.txt from MFA results ..."
+ python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
+ --inputdir=./ljspeech_alignment \
+ --output=durations.txt \
+ --config=${config_path}
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+ # extract features
+ echo "Extract features ..."
+ python3 ${BIN_DIR}/preprocess.py \
+ --dataset=ljspeech \
+ --rootdir=~/datasets/LJSpeech-1.1/ \
+ --dumpdir=dump \
+ --dur-file=durations.txt \
+ --config=${config_path} \
+ --num-cpu=20 \
+ --cut-sil=True
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+ # get features' stats(mean and std)
+ echo "Get features' stats ..."
+ python3 ${MAIN_ROOT}/utils/compute_statistics.py \
+ --metadata=dump/train/raw/metadata.jsonl \
+ --field-name="speech"
+
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+ # normalize and covert phone to id, dev and test should use train's stats
+ echo "Normalize ..."
+ python3 ${BIN_DIR}/normalize.py \
+ --metadata=dump/train/raw/metadata.jsonl \
+ --dumpdir=dump/train/norm \
+ --speech-stats=dump/train/speech_stats.npy \
+ --phones-dict=dump/phone_id_map.txt \
+ --speaker-dict=dump/speaker_id_map.txt
+
+ python3 ${BIN_DIR}/normalize.py \
+ --metadata=dump/dev/raw/metadata.jsonl \
+ --dumpdir=dump/dev/norm \
+ --speech-stats=dump/train/speech_stats.npy \
+ --phones-dict=dump/phone_id_map.txt \
+ --speaker-dict=dump/speaker_id_map.txt
+
+ python3 ${BIN_DIR}/normalize.py \
+ --metadata=dump/test/raw/metadata.jsonl \
+ --dumpdir=dump/test/norm \
+ --speech-stats=dump/train/speech_stats.npy \
+ --phones-dict=dump/phone_id_map.txt \
+ --speaker-dict=dump/speaker_id_map.txt
+fi
diff --git a/examples/ljspeech/tts0/local/synthesize.sh b/examples/ljspeech/tts0/local/synthesize.sh
index 3f5f9c06..0d005820 100755
--- a/examples/ljspeech/tts0/local/synthesize.sh
+++ b/examples/ljspeech/tts0/local/synthesize.sh
@@ -1,11 +1,20 @@
#!/bin/bash
-train_output_path=$1
-ckpt_name=$2
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
-python3 ${BIN_DIR}/synthesize.py \
- --config=${train_output_path}/config.yaml \
- --checkpoint_path=${train_output_path}/checkpoints/${ckpt_name} \
- --input=${BIN_DIR}/../sentences_en.txt \
- --output=${train_output_path}/test \
- --ngpu=1
+FLAGS_allocator_strategy=naive_best_fit \
+FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+python3 ${BIN_DIR}/../synthesize.py \
+ --am=tacotron2_ljspeech \
+ --am_config=${config_path} \
+ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+ --am_stat=dump/train/speech_stats.npy \
+ --voc=pwgan_ljspeech \
+ --voc_config=pwg_ljspeech_ckpt_0.5/pwg_default.yaml \
+ --voc_ckpt=pwg_ljspeech_ckpt_0.5/pwg_snapshot_iter_400000.pdz \
+ --voc_stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \
+ --test_metadata=dump/test/norm/metadata.jsonl \
+ --output_dir=${train_output_path}/test \
+ --phones_dict=dump/phone_id_map.txt
diff --git a/examples/ljspeech/tts0/local/synthesize_e2e.sh b/examples/ljspeech/tts0/local/synthesize_e2e.sh
new file mode 100755
index 00000000..73dfff60
--- /dev/null
+++ b/examples/ljspeech/tts0/local/synthesize_e2e.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+# TODO: dygraph to static graph is not good for tacotron2_ljspeech now
+FLAGS_allocator_strategy=naive_best_fit \
+FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+python3 ${BIN_DIR}/../synthesize_e2e.py \
+ --am=tacotron2_ljspeech \
+ --am_config=${config_path} \
+ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+ --am_stat=dump/train/speech_stats.npy \
+ --voc=pwgan_ljspeech \
+ --voc_config=pwg_ljspeech_ckpt_0.5/pwg_default.yaml \
+ --voc_ckpt=pwg_ljspeech_ckpt_0.5/pwg_snapshot_iter_400000.pdz \
+ --voc_stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \
+ --lang=en \
+ --text=${BIN_DIR}/../sentences_en.txt \
+ --output_dir=${train_output_path}/test_e2e \
+ --phones_dict=dump/phone_id_map.txt \
+ # --inference_dir=${train_output_path}/inference
\ No newline at end of file
diff --git a/examples/ljspeech/tts0/local/train.sh b/examples/ljspeech/tts0/local/train.sh
index a94f955a..f90db915 100755
--- a/examples/ljspeech/tts0/local/train.sh
+++ b/examples/ljspeech/tts0/local/train.sh
@@ -1,9 +1,12 @@
#!/bin/bash
-preprocess_path=$1
+config_path=$1
train_output_path=$2
python3 ${BIN_DIR}/train.py \
- --data=${preprocess_path} \
- --output=${train_output_path} \
- --ngpu=1 \
\ No newline at end of file
+ --train-metadata=dump/train/norm/metadata.jsonl \
+ --dev-metadata=dump/dev/norm/metadata.jsonl \
+ --config=${config_path} \
+ --output-dir=${train_output_path} \
+ --ngpu=1 \
+ --phones-dict=dump/phone_id_map.txt
\ No newline at end of file
diff --git a/examples/ljspeech/tts0/run.sh b/examples/ljspeech/tts0/run.sh
index 47c76c3d..c64fa888 100755
--- a/examples/ljspeech/tts0/run.sh
+++ b/examples/ljspeech/tts0/run.sh
@@ -3,13 +3,13 @@
set -e
source path.sh
-gpus=0
+gpus=0,1
stage=0
stop_stage=100
-preprocess_path=preprocessed_ljspeech
-train_output_path=output
-ckpt_name=step-35000
+conf_path=conf/default.yaml
+train_output_path=exp/default
+ckpt_name=snapshot_iter_201.pdz
# with the following command, you can choose the stage range you want to run
# such as `./run.sh --stage 0 --stop-stage 0`
@@ -18,16 +18,20 @@ source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# prepare data
- ./local/preprocess.sh ${preprocess_path} || exit -1
+ ./local/preprocess.sh ${conf_path} || exit -1
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# train model, all `ckpt` under `train_output_path/checkpoints/` dir
- CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${preprocess_path} ${train_output_path} || exit -1
+ CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
- # train model, all `ckpt` under `train_output_path/checkpoints/` dir
- CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${train_output_path} ${ckpt_name} || exit -1
+ # synthesize, vocoder is pwgan
+ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
fi
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+ # synthesize_e2e, vocoder is pwgan
+ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
diff --git a/examples/ljspeech/tts1/conf/default.yaml b/examples/ljspeech/tts1/conf/default.yaml
index 6b495eff..456b6a1e 100644
--- a/examples/ljspeech/tts1/conf/default.yaml
+++ b/examples/ljspeech/tts1/conf/default.yaml
@@ -63,9 +63,9 @@ model: # keyword arguments for the selected model
# UPDATER SETTING #
###########################################################
updater:
- use_masking: true # whether to apply masking for padded part in loss calculation
+ use_masking: True # whether to apply masking for padded part in loss calculation
loss_type: L1
- use_guided_attn_loss: true # whether to use guided attention loss
+ use_guided_attn_loss: True # whether to use guided attention loss
guided_attn_loss_sigma: 0.4 # sigma in guided attention loss
guided_attn_loss_lambda: 10.0 # lambda in guided attention loss
modules_applied_guided_attn: ["encoder-decoder"] # modules to apply guided attention loss
diff --git a/examples/ljspeech/tts3/README.md b/examples/ljspeech/tts3/README.md
index f3602c34..f5e919c0 100644
--- a/examples/ljspeech/tts3/README.md
+++ b/examples/ljspeech/tts3/README.md
@@ -1,4 +1,4 @@
-# FastSpeech2 with the LJSpeech-1.1
+# FastSpeech2 with LJSpeech-1.1
This example contains code used to train a [Fastspeech2](https://arxiv.org/abs/2006.04558) model with [LJSpeech-1.1](https://keithito.com/LJ-Speech-Dataset/).
## Dataset
diff --git a/examples/ljspeech/tts3/conf/default.yaml b/examples/ljspeech/tts3/conf/default.yaml
index 872dafcb..5305c912 100644
--- a/examples/ljspeech/tts3/conf/default.yaml
+++ b/examples/ljspeech/tts3/conf/default.yaml
@@ -16,8 +16,8 @@ fmax: 7600 # Maximum frequency of Mel basis.
n_mels: 80 # The number of mel basis.
# Only used for the model using pitch features (e.g. FastSpeech2)
-f0min: 80 # Maximum f0 for pitch extraction.
-f0max: 400 # Minimum f0 for pitch extraction.
+f0min: 80 # Minimum f0 for pitch extraction.
+f0max: 400 # Maximum f0 for pitch extraction.
###########################################################
@@ -64,14 +64,14 @@ model:
pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor
pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch
pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch
- stop_gradient_from_pitch_predictor: true # whether to stop the gradient from pitch predictor to encoder
+ stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder
energy_predictor_layers: 2 # number of conv layers in energy predictor
energy_predictor_chans: 256 # number of channels of conv layers in energy predictor
energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor
energy_predictor_dropout: 0.5 # dropout rate in energy predictor
energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy
energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy
- stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder
+ stop_gradient_from_energy_predictor: False # whether to stop the gradient from energy predictor to encoder
diff --git a/examples/ljspeech/voc0/run.sh b/examples/ljspeech/voc0/run.sh
index ddd82cb4..b040c0b2 100755
--- a/examples/ljspeech/voc0/run.sh
+++ b/examples/ljspeech/voc0/run.sh
@@ -10,7 +10,7 @@ stop_stage=100
preprocess_path=preprocessed_ljspeech
train_output_path=output
# mel generated by Tacotron2
-input_mel_path=../tts0/output/test
+input_mel_path=${preprocess_path}/mel_test
ckpt_name=step-10000
# with the following command, you can choose the stage range you want to run
@@ -28,5 +28,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+ mkdir -p ${preprocess_path}/mel_test
+ cp ${preprocess_path}/mel/LJ050-001*.npy ${preprocess_path}/mel_test/
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${input_mel_path} ${train_output_path} ${ckpt_name} || exit -1
fi
diff --git a/examples/ljspeech/voc1/conf/default.yaml b/examples/ljspeech/voc1/conf/default.yaml
index 2d39beb7..d30960d6 100644
--- a/examples/ljspeech/voc1/conf/default.yaml
+++ b/examples/ljspeech/voc1/conf/default.yaml
@@ -33,7 +33,7 @@ generator_params:
aux_context_window: 2 # Context window size for auxiliary feature.
# If set to 2, previous 2 and future 2 frames will be considered.
dropout: 0.0 # Dropout rate. 0.0 means no dropout applied.
- use_weight_norm: true # Whether to use weight norm.
+ use_weight_norm: True # Whether to use weight norm.
# If set to true, it will be applied to all of the conv layers.
upsample_scales: [4, 4, 4, 4] # Upsampling scales. prod(upsample_scales) == n_shift
@@ -46,8 +46,8 @@ discriminator_params:
kernel_size: 3 # Number of output channels.
layers: 10 # Number of conv layers.
conv_channels: 64 # Number of chnn layers.
- bias: true # Whether to use bias parameter in conv.
- use_weight_norm: true # Whether to use weight norm.
+ bias: True # Whether to use bias parameter in conv.
+ use_weight_norm: True # Whether to use weight norm.
# If set to true, it will be applied to all of the conv layers.
nonlinear_activation: "leakyrelu" # Nonlinear function after each conv.
nonlinear_activation_params: # Nonlinear function parameters
diff --git a/examples/other/1xt2x/src_deepspeech2x/models/ds2/deepspeech2.py b/examples/other/1xt2x/src_deepspeech2x/models/ds2/deepspeech2.py
index fb8b321c..f6e185ff 100644
--- a/examples/other/1xt2x/src_deepspeech2x/models/ds2/deepspeech2.py
+++ b/examples/other/1xt2x/src_deepspeech2x/models/ds2/deepspeech2.py
@@ -162,39 +162,17 @@ class DeepSpeech2Model(nn.Layer):
return loss
@paddle.no_grad()
- def decode(self, audio, audio_len, vocab_list, decoding_method,
- lang_model_path, beam_alpha, beam_beta, beam_size, cutoff_prob,
- cutoff_top_n, num_processes):
- # init once
+ def decode(self, audio, audio_len):
# decoders only accept string encoded in utf-8
- self.decoder.init_decode(
- beam_alpha=beam_alpha,
- beam_beta=beam_beta,
- lang_model_path=lang_model_path,
- vocab_list=vocab_list,
- decoding_method=decoding_method)
+ # Make sure the decoder has been initialized
eouts, eouts_len = self.encoder(audio, audio_len)
probs = self.decoder.softmax(eouts)
- print("probs.shape", probs.shape)
- return self.decoder.decode_probs(
- probs.numpy(), eouts_len, vocab_list, decoding_method,
- lang_model_path, beam_alpha, beam_beta, beam_size, cutoff_prob,
- cutoff_top_n, num_processes)
-
- def decode_probs_split(self, probs_split, vocab_list, decoding_method,
- lang_model_path, beam_alpha, beam_beta, beam_size,
- cutoff_prob, cutoff_top_n, num_processes):
- self.decoder.init_decode(
- beam_alpha=beam_alpha,
- beam_beta=beam_beta,
- lang_model_path=lang_model_path,
- vocab_list=vocab_list,
- decoding_method=decoding_method)
- return self.decoder.decode_probs_split(
- probs_split, vocab_list, decoding_method, lang_model_path,
- beam_alpha, beam_beta, beam_size, cutoff_prob, cutoff_top_n,
- num_processes)
+ batch_size = probs.shape[0]
+ self.decoder.reset_decoder(batch_size=batch_size)
+ self.decoder.next(probs, eouts_len)
+ trans_best, trans_beam = self.decoder.decode()
+ return trans_best
@classmethod
def from_pretrained(cls, dataloader, config, checkpoint_path):
diff --git a/examples/other/1xt2x/src_deepspeech2x/test_model.py b/examples/other/1xt2x/src_deepspeech2x/test_model.py
index 2a38fb5c..11b85442 100644
--- a/examples/other/1xt2x/src_deepspeech2x/test_model.py
+++ b/examples/other/1xt2x/src_deepspeech2x/test_model.py
@@ -254,12 +254,10 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
errors_func = error_rate.char_errors if cfg.error_rate_type == 'cer' else error_rate.word_errors
error_rate_func = error_rate.cer if cfg.error_rate_type == 'cer' else error_rate.wer
- vocab_list = self.test_loader.collate_fn.vocab_list
-
target_transcripts = self.ordid2token(texts, texts_len)
- result_transcripts = self.compute_result_transcripts(audio, audio_len,
- vocab_list, cfg)
+ result_transcripts = self.compute_result_transcripts(audio, audio_len)
+
for utt, target, result in zip(utts, target_transcripts,
result_transcripts):
errors, len_ref = errors_func(target, result)
@@ -280,19 +278,9 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
error_rate=errors_sum / len_refs,
error_rate_type=cfg.error_rate_type)
- def compute_result_transcripts(self, audio, audio_len, vocab_list, cfg):
- result_transcripts = self.model.decode(
- audio,
- audio_len,
- vocab_list,
- decoding_method=cfg.decoding_method,
- lang_model_path=cfg.lang_model_path,
- beam_alpha=cfg.alpha,
- beam_beta=cfg.beta,
- beam_size=cfg.beam_size,
- cutoff_prob=cfg.cutoff_prob,
- cutoff_top_n=cfg.cutoff_top_n,
- num_processes=cfg.num_proc_bsearch)
+ def compute_result_transcripts(self, audio, audio_len):
+ result_transcripts = self.model.decode(audio, audio_len)
+
result_transcripts = [
self._text_featurizer.detokenize(item)
for item in result_transcripts
@@ -307,6 +295,17 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
cfg = self.config
error_rate_type = None
errors_sum, len_refs, num_ins = 0.0, 0, 0
+
+ # Initialized the decoder in model
+ decode_cfg = self.config.decode
+ vocab_list = self.test_loader.collate_fn.vocab_list
+ decode_batch_size = self.test_loader.batch_size
+ self.model.decoder.init_decoder(
+ decode_batch_size, vocab_list, decode_cfg.decoding_method,
+ decode_cfg.lang_model_path, decode_cfg.alpha, decode_cfg.beta,
+ decode_cfg.beam_size, decode_cfg.cutoff_prob,
+ decode_cfg.cutoff_top_n, decode_cfg.num_proc_bsearch)
+
with open(self.args.result_file, 'w') as fout:
for i, batch in enumerate(self.test_loader):
utts, audio, audio_len, texts, texts_len = batch
@@ -326,6 +325,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
msg += "Final error rate [%s] (%d/%d) = %f" % (
error_rate_type, num_ins, num_ins, errors_sum / len_refs)
logger.info(msg)
+ self.model.decoder.del_decoder()
def run_test(self):
self.resume_or_scratch()
diff --git a/examples/other/g2p/run.sh b/examples/other/g2p/run.sh
index 214b8b3d..9794e791 100755
--- a/examples/other/g2p/run.sh
+++ b/examples/other/g2p/run.sh
@@ -4,6 +4,10 @@ source path.sh
USE_SCLITE=true
# test g2p
+if [ ! -d ~/datasets/BZNSYP ];then
+ echo "Please download BZNSYP dataset"
+ exit
+fi
echo "Start get g2p test data ..."
python3 get_g2p_data.py --root-dir=~/datasets/BZNSYP --output-dir=data/g2p
echo "Start test g2p ..."
diff --git a/examples/ted_en_zh/st0/conf/tuning/decode.yaml b/examples/ted_en_zh/st0/conf/tuning/decode.yaml
index ed081cf4..7d8d1daf 100644
--- a/examples/ted_en_zh/st0/conf/tuning/decode.yaml
+++ b/examples/ted_en_zh/st0/conf/tuning/decode.yaml
@@ -1,8 +1,9 @@
-batch_size: 5
+batch_size: 1
error_rate_type: char-bleu
decoding_method: fullsentence # 'fullsentence', 'simultaneous'
beam_size: 10
word_reward: 0.7
+maxlenratio: 0.3
decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
diff --git a/examples/ted_en_zh/st1/conf/tuning/decode.yaml b/examples/ted_en_zh/st1/conf/tuning/decode.yaml
index d6104dbc..4f10acf7 100644
--- a/examples/ted_en_zh/st1/conf/tuning/decode.yaml
+++ b/examples/ted_en_zh/st1/conf/tuning/decode.yaml
@@ -1,9 +1,10 @@
-batch_size: 5
+batch_size: 1
error_rate_type: char-bleu
decoding_method: fullsentence # 'fullsentence', 'simultaneous'
beam_size: 10
word_reward: 0.7
+maxlenratio: 0.3
decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
diff --git a/examples/thchs30/align0/README.md b/examples/thchs30/align0/README.md
index da56fffc..5195ab80 100644
--- a/examples/thchs30/align0/README.md
+++ b/examples/thchs30/align0/README.md
@@ -27,7 +27,7 @@ cd a0
应用程序会自动下载 THCHS-30数据集,处理成 MFA 所需的文件格式并开始训练,您可以修改 `run.sh` 中的参数 `LEXICON_NAME` 来决定您需要强制对齐的级别(word、syllable 和 phone)
## MFA 所使用的字典
---
-MFA 字典的格式请参考: [MFA 官方文档 Dictionary format ](https://montreal-forced-aligner.readthedocs.io/en/latest/dictionary.html)
+MFA 字典的格式请参考: [MFA 官方文档](https://montreal-forced-aligner.readthedocs.io/en/latest/)
phone.lexicon 直接使用的是 `THCHS-30/data_thchs30/lm_phone/lexicon.txt`
word.lexicon 考虑到了中文的多音字,使用**带概率的字典**, 生成规则请参考 `local/gen_word2phone.py`
`syllable.lexicon` 获取自 [DNSun/thchs30-pinyin2tone](https://github.com/DNSun/thchs30-pinyin2tone)
@@ -39,4 +39,4 @@ word.lexicon 考虑到了中文的多音字,使用**带概率的字典**, 生
**syllabel 级别:** [syllable.lexicon](https://paddlespeech.bj.bcebos.com/MFA/THCHS30/syllable/syllable.lexicon)、[对齐结果](https://paddlespeech.bj.bcebos.com/MFA/THCHS30/syllable/thchs30_alignment.tar.gz)、[模型](https://paddlespeech.bj.bcebos.com/MFA/THCHS30/syllable/thchs30_model.zip)
**word 级别:** [word.lexicon](https://paddlespeech.bj.bcebos.com/MFA/THCHS30/word/word.lexicon)、[对齐结果](https://paddlespeech.bj.bcebos.com/MFA/THCHS30/word/thchs30_alignment.tar.gz)、[模型](https://paddlespeech.bj.bcebos.com/MFA/THCHS30/word/thchs30_model.zip)
-随后,您可以参考 [MFA 官方文档 Align using pretrained models](https://montreal-forced-aligner.readthedocs.io/en/stable/aligning.html#align-using-pretrained-models) 使用我们给您提供好的模型直接对自己的数据集进行强制对齐,注意,您需要使用和模型对应的 lexicon 文件,当文本是汉字时,您需要用空格把不同的**汉字**(而不是词语)分开
+随后,您可以参考 [MFA 官方文档](https://montreal-forced-aligner.readthedocs.io/en/latest/) 使用我们给您提供好的模型直接对自己的数据集进行强制对齐,注意,您需要使用和模型对应的 lexicon 文件,当文本是汉字时,您需要用空格把不同的**汉字**(而不是词语)分开
diff --git a/examples/vctk/tts3/README.md b/examples/vctk/tts3/README.md
index 74c1086a..157949d1 100644
--- a/examples/vctk/tts3/README.md
+++ b/examples/vctk/tts3/README.md
@@ -240,13 +240,14 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
--am_ckpt=fastspeech2_nosil_vctk_ckpt_0.5/snapshot_iter_66200.pdz \
--am_stat=fastspeech2_nosil_vctk_ckpt_0.5/speech_stats.npy \
--voc=pwgan_vctk \
- --voc_config=pwg_vctk_ckpt_0.5/pwg_default.yaml \
- --voc_ckpt=pwg_vctk_ckpt_0.5/pwg_snapshot_iter_1000000.pdz \
- --voc_stat=pwg_vctk_ckpt_0.5/pwg_stats.npy \
+ --voc_config=pwg_vctk_ckpt_0.1.1/default.yaml \
+ --voc_ckpt=pwg_vctk_ckpt_0.1.1/snapshot_iter_1500000.pdz \
+ --voc_stat=pwg_vctk_ckpt_0.1.1/feats_stats.npy \
--lang=en \
--text=${BIN_DIR}/../sentences_en.txt \
--output_dir=exp/default/test_e2e \
--phones_dict=fastspeech2_nosil_vctk_ckpt_0.5/phone_id_map.txt \
--speaker_dict=fastspeech2_nosil_vctk_ckpt_0.5/speaker_id_map.txt \
- --spk_id=0
+ --spk_id=0 \
+ --inference_dir=exp/default/inference
```
diff --git a/examples/vctk/tts3/conf/default.yaml b/examples/vctk/tts3/conf/default.yaml
index 2738e7c2..1bca9107 100644
--- a/examples/vctk/tts3/conf/default.yaml
+++ b/examples/vctk/tts3/conf/default.yaml
@@ -16,8 +16,8 @@ fmax: 7600 # Maximum frequency of Mel basis.
n_mels: 80 # The number of mel basis.
# Only used for the model using pitch features (e.g. FastSpeech2)
-f0min: 80 # Maximum f0 for pitch extraction.
-f0max: 400 # Minimum f0 for pitch extraction.
+f0min: 80 # Minimum f0 for pitch extraction.
+f0max: 400 # Maximum f0 for pitch extraction.
###########################################################
@@ -64,14 +64,14 @@ model:
pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor
pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch
pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch
- stop_gradient_from_pitch_predictor: true # whether to stop the gradient from pitch predictor to encoder
+ stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder
energy_predictor_layers: 2 # number of conv layers in energy predictor
energy_predictor_chans: 256 # number of channels of conv layers in energy predictor
energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor
energy_predictor_dropout: 0.5 # dropout rate in energy predictor
energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy
energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy
- stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder
+ stop_gradient_from_energy_predictor: False # whether to stop the gradient from energy predictor to encoder
spk_embed_dim: 256 # speaker embedding dimension
spk_embed_integration_type: concat # speaker embedding integration type
diff --git a/examples/vctk/tts3/local/inference.sh b/examples/vctk/tts3/local/inference.sh
new file mode 100755
index 00000000..caef89d8
--- /dev/null
+++ b/examples/vctk/tts3/local/inference.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+train_output_path=$1
+
+stage=0
+stop_stage=0
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+ python3 ${BIN_DIR}/../inference.py \
+ --inference_dir=${train_output_path}/inference \
+ --am=fastspeech2_vctk \
+ --voc=pwgan_vctk \
+ --text=${BIN_DIR}/../sentences_en.txt \
+ --output_dir=${train_output_path}/pd_infer_out \
+ --phones_dict=dump/phone_id_map.txt \
+ --speaker_dict=dump/speaker_id_map.txt \
+ --spk_id=0 \
+ --lang=en
+fi
+
diff --git a/examples/vctk/tts3/local/synthesize_e2e.sh b/examples/vctk/tts3/local/synthesize_e2e.sh
index 51bb9e19..60d56d1c 100755
--- a/examples/vctk/tts3/local/synthesize_e2e.sh
+++ b/examples/vctk/tts3/local/synthesize_e2e.sh
@@ -20,4 +20,5 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--speaker_dict=dump/speaker_id_map.txt \
- --spk_id=0
+ --spk_id=0 \
+ --inference_dir=${train_output_path}/inference
diff --git a/examples/vctk/voc1/conf/default.yaml b/examples/vctk/voc1/conf/default.yaml
index 59ce3825..af859d4c 100644
--- a/examples/vctk/voc1/conf/default.yaml
+++ b/examples/vctk/voc1/conf/default.yaml
@@ -33,7 +33,7 @@ generator_params:
aux_context_window: 2 # Context window size for auxiliary feature.
# If set to 2, previous 2 and future 2 frames will be considered.
dropout: 0.0 # Dropout rate. 0.0 means no dropout applied.
- use_weight_norm: true # Whether to use weight norm.
+ use_weight_norm: True # Whether to use weight norm.
# If set to true, it will be applied to all of the conv layers.
upsample_scales: [4, 5, 3, 5] # Upsampling scales. prod(upsample_scales) == n_shift
@@ -46,8 +46,8 @@ discriminator_params:
kernel_size: 3 # Number of output channels.
layers: 10 # Number of conv layers.
conv_channels: 64 # Number of chnn layers.
- bias: true # Whether to use bias parameter in conv.
- use_weight_norm: true # Whether to use weight norm.
+ bias: True # Whether to use bias parameter in conv.
+ use_weight_norm: True # Whether to use weight norm.
# If set to true, it will be applied to all of the conv layers.
nonlinear_activation: "leakyrelu" # Nonlinear function after each conv.
nonlinear_activation_params: # Nonlinear function parameters
diff --git a/examples/voxceleb/README.md b/examples/voxceleb/README.md
new file mode 100644
index 00000000..2c8ad138
--- /dev/null
+++ b/examples/voxceleb/README.md
@@ -0,0 +1,8 @@
+
+dataset info refer to [VoxCeleb](https://www.robots.ox.ac.uk/~vgg/data/voxceleb/index.html#about)
+
+sv0 - speaker verfication with softmax backend etc, all python code
+ more info refer to the sv0/readme.txt
+
+sv1 - dependence on kaldi, speaker verfication with plda/sc backend,
+ more info refer to the sv1/readme.txt
diff --git a/examples/voxceleb/sv0/local/make_voxceleb_kaldi_trial.py b/examples/voxceleb/sv0/local/make_voxceleb_kaldi_trial.py
new file mode 100644
index 00000000..c92ede1a
--- /dev/null
+++ b/examples/voxceleb/sv0/local/make_voxceleb_kaldi_trial.py
@@ -0,0 +1,81 @@
+#!/usr/bin/python3
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Make VoxCeleb1 trial of kaldi format
+this script creat the test trial from kaldi trial voxceleb1_test_v2.txt or official trial veri_test2.txt
+to kaldi trial format
+"""
+
+import argparse
+import codecs
+import os
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument("--voxceleb_trial",
+ default="voxceleb1_test_v2",
+ type=str,
+ help="VoxCeleb trial file. Default we use the kaldi trial voxceleb1_test_v2.txt")
+parser.add_argument("--trial",
+ default="data/test/trial",
+ type=str,
+ help="Kaldi format trial file")
+args = parser.parse_args()
+
+def main(voxceleb_trial, trial):
+ """
+ VoxCeleb provide several trial file, which format is different with kaldi format.
+
+ VoxCeleb format's meaning is as following:
+ --------------------------------
+ target_or_nontarget path1 path2
+ --------------------------------
+ target_or_nontarget is an integer: 1 target path1 is equal to path2
+ 0 nontarget path1 is unequal to path2
+ path1: spkr_id/rec_id/name
+ path2: spkr_id/rec_id/name
+
+ Kaldi format's meaning is as following:
+ ---------------------------------------
+ utt_id1 utt_id2 target_or_nontarget
+ ---------------------------------------
+ utt_id1: utterance identification or speaker identification
+ utt_id2: utterance identification or speaker identification
+ target_or_nontarget is an string: 'target' utt_id1 is equal to utt_id2
+ 'nontarget' utt_id2 is unequal to utt_id2
+ """
+ print("Start convert the voxceleb trial to kaldi format")
+ if not os.path.exists(voxceleb_trial):
+ raise RuntimeError("{} does not exist. Pleas input the correct file path".format(voxceleb_trial))
+
+ trial_dirname = os.path.dirname(trial)
+ if not os.path.exists(trial_dirname):
+ os.mkdir(trial_dirname)
+
+ with codecs.open(voxceleb_trial, 'r', encoding='utf-8') as f, \
+ codecs.open(trial, 'w', encoding='utf-8') as w:
+ for line in f:
+ target_or_nontarget, path1, path2 = line.strip().split()
+
+ utt_id1 = "-".join(path1.split("/"))
+ utt_id2 = "-".join(path2.split("/"))
+ target = "nontarget"
+ if int(target_or_nontarget):
+ target = "target"
+ w.write("{} {} {}\n".format(utt_id1, utt_id2, target))
+ print("Convert the voxceleb trial to kaldi format successfully")
+
+if __name__ == "__main__":
+ main(args.voxceleb_trial, args.trial)
diff --git a/paddleaudio/CHANGELOG.md b/paddleaudio/CHANGELOG.md
index 4dc68c6f..825c32f0 100644
--- a/paddleaudio/CHANGELOG.md
+++ b/paddleaudio/CHANGELOG.md
@@ -1,2 +1 @@
# Changelog
-
diff --git a/paddleaudio/__init__.py b/paddleaudio/__init__.py
index b717777d..2685cf57 100644
--- a/paddleaudio/__init__.py
+++ b/paddleaudio/__init__.py
@@ -13,5 +13,3 @@
# limitations under the License.
from .backends import *
from .features import *
-
-__version__ = '0.1.0'
diff --git a/paddleaudio/features/core.py b/paddleaudio/features/core.py
index d3c2e290..01925ec6 100644
--- a/paddleaudio/features/core.py
+++ b/paddleaudio/features/core.py
@@ -415,11 +415,11 @@ def mfcc(x,
**kwargs)
# librosa mfcc:
- spect = librosa.feature.melspectrogram(x,sr=16000,n_fft=512,
+ spect = librosa.feature.melspectrogram(y=x,sr=16000,n_fft=512,
win_length=512,
hop_length=320,
n_mels=64, fmin=50)
- b = librosa.feature.mfcc(x,
+ b = librosa.feature.mfcc(y=x,
sr=16000,
S=spect,
n_mfcc=20,
diff --git a/paddlespeech/__init__.py b/paddlespeech/__init__.py
index 8d32f287..185a92b8 100644
--- a/paddlespeech/__init__.py
+++ b/paddlespeech/__init__.py
@@ -11,5 +11,3 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-
-__version__ = '0.1.0'
diff --git a/paddlespeech/cli/asr/infer.py b/paddlespeech/cli/asr/infer.py
index aa4e31d9..ef769fbc 100644
--- a/paddlespeech/cli/asr/infer.py
+++ b/paddlespeech/cli/asr/infer.py
@@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
-import io
import os
import sys
from typing import List
@@ -23,9 +22,9 @@ import librosa
import numpy as np
import paddle
import soundfile
-import yaml
from yacs.config import CfgNode
+from ..download import get_path_from_url
from ..executor import BaseExecutor
from ..log import logger
from ..utils import cli_register
@@ -64,14 +63,61 @@ pretrained_models = {
'ckpt_path':
'exp/transformer/checkpoints/avg_10',
},
+ "deepspeech2offline_aishell-zh-16k": {
+ 'url':
+ 'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_aishell_ckpt_0.1.1.model.tar.gz',
+ 'md5':
+ '932c3593d62fe5c741b59b31318aa314',
+ 'cfg_path':
+ 'model.yaml',
+ 'ckpt_path':
+ 'exp/deepspeech2/checkpoints/avg_1',
+ 'lm_url':
+ 'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
+ 'lm_md5':
+ '29e02312deb2e59b3c8686c7966d4fe3'
+ },
+ "deepspeech2online_aishell-zh-16k": {
+ 'url':
+ 'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.1.1.model.tar.gz',
+ 'md5':
+ 'd5e076217cf60486519f72c217d21b9b',
+ 'cfg_path':
+ 'model.yaml',
+ 'ckpt_path':
+ 'exp/deepspeech2_online/checkpoints/avg_1',
+ 'lm_url':
+ 'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
+ 'lm_md5':
+ '29e02312deb2e59b3c8686c7966d4fe3'
+ },
+ "deepspeech2offline_librispeech-en-16k": {
+ 'url':
+ 'https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr0/asr0_deepspeech2_librispeech_ckpt_0.1.1.model.tar.gz',
+ 'md5':
+ 'f5666c81ad015c8de03aac2bc92e5762',
+ 'cfg_path':
+ 'model.yaml',
+ 'ckpt_path':
+ 'exp/deepspeech2/checkpoints/avg_1',
+ 'lm_url':
+ 'https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm',
+ 'lm_md5':
+ '099a601759d467cd0a8523ff939819c5'
+ },
}
model_alias = {
- "deepspeech2offline": "paddlespeech.s2t.models.ds2:DeepSpeech2Model",
- "deepspeech2online": "paddlespeech.s2t.models.ds2_online:DeepSpeech2ModelOnline",
- "conformer": "paddlespeech.s2t.models.u2:U2Model",
- "transformer": "paddlespeech.s2t.models.u2:U2Model",
- "wenetspeech": "paddlespeech.s2t.models.u2:U2Model",
+ "deepspeech2offline":
+ "paddlespeech.s2t.models.ds2:DeepSpeech2Model",
+ "deepspeech2online":
+ "paddlespeech.s2t.models.ds2_online:DeepSpeech2ModelOnline",
+ "conformer":
+ "paddlespeech.s2t.models.u2:U2Model",
+ "transformer":
+ "paddlespeech.s2t.models.u2:U2Model",
+ "wenetspeech":
+ "paddlespeech.s2t.models.u2:U2Model",
}
@@ -95,7 +141,8 @@ class ASRExecutor(BaseExecutor):
'--lang',
type=str,
default='zh',
- help='Choose model language. zh or en, zh:[conformer_wenetspeech-zh-16k], en:[transformer_librispeech-en-16k]')
+ help='Choose model language. zh or en, zh:[conformer_wenetspeech-zh-16k], en:[transformer_librispeech-en-16k]'
+ )
self.parser.add_argument(
"--sample_rate",
type=int,
@@ -111,7 +158,10 @@ class ASRExecutor(BaseExecutor):
'--decode_method',
type=str,
default='attention_rescoring',
- choices=['ctc_greedy_search', 'ctc_prefix_beam_search', 'attention', 'attention_rescoring'],
+ choices=[
+ 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention',
+ 'attention_rescoring'
+ ],
help='only support transformer and conformer model')
self.parser.add_argument(
'--ckpt_path',
@@ -135,8 +185,9 @@ class ASRExecutor(BaseExecutor):
"""
Download and returns pretrained resources path of current task.
"""
- assert tag in pretrained_models, 'Can not find pretrained resources of {}.'.format(
- tag)
+ support_models = list(pretrained_models.keys())
+ assert tag in pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format(
+ tag, '\n\t\t'.join(support_models))
res_path = os.path.join(MODEL_HOME, tag)
decompressed_path = download_and_decompress(pretrained_models[tag],
@@ -187,13 +238,21 @@ class ASRExecutor(BaseExecutor):
if "deepspeech2online" in model_type or "deepspeech2offline" in model_type:
from paddlespeech.s2t.io.collator import SpeechCollator
self.vocab = self.config.vocab_filepath
- self.config.decode.lang_model_path = os.path.join(res_path, self.config.decode.lang_model_path)
+ self.config.decode.lang_model_path = os.path.join(
+ MODEL_HOME, 'language_model',
+ self.config.decode.lang_model_path)
self.collate_fn_test = SpeechCollator.from_config(self.config)
self.text_feature = TextFeaturizer(
- unit_type=self.config.unit_type,
- vocab=self.vocab)
+ unit_type=self.config.unit_type, vocab=self.vocab)
+ lm_url = pretrained_models[tag]['lm_url']
+ lm_md5 = pretrained_models[tag]['lm_md5']
+ self.download_lm(
+ lm_url,
+ os.path.dirname(self.config.decode.lang_model_path), lm_md5)
+
elif "conformer" in model_type or "transformer" in model_type or "wenetspeech" in model_type:
- self.config.spm_model_prefix = os.path.join(self.res_path, self.config.spm_model_prefix)
+ self.config.spm_model_prefix = os.path.join(
+ self.res_path, self.config.spm_model_prefix)
self.text_feature = TextFeaturizer(
unit_type=self.config.unit_type,
vocab=self.config.vocab_filepath,
@@ -252,8 +311,10 @@ class ASRExecutor(BaseExecutor):
audio = audio[:, 0]
# pcm16 -> pcm 32
audio = self._pcm16to32(audio)
- audio = librosa.resample(audio, audio_sample_rate,
- self.sample_rate)
+ audio = librosa.resample(
+ audio,
+ orig_sr=audio_sample_rate,
+ target_sr=self.sample_rate)
audio_sample_rate = self.sample_rate
# pcm32 -> pcm 16
audio = self._pcm32to16(audio)
@@ -284,18 +345,15 @@ class ASRExecutor(BaseExecutor):
audio = self._inputs["audio"]
audio_len = self._inputs["audio_len"]
if "deepspeech2online" in model_type or "deepspeech2offline" in model_type:
- result_transcripts = self.model.decode(
- audio,
- audio_len,
- self.text_feature.vocab_list,
- decoding_method=cfg.decoding_method,
- lang_model_path=cfg.lang_model_path,
- beam_alpha=cfg.alpha,
- beam_beta=cfg.beta,
- beam_size=cfg.beam_size,
- cutoff_prob=cfg.cutoff_prob,
- cutoff_top_n=cfg.cutoff_top_n,
- num_processes=cfg.num_proc_bsearch)
+ decode_batch_size = audio.shape[0]
+ self.model.decoder.init_decoder(
+ decode_batch_size, self.text_feature.vocab_list,
+ cfg.decoding_method, cfg.lang_model_path, cfg.alpha, cfg.beta,
+ cfg.beam_size, cfg.cutoff_prob, cfg.cutoff_top_n,
+ cfg.num_proc_bsearch)
+
+ result_transcripts = self.model.decode(audio, audio_len)
+ self.model.decoder.del_decoder()
self._outputs["result"] = result_transcripts[0]
elif "conformer" in model_type or "transformer" in model_type:
@@ -319,6 +377,13 @@ class ASRExecutor(BaseExecutor):
"""
return self._outputs["result"]
+ def download_lm(self, url, lm_dir, md5sum):
+ download_path = get_path_from_url(
+ url=url,
+ root_dir=lm_dir,
+ md5sum=md5sum,
+ decompress=False, )
+
def _pcm16to32(self, audio):
assert (audio.dtype == np.int16)
audio = audio.astype("float32")
@@ -411,7 +476,7 @@ class ASRExecutor(BaseExecutor):
try:
res = self(audio_file, model, lang, sample_rate, config, ckpt_path,
- decode_method, force_yes, device)
+ decode_method, force_yes, device)
logger.info('ASR Result: {}'.format(res))
return True
except Exception as e:
@@ -435,7 +500,8 @@ class ASRExecutor(BaseExecutor):
audio_file = os.path.abspath(audio_file)
self._check(audio_file, sample_rate, force_yes)
paddle.set_device(device)
- self._init_from_path(model, lang, sample_rate, config, decode_method, ckpt_path)
+ self._init_from_path(model, lang, sample_rate, config, decode_method,
+ ckpt_path)
self.preprocess(model, audio_file)
self.infer(model)
res = self.postprocess() # Retrieve result of asr.
diff --git a/paddlespeech/cli/cls/infer.py b/paddlespeech/cli/cls/infer.py
index 52bc1972..5839ff30 100644
--- a/paddlespeech/cli/cls/infer.py
+++ b/paddlespeech/cli/cls/infer.py
@@ -114,8 +114,9 @@ class CLSExecutor(BaseExecutor):
"""
Download and returns pretrained resources path of current task.
"""
- assert tag in pretrained_models, 'Can not find pretrained resources of {}.'.format(
- tag)
+ support_models = list(pretrained_models.keys())
+ assert tag in pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format(
+ tag, '\n\t\t'.join(support_models))
res_path = os.path.join(MODEL_HOME, tag)
decompressed_path = download_and_decompress(pretrained_models[tag],
diff --git a/paddlespeech/cli/st/infer.py b/paddlespeech/cli/st/infer.py
index 1276424c..1709c754 100644
--- a/paddlespeech/cli/st/infer.py
+++ b/paddlespeech/cli/st/infer.py
@@ -112,8 +112,9 @@ class STExecutor(BaseExecutor):
"""
Download and returns pretrained resources path of current task.
"""
- assert tag in pretrained_models, "Can not find pretrained resources of {}.".format(
- tag)
+ support_models = list(pretrained_models.keys())
+ assert tag in pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format(
+ tag, '\n\t\t'.join(support_models))
res_path = os.path.join(MODEL_HOME, tag)
decompressed_path = download_and_decompress(pretrained_models[tag],
@@ -173,8 +174,8 @@ class STExecutor(BaseExecutor):
self.config.decode.decoding_method = "fullsentence"
with UpdateConfig(self.config):
- self.config.cmvn_path = os.path.join(
- res_path, self.config.cmvn_path)
+ self.config.cmvn_path = os.path.join(res_path,
+ self.config.cmvn_path)
self.config.spm_model_prefix = os.path.join(
res_path, self.config.spm_model_prefix)
self.text_feature = TextFeaturizer(
diff --git a/paddlespeech/cli/text/infer.py b/paddlespeech/cli/text/infer.py
index 1cef8fcf..b0977c88 100644
--- a/paddlespeech/cli/text/infer.py
+++ b/paddlespeech/cli/text/infer.py
@@ -124,8 +124,9 @@ class TextExecutor(BaseExecutor):
"""
Download and returns pretrained resources path of current task.
"""
- assert tag in pretrained_models, 'Can not find pretrained resources of {}.'.format(
- tag)
+ support_models = list(pretrained_models.keys())
+ assert tag in pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format(
+ tag, '\n\t\t'.join(support_models))
res_path = os.path.join(MODEL_HOME, tag)
decompressed_path = download_and_decompress(pretrained_models[tag],
diff --git a/paddlespeech/cli/tts/infer.py b/paddlespeech/cli/tts/infer.py
index a39a5c4e..dfd6a42f 100644
--- a/paddlespeech/cli/tts/infer.py
+++ b/paddlespeech/cli/tts/infer.py
@@ -117,6 +117,36 @@ pretrained_models = {
'speaker_dict':
'speaker_id_map.txt',
},
+ # tacotron2
+ "tacotron2_csmsc-zh": {
+ 'url':
+ 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_csmsc_ckpt_0.2.0.zip',
+ 'md5':
+ '0df4b6f0bcbe0d73c5ed6df8867ab91a',
+ 'config':
+ 'default.yaml',
+ 'ckpt':
+ 'snapshot_iter_30600.pdz',
+ 'speech_stats':
+ 'speech_stats.npy',
+ 'phones_dict':
+ 'phone_id_map.txt',
+ },
+ "tacotron2_ljspeech-en": {
+ 'url':
+ 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_ljspeech_ckpt_0.2.0.zip',
+ 'md5':
+ '6a5eddd81ae0e81d16959b97481135f3',
+ 'config':
+ 'default.yaml',
+ 'ckpt':
+ 'snapshot_iter_60300.pdz',
+ 'speech_stats':
+ 'speech_stats.npy',
+ 'phones_dict':
+ 'phone_id_map.txt',
+ },
+
# pwgan
"pwgan_csmsc-zh": {
'url':
@@ -205,6 +235,20 @@ pretrained_models = {
'speech_stats':
'feats_stats.npy',
},
+
+ # wavernn
+ "wavernn_csmsc-zh": {
+ 'url':
+ 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/wavernn/wavernn_csmsc_ckpt_0.2.0.zip',
+ 'md5':
+ 'ee37b752f09bcba8f2af3b777ca38e13',
+ 'config':
+ 'default.yaml',
+ 'ckpt':
+ 'snapshot_iter_400000.pdz',
+ 'speech_stats':
+ 'feats_stats.npy',
+ }
}
model_alias = {
@@ -217,6 +261,10 @@ model_alias = {
"paddlespeech.t2s.models.fastspeech2:FastSpeech2",
"fastspeech2_inference":
"paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference",
+ "tacotron2":
+ "paddlespeech.t2s.models.tacotron2:Tacotron2",
+ "tacotron2_inference":
+ "paddlespeech.t2s.models.tacotron2:Tacotron2Inference",
# voc
"pwgan":
"paddlespeech.t2s.models.parallel_wavegan:PWGGenerator",
@@ -234,6 +282,10 @@ model_alias = {
"paddlespeech.t2s.models.hifigan:HiFiGANGenerator",
"hifigan_inference":
"paddlespeech.t2s.models.hifigan:HiFiGANInference",
+ "wavernn":
+ "paddlespeech.t2s.models.wavernn:WaveRNN",
+ "wavernn_inference":
+ "paddlespeech.t2s.models.wavernn:WaveRNNInference",
}
@@ -253,9 +305,13 @@ class TTSExecutor(BaseExecutor):
type=str,
default='fastspeech2_csmsc',
choices=[
- 'speedyspeech_csmsc', 'fastspeech2_csmsc',
- 'fastspeech2_ljspeech', 'fastspeech2_aishell3',
- 'fastspeech2_vctk'
+ 'speedyspeech_csmsc',
+ 'fastspeech2_csmsc',
+ 'fastspeech2_ljspeech',
+ 'fastspeech2_aishell3',
+ 'fastspeech2_vctk',
+ 'tacotron2_csmsc',
+ 'tacotron2_ljspeech',
],
help='Choose acoustic model type of tts task.')
self.parser.add_argument(
@@ -300,8 +356,14 @@ class TTSExecutor(BaseExecutor):
type=str,
default='pwgan_csmsc',
choices=[
- 'pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3', 'pwgan_vctk',
- 'mb_melgan_csmsc', 'style_melgan_csmsc', 'hifigan_csmsc'
+ 'pwgan_csmsc',
+ 'pwgan_ljspeech',
+ 'pwgan_aishell3',
+ 'pwgan_vctk',
+ 'mb_melgan_csmsc',
+ 'style_melgan_csmsc',
+ 'hifigan_csmsc',
+ 'wavernn_csmsc',
],
help='Choose vocoder type of tts task.')
@@ -340,8 +402,9 @@ class TTSExecutor(BaseExecutor):
"""
Download and returns pretrained resources path of current task.
"""
- assert tag in pretrained_models, 'Can not find pretrained resources of {}.'.format(
- tag)
+ support_models = list(pretrained_models.keys())
+ assert tag in pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format(
+ tag, '\n\t\t'.join(support_models))
res_path = os.path.join(MODEL_HOME, tag)
decompressed_path = download_and_decompress(pretrained_models[tag],
@@ -368,7 +431,7 @@ class TTSExecutor(BaseExecutor):
"""
Init model and other resources from a specific path.
"""
- if hasattr(self, 'am') and hasattr(self, 'voc'):
+ if hasattr(self, 'am_inference') and hasattr(self, 'voc_inference'):
logger.info('Models had been initialized.')
return
# am
@@ -488,6 +551,8 @@ class TTSExecutor(BaseExecutor):
vocab_size=vocab_size,
tone_size=tone_size,
**self.am_config["model"])
+ elif am_name == 'tacotron2':
+ am = am_class(idim=vocab_size, odim=odim, **self.am_config["model"])
am.set_state_dict(paddle.load(self.am_ckpt)["main_params"])
am.eval()
@@ -505,10 +570,15 @@ class TTSExecutor(BaseExecutor):
voc_class = dynamic_import(voc_name, model_alias)
voc_inference_class = dynamic_import(voc_name + '_inference',
model_alias)
- voc = voc_class(**self.voc_config["generator_params"])
- voc.set_state_dict(paddle.load(self.voc_ckpt)["generator_params"])
- voc.remove_weight_norm()
- voc.eval()
+ if voc_name != 'wavernn':
+ voc = voc_class(**self.voc_config["generator_params"])
+ voc.set_state_dict(paddle.load(self.voc_ckpt)["generator_params"])
+ voc.remove_weight_norm()
+ voc.eval()
+ else:
+ voc = voc_class(**self.voc_config["model"])
+ voc.set_state_dict(paddle.load(self.voc_ckpt)["main_params"])
+ voc.eval()
voc_mu, voc_std = np.load(self.voc_stat)
voc_mu = paddle.to_tensor(voc_mu)
voc_std = paddle.to_tensor(voc_std)
diff --git a/paddlespeech/cli/utils.py b/paddlespeech/cli/utils.py
index 63b670c8..d7dcc90c 100644
--- a/paddlespeech/cli/utils.py
+++ b/paddlespeech/cli/utils.py
@@ -24,14 +24,17 @@ from typing import Any
from typing import Dict
import paddle
-import paddleaudio
import requests
import yaml
from paddle.framework import load
+import paddleaudio
from . import download
-from .. import __version__
from .entry import commands
+try:
+ from .. import __version__
+except ImportError:
+ __version__ = "0.0.0" # for develop branch
requests.adapters.DEFAULT_RETRIES = 3
diff --git a/paddlespeech/s2t/decoders/ctcdecoder/__init__.py b/paddlespeech/s2t/decoders/ctcdecoder/__init__.py
index 185a92b8..37ceae6e 100644
--- a/paddlespeech/s2t/decoders/ctcdecoder/__init__.py
+++ b/paddlespeech/s2t/decoders/ctcdecoder/__init__.py
@@ -11,3 +11,8 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
+from .swig_wrapper import ctc_beam_search_decoding
+from .swig_wrapper import ctc_beam_search_decoding_batch
+from .swig_wrapper import ctc_greedy_decoding
+from .swig_wrapper import CTCBeamSearchDecoder
+from .swig_wrapper import Scorer
diff --git a/paddlespeech/s2t/decoders/ctcdecoder/swig_wrapper.py b/paddlespeech/s2t/decoders/ctcdecoder/swig_wrapper.py
index d883d430..9e2a8506 100644
--- a/paddlespeech/s2t/decoders/ctcdecoder/swig_wrapper.py
+++ b/paddlespeech/s2t/decoders/ctcdecoder/swig_wrapper.py
@@ -12,10 +12,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""Wrapper for various CTC decoders in SWIG."""
-import swig_decoders
+import paddlespeech_ctcdecoders
-class Scorer(swig_decoders.Scorer):
+class Scorer(paddlespeech_ctcdecoders.Scorer):
"""Wrapper for Scorer.
:param alpha: Parameter associated with language model. Don't use
@@ -26,14 +26,17 @@ class Scorer(swig_decoders.Scorer):
:type beta: float
:model_path: Path to load language model.
:type model_path: str
+ :param vocabulary: Vocabulary list.
+ :type vocabulary: list
"""
def __init__(self, alpha, beta, model_path, vocabulary):
- swig_decoders.Scorer.__init__(self, alpha, beta, model_path, vocabulary)
+ paddlespeech_ctcdecoders.Scorer.__init__(self, alpha, beta, model_path,
+ vocabulary)
-def ctc_greedy_decoder(probs_seq, vocabulary, blank_id):
- """Wrapper for ctc best path decoder in swig.
+def ctc_greedy_decoding(probs_seq, vocabulary, blank_id):
+ """Wrapper for ctc best path decodeing function in swig.
:param probs_seq: 2-D list of probability distributions over each time
step, with each element being a list of normalized
@@ -44,19 +47,19 @@ def ctc_greedy_decoder(probs_seq, vocabulary, blank_id):
:return: Decoding result string.
:rtype: str
"""
- result = swig_decoders.ctc_greedy_decoder(probs_seq.tolist(), vocabulary,
- blank_id)
+ result = paddlespeech_ctcdecoders.ctc_greedy_decoding(probs_seq.tolist(),
+ vocabulary, blank_id)
return result
-def ctc_beam_search_decoder(probs_seq,
- vocabulary,
- beam_size,
- cutoff_prob=1.0,
- cutoff_top_n=40,
- ext_scoring_func=None,
- blank_id=0):
- """Wrapper for the CTC Beam Search Decoder.
+def ctc_beam_search_decoding(probs_seq,
+ vocabulary,
+ beam_size,
+ cutoff_prob=1.0,
+ cutoff_top_n=40,
+ ext_scoring_func=None,
+ blank_id=0):
+ """Wrapper for the CTC Beam Search Decoding function.
:param probs_seq: 2-D list of probability distributions over each time
step, with each element being a list of normalized
@@ -81,22 +84,22 @@ def ctc_beam_search_decoder(probs_seq,
results, in descending order of the probability.
:rtype: list
"""
- beam_results = swig_decoders.ctc_beam_search_decoder(
+ beam_results = paddlespeech_ctcdecoders.ctc_beam_search_decoding(
probs_seq.tolist(), vocabulary, beam_size, cutoff_prob, cutoff_top_n,
ext_scoring_func, blank_id)
beam_results = [(res[0], res[1].decode('utf-8')) for res in beam_results]
return beam_results
-def ctc_beam_search_decoder_batch(probs_split,
- vocabulary,
- beam_size,
- num_processes,
- cutoff_prob=1.0,
- cutoff_top_n=40,
- ext_scoring_func=None,
- blank_id=0):
- """Wrapper for the batched CTC beam search decoder.
+def ctc_beam_search_decoding_batch(probs_split,
+ vocabulary,
+ beam_size,
+ num_processes,
+ cutoff_prob=1.0,
+ cutoff_top_n=40,
+ ext_scoring_func=None,
+ blank_id=0):
+ """Wrapper for the batched CTC beam search decodeing batch function.
:param probs_seq: 3-D list with each element as an instance of 2-D list
of probabilities used by ctc_beam_search_decoder().
@@ -126,9 +129,31 @@ def ctc_beam_search_decoder_batch(probs_split,
"""
probs_split = [probs_seq.tolist() for probs_seq in probs_split]
- batch_beam_results = swig_decoders.ctc_beam_search_decoder_batch(
+ batch_beam_results = paddlespeech_ctcdecoders.ctc_beam_search_decoding_batch(
probs_split, vocabulary, beam_size, num_processes, cutoff_prob,
cutoff_top_n, ext_scoring_func, blank_id)
batch_beam_results = [[(res[0], res[1]) for res in beam_results]
for beam_results in batch_beam_results]
return batch_beam_results
+
+
+class CTCBeamSearchDecoder(paddlespeech_ctcdecoders.CtcBeamSearchDecoderBatch):
+ """Wrapper for CtcBeamSearchDecoderBatch.
+ Args:
+ vocab_list (list): Vocabulary list.
+ beam_size (int): Width for beam search.
+ num_processes (int): Number of parallel processes.
+ param cutoff_prob (float): Cutoff probability in vocabulary pruning,
+ default 1.0, no pruning.
+ cutoff_top_n (int): Cutoff number in pruning, only top cutoff_top_n
+ characters with highest probs in vocabulary will be
+ used in beam search, default 40.
+ param ext_scorer (Scorer): External scorer for partially decoded sentence, e.g. word count
+ or language model.
+ """
+
+ def __init__(self, vocab_list, batch_size, beam_size, num_processes,
+ cutoff_prob, cutoff_top_n, _ext_scorer, blank_id):
+ paddlespeech_ctcdecoders.CtcBeamSearchDecoderBatch.__init__(
+ self, vocab_list, batch_size, beam_size, num_processes, cutoff_prob,
+ cutoff_top_n, _ext_scorer, blank_id)
diff --git a/paddlespeech/s2t/exps/deepspeech2/model.py b/paddlespeech/s2t/exps/deepspeech2/model.py
index 049311c7..3e9ede76 100644
--- a/paddlespeech/s2t/exps/deepspeech2/model.py
+++ b/paddlespeech/s2t/exps/deepspeech2/model.py
@@ -267,12 +267,9 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
errors_func = error_rate.char_errors if decode_cfg.error_rate_type == 'cer' else error_rate.word_errors
error_rate_func = error_rate.cer if decode_cfg.error_rate_type == 'cer' else error_rate.wer
- vocab_list = self.test_loader.collate_fn.vocab_list
-
target_transcripts = self.ordid2token(texts, texts_len)
- result_transcripts = self.compute_result_transcripts(
- audio, audio_len, vocab_list, decode_cfg)
+ result_transcripts = self.compute_result_transcripts(audio, audio_len)
for utt, target, result in zip(utts, target_transcripts,
result_transcripts):
@@ -296,21 +293,8 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
error_rate=errors_sum / len_refs,
error_rate_type=decode_cfg.error_rate_type)
- def compute_result_transcripts(self, audio, audio_len, vocab_list,
- decode_cfg):
- result_transcripts = self.model.decode(
- audio,
- audio_len,
- vocab_list,
- decoding_method=decode_cfg.decoding_method,
- lang_model_path=decode_cfg.lang_model_path,
- beam_alpha=decode_cfg.alpha,
- beam_beta=decode_cfg.beta,
- beam_size=decode_cfg.beam_size,
- cutoff_prob=decode_cfg.cutoff_prob,
- cutoff_top_n=decode_cfg.cutoff_top_n,
- num_processes=decode_cfg.num_proc_bsearch)
-
+ def compute_result_transcripts(self, audio, audio_len):
+ result_transcripts = self.model.decode(audio, audio_len)
return result_transcripts
@mp_tools.rank_zero_only
@@ -320,6 +304,17 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
self.model.eval()
error_rate_type = None
errors_sum, len_refs, num_ins = 0.0, 0, 0
+
+ # Initialized the decoder in model
+ decode_cfg = self.config.decode
+ vocab_list = self.test_loader.collate_fn.vocab_list
+ decode_batch_size = self.test_loader.batch_size
+ self.model.decoder.init_decoder(
+ decode_batch_size, vocab_list, decode_cfg.decoding_method,
+ decode_cfg.lang_model_path, decode_cfg.alpha, decode_cfg.beta,
+ decode_cfg.beam_size, decode_cfg.cutoff_prob,
+ decode_cfg.cutoff_top_n, decode_cfg.num_proc_bsearch)
+
with jsonlines.open(self.args.result_file, 'w') as fout:
for i, batch in enumerate(self.test_loader):
utts, audio, audio_len, texts, texts_len = batch
@@ -339,6 +334,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
msg += "Final error rate [%s] (%d/%d) = %f" % (
error_rate_type, num_ins, num_ins, errors_sum / len_refs)
logger.info(msg)
+ self.model.decoder.del_decoder()
@paddle.no_grad()
def export(self):
@@ -377,6 +373,22 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester):
self.model.eval()
error_rate_type = None
errors_sum, len_refs, num_ins = 0.0, 0, 0
+
+ # Initialized the decoder in model
+ decode_cfg = self.config.decode
+ vocab_list = self.test_loader.collate_fn.vocab_list
+ if self.args.model_type == "online":
+ decode_batch_size = 1
+ elif self.args.model_type == "offline":
+ decode_batch_size = self.test_loader.batch_size
+ else:
+ raise Exception("wrong model type")
+ self.model.decoder.init_decoder(
+ decode_batch_size, vocab_list, decode_cfg.decoding_method,
+ decode_cfg.lang_model_path, decode_cfg.alpha, decode_cfg.beta,
+ decode_cfg.beam_size, decode_cfg.cutoff_prob,
+ decode_cfg.cutoff_top_n, decode_cfg.num_proc_bsearch)
+
with jsonlines.open(self.args.result_file, 'w') as fout:
for i, batch in enumerate(self.test_loader):
utts, audio, audio_len, texts, texts_len = batch
@@ -388,7 +400,6 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester):
error_rate_type = metrics['error_rate_type']
logger.info("Error rate [%s] (%d/?) = %f" %
(error_rate_type, num_ins, errors_sum / len_refs))
-
# logging
msg = "Test: "
msg += "epoch: {}, ".format(self.epoch)
@@ -398,30 +409,31 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester):
logger.info(msg)
if self.args.enable_auto_log is True:
self.autolog.report()
+ self.model.decoder.del_decoder()
- def compute_result_transcripts(self, audio, audio_len, vocab_list,
- decode_cfg):
+ def compute_result_transcripts(self, audio, audio_len):
if self.args.model_type == "online":
- output_probs, output_lens = self.static_forward_online(audio,
- audio_len)
+ output_probs, output_lens, trans_batch = self.static_forward_online(
+ audio, audio_len, decoder_chunk_size=1)
+ result_transcripts = [trans[-1] for trans in trans_batch]
elif self.args.model_type == "offline":
output_probs, output_lens = self.static_forward_offline(audio,
audio_len)
+ batch_size = output_probs.shape[0]
+ self.model.decoder.reset_decoder(batch_size=batch_size)
+
+ self.model.decoder.next(output_probs, output_lens)
+
+ trans_best, trans_beam = self.model.decoder.decode()
+
+ result_transcripts = trans_best
+
else:
raise Exception("wrong model type")
self.predictor.clear_intermediate_tensor()
self.predictor.try_shrink_memory()
- self.model.decoder.init_decode(decode_cfg.alpha, decode_cfg.beta,
- decode_cfg.lang_model_path, vocab_list,
- decode_cfg.decoding_method)
-
- result_transcripts = self.model.decoder.decode_probs(
- output_probs, output_lens, vocab_list, decode_cfg.decoding_method,
- decode_cfg.lang_model_path, decode_cfg.alpha, decode_cfg.beta,
- decode_cfg.beam_size, decode_cfg.cutoff_prob,
- decode_cfg.cutoff_top_n, decode_cfg.num_proc_bsearch)
#replace the with ' '
result_transcripts = [
self._text_featurizer.detokenize(sentence)
@@ -451,6 +463,7 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester):
-------
output_probs(numpy.array): shape[B, T, vocab_size]
output_lens(numpy.array): shape[B]
+ trans(list(list(str))): shape[B, T]
"""
output_probs_list = []
output_lens_list = []
@@ -464,14 +477,15 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester):
batch_size, Tmax, x_dim = x_batch.shape
x_len_batch = audio_len.numpy().astype(np.int64)
if (Tmax - chunk_size) % chunk_stride != 0:
- padding_len_batch = chunk_stride - (
- Tmax - chunk_size
- ) % chunk_stride # The length of padding for the batch
+ # The length of padding for the batch
+ padding_len_batch = chunk_stride - (Tmax - chunk_size
+ ) % chunk_stride
else:
padding_len_batch = 0
x_list = np.split(x_batch, batch_size, axis=0)
x_len_list = np.split(x_len_batch, batch_size, axis=0)
+ trans_batch = []
for x, x_len in zip(x_list, x_len_list):
if self.args.enable_auto_log is True:
self.autolog.times.start()
@@ -504,12 +518,14 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester):
h_box_handle = self.predictor.get_input_handle(input_names[2])
c_box_handle = self.predictor.get_input_handle(input_names[3])
+ trans = []
probs_chunk_list = []
probs_chunk_lens_list = []
if self.args.enable_auto_log is True:
# record the model preprocessing time
self.autolog.times.stamp()
+ self.model.decoder.reset_decoder(batch_size=1)
for i in range(0, num_chunk):
start = i * chunk_stride
end = start + chunk_size
@@ -518,9 +534,8 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester):
x_chunk_lens = 0
else:
x_chunk_lens = min(x_len - i * chunk_stride, chunk_size)
-
- if (x_chunk_lens <
- receptive_field_length): #means the number of input frames in the chunk is not enough for predicting one prob
+ #means the number of input frames in the chunk is not enough for predicting one prob
+ if (x_chunk_lens < receptive_field_length):
break
x_chunk_lens = np.array([x_chunk_lens])
audio_handle.reshape(x_chunk.shape)
@@ -549,9 +564,12 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester):
output_chunk_lens = output_lens_handle.copy_to_cpu()
chunk_state_h_box = output_state_h_handle.copy_to_cpu()
chunk_state_c_box = output_state_c_handle.copy_to_cpu()
-
+ self.model.decoder.next(output_chunk_probs, output_chunk_lens)
probs_chunk_list.append(output_chunk_probs)
probs_chunk_lens_list.append(output_chunk_lens)
+ trans_best, trans_beam = self.model.decoder.decode()
+ trans.append(trans_best[0])
+ trans_batch.append(trans)
output_probs = np.concatenate(probs_chunk_list, axis=1)
output_lens = np.sum(probs_chunk_lens_list, axis=0)
vocab_size = output_probs.shape[2]
@@ -573,7 +591,7 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester):
self.autolog.times.end()
output_probs = np.concatenate(output_probs_list, axis=0)
output_lens = np.concatenate(output_lens_list, axis=0)
- return output_probs, output_lens
+ return output_probs, output_lens, trans_batch
def static_forward_offline(self, audio, audio_len):
"""
diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py
index 85bb877b..d7bee6d7 100644
--- a/paddlespeech/s2t/exps/u2/model.py
+++ b/paddlespeech/s2t/exps/u2/model.py
@@ -175,7 +175,7 @@ class U2Trainer(Trainer):
observation['batch_cost'] = observation[
'reader_cost'] + observation['step_cost']
observation['samples'] = observation['batch_size']
- observation['ips,sent./sec'] = observation[
+ observation['ips,samples/s'] = observation[
'batch_size'] / observation['batch_cost']
for k, v in observation.items():
msg += f" {k.split(',')[0]}: "
diff --git a/paddlespeech/s2t/exps/u2_st/model.py b/paddlespeech/s2t/exps/u2_st/model.py
index b03ca38b..6a32eda7 100644
--- a/paddlespeech/s2t/exps/u2_st/model.py
+++ b/paddlespeech/s2t/exps/u2_st/model.py
@@ -285,7 +285,7 @@ class U2STTrainer(Trainer):
subsampling_factor=1,
load_aux_output=load_transcript,
num_encs=1,
- dist_sampler=True)
+ dist_sampler=False)
logger.info("Setup train/valid Dataloader!")
else:
# test dataset, return raw text
@@ -408,6 +408,7 @@ class U2STTester(U2STTrainer):
decoding_method=decode_cfg.decoding_method,
beam_size=decode_cfg.beam_size,
word_reward=decode_cfg.word_reward,
+ maxlenratio=decode_cfg.maxlenratio,
decoding_chunk_size=decode_cfg.decoding_chunk_size,
num_decoding_left_chunks=decode_cfg.num_decoding_left_chunks,
simulate_streaming=decode_cfg.simulate_streaming)
@@ -435,6 +436,7 @@ class U2STTester(U2STTrainer):
decoding_method=decode_cfg.decoding_method,
beam_size=decode_cfg.beam_size,
word_reward=decode_cfg.word_reward,
+ maxlenratio=decode_cfg.maxlenratio,
decoding_chunk_size=decode_cfg.decoding_chunk_size,
num_decoding_left_chunks=decode_cfg.num_decoding_left_chunks,
simulate_streaming=decode_cfg.simulate_streaming)
diff --git a/paddlespeech/s2t/io/batchfy.py b/paddlespeech/s2t/io/batchfy.py
index f59fb24c..f3630f2e 100644
--- a/paddlespeech/s2t/io/batchfy.py
+++ b/paddlespeech/s2t/io/batchfy.py
@@ -419,7 +419,7 @@ def make_batchset(
# sort it by input lengths (long to short)
sorted_data = sorted(
d.items(),
- key=lambda data: int(data[1][batch_sort_key][batch_sort_axis]["shape"][0]),
+ key=lambda data: float(data[1][batch_sort_key][batch_sort_axis]["shape"][0]),
reverse=not shortest_first, )
logger.info("# utts: " + str(len(sorted_data)))
diff --git a/paddlespeech/s2t/io/dataloader.py b/paddlespeech/s2t/io/dataloader.py
index 920de34f..55aa13ff 100644
--- a/paddlespeech/s2t/io/dataloader.py
+++ b/paddlespeech/s2t/io/dataloader.py
@@ -61,7 +61,7 @@ class BatchDataLoader():
def __init__(self,
json_file: str,
train_mode: bool,
- sortagrad: bool=False,
+ sortagrad: int=0,
batch_size: int=0,
maxlen_in: float=float('inf'),
maxlen_out: float=float('inf'),
diff --git a/paddlespeech/s2t/io/sampler.py b/paddlespeech/s2t/io/sampler.py
index ac55af12..89752bb9 100644
--- a/paddlespeech/s2t/io/sampler.py
+++ b/paddlespeech/s2t/io/sampler.py
@@ -51,7 +51,7 @@ def _batch_shuffle(indices, batch_size, epoch, clipped=False):
"""
rng = np.random.RandomState(epoch)
shift_len = rng.randint(0, batch_size - 1)
- batch_indices = list(zip(* [iter(indices[shift_len:])] * batch_size))
+ batch_indices = list(zip(*[iter(indices[shift_len:])] * batch_size))
rng.shuffle(batch_indices)
batch_indices = [item for batch in batch_indices for item in batch]
assert clipped is False
diff --git a/paddlespeech/s2t/models/ds2/__init__.py b/paddlespeech/s2t/models/ds2/__init__.py
index 8d5959c8..b3222067 100644
--- a/paddlespeech/s2t/models/ds2/__init__.py
+++ b/paddlespeech/s2t/models/ds2/__init__.py
@@ -16,7 +16,7 @@ from .deepspeech2 import DeepSpeech2Model
from paddlespeech.s2t.utils import dynamic_pip_install
try:
- import swig_decoders
+ import paddlespeech_ctcdecoders
except ImportError:
try:
package_name = 'paddlespeech_ctcdecoders'
diff --git a/paddlespeech/s2t/models/ds2/deepspeech2.py b/paddlespeech/s2t/models/ds2/deepspeech2.py
index 4a4d67ce..9c6b66c2 100644
--- a/paddlespeech/s2t/models/ds2/deepspeech2.py
+++ b/paddlespeech/s2t/models/ds2/deepspeech2.py
@@ -164,24 +164,18 @@ class DeepSpeech2Model(nn.Layer):
return loss
@paddle.no_grad()
- def decode(self, audio, audio_len, vocab_list, decoding_method,
- lang_model_path, beam_alpha, beam_beta, beam_size, cutoff_prob,
- cutoff_top_n, num_processes):
- # init once
+ def decode(self, audio, audio_len):
# decoders only accept string encoded in utf-8
- self.decoder.init_decode(
- beam_alpha=beam_alpha,
- beam_beta=beam_beta,
- lang_model_path=lang_model_path,
- vocab_list=vocab_list,
- decoding_method=decoding_method)
+ # Make sure the decoder has been initialized
eouts, eouts_len = self.encoder(audio, audio_len)
probs = self.decoder.softmax(eouts)
- return self.decoder.decode_probs(
- probs.numpy(), eouts_len, vocab_list, decoding_method,
- lang_model_path, beam_alpha, beam_beta, beam_size, cutoff_prob,
- cutoff_top_n, num_processes)
+ batch_size = probs.shape[0]
+ self.decoder.reset_decoder(batch_size=batch_size)
+ self.decoder.next(probs, eouts_len)
+ trans_best, trans_beam = self.decoder.decode()
+
+ return trans_best
@classmethod
def from_pretrained(cls, dataloader, config, checkpoint_path):
diff --git a/paddlespeech/s2t/models/ds2_online/__init__.py b/paddlespeech/s2t/models/ds2_online/__init__.py
index 2d304237..c5fdab1b 100644
--- a/paddlespeech/s2t/models/ds2_online/__init__.py
+++ b/paddlespeech/s2t/models/ds2_online/__init__.py
@@ -16,7 +16,7 @@ from .deepspeech2 import DeepSpeech2ModelOnline
from paddlespeech.s2t.utils import dynamic_pip_install
try:
- import swig_decoders
+ import paddlespeech_ctcdecoders
except ImportError:
try:
package_name = 'paddlespeech_ctcdecoders'
diff --git a/paddlespeech/s2t/models/ds2_online/deepspeech2.py b/paddlespeech/s2t/models/ds2_online/deepspeech2.py
index 5e4981c0..9574a62b 100644
--- a/paddlespeech/s2t/models/ds2_online/deepspeech2.py
+++ b/paddlespeech/s2t/models/ds2_online/deepspeech2.py
@@ -293,25 +293,17 @@ class DeepSpeech2ModelOnline(nn.Layer):
return loss
@paddle.no_grad()
- def decode(self, audio, audio_len, vocab_list, decoding_method,
- lang_model_path, beam_alpha, beam_beta, beam_size, cutoff_prob,
- cutoff_top_n, num_processes):
- # init once
+ def decode(self, audio, audio_len):
# decoders only accept string encoded in utf-8
- self.decoder.init_decode(
- beam_alpha=beam_alpha,
- beam_beta=beam_beta,
- lang_model_path=lang_model_path,
- vocab_list=vocab_list,
- decoding_method=decoding_method)
-
+ # Make sure the decoder has been initialized
eouts, eouts_len, final_state_h_box, final_state_c_box = self.encoder(
audio, audio_len, None, None)
probs = self.decoder.softmax(eouts)
- return self.decoder.decode_probs(
- probs.numpy(), eouts_len, vocab_list, decoding_method,
- lang_model_path, beam_alpha, beam_beta, beam_size, cutoff_prob,
- cutoff_top_n, num_processes)
+ batch_size = probs.shape[0]
+ self.decoder.reset_decoder(batch_size=batch_size)
+ self.decoder.next(probs, eouts_len)
+ trans_best, trans_beam = self.decoder.decode()
+ return trans_best
@classmethod
def from_pretrained(cls, dataloader, config, checkpoint_path):
diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py
index ff4012e8..91079812 100644
--- a/paddlespeech/s2t/models/u2/u2.py
+++ b/paddlespeech/s2t/models/u2/u2.py
@@ -32,7 +32,7 @@ from paddlespeech.s2t.frontend.utility import IGNORE_ID
from paddlespeech.s2t.frontend.utility import load_cmvn
from paddlespeech.s2t.models.asr_interface import ASRInterface
from paddlespeech.s2t.modules.cmvn import GlobalCMVN
-from paddlespeech.s2t.modules.ctc import CTCDecoder
+from paddlespeech.s2t.modules.ctc import CTCDecoderBase
from paddlespeech.s2t.modules.decoder import TransformerDecoder
from paddlespeech.s2t.modules.encoder import ConformerEncoder
from paddlespeech.s2t.modules.encoder import TransformerEncoder
@@ -63,7 +63,7 @@ class U2BaseModel(ASRInterface, nn.Layer):
vocab_size: int,
encoder: TransformerEncoder,
decoder: TransformerDecoder,
- ctc: CTCDecoder,
+ ctc: CTCDecoderBase,
ctc_weight: float=0.5,
ignore_id: int=IGNORE_ID,
lsm_weight: float=0.0,
@@ -663,7 +663,7 @@ class U2BaseModel(ASRInterface, nn.Layer):
# (num_hyps, max_hyps_len, vocab_size)
decoder_out, _ = self.decoder(encoder_out, encoder_mask, hyps,
hyps_lens)
- decoder_out = paddle.nn.functional.log_softmax(decoder_out, dim=-1)
+ decoder_out = paddle.nn.functional.log_softmax(decoder_out, axis=-1)
return decoder_out
@paddle.no_grad()
@@ -840,7 +840,7 @@ class U2Model(U2DecodeModel):
model_conf = configs.get('model_conf', dict())
dropout_rate = model_conf.get('ctc_dropout_rate', 0.0)
grad_norm_type = model_conf.get('ctc_grad_norm_type', None)
- ctc = CTCDecoder(
+ ctc = CTCDecoderBase(
odim=vocab_size,
enc_n_units=encoder.output_size(),
blank_id=0,
diff --git a/paddlespeech/s2t/models/u2_st/u2_st.py b/paddlespeech/s2t/models/u2_st/u2_st.py
index 79ca423f..f7b05714 100644
--- a/paddlespeech/s2t/models/u2_st/u2_st.py
+++ b/paddlespeech/s2t/models/u2_st/u2_st.py
@@ -28,7 +28,7 @@ from paddle import nn
from paddlespeech.s2t.frontend.utility import IGNORE_ID
from paddlespeech.s2t.frontend.utility import load_cmvn
from paddlespeech.s2t.modules.cmvn import GlobalCMVN
-from paddlespeech.s2t.modules.ctc import CTCDecoder
+from paddlespeech.s2t.modules.ctc import CTCDecoderBase
from paddlespeech.s2t.modules.decoder import TransformerDecoder
from paddlespeech.s2t.modules.encoder import ConformerEncoder
from paddlespeech.s2t.modules.encoder import TransformerEncoder
@@ -56,7 +56,7 @@ class U2STBaseModel(nn.Layer):
encoder: TransformerEncoder,
st_decoder: TransformerDecoder,
decoder: TransformerDecoder=None,
- ctc: CTCDecoder=None,
+ ctc: CTCDecoderBase=None,
ctc_weight: float=0.0,
asr_weight: float=0.0,
ignore_id: int=IGNORE_ID,
@@ -264,14 +264,17 @@ class U2STBaseModel(nn.Layer):
speech_lengths: paddle.Tensor,
beam_size: int=10,
word_reward: float=0.0,
+ maxlenratio: float=0.5,
decoding_chunk_size: int=-1,
num_decoding_left_chunks: int=-1,
simulate_streaming: bool=False, ) -> paddle.Tensor:
- """ Apply beam search on attention decoder
+ """ Apply beam search on attention decoder with length penalty
Args:
speech (paddle.Tensor): (batch, max_len, feat_dim)
speech_length (paddle.Tensor): (batch, )
beam_size (int): beam size for beam search
+ word_reward (float): word reward used in beam search
+ maxlenratio (float): max length ratio to bound the length of translated text
decoding_chunk_size (int): decoding chunk for dynamic chunk
trained model.
<0: for decoding, use full chunk.
@@ -284,90 +287,89 @@ class U2STBaseModel(nn.Layer):
"""
assert speech.shape[0] == speech_lengths.shape[0]
assert decoding_chunk_size != 0
+ assert speech.shape[0] == 1
device = speech.place
- batch_size = speech.shape[0]
# Let's assume B = batch_size and N = beam_size
- # 1. Encoder
+ # 1. Encoder and init hypothesis
encoder_out, encoder_mask = self._forward_encoder(
speech, speech_lengths, decoding_chunk_size,
num_decoding_left_chunks,
simulate_streaming) # (B, maxlen, encoder_dim)
- maxlen = encoder_out.shape[1]
- encoder_dim = encoder_out.shape[2]
- running_size = batch_size * beam_size
- encoder_out = encoder_out.unsqueeze(1).repeat(1, beam_size, 1, 1).view(
- running_size, maxlen, encoder_dim) # (B*N, maxlen, encoder_dim)
- encoder_mask = encoder_mask.unsqueeze(1).repeat(
- 1, beam_size, 1, 1).view(running_size, 1,
- maxlen) # (B*N, 1, max_len)
-
- hyps = paddle.ones(
- [running_size, 1], dtype=paddle.long).fill_(self.sos) # (B*N, 1)
- # log scale score
- scores = paddle.to_tensor(
- [0.0] + [-float('inf')] * (beam_size - 1), dtype=paddle.float)
- scores = scores.to(device).repeat(batch_size).unsqueeze(1).to(
- device) # (B*N, 1)
- end_flag = paddle.zeros_like(scores, dtype=paddle.bool) # (B*N, 1)
- cache: Optional[List[paddle.Tensor]] = None
+
+ maxlen = max(int(encoder_out.shape[1] * maxlenratio), 5)
+
+ hyp = {"score": 0.0, "yseq": [self.sos], "cache": None}
+ hyps = [hyp]
+ ended_hyps = []
+ cur_best_score = -float("inf")
+ cache = None
+
# 2. Decoder forward step by step
for i in range(1, maxlen + 1):
- # Stop if all batch and all beam produce eos
- # TODO(Hui Zhang): if end_flag.sum() == running_size:
- if end_flag.cast(paddle.int64).sum() == running_size:
- break
+ ys = paddle.ones((len(hyps), i), dtype=paddle.long)
+
+ if hyps[0]["cache"] is not None:
+ cache = [
+ paddle.ones(
+ (len(hyps), i - 1, hyp_cache.shape[-1]),
+ dtype=paddle.float32) for hyp_cache in hyps[0]["cache"]
+ ]
+ for j, hyp in enumerate(hyps):
+ ys[j, :] = paddle.to_tensor(hyp["yseq"])
+ if hyps[0]["cache"] is not None:
+ for k in range(len(cache)):
+ cache[k][j] = hyps[j]["cache"][k]
+ ys_mask = subsequent_mask(i).unsqueeze(0).to(device)
- # 2.1 Forward decoder step
- hyps_mask = subsequent_mask(i).unsqueeze(0).repeat(
- running_size, 1, 1).to(device) # (B*N, i, i)
- # logp: (B*N, vocab)
logp, cache = self.st_decoder.forward_one_step(
- encoder_out, encoder_mask, hyps, hyps_mask, cache)
-
- # 2.2 First beam prune: select topk best prob at current time
- top_k_logp, top_k_index = logp.topk(beam_size) # (B*N, N)
- top_k_logp += word_reward
- top_k_logp = mask_finished_scores(top_k_logp, end_flag)
- top_k_index = mask_finished_preds(top_k_index, end_flag, self.eos)
-
- # 2.3 Seconde beam prune: select topk score with history
- scores = scores + top_k_logp # (B*N, N), broadcast add
- scores = scores.view(batch_size, beam_size * beam_size) # (B, N*N)
- scores, offset_k_index = scores.topk(k=beam_size) # (B, N)
- scores = scores.view(-1, 1) # (B*N, 1)
-
- # 2.4. Compute base index in top_k_index,
- # regard top_k_index as (B*N*N),regard offset_k_index as (B*N),
- # then find offset_k_index in top_k_index
- base_k_index = paddle.arange(batch_size).view(-1, 1).repeat(
- 1, beam_size) # (B, N)
- base_k_index = base_k_index * beam_size * beam_size
- best_k_index = base_k_index.view(-1) + offset_k_index.view(
- -1) # (B*N)
-
- # 2.5 Update best hyps
- best_k_pred = paddle.index_select(
- top_k_index.view(-1), index=best_k_index, axis=0) # (B*N)
- best_hyps_index = best_k_index // beam_size
- last_best_k_hyps = paddle.index_select(
- hyps, index=best_hyps_index, axis=0) # (B*N, i)
- hyps = paddle.cat(
- (last_best_k_hyps, best_k_pred.view(-1, 1)),
- dim=1) # (B*N, i+1)
-
- # 2.6 Update end flag
- end_flag = paddle.eq(hyps[:, -1], self.eos).view(-1, 1)
+ encoder_out.repeat(len(hyps), 1, 1),
+ encoder_mask.repeat(len(hyps), 1, 1), ys, ys_mask, cache)
+
+ hyps_best_kept = []
+ for j, hyp in enumerate(hyps):
+ top_k_logp, top_k_index = logp[j:j + 1].topk(beam_size)
+
+ for b in range(beam_size):
+ new_hyp = {}
+ new_hyp["score"] = hyp["score"] + float(top_k_logp[0, b])
+ new_hyp["yseq"] = [0] * (1 + len(hyp["yseq"]))
+ new_hyp["yseq"][:len(hyp["yseq"])] = hyp["yseq"]
+ new_hyp["yseq"][len(hyp["yseq"])] = int(top_k_index[0, b])
+ new_hyp["cache"] = [cache_[j] for cache_ in cache]
+ # will be (2 x beam) hyps at most
+ hyps_best_kept.append(new_hyp)
+
+ hyps_best_kept = sorted(
+ hyps_best_kept, key=lambda x: -x["score"])[:beam_size]
+
+ # sort and get nbest
+ hyps = hyps_best_kept
+ if i == maxlen:
+ for hyp in hyps:
+ hyp["yseq"].append(self.eos)
+
+ # finalize the ended hypotheses with word reward (by length)
+ remained_hyps = []
+ for hyp in hyps:
+ if hyp["yseq"][-1] == self.eos:
+ hyp["score"] += (i - 1) * word_reward
+ cur_best_score = max(cur_best_score, hyp["score"])
+ ended_hyps.append(hyp)
+ else:
+ # stop while guarantee the optimality
+ if hyp["score"] + maxlen * word_reward > cur_best_score:
+ remained_hyps.append(hyp)
+
+ # stop predition when there is no unended hypothesis
+ if not remained_hyps:
+ break
+ hyps = remained_hyps
# 3. Select best of best
- scores = scores.view(batch_size, beam_size)
- # TODO: length normalization
- best_index = paddle.argmax(scores, axis=-1).long() # (B)
- best_hyps_index = best_index + paddle.arange(
- batch_size, dtype=paddle.long) * beam_size
- best_hyps = paddle.index_select(hyps, index=best_hyps_index, axis=0)
- best_hyps = best_hyps[:, 1:]
- return best_hyps
+ best_hyp = max(ended_hyps, key=lambda x: x["score"])
+
+ return paddle.to_tensor([best_hyp["yseq"][1:]])
# @jit.to_static
def subsampling_rate(self) -> int:
@@ -472,6 +474,7 @@ class U2STBaseModel(nn.Layer):
decoding_method: str,
beam_size: int,
word_reward: float=0.0,
+ maxlenratio: float=0.5,
decoding_chunk_size: int=-1,
num_decoding_left_chunks: int=-1,
simulate_streaming: bool=False):
@@ -507,6 +510,7 @@ class U2STBaseModel(nn.Layer):
feats_lengths,
beam_size=beam_size,
word_reward=word_reward,
+ maxlenratio=maxlenratio,
decoding_chunk_size=decoding_chunk_size,
num_decoding_left_chunks=num_decoding_left_chunks,
simulate_streaming=simulate_streaming)
@@ -591,7 +595,7 @@ class U2STModel(U2STBaseModel):
model_conf = configs['model_conf']
dropout_rate = model_conf.get('ctc_dropout_rate', 0.0)
grad_norm_type = model_conf.get('ctc_grad_norm_type', None)
- ctc = CTCDecoder(
+ ctc = CTCDecoderBase(
odim=vocab_size,
enc_n_units=encoder.output_size(),
blank_id=0,
diff --git a/paddlespeech/s2t/modules/ctc.py b/paddlespeech/s2t/modules/ctc.py
index 1f983807..2094182a 100644
--- a/paddlespeech/s2t/modules/ctc.py
+++ b/paddlespeech/s2t/modules/ctc.py
@@ -25,17 +25,19 @@ from paddlespeech.s2t.utils.log import Log
logger = Log(__name__).getlog()
try:
- from paddlespeech.s2t.decoders.ctcdecoder.swig_wrapper import ctc_beam_search_decoder_batch # noqa: F401
- from paddlespeech.s2t.decoders.ctcdecoder.swig_wrapper import ctc_greedy_decoder # noqa: F401
- from paddlespeech.s2t.decoders.ctcdecoder.swig_wrapper import Scorer # noqa: F401
+ from paddlespeech.s2t.decoders.ctcdecoder import ctc_beam_search_decoding_batch # noqa: F401
+ from paddlespeech.s2t.decoders.ctcdecoder import ctc_greedy_decoding # noqa: F401
+ from paddlespeech.s2t.decoders.ctcdecoder import Scorer # noqa: F401
+ from paddlespeech.s2t.decoders.ctcdecoder import CTCBeamSearchDecoder # noqa: F401
except ImportError:
try:
from paddlespeech.s2t.utils import dynamic_pip_install
package_name = 'paddlespeech_ctcdecoders'
dynamic_pip_install.install(package_name)
- from paddlespeech.s2t.decoders.ctcdecoder.swig_wrapper import ctc_beam_search_decoder_batch # noqa: F401
- from paddlespeech.s2t.decoders.ctcdecoder.swig_wrapper import ctc_greedy_decoder # noqa: F401
- from paddlespeech.s2t.decoders.ctcdecoder.swig_wrapper import Scorer # noqa: F401
+ from paddlespeech.s2t.decoders.ctcdecoder import ctc_beam_search_decoding_batch # noqa: F401
+ from paddlespeech.s2t.decoders.ctcdecoder import ctc_greedy_decoding # noqa: F401
+ from paddlespeech.s2t.decoders.ctcdecoder import Scorer # noqa: F401
+ from paddlespeech.s2t.decoders.ctcdecoder import CTCBeamSearchDecoder # noqa: F401
except Exception as e:
logger.info("paddlespeech_ctcdecoders not installed!")
@@ -139,9 +141,11 @@ class CTCDecoder(CTCDecoderBase):
super().__init__(*args, **kwargs)
# CTCDecoder LM Score handle
self._ext_scorer = None
+ self.beam_search_decoder = None
- def _decode_batch_greedy(self, probs_split, vocab_list):
- """Decode by best path for a batch of probs matrix input.
+ def _decode_batch_greedy_offline(self, probs_split, vocab_list):
+ """This function will be deprecated in future.
+ Decode by best path for a batch of probs matrix input.
:param probs_split: List of 2-D probability matrix, and each consists
of prob vectors for one speech utterancce.
:param probs_split: List of matrix
@@ -152,7 +156,7 @@ class CTCDecoder(CTCDecoderBase):
"""
results = []
for i, probs in enumerate(probs_split):
- output_transcription = ctc_greedy_decoder(
+ output_transcription = ctc_greedy_decoding(
probs_seq=probs, vocabulary=vocab_list, blank_id=self.blank_id)
results.append(output_transcription)
return results
@@ -194,10 +198,12 @@ class CTCDecoder(CTCDecoderBase):
logger.info("no language model provided, "
"decoding by pure beam search without scorer.")
- def _decode_batch_beam_search(self, probs_split, beam_alpha, beam_beta,
- beam_size, cutoff_prob, cutoff_top_n,
- vocab_list, num_processes):
- """Decode by beam search for a batch of probs matrix input.
+ def _decode_batch_beam_search_offline(
+ self, probs_split, beam_alpha, beam_beta, beam_size, cutoff_prob,
+ cutoff_top_n, vocab_list, num_processes):
+ """
+ This function will be deprecated in future.
+ Decode by beam search for a batch of probs matrix input.
:param probs_split: List of 2-D probability matrix, and each consists
of prob vectors for one speech utterancce.
:param probs_split: List of matrix
@@ -226,7 +232,7 @@ class CTCDecoder(CTCDecoderBase):
# beam search decode
num_processes = min(num_processes, len(probs_split))
- beam_search_results = ctc_beam_search_decoder_batch(
+ beam_search_results = ctc_beam_search_decoding_batch(
probs_split=probs_split,
vocabulary=vocab_list,
beam_size=beam_size,
@@ -239,30 +245,69 @@ class CTCDecoder(CTCDecoderBase):
results = [result[0][1] for result in beam_search_results]
return results
- def init_decode(self, beam_alpha, beam_beta, lang_model_path, vocab_list,
- decoding_method):
+ def init_decoder(self, batch_size, vocab_list, decoding_method,
+ lang_model_path, beam_alpha, beam_beta, beam_size,
+ cutoff_prob, cutoff_top_n, num_processes):
+ """
+ init ctc decoders
+ Args:
+ batch_size(int): Batch size for input data
+ vocab_list (list): List of tokens in the vocabulary, for decoding
+ decoding_method (str): ctc_beam_search
+ lang_model_path (str): language model path
+ beam_alpha (float): beam_alpha
+ beam_beta (float): beam_beta
+ beam_size (int): beam_size
+ cutoff_prob (float): cutoff probability in beam search
+ cutoff_top_n (int): cutoff_top_n
+ num_processes (int): num_processes
+
+ Raises:
+ ValueError: when decoding_method not support.
+ Returns:
+ CTCBeamSearchDecoder
+ """
+ self.batch_size = batch_size
+ self.vocab_list = vocab_list
+ self.decoding_method = decoding_method
+ self.beam_size = beam_size
+ self.cutoff_prob = cutoff_prob
+ self.cutoff_top_n = cutoff_top_n
+ self.num_processes = num_processes
if decoding_method == "ctc_beam_search":
self._init_ext_scorer(beam_alpha, beam_beta, lang_model_path,
vocab_list)
+ if self.beam_search_decoder is None:
+ self.beam_search_decoder = self.get_decoder(
+ vocab_list, batch_size, beam_alpha, beam_beta, beam_size,
+ num_processes, cutoff_prob, cutoff_top_n)
+ return self.beam_search_decoder
+ elif decoding_method == "ctc_greedy":
+ self._init_ext_scorer(beam_alpha, beam_beta, lang_model_path,
+ vocab_list)
+ else:
+ raise ValueError(f"Not support: {decoding_method}")
- def decode_probs(self, probs, logits_lens, vocab_list, decoding_method,
- lang_model_path, beam_alpha, beam_beta, beam_size,
- cutoff_prob, cutoff_top_n, num_processes):
- """ctc decoding with probs.
-
+ def decode_probs_offline(self, probs, logits_lens, vocab_list,
+ decoding_method, lang_model_path, beam_alpha,
+ beam_beta, beam_size, cutoff_prob, cutoff_top_n,
+ num_processes):
+ """
+ This function will be deprecated in future.
+ ctc decoding with probs.
Args:
probs (Tensor): activation after softmax
logits_lens (Tensor): audio output lens
- vocab_list ([type]): [description]
- decoding_method ([type]): [description]
- lang_model_path ([type]): [description]
- beam_alpha ([type]): [description]
- beam_beta ([type]): [description]
- beam_size ([type]): [description]
- cutoff_prob ([type]): [description]
- cutoff_top_n ([type]): [description]
- num_processes ([type]): [description]
+ vocab_list (list): List of tokens in the vocabulary, for decoding
+ decoding_method (str): ctc_beam_search
+ lang_model_path (str): language model path
+ beam_alpha (float): beam_alpha
+ beam_beta (float): beam_beta
+ beam_size (int): beam_size
+ cutoff_prob (float): cutoff probability in beam search
+ cutoff_top_n (int): cutoff_top_n
+ num_processes (int): num_processes
Raises:
ValueError: when decoding_method not support.
@@ -270,13 +315,14 @@ class CTCDecoder(CTCDecoderBase):
Returns:
List[str]: transcripts.
"""
-
+ logger.warn(
+ "This function will be deprecated in future: decode_probs_offline")
probs_split = [probs[i, :l, :] for i, l in enumerate(logits_lens)]
if decoding_method == "ctc_greedy":
- result_transcripts = self._decode_batch_greedy(
+ result_transcripts = self._decode_batch_greedy_offline(
probs_split=probs_split, vocab_list=vocab_list)
elif decoding_method == "ctc_beam_search":
- result_transcripts = self._decode_batch_beam_search(
+ result_transcripts = self._decode_batch_beam_search_offline(
probs_split=probs_split,
beam_alpha=beam_alpha,
beam_beta=beam_beta,
@@ -288,3 +334,136 @@ class CTCDecoder(CTCDecoderBase):
else:
raise ValueError(f"Not support: {decoding_method}")
return result_transcripts
+
+ def get_decoder(self, vocab_list, batch_size, beam_alpha, beam_beta,
+ beam_size, num_processes, cutoff_prob, cutoff_top_n):
+ """
+ init get ctc decoder
+ Args:
+ vocab_list (list): List of tokens in the vocabulary, for decoding.
+ batch_size(int): Batch size for input data
+ beam_alpha (float): beam_alpha
+ beam_beta (float): beam_beta
+ beam_size (int): beam_size
+ num_processes (int): num_processes
+ cutoff_prob (float): cutoff probability in beam search
+ cutoff_top_n (int): cutoff_top_n
+
+ Raises:
+ ValueError: when decoding_method not support.
+
+ Returns:
+ CTCBeamSearchDecoder
+ """
+ num_processes = min(num_processes, batch_size)
+ if self._ext_scorer is not None:
+ self._ext_scorer.reset_params(beam_alpha, beam_beta)
+ if self.decoding_method == "ctc_beam_search":
+ beam_search_decoder = CTCBeamSearchDecoder(
+ vocab_list, batch_size, beam_size, num_processes, cutoff_prob,
+ cutoff_top_n, self._ext_scorer, self.blank_id)
+ else:
+ raise ValueError(f"Not support: {decoding_method}")
+ return beam_search_decoder
+
+ def next(self, probs, logits_lens):
+ """
+ Input probs into ctc decoder
+ Args:
+ probs (list(list(float))): probs for a batch of data
+ logits_lens (list(int)): logits lens for a batch of data
+ Raises:
+ Exception: when the ctc decoder is not initialized
+ ValueError: when decoding_method not support.
+ """
+
+ if self.beam_search_decoder is None:
+ raise Exception(
+ "You need to initialize the beam_search_decoder firstly")
+ beam_search_decoder = self.beam_search_decoder
+
+ has_value = (logits_lens > 0).tolist()
+ has_value = [
+ "true" if has_value[i] is True else "false"
+ for i in range(len(has_value))
+ ]
+ probs_split = [
+ probs[i, :l, :].tolist() if has_value[i] else probs[i].tolist()
+ for i, l in enumerate(logits_lens)
+ ]
+ if self.decoding_method == "ctc_beam_search":
+ beam_search_decoder.next(probs_split, has_value)
+ else:
+ raise ValueError(f"Not support: {decoding_method}")
+
+ return
+
+ def decode(self):
+ """
+ Get the decoding result
+ Raises:
+ Exception: when the ctc decoder is not initialized
+ ValueError: when decoding_method not support.
+ Returns:
+ results_best (list(str)): The best result for a batch of data
+ results_beam (list(list(str))): The beam search result for a batch of data
+ """
+ if self.beam_search_decoder is None:
+ raise Exception(
+ "You need to initialize the beam_search_decoder firstly")
+
+ beam_search_decoder = self.beam_search_decoder
+ if self.decoding_method == "ctc_beam_search":
+ batch_beam_results = beam_search_decoder.decode()
+ batch_beam_results = [[(res[0], res[1]) for res in beam_results]
+ for beam_results in batch_beam_results]
+ results_best = [result[0][1] for result in batch_beam_results]
+ results_beam = [[trans[1] for trans in result]
+ for result in batch_beam_results]
+
+ else:
+ raise ValueError(f"Not support: {decoding_method}")
+
+ return results_best, results_beam
+
+ def reset_decoder(self,
+ batch_size=-1,
+ beam_size=-1,
+ num_processes=-1,
+ cutoff_prob=-1.0,
+ cutoff_top_n=-1):
+ if batch_size > 0:
+ self.batch_size = batch_size
+ if beam_size > 0:
+ self.beam_size = beam_size
+ if num_processes > 0:
+ self.num_processes = num_processes
+ if cutoff_prob > 0:
+ self.cutoff_prob = cutoff_prob
+ if cutoff_top_n > 0:
+ self.cutoff_top_n = cutoff_top_n
+ """
+ Reset the decoder state
+ Args:
+ batch_size(int): Batch size for input data
+ beam_size (int): beam_size
+ num_processes (int): num_processes
+ cutoff_prob (float): cutoff probability in beam search
+ cutoff_top_n (int): cutoff_top_n
+ Raises:
+ Exception: when the ctc decoder is not initialized
+ """
+ if self.beam_search_decoder is None:
+ raise Exception(
+ "You need to initialize the beam_search_decoder firstly")
+ self.beam_search_decoder.reset_state(
+ self.batch_size, self.beam_size, self.num_processes,
+ self.cutoff_prob, self.cutoff_top_n)
+
+ def del_decoder(self):
+ """
+ Delete the decoder
+ """
+ if self.beam_search_decoder is not None:
+ del self.beam_search_decoder
+ self.beam_search_decoder = None
diff --git a/paddlespeech/s2t/training/trainer.py b/paddlespeech/s2t/training/trainer.py
index cac5e570..de90c9ef 100644
--- a/paddlespeech/s2t/training/trainer.py
+++ b/paddlespeech/s2t/training/trainer.py
@@ -252,8 +252,7 @@ class Trainer():
if self.args.benchmark_max_step and self.iteration > self.args.benchmark_max_step:
logger.info(
f"Reach benchmark-max-step: {self.args.benchmark_max_step}")
- sys.exit(
- f"Reach benchmark-max-step: {self.args.benchmark_max_step}")
+ sys.exit(0)
def do_train(self):
"""The training process control by epoch."""
@@ -282,7 +281,7 @@ class Trainer():
observation['batch_cost'] = observation[
'reader_cost'] + observation['step_cost']
observation['samples'] = observation['batch_size']
- observation['ips[sent./sec]'] = observation[
+ observation['ips samples/s'] = observation[
'batch_size'] / observation['batch_cost']
for k, v in observation.items():
msg += f" {k}: "
diff --git a/paddlespeech/s2t/transform/perturb.py b/paddlespeech/s2t/transform/perturb.py
index 226885f3..9e41b824 100644
--- a/paddlespeech/s2t/transform/perturb.py
+++ b/paddlespeech/s2t/transform/perturb.py
@@ -90,7 +90,8 @@ class SpeedPerturbation():
# Note1: resample requires the sampling-rate of input and output,
# but actually only the ratio is used.
- y = librosa.resample(x, ratio, 1, res_type=self.res_type)
+ y = librosa.resample(
+ x, orig_sr=ratio, target_sr=1, res_type=self.res_type)
if self.keep_length:
diff = abs(len(x) - len(y))
diff --git a/paddlespeech/s2t/transform/spectrogram.py b/paddlespeech/s2t/transform/spectrogram.py
index a6346c34..889cd349 100644
--- a/paddlespeech/s2t/transform/spectrogram.py
+++ b/paddlespeech/s2t/transform/spectrogram.py
@@ -38,7 +38,7 @@ def stft(x,
x = np.stack(
[
librosa.stft(
- x[:, ch],
+ y=x[:, ch],
n_fft=n_fft,
hop_length=n_shift,
win_length=win_length,
@@ -67,7 +67,7 @@ def istft(x, n_shift, win_length=None, window="hann", center=True):
x = np.stack(
[
librosa.istft(
- x[:, ch].T, # [Time, Freq] -> [Freq, Time]
+ stft_matrix=x[:, ch].T, # [Time, Freq] -> [Freq, Time]
hop_length=n_shift,
win_length=win_length,
window=window,
@@ -95,7 +95,8 @@ def stft2logmelspectrogram(x_stft,
# spc: (Time, Channel, Freq) or (Time, Freq)
spc = np.abs(x_stft)
# mel_basis: (Mel_freq, Freq)
- mel_basis = librosa.filters.mel(fs, n_fft, n_mels, fmin, fmax)
+ mel_basis = librosa.filters.mel(
+ sr=fs, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax)
# lmspc: (Time, Channel, Mel_freq) or (Time, Mel_freq)
lmspc = np.log10(np.maximum(eps, np.dot(spc, mel_basis.T)))
diff --git a/paddlespeech/t2s/__init__.py b/paddlespeech/t2s/__init__.py
index 8a0acc48..7d93c026 100644
--- a/paddlespeech/t2s/__init__.py
+++ b/paddlespeech/t2s/__init__.py
@@ -13,7 +13,6 @@
# limitations under the License.
import logging
-from . import data
from . import datasets
from . import exps
from . import frontend
diff --git a/paddlespeech/t2s/audio/__init__.py b/paddlespeech/t2s/audio/__init__.py
index 7747b794..0deefc8b 100644
--- a/paddlespeech/t2s/audio/__init__.py
+++ b/paddlespeech/t2s/audio/__init__.py
@@ -12,5 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from .audio import AudioProcessor
+from .codec import *
from .spec_normalizer import LogMagnitude
from .spec_normalizer import NormalizerBase
diff --git a/paddlespeech/t2s/audio/audio.py b/paddlespeech/t2s/audio/audio.py
index ab9a45d3..59ea8c87 100644
--- a/paddlespeech/t2s/audio/audio.py
+++ b/paddlespeech/t2s/audio/audio.py
@@ -53,8 +53,8 @@ class AudioProcessor(object):
def _create_mel_filter(self):
mel_filter = librosa.filters.mel(
- self.sample_rate,
- self.n_fft,
+ sr=self.sample_rate,
+ n_fft=self.n_fft,
n_mels=self.n_mels,
fmin=self.fmin,
fmax=self.fmax)
diff --git a/paddlespeech/t2s/audio/codec.py b/paddlespeech/t2s/audio/codec.py
new file mode 100644
index 00000000..2a759ce4
--- /dev/null
+++ b/paddlespeech/t2s/audio/codec.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+
+import numpy as np
+import paddle
+
+
+# x: [0: 2**bit-1], return: [-1, 1]
+def label_2_float(x, bits):
+ return 2 * x / (2**bits - 1.) - 1.
+
+
+#x: [-1, 1], return: [0, 2**bits-1]
+def float_2_label(x, bits):
+ assert abs(x).max() <= 1.0
+ x = (x + 1.) * (2**bits - 1) / 2
+ return x.clip(0, 2**bits - 1)
+
+
+# y: [-1, 1], mu: 2**bits, return: [0, 2**bits-1]
+# see https://en.wikipedia.org/wiki/%CE%9C-law_algorithm
+# be careful the input `mu` here, which is +1 than that of the link above
+def encode_mu_law(x, mu):
+ mu = mu - 1
+ fx = np.sign(x) * np.log(1 + mu * np.abs(x)) / np.log(1 + mu)
+ return np.floor((fx + 1) / 2 * mu + 0.5)
+
+
+# from_labels = True:
+# y: [0: 2**bit-1], mu: 2**bits, return: [-1,1]
+# from_labels = False:
+# y: [-1, 1], return: [-1, 1]
+def decode_mu_law(y, mu, from_labels=True):
+ # TODO: get rid of log2 - makes no sense
+ if from_labels:
+ y = label_2_float(y, math.log2(mu))
+ mu = mu - 1
+ x = paddle.sign(y) / mu * ((1 + mu)**paddle.abs(y) - 1)
+ return x
diff --git a/paddlespeech/t2s/datasets/__init__.py b/paddlespeech/t2s/datasets/__init__.py
index fc64a82f..caf20aac 100644
--- a/paddlespeech/t2s/datasets/__init__.py
+++ b/paddlespeech/t2s/datasets/__init__.py
@@ -11,5 +11,4 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-from .common import *
from .ljspeech import *
diff --git a/paddlespeech/t2s/datasets/am_batch_fn.py b/paddlespeech/t2s/datasets/am_batch_fn.py
index 526871a2..4e3ad3c1 100644
--- a/paddlespeech/t2s/datasets/am_batch_fn.py
+++ b/paddlespeech/t2s/datasets/am_batch_fn.py
@@ -14,7 +14,77 @@
import numpy as np
import paddle
-from paddlespeech.t2s.data.batch import batch_sequences
+from paddlespeech.t2s.datasets.batch import batch_sequences
+
+
+def tacotron2_single_spk_batch_fn(examples):
+ # fields = ["text", "text_lengths", "speech", "speech_lengths"]
+ text = [np.array(item["text"], dtype=np.int64) for item in examples]
+ speech = [np.array(item["speech"], dtype=np.float32) for item in examples]
+ text_lengths = [
+ np.array(item["text_lengths"], dtype=np.int64) for item in examples
+ ]
+ speech_lengths = [
+ np.array(item["speech_lengths"], dtype=np.int64) for item in examples
+ ]
+
+ text = batch_sequences(text)
+ speech = batch_sequences(speech)
+
+ # convert each batch to paddle.Tensor
+ text = paddle.to_tensor(text)
+ speech = paddle.to_tensor(speech)
+ text_lengths = paddle.to_tensor(text_lengths)
+ speech_lengths = paddle.to_tensor(speech_lengths)
+
+ batch = {
+ "text": text,
+ "text_lengths": text_lengths,
+ "speech": speech,
+ "speech_lengths": speech_lengths,
+ }
+ return batch
+
+
+def tacotron2_multi_spk_batch_fn(examples):
+ # fields = ["text", "text_lengths", "speech", "speech_lengths"]
+ text = [np.array(item["text"], dtype=np.int64) for item in examples]
+ speech = [np.array(item["speech"], dtype=np.float32) for item in examples]
+ text_lengths = [
+ np.array(item["text_lengths"], dtype=np.int64) for item in examples
+ ]
+ speech_lengths = [
+ np.array(item["speech_lengths"], dtype=np.int64) for item in examples
+ ]
+
+ text = batch_sequences(text)
+ speech = batch_sequences(speech)
+
+ # convert each batch to paddle.Tensor
+ text = paddle.to_tensor(text)
+ speech = paddle.to_tensor(speech)
+ text_lengths = paddle.to_tensor(text_lengths)
+ speech_lengths = paddle.to_tensor(speech_lengths)
+
+ batch = {
+ "text": text,
+ "text_lengths": text_lengths,
+ "speech": speech,
+ "speech_lengths": speech_lengths,
+ }
+ # spk_emb has a higher priority than spk_id
+ if "spk_emb" in examples[0]:
+ spk_emb = [
+ np.array(item["spk_emb"], dtype=np.float32) for item in examples
+ ]
+ spk_emb = batch_sequences(spk_emb)
+ spk_emb = paddle.to_tensor(spk_emb)
+ batch["spk_emb"] = spk_emb
+ elif "spk_id" in examples[0]:
+ spk_id = [np.array(item["spk_id"], dtype=np.int64) for item in examples]
+ spk_id = paddle.to_tensor(spk_id)
+ batch["spk_id"] = spk_id
+ return batch
def speedyspeech_single_spk_batch_fn(examples):
@@ -56,7 +126,7 @@ def speedyspeech_single_spk_batch_fn(examples):
def speedyspeech_multi_spk_batch_fn(examples):
- # fields = ["phones", "tones", "num_phones", "num_frames", "feats", "durations"]
+ # fields = ["phones", "tones", "num_phones", "num_frames", "feats", "durations", "spk_id"]
phones = [np.array(item["phones"], dtype=np.int64) for item in examples]
tones = [np.array(item["tones"], dtype=np.int64) for item in examples]
feats = [np.array(item["feats"], dtype=np.float32) for item in examples]
diff --git a/paddlespeech/t2s/data/batch.py b/paddlespeech/t2s/datasets/batch.py
similarity index 100%
rename from paddlespeech/t2s/data/batch.py
rename to paddlespeech/t2s/datasets/batch.py
diff --git a/paddlespeech/t2s/datasets/common.py b/paddlespeech/t2s/datasets/common.py
deleted file mode 100644
index d6fa3a84..00000000
--- a/paddlespeech/t2s/datasets/common.py
+++ /dev/null
@@ -1,92 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from pathlib import Path
-from typing import List
-
-import librosa
-import numpy as np
-from paddle.io import Dataset
-
-__all__ = ["AudioSegmentDataset", "AudioDataset", "AudioFolderDataset"]
-
-
-class AudioSegmentDataset(Dataset):
- """A simple dataset adaptor for audio files to train vocoders.
- Read -> trim silence -> normalize -> extract a segment
- """
-
- def __init__(self,
- file_paths: List[Path],
- sample_rate: int,
- length: int,
- top_db: float):
- self.file_paths = file_paths
- self.sr = sample_rate
- self.top_db = top_db
- self.length = length # samples in the clip
-
- def __getitem__(self, i):
- fpath = self.file_paths[i]
- y, sr = librosa.load(fpath, self.sr)
- y, _ = librosa.effects.trim(y, top_db=self.top_db)
- y = librosa.util.normalize(y)
- y = y.astype(np.float32)
-
- # pad or trim
- if y.size <= self.length:
- y = np.pad(y, [0, self.length - len(y)], mode='constant')
- else:
- start = np.random.randint(0, 1 + len(y) - self.length)
- y = y[start:start + self.length]
- return y
-
- def __len__(self):
- return len(self.file_paths)
-
-
-class AudioDataset(Dataset):
- """A simple dataset adaptor for the audio files.
- Read -> trim silence -> normalize
- """
-
- def __init__(self,
- file_paths: List[Path],
- sample_rate: int,
- top_db: float=60):
- self.file_paths = file_paths
- self.sr = sample_rate
- self.top_db = top_db
-
- def __getitem__(self, i):
- fpath = self.file_paths[i]
- y, sr = librosa.load(fpath, self.sr)
- y, _ = librosa.effects.trim(y, top_db=self.top_db)
- y = librosa.util.normalize(y)
- y = y.astype(np.float32)
- return y
-
- def __len__(self):
- return len(self.file_paths)
-
-
-class AudioFolderDataset(AudioDataset):
- def __init__(
- self,
- root,
- sample_rate,
- top_db=60,
- extension=".wav", ):
- root = Path(root).expanduser()
- file_paths = sorted(list(root.rglob("*{}".format(extension))))
- super().__init__(file_paths, sample_rate, top_db)
diff --git a/paddlespeech/t2s/datasets/data_table.py b/paddlespeech/t2s/datasets/data_table.py
index b0e4c891..c9815af2 100644
--- a/paddlespeech/t2s/datasets/data_table.py
+++ b/paddlespeech/t2s/datasets/data_table.py
@@ -22,26 +22,17 @@ from paddle.io import Dataset
class DataTable(Dataset):
"""Dataset to load and convert data for general purpose.
-
- Parameters
- ----------
- data : List[Dict[str, Any]]
- Metadata, a list of meta datum, each of which is composed of
- several fields
- fields : List[str], optional
- Fields to use, if not specified, all the fields in the data are
- used, by default None
- converters : Dict[str, Callable], optional
- Converters used to process each field, by default None
- use_cache : bool, optional
- Whether to use cache, by default False
-
- Raises
- ------
- ValueError
- If there is some field that does not exist in data.
- ValueError
- If there is some field in converters that does not exist in fields.
+ Args:
+ data (List[Dict[str, Any]]): Metadata, a list of meta datum, each of which is composed of several fields
+ fields (List[str], optional): Fields to use, if not specified, all the fields in the data are used, by default None
+ converters (Dict[str, Callable], optional): Converters used to process each field, by default None
+ use_cache (bool, optional): Whether to use cache, by default False
+
+ Raises:
+ ValueError:
+ If there is some field that does not exist in data.
+ ValueError:
+ If there is some field in converters that does not exist in fields.
"""
def __init__(self,
@@ -95,15 +86,11 @@ class DataTable(Dataset):
"""Convert a meta datum to an example by applying the corresponding
converters to each fields requested.
- Parameters
- ----------
- meta_datum : Dict[str, Any]
- Meta datum
+ Args:
+ meta_datum (Dict[str, Any]): Meta datum
- Returns
- -------
- Dict[str, Any]
- Converted example
+ Returns:
+ Dict[str, Any]: Converted example
"""
example = {}
for field in self.fields:
@@ -118,16 +105,11 @@ class DataTable(Dataset):
def __getitem__(self, idx: int) -> Dict[str, Any]:
"""Get an example given an index.
+ Args:
+ idx (int): Index of the example to get
- Parameters
- ----------
- idx : int
- Index of the example to get
-
- Returns
- -------
- Dict[str, Any]
- A converted example
+ Returns:
+ Dict[str, Any]: A converted example
"""
if self.use_cache and self.caches[idx] is not None:
return self.caches[idx]
diff --git a/paddlespeech/t2s/data/dataset.py b/paddlespeech/t2s/datasets/dataset.py
similarity index 99%
rename from paddlespeech/t2s/data/dataset.py
rename to paddlespeech/t2s/datasets/dataset.py
index 2d6c03cb..f81c2877 100644
--- a/paddlespeech/t2s/data/dataset.py
+++ b/paddlespeech/t2s/datasets/dataset.py
@@ -258,4 +258,4 @@ class ChainDataset(Dataset):
return dataset[i]
i -= len(dataset)
- raise IndexError("dataset index out of range")
+ raise IndexError("dataset index out of range")
\ No newline at end of file
diff --git a/paddlespeech/t2s/data/get_feats.py b/paddlespeech/t2s/datasets/get_feats.py
similarity index 100%
rename from paddlespeech/t2s/data/get_feats.py
rename to paddlespeech/t2s/datasets/get_feats.py
diff --git a/paddlespeech/t2s/datasets/preprocess_utils.py b/paddlespeech/t2s/datasets/preprocess_utils.py
index 8b01f6c3..445b69bd 100644
--- a/paddlespeech/t2s/datasets/preprocess_utils.py
+++ b/paddlespeech/t2s/datasets/preprocess_utils.py
@@ -18,14 +18,10 @@ import re
def get_phn_dur(file_name):
'''
read MFA duration.txt
- Parameters
- ----------
- file_name : str or Path
- path of gen_duration_from_textgrid.py's result
- Returns
- ----------
- Dict
- sentence: {'utt': ([char], [int])}
+ Args:
+ file_name (str or Path): path of gen_duration_from_textgrid.py's result
+ Returns:
+ Dict: sentence: {'utt': ([char], [int])}
'''
f = open(file_name, 'r')
sentence = {}
@@ -48,10 +44,8 @@ def get_phn_dur(file_name):
def merge_silence(sentence):
'''
merge silences
- Parameters
- ----------
- sentence : Dict
- sentence: {'utt': (([char], [int]), str)}
+ Args:
+ sentence (Dict): sentence: {'utt': (([char], [int]), str)}
'''
for utt in sentence:
cur_phn, cur_dur, speaker = sentence[utt]
@@ -81,12 +75,9 @@ def merge_silence(sentence):
def get_input_token(sentence, output_path, dataset="baker"):
'''
get phone set from training data and save it
- Parameters
- ----------
- sentence : Dict
- sentence: {'utt': ([char], [int])}
- output_path : str or path
- path to save phone_id_map
+ Args:
+ sentence (Dict): sentence: {'utt': ([char], [int])}
+ output_path (str or path):path to save phone_id_map
'''
phn_token = set()
for utt in sentence:
@@ -112,14 +103,10 @@ def get_phones_tones(sentence,
dataset="baker"):
'''
get phone set and tone set from training data and save it
- Parameters
- ----------
- sentence : Dict
- sentence: {'utt': ([char], [int])}
- phones_output_path : str or path
- path to save phone_id_map
- tones_output_path : str or path
- path to save tone_id_map
+ Args:
+ sentence (Dict): sentence: {'utt': ([char], [int])}
+ phones_output_path (str or path): path to save phone_id_map
+ tones_output_path (str or path): path to save tone_id_map
'''
phn_token = set()
tone_token = set()
@@ -162,14 +149,10 @@ def get_spk_id_map(speaker_set, output_path):
def compare_duration_and_mel_length(sentences, utt, mel):
'''
check duration error, correct sentences[utt] if possible, else pop sentences[utt]
- Parameters
- ----------
- sentences : Dict
- sentences[utt] = [phones_list ,durations_list]
- utt : str
- utt_id
- mel : np.ndarry
- features (num_frames, n_mels)
+ Args:
+ sentences (Dict): sentences[utt] = [phones_list ,durations_list]
+ utt (str): utt_id
+ mel (np.ndarry): features (num_frames, n_mels)
'''
if utt in sentences:
diff --git a/paddlespeech/t2s/datasets/vocoder_batch_fn.py b/paddlespeech/t2s/datasets/vocoder_batch_fn.py
index 2e4f740f..08748de0 100644
--- a/paddlespeech/t2s/datasets/vocoder_batch_fn.py
+++ b/paddlespeech/t2s/datasets/vocoder_batch_fn.py
@@ -14,6 +14,10 @@
import numpy as np
import paddle
+from paddlespeech.t2s.audio.codec import encode_mu_law
+from paddlespeech.t2s.audio.codec import float_2_label
+from paddlespeech.t2s.audio.codec import label_2_float
+
class Clip(object):
"""Collate functor for training vocoders.
@@ -25,15 +29,11 @@ class Clip(object):
hop_size=256,
aux_context_window=0, ):
"""Initialize customized collater for DataLoader.
+ Args:
- Parameters
- ----------
- batch_max_steps : int
- The maximum length of input signal in batch.
- hop_size : int
- Hop size of auxiliary features.
- aux_context_window : int
- Context window size for auxiliary feature conv.
+ batch_max_steps (int): The maximum length of input signal in batch.
+ hop_size (int): Hop size of auxiliary features.
+ aux_context_window (int): Context window size for auxiliary feature conv.
"""
if batch_max_steps % hop_size != 0:
@@ -49,29 +49,26 @@ class Clip(object):
self.end_offset = -(self.batch_max_frames + aux_context_window)
self.mel_threshold = self.batch_max_frames + 2 * aux_context_window
- def __call__(self, examples):
+ def __call__(self, batch):
"""Convert into batch tensors.
- Parameters
- ----------
- batch : list
- list of tuple of the pair of audio and features. Audio shape (T, ), features shape(T', C).
+ Args:
+ batch (list): list of tuple of the pair of audio and features. Audio shape (T, ), features shape(T', C).
- Returns
- ----------
- Tensor
- Auxiliary feature batch (B, C, T'), where
- T = (T' - 2 * aux_context_window) * hop_size.
- Tensor
- Target signal batch (B, 1, T).
+ Returns:
+ Tensor:
+ Auxiliary feature batch (B, C, T'), where
+ T = (T' - 2 * aux_context_window) * hop_size.
+ Tensor:
+ Target signal batch (B, 1, T).
"""
# check length
- examples = [
- self._adjust_length(b['wave'], b['feats']) for b in examples
+ batch = [
+ self._adjust_length(b['wave'], b['feats']) for b in batch
if b['feats'].shape[0] > self.mel_threshold
]
- xs, cs = [b[0] for b in examples], [b[1] for b in examples]
+ xs, cs = [b[0] for b in batch], [b[1] for b in batch]
# make batch with random cut
c_lengths = [c.shape[0] for c in cs]
@@ -89,7 +86,7 @@ class Clip(object):
c_batch = np.stack(
[c[start:end] for c, start, end in zip(cs, c_starts, c_ends)])
- # convert each batch to tensor, asuume that each item in batch has the same length
+ # convert each batch to tensor, assume that each item in batch has the same length
y_batch = paddle.to_tensor(
y_batch, dtype=paddle.float32).unsqueeze(1) # (B, 1, T)
c_batch = paddle.to_tensor(
@@ -100,11 +97,10 @@ class Clip(object):
def _adjust_length(self, x, c):
"""Adjust the audio and feature lengths.
- Note
- -------
- Basically we assume that the length of x and c are adjusted
- through preprocessing stage, but if we use other library processed
- features, this process will be needed.
+ Note:
+ Basically we assume that the length of x and c are adjusted
+ through preprocessing stage, but if we use other library processed
+ features, this process will be needed.
"""
if len(x) < c.shape[0] * self.hop_size:
@@ -120,3 +116,105 @@ class Clip(object):
0] * self.hop_size, f"wave length: ({len(x)}), mel length: ({c.shape[0]})"
return x, c
+
+
+class WaveRNNClip(Clip):
+ def __init__(self,
+ mode: str='RAW',
+ batch_max_steps: int=4500,
+ hop_size: int=300,
+ aux_context_window: int=2,
+ bits: int=9,
+ mu_law: bool=True):
+ self.mode = mode
+ self.mel_win = batch_max_steps // hop_size + 2 * aux_context_window
+ self.batch_max_steps = batch_max_steps
+ self.hop_size = hop_size
+ self.aux_context_window = aux_context_window
+ self.mu_law = mu_law
+ self.batch_max_frames = batch_max_steps // hop_size
+ self.mel_threshold = self.batch_max_frames + 2 * aux_context_window
+ if self.mode == 'MOL':
+ self.bits = 16
+ else:
+ self.bits = bits
+
+ def to_quant(self, wav):
+ if self.mode == 'RAW':
+ if self.mu_law:
+ quant = encode_mu_law(wav, mu=2**self.bits)
+ else:
+ quant = float_2_label(wav, bits=self.bits)
+ elif self.mode == 'MOL':
+ quant = float_2_label(wav, bits=16)
+ quant = quant.astype(np.int64)
+ return quant
+
+ def __call__(self, batch):
+ # voc_pad = 2 this will pad the input so that the resnet can 'see' wider than input length
+ # max_offsets = n_frames - 2 - (mel_win + 2 * hp.voc_pad) = n_frames - 15
+ """Convert into batch tensors.
+ Args:
+ batch (list): list of tuple of the pair of audio and features. Audio shape (T, ), features shape(T', C).
+
+ Returns:
+ Tensor: Input signal batch (B, 1, T).
+ Tensor: Target signal batch (B, 1, T).
+ Tensor: Auxiliary feature batch (B, C, T'),
+ where T = (T' - 2 * aux_context_window) * hop_size.
+
+ """
+ # check length
+ batch = [
+ self._adjust_length(b['wave'], b['feats']) for b in batch
+ if b['feats'].shape[0] > self.mel_threshold
+ ]
+ wav, mel = [b[0] for b in batch], [b[1] for b in batch]
+ # mel 此处需要转置
+ mel = [x.T for x in mel]
+ max_offsets = [
+ x.shape[-1] - 2 - (self.mel_win + 2 * self.aux_context_window)
+ for x in mel
+ ]
+ # the slice point of mel selecting randomly
+ mel_offsets = [np.random.randint(0, offset) for offset in max_offsets]
+ # the slice point of wav selecting randomly, which is behind 2(=pad) frames
+ sig_offsets = [(offset + self.aux_context_window) * self.hop_size
+ for offset in mel_offsets]
+ # mels.shape[1] = voc_seq_len // hop_length + 2 * voc_pad
+ mels = [
+ x[:, mel_offsets[i]:mel_offsets[i] + self.mel_win]
+ for i, x in enumerate(mel)
+ ]
+ # label.shape[1] = voc_seq_len + 1
+ wav = [self.to_quant(x) for x in wav]
+
+ labels = [
+ x[sig_offsets[i]:sig_offsets[i] + self.batch_max_steps + 1]
+ for i, x in enumerate(wav)
+ ]
+
+ mels = np.stack(mels).astype(np.float32)
+ labels = np.stack(labels).astype(np.int64)
+
+ mels = paddle.to_tensor(mels)
+ labels = paddle.to_tensor(labels, dtype='int64')
+ # x is input, y is label
+ x = labels[:, :self.batch_max_steps]
+ y = labels[:, 1:]
+ '''
+ mode = RAW:
+ mu_law = True:
+ quant: bits = 9 0, 1, 2, ..., 509, 510, 511 int
+ mu_law = False
+ quant bits = 9 [0, 511] float
+ mode = MOL:
+ quant: bits = 16 [0. 65536] float
+ '''
+ # x should be normalizes in.[0, 1] in RAW mode
+ x = label_2_float(paddle.cast(x, dtype='float32'), self.bits)
+ # y should be normalizes in.[0, 1] in MOL mode
+ if self.mode == 'MOL':
+ y = label_2_float(paddle.cast(y, dtype='float32'), self.bits)
+
+ return x, y, mels
diff --git a/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py b/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py
index 4ddd19f7..3fded29b 100644
--- a/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py
+++ b/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py
@@ -15,20 +15,21 @@
# for mb melgan finetune
# 长度和原本的 mel 不一致怎么办?
import argparse
+import os
from pathlib import Path
import numpy as np
import paddle
import yaml
-from yacs.config import CfgNode
from tqdm import tqdm
-import os
+from yacs.config import CfgNode
from paddlespeech.t2s.datasets.preprocess_utils import get_phn_dur
from paddlespeech.t2s.datasets.preprocess_utils import merge_silence
from paddlespeech.t2s.models.fastspeech2 import FastSpeech2
from paddlespeech.t2s.models.fastspeech2 import StyleFastSpeech2Inference
from paddlespeech.t2s.modules.normalizer import ZScore
+from paddlespeech.t2s.utils import str2bool
def evaluate(args, fastspeech2_config):
@@ -50,11 +51,14 @@ def evaluate(args, fastspeech2_config):
spk_id_list = [line.strip().split() for line in f.readlines()]
spk_num = len(spk_id_list)
else:
- spk_num=None
+ spk_num = None
odim = fastspeech2_config.n_mels
model = FastSpeech2(
- idim=vocab_size, odim=odim, **fastspeech2_config["model"], spk_num=spk_num)
+ idim=vocab_size,
+ odim=odim,
+ **fastspeech2_config["model"],
+ spk_num=spk_num)
model.set_state_dict(
paddle.load(args.fastspeech2_checkpoint)["main_params"])
@@ -99,9 +103,15 @@ def evaluate(args, fastspeech2_config):
else:
train_wav_files += wav_files
- train_wav_files = [os.path.basename(str(str_path)) for str_path in train_wav_files]
- dev_wav_files = [os.path.basename(str(str_path)) for str_path in dev_wav_files]
- test_wav_files = [os.path.basename(str(str_path)) for str_path in test_wav_files]
+ train_wav_files = [
+ os.path.basename(str(str_path)) for str_path in train_wav_files
+ ]
+ dev_wav_files = [
+ os.path.basename(str(str_path)) for str_path in dev_wav_files
+ ]
+ test_wav_files = [
+ os.path.basename(str(str_path)) for str_path in test_wav_files
+ ]
for i, utt_id in enumerate(tqdm(sentences)):
phones = sentences[utt_id][0]
@@ -122,7 +132,8 @@ def evaluate(args, fastspeech2_config):
phone_ids = paddle.to_tensor(np.array(phone_ids))
if args.speaker_dict:
- speaker_id = int([item[1] for item in spk_id_list if speaker == item[0]][0])
+ speaker_id = int(
+ [item[1] for item in spk_id_list if speaker == item[0]][0])
speaker_id = paddle.to_tensor(speaker_id)
else:
speaker_id = None
@@ -143,7 +154,8 @@ def evaluate(args, fastspeech2_config):
sub_output_dir.mkdir(parents=True, exist_ok=True)
with paddle.no_grad():
- mel = fastspeech2_inference(phone_ids, durations=durations, spk_id=speaker_id)
+ mel = fastspeech2_inference(
+ phone_ids, durations=durations, spk_id=speaker_id)
np.save(sub_output_dir / (utt_id + "_feats.npy"), mel)
@@ -175,12 +187,9 @@ def main():
type=str,
default="phone_id_map.txt",
help="phone vocabulary file.")
-
+
parser.add_argument(
- "--speaker-dict",
- type=str,
- default=None,
- help="speaker id map file.")
+ "--speaker-dict", type=str, default=None, help="speaker id map file.")
parser.add_argument(
"--dur-file", default=None, type=str, help="path to durations.txt.")
@@ -188,9 +197,6 @@ def main():
parser.add_argument(
"--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
- def str2bool(str):
- return True if str.lower() == 'true' else False
-
parser.add_argument(
"--cut-sil",
type=str2bool,
diff --git a/paddlespeech/t2s/exps/fastspeech2/preprocess.py b/paddlespeech/t2s/exps/fastspeech2/preprocess.py
index b874b3a7..5bda7545 100644
--- a/paddlespeech/t2s/exps/fastspeech2/preprocess.py
+++ b/paddlespeech/t2s/exps/fastspeech2/preprocess.py
@@ -27,14 +27,15 @@ import tqdm
import yaml
from yacs.config import CfgNode
-from paddlespeech.t2s.data.get_feats import Energy
-from paddlespeech.t2s.data.get_feats import LogMelFBank
-from paddlespeech.t2s.data.get_feats import Pitch
+from paddlespeech.t2s.datasets.get_feats import Energy
+from paddlespeech.t2s.datasets.get_feats import LogMelFBank
+from paddlespeech.t2s.datasets.get_feats import Pitch
from paddlespeech.t2s.datasets.preprocess_utils import compare_duration_and_mel_length
from paddlespeech.t2s.datasets.preprocess_utils import get_input_token
from paddlespeech.t2s.datasets.preprocess_utils import get_phn_dur
from paddlespeech.t2s.datasets.preprocess_utils import get_spk_id_map
from paddlespeech.t2s.datasets.preprocess_utils import merge_silence
+from paddlespeech.t2s.utils import str2bool
def process_sentence(config: Dict[str, Any],
@@ -203,9 +204,6 @@ def main():
parser.add_argument(
"--num-cpu", type=int, default=1, help="number of process.")
- def str2bool(str):
- return True if str.lower() == 'true' else False
-
parser.add_argument(
"--cut-sil",
type=str2bool,
diff --git a/paddlespeech/t2s/exps/fastspeech2/train.py b/paddlespeech/t2s/exps/fastspeech2/train.py
index 1dfa575a..10e023d0 100644
--- a/paddlespeech/t2s/exps/fastspeech2/train.py
+++ b/paddlespeech/t2s/exps/fastspeech2/train.py
@@ -38,6 +38,7 @@ from paddlespeech.t2s.training.extensions.visualizer import VisualDL
from paddlespeech.t2s.training.optimizer import build_optimizers
from paddlespeech.t2s.training.seeding import seed_everything
from paddlespeech.t2s.training.trainer import Trainer
+from paddlespeech.t2s.utils import str2bool
def train_sp(args, config):
@@ -159,9 +160,8 @@ def train_sp(args, config):
if dist.get_rank() == 0:
trainer.extend(evaluator, trigger=(1, "epoch"))
trainer.extend(VisualDL(output_dir), trigger=(1, "iteration"))
- trainer.extend(
- Snapshot(max_size=config.num_snapshots), trigger=(1, 'epoch'))
- # print(trainer.extensions)
+ trainer.extend(
+ Snapshot(max_size=config.num_snapshots), trigger=(1, 'epoch'))
trainer.run()
@@ -182,9 +182,6 @@ def main():
default=None,
help="speaker id map file for multiple speaker model.")
- def str2bool(str):
- return True if str.lower() == 'true' else False
-
parser.add_argument(
"--voice-cloning",
type=str2bool,
diff --git a/paddlespeech/t2s/exps/gan_vocoder/hifigan/train.py b/paddlespeech/t2s/exps/gan_vocoder/hifigan/train.py
index 9ac6cbd3..c70821e7 100644
--- a/paddlespeech/t2s/exps/gan_vocoder/hifigan/train.py
+++ b/paddlespeech/t2s/exps/gan_vocoder/hifigan/train.py
@@ -231,9 +231,9 @@ def train_sp(args, config):
trainer.extend(
evaluator, trigger=(config.eval_interval_steps, 'iteration'))
trainer.extend(VisualDL(output_dir), trigger=(1, 'iteration'))
- trainer.extend(
- Snapshot(max_size=config.num_snapshots),
- trigger=(config.save_interval_steps, 'iteration'))
+ trainer.extend(
+ Snapshot(max_size=config.num_snapshots),
+ trigger=(config.save_interval_steps, 'iteration'))
print("Trainer Done!")
trainer.run()
diff --git a/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py b/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py
index 3d0ff7d3..27ffded6 100644
--- a/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py
+++ b/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py
@@ -219,9 +219,9 @@ def train_sp(args, config):
trainer.extend(
evaluator, trigger=(config.eval_interval_steps, 'iteration'))
trainer.extend(VisualDL(output_dir), trigger=(1, 'iteration'))
- trainer.extend(
- Snapshot(max_size=config.num_snapshots),
- trigger=(config.save_interval_steps, 'iteration'))
+ trainer.extend(
+ Snapshot(max_size=config.num_snapshots),
+ trigger=(config.save_interval_steps, 'iteration'))
print("Trainer Done!")
trainer.run()
diff --git a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/synthesize_from_wav.py b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/synthesize_from_wav.py
index f5affb50..def30e67 100644
--- a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/synthesize_from_wav.py
+++ b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/synthesize_from_wav.py
@@ -23,7 +23,7 @@ import soundfile as sf
import yaml
from yacs.config import CfgNode
-from paddlespeech.t2s.data.get_feats import LogMelFBank
+from paddlespeech.t2s.datasets.get_feats import LogMelFBank
from paddlespeech.t2s.models.parallel_wavegan import PWGGenerator
from paddlespeech.t2s.models.parallel_wavegan import PWGInference
from paddlespeech.t2s.modules.normalizer import ZScore
diff --git a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py
index a7881d6b..92de7a2c 100644
--- a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py
+++ b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py
@@ -41,6 +41,7 @@ from paddlespeech.t2s.training.extensions.snapshot import Snapshot
from paddlespeech.t2s.training.extensions.visualizer import VisualDL
from paddlespeech.t2s.training.seeding import seed_everything
from paddlespeech.t2s.training.trainer import Trainer
+from paddlespeech.t2s.utils import str2bool
def train_sp(args, config):
@@ -193,19 +194,16 @@ def train_sp(args, config):
trainer.extend(
evaluator, trigger=(config.eval_interval_steps, 'iteration'))
trainer.extend(VisualDL(output_dir), trigger=(1, 'iteration'))
- trainer.extend(
- Snapshot(max_size=config.num_snapshots),
- trigger=(config.save_interval_steps, 'iteration'))
+ trainer.extend(
+ Snapshot(max_size=config.num_snapshots),
+ trigger=(config.save_interval_steps, 'iteration'))
- # print(trainer.extensions.keys())
print("Trainer Done!")
trainer.run()
def main():
# parse args and config and redirect to train_sp
- def str2bool(str):
- return True if str.lower() == 'true' else False
parser = argparse.ArgumentParser(
description="Train a ParallelWaveGAN model.")
diff --git a/paddlespeech/t2s/exps/gan_vocoder/preprocess.py b/paddlespeech/t2s/exps/gan_vocoder/preprocess.py
index 782fbdf2..4871bca7 100644
--- a/paddlespeech/t2s/exps/gan_vocoder/preprocess.py
+++ b/paddlespeech/t2s/exps/gan_vocoder/preprocess.py
@@ -27,9 +27,10 @@ import tqdm
import yaml
from yacs.config import CfgNode
-from paddlespeech.t2s.data.get_feats import LogMelFBank
+from paddlespeech.t2s.datasets.get_feats import LogMelFBank
from paddlespeech.t2s.datasets.preprocess_utils import get_phn_dur
from paddlespeech.t2s.datasets.preprocess_utils import merge_silence
+from paddlespeech.t2s.utils import str2bool
def process_sentence(config: Dict[str, Any],
@@ -165,9 +166,6 @@ def main():
parser.add_argument(
"--dur-file", default=None, type=str, help="path to durations.txt.")
- def str2bool(str):
- return True if str.lower() == 'true' else False
-
parser.add_argument(
"--cut-sil",
type=str2bool,
diff --git a/paddlespeech/t2s/exps/gan_vocoder/style_melgan/train.py b/paddlespeech/t2s/exps/gan_vocoder/style_melgan/train.py
index b162260d..be3ba742 100644
--- a/paddlespeech/t2s/exps/gan_vocoder/style_melgan/train.py
+++ b/paddlespeech/t2s/exps/gan_vocoder/style_melgan/train.py
@@ -212,9 +212,9 @@ def train_sp(args, config):
trainer.extend(
evaluator, trigger=(config.eval_interval_steps, 'iteration'))
trainer.extend(VisualDL(output_dir), trigger=(1, 'iteration'))
- trainer.extend(
- Snapshot(max_size=config.num_snapshots),
- trigger=(config.save_interval_steps, 'iteration'))
+ trainer.extend(
+ Snapshot(max_size=config.num_snapshots),
+ trigger=(config.save_interval_steps, 'iteration'))
print("Trainer Done!")
trainer.run()
diff --git a/paddlespeech/t2s/exps/inference.py b/paddlespeech/t2s/exps/inference.py
index e1d5306c..26d7e2c0 100644
--- a/paddlespeech/t2s/exps/inference.py
+++ b/paddlespeech/t2s/exps/inference.py
@@ -14,9 +14,11 @@
import argparse
from pathlib import Path
+import numpy
import soundfile as sf
from paddle import inference
+from paddlespeech.t2s.frontend import English
from paddlespeech.t2s.frontend.zh_frontend import Frontend
@@ -29,20 +31,38 @@ def main():
'--am',
type=str,
default='fastspeech2_csmsc',
- choices=['speedyspeech_csmsc', 'fastspeech2_csmsc'],
+ choices=[
+ 'speedyspeech_csmsc', 'fastspeech2_csmsc', 'fastspeech2_aishell3',
+ 'fastspeech2_vctk', 'tacotron2_csmsc'
+ ],
help='Choose acoustic model type of tts task.')
parser.add_argument(
"--phones_dict", type=str, default=None, help="phone vocabulary file.")
parser.add_argument(
"--tones_dict", type=str, default=None, help="tone vocabulary file.")
+ parser.add_argument(
+ "--speaker_dict", type=str, default=None, help="speaker id map file.")
+ parser.add_argument(
+ '--spk_id',
+ type=int,
+ default=0,
+ help='spk id for multi speaker acoustic model')
# voc
parser.add_argument(
'--voc',
type=str,
default='pwgan_csmsc',
- choices=['pwgan_csmsc', 'mb_melgan_csmsc', 'hifigan_csmsc'],
+ choices=[
+ 'pwgan_csmsc', 'mb_melgan_csmsc', 'hifigan_csmsc', 'pwgan_aishell3',
+ 'pwgan_vctk', 'wavernn_csmsc'
+ ],
help='Choose vocoder type of tts task.')
# other
+ parser.add_argument(
+ '--lang',
+ type=str,
+ default='zh',
+ help='Choose model language. zh or en')
parser.add_argument(
"--text",
type=str,
@@ -53,8 +73,12 @@ def main():
args, _ = parser.parse_known_args()
- frontend = Frontend(
- phone_vocab_path=args.phones_dict, tone_vocab_path=args.tones_dict)
+ # frontend
+ if args.lang == 'zh':
+ frontend = Frontend(
+ phone_vocab_path=args.phones_dict, tone_vocab_path=args.tones_dict)
+ elif args.lang == 'en':
+ frontend = English(phone_vocab_path=args.phones_dict)
print("frontend done!")
# model: {model_name}_{dataset}
@@ -83,30 +107,53 @@ def main():
print("in new inference")
+ # construct dataset for evaluation
+ sentences = []
with open(args.text, 'rt') as f:
for line in f:
items = line.strip().split()
utt_id = items[0]
- sentence = "".join(items[1:])
+ if args.lang == 'zh':
+ sentence = "".join(items[1:])
+ elif args.lang == 'en':
+ sentence = " ".join(items[1:])
sentences.append((utt_id, sentence))
get_tone_ids = False
+ get_spk_id = False
if am_name == 'speedyspeech':
get_tone_ids = True
+ if am_dataset in {"aishell3", "vctk"} and args.speaker_dict:
+ get_spk_id = True
+ spk_id = numpy.array([args.spk_id])
am_input_names = am_predictor.get_input_names()
-
+ print("am_input_names:", am_input_names)
+ merge_sentences = True
for utt_id, sentence in sentences:
- input_ids = frontend.get_input_ids(
- sentence, merge_sentences=True, get_tone_ids=get_tone_ids)
- phone_ids = input_ids["phone_ids"]
+ if args.lang == 'zh':
+ input_ids = frontend.get_input_ids(
+ sentence,
+ merge_sentences=merge_sentences,
+ get_tone_ids=get_tone_ids)
+ phone_ids = input_ids["phone_ids"]
+ elif args.lang == 'en':
+ input_ids = frontend.get_input_ids(
+ sentence, merge_sentences=merge_sentences)
+ phone_ids = input_ids["phone_ids"]
+ else:
+ print("lang should in {'zh', 'en'}!")
+
if get_tone_ids:
tone_ids = input_ids["tone_ids"]
tones = tone_ids[0].numpy()
tones_handle = am_predictor.get_input_handle(am_input_names[1])
tones_handle.reshape(tones.shape)
tones_handle.copy_from_cpu(tones)
-
+ if get_spk_id:
+ spk_id_handle = am_predictor.get_input_handle(am_input_names[1])
+ spk_id_handle.reshape(spk_id.shape)
+ spk_id_handle.copy_from_cpu(spk_id)
phones = phone_ids[0].numpy()
phones_handle = am_predictor.get_input_handle(am_input_names[0])
phones_handle.reshape(phones.shape)
diff --git a/paddlespeech/t2s/exps/speedyspeech/gen_gta_mel.py b/paddlespeech/t2s/exps/speedyspeech/gen_gta_mel.py
index b6440fd6..31b7d2ea 100644
--- a/paddlespeech/t2s/exps/speedyspeech/gen_gta_mel.py
+++ b/paddlespeech/t2s/exps/speedyspeech/gen_gta_mel.py
@@ -30,6 +30,7 @@ from paddlespeech.t2s.frontend.zh_frontend import Frontend
from paddlespeech.t2s.models.speedyspeech import SpeedySpeech
from paddlespeech.t2s.models.speedyspeech import SpeedySpeechInference
from paddlespeech.t2s.modules.normalizer import ZScore
+from paddlespeech.t2s.utils import str2bool
def evaluate(args, speedyspeech_config):
@@ -213,9 +214,6 @@ def main():
parser.add_argument(
"--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
- def str2bool(str):
- return True if str.lower() == 'true' else False
-
parser.add_argument(
"--cut-sil",
type=str2bool,
diff --git a/paddlespeech/t2s/exps/speedyspeech/normalize.py b/paddlespeech/t2s/exps/speedyspeech/normalize.py
index a427c469..249a4d6d 100644
--- a/paddlespeech/t2s/exps/speedyspeech/normalize.py
+++ b/paddlespeech/t2s/exps/speedyspeech/normalize.py
@@ -23,6 +23,7 @@ from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
from paddlespeech.t2s.datasets.data_table import DataTable
+from paddlespeech.t2s.utils import str2bool
def main():
@@ -55,9 +56,6 @@ def main():
default=1,
help="logging level. higher is more logging. (default=1)")
- def str2bool(str):
- return True if str.lower() == 'true' else False
-
parser.add_argument(
"--use-relative-path",
type=str2bool,
diff --git a/paddlespeech/t2s/exps/speedyspeech/preprocess.py b/paddlespeech/t2s/exps/speedyspeech/preprocess.py
index 9ff77144..3f81c4e1 100644
--- a/paddlespeech/t2s/exps/speedyspeech/preprocess.py
+++ b/paddlespeech/t2s/exps/speedyspeech/preprocess.py
@@ -27,12 +27,13 @@ import tqdm
import yaml
from yacs.config import CfgNode
-from paddlespeech.t2s.data.get_feats import LogMelFBank
+from paddlespeech.t2s.datasets.get_feats import LogMelFBank
from paddlespeech.t2s.datasets.preprocess_utils import compare_duration_and_mel_length
from paddlespeech.t2s.datasets.preprocess_utils import get_phn_dur
from paddlespeech.t2s.datasets.preprocess_utils import get_phones_tones
from paddlespeech.t2s.datasets.preprocess_utils import get_spk_id_map
from paddlespeech.t2s.datasets.preprocess_utils import merge_silence
+from paddlespeech.t2s.utils import str2bool
def process_sentence(config: Dict[str, Any],
@@ -190,9 +191,6 @@ def main():
parser.add_argument(
"--num-cpu", type=int, default=1, help="number of process.")
- def str2bool(str):
- return True if str.lower() == 'true' else False
-
parser.add_argument(
"--cut-sil",
type=str2bool,
diff --git a/paddlespeech/t2s/exps/speedyspeech/train.py b/paddlespeech/t2s/exps/speedyspeech/train.py
index 448cd7bb..bda5370c 100644
--- a/paddlespeech/t2s/exps/speedyspeech/train.py
+++ b/paddlespeech/t2s/exps/speedyspeech/train.py
@@ -38,6 +38,7 @@ from paddlespeech.t2s.training.extensions.visualizer import VisualDL
from paddlespeech.t2s.training.optimizer import build_optimizers
from paddlespeech.t2s.training.seeding import seed_everything
from paddlespeech.t2s.training.trainer import Trainer
+from paddlespeech.t2s.utils import str2bool
def train_sp(args, config):
@@ -170,8 +171,8 @@ def train_sp(args, config):
if dist.get_rank() == 0:
trainer.extend(evaluator, trigger=(1, "epoch"))
trainer.extend(VisualDL(output_dir), trigger=(1, "iteration"))
- trainer.extend(
- Snapshot(max_size=config.num_snapshots), trigger=(1, 'epoch'))
+ trainer.extend(
+ Snapshot(max_size=config.num_snapshots), trigger=(1, 'epoch'))
trainer.run()
@@ -186,9 +187,6 @@ def main():
parser.add_argument(
"--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
- def str2bool(str):
- return True if str.lower() == 'true' else False
-
parser.add_argument(
"--use-relative-path",
type=str2bool,
diff --git a/paddlespeech/t2s/exps/synthesize.py b/paddlespeech/t2s/exps/synthesize.py
index f5477470..1c42a87c 100644
--- a/paddlespeech/t2s/exps/synthesize.py
+++ b/paddlespeech/t2s/exps/synthesize.py
@@ -25,6 +25,7 @@ from yacs.config import CfgNode
from paddlespeech.s2t.utils.dynamic_import import dynamic_import
from paddlespeech.t2s.datasets.data_table import DataTable
from paddlespeech.t2s.modules.normalizer import ZScore
+from paddlespeech.t2s.utils import str2bool
model_alias = {
# acoustic model
@@ -36,6 +37,10 @@ model_alias = {
"paddlespeech.t2s.models.fastspeech2:FastSpeech2",
"fastspeech2_inference":
"paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference",
+ "tacotron2":
+ "paddlespeech.t2s.models.tacotron2:Tacotron2",
+ "tacotron2_inference":
+ "paddlespeech.t2s.models.tacotron2:Tacotron2Inference",
# voc
"pwgan":
"paddlespeech.t2s.models.parallel_wavegan:PWGGenerator",
@@ -91,6 +96,11 @@ def evaluate(args):
print("spk_num:", spk_num)
elif am_name == 'speedyspeech':
fields = ["utt_id", "phones", "tones"]
+ elif am_name == 'tacotron2':
+ fields = ["utt_id", "text"]
+ if args.voice_cloning:
+ print("voice cloning!")
+ fields += ["spk_emb"]
test_dataset = DataTable(data=test_metadata, fields=fields)
@@ -117,6 +127,8 @@ def evaluate(args):
elif am_name == 'speedyspeech':
am = am_class(
vocab_size=vocab_size, tone_size=tone_size, **am_config["model"])
+ elif am_name == 'tacotron2':
+ am = am_class(idim=vocab_size, odim=odim, **am_config["model"])
am.set_state_dict(paddle.load(args.am_ckpt)["main_params"])
am.eval()
@@ -168,6 +180,13 @@ def evaluate(args):
phone_ids = paddle.to_tensor(datum["phones"])
tone_ids = paddle.to_tensor(datum["tones"])
mel = am_inference(phone_ids, tone_ids)
+ elif am_name == 'tacotron2':
+ phone_ids = paddle.to_tensor(datum["text"])
+ spk_emb = None
+ # multi speaker
+ if args.voice_cloning and "spk_emb" in datum:
+ spk_emb = paddle.to_tensor(np.load(datum["spk_emb"]))
+ mel = am_inference(phone_ids, spk_emb=spk_emb)
# vocoder
wav = voc_inference(mel)
sf.write(
@@ -188,7 +207,8 @@ def main():
default='fastspeech2_csmsc',
choices=[
'speedyspeech_csmsc', 'fastspeech2_csmsc', 'fastspeech2_ljspeech',
- 'fastspeech2_aishell3', 'fastspeech2_vctk'
+ 'fastspeech2_aishell3', 'fastspeech2_vctk', 'tacotron2_csmsc',
+ 'tacotron2_ljspeech', 'tacotron2_aishell3'
],
help='Choose acoustic model type of tts task.')
parser.add_argument(
@@ -214,9 +234,6 @@ def main():
parser.add_argument(
"--speaker_dict", type=str, default=None, help="speaker id map file.")
- def str2bool(str):
- return True if str.lower() == 'true' else False
-
parser.add_argument(
"--voice-cloning",
type=str2bool,
diff --git a/paddlespeech/t2s/exps/synthesize_e2e.py b/paddlespeech/t2s/exps/synthesize_e2e.py
index 15ed1e4d..75c631b8 100644
--- a/paddlespeech/t2s/exps/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/synthesize_e2e.py
@@ -38,6 +38,10 @@ model_alias = {
"paddlespeech.t2s.models.fastspeech2:FastSpeech2",
"fastspeech2_inference":
"paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference",
+ "tacotron2":
+ "paddlespeech.t2s.models.tacotron2:Tacotron2",
+ "tacotron2_inference":
+ "paddlespeech.t2s.models.tacotron2:Tacotron2Inference",
# voc
"pwgan":
"paddlespeech.t2s.models.parallel_wavegan:PWGGenerator",
@@ -55,6 +59,10 @@ model_alias = {
"paddlespeech.t2s.models.hifigan:HiFiGANGenerator",
"hifigan_inference":
"paddlespeech.t2s.models.hifigan:HiFiGANInference",
+ "wavernn":
+ "paddlespeech.t2s.models.wavernn:WaveRNN",
+ "wavernn_inference":
+ "paddlespeech.t2s.models.wavernn:WaveRNNInference",
}
@@ -125,7 +133,12 @@ def evaluate(args):
idim=vocab_size, odim=odim, spk_num=spk_num, **am_config["model"])
elif am_name == 'speedyspeech':
am = am_class(
- vocab_size=vocab_size, tone_size=tone_size, **am_config["model"])
+ vocab_size=vocab_size,
+ tone_size=tone_size,
+ spk_num=spk_num,
+ **am_config["model"])
+ elif am_name == 'tacotron2':
+ am = am_class(idim=vocab_size, odim=odim, **am_config["model"])
am.set_state_dict(paddle.load(args.am_ckpt)["main_params"])
am.eval()
@@ -142,10 +155,16 @@ def evaluate(args):
voc_name = args.voc[:args.voc.rindex('_')]
voc_class = dynamic_import(voc_name, model_alias)
voc_inference_class = dynamic_import(voc_name + '_inference', model_alias)
- voc = voc_class(**voc_config["generator_params"])
- voc.set_state_dict(paddle.load(args.voc_ckpt)["generator_params"])
- voc.remove_weight_norm()
- voc.eval()
+ if voc_name != 'wavernn':
+ voc = voc_class(**voc_config["generator_params"])
+ voc.set_state_dict(paddle.load(args.voc_ckpt)["generator_params"])
+ voc.remove_weight_norm()
+ voc.eval()
+ else:
+ voc = voc_class(**voc_config["model"])
+ voc.set_state_dict(paddle.load(args.voc_ckpt)["main_params"])
+ voc.eval()
+
voc_mu, voc_std = np.load(args.voc_stat)
voc_mu = paddle.to_tensor(voc_mu)
voc_std = paddle.to_tensor(voc_std)
@@ -159,29 +178,42 @@ def evaluate(args):
# acoustic model
if am_name == 'fastspeech2':
if am_dataset in {"aishell3", "vctk"} and args.speaker_dict:
- print(
- "Haven't test dygraph to static for multi speaker fastspeech2 now!"
- )
+ am_inference = jit.to_static(
+ am_inference,
+ input_spec=[
+ InputSpec([-1], dtype=paddle.int64),
+ InputSpec([1], dtype=paddle.int64)
+ ])
else:
am_inference = jit.to_static(
am_inference,
input_spec=[InputSpec([-1], dtype=paddle.int64)])
- paddle.jit.save(am_inference,
- os.path.join(args.inference_dir, args.am))
- am_inference = paddle.jit.load(
- os.path.join(args.inference_dir, args.am))
+
elif am_name == 'speedyspeech':
+ if am_dataset in {"aishell3", "vctk"} and args.speaker_dict:
+ am_inference = jit.to_static(
+ am_inference,
+ input_spec=[
+ InputSpec([-1], dtype=paddle.int64), # text
+ InputSpec([-1], dtype=paddle.int64), # tone
+ None, # duration
+ InputSpec([-1], dtype=paddle.int64) # spk_id
+ ])
+ else:
+ am_inference = jit.to_static(
+ am_inference,
+ input_spec=[
+ InputSpec([-1], dtype=paddle.int64),
+ InputSpec([-1], dtype=paddle.int64)
+ ])
+
+ elif am_name == 'tacotron2':
am_inference = jit.to_static(
- am_inference,
- input_spec=[
- InputSpec([-1], dtype=paddle.int64),
- InputSpec([-1], dtype=paddle.int64)
- ])
+ am_inference, input_spec=[InputSpec([-1], dtype=paddle.int64)])
- paddle.jit.save(am_inference,
- os.path.join(args.inference_dir, args.am))
- am_inference = paddle.jit.load(
- os.path.join(args.inference_dir, args.am))
+ paddle.jit.save(am_inference, os.path.join(args.inference_dir, args.am))
+ am_inference = paddle.jit.load(
+ os.path.join(args.inference_dir, args.am))
# vocoder
voc_inference = jit.to_static(
@@ -197,6 +229,11 @@ def evaluate(args):
output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
merge_sentences = False
+ # Avoid not stopping at the end of a sub sentence when tacotron2_ljspeech dygraph to static graph
+ # but still not stopping in the end (NOTE by yuantian01 Feb 9 2022)
+ if am_name == 'tacotron2':
+ merge_sentences = True
+
for utt_id, sentence in sentences:
get_tone_ids = False
if am_name == 'speedyspeech':
@@ -229,7 +266,14 @@ def evaluate(args):
mel = am_inference(part_phone_ids)
elif am_name == 'speedyspeech':
part_tone_ids = tone_ids[i]
- mel = am_inference(part_phone_ids, part_tone_ids)
+ if am_dataset in {"aishell3", "vctk"}:
+ spk_id = paddle.to_tensor(args.spk_id)
+ mel = am_inference(part_phone_ids, part_tone_ids,
+ spk_id)
+ else:
+ mel = am_inference(part_phone_ids, part_tone_ids)
+ elif am_name == 'tacotron2':
+ mel = am_inference(part_phone_ids)
# vocoder
wav = voc_inference(mel)
if flags == 0:
@@ -254,8 +298,9 @@ def main():
type=str,
default='fastspeech2_csmsc',
choices=[
- 'speedyspeech_csmsc', 'fastspeech2_csmsc', 'fastspeech2_ljspeech',
- 'fastspeech2_aishell3', 'fastspeech2_vctk'
+ 'speedyspeech_csmsc', 'speedyspeech_aishell3', 'fastspeech2_csmsc',
+ 'fastspeech2_ljspeech', 'fastspeech2_aishell3', 'fastspeech2_vctk',
+ 'tacotron2_csmsc', 'tacotron2_ljspeech'
],
help='Choose acoustic model type of tts task.')
parser.add_argument(
@@ -292,7 +337,8 @@ def main():
default='pwgan_csmsc',
choices=[
'pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3', 'pwgan_vctk',
- 'mb_melgan_csmsc', 'style_melgan_csmsc', 'hifigan_csmsc'
+ 'mb_melgan_csmsc', 'style_melgan_csmsc', 'hifigan_csmsc',
+ 'wavernn_csmsc'
],
help='Choose vocoder type of tts task.')
diff --git a/paddlespeech/t2s/exps/tacotron2/config.py b/paddlespeech/t2s/exps/tacotron2/config.py
deleted file mode 100644
index 0ce2df36..00000000
--- a/paddlespeech/t2s/exps/tacotron2/config.py
+++ /dev/null
@@ -1,75 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from yacs.config import CfgNode as CN
-
-_C = CN()
-_C.data = CN(
- dict(
- batch_size=32, # batch size
- valid_size=64, # the first N examples are reserved for validation
- sample_rate=22050, # Hz, sample rate
- n_fft=1024, # fft frame size
- win_length=1024, # window size
- hop_length=256, # hop size between ajacent frame
- fmax=8000, # Hz, max frequency when converting to mel
- fmin=0, # Hz, min frequency when converting to mel
- n_mels=80, # mel bands
- padding_idx=0, # text embedding's padding index
- ))
-
-_C.model = CN(
- dict(
- vocab_size=37, # set this according to the frontend's vocab size
- n_tones=None,
- reduction_factor=1, # reduction factor
- d_encoder=512, # embedding & encoder's internal size
- encoder_conv_layers=3, # number of conv layer in tacotron2 encoder
- encoder_kernel_size=5, # kernel size of conv layers in tacotron2 encoder
- d_prenet=256, # hidden size of decoder prenet
- d_attention_rnn=1024, # hidden size of the first rnn layer in tacotron2 decoder
- d_decoder_rnn=1024, # hidden size of the second rnn layer in tacotron2 decoder
- d_attention=128, # hidden size of decoder location linear layer
- attention_filters=32, # number of filter in decoder location conv layer
- attention_kernel_size=31, # kernel size of decoder location conv layer
- d_postnet=512, # hidden size of decoder postnet
- postnet_kernel_size=5, # kernel size of conv layers in postnet
- postnet_conv_layers=5, # number of conv layer in decoder postnet
- p_encoder_dropout=0.5, # droput probability in encoder
- p_prenet_dropout=0.5, # droput probability in decoder prenet
- p_attention_dropout=0.1, # droput probability of first rnn layer in decoder
- p_decoder_dropout=0.1, # droput probability of second rnn layer in decoder
- p_postnet_dropout=0.5, # droput probability in decoder postnet
- d_global_condition=None,
- use_stop_token=True, # wherther to use binary classifier to predict when to stop
- use_guided_attention_loss=False, # whether to use guided attention loss
- guided_attention_loss_sigma=0.2 # sigma in guided attention loss
- ))
-
-_C.training = CN(
- dict(
- lr=1e-3, # learning rate
- weight_decay=1e-6, # the coeff of weight decay
- grad_clip_thresh=1.0, # the clip norm of grad clip.
- plot_interval=1000, # plot attention and spectrogram
- valid_interval=1000, # validation
- save_interval=1000, # checkpoint
- max_iteration=500000, # max iteration to train
- ))
-
-
-def get_cfg_defaults():
- """Get a yacs CfgNode object with default values for my_project."""
- # Return a clone so that the defaults will not be altered
- # This is for the "local variable" use pattern
- return _C.clone()
diff --git a/paddlespeech/t2s/exps/tacotron2/ljspeech.py b/paddlespeech/t2s/exps/tacotron2/ljspeech.py
deleted file mode 100644
index 08db2a64..00000000
--- a/paddlespeech/t2s/exps/tacotron2/ljspeech.py
+++ /dev/null
@@ -1,91 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import pickle
-from pathlib import Path
-
-import numpy as np
-from paddle.io import Dataset
-
-from paddlespeech.t2s.data.batch import batch_spec
-from paddlespeech.t2s.data.batch import batch_text_id
-
-
-class LJSpeech(Dataset):
- """A simple dataset adaptor for the processed ljspeech dataset."""
-
- def __init__(self, root):
- self.root = Path(root).expanduser()
- records = []
- with open(self.root / "metadata.pkl", 'rb') as f:
- metadata = pickle.load(f)
- for mel_name, text, ids in metadata:
- mel_name = self.root / "mel" / (mel_name + ".npy")
- records.append((mel_name, text, ids))
- self.records = records
-
- def __getitem__(self, i):
- mel_name, _, ids = self.records[i]
- mel = np.load(mel_name)
- return ids, mel
-
- def __len__(self):
- return len(self.records)
-
-
-class LJSpeechCollector(object):
- """A simple callable to batch LJSpeech examples."""
-
- def __init__(self, padding_idx=0, padding_value=0., padding_stop_token=1.0):
- self.padding_idx = padding_idx
- self.padding_value = padding_value
- self.padding_stop_token = padding_stop_token
-
- def __call__(self, examples):
- texts = []
- mels = []
- text_lens = []
- mel_lens = []
-
- for data in examples:
- text, mel = data
- text = np.array(text, dtype=np.int64)
- text_lens.append(len(text))
- mels.append(mel)
- texts.append(text)
- mel_lens.append(mel.shape[1])
-
- # Sort by text_len in descending order
- texts = [
- i for i, _ in sorted(
- zip(texts, text_lens), key=lambda x: x[1], reverse=True)
- ]
- mels = [
- i for i, _ in sorted(
- zip(mels, text_lens), key=lambda x: x[1], reverse=True)
- ]
-
- mel_lens = [
- i for i, _ in sorted(
- zip(mel_lens, text_lens), key=lambda x: x[1], reverse=True)
- ]
-
- mel_lens = np.array(mel_lens, dtype=np.int64)
- text_lens = np.array(sorted(text_lens, reverse=True), dtype=np.int64)
-
- # Pad sequence with largest len of the batch
- texts, _ = batch_text_id(texts, pad_id=self.padding_idx)
- mels, _ = batch_spec(mels, pad_value=self.padding_value)
- mels = np.transpose(mels, axes=(0, 2, 1))
-
- return texts, mels, text_lens, mel_lens
diff --git a/paddlespeech/t2s/exps/tacotron2/normalize.py b/paddlespeech/t2s/exps/tacotron2/normalize.py
new file mode 120000
index 00000000..64848f89
--- /dev/null
+++ b/paddlespeech/t2s/exps/tacotron2/normalize.py
@@ -0,0 +1 @@
+../transformer_tts/normalize.py
\ No newline at end of file
diff --git a/paddlespeech/t2s/exps/tacotron2/preprocess.py b/paddlespeech/t2s/exps/tacotron2/preprocess.py
index 480b3331..7f41089e 100644
--- a/paddlespeech/t2s/exps/tacotron2/preprocess.py
+++ b/paddlespeech/t2s/exps/tacotron2/preprocess.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -13,86 +13,314 @@
# limitations under the License.
import argparse
import os
-import pickle
+from concurrent.futures import ThreadPoolExecutor
+from operator import itemgetter
from pathlib import Path
+from typing import Any
+from typing import Dict
+from typing import List
+import jsonlines
+import librosa
import numpy as np
import tqdm
+import yaml
+from yacs.config import CfgNode
-from paddlespeech.t2s.audio import AudioProcessor
-from paddlespeech.t2s.audio import LogMagnitude
-from paddlespeech.t2s.datasets import LJSpeechMetaData
-from paddlespeech.t2s.exps.tacotron2.config import get_cfg_defaults
-from paddlespeech.t2s.frontend import EnglishCharacter
-
-
-def create_dataset(config, source_path, target_path, verbose=False):
- # create output dir
- target_path = Path(target_path).expanduser()
- mel_path = target_path / "mel"
- os.makedirs(mel_path, exist_ok=True)
-
- meta_data = LJSpeechMetaData(source_path)
- frontend = EnglishCharacter()
- processor = AudioProcessor(
- sample_rate=config.data.sample_rate,
- n_fft=config.data.n_fft,
- n_mels=config.data.n_mels,
- win_length=config.data.win_length,
- hop_length=config.data.hop_length,
- fmax=config.data.fmax,
- fmin=config.data.fmin)
- normalizer = LogMagnitude()
-
- records = []
- for (fname, text, _) in tqdm.tqdm(meta_data):
- wav = processor.read_wav(fname)
- mel = processor.mel_spectrogram(wav)
- mel = normalizer.transform(mel)
- ids = frontend(text)
- mel_name = os.path.splitext(os.path.basename(fname))[0]
-
- # save mel spectrogram
- records.append((mel_name, text, ids))
- np.save(mel_path / mel_name, mel)
- if verbose:
- print("save mel spectrograms into {}".format(mel_path))
-
- # save meta data as pickle archive
- with open(target_path / "metadata.pkl", 'wb') as f:
- pickle.dump(records, f)
- if verbose:
- print("saved metadata into {}".format(target_path / "metadata.pkl"))
-
- print("Done.")
+from paddlespeech.t2s.datasets.get_feats import LogMelFBank
+from paddlespeech.t2s.datasets.preprocess_utils import compare_duration_and_mel_length
+from paddlespeech.t2s.datasets.preprocess_utils import get_input_token
+from paddlespeech.t2s.datasets.preprocess_utils import get_phn_dur
+from paddlespeech.t2s.datasets.preprocess_utils import get_spk_id_map
+from paddlespeech.t2s.datasets.preprocess_utils import merge_silence
+from paddlespeech.t2s.utils import str2bool
-if __name__ == "__main__":
- parser = argparse.ArgumentParser(description="create dataset")
+def process_sentence(config: Dict[str, Any],
+ fp: Path,
+ sentences: Dict,
+ output_dir: Path,
+ mel_extractor=None,
+ cut_sil: bool=True,
+ spk_emb_dir: Path=None):
+ utt_id = fp.stem
+ # for vctk
+ if utt_id.endswith("_mic2"):
+ utt_id = utt_id[:-5]
+ record = None
+ if utt_id in sentences:
+ # reading, resampling may occur
+ wav, _ = librosa.load(str(fp), sr=config.fs)
+ if len(wav.shape) != 1 or np.abs(wav).max() > 1.0:
+ return record
+ assert len(wav.shape) == 1, f"{utt_id} is not a mono-channel audio."
+ assert np.abs(wav).max(
+ ) <= 1.0, f"{utt_id} is seems to be different that 16 bit PCM."
+ phones = sentences[utt_id][0]
+ durations = sentences[utt_id][1]
+ speaker = sentences[utt_id][2]
+ d_cumsum = np.pad(np.array(durations).cumsum(0), (1, 0), 'constant')
+ # little imprecise than use *.TextGrid directly
+ times = librosa.frames_to_time(
+ d_cumsum, sr=config.fs, hop_length=config.n_shift)
+ if cut_sil:
+ start = 0
+ end = d_cumsum[-1]
+ if phones[0] == "sil" and len(durations) > 1:
+ start = times[1]
+ durations = durations[1:]
+ phones = phones[1:]
+ if phones[-1] == 'sil' and len(durations) > 1:
+ end = times[-2]
+ durations = durations[:-1]
+ phones = phones[:-1]
+ sentences[utt_id][0] = phones
+ sentences[utt_id][1] = durations
+ start, end = librosa.time_to_samples([start, end], sr=config.fs)
+ wav = wav[start:end]
+ # extract mel feats
+ logmel = mel_extractor.get_log_mel_fbank(wav)
+ # change duration according to mel_length
+ compare_duration_and_mel_length(sentences, utt_id, logmel)
+ phones = sentences[utt_id][0]
+ durations = sentences[utt_id][1]
+ num_frames = logmel.shape[0]
+ assert sum(durations) == num_frames
+ mel_dir = output_dir / "data_speech"
+ mel_dir.mkdir(parents=True, exist_ok=True)
+ mel_path = mel_dir / (utt_id + "_speech.npy")
+ np.save(mel_path, logmel)
+ record = {
+ "utt_id": utt_id,
+ "phones": phones,
+ "text_lengths": len(phones),
+ "speech_lengths": num_frames,
+ "speech": str(mel_path),
+ "speaker": speaker
+ }
+ if spk_emb_dir:
+ if speaker in os.listdir(spk_emb_dir):
+ embed_name = utt_id + ".npy"
+ embed_path = spk_emb_dir / speaker / embed_name
+ if embed_path.is_file():
+ record["spk_emb"] = str(embed_path)
+ else:
+ return None
+ return record
+
+
+def process_sentences(config,
+ fps: List[Path],
+ sentences: Dict,
+ output_dir: Path,
+ mel_extractor=None,
+ nprocs: int=1,
+ cut_sil: bool=True,
+ spk_emb_dir: Path=None):
+ if nprocs == 1:
+ results = []
+ for fp in fps:
+ record = process_sentence(config, fp, sentences, output_dir,
+ mel_extractor, cut_sil, spk_emb_dir)
+ if record:
+ results.append(record)
+ else:
+ with ThreadPoolExecutor(nprocs) as pool:
+ futures = []
+ with tqdm.tqdm(total=len(fps)) as progress:
+ for fp in fps:
+ future = pool.submit(process_sentence, config, fp,
+ sentences, output_dir, mel_extractor,
+ cut_sil, spk_emb_dir)
+ future.add_done_callback(lambda p: progress.update())
+ futures.append(future)
+
+ results = []
+ for ft in futures:
+ record = ft.result()
+ if record:
+ results.append(record)
+
+ results.sort(key=itemgetter("utt_id"))
+ with jsonlines.open(output_dir / "metadata.jsonl", 'w') as writer:
+ for item in results:
+ writer.write(item)
+ print("Done")
+
+
+def main():
+ # parse config and args
+ parser = argparse.ArgumentParser(
+ description="Preprocess audio and then extract features.")
+
parser.add_argument(
- "--config",
+ "--dataset",
+ default="baker",
type=str,
- metavar="FILE",
- help="extra config to overwrite the default config")
+ help="name of dataset, should in {baker, aishell3, ljspeech, vctk} now")
+
parser.add_argument(
- "--input", type=str, help="path of the ljspeech dataset")
+ "--rootdir", default=None, type=str, help="directory to dataset.")
+
+ parser.add_argument(
+ "--dumpdir",
+ type=str,
+ required=True,
+ help="directory to dump feature files.")
+ parser.add_argument(
+ "--dur-file", default=None, type=str, help="path to durations.txt.")
+
+ parser.add_argument("--config", type=str, help="fastspeech2 config file.")
+
parser.add_argument(
- "--output", type=str, help="path to save output dataset")
+ "--verbose",
+ type=int,
+ default=1,
+ help="logging level. higher is more logging. (default=1)")
parser.add_argument(
- "--opts",
- nargs=argparse.REMAINDER,
- help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
- )
+ "--num-cpu", type=int, default=1, help="number of process.")
+
parser.add_argument(
- "-v", "--verbose", action="store_true", help="print msg")
+ "--cut-sil",
+ type=str2bool,
+ default=True,
+ help="whether cut sil in the edge of audio")
- config = get_cfg_defaults()
+ parser.add_argument(
+ "--spk_emb_dir",
+ default=None,
+ type=str,
+ help="directory to speaker embedding files.")
args = parser.parse_args()
- if args.config:
- config.merge_from_file(args.config)
- if args.opts:
- config.merge_from_list(args.opts)
- config.freeze()
- print(config.data)
-
- create_dataset(config, args.input, args.output, args.verbose)
+
+ rootdir = Path(args.rootdir).expanduser()
+ dumpdir = Path(args.dumpdir).expanduser()
+ # use absolute path
+ dumpdir = dumpdir.resolve()
+ dumpdir.mkdir(parents=True, exist_ok=True)
+ dur_file = Path(args.dur_file).expanduser()
+
+ if args.spk_emb_dir:
+ spk_emb_dir = Path(args.spk_emb_dir).expanduser().resolve()
+ else:
+ spk_emb_dir = None
+
+ assert rootdir.is_dir()
+ assert dur_file.is_file()
+
+ with open(args.config, 'rt') as f:
+ config = CfgNode(yaml.safe_load(f))
+
+ if args.verbose > 1:
+ print(vars(args))
+ print(config)
+
+ sentences, speaker_set = get_phn_dur(dur_file)
+
+ merge_silence(sentences)
+ phone_id_map_path = dumpdir / "phone_id_map.txt"
+ speaker_id_map_path = dumpdir / "speaker_id_map.txt"
+ get_input_token(sentences, phone_id_map_path, args.dataset)
+ get_spk_id_map(speaker_set, speaker_id_map_path)
+
+ if args.dataset == "baker":
+ wav_files = sorted(list((rootdir / "Wave").rglob("*.wav")))
+ # split data into 3 sections
+ num_train = 9800
+ num_dev = 100
+ train_wav_files = wav_files[:num_train]
+ dev_wav_files = wav_files[num_train:num_train + num_dev]
+ test_wav_files = wav_files[num_train + num_dev:]
+ elif args.dataset == "aishell3":
+ sub_num_dev = 5
+ wav_dir = rootdir / "train" / "wav"
+ train_wav_files = []
+ dev_wav_files = []
+ test_wav_files = []
+ for speaker in os.listdir(wav_dir):
+ wav_files = sorted(list((wav_dir / speaker).rglob("*.wav")))
+ if len(wav_files) > 100:
+ train_wav_files += wav_files[:-sub_num_dev * 2]
+ dev_wav_files += wav_files[-sub_num_dev * 2:-sub_num_dev]
+ test_wav_files += wav_files[-sub_num_dev:]
+ else:
+ train_wav_files += wav_files
+
+ elif args.dataset == "ljspeech":
+ wav_files = sorted(list((rootdir / "wavs").rglob("*.wav")))
+ # split data into 3 sections
+ num_train = 12900
+ num_dev = 100
+ train_wav_files = wav_files[:num_train]
+ dev_wav_files = wav_files[num_train:num_train + num_dev]
+ test_wav_files = wav_files[num_train + num_dev:]
+ elif args.dataset == "vctk":
+ sub_num_dev = 5
+ wav_dir = rootdir / "wav48_silence_trimmed"
+ train_wav_files = []
+ dev_wav_files = []
+ test_wav_files = []
+ for speaker in os.listdir(wav_dir):
+ wav_files = sorted(list((wav_dir / speaker).rglob("*_mic2.flac")))
+ if len(wav_files) > 100:
+ train_wav_files += wav_files[:-sub_num_dev * 2]
+ dev_wav_files += wav_files[-sub_num_dev * 2:-sub_num_dev]
+ test_wav_files += wav_files[-sub_num_dev:]
+ else:
+ train_wav_files += wav_files
+
+ else:
+ print("dataset should in {baker, aishell3, ljspeech, vctk} now!")
+
+ train_dump_dir = dumpdir / "train" / "raw"
+ train_dump_dir.mkdir(parents=True, exist_ok=True)
+ dev_dump_dir = dumpdir / "dev" / "raw"
+ dev_dump_dir.mkdir(parents=True, exist_ok=True)
+ test_dump_dir = dumpdir / "test" / "raw"
+ test_dump_dir.mkdir(parents=True, exist_ok=True)
+
+ # Extractor
+ mel_extractor = LogMelFBank(
+ sr=config.fs,
+ n_fft=config.n_fft,
+ hop_length=config.n_shift,
+ win_length=config.win_length,
+ window=config.window,
+ n_mels=config.n_mels,
+ fmin=config.fmin,
+ fmax=config.fmax)
+
+ # process for the 3 sections
+ if train_wav_files:
+ process_sentences(
+ config,
+ train_wav_files,
+ sentences,
+ train_dump_dir,
+ mel_extractor,
+ nprocs=args.num_cpu,
+ cut_sil=args.cut_sil,
+ spk_emb_dir=spk_emb_dir)
+ if dev_wav_files:
+ process_sentences(
+ config,
+ dev_wav_files,
+ sentences,
+ dev_dump_dir,
+ mel_extractor,
+ cut_sil=args.cut_sil,
+ spk_emb_dir=spk_emb_dir)
+ if test_wav_files:
+ process_sentences(
+ config,
+ test_wav_files,
+ sentences,
+ test_dump_dir,
+ mel_extractor,
+ nprocs=args.num_cpu,
+ cut_sil=args.cut_sil,
+ spk_emb_dir=spk_emb_dir)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/paddlespeech/t2s/exps/tacotron2/synthesize.ipynb b/paddlespeech/t2s/exps/tacotron2/synthesize.ipynb
deleted file mode 100644
index cc424311..00000000
--- a/paddlespeech/t2s/exps/tacotron2/synthesize.ipynb
+++ /dev/null
@@ -1,342 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## TTS with Tacotron2 + Waveflow"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [],
- "source": [
- "import numpy as np\n",
- "import paddle\n",
- "from matplotlib import pyplot as plt\n",
- "from IPython import display as ipd\n",
- "%matplotlib inline\n",
- "\n",
- "from paddlespeech.t2s.utils import display\n",
- "from paddlespeech.t2s.utils import layer_tools\n",
- "paddle.set_device(\"gpu:0\")\n",
- "\n",
- "import sys\n",
- "sys.path.append(\"../..\")\n",
- "import examples"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Tacotron2: synthesizer model\n",
- "\n",
- "Tacotron2 is used here as a phonemes to spectrogram model. Here we will use an alternative config. In this config, the tacotron2 model does not have a binary classifier to predict whether the generation should stop.\n",
- "\n",
- "Instead, the peak position is used as the criterion. When the peak position of the attention reaches the end of the encoder outputs, it implies that the content is exhausted. So we stop the generated after 10 frames."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {},
- "outputs": [],
- "source": [
- "from paddlespeech.t2s.models.tacotron2 import Tacotron2\n",
- "from paddlespeech.t2s.frontend import EnglishCharacter"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "data:\n",
- " batch_size: 32\n",
- " fmax: 8000\n",
- " fmin: 0\n",
- " hop_length: 256\n",
- " n_fft: 1024\n",
- " n_mels: 80\n",
- " padding_idx: 0\n",
- " sample_rate: 22050\n",
- " valid_size: 64\n",
- " win_length: 1024\n",
- "model:\n",
- " attention_filters: 32\n",
- " attention_kernel_size: 31\n",
- " d_attention: 128\n",
- " d_attention_rnn: 1024\n",
- " d_decoder_rnn: 1024\n",
- " d_encoder: 512\n",
- " d_global_condition: None\n",
- " d_postnet: 512\n",
- " d_prenet: 256\n",
- " encoder_conv_layers: 3\n",
- " encoder_kernel_size: 5\n",
- " guided_attention_loss_sigma: 0.2\n",
- " n_tones: None\n",
- " p_attention_dropout: 0.1\n",
- " p_decoder_dropout: 0.1\n",
- " p_encoder_dropout: 0.5\n",
- " p_postnet_dropout: 0.5\n",
- " p_prenet_dropout: 0.5\n",
- " postnet_conv_layers: 5\n",
- " postnet_kernel_size: 5\n",
- " reduction_factor: 1\n",
- " use_guided_attention_loss: True\n",
- " use_stop_token: False\n",
- " vocab_size: 37\n",
- "training:\n",
- " grad_clip_thresh: 1.0\n",
- " lr: 0.001\n",
- " max_iteration: 500000\n",
- " plot_interval: 1000\n",
- " save_interval: 1000\n",
- " valid_interval: 1000\n",
- " weight_decay: 1e-06\n"
- ]
- }
- ],
- "source": [
- "from examples.tacotron2 import config as tacotron2_config\n",
- "synthesizer_config = tacotron2_config.get_cfg_defaults()\n",
- "synthesizer_config.merge_from_file(\"configs/alternative.yaml\")\n",
- "print(synthesizer_config)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[checkpoint] Rank 0: loaded model from ../../pretrained/tacotron2/tacotron2_ljspeech_ckpt_0.3_alternative/step-50000.pdparams\n"
- ]
- }
- ],
- "source": [
- "frontend = EnglishCharacter()\n",
- "model = Tacotron2.from_pretrained(\n",
- " synthesizer_config, \"../../pretrained/tacotron2/tacotron2_ljspeech_ckpt_0.3_alternative/step-50000\")\n",
- "model.eval()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- " 36%|███▋ | 365/1000 [00:01<00:02, 256.89it/s]\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "content exhausted!\n"
- ]
- }
- ],
- "source": [
- "sentence = \"Life was like a box of chocolates, you never know what you're gonna get.\" \n",
- "sentence = paddle.to_tensor(frontend(sentence)).unsqueeze(0)\n",
- "\n",
- "with paddle.no_grad():\n",
- " outputs = model.infer(sentence)\n",
- "mel_output = outputs[\"mel_outputs_postnet\"][0].numpy().T\n",
- "alignment = outputs[\"alignments\"][0].numpy().T"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZIAAAEYCAYAAAB2qXBEAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+j8jraAAAgAElEQVR4nO3de5xcdZ3n/9f7nKrq+yWdhCY3SLgblQAygKCAgAquP5jZUQd0d+M8WJlx1NHxssDM6qi7s4Pjb1DHZXQyiuKsI+JtwBkUWQRUHDDhFm7BhEDIPeTe6U53V9X57B/ndLrS6Ut1d1XX6c7nmUc9+pxT5/Lp09X9zfl8bzIznHPOuYkKah2Ac8656c0LEuecc5PiBYlzzrlJ8YLEOefcpHhB4pxzblK8IHHOOTcpVStIJJ0q6YmS135JH5HUIeleSWuTr7OqFYNzzrnq01T0I5EUApuBc4EPALvN7CZJNwCzzOz6qgfhnHOuKqaqIHkL8JdmdoGk54GLzWyrpHnAA2Z26mjH51Rn9TRVPU7n3FFEIJUkZcKQqCFLoVG0dxwgwAgVESoiMtEU9KH4sEOKiO6oDoCsighDGNs25+naXRAV8NY3Ndmu3cWy9390dd89ZnZ5Ja5drswUXedq4DvJcqeZbU2WtwGdYx1cTxPn6tJqxeacOwopk0G53KH1YFY7Pa9dwI4zs/zeH/ySxqCftkwP7WEPXcV6zmtYT52KhBr8z3dXlOXhgycSWcD87B6yKpBTkQ9e9WLF4ty5u8gj9ywse//svBfmVOziZap6QSIpB1wJ3Dj0PTMzScM+Ekm6DrgOoJ7GqsbonHPpZRQtqnUQo5qKJ5IrgMfMbHuyvl3SvJLU1o7hDjKzFcAKgFZ1+IBgzrnKkFAuR9DcBMfMpvvEWWy+KMTm9ZLZkKFxKzz2nqVobxcEAdbbh8KAO7sWY/15sAiL4j9Jymbi82QyUChAvoAVCuw6uK1i4RoQke4/gVPR/PcaBtNaAHcBy5Pl5cCdUxCDc85NW9E4/tVCVZ9IJDUBbwb+qGTzTcAdkq4FNgDvqmYMzjk3nRlGMeWjtFe1IDGzbmD2kG27AK85d865MqU9tTVVrbacc672JDIL5lOc287+E5t55cyA4JQDhGuamXNnjtbn9hDs2ktx9x4sXwCLYLinASUteyODfAEgrj8BCMPB9yvAgKIXJM455ybDn0icc85NmAH5o7mOxDnnqko6MvWUpJUUhvHXTPxnTg0NbLv6NP7Hx77Bqu4T+MGLy2j4+SwWrNhL4eWnAbAwJEo6KSoQhIMdFoksTnUpgEAQGVbIU9zff0RYZuX3RB+LYZ7acs45NwkGxXSXI16QOOdcmsUdEtPNCxLn3PQ1kNYaaCV1aD0ABYQL53HwxDlsuizHf3nb/dz6+BK+/OplEBnzeQEsolAsHjrOCgWsZJ1CYfTrS0deu+JEkcq1AqsGL0iccy7FjLh6Js28IHHOuZTzJxLnnKuCoL4eMzvsv+tBexvWOZv9r2rj4OyAPWcWIBLZPfDQu8/g1A1rifr74zTUaCmp4VqDlb4HoAAFilNhCuKJSgZadVWu0VbSIdELEuecc5MQmRckzjnnJsifSJxzzk2KIfIW1jqMUXlB4pybdpTJsOHjZ3HJVY/yjo5VBIpYnDnA2nwbTx48nnu2L2X37ll03NtM68sF6h94nKi39/CTjNZcN6nzUBjGvdiLxbgupPRYK3Jo4sLSnuwV7NUO/kTinHNu0kTRpmIOwonzgsQ551Is7tnuBYlzzk2MhMKQ/EXL2HpeHflX97C4cxfZoEj7P0S8eOUsPrfvPCxfGJJ62sxxtunQallDjJQ06yWKz2VRGWmq4eYeqXAHQk9tOeecmzAzT20555ybpMifSJxzrkwSQXMzwZwODp44h6BgbD+nnsuv/nd+sPosWn7TSM+WenL7i7T/+lkKXV2VHyyxnHRWSbwKQwjDeGreQ9srF07casufSJxzzk2Yp7acc85Ngrfacs45NymG6Pee7c45NzJlcyiXRbkcam9l89sXELx5F7ObdtCWO8gLv13M6tfByfboYaPyRplM3FS3Uj3JJ1LXYoYVCkdOgFXpapuUp7aqGp2kdknfl7RG0nOSXi+pQ9K9ktYmX2dVMwbnnJvOBirby33VQrWv+iXgp2Z2GrAMeA64AbjPzE4G7kvWnXPODcMQRSv/VQtVS21JagMuBN4LYGb9QL+kq4CLk91uAx4Arq9WHM65lApCglyWYO4cotmtdC9qpvuYkMxBo/0LTeSe20tPUZy6d3U8gRUcln6yyOKJpWyUSahmiKO5sn0J8ArwDUnLgEeBDwOdZrY12Wcb0DncwZKuA64DqKeximE651x6mZH65r/VjC4DnAV8xczOBLoZksay+L8Zw/5XwsxWmNnZZnZ2lroqhumcc2kmonG8aqGaTySbgE1m9kiy/n3igmS7pHlmtlXSPGBHFWNwzqVNEBI0NVJ87Qm8fHETjefvJJfpZc+BgN5tTSz4OeQef4HCvv3x/iOlrSzConT/T70SjKP4icTMtgEbJZ2abLoUeBa4C1iebFsO3FmtGJxzbiZIe6utavcj+RDwbUk5YD3wh8SF1x2SrgU2AO+qcgzOOTdtGSKqUWusclW1IDGzJ4Czh3nr0mpe1zmXMhJhSwu2eD6985rZ8boc/W1Gbi+0f76J3IZdtHVtwQ72Eh08SLGMVlgKQ1CQ9EeMZmzLLQPylu6+4+mOzjnnjnryia2cc85NnJH+IVK8IHHOuZTzJxLn3FFJ2RxYBGFIeMxcDp7ayf7jc8igbo8x+9kiLat3UNy8lWJk8ZzrNo66jjAZEbcwc+tHIJ5qt5JPJJIuJx6+KgS+ZmY3DXn/OOJRR9qTfW4ws7tHO6cXJM45l3KV6kciKQRuAd5M3NdvpaS7zOzZkt3+O3CHmX1F0lLgbmDxaOdNd+LNOeeOcvHEVhXr2X4OsM7M1ifjH94OXDXMJVuT5TZgy1gn9ScS51xlBSGZ+cey79yF9BwTsGdZkbqOgxQLRWyL0bAjYP4vDhCu3URhz754jnSVOfCiFM9BkrD+/hmd1oqNe6rdOZJWlayvMLMVyfICYGPJe5uAc4cc/2ngZ5I+BDQBl411QS9InHMuxeJWW+OqbN9pZsP13yvXNcA3zexvJb0e+CdJrzGzaKQDvCBxzrmUq+DQJ5uBRSXrC5Ntpa4FLgcws3+XVA/MYZRxEb0gcc5NnkSm8xgKxx1D93GNvPLOgxw3Zwu7drfTvLKFBV8zwk1bKe7aEw+2WChw2AS55aanzA5NrWv9M7u11oAKD5GyEjhZ0hLiAuRq4N1D9nmZePSRb0p6FVBPPCXIiLwgcc65FDODvIUVOpcVJH0QuIe4ae+tZvaMpM8Cq8zsLuBjwD9K+jPizNp7zUYvsb0gcc65lKvkoI1Jn5C7h2z7VMnys8AF4zmnFyTOOZdicWor3T01vCBxzo1PMpIvgC2ez64zZ9H19gN86czbeaj7FFbuPp7oH44juH01xyUtTQ0oVDyOgJk86m8pHyLFOefchE2g+e+U84LEOedSzVNbzrmZQkK5HMHxC+l6zRwOzA/pWhIRZYyOnzVz8zXLkoEXt9Ay9qgakzfQI/4oUMbQJzXlBYlzzqWYGRQ9teWcc24yPLXlnJu+gpCgvo5gdgf9J8xl3+J6upaI3mMKqFhkzqqA2U/shbUbiAqTaJdV7qCNQx0FLbYq3LO9Krwgcc65FDOg4E8kzjnnJsNTW8656SUIUTZD0NwEx8zm5Svn0jfLKMwqEBw02p6HJd/dA1t3EHUfJCrkj0wxjSdVdWiOkaOjc+G4mae2nHPOTcLADIlpVtWCRNJLQBdQBApmdrakDuC7xHMAvwS8y8z2VDMO55ybztL+RDIVibc3mdkZJTN23QDcZ2YnA/cl684554YxMERKua9aqEVq6yrg4mT5NuAB4PoaxOGcA5AI6uoIjj2G3efPZ//xAf2zjHDxAU6fv4Xup2fRuCHDggehccNueGUP0d59cS/2kYynrqNksio3vKP9icSIJ5F/VNJ1ybZOM9uaLG8DOoc7UNJ1klZJWpWnr8phOudcOg30Izman0jeYGabJR0D3CtpTembZmaShv2vi5mtAFYAtKrDm3I4545aR3Vlu5ltTr7ukPQj4Bxgu6R5ZrZV0jxGmVDeOVdFQUjwmpM5/bY1vLphHaHW8vcvXkTPi3PI7QppvbuZA490cOraxw6lsYpmR81AialhR3FqS1KTpJaBZeAtwNPAXcDyZLflwJ3VisE556Y7AwpRUParFqr5RNIJ/Ejx/14ywD+b2U8lrQTukHQtsAF4VxVjcM65ae2oHmvLzNYDy4bZvgu4tFrXdc4NL5zdAZ1z6Dm+ja7jMuw9zYiaigT/8XiePthJtL+L5t4NnMIGsAhIUlkwmM4a6IWevA94b/QpYEdrQeKcc64yZkRlu6RjiSvKDVhpZtuqGpVzzjkgfuCb9qktSf8V+BTwc0DAlyV91sxurXZwzrkKkAgaG+k+/yS2XBBSbIogMpo2Biz86V4KGzbF09aWy9NaU24mpLY+AZyZ1G0gaTbwa8ALEuecq7qZUdm+i3jgxQFdyTbnnHNTYCY8kawDHpF0J3EdyVXAakkfBTCzm6sYn3POHdUGBm1Ms3IKkheS14CBDoQtlQ/HOVcRQUjQUE/QOZeouZ6+Y5t5+fciWp7JMOsho+mFPdjGrUTdPeXXjyjp7DZQRzJa/UgQDi6bT1g1KZb+2zdmQWJmnwGQ1GhmPdUPyTnn3AADiimfanfM6CS9XtKzwJpkfZmkv696ZM4552CGjP77ReCtxGNkYWZPSrqwqlE558ZFmQxWLKIwJOw8hmc/uZBrzn2Y1zU9xbZCG/e+spRX/XErxTXrwIxxz/4x3jlDxtOc2I1p2qe2AMxsow4f8dM/Jc45N0VmQqutjZLOB0xSFvgw8Fx1w3LOOQfJw+AMKEj+GPgSsADYDPwM+JNqBuWcG4M0mO+QUC6H9fYRtLfRdfZCml7K8NjXl/HY47lkLpEdYNvHPpdLpZnQ/PdUM3tP6QZJFwAPVSck55xzpdJezpfTpuzLZW5zzjlXBWYq+1ULIz6RSHo9cD4wd6AXe6IVCIc/yjnnXCUZtSsgyjVaaisHNCf7lPZi3w+8o5pBOedGIaEwhDAkaG2F9hbWfLKdH194C2vzc/noQ4s55X1PYPn+I44bNkeS9ryJI+0/oRELEjN7EHhQ0jfNbAOApABoNrP9UxWgc84d1QwsqtwTiaTLiRtQhcDXzOymYfZ5F/Dp+Oo8aWbvHu2c5dSR/LWkVklNwNPAs5I+Md7gnXPOTUyl6kgkhcAtwBXAUuAaSUuH7HMycCNwgZm9GvjIWPGV02prqZntl/Qe4CfADcCjwOfLONY5N0HKZCAMsf44RZU5biE7bmng9tfeSrdl6AgKbC/mWNvfyd984Wo+/v7LsL4+TgmfPjKtBSOmsJTJYJEdPmHVKPu7qVfBH8U5wDozWw8g6XbiEd2fLdnnfcAtZrYnvrbtGOuk5TyRZJOOiL8L3GVmedKfsnPOuRnBGPcTyRxJq0pe15WcbgGwsWR9U7Kt1CnAKZIekvRwkgobVTlPJP8AvAQ8CfxC0vHEFe7OOeeqzYDxtdraaWZnT+KKGeBk4GJgIfHf/dea2d7RDhiVmf0d8HclmzZIetMkgnTOjUWCMCR/wWvY8oY6eo/rh0ic/L/6+cCjl0JkWCF/KOcxVw8TDeQ/CoVxXcfGs7+riQqmtjYDi0rWFybbSm0CHkmyTy9K+i1xwbJypJOWM4x8p6SvS/pJsr4UWD7O4J1zzk2UjeM1upXAyZKWSMoBV5OM7F7iX4ifRpA0hzjVtX60k5ZTR/JN4B5gfrL+W8qoxR8gKZT0uKR/TdaXSHpE0jpJ302+Geecc8Mqv35krFZbZlYAPkj8N/054A4ze0bSZyVdmex2D7ArmYfqfuATZrZrtPOWU0cyx8zukHTjQCCSxjOM/MBowa3J+ueAL5jZ7ZK+ClwLfGUc53OutgamVCgZNPEQs8PXS6anVRiCAhQGEARYvgAWxYMqJscFdXUEc2ZzcOk8+ltDdvx+L/m9IY0v5Jj7ZJ7wseeJ+vpGjmukHIgUxzK0ZZabHirYvMnM7gbuHrLtUyXLBnw0eZWlnCeSbkmzSb4VSecB+8o5uaSFwH8AvpasC7gE+H6yy23ErcGcc84Nx6bxWFslPkqcQztR0kPAXMofIuWLwH9jcIiV2cDe5PEKhm96BkDSZO06gHoay7ycc87NQNN4rC0AzOwxSRcBpwICnk9q80cl6e3ADjN7VNLF4w3MzFYAKwBa1eH9VpxzR6+U/wUcsyBJutS/DVic7P8WSZjZzWMcegFwpaS3AfXEdSRfAtolZZKnkuGanjmXbkPrIUZbL5nnPK4LKXDov2ESymRRGPcu3/3OM9n5OmPeaTvo7jvA/q4GTvmrPmzNM/F87Llc3AN9qKF1NiPFPJ451126pLwgKaeO5MfAe4nTUi0lr1GZ2Y1mttDMFhM3Mft5MkHW/QymxpYDd44/bOecO0oMdEgs91UD5dSRLDSz0yt4zeuB2yX9T+Bx4OsVPLdzzs04aR/2rJyC5CeS3mJmP5voRczsAeCBZHk98cBhzk0fYzWtHem9IIyb3JqhTDbujQ6ggHBWGxw7F6KI7pNmsfOtvdQ/00DdXbNoe3En83v3Utj+CkRxSsr6+gjq67HCMKms0uVyUl1D4x/P/m7qpfxHU05B8jDwo2QukjxxhbuZWevohznnnKuI6d5qC7gZeD3wVNJRxTnn3BRSyv/yllOQbASe9kLEzVjDpXaSnuAKQxQGqK0V+vNEPT2HHxqGBJ1zsV17IAyJenpQLgdRhBobUVMD9PVjvX1EJy7A6jJk128jmtvOmmvbOfE1m7lo7loe3r2E096bJdqxE6KIohlKptS1JLWFFPdqH0+rsXL4r3a6lTeGVk2VU5CsBx5IBm08NDZDGc1/nXPOTVrtWmOVq5yC5MXklUtezjnnplLKh0grp2f7Z6YiEOeccyOYrqktSV80s49I+jHDfBtmduUwhzmXHsON0qsABcn2ZCRe1dfBMXMozGkmOFggqs/Q355jzylZ8i1QrDPybRHWXCTYlyHsFVEmrgAt1hnZ/SKqm0+xziCweJC9rEFoIMg294OM/MEsRKK5o4N8PqTtwYDsikZ+ta4N6992RF3FML90Vb1dLqXGP0PilBvtieSfkq///1QE4pxzbnjTttWWmT2aLJ5hZl8qfU/Sh4EHqxmYc865xHQtSEosJx5ssdR7h9nmXM0F9fUQhmA2OHFUZChQ3Cw3ac5LGKKWZqK2Jkxi8yXtHFjWS6bOaGroQ+qlu6uR4v4sQV9A4+aQsC+kd45RrDdUEASQ2ydUEBYa9fsDolz8XpCHTA9ku41Mb0ihXjRvLYBBlGsmu79I7pEnKXZ31/qWOTdpo9WRXAO8G1giqXRO3xZgd7UDc845F5u2qS3g18BWYA7wtyXbu4DV1QzKOedciela2W5mG4ANxMOjOJdOGmyBhUXxnB1maH4nL/3BMfzOFU/zPxfcTQB0RQE9liFvIf2EPHnweJ7oWkRXoY7st9o47XMHKLQ1kNlbRPu7OWb7CwBYoTDy9Sch5V0DXFrMkJ7tzjnnaskLEuecc5OhlD++jlqQJNPsfiuZ2dC52lNpZ8KQcE4HUWcHfXMa2PWaOnrO6aGzYz9dvVnq74UtN57EHz18GVYs+U20KD6+vg41NkB/nmMW7EG9/WS278T680T9/fHUuCNd3zsHuqmU8o/bqAWJmRUlHS8pZ2b9UxWUc865mGx6t9oasB54KGkCfKjRu4/+65xzU2S6ttoq8ULyCoj7kDjnnJtK0/2JZGD0X0mNZtYz1v7OVYsyGdTQQNDWijXUke9sZdtZjRxYZLDgIFGhl2PurqduS0jbizuxrh0Ud+8lioap6wAs3w9dXfHKvv1xc1wrrUsZ5rfX60ZcDaQ9tRWMtYOk10t6FliTrC+T9PdVj8w551zMxvGqgTELEuCLwFuBXQBm9iRwYTWDcs45l7DBCvdyXrVQVj8SM9soHVbZM3yuwLnJCEKCXBZIepMnvdUHBl3c9v5zqL9iB5fNf57O7H5e6p3NDx85m1lPhRxzRxE9tYaoPw8WUUiOLTsVNUL6y7lUmO6pLWCjpPMBk5SV9HHgubEOklQv6TeSnpT0jKSBupYlkh6RtE7SdyX59L3OOTeaGZDa+mPgA8ACYDNwRrI+lj7gEjNblhxzuaTzgM8BXzCzk4A9wLUTCdw5544W0z61ZWY7gXH3bDczAw4kq9nkZcAlxMPTA9wGfBr4ynjP72YWZXMEHe1Ex3VSrAvZ8bpGll39NDfO+ynzM3Fa9Yx7zqD9y7N4/NnFRLv3Eh04wCmsBAXxf8ZsSO/1cQUwTDt9b6Hl0iLlH8XR5iP5MqOEb2Z/OtbJkyFWHgVOAm4h7o+y18wGhlPdRPyk45xzbjjToGf7aKmtVcSFQD1wFrA2eZ0BlFWvYWZFMzsDWAicA5xWbmCSrpO0StKqPH3lHuacczNPyutIRpuP5DYASe8H3jDwFCHpq8Avx3MRM9sr6X7iuU3aJWWS8y0krncZ7pgVwAqAVnWkvDx2zrkqSvlfwHKa/84CWhmcXrc52TYqSXOBfFKINABvJq5ovx94B3A78Xzwd04gbpdSQX394ERQClA2AxIKw3jCqfp6or37DqvDCBob6b7wNF6+yjh36QvMyXWz4YVT2fjpU/jYv0PxQDdYxKmZ1RCIYrEYj8w7UIdhFWi66/UhLqXE9E5tDbgJeFzSNyXdBjwG/K8yjpsH3C9pNbASuNfM/hW4HviopHXAbODrEwvdOeeOEhVMbUm6XNLzSReMG0bZ7/clmaSzxzpnOa22viHpJ8C5SZjXm9m2Mo5bDZw5zPb1xPUlzjnnxlLByvakAdQtxBmiTcBKSXeZ2bND9msBPgw8Us55y50h8RzgjcmyAT8u8zh3tJAIW1qI+vritJMCFERYsYhyOchm0OxZYMb7HnqEixq2klX8QLyzWORdq49l3v/pYP+fh+x5pY8lrAYOH0LB8lWeEkfyFJdLp8p9LM8B1iX/oUfS7cBVwLND9vsfxFURnyjnpOUM2ngTccn0bPL6U0nlpLacc85VQuVSWwuAjSXrR3TBkHQWsMjM/q3c8Mp5InkbcIYlvb2SepLHgT8v9yLOOecmbpxzts+RtKpkfUXSCnbs60gBcDPw3vFcsNzUVjuDrbbaxnMBN8MFIWFbK9FJC+n5qy6uXriSjswB6pXnxOwuioiuKMe2Qjsb8x3sKzRy65svYsXGLYcNlDgniOdPK9Zy8ERPa7k0Gn//kJ1mNlIF+WZgUcn60C4YLcBrgAeSgXqPBe6SdKWZlRZOhymnIPlr4lZb9xO3RLsQGLGm3znnXGVVsPnvSuBkSUuIC5CrGRyyCjPbB8w5dF3pAeDjoxUiUF6rre8kJ/udZFNZrbacc85VSIUKEjMrSPogcA8QArea2TOSPgusMrO7JnLeMQsSSb8H/HzgApLaJf2umf3LRC7ophkJ5XIEJxwHr+ymuGs3CkN63n4W26/pZfmrHqEts4Nf7hG7P7GIO586gejAgThNNDAQ4hEpo41HXIaoOPzAiZOM3dNVbiaoZIdEM7sbuHvItk+NsO/F5ZyznA6Jf5k87gyceC/wl+Wc3DnnXAVM17G2SgxX2JRbSe+cc24yalhAlKucJ5JVkm6WdGLyupl4VGDnnHNVpnG+aqGcJ4sPAZ8Evpus30t5MyS6aU6ZDMpkCObO4eW3z6V3zhwKswqEzXlm/zSk8zv1/GrtGWh/N0QRmf4tFAfqR6D29RO1vr5zlZLyj3I5rba68ea+zjlXM2kf/becVlunAB8HFpfub2aXVC8s55xzh4xz5uipVk5q63vAV4GvcfgYem6mCkIynXMpzp/NgeOa2XNqSPYA1O8S0d4s9bszzH54B/byZop9fYMppCAsL500XLPcSjb99Wa/biaZBlPtllOQFMzsK1WPxDnn3PBmQEHyY0l/AvwIBidPN7PdIx/inHOuUmbCE8ny5GvpuPQGnFD5cFy1KZuDQBDZkfN7JOmlA79/NlsuL3LF6U/zq81LyN3TTue3VmP9eayQBwUULToyfTSZARcrmYrytJabaVL+kS6n1daSqQjEOefc8NL+RDJih0RJ/61k+Z1D3vOJrZxzbiqMZ3iUGhU4o/Vsv7pk+cYh711ehVicc84NJ+UFyWipLY2wPNy6q5XSEXZLm9AqGLbOwgp5guZmJFEsFg+NuqtMlmDxQvJfzdMWbKT3O8fz0p/lmNe1BoUhUaFQcpIR6kIkgro6rBjFdSkDcQAKhEUWfy0919Dvxes3nDuMSH9qa7SCxEZYHm7dOedctaT8L+5oBckySfuJC8SGZJlkvb7qkTnnnANAKX9SH7EgMbNwKgNxE1T6ATtsuYiyOYKGkjI/m4EgxHp6sIF9JRSGKJdFhSLZD9ZTXLOOOdoeN/EFrFiEIESBDqWqrFiEYZoAR729Q+KL02DJqQ59HfN7cc7FDJTyIVLKGUZ+QiQtknS/pGclPSPpw8n2Dkn3SlqbfJ1VrRicc25GSHlle9UKEqAAfMzMlgLnAR+QtJR4JOH7zOxk4D58ZGHnnBuVrPxXLVRtpkMz2wpsTZa7JD0HLACuAi5OdrsNeAC4vlpxzHilLZ0Up54UhiibQS3NcLAXtTRjrU1QjOhdPIuX35rhLRc+wSc7H6QtyNFjeV4pii/uuJSX37c4Pp+VzKGugCCXxQqFuDd80spLYZaoPz/YOsxTU85VR8p/taZkylxJi4EzgUeAzqSQAdgGdE5FDM45Ny3NkNF/J0VSM/AD4CNmtl8lfR3MzKThb5Gk64DrAOpprHaYzjmXXkdzQSIpS1yIfNvMfphs3i5pnpltlTQP2DHcsWa2AlgB0KqOlN/GKSQRtrWy9T+9mr3L8lzy2uc4rXkrV7Wspr6kTN5dzHJv91I2981iX97oyvfTlu3lwfUtHP9148VPhby3542Hp6PUC6wZXD80ZW6RqLd42HbL92P56powZAwAABPNSURBVH6rzrnp0SGxmq22BHwdeM7Mbi556y4GRxReDtxZrRicc25GMCv/VQPVfCK5APjPwFOSnki2/TlwE3CHpGuBDcC7qhiDc85Ne2l/Iqlmq61fMfKYXJdW67rOOTej1LB/SLmmpNWWmzhlMug1p9B1UgsHOwJ65ovsGXvo+Mc88771PFvDLJt75vBzXYoVo8O6jquuDjU3xSuR0d0dcXK0hqg/TzTcJFTefNe5VNIk5oybCl6QOOdcyh21qS3nnHMVYKQ+W+AFSUopm4t7py+azws3Zjj12A1s2tdG75Y2Ov65jcZV6yke6EZhiBXyh6dQk7lJ7GAv6uvDIhtMeSkYY+RE51za+BOJc865yfGCxDnn3ERNhw6JXpCkQTLXhxUKEIT0/ofX8fJVEbdd8jXu3HMW+Y8uo+/BbcxlG3MhTls1NKDXLSVY+zLRgSieH6RUMvDiEVmskabJdc6lUw07GpbLCxLnnEs5fyJxzjk3OV6QuDFFRcwEQUhm0Xya1u/jVTcbf/2n5wOQ0W+JSkZNViZL1NsHK5+iGISD84EMNZAyG2i1lUyTO9wUueNSOgeKc67q/InEOefcxBlQTHdJ4gWJc86lXNqfSKo5Z7tzzrlKqOAw8pIul/S8pHWSbhjm/Y9KelbSakn3STp+rHN6QVJLQYiyOTKLFtL1rnPh3nmceedLvHBNBz3HtxH19hH19MTNgjX4o7JCyTzpI9WPAEEuS9DYSNBQT1BXF8/lHobxuTTSwMxl8PoR56aUrPzXqOeRQuAW4ApgKXCNpKVDdnscONvMTge+D/zNWPF5QeKcc2lm43yN7hxgnZmtN7N+4HbgqsMuZ3a/mfUkqw8DC8c6qdeROOdcisU928eVBZgjaVXJ+opk6nKABcDGkvc2AeeOcq5rgZ+MdUEvSKZSEKJshqC5CSKjcNpx7DuxgX0nBPTPirCvLeTRu/awpPAYli8cSltZX9+ELhf19kJvbyW/A+dcLYxvnNWdZnb2ZC8p6T8BZwMXjbWvFyTOOZdy43wiGc1mYFHJ+sJk2+HXky4D/gK4yMzG/J+s15E451yaVbaOZCVwsqQlknLA1cBdpTtIOhP4B+BKM9tRToj+RFJJ0qH5PpTLEbS2wtxZvPCe2Zx4/gZuOeEOQsG2Yh0hxj1dr+XuLa/GftPJCf/SR/CLJ4iSuUTKETQ1xWkvBVi+v8rfnHOuNio3aKOZFSR9ELgHCIFbzewZSZ8FVpnZXcDngWbge4r/Fr1sZleOdl4vSJxzLuUUVa7JvZndDdw9ZNunSpYvG+85vSBxzrk0M1DKJzX1gsQ559Iu5Z2AvSAZamBk25FGuE3qQRQM1oeggKChnsKrl7B2eR3/39mPs3z2L3mqbyHrejtZf+cb6b55IX9y31uwfCHumQ5gRhPraWL94dco80MT9fSk/gPmnKuAlP+aV63VlqRbJe2Q9HTJtg5J90pam3ydVa3rO+fcTCGzsl+1UM3mv98ELh+y7QbgPjM7GbgvWXfOOTeaCg7aWA1VS22Z2S8kLR6y+Srg4mT5NuAB4PpqxTBhw6W1Bpr2AspmUCZD0N4GxSLRsbPZdXobXceJ1jXwzPdO55OPNiZprALH5x+BqDjOzqll8LSWczOfMd6e7VNuqutIOs1sa7K8DegcaUdJ1wHXAdTTOAWhOedc+ojapazKVbPKdjMzaeRBj5NBxlYAtKoj3XfROeeqyQuSw2yXNM/MtkqaB5TV/X7KqLQlVtK7XAFBLgvZLHbq8Rw8tpFXzszwznc8yIl1z3Nibge/7jmZW587n/r7W5h3x/MUd+2mONk50SH1Hx7n3BRJ+d+CqR5r6y5gebK8HLhziq/vnHPTi4GKVvarFqrZ/Pc7wL8Dp0raJOla4CbgzZLWApcl684550ZzFLfaumaEty6t1jUnLAhRIMI5s7HWZqKWevaf3ELmYMTuV2XoX9bNgjl7MTtAoC4Kj85n5TtO5ZGXmrDCIjDjOJ4CaXIprQEpf4x1zk2l2hUQ5fKe7c45l2aGFyTOOecmyfuROOecmwzvR5JWEgpDCEOUyaD6OmhsoNjeyIHFTWw/DzpO2kvP+ll0/LyR4it1ND+3m2j9y5xgmykWi0c+bqb8h+2cm6ZS/rfl6C1InHNuOjCgghNbVYMXJM45l2reait9kl7jYccs1NpC1NJA37HNdC3MUqyD/lahIpz0z11kNh9gbt8uoq4DUCxSjOzQ/CPOOTdlvCBxzjk3YQYU091sywsS55xLtSQTkmJHTUESNDailmZ6lx3HK2fk4PV7WdS+lygqcrA/z84ts2h+PsfsZwo0/XYX0YZNFPr7h5+XpHQ55Y+czrkZIOV/Z46agsQ556Ylb7XlnHNu0vyJxDnn3KR4QVIlI82rDiiTjb/W16FMBjIZus9bwiunZ7AzuujdI467tRnWFwl37qF9307a2QxhGDfz7c9DVBz+upZMoJzyH6xzbqbwfiTOOecmw4DIW20555ybDH8iqRCJoK4OggA1NaFcln3nLWLHO3p532t/xX9seZIA2FBopSPsoV5F9kY5Qoy8hdy08W1sefgETvpMHlvzHFYsHjkJVblNe1P+Q3XOzTAp/5szfQoS55w7GplhxRHqbFPCCxLnnEs770cyeaec3sO//nQVW4sH2VJooNtyvNDfyZfXtHPMt1u4/9Nnct/62XGpXTqUwGGPg9s5ge2jTzSW8sdH59xRKuV/m6ZFQeKcc0ctM2+15ZxzbpL8iWTyfru6kbcvPBvlcgQN9Vi+gBrqWaRXIL+Z4t59ZZ9LdXUoDFFdHRSLhzohRn19KJMhOtiLAmGFwpHHZpLblRxjkcUdF4MQZTOHb0tamamhAcIA6zlI1NMzZnxh5zEwZxZs3Epx//7hv4dsjgNXnknTD38Tbyj3QyYRNDdjpy0m3LaHwuatgx0vk6mHD33fQRjH09GO6uuJdu7CCoX4fYkD7zyX9l+8SLS/C8sXwKIktTiBD7xEOLtjzHukTGbw/g4IwkP3G7P4ZxGI8NhOyGYobtqKFfIlJwkG058DsQ6cY4opkxn2cza4Q/I9SSiTJZzfifX0UnzllZEPqasjaG0ddZ+BawctLdjx82DtBqLu7nEGX8aApck+QWMjWrII7d5HtGcvhGH8cx6Y12e89z753QpmtVPYum1chwb19US9vUeeMpsjaGrAFs4j6OqmsGFj+bE0Ng7evyoN5Gr+ROKcc27i0t+zvSZT/Um6XNLzktZJuqEWMTjn3LQwMPpvua8amPInEkkhcAvwZmATsFLSXWb27FTH4pxz00LKJ7aSTfEjk6TXA582s7cm6zcCmNlfj3RMqzrsXF06mMcvFlEYxrnXtlbs4EHsYC8EAUFHO7ZvP1aMUBhgvX2YGZYvENTXER08CGYomwOL4nqR/jiHrkBxPUgQEPX2Df7wFAzm4QfyuqV59mS7spm4viDJ+Sqbwwp5grq6Q9cgKsbXUIDl+4Ekb9vXd3jOHuLvsbWZ4q7dR9yToKWFwrITyW3cBWFI4aWNcUwD5xgpV5vcQ2AwPz/Qo18BQS57eA55oF6lvx/r6xu8fn19/D0NyW8HTU0Es9pBorjjFay/P743YRjXVYz0eQtCgob6wTqqYnHwvpdcEzj8Xo1mpIE9U5ImUCaDFYsEDQ3x5w0GP0MDSn+myT2ygfq8kp/HcPVbpfVf8bmG/zwEjY1YX198fFLHZJEdfu2xvo9CIf5ch+Fhn5NDSuuyFKBgcBQJG/hf9NDrjfGzClpaIJ+Pr93QQHTgQDxgq0WHf7ZHq4sZeH/gd7au7tD9D+fOxroOxD+jujqKB7oPrwsd6ZzAI3Yf+223hn1znFqD2XZe5q1l739v/juPmtnZlbh2uWpRR7IAKK3J2gScO3QnSdcB1wHU0zg1kTnnXNp4z/aJM7MVwAqIn0hqHI5zztWOp7aGXHACqS1JXcDzUxPhhM0BdtY6iDJMhzg9xsrwGCtjIjEeb2ZzK3FxST9NYijXTjO7vBLXLlctCpIM8FvgUmAzsBJ4t5k9M8oxq6Y65zde0yFGmB5xeoyV4TFWxnSIsdamPLVlZgVJHwTuAULg1tEKEeecc+lWkzoSM7sbuLsW13bOOVdZNemQOAErah1AGaZDjDA94vQYK8NjrIzpEGNNTXkdiXPOuZllujyROOecS6nUFyRpHZdL0kuSnpL0hKRVybYOSfdKWpt8nTXFMd0qaYekp0u2DRuTYn+X3NfVks6qYYyflrQ5uZdPSHpbyXs3JjE+L6n87r2Ti3GRpPslPSvpGUkfTran5l6OEmPa7mW9pN9IejKJ8zPJ9iWSHkni+a6kXLK9Lllfl7y/uIYxflPSiyX38oxke01+d1LNzFL7Im7V9QJwApADngSW1jquJLaXgDlDtv0NcEOyfAPwuSmO6ULgLODpsWIC3gb8BBBwHvBIDWP8NPDxYfZdmvzM64AlyWchnIIY5wFnJcstxM3Vl6bpXo4SY9rupYDmZDkLPJLcozuAq5PtXwXenyz/CfDVZPlq4Ls1jPGbwDuG2b8mvztpfqX9ieQcYJ2ZrTezfuB24KoaxzSaq4DbkuXbgN+dyoub2S+AoQNzjRTTVcC3LPYw0C5pXo1iHMlVwO1m1mdmLwLriD8TVWVmW83ssWS5C3iOeGif1NzLUWIcSa3upZnZgWQ1m7wMuAT4frJ96L0cuMffBy6VVJExqyYQ40hq8ruTZmkvSIYbl2u0X5apZMDPJD2ajAsG0GlmW5PlbUBnbUI7zEgxpe3efjBJE9xakhKseYxJauVM4v+lpvJeDokRUnYvJYWSngB2APcSPw3tNbOB0SlLYzkUZ/L+PmD2VMdoZgP38q+Se/kFSXVDYxwm/qNS2guSNHuDmZ0FXAF8QNKFpW9a/AycqiZxaYwp8RXgROAMYCvwt7UNJyapGfgB8BEzO2y6yrTcy2FiTN29NLOimZ0BLCR+CjqtxiEdYWiMkl4D3Egc6+8AHcD1NQwx1dJekGwGFpWsL0y21ZyZbU6+7gB+RPwLsn3gETf5uqN2ER4yUkypubdmtj35RY6Af2Qw5VKzGCVlif9Af9vMfphsTtW9HC7GNN7LAWa2F7gfeD1xOmigQ3RpLIfiTN5vA3bVIMbLk/ShmVkf8A1SdC/TJu0FyUrg5KSFR4648u2uGseEpCZJLQPLwFuAp4ljW57sthy4szYRHmakmO4C/kvSAuU8YF9J2mZKDckv/x7xvYQ4xquTljxLgJOB30xBPAK+DjxnZjeXvJWaezlSjCm8l3MltSfLDcQT2j1H/Mf6HcluQ+/lwD1+B/Dz5OlvqmNcU/KfBhHX4ZTey1T87qRGrWv7x3oRt5D4LXFe9S9qHU8S0wnELWCeBJ4ZiIs4l3sfsBb4v0DHFMf1HeJ0Rp44b3vtSDERtzi5JbmvTwFn1zDGf0piWE38SzqvZP+/SGJ8HrhiimJ8A3HaajXwRPJ6W5ru5Sgxpu1eng48nsTzNPCpZPsJxAXZOuB7QF2yvT5ZX5e8f0INY/x5ci+fBv4Pgy27avK7k+aX92x3zjk3KWlPbTnnnEs5L0icc85NihckzjnnJsULEuecc5PiBYlzzrlJ8YLEVZSkYjJS6jPJaKofk1S1z5mkxSoZSXiC5/jzIeu/nlxUo15rsaR3V+v8ztWCFySu0g6a2Rlm9mrijl1XAH9Z45gOKelNXeqwgsTMzq9iCIsBL0jcjOIFiasai4ePuY54EEElA+N9XtLKZCC8PxrYV9L1iud3eVLSTcm2MyQ9nOz7Iw3O//G6ZL8ngQ+UnGPY80u6WNIvJd0FPFsaY3KthuQp6tvJtgMlxz0o6U5J6yXdJOk9iueueErSicl+cyX9ILnuSkkXJNsv0uBcFo8noyHcBLwx2fZnY8T8C0n/pnj+kK9W88nOuUmpdY9If82sF3BgmG17iUfKvQ7478m2OmAV8dwYVwC/BhqT9wZ6jK8GLkqWPwt8sWT7hcny50nmNhnl/BcD3cCScmIeWE+O20s890cd8XhKn0ne+3BJPP9MPIgnwHHEw5YA/Bi4IFluBjLJOf+15FqjxdxL3AM8JB4194i5MfzlrzS8hnvMd65a3gKcLmlgjKU24jGfLgO+YWY9AGa2W1Ib0G5mDyb73gZ8LxkTqd3ieU0gHhLkijHO3w/8xuJ5OMZrpSXjKEl6AfhZsv0p4E3J8mXAUg1Om9GqeFTeh4CbkyedH5rZJh05tcZYMa9Prv0d4mFRvj/0BM7VmhckrqoknQAUiUfKFfAhM7tnyD6VmvZ1pPNfTPxEMhF9JctRyXrE4O9PAJxnZr1Djr1J0r8Rj4H10Ajf52gxDx2/yMczcqnkOVdXNZLmEk+j+r/NzIB7gPcrHv4cSacoHj35XuAPJTUm2zvMbB+wR9Ibk9P9Z+BBi4f53ivpDcn295RccqTzjyU/cMwE/Qz40MCKBuf2PtHMnjKzzxGPZH0a0EU8NW45MZ+jeOTrAPgD4FeTiNG5qvEnEldpDYpnmssCBeLU08Aw518jbrX0WDI09yvA75rZT5M/vqsk9QN3E7ekWg58NSlg1gN/mJznD4FbJRmDqaYRz19GzCuA1ZIeM7P3jLn3kf4UuEXSauLfqV8Afwx8RNKbiJ9eniGe5zsCiklDgW8CXxol5pXA/wZOIh52/UcTiM25qvPRf51LoSS19XEze3utY3FuLJ7acs45Nyn+ROKcc25S/InEOefcpHhB4pxzblK8IHHOOTcpXpA455ybFC9InHPOTYoXJM455ybl/wG2SGZbPnFFOwAAAABJRU5ErkJggg==\n",
- "text/plain": [
- ""
- ]
- },
- "metadata": {
- "needs_background": "light"
- },
- "output_type": "display_data"
- }
- ],
- "source": [
- "fig = display.plot_alignment(alignment)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## WaveFlow: vocoder model\n",
- "Generated spectrogram is converted to raw audio using a pretrained waveflow model."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "metadata": {},
- "outputs": [],
- "source": [
- "from paddlespeech.t2s.models.waveflow import ConditionalWaveFlow"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "data:\n",
- " batch_size: 8\n",
- " clip_frames: 65\n",
- " fmax: 8000\n",
- " fmin: 0\n",
- " hop_length: 256\n",
- " n_fft: 1024\n",
- " n_mels: 80\n",
- " sample_rate: 22050\n",
- " valid_size: 16\n",
- " win_length: 1024\n",
- "model:\n",
- " channels: 128\n",
- " kernel_size: [3, 3]\n",
- " n_flows: 8\n",
- " n_group: 16\n",
- " n_layers: 8\n",
- " sigma: 1.0\n",
- " upsample_factors: [16, 16]\n",
- "training:\n",
- " lr: 0.0002\n",
- " max_iteration: 3000000\n",
- " save_interval: 10000\n",
- " valid_interval: 1000\n"
- ]
- }
- ],
- "source": [
- "from examples.waveflow import config as waveflow_config\n",
- "vocoder_config = waveflow_config.get_cfg_defaults()\n",
- "print(vocoder_config)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[checkpoint] Rank 0: loaded model from ../../pretrained/waveflow/waveflow_ljspeech_ckpt_0.3/step-2000000.pdparams\n"
- ]
- }
- ],
- "source": [
- "vocoder = ConditionalWaveFlow.from_pretrained(\n",
- " vocoder_config, \n",
- " \"../../pretrained/waveflow/waveflow_ljspeech_ckpt_0.3/step-2000000\")\n",
- "layer_tools.recursively_remove_weight_norm(vocoder)\n",
- "vocoder.eval()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "time: 9.412613868713379s\n"
- ]
- }
- ],
- "source": [
- "audio = vocoder.infer(paddle.transpose(outputs[\"mel_outputs_postnet\"], [0, 2, 1]))\n",
- "wav = audio[0].numpy()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- " \n",
- " \n",
- " Your browser does not support the audio element.\n",
- " \n",
- " "
- ],
- "text/plain": [
- ""
- ]
- },
- "execution_count": 13,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "ipd.Audio(wav, rate=22050)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.7.7"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/paddlespeech/t2s/exps/tacotron2/synthesize.py b/paddlespeech/t2s/exps/tacotron2/synthesize.py
deleted file mode 100644
index c73c32d2..00000000
--- a/paddlespeech/t2s/exps/tacotron2/synthesize.py
+++ /dev/null
@@ -1,103 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-from pathlib import Path
-
-import numpy as np
-import paddle
-from matplotlib import pyplot as plt
-
-from paddlespeech.t2s.exps.tacotron2.config import get_cfg_defaults
-from paddlespeech.t2s.frontend import EnglishCharacter
-from paddlespeech.t2s.models.tacotron2 import Tacotron2
-from paddlespeech.t2s.utils import display
-
-
-def main(config, args):
- if args.ngpu == 0:
- paddle.set_device("cpu")
- elif args.ngpu > 0:
- paddle.set_device("gpu")
- else:
- print("ngpu should >= 0 !")
-
- # model
- frontend = EnglishCharacter()
- model = Tacotron2.from_pretrained(config, args.checkpoint_path)
- model.eval()
-
- # inputs
- input_path = Path(args.input).expanduser()
- sentences = []
- with open(input_path, "rt") as f:
- for line in f:
- line_list = line.strip().split()
- utt_id = line_list[0]
- sentence = " ".join(line_list[1:])
- sentences.append((utt_id, sentence))
-
- if args.output is None:
- output_dir = input_path.parent / "synthesis"
- else:
- output_dir = Path(args.output).expanduser()
- output_dir.mkdir(exist_ok=True)
-
- for i, sentence in enumerate(sentences):
- sentence = paddle.to_tensor(frontend(sentence)).unsqueeze(0)
- outputs = model.infer(sentence)
- mel_output = outputs["mel_outputs_postnet"][0].numpy().T
- alignment = outputs["alignments"][0].numpy().T
-
- np.save(str(output_dir / f"sentence_{i}"), mel_output)
- display.plot_alignment(alignment)
- plt.savefig(str(output_dir / f"sentence_{i}.png"))
- if args.verbose:
- print("spectrogram saved at {}".format(output_dir /
- f"sentence_{i}.npy"))
-
-
-if __name__ == "__main__":
- config = get_cfg_defaults()
-
- parser = argparse.ArgumentParser(
- description="generate mel spectrogram with TransformerTTS.")
- parser.add_argument(
- "--config",
- type=str,
- metavar="FILE",
- help="extra config to overwrite the default config")
- parser.add_argument(
- "--checkpoint_path", type=str, help="path of the checkpoint to load.")
- parser.add_argument("--input", type=str, help="path of the text sentences")
- parser.add_argument("--output", type=str, help="path to save outputs")
- parser.add_argument(
- "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
- parser.add_argument(
- "--opts",
- nargs=argparse.REMAINDER,
- help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
- )
- parser.add_argument(
- "-v", "--verbose", action="store_true", help="print msg")
-
- args = parser.parse_args()
- if args.config:
- config.merge_from_file(args.config)
- if args.opts:
- config.merge_from_list(args.opts)
- config.freeze()
- print(config)
- print(args)
-
- main(config, args)
diff --git a/paddlespeech/t2s/exps/tacotron2/train.py b/paddlespeech/t2s/exps/tacotron2/train.py
index 8198348f..69ff80e4 100644
--- a/paddlespeech/t2s/exps/tacotron2/train.py
+++ b/paddlespeech/t2s/exps/tacotron2/train.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -11,210 +11,192 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-import time
-from collections import defaultdict
+import argparse
+import logging
+import os
+import shutil
+from pathlib import Path
+import jsonlines
import numpy as np
import paddle
+import yaml
+from paddle import DataParallel
from paddle import distributed as dist
from paddle.io import DataLoader
from paddle.io import DistributedBatchSampler
+from yacs.config import CfgNode
-from paddlespeech.t2s.data import dataset
-from paddlespeech.t2s.exps.tacotron2.config import get_cfg_defaults
-from paddlespeech.t2s.exps.tacotron2.ljspeech import LJSpeech
-from paddlespeech.t2s.exps.tacotron2.ljspeech import LJSpeechCollector
+from paddlespeech.t2s.datasets.am_batch_fn import tacotron2_multi_spk_batch_fn
+from paddlespeech.t2s.datasets.am_batch_fn import tacotron2_single_spk_batch_fn
+from paddlespeech.t2s.datasets.data_table import DataTable
from paddlespeech.t2s.models.tacotron2 import Tacotron2
-from paddlespeech.t2s.models.tacotron2 import Tacotron2Loss
-from paddlespeech.t2s.training.cli import default_argument_parser
-from paddlespeech.t2s.training.experiment import ExperimentBase
-from paddlespeech.t2s.utils import display
-from paddlespeech.t2s.utils import mp_tools
-
-
-class Experiment(ExperimentBase):
- def compute_losses(self, inputs, outputs):
- texts, mel_targets, plens, slens = inputs
-
- mel_outputs = outputs["mel_output"]
- mel_outputs_postnet = outputs["mel_outputs_postnet"]
- attention_weight = outputs["alignments"]
- if self.config.model.use_stop_token:
- stop_logits = outputs["stop_logits"]
- else:
- stop_logits = None
-
- losses = self.criterion(mel_outputs, mel_outputs_postnet, mel_targets,
- attention_weight, slens, plens, stop_logits)
- return losses
-
- def train_batch(self):
- start = time.time()
- batch = self.read_batch()
- data_loader_time = time.time() - start
-
- self.optimizer.clear_grad()
- self.model.train()
- texts, mels, text_lens, output_lens = batch
- outputs = self.model(texts, text_lens, mels, output_lens)
- losses = self.compute_losses(batch, outputs)
- loss = losses["loss"]
- loss.backward()
- self.optimizer.step()
- iteration_time = time.time() - start
-
- losses_np = {k: float(v) for k, v in losses.items()}
- # logging
- msg = "Rank: {}, ".format(dist.get_rank())
- msg += "step: {}, ".format(self.iteration)
- msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time,
- iteration_time)
- msg += ', '.join('{}: {:>.6f}'.format(k, v)
- for k, v in losses_np.items())
- self.logger.info(msg)
-
- if dist.get_rank() == 0:
- for k, v in losses_np.items():
- self.visualizer.add_scalar(f"train_loss/{k}", v, self.iteration)
-
- @mp_tools.rank_zero_only
- @paddle.no_grad()
- def valid(self):
- valid_losses = defaultdict(list)
- for i, batch in enumerate(self.valid_loader):
- texts, mels, text_lens, output_lens = batch
- outputs = self.model(texts, text_lens, mels, output_lens)
- losses = self.compute_losses(batch, outputs)
- for k, v in losses.items():
- valid_losses[k].append(float(v))
-
- attention_weights = outputs["alignments"]
- self.visualizer.add_figure(
- f"valid_sentence_{i}_alignments",
- display.plot_alignment(attention_weights[0].numpy().T),
- self.iteration)
- self.visualizer.add_figure(
- f"valid_sentence_{i}_target_spectrogram",
- display.plot_spectrogram(mels[0].numpy().T), self.iteration)
- self.visualizer.add_figure(
- f"valid_sentence_{i}_predicted_spectrogram",
- display.plot_spectrogram(outputs['mel_outputs_postnet'][0]
- .numpy().T), self.iteration)
-
- # write visual log
- valid_losses = {k: np.mean(v) for k, v in valid_losses.items()}
-
- # logging
- msg = "Valid: "
- msg += "step: {}, ".format(self.iteration)
- msg += ', '.join('{}: {:>.6f}'.format(k, v)
- for k, v in valid_losses.items())
- self.logger.info(msg)
-
- for k, v in valid_losses.items():
- self.visualizer.add_scalar(f"valid/{k}", v, self.iteration)
-
- def setup_model(self):
- config = self.config
- model = Tacotron2(
- vocab_size=config.model.vocab_size,
- d_mels=config.data.n_mels,
- d_encoder=config.model.d_encoder,
- encoder_conv_layers=config.model.encoder_conv_layers,
- encoder_kernel_size=config.model.encoder_kernel_size,
- d_prenet=config.model.d_prenet,
- d_attention_rnn=config.model.d_attention_rnn,
- d_decoder_rnn=config.model.d_decoder_rnn,
- attention_filters=config.model.attention_filters,
- attention_kernel_size=config.model.attention_kernel_size,
- d_attention=config.model.d_attention,
- d_postnet=config.model.d_postnet,
- postnet_kernel_size=config.model.postnet_kernel_size,
- postnet_conv_layers=config.model.postnet_conv_layers,
- reduction_factor=config.model.reduction_factor,
- p_encoder_dropout=config.model.p_encoder_dropout,
- p_prenet_dropout=config.model.p_prenet_dropout,
- p_attention_dropout=config.model.p_attention_dropout,
- p_decoder_dropout=config.model.p_decoder_dropout,
- p_postnet_dropout=config.model.p_postnet_dropout,
- use_stop_token=config.model.use_stop_token)
-
- if self.parallel:
- model = paddle.DataParallel(model)
-
- grad_clip = paddle.nn.ClipGradByGlobalNorm(
- config.training.grad_clip_thresh)
- optimizer = paddle.optimizer.Adam(
- learning_rate=config.training.lr,
- parameters=model.parameters(),
- weight_decay=paddle.regularizer.L2Decay(
- config.training.weight_decay),
- grad_clip=grad_clip)
- criterion = Tacotron2Loss(
- use_stop_token_loss=config.model.use_stop_token,
- use_guided_attention_loss=config.model.use_guided_attention_loss,
- sigma=config.model.guided_attention_loss_sigma)
- self.model = model
- self.optimizer = optimizer
- self.criterion = criterion
-
- def setup_dataloader(self):
- args = self.args
- config = self.config
- ljspeech_dataset = LJSpeech(args.data)
-
- valid_set, train_set = dataset.split(ljspeech_dataset,
- config.data.valid_size)
- batch_fn = LJSpeechCollector(padding_idx=config.data.padding_idx)
-
- if not self.parallel:
- self.train_loader = DataLoader(
- train_set,
- batch_size=config.data.batch_size,
- shuffle=True,
- drop_last=True,
- collate_fn=batch_fn)
- else:
- sampler = DistributedBatchSampler(
- train_set,
- batch_size=config.data.batch_size,
- shuffle=True,
- drop_last=True)
- self.train_loader = DataLoader(
- train_set, batch_sampler=sampler, collate_fn=batch_fn)
-
- self.valid_loader = DataLoader(
- valid_set,
- batch_size=config.data.batch_size,
- shuffle=False,
- drop_last=False,
- collate_fn=batch_fn)
-
-
-def main_sp(config, args):
- exp = Experiment(config, args)
- exp.setup()
- exp.resume_or_load()
- exp.run()
-
-
-def main(config, args):
- if args.ngpu > 1:
- dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu)
+from paddlespeech.t2s.models.tacotron2 import Tacotron2Evaluator
+from paddlespeech.t2s.models.tacotron2 import Tacotron2Updater
+from paddlespeech.t2s.training.extensions.snapshot import Snapshot
+from paddlespeech.t2s.training.extensions.visualizer import VisualDL
+from paddlespeech.t2s.training.optimizer import build_optimizers
+from paddlespeech.t2s.training.seeding import seed_everything
+from paddlespeech.t2s.training.trainer import Trainer
+from paddlespeech.t2s.utils import str2bool
+
+
+def train_sp(args, config):
+ # decides device type and whether to run in parallel
+ # setup running environment correctly
+ if (not paddle.is_compiled_with_cuda()) or args.ngpu == 0:
+ paddle.set_device("cpu")
else:
- main_sp(config, args)
-
+ paddle.set_device("gpu")
+ world_size = paddle.distributed.get_world_size()
+ if world_size > 1:
+ paddle.distributed.init_parallel_env()
+
+ # set the random seed, it is a must for multiprocess training
+ seed_everything(config.seed)
+
+ print(
+ f"rank: {dist.get_rank()}, pid: {os.getpid()}, parent_pid: {os.getppid()}",
+ )
+
+ # dataloader has been too verbose
+ logging.getLogger("DataLoader").disabled = True
+
+ fields = [
+ "text",
+ "text_lengths",
+ "speech",
+ "speech_lengths",
+ ]
+
+ converters = {
+ "speech": np.load,
+ }
+ if args.voice_cloning:
+ print("Training voice cloning!")
+ collate_fn = tacotron2_multi_spk_batch_fn
+ fields += ["spk_emb"]
+ converters["spk_emb"] = np.load
+ else:
+ print("single speaker tacotron2!")
+ collate_fn = tacotron2_single_spk_batch_fn
+
+ # construct dataset for training and validation
+ with jsonlines.open(args.train_metadata, 'r') as reader:
+ train_metadata = list(reader)
+ train_dataset = DataTable(
+ data=train_metadata,
+ fields=fields,
+ converters=converters, )
+ with jsonlines.open(args.dev_metadata, 'r') as reader:
+ dev_metadata = list(reader)
+ dev_dataset = DataTable(
+ data=dev_metadata,
+ fields=fields,
+ converters=converters, )
+
+ # collate function and dataloader
+ train_sampler = DistributedBatchSampler(
+ train_dataset,
+ batch_size=config.batch_size,
+ shuffle=True,
+ drop_last=True)
+
+ print("samplers done!")
+
+ train_dataloader = DataLoader(
+ train_dataset,
+ batch_sampler=train_sampler,
+ collate_fn=collate_fn,
+ num_workers=config.num_workers)
+
+ dev_dataloader = DataLoader(
+ dev_dataset,
+ shuffle=False,
+ drop_last=False,
+ batch_size=config.batch_size,
+ collate_fn=collate_fn,
+ num_workers=config.num_workers)
+ print("dataloaders done!")
+
+ with open(args.phones_dict, "r") as f:
+ phn_id = [line.strip().split() for line in f.readlines()]
+ vocab_size = len(phn_id)
+ print("vocab_size:", vocab_size)
+
+ odim = config.n_mels
+ model = Tacotron2(idim=vocab_size, odim=odim, **config["model"])
+ if world_size > 1:
+ model = DataParallel(model)
+ print("model done!")
+
+ optimizer = build_optimizers(model, **config["optimizer"])
+ print("optimizer done!")
+
+ output_dir = Path(args.output_dir)
+ output_dir.mkdir(parents=True, exist_ok=True)
+ if dist.get_rank() == 0:
+ config_name = args.config.split("/")[-1]
+ # copy conf to output_dir
+ shutil.copyfile(args.config, output_dir / config_name)
+
+ updater = Tacotron2Updater(
+ model=model,
+ optimizer=optimizer,
+ dataloader=train_dataloader,
+ output_dir=output_dir,
+ **config["updater"])
+
+ trainer = Trainer(updater, (config.max_epoch, 'epoch'), output_dir)
+
+ evaluator = Tacotron2Evaluator(
+ model, dev_dataloader, output_dir=output_dir, **config["updater"])
+
+ if dist.get_rank() == 0:
+ trainer.extend(evaluator, trigger=(1, "epoch"))
+ trainer.extend(VisualDL(output_dir), trigger=(1, "iteration"))
+ trainer.extend(
+ Snapshot(max_size=config.num_snapshots), trigger=(1, 'epoch'))
+ trainer.run()
+
+
+def main():
+ # parse args and config and redirect to train_sp
+ parser = argparse.ArgumentParser(description="Train a Tacotron2 model.")
+ parser.add_argument("--config", type=str, help="tacotron2 config file.")
+ parser.add_argument("--train-metadata", type=str, help="training data.")
+ parser.add_argument("--dev-metadata", type=str, help="dev data.")
+ parser.add_argument("--output-dir", type=str, help="output dir.")
+ parser.add_argument(
+ "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
+ parser.add_argument(
+ "--phones-dict", type=str, default=None, help="phone vocabulary file.")
+
+ parser.add_argument(
+ "--voice-cloning",
+ type=str2bool,
+ default=False,
+ help="whether training voice cloning model.")
-if __name__ == "__main__":
- config = get_cfg_defaults()
- parser = default_argument_parser()
args = parser.parse_args()
- if args.config:
- config.merge_from_file(args.config)
- if args.opts:
- config.merge_from_list(args.opts)
- config.freeze()
+
+ with open(args.config) as f:
+ config = CfgNode(yaml.safe_load(f))
+
+ print("========Args========")
+ print(yaml.safe_dump(vars(args)))
+ print("========Config========")
print(config)
- print(args)
+ print(
+ f"master see the word size: {dist.get_world_size()}, from pid: {os.getpid()}"
+ )
- main(config, args)
+ # dispatch
+ if args.ngpu > 1:
+ dist.spawn(train_sp, (args, config), nprocs=args.ngpu)
+ else:
+ train_sp(args, config)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/paddlespeech/t2s/exps/transformer_tts/normalize.py b/paddlespeech/t2s/exps/transformer_tts/normalize.py
index 4bb77c79..87e975b8 100644
--- a/paddlespeech/t2s/exps/transformer_tts/normalize.py
+++ b/paddlespeech/t2s/exps/transformer_tts/normalize.py
@@ -130,6 +130,9 @@ def main():
"speech_lengths": item['speech_lengths'],
"speech": str(speech_path),
}
+ # add spk_emb for voice cloning
+ if "spk_emb" in item:
+ record["spk_emb"] = str(item["spk_emb"])
output_metadata.append(record)
output_metadata.sort(key=itemgetter('utt_id'))
output_metadata_path = Path(args.dumpdir) / "metadata.jsonl"
diff --git a/paddlespeech/t2s/exps/transformer_tts/preprocess.py b/paddlespeech/t2s/exps/transformer_tts/preprocess.py
index 93158b67..9aa87e91 100644
--- a/paddlespeech/t2s/exps/transformer_tts/preprocess.py
+++ b/paddlespeech/t2s/exps/transformer_tts/preprocess.py
@@ -26,20 +26,17 @@ import tqdm
import yaml
from yacs.config import CfgNode as Configuration
-from paddlespeech.t2s.data.get_feats import LogMelFBank
+from paddlespeech.t2s.datasets.get_feats import LogMelFBank
from paddlespeech.t2s.frontend import English
def get_lj_sentences(file_name, frontend):
- '''
- read MFA duration.txt
- Parameters
- ----------
- file_name : str or Path
- Returns
- ----------
- Dict
- sentence: {'utt': ([char], [int])}
+ '''read MFA duration.txt
+
+ Args:
+ file_name (str or Path)
+ Returns:
+ Dict: sentence: {'utt': ([char], [int])}
'''
f = open(file_name, 'r')
sentence = {}
@@ -59,14 +56,11 @@ def get_lj_sentences(file_name, frontend):
def get_input_token(sentence, output_path):
- '''
- get phone set from training data and save it
- Parameters
- ----------
- sentence : Dict
- sentence: {'utt': ([char], str)}
- output_path : str or path
- path to save phone_id_map
+ '''get phone set from training data and save it
+
+ Args:
+ sentence (Dict): sentence: {'utt': ([char], str)}
+ output_path (str or path): path to save phone_id_map
'''
phn_token = set()
for utt in sentence:
diff --git a/paddlespeech/t2s/exps/transformer_tts/train.py b/paddlespeech/t2s/exps/transformer_tts/train.py
index 8695c06a..d521ce89 100644
--- a/paddlespeech/t2s/exps/transformer_tts/train.py
+++ b/paddlespeech/t2s/exps/transformer_tts/train.py
@@ -148,9 +148,8 @@ def train_sp(args, config):
if dist.get_rank() == 0:
trainer.extend(evaluator, trigger=(1, "epoch"))
trainer.extend(VisualDL(output_dir), trigger=(1, "iteration"))
- trainer.extend(
- Snapshot(max_size=config.num_snapshots), trigger=(1, 'epoch'))
- # print(trainer.extensions)
+ trainer.extend(
+ Snapshot(max_size=config.num_snapshots), trigger=(1, 'epoch'))
trainer.run()
diff --git a/paddlespeech/t2s/exps/fastspeech2/voice_cloning.py b/paddlespeech/t2s/exps/voice_cloning.py
similarity index 57%
rename from paddlespeech/t2s/exps/fastspeech2/voice_cloning.py
rename to paddlespeech/t2s/exps/voice_cloning.py
index 9fbd4964..3de30774 100644
--- a/paddlespeech/t2s/exps/fastspeech2/voice_cloning.py
+++ b/paddlespeech/t2s/exps/voice_cloning.py
@@ -21,17 +21,43 @@ import soundfile as sf
import yaml
from yacs.config import CfgNode
+from paddlespeech.s2t.utils.dynamic_import import dynamic_import
from paddlespeech.t2s.frontend.zh_frontend import Frontend
-from paddlespeech.t2s.models.fastspeech2 import FastSpeech2
-from paddlespeech.t2s.models.fastspeech2 import FastSpeech2Inference
-from paddlespeech.t2s.models.parallel_wavegan import PWGGenerator
-from paddlespeech.t2s.models.parallel_wavegan import PWGInference
from paddlespeech.t2s.modules.normalizer import ZScore
from paddlespeech.vector.exps.ge2e.audio_processor import SpeakerVerificationPreprocessor
from paddlespeech.vector.models.lstm_speaker_encoder import LSTMSpeakerEncoder
+model_alias = {
+ # acoustic model
+ "fastspeech2":
+ "paddlespeech.t2s.models.fastspeech2:FastSpeech2",
+ "fastspeech2_inference":
+ "paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference",
+ "tacotron2":
+ "paddlespeech.t2s.models.tacotron2:Tacotron2",
+ "tacotron2_inference":
+ "paddlespeech.t2s.models.tacotron2:Tacotron2Inference",
+ # voc
+ "pwgan":
+ "paddlespeech.t2s.models.parallel_wavegan:PWGGenerator",
+ "pwgan_inference":
+ "paddlespeech.t2s.models.parallel_wavegan:PWGInference",
+}
+
+
+def voice_cloning(args):
+ # Init body.
+ with open(args.am_config) as f:
+ am_config = CfgNode(yaml.safe_load(f))
+ with open(args.voc_config) as f:
+ voc_config = CfgNode(yaml.safe_load(f))
+
+ print("========Args========")
+ print(yaml.safe_dump(vars(args)))
+ print("========Config========")
+ print(am_config)
+ print(voc_config)
-def voice_cloning(args, fastspeech2_config, pwg_config):
# speaker encoder
p = SpeakerVerificationPreprocessor(
sampling_rate=16000,
@@ -57,40 +83,52 @@ def voice_cloning(args, fastspeech2_config, pwg_config):
phn_id = [line.strip().split() for line in f.readlines()]
vocab_size = len(phn_id)
print("vocab_size:", vocab_size)
- odim = fastspeech2_config.n_mels
- model = FastSpeech2(
- idim=vocab_size, odim=odim, **fastspeech2_config["model"])
- model.set_state_dict(
- paddle.load(args.fastspeech2_checkpoint)["main_params"])
- model.eval()
-
- vocoder = PWGGenerator(**pwg_config["generator_params"])
- vocoder.set_state_dict(paddle.load(args.pwg_checkpoint)["generator_params"])
- vocoder.remove_weight_norm()
- vocoder.eval()
- print("model done!")
+ # acoustic model
+ odim = am_config.n_mels
+ # model: {model_name}_{dataset}
+ am_name = args.am[:args.am.rindex('_')]
+ am_dataset = args.am[args.am.rindex('_') + 1:]
+
+ am_class = dynamic_import(am_name, model_alias)
+ am_inference_class = dynamic_import(am_name + '_inference', model_alias)
+
+ if am_name == 'fastspeech2':
+ am = am_class(
+ idim=vocab_size, odim=odim, spk_num=None, **am_config["model"])
+ elif am_name == 'tacotron2':
+ am = am_class(idim=vocab_size, odim=odim, **am_config["model"])
+
+ am.set_state_dict(paddle.load(args.am_ckpt)["main_params"])
+ am.eval()
+ am_mu, am_std = np.load(args.am_stat)
+ am_mu = paddle.to_tensor(am_mu)
+ am_std = paddle.to_tensor(am_std)
+ am_normalizer = ZScore(am_mu, am_std)
+ am_inference = am_inference_class(am_normalizer, am)
+ am_inference.eval()
+ print("acoustic model done!")
+
+ # vocoder
+ # model: {model_name}_{dataset}
+ voc_name = args.voc[:args.voc.rindex('_')]
+ voc_class = dynamic_import(voc_name, model_alias)
+ voc_inference_class = dynamic_import(voc_name + '_inference', model_alias)
+ voc = voc_class(**voc_config["generator_params"])
+ voc.set_state_dict(paddle.load(args.voc_ckpt)["generator_params"])
+ voc.remove_weight_norm()
+ voc.eval()
+ voc_mu, voc_std = np.load(args.voc_stat)
+ voc_mu = paddle.to_tensor(voc_mu)
+ voc_std = paddle.to_tensor(voc_std)
+ voc_normalizer = ZScore(voc_mu, voc_std)
+ voc_inference = voc_inference_class(voc_normalizer, voc)
+ voc_inference.eval()
+ print("voc done!")
frontend = Frontend(phone_vocab_path=args.phones_dict)
print("frontend done!")
- stat = np.load(args.fastspeech2_stat)
- mu, std = stat
- mu = paddle.to_tensor(mu)
- std = paddle.to_tensor(std)
- fastspeech2_normalizer = ZScore(mu, std)
-
- stat = np.load(args.pwg_stat)
- mu, std = stat
- mu = paddle.to_tensor(mu)
- std = paddle.to_tensor(std)
- pwg_normalizer = ZScore(mu, std)
-
- fastspeech2_inference = FastSpeech2Inference(fastspeech2_normalizer, model)
- fastspeech2_inference.eval()
- pwg_inference = PWGInference(pwg_normalizer, vocoder)
- pwg_inference.eval()
-
output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
@@ -112,24 +150,23 @@ def voice_cloning(args, fastspeech2_config, pwg_config):
# print("spk_emb shape: ", spk_emb.shape)
with paddle.no_grad():
- wav = pwg_inference(
- fastspeech2_inference(phone_ids, spk_emb=spk_emb))
+ wav = voc_inference(am_inference(phone_ids, spk_emb=spk_emb))
sf.write(
str(output_dir / (utt_id + ".wav")),
wav.numpy(),
- samplerate=fastspeech2_config.fs)
+ samplerate=am_config.fs)
print(f"{utt_id} done!")
# Randomly generate numbers of 0 ~ 0.2, 256 is the dim of spk_emb
random_spk_emb = np.random.rand(256) * 0.2
random_spk_emb = paddle.to_tensor(random_spk_emb)
utt_id = "random_spk_emb"
with paddle.no_grad():
- wav = pwg_inference(fastspeech2_inference(phone_ids, spk_emb=spk_emb))
+ wav = voc_inference(am_inference(phone_ids, spk_emb=spk_emb))
sf.write(
str(output_dir / (utt_id + ".wav")),
wav.numpy(),
- samplerate=fastspeech2_config.fs)
+ samplerate=am_config.fs)
print(f"{utt_id} done!")
@@ -137,32 +174,53 @@ def main():
# parse args and config and redirect to train_sp
parser = argparse.ArgumentParser(description="")
parser.add_argument(
- "--fastspeech2-config", type=str, help="fastspeech2 config file.")
- parser.add_argument(
- "--fastspeech2-checkpoint",
+ '--am',
type=str,
- help="fastspeech2 checkpoint to load.")
+ default='fastspeech2_csmsc',
+ choices=['fastspeech2_aishell3', 'tacotron2_aishell3'],
+ help='Choose acoustic model type of tts task.')
parser.add_argument(
- "--fastspeech2-stat",
+ '--am_config',
type=str,
- help="mean and standard deviation used to normalize spectrogram when training fastspeech2."
- )
+ default=None,
+ help='Config of acoustic model. Use deault config when it is None.')
parser.add_argument(
- "--pwg-config", type=str, help="parallel wavegan config file.")
- parser.add_argument(
- "--pwg-checkpoint",
+ '--am_ckpt',
type=str,
- help="parallel wavegan generator parameters to load.")
+ default=None,
+ help='Checkpoint file of acoustic model.')
parser.add_argument(
- "--pwg-stat",
+ "--am_stat",
type=str,
- help="mean and standard deviation used to normalize spectrogram when training parallel wavegan."
+ default=None,
+ help="mean and standard deviation used to normalize spectrogram when training acoustic model."
)
parser.add_argument(
"--phones-dict",
type=str,
default="phone_id_map.txt",
help="phone vocabulary file.")
+ # vocoder
+ parser.add_argument(
+ '--voc',
+ type=str,
+ default='pwgan_csmsc',
+ choices=['pwgan_aishell3'],
+ help='Choose vocoder type of tts task.')
+
+ parser.add_argument(
+ '--voc_config',
+ type=str,
+ default=None,
+ help='Config of voc. Use deault config when it is None.')
+ parser.add_argument(
+ '--voc_ckpt', type=str, default=None, help='Checkpoint file of voc.')
+ parser.add_argument(
+ "--voc_stat",
+ type=str,
+ default=None,
+ help="mean and standard deviation used to normalize spectrogram when training voc."
+ )
parser.add_argument(
"--text",
type=str,
@@ -190,18 +248,7 @@ def main():
else:
print("ngpu should >= 0 !")
- with open(args.fastspeech2_config) as f:
- fastspeech2_config = CfgNode(yaml.safe_load(f))
- with open(args.pwg_config) as f:
- pwg_config = CfgNode(yaml.safe_load(f))
-
- print("========Args========")
- print(yaml.safe_dump(vars(args)))
- print("========Config========")
- print(fastspeech2_config)
- print(pwg_config)
-
- voice_cloning(args, fastspeech2_config, pwg_config)
+ voice_cloning(args)
if __name__ == "__main__":
diff --git a/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/aishell3.py b/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/aishell3.py
deleted file mode 100644
index da95582d..00000000
--- a/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/aishell3.py
+++ /dev/null
@@ -1,89 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import pickle
-from pathlib import Path
-
-import numpy as np
-from paddle.io import Dataset
-
-from paddlespeech.t2s.data import batch_spec
-from paddlespeech.t2s.data import batch_text_id
-from paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.preprocess_transcription import _phones
-from paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.preprocess_transcription import _tones
-from paddlespeech.t2s.frontend import Vocab
-
-voc_phones = Vocab(sorted(list(_phones)))
-print("vocab_phones:\n", voc_phones)
-voc_tones = Vocab(sorted(list(_tones)))
-print("vocab_tones:\n", voc_tones)
-
-
-class AiShell3(Dataset):
- """Processed AiShell3 dataset."""
-
- def __init__(self, root):
- super().__init__()
- self.root = Path(root).expanduser()
- self.embed_dir = self.root / "embed"
- self.mel_dir = self.root / "mel"
-
- with open(self.root / "metadata.pickle", 'rb') as f:
- self.records = pickle.load(f)
-
- def __getitem__(self, index):
- metadatum = self.records[index]
- sentence_id = metadatum["sentence_id"]
- speaker_id = sentence_id[:7]
- phones = metadatum["phones"]
- tones = metadatum["tones"]
- phones = np.array(
- [voc_phones.lookup(item) for item in phones], dtype=np.int64)
- tones = np.array(
- [voc_tones.lookup(item) for item in tones], dtype=np.int64)
- mel = np.load(str(self.mel_dir / speaker_id / (sentence_id + ".npy")))
- embed = np.load(
- str(self.embed_dir / speaker_id / (sentence_id + ".npy")))
- return phones, tones, mel, embed
-
- def __len__(self):
- return len(self.records)
-
-
-def collate_aishell3_examples(examples):
- phones, tones, mel, embed = list(zip(*examples))
-
- text_lengths = np.array([item.shape[0] for item in phones], dtype=np.int64)
- spec_lengths = np.array([item.shape[1] for item in mel], dtype=np.int64)
- T_dec = np.max(spec_lengths)
- stop_tokens = (
- np.arange(T_dec) >= np.expand_dims(spec_lengths, -1)).astype(np.float32)
- phones, _ = batch_text_id(phones)
- tones, _ = batch_text_id(tones)
- mel, _ = batch_spec(mel)
- mel = np.transpose(mel, (0, 2, 1))
- embed = np.stack(embed)
- # 7 fields
- # (B, T), (B, T), (B, T, C), (B, C), (B,), (B,), (B, T)
- return phones, tones, mel, embed, text_lengths, spec_lengths, stop_tokens
-
-
-if __name__ == "__main__":
- dataset = AiShell3("~/datasets/aishell3/train")
- example = dataset[0]
-
- examples = [dataset[i] for i in range(10)]
- batch = collate_aishell3_examples(examples)
-
- for field in batch:
- print(field.shape, field.dtype)
diff --git a/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/chinese_g2p.py b/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/chinese_g2p.py
deleted file mode 100644
index 12de3bb7..00000000
--- a/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/chinese_g2p.py
+++ /dev/null
@@ -1,42 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from typing import List
-from typing import Tuple
-
-from pypinyin import lazy_pinyin
-from pypinyin import Style
-
-from paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.preprocess_transcription import split_syllable
-
-
-def convert_to_pinyin(text: str) -> List[str]:
- """convert text into list of syllables, other characters that are not chinese, thus
- cannot be converted to pinyin are splited.
- """
- syllables = lazy_pinyin(
- text, style=Style.TONE3, neutral_tone_with_five=True)
- return syllables
-
-
-def convert_sentence(text: str) -> List[Tuple[str]]:
- """convert a sentence into two list: phones and tones"""
- syllables = convert_to_pinyin(text)
- phones = []
- tones = []
- for syllable in syllables:
- p, t = split_syllable(syllable)
- phones.extend(p)
- tones.extend(t)
-
- return phones, tones
diff --git a/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/config.py b/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/config.py
deleted file mode 100644
index 8d8c9c4e..00000000
--- a/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/config.py
+++ /dev/null
@@ -1,81 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from yacs.config import CfgNode as CN
-
-_C = CN()
-_C.data = CN(
- dict(
- batch_size=32, # batch size
- valid_size=64, # the first N examples are reserved for validation
- sample_rate=22050, # Hz, sample rate
- n_fft=1024, # fft frame size
- win_length=1024, # window size
- hop_length=256, # hop size between ajacent frame
- fmax=8000, # Hz, max frequency when converting to mel
- fmin=0, # Hz, min frequency when converting to mel
- d_mels=80, # mel bands
- padding_idx=0, # text embedding's padding index
- ))
-
-_C.model = CN(
- dict(
- vocab_size=70,
- n_tones=10,
- reduction_factor=1, # reduction factor
- d_encoder=512, # embedding & encoder's internal size
- encoder_conv_layers=3, # number of conv layer in tacotron2 encoder
- encoder_kernel_size=5, # kernel size of conv layers in tacotron2 encoder
- d_prenet=256, # hidden size of decoder prenet
- # hidden size of the first rnn layer in tacotron2 decoder
- d_attention_rnn=1024,
- # hidden size of the second rnn layer in tacotron2 decoder
- d_decoder_rnn=1024,
- d_attention=128, # hidden size of decoder location linear layer
- attention_filters=32, # number of filter in decoder location conv layer
- attention_kernel_size=31, # kernel size of decoder location conv layer
- d_postnet=512, # hidden size of decoder postnet
- postnet_kernel_size=5, # kernel size of conv layers in postnet
- postnet_conv_layers=5, # number of conv layer in decoder postnet
- p_encoder_dropout=0.5, # droput probability in encoder
- p_prenet_dropout=0.5, # droput probability in decoder prenet
-
- # droput probability of first rnn layer in decoder
- p_attention_dropout=0.1,
- # droput probability of second rnn layer in decoder
- p_decoder_dropout=0.1,
- p_postnet_dropout=0.5, # droput probability in decoder postnet
- guided_attention_loss_sigma=0.2,
- d_global_condition=256,
-
- # whether to use a classifier to predict stop probability
- use_stop_token=False,
- # whether to use guided attention loss in training
- use_guided_attention_loss=True, ))
-
-_C.training = CN(
- dict(
- lr=1e-3, # learning rate
- weight_decay=1e-6, # the coeff of weight decay
- grad_clip_thresh=1.0, # the clip norm of grad clip.
- valid_interval=1000, # validation
- save_interval=1000, # checkpoint
- max_iteration=500000, # max iteration to train
- ))
-
-
-def get_cfg_defaults():
- """Get a yacs CfgNode object with default values for my_project."""
- # Return a clone so that the defaults will not be altered
- # This is for the "local variable" use pattern
- return _C.clone()
diff --git a/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/extract_mel.py b/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/extract_mel.py
deleted file mode 100644
index d12466f6..00000000
--- a/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/extract_mel.py
+++ /dev/null
@@ -1,95 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import multiprocessing as mp
-from functools import partial
-from pathlib import Path
-
-import numpy as np
-import tqdm
-
-from paddlespeech.t2s.audio import AudioProcessor
-from paddlespeech.t2s.audio.spec_normalizer import LogMagnitude
-from paddlespeech.t2s.audio.spec_normalizer import NormalizerBase
-from paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.config import get_cfg_defaults
-
-
-def extract_mel(fname: Path,
- input_dir: Path,
- output_dir: Path,
- p: AudioProcessor,
- n: NormalizerBase):
- relative_path = fname.relative_to(input_dir)
- out_path = (output_dir / relative_path).with_suffix(".npy")
- out_path.parent.mkdir(parents=True, exist_ok=True)
- wav = p.read_wav(fname)
- mel = p.mel_spectrogram(wav)
- mel = n.transform(mel)
- np.save(out_path, mel)
-
-
-def extract_mel_multispeaker(config, input_dir, output_dir, extension=".wav"):
- input_dir = Path(input_dir).expanduser()
- fnames = list(input_dir.rglob(f"*{extension}"))
- output_dir = Path(output_dir).expanduser()
- output_dir.mkdir(parents=True, exist_ok=True)
-
- p = AudioProcessor(config.sample_rate, config.n_fft, config.win_length,
- config.hop_length, config.d_mels, config.fmin,
- config.fmax)
- n = LogMagnitude(1e-5)
-
- func = partial(
- extract_mel, input_dir=input_dir, output_dir=output_dir, p=p, n=n)
-
- with mp.Pool(16) as pool:
- list(
- tqdm.tqdm(
- pool.imap(func, fnames), total=len(fnames), unit="utterance"))
-
-
-if __name__ == "__main__":
- parser = argparse.ArgumentParser(
- description="Extract mel spectrogram from processed wav in AiShell3 training dataset."
- )
- parser.add_argument(
- "--config",
- type=str,
- help="yaml config file to overwrite the default config")
- parser.add_argument(
- "--input",
- type=str,
- default="~/datasets/aishell3/train/normalized_wav",
- help="path of the processed wav folder")
- parser.add_argument(
- "--output",
- type=str,
- default="~/datasets/aishell3/train/mel",
- help="path of the folder to save mel spectrograms")
- parser.add_argument(
- "--opts",
- nargs=argparse.REMAINDER,
- help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
- )
- default_config = get_cfg_defaults()
-
- args = parser.parse_args()
- if args.config:
- default_config.merge_from_file(args.config)
- if args.opts:
- default_config.merge_from_list(args.opts)
- default_config.freeze()
- audio_config = default_config.data
-
- extract_mel_multispeaker(audio_config, args.input, args.output)
diff --git a/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/lexicon.txt b/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/lexicon.txt
deleted file mode 100644
index cc56b55d..00000000
--- a/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/lexicon.txt
+++ /dev/null
@@ -1,4150 +0,0 @@
-zhi1 zh iii1
-zhi2 zh iii2
-zhi3 zh iii3
-zhi4 zh iii4
-zhi5 zh iii5
-chi1 ch iii1
-chi2 ch iii2
-chi3 ch iii3
-chi4 ch iii4
-chi5 ch iii5
-shi1 sh iii1
-shi2 sh iii2
-shi3 sh iii3
-shi4 sh iii4
-shi5 sh iii5
-ri1 r iii1
-ri2 r iii2
-ri3 r iii3
-ri4 r iii4
-ri5 r iii5
-zi1 z ii1
-zi2 z ii2
-zi3 z ii3
-zi4 z ii4
-zi5 z ii5
-ci1 c ii1
-ci2 c ii2
-ci3 c ii3
-ci4 c ii4
-ci5 c ii5
-si1 s ii1
-si2 s ii2
-si3 s ii3
-si4 s ii4
-si5 s ii5
-a1 a1
-a2 a2
-a3 a3
-a4 a4
-a5 a5
-ba1 b a1
-ba2 b a2
-ba3 b a3
-ba4 b a4
-ba5 b a5
-pa1 p a1
-pa2 p a2
-pa3 p a3
-pa4 p a4
-pa5 p a5
-ma1 m a1
-ma2 m a2
-ma3 m a3
-ma4 m a4
-ma5 m a5
-fa1 f a1
-fa2 f a2
-fa3 f a3
-fa4 f a4
-fa5 f a5
-da1 d a1
-da2 d a2
-da3 d a3
-da4 d a4
-da5 d a5
-ta1 t a1
-ta2 t a2
-ta3 t a3
-ta4 t a4
-ta5 t a5
-na1 n a1
-na2 n a2
-na3 n a3
-na4 n a4
-na5 n a5
-la1 l a1
-la2 l a2
-la3 l a3
-la4 l a4
-la5 l a5
-ga1 g a1
-ga2 g a2
-ga3 g a3
-ga4 g a4
-ga5 g a5
-ka1 k a1
-ka2 k a2
-ka3 k a3
-ka4 k a4
-ka5 k a5
-ha1 h a1
-ha2 h a2
-ha3 h a3
-ha4 h a4
-ha5 h a5
-zha1 zh a1
-zha2 zh a2
-zha3 zh a3
-zha4 zh a4
-zha5 zh a5
-cha1 ch a1
-cha2 ch a2
-cha3 ch a3
-cha4 ch a4
-cha5 ch a5
-sha1 sh a1
-sha2 sh a2
-sha3 sh a3
-sha4 sh a4
-sha5 sh a5
-za1 z a1
-za2 z a2
-za3 z a3
-za4 z a4
-za5 z a5
-ca1 c a1
-ca2 c a2
-ca3 c a3
-ca4 c a4
-ca5 c a5
-sa1 s a1
-sa2 s a2
-sa3 s a3
-sa4 s a4
-sa5 s a5
-o1 o1
-o2 o2
-o3 o3
-o4 o4
-o5 o5
-bo1 b uo1
-bo2 b uo2
-bo3 b uo3
-bo4 b uo4
-bo5 b uo5
-po1 p uo1
-po2 p uo2
-po3 p uo3
-po4 p uo4
-po5 p uo5
-mo1 m uo1
-mo2 m uo2
-mo3 m uo3
-mo4 m uo4
-mo5 m uo5
-fo1 f uo1
-fo2 f uo2
-fo3 f uo3
-fo4 f uo4
-fo5 f uo5
-lo1 l o1
-lo2 l o2
-lo3 l o3
-lo4 l o4
-lo5 l o5
-e1 e1
-e2 e2
-e3 e3
-e4 e4
-e5 e5
-me1 m e1
-me2 m e2
-me3 m e3
-me4 m e4
-me5 m e5
-de1 d e1
-de2 d e2
-de3 d e3
-de4 d e4
-de5 d e5
-te1 t e1
-te2 t e2
-te3 t e3
-te4 t e4
-te5 t e5
-ne1 n e1
-ne2 n e2
-ne3 n e3
-ne4 n e4
-ne5 n e5
-le1 l e1
-le2 l e2
-le3 l e3
-le4 l e4
-le5 l e5
-ge1 g e1
-ge2 g e2
-ge3 g e3
-ge4 g e4
-ge5 g e5
-ke1 k e1
-ke2 k e2
-ke3 k e3
-ke4 k e4
-ke5 k e5
-he1 h e1
-he2 h e2
-he3 h e3
-he4 h e4
-he5 h e5
-zhe1 zh e1
-zhe2 zh e2
-zhe3 zh e3
-zhe4 zh e4
-zhe5 zh e5
-che1 ch e1
-che2 ch e2
-che3 ch e3
-che4 ch e4
-che5 ch e5
-she1 sh e1
-she2 sh e2
-she3 sh e3
-she4 sh e4
-she5 sh e5
-re1 r e1
-re2 r e2
-re3 r e3
-re4 r e4
-re5 r e5
-ze1 z e1
-ze2 z e2
-ze3 z e3
-ze4 z e4
-ze5 z e5
-ce1 c e1
-ce2 c e2
-ce3 c e3
-ce4 c e4
-ce5 c e5
-se1 s e1
-se2 s e2
-se3 s e3
-se4 s e4
-se5 s e5
-ea1 ea1
-ea2 ea2
-ea3 ea3
-ea4 ea4
-ea5 ea5
-ai1 ai1
-ai2 ai2
-ai3 ai3
-ai4 ai4
-ai5 ai5
-bai1 b ai1
-bai2 b ai2
-bai3 b ai3
-bai4 b ai4
-bai5 b ai5
-pai1 p ai1
-pai2 p ai2
-pai3 p ai3
-pai4 p ai4
-pai5 p ai5
-mai1 m ai1
-mai2 m ai2
-mai3 m ai3
-mai4 m ai4
-mai5 m ai5
-dai1 d ai1
-dai2 d ai2
-dai3 d ai3
-dai4 d ai4
-dai5 d ai5
-tai1 t ai1
-tai2 t ai2
-tai3 t ai3
-tai4 t ai4
-tai5 t ai5
-nai1 n ai1
-nai2 n ai2
-nai3 n ai3
-nai4 n ai4
-nai5 n ai5
-lai1 l ai1
-lai2 l ai2
-lai3 l ai3
-lai4 l ai4
-lai5 l ai5
-gai1 g ai1
-gai2 g ai2
-gai3 g ai3
-gai4 g ai4
-gai5 g ai5
-kai1 k ai1
-kai2 k ai2
-kai3 k ai3
-kai4 k ai4
-kai5 k ai5
-hai1 h ai1
-hai2 h ai2
-hai3 h ai3
-hai4 h ai4
-hai5 h ai5
-zhai1 zh ai1
-zhai2 zh ai2
-zhai3 zh ai3
-zhai4 zh ai4
-zhai5 zh ai5
-chai1 ch ai1
-chai2 ch ai2
-chai3 ch ai3
-chai4 ch ai4
-chai5 ch ai5
-shai1 sh ai1
-shai2 sh ai2
-shai3 sh ai3
-shai4 sh ai4
-shai5 sh ai5
-zai1 z ai1
-zai2 z ai2
-zai3 z ai3
-zai4 z ai4
-zai5 z ai5
-cai1 c ai1
-cai2 c ai2
-cai3 c ai3
-cai4 c ai4
-cai5 c ai5
-sai1 s ai1
-sai2 s ai2
-sai3 s ai3
-sai4 s ai4
-sai5 s ai5
-ei1 ei1
-ei2 ei2
-ei3 ei3
-ei4 ei4
-ei5 ei5
-bei1 b ei1
-bei2 b ei2
-bei3 b ei3
-bei4 b ei4
-bei5 b ei5
-pei1 p ei1
-pei2 p ei2
-pei3 p ei3
-pei4 p ei4
-pei5 p ei5
-mei1 m ei1
-mei2 m ei2
-mei3 m ei3
-mei4 m ei4
-mei5 m ei5
-fei1 f ei1
-fei2 f ei2
-fei3 f ei3
-fei4 f ei4
-fei5 f ei5
-dei1 d ei1
-dei2 d ei2
-dei3 d ei3
-dei4 d ei4
-dei5 d ei5
-tei1 t ei1
-tei2 t ei2
-tei3 t ei3
-tei4 t ei4
-tei5 t ei5
-nei1 n ei1
-nei2 n ei2
-nei3 n ei3
-nei4 n ei4
-nei5 n ei5
-lei1 l ei1
-lei2 l ei2
-lei3 l ei3
-lei4 l ei4
-lei5 l ei5
-gei1 g ei1
-gei2 g ei2
-gei3 g ei3
-gei4 g ei4
-gei5 g ei5
-kei1 k ei1
-kei2 k ei2
-kei3 k ei3
-kei4 k ei4
-kei5 k ei5
-hei1 h ei1
-hei2 h ei2
-hei3 h ei3
-hei4 h ei4
-hei5 h ei5
-zhei1 zh ei1
-zhei2 zh ei2
-zhei3 zh ei3
-zhei4 zh ei4
-zhei5 zh ei5
-shei1 sh ei1
-shei2 sh ei2
-shei3 sh ei3
-shei4 sh ei4
-shei5 sh ei5
-zei1 z ei1
-zei2 z ei2
-zei3 z ei3
-zei4 z ei4
-zei5 z ei5
-ao1 au1
-ao2 au2
-ao3 au3
-ao4 au4
-ao5 au5
-bao1 b au1
-bao2 b au2
-bao3 b au3
-bao4 b au4
-bao5 b au5
-pao1 p au1
-pao2 p au2
-pao3 p au3
-pao4 p au4
-pao5 p au5
-mao1 m au1
-mao2 m au2
-mao3 m au3
-mao4 m au4
-mao5 m au5
-dao1 d au1
-dao2 d au2
-dao3 d au3
-dao4 d au4
-dao5 d au5
-tao1 t au1
-tao2 t au2
-tao3 t au3
-tao4 t au4
-tao5 t au5
-nao1 n au1
-nao2 n au2
-nao3 n au3
-nao4 n au4
-nao5 n au5
-lao1 l au1
-lao2 l au2
-lao3 l au3
-lao4 l au4
-lao5 l au5
-gao1 g au1
-gao2 g au2
-gao3 g au3
-gao4 g au4
-gao5 g au5
-kao1 k au1
-kao2 k au2
-kao3 k au3
-kao4 k au4
-kao5 k au5
-hao1 h au1
-hao2 h au2
-hao3 h au3
-hao4 h au4
-hao5 h au5
-zhao1 zh au1
-zhao2 zh au2
-zhao3 zh au3
-zhao4 zh au4
-zhao5 zh au5
-chao1 ch au1
-chao2 ch au2
-chao3 ch au3
-chao4 ch au4
-chao5 ch au5
-shao1 sh au1
-shao2 sh au2
-shao3 sh au3
-shao4 sh au4
-shao5 sh au5
-rao1 r au1
-rao2 r au2
-rao3 r au3
-rao4 r au4
-rao5 r au5
-zao1 z au1
-zao2 z au2
-zao3 z au3
-zao4 z au4
-zao5 z au5
-cao1 c au1
-cao2 c au2
-cao3 c au3
-cao4 c au4
-cao5 c au5
-sao1 s au1
-sao2 s au2
-sao3 s au3
-sao4 s au4
-sao5 s au5
-ou1 ou1
-ou2 ou2
-ou3 ou3
-ou4 ou4
-ou5 ou5
-pou1 p ou1
-pou2 p ou2
-pou3 p ou3
-pou4 p ou4
-pou5 p ou5
-mou1 m ou1
-mou2 m ou2
-mou3 m ou3
-mou4 m ou4
-mou5 m ou5
-fou1 f ou1
-fou2 f ou2
-fou3 f ou3
-fou4 f ou4
-fou5 f ou5
-dou1 d ou1
-dou2 d ou2
-dou3 d ou3
-dou4 d ou4
-dou5 d ou5
-tou1 t ou1
-tou2 t ou2
-tou3 t ou3
-tou4 t ou4
-tou5 t ou5
-nou1 n ou1
-nou2 n ou2
-nou3 n ou3
-nou4 n ou4
-nou5 n ou5
-lou1 l ou1
-lou2 l ou2
-lou3 l ou3
-lou4 l ou4
-lou5 l ou5
-gou1 g ou1
-gou2 g ou2
-gou3 g ou3
-gou4 g ou4
-gou5 g ou5
-kou1 k ou1
-kou2 k ou2
-kou3 k ou3
-kou4 k ou4
-kou5 k ou5
-hou1 h ou1
-hou2 h ou2
-hou3 h ou3
-hou4 h ou4
-hou5 h ou5
-zhou1 zh ou1
-zhou2 zh ou2
-zhou3 zh ou3
-zhou4 zh ou4
-zhou5 zh ou5
-chou1 ch ou1
-chou2 ch ou2
-chou3 ch ou3
-chou4 ch ou4
-chou5 ch ou5
-shou1 sh ou1
-shou2 sh ou2
-shou3 sh ou3
-shou4 sh ou4
-shou5 sh ou5
-rou1 r ou1
-rou2 r ou2
-rou3 r ou3
-rou4 r ou4
-rou5 r ou5
-zou1 z ou1
-zou2 z ou2
-zou3 z ou3
-zou4 z ou4
-zou5 z ou5
-cou1 c ou1
-cou2 c ou2
-cou3 c ou3
-cou4 c ou4
-cou5 c ou5
-sou1 s ou1
-sou2 s ou2
-sou3 s ou3
-sou4 s ou4
-sou5 s ou5
-an1 an1
-an2 an2
-an3 an3
-an4 an4
-an5 an5
-ban1 b an1
-ban2 b an2
-ban3 b an3
-ban4 b an4
-ban5 b an5
-pan1 p an1
-pan2 p an2
-pan3 p an3
-pan4 p an4
-pan5 p an5
-man1 m an1
-man2 m an2
-man3 m an3
-man4 m an4
-man5 m an5
-fan1 f an1
-fan2 f an2
-fan3 f an3
-fan4 f an4
-fan5 f an5
-dan1 d an1
-dan2 d an2
-dan3 d an3
-dan4 d an4
-dan5 d an5
-tan1 t an1
-tan2 t an2
-tan3 t an3
-tan4 t an4
-tan5 t an5
-nan1 n an1
-nan2 n an2
-nan3 n an3
-nan4 n an4
-nan5 n an5
-lan1 l an1
-lan2 l an2
-lan3 l an3
-lan4 l an4
-lan5 l an5
-gan1 g an1
-gan2 g an2
-gan3 g an3
-gan4 g an4
-gan5 g an5
-kan1 k an1
-kan2 k an2
-kan3 k an3
-kan4 k an4
-kan5 k an5
-han1 h an1
-han2 h an2
-han3 h an3
-han4 h an4
-han5 h an5
-zhan1 zh an1
-zhan2 zh an2
-zhan3 zh an3
-zhan4 zh an4
-zhan5 zh an5
-chan1 ch an1
-chan2 ch an2
-chan3 ch an3
-chan4 ch an4
-chan5 ch an5
-shan1 sh an1
-shan2 sh an2
-shan3 sh an3
-shan4 sh an4
-shan5 sh an5
-ran1 r an1
-ran2 r an2
-ran3 r an3
-ran4 r an4
-ran5 r an5
-zan1 z an1
-zan2 z an2
-zan3 z an3
-zan4 z an4
-zan5 z an5
-can1 c an1
-can2 c an2
-can3 c an3
-can4 c an4
-can5 c an5
-san1 s an1
-san2 s an2
-san3 s an3
-san4 s an4
-san5 s an5
-en1 en1
-en2 en2
-en3 en3
-en4 en4
-en5 en5
-ben1 b en1
-ben2 b en2
-ben3 b en3
-ben4 b en4
-ben5 b en5
-pen1 p en1
-pen2 p en2
-pen3 p en3
-pen4 p en4
-pen5 p en5
-men1 m en1
-men2 m en2
-men3 m en3
-men4 m en4
-men5 m en5
-fen1 f en1
-fen2 f en2
-fen3 f en3
-fen4 f en4
-fen5 f en5
-den1 d en1
-den2 d en2
-den3 d en3
-den4 d en4
-den5 d en5
-nen1 n en1
-nen2 n en2
-nen3 n en3
-nen4 n en4
-nen5 n en5
-gen1 g en1
-gen2 g en2
-gen3 g en3
-gen4 g en4
-gen5 g en5
-ken1 k en1
-ken2 k en2
-ken3 k en3
-ken4 k en4
-ken5 k en5
-hen1 h en1
-hen2 h en2
-hen3 h en3
-hen4 h en4
-hen5 h en5
-zhen1 zh en1
-zhen2 zh en2
-zhen3 zh en3
-zhen4 zh en4
-zhen5 zh en5
-chen1 ch en1
-chen2 ch en2
-chen3 ch en3
-chen4 ch en4
-chen5 ch en5
-shen1 sh en1
-shen2 sh en2
-shen3 sh en3
-shen4 sh en4
-shen5 sh en5
-ren1 r en1
-ren2 r en2
-ren3 r en3
-ren4 r en4
-ren5 r en5
-zen1 z en1
-zen2 z en2
-zen3 z en3
-zen4 z en4
-zen5 z en5
-cen1 c en1
-cen2 c en2
-cen3 c en3
-cen4 c en4
-cen5 c en5
-sen1 s en1
-sen2 s en2
-sen3 s en3
-sen4 s en4
-sen5 s en5
-ang1 ang1
-ang2 ang2
-ang3 ang3
-ang4 ang4
-ang5 ang5
-bang1 b ang1
-bang2 b ang2
-bang3 b ang3
-bang4 b ang4
-bang5 b ang5
-pang1 p ang1
-pang2 p ang2
-pang3 p ang3
-pang4 p ang4
-pang5 p ang5
-mang1 m ang1
-mang2 m ang2
-mang3 m ang3
-mang4 m ang4
-mang5 m ang5
-fang1 f ang1
-fang2 f ang2
-fang3 f ang3
-fang4 f ang4
-fang5 f ang5
-dang1 d ang1
-dang2 d ang2
-dang3 d ang3
-dang4 d ang4
-dang5 d ang5
-tang1 t ang1
-tang2 t ang2
-tang3 t ang3
-tang4 t ang4
-tang5 t ang5
-nang1 n ang1
-nang2 n ang2
-nang3 n ang3
-nang4 n ang4
-nang5 n ang5
-lang1 l ang1
-lang2 l ang2
-lang3 l ang3
-lang4 l ang4
-lang5 l ang5
-gang1 g ang1
-gang2 g ang2
-gang3 g ang3
-gang4 g ang4
-gang5 g ang5
-kang1 k ang1
-kang2 k ang2
-kang3 k ang3
-kang4 k ang4
-kang5 k ang5
-hang1 h ang1
-hang2 h ang2
-hang3 h ang3
-hang4 h ang4
-hang5 h ang5
-zhang1 zh ang1
-zhang2 zh ang2
-zhang3 zh ang3
-zhang4 zh ang4
-zhang5 zh ang5
-chang1 ch ang1
-chang2 ch ang2
-chang3 ch ang3
-chang4 ch ang4
-chang5 ch ang5
-shang1 sh ang1
-shang2 sh ang2
-shang3 sh ang3
-shang4 sh ang4
-shang5 sh ang5
-rang1 r ang1
-rang2 r ang2
-rang3 r ang3
-rang4 r ang4
-rang5 r ang5
-zang1 z ang1
-zang2 z ang2
-zang3 z ang3
-zang4 z ang4
-zang5 z ang5
-cang1 c ang1
-cang2 c ang2
-cang3 c ang3
-cang4 c ang4
-cang5 c ang5
-sang1 s ang1
-sang2 s ang2
-sang3 s ang3
-sang4 s ang4
-sang5 s ang5
-eng1 eng1
-eng2 eng2
-eng3 eng3
-eng4 eng4
-eng5 eng5
-beng1 b eng1
-beng2 b eng2
-beng3 b eng3
-beng4 b eng4
-beng5 b eng5
-peng1 p eng1
-peng2 p eng2
-peng3 p eng3
-peng4 p eng4
-peng5 p eng5
-meng1 m eng1
-meng2 m eng2
-meng3 m eng3
-meng4 m eng4
-meng5 m eng5
-feng1 f eng1
-feng2 f eng2
-feng3 f eng3
-feng4 f eng4
-feng5 f eng5
-deng1 d eng1
-deng2 d eng2
-deng3 d eng3
-deng4 d eng4
-deng5 d eng5
-teng1 t eng1
-teng2 t eng2
-teng3 t eng3
-teng4 t eng4
-teng5 t eng5
-neng1 n eng1
-neng2 n eng2
-neng3 n eng3
-neng4 n eng4
-neng5 n eng5
-leng1 l eng1
-leng2 l eng2
-leng3 l eng3
-leng4 l eng4
-leng5 l eng5
-geng1 g eng1
-geng2 g eng2
-geng3 g eng3
-geng4 g eng4
-geng5 g eng5
-keng1 k eng1
-keng2 k eng2
-keng3 k eng3
-keng4 k eng4
-keng5 k eng5
-heng1 h eng1
-heng2 h eng2
-heng3 h eng3
-heng4 h eng4
-heng5 h eng5
-zheng1 zh eng1
-zheng2 zh eng2
-zheng3 zh eng3
-zheng4 zh eng4
-zheng5 zh eng5
-cheng1 ch eng1
-cheng2 ch eng2
-cheng3 ch eng3
-cheng4 ch eng4
-cheng5 ch eng5
-sheng1 sh eng1
-sheng2 sh eng2
-sheng3 sh eng3
-sheng4 sh eng4
-sheng5 sh eng5
-reng1 r eng1
-reng2 r eng2
-reng3 r eng3
-reng4 r eng4
-reng5 r eng5
-zeng1 z eng1
-zeng2 z eng2
-zeng3 z eng3
-zeng4 z eng4
-zeng5 z eng5
-ceng1 c eng1
-ceng2 c eng2
-ceng3 c eng3
-ceng4 c eng4
-ceng5 c eng5
-seng1 s eng1
-seng2 s eng2
-seng3 s eng3
-seng4 s eng4
-seng5 s eng5
-er1 er1
-er2 er2
-er3 er3
-er4 er4
-er5 er5
-yi1 y i1
-yi2 y i2
-yi3 y i3
-yi4 y i4
-yi5 y i5
-bi1 b i1
-bi2 b i2
-bi3 b i3
-bi4 b i4
-bi5 b i5
-pi1 p i1
-pi2 p i2
-pi3 p i3
-pi4 p i4
-pi5 p i5
-mi1 m i1
-mi2 m i2
-mi3 m i3
-mi4 m i4
-mi5 m i5
-di1 d i1
-di2 d i2
-di3 d i3
-di4 d i4
-di5 d i5
-ti1 t i1
-ti2 t i2
-ti3 t i3
-ti4 t i4
-ti5 t i5
-ni1 n i1
-ni2 n i2
-ni3 n i3
-ni4 n i4
-ni5 n i5
-li1 l i1
-li2 l i2
-li3 l i3
-li4 l i4
-li5 l i5
-ji1 j i1
-ji2 j i2
-ji3 j i3
-ji4 j i4
-ji5 j i5
-qi1 q i1
-qi2 q i2
-qi3 q i3
-qi4 q i4
-qi5 q i5
-xi1 x i1
-xi2 x i2
-xi3 x i3
-xi4 x i4
-xi5 x i5
-ya1 y ia1
-ya2 y ia2
-ya3 y ia3
-ya4 y ia4
-ya5 y ia5
-dia1 d ia1
-dia2 d ia2
-dia3 d ia3
-dia4 d ia4
-dia5 d ia5
-lia1 l ia1
-lia2 l ia2
-lia3 l ia3
-lia4 l ia4
-lia5 l ia5
-jia1 j ia1
-jia2 j ia2
-jia3 j ia3
-jia4 j ia4
-jia5 j ia5
-qia1 q ia1
-qia2 q ia2
-qia3 q ia3
-qia4 q ia4
-qia5 q ia5
-xia1 x ia1
-xia2 x ia2
-xia3 x ia3
-xia4 x ia4
-xia5 x ia5
-yo1 y io1
-yo2 y io2
-yo3 y io3
-yo4 y io4
-yo5 y io5
-ye1 y ie1
-ye2 y ie2
-ye3 y ie3
-ye4 y ie4
-ye5 y ie5
-bie1 b ie1
-bie2 b ie2
-bie3 b ie3
-bie4 b ie4
-bie5 b ie5
-pie1 p ie1
-pie2 p ie2
-pie3 p ie3
-pie4 p ie4
-pie5 p ie5
-mie1 m ie1
-mie2 m ie2
-mie3 m ie3
-mie4 m ie4
-mie5 m ie5
-die1 d ie1
-die2 d ie2
-die3 d ie3
-die4 d ie4
-die5 d ie5
-tie1 t ie1
-tie2 t ie2
-tie3 t ie3
-tie4 t ie4
-tie5 t ie5
-nie1 n ie1
-nie2 n ie2
-nie3 n ie3
-nie4 n ie4
-nie5 n ie5
-lie1 l ie1
-lie2 l ie2
-lie3 l ie3
-lie4 l ie4
-lie5 l ie5
-jie1 j ie1
-jie2 j ie2
-jie3 j ie3
-jie4 j ie4
-jie5 j ie5
-qie1 q ie1
-qie2 q ie2
-qie3 q ie3
-qie4 q ie4
-qie5 q ie5
-xie1 x ie1
-xie2 x ie2
-xie3 x ie3
-xie4 x ie4
-xie5 x ie5
-yai1 y ai1
-yai2 y ai2
-yai3 y ai3
-yai4 y ai4
-yai5 y ai5
-yao1 y au1
-yao2 y au2
-yao3 y au3
-yao4 y au4
-yao5 y au5
-biao1 b iau1
-biao2 b iau2
-biao3 b iau3
-biao4 b iau4
-biao5 b iau5
-piao1 p iau1
-piao2 p iau2
-piao3 p iau3
-piao4 p iau4
-piao5 p iau5
-miao1 m iau1
-miao2 m iau2
-miao3 m iau3
-miao4 m iau4
-miao5 m iau5
-fiao1 f iau1
-fiao2 f iau2
-fiao3 f iau3
-fiao4 f iau4
-fiao5 f iau5
-diao1 d iau1
-diao2 d iau2
-diao3 d iau3
-diao4 d iau4
-diao5 d iau5
-tiao1 t iau1
-tiao2 t iau2
-tiao3 t iau3
-tiao4 t iau4
-tiao5 t iau5
-niao1 n iau1
-niao2 n iau2
-niao3 n iau3
-niao4 n iau4
-niao5 n iau5
-liao1 l iau1
-liao2 l iau2
-liao3 l iau3
-liao4 l iau4
-liao5 l iau5
-jiao1 j iau1
-jiao2 j iau2
-jiao3 j iau3
-jiao4 j iau4
-jiao5 j iau5
-qiao1 q iau1
-qiao2 q iau2
-qiao3 q iau3
-qiao4 q iau4
-qiao5 q iau5
-xiao1 x iau1
-xiao2 x iau2
-xiao3 x iau3
-xiao4 x iau4
-xiao5 x iau5
-you1 y iou1
-you2 y iou2
-you3 y iou3
-you4 y iou4
-you5 y iou5
-miu1 m iou1
-miu2 m iou2
-miu3 m iou3
-miu4 m iou4
-miu5 m iou5
-diu1 d iou1
-diu2 d iou2
-diu3 d iou3
-diu4 d iou4
-diu5 d iou5
-niu1 n iou1
-niu2 n iou2
-niu3 n iou3
-niu4 n iou4
-niu5 n iou5
-liu1 l iou1
-liu2 l iou2
-liu3 l iou3
-liu4 l iou4
-liu5 l iou5
-jiu1 j iou1
-jiu2 j iou2
-jiu3 j iou3
-jiu4 j iou4
-jiu5 j iou5
-qiu1 q iou1
-qiu2 q iou2
-qiu3 q iou3
-qiu4 q iou4
-qiu5 q iou5
-xiu1 xiou1
-xiu2 xiou2
-xiu3 xiou3
-xiu4 xiou4
-xiu5 xiou5
-yan1 y ian1
-yan2 y ian2
-yan3 y ian3
-yan4 y ian4
-yan5 y ian5
-bian1 b ian1
-bian2 b ian2
-bian3 b ian3
-bian4 b ian4
-bian5 b ian5
-pian1 p ian1
-pian2 p ian2
-pian3 p ian3
-pian4 p ian4
-pian5 p ian5
-mian1 m ian1
-mian2 m ian2
-mian3 m ian3
-mian4 m ian4
-mian5 m ian5
-dian1 d ian1
-dian2 d ian2
-dian3 d ian3
-dian4 d ian4
-dian5 d ian5
-tian1 t ian1
-tian2 t ian2
-tian3 t ian3
-tian4 t ian4
-tian5 t ian5
-nian1 n ian1
-nian2 n ian2
-nian3 n ian3
-nian4 n ian4
-nian5 n ian5
-lian1 l ian1
-lian2 l ian2
-lian3 l ian3
-lian4 l ian4
-lian5 l ian5
-jian1 j ian1
-jian2 j ian2
-jian3 j ian3
-jian4 j ian4
-jian5 j ian5
-qian1 q ian1
-qian2 q ian2
-qian3 q ian3
-qian4 q ian4
-qian5 q ian5
-xian1 x ian1
-xian2 x ian2
-xian3 x ian3
-xian4 x ian4
-xian5 x ian5
-yin1 y in1
-yin2 y in2
-yin3 y in3
-yin4 y in4
-yin5 y in5
-bin1 b in1
-bin2 b in2
-bin3 b in3
-bin4 b in4
-bin5 b in5
-pin1 p in1
-pin2 p in2
-pin3 p in3
-pin4 p in4
-pin5 p in5
-min1 m in1
-min2 m in2
-min3 m in3
-min4 m in4
-min5 m in5
-din1 d in1
-din2 d in2
-din3 d in3
-din4 d in4
-din5 d in5
-nin1 n in1
-nin2 n in2
-nin3 n in3
-nin4 n in4
-nin5 n in5
-lin1 l in1
-lin2 l in2
-lin3 l in3
-lin4 l in4
-lin5 l in5
-jin1 j in1
-jin2 j in2
-jin3 j in3
-jin4 j in4
-jin5 j in5
-qin1 q in1
-qin2 q in2
-qin3 q in3
-qin4 q in4
-qin5 q in5
-xin1 x in1
-xin2 x in2
-xin3 x in3
-xin4 x in4
-xin5 x in5
-yang1 y iang1
-yang2 y iang2
-yang3 y iang3
-yang4 y iang4
-yang5 y iang5
-biang1 b iang1
-biang2 b iang2
-biang3 b iang3
-biang4 b iang4
-biang5 b iang5
-niang1 n iang1
-niang2 n iang2
-niang3 n iang3
-niang4 n iang4
-niang5 n iang5
-liang1 l iang1
-liang2 l iang2
-liang3 l iang3
-liang4 l iang4
-liang5 l iang5
-jiang1 j iang1
-jiang2 j iang2
-jiang3 j iang3
-jiang4 j iang4
-jiang5 j iang5
-qiang1 q iang1
-qiang2 q iang2
-qiang3 q iang3
-qiang4 q iang4
-qiang5 q iang5
-xiang1 x iang1
-xiang2 x iang2
-xiang3 x iang3
-xiang4 x iang4
-xiang5 x iang5
-ying1 y ing1
-ying2 y ing2
-ying3 y ing3
-ying4 y ing4
-ying5 y ing5
-bing1 b ing1
-bing2 b ing2
-bing3 b ing3
-bing4 b ing4
-bing5 b ing5
-ping1 p ing1
-ping2 p ing2
-ping3 p ing3
-ping4 p ing4
-ping5 p ing5
-ming1 m ing1
-ming2 m ing2
-ming3 m ing3
-ming4 m ing4
-ming5 m ing5
-ding1 d ing1
-ding2 d ing2
-ding3 d ing3
-ding4 d ing4
-ding5 d ing5
-ting1 t ing1
-ting2 t ing2
-ting3 t ing3
-ting4 t ing4
-ting5 t ing5
-ning1 n ing1
-ning2 n ing2
-ning3 n ing3
-ning4 n ing4
-ning5 n ing5
-ling1 l ing1
-ling2 l ing2
-ling3 l ing3
-ling4 l ing4
-ling5 l ing5
-jing1 j ing1
-jing2 j ing2
-jing3 j ing3
-jing4 j ing4
-jing5 j ing5
-qing1 q ing1
-qing2 q ing2
-qing3 q ing3
-qing4 q ing4
-qing5 q ing5
-xing1 x ing1
-xing2 x ing2
-xing3 x ing3
-xing4 x ing4
-xing5 x ing5
-wu1 w u1
-wu2 w u2
-wu3 w u3
-wu4 w u4
-wu5 w u5
-bu1 b u1
-bu2 b u2
-bu3 b u3
-bu4 b u4
-bu5 b u5
-pu1 p u1
-pu2 p u2
-pu3 p u3
-pu4 p u4
-pu5 p u5
-mu1 m u1
-mu2 m u2
-mu3 m u3
-mu4 m u4
-mu5 m u5
-fu1 f u1
-fu2 f u2
-fu3 f u3
-fu4 f u4
-fu5 f u5
-du1 d u1
-du2 d u2
-du3 d u3
-du4 d u4
-du5 d u5
-tu1 t u1
-tu2 t u2
-tu3 t u3
-tu4 t u4
-tu5 t u5
-nu1 n u1
-nu2 n u2
-nu3 n u3
-nu4 n u4
-nu5 n u5
-lu1 l u1
-lu2 l u2
-lu3 l u3
-lu4 l u4
-lu5 l u5
-gu1 g u1
-gu2 g u2
-gu3 g u3
-gu4 g u4
-gu5 g u5
-ku1 k u1
-ku2 k u2
-ku3 k u3
-ku4 k u4
-ku5 k u5
-hu1 h u1
-hu2 h u2
-hu3 h u3
-hu4 h u4
-hu5 h u5
-zhu1 zh u1
-zhu2 zh u2
-zhu3 zh u3
-zhu4 zh u4
-zhu5 zh u5
-chu1 ch u1
-chu2 ch u2
-chu3 ch u3
-chu4 ch u4
-chu5 ch u5
-shu1 sh u1
-shu2 sh u2
-shu3 sh u3
-shu4 sh u4
-shu5 sh u5
-ru1 r u1
-ru2 r u2
-ru3 r u3
-ru4 r u4
-ru5 r u5
-zu1 z u1
-zu2 z u2
-zu3 z u3
-zu4 z u4
-zu5 z u5
-cu1 c u1
-cu2 c u2
-cu3 c u3
-cu4 c u4
-cu5 c u5
-su1 s u1
-su2 s u2
-su3 s u3
-su4 s u4
-su5 s u5
-wa1 w ua1
-wa2 w ua2
-wa3 w ua3
-wa4 w ua4
-wa5 w ua5
-gua1 g ua1
-gua2 g ua2
-gua3 g ua3
-gua4 g ua4
-gua5 g ua5
-kua1 k ua1
-kua2 k ua2
-kua3 k ua3
-kua4 k ua4
-kua5 k ua5
-hua1 h ua1
-hua2 h ua2
-hua3 h ua3
-hua4 h ua4
-hua5 h ua5
-zhua1 zh ua1
-zhua2 zh ua2
-zhua3 zh ua3
-zhua4 zh ua4
-zhua5 zh ua5
-chua1 ch ua1
-chua2 ch ua2
-chua3 ch ua3
-chua4 ch ua4
-chua5 ch ua5
-shua1 sh ua1
-shua2 sh ua2
-shua3 sh ua3
-shua4 sh ua4
-shua5 sh ua5
-wo1 w uo1
-wo2 w uo2
-wo3 w uo3
-wo4 w uo4
-wo5 w uo5
-duo1 d uo1
-duo2 d uo2
-duo3 d uo3
-duo4 d uo4
-duo5 d uo5
-tuo1 t uo1
-tuo2 t uo2
-tuo3 t uo3
-tuo4 t uo4
-tuo5 t uo5
-nuo1 n uo1
-nuo2 n uo2
-nuo3 n uo3
-nuo4 n uo4
-nuo5 n uo5
-luo1 l uo1
-luo2 l uo2
-luo3 l uo3
-luo4 l uo4
-luo5 l uo5
-guo1 g uo1
-guo2 g uo2
-guo3 g uo3
-guo4 g uo4
-guo5 g uo5
-kuo1 k uo1
-kuo2 k uo2
-kuo3 k uo3
-kuo4 k uo4
-kuo5 k uo5
-huo1 h uo1
-huo2 h uo2
-huo3 h uo3
-huo4 h uo4
-huo5 h uo5
-zhuo1 zh uo1
-zhuo2 zh uo2
-zhuo3 zh uo3
-zhuo4 zh uo4
-zhuo5 zh uo5
-chuo1 ch uo1
-chuo2 ch uo2
-chuo3 ch uo3
-chuo4 ch uo4
-chuo5 ch uo5
-shuo1 sh uo1
-shuo2 sh uo2
-shuo3 sh uo3
-shuo4 sh uo4
-shuo5 sh uo5
-ruo1 r uo1
-ruo2 r uo2
-ruo3 r uo3
-ruo4 r uo4
-ruo5 r uo5
-zuo1 z uo1
-zuo2 z uo2
-zuo3 z uo3
-zuo4 z uo4
-zuo5 z uo5
-cuo1 c uo1
-cuo2 c uo2
-cuo3 c uo3
-cuo4 c uo4
-cuo5 c uo5
-suo1 s uo1
-suo2 s uo2
-suo3 s uo3
-suo4 s uo4
-suo5 s uo5
-wai1 w uai1
-wai2 w uai2
-wai3 w uai3
-wai4 w uai4
-wai5 w uai5
-guai1 g uai1
-guai2 g uai2
-guai3 g uai3
-guai4 g uai4
-guai5 g uai5
-kuai1 k uai1
-kuai2 k uai2
-kuai3 k uai3
-kuai4 k uai4
-kuai5 k uai5
-huai1 h uai1
-huai2 h uai2
-huai3 h uai3
-huai4 h uai4
-huai5 h uai5
-zhuai1 zh uai1
-zhuai2 zh uai2
-zhuai3 zh uai3
-zhuai4 zh uai4
-zhuai5 zh uai5
-chuai1 ch uai1
-chuai2 ch uai2
-chuai3 ch uai3
-chuai4 ch uai4
-chuai5 ch uai5
-shuai1 sh uai1
-shuai2 sh uai2
-shuai3 sh uai3
-shuai4 sh uai4
-shuai5 sh uai5
-wei1 w uei1
-wei2 w uei2
-wei3 w uei3
-wei4 w uei4
-wei5 w uei5
-dui1 d uei1
-dui2 d uei2
-dui3 d uei3
-dui4 d uei4
-dui5 d uei5
-tui1 t uei1
-tui2 t uei2
-tui3 t uei3
-tui4 t uei4
-tui5 t uei5
-gui1 g uei1
-gui2 g uei2
-gui3 g uei3
-gui4 g uei4
-gui5 g uei5
-kui1 k uei1
-kui2 k uei2
-kui3 k uei3
-kui4 k uei4
-kui5 k uei5
-hui1 h uei1
-hui2 h uei2
-hui3 h uei3
-hui4 h uei4
-hui5 h uei5
-zhui1 zh uei1
-zhui2 zh uei2
-zhui3 zh uei3
-zhui4 zh uei4
-zhui5 zh uei5
-chui1 ch uei1
-chui2 ch uei2
-chui3 ch uei3
-chui4 ch uei4
-chui5 ch uei5
-shui1 sh uei1
-shui2 sh uei2
-shui3 sh uei3
-shui4 sh uei4
-shui5 sh uei5
-rui1 r uei1
-rui2 r uei2
-rui3 r uei3
-rui4 r uei4
-rui5 r uei5
-zui1 z uei1
-zui2 z uei2
-zui3 z uei3
-zui4 z uei4
-zui5 z uei5
-cui1 c uei1
-cui2 c uei2
-cui3 c uei3
-cui4 c uei4
-cui5 c uei5
-sui1 s uei1
-sui2 s uei2
-sui3 s uei3
-sui4 s uei4
-sui5 s uei5
-wan1 w uan1
-wan2 w uan2
-wan3 w uan3
-wan4 w uan4
-wan5 w uan5
-duan1 d uan1
-duan2 d uan2
-duan3 d uan3
-duan4 d uan4
-duan5 d uan5
-tuan1 t uan1
-tuan2 t uan2
-tuan3 t uan3
-tuan4 t uan4
-tuan5 t uan5
-nuan1 n uan1
-nuan2 n uan2
-nuan3 n uan3
-nuan4 n uan4
-nuan5 n uan5
-luan1 l uan1
-luan2 l uan2
-luan3 l uan3
-luan4 l uan4
-luan5 l uan5
-guan1 g uan1
-guan2 g uan2
-guan3 g uan3
-guan4 g uan4
-guan5 g uan5
-kuan1 k uan1
-kuan2 k uan2
-kuan3 k uan3
-kuan4 k uan4
-kuan5 k uan5
-huan1 h uan1
-huan2 h uan2
-huan3 h uan3
-huan4 h uan4
-huan5 h uan5
-zhuan1 zh uan1
-zhuan2 zh uan2
-zhuan3 zh uan3
-zhuan4 zh uan4
-zhuan5 zh uan5
-chuan1 ch uan1
-chuan2 ch uan2
-chuan3 ch uan3
-chuan4 ch uan4
-chuan5 ch uan5
-shuan1 sh uan1
-shuan2 sh uan2
-shuan3 sh uan3
-shuan4 sh uan4
-shuan5 sh uan5
-ruan1 r uan1
-ruan2 r uan2
-ruan3 r uan3
-ruan4 r uan4
-ruan5 r uan5
-zuan1 z uan1
-zuan2 z uan2
-zuan3 z uan3
-zuan4 z uan4
-zuan5 z uan5
-cuan1 c uan1
-cuan2 c uan2
-cuan3 c uan3
-cuan4 c uan4
-cuan5 c uan5
-suan1 s uan1
-suan2 s uan2
-suan3 s uan3
-suan4 s uan4
-suan5 s uan5
-wen1 w uen1
-wen2 w uen2
-wen3 w uen3
-wen4 w uen4
-wen5 w uen5
-dun1 d uen1
-dun2 d uen2
-dun3 d uen3
-dun4 d uen4
-dun5 d uen5
-tun1 t uen1
-tun2 t uen2
-tun3 t uen3
-tun4 t uen4
-tun5 t uen5
-nun1 n uen1
-nun2 n uen2
-nun3 n uen3
-nun4 n uen4
-nun5 n uen5
-lun1 l uen1
-lun2 l uen2
-lun3 l uen3
-lun4 l uen4
-lun5 l uen5
-gun1 g uen1
-gun2 g uen2
-gun3 g uen3
-gun4 g uen4
-gun5 g uen5
-kun1 k uen1
-kun2 k uen2
-kun3 k uen3
-kun4 k uen4
-kun5 k uen5
-hun1 h uen1
-hun2 h uen2
-hun3 h uen3
-hun4 h uen4
-hun5 h uen5
-zhun1 zh uen1
-zhun2 zh uen2
-zhun3 zh uen3
-zhun4 zh uen4
-zhun5 zh uen5
-chun1 ch uen1
-chun2 ch uen2
-chun3 ch uen3
-chun4 ch uen4
-chun5 ch uen5
-shun1 sh uen1
-shun2 sh uen2
-shun3 sh uen3
-shun4 sh uen4
-shun5 sh uen5
-run1 r uen1
-run2 r uen2
-run3 r uen3
-run4 r uen4
-run5 r uen5
-zun1 z uen1
-zun2 z uen2
-zun3 z uen3
-zun4 z uen4
-zun5 z uen5
-cun1 c uen1
-cun2 c uen2
-cun3 c uen3
-cun4 c uen4
-cun5 c uen5
-sun1 s uen1
-sun2 s uen2
-sun3 s uen3
-sun4 s uen4
-sun5 s uen5
-wang1 w uang1
-wang2 w uang2
-wang3 w uang3
-wang4 w uang4
-wang5 w uang5
-guang1 g uang1
-guang2 g uang2
-guang3 g uang3
-guang4 g uang4
-guang5 g uang5
-kuang1 k uang1
-kuang2 k uang2
-kuang3 k uang3
-kuang4 k uang4
-kuang5 k uang5
-huang1 h uang1
-huang2 h uang2
-huang3 h uang3
-huang4 h uang4
-huang5 h uang5
-zhuang1 zh uang1
-zhuang2 zh uang2
-zhuang3 zh uang3
-zhuang4 zh uang4
-zhuang5 zh uang5
-chuang1 ch uang1
-chuang2 ch uang2
-chuang3 ch uang3
-chuang4 ch uang4
-chuang5 ch uang5
-shuang1 sh uang1
-shuang2 sh uang2
-shuang3 sh uang3
-shuang4 sh uang4
-shuang5 sh uang5
-weng1 w ung1
-weng2 w ung2
-weng3 w ung3
-weng4 w ung4
-weng5 w ung5
-dong1 d ung1
-dong2 d ung2
-dong3 d ung3
-dong4 d ung4
-dong5 d ung5
-tong1 t ung1
-tong2 t ung2
-tong3 t ung3
-tong4 t ung4
-tong5 t ung5
-nong1 n ung1
-nong2 n ung2
-nong3 n ung3
-nong4 n ung4
-nong5 n ung5
-long1 l ung1
-long2 l ung2
-long3 l ung3
-long4 l ung4
-long5 l ung5
-gong1 g ung1
-gong2 g ung2
-gong3 g ung3
-gong4 g ung4
-gong5 g ung5
-kong1 k ung1
-kong2 k ung2
-kong3 k ung3
-kong4 k ung4
-kong5 k ung5
-hong1 h ung1
-hong2 h ung2
-hong3 h ung3
-hong4 h ung4
-hong5 h ung5
-zhong1 zh ung1
-zhong2 zh ung2
-zhong3 zh ung3
-zhong4 zh ung4
-zhong5 zh ung5
-chong1 ch ung1
-chong2 ch ung2
-chong3 ch ung3
-chong4 ch ung4
-chong5 ch ung5
-rong1 r ung1
-rong2 r ung2
-rong3 r ung3
-rong4 r ung4
-rong5 r ung5
-zong1 z ung1
-zong2 z ung2
-zong3 z ung3
-zong4 z ung4
-zong5 z ung5
-cong1 c ung1
-cong2 c ung2
-cong3 c ung3
-cong4 c ung4
-cong5 c ung5
-song1 s ung1
-song2 s ung2
-song3 s ung3
-song4 s ung4
-song5 s ung5
-yu1 y v1
-yu2 y v2
-yu3 y v3
-yu4 y v4
-yu5 y v5
-nv1 n v1
-nv2 n v2
-nv3 n v3
-nv4 n v4
-nv5 n v5
-lv1 l v1
-lv2 l v2
-lv3 l v3
-lv4 l v4
-lv5 l v5
-ju1 j v1
-ju2 j v2
-ju3 j v3
-ju4 j v4
-ju5 j v5
-qu1 q v1
-qu2 q v2
-qu3 q v3
-qu4 q v4
-qu5 q v5
-xu1 x v1
-xu2 x v2
-xu3 x v3
-xu4 x v4
-xu5 x v5
-yue1 y ve1
-yue2 y ve2
-yue3 y ve3
-yue4 y ve4
-yue5 y ve5
-nue1 n ve1
-nue2 n ve2
-nue3 n ve3
-nue4 n ve4
-nue5 n ve5
-nve1 n ve1
-nve2 n ve2
-nve3 n ve3
-nve4 n ve4
-nve5 n ve5
-lue1 l ve1
-lue2 l ve2
-lue3 l ve3
-lue4 l ve4
-lue5 l ve5
-lve1 l ve1
-lve2 l ve2
-lve3 l ve3
-lve4 l ve4
-lve5 l ve5
-jue1 j ve1
-jue2 j ve2
-jue3 j ve3
-jue4 j ve4
-jue5 j ve5
-que1 q ve1
-que2 q ve2
-que3 q ve3
-que4 q ve4
-que5 q ve5
-xue1 x ve1
-xue2 x ve2
-xue3 x ve3
-xue4 x ve4
-xue5 x ve5
-yuan1 y van1
-yuan2 y van2
-yuan3 y van3
-yuan4 y van4
-yuan5 y van5
-juan1 j van1
-juan2 j van2
-juan3 j van3
-juan4 j van4
-juan5 j van5
-quan1 q van1
-quan2 q van2
-quan3 q van3
-quan4 q van4
-quan5 q van5
-xuan1 x van1
-xuan2 x van2
-xuan3 x van3
-xuan4 x van4
-xuan5 x van5
-yun1 y vn1
-yun2 y vn2
-yun3 y vn3
-yun4 y vn4
-yun5 y vn5
-jun1 j vn1
-jun2 j vn2
-jun3 j vn3
-jun4 j vn4
-jun5 j vn5
-qun1 q vn1
-qun2 q vn2
-qun3 q vn3
-qun4 q vn4
-qun5 q vn5
-xun1 x vn1
-xun2 x vn2
-xun3 x vn3
-xun4 x vn4
-xun5 x vn5
-yong1 y vng1
-yong2 y vng2
-yong3 y vng3
-yong4 y vng4
-yong5 y vng5
-jiong1 j vng1
-jiong2 j vng2
-jiong3 j vng3
-jiong4 j vng4
-jiong5 j vng5
-qiong1 q vng1
-qiong2 q vng2
-qiong3 q vng3
-qiong4 q vng4
-qiong5 q vng5
-xiong1 x vng1
-xiong2 x vng2
-xiong3 x vng3
-xiong4 x vng4
-xiong5 x vng5
-zhir1 zh iii1 &r
-zhir2 zh iii2 &r
-zhir3 zh iii3 &r
-zhir4 zh iii4 &r
-zhir5 zh iii5 &r
-chir1 ch iii1 &r
-chir2 ch iii2 &r
-chir3 ch iii3 &r
-chir4 ch iii4 &r
-chir5 ch iii5 &r
-shir1 sh iii1 &r
-shir2 sh iii2 &r
-shir3 sh iii3 &r
-shir4 sh iii4 &r
-shir5 sh iii5 &r
-rir1 r iii1 &r
-rir2 r iii2 &r
-rir3 r iii3 &r
-rir4 r iii4 &r
-rir5 r iii5 &r
-zir1 z ii1 &r
-zir2 z ii2 &r
-zir3 z ii3 &r
-zir4 z ii4 &r
-zir5 z ii5 &r
-cir1 c ii1 &r
-cir2 c ii2 &r
-cir3 c ii3 &r
-cir4 c ii4 &r
-cir5 c ii5 &r
-sir1 s ii1 &r
-sir2 s ii2 &r
-sir3 s ii3 &r
-sir4 s ii4 &r
-sir5 s ii5 &r
-ar1 a1 &r
-ar2 a2 &r
-ar3 a3 &r
-ar4 a4 &r
-ar5 a5 &r
-bar1 b a1 &r
-bar2 b a2 &r
-bar3 b a3 &r
-bar4 b a4 &r
-bar5 b a5 &r
-par1 p a1 &r
-par2 p a2 &r
-par3 p a3 &r
-par4 p a4 &r
-par5 p a5 &r
-mar1 m a1 &r
-mar2 m a2 &r
-mar3 m a3 &r
-mar4 m a4 &r
-mar5 m a5 &r
-far1 f a1 &r
-far2 f a2 &r
-far3 f a3 &r
-far4 f a4 &r
-far5 f a5 &r
-dar1 d a1 &r
-dar2 d a2 &r
-dar3 d a3 &r
-dar4 d a4 &r
-dar5 d a5 &r
-tar1 t a1 &r
-tar2 t a2 &r
-tar3 t a3 &r
-tar4 t a4 &r
-tar5 t a5 &r
-nar1 n a1 &r
-nar2 n a2 &r
-nar3 n a3 &r
-nar4 n a4 &r
-nar5 n a5 &r
-lar1 l a1 &r
-lar2 l a2 &r
-lar3 l a3 &r
-lar4 l a4 &r
-lar5 l a5 &r
-gar1 g a1 &r
-gar2 g a2 &r
-gar3 g a3 &r
-gar4 g a4 &r
-gar5 g a5 &r
-kar1 k a1 &r
-kar2 k a2 &r
-kar3 k a3 &r
-kar4 k a4 &r
-kar5 k a5 &r
-har1 h a1 &r
-har2 h a2 &r
-har3 h a3 &r
-har4 h a4 &r
-har5 h a5 &r
-zhar1 zh a1 &r
-zhar2 zh a2 &r
-zhar3 zh a3 &r
-zhar4 zh a4 &r
-zhar5 zh a5 &r
-char1 ch a1 &r
-char2 ch a2 &r
-char3 ch a3 &r
-char4 ch a4 &r
-char5 ch a5 &r
-shar1 sh a1 &r
-shar2 sh a2 &r
-shar3 sh a3 &r
-shar4 sh a4 &r
-shar5 sh a5 &r
-zar1 z a1 &r
-zar2 z a2 &r
-zar3 z a3 &r
-zar4 z a4 &r
-zar5 z a5 &r
-car1 c a1 &r
-car2 c a2 &r
-car3 c a3 &r
-car4 c a4 &r
-car5 c a5 &r
-sar1 s a1 &r
-sar2 s a2 &r
-sar3 s a3 &r
-sar4 s a4 &r
-sar5 s a5 &r
-or1 o1 &r
-or2 o2 &r
-or3 o3 &r
-or4 o4 &r
-or5 o5 &r
-bor1 b uo1 &r
-bor2 b uo2 &r
-bor3 b uo3 &r
-bor4 b uo4 &r
-bor5 b uo5 &r
-por1 p uo1 &r
-por2 p uo2 &r
-por3 p uo3 &r
-por4 p uo4 &r
-por5 p uo5 &r
-mor1 m uo1 &r
-mor2 m uo2 &r
-mor3 m uo3 &r
-mor4 m uo4 &r
-mor5 m uo5 &r
-for1 f uo1 &r
-for2 f uo2 &r
-for3 f uo3 &r
-for4 f uo4 &r
-for5 f uo5 &r
-lor1 l o1 &r
-lor2 l o2 &r
-lor3 l o3 &r
-lor4 l o4 &r
-lor5 l o5 &r
-mer1 m e1 &r
-mer2 m e2 &r
-mer3 m e3 &r
-mer4 m e4 &r
-mer5 m e5 &r
-der1 d e1 &r
-der2 d e2 &r
-der3 d e3 &r
-der4 d e4 &r
-der5 d e5 &r
-ter1 t e1 &r
-ter2 t e2 &r
-ter3 t e3 &r
-ter4 t e4 &r
-ter5 t e5 &r
-ner1 n e1 &r
-ner2 n e2 &r
-ner3 n e3 &r
-ner4 n e4 &r
-ner5 n e5 &r
-ler1 l e1 &r
-ler2 l e2 &r
-ler3 l e3 &r
-ler4 l e4 &r
-ler5 l e5 &r
-ger1 g e1 &r
-ger2 g e2 &r
-ger3 g e3 &r
-ger4 g e4 &r
-ger5 g e5 &r
-ker1 k e1 &r
-ker2 k e2 &r
-ker3 k e3 &r
-ker4 k e4 &r
-ker5 k e5 &r
-her1 h e1 &r
-her2 h e2 &r
-her3 h e3 &r
-her4 h e4 &r
-her5 h e5 &r
-zher1 zh e1 &r
-zher2 zh e2 &r
-zher3 zh e3 &r
-zher4 zh e4 &r
-zher5 zh e5 &r
-cher1 ch e1 &r
-cher2 ch e2 &r
-cher3 ch e3 &r
-cher4 ch e4 &r
-cher5 ch e5 &r
-sher1 sh e1 &r
-sher2 sh e2 &r
-sher3 sh e3 &r
-sher4 sh e4 &r
-sher5 sh e5 &r
-rer1 r e1 &r
-rer2 r e2 &r
-rer3 r e3 &r
-rer4 r e4 &r
-rer5 r e5 &r
-zer1 z e1 &r
-zer2 z e2 &r
-zer3 z e3 &r
-zer4 z e4 &r
-zer5 z e5 &r
-cer1 c e1 &r
-cer2 c e2 &r
-cer3 c e3 &r
-cer4 c e4 &r
-cer5 c e5 &r
-ser1 s e1 &r
-ser2 s e2 &r
-ser3 s e3 &r
-ser4 s e4 &r
-ser5 s e5 &r
-air1 ai1 &r
-air2 ai2 &r
-air3 ai3 &r
-air4 ai4 &r
-air5 ai5 &r
-bair1 b ai1 &r
-bair2 b ai2 &r
-bair3 b ai3 &r
-bair4 b ai4 &r
-bair5 b ai5 &r
-pair1 p ai1 &r
-pair2 p ai2 &r
-pair3 p ai3 &r
-pair4 p ai4 &r
-pair5 p ai5 &r
-mair1 m ai1 &r
-mair2 m ai2 &r
-mair3 m ai3 &r
-mair4 m ai4 &r
-mair5 m ai5 &r
-dair1 d ai1 &r
-dair2 d ai2 &r
-dair3 d ai3 &r
-dair4 d ai4 &r
-dair5 d ai5 &r
-tair1 t ai1 &r
-tair2 t ai2 &r
-tair3 t ai3 &r
-tair4 t ai4 &r
-tair5 t ai5 &r
-nair1 n ai1 &r
-nair2 n ai2 &r
-nair3 n ai3 &r
-nair4 n ai4 &r
-nair5 n ai5 &r
-lair1 l ai1 &r
-lair2 l ai2 &r
-lair3 l ai3 &r
-lair4 l ai4 &r
-lair5 l ai5 &r
-gair1 g ai1 &r
-gair2 g ai2 &r
-gair3 g ai3 &r
-gair4 g ai4 &r
-gair5 g ai5 &r
-kair1 k ai1 &r
-kair2 k ai2 &r
-kair3 k ai3 &r
-kair4 k ai4 &r
-kair5 k ai5 &r
-hair1 h ai1 &r
-hair2 h ai2 &r
-hair3 h ai3 &r
-hair4 h ai4 &r
-hair5 h ai5 &r
-zhair1 zh ai1 &r
-zhair2 zh ai2 &r
-zhair3 zh ai3 &r
-zhair4 zh ai4 &r
-zhair5 zh ai5 &r
-chair1 ch ai1 &r
-chair2 ch ai2 &r
-chair3 ch ai3 &r
-chair4 ch ai4 &r
-chair5 ch ai5 &r
-shair1 sh ai1 &r
-shair2 sh ai2 &r
-shair3 sh ai3 &r
-shair4 sh ai4 &r
-shair5 sh ai5 &r
-zair1 z ai1 &r
-zair2 z ai2 &r
-zair3 z ai3 &r
-zair4 z ai4 &r
-zair5 z ai5 &r
-cair1 c ai1 &r
-cair2 c ai2 &r
-cair3 c ai3 &r
-cair4 c ai4 &r
-cair5 c ai5 &r
-sair1 s ai1 &r
-sair2 s ai2 &r
-sair3 s ai3 &r
-sair4 s ai4 &r
-sair5 s ai5 &r
-beir1 b ei1 &r
-beir2 b ei2 &r
-beir3 b ei3 &r
-beir4 b ei4 &r
-beir5 b ei5 &r
-peir1 p ei1 &r
-peir2 p ei2 &r
-peir3 p ei3 &r
-peir4 p ei4 &r
-peir5 p ei5 &r
-meir1 m ei1 &r
-meir2 m ei2 &r
-meir3 m ei3 &r
-meir4 m ei4 &r
-meir5 m ei5 &r
-feir1 f ei1 &r
-feir2 f ei2 &r
-feir3 f ei3 &r
-feir4 f ei4 &r
-feir5 f ei5 &r
-deir1 d ei1 &r
-deir2 d ei2 &r
-deir3 d ei3 &r
-deir4 d ei4 &r
-deir5 d ei5 &r
-teir1 t ei1 &r
-teir2 t ei2 &r
-teir3 t ei3 &r
-teir4 t ei4 &r
-teir5 t ei5 &r
-neir1 n ei1 &r
-neir2 n ei2 &r
-neir3 n ei3 &r
-neir4 n ei4 &r
-neir5 n ei5 &r
-leir1 l ei1 &r
-leir2 l ei2 &r
-leir3 l ei3 &r
-leir4 l ei4 &r
-leir5 l ei5 &r
-geir1 g ei1 &r
-geir2 g ei2 &r
-geir3 g ei3 &r
-geir4 g ei4 &r
-geir5 g ei5 &r
-keir1 k ei1 &r
-keir2 k ei2 &r
-keir3 k ei3 &r
-keir4 k ei4 &r
-keir5 k ei5 &r
-heir1 h ei1 &r
-heir2 h ei2 &r
-heir3 h ei3 &r
-heir4 h ei4 &r
-heir5 h ei5 &r
-zheir1 zh ei1 &r
-zheir2 zh ei2 &r
-zheir3 zh ei3 &r
-zheir4 zh ei4 &r
-zheir5 zh ei5 &r
-sheir1 sh ei1 &r
-sheir2 sh ei2 &r
-sheir3 sh ei3 &r
-sheir4 sh ei4 &r
-sheir5 sh ei5 &r
-zeir1 z ei1 &r
-zeir2 z ei2 &r
-zeir3 z ei3 &r
-zeir4 z ei4 &r
-zeir5 z ei5 &r
-aor1 au1 &r
-aor2 au2 &r
-aor3 au3 &r
-aor4 au4 &r
-aor5 au5 &r
-baor1 b au1 &r
-baor2 b au2 &r
-baor3 b au3 &r
-baor4 b au4 &r
-baor5 b au5 &r
-paor1 p au1 &r
-paor2 p au2 &r
-paor3 p au3 &r
-paor4 p au4 &r
-paor5 p au5 &r
-maor1 m au1 &r
-maor2 m au2 &r
-maor3 m au3 &r
-maor4 m au4 &r
-maor5 m au5 &r
-daor1 d au1 &r
-daor2 d au2 &r
-daor3 d au3 &r
-daor4 d au4 &r
-daor5 d au5 &r
-taor1 t au1 &r
-taor2 t au2 &r
-taor3 t au3 &r
-taor4 t au4 &r
-taor5 t au5 &r
-naor1 n au1 &r
-naor2 n au2 &r
-naor3 n au3 &r
-naor4 n au4 &r
-naor5 n au5 &r
-laor1 l au1 &r
-laor2 l au2 &r
-laor3 l au3 &r
-laor4 l au4 &r
-laor5 l au5 &r
-gaor1 g au1 &r
-gaor2 g au2 &r
-gaor3 g au3 &r
-gaor4 g au4 &r
-gaor5 g au5 &r
-kaor1 k au1 &r
-kaor2 k au2 &r
-kaor3 k au3 &r
-kaor4 k au4 &r
-kaor5 k au5 &r
-haor1 h au1 &r
-haor2 h au2 &r
-haor3 h au3 &r
-haor4 h au4 &r
-haor5 h au5 &r
-zhaor1 zh au1 &r
-zhaor2 zh au2 &r
-zhaor3 zh au3 &r
-zhaor4 zh au4 &r
-zhaor5 zh au5 &r
-chaor1 ch au1 &r
-chaor2 ch au2 &r
-chaor3 ch au3 &r
-chaor4 ch au4 &r
-chaor5 ch au5 &r
-shaor1 sh au1 &r
-shaor2 sh au2 &r
-shaor3 sh au3 &r
-shaor4 sh au4 &r
-shaor5 sh au5 &r
-raor1 r au1 &r
-raor2 r au2 &r
-raor3 r au3 &r
-raor4 r au4 &r
-raor5 r au5 &r
-zaor1 z au1 &r
-zaor2 z au2 &r
-zaor3 z au3 &r
-zaor4 z au4 &r
-zaor5 z au5 &r
-caor1 c au1 &r
-caor2 c au2 &r
-caor3 c au3 &r
-caor4 c au4 &r
-caor5 c au5 &r
-saor1 s au1 &r
-saor2 s au2 &r
-saor3 s au3 &r
-saor4 s au4 &r
-saor5 s au5 &r
-our1 ou1 &r
-our2 ou2 &r
-our3 ou3 &r
-our4 ou4 &r
-our5 ou5 &r
-pour1 p ou1 &r
-pour2 p ou2 &r
-pour3 p ou3 &r
-pour4 p ou4 &r
-pour5 p ou5 &r
-mour1 m ou1 &r
-mour2 m ou2 &r
-mour3 m ou3 &r
-mour4 m ou4 &r
-mour5 m ou5 &r
-four1 f ou1 &r
-four2 f ou2 &r
-four3 f ou3 &r
-four4 f ou4 &r
-four5 f ou5 &r
-dour1 d ou1 &r
-dour2 d ou2 &r
-dour3 d ou3 &r
-dour4 d ou4 &r
-dour5 d ou5 &r
-tour1 t ou1 &r
-tour2 t ou2 &r
-tour3 t ou3 &r
-tour4 t ou4 &r
-tour5 t ou5 &r
-nour1 n ou1 &r
-nour2 n ou2 &r
-nour3 n ou3 &r
-nour4 n ou4 &r
-nour5 n ou5 &r
-lour1 l ou1 &r
-lour2 l ou2 &r
-lour3 l ou3 &r
-lour4 l ou4 &r
-lour5 l ou5 &r
-gour1 g ou1 &r
-gour2 g ou2 &r
-gour3 g ou3 &r
-gour4 g ou4 &r
-gour5 g ou5 &r
-kour1 k ou1 &r
-kour2 k ou2 &r
-kour3 k ou3 &r
-kour4 k ou4 &r
-kour5 k ou5 &r
-hour1 h ou1 &r
-hour2 h ou2 &r
-hour3 h ou3 &r
-hour4 h ou4 &r
-hour5 h ou5 &r
-zhour1 zh ou1 &r
-zhour2 zh ou2 &r
-zhour3 zh ou3 &r
-zhour4 zh ou4 &r
-zhour5 zh ou5 &r
-chour1 ch ou1 &r
-chour2 ch ou2 &r
-chour3 ch ou3 &r
-chour4 ch ou4 &r
-chour5 ch ou5 &r
-shour1 sh ou1 &r
-shour2 sh ou2 &r
-shour3 sh ou3 &r
-shour4 sh ou4 &r
-shour5 sh ou5 &r
-rour1 r ou1 &r
-rour2 r ou2 &r
-rour3 r ou3 &r
-rour4 r ou4 &r
-rour5 r ou5 &r
-zour1 z ou1 &r
-zour2 z ou2 &r
-zour3 z ou3 &r
-zour4 z ou4 &r
-zour5 z ou5 &r
-cour1 c ou1 &r
-cour2 c ou2 &r
-cour3 c ou3 &r
-cour4 c ou4 &r
-cour5 c ou5 &r
-sour1 s ou1 &r
-sour2 s ou2 &r
-sour3 s ou3 &r
-sour4 s ou4 &r
-sour5 s ou5 &r
-anr1 an1 &r
-anr2 an2 &r
-anr3 an3 &r
-anr4 an4 &r
-anr5 an5 &r
-banr1 b an1 &r
-banr2 b an2 &r
-banr3 b an3 &r
-banr4 b an4 &r
-banr5 b an5 &r
-panr1 p an1 &r
-panr2 p an2 &r
-panr3 p an3 &r
-panr4 p an4 &r
-panr5 p an5 &r
-manr1 m an1 &r
-manr2 m an2 &r
-manr3 m an3 &r
-manr4 m an4 &r
-manr5 m an5 &r
-fanr1 f an1 &r
-fanr2 f an2 &r
-fanr3 f an3 &r
-fanr4 f an4 &r
-fanr5 f an5 &r
-danr1 d an1 &r
-danr2 d an2 &r
-danr3 d an3 &r
-danr4 d an4 &r
-danr5 d an5 &r
-tanr1 t an1 &r
-tanr2 t an2 &r
-tanr3 t an3 &r
-tanr4 t an4 &r
-tanr5 t an5 &r
-nanr1 n an1 &r
-nanr2 n an2 &r
-nanr3 n an3 &r
-nanr4 n an4 &r
-nanr5 n an5 &r
-lanr1 l an1 &r
-lanr2 l an2 &r
-lanr3 l an3 &r
-lanr4 l an4 &r
-lanr5 l an5 &r
-ganr1 g an1 &r
-ganr2 g an2 &r
-ganr3 g an3 &r
-ganr4 g an4 &r
-ganr5 g an5 &r
-kanr1 k an1 &r
-kanr2 k an2 &r
-kanr3 k an3 &r
-kanr4 k an4 &r
-kanr5 k an5 &r
-hanr1 h an1 &r
-hanr2 h an2 &r
-hanr3 h an3 &r
-hanr4 h an4 &r
-hanr5 h an5 &r
-zhanr1 zh an1 &r
-zhanr2 zh an2 &r
-zhanr3 zh an3 &r
-zhanr4 zh an4 &r
-zhanr5 zh an5 &r
-chanr1 ch an1 &r
-chanr2 ch an2 &r
-chanr3 ch an3 &r
-chanr4 ch an4 &r
-chanr5 ch an5 &r
-shanr1 sh an1 &r
-shanr2 sh an2 &r
-shanr3 sh an3 &r
-shanr4 sh an4 &r
-shanr5 sh an5 &r
-ranr1 r an1 &r
-ranr2 r an2 &r
-ranr3 r an3 &r
-ranr4 r an4 &r
-ranr5 r an5 &r
-zanr1 z an1 &r
-zanr2 z an2 &r
-zanr3 z an3 &r
-zanr4 z an4 &r
-zanr5 z an5 &r
-canr1 c an1 &r
-canr2 c an2 &r
-canr3 c an3 &r
-canr4 c an4 &r
-canr5 c an5 &r
-sanr1 s an1 &r
-sanr2 s an2 &r
-sanr3 s an3 &r
-sanr4 s an4 &r
-sanr5 s an5 &r
-benr1 b en1 &r
-benr2 b en2 &r
-benr3 b en3 &r
-benr4 b en4 &r
-benr5 b en5 &r
-penr1 p en1 &r
-penr2 p en2 &r
-penr3 p en3 &r
-penr4 p en4 &r
-penr5 p en5 &r
-menr1 m en1 &r
-menr2 m en2 &r
-menr3 m en3 &r
-menr4 m en4 &r
-menr5 m en5 &r
-fenr1 f en1 &r
-fenr2 f en2 &r
-fenr3 f en3 &r
-fenr4 f en4 &r
-fenr5 f en5 &r
-denr1 d en1 &r
-denr2 d en2 &r
-denr3 d en3 &r
-denr4 d en4 &r
-denr5 d en5 &r
-nenr1 n en1 &r
-nenr2 n en2 &r
-nenr3 n en3 &r
-nenr4 n en4 &r
-nenr5 n en5 &r
-genr1 g en1 &r
-genr2 g en2 &r
-genr3 g en3 &r
-genr4 g en4 &r
-genr5 g en5 &r
-kenr1 k en1 &r
-kenr2 k en2 &r
-kenr3 k en3 &r
-kenr4 k en4 &r
-kenr5 k en5 &r
-henr1 h en1 &r
-henr2 h en2 &r
-henr3 h en3 &r
-henr4 h en4 &r
-henr5 h en5 &r
-zhenr1 zh en1 &r
-zhenr2 zh en2 &r
-zhenr3 zh en3 &r
-zhenr4 zh en4 &r
-zhenr5 zh en5 &r
-chenr1 ch en1 &r
-chenr2 ch en2 &r
-chenr3 ch en3 &r
-chenr4 ch en4 &r
-chenr5 ch en5 &r
-shenr1 sh en1 &r
-shenr2 sh en2 &r
-shenr3 sh en3 &r
-shenr4 sh en4 &r
-shenr5 sh en5 &r
-renr1 r en1 &r
-renr2 r en2 &r
-renr3 r en3 &r
-renr4 r en4 &r
-renr5 r en5 &r
-zenr1 z en1 &r
-zenr2 z en2 &r
-zenr3 z en3 &r
-zenr4 z en4 &r
-zenr5 z en5 &r
-cenr1 c en1 &r
-cenr2 c en2 &r
-cenr3 c en3 &r
-cenr4 c en4 &r
-cenr5 c en5 &r
-senr1 s en1 &r
-senr2 s en2 &r
-senr3 s en3 &r
-senr4 s en4 &r
-senr5 s en5 &r
-angr1 ang1 &r
-angr2 ang2 &r
-angr3 ang3 &r
-angr4 ang4 &r
-angr5 ang5 &r
-bangr1 b ang1 &r
-bangr2 b ang2 &r
-bangr3 b ang3 &r
-bangr4 b ang4 &r
-bangr5 b ang5 &r
-pangr1 p ang1 &r
-pangr2 p ang2 &r
-pangr3 p ang3 &r
-pangr4 p ang4 &r
-pangr5 p ang5 &r
-mangr1 m ang1 &r
-mangr2 m ang2 &r
-mangr3 m ang3 &r
-mangr4 m ang4 &r
-mangr5 m ang5 &r
-fangr1 f ang1 &r
-fangr2 f ang2 &r
-fangr3 f ang3 &r
-fangr4 f ang4 &r
-fangr5 f ang5 &r
-dangr1 d ang1 &r
-dangr2 d ang2 &r
-dangr3 d ang3 &r
-dangr4 d ang4 &r
-dangr5 d ang5 &r
-tangr1 t ang1 &r
-tangr2 t ang2 &r
-tangr3 t ang3 &r
-tangr4 t ang4 &r
-tangr5 t ang5 &r
-nangr1 n ang1 &r
-nangr2 n ang2 &r
-nangr3 n ang3 &r
-nangr4 n ang4 &r
-nangr5 n ang5 &r
-langr1 l ang1 &r
-langr2 l ang2 &r
-langr3 l ang3 &r
-langr4 l ang4 &r
-langr5 l ang5 &r
-gangr1 g ang1 &r
-gangr2 g ang2 &r
-gangr3 g ang3 &r
-gangr4 g ang4 &r
-gangr5 g ang5 &r
-kangr1 k ang1 &r
-kangr2 k ang2 &r
-kangr3 k ang3 &r
-kangr4 k ang4 &r
-kangr5 k ang5 &r
-hangr1 h ang1 &r
-hangr2 h ang2 &r
-hangr3 h ang3 &r
-hangr4 h ang4 &r
-hangr5 h ang5 &r
-zhangr1 zh ang1 &r
-zhangr2 zh ang2 &r
-zhangr3 zh ang3 &r
-zhangr4 zh ang4 &r
-zhangr5 zh ang5 &r
-changr1 ch ang1 &r
-changr2 ch ang2 &r
-changr3 ch ang3 &r
-changr4 ch ang4 &r
-changr5 ch ang5 &r
-shangr1 sh ang1 &r
-shangr2 sh ang2 &r
-shangr3 sh ang3 &r
-shangr4 sh ang4 &r
-shangr5 sh ang5 &r
-rangr1 r ang1 &r
-rangr2 r ang2 &r
-rangr3 r ang3 &r
-rangr4 r ang4 &r
-rangr5 r ang5 &r
-zangr1 z ang1 &r
-zangr2 z ang2 &r
-zangr3 z ang3 &r
-zangr4 z ang4 &r
-zangr5 z ang5 &r
-cangr1 c ang1 &r
-cangr2 c ang2 &r
-cangr3 c ang3 &r
-cangr4 c ang4 &r
-cangr5 c ang5 &r
-sangr1 s ang1 &r
-sangr2 s ang2 &r
-sangr3 s ang3 &r
-sangr4 s ang4 &r
-sangr5 s ang5 &r
-bengr1 b eng1 &r
-bengr2 b eng2 &r
-bengr3 b eng3 &r
-bengr4 b eng4 &r
-bengr5 b eng5 &r
-pengr1 p eng1 &r
-pengr2 p eng2 &r
-pengr3 p eng3 &r
-pengr4 p eng4 &r
-pengr5 p eng5 &r
-mengr1 m eng1 &r
-mengr2 m eng2 &r
-mengr3 m eng3 &r
-mengr4 m eng4 &r
-mengr5 m eng5 &r
-fengr1 f eng1 &r
-fengr2 f eng2 &r
-fengr3 f eng3 &r
-fengr4 f eng4 &r
-fengr5 f eng5 &r
-dengr1 d eng1 &r
-dengr2 d eng2 &r
-dengr3 d eng3 &r
-dengr4 d eng4 &r
-dengr5 d eng5 &r
-tengr1 t eng1 &r
-tengr2 t eng2 &r
-tengr3 t eng3 &r
-tengr4 t eng4 &r
-tengr5 t eng5 &r
-nengr1 n eng1 &r
-nengr2 n eng2 &r
-nengr3 n eng3 &r
-nengr4 n eng4 &r
-nengr5 n eng5 &r
-lengr1 l eng1 &r
-lengr2 l eng2 &r
-lengr3 l eng3 &r
-lengr4 l eng4 &r
-lengr5 l eng5 &r
-gengr1 g eng1 &r
-gengr2 g eng2 &r
-gengr3 g eng3 &r
-gengr4 g eng4 &r
-gengr5 g eng5 &r
-kengr1 k eng1 &r
-kengr2 k eng2 &r
-kengr3 k eng3 &r
-kengr4 k eng4 &r
-kengr5 k eng5 &r
-hengr1 h eng1 &r
-hengr2 h eng2 &r
-hengr3 h eng3 &r
-hengr4 h eng4 &r
-hengr5 h eng5 &r
-zhengr1 zh eng1 &r
-zhengr2 zh eng2 &r
-zhengr3 zh eng3 &r
-zhengr4 zh eng4 &r
-zhengr5 zh eng5 &r
-chengr1 ch eng1 &r
-chengr2 ch eng2 &r
-chengr3 ch eng3 &r
-chengr4 ch eng4 &r
-chengr5 ch eng5 &r
-shengr1 sh eng1 &r
-shengr2 sh eng2 &r
-shengr3 sh eng3 &r
-shengr4 sh eng4 &r
-shengr5 sh eng5 &r
-rengr1 r eng1 &r
-rengr2 r eng2 &r
-rengr3 r eng3 &r
-rengr4 r eng4 &r
-rengr5 r eng5 &r
-zengr1 z eng1 &r
-zengr2 z eng2 &r
-zengr3 z eng3 &r
-zengr4 z eng4 &r
-zengr5 z eng5 &r
-cengr1 c eng1 &r
-cengr2 c eng2 &r
-cengr3 c eng3 &r
-cengr4 c eng4 &r
-cengr5 c eng5 &r
-sengr1 s eng1 &r
-sengr2 s eng2 &r
-sengr3 s eng3 &r
-sengr4 s eng4 &r
-sengr5 s eng5 &r
-yir1 y i1 &r
-yir2 y i2 &r
-yir3 y i3 &r
-yir4 y i4 &r
-yir5 y i5 &r
-bir1 b i1 &r
-bir2 b i2 &r
-bir3 b i3 &r
-bir4 b i4 &r
-bir5 b i5 &r
-pir1 p i1 &r
-pir2 p i2 &r
-pir3 p i3 &r
-pir4 p i4 &r
-pir5 p i5 &r
-mir1 m i1 &r
-mir2 m i2 &r
-mir3 m i3 &r
-mir4 m i4 &r
-mir5 m i5 &r
-dir1 d i1 &r
-dir2 d i2 &r
-dir3 d i3 &r
-dir4 d i4 &r
-dir5 d i5 &r
-tir1 t i1 &r
-tir2 t i2 &r
-tir3 t i3 &r
-tir4 t i4 &r
-tir5 t i5 &r
-nir1 n i1 &r
-nir2 n i2 &r
-nir3 n i3 &r
-nir4 n i4 &r
-nir5 n i5 &r
-lir1 l i1 &r
-lir2 l i2 &r
-lir3 l i3 &r
-lir4 l i4 &r
-lir5 l i5 &r
-jir1 j i1 &r
-jir2 j i2 &r
-jir3 j i3 &r
-jir4 j i4 &r
-jir5 j i5 &r
-qir1 q i1 &r
-qir2 q i2 &r
-qir3 q i3 &r
-qir4 q i4 &r
-qir5 q i5 &r
-xir1 x i1 &r
-xir2 x i2 &r
-xir3 x i3 &r
-xir4 x i4 &r
-xir5 x i5 &r
-yar1 y ia1 &r
-yar2 y ia2 &r
-yar3 y ia3 &r
-yar4 y ia4 &r
-yar5 y ia5 &r
-diar1 d ia1 &r
-diar2 d ia2 &r
-diar3 d ia3 &r
-diar4 d ia4 &r
-diar5 d ia5 &r
-liar1 l ia1 &r
-liar2 l ia2 &r
-liar3 l ia3 &r
-liar4 l ia4 &r
-liar5 l ia5 &r
-jiar1 j ia1 &r
-jiar2 j ia2 &r
-jiar3 j ia3 &r
-jiar4 j ia4 &r
-jiar5 j ia5 &r
-qiar1 q ia1 &r
-qiar2 q ia2 &r
-qiar3 q ia3 &r
-qiar4 q ia4 &r
-qiar5 q ia5 &r
-xiar1 x ia1 &r
-xiar2 x ia2 &r
-xiar3 x ia3 &r
-xiar4 x ia4 &r
-xiar5 x ia5 &r
-yor1 y io1 &r
-yor2 y io2 &r
-yor3 y io3 &r
-yor4 y io4 &r
-yor5 y io5 &r
-yer1 y ie1 &r
-yer2 y ie2 &r
-yer3 y ie3 &r
-yer4 y ie4 &r
-yer5 y ie5 &r
-bier1 b ie1 &r
-bier2 b ie2 &r
-bier3 b ie3 &r
-bier4 b ie4 &r
-bier5 b ie5 &r
-pier1 p ie1 &r
-pier2 p ie2 &r
-pier3 p ie3 &r
-pier4 p ie4 &r
-pier5 p ie5 &r
-mier1 m ie1 &r
-mier2 m ie2 &r
-mier3 m ie3 &r
-mier4 m ie4 &r
-mier5 m ie5 &r
-dier1 d ie1 &r
-dier2 d ie2 &r
-dier3 d ie3 &r
-dier4 d ie4 &r
-dier5 d ie5 &r
-tier1 t ie1 &r
-tier2 t ie2 &r
-tier3 t ie3 &r
-tier4 t ie4 &r
-tier5 t ie5 &r
-nier1 n ie1 &r
-nier2 n ie2 &r
-nier3 n ie3 &r
-nier4 n ie4 &r
-nier5 n ie5 &r
-lier1 l ie1 &r
-lier2 l ie2 &r
-lier3 l ie3 &r
-lier4 l ie4 &r
-lier5 l ie5 &r
-jier1 j ie1 &r
-jier2 j ie2 &r
-jier3 j ie3 &r
-jier4 j ie4 &r
-jier5 j ie5 &r
-qier1 q ie1 &r
-qier2 q ie2 &r
-qier3 q ie3 &r
-qier4 q ie4 &r
-qier5 q ie5 &r
-xier1 x ie1 &r
-xier2 x ie2 &r
-xier3 x ie3 &r
-xier4 x ie4 &r
-xier5 x ie5 &r
-yair1 y ai1 &r
-yair2 y ai2 &r
-yair3 y ai3 &r
-yair4 y ai4 &r
-yair5 y ai5 &r
-yaor1 y au1 &r
-yaor2 y au2 &r
-yaor3 y au3 &r
-yaor4 y au4 &r
-yaor5 y au5 &r
-biaor1 b iau1 &r
-biaor2 b iau2 &r
-biaor3 b iau3 &r
-biaor4 b iau4 &r
-biaor5 b iau5 &r
-piaor1 p iau1 &r
-piaor2 p iau2 &r
-piaor3 p iau3 &r
-piaor4 p iau4 &r
-piaor5 p iau5 &r
-miaor1 m iau1 &r
-miaor2 m iau2 &r
-miaor3 m iau3 &r
-miaor4 m iau4 &r
-miaor5 m iau5 &r
-fiaor1 f iau1 &r
-fiaor2 f iau2 &r
-fiaor3 f iau3 &r
-fiaor4 f iau4 &r
-fiaor5 f iau5 &r
-diaor1 d iau1 &r
-diaor2 d iau2 &r
-diaor3 d iau3 &r
-diaor4 d iau4 &r
-diaor5 d iau5 &r
-tiaor1 t iau1 &r
-tiaor2 t iau2 &r
-tiaor3 t iau3 &r
-tiaor4 t iau4 &r
-tiaor5 t iau5 &r
-niaor1 n iau1 &r
-niaor2 n iau2 &r
-niaor3 n iau3 &r
-niaor4 n iau4 &r
-niaor5 n iau5 &r
-liaor1 l iau1 &r
-liaor2 l iau2 &r
-liaor3 l iau3 &r
-liaor4 l iau4 &r
-liaor5 l iau5 &r
-jiaor1 j iau1 &r
-jiaor2 j iau2 &r
-jiaor3 j iau3 &r
-jiaor4 j iau4 &r
-jiaor5 j iau5 &r
-qiaor1 q iau1 &r
-qiaor2 q iau2 &r
-qiaor3 q iau3 &r
-qiaor4 q iau4 &r
-qiaor5 q iau5 &r
-xiaor1 x iau1 &r
-xiaor2 x iau2 &r
-xiaor3 x iau3 &r
-xiaor4 x iau4 &r
-xiaor5 x iau5 &r
-your1 y iou1 &r
-your2 y iou2 &r
-your3 y iou3 &r
-your4 y iou4 &r
-your5 y iou5 &r
-miur1 m iou1 &r
-miur2 m iou2 &r
-miur3 m iou3 &r
-miur4 m iou4 &r
-miur5 m iou5 &r
-diur1 d iou1 &r
-diur2 d iou2 &r
-diur3 d iou3 &r
-diur4 d iou4 &r
-diur5 d iou5 &r
-niur1 n iou1 &r
-niur2 n iou2 &r
-niur3 n iou3 &r
-niur4 n iou4 &r
-niur5 n iou5 &r
-liur1 l iou1 &r
-liur2 l iou2 &r
-liur3 l iou3 &r
-liur4 l iou4 &r
-liur5 l iou5 &r
-jiur1 j iou1 &r
-jiur2 j iou2 &r
-jiur3 j iou3 &r
-jiur4 j iou4 &r
-jiur5 j iou5 &r
-qiur1 q iou1 &r
-qiur2 q iou2 &r
-qiur3 q iou3 &r
-qiur4 q iou4 &r
-qiur5 q iou5 &r
-xiur1 xiou1 &r
-xiur2 xiou2 &r
-xiur3 xiou3 &r
-xiur4 xiou4 &r
-xiur5 xiou5 &r
-yanr1 y ian1 &r
-yanr2 y ian2 &r
-yanr3 y ian3 &r
-yanr4 y ian4 &r
-yanr5 y ian5 &r
-bianr1 b ian1 &r
-bianr2 b ian2 &r
-bianr3 b ian3 &r
-bianr4 b ian4 &r
-bianr5 b ian5 &r
-pianr1 p ian1 &r
-pianr2 p ian2 &r
-pianr3 p ian3 &r
-pianr4 p ian4 &r
-pianr5 p ian5 &r
-mianr1 m ian1 &r
-mianr2 m ian2 &r
-mianr3 m ian3 &r
-mianr4 m ian4 &r
-mianr5 m ian5 &r
-dianr1 d ian1 &r
-dianr2 d ian2 &r
-dianr3 d ian3 &r
-dianr4 d ian4 &r
-dianr5 d ian5 &r
-tianr1 t ian1 &r
-tianr2 t ian2 &r
-tianr3 t ian3 &r
-tianr4 t ian4 &r
-tianr5 t ian5 &r
-nianr1 n ian1 &r
-nianr2 n ian2 &r
-nianr3 n ian3 &r
-nianr4 n ian4 &r
-nianr5 n ian5 &r
-lianr1 l ian1 &r
-lianr2 l ian2 &r
-lianr3 l ian3 &r
-lianr4 l ian4 &r
-lianr5 l ian5 &r
-jianr1 j ian1 &r
-jianr2 j ian2 &r
-jianr3 j ian3 &r
-jianr4 j ian4 &r
-jianr5 j ian5 &r
-qianr1 q ian1 &r
-qianr2 q ian2 &r
-qianr3 q ian3 &r
-qianr4 q ian4 &r
-qianr5 q ian5 &r
-xianr1 x ian1 &r
-xianr2 x ian2 &r
-xianr3 x ian3 &r
-xianr4 x ian4 &r
-xianr5 x ian5 &r
-yinr1 y in1 &r
-yinr2 y in2 &r
-yinr3 y in3 &r
-yinr4 y in4 &r
-yinr5 y in5 &r
-binr1 b in1 &r
-binr2 b in2 &r
-binr3 b in3 &r
-binr4 b in4 &r
-binr5 b in5 &r
-pinr1 p in1 &r
-pinr2 p in2 &r
-pinr3 p in3 &r
-pinr4 p in4 &r
-pinr5 p in5 &r
-minr1 m in1 &r
-minr2 m in2 &r
-minr3 m in3 &r
-minr4 m in4 &r
-minr5 m in5 &r
-dinr1 d in1 &r
-dinr2 d in2 &r
-dinr3 d in3 &r
-dinr4 d in4 &r
-dinr5 d in5 &r
-ninr1 n in1 &r
-ninr2 n in2 &r
-ninr3 n in3 &r
-ninr4 n in4 &r
-ninr5 n in5 &r
-linr1 l in1 &r
-linr2 l in2 &r
-linr3 l in3 &r
-linr4 l in4 &r
-linr5 l in5 &r
-jinr1 j in1 &r
-jinr2 j in2 &r
-jinr3 j in3 &r
-jinr4 j in4 &r
-jinr5 j in5 &r
-qinr1 q in1 &r
-qinr2 q in2 &r
-qinr3 q in3 &r
-qinr4 q in4 &r
-qinr5 q in5 &r
-xinr1 x in1 &r
-xinr2 x in2 &r
-xinr3 x in3 &r
-xinr4 x in4 &r
-xinr5 x in5 &r
-yangr1 y iang1 &r
-yangr2 y iang2 &r
-yangr3 y iang3 &r
-yangr4 y iang4 &r
-yangr5 y iang5 &r
-biangr1 b iang1 &r
-biangr2 b iang2 &r
-biangr3 b iang3 &r
-biangr4 b iang4 &r
-biangr5 b iang5 &r
-niangr1 n iang1 &r
-niangr2 n iang2 &r
-niangr3 n iang3 &r
-niangr4 n iang4 &r
-niangr5 n iang5 &r
-liangr1 l iang1 &r
-liangr2 l iang2 &r
-liangr3 l iang3 &r
-liangr4 l iang4 &r
-liangr5 l iang5 &r
-jiangr1 j iang1 &r
-jiangr2 j iang2 &r
-jiangr3 j iang3 &r
-jiangr4 j iang4 &r
-jiangr5 j iang5 &r
-qiangr1 q iang1 &r
-qiangr2 q iang2 &r
-qiangr3 q iang3 &r
-qiangr4 q iang4 &r
-qiangr5 q iang5 &r
-xiangr1 x iang1 &r
-xiangr2 x iang2 &r
-xiangr3 x iang3 &r
-xiangr4 x iang4 &r
-xiangr5 x iang5 &r
-yingr1 y ing1 &r
-yingr2 y ing2 &r
-yingr3 y ing3 &r
-yingr4 y ing4 &r
-yingr5 y ing5 &r
-bingr1 b ing1 &r
-bingr2 b ing2 &r
-bingr3 b ing3 &r
-bingr4 b ing4 &r
-bingr5 b ing5 &r
-pingr1 p ing1 &r
-pingr2 p ing2 &r
-pingr3 p ing3 &r
-pingr4 p ing4 &r
-pingr5 p ing5 &r
-mingr1 m ing1 &r
-mingr2 m ing2 &r
-mingr3 m ing3 &r
-mingr4 m ing4 &r
-mingr5 m ing5 &r
-dingr1 d ing1 &r
-dingr2 d ing2 &r
-dingr3 d ing3 &r
-dingr4 d ing4 &r
-dingr5 d ing5 &r
-tingr1 t ing1 &r
-tingr2 t ing2 &r
-tingr3 t ing3 &r
-tingr4 t ing4 &r
-tingr5 t ing5 &r
-ningr1 n ing1 &r
-ningr2 n ing2 &r
-ningr3 n ing3 &r
-ningr4 n ing4 &r
-ningr5 n ing5 &r
-lingr1 l ing1 &r
-lingr2 l ing2 &r
-lingr3 l ing3 &r
-lingr4 l ing4 &r
-lingr5 l ing5 &r
-jingr1 j ing1 &r
-jingr2 j ing2 &r
-jingr3 j ing3 &r
-jingr4 j ing4 &r
-jingr5 j ing5 &r
-qingr1 q ing1 &r
-qingr2 q ing2 &r
-qingr3 q ing3 &r
-qingr4 q ing4 &r
-qingr5 q ing5 &r
-xingr1 x ing1 &r
-xingr2 x ing2 &r
-xingr3 x ing3 &r
-xingr4 x ing4 &r
-xingr5 x ing5 &r
-wur1 w u1 &r
-wur2 w u2 &r
-wur3 w u3 &r
-wur4 w u4 &r
-wur5 w u5 &r
-bur1 b u1 &r
-bur2 b u2 &r
-bur3 b u3 &r
-bur4 b u4 &r
-bur5 b u5 &r
-pur1 p u1 &r
-pur2 p u2 &r
-pur3 p u3 &r
-pur4 p u4 &r
-pur5 p u5 &r
-mur1 m u1 &r
-mur2 m u2 &r
-mur3 m u3 &r
-mur4 m u4 &r
-mur5 m u5 &r
-fur1 f u1 &r
-fur2 f u2 &r
-fur3 f u3 &r
-fur4 f u4 &r
-fur5 f u5 &r
-dur1 d u1 &r
-dur2 d u2 &r
-dur3 d u3 &r
-dur4 d u4 &r
-dur5 d u5 &r
-tur1 t u1 &r
-tur2 t u2 &r
-tur3 t u3 &r
-tur4 t u4 &r
-tur5 t u5 &r
-nur1 n u1 &r
-nur2 n u2 &r
-nur3 n u3 &r
-nur4 n u4 &r
-nur5 n u5 &r
-lur1 l u1 &r
-lur2 l u2 &r
-lur3 l u3 &r
-lur4 l u4 &r
-lur5 l u5 &r
-gur1 g u1 &r
-gur2 g u2 &r
-gur3 g u3 &r
-gur4 g u4 &r
-gur5 g u5 &r
-kur1 k u1 &r
-kur2 k u2 &r
-kur3 k u3 &r
-kur4 k u4 &r
-kur5 k u5 &r
-hur1 h u1 &r
-hur2 h u2 &r
-hur3 h u3 &r
-hur4 h u4 &r
-hur5 h u5 &r
-zhur1 zh u1 &r
-zhur2 zh u2 &r
-zhur3 zh u3 &r
-zhur4 zh u4 &r
-zhur5 zh u5 &r
-chur1 ch u1 &r
-chur2 ch u2 &r
-chur3 ch u3 &r
-chur4 ch u4 &r
-chur5 ch u5 &r
-shur1 sh u1 &r
-shur2 sh u2 &r
-shur3 sh u3 &r
-shur4 sh u4 &r
-shur5 sh u5 &r
-rur1 r u1 &r
-rur2 r u2 &r
-rur3 r u3 &r
-rur4 r u4 &r
-rur5 r u5 &r
-zur1 z u1 &r
-zur2 z u2 &r
-zur3 z u3 &r
-zur4 z u4 &r
-zur5 z u5 &r
-cur1 c u1 &r
-cur2 c u2 &r
-cur3 c u3 &r
-cur4 c u4 &r
-cur5 c u5 &r
-sur1 s u1 &r
-sur2 s u2 &r
-sur3 s u3 &r
-sur4 s u4 &r
-sur5 s u5 &r
-war1 w ua1 &r
-war2 w ua2 &r
-war3 w ua3 &r
-war4 w ua4 &r
-war5 w ua5 &r
-guar1 g ua1 &r
-guar2 g ua2 &r
-guar3 g ua3 &r
-guar4 g ua4 &r
-guar5 g ua5 &r
-kuar1 k ua1 &r
-kuar2 k ua2 &r
-kuar3 k ua3 &r
-kuar4 k ua4 &r
-kuar5 k ua5 &r
-huar1 h ua1 &r
-huar2 h ua2 &r
-huar3 h ua3 &r
-huar4 h ua4 &r
-huar5 h ua5 &r
-zhuar1 zh ua1 &r
-zhuar2 zh ua2 &r
-zhuar3 zh ua3 &r
-zhuar4 zh ua4 &r
-zhuar5 zh ua5 &r
-chuar1 ch ua1 &r
-chuar2 ch ua2 &r
-chuar3 ch ua3 &r
-chuar4 ch ua4 &r
-chuar5 ch ua5 &r
-shuar1 sh ua1 &r
-shuar2 sh ua2 &r
-shuar3 sh ua3 &r
-shuar4 sh ua4 &r
-shuar5 sh ua5 &r
-wor1 w uo1 &r
-wor2 w uo2 &r
-wor3 w uo3 &r
-wor4 w uo4 &r
-wor5 w uo5 &r
-duor1 d uo1 &r
-duor2 d uo2 &r
-duor3 d uo3 &r
-duor4 d uo4 &r
-duor5 d uo5 &r
-tuor1 t uo1 &r
-tuor2 t uo2 &r
-tuor3 t uo3 &r
-tuor4 t uo4 &r
-tuor5 t uo5 &r
-nuor1 n uo1 &r
-nuor2 n uo2 &r
-nuor3 n uo3 &r
-nuor4 n uo4 &r
-nuor5 n uo5 &r
-luor1 l uo1 &r
-luor2 l uo2 &r
-luor3 l uo3 &r
-luor4 l uo4 &r
-luor5 l uo5 &r
-guor1 g uo1 &r
-guor2 g uo2 &r
-guor3 g uo3 &r
-guor4 g uo4 &r
-guor5 g uo5 &r
-kuor1 k uo1 &r
-kuor2 k uo2 &r
-kuor3 k uo3 &r
-kuor4 k uo4 &r
-kuor5 k uo5 &r
-huor1 h uo1 &r
-huor2 h uo2 &r
-huor3 h uo3 &r
-huor4 h uo4 &r
-huor5 h uo5 &r
-zhuor1 zh uo1 &r
-zhuor2 zh uo2 &r
-zhuor3 zh uo3 &r
-zhuor4 zh uo4 &r
-zhuor5 zh uo5 &r
-chuor1 ch uo1 &r
-chuor2 ch uo2 &r
-chuor3 ch uo3 &r
-chuor4 ch uo4 &r
-chuor5 ch uo5 &r
-shuor1 sh uo1 &r
-shuor2 sh uo2 &r
-shuor3 sh uo3 &r
-shuor4 sh uo4 &r
-shuor5 sh uo5 &r
-ruor1 r uo1 &r
-ruor2 r uo2 &r
-ruor3 r uo3 &r
-ruor4 r uo4 &r
-ruor5 r uo5 &r
-zuor1 z uo1 &r
-zuor2 z uo2 &r
-zuor3 z uo3 &r
-zuor4 z uo4 &r
-zuor5 z uo5 &r
-cuor1 c uo1 &r
-cuor2 c uo2 &r
-cuor3 c uo3 &r
-cuor4 c uo4 &r
-cuor5 c uo5 &r
-suor1 s uo1 &r
-suor2 s uo2 &r
-suor3 s uo3 &r
-suor4 s uo4 &r
-suor5 s uo5 &r
-wair1 w uai1 &r
-wair2 w uai2 &r
-wair3 w uai3 &r
-wair4 w uai4 &r
-wair5 w uai5 &r
-guair1 g uai1 &r
-guair2 g uai2 &r
-guair3 g uai3 &r
-guair4 g uai4 &r
-guair5 g uai5 &r
-kuair1 k uai1 &r
-kuair2 k uai2 &r
-kuair3 k uai3 &r
-kuair4 k uai4 &r
-kuair5 k uai5 &r
-huair1 h uai1 &r
-huair2 h uai2 &r
-huair3 h uai3 &r
-huair4 h uai4 &r
-huair5 h uai5 &r
-zhuair1 zh uai1 &r
-zhuair2 zh uai2 &r
-zhuair3 zh uai3 &r
-zhuair4 zh uai4 &r
-zhuair5 zh uai5 &r
-chuair1 ch uai1 &r
-chuair2 ch uai2 &r
-chuair3 ch uai3 &r
-chuair4 ch uai4 &r
-chuair5 ch uai5 &r
-shuair1 sh uai1 &r
-shuair2 sh uai2 &r
-shuair3 sh uai3 &r
-shuair4 sh uai4 &r
-shuair5 sh uai5 &r
-weir1 w uei1 &r
-weir2 w uei2 &r
-weir3 w uei3 &r
-weir4 w uei4 &r
-weir5 w uei5 &r
-duir1 d uei1 &r
-duir2 d uei2 &r
-duir3 d uei3 &r
-duir4 d uei4 &r
-duir5 d uei5 &r
-tuir1 t uei1 &r
-tuir2 t uei2 &r
-tuir3 t uei3 &r
-tuir4 t uei4 &r
-tuir5 t uei5 &r
-guir1 g uei1 &r
-guir2 g uei2 &r
-guir3 g uei3 &r
-guir4 g uei4 &r
-guir5 g uei5 &r
-kuir1 k uei1 &r
-kuir2 k uei2 &r
-kuir3 k uei3 &r
-kuir4 k uei4 &r
-kuir5 k uei5 &r
-huir1 h uei1 &r
-huir2 h uei2 &r
-huir3 h uei3 &r
-huir4 h uei4 &r
-huir5 h uei5 &r
-zhuir1 zh uei1 &r
-zhuir2 zh uei2 &r
-zhuir3 zh uei3 &r
-zhuir4 zh uei4 &r
-zhuir5 zh uei5 &r
-chuir1 ch uei1 &r
-chuir2 ch uei2 &r
-chuir3 ch uei3 &r
-chuir4 ch uei4 &r
-chuir5 ch uei5 &r
-shuir1 sh uei1 &r
-shuir2 sh uei2 &r
-shuir3 sh uei3 &r
-shuir4 sh uei4 &r
-shuir5 sh uei5 &r
-ruir1 r uei1 &r
-ruir2 r uei2 &r
-ruir3 r uei3 &r
-ruir4 r uei4 &r
-ruir5 r uei5 &r
-zuir1 z uei1 &r
-zuir2 z uei2 &r
-zuir3 z uei3 &r
-zuir4 z uei4 &r
-zuir5 z uei5 &r
-cuir1 c uei1 &r
-cuir2 c uei2 &r
-cuir3 c uei3 &r
-cuir4 c uei4 &r
-cuir5 c uei5 &r
-suir1 s uei1 &r
-suir2 s uei2 &r
-suir3 s uei3 &r
-suir4 s uei4 &r
-suir5 s uei5 &r
-wanr1 w uan1 &r
-wanr2 w uan2 &r
-wanr3 w uan3 &r
-wanr4 w uan4 &r
-wanr5 w uan5 &r
-duanr1 d uan1 &r
-duanr2 d uan2 &r
-duanr3 d uan3 &r
-duanr4 d uan4 &r
-duanr5 d uan5 &r
-tuanr1 t uan1 &r
-tuanr2 t uan2 &r
-tuanr3 t uan3 &r
-tuanr4 t uan4 &r
-tuanr5 t uan5 &r
-nuanr1 n uan1 &r
-nuanr2 n uan2 &r
-nuanr3 n uan3 &r
-nuanr4 n uan4 &r
-nuanr5 n uan5 &r
-luanr1 l uan1 &r
-luanr2 l uan2 &r
-luanr3 l uan3 &r
-luanr4 l uan4 &r
-luanr5 l uan5 &r
-guanr1 g uan1 &r
-guanr2 g uan2 &r
-guanr3 g uan3 &r
-guanr4 g uan4 &r
-guanr5 g uan5 &r
-kuanr1 k uan1 &r
-kuanr2 k uan2 &r
-kuanr3 k uan3 &r
-kuanr4 k uan4 &r
-kuanr5 k uan5 &r
-huanr1 h uan1 &r
-huanr2 h uan2 &r
-huanr3 h uan3 &r
-huanr4 h uan4 &r
-huanr5 h uan5 &r
-zhuanr1 zh uan1 &r
-zhuanr2 zh uan2 &r
-zhuanr3 zh uan3 &r
-zhuanr4 zh uan4 &r
-zhuanr5 zh uan5 &r
-chuanr1 ch uan1 &r
-chuanr2 ch uan2 &r
-chuanr3 ch uan3 &r
-chuanr4 ch uan4 &r
-chuanr5 ch uan5 &r
-shuanr1 sh uan1 &r
-shuanr2 sh uan2 &r
-shuanr3 sh uan3 &r
-shuanr4 sh uan4 &r
-shuanr5 sh uan5 &r
-ruanr1 r uan1 &r
-ruanr2 r uan2 &r
-ruanr3 r uan3 &r
-ruanr4 r uan4 &r
-ruanr5 r uan5 &r
-zuanr1 z uan1 &r
-zuanr2 z uan2 &r
-zuanr3 z uan3 &r
-zuanr4 z uan4 &r
-zuanr5 z uan5 &r
-cuanr1 c uan1 &r
-cuanr2 c uan2 &r
-cuanr3 c uan3 &r
-cuanr4 c uan4 &r
-cuanr5 c uan5 &r
-suanr1 s uan1 &r
-suanr2 s uan2 &r
-suanr3 s uan3 &r
-suanr4 s uan4 &r
-suanr5 s uan5 &r
-wenr1 w uen1 &r
-wenr2 w uen2 &r
-wenr3 w uen3 &r
-wenr4 w uen4 &r
-wenr5 w uen5 &r
-dunr1 d uen1 &r
-dunr2 d uen2 &r
-dunr3 d uen3 &r
-dunr4 d uen4 &r
-dunr5 d uen5 &r
-tunr1 t uen1 &r
-tunr2 t uen2 &r
-tunr3 t uen3 &r
-tunr4 t uen4 &r
-tunr5 t uen5 &r
-nunr1 n uen1 &r
-nunr2 n uen2 &r
-nunr3 n uen3 &r
-nunr4 n uen4 &r
-nunr5 n uen5 &r
-lunr1 l uen1 &r
-lunr2 l uen2 &r
-lunr3 l uen3 &r
-lunr4 l uen4 &r
-lunr5 l uen5 &r
-gunr1 g uen1 &r
-gunr2 g uen2 &r
-gunr3 g uen3 &r
-gunr4 g uen4 &r
-gunr5 g uen5 &r
-kunr1 k uen1 &r
-kunr2 k uen2 &r
-kunr3 k uen3 &r
-kunr4 k uen4 &r
-kunr5 k uen5 &r
-hunr1 h uen1 &r
-hunr2 h uen2 &r
-hunr3 h uen3 &r
-hunr4 h uen4 &r
-hunr5 h uen5 &r
-zhunr1 zh uen1 &r
-zhunr2 zh uen2 &r
-zhunr3 zh uen3 &r
-zhunr4 zh uen4 &r
-zhunr5 zh uen5 &r
-chunr1 ch uen1 &r
-chunr2 ch uen2 &r
-chunr3 ch uen3 &r
-chunr4 ch uen4 &r
-chunr5 ch uen5 &r
-shunr1 sh uen1 &r
-shunr2 sh uen2 &r
-shunr3 sh uen3 &r
-shunr4 sh uen4 &r
-shunr5 sh uen5 &r
-runr1 r uen1 &r
-runr2 r uen2 &r
-runr3 r uen3 &r
-runr4 r uen4 &r
-runr5 r uen5 &r
-zunr1 z uen1 &r
-zunr2 z uen2 &r
-zunr3 z uen3 &r
-zunr4 z uen4 &r
-zunr5 z uen5 &r
-cunr1 c uen1 &r
-cunr2 c uen2 &r
-cunr3 c uen3 &r
-cunr4 c uen4 &r
-cunr5 c uen5 &r
-sunr1 s uen1 &r
-sunr2 s uen2 &r
-sunr3 s uen3 &r
-sunr4 s uen4 &r
-sunr5 s uen5 &r
-wangr1 w uang1 &r
-wangr2 w uang2 &r
-wangr3 w uang3 &r
-wangr4 w uang4 &r
-wangr5 w uang5 &r
-guangr1 g uang1 &r
-guangr2 g uang2 &r
-guangr3 g uang3 &r
-guangr4 g uang4 &r
-guangr5 g uang5 &r
-kuangr1 k uang1 &r
-kuangr2 k uang2 &r
-kuangr3 k uang3 &r
-kuangr4 k uang4 &r
-kuangr5 k uang5 &r
-huangr1 h uang1 &r
-huangr2 h uang2 &r
-huangr3 h uang3 &r
-huangr4 h uang4 &r
-huangr5 h uang5 &r
-zhuangr1 zh uang1 &r
-zhuangr2 zh uang2 &r
-zhuangr3 zh uang3 &r
-zhuangr4 zh uang4 &r
-zhuangr5 zh uang5 &r
-chuangr1 ch uang1 &r
-chuangr2 ch uang2 &r
-chuangr3 ch uang3 &r
-chuangr4 ch uang4 &r
-chuangr5 ch uang5 &r
-shuangr1 sh uang1 &r
-shuangr2 sh uang2 &r
-shuangr3 sh uang3 &r
-shuangr4 sh uang4 &r
-shuangr5 sh uang5 &r
-wengr1 w ung1 &r
-wengr2 w ung2 &r
-wengr3 w ung3 &r
-wengr4 w ung4 &r
-wengr5 w ung5 &r
-dongr1 d ung1 &r
-dongr2 d ung2 &r
-dongr3 d ung3 &r
-dongr4 d ung4 &r
-dongr5 d ung5 &r
-tongr1 t ung1 &r
-tongr2 t ung2 &r
-tongr3 t ung3 &r
-tongr4 t ung4 &r
-tongr5 t ung5 &r
-nongr1 n ung1 &r
-nongr2 n ung2 &r
-nongr3 n ung3 &r
-nongr4 n ung4 &r
-nongr5 n ung5 &r
-longr1 l ung1 &r
-longr2 l ung2 &r
-longr3 l ung3 &r
-longr4 l ung4 &r
-longr5 l ung5 &r
-gongr1 g ung1 &r
-gongr2 g ung2 &r
-gongr3 g ung3 &r
-gongr4 g ung4 &r
-gongr5 g ung5 &r
-kongr1 k ung1 &r
-kongr2 k ung2 &r
-kongr3 k ung3 &r
-kongr4 k ung4 &r
-kongr5 k ung5 &r
-hongr1 h ung1 &r
-hongr2 h ung2 &r
-hongr3 h ung3 &r
-hongr4 h ung4 &r
-hongr5 h ung5 &r
-zhongr1 zh ung1 &r
-zhongr2 zh ung2 &r
-zhongr3 zh ung3 &r
-zhongr4 zh ung4 &r
-zhongr5 zh ung5 &r
-chongr1 ch ung1 &r
-chongr2 ch ung2 &r
-chongr3 ch ung3 &r
-chongr4 ch ung4 &r
-chongr5 ch ung5 &r
-rongr1 r ung1 &r
-rongr2 r ung2 &r
-rongr3 r ung3 &r
-rongr4 r ung4 &r
-rongr5 r ung5 &r
-zongr1 z ung1 &r
-zongr2 z ung2 &r
-zongr3 z ung3 &r
-zongr4 z ung4 &r
-zongr5 z ung5 &r
-congr1 c ung1 &r
-congr2 c ung2 &r
-congr3 c ung3 &r
-congr4 c ung4 &r
-congr5 c ung5 &r
-songr1 s ung1 &r
-songr2 s ung2 &r
-songr3 s ung3 &r
-songr4 s ung4 &r
-songr5 s ung5 &r
-yur1 y v1 &r
-yur2 y v2 &r
-yur3 y v3 &r
-yur4 y v4 &r
-yur5 y v5 &r
-nvr1 n v1 &r
-nvr2 n v2 &r
-nvr3 n v3 &r
-nvr4 n v4 &r
-nvr5 n v5 &r
-lvr1 l v1 &r
-lvr2 l v2 &r
-lvr3 l v3 &r
-lvr4 l v4 &r
-lvr5 l v5 &r
-jur1 j v1 &r
-jur2 j v2 &r
-jur3 j v3 &r
-jur4 j v4 &r
-jur5 j v5 &r
-qur1 q v1 &r
-qur2 q v2 &r
-qur3 q v3 &r
-qur4 q v4 &r
-qur5 q v5 &r
-xur1 x v1 &r
-xur2 x v2 &r
-xur3 x v3 &r
-xur4 x v4 &r
-xur5 x v5 &r
-yuer1 y ve1 &r
-yuer2 y ve2 &r
-yuer3 y ve3 &r
-yuer4 y ve4 &r
-yuer5 y ve5 &r
-nuer1 n ve1 &r
-nuer2 n ve2 &r
-nuer3 n ve3 &r
-nuer4 n ve4 &r
-nuer5 n ve5 &r
-nver1 n ve1 &r
-nver2 n ve2 &r
-nver3 n ve3 &r
-nver4 n ve4 &r
-nver5 n ve5 &r
-luer1 l ve1 &r
-luer2 l ve2 &r
-luer3 l ve3 &r
-luer4 l ve4 &r
-luer5 l ve5 &r
-lver1 l ve1 &r
-lver2 l ve2 &r
-lver3 l ve3 &r
-lver4 l ve4 &r
-lver5 l ve5 &r
-juer1 j ve1 &r
-juer2 j ve2 &r
-juer3 j ve3 &r
-juer4 j ve4 &r
-juer5 j ve5 &r
-quer1 q ve1 &r
-quer2 q ve2 &r
-quer3 q ve3 &r
-quer4 q ve4 &r
-quer5 q ve5 &r
-xuer1 x ve1 &r
-xuer2 x ve2 &r
-xuer3 x ve3 &r
-xuer4 x ve4 &r
-xuer5 x ve5 &r
-yuanr1 y van1 &r
-yuanr2 y van2 &r
-yuanr3 y van3 &r
-yuanr4 y van4 &r
-yuanr5 y van5 &r
-juanr1 j van1 &r
-juanr2 j van2 &r
-juanr3 j van3 &r
-juanr4 j van4 &r
-juanr5 j van5 &r
-quanr1 q van1 &r
-quanr2 q van2 &r
-quanr3 q van3 &r
-quanr4 q van4 &r
-quanr5 q van5 &r
-xuanr1 x van1 &r
-xuanr2 x van2 &r
-xuanr3 x van3 &r
-xuanr4 x van4 &r
-xuanr5 x van5 &r
-yunr1 y vn1 &r
-yunr2 y vn2 &r
-yunr3 y vn3 &r
-yunr4 y vn4 &r
-yunr5 y vn5 &r
-junr1 j vn1 &r
-junr2 j vn2 &r
-junr3 j vn3 &r
-junr4 j vn4 &r
-junr5 j vn5 &r
-qunr1 q vn1 &r
-qunr2 q vn2 &r
-qunr3 q vn3 &r
-qunr4 q vn4 &r
-qunr5 q vn5 &r
-xunr1 x vn1 &r
-xunr2 x vn2 &r
-xunr3 x vn3 &r
-xunr4 x vn4 &r
-xunr5 x vn5 &r
-yongr1 y vng1 &r
-yongr2 y vng2 &r
-yongr3 y vng3 &r
-yongr4 y vng4 &r
-yongr5 y vng5 &r
-jiongr1 j vng1 &r
-jiongr2 j vng2 &r
-jiongr3 j vng3 &r
-jiongr4 j vng4 &r
-jiongr5 j vng5 &r
-qiongr1 q vng1 &r
-qiongr2 q vng2 &r
-qiongr3 q vng3 &r
-qiongr4 q vng4 &r
-qiongr5 q vng5 &r
-xiongr1 x vng1 &r
-xiongr2 x vng2 &r
-xiongr3 x vng3 &r
-xiongr4 x vng4 &r
-xiongr5 x vng5 &r
diff --git a/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/preprocess_transcription.py b/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/preprocess_transcription.py
deleted file mode 100644
index ce117d42..00000000
--- a/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/preprocess_transcription.py
+++ /dev/null
@@ -1,257 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import pickle
-import re
-from pathlib import Path
-
-import tqdm
-import yaml
-
-zh_pattern = re.compile("[\u4e00-\u9fa5]")
-
-_tones = {'', '', ' ', '0', '1', '2', '3', '4', '5'}
-
-_pauses = {'%', '$'}
-
-_initials = {
- 'b',
- 'p',
- 'm',
- 'f',
- 'd',
- 't',
- 'n',
- 'l',
- 'g',
- 'k',
- 'h',
- 'j',
- 'q',
- 'x',
- 'zh',
- 'ch',
- 'sh',
- 'r',
- 'z',
- 'c',
- 's',
-}
-
-_finals = {
- 'ii',
- 'iii',
- 'a',
- 'o',
- 'e',
- 'ea',
- 'ai',
- 'ei',
- 'ao',
- 'ou',
- 'an',
- 'en',
- 'ang',
- 'eng',
- 'er',
- 'i',
- 'ia',
- 'io',
- 'ie',
- 'iai',
- 'iao',
- 'iou',
- 'ian',
- 'ien',
- 'iang',
- 'ieng',
- 'u',
- 'ua',
- 'uo',
- 'uai',
- 'uei',
- 'uan',
- 'uen',
- 'uang',
- 'ueng',
- 'v',
- 've',
- 'van',
- 'ven',
- 'veng',
-}
-
-_ernized_symbol = {'&r'}
-
-_specials = {'', '', '', ' '}
-
-_phones = _initials | _finals | _ernized_symbol | _specials | _pauses
-
-
-def is_zh(word):
- global zh_pattern
- match = zh_pattern.search(word)
- return match is not None
-
-
-def ernized(syllable):
- return syllable[:2] != "er" and syllable[-2] == 'r'
-
-
-def convert(syllable):
- # expansion of o -> uo
- syllable = re.sub(r"([bpmf])o$", r"\1uo", syllable)
- # syllable = syllable.replace("bo", "buo").replace("po", "puo").replace("mo", "muo").replace("fo", "fuo")
- # expansion for iong, ong
- syllable = syllable.replace("iong", "veng").replace("ong", "ueng")
-
- # expansion for ing, in
- syllable = syllable.replace("ing", "ieng").replace("in", "ien")
-
- # expansion for un, ui, iu
- syllable = syllable.replace("un", "uen").replace("ui",
- "uei").replace("iu", "iou")
-
- # rule for variants of i
- syllable = syllable.replace("zi", "zii").replace("ci", "cii").replace("si", "sii")\
- .replace("zhi", "zhiii").replace("chi", "chiii").replace("shi", "shiii")\
- .replace("ri", "riii")
-
- # rule for y preceding i, u
- syllable = syllable.replace("yi", "i").replace("yu", "v").replace("y", "i")
-
- # rule for w
- syllable = syllable.replace("wu", "u").replace("w", "u")
-
- # rule for v following j, q, x
- syllable = syllable.replace("ju", "jv").replace("qu",
- "qv").replace("xu", "xv")
-
- return syllable
-
-
-def split_syllable(syllable: str):
- """Split a syllable in pinyin into a list of phones and a list of tones.
- Initials have no tone, represented by '0', while finals have tones from
- '1,2,3,4,5'.
-
- e.g.
-
- zhang -> ['zh', 'ang'], ['0', '1']
- """
- if syllable in _pauses:
- # syllable, tone
- return [syllable], ['0']
-
- tone = syllable[-1]
- syllable = convert(syllable[:-1])
-
- phones = []
- tones = []
-
- global _initials
- if syllable[:2] in _initials:
- phones.append(syllable[:2])
- tones.append('0')
- phones.append(syllable[2:])
- tones.append(tone)
- elif syllable[0] in _initials:
- phones.append(syllable[0])
- tones.append('0')
- phones.append(syllable[1:])
- tones.append(tone)
- else:
- phones.append(syllable)
- tones.append(tone)
- return phones, tones
-
-
-def load_aishell3_transcription(line: str):
- sentence_id, pinyin, text = line.strip().split("|")
- syllables = pinyin.strip().split()
-
- results = []
-
- for syllable in syllables:
- if syllable in _pauses:
- results.append(syllable)
- elif not ernized(syllable):
- results.append(syllable)
- else:
- results.append(syllable[:-2] + syllable[-1])
- results.append('&r5')
-
- phones = []
- tones = []
- for syllable in results:
- p, t = split_syllable(syllable)
- phones.extend(p)
- tones.extend(t)
- for p in phones:
- assert p in _phones, p
- return {
- "sentence_id": sentence_id,
- "text": text,
- "syllables": results,
- "phones": phones,
- "tones": tones
- }
-
-
-def process_aishell3(dataset_root, output_dir):
- dataset_root = Path(dataset_root).expanduser()
- output_dir = Path(output_dir).expanduser()
- output_dir.mkdir(parents=True, exist_ok=True)
-
- prosody_label_path = dataset_root / "label_train-set.txt"
- with open(prosody_label_path, 'rt') as f:
- lines = [line.strip() for line in f]
-
- records = lines[5:]
-
- processed_records = []
- for record in tqdm.tqdm(records):
- new_record = load_aishell3_transcription(record)
- processed_records.append(new_record)
- print(new_record)
-
- with open(output_dir / "metadata.pickle", 'wb') as f:
- pickle.dump(processed_records, f)
-
- with open(output_dir / "metadata.yaml", 'wt', encoding="utf-8") as f:
- yaml.safe_dump(
- processed_records, f, default_flow_style=None, allow_unicode=True)
-
- print("metadata done!")
-
-
-if __name__ == "__main__":
- parser = argparse.ArgumentParser(
- description="Preprocess transcription of AiShell3 and save them in a compact file(yaml and pickle)."
- )
- parser.add_argument(
- "--input",
- type=str,
- default="~/datasets/aishell3/train",
- help="path of the training dataset,(contains a label_train-set.txt).")
- parser.add_argument(
- "--output",
- type=str,
- help="the directory to save the processed transcription."
- "If not provided, it would be the same as the input.")
- args = parser.parse_args()
- if args.output is None:
- args.output = args.input
-
- process_aishell3(args.input, args.output)
diff --git a/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/process_wav.py b/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/process_wav.py
deleted file mode 100644
index 56d8e4c3..00000000
--- a/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/process_wav.py
+++ /dev/null
@@ -1,94 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-from functools import partial
-from multiprocessing import Pool
-from pathlib import Path
-
-import librosa
-import numpy as np
-import soundfile as sf
-from praatio import textgrid
-from tqdm import tqdm
-
-
-def get_valid_part(fpath):
- f = textgrid.openTextgrid(fpath, includeEmptyIntervals=True)
-
- start = 0
- phone_entry_list = f.tierDict['phones'].entryList
- first_entry = phone_entry_list[0]
- if first_entry.label == "sil":
- start = first_entry.end
-
- last_entry = phone_entry_list[-1]
- if last_entry.label == "sp":
- end = last_entry.start
- else:
- end = last_entry.end
- return start, end
-
-
-def process_utterance(fpath, source_dir, target_dir, alignment_dir):
- rel_path = fpath.relative_to(source_dir)
- opath = target_dir / rel_path
- apath = (alignment_dir / rel_path).with_suffix(".TextGrid")
- opath.parent.mkdir(parents=True, exist_ok=True)
-
- start, end = get_valid_part(apath)
- wav, _ = librosa.load(fpath, sr=22050, offset=start, duration=end - start)
- normalized_wav = wav / np.max(wav) * 0.999
- sf.write(opath, normalized_wav, samplerate=22050, subtype='PCM_16')
- # print(f"{fpath} => {opath}")
-
-
-def preprocess_aishell3(source_dir, target_dir, alignment_dir):
- source_dir = Path(source_dir).expanduser()
- target_dir = Path(target_dir).expanduser()
- alignment_dir = Path(alignment_dir).expanduser()
-
- wav_paths = list(source_dir.rglob("*.wav"))
- print(f"there are {len(wav_paths)} audio files in total")
- fx = partial(
- process_utterance,
- source_dir=source_dir,
- target_dir=target_dir,
- alignment_dir=alignment_dir)
- with Pool(16) as p:
- list(
- tqdm(p.imap(fx, wav_paths), total=len(wav_paths), unit="utterance"))
-
-
-if __name__ == "__main__":
- parser = argparse.ArgumentParser(
- description="Process audio in AiShell3, trim silence according to the alignment "
- "files generated by MFA, and normalize volume by peak.")
- parser.add_argument(
- "--input",
- type=str,
- default="~/datasets/aishell3/train/wav",
- help="path of the original audio folder in aishell3.")
- parser.add_argument(
- "--output",
- type=str,
- default="~/datasets/aishell3/train/normalized_wav",
- help="path of the folder to save the processed audio files.")
- parser.add_argument(
- "--alignment",
- type=str,
- default="~/datasets/aishell3/train/alignment",
- help="path of the alignment files.")
- args = parser.parse_args()
-
- preprocess_aishell3(args.input, args.output, args.alignment)
diff --git a/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/train.py b/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/train.py
deleted file mode 100644
index ea5f12da..00000000
--- a/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/train.py
+++ /dev/null
@@ -1,263 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import time
-from collections import defaultdict
-from pathlib import Path
-
-import numpy as np
-import paddle
-from matplotlib import pyplot as plt
-from paddle import distributed as dist
-from paddle.io import DataLoader
-from paddle.io import DistributedBatchSampler
-
-from paddlespeech.t2s.data import dataset
-from paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.aishell3 import AiShell3
-from paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.aishell3 import collate_aishell3_examples
-from paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.config import get_cfg_defaults
-from paddlespeech.t2s.models.tacotron2 import Tacotron2
-from paddlespeech.t2s.models.tacotron2 import Tacotron2Loss
-from paddlespeech.t2s.training.cli import default_argument_parser
-from paddlespeech.t2s.training.experiment import ExperimentBase
-from paddlespeech.t2s.utils import display
-from paddlespeech.t2s.utils import mp_tools
-
-
-class Experiment(ExperimentBase):
- def compute_losses(self, inputs, outputs):
- texts, tones, mel_targets, utterance_embeds, text_lens, output_lens, stop_tokens = inputs
-
- mel_outputs = outputs["mel_output"]
- mel_outputs_postnet = outputs["mel_outputs_postnet"]
- alignments = outputs["alignments"]
-
- losses = self.criterion(mel_outputs, mel_outputs_postnet, mel_targets,
- alignments, output_lens, text_lens)
- return losses
-
- def train_batch(self):
- start = time.time()
- batch = self.read_batch()
- data_loader_time = time.time() - start
-
- self.optimizer.clear_grad()
- self.model.train()
- texts, tones, mels, utterance_embeds, text_lens, output_lens, stop_tokens = batch
- outputs = self.model(
- texts,
- text_lens,
- mels,
- output_lens,
- tones=tones,
- global_condition=utterance_embeds)
- losses = self.compute_losses(batch, outputs)
- loss = losses["loss"]
- loss.backward()
- self.optimizer.step()
- iteration_time = time.time() - start
-
- losses_np = {k: float(v) for k, v in losses.items()}
- # logging
- msg = "Rank: {}, ".format(dist.get_rank())
- msg += "step: {}, ".format(self.iteration)
- msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time,
- iteration_time)
- msg += ', '.join('{}: {:>.6f}'.format(k, v)
- for k, v in losses_np.items())
- self.logger.info(msg)
-
- if dist.get_rank() == 0:
- for key, value in losses_np.items():
- self.visualizer.add_scalar(f"train_loss/{key}", value,
- self.iteration)
-
- @mp_tools.rank_zero_only
- @paddle.no_grad()
- def valid(self):
- valid_losses = defaultdict(list)
- for i, batch in enumerate(self.valid_loader):
- texts, tones, mels, utterance_embeds, text_lens, output_lens, stop_tokens = batch
- outputs = self.model(
- texts,
- text_lens,
- mels,
- output_lens,
- tones=tones,
- global_condition=utterance_embeds)
- losses = self.compute_losses(batch, outputs)
- for key, value in losses.items():
- valid_losses[key].append(float(value))
-
- attention_weights = outputs["alignments"]
- self.visualizer.add_figure(
- f"valid_sentence_{i}_alignments",
- display.plot_alignment(attention_weights[0].numpy().T),
- self.iteration)
- self.visualizer.add_figure(
- f"valid_sentence_{i}_target_spectrogram",
- display.plot_spectrogram(mels[0].numpy().T), self.iteration)
- mel_pred = outputs['mel_outputs_postnet']
- self.visualizer.add_figure(
- f"valid_sentence_{i}_predicted_spectrogram",
- display.plot_spectrogram(mel_pred[0].numpy().T), self.iteration)
-
- # write visual log
- valid_losses = {k: np.mean(v) for k, v in valid_losses.items()}
-
- # logging
- msg = "Valid: "
- msg += "step: {}, ".format(self.iteration)
- msg += ', '.join('{}: {:>.6f}'.format(k, v)
- for k, v in valid_losses.items())
- self.logger.info(msg)
-
- for key, value in valid_losses.items():
- self.visualizer.add_scalar(f"valid/{key}", value, self.iteration)
-
- @mp_tools.rank_zero_only
- @paddle.no_grad()
- def eval(self):
- """Evaluation of Tacotron2 in autoregressive manner."""
- self.model.eval()
- mel_dir = Path(self.output_dir / ("eval_{}".format(self.iteration)))
- mel_dir.mkdir(parents=True, exist_ok=True)
- for i, batch in enumerate(self.test_loader):
- texts, tones, mels, utterance_embeds, *_ = batch
- outputs = self.model.infer(
- texts, tones=tones, global_condition=utterance_embeds)
-
- display.plot_alignment(outputs["alignments"][0].numpy().T)
- plt.savefig(mel_dir / f"sentence_{i}.png")
- plt.close()
- np.save(mel_dir / f"sentence_{i}",
- outputs["mel_outputs_postnet"][0].numpy().T)
- print(f"sentence_{i}")
-
- def setup_model(self):
- config = self.config
- model = Tacotron2(
- vocab_size=config.model.vocab_size,
- n_tones=config.model.n_tones,
- d_mels=config.data.d_mels,
- d_encoder=config.model.d_encoder,
- encoder_conv_layers=config.model.encoder_conv_layers,
- encoder_kernel_size=config.model.encoder_kernel_size,
- d_prenet=config.model.d_prenet,
- d_attention_rnn=config.model.d_attention_rnn,
- d_decoder_rnn=config.model.d_decoder_rnn,
- attention_filters=config.model.attention_filters,
- attention_kernel_size=config.model.attention_kernel_size,
- d_attention=config.model.d_attention,
- d_postnet=config.model.d_postnet,
- postnet_kernel_size=config.model.postnet_kernel_size,
- postnet_conv_layers=config.model.postnet_conv_layers,
- reduction_factor=config.model.reduction_factor,
- p_encoder_dropout=config.model.p_encoder_dropout,
- p_prenet_dropout=config.model.p_prenet_dropout,
- p_attention_dropout=config.model.p_attention_dropout,
- p_decoder_dropout=config.model.p_decoder_dropout,
- p_postnet_dropout=config.model.p_postnet_dropout,
- d_global_condition=config.model.d_global_condition,
- use_stop_token=config.model.use_stop_token, )
-
- if self.parallel:
- model = paddle.DataParallel(model)
-
- grad_clip = paddle.nn.ClipGradByGlobalNorm(
- config.training.grad_clip_thresh)
- optimizer = paddle.optimizer.Adam(
- learning_rate=config.training.lr,
- parameters=model.parameters(),
- weight_decay=paddle.regularizer.L2Decay(
- config.training.weight_decay),
- grad_clip=grad_clip)
- criterion = Tacotron2Loss(
- use_stop_token_loss=config.model.use_stop_token,
- use_guided_attention_loss=config.model.use_guided_attention_loss,
- sigma=config.model.guided_attention_loss_sigma)
- self.model = model
- self.optimizer = optimizer
- self.criterion = criterion
-
- def setup_dataloader(self):
- args = self.args
- config = self.config
- aishell3_dataset = AiShell3(args.data)
-
- valid_set, train_set = dataset.split(aishell3_dataset,
- config.data.valid_size)
- batch_fn = collate_aishell3_examples
-
- if not self.parallel:
- self.train_loader = DataLoader(
- train_set,
- batch_size=config.data.batch_size,
- shuffle=True,
- drop_last=True,
- collate_fn=batch_fn)
- else:
- sampler = DistributedBatchSampler(
- train_set,
- batch_size=config.data.batch_size,
- shuffle=True,
- drop_last=True)
- self.train_loader = DataLoader(
- train_set, batch_sampler=sampler, collate_fn=batch_fn)
-
- self.valid_loader = DataLoader(
- valid_set,
- batch_size=config.data.batch_size,
- shuffle=False,
- drop_last=False,
- collate_fn=batch_fn)
-
- self.test_loader = DataLoader(
- valid_set,
- batch_size=1,
- shuffle=False,
- drop_last=False,
- collate_fn=batch_fn)
-
-
-def main_sp(config, args):
- exp = Experiment(config, args)
- exp.setup()
- exp.resume_or_load()
- if not args.test:
- exp.run()
- else:
- exp.eval()
-
-
-def main(config, args):
- if args.ngpu > 1:
- dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu)
- else:
- main_sp(config, args)
-
-
-if __name__ == "__main__":
- config = get_cfg_defaults()
- parser = default_argument_parser()
- parser.add_argument("--test", action="store_true")
- args = parser.parse_args()
- if args.config:
- config.merge_from_file(args.config)
- if args.opts:
- config.merge_from_list(args.opts)
- config.freeze()
- print(config)
- print(args)
-
- main(config, args)
diff --git a/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/voice_cloning.py b/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/voice_cloning.py
deleted file mode 100644
index 4e6b8d36..00000000
--- a/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/voice_cloning.py
+++ /dev/null
@@ -1,166 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import os
-from pathlib import Path
-
-import numpy as np
-import paddle
-import soundfile as sf
-from matplotlib import pyplot as plt
-
-from paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.aishell3 import voc_phones
-from paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.aishell3 import voc_tones
-from paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.chinese_g2p import convert_sentence
-from paddlespeech.t2s.models.tacotron2 import Tacotron2
-from paddlespeech.t2s.models.waveflow import ConditionalWaveFlow
-from paddlespeech.t2s.utils import display
-from paddlespeech.vector.exps.ge2e.audio_processor import SpeakerVerificationPreprocessor
-from paddlespeech.vector.models.lstm_speaker_encoder import LSTMSpeakerEncoder
-
-
-def voice_cloning(args):
- # speaker encoder
- p = SpeakerVerificationPreprocessor(
- sampling_rate=16000,
- audio_norm_target_dBFS=-30,
- vad_window_length=30,
- vad_moving_average_width=8,
- vad_max_silence_length=6,
- mel_window_length=25,
- mel_window_step=10,
- n_mels=40,
- partial_n_frames=160,
- min_pad_coverage=0.75,
- partial_overlap_ratio=0.5)
- print("Audio Processor Done!")
-
- speaker_encoder = LSTMSpeakerEncoder(
- n_mels=40, num_layers=3, hidden_size=256, output_size=256)
- speaker_encoder.set_state_dict(paddle.load(args.ge2e_params_path))
- speaker_encoder.eval()
- print("GE2E Done!")
-
- synthesizer = Tacotron2(
- vocab_size=68,
- n_tones=10,
- d_mels=80,
- d_encoder=512,
- encoder_conv_layers=3,
- encoder_kernel_size=5,
- d_prenet=256,
- d_attention_rnn=1024,
- d_decoder_rnn=1024,
- attention_filters=32,
- attention_kernel_size=31,
- d_attention=128,
- d_postnet=512,
- postnet_kernel_size=5,
- postnet_conv_layers=5,
- reduction_factor=1,
- p_encoder_dropout=0.5,
- p_prenet_dropout=0.5,
- p_attention_dropout=0.1,
- p_decoder_dropout=0.1,
- p_postnet_dropout=0.5,
- d_global_condition=256,
- use_stop_token=False, )
- synthesizer.set_state_dict(paddle.load(args.tacotron2_params_path))
- synthesizer.eval()
- print("Tacotron2 Done!")
-
- # vocoder
- vocoder = ConditionalWaveFlow(
- upsample_factors=[16, 16],
- n_flows=8,
- n_layers=8,
- n_group=16,
- channels=128,
- n_mels=80,
- kernel_size=[3, 3])
- vocoder.set_state_dict(paddle.load(args.waveflow_params_path))
- vocoder.eval()
- print("WaveFlow Done!")
-
- output_dir = Path(args.output_dir)
- output_dir.mkdir(parents=True, exist_ok=True)
-
- input_dir = Path(args.input_dir)
-
- # 因为 AISHELL-3 数据集中使用 % 和 $ 表示韵律词和韵律短语的边界,它们大约对应着较短和较长的停顿,在文本中可以使用 % 和 $ 来调节韵律。
- # 值得的注意的是,句子的有效字符集仅包含汉字和 %, $, 因此输入的句子只能包含这些字符。
- sentence = "每当你觉得%想要批评什么人的时候$你切要记着%这个世界上的人%并非都具备你禀有的条件$"
- phones, tones = convert_sentence(sentence)
- phones = np.array(
- [voc_phones.lookup(item) for item in phones], dtype=np.int64)
- tones = np.array([voc_tones.lookup(item) for item in tones], dtype=np.int64)
- phones = paddle.to_tensor(phones).unsqueeze(0)
- tones = paddle.to_tensor(tones).unsqueeze(0)
-
- for name in os.listdir(input_dir):
- utt_id = name.split(".")[0]
- ref_audio_path = input_dir / name
- mel_sequences = p.extract_mel_partials(p.preprocess_wav(ref_audio_path))
- print("mel_sequences: ", mel_sequences.shape)
- with paddle.no_grad():
- embed = speaker_encoder.embed_utterance(
- paddle.to_tensor(mel_sequences))
- print("embed shape: ", embed.shape)
- utterance_embeds = paddle.unsqueeze(embed, 0)
- outputs = synthesizer.infer(
- phones, tones=tones, global_condition=utterance_embeds)
- mel_input = paddle.transpose(outputs["mel_outputs_postnet"], [0, 2, 1])
- alignment = outputs["alignments"][0].numpy().T
- display.plot_alignment(alignment)
- plt.savefig(str(output_dir / (utt_id + ".png")))
-
- with paddle.no_grad():
- wav = vocoder.infer(mel_input)
- wav = wav.numpy()[0]
- sf.write(str(output_dir / (utt_id + ".wav")), wav, samplerate=22050)
-
-
-def main():
- # parse args and config and redirect to train_sp
- parser = argparse.ArgumentParser(description="")
- parser.add_argument(
- "--ge2e_params_path", type=str, help="ge2e params path.")
- parser.add_argument(
- "--tacotron2_params_path", type=str, help="tacotron2 params path.")
- parser.add_argument(
- "--waveflow_params_path", type=str, help="waveflow params path.")
-
- parser.add_argument(
- "--ngpu", type=int, default=1, help="if ngpu=0, use cpu.")
-
- parser.add_argument(
- "--input-dir",
- type=str,
- help="input dir of *.wav, the sample rate will be resample to 16k.")
- parser.add_argument("--output-dir", type=str, help="output dir.")
-
- args = parser.parse_args()
-
- if args.ngpu == 0:
- paddle.set_device("cpu")
- elif args.ngpu > 0:
- paddle.set_device("gpu")
- else:
- print("ngpu should >= 0 !")
-
- voice_cloning(args)
-
-
-if __name__ == "__main__":
- main()
diff --git a/paddlespeech/t2s/exps/waveflow/ljspeech.py b/paddlespeech/t2s/exps/waveflow/ljspeech.py
index 655b63da..a6efa9ec 100644
--- a/paddlespeech/t2s/exps/waveflow/ljspeech.py
+++ b/paddlespeech/t2s/exps/waveflow/ljspeech.py
@@ -17,8 +17,8 @@ import numpy as np
import pandas
from paddle.io import Dataset
-from paddlespeech.t2s.data.batch import batch_spec
-from paddlespeech.t2s.data.batch import batch_wav
+from paddlespeech.t2s.datasets.batch import batch_spec
+from paddlespeech.t2s.datasets.batch import batch_wav
class LJSpeech(Dataset):
diff --git a/paddlespeech/t2s/exps/waveflow/train.py b/paddlespeech/t2s/exps/waveflow/train.py
index d500336a..cf03f5ef 100644
--- a/paddlespeech/t2s/exps/waveflow/train.py
+++ b/paddlespeech/t2s/exps/waveflow/train.py
@@ -19,7 +19,7 @@ from paddle import distributed as dist
from paddle.io import DataLoader
from paddle.io import DistributedBatchSampler
-from paddlespeech.t2s.data import dataset
+from paddlespeech.t2s.datasets import dataset
from paddlespeech.t2s.exps.waveflow.config import get_cfg_defaults
from paddlespeech.t2s.exps.waveflow.ljspeech import LJSpeech
from paddlespeech.t2s.exps.waveflow.ljspeech import LJSpeechClipCollector
diff --git a/paddlespeech/t2s/exps/voice_cloning/__init__.py b/paddlespeech/t2s/exps/wavernn/__init__.py
similarity index 100%
rename from paddlespeech/t2s/exps/voice_cloning/__init__.py
rename to paddlespeech/t2s/exps/wavernn/__init__.py
diff --git a/paddlespeech/t2s/exps/wavernn/synthesize.py b/paddlespeech/t2s/exps/wavernn/synthesize.py
new file mode 100644
index 00000000..4357b282
--- /dev/null
+++ b/paddlespeech/t2s/exps/wavernn/synthesize.py
@@ -0,0 +1,108 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+from pathlib import Path
+
+import jsonlines
+import numpy as np
+import paddle
+import soundfile as sf
+import yaml
+from paddle import distributed as dist
+from timer import timer
+from yacs.config import CfgNode
+
+from paddlespeech.t2s.datasets.data_table import DataTable
+from paddlespeech.t2s.models.wavernn import WaveRNN
+
+
+def main():
+ parser = argparse.ArgumentParser(description="Synthesize with WaveRNN.")
+
+ parser.add_argument("--config", type=str, help="Vocoder config file.")
+ parser.add_argument("--checkpoint", type=str, help="snapshot to load.")
+ parser.add_argument("--test-metadata", type=str, help="dev data.")
+ parser.add_argument("--output-dir", type=str, help="output dir.")
+ parser.add_argument(
+ "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
+
+ args = parser.parse_args()
+
+ with open(args.config) as f:
+ config = CfgNode(yaml.safe_load(f))
+
+ print("========Args========")
+ print(yaml.safe_dump(vars(args)))
+ print("========Config========")
+ print(config)
+ print(
+ f"master see the word size: {dist.get_world_size()}, from pid: {os.getpid()}"
+ )
+
+ if args.ngpu == 0:
+ paddle.set_device("cpu")
+ elif args.ngpu > 0:
+ paddle.set_device("gpu")
+ else:
+ print("ngpu should >= 0 !")
+
+ model = WaveRNN(
+ hop_length=config.n_shift, sample_rate=config.fs, **config["model"])
+ state_dict = paddle.load(args.checkpoint)
+ model.set_state_dict(state_dict["main_params"])
+
+ model.eval()
+
+ with jsonlines.open(args.test_metadata, 'r') as reader:
+ metadata = list(reader)
+ test_dataset = DataTable(
+ metadata,
+ fields=['utt_id', 'feats'],
+ converters={
+ 'utt_id': None,
+ 'feats': np.load,
+ })
+ output_dir = Path(args.output_dir)
+ output_dir.mkdir(parents=True, exist_ok=True)
+
+ N = 0
+ T = 0
+ for example in test_dataset:
+ utt_id = example['utt_id']
+ mel = example['feats']
+ mel = paddle.to_tensor(mel) # (T, C)
+ with timer() as t:
+ with paddle.no_grad():
+ wav = model.generate(
+ c=mel,
+ batched=config.inference.gen_batched,
+ target=config.inference.target,
+ overlap=config.inference.overlap,
+ mu_law=config.mu_law,
+ gen_display=True)
+ wav = wav.numpy()
+ N += wav.size
+ T += t.elapse
+ speed = wav.size / t.elapse
+ rtf = config.fs / speed
+ print(
+ f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
+ )
+ sf.write(str(output_dir / (utt_id + ".wav")), wav, samplerate=config.fs)
+ print(f"generation speed: {N / T}Hz, RTF: {config.fs / (N / T) }")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/paddlespeech/t2s/exps/wavernn/train.py b/paddlespeech/t2s/exps/wavernn/train.py
new file mode 100644
index 00000000..8661d311
--- /dev/null
+++ b/paddlespeech/t2s/exps/wavernn/train.py
@@ -0,0 +1,212 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+import shutil
+from pathlib import Path
+
+import jsonlines
+import numpy as np
+import paddle
+import yaml
+from paddle import DataParallel
+from paddle import distributed as dist
+from paddle.io import DataLoader
+from paddle.io import DistributedBatchSampler
+from paddle.optimizer import Adam
+from yacs.config import CfgNode
+
+from paddlespeech.t2s.datasets.data_table import DataTable
+from paddlespeech.t2s.datasets.vocoder_batch_fn import WaveRNNClip
+from paddlespeech.t2s.models.wavernn import WaveRNN
+from paddlespeech.t2s.models.wavernn import WaveRNNEvaluator
+from paddlespeech.t2s.models.wavernn import WaveRNNUpdater
+from paddlespeech.t2s.modules.losses import discretized_mix_logistic_loss
+from paddlespeech.t2s.training.extensions.snapshot import Snapshot
+from paddlespeech.t2s.training.extensions.visualizer import VisualDL
+from paddlespeech.t2s.training.seeding import seed_everything
+from paddlespeech.t2s.training.trainer import Trainer
+
+
+def train_sp(args, config):
+ # decides device type and whether to run in parallel
+ # setup running environment correctly
+ world_size = paddle.distributed.get_world_size()
+ if (not paddle.is_compiled_with_cuda()) or args.ngpu == 0:
+ paddle.set_device("cpu")
+ else:
+ paddle.set_device("gpu")
+ if world_size > 1:
+ paddle.distributed.init_parallel_env()
+
+ # set the random seed, it is a must for multiprocess training
+ seed_everything(config.seed)
+
+ print(
+ f"rank: {dist.get_rank()}, pid: {os.getpid()}, parent_pid: {os.getppid()}",
+ )
+
+ # construct dataset for training and validation
+ with jsonlines.open(args.train_metadata, 'r') as reader:
+ train_metadata = list(reader)
+ train_dataset = DataTable(
+ data=train_metadata,
+ fields=["wave", "feats"],
+ converters={
+ "wave": np.load,
+ "feats": np.load,
+ }, )
+
+ with jsonlines.open(args.dev_metadata, 'r') as reader:
+ dev_metadata = list(reader)
+ dev_dataset = DataTable(
+ data=dev_metadata,
+ fields=["wave", "feats"],
+ converters={
+ "wave": np.load,
+ "feats": np.load,
+ }, )
+
+ batch_fn = WaveRNNClip(
+ mode=config.model.mode,
+ aux_context_window=config.model.aux_context_window,
+ hop_size=config.n_shift,
+ batch_max_steps=config.batch_max_steps,
+ bits=config.model.bits)
+
+ # collate function and dataloader
+ train_sampler = DistributedBatchSampler(
+ train_dataset,
+ batch_size=config.batch_size,
+ shuffle=True,
+ drop_last=True)
+ dev_sampler = DistributedBatchSampler(
+ dev_dataset,
+ batch_size=config.batch_size,
+ shuffle=False,
+ drop_last=False)
+ print("samplers done!")
+
+ train_dataloader = DataLoader(
+ train_dataset,
+ batch_sampler=train_sampler,
+ collate_fn=batch_fn,
+ num_workers=config.num_workers)
+
+ dev_dataloader = DataLoader(
+ dev_dataset,
+ collate_fn=batch_fn,
+ batch_sampler=dev_sampler,
+ num_workers=config.num_workers)
+
+ valid_generate_loader = DataLoader(dev_dataset, batch_size=1)
+
+ print("dataloaders done!")
+
+ model = WaveRNN(
+ hop_length=config.n_shift, sample_rate=config.fs, **config["model"])
+ if world_size > 1:
+ model = DataParallel(model)
+ print("model done!")
+
+ if config.model.mode == 'RAW':
+ criterion = paddle.nn.CrossEntropyLoss(axis=1)
+ elif config.model.mode == 'MOL':
+ criterion = discretized_mix_logistic_loss
+ else:
+ criterion = None
+ RuntimeError('Unknown model mode value - ', config.model.mode)
+ print("criterions done!")
+ clip = paddle.nn.ClipGradByGlobalNorm(config.grad_clip)
+ optimizer = Adam(
+ parameters=model.parameters(),
+ learning_rate=config.learning_rate,
+ grad_clip=clip)
+
+ print("optimizer done!")
+
+ output_dir = Path(args.output_dir)
+ output_dir.mkdir(parents=True, exist_ok=True)
+ if dist.get_rank() == 0:
+ config_name = args.config.split("/")[-1]
+ # copy conf to output_dir
+ shutil.copyfile(args.config, output_dir / config_name)
+
+ updater = WaveRNNUpdater(
+ model=model,
+ optimizer=optimizer,
+ criterion=criterion,
+ dataloader=train_dataloader,
+ output_dir=output_dir,
+ mode=config.model.mode)
+
+ evaluator = WaveRNNEvaluator(
+ model=model,
+ dataloader=dev_dataloader,
+ criterion=criterion,
+ output_dir=output_dir,
+ valid_generate_loader=valid_generate_loader,
+ config=config)
+
+ trainer = Trainer(
+ updater,
+ stop_trigger=(config.train_max_steps, "iteration"),
+ out=output_dir)
+
+ if dist.get_rank() == 0:
+ trainer.extend(
+ evaluator, trigger=(config.eval_interval_steps, 'iteration'))
+ trainer.extend(VisualDL(output_dir), trigger=(1, 'iteration'))
+ trainer.extend(
+ Snapshot(max_size=config.num_snapshots),
+ trigger=(config.save_interval_steps, 'iteration'))
+
+ print("Trainer Done!")
+ trainer.run()
+
+
+def main():
+ # parse args and config and redirect to train_sp
+
+ parser = argparse.ArgumentParser(description="Train a WaveRNN model.")
+ parser.add_argument(
+ "--config", type=str, help="config file to overwrite default config.")
+ parser.add_argument("--train-metadata", type=str, help="training data.")
+ parser.add_argument("--dev-metadata", type=str, help="dev data.")
+ parser.add_argument("--output-dir", type=str, help="output dir.")
+ parser.add_argument(
+ "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
+
+ args = parser.parse_args()
+
+ with open(args.config, 'rt') as f:
+ config = CfgNode(yaml.safe_load(f))
+
+ print("========Args========")
+ print(yaml.safe_dump(vars(args)))
+ print("========Config========")
+ print(config)
+ print(
+ f"master see the word size: {dist.get_world_size()}, from pid: {os.getpid()}"
+ )
+
+ # dispatch
+ if args.ngpu > 1:
+ dist.spawn(train_sp, (args, config), nprocs=args.ngpu)
+ else:
+ train_sp(args, config)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/paddlespeech/t2s/frontend/arpabet.py b/paddlespeech/t2s/frontend/arpabet.py
index 094a2bfa..7a81b645 100644
--- a/paddlespeech/t2s/frontend/arpabet.py
+++ b/paddlespeech/t2s/frontend/arpabet.py
@@ -133,16 +133,11 @@ class ARPABET(Phonetics):
def phoneticize(self, sentence, add_start_end=False):
""" Normalize the input text sequence and convert it into pronunciation sequence.
+ Args:
+ sentence (str): The input text sequence.
- Parameters
- -----------
- sentence: str
- The input text sequence.
-
- Returns
- ----------
- List[str]
- The list of pronunciation sequence.
+ Returns:
+ List[str]: The list of pronunciation sequence.
"""
phonemes = [
self._remove_vowels(item) for item in self.backend(sentence)
@@ -156,16 +151,12 @@ class ARPABET(Phonetics):
def numericalize(self, phonemes):
""" Convert pronunciation sequence into pronunciation id sequence.
-
- Parameters
- -----------
- phonemes: List[str]
- The list of pronunciation sequence.
+
+ Args:
+ phonemes (List[str]): The list of pronunciation sequence.
- Returns
- ----------
- List[int]
- The list of pronunciation id sequence.
+ Returns:
+ List[int]: The list of pronunciation id sequence.
"""
ids = [self.vocab.lookup(item) for item in phonemes]
return ids
@@ -173,30 +164,23 @@ class ARPABET(Phonetics):
def reverse(self, ids):
""" Reverse the list of pronunciation id sequence to a list of pronunciation sequence.
- Parameters
- -----------
- ids: List[int]
- The list of pronunciation id sequence.
+ Args:
+ ids( List[int]): The list of pronunciation id sequence.
- Returns
- ----------
- List[str]
- The list of pronunciation sequence.
+ Returns:
+ List[str]:
+ The list of pronunciation sequence.
"""
return [self.vocab.reverse(i) for i in ids]
def __call__(self, sentence, add_start_end=False):
""" Convert the input text sequence into pronunciation id sequence.
- Parameters
- -----------
- sentence: str
- The input text sequence.
+ Args:
+ sentence (str): The input text sequence.
- Returns
- ----------
- List[str]
- The list of pronunciation id sequence.
+ Returns:
+ List[str]: The list of pronunciation id sequence.
"""
return self.numericalize(
self.phoneticize(sentence, add_start_end=add_start_end))
@@ -229,15 +213,11 @@ class ARPABETWithStress(Phonetics):
def phoneticize(self, sentence, add_start_end=False):
""" Normalize the input text sequence and convert it into pronunciation sequence.
- Parameters
- -----------
- sentence: str
- The input text sequence.
+ Args:
+ sentence (str): The input text sequence.
- Returns
- ----------
- List[str]
- The list of pronunciation sequence.
+ Returns:
+ List[str]: The list of pronunciation sequence.
"""
phonemes = self.backend(sentence)
if add_start_end:
@@ -249,47 +229,33 @@ class ARPABETWithStress(Phonetics):
def numericalize(self, phonemes):
""" Convert pronunciation sequence into pronunciation id sequence.
-
- Parameters
- -----------
- phonemes: List[str]
- The list of pronunciation sequence.
+
+ Args:
+ phonemes (List[str]): The list of pronunciation sequence.
- Returns
- ----------
- List[int]
- The list of pronunciation id sequence.
+ Returns:
+ List[int]: The list of pronunciation id sequence.
"""
ids = [self.vocab.lookup(item) for item in phonemes]
return ids
def reverse(self, ids):
""" Reverse the list of pronunciation id sequence to a list of pronunciation sequence.
-
- Parameters
- -----------
- ids: List[int]
- The list of pronunciation id sequence.
+ Args:
+ ids (List[int]): The list of pronunciation id sequence.
- Returns
- ----------
- List[str]
- The list of pronunciation sequence.
+ Returns:
+ List[str]: The list of pronunciation sequence.
"""
return [self.vocab.reverse(i) for i in ids]
def __call__(self, sentence, add_start_end=False):
""" Convert the input text sequence into pronunciation id sequence.
+ Args:
+ sentence (str): The input text sequence.
- Parameters
- -----------
- sentence: str
- The input text sequence.
-
- Returns
- ----------
- List[str]
- The list of pronunciation id sequence.
+ Returns:
+ List[str]: The list of pronunciation id sequence.
"""
return self.numericalize(
self.phoneticize(sentence, add_start_end=add_start_end))
diff --git a/paddlespeech/t2s/frontend/phonectic.py b/paddlespeech/t2s/frontend/phonectic.py
index 25413871..8e9f1173 100644
--- a/paddlespeech/t2s/frontend/phonectic.py
+++ b/paddlespeech/t2s/frontend/phonectic.py
@@ -65,14 +65,10 @@ class English(Phonetics):
def phoneticize(self, sentence):
""" Normalize the input text sequence and convert it into pronunciation sequence.
- Parameters
- -----------
- sentence: str
- The input text sequence.
- Returns
- ----------
- List[str]
- The list of pronunciation sequence.
+ Args:
+ sentence (str): The input text sequence.
+ Returns:
+ List[str]: The list of pronunciation sequence.
"""
start = self.vocab.start_symbol
end = self.vocab.end_symbol
@@ -83,11 +79,6 @@ class English(Phonetics):
return phonemes
def _p2id(self, phonemes: List[str]) -> np.array:
- # replace unk phone with sp
- phonemes = [
- phn if (phn in self.vocab_phones and phn not in self.punc) else "sp"
- for phn in phonemes
- ]
phone_ids = [self.vocab_phones[item] for item in phonemes]
return np.array(phone_ids, np.int64)
@@ -102,6 +93,12 @@ class English(Phonetics):
# remove start_symbol and end_symbol
phones = phones[1:-1]
phones = [phn for phn in phones if not phn.isspace()]
+ # replace unk phone with sp
+ phones = [
+ phn
+ if (phn in self.vocab_phones and phn not in self.punc) else "sp"
+ for phn in phones
+ ]
phones_list.append(phones)
if merge_sentences:
@@ -122,14 +119,10 @@ class English(Phonetics):
def numericalize(self, phonemes):
""" Convert pronunciation sequence into pronunciation id sequence.
- Parameters
- -----------
- phonemes: List[str]
- The list of pronunciation sequence.
- Returns
- ----------
- List[int]
- The list of pronunciation id sequence.
+ Args:
+ phonemes (List[str]): The list of pronunciation sequence.
+ Returns:
+ List[int]: The list of pronunciation id sequence.
"""
ids = [
self.vocab.lookup(item) for item in phonemes
@@ -139,27 +132,19 @@ class English(Phonetics):
def reverse(self, ids):
""" Reverse the list of pronunciation id sequence to a list of pronunciation sequence.
- Parameters
- -----------
- ids: List[int]
- The list of pronunciation id sequence.
- Returns
- ----------
- List[str]
- The list of pronunciation sequence.
+ Args:
+ ids (List[int]): The list of pronunciation id sequence.
+ Returns:
+ List[str]: The list of pronunciation sequence.
"""
return [self.vocab.reverse(i) for i in ids]
def __call__(self, sentence):
""" Convert the input text sequence into pronunciation id sequence.
- Parameters
- -----------
- sentence: str
- The input text sequence.
- Returns
- ----------
- List[str]
- The list of pronunciation id sequence.
+ Args:
+ sentence(str): The input text sequence.
+ Returns:
+ List[str]: The list of pronunciation id sequence.
"""
return self.numericalize(self.phoneticize(sentence))
@@ -182,28 +167,21 @@ class EnglishCharacter(Phonetics):
def phoneticize(self, sentence):
""" Normalize the input text sequence.
- Parameters
- -----------
- sentence: str
- The input text sequence.
- Returns
- ----------
- str
- A text sequence after normalize.
+ Args:
+ sentence(str): The input text sequence.
+ Returns:
+ str: A text sequence after normalize.
"""
words = normalize(sentence)
return words
def numericalize(self, sentence):
""" Convert a text sequence into ids.
- Parameters
- -----------
- sentence: str
- The input text sequence.
- Returns
- ----------
- List[int]
- List of a character id sequence.
+ Args:
+ sentence (str): The input text sequence.
+ Returns:
+ List[int]:
+ List of a character id sequence.
"""
ids = [
self.vocab.lookup(item) for item in sentence
@@ -213,27 +191,19 @@ class EnglishCharacter(Phonetics):
def reverse(self, ids):
""" Convert a character id sequence into text.
- Parameters
- -----------
- ids: List[int]
- List of a character id sequence.
- Returns
- ----------
- str
- The input text sequence.
+ Args:
+ ids (List[int]): List of a character id sequence.
+ Returns:
+ str: The input text sequence.
"""
return [self.vocab.reverse(i) for i in ids]
def __call__(self, sentence):
""" Normalize the input text sequence and convert it into character id sequence.
- Parameters
- -----------
- sentence: str
- The input text sequence.
- Returns
- ----------
- List[int]
- List of a character id sequence.
+ Args:
+ sentence (str): The input text sequence.
+ Returns:
+ List[int]: List of a character id sequence.
"""
return self.numericalize(self.phoneticize(sentence))
@@ -263,14 +233,10 @@ class Chinese(Phonetics):
def phoneticize(self, sentence):
""" Normalize the input text sequence and convert it into pronunciation sequence.
- Parameters
- -----------
- sentence: str
- The input text sequence.
- Returns
- ----------
- List[str]
- The list of pronunciation sequence.
+ Args:
+ sentence(str): The input text sequence.
+ Returns:
+ List[str]: The list of pronunciation sequence.
"""
# simplified = self.opencc_backend.convert(sentence)
simplified = sentence
@@ -295,28 +261,20 @@ class Chinese(Phonetics):
def numericalize(self, phonemes):
""" Convert pronunciation sequence into pronunciation id sequence.
- Parameters
- -----------
- phonemes: List[str]
- The list of pronunciation sequence.
- Returns
- ----------
- List[int]
- The list of pronunciation id sequence.
+ Args:
+ phonemes(List[str]): The list of pronunciation sequence.
+ Returns:
+ List[int]: The list of pronunciation id sequence.
"""
ids = [self.vocab.lookup(item) for item in phonemes]
return ids
def __call__(self, sentence):
""" Convert the input text sequence into pronunciation id sequence.
- Parameters
- -----------
- sentence: str
- The input text sequence.
- Returns
- ----------
- List[str]
- The list of pronunciation id sequence.
+ Args:
+ sentence (str): The input text sequence.
+ Returns:
+ List[str]: The list of pronunciation id sequence.
"""
return self.numericalize(self.phoneticize(sentence))
@@ -328,13 +286,9 @@ class Chinese(Phonetics):
def reverse(self, ids):
""" Reverse the list of pronunciation id sequence to a list of pronunciation sequence.
- Parameters
- -----------
- ids: List[int]
- The list of pronunciation id sequence.
- Returns
- ----------
- List[str]
- The list of pronunciation sequence.
+ Args:
+ ids (List[int]): The list of pronunciation id sequence.
+ Returns:
+ List[str]: The list of pronunciation sequence.
"""
return [self.vocab.reverse(i) for i in ids]
diff --git a/paddlespeech/t2s/frontend/vocab.py b/paddlespeech/t2s/frontend/vocab.py
index 9ef6b137..76bb3c7b 100644
--- a/paddlespeech/t2s/frontend/vocab.py
+++ b/paddlespeech/t2s/frontend/vocab.py
@@ -20,22 +20,12 @@ __all__ = ["Vocab"]
class Vocab(object):
""" Vocabulary.
- Parameters
- -----------
- symbols: Iterable[str]
- Common symbols.
-
- padding_symbol: str, optional
- Symbol for pad. Defaults to "".
-
- unk_symbol: str, optional
- Symbol for unknow. Defaults to ""
-
- start_symbol: str, optional
- Symbol for start. Defaults to ""
-
- end_symbol: str, optional
- Symbol for end. Defaults to " "
+ Args:
+ symbols (Iterable[str]): Common symbols.
+ padding_symbol (str, optional): Symbol for pad. Defaults to "".
+ unk_symbol (str, optional): Symbol for unknow. Defaults to ""
+ start_symbol (str, optional): Symbol for start. Defaults to ""
+ end_symbol (str, optional): Symbol for end. Defaults to " "
"""
def __init__(self,
diff --git a/paddlespeech/t2s/frontend/zh_normalization/chronology.py b/paddlespeech/t2s/frontend/zh_normalization/chronology.py
index 8801baa0..bfa7d2b1 100644
--- a/paddlespeech/t2s/frontend/zh_normalization/chronology.py
+++ b/paddlespeech/t2s/frontend/zh_normalization/chronology.py
@@ -44,12 +44,10 @@ RE_TIME_RANGE = re.compile(r'([0-1]?[0-9]|2[0-3])'
def replace_time(match) -> str:
"""
- Parameters
- ----------
- match : re.Match
- Returns
- ----------
- str
+ Args:
+ match (re.Match)
+ Returns:
+ str
"""
is_range = len(match.groups()) > 5
@@ -87,12 +85,10 @@ RE_DATE = re.compile(r'(\d{4}|\d{2})年'
def replace_date(match) -> str:
"""
- Parameters
- ----------
- match : re.Match
- Returns
- ----------
- str
+ Args:
+ match (re.Match)
+ Returns:
+ str
"""
year = match.group(1)
month = match.group(3)
@@ -114,12 +110,10 @@ RE_DATE2 = re.compile(
def replace_date2(match) -> str:
"""
- Parameters
- ----------
- match : re.Match
- Returns
- ----------
- str
+ Args:
+ match (re.Match)
+ Returns:
+ str
"""
year = match.group(1)
month = match.group(3)
diff --git a/paddlespeech/t2s/frontend/zh_normalization/num.py b/paddlespeech/t2s/frontend/zh_normalization/num.py
index 1e575c08..27a2f846 100644
--- a/paddlespeech/t2s/frontend/zh_normalization/num.py
+++ b/paddlespeech/t2s/frontend/zh_normalization/num.py
@@ -36,12 +36,10 @@ RE_FRAC = re.compile(r'(-?)(\d+)/(\d+)')
def replace_frac(match) -> str:
"""
- Parameters
- ----------
- match : re.Match
- Returns
- ----------
- str
+ Args:
+ match (re.Match)
+ Returns:
+ str
"""
sign = match.group(1)
nominator = match.group(2)
@@ -59,12 +57,10 @@ RE_PERCENTAGE = re.compile(r'(-?)(\d+(\.\d+)?)%')
def replace_percentage(match) -> str:
"""
- Parameters
- ----------
- match : re.Match
- Returns
- ----------
- str
+ Args:
+ match (re.Match)
+ Returns:
+ str
"""
sign = match.group(1)
percent = match.group(2)
@@ -81,12 +77,10 @@ RE_INTEGER = re.compile(r'(-)' r'(\d+)')
def replace_negative_num(match) -> str:
"""
- Parameters
- ----------
- match : re.Match
- Returns
- ----------
- str
+ Args:
+ match (re.Match)
+ Returns:
+ str
"""
sign = match.group(1)
number = match.group(2)
@@ -103,12 +97,10 @@ RE_DEFAULT_NUM = re.compile(r'\d{3}\d*')
def replace_default_num(match):
"""
- Parameters
- ----------
- match : re.Match
- Returns
- ----------
- str
+ Args:
+ match (re.Match)
+ Returns:
+ str
"""
number = match.group(0)
return verbalize_digit(number)
@@ -124,12 +116,10 @@ RE_NUMBER = re.compile(r'(-?)((\d+)(\.\d+)?)' r'|(\.(\d+))')
def replace_positive_quantifier(match) -> str:
"""
- Parameters
- ----------
- match : re.Match
- Returns
- ----------
- str
+ Args:
+ match (re.Match)
+ Returns:
+ str
"""
number = match.group(1)
match_2 = match.group(2)
@@ -142,12 +132,10 @@ def replace_positive_quantifier(match) -> str:
def replace_number(match) -> str:
"""
- Parameters
- ----------
- match : re.Match
- Returns
- ----------
- str
+ Args:
+ match (re.Match)
+ Returns:
+ str
"""
sign = match.group(1)
number = match.group(2)
@@ -169,12 +157,10 @@ RE_RANGE = re.compile(
def replace_range(match) -> str:
"""
- Parameters
- ----------
- match : re.Match
- Returns
- ----------
- str
+ Args:
+ match (re.Match)
+ Returns:
+ str
"""
first, second = match.group(1), match.group(8)
first = RE_NUMBER.sub(replace_number, first)
@@ -222,7 +208,7 @@ def verbalize_digit(value_string: str, alt_one=False) -> str:
result_symbols = [DIGITS[digit] for digit in value_string]
result = ''.join(result_symbols)
if alt_one:
- result.replace("一", "幺")
+ result = result.replace("一", "幺")
return result
diff --git a/paddlespeech/t2s/frontend/zh_normalization/phonecode.py b/paddlespeech/t2s/frontend/zh_normalization/phonecode.py
index b7b69b41..06b5d41b 100644
--- a/paddlespeech/t2s/frontend/zh_normalization/phonecode.py
+++ b/paddlespeech/t2s/frontend/zh_normalization/phonecode.py
@@ -45,23 +45,19 @@ def phone2str(phone_string: str, mobile=True) -> str:
def replace_phone(match) -> str:
"""
- Parameters
- ----------
- match : re.Match
- Returns
- ----------
- str
+ Args:
+ match (re.Match)
+ Returns:
+ str
"""
return phone2str(match.group(0), mobile=False)
def replace_mobile(match) -> str:
"""
- Parameters
- ----------
- match : re.Match
- Returns
- ----------
- str
+ Args:
+ match (re.Match)
+ Returns:
+ str
"""
return phone2str(match.group(0))
diff --git a/paddlespeech/t2s/frontend/zh_normalization/quantifier.py b/paddlespeech/t2s/frontend/zh_normalization/quantifier.py
index d3805a32..268d7229 100644
--- a/paddlespeech/t2s/frontend/zh_normalization/quantifier.py
+++ b/paddlespeech/t2s/frontend/zh_normalization/quantifier.py
@@ -22,12 +22,10 @@ RE_TEMPERATURE = re.compile(r'(-?)(\d+(\.\d+)?)(°C|℃|度|摄氏度)')
def replace_temperature(match) -> str:
"""
- Parameters
- ----------
- match : re.Match
- Returns
- ----------
- str
+ Args:
+ match (re.Match)
+ Returns:
+ str
"""
sign = match.group(1)
temperature = match.group(2)
diff --git a/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py b/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py
index 9794a700..f9d1b8cb 100644
--- a/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py
+++ b/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py
@@ -55,14 +55,10 @@ class TextNormalizer():
def _split(self, text: str, lang="zh") -> List[str]:
"""Split long text into sentences with sentence-splitting punctuations.
- Parameters
- ----------
- text : str
- The input text.
- Returns
- -------
- List[str]
- Sentences.
+ Args:
+ text (str): The input text.
+ Returns:
+ List[str]: Sentences.
"""
# Only for pure Chinese here
if lang == "zh":
diff --git a/paddlespeech/t2s/models/__init__.py b/paddlespeech/t2s/models/__init__.py
index f268a4e3..41be7c1d 100644
--- a/paddlespeech/t2s/models/__init__.py
+++ b/paddlespeech/t2s/models/__init__.py
@@ -19,3 +19,4 @@ from .speedyspeech import *
from .tacotron2 import *
from .transformer_tts import *
from .waveflow import *
+from .wavernn import *
diff --git a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
index 405ad957..73f5498e 100644
--- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
+++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
@@ -38,17 +38,21 @@ from paddlespeech.t2s.modules.transformer.encoder import TransformerEncoder
class FastSpeech2(nn.Layer):
"""FastSpeech2 module.
-
+
This is a module of FastSpeech2 described in `FastSpeech 2: Fast and
High-Quality End-to-End Text to Speech`_. Instead of quantized pitch and
energy, we use token-averaged value introduced in `FastPitch: Parallel
Text-to-speech with Pitch Prediction`_.
-
+
.. _`FastSpeech 2: Fast and High-Quality End-to-End Text to Speech`:
https://arxiv.org/abs/2006.04558
.. _`FastPitch: Parallel Text-to-speech with Pitch Prediction`:
https://arxiv.org/abs/2006.06873
+ Args:
+
+ Returns:
+
"""
def __init__(
@@ -127,136 +131,72 @@ class FastSpeech2(nn.Layer):
init_enc_alpha: float=1.0,
init_dec_alpha: float=1.0, ):
"""Initialize FastSpeech2 module.
- Parameters
- ----------
- idim : int
- Dimension of the inputs.
- odim : int
- Dimension of the outputs.
- adim : int
- Attention dimension.
- aheads : int
- Number of attention heads.
- elayers : int
- Number of encoder layers.
- eunits : int
- Number of encoder hidden units.
- dlayers : int
- Number of decoder layers.
- dunits : int
- Number of decoder hidden units.
- postnet_layers : int
- Number of postnet layers.
- postnet_chans : int
- Number of postnet channels.
- postnet_filts : int
- Kernel size of postnet.
- postnet_dropout_rate : float
- Dropout rate in postnet.
- use_scaled_pos_enc : bool
- Whether to use trainable scaled pos encoding.
- use_batch_norm : bool
- Whether to use batch normalization in encoder prenet.
- encoder_normalize_before : bool
- Whether to apply layernorm layer before encoder block.
- decoder_normalize_before : bool
- Whether to apply layernorm layer before
- decoder block.
- encoder_concat_after : bool
- Whether to concatenate attention layer's input and output in encoder.
- decoder_concat_after : bool
- Whether to concatenate attention layer's input and output in decoder.
- reduction_factor : int
- Reduction factor.
- encoder_type : str
- Encoder type ("transformer" or "conformer").
- decoder_type : str
- Decoder type ("transformer" or "conformer").
- transformer_enc_dropout_rate : float
- Dropout rate in encoder except attention and positional encoding.
- transformer_enc_positional_dropout_rate (float): Dropout rate after encoder
- positional encoding.
- transformer_enc_attn_dropout_rate (float): Dropout rate in encoder
- self-attention module.
- transformer_dec_dropout_rate (float): Dropout rate in decoder except
- attention & positional encoding.
- transformer_dec_positional_dropout_rate (float): Dropout rate after decoder
- positional encoding.
- transformer_dec_attn_dropout_rate (float): Dropout rate in decoder
- self-attention module.
- conformer_pos_enc_layer_type : str
- Pos encoding layer type in conformer.
- conformer_self_attn_layer_type : str
- Self-attention layer type in conformer
- conformer_activation_type : str
- Activation function type in conformer.
- use_macaron_style_in_conformer : bool
- Whether to use macaron style FFN.
- use_cnn_in_conformer : bool
- Whether to use CNN in conformer.
- zero_triu : bool
- Whether to use zero triu in relative self-attention module.
- conformer_enc_kernel_size : int
- Kernel size of encoder conformer.
- conformer_dec_kernel_size : int
- Kernel size of decoder conformer.
- duration_predictor_layers : int
- Number of duration predictor layers.
- duration_predictor_chans : int
- Number of duration predictor channels.
- duration_predictor_kernel_size : int
- Kernel size of duration predictor.
- duration_predictor_dropout_rate : float
- Dropout rate in duration predictor.
- pitch_predictor_layers : int
- Number of pitch predictor layers.
- pitch_predictor_chans : int
- Number of pitch predictor channels.
- pitch_predictor_kernel_size : int
- Kernel size of pitch predictor.
- pitch_predictor_dropout_rate : float
- Dropout rate in pitch predictor.
- pitch_embed_kernel_size : float
- Kernel size of pitch embedding.
- pitch_embed_dropout_rate : float
- Dropout rate for pitch embedding.
- stop_gradient_from_pitch_predictor : bool
- Whether to stop gradient from pitch predictor to encoder.
- energy_predictor_layers : int
- Number of energy predictor layers.
- energy_predictor_chans : int
- Number of energy predictor channels.
- energy_predictor_kernel_size : int
- Kernel size of energy predictor.
- energy_predictor_dropout_rate : float
- Dropout rate in energy predictor.
- energy_embed_kernel_size : float
- Kernel size of energy embedding.
- energy_embed_dropout_rate : float
- Dropout rate for energy embedding.
- stop_gradient_from_energy_predictor : bool
- Whether to stop gradient from energy predictor to encoder.
- spk_num : Optional[int]
- Number of speakers. If not None, assume that the spk_embed_dim is not None,
- spk_ids will be provided as the input and use spk_embedding_table.
- spk_embed_dim : Optional[int]
- Speaker embedding dimension. If not None,
- assume that spk_emb will be provided as the input or spk_num is not None.
- spk_embed_integration_type : str
- How to integrate speaker embedding.
- tone_num : Optional[int]
- Number of tones. If not None, assume that the
- tone_ids will be provided as the input and use tone_embedding_table.
- tone_embed_dim : Optional[int]
- Tone embedding dimension. If not None, assume that tone_num is not None.
- tone_embed_integration_type : str
- How to integrate tone embedding.
- init_type : str
- How to initialize transformer parameters.
- init_enc_alpha : float
- Initial value of alpha in scaled pos encoding of the encoder.
- init_dec_alpha : float
- Initial value of alpha in scaled pos encoding of the decoder.
+ Args:
+ idim (int): Dimension of the inputs.
+ odim (int): Dimension of the outputs.
+ adim (int): Attention dimension.
+ aheads (int): Number of attention heads.
+ elayers (int): Number of encoder layers.
+ eunits (int): Number of encoder hidden units.
+ dlayers (int): Number of decoder layers.
+ dunits (int): Number of decoder hidden units.
+ postnet_layers (int): Number of postnet layers.
+ postnet_chans (int): Number of postnet channels.
+ postnet_filts (int): Kernel size of postnet.
+ postnet_dropout_rate (float): Dropout rate in postnet.
+ use_scaled_pos_enc (bool): Whether to use trainable scaled pos encoding.
+ use_batch_norm (bool): Whether to use batch normalization in encoder prenet.
+ encoder_normalize_before (bool): Whether to apply layernorm layer before encoder block.
+ decoder_normalize_before (bool): Whether to apply layernorm layer before decoder block.
+ encoder_concat_after (bool): Whether to concatenate attention layer's input and output in encoder.
+ decoder_concat_after (bool): Whether to concatenate attention layer's input and output in decoder.
+ reduction_factor (int): Reduction factor.
+ encoder_type (str): Encoder type ("transformer" or "conformer").
+ decoder_type (str): Decoder type ("transformer" or "conformer").
+ transformer_enc_dropout_rate (float): Dropout rate in encoder except attention and positional encoding.
+ transformer_enc_positional_dropout_rate (float): Dropout rate after encoder positional encoding.
+ transformer_enc_attn_dropout_rate (float): Dropout rate in encoder self-attention module.
+ transformer_dec_dropout_rate (float): Dropout rate in decoder except attention & positional encoding.
+ transformer_dec_positional_dropout_rate (float): Dropout rate after decoder positional encoding.
+ transformer_dec_attn_dropout_rate (float): Dropout rate in decoder self-attention module.
+ conformer_pos_enc_layer_type (str): Pos encoding layer type in conformer.
+ conformer_self_attn_layer_type (str): Self-attention layer type in conformer
+ conformer_activation_type (str): Activation function type in conformer.
+ use_macaron_style_in_conformer (bool): Whether to use macaron style FFN.
+ use_cnn_in_conformer (bool): Whether to use CNN in conformer.
+ zero_triu (bool): Whether to use zero triu in relative self-attention module.
+ conformer_enc_kernel_size (int): Kernel size of encoder conformer.
+ conformer_dec_kernel_size (int): Kernel size of decoder conformer.
+ duration_predictor_layers (int): Number of duration predictor layers.
+ duration_predictor_chans (int): Number of duration predictor channels.
+ duration_predictor_kernel_size (int): Kernel size of duration predictor.
+ duration_predictor_dropout_rate (float): Dropout rate in duration predictor.
+ pitch_predictor_layers (int): Number of pitch predictor layers.
+ pitch_predictor_chans (int): Number of pitch predictor channels.
+ pitch_predictor_kernel_size (int): Kernel size of pitch predictor.
+ pitch_predictor_dropout_rate (float): Dropout rate in pitch predictor.
+ pitch_embed_kernel_size (float): Kernel size of pitch embedding.
+ pitch_embed_dropout_rate (float): Dropout rate for pitch embedding.
+ stop_gradient_from_pitch_predictor (bool): Whether to stop gradient from pitch predictor to encoder.
+ energy_predictor_layers (int): Number of energy predictor layers.
+ energy_predictor_chans (int): Number of energy predictor channels.
+ energy_predictor_kernel_size (int): Kernel size of energy predictor.
+ energy_predictor_dropout_rate (float): Dropout rate in energy predictor.
+ energy_embed_kernel_size (float): Kernel size of energy embedding.
+ energy_embed_dropout_rate (float): Dropout rate for energy embedding.
+ stop_gradient_from_energy_predictor(bool): Whether to stop gradient from energy predictor to encoder.
+ spk_num (Optional[int]): Number of speakers. If not None, assume that the spk_embed_dim is not None,
+ spk_ids will be provided as the input and use spk_embedding_table.
+ spk_embed_dim (Optional[int]): Speaker embedding dimension. If not None,
+ assume that spk_emb will be provided as the input or spk_num is not None.
+ spk_embed_integration_type (str): How to integrate speaker embedding.
+ tone_num (Optional[int]): Number of tones. If not None, assume that the
+ tone_ids will be provided as the input and use tone_embedding_table.
+ tone_embed_dim (Optional[int]): Tone embedding dimension. If not None, assume that tone_num is not None.
+ tone_embed_integration_type (str): How to integrate tone embedding.
+ init_type (str): How to initialize transformer parameters.
+ init_enc_alpha (float): Initial value of alpha in scaled pos encoding of the encoder.
+ init_dec_alpha (float): Initial value of alpha in scaled pos encoding of the decoder.
"""
assert check_argument_types()
@@ -489,45 +429,21 @@ class FastSpeech2(nn.Layer):
) -> Tuple[paddle.Tensor, Dict[str, paddle.Tensor], paddle.Tensor]:
"""Calculate forward propagation.
- Parameters
- ----------
- text : Tensor(int64)
- Batch of padded token ids (B, Tmax).
- text_lengths : Tensor(int64)
- Batch of lengths of each input (B,).
- speech : Tensor
- Batch of padded target features (B, Lmax, odim).
- speech_lengths : Tensor(int64)
- Batch of the lengths of each target (B,).
- durations : Tensor(int64)
- Batch of padded durations (B, Tmax).
- pitch : Tensor
- Batch of padded token-averaged pitch (B, Tmax, 1).
- energy : Tensor
- Batch of padded token-averaged energy (B, Tmax, 1).
- tone_id : Tensor, optional(int64)
- Batch of padded tone ids (B, Tmax).
- spk_emb : Tensor, optional
- Batch of speaker embeddings (B, spk_embed_dim).
- spk_id : Tnesor, optional(int64)
- Batch of speaker ids (B,)
-
- Returns
- ----------
- Tensor
- mel outs before postnet
- Tensor
- mel outs after postnet
- Tensor
- duration predictor's output
- Tensor
- pitch predictor's output
- Tensor
- energy predictor's output
- Tensor
- speech
- Tensor
- speech_lengths, modified if reduction_factor > 1
+ Args:
+ text(Tensor(int64)): Batch of padded token ids (B, Tmax).
+ text_lengths(Tensor(int64)): Batch of lengths of each input (B,).
+ speech(Tensor): Batch of padded target features (B, Lmax, odim).
+ speech_lengths(Tensor(int64)): Batch of the lengths of each target (B,).
+ durations(Tensor(int64)): Batch of padded durations (B, Tmax).
+ pitch(Tensor): Batch of padded token-averaged pitch (B, Tmax, 1).
+ energy(Tensor): Batch of padded token-averaged energy (B, Tmax, 1).
+ tone_id(Tensor, optional(int64)): Batch of padded tone ids (B, Tmax).
+ spk_emb(Tensor, optional): Batch of speaker embeddings (B, spk_embed_dim).
+ spk_id(Tnesor, optional(int64)): Batch of speaker ids (B,)
+
+ Returns:
+
+
"""
# input of embedding must be int64
@@ -556,8 +472,7 @@ class FastSpeech2(nn.Layer):
tone_id=tone_id)
# modify mod part of groundtruth
if self.reduction_factor > 1:
- olens = paddle.to_tensor(
- [olen - olen % self.reduction_factor for olen in olens.numpy()])
+ olens = olens - olens % self.reduction_factor
max_olen = max(olens)
ys = ys[:, :max_olen]
@@ -627,7 +542,7 @@ class FastSpeech2(nn.Layer):
hs = hs + e_embs + p_embs
# (B, Lmax, adim)
- hs = self.length_regulator(hs, d_outs, alpha)
+ hs = self.length_regulator(hs, d_outs, alpha, is_inference=True)
else:
d_outs = self.duration_predictor(hs, d_masks)
# use groundtruth in training
@@ -638,7 +553,7 @@ class FastSpeech2(nn.Layer):
hs = hs + e_embs + p_embs
# (B, Lmax, adim)
- hs = self.length_regulator(hs, ds)
+ hs = self.length_regulator(hs, ds, is_inference=False)
# forward decoder
if olens is not None and not is_inference:
@@ -681,34 +596,22 @@ class FastSpeech2(nn.Layer):
) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
"""Generate the sequence of features given the sequences of characters.
- Parameters
- ----------
- text : Tensor(int64)
- Input sequence of characters (T,).
- speech : Tensor, optional
- Feature sequence to extract style (N, idim).
- durations : Tensor, optional (int64)
- Groundtruth of duration (T,).
- pitch : Tensor, optional
- Groundtruth of token-averaged pitch (T, 1).
- energy : Tensor, optional
- Groundtruth of token-averaged energy (T, 1).
- alpha : float, optional
- Alpha to control the speed.
- use_teacher_forcing : bool, optional
- Whether to use teacher forcing.
- If true, groundtruth of duration, pitch and energy will be used.
- spk_emb : Tensor, optional
- peaker embedding vector (spk_embed_dim,).
- spk_id : Tensor, optional(int64)
- Batch of padded spk ids (1,).
- tone_id : Tensor, optional(int64)
- Batch of padded tone ids (T,).
-
- Returns
- ----------
- Tensor
- Output sequence of features (L, odim).
+ Args:
+ text(Tensor(int64)): Input sequence of characters (T,).
+ speech(Tensor, optional): Feature sequence to extract style (N, idim).
+ durations(Tensor, optional (int64)): Groundtruth of duration (T,).
+ pitch(Tensor, optional): Groundtruth of token-averaged pitch (T, 1).
+ energy(Tensor, optional): Groundtruth of token-averaged energy (T, 1).
+ alpha(float, optional): Alpha to control the speed.
+ use_teacher_forcing(bool, optional): Whether to use teacher forcing.
+ If true, groundtruth of duration, pitch and energy will be used.
+ spk_emb(Tensor, optional, optional): peaker embedding vector (spk_embed_dim,). (Default value = None)
+ spk_id(Tensor, optional(int64), optional): Batch of padded spk ids (1,). (Default value = None)
+ tone_id(Tensor, optional(int64), optional): Batch of padded tone ids (T,). (Default value = None)
+
+ Returns:
+
+
"""
# input of embedding must be int64
x = paddle.cast(text, 'int64')
@@ -762,17 +665,13 @@ class FastSpeech2(nn.Layer):
def _integrate_with_spk_embed(self, hs, spk_emb):
"""Integrate speaker embedding with hidden states.
- Parameters
- ----------
- hs : Tensor
- Batch of hidden state sequences (B, Tmax, adim).
- spk_emb : Tensor
- Batch of speaker embeddings (B, spk_embed_dim).
-
- Returns
- ----------
- Tensor
- Batch of integrated hidden state sequences (B, Tmax, adim)
+ Args:
+ hs(Tensor): Batch of hidden state sequences (B, Tmax, adim).
+ spk_emb(Tensor): Batch of speaker embeddings (B, spk_embed_dim).
+
+ Returns:
+
+
"""
if self.spk_embed_integration_type == "add":
# apply projection and then add to hidden states
@@ -781,7 +680,7 @@ class FastSpeech2(nn.Layer):
elif self.spk_embed_integration_type == "concat":
# concat hidden states with spk embeds and then apply projection
spk_emb = F.normalize(spk_emb).unsqueeze(1).expand(
- shape=[-1, hs.shape[1], -1])
+ shape=[-1, paddle.shape(hs)[1], -1])
hs = self.spk_projection(paddle.concat([hs, spk_emb], axis=-1))
else:
raise NotImplementedError("support only add or concat.")
@@ -791,17 +690,13 @@ class FastSpeech2(nn.Layer):
def _integrate_with_tone_embed(self, hs, tone_embs):
"""Integrate speaker embedding with hidden states.
- Parameters
- ----------
- hs : Tensor
- Batch of hidden state sequences (B, Tmax, adim).
- tone_embs : Tensor
- Batch of speaker embeddings (B, Tmax, tone_embed_dim).
-
- Returns
- ----------
- Tensor
- Batch of integrated hidden state sequences (B, Tmax, adim)
+ Args:
+ hs(Tensor): Batch of hidden state sequences (B, Tmax, adim).
+ tone_embs(Tensor): Batch of speaker embeddings (B, Tmax, tone_embed_dim).
+
+ Returns:
+
+
"""
if self.tone_embed_integration_type == "add":
# apply projection and then add to hidden states
@@ -820,24 +715,17 @@ class FastSpeech2(nn.Layer):
def _source_mask(self, ilens: paddle.Tensor) -> paddle.Tensor:
"""Make masks for self-attention.
- Parameters
- ----------
- ilens : Tensor
- Batch of lengths (B,).
-
- Returns
- -------
- Tensor
- Mask tensor for self-attention.
- dtype=paddle.bool
+ Args:
+ ilens(Tensor): Batch of lengths (B,).
- Examples
- -------
- >>> ilens = [5, 3]
- >>> self._source_mask(ilens)
- tensor([[[1, 1, 1, 1, 1],
- [1, 1, 1, 0, 0]]]) bool
+ Returns:
+ Tensor: Mask tensor for self-attention. dtype=paddle.bool
+ Examples:
+ >>> ilens = [5, 3]
+ >>> self._source_mask(ilens)
+ tensor([[[1, 1, 1, 1, 1],
+ [1, 1, 1, 0, 0]]]) bool
"""
x_masks = make_non_pad_mask(ilens)
return x_masks.unsqueeze(-2)
@@ -911,34 +799,26 @@ class StyleFastSpeech2Inference(FastSpeech2Inference):
spk_emb=None,
spk_id=None):
"""
- Parameters
- ----------
- text : Tensor(int64)
- Input sequence of characters (T,).
- speech : Tensor, optional
- Feature sequence to extract style (N, idim).
- durations : paddle.Tensor/np.ndarray, optional (int64)
- Groundtruth of duration (T,), this will overwrite the set of durations_scale and durations_bias
- durations_scale: int/float, optional
- durations_bias: int/float, optional
- pitch : paddle.Tensor/np.ndarray, optional
- Groundtruth of token-averaged pitch (T, 1), this will overwrite the set of pitch_scale and pitch_bias
- pitch_scale: int/float, optional
- In denormed HZ domain.
- pitch_bias: int/float, optional
- In denormed HZ domain.
- energy : paddle.Tensor/np.ndarray, optional
- Groundtruth of token-averaged energy (T, 1), this will overwrite the set of energy_scale and energy_bias
- energy_scale: int/float, optional
- In denormed domain.
- energy_bias: int/float, optional
- In denormed domain.
- robot : bool, optional
- Weather output robot style
- Returns
- ----------
- Tensor
- Output sequence of features (L, odim).
+
+ Args:
+ text(Tensor(int64)): Input sequence of characters (T,).
+ speech(Tensor, optional): Feature sequence to extract style (N, idim).
+ durations(paddle.Tensor/np.ndarray, optional (int64)): Groundtruth of duration (T,), this will overwrite the set of durations_scale and durations_bias
+ durations_scale(int/float, optional):
+ durations_bias(int/float, optional):
+ pitch(paddle.Tensor/np.ndarray, optional): Groundtruth of token-averaged pitch (T, 1), this will overwrite the set of pitch_scale and pitch_bias
+ pitch_scale(int/float, optional): In denormed HZ domain.
+ pitch_bias(int/float, optional): In denormed HZ domain.
+ energy(paddle.Tensor/np.ndarray, optional): Groundtruth of token-averaged energy (T, 1), this will overwrite the set of energy_scale and energy_bias
+ energy_scale(int/float, optional): In denormed domain.
+ energy_bias(int/float, optional): In denormed domain.
+ robot: bool: (Default value = False)
+ spk_emb: (Default value = None)
+ spk_id: (Default value = None)
+
+ Returns:
+ Tensor: logmel
+
"""
normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference(
text,
@@ -1012,13 +892,9 @@ class FastSpeech2Loss(nn.Layer):
def __init__(self, use_masking: bool=True,
use_weighted_masking: bool=False):
"""Initialize feed-forward Transformer loss module.
-
- Parameters
- ----------
- use_masking : bool
- Whether to apply masking for padded part in loss calculation.
- use_weighted_masking : bool
- Whether to weighted masking in loss calculation.
+ Args:
+ use_masking (bool): Whether to apply masking for padded part in loss calculation.
+ use_weighted_masking (bool): Whether to weighted masking in loss calculation.
"""
assert check_argument_types()
super().__init__()
@@ -1049,42 +925,22 @@ class FastSpeech2Loss(nn.Layer):
) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor]:
"""Calculate forward propagation.
- Parameters
- ----------
- after_outs : Tensor
- Batch of outputs after postnets (B, Lmax, odim).
- before_outs : Tensor
- Batch of outputs before postnets (B, Lmax, odim).
- d_outs : Tensor
- Batch of outputs of duration predictor (B, Tmax).
- p_outs : Tensor
- Batch of outputs of pitch predictor (B, Tmax, 1).
- e_outs : Tensor
- Batch of outputs of energy predictor (B, Tmax, 1).
- ys : Tensor
- Batch of target features (B, Lmax, odim).
- ds : Tensor
- Batch of durations (B, Tmax).
- ps : Tensor
- Batch of target token-averaged pitch (B, Tmax, 1).
- es : Tensor
- Batch of target token-averaged energy (B, Tmax, 1).
- ilens : Tensor
- Batch of the lengths of each input (B,).
- olens : Tensor
- Batch of the lengths of each target (B,).
-
- Returns
- ----------
- Tensor
- L1 loss value.
- Tensor
- Duration predictor loss value.
- Tensor
- Pitch predictor loss value.
- Tensor
- Energy predictor loss value.
-
+ Args:
+ after_outs(Tensor): Batch of outputs after postnets (B, Lmax, odim).
+ before_outs(Tensor): Batch of outputs before postnets (B, Lmax, odim).
+ d_outs(Tensor): Batch of outputs of duration predictor (B, Tmax).
+ p_outs(Tensor): Batch of outputs of pitch predictor (B, Tmax, 1).
+ e_outs(Tensor): Batch of outputs of energy predictor (B, Tmax, 1).
+ ys(Tensor): Batch of target features (B, Lmax, odim).
+ ds(Tensor): Batch of durations (B, Tmax).
+ ps(Tensor): Batch of target token-averaged pitch (B, Tmax, 1).
+ es(Tensor): Batch of target token-averaged energy (B, Tmax, 1).
+ ilens(Tensor): Batch of the lengths of each input (B,).
+ olens(Tensor): Batch of the lengths of each target (B,).
+
+ Returns:
+
+
"""
# apply mask to remove padded part
if self.use_masking:
diff --git a/paddlespeech/t2s/models/fastspeech2/fastspeech2_updater.py b/paddlespeech/t2s/models/fastspeech2/fastspeech2_updater.py
index 0dabf934..92aa9dfc 100644
--- a/paddlespeech/t2s/models/fastspeech2/fastspeech2_updater.py
+++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2_updater.py
@@ -12,8 +12,12 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
+from pathlib import Path
from paddle import distributed as dist
+from paddle.io import DataLoader
+from paddle.nn import Layer
+from paddle.optimizer import Optimizer
from paddlespeech.t2s.models.fastspeech2 import FastSpeech2Loss
from paddlespeech.t2s.training.extensions.evaluator import StandardEvaluator
@@ -28,20 +32,17 @@ logger.setLevel(logging.INFO)
class FastSpeech2Updater(StandardUpdater):
def __init__(self,
- model,
- optimizer,
- dataloader,
+ model: Layer,
+ optimizer: Optimizer,
+ dataloader: DataLoader,
init_state=None,
- use_masking=False,
- use_weighted_masking=False,
- output_dir=None):
+ use_masking: bool=False,
+ use_weighted_masking: bool=False,
+ output_dir: Path=None):
super().__init__(model, optimizer, dataloader, init_state=None)
- self.use_masking = use_masking
- self.use_weighted_masking = use_weighted_masking
self.criterion = FastSpeech2Loss(
- use_masking=self.use_masking,
- use_weighted_masking=self.use_weighted_masking)
+ use_masking=use_masking, use_weighted_masking=use_weighted_masking)
log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
self.filehandler = logging.FileHandler(str(log_file))
@@ -107,14 +108,12 @@ class FastSpeech2Updater(StandardUpdater):
class FastSpeech2Evaluator(StandardEvaluator):
def __init__(self,
- model,
- dataloader,
- use_masking=False,
- use_weighted_masking=False,
- output_dir=None):
+ model: Layer,
+ dataloader: DataLoader,
+ use_masking: bool=False,
+ use_weighted_masking: bool=False,
+ output_dir: Path=None):
super().__init__(model, dataloader)
- self.use_masking = use_masking
- self.use_weighted_masking = use_weighted_masking
log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
self.filehandler = logging.FileHandler(str(log_file))
@@ -123,8 +122,7 @@ class FastSpeech2Evaluator(StandardEvaluator):
self.msg = ""
self.criterion = FastSpeech2Loss(
- use_masking=self.use_masking,
- use_weighted_masking=self.use_weighted_masking)
+ use_masking=use_masking, use_weighted_masking=use_weighted_masking)
def evaluate_core(self, batch):
self.msg = "Evaluate: "
diff --git a/paddlespeech/t2s/models/hifigan/hifigan.py b/paddlespeech/t2s/models/hifigan/hifigan.py
index 82dd66c1..116376ec 100644
--- a/paddlespeech/t2s/models/hifigan/hifigan.py
+++ b/paddlespeech/t2s/models/hifigan/hifigan.py
@@ -37,35 +37,21 @@ class HiFiGANGenerator(nn.Layer):
use_weight_norm: bool=True,
init_type: str="xavier_uniform", ):
"""Initialize HiFiGANGenerator module.
- Parameters
- ----------
- in_channels : int
- Number of input channels.
- out_channels : int
- Number of output channels.
- channels : int
- Number of hidden representation channels.
- kernel_size : int
- Kernel size of initial and final conv layer.
- upsample_scales : list
- List of upsampling scales.
- upsample_kernel_sizes : list
- List of kernel sizes for upsampling layers.
- resblock_kernel_sizes : list
- List of kernel sizes for residual blocks.
- resblock_dilations : list
- List of dilation list for residual blocks.
- use_additional_convs : bool
- Whether to use additional conv layers in residual blocks.
- bias : bool
- Whether to add bias parameter in convolution layers.
- nonlinear_activation : str
- Activation function module name.
- nonlinear_activation_params : dict
- Hyperparameters for activation function.
- use_weight_norm : bool
- Whether to use weight norm.
- If set to true, it will be applied to all of the conv layers.
+ Args:
+ in_channels (int): Number of input channels.
+ out_channels (int): Number of output channels.
+ channels (int): Number of hidden representation channels.
+ kernel_size (int): Kernel size of initial and final conv layer.
+ upsample_scales (list): List of upsampling scales.
+ upsample_kernel_sizes (list): List of kernel sizes for upsampling layers.
+ resblock_kernel_sizes (list): List of kernel sizes for residual blocks.
+ resblock_dilations (list): List of dilation list for residual blocks.
+ use_additional_convs (bool): Whether to use additional conv layers in residual blocks.
+ bias (bool): Whether to add bias parameter in convolution layers.
+ nonlinear_activation (str): Activation function module name.
+ nonlinear_activation_params (dict): Hyperparameters for activation function.
+ use_weight_norm (bool): Whether to use weight norm.
+ If set to true, it will be applied to all of the conv layers.
"""
super().__init__()
@@ -134,14 +120,11 @@ class HiFiGANGenerator(nn.Layer):
def forward(self, c):
"""Calculate forward propagation.
- Parameters
- ----------
- c : Tensor
- Input tensor (B, in_channels, T).
- Returns
- ----------
- Tensor
- Output tensor (B, out_channels, T).
+
+ Args:
+ c (Tensor): Input tensor (B, in_channels, T).
+ Returns:
+ Tensor: Output tensor (B, out_channels, T).
"""
c = self.input_conv(c)
for i in range(self.num_upsamples):
@@ -196,15 +179,12 @@ class HiFiGANGenerator(nn.Layer):
def inference(self, c):
"""Perform inference.
- Parameters
- ----------
- c : Tensor
- Input tensor (T, in_channels).
- normalize_before (bool): Whether to perform normalization.
- Returns
- ----------
- Tensor
- Output tensor (T ** prod(upsample_scales), out_channels).
+ Args:
+ c (Tensor): Input tensor (T, in_channels).
+ normalize_before (bool): Whether to perform normalization.
+ Returns:
+ Tensor:
+ Output tensor (T ** prod(upsample_scales), out_channels).
"""
c = self.forward(c.transpose([1, 0]).unsqueeze(0))
return c.squeeze(0).transpose([1, 0])
@@ -229,36 +209,23 @@ class HiFiGANPeriodDiscriminator(nn.Layer):
use_spectral_norm: bool=False,
init_type: str="xavier_uniform", ):
"""Initialize HiFiGANPeriodDiscriminator module.
- Parameters
- ----------
- in_channels : int
- Number of input channels.
- out_channels : int
- Number of output channels.
- period : int
- Period.
- kernel_sizes : list
- Kernel sizes of initial conv layers and the final conv layer.
- channels : int
- Number of initial channels.
- downsample_scales : list
- List of downsampling scales.
- max_downsample_channels : int
- Number of maximum downsampling channels.
- use_additional_convs : bool
- Whether to use additional conv layers in residual blocks.
- bias : bool
- Whether to add bias parameter in convolution layers.
- nonlinear_activation : str
- Activation function module name.
- nonlinear_activation_params : dict
- Hyperparameters for activation function.
- use_weight_norm : bool
- Whether to use weight norm.
- If set to true, it will be applied to all of the conv layers.
- use_spectral_norm : bool
- Whether to use spectral norm.
- If set to true, it will be applied to all of the conv layers.
+
+ Args:
+ in_channels (int): Number of input channels.
+ out_channels (int): Number of output channels.
+ period (int): Period.
+ kernel_sizes (list): Kernel sizes of initial conv layers and the final conv layer.
+ channels (int): Number of initial channels.
+ downsample_scales (list): List of downsampling scales.
+ max_downsample_channels (int): Number of maximum downsampling channels.
+ use_additional_convs (bool): Whether to use additional conv layers in residual blocks.
+ bias (bool): Whether to add bias parameter in convolution layers.
+ nonlinear_activation (str): Activation function module name.
+ nonlinear_activation_params (dict): Hyperparameters for activation function.
+ use_weight_norm (bool): Whether to use weight norm.
+ If set to true, it will be applied to all of the conv layers.
+ use_spectral_norm (bool): Whether to use spectral norm.
+ If set to true, it will be applied to all of the conv layers.
"""
super().__init__()
@@ -307,14 +274,11 @@ class HiFiGANPeriodDiscriminator(nn.Layer):
def forward(self, x):
"""Calculate forward propagation.
- Parameters
- ----------
- c : Tensor
- Input tensor (B, in_channels, T).
- Returns
- ----------
- list
- List of each layer's tensors.
+
+ Args:
+ c (Tensor): Input tensor (B, in_channels, T).
+ Returns:
+ list: List of each layer's tensors.
"""
# transform 1d to 2d -> (B, C, T/P, P)
b, c, t = paddle.shape(x)
@@ -379,13 +343,11 @@ class HiFiGANMultiPeriodDiscriminator(nn.Layer):
},
init_type: str="xavier_uniform", ):
"""Initialize HiFiGANMultiPeriodDiscriminator module.
- Parameters
- ----------
- periods : list
- List of periods.
- discriminator_params : dict
- Parameters for hifi-gan period discriminator module.
- The period parameter will be overwritten.
+
+ Args:
+ periods (list): List of periods.
+ discriminator_params (dict): Parameters for hifi-gan period discriminator module.
+ The period parameter will be overwritten.
"""
super().__init__()
# initialize parameters
@@ -399,14 +361,11 @@ class HiFiGANMultiPeriodDiscriminator(nn.Layer):
def forward(self, x):
"""Calculate forward propagation.
- Parameters
- ----------
- x : Tensor
- Input noise signal (B, 1, T).
- Returns
- ----------
- List
- List of list of each discriminator outputs, which consists of each layer output tensors.
+
+ Args:
+ x (Tensor): Input noise signal (B, 1, T).
+ Returns:
+ List: List of list of each discriminator outputs, which consists of each layer output tensors.
"""
outs = []
for f in self.discriminators:
@@ -434,33 +393,22 @@ class HiFiGANScaleDiscriminator(nn.Layer):
use_spectral_norm: bool=False,
init_type: str="xavier_uniform", ):
"""Initilize HiFiGAN scale discriminator module.
- Parameters
- ----------
- in_channels : int
- Number of input channels.
- out_channels : int
- Number of output channels.
- kernel_sizes : list
- List of four kernel sizes. The first will be used for the first conv layer,
- and the second is for downsampling part, and the remaining two are for output layers.
- channels : int
- Initial number of channels for conv layer.
- max_downsample_channels : int
- Maximum number of channels for downsampling layers.
- bias : bool
- Whether to add bias parameter in convolution layers.
- downsample_scales : list
- List of downsampling scales.
- nonlinear_activation : str
- Activation function module name.
- nonlinear_activation_params : dict
- Hyperparameters for activation function.
- use_weight_norm : bool
- Whether to use weight norm.
- If set to true, it will be applied to all of the conv layers.
- use_spectral_norm : bool
- Whether to use spectral norm.
- If set to true, it will be applied to all of the conv layers.
+
+ Args:
+ in_channels (int): Number of input channels.
+ out_channels (int): Number of output channels.
+ kernel_sizes (list): List of four kernel sizes. The first will be used for the first conv layer,
+ and the second is for downsampling part, and the remaining two are for output layers.
+ channels (int): Initial number of channels for conv layer.
+ max_downsample_channels (int): Maximum number of channels for downsampling layers.
+ bias (bool): Whether to add bias parameter in convolution layers.
+ downsample_scales (list): List of downsampling scales.
+ nonlinear_activation (str): Activation function module name.
+ nonlinear_activation_params (dict): Hyperparameters for activation function.
+ use_weight_norm (bool): Whether to use weight norm.
+ If set to true, it will be applied to all of the conv layers.
+ use_spectral_norm (bool): Whether to use spectral norm.
+ If set to true, it will be applied to all of the conv layers.
"""
super().__init__()
@@ -546,14 +494,11 @@ class HiFiGANScaleDiscriminator(nn.Layer):
def forward(self, x):
"""Calculate forward propagation.
- Parameters
- ----------
- x : Tensor
- Input noise signal (B, 1, T).
- Returns
- ----------
- List
- List of output tensors of each layer.
+
+ Args:
+ x (Tensor): Input noise signal (B, 1, T).
+ Returns:
+ List: List of output tensors of each layer.
"""
outs = []
for f in self.layers:
@@ -613,20 +558,14 @@ class HiFiGANMultiScaleDiscriminator(nn.Layer):
follow_official_norm: bool=False,
init_type: str="xavier_uniform", ):
"""Initilize HiFiGAN multi-scale discriminator module.
- Parameters
- ----------
- scales : int
- Number of multi-scales.
- downsample_pooling : str
- Pooling module name for downsampling of the inputs.
- downsample_pooling_params : dict
- Parameters for the above pooling module.
- discriminator_params : dict
- Parameters for hifi-gan scale discriminator module.
- follow_official_norm : bool
- Whether to follow the norm setting of the official
- implementaion. The first discriminator uses spectral norm and the other
- discriminators use weight norm.
+
+ Args:
+ scales (int): Number of multi-scales.
+ downsample_pooling (str): Pooling module name for downsampling of the inputs.
+ downsample_pooling_params (dict): Parameters for the above pooling module.
+ discriminator_params (dict): Parameters for hifi-gan scale discriminator module.
+ follow_official_norm (bool): Whether to follow the norm setting of the official
+ implementaion. The first discriminator uses spectral norm and the other discriminators use weight norm.
"""
super().__init__()
@@ -651,14 +590,11 @@ class HiFiGANMultiScaleDiscriminator(nn.Layer):
def forward(self, x):
"""Calculate forward propagation.
- Parameters
- ----------
- x : Tensor
- Input noise signal (B, 1, T).
- Returns
- ----------
- List
- List of list of each discriminator outputs, which consists of each layer output tensors.
+
+ Args:
+ x (Tensor): Input noise signal (B, 1, T).
+ Returns:
+ List: List of list of each discriminator outputs, which consists of each layer output tensors.
"""
outs = []
for f in self.discriminators:
@@ -715,24 +651,17 @@ class HiFiGANMultiScaleMultiPeriodDiscriminator(nn.Layer):
},
init_type: str="xavier_uniform", ):
"""Initilize HiFiGAN multi-scale + multi-period discriminator module.
- Parameters
- ----------
- scales : int
- Number of multi-scales.
- scale_downsample_pooling : str
- Pooling module name for downsampling of the inputs.
- scale_downsample_pooling_params : dict
- Parameters for the above pooling module.
- scale_discriminator_params : dict
- Parameters for hifi-gan scale discriminator module.
- follow_official_norm : bool): Whether to follow the norm setting of the official
- implementaion. The first discriminator uses spectral norm and the other
- discriminators use weight norm.
- periods : list
- List of periods.
- period_discriminator_params : dict
- Parameters for hifi-gan period discriminator module.
- The period parameter will be overwritten.
+
+ Args:
+ scales (int): Number of multi-scales.
+ scale_downsample_pooling (str): Pooling module name for downsampling of the inputs.
+ scale_downsample_pooling_params (dict): Parameters for the above pooling module.
+ scale_discriminator_params (dict): Parameters for hifi-gan scale discriminator module.
+ follow_official_norm (bool): Whether to follow the norm setting of the official implementaion.
+ The first discriminator uses spectral norm and the other discriminators use weight norm.
+ periods (list): List of periods.
+ period_discriminator_params (dict): Parameters for hifi-gan period discriminator module.
+ The period parameter will be overwritten.
"""
super().__init__()
@@ -751,16 +680,14 @@ class HiFiGANMultiScaleMultiPeriodDiscriminator(nn.Layer):
def forward(self, x):
"""Calculate forward propagation.
- Parameters
- ----------
- x : Tensor
- Input noise signal (B, 1, T).
- Returns
- ----------
- List:
- List of list of each discriminator outputs,
- which consists of each layer output tensors.
- Multi scale and multi period ones are concatenated.
+
+ Args:
+ x (Tensor): Input noise signal (B, 1, T).
+ Returns:
+ List:
+ List of list of each discriminator outputs,
+ which consists of each layer output tensors.
+ Multi scale and multi period ones are concatenated.
"""
msd_outs = self.msd(x)
mpd_outs = self.mpd(x)
diff --git a/paddlespeech/t2s/models/melgan/melgan.py b/paddlespeech/t2s/models/melgan/melgan.py
index 3e90b691..6a139659 100644
--- a/paddlespeech/t2s/models/melgan/melgan.py
+++ b/paddlespeech/t2s/models/melgan/melgan.py
@@ -51,41 +51,26 @@ class MelGANGenerator(nn.Layer):
use_causal_conv: bool=False,
init_type: str="xavier_uniform", ):
"""Initialize MelGANGenerator module.
- Parameters
- ----------
- in_channels : int
- Number of input channels.
- out_channels : int
- Number of output channels,
- the number of sub-band is out_channels in multi-band melgan.
- kernel_size : int
- Kernel size of initial and final conv layer.
- channels : int
- Initial number of channels for conv layer.
- bias : bool
- Whether to add bias parameter in convolution layers.
- upsample_scales : List[int]
- List of upsampling scales.
- stack_kernel_size : int
- Kernel size of dilated conv layers in residual stack.
- stacks : int
- Number of stacks in a single residual stack.
- nonlinear_activation : Optional[str], optional
- Non linear activation in upsample network, by default None
- nonlinear_activation_params : Dict[str, Any], optional
- Parameters passed to the linear activation in the upsample network,
- by default {}
- pad : str
- Padding function module name before dilated convolution layer.
- pad_params : dict
- Hyperparameters for padding function.
- use_final_nonlinear_activation : nn.Layer
- Activation function for the final layer.
- use_weight_norm : bool
- Whether to use weight norm.
- If set to true, it will be applied to all of the conv layers.
- use_causal_conv : bool
- Whether to use causal convolution.
+
+ Args:
+ in_channels (int): Number of input channels.
+ out_channels (int): Number of output channels,
+ the number of sub-band is out_channels in multi-band melgan.
+ kernel_size (int): Kernel size of initial and final conv layer.
+ channels (int): Initial number of channels for conv layer.
+ bias (bool): Whether to add bias parameter in convolution layers.
+ upsample_scales (List[int]): List of upsampling scales.
+ stack_kernel_size (int): Kernel size of dilated conv layers in residual stack.
+ stacks (int): Number of stacks in a single residual stack.
+ nonlinear_activation (Optional[str], optional): Non linear activation in upsample network, by default None
+ nonlinear_activation_params (Dict[str, Any], optional): Parameters passed to the linear activation in the upsample network,
+ by default {}
+ pad (str): Padding function module name before dilated convolution layer.
+ pad_params (dict): Hyperparameters for padding function.
+ use_final_nonlinear_activation (nn.Layer): Activation function for the final layer.
+ use_weight_norm (bool): Whether to use weight norm.
+ If set to true, it will be applied to all of the conv layers.
+ use_causal_conv (bool): Whether to use causal convolution.
"""
super().__init__()
@@ -207,14 +192,11 @@ class MelGANGenerator(nn.Layer):
def forward(self, c):
"""Calculate forward propagation.
- Parameters
- ----------
- c : Tensor
- Input tensor (B, in_channels, T).
- Returns
- ----------
- Tensor
- Output tensor (B, out_channels, T ** prod(upsample_scales)).
+
+ Args:
+ c (Tensor): Input tensor (B, in_channels, T).
+ Returns:
+ Tensor: Output tensor (B, out_channels, T ** prod(upsample_scales)).
"""
out = self.melgan(c)
return out
@@ -260,14 +242,11 @@ class MelGANGenerator(nn.Layer):
def inference(self, c):
"""Perform inference.
- Parameters
- ----------
- c : Union[Tensor, ndarray]
- Input tensor (T, in_channels).
- Returns
- ----------
- Tensor
- Output tensor (out_channels*T ** prod(upsample_scales), 1).
+
+ Args:
+ c (Union[Tensor, ndarray]): Input tensor (T, in_channels).
+ Returns:
+ Tensor: Output tensor (out_channels*T ** prod(upsample_scales), 1).
"""
# pseudo batch
c = c.transpose([1, 0]).unsqueeze(0)
@@ -298,33 +277,22 @@ class MelGANDiscriminator(nn.Layer):
pad_params: Dict[str, Any]={"mode": "reflect"},
init_type: str="xavier_uniform", ):
"""Initilize MelGAN discriminator module.
- Parameters
- ----------
- in_channels : int
- Number of input channels.
- out_channels : int
- Number of output channels.
- kernel_sizes : List[int]
- List of two kernel sizes. The prod will be used for the first conv layer,
- and the first and the second kernel sizes will be used for the last two layers.
- For example if kernel_sizes = [5, 3], the first layer kernel size will be 5 * 3 = 15,
- the last two layers' kernel size will be 5 and 3, respectively.
- channels : int
- Initial number of channels for conv layer.
- max_downsample_channels : int
- Maximum number of channels for downsampling layers.
- bias : bool
- Whether to add bias parameter in convolution layers.
- downsample_scales : List[int]
- List of downsampling scales.
- nonlinear_activation : str
- Activation function module name.
- nonlinear_activation_params : dict
- Hyperparameters for activation function.
- pad : str
- Padding function module name before dilated convolution layer.
- pad_params : dict
- Hyperparameters for padding function.
+
+ Args:
+ in_channels (int): Number of input channels.
+ out_channels (int): Number of output channels.
+ kernel_sizes (List[int]): List of two kernel sizes. The prod will be used for the first conv layer,
+ and the first and the second kernel sizes will be used for the last two layers.
+ For example if kernel_sizes = [5, 3], the first layer kernel size will be 5 * 3 = 15,
+ the last two layers' kernel size will be 5 and 3, respectively.
+ channels (int): Initial number of channels for conv layer.
+ max_downsample_channels (int): Maximum number of channels for downsampling layers.
+ bias (bool): Whether to add bias parameter in convolution layers.
+ downsample_scales (List[int]): List of downsampling scales.
+ nonlinear_activation (str): Activation function module name.
+ nonlinear_activation_params (dict): Hyperparameters for activation function.
+ pad (str): Padding function module name before dilated convolution layer.
+ pad_params (dict): Hyperparameters for padding function.
"""
super().__init__()
@@ -395,14 +363,10 @@ class MelGANDiscriminator(nn.Layer):
def forward(self, x):
"""Calculate forward propagation.
- Parameters
- ----------
- x : Tensor
- Input noise signal (B, 1, T).
- Returns
- ----------
- List
- List of output tensors of each layer (for feat_match_loss).
+ Args:
+ x (Tensor): Input noise signal (B, 1, T).
+ Returns:
+ List: List of output tensors of each layer (for feat_match_loss).
"""
outs = []
for f in self.layers:
@@ -440,39 +404,24 @@ class MelGANMultiScaleDiscriminator(nn.Layer):
use_weight_norm: bool=True,
init_type: str="xavier_uniform", ):
"""Initilize MelGAN multi-scale discriminator module.
- Parameters
- ----------
- in_channels : int
- Number of input channels.
- out_channels : int
- Number of output channels.
- scales : int
- Number of multi-scales.
- downsample_pooling : str
- Pooling module name for downsampling of the inputs.
- downsample_pooling_params : dict
- Parameters for the above pooling module.
- kernel_sizes : List[int]
- List of two kernel sizes. The sum will be used for the first conv layer,
- and the first and the second kernel sizes will be used for the last two layers.
- channels : int
- Initial number of channels for conv layer.
- max_downsample_channels : int
- Maximum number of channels for downsampling layers.
- bias : bool
- Whether to add bias parameter in convolution layers.
- downsample_scales : List[int]
- List of downsampling scales.
- nonlinear_activation : str
- Activation function module name.
- nonlinear_activation_params : dict
- Hyperparameters for activation function.
- pad : str
- Padding function module name before dilated convolution layer.
- pad_params : dict
- Hyperparameters for padding function.
- use_causal_conv : bool
- Whether to use causal convolution.
+
+ Args:
+ in_channels (int): Number of input channels.
+ out_channels (int): Number of output channels.
+ scales (int): Number of multi-scales.
+ downsample_pooling (str): Pooling module name for downsampling of the inputs.
+ downsample_pooling_params (dict): Parameters for the above pooling module.
+ kernel_sizes (List[int]): List of two kernel sizes. The sum will be used for the first conv layer,
+ and the first and the second kernel sizes will be used for the last two layers.
+ channels (int): Initial number of channels for conv layer.
+ max_downsample_channels (int): Maximum number of channels for downsampling layers.
+ bias (bool): Whether to add bias parameter in convolution layers.
+ downsample_scales (List[int]): List of downsampling scales.
+ nonlinear_activation (str): Activation function module name.
+ nonlinear_activation_params (dict): Hyperparameters for activation function.
+ pad (str): Padding function module name before dilated convolution layer.
+ pad_params (dict): Hyperparameters for padding function.
+ use_causal_conv (bool): Whether to use causal convolution.
"""
super().__init__()
@@ -514,14 +463,10 @@ class MelGANMultiScaleDiscriminator(nn.Layer):
def forward(self, x):
"""Calculate forward propagation.
- Parameters
- ----------
- x : Tensor
- Input noise signal (B, 1, T).
- Returns
- ----------
- List
- List of list of each discriminator outputs, which consists of each layer output tensors.
+ Args:
+ x (Tensor): Input noise signal (B, 1, T).
+ Returns:
+ List: List of list of each discriminator outputs, which consists of each layer output tensors.
"""
outs = []
for f in self.discriminators:
diff --git a/paddlespeech/t2s/models/melgan/style_melgan.py b/paddlespeech/t2s/models/melgan/style_melgan.py
index bd451e1f..40a2f100 100644
--- a/paddlespeech/t2s/models/melgan/style_melgan.py
+++ b/paddlespeech/t2s/models/melgan/style_melgan.py
@@ -52,37 +52,23 @@ class StyleMelGANGenerator(nn.Layer):
use_weight_norm: bool=True,
init_type: str="xavier_uniform", ):
"""Initilize Style MelGAN generator.
- Parameters
- ----------
- in_channels : int
- Number of input noise channels.
- aux_channels : int
- Number of auxiliary input channels.
- channels : int
- Number of channels for conv layer.
- out_channels : int
- Number of output channels.
- kernel_size : int
- Kernel size of conv layers.
- dilation : int
- Dilation factor for conv layers.
- bias : bool
- Whether to add bias parameter in convolution layers.
- noise_upsample_scales : list
- List of noise upsampling scales.
- noise_upsample_activation : str
- Activation function module name for noise upsampling.
- noise_upsample_activation_params : dict
- Hyperparameters for the above activation function.
- upsample_scales : list
- List of upsampling scales.
- upsample_mode : str
- Upsampling mode in TADE layer.
- gated_function : str
- Gated function in TADEResBlock ("softmax" or "sigmoid").
- use_weight_norm : bool
- Whether to use weight norm.
- If set to true, it will be applied to all of the conv layers.
+
+ Args:
+ in_channels (int): Number of input noise channels.
+ aux_channels (int): Number of auxiliary input channels.
+ channels (int): Number of channels for conv layer.
+ out_channels (int): Number of output channels.
+ kernel_size (int): Kernel size of conv layers.
+ dilation (int): Dilation factor for conv layers.
+ bias (bool): Whether to add bias parameter in convolution layers.
+ noise_upsample_scales (list): List of noise upsampling scales.
+ noise_upsample_activation (str): Activation function module name for noise upsampling.
+ noise_upsample_activation_params (dict): Hyperparameters for the above activation function.
+ upsample_scales (list): List of upsampling scales.
+ upsample_mode (str): Upsampling mode in TADE layer.
+ gated_function (str): Gated function in TADEResBlock ("softmax" or "sigmoid").
+ use_weight_norm (bool): Whether to use weight norm.
+ If set to true, it will be applied to all of the conv layers.
"""
super().__init__()
@@ -147,16 +133,12 @@ class StyleMelGANGenerator(nn.Layer):
def forward(self, c, z=None):
"""Calculate forward propagation.
- Parameters
- ----------
- c : Tensor
- Auxiliary input tensor (B, channels, T).
- z : Tensor
- Input noise tensor (B, in_channels, 1).
- Returns
- ----------
- Tensor
- Output tensor (B, out_channels, T ** prod(upsample_scales)).
+
+ Args:
+ c (Tensor): Auxiliary input tensor (B, channels, T).
+ z (Tensor): Input noise tensor (B, in_channels, 1).
+ Returns:
+ Tensor: Output tensor (B, out_channels, T ** prod(upsample_scales)).
"""
# batch_max_steps(24000) == noise_upsample_factor(80) * upsample_factor(300)
if z is None:
@@ -211,14 +193,10 @@ class StyleMelGANGenerator(nn.Layer):
def inference(self, c):
"""Perform inference.
- Parameters
- ----------
- c : Tensor
- Input tensor (T, in_channels).
- Returns
- ----------
- Tensor
- Output tensor (T ** prod(upsample_scales), out_channels).
+ Args:
+ c (Tensor): Input tensor (T, in_channels).
+ Returns:
+ Tensor: Output tensor (T ** prod(upsample_scales), out_channels).
"""
# (1, in_channels, T)
c = c.transpose([1, 0]).unsqueeze(0)
@@ -278,18 +256,13 @@ class StyleMelGANDiscriminator(nn.Layer):
use_weight_norm: bool=True,
init_type: str="xavier_uniform", ):
"""Initilize Style MelGAN discriminator.
- Parameters
- ----------
- repeats : int
- Number of repititons to apply RWD.
- window_sizes : list
- List of random window sizes.
- pqmf_params : list
- List of list of Parameters for PQMF modules
- discriminator_params : dict
- Parameters for base discriminator module.
- use_weight_nom : bool
- Whether to apply weight normalization.
+
+ Args:
+ repeats (int): Number of repititons to apply RWD.
+ window_sizes (list): List of random window sizes.
+ pqmf_params (list): List of list of Parameters for PQMF modules
+ discriminator_params (dict): Parameters for base discriminator module.
+ use_weight_nom (bool): Whether to apply weight normalization.
"""
super().__init__()
@@ -325,15 +298,11 @@ class StyleMelGANDiscriminator(nn.Layer):
def forward(self, x):
"""Calculate forward propagation.
- Parameters
- ----------
- x : Tensor
- Input tensor (B, 1, T).
- Returns
- ----------
- List
- List of discriminator outputs, #items in the list will be
- equal to repeats * #discriminators.
+ Args:
+ x (Tensor): Input tensor (B, 1, T).
+ Returns:
+ List: List of discriminator outputs, #items in the list will be
+ equal to repeats * #discriminators.
"""
outs = []
for _ in range(self.repeats):
diff --git a/paddlespeech/t2s/models/parallel_wavegan/parallel_wavegan.py b/paddlespeech/t2s/models/parallel_wavegan/parallel_wavegan.py
index 9eff4497..cc8460e4 100644
--- a/paddlespeech/t2s/models/parallel_wavegan/parallel_wavegan.py
+++ b/paddlespeech/t2s/models/parallel_wavegan/parallel_wavegan.py
@@ -31,51 +31,30 @@ from paddlespeech.t2s.modules.upsample import ConvInUpsampleNet
class PWGGenerator(nn.Layer):
"""Wave Generator for Parallel WaveGAN
- Parameters
- ----------
- in_channels : int, optional
- Number of channels of the input waveform, by default 1
- out_channels : int, optional
- Number of channels of the output waveform, by default 1
- kernel_size : int, optional
- Kernel size of the residual blocks inside, by default 3
- layers : int, optional
- Number of residual blocks inside, by default 30
- stacks : int, optional
- The number of groups to split the residual blocks into, by default 3
- Within each group, the dilation of the residual block grows
- exponentially.
- residual_channels : int, optional
- Residual channel of the residual blocks, by default 64
- gate_channels : int, optional
- Gate channel of the residual blocks, by default 128
- skip_channels : int, optional
- Skip channel of the residual blocks, by default 64
- aux_channels : int, optional
- Auxiliary channel of the residual blocks, by default 80
- aux_context_window : int, optional
- The context window size of the first convolution applied to the
- auxiliary input, by default 2
- dropout : float, optional
- Dropout of the residual blocks, by default 0.
- bias : bool, optional
- Whether to use bias in residual blocks, by default True
- use_weight_norm : bool, optional
- Whether to use weight norm in all convolutions, by default True
- use_causal_conv : bool, optional
- Whether to use causal padding in the upsample network and residual
- blocks, by default False
- upsample_scales : List[int], optional
- Upsample scales of the upsample network, by default [4, 4, 4, 4]
- nonlinear_activation : Optional[str], optional
- Non linear activation in upsample network, by default None
- nonlinear_activation_params : Dict[str, Any], optional
- Parameters passed to the linear activation in the upsample network,
- by default {}
- interpolate_mode : str, optional
- Interpolation mode of the upsample network, by default "nearest"
- freq_axis_kernel_size : int, optional
- Kernel size along the frequency axis of the upsample network, by default 1
+ Args:
+ in_channels (int, optional): Number of channels of the input waveform, by default 1
+ out_channels (int, optional): Number of channels of the output waveform, by default 1
+ kernel_size (int, optional): Kernel size of the residual blocks inside, by default 3
+ layers (int, optional): Number of residual blocks inside, by default 30
+ stacks (int, optional): The number of groups to split the residual blocks into, by default 3
+ Within each group, the dilation of the residual block grows exponentially.
+ residual_channels (int, optional): Residual channel of the residual blocks, by default 64
+ gate_channels (int, optional): Gate channel of the residual blocks, by default 128
+ skip_channels (int, optional): Skip channel of the residual blocks, by default 64
+ aux_channels (int, optional): Auxiliary channel of the residual blocks, by default 80
+ aux_context_window (int, optional): The context window size of the first convolution applied to the
+ auxiliary input, by default 2
+ dropout (float, optional): Dropout of the residual blocks, by default 0.
+ bias (bool, optional): Whether to use bias in residual blocks, by default True
+ use_weight_norm (bool, optional): Whether to use weight norm in all convolutions, by default True
+ use_causal_conv (bool, optional): Whether to use causal padding in the upsample network and residual
+ blocks, by default False
+ upsample_scales (List[int], optional): Upsample scales of the upsample network, by default [4, 4, 4, 4]
+ nonlinear_activation (Optional[str], optional): Non linear activation in upsample network, by default None
+ nonlinear_activation_params (Dict[str, Any], optional): Parameters passed to the linear activation in the upsample network,
+ by default {}
+ interpolate_mode (str, optional): Interpolation mode of the upsample network, by default "nearest"
+ freq_axis_kernel_size (int, optional): Kernel size along the frequency axis of the upsample network, by default 1
"""
def __init__(
@@ -167,18 +146,13 @@ class PWGGenerator(nn.Layer):
def forward(self, x, c):
"""Generate waveform.
- Parameters
- ----------
- x : Tensor
- Shape (N, C_in, T), The input waveform.
- c : Tensor
- Shape (N, C_aux, T'). The auxiliary input (e.g. spectrogram). It
+ Args:
+ x(Tensor): Shape (N, C_in, T), The input waveform.
+ c(Tensor): Shape (N, C_aux, T'). The auxiliary input (e.g. spectrogram). It
is upsampled to match the time resolution of the input.
- Returns
- -------
- Tensor
- Shape (N, C_out, T), the generated waveform.
+ Returns:
+ Tensor: Shape (N, C_out, T), the generated waveform.
"""
c = self.upsample_net(c)
assert c.shape[-1] == x.shape[-1]
@@ -218,19 +192,14 @@ class PWGGenerator(nn.Layer):
self.apply(_remove_weight_norm)
def inference(self, c=None):
- """Waveform generation. This function is used for single instance
- inference.
- Parameters
- ----------
- c : Tensor, optional
- Shape (T', C_aux), the auxiliary input, by default None
- x : Tensor, optional
- Shape (T, C_in), the noise waveform, by default None
- If not provided, a sample is drawn from a gaussian distribution.
- Returns
- -------
- Tensor
- Shape (T, C_out), the generated waveform
+ """Waveform generation. This function is used for single instance inference.
+
+ Args:
+ c(Tensor, optional, optional): Shape (T', C_aux), the auxiliary input, by default None
+ x(Tensor, optional): Shape (T, C_in), the noise waveform, by default None
+
+ Returns:
+ Tensor: Shape (T, C_out), the generated waveform
"""
# when to static, can not input x, see https://github.com/PaddlePaddle/Parakeet/pull/132/files
x = paddle.randn(
@@ -244,32 +213,21 @@ class PWGGenerator(nn.Layer):
class PWGDiscriminator(nn.Layer):
"""A convolutional discriminator for audio.
- Parameters
- ----------
- in_channels : int, optional
- Number of channels of the input audio, by default 1
- out_channels : int, optional
- Output feature size, by default 1
- kernel_size : int, optional
- Kernel size of convolutional sublayers, by default 3
- layers : int, optional
- Number of layers, by default 10
- conv_channels : int, optional
- Feature size of the convolutional sublayers, by default 64
- dilation_factor : int, optional
- The factor with which dilation of each convolutional sublayers grows
- exponentially if it is greater than 1, else the dilation of each
- convolutional sublayers grows linearly, by default 1
- nonlinear_activation : str, optional
- The activation after each convolutional sublayer, by default "leakyrelu"
- nonlinear_activation_params : Dict[str, Any], optional
- The parameters passed to the activation's initializer, by default
- {"negative_slope": 0.2}
- bias : bool, optional
- Whether to use bias in convolutional sublayers, by default True
- use_weight_norm : bool, optional
- Whether to use weight normalization at all convolutional sublayers,
- by default True
+ Args:
+ in_channels (int, optional): Number of channels of the input audio, by default 1
+ out_channels (int, optional): Output feature size, by default 1
+ kernel_size (int, optional): Kernel size of convolutional sublayers, by default 3
+ layers (int, optional): Number of layers, by default 10
+ conv_channels (int, optional): Feature size of the convolutional sublayers, by default 64
+ dilation_factor (int, optional): The factor with which dilation of each convolutional sublayers grows
+ exponentially if it is greater than 1, else the dilation of each convolutional sublayers grows linearly,
+ by default 1
+ nonlinear_activation (str, optional): The activation after each convolutional sublayer, by default "leakyrelu"
+ nonlinear_activation_params (Dict[str, Any], optional): The parameters passed to the activation's initializer, by default
+ {"negative_slope": 0.2}
+ bias (bool, optional): Whether to use bias in convolutional sublayers, by default True
+ use_weight_norm (bool, optional): Whether to use weight normalization at all convolutional sublayers,
+ by default True
"""
def __init__(
@@ -330,15 +288,12 @@ class PWGDiscriminator(nn.Layer):
def forward(self, x):
"""
- Parameters
- ----------
- x : Tensor
- Shape (N, in_channels, num_samples), the input audio.
-
- Returns
- -------
- Tensor
- Shape (N, out_channels, num_samples), the predicted logits.
+
+ Args:
+ x (Tensor): Shape (N, in_channels, num_samples), the input audio.
+
+ Returns:
+ Tensor: Shape (N, out_channels, num_samples), the predicted logits.
"""
return self.conv_layers(x)
@@ -362,39 +317,25 @@ class PWGDiscriminator(nn.Layer):
class ResidualPWGDiscriminator(nn.Layer):
"""A wavenet-style discriminator for audio.
- Parameters
- ----------
- in_channels : int, optional
- Number of channels of the input audio, by default 1
- out_channels : int, optional
- Output feature size, by default 1
- kernel_size : int, optional
- Kernel size of residual blocks, by default 3
- layers : int, optional
- Number of residual blocks, by default 30
- stacks : int, optional
- Number of groups of residual blocks, within which the dilation
- of each residual blocks grows exponentially, by default 3
- residual_channels : int, optional
- Residual channels of residual blocks, by default 64
- gate_channels : int, optional
- Gate channels of residual blocks, by default 128
- skip_channels : int, optional
- Skip channels of residual blocks, by default 64
- dropout : float, optional
- Dropout probability of residual blocks, by default 0.
- bias : bool, optional
- Whether to use bias in residual blocks, by default True
- use_weight_norm : bool, optional
- Whether to use weight normalization in all convolutional layers,
- by default True
- use_causal_conv : bool, optional
- Whether to use causal convolution in residual blocks, by default False
- nonlinear_activation : str, optional
- Activation after convolutions other than those in residual blocks,
- by default "leakyrelu"
- nonlinear_activation_params : Dict[str, Any], optional
- Parameters to pass to the activation, by default {"negative_slope": 0.2}
+ Args:
+ in_channels (int, optional): Number of channels of the input audio, by default 1
+ out_channels (int, optional): Output feature size, by default 1
+ kernel_size (int, optional): Kernel size of residual blocks, by default 3
+ layers (int, optional): Number of residual blocks, by default 30
+ stacks (int, optional): Number of groups of residual blocks, within which the dilation
+ of each residual blocks grows exponentially, by default 3
+ residual_channels (int, optional): Residual channels of residual blocks, by default 64
+ gate_channels (int, optional): Gate channels of residual blocks, by default 128
+ skip_channels (int, optional): Skip channels of residual blocks, by default 64
+ dropout (float, optional): Dropout probability of residual blocks, by default 0.
+ bias (bool, optional): Whether to use bias in residual blocks, by default True
+ use_weight_norm (bool, optional): Whether to use weight normalization in all convolutional layers,
+ by default True
+ use_causal_conv (bool, optional): Whether to use causal convolution in residual blocks, by default False
+ nonlinear_activation (str, optional): Activation after convolutions other than those in residual blocks,
+ by default "leakyrelu"
+ nonlinear_activation_params (Dict[str, Any], optional): Parameters to pass to the activation,
+ by default {"negative_slope": 0.2}
"""
def __init__(
@@ -463,15 +404,11 @@ class ResidualPWGDiscriminator(nn.Layer):
def forward(self, x):
"""
- Parameters
- ----------
- x : Tensor
- Shape (N, in_channels, num_samples), the input audio.
-
- Returns
- -------
- Tensor
- Shape (N, out_channels, num_samples), the predicted logits.
+ Args:
+ x(Tensor): Shape (N, in_channels, num_samples), the input audio.↩
+
+ Returns:
+ Tensor: Shape (N, out_channels, num_samples), the predicted logits.
"""
x = self.first_conv(x)
skip = 0
diff --git a/paddlespeech/t2s/models/speedyspeech/speedyspeech.py b/paddlespeech/t2s/models/speedyspeech/speedyspeech.py
index cc9e2066..42e8f743 100644
--- a/paddlespeech/t2s/models/speedyspeech/speedyspeech.py
+++ b/paddlespeech/t2s/models/speedyspeech/speedyspeech.py
@@ -14,28 +14,9 @@
import paddle
from paddle import nn
+from paddlespeech.t2s.modules.nets_utils import initialize
from paddlespeech.t2s.modules.positional_encoding import sinusoid_position_encoding
-
-
-def expand(encodings: paddle.Tensor, durations: paddle.Tensor) -> paddle.Tensor:
- """
- encodings: (B, T, C)
- durations: (B, T)
- """
- batch_size, t_enc = paddle.shape(durations)
- slens = paddle.sum(durations, -1)
- t_dec = paddle.max(slens)
- M = paddle.zeros([batch_size, t_dec, t_enc])
- for i in range(batch_size):
- k = 0
- for j in range(t_enc):
- d = durations[i, j]
- # If the d == 0, slice action is meaningless and not supported
- if d >= 1:
- M[0, k:k + d, j] = 1
- k += d
- encodings = paddle.matmul(M, encodings)
- return encodings
+from paddlespeech.t2s.modules.predictor.length_regulator import LengthRegulator
class ResidualBlock(nn.Layer):
@@ -175,19 +156,25 @@ class SpeedySpeechDecoder(nn.Layer):
class SpeedySpeech(nn.Layer):
- def __init__(self,
- vocab_size,
- encoder_hidden_size,
- encoder_kernel_size,
- encoder_dilations,
- duration_predictor_hidden_size,
- decoder_hidden_size,
- decoder_output_size,
- decoder_kernel_size,
- decoder_dilations,
- tone_size=None,
- spk_num=None):
+ def __init__(
+ self,
+ vocab_size,
+ encoder_hidden_size,
+ encoder_kernel_size,
+ encoder_dilations,
+ duration_predictor_hidden_size,
+ decoder_hidden_size,
+ decoder_output_size,
+ decoder_kernel_size,
+ decoder_dilations,
+ tone_size=None,
+ spk_num=None,
+ init_type: str="xavier_uniform", ):
super().__init__()
+
+ # initialize parameters
+ initialize(self, init_type)
+
encoder = SpeedySpeechEncoder(vocab_size, tone_size,
encoder_hidden_size, encoder_kernel_size,
encoder_dilations, spk_num)
@@ -198,6 +185,10 @@ class SpeedySpeech(nn.Layer):
self.encoder = encoder
self.duration_predictor = duration_predictor
self.decoder = decoder
+ # define length regulator
+ self.length_regulator = LengthRegulator()
+
+ nn.initializer.set_global_initializer(None)
def forward(self, text, tones, durations, spk_id: paddle.Tensor=None):
# input of embedding must be int64
@@ -212,7 +203,7 @@ class SpeedySpeech(nn.Layer):
# expand encodings
durations_to_expand = durations
- encodings = expand(encodings, durations_to_expand)
+ encodings = self.length_regulator(encodings, durations_to_expand)
# decode
# remove positional encoding here
@@ -240,7 +231,8 @@ class SpeedySpeech(nn.Layer):
durations_to_expand = durations_to_expand.astype(paddle.int64)
else:
durations_to_expand = durations
- encodings = expand(encodings, durations_to_expand)
+ encodings = self.length_regulator(
+ encodings, durations_to_expand, is_inference=True)
shape = paddle.shape(encodings)
t_dec, feature_size = shape[1], shape[2]
diff --git a/paddlespeech/t2s/models/speedyspeech/speedyspeech_updater.py b/paddlespeech/t2s/models/speedyspeech/speedyspeech_updater.py
index ee45cdc8..e30a3fe1 100644
--- a/paddlespeech/t2s/models/speedyspeech/speedyspeech_updater.py
+++ b/paddlespeech/t2s/models/speedyspeech/speedyspeech_updater.py
@@ -12,11 +12,15 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
+from pathlib import Path
import paddle
from paddle import distributed as dist
from paddle.fluid.layers import huber_loss
+from paddle.io import DataLoader
from paddle.nn import functional as F
+from paddle.nn import Layer
+from paddle.optimizer import Optimizer
from paddlespeech.t2s.modules.losses import masked_l1_loss
from paddlespeech.t2s.modules.losses import ssim
@@ -33,11 +37,11 @@ logger.setLevel(logging.INFO)
class SpeedySpeechUpdater(StandardUpdater):
def __init__(self,
- model,
- optimizer,
- dataloader,
+ model: Layer,
+ optimizer: Optimizer,
+ dataloader: DataLoader,
init_state=None,
- output_dir=None):
+ output_dir: Path=None):
super().__init__(model, optimizer, dataloader, init_state=None)
log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
@@ -103,7 +107,10 @@ class SpeedySpeechUpdater(StandardUpdater):
class SpeedySpeechEvaluator(StandardEvaluator):
- def __init__(self, model, dataloader, output_dir=None):
+ def __init__(self,
+ model: Layer,
+ dataloader: DataLoader,
+ output_dir: Path=None):
super().__init__(model, dataloader)
log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
diff --git a/paddlespeech/t2s/models/tacotron2.py b/paddlespeech/t2s/models/tacotron2.py
deleted file mode 100644
index 01ea4f7d..00000000
--- a/paddlespeech/t2s/models/tacotron2.py
+++ /dev/null
@@ -1,1074 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import math
-
-import paddle
-from paddle import nn
-from paddle.fluid.layers import sequence_mask
-from paddle.nn import functional as F
-from paddle.nn import initializer as I
-from tqdm import trange
-
-from paddlespeech.t2s.modules.conv import Conv1dBatchNorm
-from paddlespeech.t2s.modules.losses import guided_attention_loss
-from paddlespeech.t2s.utils import checkpoint
-
-__all__ = ["Tacotron2", "Tacotron2Loss"]
-
-
-class LocationSensitiveAttention(nn.Layer):
- """Location Sensitive Attention module.
-
- Reference: `Attention-Based Models for Speech Recognition `_
-
- Parameters
- -----------
- d_query: int
- The feature size of query.
- d_key : int
- The feature size of key.
- d_attention : int
- The feature size of dimension.
- location_filters : int
- Filter size of attention convolution.
- location_kernel_size : int
- Kernel size of attention convolution.
- """
-
- def __init__(self,
- d_query: int,
- d_key: int,
- d_attention: int,
- location_filters: int,
- location_kernel_size: int):
- super().__init__()
-
- self.query_layer = nn.Linear(d_query, d_attention, bias_attr=False)
- self.key_layer = nn.Linear(d_key, d_attention, bias_attr=False)
- self.value = nn.Linear(d_attention, 1, bias_attr=False)
-
- # Location Layer
- self.location_conv = nn.Conv1D(
- 2,
- location_filters,
- kernel_size=location_kernel_size,
- padding=int((location_kernel_size - 1) / 2),
- bias_attr=False,
- data_format='NLC')
- self.location_layer = nn.Linear(
- location_filters, d_attention, bias_attr=False)
-
- def forward(self,
- query,
- processed_key,
- value,
- attention_weights_cat,
- mask=None):
- """Compute context vector and attention weights.
-
- Parameters
- -----------
- query : Tensor [shape=(batch_size, d_query)]
- The queries.
- processed_key : Tensor [shape=(batch_size, time_steps_k, d_attention)]
- The keys after linear layer.
- value : Tensor [shape=(batch_size, time_steps_k, d_key)]
- The values.
- attention_weights_cat : Tensor [shape=(batch_size, time_step_k, 2)]
- Attention weights concat.
- mask : Tensor, optional
- The mask. Shape should be (batch_size, times_steps_k, 1).
- Defaults to None.
-
- Returns
- ----------
- attention_context : Tensor [shape=(batch_size, d_attention)]
- The context vector.
- attention_weights : Tensor [shape=(batch_size, time_steps_k)]
- The attention weights.
- """
-
- processed_query = self.query_layer(paddle.unsqueeze(query, axis=[1]))
- processed_attention_weights = self.location_layer(
- self.location_conv(attention_weights_cat))
- # (B, T_enc, 1)
- alignment = self.value(
- paddle.tanh(processed_attention_weights + processed_key +
- processed_query))
-
- if mask is not None:
- alignment = alignment + (1.0 - mask) * -1e9
-
- attention_weights = F.softmax(alignment, axis=1)
- attention_context = paddle.matmul(
- attention_weights, value, transpose_x=True)
-
- attention_weights = paddle.squeeze(attention_weights, axis=-1)
- attention_context = paddle.squeeze(attention_context, axis=1)
-
- return attention_context, attention_weights
-
-
-class DecoderPreNet(nn.Layer):
- """Decoder prenet module for Tacotron2.
-
- Parameters
- ----------
- d_input: int
- The input feature size.
-
- d_hidden: int
- The hidden size.
-
- d_output: int
- The output feature size.
-
- dropout_rate: float
- The droput probability.
-
- """
-
- def __init__(self,
- d_input: int,
- d_hidden: int,
- d_output: int,
- dropout_rate: float):
- super().__init__()
-
- self.dropout_rate = dropout_rate
- self.linear1 = nn.Linear(d_input, d_hidden, bias_attr=False)
- self.linear2 = nn.Linear(d_hidden, d_output, bias_attr=False)
-
- def forward(self, x):
- """Calculate forward propagation.
-
- Parameters
- ----------
- x: Tensor [shape=(B, T_mel, C)]
- Batch of the sequences of padded mel spectrogram.
-
- Returns
- -------
- output: Tensor [shape=(B, T_mel, C)]
- Batch of the sequences of padded hidden state.
-
- """
-
- x = F.dropout(F.relu(self.linear1(x)), self.dropout_rate, training=True)
- output = F.dropout(
- F.relu(self.linear2(x)), self.dropout_rate, training=True)
- return output
-
-
-class DecoderPostNet(nn.Layer):
- """Decoder postnet module for Tacotron2.
-
- Parameters
- ----------
- d_mels: int
- The number of mel bands.
-
- d_hidden: int
- The hidden size of postnet.
-
- kernel_size: int
- The kernel size of the conv layer in postnet.
-
- num_layers: int
- The number of conv layers in postnet.
-
- dropout: float
- The droput probability.
-
- """
-
- def __init__(self,
- d_mels: int,
- d_hidden: int,
- kernel_size: int,
- num_layers: int,
- dropout: float):
- super().__init__()
- self.dropout = dropout
- self.num_layers = num_layers
-
- padding = int((kernel_size - 1) / 2)
-
- self.conv_batchnorms = nn.LayerList()
- k = math.sqrt(1.0 / (d_mels * kernel_size))
- self.conv_batchnorms.append(
- Conv1dBatchNorm(
- d_mels,
- d_hidden,
- kernel_size=kernel_size,
- padding=padding,
- bias_attr=I.Uniform(-k, k),
- data_format='NLC'))
-
- k = math.sqrt(1.0 / (d_hidden * kernel_size))
- self.conv_batchnorms.extend([
- Conv1dBatchNorm(
- d_hidden,
- d_hidden,
- kernel_size=kernel_size,
- padding=padding,
- bias_attr=I.Uniform(-k, k),
- data_format='NLC') for i in range(1, num_layers - 1)
- ])
-
- self.conv_batchnorms.append(
- Conv1dBatchNorm(
- d_hidden,
- d_mels,
- kernel_size=kernel_size,
- padding=padding,
- bias_attr=I.Uniform(-k, k),
- data_format='NLC'))
-
- def forward(self, x):
- """Calculate forward propagation.
-
- Parameters
- ----------
- x: Tensor [shape=(B, T_mel, C)]
- Output sequence of features from decoder.
-
- Returns
- -------
- output: Tensor [shape=(B, T_mel, C)]
- Output sequence of features after postnet.
-
- """
-
- for i in range(len(self.conv_batchnorms) - 1):
- x = F.dropout(
- F.tanh(self.conv_batchnorms[i](x)),
- self.dropout,
- training=self.training)
- output = F.dropout(
- self.conv_batchnorms[self.num_layers - 1](x),
- self.dropout,
- training=self.training)
- return output
-
-
-class Tacotron2Encoder(nn.Layer):
- """Tacotron2 encoder module for Tacotron2.
-
- Parameters
- ----------
- d_hidden: int
- The hidden size in encoder module.
-
- conv_layers: int
- The number of conv layers.
-
- kernel_size: int
- The kernel size of conv layers.
-
- p_dropout: float
- The droput probability.
- """
-
- def __init__(self,
- d_hidden: int,
- conv_layers: int,
- kernel_size: int,
- p_dropout: float):
- super().__init__()
-
- k = math.sqrt(1.0 / (d_hidden * kernel_size))
- self.conv_batchnorms = nn.LayerList([
- Conv1dBatchNorm(
- d_hidden,
- d_hidden,
- kernel_size,
- stride=1,
- padding=int((kernel_size - 1) / 2),
- bias_attr=I.Uniform(-k, k),
- data_format='NLC') for i in range(conv_layers)
- ])
- self.p_dropout = p_dropout
-
- self.hidden_size = int(d_hidden / 2)
- self.lstm = nn.LSTM(
- d_hidden, self.hidden_size, direction="bidirectional")
-
- def forward(self, x, input_lens=None):
- """Calculate forward propagation of tacotron2 encoder.
-
- Parameters
- ----------
- x: Tensor [shape=(B, T, C)]
- Input embeddings.
-
- text_lens: Tensor [shape=(B,)], optional
- Batch of lengths of each text input batch. Defaults to None.
-
- Returns
- -------
- output : Tensor [shape=(B, T, C)]
- Batch of the sequences of padded hidden states.
-
- """
- for conv_batchnorm in self.conv_batchnorms:
- x = F.dropout(
- F.relu(conv_batchnorm(x)),
- self.p_dropout,
- training=self.training)
-
- output, _ = self.lstm(inputs=x, sequence_length=input_lens)
- return output
-
-
-class Tacotron2Decoder(nn.Layer):
- """Tacotron2 decoder module for Tacotron2.
-
- Parameters
- ----------
- d_mels: int
- The number of mel bands.
-
- reduction_factor: int
- The reduction factor of tacotron.
-
- d_encoder: int
- The hidden size of encoder.
-
- d_prenet: int
- The hidden size in decoder prenet.
-
- d_attention_rnn: int
- The attention rnn layer hidden size.
-
- d_decoder_rnn: int
- The decoder rnn layer hidden size.
-
- d_attention: int
- The hidden size of the linear layer in location sensitive attention.
-
- attention_filters: int
- The filter size of the conv layer in location sensitive attention.
-
- attention_kernel_size: int
- The kernel size of the conv layer in location sensitive attention.
-
- p_prenet_dropout: float
- The droput probability in decoder prenet.
-
- p_attention_dropout: float
- The droput probability in location sensitive attention.
-
- p_decoder_dropout: float
- The droput probability in decoder.
-
- use_stop_token: bool
- Whether to use a binary classifier for stop token prediction.
- Defaults to False
- """
-
- def __init__(self,
- d_mels: int,
- reduction_factor: int,
- d_encoder: int,
- d_prenet: int,
- d_attention_rnn: int,
- d_decoder_rnn: int,
- d_attention: int,
- attention_filters: int,
- attention_kernel_size: int,
- p_prenet_dropout: float,
- p_attention_dropout: float,
- p_decoder_dropout: float,
- use_stop_token: bool=False):
- super().__init__()
- self.d_mels = d_mels
- self.reduction_factor = reduction_factor
- self.d_encoder = d_encoder
- self.d_attention_rnn = d_attention_rnn
- self.d_decoder_rnn = d_decoder_rnn
- self.p_attention_dropout = p_attention_dropout
- self.p_decoder_dropout = p_decoder_dropout
-
- self.prenet = DecoderPreNet(
- d_mels * reduction_factor,
- d_prenet,
- d_prenet,
- dropout_rate=p_prenet_dropout)
-
- # attention_rnn takes attention's context vector has an
- # auxiliary input
- self.attention_rnn = nn.LSTMCell(d_prenet + d_encoder, d_attention_rnn)
-
- self.attention_layer = LocationSensitiveAttention(
- d_attention_rnn, d_encoder, d_attention, attention_filters,
- attention_kernel_size)
-
- # decoder_rnn takes prenet's output and attention_rnn's input
- # as input
- self.decoder_rnn = nn.LSTMCell(d_attention_rnn + d_encoder,
- d_decoder_rnn)
- self.linear_projection = nn.Linear(d_decoder_rnn + d_encoder,
- d_mels * reduction_factor)
-
- self.use_stop_token = use_stop_token
- if use_stop_token:
- self.stop_layer = nn.Linear(d_decoder_rnn + d_encoder, 1)
-
- # states - temporary attributes
- self.attention_hidden = None
- self.attention_cell = None
-
- self.decoder_hidden = None
- self.decoder_cell = None
-
- self.attention_weights = None
- self.attention_weights_cum = None
- self.attention_context = None
-
- self.key = None
- self.mask = None
- self.processed_key = None
-
- def _initialize_decoder_states(self, key):
- """init states be used in decoder
- """
- batch_size, encoder_steps, _ = key.shape
-
- self.attention_hidden = paddle.zeros(
- shape=[batch_size, self.d_attention_rnn], dtype=key.dtype)
- self.attention_cell = paddle.zeros(
- shape=[batch_size, self.d_attention_rnn], dtype=key.dtype)
-
- self.decoder_hidden = paddle.zeros(
- shape=[batch_size, self.d_decoder_rnn], dtype=key.dtype)
- self.decoder_cell = paddle.zeros(
- shape=[batch_size, self.d_decoder_rnn], dtype=key.dtype)
-
- self.attention_weights = paddle.zeros(
- shape=[batch_size, encoder_steps], dtype=key.dtype)
- self.attention_weights_cum = paddle.zeros(
- shape=[batch_size, encoder_steps], dtype=key.dtype)
- self.attention_context = paddle.zeros(
- shape=[batch_size, self.d_encoder], dtype=key.dtype)
-
- self.key = key # [B, T, C]
- # pre-compute projected keys to improve efficiency
- self.processed_key = self.attention_layer.key_layer(key) # [B, T, C]
-
- def _decode(self, query):
- """decode one time step
- """
- cell_input = paddle.concat([query, self.attention_context], axis=-1)
-
- # The first lstm layer (or spec encoder lstm)
- _, (self.attention_hidden, self.attention_cell) = self.attention_rnn(
- cell_input, (self.attention_hidden, self.attention_cell))
- self.attention_hidden = F.dropout(
- self.attention_hidden,
- self.p_attention_dropout,
- training=self.training)
-
- # Loaction sensitive attention
- attention_weights_cat = paddle.stack(
- [self.attention_weights, self.attention_weights_cum], axis=-1)
- self.attention_context, self.attention_weights = self.attention_layer(
- self.attention_hidden, self.processed_key, self.key,
- attention_weights_cat, self.mask)
- self.attention_weights_cum += self.attention_weights
-
- # The second lstm layer (or spec decoder lstm)
- decoder_input = paddle.concat(
- [self.attention_hidden, self.attention_context], axis=-1)
- _, (self.decoder_hidden, self.decoder_cell) = self.decoder_rnn(
- decoder_input, (self.decoder_hidden, self.decoder_cell))
- self.decoder_hidden = F.dropout(
- self.decoder_hidden,
- p=self.p_decoder_dropout,
- training=self.training)
-
- # decode output one step
- decoder_hidden_attention_context = paddle.concat(
- [self.decoder_hidden, self.attention_context], axis=-1)
- decoder_output = self.linear_projection(
- decoder_hidden_attention_context)
- if self.use_stop_token:
- stop_logit = self.stop_layer(decoder_hidden_attention_context)
- return decoder_output, self.attention_weights, stop_logit
- return decoder_output, self.attention_weights
-
- def forward(self, keys, querys, mask):
- """Calculate forward propagation of tacotron2 decoder.
-
- Parameters
- ----------
- keys: Tensor[shape=(B, T_key, C)]
- Batch of the sequences of padded output from encoder.
-
- querys: Tensor[shape(B, T_query, C)]
- Batch of the sequences of padded mel spectrogram.
-
- mask: Tensor
- Mask generated with text length. Shape should be (B, T_key, 1).
-
- Returns
- -------
- mel_output: Tensor [shape=(B, T_query, C)]
- Output sequence of features.
-
- alignments: Tensor [shape=(B, T_query, T_key)]
- Attention weights.
- """
- self._initialize_decoder_states(keys)
- self.mask = mask
-
- querys = paddle.reshape(
- querys,
- [querys.shape[0], querys.shape[1] // self.reduction_factor, -1])
- start_step = paddle.zeros(
- shape=[querys.shape[0], 1, querys.shape[-1]], dtype=querys.dtype)
- querys = paddle.concat([start_step, querys], axis=1)
-
- querys = self.prenet(querys)
-
- mel_outputs, alignments = [], []
- stop_logits = []
- # Ignore the last time step
- while len(mel_outputs) < querys.shape[1] - 1:
- query = querys[:, len(mel_outputs), :]
- if self.use_stop_token:
- mel_output, attention_weights, stop_logit = self._decode(query)
- else:
- mel_output, attention_weights = self._decode(query)
- mel_outputs.append(mel_output)
- alignments.append(attention_weights)
- if self.use_stop_token:
- stop_logits.append(stop_logit)
-
- alignments = paddle.stack(alignments, axis=1)
- mel_outputs = paddle.stack(mel_outputs, axis=1)
- if self.use_stop_token:
- stop_logits = paddle.concat(stop_logits, axis=1)
- return mel_outputs, alignments, stop_logits
- return mel_outputs, alignments
-
- def infer(self, key, max_decoder_steps=1000):
- """Calculate forward propagation of tacotron2 decoder.
-
- Parameters
- ----------
- keys: Tensor [shape=(B, T_key, C)]
- Batch of the sequences of padded output from encoder.
-
- max_decoder_steps: int, optional
- Number of max step when synthesize. Defaults to 1000.
-
- Returns
- -------
- mel_output: Tensor [shape=(B, T_mel, C)]
- Output sequence of features.
-
- alignments: Tensor [shape=(B, T_mel, T_key)]
- Attention weights.
-
- """
- self._initialize_decoder_states(key)
- self.mask = None # mask is not needed for single instance inference
- encoder_steps = key.shape[1]
-
- # [B, C]
- start_step = paddle.zeros(
- shape=[key.shape[0], self.d_mels * self.reduction_factor],
- dtype=key.dtype)
- query = start_step # [B, C]
- first_hit_end = None
-
- mel_outputs, alignments = [], []
- stop_logits = []
- for i in trange(max_decoder_steps):
- query = self.prenet(query)
- if self.use_stop_token:
- mel_output, alignment, stop_logit = self._decode(query)
- else:
- mel_output, alignment = self._decode(query)
-
- mel_outputs.append(mel_output)
- alignments.append(alignment) # (B=1, T)
- if self.use_stop_token:
- stop_logits.append(stop_logit)
-
- if self.use_stop_token:
- if F.sigmoid(stop_logit) > 0.5:
- print("hit stop condition!")
- break
- else:
- if int(paddle.argmax(alignment[0])) == encoder_steps - 1:
- if first_hit_end is None:
- first_hit_end = i
- elif i > (first_hit_end + 20):
- print("content exhausted!")
- break
- if len(mel_outputs) == max_decoder_steps:
- print("Warning! Reached max decoder steps!!!")
- break
-
- query = mel_output
-
- alignments = paddle.stack(alignments, axis=1)
- mel_outputs = paddle.stack(mel_outputs, axis=1)
- if self.use_stop_token:
- stop_logits = paddle.concat(stop_logits, axis=1)
- return mel_outputs, alignments, stop_logits
- return mel_outputs, alignments
-
-
-class Tacotron2(nn.Layer):
- """Tacotron2 model for end-to-end text-to-speech (E2E-TTS).
-
- This is a model of Spectrogram prediction network in Tacotron2 described
- in `Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram
- Predictions `_,
- which converts the sequence of characters
- into the sequence of mel spectrogram.
-
- Parameters
- ----------
- vocab_size : int
- Vocabulary size of phons of the model.
-
- n_tones: int
- Vocabulary size of tones of the model. Defaults to None. If provided,
- the model has an extra tone embedding.
-
- d_mels: int
- Number of mel bands.
-
- d_encoder: int
- Hidden size in encoder module.
-
- encoder_conv_layers: int
- Number of conv layers in encoder.
-
- encoder_kernel_size: int
- Kernel size of conv layers in encoder.
-
- d_prenet: int
- Hidden size in decoder prenet.
-
- d_attention_rnn: int
- Attention rnn layer hidden size in decoder.
-
- d_decoder_rnn: int
- Decoder rnn layer hidden size in decoder.
-
- attention_filters: int
- Filter size of the conv layer in location sensitive attention.
-
- attention_kernel_size: int
- Kernel size of the conv layer in location sensitive attention.
-
- d_attention: int
- Hidden size of the linear layer in location sensitive attention.
-
- d_postnet: int
- Hidden size of postnet.
-
- postnet_kernel_size: int
- Kernel size of the conv layer in postnet.
-
- postnet_conv_layers: int
- Number of conv layers in postnet.
-
- reduction_factor: int
- Reduction factor of tacotron2.
-
- p_encoder_dropout: float
- Droput probability in encoder.
-
- p_prenet_dropout: float
- Droput probability in decoder prenet.
-
- p_attention_dropout: float
- Droput probability in location sensitive attention.
-
- p_decoder_dropout: float
- Droput probability in decoder.
-
- p_postnet_dropout: float
- Droput probability in postnet.
-
- d_global_condition: int
- Feature size of global condition. Defaults to None. If provided, The
- model assumes a global condition that is concatenated to the encoder
- outputs.
-
- """
-
- def __init__(self,
- vocab_size,
- n_tones=None,
- d_mels: int=80,
- d_encoder: int=512,
- encoder_conv_layers: int=3,
- encoder_kernel_size: int=5,
- d_prenet: int=256,
- d_attention_rnn: int=1024,
- d_decoder_rnn: int=1024,
- attention_filters: int=32,
- attention_kernel_size: int=31,
- d_attention: int=128,
- d_postnet: int=512,
- postnet_kernel_size: int=5,
- postnet_conv_layers: int=5,
- reduction_factor: int=1,
- p_encoder_dropout: float=0.5,
- p_prenet_dropout: float=0.5,
- p_attention_dropout: float=0.1,
- p_decoder_dropout: float=0.1,
- p_postnet_dropout: float=0.5,
- d_global_condition=None,
- use_stop_token=False):
- super().__init__()
-
- std = math.sqrt(2.0 / (vocab_size + d_encoder))
- val = math.sqrt(3.0) * std # uniform bounds for std
- self.embedding = nn.Embedding(
- vocab_size, d_encoder, weight_attr=I.Uniform(-val, val))
- if n_tones:
- self.embedding_tones = nn.Embedding(
- n_tones,
- d_encoder,
- padding_idx=0,
- weight_attr=I.Uniform(-0.1 * val, 0.1 * val))
- self.toned = n_tones is not None
-
- self.encoder = Tacotron2Encoder(d_encoder, encoder_conv_layers,
- encoder_kernel_size, p_encoder_dropout)
-
- # input augmentation scheme: concat global condition to the encoder output
- if d_global_condition is not None:
- d_encoder += d_global_condition
- self.decoder = Tacotron2Decoder(
- d_mels,
- reduction_factor,
- d_encoder,
- d_prenet,
- d_attention_rnn,
- d_decoder_rnn,
- d_attention,
- attention_filters,
- attention_kernel_size,
- p_prenet_dropout,
- p_attention_dropout,
- p_decoder_dropout,
- use_stop_token=use_stop_token)
- self.postnet = DecoderPostNet(
- d_mels=d_mels * reduction_factor,
- d_hidden=d_postnet,
- kernel_size=postnet_kernel_size,
- num_layers=postnet_conv_layers,
- dropout=p_postnet_dropout)
-
- def forward(self,
- text_inputs,
- text_lens,
- mels,
- output_lens=None,
- tones=None,
- global_condition=None):
- """Calculate forward propagation of tacotron2.
-
- Parameters
- ----------
- text_inputs: Tensor [shape=(B, T_text)]
- Batch of the sequencees of padded character ids.
-
- text_lens: Tensor [shape=(B,)]
- Batch of lengths of each text input batch.
-
- mels: Tensor [shape(B, T_mel, C)]
- Batch of the sequences of padded mel spectrogram.
-
- output_lens: Tensor [shape=(B,)], optional
- Batch of lengths of each mels batch. Defaults to None.
-
- tones: Tensor [shape=(B, T_text)]
- Batch of sequences of padded tone ids.
-
- global_condition: Tensor [shape(B, C)]
- Batch of global conditions. Defaults to None. If the
- `d_global_condition` of the model is not None, this input should be
- provided.
-
- use_stop_token: bool
- Whether to include a binary classifier to predict the stop token.
- Defaults to False.
-
- Returns
- -------
- outputs : Dict[str, Tensor]
-
- mel_output: output sequence of features (B, T_mel, C);
-
- mel_outputs_postnet: output sequence of features after postnet (B, T_mel, C);
-
- alignments: attention weights (B, T_mel, T_text);
-
- stop_logits: output sequence of stop logits (B, T_mel)
- """
- # input of embedding must be int64
- text_inputs = paddle.cast(text_inputs, 'int64')
- embedded_inputs = self.embedding(text_inputs)
- if self.toned:
- embedded_inputs += self.embedding_tones(tones)
-
- encoder_outputs = self.encoder(embedded_inputs, text_lens)
-
- if global_condition is not None:
- global_condition = global_condition.unsqueeze(1)
- global_condition = paddle.expand(global_condition,
- [-1, encoder_outputs.shape[1], -1])
- encoder_outputs = paddle.concat([encoder_outputs, global_condition],
- -1)
-
- # [B, T_enc, 1]
- mask = sequence_mask(
- text_lens, dtype=encoder_outputs.dtype).unsqueeze(-1)
- if self.decoder.use_stop_token:
- mel_outputs, alignments, stop_logits = self.decoder(
- encoder_outputs, mels, mask=mask)
- else:
- mel_outputs, alignments = self.decoder(
- encoder_outputs, mels, mask=mask)
- mel_outputs_postnet = self.postnet(mel_outputs)
- mel_outputs_postnet = mel_outputs + mel_outputs_postnet
-
- if output_lens is not None:
- # [B, T_dec, 1]
- mask = sequence_mask(output_lens).unsqueeze(-1)
- mel_outputs = mel_outputs * mask # [B, T, C]
- mel_outputs_postnet = mel_outputs_postnet * mask # [B, T, C]
- outputs = {
- "mel_output": mel_outputs,
- "mel_outputs_postnet": mel_outputs_postnet,
- "alignments": alignments
- }
- if self.decoder.use_stop_token:
- outputs["stop_logits"] = stop_logits
-
- return outputs
-
- @paddle.no_grad()
- def infer(self,
- text_inputs,
- max_decoder_steps=1000,
- tones=None,
- global_condition=None):
- """Generate the mel sepctrogram of features given the sequences of character ids.
-
- Parameters
- ----------
- text_inputs: Tensor [shape=(B, T_text)]
- Batch of the sequencees of padded character ids.
-
- max_decoder_steps: int, optional
- Number of max step when synthesize. Defaults to 1000.
-
- Returns
- -------
- outputs : Dict[str, Tensor]
-
- mel_output: output sequence of sepctrogram (B, T_mel, C);
-
- mel_outputs_postnet: output sequence of sepctrogram after postnet (B, T_mel, C);
-
- stop_logits: output sequence of stop logits (B, T_mel);
-
- alignments: attention weights (B, T_mel, T_text). This key is only
- present when `use_stop_token` is True.
- """
- # input of embedding must be int64
- text_inputs = paddle.cast(text_inputs, 'int64')
- embedded_inputs = self.embedding(text_inputs)
- if self.toned:
- embedded_inputs += self.embedding_tones(tones)
- encoder_outputs = self.encoder(embedded_inputs)
-
- if global_condition is not None:
- global_condition = global_condition.unsqueeze(1)
- global_condition = paddle.expand(global_condition,
- [-1, encoder_outputs.shape[1], -1])
- encoder_outputs = paddle.concat([encoder_outputs, global_condition],
- -1)
- if self.decoder.use_stop_token:
- mel_outputs, alignments, stop_logits = self.decoder.infer(
- encoder_outputs, max_decoder_steps=max_decoder_steps)
- else:
- mel_outputs, alignments = self.decoder.infer(
- encoder_outputs, max_decoder_steps=max_decoder_steps)
-
- mel_outputs_postnet = self.postnet(mel_outputs)
- mel_outputs_postnet = mel_outputs + mel_outputs_postnet
-
- outputs = {
- "mel_output": mel_outputs,
- "mel_outputs_postnet": mel_outputs_postnet,
- "alignments": alignments
- }
- if self.decoder.use_stop_token:
- outputs["stop_logits"] = stop_logits
-
- return outputs
-
- @classmethod
- def from_pretrained(cls, config, checkpoint_path):
- """Build a Tacotron2 model from a pretrained model.
-
- Parameters
- ----------
- config: yacs.config.CfgNode
- model configs
-
- checkpoint_path: Path or str
- the path of pretrained model checkpoint, without extension name
-
- Returns
- -------
- ConditionalWaveFlow
- The model built from pretrained result.
- """
- model = cls(vocab_size=config.model.vocab_size,
- n_tones=config.model.n_tones,
- d_mels=config.data.n_mels,
- d_encoder=config.model.d_encoder,
- encoder_conv_layers=config.model.encoder_conv_layers,
- encoder_kernel_size=config.model.encoder_kernel_size,
- d_prenet=config.model.d_prenet,
- d_attention_rnn=config.model.d_attention_rnn,
- d_decoder_rnn=config.model.d_decoder_rnn,
- attention_filters=config.model.attention_filters,
- attention_kernel_size=config.model.attention_kernel_size,
- d_attention=config.model.d_attention,
- d_postnet=config.model.d_postnet,
- postnet_kernel_size=config.model.postnet_kernel_size,
- postnet_conv_layers=config.model.postnet_conv_layers,
- reduction_factor=config.model.reduction_factor,
- p_encoder_dropout=config.model.p_encoder_dropout,
- p_prenet_dropout=config.model.p_prenet_dropout,
- p_attention_dropout=config.model.p_attention_dropout,
- p_decoder_dropout=config.model.p_decoder_dropout,
- p_postnet_dropout=config.model.p_postnet_dropout,
- d_global_condition=config.model.d_global_condition,
- use_stop_token=config.model.use_stop_token)
- checkpoint.load_parameters(model, checkpoint_path=checkpoint_path)
- return model
-
-
-class Tacotron2Loss(nn.Layer):
- """ Tacotron2 Loss module
- """
-
- def __init__(self,
- use_stop_token_loss=True,
- use_guided_attention_loss=False,
- sigma=0.2):
- """Tacotron 2 Criterion.
-
- Args:
- use_stop_token_loss (bool, optional): Whether to use a loss for stop token prediction. Defaults to True.
- use_guided_attention_loss (bool, optional): Whether to use a loss for attention weights. Defaults to False.
- sigma (float, optional): Hyper-parameter sigma for guided attention loss. Defaults to 0.2.
- """
- super().__init__()
- self.spec_criterion = nn.MSELoss()
- self.use_stop_token_loss = use_stop_token_loss
- self.use_guided_attention_loss = use_guided_attention_loss
- self.attn_criterion = guided_attention_loss
- self.stop_criterion = nn.BCEWithLogitsLoss()
- self.sigma = sigma
-
- def forward(self,
- mel_outputs,
- mel_outputs_postnet,
- mel_targets,
- attention_weights=None,
- slens=None,
- plens=None,
- stop_logits=None):
- """Calculate tacotron2 loss.
-
- Parameters
- ----------
- mel_outputs: Tensor [shape=(B, T_mel, C)]
- Output mel spectrogram sequence.
-
- mel_outputs_postnet: Tensor [shape(B, T_mel, C)]
- Output mel spectrogram sequence after postnet.
-
- mel_targets: Tensor [shape=(B, T_mel, C)]
- Target mel spectrogram sequence.
-
- attention_weights: Tensor [shape=(B, T_mel, T_enc)]
- Attention weights. This should be provided when
- `use_guided_attention_loss` is True.
-
- slens: Tensor [shape=(B,)]
- Number of frames of mel spectrograms. This should be provided when
- `use_guided_attention_loss` is True.
-
- plens: Tensor [shape=(B, )]
- Number of text or phone ids of each utterance. This should be
- provided when `use_guided_attention_loss` is True.
-
- stop_logits: Tensor [shape=(B, T_mel)]
- Stop logits of each mel spectrogram frame. This should be provided
- when `use_stop_token_loss` is True.
-
- Returns
- -------
- losses : Dict[str, Tensor]
-
- loss: the sum of the other three losses;
-
- mel_loss: MSE loss compute by mel_targets and mel_outputs;
-
- post_mel_loss: MSE loss compute by mel_targets and mel_outputs_postnet;
-
- guided_attn_loss: Guided attention loss for attention weights;
-
- stop_loss: Binary cross entropy loss for stop token prediction.
- """
- mel_loss = self.spec_criterion(mel_outputs, mel_targets)
- post_mel_loss = self.spec_criterion(mel_outputs_postnet, mel_targets)
- total_loss = mel_loss + post_mel_loss
- if self.use_guided_attention_loss:
- gal_loss = self.attn_criterion(attention_weights, slens, plens,
- self.sigma)
- total_loss += gal_loss
- if self.use_stop_token_loss:
- T_dec = mel_targets.shape[1]
- stop_labels = F.one_hot(slens - 1, num_classes=T_dec)
- stop_token_loss = self.stop_criterion(stop_logits, stop_labels)
- total_loss += stop_token_loss
-
- losses = {
- "loss": total_loss,
- "mel_loss": mel_loss,
- "post_mel_loss": post_mel_loss
- }
- if self.use_guided_attention_loss:
- losses["guided_attn_loss"] = gal_loss
- if self.use_stop_token_loss:
- losses["stop_loss"] = stop_token_loss
- return losses
diff --git a/paddlespeech/t2s/data/__init__.py b/paddlespeech/t2s/models/tacotron2/__init__.py
similarity index 77%
rename from paddlespeech/t2s/data/__init__.py
rename to paddlespeech/t2s/models/tacotron2/__init__.py
index c605205d..ea63257c 100644
--- a/paddlespeech/t2s/data/__init__.py
+++ b/paddlespeech/t2s/models/tacotron2/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -11,7 +11,5 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-"""t2s's infrastructure for data processing.
-"""
-from .batch import *
-from .dataset import *
+from .tacotron2 import *
+from .tacotron2_updater import *
diff --git a/paddlespeech/t2s/models/tacotron2/tacotron2.py b/paddlespeech/t2s/models/tacotron2/tacotron2.py
new file mode 100644
index 00000000..abb691b4
--- /dev/null
+++ b/paddlespeech/t2s/models/tacotron2/tacotron2.py
@@ -0,0 +1,440 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tacotron 2 related modules for paddle"""
+import logging
+from typing import Dict
+from typing import Optional
+from typing import Tuple
+
+import paddle
+import paddle.nn.functional as F
+from paddle import nn
+from typeguard import check_argument_types
+
+from paddlespeech.t2s.modules.nets_utils import initialize
+from paddlespeech.t2s.modules.nets_utils import make_pad_mask
+from paddlespeech.t2s.modules.tacotron2.attentions import AttForward
+from paddlespeech.t2s.modules.tacotron2.attentions import AttForwardTA
+from paddlespeech.t2s.modules.tacotron2.attentions import AttLoc
+from paddlespeech.t2s.modules.tacotron2.decoder import Decoder
+from paddlespeech.t2s.modules.tacotron2.encoder import Encoder
+
+
+class Tacotron2(nn.Layer):
+ """Tacotron2 module for end-to-end text-to-speech.
+
+ This is a module of Spectrogram prediction network in Tacotron2 described
+ in `Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions`_,
+ which converts the sequence of characters into the sequence of Mel-filterbanks.
+
+ .. _`Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions`:
+ https://arxiv.org/abs/1712.05884
+
+ """
+
+ def __init__(
+ self,
+ # network structure related
+ idim: int,
+ odim: int,
+ embed_dim: int=512,
+ elayers: int=1,
+ eunits: int=512,
+ econv_layers: int=3,
+ econv_chans: int=512,
+ econv_filts: int=5,
+ atype: str="location",
+ adim: int=512,
+ aconv_chans: int=32,
+ aconv_filts: int=15,
+ cumulate_att_w: bool=True,
+ dlayers: int=2,
+ dunits: int=1024,
+ prenet_layers: int=2,
+ prenet_units: int=256,
+ postnet_layers: int=5,
+ postnet_chans: int=512,
+ postnet_filts: int=5,
+ output_activation: str=None,
+ use_batch_norm: bool=True,
+ use_concate: bool=True,
+ use_residual: bool=False,
+ reduction_factor: int=1,
+ # extra embedding related
+ spk_num: Optional[int]=None,
+ lang_num: Optional[int]=None,
+ spk_embed_dim: Optional[int]=None,
+ spk_embed_integration_type: str="concat",
+ dropout_rate: float=0.5,
+ zoneout_rate: float=0.1,
+ # training related
+ init_type: str="xavier_uniform", ):
+ """Initialize Tacotron2 module.
+ Args:
+ idim (int): Dimension of the inputs.
+ odim (int): Dimension of the outputs.
+ embed_dim (int): Dimension of the token embedding.
+ elayers (int): Number of encoder blstm layers.
+ eunits (int): Number of encoder blstm units.
+ econv_layers (int): Number of encoder conv layers.
+ econv_filts (int): Number of encoder conv filter size.
+ econv_chans (int): Number of encoder conv filter channels.
+ dlayers (int): Number of decoder lstm layers.
+ dunits (int): Number of decoder lstm units.
+ prenet_layers (int): Number of prenet layers.
+ prenet_units (int): Number of prenet units.
+ postnet_layers (int): Number of postnet layers.
+ postnet_filts (int): Number of postnet filter size.
+ postnet_chans (int): Number of postnet filter channels.
+ output_activation (str): Name of activation function for outputs.
+ adim (int): Number of dimension of mlp in attention.
+ aconv_chans (int): Number of attention conv filter channels.
+ aconv_filts (int): Number of attention conv filter size.
+ cumulate_att_w (bool): Whether to cumulate previous attention weight.
+ use_batch_norm (bool): Whether to use batch normalization.
+ use_concate (bool): Whether to concat enc outputs w/ dec lstm outputs.
+ reduction_factor (int): Reduction factor.
+ spk_num (Optional[int]): Number of speakers. If set to > 1, assume that the
+ sids will be provided as the input and use sid embedding layer.
+ lang_num (Optional[int]): Number of languages. If set to > 1, assume that the
+ lids will be provided as the input and use sid embedding layer.
+ spk_embed_dim (Optional[int]): Speaker embedding dimension. If set to > 0,
+ assume that spk_emb will be provided as the input.
+ spk_embed_integration_type (str): How to integrate speaker embedding.
+ dropout_rate (float): Dropout rate.
+ zoneout_rate (float): Zoneout rate.
+ """
+ assert check_argument_types()
+ super().__init__()
+
+ # store hyperparameters
+ self.idim = idim
+ self.odim = odim
+ self.eos = idim - 1
+ self.cumulate_att_w = cumulate_att_w
+ self.reduction_factor = reduction_factor
+
+ # define activation function for the final output
+ if output_activation is None:
+ self.output_activation_fn = None
+ elif hasattr(F, output_activation):
+ self.output_activation_fn = getattr(F, output_activation)
+ else:
+ raise ValueError(f"there is no such an activation function. "
+ f"({output_activation})")
+
+ # set padding idx
+ padding_idx = 0
+ self.padding_idx = padding_idx
+
+ # initialize parameters
+ initialize(self, init_type)
+
+ # define network modules
+ self.enc = Encoder(
+ idim=idim,
+ embed_dim=embed_dim,
+ elayers=elayers,
+ eunits=eunits,
+ econv_layers=econv_layers,
+ econv_chans=econv_chans,
+ econv_filts=econv_filts,
+ use_batch_norm=use_batch_norm,
+ use_residual=use_residual,
+ dropout_rate=dropout_rate,
+ padding_idx=padding_idx, )
+
+ self.spk_num = None
+ if spk_num is not None and spk_num > 1:
+ self.spk_num = spk_num
+ self.sid_emb = nn.Embedding(spk_num, eunits)
+ self.lang_num = None
+ if lang_num is not None and lang_num > 1:
+ self.lang_num = lang_num
+ self.lid_emb = nn.Embedding(lang_num, eunits)
+
+ self.spk_embed_dim = None
+ if spk_embed_dim is not None and spk_embed_dim > 0:
+ self.spk_embed_dim = spk_embed_dim
+ self.spk_embed_integration_type = spk_embed_integration_type
+ if self.spk_embed_dim is None:
+ dec_idim = eunits
+ elif self.spk_embed_integration_type == "concat":
+ dec_idim = eunits + spk_embed_dim
+ elif self.spk_embed_integration_type == "add":
+ dec_idim = eunits
+ self.projection = nn.Linear(self.spk_embed_dim, eunits)
+ else:
+ raise ValueError(f"{spk_embed_integration_type} is not supported.")
+
+ if atype == "location":
+ att = AttLoc(dec_idim, dunits, adim, aconv_chans, aconv_filts)
+ elif atype == "forward":
+ att = AttForward(dec_idim, dunits, adim, aconv_chans, aconv_filts)
+ if self.cumulate_att_w:
+ logging.warning("cumulation of attention weights is disabled "
+ "in forward attention.")
+ self.cumulate_att_w = False
+ elif atype == "forward_ta":
+ att = AttForwardTA(dec_idim, dunits, adim, aconv_chans, aconv_filts,
+ odim)
+ if self.cumulate_att_w:
+ logging.warning("cumulation of attention weights is disabled "
+ "in forward attention.")
+ self.cumulate_att_w = False
+ else:
+ raise NotImplementedError("Support only location or forward")
+ self.dec = Decoder(
+ idim=dec_idim,
+ odim=odim,
+ att=att,
+ dlayers=dlayers,
+ dunits=dunits,
+ prenet_layers=prenet_layers,
+ prenet_units=prenet_units,
+ postnet_layers=postnet_layers,
+ postnet_chans=postnet_chans,
+ postnet_filts=postnet_filts,
+ output_activation_fn=self.output_activation_fn,
+ cumulate_att_w=self.cumulate_att_w,
+ use_batch_norm=use_batch_norm,
+ use_concate=use_concate,
+ dropout_rate=dropout_rate,
+ zoneout_rate=zoneout_rate,
+ reduction_factor=reduction_factor, )
+
+ nn.initializer.set_global_initializer(None)
+
+ def forward(
+ self,
+ text: paddle.Tensor,
+ text_lengths: paddle.Tensor,
+ speech: paddle.Tensor,
+ speech_lengths: paddle.Tensor,
+ spk_emb: Optional[paddle.Tensor]=None,
+ spk_id: Optional[paddle.Tensor]=None,
+ lang_id: Optional[paddle.Tensor]=None
+ ) -> Tuple[paddle.Tensor, Dict[str, paddle.Tensor], paddle.Tensor]:
+ """Calculate forward propagation.
+
+ Args:
+ text (Tensor(int64)): Batch of padded character ids (B, T_text).
+ text_lengths (Tensor(int64)): Batch of lengths of each input batch (B,).
+ speech (Tensor): Batch of padded target features (B, T_feats, odim).
+ speech_lengths (Tensor(int64)): Batch of the lengths of each target (B,).
+ spk_emb (Optional[Tensor]): Batch of speaker embeddings (B, spk_embed_dim).
+ spk_id (Optional[Tensor]): Batch of speaker IDs (B, 1).
+ lang_id (Optional[Tensor]): Batch of language IDs (B, 1).
+
+ Returns:
+ Tensor: Loss scalar value.
+ Dict: Statistics to be monitored.
+ Tensor: Weight value if not joint training else model outputs.
+
+ """
+ text = text[:, :text_lengths.max()]
+ speech = speech[:, :speech_lengths.max()]
+
+ batch_size = paddle.shape(text)[0]
+
+ # Add eos at the last of sequence
+ xs = F.pad(text, [0, 0, 0, 1], "constant", self.padding_idx)
+ for i, l in enumerate(text_lengths):
+ xs[i, l] = self.eos
+ ilens = text_lengths + 1
+
+ ys = speech
+ olens = speech_lengths
+
+ # make labels for stop prediction
+ stop_labels = make_pad_mask(olens - 1)
+ # bool 类型无法切片
+ stop_labels = paddle.cast(stop_labels, dtype='float32')
+ stop_labels = F.pad(stop_labels, [0, 0, 0, 1], "constant", 1.0)
+
+ # calculate tacotron2 outputs
+ after_outs, before_outs, logits, att_ws = self._forward(
+ xs=xs,
+ ilens=ilens,
+ ys=ys,
+ olens=olens,
+ spk_emb=spk_emb,
+ spk_id=spk_id,
+ lang_id=lang_id, )
+
+ # modify mod part of groundtruth
+ if self.reduction_factor > 1:
+ assert olens.ge(self.reduction_factor).all(
+ ), "Output length must be greater than or equal to reduction factor."
+ olens = olens - olens % self.reduction_factor
+ max_out = max(olens)
+ ys = ys[:, :max_out]
+ stop_labels = stop_labels[:, :max_out]
+ stop_labels = paddle.scatter(stop_labels, 1,
+ (olens - 1).unsqueeze(1), 1.0)
+ olens_in = olens // self.reduction_factor
+ else:
+ olens_in = olens
+ return after_outs, before_outs, logits, ys, stop_labels, olens, att_ws, olens_in
+
+ def _forward(
+ self,
+ xs: paddle.Tensor,
+ ilens: paddle.Tensor,
+ ys: paddle.Tensor,
+ olens: paddle.Tensor,
+ spk_emb: paddle.Tensor,
+ spk_id: paddle.Tensor,
+ lang_id: paddle.Tensor,
+ ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
+
+ hs, hlens = self.enc(xs, ilens)
+ if self.spk_num is not None:
+ sid_embs = self.sid_emb(spk_id.reshape([-1]))
+ hs = hs + sid_embs.unsqueeze(1)
+ if self.lang_num is not None:
+ lid_embs = self.lid_emb(lang_id.reshape([-1]))
+ hs = hs + lid_embs.unsqueeze(1)
+ if self.spk_embed_dim is not None:
+ hs = self._integrate_with_spk_embed(hs, spk_emb)
+
+ return self.dec(hs, hlens, ys)
+
+ def inference(
+ self,
+ text: paddle.Tensor,
+ speech: Optional[paddle.Tensor]=None,
+ spk_emb: Optional[paddle.Tensor]=None,
+ spk_id: Optional[paddle.Tensor]=None,
+ lang_id: Optional[paddle.Tensor]=None,
+ threshold: float=0.5,
+ minlenratio: float=0.0,
+ maxlenratio: float=10.0,
+ use_att_constraint: bool=False,
+ backward_window: int=1,
+ forward_window: int=3,
+ use_teacher_forcing: bool=False, ) -> Dict[str, paddle.Tensor]:
+ """Generate the sequence of features given the sequences of characters.
+
+ Args:
+ text (Tensor(int64)): Input sequence of characters (T_text,).
+ speech (Optional[Tensor]): Feature sequence to extract style (N, idim).
+ spk_emb (ptional[Tensor]): Speaker embedding (spk_embed_dim,).
+ spk_id (Optional[Tensor]): Speaker ID (1,).
+ lang_id (Optional[Tensor]): Language ID (1,).
+ threshold (float): Threshold in inference.
+ minlenratio (float): Minimum length ratio in inference.
+ maxlenratio (float): Maximum length ratio in inference.
+ use_att_constraint (bool): Whether to apply attention constraint.
+ backward_window (int): Backward window in attention constraint.
+ forward_window (int): Forward window in attention constraint.
+ use_teacher_forcing (bool): Whether to use teacher forcing.
+
+ Returns:
+ Dict[str, Tensor]
+ Output dict including the following items:
+ * feat_gen (Tensor): Output sequence of features (T_feats, odim).
+ * prob (Tensor): Output sequence of stop probabilities (T_feats,).
+ * att_w (Tensor): Attention weights (T_feats, T).
+
+ """
+ x = text
+ y = speech
+
+ # add eos at the last of sequence
+ x = F.pad(x, [0, 1], "constant", self.eos)
+
+ # inference with teacher forcing
+ if use_teacher_forcing:
+ assert speech is not None, "speech must be provided with teacher forcing."
+
+ xs, ys = x.unsqueeze(0), y.unsqueeze(0)
+ spk_emb = None if spk_emb is None else spk_emb.unsqueeze(0)
+ ilens = paddle.shape(xs)[1]
+ olens = paddle.shape(ys)[1]
+ outs, _, _, att_ws = self._forward(
+ xs=xs,
+ ilens=ilens,
+ ys=ys,
+ olens=olens,
+ spk_emb=spk_emb,
+ spk_id=spk_id,
+ lang_id=lang_id, )
+
+ return dict(feat_gen=outs[0], att_w=att_ws[0])
+
+ # inference
+ h = self.enc.inference(x)
+
+ if self.spk_num is not None:
+ sid_emb = self.sid_emb(spk_id.reshape([-1]))
+ h = h + sid_emb
+ if self.lang_num is not None:
+ lid_emb = self.lid_emb(lang_id.reshape([-1]))
+ h = h + lid_emb
+ if self.spk_embed_dim is not None:
+ hs, spk_emb = h.unsqueeze(0), spk_emb.unsqueeze(0)
+ h = self._integrate_with_spk_embed(hs, spk_emb)[0]
+ out, prob, att_w = self.dec.inference(
+ h,
+ threshold=threshold,
+ minlenratio=minlenratio,
+ maxlenratio=maxlenratio,
+ use_att_constraint=use_att_constraint,
+ backward_window=backward_window,
+ forward_window=forward_window, )
+
+ return dict(feat_gen=out, prob=prob, att_w=att_w)
+
+ def _integrate_with_spk_embed(self,
+ hs: paddle.Tensor,
+ spk_emb: paddle.Tensor) -> paddle.Tensor:
+ """Integrate speaker embedding with hidden states.
+
+ Args:
+ hs (Tensor): Batch of hidden state sequences (B, Tmax, eunits).
+ spk_emb (Tensor): Batch of speaker embeddings (B, spk_embed_dim).
+
+ Returns:
+ Tensor: Batch of integrated hidden state sequences (B, Tmax, eunits) if
+ integration_type is "add" else (B, Tmax, eunits + spk_embed_dim).
+
+ """
+ if self.spk_embed_integration_type == "add":
+ # apply projection and then add to hidden states
+ spk_emb = self.projection(F.normalize(spk_emb))
+ hs = hs + spk_emb.unsqueeze(1)
+ elif self.spk_embed_integration_type == "concat":
+ # concat hidden states with spk embeds
+ spk_emb = F.normalize(spk_emb).unsqueeze(1).expand(
+ shape=[-1, paddle.shape(hs)[1], -1])
+ hs = paddle.concat([hs, spk_emb], axis=-1)
+ else:
+ raise NotImplementedError("support only add or concat.")
+
+ return hs
+
+
+class Tacotron2Inference(nn.Layer):
+ def __init__(self, normalizer, model):
+ super().__init__()
+ self.normalizer = normalizer
+ self.acoustic_model = model
+
+ def forward(self, text, spk_id=None, spk_emb=None):
+ out = self.acoustic_model.inference(
+ text, spk_id=spk_id, spk_emb=spk_emb)
+ normalized_mel = out["feat_gen"]
+ logmel = self.normalizer.inverse(normalized_mel)
+ return logmel
diff --git a/paddlespeech/t2s/models/tacotron2/tacotron2_updater.py b/paddlespeech/t2s/models/tacotron2/tacotron2_updater.py
new file mode 100644
index 00000000..09e6827d
--- /dev/null
+++ b/paddlespeech/t2s/models/tacotron2/tacotron2_updater.py
@@ -0,0 +1,219 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+from pathlib import Path
+
+from paddle import distributed as dist
+from paddle.io import DataLoader
+from paddle.nn import Layer
+from paddle.optimizer import Optimizer
+
+from paddlespeech.t2s.modules.losses import GuidedAttentionLoss
+from paddlespeech.t2s.modules.losses import Tacotron2Loss
+from paddlespeech.t2s.training.extensions.evaluator import StandardEvaluator
+from paddlespeech.t2s.training.reporter import report
+from paddlespeech.t2s.training.updaters.standard_updater import StandardUpdater
+logging.basicConfig(
+ format='%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s',
+ datefmt='[%Y-%m-%d %H:%M:%S]')
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+
+class Tacotron2Updater(StandardUpdater):
+ def __init__(self,
+ model: Layer,
+ optimizer: Optimizer,
+ dataloader: DataLoader,
+ init_state=None,
+ use_masking: bool=True,
+ use_weighted_masking: bool=False,
+ bce_pos_weight: float=5.0,
+ loss_type: str="L1+L2",
+ use_guided_attn_loss: bool=True,
+ guided_attn_loss_sigma: float=0.4,
+ guided_attn_loss_lambda: float=1.0,
+ output_dir: Path=None):
+ super().__init__(model, optimizer, dataloader, init_state=None)
+
+ self.loss_type = loss_type
+ self.use_guided_attn_loss = use_guided_attn_loss
+
+ self.taco2_loss = Tacotron2Loss(
+ use_masking=use_masking,
+ use_weighted_masking=use_weighted_masking,
+ bce_pos_weight=bce_pos_weight, )
+ if self.use_guided_attn_loss:
+ self.attn_loss = GuidedAttentionLoss(
+ sigma=guided_attn_loss_sigma,
+ alpha=guided_attn_loss_lambda, )
+
+ log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
+ self.filehandler = logging.FileHandler(str(log_file))
+ logger.addHandler(self.filehandler)
+ self.logger = logger
+ self.msg = ""
+
+ def update_core(self, batch):
+ self.msg = "Rank: {}, ".format(dist.get_rank())
+ losses_dict = {}
+ # spk_id!=None in multiple spk fastspeech2
+ spk_id = batch["spk_id"] if "spk_id" in batch else None
+ spk_emb = batch["spk_emb"] if "spk_emb" in batch else None
+ if spk_emb is not None:
+ spk_id = None
+
+ after_outs, before_outs, logits, ys, stop_labels, olens, att_ws, olens_in = self.model(
+ text=batch["text"],
+ text_lengths=batch["text_lengths"],
+ speech=batch["speech"],
+ speech_lengths=batch["speech_lengths"],
+ spk_id=spk_id,
+ spk_emb=spk_emb)
+
+ # calculate taco2 loss
+ l1_loss, mse_loss, bce_loss = self.taco2_loss(
+ after_outs=after_outs,
+ before_outs=before_outs,
+ logits=logits,
+ ys=ys,
+ stop_labels=stop_labels,
+ olens=olens)
+
+ if self.loss_type == "L1+L2":
+ loss = l1_loss + mse_loss + bce_loss
+ elif self.loss_type == "L1":
+ loss = l1_loss + bce_loss
+ elif self.loss_type == "L2":
+ loss = mse_loss + bce_loss
+ else:
+ raise ValueError(f"unknown --loss-type {self.loss_type}")
+
+ # calculate attention loss
+ if self.use_guided_attn_loss:
+ # NOTE: length of output for auto-regressive
+ # input will be changed when r > 1
+ attn_loss = self.attn_loss(
+ att_ws=att_ws, ilens=batch["text_lengths"] + 1, olens=olens_in)
+ loss = loss + attn_loss
+
+ optimizer = self.optimizer
+ optimizer.clear_grad()
+ loss.backward()
+ optimizer.step()
+
+ report("train/l1_loss", float(l1_loss))
+ report("train/mse_loss", float(mse_loss))
+ report("train/bce_loss", float(bce_loss))
+ report("train/attn_loss", float(attn_loss))
+ report("train/loss", float(loss))
+
+ losses_dict["l1_loss"] = float(l1_loss)
+ losses_dict["mse_loss"] = float(mse_loss)
+ losses_dict["bce_loss"] = float(bce_loss)
+ losses_dict["attn_loss"] = float(attn_loss)
+ losses_dict["loss"] = float(loss)
+ self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
+ for k, v in losses_dict.items())
+
+
+class Tacotron2Evaluator(StandardEvaluator):
+ def __init__(self,
+ model: Layer,
+ dataloader: DataLoader,
+ use_masking: bool=True,
+ use_weighted_masking: bool=False,
+ bce_pos_weight: float=5.0,
+ loss_type: str="L1+L2",
+ use_guided_attn_loss: bool=True,
+ guided_attn_loss_sigma: float=0.4,
+ guided_attn_loss_lambda: float=1.0,
+ output_dir=None):
+ super().__init__(model, dataloader)
+
+ self.loss_type = loss_type
+ self.use_guided_attn_loss = use_guided_attn_loss
+
+ self.taco2_loss = Tacotron2Loss(
+ use_masking=use_masking,
+ use_weighted_masking=use_weighted_masking,
+ bce_pos_weight=bce_pos_weight, )
+ if self.use_guided_attn_loss:
+ self.attn_loss = GuidedAttentionLoss(
+ sigma=guided_attn_loss_sigma,
+ alpha=guided_attn_loss_lambda, )
+
+ log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
+ self.filehandler = logging.FileHandler(str(log_file))
+ logger.addHandler(self.filehandler)
+ self.logger = logger
+ self.msg = ""
+
+ def evaluate_core(self, batch):
+ self.msg = "Evaluate: "
+ losses_dict = {}
+ # spk_id!=None in multiple spk fastspeech2
+ spk_id = batch["spk_id"] if "spk_id" in batch else None
+ spk_emb = batch["spk_emb"] if "spk_emb" in batch else None
+ if spk_emb is not None:
+ spk_id = None
+
+ after_outs, before_outs, logits, ys, stop_labels, olens, att_ws, olens_in = self.model(
+ text=batch["text"],
+ text_lengths=batch["text_lengths"],
+ speech=batch["speech"],
+ speech_lengths=batch["speech_lengths"],
+ spk_id=spk_id,
+ spk_emb=spk_emb)
+
+ # calculate taco2 loss
+ l1_loss, mse_loss, bce_loss = self.taco2_loss(
+ after_outs=after_outs,
+ before_outs=before_outs,
+ logits=logits,
+ ys=ys,
+ stop_labels=stop_labels,
+ olens=olens)
+
+ if self.loss_type == "L1+L2":
+ loss = l1_loss + mse_loss + bce_loss
+ elif self.loss_type == "L1":
+ loss = l1_loss + bce_loss
+ elif self.loss_type == "L2":
+ loss = mse_loss + bce_loss
+ else:
+ raise ValueError(f"unknown --loss-type {self.loss_type}")
+
+ # calculate attention loss
+ if self.use_guided_attn_loss:
+ # NOTE: length of output for auto-regressive
+ # input will be changed when r > 1
+ attn_loss = self.attn_loss(
+ att_ws=att_ws, ilens=batch["text_lengths"] + 1, olens=olens_in)
+ loss = loss + attn_loss
+
+ report("eval/l1_loss", float(l1_loss))
+ report("eval/mse_loss", float(mse_loss))
+ report("eval/bce_loss", float(bce_loss))
+ report("eval/attn_loss", float(attn_loss))
+ report("eval/loss", float(loss))
+
+ losses_dict["l1_loss"] = float(l1_loss)
+ losses_dict["mse_loss"] = float(mse_loss)
+ losses_dict["bce_loss"] = float(bce_loss)
+ losses_dict["attn_loss"] = float(attn_loss)
+ losses_dict["loss"] = float(loss)
+ self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
+ for k, v in losses_dict.items())
+ self.logger.info(self.msg)
diff --git a/paddlespeech/t2s/models/transformer_tts/transformer_tts.py b/paddlespeech/t2s/models/transformer_tts/transformer_tts.py
index ae6d7365..92754c30 100644
--- a/paddlespeech/t2s/models/transformer_tts/transformer_tts.py
+++ b/paddlespeech/t2s/models/transformer_tts/transformer_tts.py
@@ -48,127 +48,67 @@ class TransformerTTS(nn.Layer):
.. _`Neural Speech Synthesis with Transformer Network`:
https://arxiv.org/pdf/1809.08895.pdf
- Parameters
- ----------
- idim : int
- Dimension of the inputs.
- odim : int
- Dimension of the outputs.
- embed_dim : int, optional
- Dimension of character embedding.
- eprenet_conv_layers : int, optional
- Number of encoder prenet convolution layers.
- eprenet_conv_chans : int, optional
- Number of encoder prenet convolution channels.
- eprenet_conv_filts : int, optional
- Filter size of encoder prenet convolution.
- dprenet_layers : int, optional
- Number of decoder prenet layers.
- dprenet_units : int, optional
- Number of decoder prenet hidden units.
- elayers : int, optional
- Number of encoder layers.
- eunits : int, optional
- Number of encoder hidden units.
- adim : int, optional
- Number of attention transformation dimensions.
- aheads : int, optional
- Number of heads for multi head attention.
- dlayers : int, optional
- Number of decoder layers.
- dunits : int, optional
- Number of decoder hidden units.
- postnet_layers : int, optional
- Number of postnet layers.
- postnet_chans : int, optional
- Number of postnet channels.
- postnet_filts : int, optional
- Filter size of postnet.
- use_scaled_pos_enc : pool, optional
- Whether to use trainable scaled positional encoding.
- use_batch_norm : bool, optional
- Whether to use batch normalization in encoder prenet.
- encoder_normalize_before : bool, optional
- Whether to perform layer normalization before encoder block.
- decoder_normalize_before : bool, optional
- Whether to perform layer normalization before decoder block.
- encoder_concat_after : bool, optional
- Whether to concatenate attention layer's input and output in encoder.
- decoder_concat_after : bool, optional
- Whether to concatenate attention layer's input and output in decoder.
- positionwise_layer_type : str, optional
- Position-wise operation type.
- positionwise_conv_kernel_size : int, optional
- Kernel size in position wise conv 1d.
- reduction_factor : int, optional
- Reduction factor.
- spk_embed_dim : int, optional
- Number of speaker embedding dimenstions.
- spk_embed_integration_type : str, optional
- How to integrate speaker embedding.
- use_gst : str, optional
- Whether to use global style token.
- gst_tokens : int, optional
- The number of GST embeddings.
- gst_heads : int, optional
- The number of heads in GST multihead attention.
- gst_conv_layers : int, optional
- The number of conv layers in GST.
- gst_conv_chans_list : Sequence[int], optional
- List of the number of channels of conv layers in GST.
- gst_conv_kernel_size : int, optional
- Kernal size of conv layers in GST.
- gst_conv_stride : int, optional
- Stride size of conv layers in GST.
- gst_gru_layers : int, optional
- The number of GRU layers in GST.
- gst_gru_units : int, optional
- The number of GRU units in GST.
- transformer_lr : float, optional
- Initial value of learning rate.
- transformer_warmup_steps : int, optional
- Optimizer warmup steps.
- transformer_enc_dropout_rate : float, optional
- Dropout rate in encoder except attention and positional encoding.
- transformer_enc_positional_dropout_rate : float, optional
- Dropout rate after encoder positional encoding.
- transformer_enc_attn_dropout_rate : float, optional
- Dropout rate in encoder self-attention module.
- transformer_dec_dropout_rate : float, optional
- Dropout rate in decoder except attention & positional encoding.
- transformer_dec_positional_dropout_rate : float, optional
- Dropout rate after decoder positional encoding.
- transformer_dec_attn_dropout_rate : float, optional
- Dropout rate in deocoder self-attention module.
- transformer_enc_dec_attn_dropout_rate : float, optional
- Dropout rate in encoder-deocoder attention module.
- init_type : str, optional
- How to initialize transformer parameters.
- init_enc_alpha : float, optional
- Initial value of alpha in scaled pos encoding of the encoder.
- init_dec_alpha : float, optional
- Initial value of alpha in scaled pos encoding of the decoder.
- eprenet_dropout_rate : float, optional
- Dropout rate in encoder prenet.
- dprenet_dropout_rate : float, optional
- Dropout rate in decoder prenet.
- postnet_dropout_rate : float, optional
- Dropout rate in postnet.
- use_masking : bool, optional
- Whether to apply masking for padded part in loss calculation.
- use_weighted_masking : bool, optional
- Whether to apply weighted masking in loss calculation.
- bce_pos_weight : float, optional
- Positive sample weight in bce calculation (only for use_masking=true).
- loss_type : str, optional
- How to calculate loss.
- use_guided_attn_loss : bool, optional
- Whether to use guided attention loss.
- num_heads_applied_guided_attn : int, optional
- Number of heads in each layer to apply guided attention loss.
- num_layers_applied_guided_attn : int, optional
- Number of layers to apply guided attention loss.
- List of module names to apply guided attention loss.
+ Args:
+ idim (int): Dimension of the inputs.
+ odim (int): Dimension of the outputs.
+ embed_dim (int, optional): Dimension of character embedding.
+ eprenet_conv_layers (int, optional): Number of encoder prenet convolution layers.
+ eprenet_conv_chans (int, optional): Number of encoder prenet convolution channels.
+ eprenet_conv_filts (int, optional): Filter size of encoder prenet convolution.
+ dprenet_layers (int, optional): Number of decoder prenet layers.
+ dprenet_units (int, optional): Number of decoder prenet hidden units.
+ elayers (int, optional): Number of encoder layers.
+ eunits (int, optional): Number of encoder hidden units.
+ adim (int, optional): Number of attention transformation dimensions.
+ aheads (int, optional): Number of heads for multi head attention.
+ dlayers (int, optional): Number of decoder layers.
+ dunits (int, optional): Number of decoder hidden units.
+ postnet_layers (int, optional): Number of postnet layers.
+ postnet_chans (int, optional): Number of postnet channels.
+ postnet_filts (int, optional): Filter size of postnet.
+ use_scaled_pos_enc (pool, optional): Whether to use trainable scaled positional encoding.
+ use_batch_norm (bool, optional): Whether to use batch normalization in encoder prenet.
+ encoder_normalize_before (bool, optional): Whether to perform layer normalization before encoder block.
+ decoder_normalize_before (bool, optional): Whether to perform layer normalization before decoder block.
+ encoder_concat_after (bool, optional): Whether to concatenate attention layer's input and output in encoder.
+ decoder_concat_after (bool, optional): Whether to concatenate attention layer's input and output in decoder.
+ positionwise_layer_type (str, optional): Position-wise operation type.
+ positionwise_conv_kernel_size (int, optional): Kernel size in position wise conv 1d.
+ reduction_factor (int, optional): Reduction factor.
+ spk_embed_dim (int, optional): Number of speaker embedding dimenstions.
+ spk_embed_integration_type (str, optional): How to integrate speaker embedding.
+ use_gst (str, optional): Whether to use global style token.
+ gst_tokens (int, optional): The number of GST embeddings.
+ gst_heads (int, optional): The number of heads in GST multihead attention.
+ gst_conv_layers (int, optional): The number of conv layers in GST.
+ gst_conv_chans_list (Sequence[int], optional): List of the number of channels of conv layers in GST.
+ gst_conv_kernel_size (int, optional): Kernal size of conv layers in GST.
+ gst_conv_stride (int, optional): Stride size of conv layers in GST.
+ gst_gru_layers (int, optional): The number of GRU layers in GST.
+ gst_gru_units (int, optional): The number of GRU units in GST.
+ transformer_lr (float, optional): Initial value of learning rate.
+ transformer_warmup_steps (int, optional): Optimizer warmup steps.
+ transformer_enc_dropout_rate (float, optional): Dropout rate in encoder except attention and positional encoding.
+ transformer_enc_positional_dropout_rate (float, optional): Dropout rate after encoder positional encoding.
+ transformer_enc_attn_dropout_rate (float, optional): Dropout rate in encoder self-attention module.
+ transformer_dec_dropout_rate (float, optional): Dropout rate in decoder except attention & positional encoding.
+ transformer_dec_positional_dropout_rate (float, optional): Dropout rate after decoder positional encoding.
+ transformer_dec_attn_dropout_rate (float, optional): Dropout rate in deocoder self-attention module.
+ transformer_enc_dec_attn_dropout_rate (float, optional): Dropout rate in encoder-deocoder attention module.
+ init_type (str, optional): How to initialize transformer parameters.
+ init_enc_alpha (float, optional): Initial value of alpha in scaled pos encoding of the encoder.
+ init_dec_alpha (float, optional): Initial value of alpha in scaled pos encoding of the decoder.
+ eprenet_dropout_rate (float, optional): Dropout rate in encoder prenet.
+ dprenet_dropout_rate (float, optional): Dropout rate in decoder prenet.
+ postnet_dropout_rate (float, optional): Dropout rate in postnet.
+ use_masking (bool, optional): Whether to apply masking for padded part in loss calculation.
+ use_weighted_masking (bool, optional): Whether to apply weighted masking in loss calculation.
+ bce_pos_weight (float, optional): Positive sample weight in bce calculation (only for use_masking=true).
+ loss_type (str, optional): How to calculate loss.
+ use_guided_attn_loss (bool, optional): Whether to use guided attention loss.
+ num_heads_applied_guided_attn (int, optional): Number of heads in each layer to apply guided attention loss.
+ num_layers_applied_guided_attn (int, optional): Number of layers to apply guided attention loss.
+ List of module names to apply guided attention loss.
"""
def __init__(
@@ -398,25 +338,16 @@ class TransformerTTS(nn.Layer):
) -> Tuple[paddle.Tensor, Dict[str, paddle.Tensor], paddle.Tensor]:
"""Calculate forward propagation.
- Parameters
- ----------
- text : Tensor(int64)
- Batch of padded character ids (B, Tmax).
- text_lengths : Tensor(int64)
- Batch of lengths of each input batch (B,).
- speech : Tensor
- Batch of padded target features (B, Lmax, odim).
- speech_lengths : Tensor(int64)
- Batch of the lengths of each target (B,).
- spk_emb : Tensor, optional
- Batch of speaker embeddings (B, spk_embed_dim).
-
- Returns
- ----------
- Tensor
- Loss scalar value.
- Dict
- Statistics to be monitored.
+ Args:
+ text(Tensor(int64)): Batch of padded character ids (B, Tmax).
+ text_lengths(Tensor(int64)): Batch of lengths of each input batch (B,).
+ speech(Tensor): Batch of padded target features (B, Lmax, odim).
+ speech_lengths(Tensor(int64)): Batch of the lengths of each target (B,).
+ spk_emb(Tensor, optional): Batch of speaker embeddings (B, spk_embed_dim).
+
+ Returns:
+ Tensor: Loss scalar value.
+ Dict: Statistics to be monitored.
"""
# input of embedding must be int64
@@ -433,12 +364,10 @@ class TransformerTTS(nn.Layer):
olens = paddle.cast(speech_lengths, 'int64')
# make labels for stop prediction
- labels = make_pad_mask(olens - 1)
- labels = numpy.pad(
- labels.numpy(), ((0, 0), (0, 1)), 'constant', constant_values=1.0)
- labels = paddle.to_tensor(labels)
- labels = paddle.cast(labels, dtype="float32")
- # labels = F.pad(labels, [0, 1], "constant", 1.0)
+ stop_labels = make_pad_mask(olens - 1)
+ # bool 类型无法切片
+ stop_labels = paddle.cast(stop_labels, dtype='float32')
+ stop_labels = F.pad(stop_labels, [0, 0, 0, 1], "constant", 1.0)
# calculate transformer outputs
after_outs, before_outs, logits = self._forward(xs, ilens, ys, olens,
@@ -447,12 +376,15 @@ class TransformerTTS(nn.Layer):
# modifiy mod part of groundtruth
if self.reduction_factor > 1:
- olens = paddle.to_tensor(
- [olen - olen % self.reduction_factor for olen in olens.numpy()])
+ olens = olens - olens % self.reduction_factor
max_olen = max(olens)
ys = ys[:, :max_olen]
- labels = labels[:, :max_olen]
- labels[:, -1] = 1.0 # make sure at least one frame has 1
+ stop_labels = stop_labels[:, :max_olen]
+ stop_labels[:, -1] = 1.0 # make sure at least one frame has 1
+ olens_in = olens // self.reduction_factor
+ else:
+ olens_in = olens
+
need_dict = {}
need_dict['encoder'] = self.encoder
need_dict['decoder'] = self.decoder
@@ -462,7 +394,7 @@ class TransformerTTS(nn.Layer):
'num_layers_applied_guided_attn'] = self.num_layers_applied_guided_attn
need_dict['use_scaled_pos_enc'] = self.use_scaled_pos_enc
- return after_outs, before_outs, logits, ys, labels, olens, ilens, need_dict
+ return after_outs, before_outs, logits, ys, stop_labels, olens, olens_in, need_dict
def _forward(
self,
@@ -488,8 +420,7 @@ class TransformerTTS(nn.Layer):
# thin out frames for reduction factor (B, Lmax, odim) -> (B, Lmax//r, odim)
if self.reduction_factor > 1:
ys_in = ys[:, self.reduction_factor - 1::self.reduction_factor]
- olens_in = olens.new(
- [olen // self.reduction_factor for olen in olens])
+ olens_in = olens // self.reduction_factor
else:
ys_in, olens_in = ys, olens
@@ -525,31 +456,19 @@ class TransformerTTS(nn.Layer):
) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
"""Generate the sequence of features given the sequences of characters.
- Parameters
- ----------
- text : Tensor(int64)
- Input sequence of characters (T,).
- speech : Tensor, optional
- Feature sequence to extract style (N, idim).
- spk_emb : Tensor, optional
- Speaker embedding vector (spk_embed_dim,).
- threshold : float, optional
- Threshold in inference.
- minlenratio : float, optional
- Minimum length ratio in inference.
- maxlenratio : float, optional
- Maximum length ratio in inference.
- use_teacher_forcing : bool, optional
- Whether to use teacher forcing.
-
- Returns
- ----------
- Tensor
- Output sequence of features (L, odim).
- Tensor
- Output sequence of stop probabilities (L,).
- Tensor
- Encoder-decoder (source) attention weights (#layers, #heads, L, T).
+ Args:
+ text(Tensor(int64)): Input sequence of characters (T,).
+ speech(Tensor, optional): Feature sequence to extract style (N, idim).
+ spk_emb(Tensor, optional): Speaker embedding vector (spk_embed_dim,).
+ threshold(float, optional): Threshold in inference.
+ minlenratio(float, optional): Minimum length ratio in inference.
+ maxlenratio(float, optional): Maximum length ratio in inference.
+ use_teacher_forcing(bool, optional): Whether to use teacher forcing.
+
+ Returns:
+ Tensor: Output sequence of features (L, odim).
+ Tensor: Output sequence of stop probabilities (L,).
+ Tensor: Encoder-decoder (source) attention weights (#layers, #heads, L, T).
"""
# input of embedding must be int64
@@ -671,23 +590,17 @@ class TransformerTTS(nn.Layer):
def _source_mask(self, ilens: paddle.Tensor) -> paddle.Tensor:
"""Make masks for self-attention.
- Parameters
- ----------
- ilens : Tensor
- Batch of lengths (B,).
+ Args:
+ ilens(Tensor): Batch of lengths (B,).
- Returns
- -------
- Tensor
- Mask tensor for self-attention.
- dtype=paddle.bool
+ Returns:
+ Tensor: Mask tensor for self-attention. dtype=paddle.bool
- Examples
- -------
- >>> ilens = [5, 3]
- >>> self._source_mask(ilens)
- tensor([[[1, 1, 1, 1, 1],
- [1, 1, 1, 0, 0]]]) bool
+ Examples:
+ >>> ilens = [5, 3]
+ >>> self._source_mask(ilens)
+ tensor([[[1, 1, 1, 1, 1],
+ [1, 1, 1, 0, 0]]]) bool
"""
x_masks = make_non_pad_mask(ilens)
@@ -696,30 +609,25 @@ class TransformerTTS(nn.Layer):
def _target_mask(self, olens: paddle.Tensor) -> paddle.Tensor:
"""Make masks for masked self-attention.
- Parameters
- ----------
- olens : LongTensor
- Batch of lengths (B,).
-
- Returns
- ----------
- Tensor
- Mask tensor for masked self-attention.
-
- Examples
- ----------
- >>> olens = [5, 3]
- >>> self._target_mask(olens)
- tensor([[[1, 0, 0, 0, 0],
- [1, 1, 0, 0, 0],
- [1, 1, 1, 0, 0],
- [1, 1, 1, 1, 0],
- [1, 1, 1, 1, 1]],
- [[1, 0, 0, 0, 0],
- [1, 1, 0, 0, 0],
- [1, 1, 1, 0, 0],
- [1, 1, 1, 0, 0],
- [1, 1, 1, 0, 0]]], dtype=paddle.uint8)
+ Args:
+ olens (Tensor(int64)): Batch of lengths (B,).
+
+ Returns:
+ Tensor: Mask tensor for masked self-attention.
+
+ Examples:
+ >>> olens = [5, 3]
+ >>> self._target_mask(olens)
+ tensor([[[1, 0, 0, 0, 0],
+ [1, 1, 0, 0, 0],
+ [1, 1, 1, 0, 0],
+ [1, 1, 1, 1, 0],
+ [1, 1, 1, 1, 1]],
+ [[1, 0, 0, 0, 0],
+ [1, 1, 0, 0, 0],
+ [1, 1, 1, 0, 0],
+ [1, 1, 1, 0, 0],
+ [1, 1, 1, 0, 0]]], dtype=paddle.uint8)
"""
y_masks = make_non_pad_mask(olens)
@@ -731,17 +639,12 @@ class TransformerTTS(nn.Layer):
spk_emb: paddle.Tensor) -> paddle.Tensor:
"""Integrate speaker embedding with hidden states.
- Parameters
- ----------
- hs : Tensor
- Batch of hidden state sequences (B, Tmax, adim).
- spk_emb : Tensor
- Batch of speaker embeddings (B, spk_embed_dim).
+ Args:
+ hs(Tensor): Batch of hidden state sequences (B, Tmax, adim).
+ spk_emb(Tensor): Batch of speaker embeddings (B, spk_embed_dim).
- Returns
- ----------
- Tensor
- Batch of integrated hidden state sequences (B, Tmax, adim).
+ Returns:
+ Tensor: Batch of integrated hidden state sequences (B, Tmax, adim).
"""
if self.spk_embed_integration_type == "add":
@@ -769,318 +672,3 @@ class TransformerTTSInference(nn.Layer):
normalized_mel = self.acoustic_model.inference(text)[0]
logmel = self.normalizer.inverse(normalized_mel)
return logmel
-
-
-class TransformerTTSLoss(nn.Layer):
- """Loss function module for Tacotron2."""
-
- def __init__(self,
- use_masking=True,
- use_weighted_masking=False,
- bce_pos_weight=5.0):
- """Initialize Tactoron2 loss module.
-
- Parameters
- ----------
- use_masking : bool
- Whether to apply masking for padded part in loss calculation.
- use_weighted_masking : bool
- Whether to apply weighted masking in loss calculation.
- bce_pos_weight : float
- Weight of positive sample of stop token.
-
- """
- super().__init__()
- assert (use_masking != use_weighted_masking) or not use_masking
- self.use_masking = use_masking
- self.use_weighted_masking = use_weighted_masking
-
- # define criterions
- reduction = "none" if self.use_weighted_masking else "mean"
- self.l1_criterion = nn.L1Loss(reduction=reduction)
- self.mse_criterion = nn.MSELoss(reduction=reduction)
- self.bce_criterion = nn.BCEWithLogitsLoss(
- reduction=reduction, pos_weight=paddle.to_tensor(bce_pos_weight))
-
- def forward(self, after_outs, before_outs, logits, ys, labels, olens):
- """Calculate forward propagation.
-
- Parameters
- ----------
- after_outs : Tensor
- Batch of outputs after postnets (B, Lmax, odim).
- before_outs : Tensor
- Batch of outputs before postnets (B, Lmax, odim).
- logits : Tensor
- Batch of stop logits (B, Lmax).
- ys : Tensor
- Batch of padded target features (B, Lmax, odim).
- labels : LongTensor
- Batch of the sequences of stop token labels (B, Lmax).
- olens : LongTensor
- Batch of the lengths of each target (B,).
-
- Returns
- ----------
- Tensor
- L1 loss value.
- Tensor
- Mean square error loss value.
- Tensor
- Binary cross entropy loss value.
-
- """
- # make mask and apply it
- if self.use_masking:
- masks = make_non_pad_mask(olens).unsqueeze(-1)
- ys = ys.masked_select(masks.broadcast_to(ys.shape))
- after_outs = after_outs.masked_select(
- masks.broadcast_to(after_outs.shape))
- before_outs = before_outs.masked_select(
- masks.broadcast_to(before_outs.shape))
- # Operator slice does not have kernel for data_type[bool]
- tmp_masks = paddle.cast(masks, dtype='int64')
- tmp_masks = tmp_masks[:, :, 0]
- tmp_masks = paddle.cast(tmp_masks, dtype='bool')
- labels = labels.masked_select(tmp_masks.broadcast_to(labels.shape))
- logits = logits.masked_select(tmp_masks.broadcast_to(logits.shape))
-
- # calculate loss
- l1_loss = self.l1_criterion(after_outs, ys) + self.l1_criterion(
- before_outs, ys)
- mse_loss = self.mse_criterion(after_outs, ys) + self.mse_criterion(
- before_outs, ys)
- bce_loss = self.bce_criterion(logits, labels)
-
- # make weighted mask and apply it
- if self.use_weighted_masking:
- masks = make_non_pad_mask(olens).unsqueeze(-1)
- weights = masks.float() / masks.sum(dim=1, keepdim=True).float()
- out_weights = weights.div(ys.shape[0] * ys.shape[2])
- logit_weights = weights.div(ys.shape[0])
-
- # apply weight
- l1_loss = l1_loss.multiply(out_weights)
- l1_loss = l1_loss.masked_select(
- masks.broadcast_to(l1_loss.shape)).sum()
-
- mse_loss = mse_loss.multiply(out_weights)
- mse_loss = mse_loss.masked_select(
- masks.broadcast_to(mse_loss.shape)).sum()
-
- bce_loss = bce_loss.multiply(logit_weights.squeeze(-1))
- bce_loss = bce_loss.masked_select(
- masks.squeeze(-1).broadcast_to(bce_loss.shape)).sum()
-
- return l1_loss, mse_loss, bce_loss
-
-
-class GuidedAttentionLoss(nn.Layer):
- """Guided attention loss function module.
-
- This module calculates the guided attention loss described
- in `Efficiently Trainable Text-to-Speech System Based
- on Deep Convolutional Networks with Guided Attention`_,
- which forces the attention to be diagonal.
-
- .. _`Efficiently Trainable Text-to-Speech System
- Based on Deep Convolutional Networks with Guided Attention`:
- https://arxiv.org/abs/1710.08969
-
- """
-
- def __init__(self, sigma=0.4, alpha=1.0, reset_always=True):
- """Initialize guided attention loss module.
-
- Parameters
- ----------
- sigma : float, optional
- Standard deviation to control how close attention to a diagonal.
- alpha : float, optional
- Scaling coefficient (lambda).
- reset_always : bool, optional
- Whether to always reset masks.
-
- """
- super(GuidedAttentionLoss, self).__init__()
- self.sigma = sigma
- self.alpha = alpha
- self.reset_always = reset_always
- self.guided_attn_masks = None
- self.masks = None
-
- def _reset_masks(self):
- self.guided_attn_masks = None
- self.masks = None
-
- def forward(self, att_ws, ilens, olens):
- """Calculate forward propagation.
-
- Parameters
- ----------
- att_ws : Tensor
- Batch of attention weights (B, T_max_out, T_max_in).
- ilens : LongTensor
- Batch of input lenghts (B,).
- olens : LongTensor
- Batch of output lenghts (B,).
-
- Returns
- ----------
- Tensor
- Guided attention loss value.
-
- """
- if self.guided_attn_masks is None:
- self.guided_attn_masks = self._make_guided_attention_masks(ilens,
- olens)
- if self.masks is None:
- self.masks = self._make_masks(ilens, olens)
- losses = self.guided_attn_masks * att_ws
- loss = paddle.mean(
- losses.masked_select(self.masks.broadcast_to(losses.shape)))
- if self.reset_always:
- self._reset_masks()
- return self.alpha * loss
-
- def _make_guided_attention_masks(self, ilens, olens):
- n_batches = len(ilens)
- max_ilen = max(ilens)
- max_olen = max(olens)
- guided_attn_masks = paddle.zeros((n_batches, max_olen, max_ilen))
-
- for idx, (ilen, olen) in enumerate(zip(ilens, olens)):
-
- ilen = int(ilen)
- olen = int(olen)
- guided_attn_masks[idx, :olen, :
- ilen] = self._make_guided_attention_mask(
- ilen, olen, self.sigma)
- return guided_attn_masks
-
- @staticmethod
- def _make_guided_attention_mask(ilen, olen, sigma):
- """Make guided attention mask.
-
- Examples
- ----------
- >>> guided_attn_mask =_make_guided_attention(5, 5, 0.4)
- >>> guided_attn_mask.shape
- [5, 5]
- >>> guided_attn_mask
- tensor([[0.0000, 0.1175, 0.3935, 0.6753, 0.8647],
- [0.1175, 0.0000, 0.1175, 0.3935, 0.6753],
- [0.3935, 0.1175, 0.0000, 0.1175, 0.3935],
- [0.6753, 0.3935, 0.1175, 0.0000, 0.1175],
- [0.8647, 0.6753, 0.3935, 0.1175, 0.0000]])
- >>> guided_attn_mask =_make_guided_attention(3, 6, 0.4)
- >>> guided_attn_mask.shape
- [6, 3]
- >>> guided_attn_mask
- tensor([[0.0000, 0.2934, 0.7506],
- [0.0831, 0.0831, 0.5422],
- [0.2934, 0.0000, 0.2934],
- [0.5422, 0.0831, 0.0831],
- [0.7506, 0.2934, 0.0000],
- [0.8858, 0.5422, 0.0831]])
-
- """
- grid_x, grid_y = paddle.meshgrid(
- paddle.arange(olen), paddle.arange(ilen))
- grid_x = grid_x.cast(dtype=paddle.float32)
- grid_y = grid_y.cast(dtype=paddle.float32)
- return 1.0 - paddle.exp(-(
- (grid_y / ilen - grid_x / olen)**2) / (2 * (sigma**2)))
-
- @staticmethod
- def _make_masks(ilens, olens):
- """Make masks indicating non-padded part.
-
- Parameters
- ----------
- ilens (LongTensor or List): Batch of lengths (B,).
- olens (LongTensor or List): Batch of lengths (B,).
-
- Returns
- ----------
- Tensor
- Mask tensor indicating non-padded part.
-
- Examples
- ----------
- >>> ilens, olens = [5, 2], [8, 5]
- >>> _make_mask(ilens, olens)
- tensor([[[1, 1, 1, 1, 1],
- [1, 1, 1, 1, 1],
- [1, 1, 1, 1, 1],
- [1, 1, 1, 1, 1],
- [1, 1, 1, 1, 1],
- [1, 1, 1, 1, 1],
- [1, 1, 1, 1, 1],
- [1, 1, 1, 1, 1]],
-
- [[1, 1, 0, 0, 0],
- [1, 1, 0, 0, 0],
- [1, 1, 0, 0, 0],
- [1, 1, 0, 0, 0],
- [1, 1, 0, 0, 0],
- [0, 0, 0, 0, 0],
- [0, 0, 0, 0, 0],
- [0, 0, 0, 0, 0]]], dtype=paddle.uint8)
-
- """
- # (B, T_in)
- in_masks = make_non_pad_mask(ilens)
- # (B, T_out)
- out_masks = make_non_pad_mask(olens)
- # (B, T_out, T_in)
-
- return paddle.logical_and(
- out_masks.unsqueeze(-1), in_masks.unsqueeze(-2))
-
-
-class GuidedMultiHeadAttentionLoss(GuidedAttentionLoss):
- """Guided attention loss function module for multi head attention.
-
- Parameters
- ----------
- sigma : float, optional
- Standard deviation to controlGuidedAttentionLoss
- how close attention to a diagonal.
- alpha : float, optional
- Scaling coefficient (lambda).
- reset_always : bool, optional
- Whether to always reset masks.
-
- """
-
- def forward(self, att_ws, ilens, olens):
- """Calculate forward propagation.
-
- Parameters
- ----------
- att_ws : Tensor
- Batch of multi head attention weights (B, H, T_max_out, T_max_in).
- ilens : Tensor
- Batch of input lenghts (B,).
- olens : Tensor
- Batch of output lenghts (B,).
-
- Returns
- ----------
- Tensor
- Guided attention loss value.
-
- """
- if self.guided_attn_masks is None:
- self.guided_attn_masks = (
- self._make_guided_attention_masks(ilens, olens).unsqueeze(1))
- if self.masks is None:
- self.masks = self._make_masks(ilens, olens).unsqueeze(1)
- losses = self.guided_attn_masks * att_ws
- loss = paddle.mean(
- losses.masked_select(self.masks.broadcast_to(losses.shape)))
- if self.reset_always:
- self._reset_masks()
-
- return self.alpha * loss
diff --git a/paddlespeech/t2s/models/transformer_tts/transformer_tts_updater.py b/paddlespeech/t2s/models/transformer_tts/transformer_tts_updater.py
index f16cf4dd..dff908e0 100644
--- a/paddlespeech/t2s/models/transformer_tts/transformer_tts_updater.py
+++ b/paddlespeech/t2s/models/transformer_tts/transformer_tts_updater.py
@@ -12,13 +12,17 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
+from pathlib import Path
from typing import Sequence
import paddle
from paddle import distributed as dist
+from paddle.io import DataLoader
+from paddle.nn import Layer
+from paddle.optimizer import Optimizer
-from paddlespeech.t2s.models.transformer_tts import GuidedMultiHeadAttentionLoss
-from paddlespeech.t2s.models.transformer_tts import TransformerTTSLoss
+from paddlespeech.t2s.modules.losses import GuidedMultiHeadAttentionLoss
+from paddlespeech.t2s.modules.losses import Tacotron2Loss as TransformerTTSLoss
from paddlespeech.t2s.training.extensions.evaluator import StandardEvaluator
from paddlespeech.t2s.training.reporter import report
from paddlespeech.t2s.training.updaters.standard_updater import StandardUpdater
@@ -32,38 +36,34 @@ logger.setLevel(logging.INFO)
class TransformerTTSUpdater(StandardUpdater):
def __init__(
self,
- model,
- optimizer,
- dataloader,
+ model: Layer,
+ optimizer: Optimizer,
+ dataloader: DataLoader,
init_state=None,
- use_masking=False,
- use_weighted_masking=False,
- output_dir=None,
- bce_pos_weight=5.0,
+ use_masking: bool=False,
+ use_weighted_masking: bool=False,
+ output_dir: Path=None,
+ bce_pos_weight: float=5.0,
loss_type: str="L1",
use_guided_attn_loss: bool=True,
modules_applied_guided_attn: Sequence[str]=("encoder-decoder"),
guided_attn_loss_sigma: float=0.4,
guided_attn_loss_lambda: float=1.0, ):
super().__init__(model, optimizer, dataloader, init_state=None)
- self.use_masking = use_masking
- self.use_weighted_masking = use_weighted_masking
- self.bce_pos_weight = bce_pos_weight
+
self.loss_type = loss_type
self.use_guided_attn_loss = use_guided_attn_loss
- self.guided_attn_loss_sigma = guided_attn_loss_sigma
- self.guided_attn_loss_lambda = guided_attn_loss_lambda
self.modules_applied_guided_attn = modules_applied_guided_attn
self.criterion = TransformerTTSLoss(
- use_masking=self.use_masking,
- use_weighted_masking=self.use_weighted_masking,
- bce_pos_weight=self.bce_pos_weight)
+ use_masking=use_masking,
+ use_weighted_masking=use_weighted_masking,
+ bce_pos_weight=bce_pos_weight)
if self.use_guided_attn_loss:
self.attn_criterion = GuidedMultiHeadAttentionLoss(
- sigma=self.guided_attn_loss_sigma,
- alpha=self.guided_attn_loss_lambda, )
+ sigma=guided_attn_loss_sigma,
+ alpha=guided_attn_loss_lambda, )
log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
self.filehandler = logging.FileHandler(str(log_file))
@@ -75,7 +75,7 @@ class TransformerTTSUpdater(StandardUpdater):
self.msg = "Rank: {}, ".format(dist.get_rank())
losses_dict = {}
- after_outs, before_outs, logits, ys, labels, olens, ilens, need_dict = self.model(
+ after_outs, before_outs, logits, ys, stop_labels, olens, olens_in, need_dict = self.model(
text=batch["text"],
text_lengths=batch["text_lengths"],
speech=batch["speech"],
@@ -86,7 +86,7 @@ class TransformerTTSUpdater(StandardUpdater):
before_outs=before_outs,
logits=logits,
ys=ys,
- labels=labels,
+ stop_labels=stop_labels,
olens=olens)
report("train/bce_loss", float(bce_loss))
@@ -120,7 +120,10 @@ class TransformerTTSUpdater(StandardUpdater):
break
# (B, H*L, T_in, T_in)
att_ws = paddle.concat(att_ws, axis=1)
- enc_attn_loss = self.attn_criterion(att_ws, ilens, ilens)
+ enc_attn_loss = self.attn_criterion(
+ att_ws=att_ws,
+ ilens=batch["text_lengths"] + 1,
+ olens=batch["text_lengths"] + 1)
loss = loss + enc_attn_loss
report("train/enc_attn_loss", float(enc_attn_loss))
losses_dict["enc_attn_loss"] = float(enc_attn_loss)
@@ -137,7 +140,8 @@ class TransformerTTSUpdater(StandardUpdater):
break
# (B, H*L, T_out, T_out)
att_ws = paddle.concat(att_ws, axis=1)
- dec_attn_loss = self.attn_criterion(att_ws, olens, olens)
+ dec_attn_loss = self.attn_criterion(
+ att_ws=att_ws, ilens=olens_in, olens=olens_in)
report("train/dec_attn_loss", float(dec_attn_loss))
losses_dict["dec_attn_loss"] = float(dec_attn_loss)
loss = loss + dec_attn_loss
@@ -154,7 +158,10 @@ class TransformerTTSUpdater(StandardUpdater):
break
# (B, H*L, T_out, T_in)
att_ws = paddle.concat(att_ws, axis=1)
- enc_dec_attn_loss = self.attn_criterion(att_ws, ilens, olens)
+ enc_dec_attn_loss = self.attn_criterion(
+ att_ws=att_ws,
+ ilens=batch["text_lengths"] + 1,
+ olens=olens_in)
report("train/enc_dec_attn_loss", float(enc_dec_attn_loss))
losses_dict["enc_dec_attn_loss"] = float(enc_dec_attn_loss)
loss = loss + enc_dec_attn_loss
@@ -182,37 +189,33 @@ class TransformerTTSUpdater(StandardUpdater):
class TransformerTTSEvaluator(StandardEvaluator):
def __init__(
self,
- model,
- dataloader,
+ model: Layer,
+ dataloader: DataLoader,
init_state=None,
- use_masking=False,
- use_weighted_masking=False,
- output_dir=None,
- bce_pos_weight=5.0,
+ use_masking: bool=False,
+ use_weighted_masking: bool=False,
+ output_dir: Path=None,
+ bce_pos_weight: float=5.0,
loss_type: str="L1",
use_guided_attn_loss: bool=True,
modules_applied_guided_attn: Sequence[str]=("encoder-decoder"),
guided_attn_loss_sigma: float=0.4,
guided_attn_loss_lambda: float=1.0, ):
super().__init__(model, dataloader)
- self.use_masking = use_masking
- self.use_weighted_masking = use_weighted_masking
- self.bce_pos_weight = bce_pos_weight
+
self.loss_type = loss_type
self.use_guided_attn_loss = use_guided_attn_loss
- self.guided_attn_loss_sigma = guided_attn_loss_sigma
- self.guided_attn_loss_lambda = guided_attn_loss_lambda
self.modules_applied_guided_attn = modules_applied_guided_attn
self.criterion = TransformerTTSLoss(
- use_masking=self.use_masking,
- use_weighted_masking=self.use_weighted_masking,
- bce_pos_weight=self.bce_pos_weight)
+ use_masking=use_masking,
+ use_weighted_masking=use_weighted_masking,
+ bce_pos_weight=bce_pos_weight)
if self.use_guided_attn_loss:
self.attn_criterion = GuidedMultiHeadAttentionLoss(
- sigma=self.guided_attn_loss_sigma,
- alpha=self.guided_attn_loss_lambda, )
+ sigma=guided_attn_loss_sigma,
+ alpha=guided_attn_loss_lambda, )
log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
self.filehandler = logging.FileHandler(str(log_file))
@@ -223,7 +226,7 @@ class TransformerTTSEvaluator(StandardEvaluator):
def evaluate_core(self, batch):
self.msg = "Evaluate: "
losses_dict = {}
- after_outs, before_outs, logits, ys, labels, olens, ilens, need_dict = self.model(
+ after_outs, before_outs, logits, ys, stop_labels, olens, olens_in, need_dict = self.model(
text=batch["text"],
text_lengths=batch["text_lengths"],
speech=batch["speech"],
@@ -234,7 +237,7 @@ class TransformerTTSEvaluator(StandardEvaluator):
before_outs=before_outs,
logits=logits,
ys=ys,
- labels=labels,
+ stop_labels=stop_labels,
olens=olens)
report("eval/bce_loss", float(bce_loss))
@@ -268,7 +271,10 @@ class TransformerTTSEvaluator(StandardEvaluator):
break
# (B, H*L, T_in, T_in)
att_ws = paddle.concat(att_ws, axis=1)
- enc_attn_loss = self.attn_criterion(att_ws, ilens, ilens)
+ enc_attn_loss = self.attn_criterion(
+ att_ws=att_ws,
+ ilens=batch["text_lengths"] + 1,
+ olens=batch["text_lengths"] + 1)
loss = loss + enc_attn_loss
report("train/enc_attn_loss", float(enc_attn_loss))
losses_dict["enc_attn_loss"] = float(enc_attn_loss)
@@ -285,7 +291,8 @@ class TransformerTTSEvaluator(StandardEvaluator):
break
# (B, H*L, T_out, T_out)
att_ws = paddle.concat(att_ws, axis=1)
- dec_attn_loss = self.attn_criterion(att_ws, olens, olens)
+ dec_attn_loss = self.attn_criterion(
+ att_ws=att_ws, ilens=olens_in, olens=olens_in)
report("eval/dec_attn_loss", float(dec_attn_loss))
losses_dict["dec_attn_loss"] = float(dec_attn_loss)
loss = loss + dec_attn_loss
@@ -303,7 +310,10 @@ class TransformerTTSEvaluator(StandardEvaluator):
break
# (B, H*L, T_out, T_in)
att_ws = paddle.concat(att_ws, axis=1)
- enc_dec_attn_loss = self.attn_criterion(att_ws, ilens, olens)
+ enc_dec_attn_loss = self.attn_criterion(
+ att_ws=att_ws,
+ ilens=batch["text_lengths"] + 1,
+ olens=olens_in)
report("eval/enc_dec_attn_loss", float(enc_dec_attn_loss))
losses_dict["enc_dec_attn_loss"] = float(enc_dec_attn_loss)
loss = loss + enc_dec_attn_loss
diff --git a/paddlespeech/t2s/models/waveflow.py b/paddlespeech/t2s/models/waveflow.py
index e519e0c5..52e6005b 100644
--- a/paddlespeech/t2s/models/waveflow.py
+++ b/paddlespeech/t2s/models/waveflow.py
@@ -30,20 +30,14 @@ __all__ = ["WaveFlow", "ConditionalWaveFlow", "WaveFlowLoss"]
def fold(x, n_group):
- r"""Fold audio or spectrogram's temporal dimension in to groups.
+ """Fold audio or spectrogram's temporal dimension in to groups.
- Parameters
- ----------
- x : Tensor [shape=(\*, time_steps)
- The input tensor.
+ Args:
+ x(Tensor): The input tensor. shape=(*, time_steps)
+ n_group(int): The size of a group.
- n_group : int
- The size of a group.
-
- Returns
- ---------
- Tensor : [shape=(\*, time_steps // n_group, group)]
- Folded tensor.
+ Returns:
+ Tensor: Folded tensor. shape=(*, time_steps // n_group, group)
"""
spatial_shape = list(x.shape[:-1])
time_steps = paddle.shape(x)[-1]
@@ -58,27 +52,23 @@ class UpsampleNet(nn.LayerList):
It consists of several conv2dtranspose layers which perform deconvolution
on mel and time dimension.
- Parameters
- ----------
- upscale_factors : List[int], optional
- Time upsampling factors for each Conv2DTranspose Layer.
-
- The ``UpsampleNet`` contains ``len(upscale_factor)`` Conv2DTranspose
- Layers. Each upscale_factor is used as the ``stride`` for the
- corresponding Conv2DTranspose. Defaults to [16, 16], this the default
- upsampling factor is 256.
+ Args:
+ upscale_factors(List[int], optional): Time upsampling factors for each Conv2DTranspose Layer.
+ The ``UpsampleNet`` contains ``len(upscale_factor)`` Conv2DTranspose
+ Layers. Each upscale_factor is used as the ``stride`` for the
+ corresponding Conv2DTranspose. Defaults to [16, 16], this the default
+ upsampling factor is 256.
- Notes
- ------
- ``np.prod(upscale_factors)`` should equals the ``hop_length`` of the stft
- transformation used to extract spectrogram features from audio.
+ Notes:
+ ``np.prod(upscale_factors)`` should equals the ``hop_length`` of the stft
+ transformation used to extract spectrogram features from audio.
- For example, ``16 * 16 = 256``, then the spectrogram extracted with a stft
- transformation whose ``hop_length`` equals 256 is suitable.
+ For example, ``16 * 16 = 256``, then the spectrogram extracted with a stft
+ transformation whose ``hop_length`` equals 256 is suitable.
- See Also
- ---------
- ``librosa.core.stft``
+ See Also
+
+ ``librosa.core.stft``
"""
def __init__(self, upsample_factors):
@@ -101,25 +91,18 @@ class UpsampleNet(nn.LayerList):
self.upsample_factors = upsample_factors
def forward(self, x, trim_conv_artifact=False):
- r"""Forward pass of the ``UpsampleNet``.
+ """Forward pass of the ``UpsampleNet``
- Parameters
- -----------
- x : Tensor [shape=(batch_size, input_channels, time_steps)]
- The input spectrogram.
+ Args:
+ x(Tensor): The input spectrogram. shape=(batch_size, input_channels, time_steps)
+ trim_conv_artifact(bool, optional, optional): Trim deconvolution artifact at each layer. Defaults to False.
- trim_conv_artifact : bool, optional
- Trim deconvolution artifact at each layer. Defaults to False.
+ Returns:
+ Tensor: The upsampled spectrogram. shape=(batch_size, input_channels, time_steps * upsample_factor)
- Returns
- --------
- Tensor: [shape=(batch_size, input_channels, time_steps \* upsample_factor)]
- The upsampled spectrogram.
-
- Notes
- --------
- If trim_conv_artifact is ``True``, the output time steps is less
- than ``time_steps \* upsample_factors``.
+ Notes:
+ If trim_conv_artifact is ``True``, the output time steps is less
+ than ``time_steps * upsample_factors``.
"""
x = paddle.unsqueeze(x, 1) # (B, C, T) -> (B, 1, C, T)
for layer in self:
@@ -139,19 +122,11 @@ class ResidualBlock(nn.Layer):
same paddign in width dimension. It also has projection for the condition
and output.
- Parameters
- ----------
- channels : int
- Feature size of the input.
-
- cond_channels : int
- Featuer size of the condition.
-
- kernel_size : Tuple[int]
- Kernel size of the Convolution2d applied to the input.
-
- dilations : int
- Dilations of the Convolution2d applied to the input.
+ Args:
+ channels (int): Feature size of the input.
+ cond_channels (int): Featuer size of the condition.
+ kernel_size (Tuple[int]): Kernel size of the Convolution2d applied to the input.
+ dilations (int): Dilations of the Convolution2d applied to the input.
"""
def __init__(self, channels, cond_channels, kernel_size, dilations):
@@ -197,21 +172,13 @@ class ResidualBlock(nn.Layer):
def forward(self, x, condition):
"""Compute output for a whole folded sequence.
- Parameters
- ----------
- x : Tensor [shape=(batch_size, channel, height, width)]
- The input.
-
- condition : Tensor [shape=(batch_size, condition_channel, height, width)]
- The local condition.
+ Args:
+ x (Tensor): The input. [shape=(batch_size, channel, height, width)]
+ condition (Tensor [shape=(batch_size, condition_channel, height, width)]): The local condition.
- Returns
- -------
- res : Tensor [shape=(batch_size, channel, height, width)]
- The residual output.
-
- skip : Tensor [shape=(batch_size, channel, height, width)]
- The skip output.
+ Returns:
+ res (Tensor): The residual output. [shape=(batch_size, channel, height, width)]
+ skip (Tensor): The skip output. [shape=(batch_size, channel, height, width)]
"""
x_in = x
x = self.conv(x)
@@ -248,21 +215,14 @@ class ResidualBlock(nn.Layer):
def add_input(self, x_row, condition_row):
"""Compute the output for a row and update the buffer.
- Parameters
- ----------
- x_row : Tensor [shape=(batch_size, channel, 1, width)]
- A row of the input.
-
- condition_row : Tensor [shape=(batch_size, condition_channel, 1, width)]
- A row of the condition.
+ Args:
+ x_row (Tensor): A row of the input. shape=(batch_size, channel, 1, width)
+ condition_row (Tensor): A row of the condition. shape=(batch_size, condition_channel, 1, width)
- Returns
- -------
- res : Tensor [shape=(batch_size, channel, 1, width)]
- A row of the the residual output.
+ Returns:
+ res (Tensor): A row of the the residual output. shape=(batch_size, channel, 1, width)
+ skip (Tensor): A row of the skip output. shape=(batch_size, channel, 1, width)
- skip : Tensor [shape=(batch_size, channel, 1, width)]
- A row of the skip output.
"""
x_row_in = x_row
if len(paddle.shape(self._conv_buffer)) == 1:
@@ -297,27 +257,15 @@ class ResidualBlock(nn.Layer):
class ResidualNet(nn.LayerList):
"""A stack of several ResidualBlocks. It merges condition at each layer.
- Parameters
- ----------
- n_layer : int
- Number of ResidualBlocks in the ResidualNet.
-
- residual_channels : int
- Feature size of each ResidualBlocks.
-
- condition_channels : int
- Feature size of the condition.
+ Args:
+ n_layer (int): Number of ResidualBlocks in the ResidualNet.
+ residual_channels (int): Feature size of each ResidualBlocks.
+ condition_channels (int): Feature size of the condition.
+ kernel_size (Tuple[int]): Kernel size of each ResidualBlock.
+ dilations_h (List[int]): Dilation in height dimension of every ResidualBlock.
- kernel_size : Tuple[int]
- Kernel size of each ResidualBlock.
-
- dilations_h : List[int]
- Dilation in height dimension of every ResidualBlock.
-
- Raises
- ------
- ValueError
- If the length of dilations_h does not equals n_layers.
+ Raises:
+ ValueError: If the length of dilations_h does not equals n_layers.
"""
def __init__(self,
@@ -339,18 +287,13 @@ class ResidualNet(nn.LayerList):
def forward(self, x, condition):
"""Comput the output of given the input and the condition.
- Parameters
- -----------
- x : Tensor [shape=(batch_size, channel, height, width)]
- The input.
-
- condition : Tensor [shape=(batch_size, condition_channel, height, width)]
- The local condition.
-
- Returns
- --------
- Tensor : [shape=(batch_size, channel, height, width)]
- The output, which is an aggregation of all the skip outputs.
+ Args:
+ x (Tensor): The input. shape=(batch_size, channel, height, width)
+ condition (Tensor): The local condition. shape=(batch_size, condition_channel, height, width)
+
+ Returns:
+ Tensor : The output, which is an aggregation of all the skip outputs. shape=(batch_size, channel, height, width)
+
"""
skip_connections = []
for layer in self:
@@ -368,21 +311,14 @@ class ResidualNet(nn.LayerList):
def add_input(self, x_row, condition_row):
"""Compute the output for a row and update the buffers.
- Parameters
- ----------
- x_row : Tensor [shape=(batch_size, channel, 1, width)]
- A row of the input.
-
- condition_row : Tensor [shape=(batch_size, condition_channel, 1, width)]
- A row of the condition.
-
- Returns
- -------
- res : Tensor [shape=(batch_size, channel, 1, width)]
- A row of the the residual output.
-
- skip : Tensor [shape=(batch_size, channel, 1, width)]
- A row of the skip output.
+ Args:
+ x_row (Tensor): A row of the input. shape=(batch_size, channel, 1, width)
+ condition_row (Tensor): A row of the condition. shape=(batch_size, condition_channel, 1, width)
+
+ Returns:
+ res (Tensor): A row of the the residual output. shape=(batch_size, channel, 1, width)
+ skip (Tensor): A row of the skip output. shape=(batch_size, channel, 1, width)
+
"""
skip_connections = []
for layer in self:
@@ -400,22 +336,12 @@ class Flow(nn.Layer):
probability density estimation. The ``inverse`` method implements the
sampling.
- Parameters
- ----------
- n_layers : int
- Number of ResidualBlocks in the Flow.
-
- channels : int
- Feature size of the ResidualBlocks.
-
- mel_bands : int
- Feature size of the mel spectrogram (mel bands).
-
- kernel_size : Tuple[int]
- Kernel size of each ResisualBlocks in the Flow.
-
- n_group : int
- Number of timesteps to the folded into a group.
+ Args:
+ n_layers (int): Number of ResidualBlocks in the Flow.
+ channels (int): Feature size of the ResidualBlocks.
+ mel_bands (int): Feature size of the mel spectrogram (mel bands).
+ kernel_size (Tuple[int]): Kernel size of each ResisualBlocks in the Flow.
+ n_group (int): Number of timesteps to the folded into a group.
"""
dilations_dict = {
8: [1, 1, 1, 1, 1, 1, 1, 1],
@@ -466,26 +392,16 @@ class Flow(nn.Layer):
"""Probability density estimation. It is done by inversely transform
a sample from p(X) into a sample from p(Z).
- Parameters
- -----------
- x : Tensor [shape=(batch, 1, height, width)]
- A input sample of the distribution p(X).
-
- condition : Tensor [shape=(batch, condition_channel, height, width)]
- The local condition.
-
- Returns
- --------
- z (Tensor): shape(batch, 1, height, width), the transformed sample.
-
- Tuple[Tensor, Tensor]
- The parameter of the transformation.
-
- logs (Tensor): shape(batch, 1, height - 1, width), the log scale
- of the transformation from x to z.
-
- b (Tensor): shape(batch, 1, height - 1, width), the shift of the
- transformation from x to z.
+ Args:
+ x (Tensor): A input sample of the distribution p(X). shape=(batch, 1, height, width)
+ condition (Tensor): The local condition. shape=(batch, condition_channel, height, width)
+
+ Returns:
+ z (Tensor): shape(batch, 1, height, width), the transformed sample.
+ Tuple[Tensor, Tensor]:
+ The parameter of the transformation.
+ logs (Tensor): shape(batch, 1, height - 1, width), the log scale of the transformation from x to z.
+ b (Tensor): shape(batch, 1, height - 1, width), the shift of the transformation from x to z.
"""
# (B, C, H-1, W)
logs, b = self._predict_parameters(x[:, :, :-1, :],
@@ -516,27 +432,12 @@ class Flow(nn.Layer):
"""Sampling from the the distrition p(X). It is done by sample form
p(Z) and transform the sample. It is a auto regressive transformation.
- Parameters
- -----------
- z : Tensor [shape=(batch, 1, height, width)]
- A sample of the distribution p(Z).
-
- condition : Tensor [shape=(batch, condition_channel, height, width)]
- The local condition.
-
- Returns
- ---------
- x : Tensor [shape=(batch, 1, height, width)]
- The transformed sample.
-
- Tuple[Tensor, Tensor]
- The parameter of the transformation.
-
- logs (Tensor): shape(batch, 1, height - 1, width), the log scale
- of the transformation from x to z.
-
- b (Tensor): shape(batch, 1, height - 1, width), the shift of the
- transformation from x to z.
+ Args:
+ z(Tensor): A sample of the distribution p(Z). shape=(batch, 1, time_steps
+ condition(Tensor): The local condition. shape=(batch, condition_channel, time_steps)
+ Returns:
+ Tensor:
+ The transformed sample. shape=(batch, 1, height, width)
"""
z_0 = z[:, :, :1, :]
x = paddle.zeros_like(z)
@@ -560,25 +461,13 @@ class WaveFlow(nn.LayerList):
"""An Deep Reversible layer that is composed of severel auto regressive
flows.
- Parameters
- -----------
- n_flows : int
- Number of flows in the WaveFlow model.
-
- n_layers : int
- Number of ResidualBlocks in each Flow.
-
- n_group : int
- Number of timesteps to fold as a group.
-
- channels : int
- Feature size of each ResidualBlock.
-
- mel_bands : int
- Feature size of mel spectrogram (mel bands).
-
- kernel_size : Union[int, List[int]]
- Kernel size of the convolution layer in each ResidualBlock.
+ Args:
+ n_flows (int): Number of flows in the WaveFlow model.
+ n_layers (int): Number of ResidualBlocks in each Flow.
+ n_group (int): Number of timesteps to fold as a group.
+ channels (int): Feature size of each ResidualBlock.
+ mel_bands (int): Feature size of mel spectrogram (mel bands).
+ kernel_size (Union[int, List[int]]): Kernel size of the convolution layer in each ResidualBlock.
"""
def __init__(self, n_flows, n_layers, n_group, channels, mel_bands,
@@ -628,22 +517,13 @@ class WaveFlow(nn.LayerList):
"""Probability density estimation of random variable x given the
condition.
- Parameters
- -----------
- x : Tensor [shape=(batch_size, time_steps)]
- The audio.
-
- condition : Tensor [shape=(batch_size, condition channel, time_steps)]
- The local condition (mel spectrogram here).
-
- Returns
- --------
- z : Tensor [shape=(batch_size, time_steps)]
- The transformed random variable.
-
- log_det_jacobian: Tensor [shape=(1,)]
- The log determinant of the jacobian of the transformation from x
- to z.
+ Args:
+ x (Tensor): The audio. shape=(batch_size, time_steps)
+ condition (Tensor): The local condition (mel spectrogram here). shape=(batch_size, condition channel, time_steps)
+
+ Returns:
+ Tensor: The transformed random variable. shape=(batch_size, time_steps)
+ Tensor: The log determinant of the jacobian of the transformation from x to z. shape=(1,)
"""
# x: (B, T)
# condition: (B, C, T) upsampled condition
@@ -678,18 +558,13 @@ class WaveFlow(nn.LayerList):
Each Flow transform .. math:: `z_{i-1}` to .. math:: `z_{i}` in an
autoregressive manner.
- Parameters
- ----------
- z : Tensor [shape=(batch, 1, time_steps]
- A sample of the distribution p(Z).
-
- condition : Tensor [shape=(batch, condition_channel, time_steps)]
- The local condition.
+ Args:
+ z (Tensor): A sample of the distribution p(Z). shape=(batch, 1, time_steps
+ condition (Tensor): The local condition. shape=(batch, condition_channel, time_steps)
- Returns
- --------
- x : Tensor [shape=(batch_size, time_steps)]
- The transformed sample (audio here).
+ Returns:
+ Tensor: The transformed sample (audio here). shape=(batch_size, time_steps)
+
"""
z, condition = self._trim(z, condition)
@@ -714,29 +589,15 @@ class WaveFlow(nn.LayerList):
class ConditionalWaveFlow(nn.LayerList):
"""ConditionalWaveFlow, a UpsampleNet with a WaveFlow model.
- Parameters
- ----------
- upsample_factors : List[int]
- Upsample factors for the upsample net.
-
- n_flows : int
- Number of flows in the WaveFlow model.
-
- n_layers : int
- Number of ResidualBlocks in each Flow.
-
- n_group : int
- Number of timesteps to fold as a group.
-
- channels : int
- Feature size of each ResidualBlock.
-
- n_mels : int
- Feature size of mel spectrogram (mel bands).
-
- kernel_size : Union[int, List[int]]
- Kernel size of the convolution layer in each ResidualBlock.
- """
+ Args:
+ upsample_factors (List[int]): Upsample factors for the upsample net.
+ n_flows (int): Number of flows in the WaveFlow model.
+ n_layers (int): Number of ResidualBlocks in each Flow.
+ n_group (int): Number of timesteps to fold as a group.
+ channels (int): Feature size of each ResidualBlock.
+ n_mels (int): Feature size of mel spectrogram (mel bands).
+ kernel_size (Union[int, List[int]]): Kernel size of the convolution layer in each ResidualBlock.
+ """
def __init__(self,
upsample_factors: List[int],
@@ -760,22 +621,13 @@ class ConditionalWaveFlow(nn.LayerList):
"""Compute the transformed random variable z (x to z) and the log of
the determinant of the jacobian of the transformation from x to z.
- Parameters
- ----------
- audio : Tensor [shape=(B, T)]
- The audio.
+ Args:
+ audio(Tensor): The audio. shape=(B, T)
+ mel(Tensor): The mel spectrogram. shape=(B, C_mel, T_mel)
- mel : Tensor [shape=(B, C_mel, T_mel)]
- The mel spectrogram.
-
- Returns
- -------
- z : Tensor [shape=(B, T)]
- The inversely transformed random variable z (x to z)
-
- log_det_jacobian: Tensor [shape=(1,)]
- the log of the determinant of the jacobian of the transformation
- from x to z.
+ Returns:
+ Tensor: The inversely transformed random variable z (x to z). shape=(B, T)
+ Tensor: the log of the determinant of the jacobian of the transformation from x to z. shape=(1,)
"""
condition = self.encoder(mel)
z, log_det_jacobian = self.decoder(audio, condition)
@@ -783,17 +635,13 @@ class ConditionalWaveFlow(nn.LayerList):
@paddle.no_grad()
def infer(self, mel):
- r"""Generate raw audio given mel spectrogram.
+ """Generate raw audio given mel spectrogram.
- Parameters
- ----------
- mel : Tensor [shape=(B, C_mel, T_mel)]
- Mel spectrogram (in log-magnitude).
+ Args:
+ mel(np.ndarray): Mel spectrogram of an utterance(in log-magnitude). shape=(C_mel, T_mel)
- Returns
- -------
- Tensor : [shape=(B, T)]
- The synthesized audio, where``T <= T_mel \* upsample_factors``.
+ Returns:
+ Tensor: The synthesized audio, where``T <= T_mel * upsample_factors``. shape=(B, T)
"""
start = time.time()
condition = self.encoder(mel, trim_conv_artifact=True) # (B, C, T)
@@ -808,15 +656,11 @@ class ConditionalWaveFlow(nn.LayerList):
def predict(self, mel):
"""Generate raw audio given mel spectrogram.
- Parameters
- ----------
- mel : np.ndarray [shape=(C_mel, T_mel)]
- Mel spectrogram of an utterance(in log-magnitude).
+ Args:
+ mel(np.ndarray): Mel spectrogram of an utterance(in log-magnitude). shape=(C_mel, T_mel)
- Returns
- -------
- np.ndarray [shape=(T,)]
- The synthesized audio.
+ Returns:
+ np.ndarray: The synthesized audio. shape=(T,)
"""
mel = paddle.to_tensor(mel)
mel = paddle.unsqueeze(mel, 0)
@@ -828,18 +672,12 @@ class ConditionalWaveFlow(nn.LayerList):
def from_pretrained(cls, config, checkpoint_path):
"""Build a ConditionalWaveFlow model from a pretrained model.
- Parameters
- ----------
- config: yacs.config.CfgNode
- model configs
+ Args:
+ config(yacs.config.CfgNode): model configs
+ checkpoint_path(Path or str): the path of pretrained model checkpoint, without extension name
- checkpoint_path: Path or str
- the path of pretrained model checkpoint, without extension name
-
- Returns
- -------
- ConditionalWaveFlow
- The model built from pretrained result.
+ Returns:
+ ConditionalWaveFlow The model built from pretrained result.
"""
model = cls(upsample_factors=config.model.upsample_factors,
n_flows=config.model.n_flows,
@@ -855,11 +693,9 @@ class ConditionalWaveFlow(nn.LayerList):
class WaveFlowLoss(nn.Layer):
"""Criterion of a WaveFlow model.
- Parameters
- ----------
- sigma : float
- The standard deviation of the gaussian noise used in WaveFlow, by
- default 1.0.
+ Args:
+ sigma (float): The standard deviation of the gaussian noise used in WaveFlow,
+ by default 1.0.
"""
def __init__(self, sigma=1.0):
@@ -871,19 +707,13 @@ class WaveFlowLoss(nn.Layer):
"""Compute the loss given the transformed random variable z and the
log_det_jacobian of transformation from x to z.
- Parameters
- ----------
- z : Tensor [shape=(B, T)]
- The transformed random variable (x to z).
-
- log_det_jacobian : Tensor [shape=(1,)]
- The log of the determinant of the jacobian matrix of the
- transformation from x to z.
+ Args:
+ z(Tensor): The transformed random variable (x to z). shape=(B, T)
+ log_det_jacobian(Tensor): The log of the determinant of the jacobian matrix of the
+ transformation from x to z. shape=(1,)
- Returns
- -------
- Tensor [shape=(1,)]
- The loss.
+ Returns:
+ Tensor: The loss. shape=(1,)
"""
loss = paddle.sum(z * z) / (2 * self.sigma * self.sigma
) - log_det_jacobian
@@ -895,15 +725,12 @@ class ConditionalWaveFlow2Infer(ConditionalWaveFlow):
def forward(self, mel):
"""Generate raw audio given mel spectrogram.
- Parameters
- ----------
- mel : np.ndarray [shape=(C_mel, T_mel)]
- Mel spectrogram of an utterance(in log-magnitude).
-
- Returns
- -------
- np.ndarray [shape=(T,)]
- The synthesized audio.
+ Args:
+ mel (np.ndarray): Mel spectrogram of an utterance(in log-magnitude). shape=(C_mel, T_mel)
+
+ Returns:
+ np.ndarray: The synthesized audio. shape=(T,)
+
"""
audio = self.predict(mel)
return audio
diff --git a/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/__init__.py b/paddlespeech/t2s/models/wavernn/__init__.py
similarity index 91%
rename from paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/__init__.py
rename to paddlespeech/t2s/models/wavernn/__init__.py
index abf198b9..80ffd068 100644
--- a/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/__init__.py
+++ b/paddlespeech/t2s/models/wavernn/__init__.py
@@ -11,3 +11,5 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
+from .wavernn import *
+from .wavernn_updater import *
diff --git a/paddlespeech/t2s/models/wavernn/wavernn.py b/paddlespeech/t2s/models/wavernn/wavernn.py
new file mode 100644
index 00000000..1320ffa3
--- /dev/null
+++ b/paddlespeech/t2s/models/wavernn/wavernn.py
@@ -0,0 +1,577 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+import time
+from typing import List
+
+import numpy as np
+import paddle
+from paddle import nn
+from paddle.nn import functional as F
+
+from paddlespeech.t2s.audio.codec import decode_mu_law
+from paddlespeech.t2s.modules.losses import sample_from_discretized_mix_logistic
+from paddlespeech.t2s.modules.nets_utils import initialize
+from paddlespeech.t2s.modules.upsample import Stretch2D
+
+
+class ResBlock(nn.Layer):
+ def __init__(self, dims):
+ super().__init__()
+ self.conv1 = nn.Conv1D(dims, dims, kernel_size=1, bias_attr=False)
+ self.conv2 = nn.Conv1D(dims, dims, kernel_size=1, bias_attr=False)
+ self.batch_norm1 = nn.BatchNorm1D(dims)
+ self.batch_norm2 = nn.BatchNorm1D(dims)
+
+ def forward(self, x):
+ '''
+ conv -> bn -> relu -> conv -> bn + residual connection
+ '''
+ residual = x
+ x = self.conv1(x)
+ x = self.batch_norm1(x)
+ x = F.relu(x)
+ x = self.conv2(x)
+ x = self.batch_norm2(x)
+ return x + residual
+
+
+class MelResNet(nn.Layer):
+ def __init__(self,
+ res_blocks: int=10,
+ compute_dims: int=128,
+ res_out_dims: int=128,
+ aux_channels: int=80,
+ aux_context_window: int=0):
+ super().__init__()
+ k_size = aux_context_window * 2 + 1
+ # pay attention here, the dim reduces aux_context_window * 2
+ self.conv_in = nn.Conv1D(
+ aux_channels, compute_dims, kernel_size=k_size, bias_attr=False)
+ self.batch_norm = nn.BatchNorm1D(compute_dims)
+ self.layers = nn.LayerList()
+ for _ in range(res_blocks):
+ self.layers.append(ResBlock(compute_dims))
+ self.conv_out = nn.Conv1D(compute_dims, res_out_dims, kernel_size=1)
+
+ def forward(self, x):
+ '''
+ Args:
+ x (Tensor): Input tensor (B, in_dims, T).
+ Returns:
+ Tensor: Output tensor (B, res_out_dims, T).
+ '''
+
+ x = self.conv_in(x)
+ x = self.batch_norm(x)
+ x = F.relu(x)
+ for f in self.layers:
+ x = f(x)
+ x = self.conv_out(x)
+ return x
+
+
+class UpsampleNetwork(nn.Layer):
+ def __init__(self,
+ aux_channels: int=80,
+ upsample_scales: List[int]=[4, 5, 3, 5],
+ compute_dims: int=128,
+ res_blocks: int=10,
+ res_out_dims: int=128,
+ aux_context_window: int=2):
+ super().__init__()
+ # total_scale is the total Up sampling multiple
+ total_scale = np.prod(upsample_scales)
+ # TODO pad*total_scale is numpy.int64
+ self.indent = int(aux_context_window * total_scale)
+ self.resnet = MelResNet(
+ res_blocks=res_blocks,
+ aux_channels=aux_channels,
+ compute_dims=compute_dims,
+ res_out_dims=res_out_dims,
+ aux_context_window=aux_context_window)
+ self.resnet_stretch = Stretch2D(total_scale, 1)
+ self.up_layers = nn.LayerList()
+ for scale in upsample_scales:
+ k_size = (1, scale * 2 + 1)
+ padding = (0, scale)
+ stretch = Stretch2D(scale, 1)
+
+ conv = nn.Conv2D(
+ 1, 1, kernel_size=k_size, padding=padding, bias_attr=False)
+ weight_ = paddle.full_like(conv.weight, 1. / k_size[1])
+ conv.weight.set_value(weight_)
+ self.up_layers.append(stretch)
+ self.up_layers.append(conv)
+
+ def forward(self, m):
+ '''
+ Args:
+ c (Tensor): Input tensor (B, C_aux, T).
+ Returns:
+ Tensor: Output tensor (B, (T - 2 * pad) * prob(upsample_scales), C_aux).
+ Tensor: Output tensor (B, (T - 2 * pad) * prob(upsample_scales), res_out_dims).
+ '''
+ # aux: [B, C_aux, T]
+ # -> [B, res_out_dims, T - 2 * aux_context_window]
+ # -> [B, 1, res_out_dims, T - 2 * aux_context_window]
+ aux = self.resnet(m).unsqueeze(1)
+ # aux: [B, 1, res_out_dims, T - 2 * aux_context_window]
+ # -> [B, 1, res_out_dims, (T - 2 * pad) * prob(upsample_scales)]
+ aux = self.resnet_stretch(aux)
+ # aux: [B, 1, res_out_dims, T * prob(upsample_scales)]
+ # -> [B, res_out_dims, T * prob(upsample_scales)]
+ aux = aux.squeeze(1)
+ # m: [B, C_aux, T] -> [B, 1, C_aux, T]
+ m = m.unsqueeze(1)
+ for f in self.up_layers:
+ m = f(m)
+ # m: [B, 1, C_aux, T*prob(upsample_scales)]
+ # -> [B, C_aux, T * prob(upsample_scales)]
+ # -> [B, C_aux, (T - 2 * pad) * prob(upsample_scales)]
+ m = m.squeeze(1)[:, :, self.indent:-self.indent]
+ # m: [B, (T - 2 * pad) * prob(upsample_scales), C_aux]
+ # aux: [B, (T - 2 * pad) * prob(upsample_scales), res_out_dims]
+ return m.transpose([0, 2, 1]), aux.transpose([0, 2, 1])
+
+
+class WaveRNN(nn.Layer):
+ def __init__(
+ self,
+ rnn_dims: int=512,
+ fc_dims: int=512,
+ bits: int=9,
+ aux_context_window: int=2,
+ upsample_scales: List[int]=[4, 5, 3, 5],
+ aux_channels: int=80,
+ compute_dims: int=128,
+ res_out_dims: int=128,
+ res_blocks: int=10,
+ hop_length: int=300,
+ sample_rate: int=24000,
+ mode='RAW',
+ init_type: str="xavier_uniform", ):
+ '''
+ Args:
+ rnn_dims (int, optional): Hidden dims of RNN Layers.
+ fc_dims (int, optional): Dims of FC Layers.
+ bits (int, optional): bit depth of signal.
+ aux_context_window (int, optional): The context window size of the first convolution applied to the
+ auxiliary input, by default 2
+ upsample_scales (List[int], optional): Upsample scales of the upsample network.
+ aux_channels (int, optional): Auxiliary channel of the residual blocks.
+ compute_dims (int, optional): Dims of Conv1D in MelResNet.
+ res_out_dims (int, optional): Dims of output in MelResNet.
+ res_blocks (int, optional): Number of residual blocks.
+ mode (str, optional): Output mode of the WaveRNN vocoder.
+ `MOL` for Mixture of Logistic Distribution, and `RAW` for quantized bits as the model's output.
+ init_type (str): How to initialize parameters.
+ '''
+ super().__init__()
+ self.mode = mode
+ self.aux_context_window = aux_context_window
+ if self.mode == 'RAW':
+ self.n_classes = 2**bits
+ elif self.mode == 'MOL':
+ self.n_classes = 10 * 3
+ else:
+ RuntimeError('Unknown model mode value - ', self.mode)
+
+ # List of rnns to call 'flatten_parameters()' on
+ self._to_flatten = []
+
+ self.rnn_dims = rnn_dims
+ self.aux_dims = res_out_dims // 4
+ self.hop_length = hop_length
+ self.sample_rate = sample_rate
+
+ # initialize parameters
+ initialize(self, init_type)
+
+ self.upsample = UpsampleNetwork(
+ aux_channels=aux_channels,
+ upsample_scales=upsample_scales,
+ compute_dims=compute_dims,
+ res_blocks=res_blocks,
+ res_out_dims=res_out_dims,
+ aux_context_window=aux_context_window)
+ self.I = nn.Linear(aux_channels + self.aux_dims + 1, rnn_dims)
+
+ self.rnn1 = nn.GRU(rnn_dims, rnn_dims)
+ self.rnn2 = nn.GRU(rnn_dims + self.aux_dims, rnn_dims)
+
+ self._to_flatten += [self.rnn1, self.rnn2]
+
+ self.fc1 = nn.Linear(rnn_dims + self.aux_dims, fc_dims)
+ self.fc2 = nn.Linear(fc_dims + self.aux_dims, fc_dims)
+ self.fc3 = nn.Linear(fc_dims, self.n_classes)
+
+ # Avoid fragmentation of RNN parameters and associated warning
+ self._flatten_parameters()
+
+ nn.initializer.set_global_initializer(None)
+
+ def forward(self, x, c):
+ '''
+ Args:
+ x (Tensor): wav sequence, [B, T]
+ c (Tensor): mel spectrogram [B, C_aux, T']
+
+ T = (T' - 2 * aux_context_window ) * hop_length
+ Returns:
+ Tensor: [B, T, n_classes]
+ '''
+ # Although we `_flatten_parameters()` on init, when using DataParallel
+ # the model gets replicated, making it no longer guaranteed that the
+ # weights are contiguous in GPU memory. Hence, we must call it again
+ self._flatten_parameters()
+
+ bsize = paddle.shape(x)[0]
+ h1 = paddle.zeros([1, bsize, self.rnn_dims])
+ h2 = paddle.zeros([1, bsize, self.rnn_dims])
+ # c: [B, T, C_aux]
+ # aux: [B, T, res_out_dims]
+ c, aux = self.upsample(c)
+
+ aux_idx = [self.aux_dims * i for i in range(5)]
+ a1 = aux[:, :, aux_idx[0]:aux_idx[1]]
+ a2 = aux[:, :, aux_idx[1]:aux_idx[2]]
+ a3 = aux[:, :, aux_idx[2]:aux_idx[3]]
+ a4 = aux[:, :, aux_idx[3]:aux_idx[4]]
+
+ x = paddle.concat([x.unsqueeze(-1), c, a1], axis=2)
+ x = self.I(x)
+ res = x
+ x, _ = self.rnn1(x, h1)
+
+ x = x + res
+ res = x
+ x = paddle.concat([x, a2], axis=2)
+ x, _ = self.rnn2(x, h2)
+
+ x = x + res
+ x = paddle.concat([x, a3], axis=2)
+ x = F.relu(self.fc1(x))
+
+ x = paddle.concat([x, a4], axis=2)
+ x = F.relu(self.fc2(x))
+
+ return self.fc3(x)
+
+ @paddle.no_grad()
+ def generate(self,
+ c,
+ batched: bool=True,
+ target: int=12000,
+ overlap: int=600,
+ mu_law: bool=True,
+ gen_display: bool=False):
+ """
+ Args:
+ c(Tensor): input mels, (T', C_aux)
+ batched(bool): generate in batch or not
+ target(int): target number of samples to be generated in each batch entry
+ overlap(int): number of samples for crossfading between batches
+ mu_law(bool)
+ Returns:
+ wav sequence: Output (T' * prod(upsample_scales), out_channels, C_out).
+ """
+
+ self.eval()
+
+ mu_law = mu_law if self.mode == 'RAW' else False
+
+ output = []
+ start = time.time()
+
+ # pseudo batch
+ # (T, C_aux) -> (1, C_aux, T)
+ c = paddle.transpose(c, [1, 0]).unsqueeze(0)
+ T = paddle.shape(c)[-1]
+ wave_len = T * self.hop_length
+ # TODO remove two transpose op by modifying function pad_tensor
+ c = self.pad_tensor(
+ c.transpose([0, 2, 1]), pad=self.aux_context_window,
+ side='both').transpose([0, 2, 1])
+
+ c, aux = self.upsample(c)
+
+ if batched:
+ # (num_folds, target + 2 * overlap, features)
+ c = self.fold_with_overlap(c, target, overlap)
+ aux = self.fold_with_overlap(aux, target, overlap)
+
+ # for dygraph to static graph, if use seq_len of `b_size, seq_len, _ = paddle.shape(c)` in for
+ # will not get TensorArray
+ # see https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/04_dygraph_to_static/case_analysis_cn.html#list-lodtensorarray
+ # b_size, seq_len, _ = paddle.shape(c)
+ b_size = paddle.shape(c)[0]
+ seq_len = paddle.shape(c)[1]
+
+ h1 = paddle.zeros([b_size, self.rnn_dims])
+ h2 = paddle.zeros([b_size, self.rnn_dims])
+ x = paddle.zeros([b_size, 1])
+
+ d = self.aux_dims
+ aux_split = [aux[:, :, d * i:d * (i + 1)] for i in range(4)]
+
+ for i in range(seq_len):
+ m_t = c[:, i, :]
+ # for dygraph to static graph
+ # a1_t, a2_t, a3_t, a4_t = (a[:, i, :] for a in aux_split)
+ a1_t = aux_split[0][:, i, :]
+ a2_t = aux_split[1][:, i, :]
+ a3_t = aux_split[2][:, i, :]
+ a4_t = aux_split[3][:, i, :]
+ x = paddle.concat([x, m_t, a1_t], axis=1)
+ x = self.I(x)
+ # use GRUCell here
+ h1, _ = self.rnn1[0].cell(x, h1)
+ x = x + h1
+ inp = paddle.concat([x, a2_t], axis=1)
+ # use GRUCell here
+ h2, _ = self.rnn2[0].cell(inp, h2)
+
+ x = x + h2
+ x = paddle.concat([x, a3_t], axis=1)
+ x = F.relu(self.fc1(x))
+
+ x = paddle.concat([x, a4_t], axis=1)
+ x = F.relu(self.fc2(x))
+
+ logits = self.fc3(x)
+
+ if self.mode == 'MOL':
+ sample = sample_from_discretized_mix_logistic(
+ logits.unsqueeze(0).transpose([0, 2, 1]))
+ output.append(sample.reshape([-1]))
+ x = sample.transpose([1, 0, 2])
+
+ elif self.mode == 'RAW':
+ posterior = F.softmax(logits, axis=1)
+ distrib = paddle.distribution.Categorical(posterior)
+ # corresponding operate [np.floor((fx + 1) / 2 * mu + 0.5)] in enocde_mu_law
+ # distrib.sample([1])[0].cast('float32'): [0, 2**bits-1]
+ # sample: [-1, 1]
+ sample = 2 * distrib.sample([1])[0].cast('float32') / (
+ self.n_classes - 1.) - 1.
+ output.append(sample)
+ x = sample.unsqueeze(-1)
+ else:
+ raise RuntimeError('Unknown model mode value - ', self.mode)
+
+ if gen_display:
+ if i % 1000 == 0:
+ self.gen_display(i, int(seq_len), int(b_size), start)
+
+ output = paddle.stack(output).transpose([1, 0])
+
+ if mu_law:
+ output = decode_mu_law(output, self.n_classes, False)
+
+ if batched:
+ output = self.xfade_and_unfold(output, target, overlap)
+ else:
+ output = output[0]
+
+ # Fade-out at the end to avoid signal cutting out suddenly
+ fade_out = paddle.linspace(1, 0, 10 * self.hop_length)
+ output = output[:wave_len]
+ output[-10 * self.hop_length:] *= fade_out
+
+ self.train()
+
+ # 增加 C_out 维度
+ return output.unsqueeze(-1)
+
+ def _flatten_parameters(self):
+ [m.flatten_parameters() for m in self._to_flatten]
+
+ def pad_tensor(self, x, pad, side='both'):
+ '''
+ Args:
+ x(Tensor): mel, [1, n_frames, 80]
+ pad(int):
+ side(str, optional): (Default value = 'both')
+
+ Returns:
+ Tensor
+ '''
+ b, t, _ = paddle.shape(x)
+ # for dygraph to static graph
+ c = x.shape[-1]
+ total = t + 2 * pad if side == 'both' else t + pad
+ padded = paddle.zeros([b, total, c])
+ if side == 'before' or side == 'both':
+ padded[:, pad:pad + t, :] = x
+ elif side == 'after':
+ padded[:, :t, :] = x
+ return padded
+
+ def fold_with_overlap(self, x, target, overlap):
+ '''
+ Fold the tensor with overlap for quick batched inference.
+ Overlap will be used for crossfading in xfade_and_unfold()
+
+ Args:
+ x(Tensor): Upsampled conditioning features. mels or aux
+ shape=(1, T, features)
+ mels: [1, T, 80]
+ aux: [1, T, 128]
+ target(int): Target timesteps for each index of batch
+ overlap(int): Timesteps for both xfade and rnn warmup
+
+ Returns:
+ Tensor:
+ shape=(num_folds, target + 2 * overlap, features)
+ num_flods = (time_seq - overlap) // (target + overlap)
+ mel: [num_folds, target + 2 * overlap, 80]
+ aux: [num_folds, target + 2 * overlap, 128]
+
+ Details:
+ x = [[h1, h2, ... hn]]
+ Where each h is a vector of conditioning features
+ Eg: target=2, overlap=1 with x.size(1)=10
+
+ folded = [[h1, h2, h3, h4],
+ [h4, h5, h6, h7],
+ [h7, h8, h9, h10]]
+ '''
+
+ _, total_len, features = paddle.shape(x)
+
+ # Calculate variables needed
+ num_folds = (total_len - overlap) // (target + overlap)
+ extended_len = num_folds * (overlap + target) + overlap
+ remaining = total_len - extended_len
+
+ # Pad if some time steps poking out
+ if remaining != 0:
+ num_folds += 1
+ padding = target + 2 * overlap - remaining
+ x = self.pad_tensor(x, padding, side='after')
+
+ folded = paddle.zeros([num_folds, target + 2 * overlap, features])
+
+ # Get the values for the folded tensor
+ for i in range(num_folds):
+ start = i * (target + overlap)
+ end = start + target + 2 * overlap
+ folded[i] = x[0][start:end, :]
+ return folded
+
+ def xfade_and_unfold(self, y, target: int=12000, overlap: int=600):
+ ''' Applies a crossfade and unfolds into a 1d array.
+
+ Args:
+ y (Tensor):
+ Batched sequences of audio samples
+ shape=(num_folds, target + 2 * overlap)
+ dtype=paddle.float32
+ overlap (int): Timesteps for both xfade and rnn warmup
+
+ Returns:
+ Tensor
+ audio samples in a 1d array
+ shape=(total_len)
+ dtype=paddle.float32
+
+ Details:
+ y = [[seq1],
+ [seq2],
+ [seq3]]
+
+ Apply a gain envelope at both ends of the sequences
+
+ y = [[seq1_in, seq1_target, seq1_out],
+ [seq2_in, seq2_target, seq2_out],
+ [seq3_in, seq3_target, seq3_out]]
+
+ Stagger and add up the groups of samples:
+
+ [seq1_in, seq1_target, (seq1_out + seq2_in), seq2_target, ...]
+
+ '''
+ # num_folds = (total_len - overlap) // (target + overlap)
+ num_folds, length = paddle.shape(y)
+ target = length - 2 * overlap
+ total_len = num_folds * (target + overlap) + overlap
+
+ # Need some silence for the run warmup
+ slience_len = overlap // 2
+ fade_len = overlap - slience_len
+ slience = paddle.zeros([slience_len], dtype=paddle.float32)
+ linear = paddle.ones([fade_len], dtype=paddle.float32)
+
+ # Equal power crossfade
+ # fade_in increase from 0 to 1, fade_out reduces from 1 to 0
+ t = paddle.linspace(-1, 1, fade_len, dtype=paddle.float32)
+ fade_in = paddle.sqrt(0.5 * (1 + t))
+ fade_out = paddle.sqrt(0.5 * (1 - t))
+ # Concat the silence to the fades
+ fade_out = paddle.concat([linear, fade_out])
+ fade_in = paddle.concat([slience, fade_in])
+
+ # Apply the gain to the overlap samples
+ y[:, :overlap] *= fade_in
+ y[:, -overlap:] *= fade_out
+
+ unfolded = paddle.zeros([total_len], dtype=paddle.float32)
+
+ # Loop to add up all the samples
+ for i in range(num_folds):
+ start = i * (target + overlap)
+ end = start + target + 2 * overlap
+ unfolded[start:end] += y[i]
+
+ return unfolded
+
+ def gen_display(self, i, seq_len, b_size, start):
+ gen_rate = (i + 1) / (time.time() - start) * b_size / 1000
+ pbar = self.progbar(i, seq_len)
+ msg = f'| {pbar} {i*b_size}/{seq_len*b_size} | Batch Size: {b_size} | Gen Rate: {gen_rate:.1f}kHz | '
+ sys.stdout.write(f"\r{msg}")
+
+ def progbar(self, i, n, size=16):
+ done = int(i * size) // n
+ bar = ''
+ for i in range(size):
+ bar += '█' if i <= done else '░'
+ return bar
+
+
+class WaveRNNInference(nn.Layer):
+ def __init__(self, normalizer, wavernn):
+ super().__init__()
+ self.normalizer = normalizer
+ self.wavernn = wavernn
+
+ def forward(self,
+ logmel,
+ batched: bool=True,
+ target: int=12000,
+ overlap: int=600,
+ mu_law: bool=True,
+ gen_display: bool=False):
+ normalized_mel = self.normalizer(logmel)
+
+ wav = self.wavernn.generate(
+ normalized_mel, )
+ # batched=batched,
+ # target=target,
+ # overlap=overlap,
+ # mu_law=mu_law,
+ # gen_display=gen_display)
+
+ return wav
diff --git a/paddlespeech/t2s/models/wavernn/wavernn_updater.py b/paddlespeech/t2s/models/wavernn/wavernn_updater.py
new file mode 100644
index 00000000..b2756d00
--- /dev/null
+++ b/paddlespeech/t2s/models/wavernn/wavernn_updater.py
@@ -0,0 +1,201 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+from pathlib import Path
+
+import paddle
+import soundfile as sf
+from paddle import distributed as dist
+from paddle.io import DataLoader
+from paddle.nn import Layer
+from paddle.optimizer import Optimizer
+
+from paddlespeech.t2s.training.extensions.evaluator import StandardEvaluator
+from paddlespeech.t2s.training.reporter import report
+from paddlespeech.t2s.training.updaters.standard_updater import StandardUpdater
+logging.basicConfig(
+ format='%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s',
+ datefmt='[%Y-%m-%d %H:%M:%S]')
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+
+def calculate_grad_norm(parameters, norm_type: str=2):
+ '''
+ calculate grad norm of mdoel's parameters
+ parameters:
+ model's parameters
+ norm_type: str
+ Returns
+ ------------
+ Tensor
+ grad_norm
+ '''
+
+ grad_list = [
+ paddle.to_tensor(p.grad) for p in parameters if p.grad is not None
+ ]
+ norm_list = paddle.stack(
+ [paddle.norm(grad, norm_type) for grad in grad_list])
+ total_norm = paddle.norm(norm_list)
+ return total_norm
+
+
+# for save name in gen_valid_samples()
+ITERATION = 0
+
+
+class WaveRNNUpdater(StandardUpdater):
+ def __init__(self,
+ model: Layer,
+ optimizer: Optimizer,
+ criterion: Layer,
+ dataloader: DataLoader,
+ init_state=None,
+ output_dir: Path=None,
+ mode='RAW'):
+ super().__init__(model, optimizer, dataloader, init_state=None)
+
+ self.criterion = criterion
+ # self.scheduler = scheduler
+
+ log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
+ self.filehandler = logging.FileHandler(str(log_file))
+ logger.addHandler(self.filehandler)
+ self.logger = logger
+ self.msg = ""
+ self.mode = mode
+
+ def update_core(self, batch):
+
+ self.msg = "Rank: {}, ".format(dist.get_rank())
+ losses_dict = {}
+ # parse batch
+ self.model.train()
+ self.optimizer.clear_grad()
+
+ wav, y, mel = batch
+
+ y_hat = self.model(wav, mel)
+ if self.mode == 'RAW':
+ y_hat = y_hat.transpose([0, 2, 1]).unsqueeze(-1)
+ elif self.mode == 'MOL':
+ y_hat = paddle.cast(y, dtype='float32')
+
+ y = y.unsqueeze(-1)
+ loss = self.criterion(y_hat, y)
+ loss.backward()
+ grad_norm = float(
+ calculate_grad_norm(self.model.parameters(), norm_type=2))
+
+ self.optimizer.step()
+
+ report("train/loss", float(loss))
+ report("train/grad_norm", float(grad_norm))
+
+ losses_dict["loss"] = float(loss)
+ losses_dict["grad_norm"] = float(grad_norm)
+ self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
+ for k, v in losses_dict.items())
+ global ITERATION
+ ITERATION = self.state.iteration + 1
+
+
+class WaveRNNEvaluator(StandardEvaluator):
+ def __init__(self,
+ model: Layer,
+ criterion: Layer,
+ dataloader: Optimizer,
+ output_dir: Path=None,
+ valid_generate_loader=None,
+ config=None):
+ super().__init__(model, dataloader)
+
+ log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
+ self.filehandler = logging.FileHandler(str(log_file))
+ logger.addHandler(self.filehandler)
+ self.logger = logger
+ self.msg = ""
+
+ self.criterion = criterion
+ self.valid_generate_loader = valid_generate_loader
+ self.config = config
+ self.mode = config.model.mode
+
+ self.valid_samples_dir = output_dir / "valid_samples"
+ self.valid_samples_dir.mkdir(parents=True, exist_ok=True)
+
+ def evaluate_core(self, batch):
+ self.msg = "Evaluate: "
+ losses_dict = {}
+ # parse batch
+ wav, y, mel = batch
+ y_hat = self.model(wav, mel)
+
+ if self.mode == 'RAW':
+ y_hat = y_hat.transpose([0, 2, 1]).unsqueeze(-1)
+ elif self.mode == 'MOL':
+ y_hat = paddle.cast(y, dtype='float32')
+
+ y = y.unsqueeze(-1)
+ loss = self.criterion(y_hat, y)
+ report("eval/loss", float(loss))
+
+ losses_dict["loss"] = float(loss)
+
+ self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
+ for k, v in losses_dict.items())
+ self.logger.info(self.msg)
+
+ def gen_valid_samples(self):
+
+ for i, item in enumerate(self.valid_generate_loader):
+ if i >= self.config.generate_num:
+ break
+ print(
+ '\n| Generating: {}/{}'.format(i + 1, self.config.generate_num))
+
+ mel = item['feats']
+ wav = item['wave']
+ wav = wav.squeeze(0)
+
+ origin_save_path = self.valid_samples_dir / '{}_steps_{}_target.wav'.format(
+ self.iteration, i)
+ sf.write(origin_save_path, wav.numpy(), samplerate=self.config.fs)
+
+ if self.config.inference.gen_batched:
+ batch_str = 'gen_batched_target{}_overlap{}'.format(
+ self.config.inference.target, self.config.inference.overlap)
+ else:
+ batch_str = 'gen_not_batched'
+ gen_save_path = str(self.valid_samples_dir /
+ '{}_steps_{}_{}.wav'.format(self.iteration, i,
+ batch_str))
+ # (1, T, C_aux) -> (T, C_aux)
+ mel = mel.squeeze(0)
+ gen_sample = self.model.generate(
+ mel, self.config.inference.gen_batched,
+ self.config.inference.target, self.config.inference.overlap,
+ self.config.mu_law)
+ sf.write(
+ gen_save_path, gen_sample.numpy(), samplerate=self.config.fs)
+
+ def __call__(self, trainer=None):
+ summary = self.evaluate()
+ for k, v in summary.items():
+ report(k, v)
+ # gen samples at then end of evaluate
+ self.iteration = ITERATION
+ if self.iteration % self.config.gen_eval_samples_interval_steps == 0:
+ self.gen_valid_samples()
diff --git a/paddlespeech/t2s/modules/causal_conv.py b/paddlespeech/t2s/modules/causal_conv.py
index c0d4f955..3abccc15 100644
--- a/paddlespeech/t2s/modules/causal_conv.py
+++ b/paddlespeech/t2s/modules/causal_conv.py
@@ -41,14 +41,10 @@ class CausalConv1D(nn.Layer):
def forward(self, x):
"""Calculate forward propagation.
- Parameters
- ----------
- x : Tensor
- Input tensor (B, in_channels, T).
- Returns
- ----------
- Tensor
- Output tensor (B, out_channels, T).
+ Args:
+ x (Tensor): Input tensor (B, in_channels, T).
+ Returns:
+ Tensor: Output tensor (B, out_channels, T).
"""
return self.conv(self.pad(x))[:, :, :x.shape[2]]
@@ -70,13 +66,9 @@ class CausalConv1DTranspose(nn.Layer):
def forward(self, x):
"""Calculate forward propagation.
- Parameters
- ----------
- x : Tensor
- Input tensor (B, in_channels, T_in).
- Returns
- ----------
- Tensor
- Output tensor (B, out_channels, T_out).
+ Args:
+ x (Tensor): Input tensor (B, in_channels, T_in).
+ Returns:
+ Tensor: Output tensor (B, out_channels, T_out).
"""
return self.deconv(x)[:, :, :-self.stride]
diff --git a/paddlespeech/t2s/modules/conformer/convolution.py b/paddlespeech/t2s/modules/conformer/convolution.py
index e4a6c8c6..185c62fb 100644
--- a/paddlespeech/t2s/modules/conformer/convolution.py
+++ b/paddlespeech/t2s/modules/conformer/convolution.py
@@ -18,12 +18,10 @@ from paddle import nn
class ConvolutionModule(nn.Layer):
"""ConvolutionModule in Conformer model.
- Parameters
- ----------
- channels : int
- The number of channels of conv layers.
- kernel_size : int
- Kernerl size of conv layers.
+
+ Args:
+ channels (int): The number of channels of conv layers.
+ kernel_size (int): Kernerl size of conv layers.
"""
def __init__(self, channels, kernel_size, activation=nn.ReLU(), bias=True):
@@ -59,14 +57,11 @@ class ConvolutionModule(nn.Layer):
def forward(self, x):
"""Compute convolution module.
- Parameters
- ----------
- x : paddle.Tensor
- Input tensor (#batch, time, channels).
- Returns
- ----------
- paddle.Tensor
- Output tensor (#batch, time, channels).
+
+ Args:
+ x (Tensor): Input tensor (#batch, time, channels).
+ Returns:
+ Tensor: Output tensor (#batch, time, channels).
"""
# exchange the temporal dimension and the feature dimension
x = x.transpose([0, 2, 1])
diff --git a/paddlespeech/t2s/modules/conformer/encoder_layer.py b/paddlespeech/t2s/modules/conformer/encoder_layer.py
index 2949dc37..61c32612 100644
--- a/paddlespeech/t2s/modules/conformer/encoder_layer.py
+++ b/paddlespeech/t2s/modules/conformer/encoder_layer.py
@@ -21,38 +21,29 @@ from paddlespeech.t2s.modules.layer_norm import LayerNorm
class EncoderLayer(nn.Layer):
"""Encoder layer module.
- Parameters
- ----------
- size : int
- Input dimension.
- self_attn : nn.Layer
- Self-attention module instance.
- `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` instance
- can be used as the argument.
- feed_forward : nn.Layer
- Feed-forward module instance.
- `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
- can be used as the argument.
- feed_forward_macaron : nn.Layer
- Additional feed-forward module instance.
- `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
- can be used as the argument.
- conv_module : nn.Layer
- Convolution module instance.
- `ConvlutionModule` instance can be used as the argument.
- dropout_rate : float
- Dropout rate.
- normalize_before : bool
- Whether to use layer_norm before the first block.
- concat_after : bool
- Whether to concat attention layer's input and output.
- if True, additional linear will be applied.
- i.e. x -> x + linear(concat(x, att(x)))
- if False, no additional linear will be applied. i.e. x -> x + att(x)
- stochastic_depth_rate : float
- Proability to skip this layer.
- During training, the layer may skip residual computation and return input
- as-is with given probability.
+
+ Args:
+ size (int): Input dimension.
+ self_attn (nn.Layer): Self-attention module instance.
+ `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` instance
+ can be used as the argument.
+ feed_forward (nn.Layer): Feed-forward module instance.
+ `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
+ can be used as the argument.
+ feed_forward_macaron (nn.Layer): Additional feed-forward module instance.
+ `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
+ can be used as the argument.
+ conv_module (nn.Layer): Convolution module instance.
+ `ConvlutionModule` instance can be used as the argument.
+ dropout_rate (float): Dropout rate.
+ normalize_before (bool): Whether to use layer_norm before the first block.
+ concat_after (bool): Whether to concat attention layer's input and output.
+ if True, additional linear will be applied.
+ i.e. x -> x + linear(concat(x, att(x)))
+ if False, no additional linear will be applied. i.e. x -> x + att(x)
+ stochastic_depth_rate (float): Proability to skip this layer.
+ During training, the layer may skip residual computation and return input
+ as-is with given probability.
"""
def __init__(
@@ -93,22 +84,17 @@ class EncoderLayer(nn.Layer):
def forward(self, x_input, mask, cache=None):
"""Compute encoded features.
- Parameters
- ----------
- x_input : Union[Tuple, paddle.Tensor]
- Input tensor w/ or w/o pos emb.
- - w/ pos emb: Tuple of tensors [(#batch, time, size), (1, time, size)].
- - w/o pos emb: Tensor (#batch, time, size).
- mask : paddle.Tensor
- Mask tensor for the input (#batch, time).
- cache paddle.Tensor
- Cache tensor of the input (#batch, time - 1, size).
- Returns
- ----------
- paddle.Tensor
- Output tensor (#batch, time, size).
- paddle.Tensor
- Mask tensor (#batch, time).
+
+ Args:
+ x_input(Union[Tuple, Tensor]): Input tensor w/ or w/o pos emb.
+ - w/ pos emb: Tuple of tensors [(#batch, time, size), (1, time, size)].
+ - w/o pos emb: Tensor (#batch, time, size).
+ mask(Tensor): Mask tensor for the input (#batch, time).
+ cache (Tensor):
+
+ Returns:
+ Tensor: Output tensor (#batch, time, size).
+ Tensor: Mask tensor (#batch, time).
"""
if isinstance(x_input, tuple):
x, pos_emb = x_input[0], x_input[1]
diff --git a/paddlespeech/t2s/modules/conv.py b/paddlespeech/t2s/modules/conv.py
index 68766d5e..aa875bd5 100644
--- a/paddlespeech/t2s/modules/conv.py
+++ b/paddlespeech/t2s/modules/conv.py
@@ -40,36 +40,29 @@ class Conv1dCell(nn.Conv1D):
2. padding must be a causal padding (recpetive_field - 1, 0).
Thus, these arguments are removed from the ``__init__`` method of this
class.
-
- Parameters
- ----------
- in_channels: int
- The feature size of the input.
- out_channels: int
- The feature size of the output.
- kernel_size: int or Tuple[int]
- The size of the kernel.
- dilation: int or Tuple[int]
- The dilation of the convolution, by default 1
- weight_attr: ParamAttr, Initializer, str or bool, optional
- The parameter attribute of the convolution kernel, by default None.
- bias_attr: ParamAttr, Initializer, str or bool, optional
- The parameter attribute of the bias. If ``False``, this layer does not
- have a bias, by default None.
-
- Examples
- --------
- >>> cell = Conv1dCell(3, 4, kernel_size=5)
- >>> inputs = [paddle.randn([4, 3]) for _ in range(16)]
- >>> outputs = []
- >>> cell.eval()
- >>> cell.start_sequence()
- >>> for xt in inputs:
- >>> outputs.append(cell.add_input(xt))
- >>> len(outputs))
- 16
- >>> outputs[0].shape
- [4, 4]
+
+ Args:
+ in_channels (int): The feature size of the input.
+ out_channels (int): The feature size of the output.
+ kernel_size (int or Tuple[int]): The size of the kernel.
+ dilation (int or Tuple[int]): The dilation of the convolution, by default 1
+ weight_attr (ParamAttr, Initializer, str or bool, optional) : The parameter attribute of the convolution kernel,
+ by default None.
+ bias_attr (ParamAttr, Initializer, str or bool, optional):The parameter attribute of the bias.
+ If ``False``, this layer does not have a bias, by default None.
+
+ Examples:
+ >>> cell = Conv1dCell(3, 4, kernel_size=5)
+ >>> inputs = [paddle.randn([4, 3]) for _ in range(16)]
+ >>> outputs = []
+ >>> cell.eval()
+ >>> cell.start_sequence()
+ >>> for xt in inputs:
+ >>> outputs.append(cell.add_input(xt))
+ >>> len(outputs))
+ 16
+ >>> outputs[0].shape
+ [4, 4]
"""
def __init__(self,
@@ -103,15 +96,13 @@ class Conv1dCell(nn.Conv1D):
def start_sequence(self):
"""Prepare the layer for a series of incremental forward.
- Warnings
- ---------
- This method should be called before a sequence of calls to
- ``add_input``.
+ Warnings:
+ This method should be called before a sequence of calls to
+ ``add_input``.
- Raises
- ------
- Exception
- If this method is called when the layer is in training mode.
+ Raises:
+ Exception
+ If this method is called when the layer is in training mode.
"""
if self.training:
raise Exception("only use start_sequence in evaluation")
@@ -130,10 +121,9 @@ class Conv1dCell(nn.Conv1D):
def initialize_buffer(self, x_t):
"""Initialize the buffer for the step input.
- Parameters
- ----------
- x_t : Tensor [shape=(batch_size, in_channels)]
- The step input.
+ Args:
+ x_t (Tensor): The step input. shape=(batch_size, in_channels)
+
"""
batch_size, _ = x_t.shape
self._buffer = paddle.zeros(
@@ -143,26 +133,22 @@ class Conv1dCell(nn.Conv1D):
def update_buffer(self, x_t):
"""Shift the buffer by one step.
- Parameters
- ----------
- x_t : Tensor [shape=(batch_size, in_channels)]
- The step input.
+ Args:
+ x_t (Tensor): The step input. shape=(batch_size, in_channels)
+
"""
self._buffer = paddle.concat(
[self._buffer[:, :, 1:], paddle.unsqueeze(x_t, -1)], -1)
def add_input(self, x_t):
"""Add step input and compute step output.
-
- Parameters
- -----------
- x_t : Tensor [shape=(batch_size, in_channels)]
- The step input.
-
- Returns
- -------
- y_t :Tensor [shape=(batch_size, out_channels)]
- The step output.
+
+ Args:
+ x_t (Tensor): The step input. shape=(batch_size, in_channels)
+
+ Returns:
+ y_t (Tensor): The step output. shape=(batch_size, out_channels)
+
"""
batch_size = x_t.shape[0]
if self.receptive_field > 1:
@@ -186,33 +172,26 @@ class Conv1dCell(nn.Conv1D):
class Conv1dBatchNorm(nn.Layer):
"""A Conv1D Layer followed by a BatchNorm1D.
- Parameters
- ----------
- in_channels : int
- The feature size of the input.
- out_channels : int
- The feature size of the output.
- kernel_size : int
- The size of the convolution kernel.
- stride : int, optional
- The stride of the convolution, by default 1.
- padding : int, str or Tuple[int], optional
- The padding of the convolution.
- If int, a symmetrical padding is applied before convolution;
- If str, it should be "same" or "valid";
- If Tuple[int], its length should be 2, meaning
- ``(pad_before, pad_after)``, by default 0.
- weight_attr : ParamAttr, Initializer, str or bool, optional
- The parameter attribute of the convolution kernel, by default None.
- bias_attr : ParamAttr, Initializer, str or bool, optional
- The parameter attribute of the bias of the convolution, by default
- None.
- data_format : str ["NCL" or "NLC"], optional
- The data layout of the input, by default "NCL"
- momentum : float, optional
- The momentum of the BatchNorm1D layer, by default 0.9
- epsilon : [type], optional
- The epsilon of the BatchNorm1D layer, by default 1e-05
+ Args:
+ in_channels (int): The feature size of the input.
+ out_channels (int): The feature size of the output.
+ kernel_size (int): The size of the convolution kernel.
+ stride (int, optional): The stride of the convolution, by default 1.
+ padding (int, str or Tuple[int], optional):
+ The padding of the convolution.
+ If int, a symmetrical padding is applied before convolution;
+ If str, it should be "same" or "valid";
+ If Tuple[int], its length should be 2, meaning
+ ``(pad_before, pad_after)``, by default 0.
+ weight_attr (ParamAttr, Initializer, str or bool, optional):
+ The parameter attribute of the convolution kernel,
+ by default None.
+ bias_attr (ParamAttr, Initializer, str or bool, optional):
+ The parameter attribute of the bias of the convolution,
+ by defaultNone.
+ data_format (str ["NCL" or "NLC"], optional): The data layout of the input, by default "NCL"
+ momentum (float, optional): The momentum of the BatchNorm1D layer, by default 0.9
+ epsilon (float, optional): The epsilon of the BatchNorm1D layer, by default 1e-05
"""
def __init__(self,
@@ -244,16 +223,15 @@ class Conv1dBatchNorm(nn.Layer):
def forward(self, x):
"""Forward pass of the Conv1dBatchNorm layer.
-
- Parameters
- ----------
- x : Tensor [shape=(B, C_in, T_in) or (B, T_in, C_in)]
- The input tensor. Its data layout depends on ``data_format``.
-
- Returns
- -------
- Tensor [shape=(B, C_out, T_out) or (B, T_out, C_out)]
- The output tensor.
+
+ Args:
+ x (Tensor): The input tensor. Its data layout depends on ``data_format``.
+ shape=(B, C_in, T_in) or (B, T_in, C_in)
+
+ Returns:
+ Tensor: The output tensor.
+ shape=(B, C_out, T_out) or (B, T_out, C_out)
+
"""
x = self.conv(x)
x = self.bn(x)
diff --git a/paddlespeech/t2s/modules/geometry.py b/paddlespeech/t2s/modules/geometry.py
index a3d56f7d..01eb5ad0 100644
--- a/paddlespeech/t2s/modules/geometry.py
+++ b/paddlespeech/t2s/modules/geometry.py
@@ -17,24 +17,18 @@ import paddle
def shuffle_dim(x, axis, perm=None):
"""Permute input tensor along aixs given the permutation or randomly.
+
+ Args:
+ x (Tensor): The input tensor.
+ axis (int): The axis to shuffle.
+ perm (List[int], ndarray, optional):
+ The order to reorder the tensor along the ``axis``-th dimension.
+ It is a permutation of ``[0, d)``, where d is the size of the
+ ``axis``-th dimension of the input tensor. If not provided,
+ a random permutation is used. Defaults to None.
- Parameters
- ----------
- x : Tensor
- The input tensor.
- axis : int
- The axis to shuffle.
- perm : List[int], ndarray, optional
- The order to reorder the tensor along the ``axis``-th dimension.
-
- It is a permutation of ``[0, d)``, where d is the size of the
- ``axis``-th dimension of the input tensor. If not provided,
- a random permutation is used. Defaults to None.
-
- Returns
- ---------
- Tensor
- The shuffled tensor, which has the same shape as x does.
+ Returns:
+ Tensor: The shuffled tensor, which has the same shape as x does.
"""
size = x.shape[axis]
if perm is not None and len(perm) != size:
diff --git a/paddlespeech/t2s/modules/layer_norm.py b/paddlespeech/t2s/modules/layer_norm.py
index 4edd22c9..088b98e0 100644
--- a/paddlespeech/t2s/modules/layer_norm.py
+++ b/paddlespeech/t2s/modules/layer_norm.py
@@ -18,13 +18,9 @@ from paddle import nn
class LayerNorm(nn.LayerNorm):
"""Layer normalization module.
-
- Parameters
- ----------
- nout : int
- Output dim size.
- dim : int
- Dimension to be normalized.
+ Args:
+ nout (int): Output dim size.
+ dim (int): Dimension to be normalized.
"""
def __init__(self, nout, dim=-1):
@@ -35,15 +31,11 @@ class LayerNorm(nn.LayerNorm):
def forward(self, x):
"""Apply layer normalization.
- Parameters
- ----------
- x : paddle.Tensor
- Input tensor.
+ Args:
+ x (Tensor):Input tensor.
- Returns
- ----------
- paddle.Tensor
- Normalized tensor.
+ Returns:
+ Tensor: Normalized tensor.
"""
if self.dim == -1:
diff --git a/paddlespeech/t2s/modules/losses.py b/paddlespeech/t2s/modules/losses.py
index 569e96ad..93644e24 100644
--- a/paddlespeech/t2s/modules/losses.py
+++ b/paddlespeech/t2s/modules/losses.py
@@ -14,12 +14,419 @@
import math
import librosa
+import numpy as np
import paddle
from paddle import nn
from paddle.fluid.layers import sequence_mask
from paddle.nn import functional as F
from scipy import signal
+from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask
+
+
+# Losses for WaveRNN
+def log_sum_exp(x):
+ """ numerically stable log_sum_exp implementation that prevents overflow """
+ # TF ordering
+ axis = len(x.shape) - 1
+ m = paddle.max(x, axis=axis)
+ m2 = paddle.max(x, axis=axis, keepdim=True)
+ return m + paddle.log(paddle.sum(paddle.exp(x - m2), axis=axis))
+
+
+# It is adapted from https://github.com/r9y9/wavenet_vocoder/blob/master/wavenet_vocoder/mixture.py
+def discretized_mix_logistic_loss(y_hat,
+ y,
+ num_classes=65536,
+ log_scale_min=None,
+ reduce=True):
+ if log_scale_min is None:
+ log_scale_min = float(np.log(1e-14))
+ y_hat = y_hat.transpose([0, 2, 1])
+ assert y_hat.dim() == 3
+ assert y_hat.shape[1] % 3 == 0
+ nr_mix = y_hat.shape[1] // 3
+
+ # (B x T x C)
+ y_hat = y_hat.transpose([0, 2, 1])
+
+ # unpack parameters. (B, T, num_mixtures) x 3
+ logit_probs = y_hat[:, :, :nr_mix]
+ means = y_hat[:, :, nr_mix:2 * nr_mix]
+ log_scales = paddle.clip(
+ y_hat[:, :, 2 * nr_mix:3 * nr_mix], min=log_scale_min)
+
+ # B x T x 1 -> B x T x num_mixtures
+ y = y.expand_as(means)
+ centered_y = paddle.cast(y, dtype=paddle.get_default_dtype()) - means
+ inv_stdv = paddle.exp(-log_scales)
+ plus_in = inv_stdv * (centered_y + 1. / (num_classes - 1))
+ cdf_plus = F.sigmoid(plus_in)
+ min_in = inv_stdv * (centered_y - 1. / (num_classes - 1))
+ cdf_min = F.sigmoid(min_in)
+
+ # log probability for edge case of 0 (before scaling)
+ # equivalent: torch.log(F.sigmoid(plus_in))
+ # softplus: log(1+ e^{-x})
+ log_cdf_plus = plus_in - F.softplus(plus_in)
+
+ # log probability for edge case of 255 (before scaling)
+ # equivalent: (1 - F.sigmoid(min_in)).log()
+ log_one_minus_cdf_min = -F.softplus(min_in)
+
+ # probability for all other cases
+ cdf_delta = cdf_plus - cdf_min
+
+ mid_in = inv_stdv * centered_y
+ # log probability in the center of the bin, to be used in extreme cases
+ # (not actually used in our code)
+ log_pdf_mid = mid_in - log_scales - 2. * F.softplus(mid_in)
+
+ # TODO: cdf_delta <= 1e-5 actually can happen. How can we choose the value
+ # for num_classes=65536 case? 1e-7? not sure..
+ inner_inner_cond = cdf_delta > 1e-5
+
+ inner_inner_cond = paddle.cast(
+ inner_inner_cond, dtype=paddle.get_default_dtype())
+
+ # inner_inner_out = inner_inner_cond * \
+ # paddle.log(paddle.clip(cdf_delta, min=1e-12)) + \
+ # (1. - inner_inner_cond) * (log_pdf_mid - np.log((num_classes - 1) / 2))
+
+ inner_inner_out = inner_inner_cond * paddle.log(
+ paddle.clip(cdf_delta, min=1e-12)) + (1. - inner_inner_cond) * (
+ log_pdf_mid - np.log((num_classes - 1) / 2))
+
+ inner_cond = y > 0.999
+
+ inner_cond = paddle.cast(inner_cond, dtype=paddle.get_default_dtype())
+
+ inner_out = inner_cond * log_one_minus_cdf_min + (1. - inner_cond
+ ) * inner_inner_out
+ cond = y < -0.999
+ cond = paddle.cast(cond, dtype=paddle.get_default_dtype())
+
+ log_probs = cond * log_cdf_plus + (1. - cond) * inner_out
+ log_probs = log_probs + F.log_softmax(logit_probs, -1)
+
+ if reduce:
+ return -paddle.mean(log_sum_exp(log_probs))
+ else:
+ return -log_sum_exp(log_probs).unsqueeze(-1)
+
+
+def sample_from_discretized_mix_logistic(y, log_scale_min=None):
+ """
+ Sample from discretized mixture of logistic distributions
+
+ Args:
+ y(Tensor): (B, C, T)
+ log_scale_min(float, optional): (Default value = None)
+
+ Returns:
+ Tensor: sample in range of [-1, 1].
+ """
+ if log_scale_min is None:
+ log_scale_min = float(np.log(1e-14))
+
+ assert y.shape[1] % 3 == 0
+ nr_mix = y.shape[1] // 3
+
+ # (B, T, C)
+ y = y.transpose([0, 2, 1])
+ logit_probs = y[:, :, :nr_mix]
+
+ # sample mixture indicator from softmax
+ temp = paddle.uniform(
+ logit_probs.shape, dtype=logit_probs.dtype, min=1e-5, max=1.0 - 1e-5)
+ temp = logit_probs - paddle.log(-paddle.log(temp))
+ argmax = paddle.argmax(temp, axis=-1)
+
+ # (B, T) -> (B, T, nr_mix)
+ one_hot = F.one_hot(argmax, nr_mix)
+ one_hot = paddle.cast(one_hot, dtype=paddle.get_default_dtype())
+
+ # select logistic parameters
+ means = paddle.sum(y[:, :, nr_mix:2 * nr_mix] * one_hot, axis=-1)
+ log_scales = paddle.clip(
+ paddle.sum(y[:, :, 2 * nr_mix:3 * nr_mix] * one_hot, axis=-1),
+ min=log_scale_min)
+ # sample from logistic & clip to interval
+ # we don't actually round to the nearest 8bit value when sampling
+ u = paddle.uniform(means.shape, min=1e-5, max=1.0 - 1e-5)
+ x = means + paddle.exp(log_scales) * (paddle.log(u) - paddle.log(1. - u))
+ x = paddle.clip(x, min=-1., max=-1.)
+
+ return x
+
+
+# Loss for new Tacotron2
+class GuidedAttentionLoss(nn.Layer):
+ """Guided attention loss function module.
+
+ This module calculates the guided attention loss described
+ in `Efficiently Trainable Text-to-Speech System Based
+ on Deep Convolutional Networks with Guided Attention`_,
+ which forces the attention to be diagonal.
+
+ .. _`Efficiently Trainable Text-to-Speech System
+ Based on Deep Convolutional Networks with Guided Attention`:
+ https://arxiv.org/abs/1710.08969
+
+ """
+
+ def __init__(self, sigma=0.4, alpha=1.0, reset_always=True):
+ """Initialize guided attention loss module.
+
+ Args:
+ sigma (float, optional): Standard deviation to control how close attention to a diagonal.
+ alpha (float, optional): Scaling coefficient (lambda).
+ reset_always (bool, optional): Whether to always reset masks.
+
+ """
+ super().__init__()
+ self.sigma = sigma
+ self.alpha = alpha
+ self.reset_always = reset_always
+ self.guided_attn_masks = None
+ self.masks = None
+
+ def _reset_masks(self):
+ self.guided_attn_masks = None
+ self.masks = None
+
+ def forward(self, att_ws, ilens, olens):
+ """Calculate forward propagation.
+
+ Args:
+ att_ws(Tensor): Batch of attention weights (B, T_max_out, T_max_in).
+ ilens(Tensor(int64)): Batch of input lenghts (B,).
+ olens(Tensor(int64)): Batch of output lenghts (B,).
+
+ Returns:
+ Tensor: Guided attention loss value.
+
+ """
+ if self.guided_attn_masks is None:
+ self.guided_attn_masks = self._make_guided_attention_masks(ilens,
+ olens)
+ if self.masks is None:
+ self.masks = self._make_masks(ilens, olens)
+ losses = self.guided_attn_masks * att_ws
+ loss = paddle.mean(
+ losses.masked_select(self.masks.broadcast_to(losses.shape)))
+ if self.reset_always:
+ self._reset_masks()
+ return self.alpha * loss
+
+ def _make_guided_attention_masks(self, ilens, olens):
+ n_batches = len(ilens)
+ max_ilen = max(ilens)
+ max_olen = max(olens)
+ guided_attn_masks = paddle.zeros((n_batches, max_olen, max_ilen))
+
+ for idx, (ilen, olen) in enumerate(zip(ilens, olens)):
+ guided_attn_masks[idx, :olen, :
+ ilen] = self._make_guided_attention_mask(
+ ilen, olen, self.sigma)
+ return guided_attn_masks
+
+ @staticmethod
+ def _make_guided_attention_mask(ilen, olen, sigma):
+ """Make guided attention mask.
+
+ Examples
+ ----------
+ >>> guided_attn_mask =_make_guided_attention(5, 5, 0.4)
+ >>> guided_attn_mask.shape
+ [5, 5]
+ >>> guided_attn_mask
+ tensor([[0.0000, 0.1175, 0.3935, 0.6753, 0.8647],
+ [0.1175, 0.0000, 0.1175, 0.3935, 0.6753],
+ [0.3935, 0.1175, 0.0000, 0.1175, 0.3935],
+ [0.6753, 0.3935, 0.1175, 0.0000, 0.1175],
+ [0.8647, 0.6753, 0.3935, 0.1175, 0.0000]])
+ >>> guided_attn_mask =_make_guided_attention(3, 6, 0.4)
+ >>> guided_attn_mask.shape
+ [6, 3]
+ >>> guided_attn_mask
+ tensor([[0.0000, 0.2934, 0.7506],
+ [0.0831, 0.0831, 0.5422],
+ [0.2934, 0.0000, 0.2934],
+ [0.5422, 0.0831, 0.0831],
+ [0.7506, 0.2934, 0.0000],
+ [0.8858, 0.5422, 0.0831]])
+
+ """
+ grid_x, grid_y = paddle.meshgrid(
+ paddle.arange(olen), paddle.arange(ilen))
+ grid_x = grid_x.cast(dtype=paddle.float32)
+ grid_y = grid_y.cast(dtype=paddle.float32)
+ return 1.0 - paddle.exp(-(
+ (grid_y / ilen - grid_x / olen)**2) / (2 * (sigma**2)))
+
+ @staticmethod
+ def _make_masks(ilens, olens):
+ """Make masks indicating non-padded part.
+
+ Args:
+ ilens(Tensor(int64) or List): Batch of lengths (B,).
+ olens(Tensor(int64) or List): Batch of lengths (B,).
+
+ Returns:
+ Tensor: Mask tensor indicating non-padded part.
+
+ Examples:
+ >>> ilens, olens = [5, 2], [8, 5]
+ >>> _make_mask(ilens, olens)
+ tensor([[[1, 1, 1, 1, 1],
+ [1, 1, 1, 1, 1],
+ [1, 1, 1, 1, 1],
+ [1, 1, 1, 1, 1],
+ [1, 1, 1, 1, 1],
+ [1, 1, 1, 1, 1],
+ [1, 1, 1, 1, 1],
+ [1, 1, 1, 1, 1]],
+
+ [[1, 1, 0, 0, 0],
+ [1, 1, 0, 0, 0],
+ [1, 1, 0, 0, 0],
+ [1, 1, 0, 0, 0],
+ [1, 1, 0, 0, 0],
+ [0, 0, 0, 0, 0],
+ [0, 0, 0, 0, 0],
+ [0, 0, 0, 0, 0]]], dtype=paddle.uint8)
+
+ """
+ # (B, T_in)
+ in_masks = make_non_pad_mask(ilens)
+ # (B, T_out)
+ out_masks = make_non_pad_mask(olens)
+ # (B, T_out, T_in)
+
+ return paddle.logical_and(
+ out_masks.unsqueeze(-1), in_masks.unsqueeze(-2))
+
+
+class GuidedMultiHeadAttentionLoss(GuidedAttentionLoss):
+ """Guided attention loss function module for multi head attention.
+
+ Args:
+ sigma (float, optional): Standard deviation to controlGuidedAttentionLoss
+ how close attention to a diagonal.
+ alpha (float, optional): Scaling coefficient (lambda).
+ reset_always (bool, optional): Whether to always reset masks.
+
+ """
+
+ def forward(self, att_ws, ilens, olens):
+ """Calculate forward propagation.
+
+ Args:
+ att_ws(Tensor): Batch of multi head attention weights (B, H, T_max_out, T_max_in).
+ ilens(Tensor): Batch of input lenghts (B,).
+ olens(Tensor): Batch of output lenghts (B,).
+
+ Returns:
+ Tensor: Guided attention loss value.
+
+ """
+ if self.guided_attn_masks is None:
+ self.guided_attn_masks = (
+ self._make_guided_attention_masks(ilens, olens).unsqueeze(1))
+ if self.masks is None:
+ self.masks = self._make_masks(ilens, olens).unsqueeze(1)
+ losses = self.guided_attn_masks * att_ws
+ loss = paddle.mean(
+ losses.masked_select(self.masks.broadcast_to(losses.shape)))
+ if self.reset_always:
+ self._reset_masks()
+
+ return self.alpha * loss
+
+
+class Tacotron2Loss(nn.Layer):
+ """Loss function module for Tacotron2."""
+
+ def __init__(self,
+ use_masking=True,
+ use_weighted_masking=False,
+ bce_pos_weight=20.0):
+ """Initialize Tactoron2 loss module.
+
+ Args:
+ use_masking (bool): Whether to apply masking for padded part in loss calculation.
+ use_weighted_masking (bool): Whether to apply weighted masking in loss calculation.
+ bce_pos_weight (float): Weight of positive sample of stop token.
+ """
+ super().__init__()
+ assert (use_masking != use_weighted_masking) or not use_masking
+ self.use_masking = use_masking
+ self.use_weighted_masking = use_weighted_masking
+
+ # define criterions
+ reduction = "none" if self.use_weighted_masking else "mean"
+ self.l1_criterion = nn.L1Loss(reduction=reduction)
+ self.mse_criterion = nn.MSELoss(reduction=reduction)
+ self.bce_criterion = nn.BCEWithLogitsLoss(
+ reduction=reduction, pos_weight=paddle.to_tensor(bce_pos_weight))
+
+ def forward(self, after_outs, before_outs, logits, ys, stop_labels, olens):
+ """Calculate forward propagation.
+
+ Args:
+ after_outs(Tensor): Batch of outputs after postnets (B, Lmax, odim).
+ before_outs(Tensor): Batch of outputs before postnets (B, Lmax, odim).
+ logits(Tensor): Batch of stop logits (B, Lmax).
+ ys(Tensor): Batch of padded target features (B, Lmax, odim).
+ stop_labels(Tensor(int64)): Batch of the sequences of stop token labels (B, Lmax).
+ olens(Tensor(int64)):
+
+ Returns:
+ Tensor: L1 loss value.
+ Tensor: Mean square error loss value.
+ Tensor: Binary cross entropy loss value.
+ """
+ # make mask and apply it
+ if self.use_masking:
+ masks = make_non_pad_mask(olens).unsqueeze(-1)
+ ys = ys.masked_select(masks.broadcast_to(ys.shape))
+ after_outs = after_outs.masked_select(
+ masks.broadcast_to(after_outs.shape))
+ before_outs = before_outs.masked_select(
+ masks.broadcast_to(before_outs.shape))
+ stop_labels = stop_labels.masked_select(
+ masks[:, :, 0].broadcast_to(stop_labels.shape))
+ logits = logits.masked_select(
+ masks[:, :, 0].broadcast_to(logits.shape))
+
+ # calculate loss
+ l1_loss = self.l1_criterion(after_outs, ys) + self.l1_criterion(
+ before_outs, ys)
+ mse_loss = self.mse_criterion(after_outs, ys) + self.mse_criterion(
+ before_outs, ys)
+ bce_loss = self.bce_criterion(logits, stop_labels)
+
+ # make weighted mask and apply it
+ if self.use_weighted_masking:
+ masks = make_non_pad_mask(olens).unsqueeze(-1)
+ weights = masks.float() / masks.sum(axis=1, keepdim=True).float()
+ out_weights = weights.divide(
+ paddle.shape(ys)[0] * paddle.shape(ys)[2])
+ logit_weights = weights.divide(paddle.shape(ys)[0])
+
+ # apply weight
+ l1_loss = l1_loss.multiply(out_weights)
+ l1_loss = l1_loss.masked_select(masks.broadcast_to(l1_loss)).sum()
+ mse_loss = mse_loss.multiply(out_weights)
+ mse_loss = mse_loss.masked_select(
+ masks.broadcast_to(mse_loss)).sum()
+ bce_loss = bce_loss.multiply(logit_weights.squeeze(-1))
+ bce_loss = bce_loss.masked_select(
+ masks.squeeze(-1).broadcast_to(bce_loss)).sum()
+
+ return l1_loss, mse_loss, bce_loss
+
# Loss for Tacotron2
def attention_guide(dec_lens, enc_lens, N, T, g, dtype=None):
@@ -65,28 +472,20 @@ def stft(x,
center=True,
pad_mode='reflect'):
"""Perform STFT and convert to magnitude spectrogram.
- Parameters
- ----------
- x : Tensor
- Input signal tensor (B, T).
- fft_size : int
- FFT size.
- hop_size : int
- Hop size.
- win_length : int
- window : str, optional
- window : str
- Name of window function, see `scipy.signal.get_window` for more
- details. Defaults to "hann".
- center : bool, optional
- center (bool, optional): Whether to pad `x` to make that the
- :math:`t \times hop\_length` at the center of :math:`t`-th frame. Default: `True`.
- pad_mode : str, optional
- Choose padding pattern when `center` is `True`.
- Returns
- ----------
- Tensor:
- Magnitude spectrogram (B, #frames, fft_size // 2 + 1).
+ Args:
+ x(Tensor): Input signal tensor (B, T).
+ fft_size(int): FFT size.
+ hop_size(int): Hop size.
+ win_length(int, optional): window : str, optional (Default value = None)
+ window(str, optional): Name of window function, see `scipy.signal.get_window` for more
+ details. Defaults to "hann".
+ center(bool, optional, optional): center (bool, optional): Whether to pad `x` to make that the
+ :math:`t \times hop\\_length` at the center of :math:`t`-th frame. Default: `True`.
+ pad_mode(str, optional, optional): (Default value = 'reflect')
+ hop_length: (Default value = None)
+
+ Returns:
+ Tensor: Magnitude spectrogram (B, #frames, fft_size // 2 + 1).
"""
# calculate window
window = signal.get_window(window, win_length, fftbins=True)
@@ -116,16 +515,11 @@ class SpectralConvergenceLoss(nn.Layer):
def forward(self, x_mag, y_mag):
"""Calculate forward propagation.
- Parameters
- ----------
- x_mag : Tensor
- Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
- y_mag : Tensor)
- Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
- Returns
- ----------
- Tensor
- Spectral convergence loss value.
+ Args:
+ x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
+ y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
+ Returns:
+ Tensor: Spectral convergence loss value.
"""
return paddle.norm(
y_mag - x_mag, p="fro") / paddle.clip(
@@ -142,16 +536,11 @@ class LogSTFTMagnitudeLoss(nn.Layer):
def forward(self, x_mag, y_mag):
"""Calculate forward propagation.
- Parameters
- ----------
- x_mag : Tensor
- Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
- y_mag : Tensor
- Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
- Returns
- ----------
- Tensor
- Log STFT magnitude loss value.
+ Args:
+ x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
+ y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
+ Returns:
+ Tensor: Log STFT magnitude loss value.
"""
return F.l1_loss(
paddle.log(paddle.clip(y_mag, min=self.epsilon)),
@@ -177,18 +566,12 @@ class STFTLoss(nn.Layer):
def forward(self, x, y):
"""Calculate forward propagation.
- Parameters
- ----------
- x : Tensor
- Predicted signal (B, T).
- y : Tensor
- Groundtruth signal (B, T).
- Returns
- ----------
- Tensor
- Spectral convergence loss value.
- Tensor
- Log STFT magnitude loss value.
+ Args:
+ x (Tensor): Predicted signal (B, T).
+ y (Tensor): Groundtruth signal (B, T).
+ Returns:
+ Tensor: Spectral convergence loss value.
+ Tensor: Log STFT magnitude loss value.
"""
x_mag = stft(x, self.fft_size, self.shift_size, self.win_length,
self.window)
@@ -210,16 +593,11 @@ class MultiResolutionSTFTLoss(nn.Layer):
win_lengths=[600, 1200, 240],
window="hann", ):
"""Initialize Multi resolution STFT loss module.
- Parameters
- ----------
- fft_sizes : list
- List of FFT sizes.
- hop_sizes : list
- List of hop sizes.
- win_lengths : list
- List of window lengths.
- window : str
- Window function type.
+ Args:
+ fft_sizes (list): List of FFT sizes.
+ hop_sizes (list): List of hop sizes.
+ win_lengths (list): List of window lengths.
+ window (str): Window function type.
"""
super().__init__()
assert len(fft_sizes) == len(hop_sizes) == len(win_lengths)
@@ -229,18 +607,13 @@ class MultiResolutionSTFTLoss(nn.Layer):
def forward(self, x, y):
"""Calculate forward propagation.
- Parameters
- ----------
- x : Tensor
- Predicted signal (B, T) or (B, #subband, T).
- y : Tensor
- Groundtruth signal (B, T) or (B, #subband, T).
- Returns
- ----------
- Tensor
- Multi resolution spectral convergence loss value.
- Tensor
- Multi resolution log STFT magnitude loss value.
+
+ Args:
+ x (Tensor): Predicted signal (B, T) or (B, #subband, T).
+ y (Tensor): Groundtruth signal (B, T) or (B, #subband, T).
+ Returns:
+ Tensor: Multi resolution spectral convergence loss value.
+ Tensor: Multi resolution log STFT magnitude loss value.
"""
if len(x.shape) == 3:
# (B, C, T) -> (B x C, T)
@@ -277,14 +650,10 @@ class GeneratorAdversarialLoss(nn.Layer):
def forward(self, outputs):
"""Calcualate generator adversarial loss.
- Parameters
- ----------
- outputs: Tensor or List
- Discriminator outputs or list of discriminator outputs.
- Returns
- ----------
- Tensor
- Generator adversarial loss value.
+ Args:
+ outputs (Tensor or List): Discriminator outputs or list of discriminator outputs.
+ Returns:
+ Tensor: Generator adversarial loss value.
"""
if isinstance(outputs, (tuple, list)):
adv_loss = 0.0
@@ -324,20 +693,15 @@ class DiscriminatorAdversarialLoss(nn.Layer):
def forward(self, outputs_hat, outputs):
"""Calcualate discriminator adversarial loss.
- Parameters
- ----------
- outputs_hat : Tensor or list
- Discriminator outputs or list of
- discriminator outputs calculated from generator outputs.
- outputs : Tensor or list
- Discriminator outputs or list of
- discriminator outputs calculated from groundtruth.
- Returns
- ----------
- Tensor
- Discriminator real loss value.
- Tensor
- Discriminator fake loss value.
+
+ Args:
+ outputs_hat (Tensor or list): Discriminator outputs or list of
+ discriminator outputs calculated from generator outputs.
+ outputs (Tensor or list): Discriminator outputs or list of
+ discriminator outputs calculated from groundtruth.
+ Returns:
+ Tensor: Discriminator real loss value.
+ Tensor: Discriminator fake loss value.
"""
if isinstance(outputs, (tuple, list)):
real_loss = 0.0
@@ -420,40 +784,32 @@ def ssim(img1, img2, window_size=11, size_average=True):
def weighted_mean(input, weight):
"""Weighted mean. It can also be used as masked mean.
- Parameters
- -----------
- input : Tensor
- The input tensor.
- weight : Tensor
- The weight tensor with broadcastable shape with the input.
-
- Returns
- ----------
- Tensor [shape=(1,)]
- Weighted mean tensor with the same dtype as input.
+ Args:
+ input(Tensor): The input tensor.
+ weight(Tensor): The weight tensor with broadcastable shape with the input.
+
+ Returns:
+ Tensor: Weighted mean tensor with the same dtype as input. shape=(1,)
+
"""
weight = paddle.cast(weight, input.dtype)
- broadcast_ratio = input.size / weight.size
+ # paddle.Tensor.size is different with torch.size() and has been overrided in s2t.__init__
+ broadcast_ratio = input.numel() / weight.numel()
return paddle.sum(input * weight) / (paddle.sum(weight) * broadcast_ratio)
def masked_l1_loss(prediction, target, mask):
"""Compute maksed L1 loss.
- Parameters
- ----------
- prediction : Tensor
- The prediction.
- target : Tensor
- The target. The shape should be broadcastable to ``prediction``.
- mask : Tensor
- The mask. The shape should be broadcatable to the broadcasted shape of
- ``prediction`` and ``target``.
-
- Returns
- -------
- Tensor [shape=(1,)]
- The masked L1 loss.
+ Args:
+ prediction(Tensor): The prediction.
+ target(Tensor): The target. The shape should be broadcastable to ``prediction``.
+ mask(Tensor): The mask. The shape should be broadcatable to the broadcasted shape of
+ ``prediction`` and ``target``.
+
+ Returns:
+ Tensor: The masked L1 loss. shape=(1,)
+
"""
abs_error = F.l1_loss(prediction, target, reduction='none')
loss = weighted_mean(abs_error, mask)
@@ -526,14 +882,11 @@ class MelSpectrogram(nn.Layer):
def forward(self, x):
"""Calculate Mel-spectrogram.
- Parameters
- ----------
- x : Tensor
- Input waveform tensor (B, T) or (B, 1, T).
- Returns
- ----------
- Tensor
- Mel-spectrogram (B, #mels, #frames).
+ Args:
+
+ x (Tensor): Input waveform tensor (B, T) or (B, 1, T).
+ Returns:
+ Tensor: Mel-spectrogram (B, #mels, #frames).
"""
if len(x.shape) == 3:
# (B, C, T) -> (B*C, T)
@@ -598,16 +951,12 @@ class MelSpectrogramLoss(nn.Layer):
def forward(self, y_hat, y):
"""Calculate Mel-spectrogram loss.
- Parameters
- ----------
- y_hat : Tensor
- Generated single tensor (B, 1, T).
- y : Tensor
- Groundtruth single tensor (B, 1, T).
- Returns
- ----------
- Tensor
- Mel-spectrogram loss value.
+ Args:
+ y_hat(Tensor): Generated single tensor (B, 1, T).
+ y(Tensor): Groundtruth single tensor (B, 1, T).
+
+ Returns:
+ Tensor: Mel-spectrogram loss value.
"""
mel_hat = self.mel_spectrogram(y_hat)
mel = self.mel_spectrogram(y)
@@ -632,18 +981,14 @@ class FeatureMatchLoss(nn.Layer):
def forward(self, feats_hat, feats):
"""Calcualate feature matching loss.
- Parameters
- ----------
- feats_hat : list
- List of list of discriminator outputs
- calcuated from generater outputs.
- feats : list
- List of list of discriminator outputs
- calcuated from groundtruth.
- Returns
- ----------
- Tensor
- Feature matching loss value.
+
+ Args:
+ feats_hat(list): List of list of discriminator outputs
+ calcuated from generater outputs.
+ feats(list): List of list of discriminator outputs
+
+ Returns:
+ Tensor: Feature matching loss value.
"""
feat_match_loss = 0.0
diff --git a/paddlespeech/t2s/modules/nets_utils.py b/paddlespeech/t2s/modules/nets_utils.py
index 3822b33d..4207d316 100644
--- a/paddlespeech/t2s/modules/nets_utils.py
+++ b/paddlespeech/t2s/modules/nets_utils.py
@@ -20,27 +20,21 @@ from typeguard import check_argument_types
def pad_list(xs, pad_value):
"""Perform padding for the list of tensors.
- Parameters
- ----------
- xs : List[Tensor]
- List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)].
- pad_value : float)
- Value for padding.
-
- Returns
- ----------
- Tensor
- Padded tensor (B, Tmax, `*`).
-
- Examples
- ----------
- >>> x = [paddle.ones([4]), paddle.ones([2]), paddle.ones([1])]
- >>> x
- [tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])]
- >>> pad_list(x, 0)
- tensor([[1., 1., 1., 1.],
- [1., 1., 0., 0.],
- [1., 0., 0., 0.]])
+ Args:
+ xs (List[Tensor]): List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)].
+ pad_value (float): Value for padding.
+
+ Returns:
+ Tensor: Padded tensor (B, Tmax, `*`).
+
+ Examples:
+ >>> x = [paddle.ones([4]), paddle.ones([2]), paddle.ones([1])]
+ >>> x
+ [tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])]
+ >>> pad_list(x, 0)
+ tensor([[1., 1., 1., 1.],
+ [1., 1., 0., 0.],
+ [1., 0., 0., 0.]])
"""
n_batch = len(xs)
max_len = max(x.shape[0] for x in xs)
@@ -55,25 +49,20 @@ def pad_list(xs, pad_value):
def make_pad_mask(lengths, length_dim=-1):
"""Make mask tensor containing indices of padded part.
- Parameters
- ----------
- lengths : LongTensor
- Batch of lengths (B,).
-
- Returns
- ----------
- Tensor(bool)
- Mask tensor containing indices of padded part bool.
-
- Examples
- ----------
- With only lengths.
-
- >>> lengths = [5, 3, 2]
- >>> make_non_pad_mask(lengths)
- masks = [[0, 0, 0, 0 ,0],
- [0, 0, 0, 1, 1],
- [0, 0, 1, 1, 1]]
+ Args:
+ lengths (Tensor(int64)): Batch of lengths (B,).
+
+ Returns:
+ Tensor(bool): Mask tensor containing indices of padded part bool.
+
+ Examples:
+ With only lengths.
+
+ >>> lengths = [5, 3, 2]
+ >>> make_non_pad_mask(lengths)
+ masks = [[0, 0, 0, 0 ,0],
+ [0, 0, 0, 1, 1],
+ [0, 0, 1, 1, 1]]
"""
if length_dim == 0:
raise ValueError("length_dim cannot be 0: {}".format(length_dim))
@@ -91,31 +80,24 @@ def make_pad_mask(lengths, length_dim=-1):
def make_non_pad_mask(lengths, length_dim=-1):
"""Make mask tensor containing indices of non-padded part.
- Parameters
- ----------
- lengths : LongTensor or List
- Batch of lengths (B,).
- xs : Tensor, optional
- The reference tensor.
- If set, masks will be the same shape as this tensor.
- length_dim : int, optional
- Dimension indicator of the above tensor.
- See the example.
-
- Returns
- ----------
- Tensor(bool)
- mask tensor containing indices of padded part bool.
-
- Examples
- ----------
- With only lengths.
-
- >>> lengths = [5, 3, 2]
- >>> make_non_pad_mask(lengths)
- masks = [[1, 1, 1, 1 ,1],
- [1, 1, 1, 0, 0],
- [1, 1, 0, 0, 0]]
+ Args:
+ lengths (Tensor(int64) or List): Batch of lengths (B,).
+ xs (Tensor, optional): The reference tensor.
+ If set, masks will be the same shape as this tensor.
+ length_dim (int, optional): Dimension indicator of the above tensor.
+ See the example.
+
+ Returns:
+ Tensor(bool): mask tensor containing indices of padded part bool.
+
+ Examples:
+ With only lengths.
+
+ >>> lengths = [5, 3, 2]
+ >>> make_non_pad_mask(lengths)
+ masks = [[1, 1, 1, 1 ,1],
+ [1, 1, 1, 0, 0],
+ [1, 1, 0, 0, 0]]
"""
return paddle.logical_not(make_pad_mask(lengths, length_dim))
@@ -127,12 +109,9 @@ def initialize(model: nn.Layer, init: str):
Custom initialization routines can be implemented into submodules
- Parameters
- ----------
- model : nn.Layer
- Target.
- init : str
- Method of initialization.
+ Args:
+ model (nn.Layer): Target.
+ init (str): Method of initialization.
"""
assert check_argument_types()
diff --git a/paddlespeech/t2s/modules/pqmf.py b/paddlespeech/t2s/modules/pqmf.py
index fb850a4d..9860da90 100644
--- a/paddlespeech/t2s/modules/pqmf.py
+++ b/paddlespeech/t2s/modules/pqmf.py
@@ -24,20 +24,16 @@ def design_prototype_filter(taps=62, cutoff_ratio=0.142, beta=9.0):
"""Design prototype filter for PQMF.
This method is based on `A Kaiser window approach for the design of prototype
filters of cosine modulated filterbanks`_.
- Parameters
- ----------
- taps : int
- The number of filter taps.
- cutoff_ratio : float
- Cut-off frequency ratio.
- beta : float
- Beta coefficient for kaiser window.
- Returns
- ----------
- ndarray
- Impluse response of prototype filter (taps + 1,).
- .. _`A Kaiser window approach for the design of prototype filters of cosine modulated filterbanks`:
- https://ieeexplore.ieee.org/abstract/document/681427
+
+ Args:
+ taps (int): The number of filter taps.
+ cutoff_ratio (float): Cut-off frequency ratio.
+ beta (float): Beta coefficient for kaiser window.
+ Returns:
+ ndarray:
+ Impluse response of prototype filter (taps + 1,).
+ .. _`A Kaiser window approach for the design of prototype filters of cosine modulated filterbanks`:
+ https://ieeexplore.ieee.org/abstract/document/681427
"""
# check the arguments are valid
assert taps % 2 == 0, "The number of taps mush be even number."
@@ -68,16 +64,12 @@ class PQMF(nn.Layer):
"""Initilize PQMF module.
The cutoff_ratio and beta parameters are optimized for #subbands = 4.
See dicussion in https://github.com/kan-bayashi/ParallelWaveGAN/issues/195.
- Parameters
- ----------
- subbands : int
- The number of subbands.
- taps : int
- The number of filter taps.
- cutoff_ratio : float
- Cut-off frequency ratio.
- beta : float
- Beta coefficient for kaiser window.
+
+ Args:
+ subbands (int): The number of subbands.
+ taps (int): The number of filter taps.
+ cutoff_ratio (float): Cut-off frequency ratio.
+ beta (float): Beta coefficient for kaiser window.
"""
super().__init__()
@@ -110,28 +102,20 @@ class PQMF(nn.Layer):
def analysis(self, x):
"""Analysis with PQMF.
- Parameters
- ----------
- x : Tensor
- Input tensor (B, 1, T).
- Returns
- ----------
- Tensor
- Output tensor (B, subbands, T // subbands).
+ Args:
+ x (Tensor): Input tensor (B, 1, T).
+ Returns:
+ Tensor: Output tensor (B, subbands, T // subbands).
"""
x = F.conv1d(self.pad_fn(x), self.analysis_filter)
return F.conv1d(x, self.updown_filter, stride=self.subbands)
def synthesis(self, x):
"""Synthesis with PQMF.
- Parameters
- ----------
- x : Tensor
- Input tensor (B, subbands, T // subbands).
- Returns
- ----------
- Tensor
- Output tensor (B, 1, T).
+ Args:
+ x (Tensor): Input tensor (B, subbands, T // subbands).
+ Returns:
+ Tensor: Output tensor (B, 1, T).
"""
x = F.conv1d_transpose(
x, self.updown_filter * self.subbands, stride=self.subbands)
diff --git a/paddlespeech/t2s/modules/predictor/duration_predictor.py b/paddlespeech/t2s/modules/predictor/duration_predictor.py
index 6b7c6a6b..33ed575b 100644
--- a/paddlespeech/t2s/modules/predictor/duration_predictor.py
+++ b/paddlespeech/t2s/modules/predictor/duration_predictor.py
@@ -49,20 +49,13 @@ class DurationPredictor(nn.Layer):
offset=1.0):
"""Initilize duration predictor module.
- Parameters
- ----------
- idim : int
- Input dimension.
- n_layers : int, optional
- Number of convolutional layers.
- n_chans : int, optional
- Number of channels of convolutional layers.
- kernel_size : int, optional
- Kernel size of convolutional layers.
- dropout_rate : float, optional
- Dropout rate.
- offset : float, optional
- Offset value to avoid nan in log domain.
+ Args:
+ idim (int):Input dimension.
+ n_layers (int, optional): Number of convolutional layers.
+ n_chans (int, optional): Number of channels of convolutional layers.
+ kernel_size (int, optional): Kernel size of convolutional layers.
+ dropout_rate (float, optional): Dropout rate.
+ offset (float, optional): Offset value to avoid nan in log domain.
"""
super().__init__()
@@ -105,35 +98,23 @@ class DurationPredictor(nn.Layer):
def forward(self, xs, x_masks=None):
"""Calculate forward propagation.
+ Args:
+ xs(Tensor): Batch of input sequences (B, Tmax, idim).
+ x_masks(ByteTensor, optional, optional): Batch of masks indicating padded part (B, Tmax). (Default value = None)
- Parameters
- ----------
- xs : Tensor
- Batch of input sequences (B, Tmax, idim).
- x_masks : ByteTensor, optional
- Batch of masks indicating padded part (B, Tmax).
-
- Returns
- ----------
- Tensor
- Batch of predicted durations in log domain (B, Tmax).
+ Returns:
+ Tensor: Batch of predicted durations in log domain (B, Tmax).
"""
return self._forward(xs, x_masks, False)
def inference(self, xs, x_masks=None):
"""Inference duration.
+ Args:
+ xs(Tensor): Batch of input sequences (B, Tmax, idim).
+ x_masks(Tensor(bool), optional, optional): Batch of masks indicating padded part (B, Tmax). (Default value = None)
- Parameters
- ----------
- xs : Tensor
- Batch of input sequences (B, Tmax, idim).
- x_masks : Tensor(bool), optional
- Batch of masks indicating padded part (B, Tmax).
-
- Returns
- ----------
- Tensor
- Batch of predicted durations in linear domain int64 (B, Tmax).
+ Returns:
+ Tensor: Batch of predicted durations in linear domain int64 (B, Tmax).
"""
return self._forward(xs, x_masks, True)
@@ -147,13 +128,9 @@ class DurationPredictorLoss(nn.Layer):
def __init__(self, offset=1.0, reduction="mean"):
"""Initilize duration predictor loss module.
-
- Parameters
- ----------
- offset : float, optional
- Offset value to avoid nan in log domain.
- reduction : str
- Reduction type in loss calculation.
+ Args:
+ offset (float, optional): Offset value to avoid nan in log domain.
+ reduction (str): Reduction type in loss calculation.
"""
super().__init__()
self.criterion = nn.MSELoss(reduction=reduction)
@@ -162,21 +139,15 @@ class DurationPredictorLoss(nn.Layer):
def forward(self, outputs, targets):
"""Calculate forward propagation.
- Parameters
- ----------
- outputs : Tensor
- Batch of prediction durations in log domain (B, T)
- targets : Tensor
- Batch of groundtruth durations in linear domain (B, T)
-
- Returns
- ----------
- Tensor
- Mean squared error loss value.
-
- Note
- ----------
- `outputs` is in log domain but `targets` is in linear domain.
+ Args:
+ outputs(Tensor): Batch of prediction durations in log domain (B, T)
+ targets(Tensor): Batch of groundtruth durations in linear domain (B, T)
+
+ Returns:
+ Tensor: Mean squared error loss value.
+
+ Note:
+ `outputs` is in log domain but `targets` is in linear domain.
"""
# NOTE: outputs is in log domain while targets in linear
targets = paddle.log(targets.cast(dtype='float32') + self.offset)
diff --git a/paddlespeech/t2s/modules/predictor/length_regulator.py b/paddlespeech/t2s/modules/predictor/length_regulator.py
index f1ecfb7c..62d707d2 100644
--- a/paddlespeech/t2s/modules/predictor/length_regulator.py
+++ b/paddlespeech/t2s/modules/predictor/length_regulator.py
@@ -13,6 +13,7 @@
# limitations under the License.
# Modified from espnet(https://github.com/espnet/espnet)
"""Length regulator related modules."""
+import numpy as np
import paddle
from paddle import nn
@@ -34,15 +35,35 @@ class LengthRegulator(nn.Layer):
def __init__(self, pad_value=0.0):
"""Initilize length regulator module.
- Parameters
- ----------
- pad_value : float, optional
- Value used for padding.
+ Args:
+ pad_value (float, optional): Value used for padding.
"""
super().__init__()
self.pad_value = pad_value
+ # expand_numpy is faster than expand
+ def expand_numpy(self, encodings: paddle.Tensor,
+ durations: paddle.Tensor) -> paddle.Tensor:
+ """
+ encodings: (B, T, C)
+ durations: (B, T)
+ """
+ batch_size, t_enc = durations.shape
+ durations = durations.numpy()
+ slens = np.sum(durations, -1)
+ t_dec = np.max(slens)
+ M = np.zeros([batch_size, t_dec, t_enc])
+ for i in range(batch_size):
+ k = 0
+ for j in range(t_enc):
+ d = durations[i, j]
+ M[i, k:k + d, j] = 1
+ k += d
+ M = paddle.to_tensor(M, dtype=encodings.dtype)
+ encodings = paddle.matmul(M, encodings)
+ return encodings
+
def expand(self, encodings: paddle.Tensor,
durations: paddle.Tensor) -> paddle.Tensor:
"""
@@ -50,39 +71,37 @@ class LengthRegulator(nn.Layer):
durations: (B, T)
"""
batch_size, t_enc = paddle.shape(durations)
- slens = durations.sum(-1)
- t_dec = slens.max()
+ slens = paddle.sum(durations, -1)
+ t_dec = paddle.max(slens)
M = paddle.zeros([batch_size, t_dec, t_enc])
for i in range(batch_size):
k = 0
for j in range(t_enc):
d = durations[i, j]
+ # If the d == 0, slice action is meaningless and not supported in paddle
if d >= 1:
M[i, k:k + d, j] = 1
k += d
encodings = paddle.matmul(M, encodings)
return encodings
- def forward(self, xs, ds, alpha=1.0):
+ def forward(self, xs, ds, alpha=1.0, is_inference=False):
"""Calculate forward propagation.
- Parameters
- ----------
- xs : Tensor
- Batch of sequences of char or phoneme embeddings (B, Tmax, D).
- ds : Tensor(int64)
- Batch of durations of each frame (B, T).
- alpha : float, optional
- Alpha value to control speed of speech.
+ Args:
+ xs (Tensor): Batch of sequences of char or phoneme embeddings (B, Tmax, D).
+ ds (Tensor(int64)): Batch of durations of each frame (B, T).
+ alpha (float, optional): Alpha value to control speed of speech.
- Returns
- ----------
- Tensor
- replicated input tensor based on durations (B, T*, D).
+ Returns:
+ Tensor: replicated input tensor based on durations (B, T*, D).
"""
if alpha != 1.0:
assert alpha > 0
ds = paddle.round(ds.cast(dtype=paddle.float32) * alpha)
ds = ds.cast(dtype=paddle.int64)
- return self.expand(xs, ds)
+ if is_inference:
+ return self.expand(xs, ds)
+ else:
+ return self.expand_numpy(xs, ds)
diff --git a/paddlespeech/t2s/modules/predictor/variance_predictor.py b/paddlespeech/t2s/modules/predictor/variance_predictor.py
index 417fca82..8afbf257 100644
--- a/paddlespeech/t2s/modules/predictor/variance_predictor.py
+++ b/paddlespeech/t2s/modules/predictor/variance_predictor.py
@@ -42,18 +42,12 @@ class VariancePredictor(nn.Layer):
dropout_rate: float=0.5, ):
"""Initilize duration predictor module.
- Parameters
- ----------
- idim : int
- Input dimension.
- n_layers : int, optional
- Number of convolutional layers.
- n_chans : int, optional
- Number of channels of convolutional layers.
- kernel_size : int, optional
- Kernel size of convolutional layers.
- dropout_rate : float, optional
- Dropout rate.
+ Args:
+ idim (int): Input dimension.
+ n_layers (int, optional): Number of convolutional layers.
+ n_chans (int, optional): Number of channels of convolutional layers.
+ kernel_size (int, optional): Kernel size of convolutional layers.
+ dropout_rate (float, optional): Dropout rate.
"""
assert check_argument_types()
super().__init__()
@@ -79,17 +73,12 @@ class VariancePredictor(nn.Layer):
x_masks: paddle.Tensor=None) -> paddle.Tensor:
"""Calculate forward propagation.
- Parameters
- ----------
- xs : Tensor
- Batch of input sequences (B, Tmax, idim).
- x_masks : Tensor(bool), optional
- Batch of masks indicating padded part (B, Tmax, 1).
+ Args:
+ xs (Tensor): Batch of input sequences (B, Tmax, idim).
+ x_masks (Tensor(bool), optional): Batch of masks indicating padded part (B, Tmax, 1).
- Returns
- ----------
- Tensor
- Batch of predicted sequences (B, Tmax, 1).
+ Returns:
+ Tensor: Batch of predicted sequences (B, Tmax, 1).
"""
# (B, idim, Tmax)
xs = xs.transpose([0, 2, 1])
diff --git a/paddlespeech/t2s/modules/residual_block.py b/paddlespeech/t2s/modules/residual_block.py
index a96a8946..efbfce27 100644
--- a/paddlespeech/t2s/modules/residual_block.py
+++ b/paddlespeech/t2s/modules/residual_block.py
@@ -28,26 +28,16 @@ class WaveNetResidualBlock(nn.Layer):
unit and parametric redidual and skip connections. For more details,
refer to `WaveNet: A Generative Model for Raw Audio `_.
- Parameters
- ----------
- kernel_size : int, optional
- Kernel size of the 1D convolution, by default 3
- residual_channels : int, optional
- Feature size of the resiaudl output(and also the input), by default 64
- gate_channels : int, optional
- Output feature size of the 1D convolution, by default 128
- skip_channels : int, optional
- Feature size of the skip output, by default 64
- aux_channels : int, optional
- Feature size of the auxiliary input (e.g. spectrogram), by default 80
- dropout : float, optional
- Probability of the dropout before the 1D convolution, by default 0.
- dilation : int, optional
- Dilation of the 1D convolution, by default 1
- bias : bool, optional
- Whether to use bias in the 1D convolution, by default True
- use_causal_conv : bool, optional
- Whether to use causal padding for the 1D convolution, by default False
+ Args:
+ kernel_size (int, optional): Kernel size of the 1D convolution, by default 3
+ residual_channels (int, optional): Feature size of the resiaudl output(and also the input), by default 64
+ gate_channels (int, optional): Output feature size of the 1D convolution, by default 128
+ skip_channels (int, optional): Feature size of the skip output, by default 64
+ aux_channels (int, optional): Feature size of the auxiliary input (e.g. spectrogram), by default 80
+ dropout (float, optional): Probability of the dropout before the 1D convolution, by default 0.
+ dilation (int, optional): Dilation of the 1D convolution, by default 1
+ bias (bool, optional): Whether to use bias in the 1D convolution, by default True
+ use_causal_conv (bool, optional): Whether to use causal padding for the 1D convolution, by default False
"""
def __init__(self,
@@ -90,21 +80,15 @@ class WaveNetResidualBlock(nn.Layer):
def forward(self, x, c):
"""
- Parameters
- ----------
- x : Tensor
- Shape (N, C_res, T), the input features.
- c : Tensor
- Shape (N, C_aux, T), the auxiliary input.
-
- Returns
- -------
- res : Tensor
- Shape (N, C_res, T), the residual output, which is used as the
- input of the next ResidualBlock in a stack of ResidualBlocks.
- skip : Tensor
- Shape (N, C_skip, T), the skip output, which is collected among
- each layer in a stack of ResidualBlocks.
+ Args:
+ x (Tensor): the input features. Shape (N, C_res, T)
+ c (Tensor): the auxiliary input. Shape (N, C_aux, T)
+
+ Returns:
+ res (Tensor): Shape (N, C_res, T), the residual output, which is used as the
+ input of the next ResidualBlock in a stack of ResidualBlocks.
+ skip (Tensor): Shape (N, C_skip, T), the skip output, which is collected among
+ each layer in a stack of ResidualBlocks.
"""
x_input = x
x = F.dropout(x, self.dropout, training=self.training)
@@ -136,22 +120,14 @@ class HiFiGANResidualBlock(nn.Layer):
nonlinear_activation_params: Dict[str, Any]={"negative_slope": 0.1},
):
"""Initialize HiFiGANResidualBlock module.
- Parameters
- ----------
- kernel_size : int
- Kernel size of dilation convolution layer.
- channels : int
- Number of channels for convolution layer.
- dilations : List[int]
- List of dilation factors.
- use_additional_convs : bool
- Whether to use additional convolution layers.
- bias : bool
- Whether to add bias parameter in convolution layers.
- nonlinear_activation : str
- Activation function module name.
- nonlinear_activation_params : dict
- Hyperparameters for activation function.
+ Args:
+ kernel_size (int): Kernel size of dilation convolution layer.
+ channels (int): Number of channels for convolution layer.
+ dilations (List[int]): List of dilation factors.
+ use_additional_convs (bool): Whether to use additional convolution layers.
+ bias (bool): Whether to add bias parameter in convolution layers.
+ nonlinear_activation (str): Activation function module name.
+ nonlinear_activation_params (dict): Hyperparameters for activation function.
"""
super().__init__()
@@ -190,14 +166,10 @@ class HiFiGANResidualBlock(nn.Layer):
def forward(self, x):
"""Calculate forward propagation.
- Parameters
- ----------
- x : Tensor
- Input tensor (B, channels, T).
- Returns
- ----------
- Tensor
- Output tensor (B, channels, T).
+ Args:
+ x (Tensor): Input tensor (B, channels, T).
+ Returns:
+ Tensor: Output tensor (B, channels, T).
"""
for idx in range(len(self.convs1)):
xt = self.convs1[idx](x)
diff --git a/paddlespeech/t2s/modules/residual_stack.py b/paddlespeech/t2s/modules/residual_stack.py
index c885dfe9..0d949b56 100644
--- a/paddlespeech/t2s/modules/residual_stack.py
+++ b/paddlespeech/t2s/modules/residual_stack.py
@@ -37,26 +37,17 @@ class ResidualStack(nn.Layer):
pad_params: Dict[str, Any]={"mode": "reflect"},
use_causal_conv: bool=False, ):
"""Initialize ResidualStack module.
- Parameters
- ----------
- kernel_size : int
- Kernel size of dilation convolution layer.
- channels : int
- Number of channels of convolution layers.
- dilation : int
- Dilation factor.
- bias : bool
- Whether to add bias parameter in convolution layers.
- nonlinear_activation : str
- Activation function module name.
- nonlinear_activation_params : Dict[str,Any]
- Hyperparameters for activation function.
- pad : str
- Padding function module name before dilated convolution layer.
- pad_params : Dict[str, Any]
- Hyperparameters for padding function.
- use_causal_conv : bool
- Whether to use causal convolution.
+
+ Args:
+ kernel_size (int): Kernel size of dilation convolution layer.
+ channels (int): Number of channels of convolution layers.
+ dilation (int): Dilation factor.
+ bias (bool): Whether to add bias parameter in convolution layers.
+ nonlinear_activation (str): Activation function module name.
+ nonlinear_activation_params (Dict[str,Any]): Hyperparameters for activation function.
+ pad (str): Padding function module name before dilated convolution layer.
+ pad_params (Dict[str, Any]): Hyperparameters for padding function.
+ use_causal_conv (bool): Whether to use causal convolution.
"""
super().__init__()
# for compatibility
@@ -102,13 +93,10 @@ class ResidualStack(nn.Layer):
def forward(self, c):
"""Calculate forward propagation.
- Parameters
- ----------
- c : Tensor
- Input tensor (B, channels, T).
- Returns
- ----------
- Tensor
- Output tensor (B, chennels, T).
+
+ Args:
+ c (Tensor): Input tensor (B, channels, T).
+ Returns:
+ Tensor: Output tensor (B, chennels, T).
"""
return self.stack(c) + self.skip_layer(c)
diff --git a/paddlespeech/t2s/modules/style_encoder.py b/paddlespeech/t2s/modules/style_encoder.py
index 9d4b83a2..49091eac 100644
--- a/paddlespeech/t2s/modules/style_encoder.py
+++ b/paddlespeech/t2s/modules/style_encoder.py
@@ -30,33 +30,21 @@ class StyleEncoder(nn.Layer):
.. _`Style Tokens: Unsupervised Style Modeling, Control and Transfer in End-to-End
Speech Synthesis`: https://arxiv.org/abs/1803.09017
-
- Parameters
- ----------
- idim : int, optional
- Dimension of the input mel-spectrogram.
- gst_tokens : int, optional
- The number of GST embeddings.
- gst_token_dim : int, optional
- Dimension of each GST embedding.
- gst_heads : int, optional
- The number of heads in GST multihead attention.
- conv_layers : int, optional
- The number of conv layers in the reference encoder.
- conv_chans_list : Sequence[int], optional
- List of the number of channels of conv layers in the referece encoder.
- conv_kernel_size : int, optional
- Kernal size of conv layers in the reference encoder.
- conv_stride : int, optional
- Stride size of conv layers in the reference encoder.
- gru_layers : int, optional
- The number of GRU layers in the reference encoder.
- gru_units : int, optional
- The number of GRU units in the reference encoder.
-
- Todo
- ----------
- * Support manual weight specification in inference.
+
+ Args:
+ idim (int, optional): Dimension of the input mel-spectrogram.
+ gst_tokens (int, optional): The number of GST embeddings.
+ gst_token_dim (int, optional): Dimension of each GST embedding.
+ gst_heads (int, optional): The number of heads in GST multihead attention.
+ conv_layers (int, optional): The number of conv layers in the reference encoder.
+ conv_chans_list (Sequence[int], optional): List of the number of channels of conv layers in the referece encoder.
+ conv_kernel_size (int, optional): Kernal size of conv layers in the reference encoder.
+ conv_stride (int, optional): Stride size of conv layers in the reference encoder.
+ gru_layers (int, optional): The number of GRU layers in the reference encoder.
+ gru_units (int, optional):The number of GRU units in the reference encoder.
+
+ Todo:
+ * Support manual weight specification in inference.
"""
@@ -93,15 +81,11 @@ class StyleEncoder(nn.Layer):
def forward(self, speech: paddle.Tensor) -> paddle.Tensor:
"""Calculate forward propagation.
- Parameters
- ----------
- speech : Tensor
- Batch of padded target features (B, Lmax, odim).
+ Args:
+ speech (Tensor): Batch of padded target features (B, Lmax, odim).
- Returns
- ----------
- Tensor:
- Style token embeddings (B, token_dim).
+ Returns:
+ Tensor: Style token embeddings (B, token_dim).
"""
ref_embs = self.ref_enc(speech)
@@ -118,23 +102,15 @@ class ReferenceEncoder(nn.Layer):
.. _`Style Tokens: Unsupervised Style Modeling, Control and Transfer in End-to-End
Speech Synthesis`: https://arxiv.org/abs/1803.09017
-
- Parameters
- ----------
- idim : int, optional
- Dimension of the input mel-spectrogram.
- conv_layers : int, optional
- The number of conv layers in the reference encoder.
- conv_chans_list: : Sequence[int], optional
- List of the number of channels of conv layers in the referece encoder.
- conv_kernel_size : int, optional
- Kernal size of conv layers in the reference encoder.
- conv_stride : int, optional
- Stride size of conv layers in the reference encoder.
- gru_layers : int, optional
- The number of GRU layers in the reference encoder.
- gru_units : int, optional
- The number of GRU units in the reference encoder.
+
+ Args:
+ idim (int, optional): Dimension of the input mel-spectrogram.
+ conv_layers (int, optional): The number of conv layers in the reference encoder.
+ conv_chans_list: (Sequence[int], optional): List of the number of channels of conv layers in the referece encoder.
+ conv_kernel_size (int, optional): Kernal size of conv layers in the reference encoder.
+ conv_stride (int, optional): Stride size of conv layers in the reference encoder.
+ gru_layers (int, optional): The number of GRU layers in the reference encoder.
+ gru_units (int, optional): The number of GRU units in the reference encoder.
"""
@@ -191,16 +167,11 @@ class ReferenceEncoder(nn.Layer):
def forward(self, speech: paddle.Tensor) -> paddle.Tensor:
"""Calculate forward propagation.
+ Args:
+ speech (Tensor): Batch of padded target features (B, Lmax, idim).
- Parameters
- ----------
- speech : Tensor
- Batch of padded target features (B, Lmax, idim).
-
- Return
- ----------
- Tensor
- Reference embedding (B, gru_units)
+ Returns:
+ Tensor: Reference embedding (B, gru_units)
"""
batch_size = speech.shape[0]
@@ -228,19 +199,12 @@ class StyleTokenLayer(nn.Layer):
.. _`Style Tokens: Unsupervised Style Modeling, Control and Transfer in End-to-End
Speech Synthesis`: https://arxiv.org/abs/1803.09017
-
- Parameters
- ----------
- ref_embed_dim : int, optional
- Dimension of the input reference embedding.
- gst_tokens : int, optional
- The number of GST embeddings.
- gst_token_dim : int, optional
- Dimension of each GST embedding.
- gst_heads : int, optional
- The number of heads in GST multihead attention.
- dropout_rate : float, optional
- Dropout rate in multi-head attention.
+ Args:
+ ref_embed_dim (int, optional): Dimension of the input reference embedding.
+ gst_tokens (int, optional): The number of GST embeddings.
+ gst_token_dim (int, optional): Dimension of each GST embedding.
+ gst_heads (int, optional): The number of heads in GST multihead attention.
+ dropout_rate (float, optional): Dropout rate in multi-head attention.
"""
@@ -271,15 +235,11 @@ class StyleTokenLayer(nn.Layer):
def forward(self, ref_embs: paddle.Tensor) -> paddle.Tensor:
"""Calculate forward propagation.
- Parameters
- ----------
- ref_embs : Tensor
- Reference embeddings (B, ref_embed_dim).
+ Args:
+ ref_embs (Tensor): Reference embeddings (B, ref_embed_dim).
- Returns
- ----------
- Tensor
- Style token embeddings (B, gst_token_dim).
+ Returns:
+ Tensor: Style token embeddings (B, gst_token_dim).
"""
batch_size = ref_embs.shape[0]
diff --git a/paddlespeech/t2s/modules/tacotron2/attentions.py b/paddlespeech/t2s/modules/tacotron2/attentions.py
new file mode 100644
index 00000000..a6fde742
--- /dev/null
+++ b/paddlespeech/t2s/modules/tacotron2/attentions.py
@@ -0,0 +1,454 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Attention modules for RNN."""
+import paddle
+import paddle.nn.functional as F
+from paddle import nn
+
+from paddlespeech.t2s.modules.masked_fill import masked_fill
+from paddlespeech.t2s.modules.nets_utils import make_pad_mask
+
+
+def _apply_attention_constraint(e,
+ last_attended_idx,
+ backward_window=1,
+ forward_window=3):
+ """Apply monotonic attention constraint.
+
+ This function apply the monotonic attention constraint
+ introduced in `Deep Voice 3: Scaling
+ Text-to-Speech with Convolutional Sequence Learning`_.
+
+ Args:
+ e(Tensor): Attention energy before applying softmax (1, T).
+ last_attended_idx(int): The index of the inputs of the last attended [0, T].
+ backward_window(int, optional, optional): Backward window size in attention constraint. (Default value = 1)
+ forward_window(int, optional, optional): Forward window size in attetion constraint. (Default value = 3)
+
+ Returns:
+ Tensor: Monotonic constrained attention energy (1, T).
+
+ .. _`Deep Voice 3: Scaling Text-to-Speech with Convolutional Sequence Learning`:
+ https://arxiv.org/abs/1710.07654
+
+ """
+ if paddle.shape(e)[0] != 1:
+ raise NotImplementedError(
+ "Batch attention constraining is not yet supported.")
+ backward_idx = last_attended_idx - backward_window
+ forward_idx = last_attended_idx + forward_window
+ if backward_idx > 0:
+ e[:, :backward_idx] = -float("inf")
+ if forward_idx < paddle.shape(e)[1]:
+ e[:, forward_idx:] = -float("inf")
+ return e
+
+
+class AttLoc(nn.Layer):
+ """location-aware attention module.
+
+ Reference: Attention-Based Models for Speech Recognition
+ (https://arxiv.org/pdf/1506.07503.pdf)
+
+ Args:
+ eprojs (int): projection-units of encoder
+ dunits (int): units of decoder
+ att_dim (int): attention dimension
+ aconv_chans (int): channels of attention convolution
+ aconv_filts (int): filter size of attention convolution
+ han_mode (bool): flag to swith on mode of hierarchical attention and not store pre_compute_enc_h
+ """
+
+ def __init__(self,
+ eprojs,
+ dunits,
+ att_dim,
+ aconv_chans,
+ aconv_filts,
+ han_mode=False):
+ super().__init__()
+ self.mlp_enc = nn.Linear(eprojs, att_dim)
+ self.mlp_dec = nn.Linear(dunits, att_dim, bias_attr=False)
+ self.mlp_att = nn.Linear(aconv_chans, att_dim, bias_attr=False)
+ self.loc_conv = nn.Conv2D(
+ 1,
+ aconv_chans,
+ (1, 2 * aconv_filts + 1),
+ padding=(0, aconv_filts),
+ bias_attr=False, )
+ self.gvec = nn.Linear(att_dim, 1)
+
+ self.dunits = dunits
+ self.eprojs = eprojs
+ self.att_dim = att_dim
+ self.h_length = None
+ self.enc_h = None
+ self.pre_compute_enc_h = None
+ self.mask = None
+ self.han_mode = han_mode
+
+ def reset(self):
+ """reset states"""
+ self.h_length = None
+ self.enc_h = None
+ self.pre_compute_enc_h = None
+ self.mask = None
+
+ def forward(
+ self,
+ enc_hs_pad,
+ enc_hs_len,
+ dec_z,
+ att_prev,
+ scaling=2.0,
+ last_attended_idx=None,
+ backward_window=1,
+ forward_window=3, ):
+ """Calculate AttLoc forward propagation.
+ Args:
+ enc_hs_pad(Tensor): padded encoder hidden state (B, T_max, D_enc)
+ enc_hs_len(Tensor): padded encoder hidden state length (B)
+ dec_z(Tensor dec_z): decoder hidden state (B, D_dec)
+ att_prev(Tensor): previous attention weight (B, T_max)
+ scaling(float, optional): scaling parameter before applying softmax (Default value = 2.0)
+ forward_window(Tensor, optional): forward window size when constraining attention (Default value = 3)
+ last_attended_idx(int, optional): index of the inputs of the last attended (Default value = None)
+ backward_window(int, optional): backward window size in attention constraint (Default value = 1)
+ forward_window(int, optional): forward window size in attetion constraint (Default value = 3)
+ Returns:
+ Tensor: attention weighted encoder state (B, D_enc)
+ Tensor: previous attention weights (B, T_max)
+ """
+ batch = paddle.shape(enc_hs_pad)[0]
+ # pre-compute all h outside the decoder loop
+ if self.pre_compute_enc_h is None or self.han_mode:
+ # (utt, frame, hdim)
+ self.enc_h = enc_hs_pad
+ self.h_length = paddle.shape(self.enc_h)[1]
+ # (utt, frame, att_dim)
+ self.pre_compute_enc_h = self.mlp_enc(self.enc_h)
+
+ if dec_z is None:
+ dec_z = paddle.zeros([batch, self.dunits])
+ else:
+ dec_z = dec_z.reshape([batch, self.dunits])
+
+ # initialize attention weight with uniform dist.
+ if paddle.sum(att_prev) == 0:
+ # if no bias, 0 0-pad goes 0
+ att_prev = 1.0 - make_pad_mask(enc_hs_len)
+ att_prev = att_prev / enc_hs_len.unsqueeze(-1)
+
+ # att_prev: (utt, frame) -> (utt, 1, 1, frame)
+ # -> (utt, att_conv_chans, 1, frame)
+ att_conv = self.loc_conv(att_prev.reshape([batch, 1, 1, self.h_length]))
+ # att_conv: (utt, att_conv_chans, 1, frame) -> (utt, frame, att_conv_chans)
+ att_conv = att_conv.squeeze(2).transpose([0, 2, 1])
+ # att_conv: (utt, frame, att_conv_chans) -> (utt, frame, att_dim)
+ att_conv = self.mlp_att(att_conv)
+ # dec_z_tiled: (utt, frame, att_dim)
+ dec_z_tiled = self.mlp_dec(dec_z).reshape([batch, 1, self.att_dim])
+
+ # dot with gvec
+ # (utt, frame, att_dim) -> (utt, frame)
+ e = paddle.tanh(att_conv + self.pre_compute_enc_h + dec_z_tiled)
+ e = self.gvec(e).squeeze(2)
+
+ # NOTE: consider zero padding when compute w.
+ if self.mask is None:
+ self.mask = make_pad_mask(enc_hs_len)
+
+ e = masked_fill(e, self.mask, -float("inf"))
+ # apply monotonic attention constraint (mainly for TTS)
+ if last_attended_idx is not None:
+ e = _apply_attention_constraint(e, last_attended_idx,
+ backward_window, forward_window)
+
+ w = F.softmax(scaling * e, axis=1)
+
+ # weighted sum over frames
+ # utt x hdim
+ c = paddle.sum(
+ self.enc_h * w.reshape([batch, self.h_length, 1]), axis=1)
+ return c, w
+
+
+class AttForward(nn.Layer):
+ """Forward attention module.
+ Reference
+ ----------
+ Forward attention in sequence-to-sequence acoustic modeling for speech synthesis
+ (https://arxiv.org/pdf/1807.06736.pdf)
+
+ Args:
+ eprojs (int): projection-units of encoder
+ dunits (int): units of decoder
+ att_dim (int): attention dimension
+ aconv_chans (int): channels of attention convolution
+ aconv_filts (int): filter size of attention convolution
+ """
+
+ def __init__(self, eprojs, dunits, att_dim, aconv_chans, aconv_filts):
+ super().__init__()
+ self.mlp_enc = nn.Linear(eprojs, att_dim)
+ self.mlp_dec = nn.Linear(dunits, att_dim, bias_attr=False)
+ self.mlp_att = nn.Linear(aconv_chans, att_dim, bias_attr=False)
+ self.loc_conv = nn.Conv2D(
+ 1,
+ aconv_chans,
+ (1, 2 * aconv_filts + 1),
+ padding=(0, aconv_filts),
+ bias_attr=False, )
+ self.gvec = nn.Linear(att_dim, 1)
+ self.dunits = dunits
+ self.eprojs = eprojs
+ self.att_dim = att_dim
+ self.h_length = None
+ self.enc_h = None
+ self.pre_compute_enc_h = None
+ self.mask = None
+
+ def reset(self):
+ """reset states"""
+ self.h_length = None
+ self.enc_h = None
+ self.pre_compute_enc_h = None
+ self.mask = None
+
+ def forward(
+ self,
+ enc_hs_pad,
+ enc_hs_len,
+ dec_z,
+ att_prev,
+ scaling=1.0,
+ last_attended_idx=None,
+ backward_window=1,
+ forward_window=3, ):
+ """Calculate AttForward forward propagation.
+
+ Args:
+ enc_hs_pad(Tensor): padded encoder hidden state (B, T_max, D_enc)
+ enc_hs_len(list): padded encoder hidden state length (B,)
+ dec_z(Tensor): decoder hidden state (B, D_dec)
+ att_prev(Tensor): attention weights of previous step (B, T_max)
+ scaling(float, optional): scaling parameter before applying softmax (Default value = 1.0)
+ last_attended_idx(int, optional): index of the inputs of the last attended (Default value = None)
+ backward_window(int, optional): backward window size in attention constraint (Default value = 1)
+ forward_window(int, optional): (Default value = 3)
+
+ Returns:
+ Tensor: attention weighted encoder state (B, D_enc)
+ Tensor: previous attention weights (B, T_max)
+ """
+ batch = len(enc_hs_pad)
+ # pre-compute all h outside the decoder loop
+ if self.pre_compute_enc_h is None:
+ self.enc_h = enc_hs_pad # utt x frame x hdim
+ self.h_length = paddle.shape(self.enc_h)[1]
+ # utt x frame x att_dim
+ self.pre_compute_enc_h = self.mlp_enc(self.enc_h)
+
+ if dec_z is None:
+ dec_z = paddle.zeros([batch, self.dunits])
+ else:
+ dec_z = dec_z.reshape([batch, self.dunits])
+
+ if att_prev is None:
+ # initial attention will be [1, 0, 0, ...]
+ att_prev = paddle.zeros([*paddle.shape(enc_hs_pad)[:2]])
+ att_prev[:, 0] = 1.0
+
+ # att_prev: utt x frame -> utt x 1 x 1 x frame
+ # -> utt x att_conv_chans x 1 x frame
+ att_conv = self.loc_conv(att_prev.reshape([batch, 1, 1, self.h_length]))
+ # att_conv: utt x att_conv_chans x 1 x frame -> utt x frame x att_conv_chans
+ att_conv = att_conv.squeeze(2).transpose([0, 2, 1])
+ # att_conv: utt x frame x att_conv_chans -> utt x frame x att_dim
+ att_conv = self.mlp_att(att_conv)
+
+ # dec_z_tiled: utt x frame x att_dim
+ dec_z_tiled = self.mlp_dec(dec_z).unsqueeze(1)
+
+ # dot with gvec
+ # utt x frame x att_dim -> utt x frame
+ e = self.gvec(
+ paddle.tanh(self.pre_compute_enc_h + dec_z_tiled +
+ att_conv)).squeeze(2)
+
+ # NOTE: consider zero padding when compute w.
+ if self.mask is None:
+ self.mask = make_pad_mask(enc_hs_len)
+ e = masked_fill(e, self.mask, -float("inf"))
+
+ # apply monotonic attention constraint (mainly for TTS)
+ if last_attended_idx is not None:
+ e = _apply_attention_constraint(e, last_attended_idx,
+ backward_window, forward_window)
+
+ w = F.softmax(scaling * e, axis=1)
+
+ # forward attention
+ att_prev_shift = F.pad(att_prev, (0, 0, 1, 0))[:, :-1]
+
+ w = (att_prev + att_prev_shift) * w
+ # NOTE: clip is needed to avoid nan gradient
+ w = F.normalize(paddle.clip(w, 1e-6), p=1, axis=1)
+
+ # weighted sum over flames
+ # utt x hdim
+ # NOTE use bmm instead of sum(*)
+ c = paddle.sum(self.enc_h * w.unsqueeze(-1), axis=1)
+
+ return c, w
+
+
+class AttForwardTA(nn.Layer):
+ """Forward attention with transition agent module.
+ Reference:
+ Forward attention in sequence-to-sequence acoustic modeling for speech synthesis
+ (https://arxiv.org/pdf/1807.06736.pdf)
+
+ Args:
+ eunits (int): units of encoder
+ dunits (int): units of decoder
+ att_dim (int): attention dimension
+ aconv_chans (int): channels of attention convolution
+ aconv_filts (int): filter size of attention convolution
+ odim (int): output dimension
+ """
+
+ def __init__(self, eunits, dunits, att_dim, aconv_chans, aconv_filts, odim):
+ super().__init__()
+ self.mlp_enc = nn.Linear(eunits, att_dim)
+ self.mlp_dec = nn.Linear(dunits, att_dim, bias_attr=False)
+ self.mlp_ta = nn.Linear(eunits + dunits + odim, 1)
+ self.mlp_att = nn.Linear(aconv_chans, att_dim, bias_attr=False)
+ self.loc_conv = nn.Conv2D(
+ 1,
+ aconv_chans,
+ (1, 2 * aconv_filts + 1),
+ padding=(0, aconv_filts),
+ bias_attr=False, )
+ self.gvec = nn.Linear(att_dim, 1)
+ self.dunits = dunits
+ self.eunits = eunits
+ self.att_dim = att_dim
+ self.h_length = None
+ self.enc_h = None
+ self.pre_compute_enc_h = None
+ self.mask = None
+ self.trans_agent_prob = 0.5
+
+ def reset(self):
+ self.h_length = None
+ self.enc_h = None
+ self.pre_compute_enc_h = None
+ self.mask = None
+ self.trans_agent_prob = 0.5
+
+ def forward(
+ self,
+ enc_hs_pad,
+ enc_hs_len,
+ dec_z,
+ att_prev,
+ out_prev,
+ scaling=1.0,
+ last_attended_idx=None,
+ backward_window=1,
+ forward_window=3, ):
+ """Calculate AttForwardTA forward propagation.
+
+ Args:
+ enc_hs_pad(Tensor): padded encoder hidden state (B, Tmax, eunits)
+ enc_hs_len(list Tensor): padded encoder hidden state length (B,)
+ dec_z(Tensor): decoder hidden state (B, dunits)
+ att_prev(Tensor): attention weights of previous step (B, T_max)
+ out_prev(Tensor): decoder outputs of previous step (B, odim)
+ scaling(float, optional): scaling parameter before applying softmax (Default value = 1.0)
+ last_attended_idx(int, optional): index of the inputs of the last attended (Default value = None)
+ backward_window(int, optional): backward window size in attention constraint (Default value = 1)
+ forward_window(int, optional): (Default value = 3)
+
+ Returns:
+ Tensor: attention weighted encoder state (B, dunits)
+ Tensor: previous attention weights (B, Tmax)
+ """
+ batch = len(enc_hs_pad)
+ # pre-compute all h outside the decoder loop
+ if self.pre_compute_enc_h is None:
+ self.enc_h = enc_hs_pad # utt x frame x hdim
+ self.h_length = paddle.shape(self.enc_h)[1]
+ # utt x frame x att_dim
+ self.pre_compute_enc_h = self.mlp_enc(self.enc_h)
+
+ if dec_z is None:
+ dec_z = paddle.zeros([batch, self.dunits])
+ else:
+ dec_z = dec_z.reshape([batch, self.dunits])
+
+ if att_prev is None:
+ # initial attention will be [1, 0, 0, ...]
+ att_prev = paddle.zeros([*paddle.shape(enc_hs_pad)[:2]])
+ att_prev[:, 0] = 1.0
+
+ # att_prev: utt x frame -> utt x 1 x 1 x frame
+ # -> utt x att_conv_chans x 1 x frame
+ att_conv = self.loc_conv(att_prev.reshape([batch, 1, 1, self.h_length]))
+ # att_conv: utt x att_conv_chans x 1 x frame -> utt x frame x att_conv_chans
+ att_conv = att_conv.squeeze(2).transpose([0, 2, 1])
+ # att_conv: utt x frame x att_conv_chans -> utt x frame x att_dim
+ att_conv = self.mlp_att(att_conv)
+
+ # dec_z_tiled: utt x frame x att_dim
+ dec_z_tiled = self.mlp_dec(dec_z).reshape([batch, 1, self.att_dim])
+
+ # dot with gvec
+ # utt x frame x att_dim -> utt x frame
+ e = self.gvec(
+ paddle.tanh(att_conv + self.pre_compute_enc_h +
+ dec_z_tiled)).squeeze(2)
+
+ # NOTE consider zero padding when compute w.
+ if self.mask is None:
+ self.mask = make_pad_mask(enc_hs_len)
+ e = masked_fill(e, self.mask, -float("inf"))
+
+ # apply monotonic attention constraint (mainly for TTS)
+ if last_attended_idx is not None:
+ e = _apply_attention_constraint(e, last_attended_idx,
+ backward_window, forward_window)
+
+ w = F.softmax(scaling * e, axis=1)
+
+ # forward attention
+ # att_prev_shift = F.pad(att_prev.unsqueeze(0), (1, 0), data_format='NCL').squeeze(0)[:, :-1]
+ att_prev_shift = F.pad(att_prev, (0, 0, 1, 0))[:, :-1]
+ w = (self.trans_agent_prob * att_prev +
+ (1 - self.trans_agent_prob) * att_prev_shift) * w
+ # NOTE: clip is needed to avoid nan gradient
+ w = F.normalize(paddle.clip(w, 1e-6), p=1, axis=1)
+
+ # weighted sum over flames
+ # utt x hdim
+ # NOTE use bmm instead of sum(*)
+ c = paddle.sum(
+ self.enc_h * w.reshape([batch, self.h_length, 1]), axis=1)
+
+ # update transition agent prob
+ self.trans_agent_prob = F.sigmoid(
+ self.mlp_ta(paddle.concat([c, out_prev, dec_z], axis=1)))
+
+ return c, w
diff --git a/paddlespeech/t2s/modules/tacotron2/decoder.py b/paddlespeech/t2s/modules/tacotron2/decoder.py
index 691bb3ee..ebdfa387 100644
--- a/paddlespeech/t2s/modules/tacotron2/decoder.py
+++ b/paddlespeech/t2s/modules/tacotron2/decoder.py
@@ -13,10 +13,12 @@
# limitations under the License.
# Modified from espnet(https://github.com/espnet/espnet)
"""Tacotron2 decoder related modules."""
+import paddle
import paddle.nn.functional as F
-import six
from paddle import nn
+from paddlespeech.t2s.modules.tacotron2.attentions import AttForwardTA
+
class Prenet(nn.Layer):
"""Prenet module for decoder of Spectrogram prediction network.
@@ -42,21 +44,16 @@ class Prenet(nn.Layer):
def __init__(self, idim, n_layers=2, n_units=256, dropout_rate=0.5):
"""Initialize prenet module.
- Parameters
- ----------
- idim : int
- Dimension of the inputs.
- odim : int
- Dimension of the outputs.
- n_layers : int, optional
- The number of prenet layers.
- n_units : int, optional
- The number of prenet units.
+ Args:
+ idim (int): Dimension of the inputs.
+ odim (int): Dimension of the outputs.
+ n_layers (int, optional): The number of prenet layers.
+ n_units (int, optional): The number of prenet units.
"""
super().__init__()
self.dropout_rate = dropout_rate
self.prenet = nn.LayerList()
- for layer in six.moves.range(n_layers):
+ for layer in range(n_layers):
n_inputs = idim if layer == 0 else n_units
self.prenet.append(
nn.Sequential(nn.Linear(n_inputs, n_units), nn.ReLU()))
@@ -64,18 +61,14 @@ class Prenet(nn.Layer):
def forward(self, x):
"""Calculate forward propagation.
- Parameters
- ----------
- x : Tensor
- Batch of input tensors (B, ..., idim).
+ Args:
+ x (Tensor): Batch of input tensors (B, ..., idim).
- Returns
- ----------
- Tensor
- Batch of output tensors (B, ..., odim).
+ Returns:
+ Tensor: Batch of output tensors (B, ..., odim).
"""
- for i in six.moves.range(len(self.prenet)):
+ for i in range(len(self.prenet)):
# F.dropout 引入了随机, tacotron2 的 dropout 是不能去掉的
x = F.dropout(self.prenet[i](x))
return x
@@ -107,26 +100,18 @@ class Postnet(nn.Layer):
use_batch_norm=True, ):
"""Initialize postnet module.
- Parameters
- ----------
- idim : int
- Dimension of the inputs.
- odim : int
- Dimension of the outputs.
- n_layers : int, optional
- The number of layers.
- n_filts : int, optional
- The number of filter size.
- n_units : int, optional
- The number of filter channels.
- use_batch_norm : bool, optional
- Whether to use batch normalization..
- dropout_rate : float, optional
- Dropout rate..
+ Args:
+ idim (int): Dimension of the inputs.
+ odim (int): Dimension of the outputs.
+ n_layers (int, optional): The number of layers.
+ n_filts (int, optional): The number of filter size.
+ n_units (int, optional): The number of filter channels.
+ use_batch_norm (bool, optional): Whether to use batch normalization..
+ dropout_rate (float, optional): Dropout rate..
"""
super().__init__()
self.postnet = nn.LayerList()
- for layer in six.moves.range(n_layers - 1):
+ for layer in range(n_layers - 1):
ichans = odim if layer == 0 else n_chans
ochans = odim if layer == n_layers - 1 else n_chans
if use_batch_norm:
@@ -182,17 +167,520 @@ class Postnet(nn.Layer):
def forward(self, xs):
"""Calculate forward propagation.
- Parameters
- ----------
- xs : Tensor
- Batch of the sequences of padded input tensors (B, idim, Tmax).
-
- Returns
- ----------
- Tensor
- Batch of padded output tensor. (B, odim, Tmax).
-
+ Args:
+ xs (Tensor): Batch of the sequences of padded input tensors (B, idim, Tmax).
+ Returns:
+ Tensor: Batch of padded output tensor. (B, odim, Tmax).
"""
- for i in six.moves.range(len(self.postnet)):
+ for i in range(len(self.postnet)):
xs = self.postnet[i](xs)
return xs
+
+
+class ZoneOutCell(nn.Layer):
+ """ZoneOut Cell module.
+ This is a module of zoneout described in
+ `Zoneout: Regularizing RNNs by Randomly Preserving Hidden Activations`_.
+ This code is modified from `eladhoffer/seq2seq.pytorch`_.
+ Examples
+ ----------
+ >>> lstm = paddle.nn.LSTMCell(16, 32)
+ >>> lstm = ZoneOutCell(lstm, 0.5)
+ .. _`Zoneout: Regularizing RNNs by Randomly Preserving Hidden Activations`:
+ https://arxiv.org/abs/1606.01305
+ .. _`eladhoffer/seq2seq.pytorch`:
+ https://github.com/eladhoffer/seq2seq.pytorch
+ """
+
+ def __init__(self, cell, zoneout_rate=0.1):
+ """Initialize zone out cell module.
+
+ Args:
+ cell (nn.Layer): Paddle recurrent cell module
+ e.g. `paddle.nn.LSTMCell`.
+ zoneout_rate (float, optional): Probability of zoneout from 0.0 to 1.0.
+ """
+ super().__init__()
+ self.cell = cell
+ self.hidden_size = cell.hidden_size
+ self.zoneout_rate = zoneout_rate
+ if zoneout_rate > 1.0 or zoneout_rate < 0.0:
+ raise ValueError(
+ "zoneout probability must be in the range from 0.0 to 1.0.")
+
+ def forward(self, inputs, hidden):
+ """Calculate forward propagation.
+
+ Args:
+ inputs (Tensor): Batch of input tensor (B, input_size).
+ hidden (tuple):
+ - Tensor: Batch of initial hidden states (B, hidden_size).
+ - Tensor: Batch of initial cell states (B, hidden_size).
+ Returns:
+ Tensor:
+ Batch of next hidden states (B, hidden_size).
+ tuple:
+ - Tensor: Batch of next hidden states (B, hidden_size).
+ - Tensor: Batch of next cell states (B, hidden_size).
+ """
+ # we only use the second output of LSTMCell in paddle
+ _, next_hidden = self.cell(inputs, hidden)
+ next_hidden = self._zoneout(hidden, next_hidden, self.zoneout_rate)
+ # to have the same output format with LSTMCell in paddle
+ return next_hidden[0], next_hidden
+
+ def _zoneout(self, h, next_h, prob):
+ # apply recursively
+ if isinstance(h, tuple):
+ num_h = len(h)
+ if not isinstance(prob, tuple):
+ prob = tuple([prob] * num_h)
+ return tuple(
+ [self._zoneout(h[i], next_h[i], prob[i]) for i in range(num_h)])
+ if self.training:
+ mask = paddle.bernoulli(paddle.ones([*paddle.shape(h)]) * prob)
+ return mask * h + (1 - mask) * next_h
+ else:
+ return prob * h + (1 - prob) * next_h
+
+
+class Decoder(nn.Layer):
+ """Decoder module of Spectrogram prediction network.
+ This is a module of decoder of Spectrogram prediction network in Tacotron2,
+ which described in `Natural TTS
+ Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions`_.
+ The decoder generates the sequence of
+ features from the sequence of the hidden states.
+ .. _`Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions`:
+ https://arxiv.org/abs/1712.05884
+ """
+
+ def __init__(
+ self,
+ idim,
+ odim,
+ att,
+ dlayers=2,
+ dunits=1024,
+ prenet_layers=2,
+ prenet_units=256,
+ postnet_layers=5,
+ postnet_chans=512,
+ postnet_filts=5,
+ output_activation_fn=None,
+ cumulate_att_w=True,
+ use_batch_norm=True,
+ use_concate=True,
+ dropout_rate=0.5,
+ zoneout_rate=0.1,
+ reduction_factor=1, ):
+ """Initialize Tacotron2 decoder module.
+
+ Args:
+ idim (int): Dimension of the inputs.
+ odim (int): Dimension of the outputs.
+ att (nn.Layer): Instance of attention class.
+ dlayers (int, optional): The number of decoder lstm layers.
+ dunits (int, optional): The number of decoder lstm units.
+ prenet_layers (int, optional): The number of prenet layers.
+ prenet_units (int, optional): The number of prenet units.
+ postnet_layers (int, optional): The number of postnet layers.
+ postnet_filts (int, optional): The number of postnet filter size.
+ postnet_chans (int, optional): The number of postnet filter channels.
+ output_activation_fn (nn.Layer, optional): Activation function for outputs.
+ cumulate_att_w (bool, optional): Whether to cumulate previous attention weight.
+ use_batch_norm (bool, optional): Whether to use batch normalization.
+ use_concate : bool, optional
+ Whether to concatenate encoder embedding with decoder lstm outputs.
+ dropout_rate : float, optional
+ Dropout rate.
+ zoneout_rate : float, optional
+ Zoneout rate.
+ reduction_factor : int, optional
+ Reduction factor.
+ """
+ super().__init__()
+
+ # store the hyperparameters
+ self.idim = idim
+ self.odim = odim
+ self.att = att
+ self.output_activation_fn = output_activation_fn
+ self.cumulate_att_w = cumulate_att_w
+ self.use_concate = use_concate
+ self.reduction_factor = reduction_factor
+
+ # check attention type
+ if isinstance(self.att, AttForwardTA):
+ self.use_att_extra_inputs = True
+ else:
+ self.use_att_extra_inputs = False
+
+ # define lstm network
+ prenet_units = prenet_units if prenet_layers != 0 else odim
+ self.lstm = nn.LayerList()
+ for layer in range(dlayers):
+ iunits = idim + prenet_units if layer == 0 else dunits
+ lstm = nn.LSTMCell(iunits, dunits)
+ if zoneout_rate > 0.0:
+ lstm = ZoneOutCell(lstm, zoneout_rate)
+ self.lstm.append(lstm)
+
+ # define prenet
+ if prenet_layers > 0:
+ self.prenet = Prenet(
+ idim=odim,
+ n_layers=prenet_layers,
+ n_units=prenet_units,
+ dropout_rate=dropout_rate, )
+ else:
+ self.prenet = None
+
+ # define postnet
+ if postnet_layers > 0:
+ self.postnet = Postnet(
+ idim=idim,
+ odim=odim,
+ n_layers=postnet_layers,
+ n_chans=postnet_chans,
+ n_filts=postnet_filts,
+ use_batch_norm=use_batch_norm,
+ dropout_rate=dropout_rate, )
+ else:
+ self.postnet = None
+
+ # define projection layers
+ iunits = idim + dunits if use_concate else dunits
+ self.feat_out = nn.Linear(
+ iunits, odim * reduction_factor, bias_attr=False)
+ self.prob_out = nn.Linear(iunits, reduction_factor)
+
+ def _zero_state(self, hs):
+ init_hs = paddle.zeros([paddle.shape(hs)[0], self.lstm[0].hidden_size])
+ return init_hs
+
+ def forward(self, hs, hlens, ys):
+ """Calculate forward propagation.
+
+ Args:
+ hs (Tensor): Batch of the sequences of padded hidden states (B, Tmax, idim).
+ hlens (Tensor(int64) padded): Batch of lengths of each input batch (B,).
+ ys (Tensor): Batch of the sequences of padded target features (B, Lmax, odim).
+
+ Returns:
+ Tensor: Batch of output tensors after postnet (B, Lmax, odim).
+ Tensor: Batch of output tensors before postnet (B, Lmax, odim).
+ Tensor: Batch of logits of stop prediction (B, Lmax).
+ Tensor: Batch of attention weights (B, Lmax, Tmax).
+
+ Note:
+ This computation is performed in teacher-forcing manner.
+ """
+ # thin out frames (B, Lmax, odim) -> (B, Lmax/r, odim)
+ if self.reduction_factor > 1:
+ ys = ys[:, self.reduction_factor - 1::self.reduction_factor]
+
+ # length list should be list of int
+ # hlens = list(map(int, hlens))
+
+ # initialize hidden states of decoder
+ c_list = [self._zero_state(hs)]
+ z_list = [self._zero_state(hs)]
+ for _ in range(1, len(self.lstm)):
+ c_list.append(self._zero_state(hs))
+ z_list.append(self._zero_state(hs))
+ prev_out = paddle.zeros([paddle.shape(hs)[0], self.odim])
+
+ # initialize attention
+ prev_att_ws = []
+ prev_att_w = paddle.zeros(paddle.shape(hlens))
+ prev_att_ws.append(prev_att_w)
+ self.att.reset()
+
+ # loop for an output sequence
+ outs, logits, att_ws = [], [], []
+ for y in ys.transpose([1, 0, 2]):
+ if self.use_att_extra_inputs:
+ att_c, att_w = self.att(hs, hlens, z_list[0], prev_att_ws[-1],
+ prev_out)
+ else:
+ att_c, att_w = self.att(hs, hlens, z_list[0], prev_att_ws[-1])
+ prenet_out = self.prenet(
+ prev_out) if self.prenet is not None else prev_out
+ xs = paddle.concat([att_c, prenet_out], axis=1)
+ # we only use the second output of LSTMCell in paddle
+ _, next_hidden = self.lstm[0](xs, (z_list[0], c_list[0]))
+ z_list[0], c_list[0] = next_hidden
+ for i in range(1, len(self.lstm)):
+ # we only use the second output of LSTMCell in paddle
+ _, next_hidden = self.lstm[i](z_list[i - 1],
+ (z_list[i], c_list[i]))
+ z_list[i], c_list[i] = next_hidden
+ zcs = (paddle.concat([z_list[-1], att_c], axis=1)
+ if self.use_concate else z_list[-1])
+ outs.append(
+ self.feat_out(zcs).reshape([paddle.shape(hs)[0], self.odim, -1
+ ]))
+ logits.append(self.prob_out(zcs))
+ att_ws.append(att_w)
+ # teacher forcing
+ prev_out = y
+ if self.cumulate_att_w and paddle.sum(prev_att_w) != 0:
+ prev_att_w = prev_att_w + att_w # Note: error when use +=
+ else:
+ prev_att_w = att_w
+ prev_att_ws.append(prev_att_w)
+ # (B, Lmax)
+ logits = paddle.concat(logits, axis=1)
+ # (B, odim, Lmax)
+ before_outs = paddle.concat(outs, axis=2)
+ # (B, Lmax, Tmax)
+ att_ws = paddle.stack(att_ws, axis=1)
+
+ if self.reduction_factor > 1:
+ # (B, odim, Lmax)
+ before_outs = before_outs.reshape(
+ [paddle.shape(before_outs)[0], self.odim, -1])
+
+ if self.postnet is not None:
+ # (B, odim, Lmax)
+ after_outs = before_outs + self.postnet(before_outs)
+ else:
+ after_outs = before_outs
+ # (B, Lmax, odim)
+ before_outs = before_outs.transpose([0, 2, 1])
+ # (B, Lmax, odim)
+ after_outs = after_outs.transpose([0, 2, 1])
+ logits = logits
+
+ # apply activation function for scaling
+ if self.output_activation_fn is not None:
+ before_outs = self.output_activation_fn(before_outs)
+ after_outs = self.output_activation_fn(after_outs)
+
+ return after_outs, before_outs, logits, att_ws
+
+ def inference(
+ self,
+ h,
+ threshold=0.5,
+ minlenratio=0.0,
+ maxlenratio=10.0,
+ use_att_constraint=False,
+ backward_window=None,
+ forward_window=None, ):
+ """Generate the sequence of features given the sequences of characters.
+ Args:
+ h(Tensor): Input sequence of encoder hidden states (T, C).
+ threshold(float, optional, optional): Threshold to stop generation. (Default value = 0.5)
+ minlenratio(float, optional, optional): Minimum length ratio. If set to 1.0 and the length of input is 10,
+ the minimum length of outputs will be 10 * 1 = 10. (Default value = 0.0)
+ maxlenratio(float, optional, optional): Minimum length ratio. If set to 10 and the length of input is 10,
+ the maximum length of outputs will be 10 * 10 = 100. (Default value = 0.0)
+ use_att_constraint(bool, optional): Whether to apply attention constraint introduced in `Deep Voice 3`_. (Default value = False)
+ backward_window(int, optional): Backward window size in attention constraint. (Default value = None)
+ forward_window(int, optional): (Default value = None)
+
+ Returns:
+ Tensor: Output sequence of features (L, odim).
+ Tensor: Output sequence of stop probabilities (L,).
+ Tensor: Attention weights (L, T).
+
+ Note:
+ This computation is performed in auto-regressive manner.
+ .. _`Deep Voice 3`: https://arxiv.org/abs/1710.07654
+ """
+ # setup
+
+ assert len(paddle.shape(h)) == 2
+ hs = h.unsqueeze(0)
+ ilens = paddle.shape(h)[0]
+ # 本来 maxlen 和 minlen 外面有 int(),防止动转静的问题此处删除
+ maxlen = paddle.shape(h)[0] * maxlenratio
+ minlen = paddle.shape(h)[0] * minlenratio
+ # 本来是直接使用 threshold 的,此处为了防止动转静的问题把 threshold 转成 tensor
+ threshold = paddle.ones([1]) * threshold
+
+ # initialize hidden states of decoder
+ c_list = [self._zero_state(hs)]
+ z_list = [self._zero_state(hs)]
+ for _ in range(1, len(self.lstm)):
+ c_list.append(self._zero_state(hs))
+ z_list.append(self._zero_state(hs))
+ prev_out = paddle.zeros([1, self.odim])
+
+ # initialize attention
+ prev_att_ws = []
+ prev_att_w = paddle.zeros([ilens])
+ prev_att_ws.append(prev_att_w)
+
+ self.att.reset()
+
+ # setup for attention constraint
+ if use_att_constraint:
+ last_attended_idx = 0
+ else:
+ last_attended_idx = None
+
+ # loop for an output sequence
+ idx = 0
+ outs, att_ws, probs = [], [], []
+ prob = paddle.zeros([1])
+ while True:
+ # updated index
+ idx += self.reduction_factor
+
+ # decoder calculation
+ if self.use_att_extra_inputs:
+ att_c, att_w = self.att(
+ hs,
+ ilens,
+ z_list[0],
+ prev_att_ws[-1],
+ prev_out,
+ last_attended_idx=last_attended_idx,
+ backward_window=backward_window,
+ forward_window=forward_window, )
+ else:
+ att_c, att_w = self.att(
+ hs,
+ ilens,
+ z_list[0],
+ prev_att_ws[-1],
+ last_attended_idx=last_attended_idx,
+ backward_window=backward_window,
+ forward_window=forward_window, )
+
+ att_ws.append(att_w)
+ prenet_out = self.prenet(
+ prev_out) if self.prenet is not None else prev_out
+ xs = paddle.concat([att_c, prenet_out], axis=1)
+ # we only use the second output of LSTMCell in paddle
+ _, next_hidden = self.lstm[0](xs, (z_list[0], c_list[0]))
+
+ z_list[0], c_list[0] = next_hidden
+ for i in range(1, len(self.lstm)):
+ # we only use the second output of LSTMCell in paddle
+ _, next_hidden = self.lstm[i](z_list[i - 1],
+ (z_list[i], c_list[i]))
+ z_list[i], c_list[i] = next_hidden
+ zcs = (paddle.concat([z_list[-1], att_c], axis=1)
+ if self.use_concate else z_list[-1])
+ # [(1, odim, r), ...]
+ outs.append(self.feat_out(zcs).reshape([1, self.odim, -1]))
+
+ prob = F.sigmoid(self.prob_out(zcs))[0]
+ probs.append(prob)
+
+ if self.output_activation_fn is not None:
+ prev_out = self.output_activation_fn(
+ outs[-1][:, :, -1]) # (1, odim)
+ else:
+ prev_out = outs[-1][:, :, -1] # (1, odim)
+ if self.cumulate_att_w and paddle.sum(prev_att_w) != 0:
+ prev_att_w = prev_att_w + att_w # Note: error when use +=
+ else:
+ prev_att_w = att_w
+ prev_att_ws.append(prev_att_w)
+ if use_att_constraint:
+ last_attended_idx = int(att_w.argmax())
+
+ # tacotron2 ljspeech 动转静的问题应该是这里没有正确判断 prob >= threshold 导致的
+ if prob >= threshold or idx >= maxlen:
+ # check mininum length
+ if idx < minlen:
+ continue
+ break
+ """
+ 仅解开 665~667 行的代码块,动转静时会卡死,但是动态图时可以正确生成音频,证明模型没问题
+ 同时解开 665~667 行 和 668 ~ 670 行的代码块,动转静时不会卡死,但是生成的音频末尾有多余的噪声
+ 证明动转静没有进入 prob >= threshold 的判断,但是静态图可以进入 prob >= threshold 并退出循环
+ 动转静时是通过 idx >= maxlen 退出循环(所以没有这个逻辑的时候会一直循环,也就是卡死),
+ 没有在模型判断该结束的时候结束,而是在超出最大长度时结束,所以合成的音频末尾有很长的额外预测的噪声
+ 动转静用 prob <= threshold 的条件可以退出循环(虽然结果不正确),证明条件参数的类型本身没问题,可能是 prob 有问题
+ """
+ # if prob >= threshold:
+ # print("prob >= threshold")
+ # break
+ # elif idx >= maxlen:
+ # print("idx >= maxlen")
+ # break
+
+ # (1, odim, L)
+ outs = paddle.concat(outs, axis=2)
+ if self.postnet is not None:
+ # (1, odim, L)
+ outs = outs + self.postnet(outs)
+ # (L, odim)
+ outs = outs.transpose([0, 2, 1]).squeeze(0)
+ probs = paddle.concat(probs, axis=0)
+ att_ws = paddle.concat(att_ws, axis=0)
+
+ if self.output_activation_fn is not None:
+ outs = self.output_activation_fn(outs)
+
+ return outs, probs, att_ws
+
+ def calculate_all_attentions(self, hs, hlens, ys):
+ """Calculate all of the attention weights.
+
+ Args:
+ hs (Tensor): Batch of the sequences of padded hidden states (B, Tmax, idim).
+ hlens (Tensor(int64)): Batch of lengths of each input batch (B,).
+ ys (Tensor): Batch of the sequences of padded target features (B, Lmax, odim).
+
+ Returns:
+ numpy.ndarray:
+ Batch of attention weights (B, Lmax, Tmax).
+
+ Note:
+ This computation is performed in teacher-forcing manner.
+ """
+ # thin out frames (B, Lmax, odim) -> (B, Lmax/r, odim)
+ if self.reduction_factor > 1:
+ ys = ys[:, self.reduction_factor - 1::self.reduction_factor]
+
+ # length list should be list of int
+ hlens = list(map(int, hlens))
+
+ # initialize hidden states of decoder
+ c_list = [self._zero_state(hs)]
+ z_list = [self._zero_state(hs)]
+ for _ in range(1, len(self.lstm)):
+ c_list.append(self._zero_state(hs))
+ z_list.append(self._zero_state(hs))
+ prev_out = paddle.zeros([paddle.shape(hs)[0], self.odim])
+
+ # initialize attention
+ prev_att_w = None
+ self.att.reset()
+
+ # loop for an output sequence
+ att_ws = []
+ for y in ys.transpose([1, 0, 2]):
+ if self.use_att_extra_inputs:
+ att_c, att_w = self.att(hs, hlens, z_list[0], prev_att_w,
+ prev_out)
+ else:
+ att_c, att_w = self.att(hs, hlens, z_list[0], prev_att_w)
+ att_ws.append(att_w)
+ prenet_out = self.prenet(
+ prev_out) if self.prenet is not None else prev_out
+ xs = paddle.concat([att_c, prenet_out], axis=1)
+ # we only use the second output of LSTMCell in paddle
+ _, next_hidden = self.lstm[0](xs, (z_list[0], c_list[0]))
+ z_list[0], c_list[0] = next_hidden
+ for i in range(1, len(self.lstm)):
+ z_list[i], c_list[i] = self.lstm[i](z_list[i - 1],
+ (z_list[i], c_list[i]))
+ # teacher forcing
+ prev_out = y
+ if self.cumulate_att_w and prev_att_w is not None:
+ # Note: error when use +=
+ prev_att_w = prev_att_w + att_w
+ else:
+ prev_att_w = att_w
+ # (B, Lmax, Tmax)
+ att_ws = paddle.stack(att_ws, axis=1)
+
+ return att_ws
diff --git a/paddlespeech/t2s/modules/tacotron2/encoder.py b/paddlespeech/t2s/modules/tacotron2/encoder.py
index f1889061..db102a11 100644
--- a/paddlespeech/t2s/modules/tacotron2/encoder.py
+++ b/paddlespeech/t2s/modules/tacotron2/encoder.py
@@ -14,7 +14,6 @@
# Modified from espnet(https://github.com/espnet/espnet)
"""Tacotron2 encoder related modules."""
import paddle
-import six
from paddle import nn
@@ -46,31 +45,18 @@ class Encoder(nn.Layer):
dropout_rate=0.5,
padding_idx=0, ):
"""Initialize Tacotron2 encoder module.
-
- Parameters
- ----------
- idim : int
- Dimension of the inputs.
- input_layer : str
- Input layer type.
- embed_dim : int, optional
- Dimension of character embedding.
- elayers : int, optional
- The number of encoder blstm layers.
- eunits : int, optional
- The number of encoder blstm units.
- econv_layers : int, optional
- The number of encoder conv layers.
- econv_filts : int, optional
- The number of encoder conv filter size.
- econv_chans : int, optional
- The number of encoder conv filter channels.
- use_batch_norm : bool, optional
- Whether to use batch normalization.
- use_residual : bool, optional
- Whether to use residual connection.
- dropout_rate : float, optional
- Dropout rate.
+ Args:
+ idim (int): Dimension of the inputs.
+ input_layer (str): Input layer type.
+ embed_dim (int, optional): Dimension of character embedding.
+ elayers (int, optional): The number of encoder blstm layers.
+ eunits (int, optional): The number of encoder blstm units.
+ econv_layers (int, optional): The number of encoder conv layers.
+ econv_filts (int, optional): The number of encoder conv filter size.
+ econv_chans (int, optional): The number of encoder conv filter channels.
+ use_batch_norm (bool, optional): Whether to use batch normalization.
+ use_residual (bool, optional): Whether to use residual connection.
+ dropout_rate (float, optional): Dropout rate.
"""
super().__init__()
@@ -88,7 +74,7 @@ class Encoder(nn.Layer):
if econv_layers > 0:
self.convs = nn.LayerList()
- for layer in six.moves.range(econv_layers):
+ for layer in range(econv_layers):
ichans = (embed_dim if layer == 0 and input_layer == "embed"
else econv_chans)
if use_batch_norm:
@@ -130,6 +116,7 @@ class Encoder(nn.Layer):
direction='bidirectional',
bias_ih_attr=True,
bias_hh_attr=True)
+ self.blstm.flatten_parameters()
else:
self.blstm = None
@@ -139,26 +126,19 @@ class Encoder(nn.Layer):
def forward(self, xs, ilens=None):
"""Calculate forward propagation.
- Parameters
- ----------
- xs : Tensor
- Batch of the padded sequence. Either character ids (B, Tmax)
- or acoustic feature (B, Tmax, idim * encoder_reduction_factor).
- Padded value should be 0.
- ilens : LongTensor
- Batch of lengths of each input batch (B,).
-
- Returns
- ----------
- Tensor
- Batch of the sequences of encoder states(B, Tmax, eunits).
- LongTensor
- Batch of lengths of each sequence (B,)
+ Args:
+ xs (Tensor): Batch of the padded sequence. Either character ids (B, Tmax)
+ or acoustic feature (B, Tmax, idim * encoder_reduction_factor).
+ Padded value should be 0.
+ ilens (Tensor(int64)): Batch of lengths of each input batch (B,).
+ Returns:
+ Tensor: Batch of the sequences of encoder states(B, Tmax, eunits).
+ Tensor(int64): Batch of lengths of each sequence (B,)
"""
xs = self.embed(xs).transpose([0, 2, 1])
if self.convs is not None:
- for i in six.moves.range(len(self.convs)):
+ for i in range(len(self.convs)):
if self.use_residual:
xs += self.convs[i](xs)
else:
@@ -168,10 +148,11 @@ class Encoder(nn.Layer):
if not isinstance(ilens, paddle.Tensor):
ilens = paddle.to_tensor(ilens)
xs = xs.transpose([0, 2, 1])
- self.blstm.flatten_parameters()
+ # for dygraph to static graph
+ # self.blstm.flatten_parameters()
# (B, Tmax, C)
- xs, _ = self.blstm(xs)
- # hlens 是什么
+ # see https://www.paddlepaddle.org.cn/documentation/docs/zh/faq/train_cn.html#paddletorch-nn-utils-rnn-pack-padded-sequencetorch-nn-utils-rnn-pad-packed-sequenceapi
+ xs, _ = self.blstm(xs, sequence_length=ilens)
hlens = ilens
return xs, hlens
@@ -179,19 +160,15 @@ class Encoder(nn.Layer):
def inference(self, x):
"""Inference.
- Parameters
- ----------
- x : Tensor
- The sequeunce of character ids (T,)
- or acoustic feature (T, idim * encoder_reduction_factor).
+ Args:
+ x (Tensor): The sequeunce of character ids (T,)
+ or acoustic feature (T, idim * encoder_reduction_factor).
- Returns
- ----------
- Tensor
- The sequences of encoder states(T, eunits).
+ Returns:
+ Tensor: The sequences of encoder states(T, eunits).
"""
xs = x.unsqueeze(0)
- ilens = paddle.to_tensor([x.shape[0]])
+ ilens = paddle.shape(x)[0]
return self.forward(xs, ilens)[0][0]
diff --git a/paddlespeech/t2s/modules/tade_res_block.py b/paddlespeech/t2s/modules/tade_res_block.py
index 1ca4e6d8..b2275e23 100644
--- a/paddlespeech/t2s/modules/tade_res_block.py
+++ b/paddlespeech/t2s/modules/tade_res_block.py
@@ -59,18 +59,12 @@ class TADELayer(nn.Layer):
def forward(self, x, c):
"""Calculate forward propagation.
- Parameters
- ----------
- x : Tensor
- Input tensor (B, in_channels, T).
- c : Tensor
- Auxiliary input tensor (B, aux_channels, T).
- Returns
- ----------
- Tensor
- Output tensor (B, in_channels, T * upsample_factor).
- Tensor
- Upsampled aux tensor (B, in_channels, T * upsample_factor).
+ Args:
+ x (Tensor): Input tensor (B, in_channels, T).
+ c (Tensor): Auxiliary input tensor (B, aux_channels, T).
+ Returns:
+ Tensor: Output tensor (B, in_channels, T * upsample_factor).
+ Tensor: Upsampled aux tensor (B, in_channels, T * upsample_factor).
"""
x = self.norm(x)
@@ -142,18 +136,13 @@ class TADEResBlock(nn.Layer):
def forward(self, x, c):
"""Calculate forward propagation.
- Parameters
- ----------
- x : Tensor
- Input tensor (B, in_channels, T).
- c : Tensor
- Auxiliary input tensor (B, aux_channels, T).
- Returns
- ----------
- Tensor
- Output tensor (B, in_channels, T * upsample_factor).
- Tensor
- Upsampled auxirialy tensor (B, in_channels, T * upsample_factor).
+ Args:
+
+ x (Tensor): Input tensor (B, in_channels, T).
+ c (Tensor): Auxiliary input tensor (B, aux_channels, T).
+ Returns:
+ Tensor: Output tensor (B, in_channels, T * upsample_factor).
+ Tensor: Upsampled auxirialy tensor (B, in_channels, T * upsample_factor).
"""
residual = x
x, c = self.tade1(x, c)
diff --git a/paddlespeech/t2s/modules/transformer/attention.py b/paddlespeech/t2s/modules/transformer/attention.py
index 34386f2a..cdb95b21 100644
--- a/paddlespeech/t2s/modules/transformer/attention.py
+++ b/paddlespeech/t2s/modules/transformer/attention.py
@@ -24,15 +24,10 @@ from paddlespeech.t2s.modules.masked_fill import masked_fill
class MultiHeadedAttention(nn.Layer):
"""Multi-Head Attention layer.
-
- Parameters
- ----------
- n_head : int
- The number of heads.
- n_feat : int
- The number of features.
- dropout_rate : float
- Dropout rate.
+ Args:
+ n_head (int): The number of heads.
+ n_feat (int): The number of features.
+ dropout_rate (float): Dropout rate.
"""
def __init__(self, n_head, n_feat, dropout_rate):
@@ -52,23 +47,15 @@ class MultiHeadedAttention(nn.Layer):
def forward_qkv(self, query, key, value):
"""Transform query, key and value.
- Parameters
- ----------
- query : paddle.Tensor
- query tensor (#batch, time1, size).
- key : paddle.Tensor
- Key tensor (#batch, time2, size).
- value : paddle.Tensor
- Value tensor (#batch, time2, size).
-
- Returns
- ----------
- paddle.Tensor
- Transformed query tensor (#batch, n_head, time1, d_k).
- paddle.Tensor
- Transformed key tensor (#batch, n_head, time2, d_k).
- paddle.Tensor
- Transformed value tensor (#batch, n_head, time2, d_k).
+ Args:
+ query(Tensor): query tensor (#batch, time1, size).
+ key(Tensor): Key tensor (#batch, time2, size).
+ value(Tensor): Value tensor (#batch, time2, size).
+
+ Returns:
+ Tensor: Transformed query tensor (#batch, n_head, time1, d_k).
+ Tensor: Transformed key tensor (#batch, n_head, time2, d_k).
+ Tensor: Transformed value tensor (#batch, n_head, time2, d_k).
"""
n_batch = paddle.shape(query)[0]
@@ -89,20 +76,13 @@ class MultiHeadedAttention(nn.Layer):
def forward_attention(self, value, scores, mask=None):
"""Compute attention context vector.
- Parameters
- ----------
- value : paddle.Tensor
- Transformed value (#batch, n_head, time2, d_k).
- scores : paddle.Tensor
- Attention score (#batch, n_head, time1, time2).
- mask : paddle.Tensor
- Mask (#batch, 1, time2) or (#batch, time1, time2).
-
- Returns
- ----------
- paddle.Tensor:
- Transformed value (#batch, time1, d_model)
- weighted by the attention score (#batch, time1, time2).
+ Args:
+ value(Tensor): Transformed value (#batch, n_head, time2, d_k).
+ scores(Tensor): Attention score (#batch, n_head, time1, time2).
+ mask(Tensor, optional): Mask (#batch, 1, time2) or (#batch, time1, time2). (Default value = None)
+
+ Returns:
+ Tensor: Transformed value (#batch, time1, d_model) weighted by the attention score (#batch, time1, time2).
"""
n_batch = paddle.shape(value)[0]
softmax = paddle.nn.Softmax(axis=-1)
@@ -132,21 +112,14 @@ class MultiHeadedAttention(nn.Layer):
def forward(self, query, key, value, mask=None):
"""Compute scaled dot product attention.
- Parameters
- ----------
- query : paddle.Tensor
- Query tensor (#batch, time1, size).
- key : paddle.Tensor
- Key tensor (#batch, time2, size).
- value : paddle.Tensor
- Value tensor (#batch, time2, size).
- mask : paddle.Tensor
- Mask tensor (#batch, 1, time2) or (#batch, time1, time2).
-
- Returns
- ----------
- paddle.Tensor
- Output tensor (#batch, time1, d_model).
+ Args:
+ query(Tensor): Query tensor (#batch, time1, size).
+ key(Tensor): Key tensor (#batch, time2, size).
+ value(Tensor): Value tensor (#batch, time2, size).
+ mask(Tensor, optional): Mask tensor (#batch, 1, time2) or (#batch, time1, time2). (Default value = None)
+
+ Returns:
+ Tensor: Output tensor (#batch, time1, d_model).
"""
q, k, v = self.forward_qkv(query, key, value)
scores = paddle.matmul(q, k.transpose(
@@ -159,16 +132,12 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention):
"""Multi-Head Attention layer with relative position encoding (new implementation).
Details can be found in https://github.com/espnet/espnet/pull/2816.
Paper: https://arxiv.org/abs/1901.02860
- Parameters
- ----------
- n_head : int
- The number of heads.
- n_feat : int
- The number of features.
- dropout_rate : float
- Dropout rate.
- zero_triu : bool
- Whether to zero the upper triangular part of attention matrix.
+
+ Args:
+ n_head (int): The number of heads.
+ n_feat (int): The number of features.
+ dropout_rate (float): Dropout rate.
+ zero_triu (bool): Whether to zero the upper triangular part of attention matrix.
"""
def __init__(self, n_head, n_feat, dropout_rate, zero_triu=False):
@@ -191,15 +160,11 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention):
def rel_shift(self, x):
"""Compute relative positional encoding.
- Parameters
- ----------
- x : paddle.Tensor
- Input tensor (batch, head, time1, 2*time1-1).
- time1 means the length of query vector.
- Returns
- ----------
- paddle.Tensor
- Output tensor.
+ Args:
+ x(Tensor): Input tensor (batch, head, time1, 2*time1-1).
+
+ Returns:
+ Tensor:Output tensor.
"""
b, h, t1, t2 = paddle.shape(x)
zero_pad = paddle.zeros((b, h, t1, 1))
@@ -216,24 +181,16 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention):
def forward(self, query, key, value, pos_emb, mask):
"""Compute 'Scaled Dot Product Attention' with rel. positional encoding.
- Parameters
- ----------
- query : paddle.Tensor
- Query tensor (#batch, time1, size).
- key : paddle.Tensor
- Key tensor (#batch, time2, size).
- value : paddle.Tensor
- Value tensor (#batch, time2, size).
- pos_emb : paddle.Tensor
- Positional embedding tensor
- (#batch, 2*time1-1, size).
- mask : paddle.Tensor
- Mask tensor (#batch, 1, time2) or
- (#batch, time1, time2).
- Returns
- ----------
- paddle.Tensor
- Output tensor (#batch, time1, d_model).
+
+ Args:
+ query(Tensor): Query tensor (#batch, time1, size).
+ key(Tensor): Key tensor (#batch, time2, size).
+ value(Tensor): Value tensor (#batch, time2, size).
+ pos_emb(Tensor): Positional embedding tensor (#batch, 2*time1-1, size).
+ mask(Tensor): Mask tensor (#batch, 1, time2) or (#batch, time1, time2).
+
+ Returns:
+ Tensor: Output tensor (#batch, time1, d_model).
"""
q, k, v = self.forward_qkv(query, key, value)
# (batch, time1, head, d_k)
diff --git a/paddlespeech/t2s/modules/transformer/decoder.py b/paddlespeech/t2s/modules/transformer/decoder.py
index fe2949f4..a8db7345 100644
--- a/paddlespeech/t2s/modules/transformer/decoder.py
+++ b/paddlespeech/t2s/modules/transformer/decoder.py
@@ -36,51 +36,32 @@ from paddlespeech.t2s.modules.transformer.repeat import repeat
class Decoder(nn.Layer):
"""Transfomer decoder module.
- Parameters
- ----------
- odim : int
- Output diminsion.
- self_attention_layer_type : str
- Self-attention layer type.
- attention_dim : int
- Dimention of attention.
- attention_heads : int
- The number of heads of multi head attention.
- conv_wshare : int
- The number of kernel of convolution. Only used in
- self_attention_layer_type == "lightconv*" or "dynamiconv*".
- conv_kernel_length : Union[int, str])
- Kernel size str of convolution
- (e.g. 71_71_71_71_71_71). Only used in self_attention_layer_type == "lightconv*" or "dynamiconv*".
- conv_usebias : bool
- Whether to use bias in convolution. Only used in
- self_attention_layer_type == "lightconv*" or "dynamiconv*".
- linear_units : int
- The number of units of position-wise feed forward.
- num_blocks : int
- The number of decoder blocks.
- dropout_rate : float
- Dropout rate.
- positional_dropout_rate : float
- Dropout rate after adding positional encoding.
- self_attention_dropout_rate : float
- Dropout rate in self-attention.
- src_attention_dropout_rate : float
- Dropout rate in source-attention.
- input_layer : (Union[str, nn.Layer])
- Input layer type.
- use_output_layer : bool
- Whether to use output layer.
- pos_enc_class : nn.Layer
- Positional encoding module class.
- `PositionalEncoding `or `ScaledPositionalEncoding`
- normalize_before : bool
- Whether to use layer_norm before the first block.
- concat_after : bool
- Whether to concat attention layer's input and output.
- if True, additional linear will be applied.
- i.e. x -> x + linear(concat(x, att(x)))
- if False, no additional linear will be applied. i.e. x -> x + att(x)
+ Args:
+ odim (int): Output diminsion.
+ self_attention_layer_type (str): Self-attention layer type.
+ attention_dim (int): Dimention of attention.
+ attention_heads (int): The number of heads of multi head attention.
+ conv_wshare (int): The number of kernel of convolution. Only used in
+ self_attention_layer_type == "lightconv*" or "dynamiconv*".
+ conv_kernel_length (Union[int, str]):Kernel size str of convolution
+ (e.g. 71_71_71_71_71_71). Only used in self_attention_layer_type == "lightconv*" or "dynamiconv*".
+ conv_usebias (bool): Whether to use bias in convolution. Only used in
+ self_attention_layer_type == "lightconv*" or "dynamiconv*".
+ linear_units(int): The number of units of position-wise feed forward.
+ num_blocks (int): The number of decoder blocks.
+ dropout_rate (float): Dropout rate.
+ positional_dropout_rate (float): Dropout rate after adding positional encoding.
+ self_attention_dropout_rate (float): Dropout rate in self-attention.
+ src_attention_dropout_rate (float): Dropout rate in source-attention.
+ input_layer (Union[str, nn.Layer]): Input layer type.
+ use_output_layer (bool): Whether to use output layer.
+ pos_enc_class (nn.Layer): Positional encoding module class.
+ `PositionalEncoding `or `ScaledPositionalEncoding`
+ normalize_before (bool): Whether to use layer_norm before the first block.
+ concat_after (bool): Whether to concat attention layer's input and output.
+ if True, additional linear will be applied.
+ i.e. x -> x + linear(concat(x, att(x)))
+ if False, no additional linear will be applied. i.e. x -> x + att(x)
"""
@@ -161,27 +142,18 @@ class Decoder(nn.Layer):
def forward(self, tgt, tgt_mask, memory, memory_mask):
"""Forward decoder.
-
- Parameters
- ----------
- tgt : paddle.Tensor
- Input token ids, int64 (#batch, maxlen_out) if input_layer == "embed".
- In the other case, input tensor (#batch, maxlen_out, odim).
- tgt_mask : paddle.Tensor
- Input token mask (#batch, maxlen_out).
- memory : paddle.Tensor
- Encoded memory, float32 (#batch, maxlen_in, feat).
- memory_mask : paddle.Tensor
- Encoded memory mask (#batch, maxlen_in).
-
- Returns
- ----------
- paddle.Tensor
- Decoded token score before softmax (#batch, maxlen_out, odim)
- if use_output_layer is True. In the other case,final block outputs
- (#batch, maxlen_out, attention_dim).
- paddle.Tensor
- Score mask before softmax (#batch, maxlen_out).
+ Args:
+ tgt(Tensor): Input token ids, int64 (#batch, maxlen_out) if input_layer == "embed".
+ In the other case, input tensor (#batch, maxlen_out, odim).
+ tgt_mask(Tensor): Input token mask (#batch, maxlen_out).
+ memory(Tensor): Encoded memory, float32 (#batch, maxlen_in, feat).
+ memory_mask(Tensor): Encoded memory mask (#batch, maxlen_in).
+
+ Returns:
+ Tensor:
+ Decoded token score before softmax (#batch, maxlen_out, odim) if use_output_layer is True.
+ In the other case,final block outputs (#batch, maxlen_out, attention_dim).
+ Tensor: Score mask before softmax (#batch, maxlen_out).
"""
x = self.embed(tgt)
@@ -196,23 +168,15 @@ class Decoder(nn.Layer):
def forward_one_step(self, tgt, tgt_mask, memory, cache=None):
"""Forward one step.
- Parameters
- ----------
- tgt : paddle.Tensor
- Input token ids, int64 (#batch, maxlen_out).
- tgt_mask : paddle.Tensor
- Input token mask (#batch, maxlen_out).
- memory : paddle.Tensor
- Encoded memory, float32 (#batch, maxlen_in, feat).
- cache : (List[paddle.Tensor])
- List of cached tensors.
- Each tensor shape should be (#batch, maxlen_out - 1, size).
- Returns
- ----------
- paddle.Tensor
- Output tensor (batch, maxlen_out, odim).
- List[paddle.Tensor]
- List of cache tensors of each decoder layer.
+ Args:
+ tgt(Tensor): Input token ids, int64 (#batch, maxlen_out).
+ tgt_mask(Tensor): Input token mask (#batch, maxlen_out).
+ memory(Tensor): Encoded memory, float32 (#batch, maxlen_in, feat).
+ cache((List[Tensor]), optional): List of cached tensors. (Default value = None)
+
+ Returns:
+ Tensor: Output tensor (batch, maxlen_out, odim).
+ List[Tensor]: List of cache tensors of each decoder layer.
"""
x = self.embed(tgt)
@@ -254,20 +218,14 @@ class Decoder(nn.Layer):
xs: paddle.Tensor) -> Tuple[paddle.Tensor, List[Any]]:
"""Score new token batch (required).
- Parameters
- ----------
- ys : paddle.Tensor
- paddle.int64 prefix tokens (n_batch, ylen).
- states : List[Any]
- Scorer states for prefix tokens.
- xs : paddle.Tensor
- The encoder feature that generates ys (n_batch, xlen, n_feat).
+ Args:
+ ys(Tensor): paddle.int64 prefix tokens (n_batch, ylen).
+ states(List[Any]): Scorer states for prefix tokens.
+ xs(Tensor): The encoder feature that generates ys (n_batch, xlen, n_feat).
- Returns
- ----------
- tuple[paddle.Tensor, List[Any]]
- Tuple ofbatchfied scores for next token with shape of `(n_batch, n_vocab)`
- and next state list for ys.
+ Returns:
+ tuple[Tensor, List[Any]]:
+ Tuple ofbatchfied scores for next token with shape of `(n_batch, n_vocab)` and next state list for ys.
"""
# merge states
diff --git a/paddlespeech/t2s/modules/transformer/decoder_layer.py b/paddlespeech/t2s/modules/transformer/decoder_layer.py
index 44978f1e..9a13cd79 100644
--- a/paddlespeech/t2s/modules/transformer/decoder_layer.py
+++ b/paddlespeech/t2s/modules/transformer/decoder_layer.py
@@ -22,28 +22,21 @@ from paddlespeech.t2s.modules.layer_norm import LayerNorm
class DecoderLayer(nn.Layer):
"""Single decoder layer module.
- Parameters
- ----------
- size : int
- Input dimension.
- self_attn : nn.Layer
- Self-attention module instance.
- `MultiHeadedAttention` instance can be used as the argument.
- src_attn : nn.Layer
- Self-attention module instance.
- `MultiHeadedAttention` instance can be used as the argument.
- feed_forward : nn.Layer
- Feed-forward module instance.
- `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance can be used as the argument.
- dropout_rate : float
- Dropout rate.
- normalize_before : bool
- Whether to use layer_norm before the first block.
- concat_after : bool
- Whether to concat attention layer's input and output.
- if True, additional linear will be applied.
- i.e. x -> x + linear(concat(x, att(x)))
- if False, no additional linear will be applied. i.e. x -> x + att(x)
+
+ Args:
+ size (int): Input dimension.
+ self_attn (nn.Layer): Self-attention module instance.
+ `MultiHeadedAttention` instance can be used as the argument.
+ src_attn (nn.Layer): Self-attention module instance.
+ `MultiHeadedAttention` instance can be used as the argument.
+ feed_forward (nn.Layer): Feed-forward module instance.
+ `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance can be used as the argument.
+ dropout_rate (float): Dropout rate.
+ normalize_before (bool): Whether to use layer_norm before the first block.
+ concat_after (bool): Whether to concat attention layer's input and output.
+ if True, additional linear will be applied.
+ i.e. x -> x + linear(concat(x, att(x)))
+ if False, no additional linear will be applied. i.e. x -> x + att(x)
"""
@@ -75,30 +68,22 @@ class DecoderLayer(nn.Layer):
def forward(self, tgt, tgt_mask, memory, memory_mask, cache=None):
"""Compute decoded features.
- Parameters
- ----------
- tgt : paddle.Tensor
- Input tensor (#batch, maxlen_out, size).
- tgt_mask : paddle.Tensor
- Mask for input tensor (#batch, maxlen_out).
- memory : paddle.Tensor
- Encoded memory, float32 (#batch, maxlen_in, size).
- memory_mask : paddle.Tensor
- Encoded memory mask (#batch, maxlen_in).
- cache : List[paddle.Tensor]
- List of cached tensors.
- Each tensor shape should be (#batch, maxlen_out - 1, size).
-
- Returns
- ----------
- paddle.Tensor
- Output tensor(#batch, maxlen_out, size).
- paddle.Tensor
- Mask for output tensor (#batch, maxlen_out).
- paddle.Tensor
- Encoded memory (#batch, maxlen_in, size).
- paddle.Tensor
- Encoded memory mask (#batch, maxlen_in).
+ Args:
+ tgt(Tensor): Input tensor (#batch, maxlen_out, size).
+ tgt_mask(Tensor): Mask for input tensor (#batch, maxlen_out).
+ memory(Tensor): Encoded memory, float32 (#batch, maxlen_in, size).
+ memory_mask(Tensor): Encoded memory mask (#batch, maxlen_in).
+ cache(List[Tensor], optional): List of cached tensors.
+ Each tensor shape should be (#batch, maxlen_out - 1, size). (Default value = None)
+ Returns:
+ Tensor
+ Output tensor(#batch, maxlen_out, size).
+ Tensor
+ Mask for output tensor (#batch, maxlen_out).
+ Tensor
+ Encoded memory (#batch, maxlen_in, size).
+ Tensor
+ Encoded memory mask (#batch, maxlen_in).
"""
residual = tgt
diff --git a/paddlespeech/t2s/modules/transformer/embedding.py b/paddlespeech/t2s/modules/transformer/embedding.py
index 40ab03ee..d9339d20 100644
--- a/paddlespeech/t2s/modules/transformer/embedding.py
+++ b/paddlespeech/t2s/modules/transformer/embedding.py
@@ -22,18 +22,12 @@ from paddle import nn
class PositionalEncoding(nn.Layer):
"""Positional encoding.
- Parameters
- ----------
- d_model : int
- Embedding dimension.
- dropout_rate : float
- Dropout rate.
- max_len : int
- Maximum input length.
- reverse : bool
- Whether to reverse the input position.
- type : str
- dtype of param
+ Args:
+ d_model (int): Embedding dimension.
+ dropout_rate (float): Dropout rate.
+ max_len (int): Maximum input length.
+ reverse (bool): Whether to reverse the input position.
+ type (str): dtype of param
"""
def __init__(self,
@@ -73,15 +67,11 @@ class PositionalEncoding(nn.Layer):
def forward(self, x: paddle.Tensor):
"""Add positional encoding.
- Parameters
- ----------
- x : paddle.Tensor
- Input tensor (batch, time, `*`).
+ Args:
+ x (Tensor): Input tensor (batch, time, `*`).
- Returns
- ----------
- paddle.Tensor
- Encoded tensor (batch, time, `*`).
+ Returns:
+ Tensor: Encoded tensor (batch, time, `*`).
"""
self.extend_pe(x)
T = paddle.shape(x)[1]
@@ -91,19 +81,13 @@ class PositionalEncoding(nn.Layer):
class ScaledPositionalEncoding(PositionalEncoding):
"""Scaled positional encoding module.
-
See Sec. 3.2 https://arxiv.org/abs/1809.08895
- Parameters
- ----------
- d_model : int
- Embedding dimension.
- dropout_rate : float
- Dropout rate.
- max_len : int
- Maximum input length.
- dtype : str
- dtype of param
+ Args:
+ d_model (int): Embedding dimension.
+ dropout_rate (float): Dropout rate.
+ max_len (int): Maximum input length.
+ dtype (str): dtype of param
"""
def __init__(self, d_model, dropout_rate, max_len=5000, dtype="float32"):
@@ -126,14 +110,10 @@ class ScaledPositionalEncoding(PositionalEncoding):
def forward(self, x):
"""Add positional encoding.
- Parameters
- ----------
- x : paddle.Tensor
- Input tensor (batch, time, `*`).
- Returns
- ----------
- paddle.Tensor
- Encoded tensor (batch, time, `*`).
+ Args:
+ x (Tensor): Input tensor (batch, time, `*`).
+ Returns:
+ Tensor: Encoded tensor (batch, time, `*`).
"""
self.extend_pe(x)
T = paddle.shape(x)[1]
@@ -145,14 +125,11 @@ class RelPositionalEncoding(nn.Layer):
"""Relative positional encoding module (new implementation).
Details can be found in https://github.com/espnet/espnet/pull/2816.
See : Appendix B in https://arxiv.org/abs/1901.02860
- Parameters
- ----------
- d_model : int
- Embedding dimension.
- dropout_rate : float
- Dropout rate.
- max_len : int
- Maximum input length.
+
+ Args:
+ d_model (int): Embedding dimension.
+ dropout_rate (float): Dropout rate.
+ max_len (int): Maximum input length.
"""
def __init__(self, d_model, dropout_rate, max_len=5000, dtype="float32"):
@@ -197,14 +174,10 @@ class RelPositionalEncoding(nn.Layer):
def forward(self, x: paddle.Tensor):
"""Add positional encoding.
- Parameters
- ----------
- x : paddle.Tensor
- Input tensor (batch, time, `*`).
- Returns
- ----------
- paddle.Tensor
- Encoded tensor (batch, time, `*`).
+ Args:
+ x (Tensor):Input tensor (batch, time, `*`).
+ Returns:
+ Tensor: Encoded tensor (batch, time, `*`).
"""
self.extend_pe(x)
x = x * self.xscale
diff --git a/paddlespeech/t2s/modules/transformer/encoder.py b/paddlespeech/t2s/modules/transformer/encoder.py
index 8bf71b41..2b3ee788 100644
--- a/paddlespeech/t2s/modules/transformer/encoder.py
+++ b/paddlespeech/t2s/modules/transformer/encoder.py
@@ -37,62 +37,37 @@ from paddlespeech.t2s.modules.transformer.subsampling import Conv2dSubsampling
class BaseEncoder(nn.Layer):
"""Base Encoder module.
- Parameters
- ----------
- idim : int
- Input dimension.
- attention_dim : int
- Dimention of attention.
- attention_heads : int
- The number of heads of multi head attention.
- linear_units : int
- The number of units of position-wise feed forward.
- num_blocks : int
- The number of decoder blocks.
- dropout_rate : float
- Dropout rate.
- positional_dropout_rate : float
- Dropout rate after adding positional encoding.
- attention_dropout_rate : float
- Dropout rate in attention.
- input_layer : Union[str, nn.Layer]
- Input layer type.
- normalize_before : bool
- Whether to use layer_norm before the first block.
- concat_after : bool
- Whether to concat attention layer's input and output.
- if True, additional linear will be applied.
- i.e. x -> x + linear(concat(x, att(x)))
- if False, no additional linear will be applied. i.e. x -> x + att(x)
- positionwise_layer_type : str
- "linear", "conv1d", or "conv1d-linear".
- positionwise_conv_kernel_size : int
- Kernel size of positionwise conv1d layer.
- macaron_style : bool
- Whether to use macaron style for positionwise layer.
- pos_enc_layer_type : str
- Encoder positional encoding layer type.
- selfattention_layer_type : str
- Encoder attention layer type.
- activation_type : str
- Encoder activation function type.
- use_cnn_module : bool
- Whether to use convolution module.
- zero_triu : bool
- Whether to zero the upper triangular part of attention matrix.
- cnn_module_kernel : int
- Kernerl size of convolution module.
- padding_idx : int
- Padding idx for input_layer=embed.
- stochastic_depth_rate : float
- Maximum probability to skip the encoder layer.
- intermediate_layers : Union[List[int], None]
- indices of intermediate CTC layer.
- indices start from 1.
- if not None, intermediate outputs are returned (which changes return type
- signature.)
- encoder_type: str
- "transformer", or "conformer".
+ Args:
+ idim (int): Input dimension.
+ attention_dim (int): Dimention of attention.
+ attention_heads (int): The number of heads of multi head attention.
+ linear_units (int): The number of units of position-wise feed forward.
+ num_blocks (int): The number of decoder blocks.
+ dropout_rate (float): Dropout rate.
+ positional_dropout_rate (float): Dropout rate after adding positional encoding.
+ attention_dropout_rate (float): Dropout rate in attention.
+ input_layer (Union[str, nn.Layer]): Input layer type.
+ normalize_before (bool): Whether to use layer_norm before the first block.
+ concat_after (bool): Whether to concat attention layer's input and output.
+ if True, additional linear will be applied.
+ i.e. x -> x + linear(concat(x, att(x)))
+ if False, no additional linear will be applied. i.e. x -> x + att(x)
+ positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear".
+ positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer.
+ macaron_style (bool): Whether to use macaron style for positionwise layer.
+ pos_enc_layer_type (str): Encoder positional encoding layer type.
+ selfattention_layer_type (str): Encoder attention layer type.
+ activation_type (str): Encoder activation function type.
+ use_cnn_module (bool): Whether to use convolution module.
+ zero_triu (bool): Whether to zero the upper triangular part of attention matrix.
+ cnn_module_kernel (int): Kernerl size of convolution module.
+ padding_idx (int): Padding idx for input_layer=embed.
+ stochastic_depth_rate (float): Maximum probability to skip the encoder layer.
+ intermediate_layers (Union[List[int], None]): indices of intermediate CTC layer.
+ indices start from 1.
+ if not None, intermediate outputs are returned (which changes return type
+ signature.)
+ encoder_type (str): "transformer", or "conformer".
"""
def __init__(self,
@@ -290,19 +265,13 @@ class BaseEncoder(nn.Layer):
def forward(self, xs, masks):
"""Encode input sequence.
- Parameters
- ----------
- xs : paddle.Tensor
- Input tensor (#batch, time, idim).
- masks : paddle.Tensor
- Mask tensor (#batch, 1, time).
-
- Returns
- ----------
- paddle.Tensor
- Output tensor (#batch, time, attention_dim).
- paddle.Tensor
- Mask tensor (#batch, 1, time).
+ Args:
+ xs (Tensor): Input tensor (#batch, time, idim).
+ masks (Tensor): Mask tensor (#batch, 1, time).
+
+ Returns:
+ Tensor: Output tensor (#batch, time, attention_dim).
+ Tensor: Mask tensor (#batch, 1, time).
"""
xs = self.embed(xs)
xs, masks = self.encoders(xs, masks)
@@ -313,45 +282,28 @@ class BaseEncoder(nn.Layer):
class TransformerEncoder(BaseEncoder):
"""Transformer encoder module.
- Parameters
- ----------
- idim : int
- Input dimension.
- attention_dim : int
- Dimention of attention.
- attention_heads : int
- The number of heads of multi head attention.
- linear_units : int
- The number of units of position-wise feed forward.
- num_blocks : int
- The number of decoder blocks.
- dropout_rate : float
- Dropout rate.
- positional_dropout_rate : float
- Dropout rate after adding positional encoding.
- attention_dropout_rate : float
- Dropout rate in attention.
- input_layer : Union[str, paddle.nn.Layer]
- Input layer type.
- pos_enc_layer_type : str
- Encoder positional encoding layer type.
- normalize_before : bool
- Whether to use layer_norm before the first block.
- concat_after : bool
- Whether to concat attention layer's input and output.
- if True, additional linear will be applied.
- i.e. x -> x + linear(concat(x, att(x)))
- if False, no additional linear will be applied. i.e. x -> x + att(x)
- positionwise_layer_type : str
- "linear", "conv1d", or "conv1d-linear".
- positionwise_conv_kernel_size : int
- Kernel size of positionwise conv1d layer.
- selfattention_layer_type : str
- Encoder attention layer type.
- activation_type : str
- Encoder activation function type.
- padding_idx : int
- Padding idx for input_layer=embed.
+
+ Args:
+ idim (int): Input dimension.
+ attention_dim (int): Dimention of attention.
+ attention_heads (int): The number of heads of multi head attention.
+ linear_units (int): The number of units of position-wise feed forward.
+ num_blocks (int): The number of decoder blocks.
+ dropout_rate (float): Dropout rate.
+ positional_dropout_rate (float): Dropout rate after adding positional encoding.
+ attention_dropout_rate (float): Dropout rate in attention.
+ input_layer (Union[str, paddle.nn.Layer]): Input layer type.
+ pos_enc_layer_type (str): Encoder positional encoding layer type.
+ normalize_before (bool): Whether to use layer_norm before the first block.
+ concat_after (bool): Whether to concat attention layer's input and output.
+ if True, additional linear will be applied.
+ i.e. x -> x + linear(concat(x, att(x)))
+ if False, no additional linear will be applied. i.e. x -> x + att(x)
+ positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear".
+ positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer.
+ selfattention_layer_type (str): Encoder attention layer type.
+ activation_type (str): Encoder activation function type.
+ padding_idx (int): Padding idx for input_layer=embed.
"""
def __init__(
@@ -397,19 +349,13 @@ class TransformerEncoder(BaseEncoder):
def forward(self, xs, masks):
"""Encode input sequence.
- Parameters
- ----------
- xs : paddle.Tensor
- Input tensor (#batch, time, idim).
- masks : paddle.Tensor
- Mask tensor (#batch, 1, time).
-
- Returns
- ----------
- paddle.Tensor
- Output tensor (#batch, time, attention_dim).
- paddle.Tensor
- Mask tensor (#batch, 1, time).
+ Args:
+ xs(Tensor): Input tensor (#batch, time, idim).
+ masks(Tensor): Mask tensor (#batch, 1, time).
+
+ Returns:
+ Tensor: Output tensor (#batch, time, attention_dim).
+ Tensor:Mask tensor (#batch, 1, time).
"""
xs = self.embed(xs)
xs, masks = self.encoders(xs, masks)
@@ -420,23 +366,15 @@ class TransformerEncoder(BaseEncoder):
def forward_one_step(self, xs, masks, cache=None):
"""Encode input frame.
- Parameters
- ----------
- xs : paddle.Tensor
- Input tensor.
- masks : paddle.Tensor
- Mask tensor.
- cache : List[paddle.Tensor]
- List of cache tensors.
-
- Returns
- ----------
- paddle.Tensor
- Output tensor.
- paddle.Tensor
- Mask tensor.
- List[paddle.Tensor]
- List of new cache tensors.
+ Args:
+ xs (Tensor): Input tensor.
+ masks (Tensor): Mask tensor.
+ cache (List[Tensor]): List of cache tensors.
+
+ Returns:
+ Tensor: Output tensor.
+ Tensor: Mask tensor.
+ List[Tensor]: List of new cache tensors.
"""
xs = self.embed(xs)
@@ -453,60 +391,35 @@ class TransformerEncoder(BaseEncoder):
class ConformerEncoder(BaseEncoder):
"""Conformer encoder module.
- Parameters
- ----------
- idim : int
- Input dimension.
- attention_dim : int
- Dimention of attention.
- attention_heads : int
- The number of heads of multi head attention.
- linear_units : int
- The number of units of position-wise feed forward.
- num_blocks : int
- The number of decoder blocks.
- dropout_rate : float
- Dropout rate.
- positional_dropout_rate : float
- Dropout rate after adding positional encoding.
- attention_dropout_rate : float
- Dropout rate in attention.
- input_layer : Union[str, nn.Layer]
- Input layer type.
- normalize_before : bool
- Whether to use layer_norm before the first block.
- concat_after : bool
- Whether to concat attention layer's input and output.
- if True, additional linear will be applied.
- i.e. x -> x + linear(concat(x, att(x)))
- if False, no additional linear will be applied. i.e. x -> x + att(x)
- positionwise_layer_type : str
- "linear", "conv1d", or "conv1d-linear".
- positionwise_conv_kernel_size : int
- Kernel size of positionwise conv1d layer.
- macaron_style : bool
- Whether to use macaron style for positionwise layer.
- pos_enc_layer_type : str
- Encoder positional encoding layer type.
- selfattention_layer_type : str
- Encoder attention layer type.
- activation_type : str
- Encoder activation function type.
- use_cnn_module : bool
- Whether to use convolution module.
- zero_triu : bool
- Whether to zero the upper triangular part of attention matrix.
- cnn_module_kernel : int
- Kernerl size of convolution module.
- padding_idx : int
- Padding idx for input_layer=embed.
- stochastic_depth_rate : float
- Maximum probability to skip the encoder layer.
- intermediate_layers : Union[List[int], None]
- indices of intermediate CTC layer.
- indices start from 1.
- if not None, intermediate outputs are returned (which changes return type
- signature.)
+
+ Args:
+ idim (int): Input dimension.
+ attention_dim (int): Dimention of attention.
+ attention_heads (int): The number of heads of multi head attention.
+ linear_units (int): The number of units of position-wise feed forward.
+ num_blocks (int): The number of decoder blocks.
+ dropout_rate (float): Dropout rate.
+ positional_dropout_rate (float): Dropout rate after adding positional encoding.
+ attention_dropout_rate (float): Dropout rate in attention.
+ input_layer (Union[str, nn.Layer]): Input layer type.
+ normalize_before (bool): Whether to use layer_norm before the first block.
+ concat_after (bool):Whether to concat attention layer's input and output.
+ if True, additional linear will be applied.
+ i.e. x -> x + linear(concat(x, att(x)))
+ if False, no additional linear will be applied. i.e. x -> x + att(x)
+ positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear".
+ positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer.
+ macaron_style (bool): Whether to use macaron style for positionwise layer.
+ pos_enc_layer_type (str): Encoder positional encoding layer type.
+ selfattention_layer_type (str): Encoder attention layer type.
+ activation_type (str): Encoder activation function type.
+ use_cnn_module (bool): Whether to use convolution module.
+ zero_triu (bool): Whether to zero the upper triangular part of attention matrix.
+ cnn_module_kernel (int): Kernerl size of convolution module.
+ padding_idx (int): Padding idx for input_layer=embed.
+ stochastic_depth_rate (float): Maximum probability to skip the encoder layer.
+ intermediate_layers (Union[List[int], None]):indices of intermediate CTC layer. indices start from 1.
+ if not None, intermediate outputs are returned (which changes return type signature.)
"""
def __init__(
@@ -563,18 +476,13 @@ class ConformerEncoder(BaseEncoder):
def forward(self, xs, masks):
"""Encode input sequence.
- Parameters
- ----------
- xs : paddle.Tensor
- Input tensor (#batch, time, idim).
- masks : paddle.Tensor
- Mask tensor (#batch, 1, time).
- Returns
- ----------
- paddle.Tensor
- Output tensor (#batch, time, attention_dim).
- paddle.Tensor
- Mask tensor (#batch, 1, time).
+
+ Args:
+ xs (Tensor): Input tensor (#batch, time, idim).
+ masks (Tensor): Mask tensor (#batch, 1, time).
+ Returns:
+ Tensor: Output tensor (#batch, time, attention_dim).
+ Tensor: Mask tensor (#batch, 1, time).
"""
if isinstance(self.embed, (Conv2dSubsampling)):
xs, masks = self.embed(xs, masks)
diff --git a/paddlespeech/t2s/modules/transformer/encoder_layer.py b/paddlespeech/t2s/modules/transformer/encoder_layer.py
index f55ded3d..72372b69 100644
--- a/paddlespeech/t2s/modules/transformer/encoder_layer.py
+++ b/paddlespeech/t2s/modules/transformer/encoder_layer.py
@@ -20,25 +20,18 @@ from paddle import nn
class EncoderLayer(nn.Layer):
"""Encoder layer module.
- Parameters
- ----------
- size : int
- Input dimension.
- self_attn : nn.Layer
- Self-attention module instance.
- `MultiHeadedAttention` instance can be used as the argument.
- feed_forward : nn.Layer
- Feed-forward module instance.
- `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance can be used as the argument.
- dropout_rate : float
- Dropout rate.
- normalize_before : bool
- Whether to use layer_norm before the first block.
- concat_after : bool
- Whether to concat attention layer's input and output.
- if True, additional linear will be applied.
- i.e. x -> x + linear(concat(x, att(x)))
- if False, no additional linear will be applied. i.e. x -> x + att(x)
+ Args:
+ size (int): Input dimension.
+ self_attn (nn.Layer): Self-attention module instance.
+ `MultiHeadedAttention` instance can be used as the argument.
+ feed_forward (nn.Layer): Feed-forward module instance.
+ `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance can be used as the argument.
+ dropout_rate (float): Dropout rate.
+ normalize_before (bool): Whether to use layer_norm before the first block.
+ concat_after (bool): Whether to concat attention layer's input and output.
+ if True, additional linear will be applied.
+ i.e. x -> x + linear(concat(x, att(x)))
+ if False, no additional linear will be applied. i.e. x -> x + att(x)
"""
def __init__(
@@ -65,21 +58,14 @@ class EncoderLayer(nn.Layer):
def forward(self, x, mask, cache=None):
"""Compute encoded features.
- Parameters
- ----------
- x_input : paddle.Tensor
- Input tensor (#batch, time, size).
- mask : paddle.Tensor
- Mask tensor for the input (#batch, time).
- cache : paddle.Tensor
- Cache tensor of the input (#batch, time - 1, size).
+ Args:
+ x(Tensor): Input tensor (#batch, time, size).
+ mask(Tensor): Mask tensor for the input (#batch, time).
+ cache(Tensor, optional): Cache tensor of the input (#batch, time - 1, size).
- Returns
- ----------
- paddle.Tensor
- Output tensor (#batch, time, size).
- paddle.Tensor
- Mask tensor (#batch, time).
+ Returns:
+ Tensor: Output tensor (#batch, time, size).
+ Tensor: Mask tensor (#batch, time).
"""
residual = x
if self.normalize_before:
diff --git a/paddlespeech/t2s/modules/transformer/lightconv.py b/paddlespeech/t2s/modules/transformer/lightconv.py
index ccf84c8a..9bcc1acf 100644
--- a/paddlespeech/t2s/modules/transformer/lightconv.py
+++ b/paddlespeech/t2s/modules/transformer/lightconv.py
@@ -30,20 +30,13 @@ class LightweightConvolution(nn.Layer):
This implementation is based on
https://github.com/pytorch/fairseq/tree/master/fairseq
- Parameters
- ----------
- wshare : int
- the number of kernel of convolution
- n_feat : int
- the number of features
- dropout_rate : float
- dropout_rate
- kernel_size : int
- kernel size (length)
- use_kernel_mask : bool
- Use causal mask or not for convolution kernel
- use_bias : bool
- Use bias term or not.
+ Args:
+ wshare (int): the number of kernel of convolution
+ n_feat (int): the number of features
+ dropout_rate (float): dropout_rate
+ kernel_size (int): kernel size (length)
+ use_kernel_mask (bool): Use causal mask or not for convolution kernel
+ use_bias (bool): Use bias term or not.
"""
@@ -100,21 +93,14 @@ class LightweightConvolution(nn.Layer):
This function takes query, key and value but uses only query.
This is just for compatibility with self-attention layer (attention.py)
- Parameters
- ----------
- query : paddle.Tensor
- (batch, time1, d_model) input tensor
- key : paddle.Tensor
- (batch, time2, d_model) NOT USED
- value : paddle.Tensor
- (batch, time2, d_model) NOT USED
- mask : paddle.Tensor
- (batch, time1, time2) mask
-
- Return
- ----------
- x : paddle.Tensor
- (batch, time1, d_model) ouput
+ Args:
+ query (Tensor): input tensor. (batch, time1, d_model)
+ key (Tensor): NOT USED. (batch, time2, d_model)
+ value (Tensor): NOT USED. (batch, time2, d_model)
+ mask : (Tensor): (batch, time1, time2) mask
+
+ Return:
+ Tensor: ouput. (batch, time1, d_model)
"""
# linear -> GLU -> lightconv -> linear
diff --git a/paddlespeech/t2s/modules/transformer/mask.py b/paddlespeech/t2s/modules/transformer/mask.py
index fd97b004..c10e6add 100644
--- a/paddlespeech/t2s/modules/transformer/mask.py
+++ b/paddlespeech/t2s/modules/transformer/mask.py
@@ -17,19 +17,16 @@ import paddle
def subsequent_mask(size, dtype=paddle.bool):
"""Create mask for subsequent steps (size, size).
- Parameters
- ----------
- size : int
- size of mask
- dtype : paddle.dtype
- result dtype
- Return
- ----------
- paddle.Tensor
- >>> subsequent_mask(3)
- [[1, 0, 0],
- [1, 1, 0],
- [1, 1, 1]]
+
+ Args:
+ size (int): size of mask
+ dtype (paddle.dtype): result dtype
+ Return:
+ Tensor:
+ >>> subsequent_mask(3)
+ [[1, 0, 0],
+ [1, 1, 0],
+ [1, 1, 1]]
"""
ret = paddle.ones([size, size], dtype=dtype)
return paddle.tril(ret)
@@ -37,19 +34,13 @@ def subsequent_mask(size, dtype=paddle.bool):
def target_mask(ys_in_pad, ignore_id, dtype=paddle.bool):
"""Create mask for decoder self-attention.
- Parameters
- ----------
- ys_pad : paddle.Tensor
- batch of padded target sequences (B, Lmax)
- ignore_id : int
- index of padding
- dtype : torch.dtype
- result dtype
- Return
- ----------
- paddle.Tensor
- (B, Lmax, Lmax)
+ Args:
+ ys_pad (Tensor): batch of padded target sequences (B, Lmax)
+ ignore_id (int): index of padding
+ dtype (paddle.dtype): result dtype
+ Return:
+ Tensor: (B, Lmax, Lmax)
"""
ys_mask = ys_in_pad != ignore_id
m = subsequent_mask(ys_mask.shape[-1]).unsqueeze(0)
diff --git a/paddlespeech/t2s/modules/transformer/multi_layer_conv.py b/paddlespeech/t2s/modules/transformer/multi_layer_conv.py
index df8929e3..d3285b65 100644
--- a/paddlespeech/t2s/modules/transformer/multi_layer_conv.py
+++ b/paddlespeech/t2s/modules/transformer/multi_layer_conv.py
@@ -31,16 +31,11 @@ class MultiLayeredConv1d(nn.Layer):
def __init__(self, in_chans, hidden_chans, kernel_size, dropout_rate):
"""Initialize MultiLayeredConv1d module.
- Parameters
- ----------
- in_chans : int
- Number of input channels.
- hidden_chans : int
- Number of hidden channels.
- kernel_size : int
- Kernel size of conv1d.
- dropout_rate : float
- Dropout rate.
+ Args:
+ in_chans (int): Number of input channels.
+ hidden_chans (int): Number of hidden channels.
+ kernel_size (int): Kernel size of conv1d.
+ dropout_rate (float): Dropout rate.
"""
super().__init__()
@@ -62,15 +57,11 @@ class MultiLayeredConv1d(nn.Layer):
def forward(self, x):
"""Calculate forward propagation.
- Parameters
- ----------
- x : paddle.Tensor
- Batch of input tensors (B, T, in_chans).
+ Args:
+ x (Tensor): Batch of input tensors (B, T, in_chans).
- Returns
- ----------
- paddle.Tensor
- Batch of output tensors (B, T, in_chans).
+ Returns:
+ Tensor: Batch of output tensors (B, T, in_chans).
"""
x = self.relu(self.w_1(x.transpose([0, 2, 1]))).transpose([0, 2, 1])
return self.w_2(self.dropout(x).transpose([0, 2, 1])).transpose(
@@ -87,16 +78,11 @@ class Conv1dLinear(nn.Layer):
def __init__(self, in_chans, hidden_chans, kernel_size, dropout_rate):
"""Initialize Conv1dLinear module.
- Parameters
- ----------
- in_chans : int
- Number of input channels.
- hidden_chans : int
- Number of hidden channels.
- kernel_size : int
- Kernel size of conv1d.
- dropout_rate : float
- Dropout rate.
+ Args:
+ in_chans (int): Number of input channels.
+ hidden_chans (int): Number of hidden channels.
+ kernel_size (int): Kernel size of conv1d.
+ dropout_rate (float): Dropout rate.
"""
super().__init__()
self.w_1 = nn.Conv1D(
@@ -112,15 +98,11 @@ class Conv1dLinear(nn.Layer):
def forward(self, x):
"""Calculate forward propagation.
- Parameters
- ----------
- x : paddle.Tensor
- Batch of input tensors (B, T, in_chans).
+ Args:
+ x (Tensor): Batch of input tensors (B, T, in_chans).
- Returns
- ----------
- paddle.Tensor
- Batch of output tensors (B, T, in_chans).
+ Returns:
+ Tensor: Batch of output tensors (B, T, in_chans).
"""
x = self.relu(self.w_1(x.transpose([0, 2, 1]))).transpose([0, 2, 1])
diff --git a/paddlespeech/t2s/modules/transformer/positionwise_feed_forward.py b/paddlespeech/t2s/modules/transformer/positionwise_feed_forward.py
index 28ed1c31..92af6851 100644
--- a/paddlespeech/t2s/modules/transformer/positionwise_feed_forward.py
+++ b/paddlespeech/t2s/modules/transformer/positionwise_feed_forward.py
@@ -20,14 +20,10 @@ from paddle import nn
class PositionwiseFeedForward(nn.Layer):
"""Positionwise feed forward layer.
- Parameters
- ----------
- idim : int
- Input dimenstion.
- hidden_units : int
- The number of hidden units.
- dropout_rate : float
- Dropout rate.
+ Args:
+ idim (int): Input dimenstion.
+ hidden_units (int): The number of hidden units.
+ dropout_rate (float): Dropout rate.
"""
def __init__(self,
diff --git a/paddlespeech/t2s/modules/transformer/repeat.py b/paddlespeech/t2s/modules/transformer/repeat.py
index 0325a638..2073a78b 100644
--- a/paddlespeech/t2s/modules/transformer/repeat.py
+++ b/paddlespeech/t2s/modules/transformer/repeat.py
@@ -29,16 +29,11 @@ class MultiSequential(paddle.nn.Sequential):
def repeat(N, fn):
"""Repeat module N times.
- Parameters
- ----------
- N : int
- Number of repeat time.
- fn : Callable
- Function to generate module.
+ Args:
+ N (int): Number of repeat time.
+ fn (Callable): Function to generate module.
- Returns
- ----------
- MultiSequential
- Repeated model instance.
+ Returns:
+ MultiSequential: Repeated model instance.
"""
- return MultiSequential(* [fn(n) for n in range(N)])
+ return MultiSequential(*[fn(n) for n in range(N)])
diff --git a/paddlespeech/t2s/modules/transformer/subsampling.py b/paddlespeech/t2s/modules/transformer/subsampling.py
index cf0fca8a..07439705 100644
--- a/paddlespeech/t2s/modules/transformer/subsampling.py
+++ b/paddlespeech/t2s/modules/transformer/subsampling.py
@@ -21,16 +21,12 @@ from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
class Conv2dSubsampling(nn.Layer):
"""Convolutional 2D subsampling (to 1/4 length).
- Parameters
- ----------
- idim : int
- Input dimension.
- odim : int
- Output dimension.
- dropout_rate : float
- Dropout rate.
- pos_enc : nn.Layer
- Custom position encoding layer.
+
+ Args:
+ idim (int): Input dimension.
+ odim (int): Output dimension.
+ dropout_rate (float): Dropout rate.
+ pos_enc (nn.Layer): Custom position encoding layer.
"""
def __init__(self, idim, odim, dropout_rate, pos_enc=None):
@@ -48,20 +44,12 @@ class Conv2dSubsampling(nn.Layer):
def forward(self, x, x_mask):
"""Subsample x.
- Parameters
- ----------
- x : paddle.Tensor
- Input tensor (#batch, time, idim).
- x_mask : paddle.Tensor
- Input mask (#batch, 1, time).
- Returns
- ----------
- paddle.Tensor
- Subsampled tensor (#batch, time', odim),
- where time' = time // 4.
- paddle.Tensor
- Subsampled mask (#batch, 1, time'),
- where time' = time // 4.
+ Args:
+ x (Tensor): Input tensor (#batch, time, idim).
+ x_mask (Tensor): Input mask (#batch, 1, time).
+ Returns:
+ Tensor: Subsampled tensor (#batch, time', odim), where time' = time // 4.
+ Tensor: Subsampled mask (#batch, 1, time'), where time' = time // 4.
"""
# (b, c, t, f)
x = x.unsqueeze(1)
diff --git a/paddlespeech/t2s/modules/upsample.py b/paddlespeech/t2s/modules/upsample.py
index 82e30414..65e78a89 100644
--- a/paddlespeech/t2s/modules/upsample.py
+++ b/paddlespeech/t2s/modules/upsample.py
@@ -27,17 +27,12 @@ class Stretch2D(nn.Layer):
def __init__(self, w_scale: int, h_scale: int, mode: str="nearest"):
"""Strech an image (or image-like object) with some interpolation.
- Parameters
- ----------
- w_scale : int
- Scalar of width.
- h_scale : int
- Scalar of the height.
- mode : str, optional
- Interpolation mode, modes suppored are "nearest", "bilinear",
- "trilinear", "bicubic", "linear" and "area",by default "nearest"
-
- For more details about interpolation, see
+ Args:
+ w_scale (int): Scalar of width.
+ h_scale (int): Scalar of the height.
+ mode (str, optional): Interpolation mode, modes suppored are "nearest", "bilinear",
+ "trilinear", "bicubic", "linear" and "area",by default "nearest"
+ For more details about interpolation, see
`paddle.nn.functional.interpolate `_.
"""
super().__init__()
@@ -47,16 +42,14 @@ class Stretch2D(nn.Layer):
def forward(self, x):
"""
- Parameters
- ----------
- x : Tensor
- Shape (N, C, H, W)
-
- Returns
- -------
- Tensor
- Shape (N, C, H', W'), where ``H'=h_scale * H``, ``W'=w_scale * W``.
- The stretched image.
+
+ Args:
+ x (Tensor): Shape (N, C, H, W)
+
+ Returns:
+ Tensor: The stretched image.
+ Shape (N, C, H', W'), where ``H'=h_scale * H``, ``W'=w_scale * W``.
+
"""
out = F.interpolate(
x, scale_factor=(self.h_scale, self.w_scale), mode=self.mode)
@@ -67,26 +60,16 @@ class UpsampleNet(nn.Layer):
"""A Layer to upsample spectrogram by applying consecutive stretch and
convolutions.
- Parameters
- ----------
- upsample_scales : List[int]
- Upsampling factors for each strech.
- nonlinear_activation : Optional[str], optional
- Activation after each convolution, by default None
- nonlinear_activation_params : Dict[str, Any], optional
- Parameters passed to construct the activation, by default {}
- interpolate_mode : str, optional
- Interpolation mode of the strech, by default "nearest"
- freq_axis_kernel_size : int, optional
- Convolution kernel size along the frequency axis, by default 1
- use_causal_conv : bool, optional
- Whether to use causal padding before convolution, by default False
-
- If True, Causal padding is used along the time axis, i.e. padding
- amount is ``receptive field - 1`` and 0 for before and after,
- respectively.
-
- If False, "same" padding is used along the time axis.
+ Args:
+ upsample_scales (List[int]): Upsampling factors for each strech.
+ nonlinear_activation (Optional[str], optional): Activation after each convolution, by default None
+ nonlinear_activation_params (Dict[str, Any], optional): Parameters passed to construct the activation, by default {}
+ interpolate_mode (str, optional): Interpolation mode of the strech, by default "nearest"
+ freq_axis_kernel_size (int, optional): Convolution kernel size along the frequency axis, by default 1
+ use_causal_conv (bool, optional): Whether to use causal padding before convolution, by default False
+ If True, Causal padding is used along the time axis,
+ i.e. padding amount is ``receptive field - 1`` and 0 for before and after, respectively.
+ If False, "same" padding is used along the time axis.
"""
def __init__(self,
@@ -122,16 +105,12 @@ class UpsampleNet(nn.Layer):
def forward(self, c):
"""
- Parameters
- ----------
- c : Tensor
- Shape (N, F, T), spectrogram
-
- Returns
- -------
- Tensor
- Shape (N, F, T'), where ``T' = upsample_factor * T``, upsampled
- spectrogram
+ Args:
+ c (Tensor): spectrogram. Shape (N, F, T)
+
+ Returns:
+ Tensor: upsampled spectrogram.
+ Shape (N, F, T'), where ``T' = upsample_factor * T``,
"""
c = c.unsqueeze(1)
for f in self.up_layers:
@@ -145,35 +124,22 @@ class UpsampleNet(nn.Layer):
class ConvInUpsampleNet(nn.Layer):
"""A Layer to upsample spectrogram composed of a convolution and an
UpsampleNet.
-
- Parameters
- ----------
- upsample_scales : List[int]
- Upsampling factors for each strech.
- nonlinear_activation : Optional[str], optional
- Activation after each convolution, by default None
- nonlinear_activation_params : Dict[str, Any], optional
- Parameters passed to construct the activation, by default {}
- interpolate_mode : str, optional
- Interpolation mode of the strech, by default "nearest"
- freq_axis_kernel_size : int, optional
- Convolution kernel size along the frequency axis, by default 1
- aux_channels : int, optional
- Feature size of the input, by default 80
- aux_context_window : int, optional
- Context window of the first 1D convolution applied to the input. It
- related to the kernel size of the convolution, by default 0
-
- If use causal convolution, the kernel size is ``window + 1``, else
- the kernel size is ``2 * window + 1``.
- use_causal_conv : bool, optional
- Whether to use causal padding before convolution, by default False
-
- If True, Causal padding is used along the time axis, i.e. padding
- amount is ``receptive field - 1`` and 0 for before and after,
- respectively.
-
- If False, "same" padding is used along the time axis.
+
+ Args:
+ upsample_scales (List[int]): Upsampling factors for each strech.
+ nonlinear_activation (Optional[str], optional): Activation after each convolution, by default None
+ nonlinear_activation_params (Dict[str, Any], optional): Parameters passed to construct the activation, by default {}
+ interpolate_mode (str, optional): Interpolation mode of the strech, by default "nearest"
+ freq_axis_kernel_size (int, optional): Convolution kernel size along the frequency axis, by default 1
+ aux_channels (int, optional): Feature size of the input, by default 80
+ aux_context_window (int, optional): Context window of the first 1D convolution applied to the input. It
+ related to the kernel size of the convolution, by default 0
+ If use causal convolution, the kernel size is ``window + 1``,
+ else the kernel size is ``2 * window + 1``.
+ use_causal_conv (bool, optional): Whether to use causal padding before convolution, by default False
+ If True, Causal padding is used along the time axis, i.e. padding
+ amount is ``receptive field - 1`` and 0 for before and after, respectively.
+ If False, "same" padding is used along the time axis.
"""
def __init__(self,
@@ -204,16 +170,11 @@ class ConvInUpsampleNet(nn.Layer):
def forward(self, c):
"""
- Parameters
- ----------
- c : Tensor
- Shape (N, F, T), spectrogram
-
- Returns
- -------
- Tensors
- Shape (N, F, T'), where ``T' = upsample_factor * T``, upsampled
- spectrogram
+ Args:
+ c (Tensor): spectrogram. Shape (N, F, T)
+
+ Returns:
+ Tensors: upsampled spectrogram. Shape (N, F, T'), where ``T' = upsample_factor * T``,
"""
c_ = self.conv_in(c)
c = c_[:, :, :-self.aux_context_window] if self.use_causal_conv else c_
diff --git a/paddlespeech/t2s/training/experiment.py b/paddlespeech/t2s/training/experiment.py
index de36db24..05a363ff 100644
--- a/paddlespeech/t2s/training/experiment.py
+++ b/paddlespeech/t2s/training/experiment.py
@@ -57,35 +57,30 @@ class ExperimentBase(object):
Feel free to add/overwrite other methods and standalone functions if you
need.
- Parameters
- ----------
- config: yacs.config.CfgNode
- The configuration used for the experiment.
-
- args: argparse.Namespace
- The parsed command line arguments.
-
- Examples
- --------
- >>> def main_sp(config, args):
- >>> exp = Experiment(config, args)
- >>> exp.setup()
- >>> exe.resume_or_load()
- >>> exp.run()
- >>>
- >>> config = get_cfg_defaults()
- >>> parser = default_argument_parser()
- >>> args = parser.parse_args()
- >>> if args.config:
- >>> config.merge_from_file(args.config)
- >>> if args.opts:
- >>> config.merge_from_list(args.opts)
- >>> config.freeze()
- >>>
- >>> if args.ngpu > 1:
- >>> dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu)
- >>> else:
- >>> main_sp(config, args)
+ Args:
+ config (yacs.config.CfgNode): The configuration used for the experiment.
+ args (argparse.Namespace): The parsed command line arguments.
+
+ Examples:
+ >>> def main_sp(config, args):
+ >>> exp = Experiment(config, args)
+ >>> exp.setup()
+ >>> exe.resume_or_load()
+ >>> exp.run()
+ >>>
+ >>> config = get_cfg_defaults()
+ >>> parser = default_argument_parser()
+ >>> args = parser.parse_args()
+ >>> if args.config:
+ >>> config.merge_from_file(args.config)
+ >>> if args.opts:
+ >>> config.merge_from_list(args.opts)
+ >>> config.freeze()
+ >>>
+ >>> if args.ngpu > 1:
+ >>> dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu)
+ >>> else:
+ >>> main_sp(config, args)
"""
def __init__(self, config, args):
diff --git a/paddlespeech/t2s/training/extensions/snapshot.py b/paddlespeech/t2s/training/extensions/snapshot.py
index 3a86556b..5f8d3c45 100644
--- a/paddlespeech/t2s/training/extensions/snapshot.py
+++ b/paddlespeech/t2s/training/extensions/snapshot.py
@@ -43,10 +43,8 @@ class Snapshot(extension.Extension):
parameters and optimizer states. If the updater inside the trainer
subclasses StandardUpdater, everything is good to go.
- Parameters
- ----------
- checkpoint_dir : Union[str, Path]
- The directory to save checkpoints into.
+ Arsg:
+ checkpoint_dir (Union[str, Path]): The directory to save checkpoints into.
"""
trigger = (1, 'epoch')
diff --git a/paddlespeech/t2s/training/optimizer.py b/paddlespeech/t2s/training/optimizer.py
index 907e3daf..64274d53 100644
--- a/paddlespeech/t2s/training/optimizer.py
+++ b/paddlespeech/t2s/training/optimizer.py
@@ -26,10 +26,13 @@ optim_classes = dict(
sgd=paddle.optimizer.SGD, )
-def build_optimizers(model: nn.Layer,
- optim='adadelta',
- max_grad_norm=None,
- learning_rate=0.01) -> paddle.optimizer:
+def build_optimizers(
+ model: nn.Layer,
+ optim='adadelta',
+ max_grad_norm=None,
+ learning_rate=0.01,
+ weight_decay=None,
+ epsilon=1.0e-6, ) -> paddle.optimizer:
optim_class = optim_classes.get(optim)
if optim_class is None:
raise ValueError(f"must be one of {list(optim_classes)}: {optim}")
@@ -37,10 +40,13 @@ def build_optimizers(model: nn.Layer,
grad_clip = None
if max_grad_norm:
grad_clip = paddle.nn.ClipGradByGlobalNorm(max_grad_norm)
- optim = optim_class(
- parameters=model.parameters(),
- learning_rate=learning_rate,
- grad_clip=grad_clip)
+ optim_dict = {}
+ optim_dict['parameters'] = model.parameters()
+ optim_dict['learning_rate'] = learning_rate
+ optim_dict['grad_clip'] = grad_clip
+ optim_dict['weight_decay'] = weight_decay
+ if optim_class not in {'momentum', 'sgd'}:
+ optim_dict['epsilon'] = epsilon
+ optimizers = optim_class(**optim_dict)
- optimizers = optim
return optimizers
diff --git a/paddlespeech/t2s/utils/__init__.py b/paddlespeech/t2s/utils/__init__.py
index ce3a4ef6..520c81a2 100644
--- a/paddlespeech/t2s/utils/__init__.py
+++ b/paddlespeech/t2s/utils/__init__.py
@@ -16,3 +16,7 @@ from . import display
from . import layer_tools
from . import mp_tools
from . import scheduler
+
+
+def str2bool(str):
+ return True if str.lower() == 'true' else False
diff --git a/paddlespeech/t2s/utils/error_rate.py b/paddlespeech/t2s/utils/error_rate.py
index 7a9fe5ad..41b13b75 100644
--- a/paddlespeech/t2s/utils/error_rate.py
+++ b/paddlespeech/t2s/utils/error_rate.py
@@ -70,21 +70,14 @@ def word_errors(reference, hypothesis, ignore_case=False, delimiter=' '):
"""Compute the levenshtein distance between reference sequence and
hypothesis sequence in word-level.
- Parameters
- ----------
- reference : str
- The reference sentence.
- hypothesis : str
- The hypothesis sentence.
- ignore_case : bool
- Whether case-sensitive or not.
- delimiter : char(str)
- Delimiter of input sentences.
-
- Returns
- ----------
- list
- Levenshtein distance and word number of reference sentence.
+ Args:
+ reference (str): The reference sentence.
+ hypothesis (str): The hypothesis sentence.
+ ignore_case (bool): Whether case-sensitive or not.
+ delimiter (char(str)): Delimiter of input sentences.
+
+ Returns:
+ list: Levenshtein distance and word number of reference sentence.
"""
if ignore_case:
reference = reference.lower()
@@ -101,21 +94,14 @@ def char_errors(reference, hypothesis, ignore_case=False, remove_space=False):
"""Compute the levenshtein distance between reference sequence and
hypothesis sequence in char-level.
- Parameters
- ----------
- reference: str
- The reference sentence.
- hypothesis: str
- The hypothesis sentence.
- ignore_case: bool
- Whether case-sensitive or not.
- remove_space: bool
- Whether remove internal space characters
-
- Returns
- ----------
- list
- Levenshtein distance and length of reference sentence.
+ Args:
+ reference (str): The reference sentence.
+ hypothesis (str): The hypothesis sentence.
+ ignore_case (bool): Whether case-sensitive or not.
+ remove_space (bool): Whether remove internal space characters
+
+ Returns:
+ list: Levenshtein distance and length of reference sentence.
"""
if ignore_case:
reference = reference.lower()
@@ -146,27 +132,17 @@ def wer(reference, hypothesis, ignore_case=False, delimiter=' '):
We can use levenshtein distance to calculate WER. Please draw an attention
that empty items will be removed when splitting sentences by delimiter.
- Parameters
- ----------
- reference: str
- The reference sentence.
-
- hypothesis: str
- The hypothesis sentence.
- ignore_case: bool
- Whether case-sensitive or not.
- delimiter: char
- Delimiter of input sentences.
-
- Returns
- ----------
- float
- Word error rate.
-
- Raises
- ----------
- ValueError
- If word number of reference is zero.
+ Args:
+ reference (str): The reference sentence.
+ hypothesis (str): The hypothesis sentence.
+ ignore_case (bool): Whether case-sensitive or not.
+ delimiter (char): Delimiter of input sentences.
+
+ Returns:
+ float: Word error rate.
+
+ Raises:
+ ValueError: If word number of reference is zero.
"""
edit_distance, ref_len = word_errors(reference, hypothesis, ignore_case,
delimiter)
@@ -194,26 +170,17 @@ def cer(reference, hypothesis, ignore_case=False, remove_space=False):
space characters will be truncated and multiple consecutive space
characters in a sentence will be replaced by one space character.
- Parameters
- ----------
- reference: str
- The reference sentence.
- hypothesis: str
- The hypothesis sentence.
- ignore_case: bool
- Whether case-sensitive or not.
- remove_space: bool
- Whether remove internal space characters
-
- Returns
- ----------
- float
- Character error rate.
-
- Raises
- ----------
- ValueError
- If the reference length is zero.
+ Args:
+ reference (str): The reference sentence.
+ hypothesis (str): The hypothesis sentence.
+ ignore_case (bool): Whether case-sensitive or not.
+ remove_space (bool): Whether remove internal space characters
+
+ Returns:
+ float: Character error rate.
+
+ Raises:
+ ValueError: If the reference length is zero.
"""
edit_distance, ref_len = char_errors(reference, hypothesis, ignore_case,
remove_space)
diff --git a/paddlespeech/t2s/utils/h5_utils.py b/paddlespeech/t2s/utils/h5_utils.py
index d0e277db..75c2e448 100644
--- a/paddlespeech/t2s/utils/h5_utils.py
+++ b/paddlespeech/t2s/utils/h5_utils.py
@@ -23,18 +23,12 @@ import numpy as np
def read_hdf5(filename: Union[Path, str], dataset_name: str) -> Any:
"""Read a dataset from a HDF5 file.
+ Args:
+ filename (Union[Path, str]): Path of the HDF5 file.
+ dataset_name (str): Name of the dataset to read.
- Parameters
- ----------
- filename : Union[Path, str]
- Path of the HDF5 file.
- dataset_name : str
- Name of the dataset to read.
-
- Returns
- -------
- Any
- The retrieved dataset.
+ Returns:
+ Any: The retrieved dataset.
"""
filename = Path(filename)
@@ -60,17 +54,11 @@ def write_hdf5(filename: Union[Path, str],
write_data: np.ndarray,
is_overwrite: bool=True) -> None:
"""Write dataset to HDF5 file.
-
- Parameters
- ----------
- filename : Union[Path, str]
- Path of the HDF5 file.
- dataset_name : str
- Name of the dataset to write to.
- write_data : np.ndarrays
- The data to write.
- is_overwrite : bool, optional
- Whether to overwrite, by default True
+ Args:
+ filename (Union[Path, str]): Path of the HDF5 file.
+ dataset_name (str): Name of the dataset to write to.
+ write_data (np.ndarrays): The data to write.
+ is_overwrite (bool, optional): Whether to overwrite, by default True
"""
# convert to numpy array
filename = Path(filename)
diff --git a/paddlespeech/text/exps/ernie_linear/train.py b/paddlespeech/text/exps/ernie_linear/train.py
index 0d730d66..22c25e17 100644
--- a/paddlespeech/text/exps/ernie_linear/train.py
+++ b/paddlespeech/text/exps/ernie_linear/train.py
@@ -135,9 +135,8 @@ def train_sp(args, config):
if dist.get_rank() == 0:
trainer.extend(evaluator, trigger=(1, "epoch"))
trainer.extend(VisualDL(output_dir), trigger=(1, "iteration"))
- trainer.extend(
- Snapshot(max_size=config.num_snapshots), trigger=(1, 'epoch'))
- # print(trainer.extensions)
+ trainer.extend(
+ Snapshot(max_size=config.num_snapshots), trigger=(1, 'epoch'))
trainer.run()
diff --git a/paddlespeech/vector/exps/ge2e/audio_processor.py b/paddlespeech/vector/exps/ge2e/audio_processor.py
index 2d6bbe34..1ab0419e 100644
--- a/paddlespeech/vector/exps/ge2e/audio_processor.py
+++ b/paddlespeech/vector/exps/ge2e/audio_processor.py
@@ -127,7 +127,7 @@ def compute_partial_slices(n_samples: int,
partial_utterance_n_frames : int
the number of mel spectrogram frames in each partial utterance.
- min_pad_coverage : int
+ min_pad_coverage : int
when reaching the last partial utterance, it may or may not have enough frames.
If at least of are present,
then the last partial utterance will be considered, as if we padded the audio. Otherwise,
@@ -137,7 +137,7 @@ def compute_partial_slices(n_samples: int,
by how much the partial utterance should overlap. If set to 0, the partial utterances are entirely disjoint.
Returns
----------
- the waveform slices and mel spectrogram slices as lists of array slices.
+ the waveform slices and mel spectrogram slices as lists of array slices.
Index respectively the waveform and the mel spectrogram with these slices to obtain the partialutterances.
"""
assert 0 <= overlap < 1
@@ -206,7 +206,8 @@ class SpeakerVerificationPreprocessor(object):
# Resample if numpy.array is passed and sr does not match
if source_sr is not None and source_sr != self.sampling_rate:
- wav = librosa.resample(wav, source_sr, self.sampling_rate)
+ wav = librosa.resample(
+ wav, orig_sr=source_sr, target_sr=self.sampling_rate)
# loudness normalization
wav = normalize_volume(
@@ -221,7 +222,7 @@ class SpeakerVerificationPreprocessor(object):
def melspectrogram(self, wav):
mel = librosa.feature.melspectrogram(
- wav,
+ y=wav,
sr=self.sampling_rate,
n_fft=self.n_fft,
hop_length=self.hop_length,
diff --git a/paddlespeech/vector/exps/ge2e/speaker_verification_dataset.py b/paddlespeech/vector/exps/ge2e/speaker_verification_dataset.py
index 194eb7f2..ae6f6ad9 100644
--- a/paddlespeech/vector/exps/ge2e/speaker_verification_dataset.py
+++ b/paddlespeech/vector/exps/ge2e/speaker_verification_dataset.py
@@ -123,9 +123,3 @@ class Collate(object):
frame_clips = [self.random_crop(mel) for mel in examples]
batced_clips = np.stack(frame_clips)
return batced_clips
-
-
-if __name__ == "__main__":
- mydataset = MultiSpeakerMelDataset(
- Path("/home/chenfeiyu/datasets/SV2TTS/encoder"))
- print(mydataset.get_example_by_index(0, 10))
diff --git a/setup.py b/setup.py
index cdb899e4..71b6d528 100644
--- a/setup.py
+++ b/setup.py
@@ -27,6 +27,8 @@ from setuptools.command.install import install
HERE = Path(os.path.abspath(os.path.dirname(__file__)))
+VERSION = '0.1.1'
+
requirements = {
"install": [
"editdistance",
@@ -85,6 +87,24 @@ requirements = {
}
+def write_version_py(filename='paddlespeech/__init__.py'):
+ import paddlespeech
+ if hasattr(paddlespeech,
+ "__version__") and paddlespeech.__version__ == VERSION:
+ return
+ with open(filename, "a") as f:
+ f.write(f"\n__version__ = '{VERSION}'\n")
+
+
+def remove_version_py(filename='paddlespeech/__init__.py'):
+ with open(filename, "r") as f:
+ lines = f.readlines()
+ with open(filename, "w") as f:
+ for line in lines:
+ if "__version__" not in line:
+ f.write(line)
+
+
@contextlib.contextmanager
def pushd(new_dir):
old_dir = os.getcwd()
@@ -172,10 +192,12 @@ class UploadCommand(Command):
sys.exit()
+write_version_py()
+
setup_info = dict(
# Metadata
name='paddlespeech',
- version='0.1.1',
+ version=VERSION,
author='PaddlePaddle Speech and Language Team',
author_email='paddlesl@baidu.com',
url='https://github.com/PaddlePaddle/PaddleSpeech',
@@ -238,3 +260,5 @@ setup_info = dict(
})
setup(**setup_info)
+
+remove_version_py()
diff --git a/setup_audio.py b/setup_audio.py
index 5f014065..21204998 100644
--- a/setup_audio.py
+++ b/setup_audio.py
@@ -13,14 +13,33 @@
# limitations under the License.
import setuptools
-import paddleaudio
-
# set the version here
-version = paddleaudio.__version__
+VERSION = '0.1.0'
+
+
+def write_version_py(filename='paddleaudio/__init__.py'):
+ import paddleaudio
+ if hasattr(paddleaudio,
+ "__version__") and paddleaudio.__version__ == VERSION:
+ return
+ with open(filename, "a") as f:
+ f.write(f"\n__version__ = '{VERSION}'\n")
+
+
+def remove_version_py(filename='paddleaudio/__init__.py'):
+ with open(filename, "r") as f:
+ lines = f.readlines()
+ with open(filename, "w") as f:
+ for line in lines:
+ if "__version__" not in line:
+ f.write(line)
+
+
+write_version_py()
setuptools.setup(
name="paddleaudio",
- version=version,
+ version=VERSION,
author="",
author_email="",
description="PaddleAudio, in development",
@@ -41,3 +60,5 @@ setuptools.setup(
'soundfile >= 0.9.0',
'colorlog',
], )
+
+remove_version_py()
diff --git a/speechx/CMakeLists.txt b/speechx/CMakeLists.txt
new file mode 100644
index 00000000..e003136a
--- /dev/null
+++ b/speechx/CMakeLists.txt
@@ -0,0 +1,124 @@
+cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
+
+project(paddlespeech VERSION 0.1)
+
+set(CMAKE_VERBOSE_MAKEFILE on)
+# set std-14
+set(CMAKE_CXX_STANDARD 14)
+
+# include file
+include(FetchContent)
+include(ExternalProject)
+# fc_patch dir
+set(FETCHCONTENT_QUIET off)
+get_filename_component(fc_patch "fc_patch" REALPATH BASE_DIR "${CMAKE_SOURCE_DIR}")
+set(FETCHCONTENT_BASE_DIR ${fc_patch})
+
+
+###############################################################################
+# Option Configurations
+###############################################################################
+# option configurations
+option(TEST_DEBUG "option for debug" OFF)
+
+
+###############################################################################
+# Include third party
+###############################################################################
+# #example for include third party
+# FetchContent_Declare()
+# # FetchContent_MakeAvailable was not added until CMake 3.14
+# FetchContent_MakeAvailable()
+# include_directories()
+
+# ABSEIL-CPP
+include(FetchContent)
+FetchContent_Declare(
+ absl
+ GIT_REPOSITORY "https://github.com/abseil/abseil-cpp.git"
+ GIT_TAG "20210324.1"
+)
+FetchContent_MakeAvailable(absl)
+
+# libsndfile
+include(FetchContent)
+FetchContent_Declare(
+ libsndfile
+ GIT_REPOSITORY "https://github.com/libsndfile/libsndfile.git"
+ GIT_TAG "1.0.31"
+)
+FetchContent_MakeAvailable(libsndfile)
+
+# gflags
+FetchContent_Declare(
+ gflags
+ URL https://github.com/gflags/gflags/archive/v2.2.1.zip
+ URL_HASH SHA256=4e44b69e709c826734dbbbd5208f61888a2faf63f239d73d8ba0011b2dccc97a
+)
+FetchContent_MakeAvailable(gflags)
+include_directories(${gflags_BINARY_DIR}/include)
+
+# glog
+FetchContent_Declare(
+ glog
+ URL https://github.com/google/glog/archive/v0.4.0.zip
+ URL_HASH SHA256=9e1b54eb2782f53cd8af107ecf08d2ab64b8d0dc2b7f5594472f3bd63ca85cdc
+)
+FetchContent_MakeAvailable(glog)
+include_directories(${glog_BINARY_DIR})
+
+# gtest
+FetchContent_Declare(googletest
+ URL https://github.com/google/googletest/archive/release-1.10.0.zip
+ URL_HASH SHA256=94c634d499558a76fa649edb13721dce6e98fb1e7018dfaeba3cd7a083945e91
+)
+FetchContent_MakeAvailable(googletest)
+
+# openfst
+set(openfst_SOURCE_DIR ${fc_patch}/openfst-src)
+set(openfst_BINARY_DIR ${fc_patch}/openfst-build)
+set(openfst_PREFIX_DIR ${fc_patch}/openfst-subbuild/openfst-populate-prefix)
+ExternalProject_Add(openfst
+ URL https://github.com/mjansche/openfst/archive/refs/tags/1.7.2.zip
+ URL_HASH SHA256=ffc56931025579a8af3515741c0f3b0fc3a854c023421472c07ca0c6389c75e6
+ SOURCE_DIR ${openfst_SOURCE_DIR}
+ BINARY_DIR ${openfst_BINARY_DIR}
+ CONFIGURE_COMMAND ${openfst_SOURCE_DIR}/configure --prefix=${openfst_PREFIX_DIR}
+ "CPPFLAGS=-I${gflags_BINARY_DIR}/include -I${glog_SOURCE_DIR}/src -I${glog_BINARY_DIR}"
+ "LDFLAGS=-L${gflags_BINARY_DIR} -L${glog_BINARY_DIR}"
+ "LIBS=-lgflags_nothreads -lglog -lpthread"
+ BUILD_COMMAND make -j 4
+)
+add_dependencies(openfst gflags glog)
+link_directories(${openfst_PREFIX_DIR}/lib)
+include_directories(${openfst_PREFIX_DIR}/include)
+
+add_subdirectory(speechx)
+
+#openblas
+#set(OpenBLAS_INSTALL_PREFIX ${fc_patch}/OpenBLAS)
+#set(OpenBLAS_SOURCE_DIR ${fc_patch}/OpenBLAS-src)
+#ExternalProject_Add(
+# OpenBLAS
+# GIT_REPOSITORY https://github.com/xianyi/OpenBLAS
+# GIT_TAG v0.3.13
+# GIT_SHALLOW TRUE
+# GIT_PROGRESS TRUE
+# CONFIGURE_COMMAND ""
+# BUILD_IN_SOURCE TRUE
+# BUILD_COMMAND make USE_LOCKING=1 USE_THREAD=0
+# INSTALL_COMMAND make PREFIX=${OpenBLAS_INSTALL_PREFIX} install
+# UPDATE_DISCONNECTED TRUE
+#)
+
+###############################################################################
+# Add local library
+###############################################################################
+# system lib
+#find_package()
+# if dir have CmakeLists.txt
+#add_subdirectory(speechx)
+# if dir do not have CmakeLists.txt
+#add_library(lib_name STATIC file.cc)
+#target_link_libraries(lib_name item0 item1)
+#add_dependencies(lib_name depend-target)
\ No newline at end of file
diff --git a/speechx/docker/.gitkeep b/speechx/docker/.gitkeep
new file mode 100644
index 00000000..e69de29b
diff --git a/speechx/examples/.gitkeep b/speechx/examples/.gitkeep
new file mode 100644
index 00000000..e69de29b
diff --git a/speechx/speechx/CMakeLists.txt b/speechx/speechx/CMakeLists.txt
new file mode 100644
index 00000000..71c7eb7c
--- /dev/null
+++ b/speechx/speechx/CMakeLists.txt
@@ -0,0 +1,14 @@
+cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
+
+project(speechx LANGUAGES CXX)
+
+link_directories(${CMAKE_CURRENT_SOURCE_DIR}/third_party/openblas)
+
+include_directories(
+${CMAKE_CURRENT_SOURCE_DIR}
+${CMAKE_CURRENT_SOURCE_DIR}/kaldi
+)
+add_subdirectory(kaldi)
+
+add_executable(mfcc-test codelab/feat_test/feature-mfcc-test.cc)
+target_link_libraries(mfcc-test kaldi-mfcc)
diff --git a/speechx/speechx/base/basic_types.h b/speechx/speechx/base/basic_types.h
new file mode 100644
index 00000000..1966c021
--- /dev/null
+++ b/speechx/speechx/base/basic_types.h
@@ -0,0 +1,60 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "kaldi/base/kaldi-types.h"
+
+#include
+
+typedef float BaseFloat;
+typedef double double64;
+
+typedef signed char int8;
+typedef short int16;
+typedef int int32;
+
+#if defined(__LP64__) && !defined(OS_MACOSX) && !defined(OS_OPENBSD)
+typedef long int64;
+#else
+typedef long long int64;
+#endif
+
+typedef unsigned char uint8;
+typedef unsigned short uint16;
+typedef unsigned int uint32;
+
+if defined(__LP64__) && !defined(OS_MACOSX) && !defined(OS_OPENBSD)
+typedef unsigned long uint64;
+#else
+typedef unsigned long long uint64;
+#endif
+
+typedef signed int char32;
+
+const uint8 kuint8max = (( uint8) 0xFF);
+const uint16 kuint16max = ((uint16) 0xFFFF);
+const uint32 kuint32max = ((uint32) 0xFFFFFFFF);
+const uint64 kuint64max = ((uint64) (0xFFFFFFFFFFFFFFFFLL));
+const int8 kint8min = (( int8) 0x80);
+const int8 kint8max = (( int8) 0x7F);
+const int16 kint16min = (( int16) 0x8000);
+const int16 kint16max = (( int16) 0x7FFF);
+const int32 kint32min = (( int32) 0x80000000);
+const int32 kint32max = (( int32) 0x7FFFFFFF);
+const int64 kint64min = (( int64) (0x8000000000000000LL));
+const int64 kint64max = (( int64) (0x7FFFFFFFFFFFFFFFLL));
+
+const BaseFloat kBaseFloatMax = std::numeric_limits::max();
+const BaseFloat kBaseFloatMin = std::numeric_limits::min();
diff --git a/speechx/speechx/base/macros.h b/speechx/speechx/base/macros.h
new file mode 100644
index 00000000..c8d254d6
--- /dev/null
+++ b/speechx/speechx/base/macros.h
@@ -0,0 +1,23 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+namespace ppspeech {
+
+#define DISALLOW_COPY_AND_ASSIGN(TypeName) \
+ TypeName(const TypeName&) = delete; \
+ void operator=(const TypeName&) = delete
+
+} // namespace pp_speech
\ No newline at end of file
diff --git a/speechx/speechx/codelab/README.md b/speechx/speechx/codelab/README.md
new file mode 100644
index 00000000..95c95db1
--- /dev/null
+++ b/speechx/speechx/codelab/README.md
@@ -0,0 +1,4 @@
+# codelab
+
+This directory is here for testing some funcitons temporaril.
+
diff --git a/speechx/speechx/codelab/feat_test/feature-mfcc-test.cc b/speechx/speechx/codelab/feat_test/feature-mfcc-test.cc
new file mode 100644
index 00000000..c4367139
--- /dev/null
+++ b/speechx/speechx/codelab/feat_test/feature-mfcc-test.cc
@@ -0,0 +1,686 @@
+// feat/feature-mfcc-test.cc
+
+// Copyright 2009-2011 Karel Vesely; Petr Motlicek
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include
+
+#include "feat/feature-mfcc.h"
+#include "base/kaldi-math.h"
+#include "matrix/kaldi-matrix-inl.h"
+#include "feat/wave-reader.h"
+
+using namespace kaldi;
+
+
+
+static void UnitTestReadWave() {
+
+ std::cout << "=== UnitTestReadWave() ===\n";
+
+ Vector v, v2;
+
+ std::cout << "<<<=== Reading waveform\n";
+
+ {
+ std::ifstream is("test_data/test.wav", std::ios_base::binary);
+ WaveData wave;
+ wave.Read(is);
+ const Matrix data(wave.Data());
+ KALDI_ASSERT(data.NumRows() == 1);
+ v.Resize(data.NumCols());
+ v.CopyFromVec(data.Row(0));
+ }
+
+ std::cout << "<<<=== Reading Vector waveform, prepared by matlab\n";
+ std::ifstream input(
+ "test_data/test_matlab.ascii"
+ );
+ KALDI_ASSERT(input.good());
+ v2.Read(input, false);
+ input.close();
+
+ std::cout << "<<<=== Comparing freshly read waveform to 'libsndfile' waveform\n";
+ KALDI_ASSERT(v.Dim() == v2.Dim());
+ for (int32 i = 0; i < v.Dim(); i++) {
+ KALDI_ASSERT(v(i) == v2(i));
+ }
+ std::cout << "<<<=== Comparing done\n";
+
+ // std::cout << "== The Waveform Samples == \n";
+ // std::cout << v;
+
+ std::cout << "Test passed :)\n\n";
+
+}
+
+
+
+/**
+ */
+static void UnitTestSimple() {
+ std::cout << "=== UnitTestSimple() ===\n";
+
+ Vector v(100000);
+ Matrix m;
+
+ // init with noise
+ for (int32 i = 0; i < v.Dim(); i++) {
+ v(i) = (abs( i * 433024253 ) % 65535) - (65535 / 2);
+ }
+
+ std::cout << "<<<=== Just make sure it runs... Nothing is compared\n";
+ // the parametrization object
+ MfccOptions op;
+ // trying to have same opts as baseline.
+ op.frame_opts.dither = 0.0;
+ op.frame_opts.preemph_coeff = 0.0;
+ op.frame_opts.window_type = "rectangular";
+ op.frame_opts.remove_dc_offset = false;
+ op.frame_opts.round_to_power_of_two = true;
+ op.mel_opts.low_freq = 0.0;
+ op.mel_opts.htk_mode = true;
+ op.htk_compat = true;
+
+ Mfcc mfcc(op);
+ // use default parameters
+
+ // compute mfccs.
+ mfcc.Compute(v, 1.0, &m);
+
+ // possibly dump
+ // std::cout << "== Output features == \n" << m;
+ std::cout << "Test passed :)\n\n";
+}
+
+
+static void UnitTestHTKCompare1() {
+ std::cout << "=== UnitTestHTKCompare1() ===\n";
+
+ std::ifstream is("test_data/test.wav", std::ios_base::binary);
+ WaveData wave;
+ wave.Read(is);
+ KALDI_ASSERT(wave.Data().NumRows() == 1);
+ SubVector waveform(wave.Data(), 0);
+
+ // read the HTK features
+ Matrix htk_features;
+ {
+ std::ifstream is("test_data/test.wav.fea_htk.1",
+ std::ios::in | std::ios_base::binary);
+ bool ans = ReadHtk(is, &htk_features, 0);
+ KALDI_ASSERT(ans);
+ }
+
+ // use mfcc with default configuration...
+ MfccOptions op;
+ op.frame_opts.dither = 0.0;
+ op.frame_opts.preemph_coeff = 0.0;
+ op.frame_opts.window_type = "hamming";
+ op.frame_opts.remove_dc_offset = false;
+ op.frame_opts.round_to_power_of_two = true;
+ op.mel_opts.low_freq = 0.0;
+ op.mel_opts.htk_mode = true;
+ op.htk_compat = true;
+ op.use_energy = false; // C0 not energy.
+
+ Mfcc mfcc(op);
+
+ // calculate kaldi features
+ Matrix kaldi_raw_features;
+ mfcc.Compute(waveform, 1.0, &kaldi_raw_features);
+
+ DeltaFeaturesOptions delta_opts;
+ Matrix kaldi_features;
+ ComputeDeltas(delta_opts,
+ kaldi_raw_features,
+ &kaldi_features);
+
+ // compare the results
+ bool passed = true;
+ int32 i_old = -1;
+ KALDI_ASSERT(kaldi_features.NumRows() == htk_features.NumRows());
+ KALDI_ASSERT(kaldi_features.NumCols() == htk_features.NumCols());
+ // Ignore ends-- we make slightly different choices than
+ // HTK about how to treat the deltas at the ends.
+ for (int32 i = 10; i+10 < kaldi_features.NumRows(); i++) {
+ for (int32 j = 0; j < kaldi_features.NumCols(); j++) {
+ BaseFloat a = kaldi_features(i, j), b = htk_features(i, j);
+ if ((std::abs(b - a)) > 1.0) { //<< TOLERANCE TO DIFFERENCES!!!!!
+ // print the non-matching data only once per-line
+ if (i_old != i) {
+ std::cout << "\n\n\n[HTK-row: " << i << "] " << htk_features.Row(i) << "\n";
+ std::cout << "[Kaldi-row: " << i << "] " << kaldi_features.Row(i) << "\n\n\n";
+ i_old = i;
+ }
+ // print indices of non-matching cells
+ std::cout << "[" << i << ", " << j << "]";
+ passed = false;
+ }}}
+ if (!passed) KALDI_ERR << "Test failed";
+
+ // write the htk features for later inspection
+ HtkHeader header = {
+ kaldi_features.NumRows(),
+ 100000, // 10ms
+ static_cast(sizeof(float)*kaldi_features.NumCols()),
+ 021406 // MFCC_D_A_0
+ };
+ {
+ std::ofstream os("tmp.test.wav.fea_kaldi.1",
+ std::ios::out|std::ios::binary);
+ WriteHtk(os, kaldi_features, header);
+ }
+
+ std::cout << "Test passed :)\n\n";
+
+ unlink("tmp.test.wav.fea_kaldi.1");
+}
+
+
+static void UnitTestHTKCompare2() {
+ std::cout << "=== UnitTestHTKCompare2() ===\n";
+
+ std::ifstream is("test_data/test.wav", std::ios_base::binary);
+ WaveData wave;
+ wave.Read(is);
+ KALDI_ASSERT(wave.Data().NumRows() == 1);
+ SubVector waveform(wave.Data(), 0);
+
+ // read the HTK features
+ Matrix htk_features;
+ {
+ std::ifstream is("test_data/test.wav.fea_htk.2",
+ std::ios::in | std::ios_base::binary);
+ bool ans = ReadHtk(is, &htk_features, 0);
+ KALDI_ASSERT(ans);
+ }
+
+ // use mfcc with default configuration...
+ MfccOptions op;
+ op.frame_opts.dither = 0.0;
+ op.frame_opts.preemph_coeff = 0.0;
+ op.frame_opts.window_type = "hamming";
+ op.frame_opts.remove_dc_offset = false;
+ op.frame_opts.round_to_power_of_two = true;
+ op.mel_opts.low_freq = 0.0;
+ op.mel_opts.htk_mode = true;
+ op.htk_compat = true;
+ op.use_energy = true; // Use energy.
+
+ Mfcc mfcc(op);
+
+ // calculate kaldi features
+ Matrix kaldi_raw_features;
+ mfcc.Compute(waveform, 1.0, &kaldi_raw_features);
+
+ DeltaFeaturesOptions delta_opts;
+ Matrix kaldi_features;
+ ComputeDeltas(delta_opts,
+ kaldi_raw_features,
+ &kaldi_features);
+
+ // compare the results
+ bool passed = true;
+ int32 i_old = -1;
+ KALDI_ASSERT(kaldi_features.NumRows() == htk_features.NumRows());
+ KALDI_ASSERT(kaldi_features.NumCols() == htk_features.NumCols());
+ // Ignore ends-- we make slightly different choices than
+ // HTK about how to treat the deltas at the ends.
+ for (int32 i = 10; i+10 < kaldi_features.NumRows(); i++) {
+ for (int32 j = 0; j < kaldi_features.NumCols(); j++) {
+ BaseFloat a = kaldi_features(i, j), b = htk_features(i, j);
+ if ((std::abs(b - a)) > 1.0) { //<< TOLERANCE TO DIFFERENCES!!!!!
+ // print the non-matching data only once per-line
+ if (i_old != i) {
+ std::cout << "\n\n\n[HTK-row: " << i << "] " << htk_features.Row(i) << "\n";
+ std::cout << "[Kaldi-row: " << i << "] " << kaldi_features.Row(i) << "\n\n\n";
+ i_old = i;
+ }
+ // print indices of non-matching cells
+ std::cout << "[" << i << ", " << j << "]";
+ passed = false;
+ }}}
+ if (!passed) KALDI_ERR << "Test failed";
+
+ // write the htk features for later inspection
+ HtkHeader header = {
+ kaldi_features.NumRows(),
+ 100000, // 10ms
+ static_cast(sizeof(float)*kaldi_features.NumCols()),
+ 021406 // MFCC_D_A_0
+ };
+ {
+ std::ofstream os("tmp.test.wav.fea_kaldi.2",
+ std::ios::out|std::ios::binary);
+ WriteHtk(os, kaldi_features, header);
+ }
+
+ std::cout << "Test passed :)\n\n";
+
+ unlink("tmp.test.wav.fea_kaldi.2");
+}
+
+
+static void UnitTestHTKCompare3() {
+ std::cout << "=== UnitTestHTKCompare3() ===\n";
+
+ std::ifstream is("test_data/test.wav", std::ios_base::binary);
+ WaveData wave;
+ wave.Read(is);
+ KALDI_ASSERT(wave.Data().NumRows() == 1);
+ SubVector waveform(wave.Data(), 0);
+
+ // read the HTK features
+ Matrix htk_features;
+ {
+ std::ifstream is("test_data/test.wav.fea_htk.3",
+ std::ios::in | std::ios_base::binary);
+ bool ans = ReadHtk(is, &htk_features, 0);
+ KALDI_ASSERT(ans);
+ }
+
+ // use mfcc with default configuration...
+ MfccOptions op;
+ op.frame_opts.dither = 0.0;
+ op.frame_opts.preemph_coeff = 0.0;
+ op.frame_opts.window_type = "hamming";
+ op.frame_opts.remove_dc_offset = false;
+ op.frame_opts.round_to_power_of_two = true;
+ op.htk_compat = true;
+ op.use_energy = true; // Use energy.
+ op.mel_opts.low_freq = 20.0;
+ //op.mel_opts.debug_mel = true;
+ op.mel_opts.htk_mode = true;
+
+ Mfcc mfcc(op);
+
+ // calculate kaldi features
+ Matrix kaldi_raw_features;
+ mfcc.Compute(waveform, 1.0, &kaldi_raw_features);
+
+ DeltaFeaturesOptions delta_opts;
+ Matrix kaldi_features;
+ ComputeDeltas(delta_opts,
+ kaldi_raw_features,
+ &kaldi_features);
+
+ // compare the results
+ bool passed = true;
+ int32 i_old = -1;
+ KALDI_ASSERT(kaldi_features.NumRows() == htk_features.NumRows());
+ KALDI_ASSERT(kaldi_features.NumCols() == htk_features.NumCols());
+ // Ignore ends-- we make slightly different choices than
+ // HTK about how to treat the deltas at the ends.
+ for (int32 i = 10; i+10 < kaldi_features.NumRows(); i++) {
+ for (int32 j = 0; j < kaldi_features.NumCols(); j++) {
+ BaseFloat a = kaldi_features(i, j), b = htk_features(i, j);
+ if ((std::abs(b - a)) > 1.0) { //<< TOLERANCE TO DIFFERENCES!!!!!
+ // print the non-matching data only once per-line
+ if (static_cast(i_old) != i) {
+ std::cout << "\n\n\n[HTK-row: " << i << "] " << htk_features.Row(i) << "\n";
+ std::cout << "[Kaldi-row: " << i << "] " << kaldi_features.Row(i) << "\n\n\n";
+ i_old = i;
+ }
+ // print indices of non-matching cells
+ std::cout << "[" << i << ", " << j << "]";
+ passed = false;
+ }}}
+ if (!passed) KALDI_ERR << "Test failed";
+
+ // write the htk features for later inspection
+ HtkHeader header = {
+ kaldi_features.NumRows(),
+ 100000, // 10ms
+ static_cast(sizeof(float)*kaldi_features.NumCols()),
+ 021406 // MFCC_D_A_0
+ };
+ {
+ std::ofstream os("tmp.test.wav.fea_kaldi.3",
+ std::ios::out|std::ios::binary);
+ WriteHtk(os, kaldi_features, header);
+ }
+
+ std::cout << "Test passed :)\n\n";
+
+ unlink("tmp.test.wav.fea_kaldi.3");
+}
+
+
+static void UnitTestHTKCompare4() {
+ std::cout << "=== UnitTestHTKCompare4() ===\n";
+
+ std::ifstream is("test_data/test.wav", std::ios_base::binary);
+ WaveData wave;
+ wave.Read(is);
+ KALDI_ASSERT(wave.Data().NumRows() == 1);
+ SubVector waveform(wave.Data(), 0);
+
+ // read the HTK features
+ Matrix htk_features;
+ {
+ std::ifstream is("test_data/test.wav.fea_htk.4",
+ std::ios::in | std::ios_base::binary);
+ bool ans = ReadHtk(is, &htk_features, 0);
+ KALDI_ASSERT(ans);
+ }
+
+ // use mfcc with default configuration...
+ MfccOptions op;
+ op.frame_opts.dither = 0.0;
+ op.frame_opts.window_type = "hamming";
+ op.frame_opts.remove_dc_offset = false;
+ op.frame_opts.round_to_power_of_two = true;
+ op.mel_opts.low_freq = 0.0;
+ op.htk_compat = true;
+ op.use_energy = true; // Use energy.
+ op.mel_opts.htk_mode = true;
+
+ Mfcc mfcc(op);
+
+ // calculate kaldi features
+ Matrix kaldi_raw_features;
+ mfcc.Compute(waveform, 1.0, &kaldi_raw_features);
+
+ DeltaFeaturesOptions delta_opts;
+ Matrix kaldi_features;
+ ComputeDeltas(delta_opts,
+ kaldi_raw_features,
+ &kaldi_features);
+
+ // compare the results
+ bool passed = true;
+ int32 i_old = -1;
+ KALDI_ASSERT(kaldi_features.NumRows() == htk_features.NumRows());
+ KALDI_ASSERT(kaldi_features.NumCols() == htk_features.NumCols());
+ // Ignore ends-- we make slightly different choices than
+ // HTK about how to treat the deltas at the ends.
+ for (int32 i = 10; i+10 < kaldi_features.NumRows(); i++) {
+ for (int32 j = 0; j < kaldi_features.NumCols(); j++) {
+ BaseFloat a = kaldi_features(i, j), b = htk_features(i, j);
+ if ((std::abs(b - a)) > 1.0) { //<< TOLERANCE TO DIFFERENCES!!!!!
+ // print the non-matching data only once per-line
+ if (static_cast(i_old) != i) {
+ std::cout << "\n\n\n[HTK-row: " << i << "] " << htk_features.Row(i) << "\n";
+ std::cout << "[Kaldi-row: " << i << "] " << kaldi_features.Row(i) << "\n\n\n";
+ i_old = i;
+ }
+ // print indices of non-matching cells
+ std::cout << "[" << i << ", " << j << "]";
+ passed = false;
+ }}}
+ if (!passed) KALDI_ERR << "Test failed";
+
+ // write the htk features for later inspection
+ HtkHeader header = {
+ kaldi_features.NumRows(),
+ 100000, // 10ms
+ static_cast(sizeof(float)*kaldi_features.NumCols()),
+ 021406 // MFCC_D_A_0
+ };
+ {
+ std::ofstream os("tmp.test.wav.fea_kaldi.4",
+ std::ios::out|std::ios::binary);
+ WriteHtk(os, kaldi_features, header);
+ }
+
+ std::cout << "Test passed :)\n\n";
+
+ unlink("tmp.test.wav.fea_kaldi.4");
+}
+
+
+static void UnitTestHTKCompare5() {
+ std::cout << "=== UnitTestHTKCompare5() ===\n";
+
+ std::ifstream is("test_data/test.wav", std::ios_base::binary);
+ WaveData wave;
+ wave.Read(is);
+ KALDI_ASSERT(wave.Data().NumRows() == 1);
+ SubVector waveform(wave.Data(), 0);
+
+ // read the HTK features
+ Matrix htk_features;
+ {
+ std::ifstream is("test_data/test.wav.fea_htk.5",
+ std::ios::in | std::ios_base::binary);
+ bool ans = ReadHtk(is, &htk_features, 0);
+ KALDI_ASSERT(ans);
+ }
+
+ // use mfcc with default configuration...
+ MfccOptions op;
+ op.frame_opts.dither = 0.0;
+ op.frame_opts.window_type = "hamming";
+ op.frame_opts.remove_dc_offset = false;
+ op.frame_opts.round_to_power_of_two = true;
+ op.htk_compat = true;
+ op.use_energy = true; // Use energy.
+ op.mel_opts.low_freq = 0.0;
+ op.mel_opts.vtln_low = 100.0;
+ op.mel_opts.vtln_high = 7500.0;
+ op.mel_opts.htk_mode = true;
+
+ BaseFloat vtln_warp = 1.1; // our approach identical to htk for warp factor >1,
+ // differs slightly for higher mel bins if warp_factor <0.9
+
+ Mfcc mfcc(op);
+
+ // calculate kaldi features
+ Matrix kaldi_raw_features;
+ mfcc.Compute(waveform, vtln_warp, &kaldi_raw_features);
+
+ DeltaFeaturesOptions delta_opts;
+ Matrix kaldi_features;
+ ComputeDeltas(delta_opts,
+ kaldi_raw_features,
+ &kaldi_features);
+
+ // compare the results
+ bool passed = true;
+ int32 i_old = -1;
+ KALDI_ASSERT(kaldi_features.NumRows() == htk_features.NumRows());
+ KALDI_ASSERT(kaldi_features.NumCols() == htk_features.NumCols());
+ // Ignore ends-- we make slightly different choices than
+ // HTK about how to treat the deltas at the ends.
+ for (int32 i = 10; i+10 < kaldi_features.NumRows(); i++) {
+ for (int32 j = 0; j < kaldi_features.NumCols(); j++) {
+ BaseFloat a = kaldi_features(i, j), b = htk_features(i, j);
+ if ((std::abs(b - a)) > 1.0) { //<< TOLERANCE TO DIFFERENCES!!!!!
+ // print the non-matching data only once per-line
+ if (static_cast(i_old) != i) {
+ std::cout << "\n\n\n[HTK-row: " << i << "] " << htk_features.Row(i) << "\n";
+ std::cout << "[Kaldi-row: " << i << "] " << kaldi_features.Row(i) << "\n\n\n";
+ i_old = i;
+ }
+ // print indices of non-matching cells
+ std::cout << "[" << i << ", " << j << "]";
+ passed = false;
+ }}}
+ if (!passed) KALDI_ERR << "Test failed";
+
+ // write the htk features for later inspection
+ HtkHeader header = {
+ kaldi_features.NumRows(),
+ 100000, // 10ms
+ static_cast(sizeof(float)*kaldi_features.NumCols()),
+ 021406 // MFCC_D_A_0
+ };
+ {
+ std::ofstream os("tmp.test.wav.fea_kaldi.5",
+ std::ios::out|std::ios::binary);
+ WriteHtk(os, kaldi_features, header);
+ }
+
+ std::cout << "Test passed :)\n\n";
+
+ unlink("tmp.test.wav.fea_kaldi.5");
+}
+
+static void UnitTestHTKCompare6() {
+ std::cout << "=== UnitTestHTKCompare6() ===\n";
+
+
+ std::ifstream is("test_data/test.wav", std::ios_base::binary);
+ WaveData wave;
+ wave.Read(is);
+ KALDI_ASSERT(wave.Data().NumRows() == 1);
+ SubVector waveform(wave.Data(), 0);
+
+ // read the HTK features
+ Matrix htk_features;
+ {
+ std::ifstream is("test_data/test.wav.fea_htk.6",
+ std::ios::in | std::ios_base::binary);
+ bool ans = ReadHtk(is, &htk_features, 0);
+ KALDI_ASSERT(ans);
+ }
+
+ // use mfcc with default configuration...
+ MfccOptions op;
+ op.frame_opts.dither = 0.0;
+ op.frame_opts.preemph_coeff = 0.97;
+ op.frame_opts.window_type = "hamming";
+ op.frame_opts.remove_dc_offset = false;
+ op.frame_opts.round_to_power_of_two = true;
+ op.mel_opts.num_bins = 24;
+ op.mel_opts.low_freq = 125.0;
+ op.mel_opts.high_freq = 7800.0;
+ op.htk_compat = true;
+ op.use_energy = false; // C0 not energy.
+
+ Mfcc mfcc(op);
+
+ // calculate kaldi features
+ Matrix kaldi_raw_features;
+ mfcc.Compute(waveform, 1.0, &kaldi_raw_features);
+
+ DeltaFeaturesOptions delta_opts;
+ Matrix kaldi_features;
+ ComputeDeltas(delta_opts,
+ kaldi_raw_features,
+ &kaldi_features);
+
+ // compare the results
+ bool passed = true;
+ int32 i_old = -1;
+ KALDI_ASSERT(kaldi_features.NumRows() == htk_features.NumRows());
+ KALDI_ASSERT(kaldi_features.NumCols() == htk_features.NumCols());
+ // Ignore ends-- we make slightly different choices than
+ // HTK about how to treat the deltas at the ends.
+ for (int32 i = 10; i+10 < kaldi_features.NumRows(); i++) {
+ for (int32 j = 0; j < kaldi_features.NumCols(); j++) {
+ BaseFloat a = kaldi_features(i, j), b = htk_features(i, j);
+ if ((std::abs(b - a)) > 1.0) { //<< TOLERANCE TO DIFFERENCES!!!!!
+ // print the non-matching data only once per-line
+ if (static_cast(i_old) != i) {
+ std::cout << "\n\n\n[HTK-row: " << i << "] " << htk_features.Row(i) << "\n";
+ std::cout << "[Kaldi-row: " << i << "] " << kaldi_features.Row(i) << "\n\n\n";
+ i_old = i;
+ }
+ // print indices of non-matching cells
+ std::cout << "[" << i << ", " << j << "]";
+ passed = false;
+ }}}
+ if (!passed) KALDI_ERR << "Test failed";
+
+ // write the htk features for later inspection
+ HtkHeader header = {
+ kaldi_features.NumRows(),
+ 100000, // 10ms
+ static_cast(sizeof(float)*kaldi_features.NumCols()),
+ 021406 // MFCC_D_A_0
+ };
+ {
+ std::ofstream os("tmp.test.wav.fea_kaldi.6",
+ std::ios::out|std::ios::binary);
+ WriteHtk(os, kaldi_features, header);
+ }
+
+ std::cout << "Test passed :)\n\n";
+
+ unlink("tmp.test.wav.fea_kaldi.6");
+}
+
+void UnitTestVtln() {
+ // Test the function VtlnWarpFreq.
+ BaseFloat low_freq = 10, high_freq = 7800,
+ vtln_low_cutoff = 20, vtln_high_cutoff = 7400;
+
+ for (size_t i = 0; i < 100; i++) {
+ BaseFloat freq = 5000, warp_factor = 0.9 + RandUniform() * 0.2;
+ AssertEqual(MelBanks::VtlnWarpFreq(vtln_low_cutoff, vtln_high_cutoff,
+ low_freq, high_freq, warp_factor,
+ freq),
+ freq / warp_factor);
+
+ AssertEqual(MelBanks::VtlnWarpFreq(vtln_low_cutoff, vtln_high_cutoff,
+ low_freq, high_freq, warp_factor,
+ low_freq),
+ low_freq);
+ AssertEqual(MelBanks::VtlnWarpFreq(vtln_low_cutoff, vtln_high_cutoff,
+ low_freq, high_freq, warp_factor,
+ high_freq),
+ high_freq);
+ BaseFloat freq2 = low_freq + (high_freq-low_freq) * RandUniform(),
+ freq3 = freq2 + (high_freq-freq2) * RandUniform(); // freq3>=freq2
+ BaseFloat w2 = MelBanks::VtlnWarpFreq(vtln_low_cutoff, vtln_high_cutoff,
+ low_freq, high_freq, warp_factor,
+ freq2);
+ BaseFloat w3 = MelBanks::VtlnWarpFreq(vtln_low_cutoff, vtln_high_cutoff,
+ low_freq, high_freq, warp_factor,
+ freq3);
+ KALDI_ASSERT(w3 >= w2); // increasing function.
+ BaseFloat w3dash = MelBanks::VtlnWarpFreq(vtln_low_cutoff, vtln_high_cutoff,
+ low_freq, high_freq, 1.0,
+ freq3);
+ AssertEqual(w3dash, freq3);
+ }
+}
+
+static void UnitTestFeat() {
+ UnitTestVtln();
+ UnitTestReadWave();
+ UnitTestSimple();
+ UnitTestHTKCompare1();
+ UnitTestHTKCompare2();
+ // commenting out this one as it doesn't compare right now I normalized
+ // the way the FFT bins are treated (removed offset of 0.5)... this seems
+ // to relate to the way frequency zero behaves.
+ UnitTestHTKCompare3();
+ UnitTestHTKCompare4();
+ UnitTestHTKCompare5();
+ UnitTestHTKCompare6();
+ std::cout << "Tests succeeded.\n";
+}
+
+
+
+int main() {
+ try {
+ for (int i = 0; i < 5; i++)
+ UnitTestFeat();
+ std::cout << "Tests succeeded.\n";
+ return 0;
+ } catch (const std::exception &e) {
+ std::cerr << e.what();
+ return 1;
+ }
+}
+
+
diff --git a/speechx/speechx/common/CMakeLists.txt b/speechx/speechx/common/CMakeLists.txt
new file mode 100644
index 00000000..e69de29b
diff --git a/speechx/speechx/decoder/CMakeLists.txt b/speechx/speechx/decoder/CMakeLists.txt
new file mode 100644
index 00000000..259261bd
--- /dev/null
+++ b/speechx/speechx/decoder/CMakeLists.txt
@@ -0,0 +1,2 @@
+aux_source_directory(. DIR_LIB_SRCS)
+add_library(decoder STATIC ${DIR_LIB_SRCS})
diff --git a/speechx/speechx/frontend/CMakeLists.txt b/speechx/speechx/frontend/CMakeLists.txt
new file mode 100644
index 00000000..e69de29b
diff --git a/speechx/speechx/frontend/audio/CMakeLists.txt b/speechx/speechx/frontend/audio/CMakeLists.txt
new file mode 100644
index 00000000..e69de29b
diff --git a/speechx/speechx/frontend/text/CMakeLists.txt b/speechx/speechx/frontend/text/CMakeLists.txt
new file mode 100644
index 00000000..e69de29b
diff --git a/speechx/speechx/kaldi/.gitkeep b/speechx/speechx/kaldi/.gitkeep
new file mode 100644
index 00000000..e69de29b
diff --git a/speechx/speechx/kaldi/CMakeLists.txt b/speechx/speechx/kaldi/CMakeLists.txt
new file mode 100644
index 00000000..414a6fa0
--- /dev/null
+++ b/speechx/speechx/kaldi/CMakeLists.txt
@@ -0,0 +1,6 @@
+project(kaldi)
+
+add_subdirectory(base)
+add_subdirectory(util)
+add_subdirectory(feat)
+add_subdirectory(matrix)
diff --git a/speechx/speechx/kaldi/base/CMakeLists.txt b/speechx/speechx/kaldi/base/CMakeLists.txt
new file mode 100644
index 00000000..f738bf2d
--- /dev/null
+++ b/speechx/speechx/kaldi/base/CMakeLists.txt
@@ -0,0 +1,7 @@
+
+add_library(kaldi-base
+ io-funcs.cc
+ kaldi-error.cc
+ kaldi-math.cc
+ kaldi-utils.cc
+ timer.cc)
\ No newline at end of file
diff --git a/speechx/speechx/kaldi/base/io-funcs-inl.h b/speechx/speechx/kaldi/base/io-funcs-inl.h
new file mode 100644
index 00000000..b703ef5a
--- /dev/null
+++ b/speechx/speechx/kaldi/base/io-funcs-inl.h
@@ -0,0 +1,327 @@
+// base/io-funcs-inl.h
+
+// Copyright 2009-2011 Microsoft Corporation; Saarland University;
+// Jan Silovsky; Yanmin Qian;
+// Johns Hopkins University (Author: Daniel Povey)
+// 2016 Xiaohui Zhang
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_BASE_IO_FUNCS_INL_H_
+#define KALDI_BASE_IO_FUNCS_INL_H_ 1
+
+// Do not include this file directly. It is included by base/io-funcs.h
+
+#include
+#include
+
+namespace kaldi {
+
+// Template that covers integers.
+template void WriteBasicType(std::ostream &os,
+ bool binary, T t) {
+ // Compile time assertion that this is not called with a wrong type.
+ KALDI_ASSERT_IS_INTEGER_TYPE(T);
+ if (binary) {
+ char len_c = (std::numeric_limits::is_signed ? 1 : -1)
+ * static_cast(sizeof(t));
+ os.put(len_c);
+ os.write(reinterpret_cast(&t), sizeof(t));
+ } else {
+ if (sizeof(t) == 1)
+ os << static_cast(t) << " ";
+ else
+ os << t << " ";
+ }
+ if (os.fail()) {
+ KALDI_ERR << "Write failure in WriteBasicType.";
+ }
+}
+
+// Template that covers integers.
+template inline void ReadBasicType(std::istream &is,
+ bool binary, T *t) {
+ KALDI_PARANOID_ASSERT(t != NULL);
+ // Compile time assertion that this is not called with a wrong type.
+ KALDI_ASSERT_IS_INTEGER_TYPE(T);
+ if (binary) {
+ int len_c_in = is.get();
+ if (len_c_in == -1)
+ KALDI_ERR << "ReadBasicType: encountered end of stream.";
+ char len_c = static_cast(len_c_in), len_c_expected
+ = (std::numeric_limits::is_signed ? 1 : -1)
+ * static_cast(sizeof(*t));
+ if (len_c != len_c_expected) {
+ KALDI_ERR << "ReadBasicType: did not get expected integer type, "
+ << static_cast(len_c)
+ << " vs. " << static_cast(len_c_expected)
+ << ". You can change this code to successfully"
+ << " read it later, if needed.";
+ // insert code here to read "wrong" type. Might have a switch statement.
+ }
+ is.read(reinterpret_cast(t), sizeof(*t));
+ } else {
+ if (sizeof(*t) == 1) {
+ int16 i;
+ is >> i;
+ *t = i;
+ } else {
+ is >> *t;
+ }
+ }
+ if (is.fail()) {
+ KALDI_ERR << "Read failure in ReadBasicType, file position is "
+ << is.tellg() << ", next char is " << is.peek();
+ }
+}
+
+// Template that covers integers.
+template
+inline void WriteIntegerPairVector(std::ostream &os, bool binary,
+ const std::vector > &v) {
+ // Compile time assertion that this is not called with a wrong type.
+ KALDI_ASSERT_IS_INTEGER_TYPE(T);
+ if (binary) {
+ char sz = sizeof(T); // this is currently just a check.
+ os.write(&sz, 1);
+ int32 vecsz = static_cast(v.size());
+ KALDI_ASSERT((size_t)vecsz == v.size());
+ os.write(reinterpret_cast(&vecsz), sizeof(vecsz));
+ if (vecsz != 0) {
+ os.write(reinterpret_cast(&(v[0])), sizeof(T) * vecsz * 2);
+ }
+ } else {
+ // focus here is on prettiness of text form rather than
+ // efficiency of reading-in.
+ // reading-in is dominated by low-level operations anyway:
+ // for efficiency use binary.
+ os << "[ ";
+ typename std::vector >::const_iterator iter = v.begin(),
+ end = v.end();
+ for (; iter != end; ++iter) {
+ if (sizeof(T) == 1)
+ os << static_cast(iter->first) << ','
+ << static_cast(iter->second) << ' ';
+ else
+ os << iter->first << ','
+ << iter->second << ' ';
+ }
+ os << "]\n";
+ }
+ if (os.fail()) {
+ KALDI_ERR << "Write failure in WriteIntegerPairVector.";
+ }
+}
+
+// Template that covers integers.
+template
+inline void ReadIntegerPairVector(std::istream &is, bool binary,
+ std::vector > *v) {
+ KALDI_ASSERT_IS_INTEGER_TYPE(T);
+ KALDI_ASSERT(v != NULL);
+ if (binary) {
+ int sz = is.peek();
+ if (sz == sizeof(T)) {
+ is.get();
+ } else { // this is currently just a check.
+ KALDI_ERR << "ReadIntegerPairVector: expected to see type of size "
+ << sizeof(T) << ", saw instead " << sz << ", at file position "
+ << is.tellg();
+ }
+ int32 vecsz;
+ is.read(reinterpret_cast(&vecsz), sizeof(vecsz));
+ if (is.fail() || vecsz < 0) goto bad;
+ v->resize(vecsz);
+ if (vecsz > 0) {
+ is.read(reinterpret_cast(&((*v)[0])), sizeof(T)*vecsz*2);
+ }
+ } else {
+ std::vector > tmp_v; // use temporary so v doesn't use extra memory
+ // due to resizing.
+ is >> std::ws;
+ if (is.peek() != static_cast('[')) {
+ KALDI_ERR << "ReadIntegerPairVector: expected to see [, saw "
+ << is.peek() << ", at file position " << is.tellg();
+ }
+ is.get(); // consume the '['.
+ is >> std::ws; // consume whitespace.
+ while (is.peek() != static_cast(']')) {
+ if (sizeof(T) == 1) { // read/write chars as numbers.
+ int16 next_t1, next_t2;
+ is >> next_t1;
+ if (is.fail()) goto bad;
+ if (is.peek() != static_cast(','))
+ KALDI_ERR << "ReadIntegerPairVector: expected to see ',', saw "
+ << is.peek() << ", at file position " << is.tellg();
+ is.get(); // consume the ','.
+ is >> next_t2 >> std::ws;
+ if (is.fail()) goto bad;
+ else
+ tmp_v.push_back(std::make_pair((T)next_t1, (T)next_t2));
+ } else {
+ T next_t1, next_t2;
+ is >> next_t1;
+ if (is.fail()) goto bad;
+ if (is.peek() != static_cast(','))
+ KALDI_ERR << "ReadIntegerPairVector: expected to see ',', saw "
+ << is.peek() << ", at file position " << is.tellg();
+ is.get(); // consume the ','.
+ is >> next_t2 >> std::ws;
+ if (is.fail()) goto bad;
+ else
+ tmp_v.push_back(std::pair(next_t1, next_t2));
+ }
+ }
+ is.get(); // get the final ']'.
+ *v = tmp_v; // could use std::swap to use less temporary memory, but this
+ // uses less permanent memory.
+ }
+ if (!is.fail()) return;
+ bad:
+ KALDI_ERR << "ReadIntegerPairVector: read failure at file position "
+ << is.tellg();
+}
+
+template inline void WriteIntegerVector(std::ostream &os, bool binary,
+ const std::vector &v) {
+ // Compile time assertion that this is not called with a wrong type.
+ KALDI_ASSERT_IS_INTEGER_TYPE(T);
+ if (binary) {
+ char sz = sizeof(T); // this is currently just a check.
+ os.write(&sz, 1);
+ int32 vecsz = static_cast(v.size());
+ KALDI_ASSERT((size_t)vecsz == v.size());
+ os.write(reinterpret_cast(&vecsz), sizeof(vecsz));
+ if (vecsz != 0) {
+ os.write(reinterpret_cast(&(v[0])), sizeof(T)*vecsz);
+ }
+ } else {
+ // focus here is on prettiness of text form rather than
+ // efficiency of reading-in.
+ // reading-in is dominated by low-level operations anyway:
+ // for efficiency use binary.
+ os << "[ ";
+ typename std::vector::const_iterator iter = v.begin(), end = v.end();
+ for (; iter != end; ++iter) {
+ if (sizeof(T) == 1)
+ os << static_cast(*iter) << " ";
+ else
+ os << *iter << " ";
+ }
+ os << "]\n";
+ }
+ if (os.fail()) {
+ KALDI_ERR << "Write failure in WriteIntegerVector.";
+ }
+}
+
+
+template inline void ReadIntegerVector(std::istream &is,
+ bool binary,
+ std::vector *v) {
+ KALDI_ASSERT_IS_INTEGER_TYPE(T);
+ KALDI_ASSERT(v != NULL);
+ if (binary) {
+ int sz = is.peek();
+ if (sz == sizeof(T)) {
+ is.get();
+ } else { // this is currently just a check.
+ KALDI_ERR << "ReadIntegerVector: expected to see type of size "
+ << sizeof(T) << ", saw instead " << sz << ", at file position "
+ << is.tellg();
+ }
+ int32 vecsz;
+ is.read(reinterpret_cast(&vecsz), sizeof(vecsz));
+ if (is.fail() || vecsz < 0) goto bad;
+ v->resize(vecsz);
+ if (vecsz > 0) {
+ is.read(reinterpret_cast(&((*v)[0])), sizeof(T)*vecsz);
+ }
+ } else {
+ std::vector tmp_v; // use temporary so v doesn't use extra memory
+ // due to resizing.
+ is >> std::ws;
+ if (is.peek() != static_cast('[')) {
+ KALDI_ERR << "ReadIntegerVector: expected to see [, saw "
+ << is.peek() << ", at file position " << is.tellg();
+ }
+ is.get(); // consume the '['.
+ is >> std::ws; // consume whitespace.
+ while (is.peek() != static_cast(']')) {
+ if (sizeof(T) == 1) { // read/write chars as numbers.
+ int16 next_t;
+ is >> next_t >> std::ws;
+ if (is.fail()) goto bad;
+ else
+ tmp_v.push_back((T)next_t);
+ } else {
+ T next_t;
+ is >> next_t >> std::ws;
+ if (is.fail()) goto bad;
+ else
+ tmp_v.push_back(next_t);
+ }
+ }
+ is.get(); // get the final ']'.
+ *v = tmp_v; // could use std::swap to use less temporary memory, but this
+ // uses less permanent memory.
+ }
+ if (!is.fail()) return;
+ bad:
+ KALDI_ERR << "ReadIntegerVector: read failure at file position "
+ << is.tellg();
+}
+
+
+// Initialize an opened stream for writing by writing an optional binary
+// header and modifying the floating-point precision.
+inline void InitKaldiOutputStream(std::ostream &os, bool binary) {
+ // This does not throw exceptions (does not check for errors).
+ if (binary) {
+ os.put('\0');
+ os.put('B');
+ }
+ // Note, in non-binary mode we may at some point want to mess with
+ // the precision a bit.
+ // 7 is a bit more than the precision of float..
+ if (os.precision() < 7)
+ os.precision(7);
+}
+
+/// Initialize an opened stream for reading by detecting the binary header and
+// setting the "binary" value appropriately.
+inline bool InitKaldiInputStream(std::istream &is, bool *binary) {
+ // Sets the 'binary' variable.
+ // Throws exception in the very unusual situation that stream
+ // starts with '\0' but not then 'B'.
+
+ if (is.peek() == '\0') { // seems to be binary
+ is.get();
+ if (is.peek() != 'B') {
+ return false;
+ }
+ is.get();
+ *binary = true;
+ return true;
+ } else {
+ *binary = false;
+ return true;
+ }
+}
+
+} // end namespace kaldi.
+
+#endif // KALDI_BASE_IO_FUNCS_INL_H_
diff --git a/speechx/speechx/kaldi/base/io-funcs.cc b/speechx/speechx/kaldi/base/io-funcs.cc
new file mode 100644
index 00000000..150f7409
--- /dev/null
+++ b/speechx/speechx/kaldi/base/io-funcs.cc
@@ -0,0 +1,218 @@
+// base/io-funcs.cc
+
+// Copyright 2009-2011 Microsoft Corporation; Saarland University
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/io-funcs.h"
+#include "base/kaldi-math.h"
+
+namespace kaldi {
+
+template<>
+void WriteBasicType(std::ostream &os, bool binary, bool b) {
+ os << (b ? "T":"F");
+ if (!binary) os << " ";
+ if (os.fail())
+ KALDI_ERR << "Write failure in WriteBasicType";
+}
+
+template<>
+void ReadBasicType(std::istream &is, bool binary, bool *b) {
+ KALDI_PARANOID_ASSERT(b != NULL);
+ if (!binary) is >> std::ws; // eat up whitespace.
+ char c = is.peek();
+ if (c == 'T') {
+ *b = true;
+ is.get();
+ } else if (c == 'F') {
+ *b = false;
+ is.get();
+ } else {
+ KALDI_ERR << "Read failure in ReadBasicType, file position is "
+ << is.tellg() << ", next char is " << CharToString(c);
+ }
+}
+
+template<>
+void WriteBasicType(std::ostream &os, bool binary, float f) {
+ if (binary) {
+ char c = sizeof(f);
+ os.put(c);
+ os.write(reinterpret_cast(&f), sizeof(f));
+ } else {
+ os << f << " ";
+ }
+}
+
+template<>
+void WriteBasicType(std::ostream &os, bool binary, double f) {
+ if (binary) {
+ char c = sizeof(f);
+ os.put(c);
+ os.write(reinterpret_cast(&f), sizeof(f));
+ } else {
+ os << f << " ";
+ }
+}
+
+template<>
+void ReadBasicType(std::istream &is, bool binary, float *f) {
+ KALDI_PARANOID_ASSERT(f != NULL);
+ if (binary) {
+ double d;
+ int c = is.peek();
+ if (c == sizeof(*f)) {
+ is.get();
+ is.read(reinterpret_cast(f), sizeof(*f));
+ } else if (c == sizeof(d)) {
+ ReadBasicType(is, binary, &d);
+ *f = d;
+ } else {
+ KALDI_ERR << "ReadBasicType: expected float, saw " << is.peek()
+ << ", at file position " << is.tellg();
+ }
+ } else {
+ is >> *f;
+ }
+ if (is.fail()) {
+ KALDI_ERR << "ReadBasicType: failed to read, at file position "
+ << is.tellg();
+ }
+}
+
+template<>
+void ReadBasicType(std::istream &is, bool binary, double *d) {
+ KALDI_PARANOID_ASSERT(d != NULL);
+ if (binary) {
+ float f;
+ int c = is.peek();
+ if (c == sizeof(*d)) {
+ is.get();
+ is.read(reinterpret_cast(d), sizeof(*d));
+ } else if (c == sizeof(f)) {
+ ReadBasicType(is, binary, &f);
+ *d = f;
+ } else {
+ KALDI_ERR << "ReadBasicType: expected float, saw " << is.peek()
+ << ", at file position " << is.tellg();
+ }
+ } else {
+ is >> *d;
+ }
+ if (is.fail()) {
+ KALDI_ERR << "ReadBasicType: failed to read, at file position "
+ << is.tellg();
+ }
+}
+
+void CheckToken(const char *token) {
+ if (*token == '\0')
+ KALDI_ERR << "Token is empty (not a valid token)";
+ const char *orig_token = token;
+ while (*token != '\0') {
+ if (::isspace(*token))
+ KALDI_ERR << "Token is not a valid token (contains space): '"
+ << orig_token << "'";
+ token++;
+ }
+}
+
+void WriteToken(std::ostream &os, bool binary, const char *token) {
+ // binary mode is ignored;
+ // we use space as termination character in either case.
+ KALDI_ASSERT(token != NULL);
+ CheckToken(token); // make sure it's valid (can be read back)
+ os << token << " ";
+ if (os.fail()) {
+ KALDI_ERR << "Write failure in WriteToken.";
+ }
+}
+
+int Peek(std::istream &is, bool binary) {
+ if (!binary) is >> std::ws; // eat up whitespace.
+ return is.peek();
+}
+
+void WriteToken(std::ostream &os, bool binary, const std::string & token) {
+ WriteToken(os, binary, token.c_str());
+}
+
+void ReadToken(std::istream &is, bool binary, std::string *str) {
+ KALDI_ASSERT(str != NULL);
+ if (!binary) is >> std::ws; // consume whitespace.
+ is >> *str;
+ if (is.fail()) {
+ KALDI_ERR << "ReadToken, failed to read token at file position "
+ << is.tellg();
+ }
+ if (!isspace(is.peek())) {
+ KALDI_ERR << "ReadToken, expected space after token, saw instead "
+ << CharToString(static_cast(is.peek()))
+ << ", at file position " << is.tellg();
+ }
+ is.get(); // consume the space.
+}
+
+int PeekToken(std::istream &is, bool binary) {
+ if (!binary) is >> std::ws; // consume whitespace.
+ bool read_bracket;
+ if (static_cast(is.peek()) == '<') {
+ read_bracket = true;
+ is.get();
+ } else {
+ read_bracket = false;
+ }
+ int ans = is.peek();
+ if (read_bracket) {
+ if (!is.unget()) {
+ // Clear the bad bit. This code can be (and is in fact) reached, since the
+ // C++ standard does not guarantee that a call to unget() must succeed.
+ is.clear();
+ }
+ }
+ return ans;
+}
+
+
+void ExpectToken(std::istream &is, bool binary, const char *token) {
+ int pos_at_start = is.tellg();
+ KALDI_ASSERT(token != NULL);
+ CheckToken(token); // make sure it's valid (can be read back)
+ if (!binary) is >> std::ws; // consume whitespace.
+ std::string str;
+ is >> str;
+ is.get(); // consume the space.
+ if (is.fail()) {
+ KALDI_ERR << "Failed to read token [started at file position "
+ << pos_at_start << "], expected " << token;
+ }
+ // The second half of the '&&' expression below is so that if we're expecting
+ // "", we will accept "Foo>" instead. This is so that the model-reading
+ // code will tolerate errors in PeekToken where is.unget() failed; search for
+ // is.clear() in PeekToken() for an explanation.
+ if (strcmp(str.c_str(), token) != 0 &&
+ !(token[0] == '<' && strcmp(str.c_str(), token + 1) == 0)) {
+ KALDI_ERR << "Expected token \"" << token << "\", got instead \""
+ << str <<"\".";
+ }
+}
+
+void ExpectToken(std::istream &is, bool binary, const std::string &token) {
+ ExpectToken(is, binary, token.c_str());
+}
+
+} // end namespace kaldi
diff --git a/speechx/speechx/kaldi/base/io-funcs.h b/speechx/speechx/kaldi/base/io-funcs.h
new file mode 100644
index 00000000..895f661e
--- /dev/null
+++ b/speechx/speechx/kaldi/base/io-funcs.h
@@ -0,0 +1,245 @@
+// base/io-funcs.h
+
+// Copyright 2009-2011 Microsoft Corporation; Saarland University;
+// Jan Silovsky; Yanmin Qian
+// 2016 Xiaohui Zhang
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_BASE_IO_FUNCS_H_
+#define KALDI_BASE_IO_FUNCS_H_
+
+// This header only contains some relatively low-level I/O functions.
+// The full Kaldi I/O declarations are in ../util/kaldi-io.h
+// and ../util/kaldi-table.h
+// They were put in util/ in order to avoid making the Matrix library
+// dependent on them.
+
+#include
+#include
+#include
+
+#include "base/kaldi-common.h"
+#include "base/io-funcs-inl.h"
+
+namespace kaldi {
+
+
+
+/*
+ This comment describes the Kaldi approach to I/O. All objects can be written
+ and read in two modes: binary and text. In addition we want to make the I/O
+ work if we redefine the typedef "BaseFloat" between floats and doubles.
+ We also want to have control over whitespace in text mode without affecting
+ the meaning of the file, for pretty-printing purposes.
+
+ Errors are handled by throwing a KaldiFatalError exception.
+
+ For integer and floating-point types (and boolean values):
+
+ WriteBasicType(std::ostream &, bool binary, const T&);
+ ReadBasicType(std::istream &, bool binary, T*);
+
+ and we expect these functions to be defined in such a way that they work when
+ the type T changes between float and double, so you can read float into double
+ and vice versa]. Note that for efficiency and space-saving reasons, the Vector
+ and Matrix classes do not use these functions [but they preserve the type
+ interchangeability in their own way]
+
+ For a class (or struct) C:
+ class C {
+ ..
+ Write(std::ostream &, bool binary, [possibly extra optional args for specific classes]) const;
+ Read(std::istream &, bool binary, [possibly extra optional args for specific classes]);
+ ..
+ }
+ NOTE: The only actual optional args we used are the "add" arguments in
+ Vector/Matrix classes, which specify whether we should sum the data already
+ in the class with the data being read.
+
+ For types which are typedef's involving stl classes, I/O is as follows:
+ typedef std::vector > MyTypedefName;
+
+ The user should define something like:
+
+ WriteMyTypedefName(std::ostream &, bool binary, const MyTypedefName &t);
+ ReadMyTypedefName(std::ostream &, bool binary, MyTypedefName *t);
+
+ The user would have to write these functions.
+
+ For a type std::vector:
+
+ void WriteIntegerVector(std::ostream &os, bool binary, const std::vector &v);
+ void ReadIntegerVector(std::istream &is, bool binary, std::vector *v);
+
+ For other types, e.g. vectors of pairs, the user should create a routine of the
+ type WriteMyTypedefName. This is to avoid introducing confusing templated functions;
+ we could easily create templated functions to handle most of these cases but they
+ would have to share the same name.
+
+ It also often happens that the user needs to write/read special tokens as part
+ of a file. These might be class headers, or separators/identifiers in the class.
+ We provide special functions for manipulating these. These special tokens must
+ be nonempty and must not contain any whitespace.
+
+ void WriteToken(std::ostream &os, bool binary, const char*);
+ void WriteToken(std::ostream &os, bool binary, const std::string & token);
+ int Peek(std::istream &is, bool binary);
+ void ReadToken(std::istream &is, bool binary, std::string *str);
+ void PeekToken(std::istream &is, bool binary, std::string *str);
+
+ WriteToken writes the token and one space (whether in binary or text mode).
+
+ Peek returns the first character of the next token, by consuming whitespace
+ (in text mode) and then returning the peek() character. It returns -1 at EOF;
+ it doesn't throw. It's useful if a class can have various forms based on
+ typedefs and virtual classes, and wants to know which version to read.
+
+ ReadToken allows the caller to obtain the next token. PeekToken works just
+ like ReadToken, but seeks back to the beginning of the token. A subsequent
+ call to ReadToken will read the same token again. This is useful when
+ different object types are written to the same file; using PeekToken one can
+ decide which of the objects to read.
+
+ There is currently no special functionality for writing/reading strings (where the strings
+ contain data rather than "special tokens" that are whitespace-free and nonempty). This is
+ because Kaldi is structured in such a way that strings don't appear, except as OpenFst symbol
+ table entries (and these have their own format).
+
+
+ NOTE: you should not call ReadIntegerType and WriteIntegerType with types,
+ such as int and size_t, that are machine-independent -- at least not
+ if you want your file formats to port between machines. Use int32 and
+ int64 where necessary. There is no way to detect this using compile-time
+ assertions because C++ only keeps track of the internal representation of
+ the type.
+*/
+
+/// \addtogroup io_funcs_basic
+/// @{
+
+
+/// WriteBasicType is the name of the write function for bool, integer types,
+/// and floating-point types. They all throw on error.
+template void WriteBasicType(std::ostream &os, bool binary, T t);
+
+/// ReadBasicType is the name of the read function for bool, integer types,
+/// and floating-point types. They all throw on error.
+template void ReadBasicType(std::istream &is, bool binary, T *t);
+
+
+// Declare specialization for bool.
+template<>
+void WriteBasicType(std::ostream &os, bool binary, bool b);
+
+template <>
+void ReadBasicType(std::istream &is, bool binary, bool *b);
+
+// Declare specializations for float and double.
+template<>
+void WriteBasicType(std::ostream &os, bool binary, float f);
+
+template<>
+void WriteBasicType(std::ostream &os, bool binary, double f);
+
+template<>
+void ReadBasicType(std::istream &is, bool binary, float *f);
+
+template<>
+void ReadBasicType(std::istream &is, bool binary, double *f);
+
+// Define ReadBasicType that accepts an "add" parameter to add to
+// the destination. Caution: if used in Read functions, be careful
+// to initialize the parameters concerned to zero in the default
+// constructor.
+template
+inline void ReadBasicType(std::istream &is, bool binary, T *t, bool add) {
+ if (!add) {
+ ReadBasicType(is, binary, t);
+ } else {
+ T tmp = T(0);
+ ReadBasicType(is, binary, &tmp);
+ *t += tmp;
+ }
+}
+
+/// Function for writing STL vectors of integer types.
+template inline void WriteIntegerVector(std::ostream &os, bool binary,
+ const std::vector &v);
+
+/// Function for reading STL vector of integer types.
+template inline void ReadIntegerVector(std::istream &is, bool binary,
+ std::vector *v);
+
+/// Function for writing STL vectors of pairs of integer types.
+template
+inline void WriteIntegerPairVector(std::ostream &os, bool binary,
+ const std::vector > &v);
+
+/// Function for reading STL vector of pairs of integer types.
+template
+inline void ReadIntegerPairVector(std::istream &is, bool binary,
+ std::vector > *v);
+
+/// The WriteToken functions are for writing nonempty sequences of non-space
+/// characters. They are not for general strings.
+void WriteToken(std::ostream &os, bool binary, const char *token);
+void WriteToken(std::ostream &os, bool binary, const std::string & token);
+
+/// Peek consumes whitespace (if binary == false) and then returns the peek()
+/// value of the stream.
+int Peek(std::istream &is, bool binary);
+
+/// ReadToken gets the next token and puts it in str (exception on failure). If
+/// PeekToken() had been previously called, it is possible that the stream had
+/// failed to unget the starting '<' character. In this case ReadToken() returns
+/// the token string without the leading '<'. You must be prepared to handle
+/// this case. ExpectToken() handles this internally, and is not affected.
+void ReadToken(std::istream &is, bool binary, std::string *token);
+
+/// PeekToken will return the first character of the next token, or -1 if end of
+/// file. It's the same as Peek(), except if the first character is '<' it will
+/// skip over it and will return the next character. It will attempt to unget
+/// the '<' so the stream is where it was before you did PeekToken(), however,
+/// this is not guaranteed (see ReadToken()).
+int PeekToken(std::istream &is, bool binary);
+
+/// ExpectToken tries to read in the given token, and throws an exception
+/// on failure.
+void ExpectToken(std::istream &is, bool binary, const char *token);
+void ExpectToken(std::istream &is, bool binary, const std::string & token);
+
+/// ExpectPretty attempts to read the text in "token", but only in non-binary
+/// mode. Throws exception on failure. It expects an exact match except that
+/// arbitrary whitespace matches arbitrary whitespace.
+void ExpectPretty(std::istream &is, bool binary, const char *token);
+void ExpectPretty(std::istream &is, bool binary, const std::string & token);
+
+/// @} end "addtogroup io_funcs_basic"
+
+
+/// InitKaldiOutputStream initializes an opened stream for writing by writing an
+/// optional binary header and modifying the floating-point precision; it will
+/// typically not be called by users directly.
+inline void InitKaldiOutputStream(std::ostream &os, bool binary);
+
+/// InitKaldiInputStream initializes an opened stream for reading by detecting
+/// the binary header and setting the "binary" value appropriately;
+/// It will typically not be called by users directly.
+inline bool InitKaldiInputStream(std::istream &is, bool *binary);
+
+} // end namespace kaldi.
+#endif // KALDI_BASE_IO_FUNCS_H_
diff --git a/speechx/speechx/kaldi/base/kaldi-common.h b/speechx/speechx/kaldi/base/kaldi-common.h
new file mode 100644
index 00000000..264565d1
--- /dev/null
+++ b/speechx/speechx/kaldi/base/kaldi-common.h
@@ -0,0 +1,41 @@
+// base/kaldi-common.h
+
+// Copyright 2009-2011 Microsoft Corporation
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_BASE_KALDI_COMMON_H_
+#define KALDI_BASE_KALDI_COMMON_H_ 1
+
+#include
+#include
+#include // C string stuff like strcpy
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include "base/kaldi-utils.h"
+#include "base/kaldi-error.h"
+#include "base/kaldi-types.h"
+#include "base/io-funcs.h"
+#include "base/kaldi-math.h"
+#include "base/timer.h"
+
+#endif // KALDI_BASE_KALDI_COMMON_H_
diff --git a/speechx/speechx/kaldi/base/kaldi-error.cc b/speechx/speechx/kaldi/base/kaldi-error.cc
new file mode 100644
index 00000000..2dbc7318
--- /dev/null
+++ b/speechx/speechx/kaldi/base/kaldi-error.cc
@@ -0,0 +1,245 @@
+// base/kaldi-error.cc
+
+// Copyright 2019 LAIX (Yi Sun)
+// Copyright 2019 SmartAction LLC (kkm)
+// Copyright 2016 Brno University of Technology (author: Karel Vesely)
+// Copyright 2009-2011 Microsoft Corporation; Lukas Burget; Ondrej Glembek
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifdef HAVE_EXECINFO_H
+#include // To get stack trace in error messages.
+// If this #include fails there is an error in the Makefile, it does not
+// support your platform well. Make sure HAVE_EXECINFO_H is undefined,
+// and the code will compile.
+#ifdef HAVE_CXXABI_H
+#include // For name demangling.
+// Useful to decode the stack trace, but only used if we have execinfo.h
+#endif // HAVE_CXXABI_H
+#endif // HAVE_EXECINFO_H
+
+#include "base/kaldi-common.h"
+#include "base/kaldi-error.h"
+#include "base/version.h"
+
+namespace kaldi {
+
+/***** GLOBAL VARIABLES FOR LOGGING *****/
+
+int32 g_kaldi_verbose_level = 0;
+static std::string program_name;
+static LogHandler log_handler = NULL;
+
+void SetProgramName(const char *basename) {
+ // Using the 'static std::string' for the program name is mostly harmless,
+ // because (a) Kaldi logging is undefined before main(), and (b) no stdc++
+ // string implementation has been found in the wild that would not be just
+ // an empty string when zero-initialized but not yet constructed.
+ program_name = basename;
+}
+
+/***** HELPER FUNCTIONS *****/
+
+// Trim filename to at most 1 trailing directory long. Given a filename like
+// "/a/b/c/d/e/f.cc", return "e/f.cc". Support both '/' and '\' as the path
+// separator.
+static const char *GetShortFileName(const char *path) {
+ if (path == nullptr)
+ return "";
+
+ const char *prev = path, *last = path;
+ while ((path = std::strpbrk(path, "\\/")) != nullptr) {
+ ++path;
+ prev = last;
+ last = path;
+ }
+ return prev;
+}
+
+/***** STACK TRACE *****/
+
+namespace internal {
+bool LocateSymbolRange(const std::string &trace_name, size_t *begin,
+ size_t *end) {
+ // Find the first '_' with leading ' ' or '('.
+ *begin = std::string::npos;
+ for (size_t i = 1; i < trace_name.size(); i++) {
+ if (trace_name[i] != '_') {
+ continue;
+ }
+ if (trace_name[i - 1] == ' ' || trace_name[i - 1] == '(') {
+ *begin = i;
+ break;
+ }
+ }
+ if (*begin == std::string::npos) {
+ return false;
+ }
+ *end = trace_name.find_first_of(" +", *begin);
+ return *end != std::string::npos;
+}
+} // namespace internal
+
+#ifdef HAVE_EXECINFO_H
+static std::string Demangle(std::string trace_name) {
+#ifndef HAVE_CXXABI_H
+ return trace_name;
+#else // HAVE_CXXABI_H
+ // Try demangle the symbol. We are trying to support the following formats
+ // produced by different platforms:
+ //
+ // Linux:
+ // ./kaldi-error-test(_ZN5kaldi13UnitTestErrorEv+0xb) [0x804965d]
+ //
+ // Mac:
+ // 0 server 0x000000010f67614d _ZNK5kaldi13MessageLogger10LogMessageEv + 813
+ //
+ // We want to extract the name e.g., '_ZN5kaldi13UnitTestErrorEv' and
+ // demangle it info a readable name like kaldi::UnitTextError.
+ size_t begin, end;
+ if (!internal::LocateSymbolRange(trace_name, &begin, &end)) {
+ return trace_name;
+ }
+ std::string symbol = trace_name.substr(begin, end - begin);
+ int status;
+ char *demangled_name = abi::__cxa_demangle(symbol.c_str(), 0, 0, &status);
+ if (status == 0 && demangled_name != nullptr) {
+ symbol = demangled_name;
+ free(demangled_name);
+ }
+ return trace_name.substr(0, begin) + symbol +
+ trace_name.substr(end, std::string::npos);
+#endif // HAVE_CXXABI_H
+}
+#endif // HAVE_EXECINFO_H
+
+static std::string KaldiGetStackTrace() {
+ std::string ans;
+#ifdef HAVE_EXECINFO_H
+ const size_t KALDI_MAX_TRACE_SIZE = 50;
+ const size_t KALDI_MAX_TRACE_PRINT = 50; // Must be even.
+ // Buffer for the trace.
+ void *trace[KALDI_MAX_TRACE_SIZE];
+ // Get the trace.
+ size_t size = backtrace(trace, KALDI_MAX_TRACE_SIZE);
+ // Get the trace symbols.
+ char **trace_symbol = backtrace_symbols(trace, size);
+ if (trace_symbol == NULL)
+ return ans;
+
+ // Compose a human-readable backtrace string.
+ ans += "[ Stack-Trace: ]\n";
+ if (size <= KALDI_MAX_TRACE_PRINT) {
+ for (size_t i = 0; i < size; i++) {
+ ans += Demangle(trace_symbol[i]) + "\n";
+ }
+ } else { // Print out first+last (e.g.) 5.
+ for (size_t i = 0; i < KALDI_MAX_TRACE_PRINT / 2; i++) {
+ ans += Demangle(trace_symbol[i]) + "\n";
+ }
+ ans += ".\n.\n.\n";
+ for (size_t i = size - KALDI_MAX_TRACE_PRINT / 2; i < size; i++) {
+ ans += Demangle(trace_symbol[i]) + "\n";
+ }
+ if (size == KALDI_MAX_TRACE_SIZE)
+ ans += ".\n.\n.\n"; // Stack was too long, probably a bug.
+ }
+
+ // We must free the array of pointers allocated by backtrace_symbols(),
+ // but not the strings themselves.
+ free(trace_symbol);
+#endif // HAVE_EXECINFO_H
+ return ans;
+}
+
+/***** KALDI LOGGING *****/
+
+MessageLogger::MessageLogger(LogMessageEnvelope::Severity severity,
+ const char *func, const char *file, int32 line) {
+ // Obviously, we assume the strings survive the destruction of this object.
+ envelope_.severity = severity;
+ envelope_.func = func;
+ envelope_.file = GetShortFileName(file); // Points inside 'file'.
+ envelope_.line = line;
+}
+
+void MessageLogger::LogMessage() const {
+ // Send to the logging handler if provided.
+ if (log_handler != NULL) {
+ log_handler(envelope_, GetMessage().c_str());
+ return;
+ }
+
+ // Otherwise, use the default Kaldi logging.
+ // Build the log-message header.
+ std::stringstream full_message;
+ if (envelope_.severity > LogMessageEnvelope::kInfo) {
+ full_message << "VLOG[" << envelope_.severity << "] (";
+ } else {
+ switch (envelope_.severity) {
+ case LogMessageEnvelope::kInfo:
+ full_message << "LOG (";
+ break;
+ case LogMessageEnvelope::kWarning:
+ full_message << "WARNING (";
+ break;
+ case LogMessageEnvelope::kAssertFailed:
+ full_message << "ASSERTION_FAILED (";
+ break;
+ case LogMessageEnvelope::kError:
+ default: // If not the ERROR, it still an error!
+ full_message << "ERROR (";
+ break;
+ }
+ }
+ // Add other info from the envelope and the message text.
+ full_message << program_name.c_str() << "[" KALDI_VERSION "]" << ':'
+ << envelope_.func << "():" << envelope_.file << ':'
+ << envelope_.line << ") " << GetMessage().c_str();
+
+ // Add stack trace for errors and assertion failures, if available.
+ if (envelope_.severity < LogMessageEnvelope::kWarning) {
+ const std::string &stack_trace = KaldiGetStackTrace();
+ if (!stack_trace.empty()) {
+ full_message << "\n\n" << stack_trace;
+ }
+ }
+
+ // Print the complete message to stderr.
+ full_message << "\n";
+ std::cerr << full_message.str();
+}
+
+/***** KALDI ASSERTS *****/
+
+void KaldiAssertFailure_(const char *func, const char *file, int32 line,
+ const char *cond_str) {
+ MessageLogger::Log() =
+ MessageLogger(LogMessageEnvelope::kAssertFailed, func, file, line)
+ << "Assertion failed: (" << cond_str << ")";
+ fflush(NULL); // Flush all pending buffers, abort() may not flush stderr.
+ std::abort();
+}
+
+/***** THIRD-PARTY LOG-HANDLER *****/
+
+LogHandler SetLogHandler(LogHandler handler) {
+ LogHandler old_handler = log_handler;
+ log_handler = handler;
+ return old_handler;
+}
+
+} // namespace kaldi
diff --git a/speechx/speechx/kaldi/base/kaldi-error.h b/speechx/speechx/kaldi/base/kaldi-error.h
new file mode 100644
index 00000000..a9904a75
--- /dev/null
+++ b/speechx/speechx/kaldi/base/kaldi-error.h
@@ -0,0 +1,231 @@
+// base/kaldi-error.h
+
+// Copyright 2019 LAIX (Yi Sun)
+// Copyright 2019 SmartAction LLC (kkm)
+// Copyright 2016 Brno University of Technology (author: Karel Vesely)
+// Copyright 2009-2011 Microsoft Corporation; Ondrej Glembek; Lukas Burget;
+// Saarland University
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_BASE_KALDI_ERROR_H_
+#define KALDI_BASE_KALDI_ERROR_H_ 1
+
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include "base/kaldi-types.h"
+#include "base/kaldi-utils.h"
+/* Important that this file does not depend on any other kaldi headers. */
+
+#ifdef _MSC_VER
+#define __func__ __FUNCTION__
+#endif
+
+namespace kaldi {
+
+/// \addtogroup error_group
+/// @{
+
+/***** PROGRAM NAME AND VERBOSITY LEVEL *****/
+
+/// Called by ParseOptions to set base name (no directory) of the executing
+/// program. The name is printed in logging code along with every message,
+/// because in our scripts, we often mix together the stderr of many programs.
+/// This function is very thread-unsafe.
+void SetProgramName(const char *basename);
+
+/// This is set by util/parse-options.{h,cc} if you set --verbose=? option.
+/// Do not use directly, prefer {Get,Set}VerboseLevel().
+extern int32 g_kaldi_verbose_level;
+
+/// Get verbosity level, usually set via command line '--verbose=' switch.
+inline int32 GetVerboseLevel() { return g_kaldi_verbose_level; }
+
+/// This should be rarely used, except by programs using Kaldi as library;
+/// command-line programs set the verbose level automatically from ParseOptions.
+inline void SetVerboseLevel(int32 i) { g_kaldi_verbose_level = i; }
+
+/***** KALDI LOGGING *****/
+
+/// Log message severity and source location info.
+struct LogMessageEnvelope {
+ /// Message severity. In addition to these levels, positive values (1 to 6)
+ /// specify verbose logging level. Verbose messages are produced only when
+ /// SetVerboseLevel() has been called to set logging level to at least the
+ /// corresponding value.
+ enum Severity {
+ kAssertFailed = -3, //!< Assertion failure. abort() will be called.
+ kError = -2, //!< Fatal error. KaldiFatalError will be thrown.
+ kWarning = -1, //!< Indicates a recoverable but abnormal condition.
+ kInfo = 0, //!< Informational message.
+ };
+ int severity; //!< A Severity value, or positive verbosity level.
+ const char *func; //!< Name of the function invoking the logging.
+ const char *file; //!< Source file name with up to 1 leading directory.
+ int32 line; // MessageLogger &operator<<(const T &val) {
+ ss_ << val;
+ return *this;
+ }
+
+ // When assigned a MessageLogger, log its contents.
+ struct Log final {
+ void operator=(const MessageLogger &logger) { logger.LogMessage(); }
+ };
+
+ // When assigned a MessageLogger, log its contents and then throw
+ // a KaldiFatalError.
+ struct LogAndThrow final {
+ [[noreturn]] void operator=(const MessageLogger &logger) {
+ logger.LogMessage();
+ throw KaldiFatalError(logger.GetMessage());
+ }
+ };
+
+private:
+ std::string GetMessage() const { return ss_.str(); }
+ void LogMessage() const;
+
+ LogMessageEnvelope envelope_;
+ std::ostringstream ss_;
+};
+
+// Logging macros.
+#define KALDI_ERR \
+ ::kaldi::MessageLogger::LogAndThrow() = ::kaldi::MessageLogger( \
+ ::kaldi::LogMessageEnvelope::kError, __func__, __FILE__, __LINE__)
+#define KALDI_WARN \
+ ::kaldi::MessageLogger::Log() = ::kaldi::MessageLogger( \
+ ::kaldi::LogMessageEnvelope::kWarning, __func__, __FILE__, __LINE__)
+#define KALDI_LOG \
+ ::kaldi::MessageLogger::Log() = ::kaldi::MessageLogger( \
+ ::kaldi::LogMessageEnvelope::kInfo, __func__, __FILE__, __LINE__)
+#define KALDI_VLOG(v) \
+ if ((v) <= ::kaldi::GetVerboseLevel()) \
+ ::kaldi::MessageLogger::Log() = \
+ ::kaldi::MessageLogger((::kaldi::LogMessageEnvelope::Severity)(v), \
+ __func__, __FILE__, __LINE__)
+
+/***** KALDI ASSERTS *****/
+
+[[noreturn]] void KaldiAssertFailure_(const char *func, const char *file,
+ int32 line, const char *cond_str);
+
+// Note on KALDI_ASSERT and KALDI_PARANOID_ASSERT:
+//
+// A single block {} around if /else does not work, because it causes
+// syntax error (unmatched else block) in the following code:
+//
+// if (condition)
+// KALDI_ASSERT(condition2);
+// else
+// SomethingElse();
+//
+// do {} while(0) -- note there is no semicolon at the end! -- works nicely,
+// and compilers will be able to optimize the loop away (as the condition
+// is always false).
+//
+// Also see KALDI_COMPILE_TIME_ASSERT, defined in base/kaldi-utils.h, and
+// KALDI_ASSERT_IS_INTEGER_TYPE and KALDI_ASSERT_IS_FLOATING_TYPE, also defined
+// there.
+#ifndef NDEBUG
+#define KALDI_ASSERT(cond) \
+ do { \
+ if (cond) \
+ (void)0; \
+ else \
+ ::kaldi::KaldiAssertFailure_(__func__, __FILE__, __LINE__, #cond); \
+ } while (0)
+#else
+#define KALDI_ASSERT(cond) (void)0
+#endif
+
+// Some more expensive asserts only checked if this defined.
+#ifdef KALDI_PARANOID
+#define KALDI_PARANOID_ASSERT(cond) \
+ do { \
+ if (cond) \
+ (void)0; \
+ else \
+ ::kaldi::KaldiAssertFailure_(__func__, __FILE__, __LINE__, #cond); \
+ } while (0)
+#else
+#define KALDI_PARANOID_ASSERT(cond) (void)0
+#endif
+
+/***** THIRD-PARTY LOG-HANDLER *****/
+
+/// Type of third-party logging function.
+typedef void (*LogHandler)(const LogMessageEnvelope &envelope,
+ const char *message);
+
+/// Set logging handler. If called with a non-NULL function pointer, the
+/// function pointed by it is called to send messages to a caller-provided log.
+/// If called with a NULL pointer, restores default Kaldi error logging to
+/// stderr. This function is obviously not thread safe; the log handler must be.
+/// Returns a previously set logging handler pointer, or NULL.
+LogHandler SetLogHandler(LogHandler);
+
+/// @} end "addtogroup error_group"
+
+// Functions within internal is exported for testing only, do not use.
+namespace internal {
+bool LocateSymbolRange(const std::string &trace_name, size_t *begin,
+ size_t *end);
+} // namespace internal
+} // namespace kaldi
+
+#endif // KALDI_BASE_KALDI_ERROR_H_
diff --git a/speechx/speechx/kaldi/base/kaldi-math.cc b/speechx/speechx/kaldi/base/kaldi-math.cc
new file mode 100644
index 00000000..484c80d4
--- /dev/null
+++ b/speechx/speechx/kaldi/base/kaldi-math.cc
@@ -0,0 +1,162 @@
+// base/kaldi-math.cc
+
+// Copyright 2009-2011 Microsoft Corporation; Yanmin Qian;
+// Saarland University; Jan Silovsky
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-math.h"
+#ifndef _MSC_VER
+#include
+#include
+#endif
+#include
+#include
+
+namespace kaldi {
+// These routines are tested in matrix/matrix-test.cc
+
+int32 RoundUpToNearestPowerOfTwo(int32 n) {
+ KALDI_ASSERT(n > 0);
+ n--;
+ n |= n >> 1;
+ n |= n >> 2;
+ n |= n >> 4;
+ n |= n >> 8;
+ n |= n >> 16;
+ return n+1;
+}
+
+static std::mutex _RandMutex;
+
+int Rand(struct RandomState* state) {
+#if !defined(_POSIX_THREAD_SAFE_FUNCTIONS)
+ // On Windows and Cygwin, just call Rand()
+ return rand();
+#else
+ if (state) {
+ return rand_r(&(state->seed));
+ } else {
+ std::lock_guard lock(_RandMutex);
+ return rand();
+ }
+#endif
+}
+
+RandomState::RandomState() {
+ // we initialize it as Rand() + 27437 instead of just Rand(), because on some
+ // systems, e.g. at the very least Mac OSX Yosemite and later, it seems to be
+ // the case that rand_r when initialized with rand() will give you the exact
+ // same sequence of numbers that rand() will give if you keep calling rand()
+ // after that initial call. This can cause problems with repeated sequences.
+ // For example if you initialize two RandomState structs one after the other
+ // without calling rand() in between, they would give you the same sequence
+ // offset by one (if we didn't have the "+ 27437" in the code). 27437 is just
+ // a randomly chosen prime number.
+ seed = Rand() + 27437;
+}
+
+bool WithProb(BaseFloat prob, struct RandomState* state) {
+ KALDI_ASSERT(prob >= 0 && prob <= 1.1); // prob should be <= 1.0,
+ // but we allow slightly larger values that could arise from roundoff in
+ // previous calculations.
+ KALDI_COMPILE_TIME_ASSERT(RAND_MAX > 128 * 128);
+ if (prob == 0) return false;
+ else if (prob == 1.0) return true;
+ else if (prob * RAND_MAX < 128.0) {
+ // prob is very small but nonzero, and the "main algorithm"
+ // wouldn't work that well. So: with probability 1/128, we
+ // return WithProb (prob * 128), else return false.
+ if (Rand(state) < RAND_MAX / 128) { // with probability 128...
+ // Note: we know that prob * 128.0 < 1.0, because
+ // we asserted RAND_MAX > 128 * 128.
+ return WithProb(prob * 128.0);
+ } else {
+ return false;
+ }
+ } else {
+ return (Rand(state) < ((RAND_MAX + static_cast(1.0)) * prob));
+ }
+}
+
+int32 RandInt(int32 min_val, int32 max_val, struct RandomState* state) {
+ // This is not exact.
+ KALDI_ASSERT(max_val >= min_val);
+ if (max_val == min_val) return min_val;
+
+#ifdef _MSC_VER
+ // RAND_MAX is quite small on Windows -> may need to handle larger numbers.
+ if (RAND_MAX > (max_val-min_val)*8) {
+ // *8 to avoid large inaccuracies in probability, from the modulus...
+ return min_val +
+ ((unsigned int)Rand(state) % (unsigned int)(max_val+1-min_val));
+ } else {
+ if ((unsigned int)(RAND_MAX*RAND_MAX) >
+ (unsigned int)((max_val+1-min_val)*8)) {
+ // *8 to avoid inaccuracies in probability, from the modulus...
+ return min_val + ( (unsigned int)( (Rand(state)+RAND_MAX*Rand(state)))
+ % (unsigned int)(max_val+1-min_val));
+ } else {
+ KALDI_ERR << "rand_int failed because we do not support such large "
+ "random numbers. (Extend this function).";
+ }
+ }
+#else
+ return min_val +
+ (static_cast(Rand(state)) % static_cast(max_val+1-min_val));
+#endif
+}
+
+// Returns poisson-distributed random number.
+// Take care: this takes time proportional
+// to lambda. Faster algorithms exist but are more complex.
+int32 RandPoisson(float lambda, struct RandomState* state) {
+ // Knuth's algorithm.
+ KALDI_ASSERT(lambda >= 0);
+ float L = expf(-lambda), p = 1.0;
+ int32 k = 0;
+ do {
+ k++;
+ float u = RandUniform(state);
+ p *= u;
+ } while (p > L);
+ return k-1;
+}
+
+void RandGauss2(float *a, float *b, RandomState *state) {
+ KALDI_ASSERT(a);
+ KALDI_ASSERT(b);
+ float u1 = RandUniform(state);
+ float u2 = RandUniform(state);
+ u1 = sqrtf(-2.0f * logf(u1));
+ u2 = 2.0f * M_PI * u2;
+ *a = u1 * cosf(u2);
+ *b = u1 * sinf(u2);
+}
+
+void RandGauss2(double *a, double *b, RandomState *state) {
+ KALDI_ASSERT(a);
+ KALDI_ASSERT(b);
+ float a_float, b_float;
+ // Just because we're using doubles doesn't mean we need super-high-quality
+ // random numbers, so we just use the floating-point version internally.
+ RandGauss2(&a_float, &b_float, state);
+ *a = a_float;
+ *b = b_float;
+}
+
+
+} // end namespace kaldi
diff --git a/speechx/speechx/kaldi/base/kaldi-math.h b/speechx/speechx/kaldi/base/kaldi-math.h
new file mode 100644
index 00000000..93c265ee
--- /dev/null
+++ b/speechx/speechx/kaldi/base/kaldi-math.h
@@ -0,0 +1,363 @@
+// base/kaldi-math.h
+
+// Copyright 2009-2011 Ondrej Glembek; Microsoft Corporation; Yanmin Qian;
+// Jan Silovsky; Saarland University
+//
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_BASE_KALDI_MATH_H_
+#define KALDI_BASE_KALDI_MATH_H_ 1
+
+#ifdef _MSC_VER
+#include
+#endif
+
+#include
+#include
+#include
+
+#include "base/kaldi-types.h"
+#include "base/kaldi-common.h"
+
+
+#ifndef DBL_EPSILON
+#define DBL_EPSILON 2.2204460492503131e-16
+#endif
+#ifndef FLT_EPSILON
+#define FLT_EPSILON 1.19209290e-7f
+#endif
+
+#ifndef M_PI
+#define M_PI 3.1415926535897932384626433832795
+#endif
+
+#ifndef M_SQRT2
+#define M_SQRT2 1.4142135623730950488016887
+#endif
+
+#ifndef M_2PI
+#define M_2PI 6.283185307179586476925286766559005
+#endif
+
+#ifndef M_SQRT1_2
+#define M_SQRT1_2 0.7071067811865475244008443621048490
+#endif
+
+#ifndef M_LOG_2PI
+#define M_LOG_2PI 1.8378770664093454835606594728112
+#endif
+
+#ifndef M_LN2
+#define M_LN2 0.693147180559945309417232121458
+#endif
+
+#ifndef M_LN10
+#define M_LN10 2.302585092994045684017991454684
+#endif
+
+
+#define KALDI_ISNAN std::isnan
+#define KALDI_ISINF std::isinf
+#define KALDI_ISFINITE(x) std::isfinite(x)
+
+#if !defined(KALDI_SQR)
+# define KALDI_SQR(x) ((x) * (x))
+#endif
+
+namespace kaldi {
+
+#if !defined(_MSC_VER) || (_MSC_VER >= 1900)
+inline double Exp(double x) { return exp(x); }
+#ifndef KALDI_NO_EXPF
+inline float Exp(float x) { return expf(x); }
+#else
+inline float Exp(float x) { return exp(static_cast(x)); }
+#endif // KALDI_NO_EXPF
+#else
+inline double Exp(double x) { return exp(x); }
+#if !defined(__INTEL_COMPILER) && _MSC_VER == 1800 && defined(_M_X64)
+// Microsoft CL v18.0 buggy 64-bit implementation of
+// expf() incorrectly returns -inf for exp(-inf).
+inline float Exp(float x) { return exp(static_cast(x)); }
+#else
+inline float Exp(float x) { return expf(x); }
+#endif // !defined(__INTEL_COMPILER) && _MSC_VER == 1800 && defined(_M_X64)
+#endif // !defined(_MSC_VER) || (_MSC_VER >= 1900)
+
+inline double Log(double x) { return log(x); }
+inline float Log(float x) { return logf(x); }
+
+#if !defined(_MSC_VER) || (_MSC_VER >= 1700)
+inline double Log1p(double x) { return log1p(x); }
+inline float Log1p(float x) { return log1pf(x); }
+#else
+inline double Log1p(double x) {
+ const double cutoff = 1.0e-08;
+ if (x < cutoff)
+ return x - 0.5 * x * x;
+ else
+ return Log(1.0 + x);
+}
+
+inline float Log1p(float x) {
+ const float cutoff = 1.0e-07;
+ if (x < cutoff)
+ return x - 0.5 * x * x;
+ else
+ return Log(1.0 + x);
+}
+#endif
+
+static const double kMinLogDiffDouble = Log(DBL_EPSILON); // negative!
+static const float kMinLogDiffFloat = Log(FLT_EPSILON); // negative!
+
+// -infinity
+const float kLogZeroFloat = -std::numeric_limits::infinity();
+const double kLogZeroDouble = -std::numeric_limits::infinity();
+const BaseFloat kLogZeroBaseFloat = -std::numeric_limits::infinity();
+
+// Returns a random integer between 0 and RAND_MAX, inclusive
+int Rand(struct RandomState* state = NULL);
+
+// State for thread-safe random number generator
+struct RandomState {
+ RandomState();
+ unsigned seed;
+};
+
+// Returns a random integer between first and last inclusive.
+int32 RandInt(int32 first, int32 last, struct RandomState* state = NULL);
+
+// Returns true with probability "prob",
+bool WithProb(BaseFloat prob, struct RandomState* state = NULL);
+// with 0 <= prob <= 1 [we check this].
+// Internally calls Rand(). This function is carefully implemented so
+// that it should work even if prob is very small.
+
+/// Returns a random number strictly between 0 and 1.
+inline float RandUniform(struct RandomState* state = NULL) {
+ return static_cast((Rand(state) + 1.0) / (RAND_MAX+2.0));
+}
+
+inline float RandGauss(struct RandomState* state = NULL) {
+ return static_cast(sqrtf (-2 * Log(RandUniform(state)))
+ * cosf(2*M_PI*RandUniform(state)));
+}
+
+// Returns poisson-distributed random number. Uses Knuth's algorithm.
+// Take care: this takes time proportional
+// to lambda. Faster algorithms exist but are more complex.
+int32 RandPoisson(float lambda, struct RandomState* state = NULL);
+
+// Returns a pair of gaussian random numbers. Uses Box-Muller transform
+void RandGauss2(float *a, float *b, RandomState *state = NULL);
+void RandGauss2(double *a, double *b, RandomState *state = NULL);
+
+// Also see Vector::RandCategorical().
+
+// This is a randomized pruning mechanism that preserves expectations,
+// that we typically use to prune posteriors.
+template
+inline Float RandPrune(Float post, BaseFloat prune_thresh,
+ struct RandomState* state = NULL) {
+ KALDI_ASSERT(prune_thresh >= 0.0);
+ if (post == 0.0 || std::abs(post) >= prune_thresh)
+ return post;
+ return (post >= 0 ? 1.0 : -1.0) *
+ (RandUniform(state) <= fabs(post)/prune_thresh ? prune_thresh : 0.0);
+}
+
+// returns log(exp(x) + exp(y)).
+inline double LogAdd(double x, double y) {
+ double diff;
+
+ if (x < y) {
+ diff = x - y;
+ x = y;
+ } else {
+ diff = y - x;
+ }
+ // diff is negative. x is now the larger one.
+
+ if (diff >= kMinLogDiffDouble) {
+ double res;
+ res = x + Log1p(Exp(diff));
+ return res;
+ } else {
+ return x; // return the larger one.
+ }
+}
+
+
+// returns log(exp(x) + exp(y)).
+inline float LogAdd(float x, float y) {
+ float diff;
+
+ if (x < y) {
+ diff = x - y;
+ x = y;
+ } else {
+ diff = y - x;
+ }
+ // diff is negative. x is now the larger one.
+
+ if (diff >= kMinLogDiffFloat) {
+ float res;
+ res = x + Log1p(Exp(diff));
+ return res;
+ } else {
+ return x; // return the larger one.
+ }
+}
+
+
+// returns log(exp(x) - exp(y)).
+inline double LogSub(double x, double y) {
+ if (y >= x) { // Throws exception if y>=x.
+ if (y == x)
+ return kLogZeroDouble;
+ else
+ KALDI_ERR << "Cannot subtract a larger from a smaller number.";
+ }
+
+ double diff = y - x; // Will be negative.
+ double res = x + Log(1.0 - Exp(diff));
+
+ // res might be NAN if diff ~0.0, and 1.0-exp(diff) == 0 to machine precision
+ if (KALDI_ISNAN(res))
+ return kLogZeroDouble;
+ return res;
+}
+
+
+// returns log(exp(x) - exp(y)).
+inline float LogSub(float x, float y) {
+ if (y >= x) { // Throws exception if y>=x.
+ if (y == x)
+ return kLogZeroDouble;
+ else
+ KALDI_ERR << "Cannot subtract a larger from a smaller number.";
+ }
+
+ float diff = y - x; // Will be negative.
+ float res = x + Log(1.0f - Exp(diff));
+
+ // res might be NAN if diff ~0.0, and 1.0-exp(diff) == 0 to machine precision
+ if (KALDI_ISNAN(res))
+ return kLogZeroFloat;
+ return res;
+}
+
+/// return abs(a - b) <= relative_tolerance * (abs(a)+abs(b)).
+static inline bool ApproxEqual(float a, float b,
+ float relative_tolerance = 0.001) {
+ // a==b handles infinities.
+ if (a == b) return true;
+ float diff = std::abs(a-b);
+ if (diff == std::numeric_limits::infinity()
+ || diff != diff) return false; // diff is +inf or nan.
+ return (diff <= relative_tolerance*(std::abs(a)+std::abs(b)));
+}
+
+/// assert abs(a - b) <= relative_tolerance * (abs(a)+abs(b))
+static inline void AssertEqual(float a, float b,
+ float relative_tolerance = 0.001) {
+ // a==b handles infinities.
+ KALDI_ASSERT(ApproxEqual(a, b, relative_tolerance));
+}
+
+
+// RoundUpToNearestPowerOfTwo does the obvious thing. It crashes if n <= 0.
+int32 RoundUpToNearestPowerOfTwo(int32 n);
+
+/// Returns a / b, rounding towards negative infinity in all cases.
+static inline int32 DivideRoundingDown(int32 a, int32 b) {
+ KALDI_ASSERT(b != 0);
+ if (a * b >= 0)
+ return a / b;
+ else if (a < 0)
+ return (a - b + 1) / b;
+ else
+ return (a - b - 1) / b;
+}
+
+template I Gcd(I m, I n) {
+ if (m == 0 || n == 0) {
+ if (m == 0 && n == 0) { // gcd not defined, as all integers are divisors.
+ KALDI_ERR << "Undefined GCD since m = 0, n = 0.";
+ }
+ return (m == 0 ? (n > 0 ? n : -n) : ( m > 0 ? m : -m));
+ // return absolute value of whichever is nonzero
+ }
+ // could use compile-time assertion
+ // but involves messing with complex template stuff.
+ KALDI_ASSERT(std::numeric_limits::is_integer);
+ while (1) {
+ m %= n;
+ if (m == 0) return (n > 0 ? n : -n);
+ n %= m;
+ if (n == 0) return (m > 0 ? m : -m);
+ }
+}
+
+/// Returns the least common multiple of two integers. Will
+/// crash unless the inputs are positive.
+template I Lcm(I m, I n) {
+ KALDI_ASSERT(m > 0 && n > 0);
+ I gcd = Gcd(m, n);
+ return gcd * (m/gcd) * (n/gcd);
+}
+
+
+template void Factorize(I m, std::vector *factors) {
+ // Splits a number into its prime factors, in sorted order from
+ // least to greatest, with duplication. A very inefficient
+ // algorithm, which is mainly intended for use in the
+ // mixed-radix FFT computation (where we assume most factors
+ // are small).
+ KALDI_ASSERT(factors != NULL);
+ KALDI_ASSERT(m >= 1); // Doesn't work for zero or negative numbers.
+ factors->clear();
+ I small_factors[10] = { 2, 3, 5, 7, 11, 13, 17, 19, 23, 29 };
+
+ // First try small factors.
+ for (I i = 0; i < 10; i++) {
+ if (m == 1) return; // We're done.
+ while (m % small_factors[i] == 0) {
+ m /= small_factors[i];
+ factors->push_back(small_factors[i]);
+ }
+ }
+ // Next try all odd numbers starting from 31.
+ for (I j = 31;; j += 2) {
+ if (m == 1) return;
+ while (m % j == 0) {
+ m /= j;
+ factors->push_back(j);
+ }
+ }
+}
+
+inline double Hypot(double x, double y) { return hypot(x, y); }
+inline float Hypot(float x, float y) { return hypotf(x, y); }
+
+
+
+
+} // namespace kaldi
+
+
+#endif // KALDI_BASE_KALDI_MATH_H_
diff --git a/speechx/speechx/kaldi/base/kaldi-types.h b/speechx/speechx/kaldi/base/kaldi-types.h
new file mode 100644
index 00000000..4fa8f224
--- /dev/null
+++ b/speechx/speechx/kaldi/base/kaldi-types.h
@@ -0,0 +1,76 @@
+// base/kaldi-types.h
+
+// Copyright 2009-2011 Microsoft Corporation; Saarland University;
+// Jan Silovsky; Yanmin Qian
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_BASE_KALDI_TYPES_H_
+#define KALDI_BASE_KALDI_TYPES_H_ 1
+
+namespace kaldi {
+// TYPEDEFS ..................................................................
+#if (KALDI_DOUBLEPRECISION != 0)
+typedef double BaseFloat;
+#else
+typedef float BaseFloat;
+#endif
+}
+
+#ifdef _MSC_VER
+#include
+#define ssize_t SSIZE_T
+#endif
+
+// we can do this a different way if some platform
+// we find in the future lacks stdint.h
+#include
+
+// for discussion on what to do if you need compile kaldi
+// without OpenFST, see the bottom of this this file
+
+#include
+
+namespace kaldi {
+ using ::int16;
+ using ::int32;
+ using ::int64;
+ using ::uint16;
+ using ::uint32;
+ using ::uint64;
+ typedef float float32;
+ typedef double double64;
+} // end namespace kaldi
+
+// In a theoretical case you decide compile Kaldi without the OpenFST
+// comment the previous namespace statement and uncomment the following
+/*
+namespace kaldi {
+ typedef int8_t int8;
+ typedef int16_t int16;
+ typedef int32_t int32;
+ typedef int64_t int64;
+
+ typedef uint8_t uint8;
+ typedef uint16_t uint16;
+ typedef uint32_t uint32;
+ typedef uint64_t uint64;
+ typedef float float32;
+ typedef double double64;
+} // end namespace kaldi
+*/
+
+#endif // KALDI_BASE_KALDI_TYPES_H_
diff --git a/speechx/speechx/kaldi/base/kaldi-utils.cc b/speechx/speechx/kaldi/base/kaldi-utils.cc
new file mode 100644
index 00000000..432da426b
--- /dev/null
+++ b/speechx/speechx/kaldi/base/kaldi-utils.cc
@@ -0,0 +1,55 @@
+// base/kaldi-utils.cc
+// Copyright 2009-2011 Karel Vesely; Yanmin Qian; Microsoft Corporation
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifdef _WIN32_WINNT_WIN8
+#include
+#elif defined(_WIN32) || defined(_MSC_VER) || defined(MINGW)
+#include
+#if defined(_MSC_VER) && _MSC_VER < 1900
+#define snprintf _snprintf
+#endif /* _MSC_VER < 1900 */
+#else
+#include
+#endif
+
+#include
+#include "base/kaldi-common.h"
+
+
+namespace kaldi {
+
+std::string CharToString(const char &c) {
+ char buf[20];
+ if (std::isprint(c))
+ snprintf(buf, sizeof(buf), "\'%c\'", c);
+ else
+ snprintf(buf, sizeof(buf), "[character %d]", static_cast(c));
+ return (std::string) buf;
+}
+
+void Sleep(float seconds) {
+#if defined(_MSC_VER) || defined(MINGW)
+ ::Sleep(static_cast(seconds * 1000.0));
+#elif defined(__CYGWIN__)
+ sleep(static_cast(seconds));
+#else
+ usleep(static_cast(seconds * 1000000.0));
+#endif
+}
+
+} // end namespace kaldi
diff --git a/speechx/speechx/kaldi/base/kaldi-utils.h b/speechx/speechx/kaldi/base/kaldi-utils.h
new file mode 100644
index 00000000..c9d6fd95
--- /dev/null
+++ b/speechx/speechx/kaldi/base/kaldi-utils.h
@@ -0,0 +1,155 @@
+// base/kaldi-utils.h
+
+// Copyright 2009-2011 Ondrej Glembek; Microsoft Corporation;
+// Saarland University; Karel Vesely; Yanmin Qian
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_BASE_KALDI_UTILS_H_
+#define KALDI_BASE_KALDI_UTILS_H_ 1
+
+#if defined(_MSC_VER)
+# define WIN32_LEAN_AND_MEAN
+# define NOMINMAX
+# include
+#endif
+
+#ifdef _MSC_VER
+#include
+#define unlink _unlink
+#else
+#include
+#endif
+
+#include
+#include
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4056 4305 4800 4267 4996 4756 4661)
+#if _MSC_VER < 1400
+#define __restrict__
+#else
+#define __restrict__ __restrict
+#endif
+#endif
+
+#if defined(_MSC_VER)
+# define KALDI_MEMALIGN(align, size, pp_orig) \
+ (*(pp_orig) = _aligned_malloc(size, align))
+# define KALDI_MEMALIGN_FREE(x) _aligned_free(x)
+#elif defined(__CYGWIN__)
+# define KALDI_MEMALIGN(align, size, pp_orig) \
+ (*(pp_orig) = aligned_alloc(align, size))
+# define KALDI_MEMALIGN_FREE(x) free(x)
+#else
+# define KALDI_MEMALIGN(align, size, pp_orig) \
+ (!posix_memalign(pp_orig, align, size) ? *(pp_orig) : NULL)
+# define KALDI_MEMALIGN_FREE(x) free(x)
+#endif
+
+#ifdef __ICC
+#pragma warning(disable: 383) // ICPC remark we don't want.
+#pragma warning(disable: 810) // ICPC remark we don't want.
+#pragma warning(disable: 981) // ICPC remark we don't want.
+#pragma warning(disable: 1418) // ICPC remark we don't want.
+#pragma warning(disable: 444) // ICPC remark we don't want.
+#pragma warning(disable: 869) // ICPC remark we don't want.
+#pragma warning(disable: 1287) // ICPC remark we don't want.
+#pragma warning(disable: 279) // ICPC remark we don't want.
+#pragma warning(disable: 981) // ICPC remark we don't want.
+#endif
+
+
+namespace kaldi {
+
+
+// CharToString prints the character in a human-readable form, for debugging.
+std::string CharToString(const char &c);
+
+
+inline int MachineIsLittleEndian() {
+ int check = 1;
+ return (*reinterpret_cast(&check) != 0);
+}
+
+// This function kaldi::Sleep() provides a portable way
+// to sleep for a possibly fractional
+// number of seconds. On Windows it's only accurate to microseconds.
+void Sleep(float seconds);
+}
+
+#define KALDI_SWAP8(a) { \
+ int t = (reinterpret_cast(&a))[0];\
+ (reinterpret_cast(&a))[0]=(reinterpret_cast(&a))[7];\
+ (reinterpret_cast(&a))[7]=t;\
+ t = (reinterpret_cast(&a))[1];\
+ (reinterpret_cast(&a))[1]=(reinterpret_cast(&a))[6];\
+ (reinterpret_cast(&a))[6]=t;\
+ t = (reinterpret_cast(&a))[2];\
+ (reinterpret_cast(&a))[2]=(reinterpret_cast(&a))[5];\
+ (reinterpret_cast(&a))[5]=t;\
+ t = (reinterpret_cast(&a))[3];\
+ (reinterpret_cast(&a))[3]=(reinterpret_cast(&a))[4];\
+ (reinterpret_cast(&a))[4]=t;}
+#define KALDI_SWAP4(a) { \
+ int t = (reinterpret_cast(&a))[0];\
+ (reinterpret_cast(&a))[0]=(reinterpret_cast(&a))[3];\
+ (reinterpret_cast(&a))[3]=t;\
+ t = (reinterpret_cast(&a))[1];\
+ (reinterpret_cast(&a))[1]=(reinterpret_cast