From 30aba266930e84bc016f66457b3add4cee2054f4 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Wed, 19 May 2021 12:07:59 +0000 Subject: [PATCH 001/281] add align code --- deepspeech/exps/u2/model.py | 63 +++++++++++++++++ deepspeech/utils/ctc_utils.py | 6 +- deepspeech/utils/text_grid.py | 125 ++++++++++++++++++++++++++++++++++ 3 files changed, 191 insertions(+), 3 deletions(-) create mode 100644 deepspeech/utils/text_grid.py diff --git a/deepspeech/exps/u2/model.py b/deepspeech/exps/u2/model.py index f166a071e..6da0c3bd1 100644 --- a/deepspeech/exps/u2/model.py +++ b/deepspeech/exps/u2/model.py @@ -34,9 +34,11 @@ from deepspeech.models.u2 import U2Model from deepspeech.training.gradclip import ClipGradByGlobalNormWithLog from deepspeech.training.scheduler import WarmupLR from deepspeech.training.trainer import Trainer +from deepspeech.utils import ctc_utils from deepspeech.utils import error_rate from deepspeech.utils import layer_tools from deepspeech.utils import mp_tools +from deepspeech.utils import text_grid from deepspeech.utils.log import Log logger = Log(__name__).getlog() @@ -483,6 +485,67 @@ class U2Tester(U2Trainer): except KeyboardInterrupt: sys.exit(-1) + @paddle.no_grad() + def align(self): + if self.config.decoding.batch_size > 1: + logger.fatal('alignment mode must be running with batch_size == 1') + sys.exit(1) + + # xxx.align + assert self.args.result_file + + self.model.eval() + logger.info(f"Align Total Examples: {len(self.test_loader.dataset)}") + + stride_ms = self.test_loader.dataset.stride_ms + token_dict = self.test_loader.dataset.vocab_list + with open(self.args.result_file, 'w') as fout: + for i, batch in enumerate(self.test_loader): + key, feat, feats_length, target, target_length = batch + # 1. Encoder + encoder_out, encoder_mask = self.model._forward_encoder( + feat, feats_length) # (B, maxlen, encoder_dim) + maxlen = encoder_out.size(1) + ctc_probs = self.model.ctc.log_softmax( + encoder_out) # (1, maxlen, vocab_size) + + # 2. alignment + # print(ctc_probs.size(1)) + ctc_probs = ctc_probs.squeeze(0) + target = target.squeeze(0) + alignment = ctc_utils.forced_align(ctc_probs, target) + print(alignment) + fout.write('{} {}\n'.format(key[0], alignment)) + + # 3. gen praat + # segment alignment + align_segs = text_grid.segment_alignment(alignment) + print(align_segs) + # IntervalTier, List["start end token\n"] + subsample = get_subsample(self.config) + tierformat = text_grid.align_to_tierformat( + align_segs, subsample, token_dict) + tier_path = os.path.join( + os.path.dirname(args.result_file), key[0] + ".tier") + with open(tier_path, 'w') as f: + f.writelines(tierformat) + + textgrid_path = s.path.join( + os.path.dirname(args.result_file), key[0] + ".TextGrid") + second_per_frame = 1. / (1000. / stride_ms + ) # 25ms window, 10ms stride + text_grid.generate_textgrid( + maxtime=(len(alignment) + 1) * subsample * second_per_frame, + lines=tierformat, + output=textgrid_path) + + def run_align(self): + self.resume_or_scratch() + try: + self.align() + except KeyboardInterrupt: + sys.exit(-1) + def load_inferspec(self): """infer model and input spec. diff --git a/deepspeech/utils/ctc_utils.py b/deepspeech/utils/ctc_utils.py index 73669fea6..76c1898be 100644 --- a/deepspeech/utils/ctc_utils.py +++ b/deepspeech/utils/ctc_utils.py @@ -46,7 +46,7 @@ def remove_duplicates_and_blank(hyp: List[int], blank_id=0) -> List[int]: return new_hyp -def insert_blank(label: np.ndarray, blank_id: int=0): +def insert_blank(label: np.ndarray, blank_id: int=0) -> np.ndarray: """Insert blank token between every two label token. "abcdefg" -> "-a-b-c-d-e-f-g-" @@ -67,7 +67,7 @@ def insert_blank(label: np.ndarray, blank_id: int=0): def forced_align(ctc_probs: paddle.Tensor, y: paddle.Tensor, - blank_id=0) -> list: + blank_id=0) -> List[int]: """ctc forced alignment. https://distill.pub/2017/ctc/ @@ -77,7 +77,7 @@ def forced_align(ctc_probs: paddle.Tensor, y: paddle.Tensor, y (paddle.Tensor): label id sequence tensor, 1d tensor (L) blank_id (int): blank symbol index Returns: - paddle.Tensor: best alignment result, (T). + List[int]: best alignment result, (T). """ y_insert_blank = insert_blank(y, blank_id) diff --git a/deepspeech/utils/text_grid.py b/deepspeech/utils/text_grid.py new file mode 100644 index 000000000..9afed89e0 --- /dev/null +++ b/deepspeech/utils/text_grid.py @@ -0,0 +1,125 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Dict +from typing import List +from typing import Text + +import textgrid + + +def segment_alignment(alignment: List[int], blank_id=0) -> List[List[int]]: + """segment ctc alignment ids by continuous blank and repeat label. + + Args: + alignment (List[int]): ctc alignment id sequence. e.g. [0, 0, 0, 1, 1, 1, 2, 0, 0, 3] + blank_id (int, optional): blank id. Defaults to 0. + + Returns: + List[List[int]]: segment aligment id sequence. e.g. [[0, 0, 0, 1, 1, 1], [2], [0, 0, 3]] + """ + # convert alignment to a praat format, which is a doing phonetics + # by computer and helps analyzing alignment + align_segs = [] + # get frames level duration for each token + start = 0 + end = 0 + while end < len(alignment): + while end < len(alignment) and alignment[end] == blank_id: # blank + end += 1 + if end == len(alignment): + align_segs[-1].extend(alignment[start:]) + break + end += 1 + while end < len(alignment) and alignment[end - 1] == alignment[ + end]: # repeat label + end += 1 + align_segs.append(alignment[start:end]) + start = end + return align_segs + + +def align_to_tierformat(align_segs: List[List[int]], + subsample: int, + token_dict: Dict[int, Text], + blank_id=0) -> List[Text]: + """Generate textgrid.Interval format from alignment segmentations. + + Args: + align_segs (List[List[int]]): segmented ctc alignment ids. + subsample (int): 25ms frame_length, 10ms hop_length, 1/subsample + token_dict (Dict[int, Text]): int -> str map. + + Returns: + List[Text]: list of textgrid.Interval. + """ + hop_length = 10 # ms + second_ms = 1000 # ms + frame_per_second = second_ms / hop_length # 25ms frame_length, 10ms hop_length + second_per_frame = 1.0 / frame_per_second + + begin = 0 + duration = 0 + tierformat = [] + + for idx, tokens in enumerate(align_segs): + token_len = len(tokens) + token = tokens[-1] + # time duration in second + duration = token_len * subsample * second_per_frame + if idx < len(align_segs) - 1: + print(f"{begin:.2f} {begin + duration:.2f} {token_dict[token]}") + tierformat.append( + f"{begin:.2f} {begin + duration:.2f} {token_dict[token]}\n") + else: + for i in tokens: + if i != blank_id: + token = i + break + print(f"{begin:.2f} {begin + duration:.2f} {token_dict[token]}") + tierformat.append( + f"{begin:.2f} {begin + duration:.2f} {token_dict[token]}\n") + begin = begin + duration + + return tierformat + + +def generate_textgrid(maxtime: float, + intervals: List[Text], + output: Text, + name: Text='ali') -> None: + """Create alignment textgrid file. + + Args: + maxtime (float): audio duartion. + intervals (List[Text]): ctc output alignment. e.g. "start-time end-time word" per item. + output (Text): textgrid filepath. + name (Text, optional): tier or layer name. Defaults to 'ali'. + """ + # Download Praat: https://www.fon.hum.uva.nl/praat/ + avg_interval = maxtime / (len(intervals) + 1) + print(f"average duration per {name}: {avg_interval}") + margin = 0.0001 + + tg = textgrid.TextGrid(maxTime=maxtime) + tier = textgrid.IntervalTier(name=name, maxTime=maxtime) + + i = 0 + for dur in intervals: + s, e, text = dur.split() + tier.add(minTime=float(s) + margin, maxTime=float(e), mark=text) + + tg.append(tier) + + tg.write(output) + print("successfully generator textgrid {}.".format(output)) From 92381451fbdb7fdf56af531de9e7ca145d4df815 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Wed, 19 May 2021 12:08:06 +0000 Subject: [PATCH 002/281] format --- README.md | 2 +- deepspeech/frontend/normalizer.py | 3 ++- doc/src/asr_text_backend.md | 2 +- doc/src/benchmark.md | 1 - doc/src/chinese_syllable.md | 2 +- doc/src/dataset.md | 2 +- doc/src/feature_list.md | 2 +- doc/src/ngram_lm.md | 2 +- doc/src/praat_textgrid.md | 15 +++++++-------- doc/src/tools.md | 1 - doc/src/tts_text_front_end.md | 6 +++--- requirements.txt | 4 ++-- 12 files changed, 20 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index a2de1783a..424dc485e 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ ## Features - See [feature list](doc/src/feature_list.md) for more information. + See [feature list](doc/src/feature_list.md) for more information. ## Setup diff --git a/deepspeech/frontend/normalizer.py b/deepspeech/frontend/normalizer.py index 6b224080b..287b51e58 100644 --- a/deepspeech/frontend/normalizer.py +++ b/deepspeech/frontend/normalizer.py @@ -179,7 +179,8 @@ class FeatureNormalizer(object): wav_number += batch_size if wav_number % 1000 == 0: - logger.info(f'process {wav_number} wavs,{all_number} frames.') + logger.info( + f'process {wav_number} wavs,{all_number} frames.') self.cmvn_info = { 'mean_stat': list(all_mean_stat.tolist()), diff --git a/doc/src/asr_text_backend.md b/doc/src/asr_text_backend.md index 879e56f8a..c3c9896c7 100644 --- a/doc/src/asr_text_backend.md +++ b/doc/src/asr_text_backend.md @@ -98,4 +98,4 @@ ## Text Filter -* 敏感词(黄暴、涉政、违法违禁等) \ No newline at end of file +* 敏感词(黄暴、涉政、违法违禁等) diff --git a/doc/src/benchmark.md b/doc/src/benchmark.md index f3af25552..9c1c86fd7 100644 --- a/doc/src/benchmark.md +++ b/doc/src/benchmark.md @@ -14,4 +14,3 @@ We compare the training time with 1, 2, 4, 8 Tesla V100 GPUs (with a subset of L | 8 | 6.95 X | `utils/profile.sh` provides such a demo profiling tool, you can change it as need. - diff --git a/doc/src/chinese_syllable.md b/doc/src/chinese_syllable.md index 676ecb531..fd5a6159a 100644 --- a/doc/src/chinese_syllable.md +++ b/doc/src/chinese_syllable.md @@ -48,4 +48,4 @@ ## Zhuyin * [Bopomofo](https://en.wikipedia.org/wiki/Bopomofo) -* [Zhuyin table](https://en.wikipedia.org/wiki/Zhuyin_table) \ No newline at end of file +* [Zhuyin table](https://en.wikipedia.org/wiki/Zhuyin_table) diff --git a/doc/src/dataset.md b/doc/src/dataset.md index d70d0e0d2..aaa805510 100644 --- a/doc/src/dataset.md +++ b/doc/src/dataset.md @@ -18,4 +18,4 @@ ### ASR Noise -* [asr-noises](https://github.com/speechio/asr-noises) \ No newline at end of file +* [asr-noises](https://github.com/speechio/asr-noises) diff --git a/doc/src/feature_list.md b/doc/src/feature_list.md index 57641d5ea..573669fa2 100644 --- a/doc/src/feature_list.md +++ b/doc/src/feature_list.md @@ -58,4 +58,4 @@ ### Grapheme To Phoneme * syallable -* phoneme \ No newline at end of file +* phoneme diff --git a/doc/src/ngram_lm.md b/doc/src/ngram_lm.md index 07aa5411c..119a3b21c 100644 --- a/doc/src/ngram_lm.md +++ b/doc/src/ngram_lm.md @@ -83,4 +83,4 @@ Please notice that the released language models only contain Chinese simplified ``` build/bin/build_binary ./result/people2014corpus_words.arps ./result/people2014corpus_words.klm - ``` \ No newline at end of file + ``` diff --git a/doc/src/praat_textgrid.md b/doc/src/praat_textgrid.md index c25c760ae..06c4f8791 100644 --- a/doc/src/praat_textgrid.md +++ b/doc/src/praat_textgrid.md @@ -76,7 +76,7 @@ pip3 install textgrid tg.read('file.TextGrid') # 'file.TextGrid' 是文件名 ``` - tg.tiers属性: + tg.tiers属性: 会把文件中的所有item打印出来, print(tg.tiers) 的结果如下: ```text @@ -86,7 +86,7 @@ pip3 install textgrid Interval(1361.89250, 1362.01250, R), Interval(1362.01250, 1362.13250, AY1), Interval(1362.13250, 1362.16250, T), - + ... ] ) @@ -113,7 +113,7 @@ pip3 install textgrid Interval 可以理解为时长 ``` - + 2. textgrid库中的对象 **IntervalTier** 对象: @@ -148,7 +148,7 @@ pip3 install textgrid strict -- > 返回bool值, 表示是否严格TextGrid格式 ``` - ​ + ​ **PointTier** 对象: 方法 @@ -174,7 +174,7 @@ pip3 install textgrid name 返回name ``` - + **Point** 对象: 支持比较大小, 支持加减运算 @@ -185,7 +185,7 @@ pip3 install textgrid time: ``` - ​ + ​ **Interval** 对象: 支持比较大小, 支持加减运算 @@ -250,10 +250,9 @@ pip3 install textgrid grids: --> 返回读取的grids的列表 ``` - + ## Reference * https://zh.wikipedia.org/wiki/Praat%E8%AF%AD%E9%9F%B3%E5%AD%A6%E8%BD%AF%E4%BB%B6 * https://blog.csdn.net/duxin_csdn/article/details/88966295 - diff --git a/doc/src/tools.md b/doc/src/tools.md index 4ec09f6a2..5fcca9239 100644 --- a/doc/src/tools.md +++ b/doc/src/tools.md @@ -1,4 +1,3 @@ # Useful Tools * [正则可视化和常用正则表达式](https://wangwl.net/static/projects/visualRegex/#) - diff --git a/doc/src/tts_text_front_end.md b/doc/src/tts_text_front_end.md index 6eb9ae5d9..9f2f91097 100644 --- a/doc/src/tts_text_front_end.md +++ b/doc/src/tts_text_front_end.md @@ -23,7 +23,7 @@ Therefore, procedures like stemming and lemmatization are not useful for Chinese ### Tokenization -**Tokenizing breaks up text data into shorter pre-set strings**, which help build context and meaning for the machine learning model. +**Tokenizing breaks up text data into shorter pre-set strings**, which help build context and meaning for the machine learning model. These “tags” label the part of speech. There are 24 part of speech tags and 4 proper name category labels in the `**jieba**` package’s existing dictionary. @@ -31,7 +31,7 @@ These “tags” label the part of speech. There are 24 part of speech tags and ### Stop Words -In NLP, **stop words are “meaningless” words** that make the data too noisy or ambiguous. +In NLP, **stop words are “meaningless” words** that make the data too noisy or ambiguous. Instead of manually removing them, you could import the `**stopwordsiso**` package for a full list of Chinese stop words. More information can be found [here](https://pypi.org/project/stopwordsiso/). And with this, we can easily create code to filter out any stop words in large text data. @@ -188,4 +188,4 @@ TN: 基于规则的方法 ## Reference * [Text Front End](https://slyne.github.io/%E5%85%AC%E5%BC%80%E8%AF%BE/2020/10/03/TTS1/) * [Chinese Natural Language (Pre)processing: An Introduction](https://towardsdatascience.com/chinese-natural-language-pre-processing-an-introduction-995d16c2705f) -* [Beginner’s Guide to Sentiment Analysis for Simplified Chinese using SnowNLP](https://towardsdatascience.com/beginners-guide-to-sentiment-analysis-for-simplified-chinese-using-snownlp-ce88a8407efb) \ No newline at end of file +* [Beginner’s Guide to Sentiment Analysis for Simplified Chinese using SnowNLP](https://towardsdatascience.com/beginners-guide-to-sentiment-analysis-for-simplified-chinese-using-snownlp-ce88a8407efb) diff --git a/requirements.txt b/requirements.txt index a6facb6cb..57a951bbd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ coverage pre-commit +pybind11 resampy==0.2.2 scipy==1.2.1 sentencepiece @@ -7,7 +8,6 @@ snakeviz SoundFile==0.9.0.post1 sox tensorboardX +textgrid typeguard yacs -pybind11 -textgrid From 3a76707062452d775330382ca1ad6e04b3483443 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 1 Jun 2021 08:32:41 +0000 Subject: [PATCH 003/281] rm useless --- doc/src/chinese_syllable.md | 51 ------- doc/src/dataset.md | 21 --- doc/src/praat_textgrid.md | 258 ---------------------------------- doc/src/tools.md | 3 - doc/src/tts_text_front_end.md | 191 ------------------------- 5 files changed, 524 deletions(-) delete mode 100644 doc/src/chinese_syllable.md delete mode 100644 doc/src/dataset.md delete mode 100644 doc/src/praat_textgrid.md delete mode 100644 doc/src/tools.md delete mode 100644 doc/src/tts_text_front_end.md diff --git a/doc/src/chinese_syllable.md b/doc/src/chinese_syllable.md deleted file mode 100644 index fd5a6159a..000000000 --- a/doc/src/chinese_syllable.md +++ /dev/null @@ -1,51 +0,0 @@ -# chinese syllable - - - -## Syllable - -* [List of Syllables in Pinyin](https://resources.allsetlearning.com/chinese/pronunciation/syllable) - The word syllable is a term referring to the units of a word, composed on an (optional) initial, a final, and a tone. - - The word "syllable" is 音节 (yīnjié) in Chinese. - - Most spoken syllables in Mandarin Chinese correspond to one written Chinese character. - - There are a total of 410 common pinyin syllables. - -* [Rare syllable](https://resources.allsetlearning.com/chinese/pronunciation/Rare_syllable) - -* [Chinese Pronunciation: The Complete Guide for Beginner](https://www.digmandarin.com/chinese-pronunciation-guide.html) - -* [Mandarin Chinese Phonetics](http://www.zein.se/patrick/chinen8p.html) - -* [chinese phonetics](https://www.easymandarin.cn/online-chinese-lessons/chinese-phonetics/) - Chinese Characters, called “Hanzi”, are the writing symbols of the Chinese language. - Pinyin is the Romanization of a phonetic notation for Chinese Characters. - Each syllable is composed of three parts: initials, finals, and tones. - In the Pinyin system there are 23 initials, 24 finals, 4 tones and a neutral tone. - - - -## Pinyin -* [Pinyin](https://en.wikipedia.org/wiki/Pinyin) -* [Pinyin quick start guide](https://resources.allsetlearning.com/chinese/pronunciation/Pinyin_quick_start_guide) -* [Pinyin Table](https://en.wikipedia.org/wiki/Pinyin_table) -* [Piyin Chat](https://resources.allsetlearning.com/chinese/pronunciation/Pinyin_chart) -* [Mandarin Chinese Pinyin Table](https://www.archchinese.com/chinese_pinyin.html) -* [Chinese Pinyin Table ](http://www.quickmandarin.com/chinesepinyintable/) - - - -## Tones -* [Four tones](https://resources.allsetlearning.com/chinese/pronunciation/Four_tones) -* [Neutral tone](https://resources.allsetlearning.com/chinese/pronunciation/Neutral_tone) -* [Where do the tone marks go?](http://www.pinyin.info/rules/where.html) -* [声调符号标在哪儿?](http://www.hwjyw.com/resource/content/2010/06/04/8183.shtml) - - - -## Zhuyin - -* [Bopomofo](https://en.wikipedia.org/wiki/Bopomofo) -* [Zhuyin table](https://en.wikipedia.org/wiki/Zhuyin_table) diff --git a/doc/src/dataset.md b/doc/src/dataset.md deleted file mode 100644 index aaa805510..000000000 --- a/doc/src/dataset.md +++ /dev/null @@ -1,21 +0,0 @@ -# Dataset - -## Text - -* [Tatoeba](https://tatoeba.org/cmn) - - **Tatoeba is a collection of sentences and translations.** It's collaborative, open, free and even addictive. An open data initiative aimed at translation and speech recognition. - - - -## Speech - -* [Tatoeba](https://tatoeba.org/cmn) - - **Tatoeba is a collection of sentences and translations.** It's collaborative, open, free and even addictive. An open data initiative aimed at translation and speech recognition. - - - -### ASR Noise - -* [asr-noises](https://github.com/speechio/asr-noises) diff --git a/doc/src/praat_textgrid.md b/doc/src/praat_textgrid.md deleted file mode 100644 index 06c4f8791..000000000 --- a/doc/src/praat_textgrid.md +++ /dev/null @@ -1,258 +0,0 @@ -# Praat and TextGrid - -* [**Praat: doing phonetics by computer**](https://www.fon.hum.uva.nl/praat/) -* [TextGrid](https://github.com/kylebgorman/textgrid) - -## Praat - -**Praat语音学软件**,原名**Praat: doing phonetics by computer**,通常简称**Praat**,是一款[跨平台](https://zh.wikipedia.org/wiki/跨平台)的多功能[语音学](https://zh.wikipedia.org/wiki/语音学)专业[软件](https://zh.wikipedia.org/wiki/软件),主要用于对[数字化](https://zh.wikipedia.org/wiki/数字化)的[语音](https://zh.wikipedia.org/wiki/语音)[信号](https://zh.wikipedia.org/wiki/信号)进行[分析](https://zh.wikipedia.org/w/index.php?title=语音分析&action=edit&redlink=1)、标注、[处理](https://zh.wikipedia.org/wiki/数字信号处理)及[合成](https://zh.wikipedia.org/wiki/语音合成)等实验,同时生成各种[语图](https://zh.wikipedia.org/w/index.php?title=语图&action=edit&redlink=1)和文字报表。 - - - - - - - -## TextGrid - -### TextGrid文件结构 - -```text -第一行是固定的:File type = "ooTextFile" -第二行也是固定的:Object class = "TextGrid" -空一行 -xmin = xxxx.xxxx  # 表示开始时间 -xmax = xxxx.xxxx  # 表示结束时间 -tiers?  # 这一行固定 -size = 4  # 表示这个文件有几个item, item也叫tiers, 可以翻译为'层', 这个值是几,就表示有几个item -item []: -    item [1]: -        class = "IntervalTier" -        name = "phone" -        xmin = 1358.8925 -        xmax = 1422.5525 -        intervals: size = 104 -        intervals [1]: -            xmin = 1358.8925 -            xmax = 1361.8925 -            text = "sil" -        intervals [2]: -            xmin = 1361.8925 -            xmax = 1362.0125 -            text = "R" -        intervals [3]: -            ... -        intervals [104]: -            xmin = 1422.2325 -            xmax = 1422.5525 -            text = "sil" -    item [2]: -        class = "IntervalTier" -        name = "word" -        xmin = 1358.8925 -        xmax = 1422.5525 -        intervals: size = 3 -        intervals [1]: -            xmin = 1358.8925 -            xmax = 1361.8925 -            text = "sp" -``` - -textgrid 文件中的 size 的值是几就表示有几个 item, 每个 item 下面包含 class, name, xmin, xmax, intervals 的键值对,item 中的 intervals: size 是几就表示这个 item 中有几个 intervals,每个 intervals 有 xmin, xmax, text 三个键值参数。所有 item 中的 xmax - xmin 的值是一样的。 - -### 安装 - -```python -pip3 install textgrid -``` - -### 使用 - -1. 读一个textgrid文件 - - ```python - import textgrid - tg = textgrid.TextGrid() - tg.read('file.TextGrid') # 'file.TextGrid' 是文件名 - ``` - - tg.tiers属性: - 会把文件中的所有item打印出来, print(tg.tiers) 的结果如下: - - ```text - [IntervalTier( - phone, [ - Interval(1358.89250, 1361.89250, sil), - Interval(1361.89250, 1362.01250, R), - Interval(1362.01250, 1362.13250, AY1), - Interval(1362.13250, 1362.16250, T), - - ... - ] - ) - ] - ``` - - 此外, tg.tiers[0] 表示第一个 IntervalTier, 支持继续用中括号取序列, '.'来取属性. - 比如: - - ```text - tg.tiers[0][0].mark --> 'sil' - tg.tiers[0].name --> 'phone' - tg.tiers[0][0].minTime --> 1358.8925 - tg.tiers[0].intervals --> [Interval(1358.89250, 1361.89250, sil), ..., Interval(1422.23250, 1422.55250, sil)] - tg.tiers[0].maxTime --> 1422.55250 - ``` - - TextGrid 模块中包含四种对象 - - ``` - PointTier 可以理解为标记(点)的集合 - IntervalTier 可以理解为时长(区间)的集合 - Point 可以理解为标记 - Interval 可以理解为时长 - ``` - - - -2. textgrid库中的对象 - **IntervalTier** 对象: - 方法 - - ``` - add(minTime, maxTime, mark): 添加一个标记,需要同时传入起止时间, 和mark的名字. - addInterval(interval): 添加一个Interval对象, 该Interval对象中已经封装了起止时间. - remove(minTime, maxTime, mark): 删除一个Interval - removeInterval(interval): 删除一个Interval - indexContaining(time): 传入时间或Point对象, 返回包含该时间的Interval对象的下标 - 例如: - print(tg[0].indexContaining(1362)) --> 1 - 表示tg[0] 中包含1362时间点的是 下标为1的 Interval 对象 - intervalContaining(): 传入时间或Point对象, 返回包含该时间的Interval对象 - 例如 - print(tg[0].intervalContaining(1362)) --> Interval(1361.89250, 1362.01250, R) - read(f): f是文件对象, 读一个TextGrid文件 - write(f): f是文件对象, 写一个TextGrid文件 - fromFile(f_path): f_path是文件路径, 从一个文件读 - bounds(): 返回一个元组, (minTime, maxTime) - ``` - - - 属性 - - ``` - intervals --> 返回所有的 interval 的列表 - maxTime --> 返回 number(decimal.Decimal)类型, 表示结束时间 - minTime --> 返回 number(decimal.Decimal)类型, 表示开始时间 - name --> 返回字符串 - strict -- > 返回bool值, 表示是否严格TextGrid格式 - ``` - - ​ - - **PointTier** 对象: - 方法 - - ``` - add(minTime, maxTime, mark): 添加一个标记,需要同时传入起止时间, 和mark的名字. - addPoint(point): 添加一个Point对象, 该Point对象中已经封装了起止时间. - remove(time, mark): 删除一个 point, 传入时间和mark - removePoint(point): 删除一个 point, 传入point对象 - read(f): 读, f是文件对象 - write(f): 写, f是文件对象 - fromFile(f_path): f_path是文件路径, 从一个文件读 - bounds(): 返回一个元组, (minTime, maxTime) - ``` - - - 属性 - - ``` - points 返回所有的 point 的列表 - maxTime 和IntervalTier一样, 返回结束时间 - minTime 和IntervalTier一样, 返回开始时间 - name 返回name - ``` - - - - **Point** 对象: - 支持比较大小, 支持加减运算 - 属性: - - ``` - mark: - time: - ``` - - ​ - - **Interval** 对象: - 支持比较大小, 支持加减运算 - 支持 in, not in 的运算 - 方法: - - ``` - duration(): 返回number 类型, 表示这个Interval的持续时间 - bounds(): --> 返回元组, (minTime, maxTime) - overlaps(Interval): --> 返回bool值, 判断本Interval的时间和传入的的Interval的时间是否重叠, 是返回True - ``` - - 属性: - - ``` - mark - maxTime - minTime - strick: --> 返回bool值, 判断格式是否严格的TextGrid格式 - ``` - - **TextGrid** 对象: - 支持列表的取值,复制, 迭代, 求长度, append, extend, pop方法 - 方法: - - ``` - getFirst(tierName) 返回第一个名字为tierName的tier - getList(tierName) 返回名字为tierName的tier的列表 - getNames() 返回所有tier的名字的列表 - append(tier) 添加一个tier作为其中的元素 - extend(tiers) 添加多个tier作为其中的元素 - pop(tier) 删除一个tier - read(f) f是文件对象 - write(f) f是文件对象 - fromFile(f_path) f_path是文件路径 - ``` - - 属性: - - ``` - maxTime - minTime - name - strict - tiers 返回所有tiers的列表 - ``` - - **MLF** 对象 - MLF('xxx.mlf') - 'xxx.mlf'为mlf格式的文件, - 读取hvite-o sm生成的htk.mlf文件并将其转换为 TextGrid的列表 - 方法: - - ``` - read(f) f是文件对象 - write(prefix='') prefix是写出路径的前缀,可选 - ``` - - 属性: - - ``` - grids: --> 返回读取的grids的列表 - ``` - - - -## Reference - -* https://zh.wikipedia.org/wiki/Praat%E8%AF%AD%E9%9F%B3%E5%AD%A6%E8%BD%AF%E4%BB%B6 -* https://blog.csdn.net/duxin_csdn/article/details/88966295 diff --git a/doc/src/tools.md b/doc/src/tools.md deleted file mode 100644 index 5fcca9239..000000000 --- a/doc/src/tools.md +++ /dev/null @@ -1,3 +0,0 @@ -# Useful Tools - -* [正则可视化和常用正则表达式](https://wangwl.net/static/projects/visualRegex/#) diff --git a/doc/src/tts_text_front_end.md b/doc/src/tts_text_front_end.md deleted file mode 100644 index 9f2f91097..000000000 --- a/doc/src/tts_text_front_end.md +++ /dev/null @@ -1,191 +0,0 @@ -# Text Front End - - - -## Text Segmentation - -There are various libraries including some of the most popular ones like NLTK, Spacy, Stanford CoreNLP that that provide excellent, easy to use functions for sentence segmentation. - -* https://github.com/bminixhofer/nnsplit -* [DeepSegment](https://github.com/notAI-tech/deepsegment) [blog](http://bpraneeth.com/projects/deepsegment) [1](https://praneethbedapudi.medium.com/deepcorrection-1-sentence-segmentation-of-unpunctuated-text-a1dbc0db4e98) [2](https://praneethbedapudi.medium.com/deepcorrection2-automatic-punctuation-restoration-ac4a837d92d9) [3](https://praneethbedapudi.medium.com/deepcorrection-3-spell-correction-and-simple-grammar-correction-d033a52bc11d) [4](https://praneethbedapudi.medium.com/deepsegment-2-0-multilingual-text-segmentation-with-vector-alignment-fd76ce62194f) - - - -## Text Normalization(文本正则) - -The **basic preprocessing steps** that occur in English NLP, including data cleaning, stemming/lemmatization, tokenization and stop words. **not all of these steps are necessary for Chinese text data!** - -### Lexicon Normalization - -There’s a concept similar to stems in this language, and they’re called Radicals. **Radicals are basically the building blocks of Chinese characters.** All Chinese characters are made up of a finite number of components which are put together in different orders and combinations. Radicals are usually the leftmost part of the character. There are around 200 radicals in Chinese, and they are used to index and categorize characters. - -Therefore, procedures like stemming and lemmatization are not useful for Chinese text data because seperating the radicals would **change the word’s meaning entirely**. - -### Tokenization - -**Tokenizing breaks up text data into shorter pre-set strings**, which help build context and meaning for the machine learning model. - -These “tags” label the part of speech. There are 24 part of speech tags and 4 proper name category labels in the `**jieba**` package’s existing dictionary. - - - -### Stop Words - -In NLP, **stop words are “meaningless” words** that make the data too noisy or ambiguous. - -Instead of manually removing them, you could import the `**stopwordsiso**` package for a full list of Chinese stop words. More information can be found [here](https://pypi.org/project/stopwordsiso/). And with this, we can easily create code to filter out any stop words in large text data. - -```python -!pip install stopwordsiso -import stopwordsiso -from stopwordsiso import stopwords -stopwords(["zh"]) # Chinese -``` - - - -文本正则化 文本正则化主要是讲非标准词(NSW)进行转化,比如: - -数字、电话号码: 10086 -> 一千零八十六/幺零零八六 -时间,比分: 23:20 -> 二十三点二十分/二十三比二十 -分数、小数、百分比: 3/4 -> 四分之三,3.24 -> 三点一四, 15% -> 百分之十五 -符号、单位: ¥ -> 元, kg -> 千克 -网址、文件后缀: www. -> 三W点 - -* https://github.com/google/re2 - -* https://github.com/speechio/chinese_text_normalization - -* [vinorm](https://github.com/NoahDrisort/vinorm) [cpp_verion](https://github.com/NoahDrisort/vinorm_cpp_version) - - Python package for text normalization, use for frontend of Text-to-speech Reseach - -* https://github.com/candlewill/CNTN - - This is a ChiNese Text Normalization (CNTN) tool for Text-to-speech system, which is based on [sparrowhawk](https://github.com/google/sparrowhawk). - - - -## Word Segmentation(分词) - -分词之所以重要可以通过这个例子来说明: -广州市长隆马戏欢迎你 -> 广州市 长隆 马戏 欢迎你 -如果没有分词错误会导致句意完全不正确:  -广州 市长 隆马戏 欢迎你 - -分词常用方法分为最大前向匹配(基于字典)和基于CRF的分词方法。用CRF的方法相当于是把这个任务转换成了序列标注,相比于基于字典的方法好处是对于歧义或者未登录词有较强的识别能力,缺点是不能快速fix bug,并且性能略低于词典。 - - -中文分词的常见工具: -* https://github.com/lancopku/PKUSeg-python -* https://github.com/thunlp/THULAC-Python -* https://github.com/fxsjy/jieba -* CRF++ -* https://github.com/isnowfy/snownlp - -### MMSEG -* [MMSEG: A Word Identification System for Mandarin Chinese Text Based on Two Variants of the Maximum Matching Algorithm](http://technology.chtsai.org/mmseg/) -* [`中文分词`简单高效的MMSeg](https://www.cnblogs.com/en-heng/p/5872308.html) -* [mmseg分词算法及实现](https://blog.csdn.net/daniel_ustc/article/details/50488040) -* [Mmseg算法](https://www.jianshu.com/p/e4ae8d194487) -* [浅谈中文分词](http://www.isnowfy.com/introduction-to-chinese-segmentation/) - -* [pymmseg-cpp](https://github.com/pluskid/pymmseg-cpp.git) -* [ustcdane/mmseg](https://github.com/ustcdane/mmseg) -* [jkom-cloud/mmseg](https://github.com/jkom-cloud/mmseg) - - -### CScanner -* [CScanner - A Chinese Lexical Scanner](http://technology.chtsai.org/cscanner/) - - - -## Part of Speech(词性预测) - -词性解释 -n/名词 np/人名 ns/地名 ni/机构名 nz/其它专名 -m/数词 q/量词 mq/数量词 t/时间词 f/方位词 s/处所词 -v/动词 a/形容词 d/副词 h/前接成分 k/后接成分 -i/习语 j/简称 r/代词 c/连词 p/介词 u/助词 y/语气助词 -e/叹词 o/拟声词 g/语素 w/标点 x/其它 - - - -## G2P(注音) - -注音是需要将词转换成对应的发音,对于中文是将其转换成拼音,比如 绿色->(lv4 se4) 这里的数字表示声调。 - -传统方法是使用字典,但是对于未登录词就很难解决。基于模型的方法是使用 [Phonetisaurus](https://github.com/AdolfVonKleist/Phonetisaurus)。 论文可以参考 - WFST-based Grapheme-to-Phoneme Conversion: Open Source Tools for Alignment, Model-Building and Decoding - -当然这个问题也可以看做是序列标注用CRF或者基于神经网络的模型都可以做。 基于神经网络工具: [g2pM](https://github.com/kakaobrain/g2pM)。 - - - - -## Prosody(韵律预测) - -ToBI(an abbreviation of tones and break indices) is a set of conventions for transcribing and annotating the prosody of speech. 中文主要关注break。 - - -韵律等级结构: - -音素 -> 音节 -> 韵律词(Prosody Word, PW) -> 韵律短语(prosody phrase, PPH) -> 语调短句(intonational phrase, IPH) -> 子句子 -> 主句子 -> 段落 -> 篇章 -LP -> LO -> L1(#1) -> L2(#2) -> L3(#3) -> L4(#4) -> L5 -> L6 -> L7 -主要关注 PW, PPH, IPH - -| | 停顿时长 | 前后音高特征 | -| --- | ----------| --- | -| 韵律词边界 | 不停顿或从听感上察觉不到停顿 | 无 | -| 韵律短语边界 | 可以感知停顿,但无明显的静音段 | 音高不下倾或稍下倾,韵末不可做句末 | -| 语调短语边界 | 有较长停顿 | 音高下倾比较完全,韵末可以作为句末 | - -常用方法使用的是级联CRF,首先预测如果是PW,再继续预测是否是PPH,再预测是否是IPH - - - -论文: 2015 .Ding Et al. - Automatic Prosody Prediction For Chinese Speech Synthesis Using BLSTM-RNN and Embedding Features - - - -## Polyphone(多音字) - - - -## Linguistic Features(语言学特征) - - - -## 基于神经网络的前端文本分析模型 - -最近这两年基本都是基于 BERT,所以这里记录一下相关的论文: - -- g2p: 2019. Sevinj Et al. Transformer based Grapheme-to-Phoneme Conversion -- 分词: 2019 huang Et al. - Toward Fast and Accurate Neural Chinese Word Segmentation with Multi-Criteria Learning -- 韵律: 2020 Zhang Et al. - Chinese Prosodic Structure Prediction Based on a Pretrained Language Representation Model - -除此之外,BLSTM + CRF 也比较主流。 - - - -## 总结 - -总结一下,文本分析各个模块的方法: - -TN: 基于规则的方法 - -分词: 字典/CRF/BLSTM+CRF/BERT - -注音: ngram/CRF/BLSTM/seq2seq - -韵律: CRF/BLSTM + CRF/ BERT - - - -考虑到分词,注音,韵律都是基于序列标注任务,所以理论上来说可以通过一个模型搞定。 - - - -## Reference -* [Text Front End](https://slyne.github.io/%E5%85%AC%E5%BC%80%E8%AF%BE/2020/10/03/TTS1/) -* [Chinese Natural Language (Pre)processing: An Introduction](https://towardsdatascience.com/chinese-natural-language-pre-processing-an-introduction-995d16c2705f) -* [Beginner’s Guide to Sentiment Analysis for Simplified Chinese using SnowNLP](https://towardsdatascience.com/beginners-guide-to-sentiment-analysis-for-simplified-chinese-using-snownlp-ce88a8407efb) From 7779f33e7410e7d132e9e5f61f7b2967f8c76154 Mon Sep 17 00:00:00 2001 From: Feiyu Chan Date: Fri, 4 Jun 2021 20:35:54 +0800 Subject: [PATCH 004/281] add text normalization example --- examples/text_normalization/README.md | 3 + .../text_normalization/data/sentences.txt | 26 +++ .../local/test_normalization.py | 14 ++ examples/text_normalization/path.sh | 8 + examples/text_normalization/run.sh | 27 +++ third_party/text_processing/__init__.py | 0 .../text_processing/normalization/__init__.py | 42 +++++ .../normalization/char_convert.py | 14 ++ .../normalization/chronology.py | 63 +++++++ .../normalization/constants.py | 57 +++++++ .../text_processing/normalization/num.py | 154 ++++++++++++++++++ .../text_processing/normalization/phone.py | 30 ++++ .../normalization/quantifier.py | 17 ++ .../normalization/sentence_split.py | 22 +++ 14 files changed, 477 insertions(+) create mode 100644 examples/text_normalization/README.md create mode 100644 examples/text_normalization/data/sentences.txt create mode 100644 examples/text_normalization/local/test_normalization.py create mode 100644 examples/text_normalization/path.sh create mode 100755 examples/text_normalization/run.sh create mode 100644 third_party/text_processing/__init__.py create mode 100644 third_party/text_processing/normalization/__init__.py create mode 100644 third_party/text_processing/normalization/char_convert.py create mode 100644 third_party/text_processing/normalization/chronology.py create mode 100644 third_party/text_processing/normalization/constants.py create mode 100644 third_party/text_processing/normalization/num.py create mode 100644 third_party/text_processing/normalization/phone.py create mode 100644 third_party/text_processing/normalization/quantifier.py create mode 100644 third_party/text_processing/normalization/sentence_split.py diff --git a/examples/text_normalization/README.md b/examples/text_normalization/README.md new file mode 100644 index 000000000..108bbf107 --- /dev/null +++ b/examples/text_normalization/README.md @@ -0,0 +1,3 @@ +# Regular expression based text normalization for Chinese + +For simplicity and ease of implementation, text normalization is basically done by rules and dictionaries. Jere's an example. \ No newline at end of file diff --git a/examples/text_normalization/data/sentences.txt b/examples/text_normalization/data/sentences.txt new file mode 100644 index 000000000..d15bfe46b --- /dev/null +++ b/examples/text_normalization/data/sentences.txt @@ -0,0 +1,26 @@ +今天的最低气温达到-10°C. +只要有33/4的人同意,就可以通过决议。 +1945年5月2日,苏联士兵在德国国会大厦上升起了胜利旗,象征着攻占柏林并战胜了纳粹德国。 +4月16日,清晨的战斗以炮击揭幕,数以千计的大炮和喀秋莎火箭炮开始炮轰德军阵地,炮击持续了数天之久。 +如果剩下的30.6%是过去,那么还有69.4%. +事情发生在2020/03/31的上午8:00. +警方正在找一支.22口径的手枪。 +欢迎致电中国联通,北京2022年冬奥会官方合作伙伴为您服务 +充值缴费请按1,查询话费及余量请按2,跳过本次提醒请按井号键。 +快速解除流量封顶请按星号键,腾讯王卡产品介绍、使用说明、特权及活动请按9,查询话费、套餐余量、积分及活动返款请按1,手机上网流量开通及取消请按2,查询本机号码及本号所使用套餐请按4,密码修改及重置请按5,紧急开机请按6,挂失请按7,查询充值记录请按8,其它自助服务及人工服务请按0 +智能客服助理快速查话费、查流量请按9,了解北京联通业务请按1,宽带IPTV新装、查询请按2,障碍报修请按3,充值缴费请按4,投诉建议请按5,政企业务请按7,人工服务请按0,for english severice press star key +您的帐户当前可用余额为63.89元,本月消费为2.17元。您的消费、套餐余量和其它信息将以短信形式下发,请您注意查收。谢谢使用,再见!。 +您的帐户当前可用余额为负15.5元,本月消费为59.6元。您的消费、套餐余量和其它信息将以短信形式下发,请您注意查收。谢谢使用,再见!。 +尊敬的客户,您目前的话费余额为负14.60元,已低于10元,为保证您的通信畅通,请及时缴纳费用。 +您的流量已用完,为避免您产生额外费用,建议您根据需求开通一个流量包以作补充。 +您可以直接说,查询话费及余量、开通流量包、缴费,您也可以说出其它需求,请问有什么可以帮您? +您的账户当前可用余额为负36.00元,本月消费36.00元。 +请问你是电话13985608526的机主吗? +如您对处理结果不满意,可拨打中国联通集团投诉电话10015进行投诉,按本地通话费收费,返回自助服务请按井号键 +“26314”号VIP客服代表为您服务。 +尊敬的5G用户,欢迎您致电中国联通 +首先是应用了M1芯片的iPad Pro,新款的iPad Pro支持5G,这也是苹果的第二款5G产品线。 +除此之外,摄像头方面再次升级,增加了前摄全新超广角摄像头,支持人物居中功能,搭配超广角可实现视频中始终让人物居中效果。 +屏幕方面,iPad Pro 12.9版本支持XDR体验的Mini-LEDS显示屏,支持HDR10、杜比视界,还支持杜比全景声。 +iPad Pro的秒控键盘这次也推出白色版本。 +售价方面,11英寸版本售价799美元起,12.9英寸售价1099美元起。 diff --git a/examples/text_normalization/local/test_normalization.py b/examples/text_normalization/local/test_normalization.py new file mode 100644 index 000000000..38a38460e --- /dev/null +++ b/examples/text_normalization/local/test_normalization.py @@ -0,0 +1,14 @@ +import argparse +from text_processing import normalization + +parser = argparse.ArgumentParser(description="Normalize text in Chinese with some rules.") +parser.add_argument("input", type=str, help="the input sentences") +parser.add_argument("output", type=str, help="path to save the output file.") +args = parser.parse_args() + +with open(args.input, 'rt') as fin: + with open(args.output, 'wt') as fout: + for sent in fin: + sent = normalization.normalize_sentence(sent.strip()) + fout.write(sent) + fout.write('\n') diff --git a/examples/text_normalization/path.sh b/examples/text_normalization/path.sh new file mode 100644 index 000000000..c8b1f1c2f --- /dev/null +++ b/examples/text_normalization/path.sh @@ -0,0 +1,8 @@ +export MAIN_ROOT=${PWD}/../../ + +export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} +export LC_ALL=C + +# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C +export PYTHONIOENCODING=UTF-8 +export PYTHONPATH=${MAIN_ROOT}:${MAIN_ROOT}/third_party:${PYTHONPATH}# diff --git a/examples/text_normalization/run.sh b/examples/text_normalization/run.sh new file mode 100755 index 000000000..b39de2a20 --- /dev/null +++ b/examples/text_normalization/run.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash + +source path.sh + +stage=-1 +stop_stage=100 + +exp_dir=exp +data_dir=data +filename="sentences.txt" + +source ${MAIN_ROOT}/utils/parse_options.sh || exit -1 + +mkdir -p ${exp_dir} + + +if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then + echo "stage 1: Processing " + python3 local/test_normalization.py ${data_dir}/${filename} ${exp_dir}/normalized.txt + if [ -f "${exp_dir}/normalized.txt" ]; then + echo "Normalized text save at ${exp_dir}/normalized.txt" + fi + # TODO(chenfeiyu): compute edit distance against ground-truth +fi + +echo "done" +exit 0 diff --git a/third_party/text_processing/__init__.py b/third_party/text_processing/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/third_party/text_processing/normalization/__init__.py b/third_party/text_processing/normalization/__init__.py new file mode 100644 index 000000000..0b4f0e7f8 --- /dev/null +++ b/third_party/text_processing/normalization/__init__.py @@ -0,0 +1,42 @@ +from .sentence_split import split +from .num import RE_NUMBER, RE_FRAC, RE_PERCENTAGE, RE_RANGE, RE_INTEGER, RE_DEFAULT_NUM +from .num import replace_number, replace_frac, replace_percentage, replace_range, replace_default_num + +from .chronology import RE_TIME, RE_DATE, RE_DATE2 +from .chronology import replace_time, replace_date, replace_date2 + +from .quantifier import RE_TEMPERATURE +from .quantifier import replace_temperature + +from .phone import RE_MOBILE_PHONE, RE_TELEPHONE, replace_phone + +from .char_convert import tranditional_to_simplified +from .constants import F2H_ASCII_LETTERS, F2H_DIGITS, F2H_SPACE + + +def normalize_sentence(sentence): + # basic character conversions + sentence = tranditional_to_simplified(sentence) + sentence = sentence.translate(F2H_ASCII_LETTERS).translate( + F2H_DIGITS).translate(F2H_SPACE) + + # number related NSW verbalization + sentence = RE_DATE.sub(replace_date, sentence) + sentence = RE_DATE2.sub(replace_date2, sentence) + sentence = RE_TIME.sub(replace_time, sentence) + sentence = RE_TEMPERATURE.sub(replace_temperature, sentence) + sentence = RE_RANGE.sub(replace_range, sentence) + sentence = RE_FRAC.sub(replace_frac, sentence) + sentence = RE_PERCENTAGE.sub(replace_percentage, sentence) + sentence = RE_MOBILE_PHONE.sub(replace_phone, sentence) + sentence = RE_TELEPHONE.sub(replace_phone, sentence) + sentence = RE_DEFAULT_NUM.sub(replace_default_num, sentence) + sentence = RE_NUMBER.sub(replace_number, sentence) + + return sentence + + +def normalize(text): + sentences = split(text) + sentences = [normalize_sentence(sent) for sent in sentences] + return sentences diff --git a/third_party/text_processing/normalization/char_convert.py b/third_party/text_processing/normalization/char_convert.py new file mode 100644 index 000000000..1c035a80e --- /dev/null +++ b/third_party/text_processing/normalization/char_convert.py @@ -0,0 +1,14 @@ +"""Traditional and simplified Chinese conversion with +`opencc `_. +""" + +import opencc + +_t2s_converter = opencc.OpenCC("t2s.json") +_s2t_converter = opencc.OpenCC('s2t.json') + +def tranditional_to_simplified(text: str) -> str: + return _t2s_converter.convert(text) + +def simplified_to_traditional(text: str) -> str: + return _s2t_converter.convert(text) \ No newline at end of file diff --git a/third_party/text_processing/normalization/chronology.py b/third_party/text_processing/normalization/chronology.py new file mode 100644 index 000000000..727bbd650 --- /dev/null +++ b/third_party/text_processing/normalization/chronology.py @@ -0,0 +1,63 @@ +import re +from .num import verbalize_cardinal, verbalize_digit, num2str, DIGITS + +def _time_num2str(num_string: str) -> str: + """A special case for verbalizing number in time.""" + result = num2str(num_string.lstrip('0')) + if num_string.startswith('0'): + result = DIGITS['0'] + result + return result + +# 时刻表达式 +RE_TIME = re.compile( + r'([0-1]?[0-9]|2[0-3])' + r':([0-5][0-9])' + r'(:([0-5][0-9]))?' +) +def replace_time(match: re.Match) -> str: + hour = match.group(1) + minute = match.group(2) + second = match.group(4) + + result = f"{num2str(hour)}点" + if minute.lstrip('0'): + result += f"{_time_num2str(minute)}分" + if second and second.lstrip('0'): + result += f"{_time_num2str(second)}秒" + return result + + +RE_DATE = re.compile( + r'(\d{4}|\d{2})年' + r'((0?[1-9]|1[0-2])月)?' + r'(((0?[1-9])|((1|2)[0-9])|30|31)([日号]))?' +) +def replace_date(match: re.Match) -> str: + year = match.group(1) + month = match.group(3) + day = match.group(5) + result = "" + if year: + result += f"{verbalize_digit(year)}年" + if month: + result += f"{verbalize_cardinal(month)}月" + if day: + result += f"{verbalize_cardinal(day)}{match.group(9)}" + return result + +# 用 / 或者 - 分隔的 YY/MM/DD 或者 YY-MM-DD 日期 +RE_DATE2 = re.compile( + r'(\d{4})([- /.])(0[1-9]|1[012])\2(0[1-9]|[12][0-9]|3[01])' +) +def replace_date2(match: re.Match) -> str: + year = match.group(1) + month = match.group(3) + day = match.group(4) + result = "" + if year: + result += f"{verbalize_digit(year)}年" + if month: + result += f"{verbalize_cardinal(month)}月" + if day: + result += f"{verbalize_cardinal(day)}日" + return result \ No newline at end of file diff --git a/third_party/text_processing/normalization/constants.py b/third_party/text_processing/normalization/constants.py new file mode 100644 index 000000000..bbfccb67b --- /dev/null +++ b/third_party/text_processing/normalization/constants.py @@ -0,0 +1,57 @@ +import string +import re +from pypinyin.constants import SUPPORT_UCS4 + +# 全角半角转换 +# 英文字符全角 -> 半角映射表 (num: 52) +F2H_ASCII_LETTERS = { + chr(ord(char) + 65248): char + for char in string.ascii_letters +} + +# 英文字符半角 -> 全角映射表 +H2F_ASCII_LETTERS = {value: key for key, value in F2H_ASCII_LETTERS.items()} + +# 数字字符全角 -> 半角映射表 (num: 10) +F2H_DIGITS = { + chr(ord(char) + 65248): char + for char in string.digits +} +# 数字字符半角 -> 全角映射表 +H2F_DIGITS = {value: key for key, value in F2H_DIGITS.items()} + +# 标点符号全角 -> 半角映射表 (num: 32) +F2H_PUNCTUATIONS = { + chr(ord(char) + 65248): char + for char in string.punctuation +} +# 标点符号半角 -> 全角映射表 +H2F_PUNCTUATIONS = {value: key for key, value in F2H_PUNCTUATIONS.items()} + +# 空格 (num: 1) +F2H_SPACE = {'\u3000': ' '} +H2F_SPACE = {' ': '\u3000'} + +# 非"有拼音的汉字"的字符串,可用于NSW提取 +if SUPPORT_UCS4: + RE_NSW = re.compile( + r'(?:[^' + r'\u3007' # 〇 + r'\u3400-\u4dbf' # CJK扩展A:[3400-4DBF] + r'\u4e00-\u9fff' # CJK基本:[4E00-9FFF] + r'\uf900-\ufaff' # CJK兼容:[F900-FAFF] + r'\U00020000-\U0002A6DF' # CJK扩展B:[20000-2A6DF] + r'\U0002A703-\U0002B73F' # CJK扩展C:[2A700-2B73F] + r'\U0002B740-\U0002B81D' # CJK扩展D:[2B740-2B81D] + r'\U0002F80A-\U0002FA1F' # CJK兼容扩展:[2F800-2FA1F] + r'])+' + ) +else: + RE_NSW = re.compile( # pragma: no cover + r'(?:[^' + r'\u3007' # 〇 + r'\u3400-\u4dbf' # CJK扩展A:[3400-4DBF] + r'\u4e00-\u9fff' # CJK基本:[4E00-9FFF] + r'\uf900-\ufaff' # CJK兼容:[F900-FAFF] + r'])+' + ) diff --git a/third_party/text_processing/normalization/num.py b/third_party/text_processing/normalization/num.py new file mode 100644 index 000000000..9b8b0ab3a --- /dev/null +++ b/third_party/text_processing/normalization/num.py @@ -0,0 +1,154 @@ +""" +Rules to verbalize numbers into Chinese characters. +https://zh.wikipedia.org/wiki/中文数字#現代中文 +""" +import re +from typing import List +from collections import OrderedDict + +DIGITS = {str(i): tran for i, tran in enumerate('零一二三四五六七八九')} +UNITS = OrderedDict({ + 1: '十', + 2: '百', + 3: '千', + 4: '万', + 8: '亿', +}) + +# 分数表达式 +RE_FRAC = re.compile(r'(-?)(\d+)/(\d+)') +def replace_frac(match: re.Match) -> str: + sign = match.group(1) + nominator = match.group(2) + denominator = match.group(3) + sign: str = "负" if sign else "" + nominator: str = num2str(nominator) + denominator: str = num2str(denominator) + result = f"{sign}{denominator}分之{nominator}" + return result + + +# 百分数表达式 +RE_PERCENTAGE = re.compile(r'(-?)(\d+(\.\d+)?)%') +def replace_percentage(match: re.Match) -> str: + sign = match.group(1) + percent = match.group(2) + sign: str = "负" if sign else "" + percent: str = num2str(percent) + result = f"{sign}百分之{percent}" + return result + +# 整数表达式 +# 带负号或者不带负号的整数 12, -10 +RE_INTEGER = re.compile( + r'(-?)' + r'(\d+)' +) + +# 编号-无符号整形 +# 00078 +RE_DEFAULT_NUM = re.compile(r'\d{4}\d*') +def replace_default_num(match: re.Match): + number = match.group(0) + return verbalize_digit(number) + +# 数字表达式 +# 1. 整数: -10, 10; +# 2. 浮点数: 10.2, -0.3 +# 3. 不带符号和整数部分的纯浮点数: .22, .38 +RE_NUMBER = re.compile( + r'(-?)((\d+)(\.\d+)?)' + r'|(\.(\d+))' +) +def replace_number(match: re.Match) -> str: + sign = match.group(1) + number = match.group(2) + pure_decimal = match.group(5) + if pure_decimal: + result = num2str(pure_decimal) + else: + sign: str = "负" if sign else "" + number: str = num2str(number) + result = f"{sign}{number}" + return result + +# 范围表达式 +# 12-23, 12~23 +RE_RANGE = re.compile( + r'(\d+)[-~](\d+)' +) +def replace_range(match: re.Match) -> str: + first, second = match.group(1), match.group(2) + first: str = num2str(first) + second: str = num2str(second) + result = f"{first}到{second}" + return result + + +def _get_value(value_string: str, use_zero: bool=True) -> List[str]: + stripped = value_string.lstrip('0') + if len(stripped) == 0: + return [] + elif len(stripped) == 1: + if use_zero and len(stripped) < len(value_string): + return [DIGITS['0'], DIGITS[stripped]] + else: + return [DIGITS[stripped]] + else: + largest_unit = next(power for power in reversed(UNITS.keys()) if power < len(stripped)) + first_part = value_string[:-largest_unit] + second_part = value_string[-largest_unit:] + return _get_value(first_part) + [UNITS[largest_unit]] + _get_value(second_part) + +def verbalize_cardinal(value_string: str) -> str: + if not value_string: + return '' + + # 000 -> '零' , 0 -> '零' + value_string = value_string.lstrip('0') + if len(value_string) == 0: + return DIGITS['0'] + + result_symbols = _get_value(value_string) + # verbalized number starting with '一十*' is abbreviated as `十*` + if len(result_symbols) >= 2 and result_symbols[0] == DIGITS['1'] and result_symbols[1] == UNITS[1]: + result_symbols = result_symbols[1:] + return ''.join(result_symbols) + +def verbalize_digit(value_string: str, alt_one=False) -> str: + result_symbols = [DIGITS[digit] for digit in value_string] + result = ''.join(result_symbols) + if alt_one: + result.replace("一", "幺") + return result + +def num2str(value_string: str) -> str: + integer_decimal = value_string.split('.') + if len(integer_decimal) == 1: + integer = integer_decimal[0] + decimal = '' + elif len(integer_decimal) == 2: + integer, decimal = integer_decimal + else: + raise ValueError(f"The value string: '${value_string}' has more than one point in it.") + + result = verbalize_cardinal(integer) + + decimal = decimal.rstrip('0') + if decimal: + # '.22' is verbalized as '点二二' + # '3.20' is verbalized as '三点二 + result += '点' + verbalize_digit(decimal) + return result + + + + + + + + + + + + diff --git a/third_party/text_processing/normalization/phone.py b/third_party/text_processing/normalization/phone.py new file mode 100644 index 000000000..e8bdecd75 --- /dev/null +++ b/third_party/text_processing/normalization/phone.py @@ -0,0 +1,30 @@ +import re +from .num import verbalize_digit + +# 规范化固话/手机号码 +# 手机 +# http://www.jihaoba.com/news/show/13680 +# 移动:139、138、137、136、135、134、159、158、157、150、151、152、188、187、182、183、184、178、198 +# 联通:130、131、132、156、155、186、185、176 +# 电信:133、153、189、180、181、177 +RE_MOBILE_PHONE= re.compile( + r"(? str: + if mobile: + sp_parts = phone_string.strip('+').split() + result = ''.join( + [verbalize_digit(part, alt_one=True) for part in sp_parts]) + return result + else: + sil_parts = phone_string.split('-') + result = ''.join( + [verbalize_digit(part, alt_one=True) for part in sil_parts]) + return result + + +def replace_phone(match: re.Match) -> str: + return phone2str(match.group(0)) \ No newline at end of file diff --git a/third_party/text_processing/normalization/quantifier.py b/third_party/text_processing/normalization/quantifier.py new file mode 100644 index 000000000..836fc88c2 --- /dev/null +++ b/third_party/text_processing/normalization/quantifier.py @@ -0,0 +1,17 @@ +import re +from .num import num2str + +# 温度表达式,温度会影响负号的读法 +# -3°C 零下三度 +RE_TEMPERATURE = re.compile( + r'(-?)(\d+(\.\d+)?)(°C|℃|度|摄氏度)' +) +def replace_temperature(match: re.Match) -> str: + sign = match.group(1) + temperature = match.group(2) + unit = match.group(3) + sign: str = "零下" if sign else "" + temperature: str = num2str(temperature) + unit: str = "摄氏度" if unit == "摄氏度" else "度" + result = f"{sign}{temperature}{unit}" + return result \ No newline at end of file diff --git a/third_party/text_processing/normalization/sentence_split.py b/third_party/text_processing/normalization/sentence_split.py new file mode 100644 index 000000000..451371da8 --- /dev/null +++ b/third_party/text_processing/normalization/sentence_split.py @@ -0,0 +1,22 @@ +import re +from typing import List + +SENTENCE_SPLITOR = re.compile(r'([。!?][”’]?)') + +def split(text: str) -> List[str]: + """Split long text into sentences with sentence-splitting punctuations. + + Parameters + ---------- + text : str + The input text. + + Returns + ------- + List[str] + Sentences. + """ + text = SENTENCE_SPLITOR.sub(r'\1\n', text) + text = text.strip() + sentences = [sentence.strip() for sentence in re.split(r'\n+', text)] + return sentences From ae92fa74982527868f72b2e48392bad7c439ab62 Mon Sep 17 00:00:00 2001 From: chenfeiyu Date: Fri, 4 Jun 2021 21:00:57 +0800 Subject: [PATCH 005/281] format code --- .../local/test_normalization.py | 17 ++++++++++++++++- examples/text_normalization/path.sh | 1 - examples/text_normalization/run.sh | 1 - third_party/text_processing/__ini__.py | 1 + .../normalization/char_convert.py | 3 ++- .../text_processing/normalization/chronology.py | 3 ++- .../text_processing/normalization/constants.py | 1 + .../text_processing/normalization/num.py | 1 + .../text_processing/normalization/phone.py | 3 ++- .../text_processing/normalization/quantifier.py | 3 ++- .../normalization/sentence_split.py | 1 + 11 files changed, 28 insertions(+), 7 deletions(-) create mode 100644 third_party/text_processing/__ini__.py diff --git a/examples/text_normalization/local/test_normalization.py b/examples/text_normalization/local/test_normalization.py index 38a38460e..bcf7ee0da 100644 --- a/examples/text_normalization/local/test_normalization.py +++ b/examples/text_normalization/local/test_normalization.py @@ -1,7 +1,22 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import argparse + from text_processing import normalization -parser = argparse.ArgumentParser(description="Normalize text in Chinese with some rules.") +parser = argparse.ArgumentParser( + description="Normalize text in Chinese with some rules.") parser.add_argument("input", type=str, help="the input sentences") parser.add_argument("output", type=str, help="path to save the output file.") args = parser.parse_args() diff --git a/examples/text_normalization/path.sh b/examples/text_normalization/path.sh index c8b1f1c2f..7cec3a24d 100644 --- a/examples/text_normalization/path.sh +++ b/examples/text_normalization/path.sh @@ -1,5 +1,4 @@ export MAIN_ROOT=${PWD}/../../ - export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} export LC_ALL=C diff --git a/examples/text_normalization/run.sh b/examples/text_normalization/run.sh index b39de2a20..c4043a319 100755 --- a/examples/text_normalization/run.sh +++ b/examples/text_normalization/run.sh @@ -1,5 +1,4 @@ #!/usr/bin/env bash - source path.sh stage=-1 diff --git a/third_party/text_processing/__ini__.py b/third_party/text_processing/__ini__.py new file mode 100644 index 000000000..8d1c8b69c --- /dev/null +++ b/third_party/text_processing/__ini__.py @@ -0,0 +1 @@ + diff --git a/third_party/text_processing/normalization/char_convert.py b/third_party/text_processing/normalization/char_convert.py index 1c035a80e..bd328f695 100644 --- a/third_party/text_processing/normalization/char_convert.py +++ b/third_party/text_processing/normalization/char_convert.py @@ -2,6 +2,7 @@ `opencc `_. """ + import opencc _t2s_converter = opencc.OpenCC("t2s.json") @@ -11,4 +12,4 @@ def tranditional_to_simplified(text: str) -> str: return _t2s_converter.convert(text) def simplified_to_traditional(text: str) -> str: - return _s2t_converter.convert(text) \ No newline at end of file + return _s2t_converter.convert(text) diff --git a/third_party/text_processing/normalization/chronology.py b/third_party/text_processing/normalization/chronology.py index 727bbd650..7143eb58c 100644 --- a/third_party/text_processing/normalization/chronology.py +++ b/third_party/text_processing/normalization/chronology.py @@ -1,6 +1,7 @@ import re from .num import verbalize_cardinal, verbalize_digit, num2str, DIGITS + def _time_num2str(num_string: str) -> str: """A special case for verbalizing number in time.""" result = num2str(num_string.lstrip('0')) @@ -60,4 +61,4 @@ def replace_date2(match: re.Match) -> str: result += f"{verbalize_cardinal(month)}月" if day: result += f"{verbalize_cardinal(day)}日" - return result \ No newline at end of file + return result diff --git a/third_party/text_processing/normalization/constants.py b/third_party/text_processing/normalization/constants.py index bbfccb67b..d5c04a761 100644 --- a/third_party/text_processing/normalization/constants.py +++ b/third_party/text_processing/normalization/constants.py @@ -2,6 +2,7 @@ import string import re from pypinyin.constants import SUPPORT_UCS4 + # 全角半角转换 # 英文字符全角 -> 半角映射表 (num: 52) F2H_ASCII_LETTERS = { diff --git a/third_party/text_processing/normalization/num.py b/third_party/text_processing/normalization/num.py index 9b8b0ab3a..60fc1686d 100644 --- a/third_party/text_processing/normalization/num.py +++ b/third_party/text_processing/normalization/num.py @@ -2,6 +2,7 @@ Rules to verbalize numbers into Chinese characters. https://zh.wikipedia.org/wiki/中文数字#現代中文 """ + import re from typing import List from collections import OrderedDict diff --git a/third_party/text_processing/normalization/phone.py b/third_party/text_processing/normalization/phone.py index e8bdecd75..1acc18365 100644 --- a/third_party/text_processing/normalization/phone.py +++ b/third_party/text_processing/normalization/phone.py @@ -1,6 +1,7 @@ import re from .num import verbalize_digit + # 规范化固话/手机号码 # 手机 # http://www.jihaoba.com/news/show/13680 @@ -27,4 +28,4 @@ def phone2str(phone_string: str, mobile=True) -> str: def replace_phone(match: re.Match) -> str: - return phone2str(match.group(0)) \ No newline at end of file + return phone2str(match.group(0)) diff --git a/third_party/text_processing/normalization/quantifier.py b/third_party/text_processing/normalization/quantifier.py index 836fc88c2..024eb6e01 100644 --- a/third_party/text_processing/normalization/quantifier.py +++ b/third_party/text_processing/normalization/quantifier.py @@ -1,6 +1,7 @@ import re from .num import num2str + # 温度表达式,温度会影响负号的读法 # -3°C 零下三度 RE_TEMPERATURE = re.compile( @@ -14,4 +15,4 @@ def replace_temperature(match: re.Match) -> str: temperature: str = num2str(temperature) unit: str = "摄氏度" if unit == "摄氏度" else "度" result = f"{sign}{temperature}{unit}" - return result \ No newline at end of file + return result diff --git a/third_party/text_processing/normalization/sentence_split.py b/third_party/text_processing/normalization/sentence_split.py index 451371da8..5867342ba 100644 --- a/third_party/text_processing/normalization/sentence_split.py +++ b/third_party/text_processing/normalization/sentence_split.py @@ -1,6 +1,7 @@ import re from typing import List + SENTENCE_SPLITOR = re.compile(r'([。!?][”’]?)') def split(text: str) -> List[str]: From 279348d7860cc3ba45a80c86f3d2c9194972db53 Mon Sep 17 00:00:00 2001 From: Haoxin Ma <745165806@qq.com> Date: Tue, 8 Jun 2021 10:32:05 +0000 Subject: [PATCH 006/281] move process utt to collator --- deepspeech/exps/deepspeech2/model.py | 2 +- deepspeech/io/collator.py | 117 ++++++++++++++++++++++++- deepspeech/io/dataset.py | 82 +---------------- examples/tiny/s0/conf/deepspeech2.yaml | 4 +- 4 files changed, 120 insertions(+), 85 deletions(-) diff --git a/deepspeech/exps/deepspeech2/model.py b/deepspeech/exps/deepspeech2/model.py index 468bc6521..50ff3c17b 100644 --- a/deepspeech/exps/deepspeech2/model.py +++ b/deepspeech/exps/deepspeech2/model.py @@ -165,7 +165,7 @@ class DeepSpeech2Trainer(Trainer): sortagrad=config.data.sortagrad, shuffle_method=config.data.shuffle_method) - collate_fn = SpeechCollator(keep_transcription_text=False) + collate_fn = SpeechCollator(config, keep_transcription_text=False) self.train_loader = DataLoader( train_dataset, batch_sampler=batch_sampler, diff --git a/deepspeech/io/collator.py b/deepspeech/io/collator.py index 3bec9875f..d725b0b1e 100644 --- a/deepspeech/io/collator.py +++ b/deepspeech/io/collator.py @@ -16,14 +16,22 @@ import numpy as np from deepspeech.frontend.utility import IGNORE_ID from deepspeech.io.utility import pad_sequence from deepspeech.utils.log import Log +from deepspeech.frontend.augmentor.augmentation import AugmentationPipeline +from deepspeech.frontend.featurizer.speech_featurizer import SpeechFeaturizer +from deepspeech.frontend.normalizer import FeatureNormalizer +from deepspeech.frontend.speech import SpeechSegment +import io +import time __all__ = ["SpeechCollator"] logger = Log(__name__).getlog() +# namedtupe need global for pickle. +TarLocalData = namedtuple('TarLocalData', ['tar2info', 'tar2object']) class SpeechCollator(): - def __init__(self, keep_transcription_text=True): + def __init__(self, config, keep_transcription_text=True): """ Padding audio features with zeros to make them have the same shape (or a user-defined shape) within one bach. @@ -32,6 +40,112 @@ class SpeechCollator(): """ self._keep_transcription_text = keep_transcription_text + if isinstance(config.data.augmentation_config, (str, bytes)): + if config.data.augmentation_config: + aug_file = io.open( + config.data.augmentation_config, mode='r', encoding='utf8') + else: + aug_file = io.StringIO(initial_value='{}', newline='') + else: + aug_file = config.data.augmentation_config + assert isinstance(aug_file, io.StringIO) + + self._local_data = TarLocalData(tar2info={}, tar2object={}) + self._augmentation_pipeline = AugmentationPipeline( + augmentation_config=aug_file.read(), + random_seed=config.data.random_seed) + + self._normalizer = FeatureNormalizer( + config.data.mean_std_filepath) if config.data.mean_std_filepath else None + + self._stride_ms = config.data.stride_ms + self._target_sample_rate = config.data.target_sample_rate + + self._speech_featurizer = SpeechFeaturizer( + unit_type=config.data.unit_type, + vocab_filepath=config.data.vocab_filepath, + spm_model_prefix=config.data.spm_model_prefix, + specgram_type=config.data.specgram_type, + feat_dim=config.data.feat_dim, + delta_delta=config.data.delta_delta, + stride_ms=config.data.stride_ms, + window_ms=config.data.window_ms, + n_fft=config.data.n_fft, + max_freq=config.data.max_freq, + target_sample_rate=config.data.target_sample_rate, + use_dB_normalization=config.data.use_dB_normalization, + target_dB=config.data.target_dB, + dither=config.data.dither) + + def _parse_tar(self, file): + """Parse a tar file to get a tarfile object + and a map containing tarinfoes + """ + result = {} + f = tarfile.open(file) + for tarinfo in f.getmembers(): + result[tarinfo.name] = tarinfo + return f, result + + def _subfile_from_tar(self, file): + """Get subfile object from tar. + + It will return a subfile object from tar file + and cached tar file info for next reading request. + """ + tarpath, filename = file.split(':', 1)[1].split('#', 1) + if 'tar2info' not in self._local_data.__dict__: + self._local_data.tar2info = {} + if 'tar2object' not in self._local_data.__dict__: + self._local_data.tar2object = {} + if tarpath not in self._local_data.tar2info: + object, infoes = self._parse_tar(tarpath) + self._local_data.tar2info[tarpath] = infoes + self._local_data.tar2object[tarpath] = object + return self._local_data.tar2object[tarpath].extractfile( + self._local_data.tar2info[tarpath][filename]) + + def process_utterance(self, audio_file, transcript): + """Load, augment, featurize and normalize for speech data. + + :param audio_file: Filepath or file object of audio file. + :type audio_file: str | file + :param transcript: Transcription text. + :type transcript: str + :return: Tuple of audio feature tensor and data of transcription part, + where transcription part could be token ids or text. + :rtype: tuple of (2darray, list) + """ + start_time = time.time() + if isinstance(audio_file, str) and audio_file.startswith('tar:'): + speech_segment = SpeechSegment.from_file( + self._subfile_from_tar(audio_file), transcript) + else: + speech_segment = SpeechSegment.from_file(audio_file, transcript) + load_wav_time = time.time() - start_time + #logger.debug(f"load wav time: {load_wav_time}") + + # audio augment + start_time = time.time() + self._augmentation_pipeline.transform_audio(speech_segment) + audio_aug_time = time.time() - start_time + #logger.debug(f"audio augmentation time: {audio_aug_time}") + + start_time = time.time() + specgram, transcript_part = self._speech_featurizer.featurize( + speech_segment, self._keep_transcription_text) + if self._normalizer: + specgram = self._normalizer.apply(specgram) + feature_time = time.time() - start_time + #logger.debug(f"audio & test feature time: {feature_time}") + + # specgram augment + start_time = time.time() + specgram = self._augmentation_pipeline.transform_feature(specgram) + feature_aug_time = time.time() - start_time + #logger.debug(f"audio feature augmentation time: {feature_aug_time}") + return specgram, transcript_part + def __call__(self, batch): """batch examples @@ -53,6 +167,7 @@ class SpeechCollator(): text_lens = [] utts = [] for utt, audio, text in batch: + audio, text = self.process_utterance(audio, text) #utt utts.append(utt) # audio diff --git a/deepspeech/io/dataset.py b/deepspeech/io/dataset.py index eaa57a4ec..fc6879026 100644 --- a/deepspeech/io/dataset.py +++ b/deepspeech/io/dataset.py @@ -34,9 +34,6 @@ __all__ = [ logger = Log(__name__).getlog() -# namedtupe need global for pickle. -TarLocalData = namedtuple('TarLocalData', ['tar2info', 'tar2object']) - class ManifestDataset(Dataset): @classmethod @@ -192,10 +189,6 @@ class ManifestDataset(Dataset): self._stride_ms = stride_ms self._target_sample_rate = target_sample_rate - self._normalizer = FeatureNormalizer( - mean_std_filepath) if mean_std_filepath else None - self._augmentation_pipeline = AugmentationPipeline( - augmentation_config=augmentation_config, random_seed=random_seed) self._speech_featurizer = SpeechFeaturizer( unit_type=unit_type, vocab_filepath=vocab_filepath, @@ -214,8 +207,6 @@ class ManifestDataset(Dataset): self._rng = np.random.RandomState(random_seed) self._keep_transcription_text = keep_transcription_text - # for caching tar files info - self._local_data = TarLocalData(tar2info={}, tar2object={}) # read manifest self._manifest = read_manifest( @@ -256,74 +247,7 @@ class ManifestDataset(Dataset): def stride_ms(self): return self._speech_featurizer.stride_ms - def _parse_tar(self, file): - """Parse a tar file to get a tarfile object - and a map containing tarinfoes - """ - result = {} - f = tarfile.open(file) - for tarinfo in f.getmembers(): - result[tarinfo.name] = tarinfo - return f, result - - def _subfile_from_tar(self, file): - """Get subfile object from tar. - It will return a subfile object from tar file - and cached tar file info for next reading request. - """ - tarpath, filename = file.split(':', 1)[1].split('#', 1) - if 'tar2info' not in self._local_data.__dict__: - self._local_data.tar2info = {} - if 'tar2object' not in self._local_data.__dict__: - self._local_data.tar2object = {} - if tarpath not in self._local_data.tar2info: - object, infoes = self._parse_tar(tarpath) - self._local_data.tar2info[tarpath] = infoes - self._local_data.tar2object[tarpath] = object - return self._local_data.tar2object[tarpath].extractfile( - self._local_data.tar2info[tarpath][filename]) - - def process_utterance(self, utt, audio_file, transcript): - """Load, augment, featurize and normalize for speech data. - - :param audio_file: Filepath or file object of audio file. - :type audio_file: str | file - :param transcript: Transcription text. - :type transcript: str - :return: Tuple of audio feature tensor and data of transcription part, - where transcription part could be token ids or text. - :rtype: tuple of (2darray, list) - """ - start_time = time.time() - if isinstance(audio_file, str) and audio_file.startswith('tar:'): - speech_segment = SpeechSegment.from_file( - self._subfile_from_tar(audio_file), transcript) - else: - speech_segment = SpeechSegment.from_file(audio_file, transcript) - load_wav_time = time.time() - start_time - #logger.debug(f"load wav time: {load_wav_time}") - - # audio augment - start_time = time.time() - self._augmentation_pipeline.transform_audio(speech_segment) - audio_aug_time = time.time() - start_time - #logger.debug(f"audio augmentation time: {audio_aug_time}") - - start_time = time.time() - specgram, transcript_part = self._speech_featurizer.featurize( - speech_segment, self._keep_transcription_text) - if self._normalizer: - specgram = self._normalizer.apply(specgram) - feature_time = time.time() - start_time - #logger.debug(f"audio & test feature time: {feature_time}") - - # specgram augment - start_time = time.time() - specgram = self._augmentation_pipeline.transform_feature(specgram) - feature_aug_time = time.time() - start_time - #logger.debug(f"audio feature augmentation time: {feature_aug_time}") - return utt, specgram, transcript_part def _instance_reader_creator(self, manifest): """ @@ -336,8 +260,6 @@ class ManifestDataset(Dataset): def reader(): for instance in manifest: - # inst = self.process_utterance(instance["feat"], - # instance["text"]) inst = self.process_utterance(instance["utt"], instance["feat"], instance["text"]) yield inst @@ -349,6 +271,4 @@ class ManifestDataset(Dataset): def __getitem__(self, idx): instance = self._manifest[idx] - return self.process_utterance(instance["utt"], instance["feat"], - instance["text"]) - # return self.process_utterance(instance["feat"], instance["text"]) + return(instance["utt"], instance["feat"], instance["text"]) diff --git a/examples/tiny/s0/conf/deepspeech2.yaml b/examples/tiny/s0/conf/deepspeech2.yaml index dd9ce51f0..aeb4f0997 100644 --- a/examples/tiny/s0/conf/deepspeech2.yaml +++ b/examples/tiny/s0/conf/deepspeech2.yaml @@ -6,7 +6,7 @@ data: mean_std_filepath: data/mean_std.json vocab_filepath: data/vocab.txt augmentation_config: conf/augmentation.json - batch_size: 4 + batch_size: 2 min_input_len: 0.0 max_input_len: 27.0 min_output_len: 0.0 @@ -37,7 +37,7 @@ model: share_rnn_weights: True training: - n_epoch: 20 + n_epoch: 10 lr: 1e-5 lr_decay: 1.0 weight_decay: 1e-06 From c706dfec2ab292c91fe95cc1947330772c3bc493 Mon Sep 17 00:00:00 2001 From: Haoxin Ma <745165806@qq.com> Date: Wed, 9 Jun 2021 12:54:01 +0000 Subject: [PATCH 007/281] fix bug --- deepspeech/exps/deepspeech2/model.py | 4 ++-- deepspeech/io/collator.py | 4 +++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/deepspeech/exps/deepspeech2/model.py b/deepspeech/exps/deepspeech2/model.py index 50ff3c17b..bcd66d19e 100644 --- a/deepspeech/exps/deepspeech2/model.py +++ b/deepspeech/exps/deepspeech2/model.py @@ -165,7 +165,7 @@ class DeepSpeech2Trainer(Trainer): sortagrad=config.data.sortagrad, shuffle_method=config.data.shuffle_method) - collate_fn = SpeechCollator(config, keep_transcription_text=False) + collate_fn = SpeechCollator(config=config, keep_transcription_text=False) self.train_loader = DataLoader( train_dataset, batch_sampler=batch_sampler, @@ -342,7 +342,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): batch_size=config.decoding.batch_size, shuffle=False, drop_last=False, - collate_fn=SpeechCollator(keep_transcription_text=True)) + collate_fn=SpeechCollator(config=config, keep_transcription_text=True)) logger.info("Setup test Dataloader!") def setup_output_dir(self): diff --git a/deepspeech/io/collator.py b/deepspeech/io/collator.py index d725b0b1e..0f86b8e72 100644 --- a/deepspeech/io/collator.py +++ b/deepspeech/io/collator.py @@ -23,6 +23,8 @@ from deepspeech.frontend.speech import SpeechSegment import io import time +from collections import namedtuple + __all__ = ["SpeechCollator"] logger = Log(__name__).getlog() @@ -50,7 +52,7 @@ class SpeechCollator(): aug_file = config.data.augmentation_config assert isinstance(aug_file, io.StringIO) - self._local_data = TarLocalData(tar2info={}, tar2object={}) + self._local_data = TarLocalData(tar2info={}, tar2object={}) self._augmentation_pipeline = AugmentationPipeline( augmentation_config=aug_file.read(), random_seed=config.data.random_seed) From 2b51d612dd64653bb407f76b648a48ad71b090de Mon Sep 17 00:00:00 2001 From: Haoxin Ma <745165806@qq.com> Date: Wed, 9 Jun 2021 13:42:19 +0000 Subject: [PATCH 008/281] delete _instance_reader_creator func in dataset --- deepspeech/io/dataset.py | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/deepspeech/io/dataset.py b/deepspeech/io/dataset.py index fc6879026..929a6cf85 100644 --- a/deepspeech/io/dataset.py +++ b/deepspeech/io/dataset.py @@ -249,22 +249,22 @@ class ManifestDataset(Dataset): - def _instance_reader_creator(self, manifest): - """ - Instance reader creator. Create a callable function to produce - instances of data. - - Instance: a tuple of ndarray of audio spectrogram and a list of - token indices for transcript. - """ - - def reader(): - for instance in manifest: - inst = self.process_utterance(instance["utt"], instance["feat"], - instance["text"]) - yield inst - - return reader + # def _instance_reader_creator(self, manifest): + # """ + # Instance reader creator. Create a callable function to produce + # instances of data. + + # Instance: a tuple of ndarray of audio spectrogram and a list of + # token indices for transcript. + # """ + + # def reader(): + # for instance in manifest: + # inst = self.process_utterance(instance["utt"], instance["feat"], + # instance["text"]) + # yield inst + + # return reader def __len__(self): return len(self._manifest) From 3d5f294363ebc3a732b5f29714f9b057431ed52c Mon Sep 17 00:00:00 2001 From: Haoxin Ma <745165806@qq.com> Date: Thu, 10 Jun 2021 03:13:35 +0000 Subject: [PATCH 009/281] dataset --- deepspeech/io/dataset.py | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/deepspeech/io/dataset.py b/deepspeech/io/dataset.py index 929a6cf85..6083d7ec8 100644 --- a/deepspeech/io/dataset.py +++ b/deepspeech/io/dataset.py @@ -247,25 +247,6 @@ class ManifestDataset(Dataset): def stride_ms(self): return self._speech_featurizer.stride_ms - - - # def _instance_reader_creator(self, manifest): - # """ - # Instance reader creator. Create a callable function to produce - # instances of data. - - # Instance: a tuple of ndarray of audio spectrogram and a list of - # token indices for transcript. - # """ - - # def reader(): - # for instance in manifest: - # inst = self.process_utterance(instance["utt"], instance["feat"], - # instance["text"]) - # yield inst - - # return reader - def __len__(self): return len(self._manifest) From 3855522ee3b43bc5726eb7f37a0dd8bd0e9355a2 Mon Sep 17 00:00:00 2001 From: Haoxin Ma <745165806@qq.com> Date: Thu, 10 Jun 2021 11:37:25 +0000 Subject: [PATCH 010/281] config --- deepspeech/exps/deepspeech2/config.py | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/deepspeech/exps/deepspeech2/config.py b/deepspeech/exps/deepspeech2/config.py index a8d452a99..37b000867 100644 --- a/deepspeech/exps/deepspeech2/config.py +++ b/deepspeech/exps/deepspeech2/config.py @@ -38,8 +38,6 @@ _C.data = CN( target_sample_rate=16000, # target sample rate use_dB_normalization=True, target_dB=-20, - random_seed=0, - keep_transcription_text=False, batch_size=32, # batch size num_workers=0, # data loader workers sortagrad=False, # sorted in first epoch when True @@ -55,6 +53,28 @@ _C.model = CN( share_rnn_weights=True #Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported. )) +_C.collator =CN( + dict( + augmentation_config="", + random_seed=0, + mean_std_filepath="", + unit_type="char", + vocab_filepath="", + spm_model_prefix="", + specgram_type='linear', # 'linear', 'mfcc', 'fbank' + feat_dim=0, # 'mfcc', 'fbank' + delta_delta=False, # 'mfcc', 'fbank' + stride_ms=10.0, # ms + window_ms=20.0, # ms + n_fft=None, # fft points + max_freq=None, # None for samplerate/2 + target_sample_rate=16000, # target sample rate + use_dB_normalization=True, + target_dB=-20, + dither=1.0, # feature dither + keep_transcription_text=True + )) + DeepSpeech2Model.params(_C.model) _C.training = CN( From b9110af9d340caf4e3e32e0eafa2fca6946d7296 Mon Sep 17 00:00:00 2001 From: Haoxin Ma <745165806@qq.com> Date: Fri, 11 Jun 2021 02:44:02 +0000 Subject: [PATCH 011/281] feat_dim, vocab_size --- deepspeech/exps/deepspeech2/model.py | 4 +- .../frontend/featurizer/speech_featurizer.py | 43 ----- deepspeech/frontend/utility.py | 2 +- deepspeech/io/collator.py | 166 +++++++++++++++--- deepspeech/io/dataset.py | 128 +++++++------- examples/tiny/s0/conf/deepspeech2.yaml | 23 ++- 6 files changed, 227 insertions(+), 139 deletions(-) diff --git a/deepspeech/exps/deepspeech2/model.py b/deepspeech/exps/deepspeech2/model.py index bcd66d19e..679261cf7 100644 --- a/deepspeech/exps/deepspeech2/model.py +++ b/deepspeech/exps/deepspeech2/model.py @@ -137,7 +137,7 @@ class DeepSpeech2Trainer(Trainer): def setup_dataloader(self): config = self.config.clone() config.defrost() - config.data.keep_transcription_text = False + config.collator.keep_transcription_text = False config.data.manifest = config.data.train_manifest train_dataset = ManifestDataset.from_config(config) @@ -165,7 +165,7 @@ class DeepSpeech2Trainer(Trainer): sortagrad=config.data.sortagrad, shuffle_method=config.data.shuffle_method) - collate_fn = SpeechCollator(config=config, keep_transcription_text=False) + collate_fn = SpeechCollator.from_config(config) self.train_loader = DataLoader( train_dataset, batch_sampler=batch_sampler, diff --git a/deepspeech/frontend/featurizer/speech_featurizer.py b/deepspeech/frontend/featurizer/speech_featurizer.py index e6761cb52..bcb8e3f47 100644 --- a/deepspeech/frontend/featurizer/speech_featurizer.py +++ b/deepspeech/frontend/featurizer/speech_featurizer.py @@ -104,50 +104,7 @@ class SpeechFeaturizer(object): speech_segment.transcript) return spec_feature, text_ids - @property - def vocab_size(self): - """Return the vocabulary size. - - Returns: - int: Vocabulary size. - """ - return self._text_featurizer.vocab_size - - @property - def vocab_list(self): - """Return the vocabulary in list. - Returns: - List[str]: - """ - return self._text_featurizer.vocab_list - - @property - def vocab_dict(self): - """Return the vocabulary in dict. - - Returns: - Dict[str, int]: - """ - return self._text_featurizer.vocab_dict - - @property - def feature_size(self): - """Return the audio feature size. - - Returns: - int: audio feature size. - """ - return self._audio_featurizer.feature_size - - @property - def stride_ms(self): - """time length in `ms` unit per frame - - Returns: - float: time(ms)/frame - """ - return self._audio_featurizer.stride_ms @property def text_feature(self): diff --git a/deepspeech/frontend/utility.py b/deepspeech/frontend/utility.py index b2dd9601f..610104f90 100644 --- a/deepspeech/frontend/utility.py +++ b/deepspeech/frontend/utility.py @@ -82,7 +82,7 @@ def read_manifest( ] if all(conditions): manifest.append(json_data) - return manifest + return manifest, json_data["feat_shape"][-1] def rms_to_db(rms: float): diff --git a/deepspeech/io/collator.py b/deepspeech/io/collator.py index 0f86b8e72..4efc69a01 100644 --- a/deepspeech/io/collator.py +++ b/deepspeech/io/collator.py @@ -22,6 +22,8 @@ from deepspeech.frontend.normalizer import FeatureNormalizer from deepspeech.frontend.speech import SpeechSegment import io import time +from yacs.config import CfgNode +from typing import Optional from collections import namedtuple @@ -33,51 +35,134 @@ logger = Log(__name__).getlog() TarLocalData = namedtuple('TarLocalData', ['tar2info', 'tar2object']) class SpeechCollator(): - def __init__(self, config, keep_transcription_text=True): - """ - Padding audio features with zeros to make them have the same shape (or - a user-defined shape) within one bach. + @classmethod + def params(cls, config: Optional[CfgNode]=None) -> CfgNode: + default = CfgNode( + dict( + augmentation_config="", + random_seed=0, + mean_std_filepath="", + unit_type="char", + vocab_filepath="", + spm_model_prefix="", + specgram_type='linear', # 'linear', 'mfcc', 'fbank' + feat_dim=0, # 'mfcc', 'fbank' + delta_delta=False, # 'mfcc', 'fbank' + stride_ms=10.0, # ms + window_ms=20.0, # ms + n_fft=None, # fft points + max_freq=None, # None for samplerate/2 + target_sample_rate=16000, # target sample rate + use_dB_normalization=True, + target_dB=-20, + dither=1.0, # feature dither + keep_transcription_text=True + )) - if ``keep_transcription_text`` is False, text is token ids else is raw string. + if config is not None: + config.merge_from_other_cfg(default) + return default + + @classmethod + def from_config(cls, config): + """Build a SpeechCollator object from a config. + + Args: + config (yacs.config.CfgNode): configs object. + + Returns: + SpeechCollator: collator object. """ - self._keep_transcription_text = keep_transcription_text + assert 'augmentation_config' in config.collator + assert 'keep_transcription_text' in config.collator + assert 'mean_std_filepath' in config.collator + assert 'vocab_filepath' in config.data + assert 'specgram_type' in config.collator + assert 'n_fft' in config.collator + assert config.collator - if isinstance(config.data.augmentation_config, (str, bytes)): - if config.data.augmentation_config: + if isinstance(config.collator.augmentation_config, (str, bytes)): + if config.collator.augmentation_config: aug_file = io.open( - config.data.augmentation_config, mode='r', encoding='utf8') + config.collator.augmentation_config, mode='r', encoding='utf8') else: aug_file = io.StringIO(initial_value='{}', newline='') else: - aug_file = config.data.augmentation_config + aug_file = config.collator.augmentation_config assert isinstance(aug_file, io.StringIO) + speech_collator = cls( + aug_file=aug_file, + random_seed=0, + mean_std_filepath=config.collator.mean_std_filepath, + unit_type=config.collator.unit_type, + vocab_filepath=config.data.vocab_filepath, + spm_model_prefix=config.collator.spm_model_prefix, + specgram_type=config.collator.specgram_type, + feat_dim=config.collator.feat_dim, + delta_delta=config.collator.delta_delta, + stride_ms=config.collator.stride_ms, + window_ms=config.collator.window_ms, + n_fft=config.collator.n_fft, + max_freq=config.collator.max_freq, + target_sample_rate=config.collator.target_sample_rate, + use_dB_normalization=config.collator.use_dB_normalization, + target_dB=config.collator.target_dB, + dither=config.collator.dither, + keep_transcription_text=config.collator.keep_transcription_text + ) + return speech_collator + + def __init__(self, aug_file, mean_std_filepath, + vocab_filepath, spm_model_prefix, + random_seed=0, + unit_type="char", + specgram_type='linear', # 'linear', 'mfcc', 'fbank' + feat_dim=0, # 'mfcc', 'fbank' + delta_delta=False, # 'mfcc', 'fbank' + stride_ms=10.0, # ms + window_ms=20.0, # ms + n_fft=None, # fft points + max_freq=None, # None for samplerate/2 + target_sample_rate=16000, # target sample rate + use_dB_normalization=True, + target_dB=-20, + dither=1.0, + keep_transcription_text=True): + """ + Padding audio features with zeros to make them have the same shape (or + a user-defined shape) within one bach. + + if ``keep_transcription_text`` is False, text is token ids else is raw string. + """ + self._keep_transcription_text = keep_transcription_text + self._local_data = TarLocalData(tar2info={}, tar2object={}) self._augmentation_pipeline = AugmentationPipeline( augmentation_config=aug_file.read(), - random_seed=config.data.random_seed) + random_seed=random_seed) self._normalizer = FeatureNormalizer( - config.data.mean_std_filepath) if config.data.mean_std_filepath else None + mean_std_filepath) if mean_std_filepath else None - self._stride_ms = config.data.stride_ms - self._target_sample_rate = config.data.target_sample_rate + self._stride_ms = stride_ms + self._target_sample_rate = target_sample_rate self._speech_featurizer = SpeechFeaturizer( - unit_type=config.data.unit_type, - vocab_filepath=config.data.vocab_filepath, - spm_model_prefix=config.data.spm_model_prefix, - specgram_type=config.data.specgram_type, - feat_dim=config.data.feat_dim, - delta_delta=config.data.delta_delta, - stride_ms=config.data.stride_ms, - window_ms=config.data.window_ms, - n_fft=config.data.n_fft, - max_freq=config.data.max_freq, - target_sample_rate=config.data.target_sample_rate, - use_dB_normalization=config.data.use_dB_normalization, - target_dB=config.data.target_dB, - dither=config.data.dither) + unit_type=unit_type, + vocab_filepath=vocab_filepath, + spm_model_prefix=spm_model_prefix, + specgram_type=specgram_type, + feat_dim=feat_dim, + delta_delta=delta_delta, + stride_ms=stride_ms, + window_ms=window_ms, + n_fft=n_fft, + max_freq=max_freq, + target_sample_rate=target_sample_rate, + use_dB_normalization=use_dB_normalization, + target_dB=target_dB, + dither=dither) def _parse_tar(self, file): """Parse a tar file to get a tarfile object @@ -196,3 +281,28 @@ class SpeechCollator(): texts, padding_value=IGNORE_ID).astype(np.int64) text_lens = np.array(text_lens).astype(np.int64) return utts, padded_audios, audio_lens, padded_texts, text_lens + + @property + def vocab_size(self): + return self._speech_featurizer.vocab_size + + @property + def vocab_list(self): + return self._speech_featurizer.vocab_list + + @property + def vocab_dict(self): + return self._speech_featurizer.vocab_dict + + @property + def text_feature(self): + return self._text_featurizer + self._speech_featurizer.text_feature + + @property + def feature_size(self): + return self._speech_featurizer.feature_size + + @property + def stride_ms(self): + return self._speech_featurizer.stride_ms diff --git a/deepspeech/io/dataset.py b/deepspeech/io/dataset.py index aa5b29ed3..1e3bbcd30 100644 --- a/deepspeech/io/dataset.py +++ b/deepspeech/io/dataset.py @@ -55,20 +55,6 @@ class ManifestDataset(Dataset): min_output_len=0.0, max_output_input_ratio=float('inf'), min_output_input_ratio=0.0, - stride_ms=10.0, # ms - window_ms=20.0, # ms - n_fft=None, # fft points - max_freq=None, # None for samplerate/2 - raw_wav=True, # use raw_wav or kaldi feature - specgram_type='linear', # 'linear', 'mfcc', 'fbank' - feat_dim=0, # 'mfcc', 'fbank' - delta_delta=False, # 'mfcc', 'fbank' - dither=1.0, # feature dither - target_sample_rate=16000, # target sample rate - use_dB_normalization=True, - target_dB=-20, - random_seed=0, - keep_transcription_text=False, batch_size=32, # batch size num_workers=0, # data loader workers sortagrad=False, # sorted in first epoch when True @@ -116,21 +102,19 @@ class ManifestDataset(Dataset): min_output_len=config.data.min_output_len, max_output_input_ratio=config.data.max_output_input_ratio, min_output_input_ratio=config.data.min_output_input_ratio, - stride_ms=config.data.stride_ms, - window_ms=config.data.window_ms, - n_fft=config.data.n_fft, - max_freq=config.data.max_freq, - target_sample_rate=config.data.target_sample_rate, - specgram_type=config.data.specgram_type, - feat_dim=config.data.feat_dim, - delta_delta=config.data.delta_delta, - dither=config.data.dither, - use_dB_normalization=config.data.use_dB_normalization, - target_dB=config.data.target_dB, - random_seed=config.data.random_seed, - keep_transcription_text=config.data.keep_transcription_text) + ) return dataset + + def _read_vocab(self, vocab_filepath): + """Load vocabulary from file.""" + vocab_lines = [] + with open(vocab_filepath, 'r', encoding='utf-8') as file: + vocab_lines.extend(file.readlines()) + vocab_list = [line[:-1] for line in vocab_lines] + return vocab_list + + def __init__(self, manifest_path, unit_type, @@ -143,20 +127,7 @@ class ManifestDataset(Dataset): max_output_len=float('inf'), min_output_len=0.0, max_output_input_ratio=float('inf'), - min_output_input_ratio=0.0, - stride_ms=10.0, - window_ms=20.0, - n_fft=None, - max_freq=None, - target_sample_rate=16000, - specgram_type='linear', - feat_dim=None, - delta_delta=False, - dither=1.0, - use_dB_normalization=True, - target_dB=-20, - random_seed=0, - keep_transcription_text=False): + min_output_input_ratio=0.0): """Manifest Dataset Args: @@ -186,30 +157,11 @@ class ManifestDataset(Dataset): keep_transcription_text (bool, optional): True, when not in training mode, will not do tokenizer; Defaults to False. """ super().__init__() - self._stride_ms = stride_ms - self._target_sample_rate = target_sample_rate - - self._speech_featurizer = SpeechFeaturizer( - unit_type=unit_type, - vocab_filepath=vocab_filepath, - spm_model_prefix=spm_model_prefix, - specgram_type=specgram_type, - feat_dim=feat_dim, - delta_delta=delta_delta, - stride_ms=stride_ms, - window_ms=window_ms, - n_fft=n_fft, - max_freq=max_freq, - target_sample_rate=target_sample_rate, - use_dB_normalization=use_dB_normalization, - target_dB=target_dB, - dither=dither) - - self._rng = np.random.RandomState(random_seed) - self._keep_transcription_text = keep_transcription_text + + # self._rng = np.random.RandomState(random_seed) # read manifest - self._manifest = read_manifest( + self._manifest, self._feature_size = read_manifest( manifest_path=manifest_path, max_input_len=max_input_len, min_input_len=min_input_len, @@ -219,9 +171,59 @@ class ManifestDataset(Dataset): min_output_input_ratio=min_output_input_ratio) self._manifest.sort(key=lambda x: x["feat_shape"][0]) + self._vocab_list = self._read_vocab(vocab_filepath) + @property def manifest(self): return self._manifest + + @property + def vocab_size(self): + """Return the vocabulary size. + + Returns: + int: Vocabulary size. + """ + return len(self._vocab_list) + + @property + def vocab_list(self): + """Return the vocabulary in list. + + Returns: + List[str]: + """ + return self._vocab_list + + @property + def vocab_dict(self): + """Return the vocabulary in dict. + + Returns: + Dict[str, int]: + """ + vocab_dict = dict( + [(token, idx) for (idx, token) in enumerate(self._vocab_list)]) + return vocab_dict + + @property + def feature_size(self): + """Return the audio feature size. + + Returns: + int: audio feature size. + """ + return self._feature_size + + @property + def stride_ms(self): + """time length in `ms` unit per frame + + Returns: + float: time(ms)/frame + """ + return self._audio_featurizer.stride_ms + def __len__(self): return len(self._manifest) diff --git a/examples/tiny/s0/conf/deepspeech2.yaml b/examples/tiny/s0/conf/deepspeech2.yaml index aeb4f0997..eda7c3cb8 100644 --- a/examples/tiny/s0/conf/deepspeech2.yaml +++ b/examples/tiny/s0/conf/deepspeech2.yaml @@ -4,9 +4,10 @@ data: dev_manifest: data/manifest.tiny test_manifest: data/manifest.tiny mean_std_filepath: data/mean_std.json + unit_type: char vocab_filepath: data/vocab.txt augmentation_config: conf/augmentation.json - batch_size: 2 + batch_size: 4 min_input_len: 0.0 max_input_len: 27.0 min_output_len: 0.0 @@ -28,6 +29,24 @@ data: sortagrad: True shuffle_method: batch_shuffle num_workers: 0 + +collator: + augmentation_config: conf/augmentation.json + random_seed: 0 + mean_std_filepath: data/mean_std.json + spm_model_prefix: + specgram_type: linear + feat_dim: + delta_delta: False + stride_ms: 10.0 + window_ms: 20.0 + n_fft: None + max_freq: None + target_sample_rate: 16000 + use_dB_normalization: True + target_dB: -20 + dither: 1.0 + keep_transcription_text: True model: num_conv_layers: 2 @@ -37,7 +56,7 @@ model: share_rnn_weights: True training: - n_epoch: 10 + n_epoch: 21 lr: 1e-5 lr_decay: 1.0 weight_decay: 1e-06 From 7bae32f3844166d549d0180da70b13bd10ef4cf7 Mon Sep 17 00:00:00 2001 From: Haoxin Ma <745165806@qq.com> Date: Tue, 15 Jun 2021 03:05:22 +0000 Subject: [PATCH 012/281] revise example/ting/s1 --- deepspeech/exps/deepspeech2/config.py | 2 +- deepspeech/exps/deepspeech2/model.py | 3 ++- deepspeech/exps/u2/config.py | 7 +++++++ deepspeech/exps/u2/model.py | 9 +++++---- deepspeech/frontend/utility.py | 2 +- deepspeech/io/collator.py | 23 ++++----------------- deepspeech/io/dataset.py | 12 ++--------- examples/tiny/s0/conf/deepspeech2.yaml | 16 +-------------- examples/tiny/s1/conf/transformer.yaml | 28 ++++++++++++++------------ 9 files changed, 38 insertions(+), 64 deletions(-) diff --git a/deepspeech/exps/deepspeech2/config.py b/deepspeech/exps/deepspeech2/config.py index 37b000867..1ce5346f6 100644 --- a/deepspeech/exps/deepspeech2/config.py +++ b/deepspeech/exps/deepspeech2/config.py @@ -72,7 +72,7 @@ _C.collator =CN( use_dB_normalization=True, target_dB=-20, dither=1.0, # feature dither - keep_transcription_text=True + keep_transcription_text=False )) DeepSpeech2Model.params(_C.model) diff --git a/deepspeech/exps/deepspeech2/model.py b/deepspeech/exps/deepspeech2/model.py index 679261cf7..7769c3776 100644 --- a/deepspeech/exps/deepspeech2/model.py +++ b/deepspeech/exps/deepspeech2/model.py @@ -336,13 +336,14 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): # config.data.max_output_input_ratio = float('inf') test_dataset = ManifestDataset.from_config(config) + config.collator.keep_transcription_text = True # return text ord id self.test_loader = DataLoader( test_dataset, batch_size=config.decoding.batch_size, shuffle=False, drop_last=False, - collate_fn=SpeechCollator(config=config, keep_transcription_text=True)) + collate_fn=SpeechCollator.from_config(config)) logger.info("Setup test Dataloader!") def setup_output_dir(self): diff --git a/deepspeech/exps/u2/config.py b/deepspeech/exps/u2/config.py index 5a0b53f9a..19080be76 100644 --- a/deepspeech/exps/u2/config.py +++ b/deepspeech/exps/u2/config.py @@ -22,6 +22,13 @@ _C = CfgNode() _C.data = ManifestDataset.params() +_C.collator =CfgNode( + dict( + augmentation_config="", + unit_type="char", + keep_transcription_text=False + )) + _C.model = U2Model.params() _C.training = U2Trainer.params() diff --git a/deepspeech/exps/u2/model.py b/deepspeech/exps/u2/model.py index 334d6bc8e..895270870 100644 --- a/deepspeech/exps/u2/model.py +++ b/deepspeech/exps/u2/model.py @@ -221,7 +221,7 @@ class U2Trainer(Trainer): config.data.augmentation_config = "" dev_dataset = ManifestDataset.from_config(config) - collate_fn = SpeechCollator(keep_transcription_text=False) + collate_fn = SpeechCollator.from_config(config) if self.parallel: batch_sampler = SortagradDistributedBatchSampler( train_dataset, @@ -266,12 +266,13 @@ class U2Trainer(Trainer): # config.data.max_output_input_ratio = float('inf') test_dataset = ManifestDataset.from_config(config) # return text ord id + config.collator.keep_transcription_text = True self.test_loader = DataLoader( test_dataset, batch_size=config.decoding.batch_size, shuffle=False, drop_last=False, - collate_fn=SpeechCollator(keep_transcription_text=True)) + collate_fn=SpeechCollator.from_config(config)) logger.info("Setup train/valid/test Dataloader!") def setup_model(self): @@ -375,7 +376,7 @@ class U2Tester(U2Trainer): error_rate_func = error_rate.cer if cfg.error_rate_type == 'cer' else error_rate.wer start_time = time.time() - text_feature = self.test_loader.dataset.text_feature + text_feature = self.test_loader.collate_fn.text_feature target_transcripts = self.ordid2token(texts, texts_len) result_transcripts = self.model.decode( audio, @@ -423,7 +424,7 @@ class U2Tester(U2Trainer): self.model.eval() logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}") - stride_ms = self.test_loader.dataset.stride_ms + stride_ms = self.config.collator.stride_ms error_rate_type = None errors_sum, len_refs, num_ins = 0.0, 0, 0 num_frames = 0.0 diff --git a/deepspeech/frontend/utility.py b/deepspeech/frontend/utility.py index 610104f90..b2dd9601f 100644 --- a/deepspeech/frontend/utility.py +++ b/deepspeech/frontend/utility.py @@ -82,7 +82,7 @@ def read_manifest( ] if all(conditions): manifest.append(json_data) - return manifest, json_data["feat_shape"][-1] + return manifest def rms_to_db(rms: float): diff --git a/deepspeech/io/collator.py b/deepspeech/io/collator.py index 4efc69a01..51384ec4e 100644 --- a/deepspeech/io/collator.py +++ b/deepspeech/io/collator.py @@ -56,7 +56,7 @@ class SpeechCollator(): use_dB_normalization=True, target_dB=-20, dither=1.0, # feature dither - keep_transcription_text=True + keep_transcription_text=False )) if config is not None: @@ -75,7 +75,7 @@ class SpeechCollator(): """ assert 'augmentation_config' in config.collator assert 'keep_transcription_text' in config.collator - assert 'mean_std_filepath' in config.collator + assert 'mean_std_filepath' in config.data assert 'vocab_filepath' in config.data assert 'specgram_type' in config.collator assert 'n_fft' in config.collator @@ -94,7 +94,7 @@ class SpeechCollator(): speech_collator = cls( aug_file=aug_file, random_seed=0, - mean_std_filepath=config.collator.mean_std_filepath, + mean_std_filepath=config.data.mean_std_filepath, unit_type=config.collator.unit_type, vocab_filepath=config.data.vocab_filepath, spm_model_prefix=config.collator.spm_model_prefix, @@ -282,26 +282,11 @@ class SpeechCollator(): text_lens = np.array(text_lens).astype(np.int64) return utts, padded_audios, audio_lens, padded_texts, text_lens - @property - def vocab_size(self): - return self._speech_featurizer.vocab_size - - @property - def vocab_list(self): - return self._speech_featurizer.vocab_list - - @property - def vocab_dict(self): - return self._speech_featurizer.vocab_dict @property def text_feature(self): - return self._text_featurizer - self._speech_featurizer.text_feature + return self._speech_featurizer.text_feature - @property - def feature_size(self): - return self._speech_featurizer.feature_size @property def stride_ms(self): diff --git a/deepspeech/io/dataset.py b/deepspeech/io/dataset.py index 1e3bbcd30..0da347f3e 100644 --- a/deepspeech/io/dataset.py +++ b/deepspeech/io/dataset.py @@ -161,7 +161,7 @@ class ManifestDataset(Dataset): # self._rng = np.random.RandomState(random_seed) # read manifest - self._manifest, self._feature_size = read_manifest( + self._manifest = read_manifest( manifest_path=manifest_path, max_input_len=max_input_len, min_input_len=min_input_len, @@ -213,16 +213,8 @@ class ManifestDataset(Dataset): Returns: int: audio feature size. """ - return self._feature_size + return self._manifest[0]["feat_shape"][-1] - @property - def stride_ms(self): - """time length in `ms` unit per frame - - Returns: - float: time(ms)/frame - """ - return self._audio_featurizer.stride_ms def __len__(self): diff --git a/examples/tiny/s0/conf/deepspeech2.yaml b/examples/tiny/s0/conf/deepspeech2.yaml index eda7c3cb8..bfed8d59d 100644 --- a/examples/tiny/s0/conf/deepspeech2.yaml +++ b/examples/tiny/s0/conf/deepspeech2.yaml @@ -6,7 +6,6 @@ data: mean_std_filepath: data/mean_std.json unit_type: char vocab_filepath: data/vocab.txt - augmentation_config: conf/augmentation.json batch_size: 4 min_input_len: 0.0 max_input_len: 27.0 @@ -14,18 +13,6 @@ data: max_output_len: 400.0 min_output_input_ratio: 0.05 max_output_input_ratio: 10.0 - specgram_type: linear - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 20.0 - delta_delta: False - dither: 1.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False sortagrad: True shuffle_method: batch_shuffle num_workers: 0 @@ -33,7 +20,6 @@ data: collator: augmentation_config: conf/augmentation.json random_seed: 0 - mean_std_filepath: data/mean_std.json spm_model_prefix: specgram_type: linear feat_dim: @@ -46,7 +32,7 @@ collator: use_dB_normalization: True target_dB: -20 dither: 1.0 - keep_transcription_text: True + keep_transcription_text: False model: num_conv_layers: 2 diff --git a/examples/tiny/s1/conf/transformer.yaml b/examples/tiny/s1/conf/transformer.yaml index 0a7cf3be8..cc1725853 100644 --- a/examples/tiny/s1/conf/transformer.yaml +++ b/examples/tiny/s1/conf/transformer.yaml @@ -7,7 +7,6 @@ data: unit_type: 'spm' spm_model_prefix: 'data/bpe_unigram_200' mean_std_filepath: "" - augmentation_config: conf/augmentation.json batch_size: 4 min_input_len: 0.5 # second max_input_len: 20.0 # second @@ -16,23 +15,26 @@ data: min_output_input_ratio: 0.05 max_output_input_ratio: 10.0 raw_wav: True # use raw_wav or kaldi feature - specgram_type: fbank #linear, mfcc, fbank + sortagrad: True + shuffle_method: batch_shuffle + num_workers: 0 #2 + +collator: + augmentation_config: conf/augmentation.json + random_seed: 0 + spm_model_prefix: + specgram_type: fbank feat_dim: 80 delta_delta: False - dither: 1.0 - target_sample_rate: 16000 - max_freq: None - n_fft: None stride_ms: 10.0 - window_ms: 25.0 + window_ms: 20.0 + n_fft: None + max_freq: None + target_sample_rate: 16000 use_dB_normalization: True target_dB: -20 - random_seed: 0 + dither: 1.0 keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 - # network architecture model: @@ -70,7 +72,7 @@ model: training: - n_epoch: 2 + n_epoch: 3 accum_grad: 1 global_grad_clip: 5.0 optim: adam From d179fc92d94ec8b89a6a7f0175171dcb3aa732cd Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Wed, 16 Jun 2021 09:08:33 +0000 Subject: [PATCH 013/281] speech deployment architecture --- speechnn/CMakeLists.txt | 0 speechnn/core/CMakeLists.txt | 0 speechnn/core/decoder/CMakeLists.txt | 0 speechnn/core/frontend/CMakeLists.txt | 0 speechnn/core/frontend/audio/CMakeLists.txt | 0 speechnn/core/frontend/text/CMakeLists.txt | 0 speechnn/core/model/CMakeLists.txt | 0 speechnn/core/protocol/CMakeLists.txt | 0 speechnn/core/utils/CMakeLists.txt | 0 speechnn/third_party/CMakeLists.txt | 0 10 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 speechnn/CMakeLists.txt create mode 100644 speechnn/core/CMakeLists.txt create mode 100644 speechnn/core/decoder/CMakeLists.txt create mode 100644 speechnn/core/frontend/CMakeLists.txt create mode 100644 speechnn/core/frontend/audio/CMakeLists.txt create mode 100644 speechnn/core/frontend/text/CMakeLists.txt create mode 100644 speechnn/core/model/CMakeLists.txt create mode 100644 speechnn/core/protocol/CMakeLists.txt create mode 100644 speechnn/core/utils/CMakeLists.txt create mode 100644 speechnn/third_party/CMakeLists.txt diff --git a/speechnn/CMakeLists.txt b/speechnn/CMakeLists.txt new file mode 100644 index 000000000..e69de29bb diff --git a/speechnn/core/CMakeLists.txt b/speechnn/core/CMakeLists.txt new file mode 100644 index 000000000..e69de29bb diff --git a/speechnn/core/decoder/CMakeLists.txt b/speechnn/core/decoder/CMakeLists.txt new file mode 100644 index 000000000..e69de29bb diff --git a/speechnn/core/frontend/CMakeLists.txt b/speechnn/core/frontend/CMakeLists.txt new file mode 100644 index 000000000..e69de29bb diff --git a/speechnn/core/frontend/audio/CMakeLists.txt b/speechnn/core/frontend/audio/CMakeLists.txt new file mode 100644 index 000000000..e69de29bb diff --git a/speechnn/core/frontend/text/CMakeLists.txt b/speechnn/core/frontend/text/CMakeLists.txt new file mode 100644 index 000000000..e69de29bb diff --git a/speechnn/core/model/CMakeLists.txt b/speechnn/core/model/CMakeLists.txt new file mode 100644 index 000000000..e69de29bb diff --git a/speechnn/core/protocol/CMakeLists.txt b/speechnn/core/protocol/CMakeLists.txt new file mode 100644 index 000000000..e69de29bb diff --git a/speechnn/core/utils/CMakeLists.txt b/speechnn/core/utils/CMakeLists.txt new file mode 100644 index 000000000..e69de29bb diff --git a/speechnn/third_party/CMakeLists.txt b/speechnn/third_party/CMakeLists.txt new file mode 100644 index 000000000..e69de29bb From 9ddae26a362998c5e3404b2c4cd69962bb098948 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Wed, 16 Jun 2021 09:12:05 +0000 Subject: [PATCH 014/281] add delpoy mergify label --- .mergify.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.mergify.yml b/.mergify.yml index b11fd5c1f..03e57e14b 100644 --- a/.mergify.yml +++ b/.mergify.yml @@ -87,3 +87,9 @@ pull_request_rules: actions: label: add: ["Docker"] + - name: "auto add label=Deployment" + conditions: + - files~=^speechnn/ + actions: + label: + add: ["Deployment"] From 6ee3033cc4561ab3109ee036c3c8db9101d1c2b7 Mon Sep 17 00:00:00 2001 From: Haoxin Ma <745165806@qq.com> Date: Wed, 16 Jun 2021 14:39:00 +0000 Subject: [PATCH 015/281] finish aishell/s0 --- deepspeech/exps/deepspeech2/model.py | 12 +-- deepspeech/exps/u2/model.py | 6 +- .../frontend/featurizer/speech_featurizer.py | 49 +++++++++- deepspeech/io/collator.py | 32 ++++++- deepspeech/io/dataset.py | 90 +++++++++---------- examples/aishell/s0/conf/deepspeech2.yaml | 24 ++--- examples/tiny/s0/conf/deepspeech2.yaml | 2 +- examples/tiny/s1/conf/transformer.yaml | 2 +- 8 files changed, 147 insertions(+), 70 deletions(-) diff --git a/deepspeech/exps/deepspeech2/model.py b/deepspeech/exps/deepspeech2/model.py index 7769c3776..5833382a4 100644 --- a/deepspeech/exps/deepspeech2/model.py +++ b/deepspeech/exps/deepspeech2/model.py @@ -102,8 +102,8 @@ class DeepSpeech2Trainer(Trainer): def setup_model(self): config = self.config model = DeepSpeech2Model( - feat_size=self.train_loader.dataset.feature_size, - dict_size=self.train_loader.dataset.vocab_size, + feat_size=self.train_loader.collate_fn.feature_size, + dict_size=self.train_loader.collate_fn.vocab_size, num_conv_layers=config.model.num_conv_layers, num_rnn_layers=config.model.num_rnn_layers, rnn_size=config.model.rnn_layer_size, @@ -199,7 +199,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): errors_func = error_rate.char_errors if cfg.error_rate_type == 'cer' else error_rate.word_errors error_rate_func = error_rate.cer if cfg.error_rate_type == 'cer' else error_rate.wer - vocab_list = self.test_loader.dataset.vocab_list + vocab_list = self.test_loader.collate_fn.vocab_list target_transcripts = self.ordid2token(texts, texts_len) result_transcripts = self.model.decode( @@ -272,7 +272,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): infer_model = DeepSpeech2InferModel.from_pretrained( self.test_loader.dataset, self.config, self.args.checkpoint_path) infer_model.eval() - feat_dim = self.test_loader.dataset.feature_size + feat_dim = self.test_loader.collate_fn.feature_size static_model = paddle.jit.to_static( infer_model, input_spec=[ @@ -308,8 +308,8 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): def setup_model(self): config = self.config model = DeepSpeech2Model( - feat_size=self.test_loader.dataset.feature_size, - dict_size=self.test_loader.dataset.vocab_size, + feat_size=self.test_loader.collate_fn.feature_size, + dict_size=self.test_loader.collate_fn.vocab_size, num_conv_layers=config.model.num_conv_layers, num_rnn_layers=config.model.num_rnn_layers, rnn_size=config.model.rnn_layer_size, diff --git a/deepspeech/exps/u2/model.py b/deepspeech/exps/u2/model.py index 895270870..676768cea 100644 --- a/deepspeech/exps/u2/model.py +++ b/deepspeech/exps/u2/model.py @@ -279,8 +279,8 @@ class U2Trainer(Trainer): config = self.config model_conf = config.model model_conf.defrost() - model_conf.input_dim = self.train_loader.dataset.feature_size - model_conf.output_dim = self.train_loader.dataset.vocab_size + model_conf.input_dim = self.train_loader.collate_fn.feature_size + model_conf.output_dim = self.train_loader.collate_fn.vocab_size model_conf.freeze() model = U2Model.from_config(model_conf) @@ -497,7 +497,7 @@ class U2Tester(U2Trainer): infer_model = U2InferModel.from_pretrained(self.test_loader.dataset, self.config.model.clone(), self.args.checkpoint_path) - feat_dim = self.test_loader.dataset.feature_size + feat_dim = self.test_loader.collate_fn.feature_size input_spec = [ paddle.static.InputSpec( shape=[None, feat_dim, None], diff --git a/deepspeech/frontend/featurizer/speech_featurizer.py b/deepspeech/frontend/featurizer/speech_featurizer.py index bcb8e3f47..852d26c9a 100644 --- a/deepspeech/frontend/featurizer/speech_featurizer.py +++ b/deepspeech/frontend/featurizer/speech_featurizer.py @@ -104,13 +104,60 @@ class SpeechFeaturizer(object): speech_segment.transcript) return spec_feature, text_ids + @property + def vocab_size(self): + """Return the vocabulary size. + Returns: + int: Vocabulary size. + """ + return self._text_featurizer.vocab_size + @property + def vocab_list(self): + """Return the vocabulary in list. + Returns: + List[str]: + """ + return self._text_featurizer.vocab_list + + @property + def vocab_dict(self): + """Return the vocabulary in dict. + Returns: + Dict[str, int]: + """ + return self._text_featurizer.vocab_dict + + @property + def feature_size(self): + """Return the audio feature size. + Returns: + int: audio feature size. + """ + return self._audio_featurizer.feature_size + + @property + def stride_ms(self): + """time length in `ms` unit per frame + Returns: + float: time(ms)/frame + """ + return self._audio_featurizer.stride_ms @property def text_feature(self): """Return the text feature object. - Returns: TextFeaturizer: object. """ return self._text_featurizer + + + # @property + # def text_feature(self): + # """Return the text feature object. + + # Returns: + # TextFeaturizer: object. + # """ + # return self._text_featurizer diff --git a/deepspeech/io/collator.py b/deepspeech/io/collator.py index 51384ec4e..8b8575dbd 100644 --- a/deepspeech/io/collator.py +++ b/deepspeech/io/collator.py @@ -283,11 +283,41 @@ class SpeechCollator(): return utts, padded_audios, audio_lens, padded_texts, text_lens + # @property + # def text_feature(self): + # return self._speech_featurizer.text_feature + + + # @property + # def stride_ms(self): + # return self._speech_featurizer.stride_ms + +########### + + @property + def manifest(self): + return self._manifest + + @property + def vocab_size(self): + return self._speech_featurizer.vocab_size + + @property + def vocab_list(self): + return self._speech_featurizer.vocab_list + + @property + def vocab_dict(self): + return self._speech_featurizer.vocab_dict + @property def text_feature(self): return self._speech_featurizer.text_feature + @property + def feature_size(self): + return self._speech_featurizer.feature_size @property def stride_ms(self): - return self._speech_featurizer.stride_ms + return self._speech_featurizer.stride_ms \ No newline at end of file diff --git a/deepspeech/io/dataset.py b/deepspeech/io/dataset.py index 0da347f3e..24d8486a8 100644 --- a/deepspeech/io/dataset.py +++ b/deepspeech/io/dataset.py @@ -55,10 +55,6 @@ class ManifestDataset(Dataset): min_output_len=0.0, max_output_input_ratio=float('inf'), min_output_input_ratio=0.0, - batch_size=32, # batch size - num_workers=0, # data loader workers - sortagrad=False, # sorted in first epoch when True - shuffle_method="batch_shuffle", # 'batch_shuffle', 'instance_shuffle' )) if config is not None: @@ -77,7 +73,7 @@ class ManifestDataset(Dataset): """ assert 'manifest' in config.data assert config.data.manifest - assert 'keep_transcription_text' in config.data + assert 'keep_transcription_text' in config.collator if isinstance(config.data.augmentation_config, (str, bytes)): if config.data.augmentation_config: @@ -171,51 +167,51 @@ class ManifestDataset(Dataset): min_output_input_ratio=min_output_input_ratio) self._manifest.sort(key=lambda x: x["feat_shape"][0]) - self._vocab_list = self._read_vocab(vocab_filepath) + # self._vocab_list = self._read_vocab(vocab_filepath) - @property - def manifest(self): - return self._manifest - - @property - def vocab_size(self): - """Return the vocabulary size. - - Returns: - int: Vocabulary size. - """ - return len(self._vocab_list) - - @property - def vocab_list(self): - """Return the vocabulary in list. - - Returns: - List[str]: - """ - return self._vocab_list - - @property - def vocab_dict(self): - """Return the vocabulary in dict. - - Returns: - Dict[str, int]: - """ - vocab_dict = dict( - [(token, idx) for (idx, token) in enumerate(self._vocab_list)]) - return vocab_dict - - @property - def feature_size(self): - """Return the audio feature size. - - Returns: - int: audio feature size. - """ - return self._manifest[0]["feat_shape"][-1] + # @property + # def manifest(self): + # return self._manifest + # @property + # def vocab_size(self): + # """Return the vocabulary size. + + # Returns: + # int: Vocabulary size. + # """ + # return len(self._vocab_list) + + # @property + # def vocab_list(self): + # """Return the vocabulary in list. + + # Returns: + # List[str]: + # """ + # return self._vocab_list + + # @property + # def vocab_dict(self): + # """Return the vocabulary in dict. + + # Returns: + # Dict[str, int]: + # """ + # vocab_dict = dict( + # [(token, idx) for (idx, token) in enumerate(self._vocab_list)]) + # return vocab_dict + + # @property + # def feature_size(self): + # """Return the audio feature size. + + # Returns: + # int: audio feature size. + # """ + # return self._manifest[0]["feat_shape"][-1] + def __len__(self): return len(self._manifest) diff --git a/examples/aishell/s0/conf/deepspeech2.yaml b/examples/aishell/s0/conf/deepspeech2.yaml index 8b08ee308..e5ab8e046 100644 --- a/examples/aishell/s0/conf/deepspeech2.yaml +++ b/examples/aishell/s0/conf/deepspeech2.yaml @@ -5,7 +5,6 @@ data: test_manifest: data/manifest.test mean_std_filepath: data/mean_std.json vocab_filepath: data/vocab.txt - augmentation_config: conf/augmentation.json batch_size: 64 # one gpu min_input_len: 0.0 max_input_len: 27.0 # second @@ -13,21 +12,26 @@ data: max_output_len: .inf min_output_input_ratio: 0.00 max_output_input_ratio: .inf + sortagrad: True + shuffle_method: batch_shuffle + num_workers: 0 + +collator: + augmentation_config: conf/augmentation.json + random_seed: 0 + spm_model_prefix: specgram_type: linear - target_sample_rate: 16000 - max_freq: None - n_fft: None + feat_dim: + delta_delta: False stride_ms: 10.0 window_ms: 20.0 - delta_delta: False - dither: 1.0 + n_fft: None + max_freq: None + target_sample_rate: 16000 use_dB_normalization: True target_dB: -20 - random_seed: 0 + dither: 1.0 keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 0 model: num_conv_layers: 2 diff --git a/examples/tiny/s0/conf/deepspeech2.yaml b/examples/tiny/s0/conf/deepspeech2.yaml index bfed8d59d..6680e5686 100644 --- a/examples/tiny/s0/conf/deepspeech2.yaml +++ b/examples/tiny/s0/conf/deepspeech2.yaml @@ -42,7 +42,7 @@ model: share_rnn_weights: True training: - n_epoch: 21 + n_epoch: 23 lr: 1e-5 lr_decay: 1.0 weight_decay: 1e-06 diff --git a/examples/tiny/s1/conf/transformer.yaml b/examples/tiny/s1/conf/transformer.yaml index cc1725853..5e28e4e87 100644 --- a/examples/tiny/s1/conf/transformer.yaml +++ b/examples/tiny/s1/conf/transformer.yaml @@ -72,7 +72,7 @@ model: training: - n_epoch: 3 + n_epoch: 21 accum_grad: 1 global_grad_clip: 5.0 optim: adam From 89a00eabeb6aaf0512be2283a563d087423c23bd Mon Sep 17 00:00:00 2001 From: Haoxin Ma <745165806@qq.com> Date: Thu, 17 Jun 2021 00:36:57 +0000 Subject: [PATCH 016/281] revise deepspeech/exps/u2/model.py --- deepspeech/exps/u2/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepspeech/exps/u2/model.py b/deepspeech/exps/u2/model.py index 676768cea..164903e69 100644 --- a/deepspeech/exps/u2/model.py +++ b/deepspeech/exps/u2/model.py @@ -424,7 +424,7 @@ class U2Tester(U2Trainer): self.model.eval() logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}") - stride_ms = self.config.collator.stride_ms + stride_ms = self.test_loader.collate_fn.stride_ms error_rate_type = None errors_sum, len_refs, num_ins = 0.0, 0, 0 num_frames = 0.0 From 698d7a9bdb3de1a763ed8ba7a71b68241e3eea17 Mon Sep 17 00:00:00 2001 From: Haoxin Ma <745165806@qq.com> Date: Thu, 17 Jun 2021 07:16:52 +0000 Subject: [PATCH 017/281] move batch_size, work_nums, shuffle_method, sortagrad to collator --- deepspeech/exps/deepspeech2/config.py | 20 +++++------------ deepspeech/exps/deepspeech2/model.py | 18 +++++++-------- deepspeech/exps/u2/config.py | 6 ++++- .../frontend/featurizer/speech_featurizer.py | 10 --------- deepspeech/io/collator.py | 22 ------------------- examples/aishell/s0/conf/deepspeech2.yaml | 9 ++++---- examples/tiny/s0/conf/deepspeech2.yaml | 9 ++++---- 7 files changed, 29 insertions(+), 65 deletions(-) diff --git a/deepspeech/exps/deepspeech2/config.py b/deepspeech/exps/deepspeech2/config.py index 1ce5346f6..faaff1aad 100644 --- a/deepspeech/exps/deepspeech2/config.py +++ b/deepspeech/exps/deepspeech2/config.py @@ -28,20 +28,6 @@ _C.data = CN( augmentation_config="", max_duration=float('inf'), min_duration=0.0, - stride_ms=10.0, # ms - window_ms=20.0, # ms - n_fft=None, # fft points - max_freq=None, # None for samplerate/2 - specgram_type='linear', # 'linear', 'mfcc', 'fbank' - feat_dim=0, # 'mfcc', 'fbank' - delat_delta=False, # 'mfcc', 'fbank' - target_sample_rate=16000, # target sample rate - use_dB_normalization=True, - target_dB=-20, - batch_size=32, # batch size - num_workers=0, # data loader workers - sortagrad=False, # sorted in first epoch when True - shuffle_method="batch_shuffle", # 'batch_shuffle', 'instance_shuffle' )) _C.model = CN( @@ -72,7 +58,11 @@ _C.collator =CN( use_dB_normalization=True, target_dB=-20, dither=1.0, # feature dither - keep_transcription_text=False + keep_transcription_text=False, + batch_size=32, # batch size + num_workers=0, # data loader workers + sortagrad=False, # sorted in first epoch when True + shuffle_method="batch_shuffle", # 'batch_shuffle', 'instance_shuffle' )) DeepSpeech2Model.params(_C.model) diff --git a/deepspeech/exps/deepspeech2/model.py b/deepspeech/exps/deepspeech2/model.py index 5833382a4..b54192dd3 100644 --- a/deepspeech/exps/deepspeech2/model.py +++ b/deepspeech/exps/deepspeech2/model.py @@ -55,7 +55,7 @@ class DeepSpeech2Trainer(Trainer): 'train_loss': float(loss), } msg += "train time: {:>.3f}s, ".format(iteration_time) - msg += "batch size: {}, ".format(self.config.data.batch_size) + msg += "batch size: {}, ".format(self.config.collator.batch_size) msg += ', '.join('{}: {:>.6f}'.format(k, v) for k, v in losses_np.items()) logger.info(msg) @@ -149,31 +149,31 @@ class DeepSpeech2Trainer(Trainer): if self.parallel: batch_sampler = SortagradDistributedBatchSampler( train_dataset, - batch_size=config.data.batch_size, + batch_size=config.collator.batch_size, num_replicas=None, rank=None, shuffle=True, drop_last=True, - sortagrad=config.data.sortagrad, - shuffle_method=config.data.shuffle_method) + sortagrad=config.collator.sortagrad, + shuffle_method=config.collator.shuffle_method) else: batch_sampler = SortagradBatchSampler( train_dataset, shuffle=True, - batch_size=config.data.batch_size, + batch_size=config.collator.batch_size, drop_last=True, - sortagrad=config.data.sortagrad, - shuffle_method=config.data.shuffle_method) + sortagrad=config.collator.sortagrad, + shuffle_method=config.collator.shuffle_method) collate_fn = SpeechCollator.from_config(config) self.train_loader = DataLoader( train_dataset, batch_sampler=batch_sampler, collate_fn=collate_fn, - num_workers=config.data.num_workers) + num_workers=config.collator.num_workers) self.valid_loader = DataLoader( dev_dataset, - batch_size=config.data.batch_size, + batch_size=config.collator.batch_size, shuffle=False, drop_last=False, collate_fn=collate_fn) diff --git a/deepspeech/exps/u2/config.py b/deepspeech/exps/u2/config.py index 19080be76..42725c74f 100644 --- a/deepspeech/exps/u2/config.py +++ b/deepspeech/exps/u2/config.py @@ -26,7 +26,11 @@ _C.collator =CfgNode( dict( augmentation_config="", unit_type="char", - keep_transcription_text=False + keep_transcription_text=False, + batch_size=32, # batch size + num_workers=0, # data loader workers + sortagrad=False, # sorted in first epoch when True + shuffle_method="batch_shuffle" # 'batch_shuffle', 'instance_shuffle' )) _C.model = U2Model.params() diff --git a/deepspeech/frontend/featurizer/speech_featurizer.py b/deepspeech/frontend/featurizer/speech_featurizer.py index 852d26c9a..0fbbc5648 100644 --- a/deepspeech/frontend/featurizer/speech_featurizer.py +++ b/deepspeech/frontend/featurizer/speech_featurizer.py @@ -151,13 +151,3 @@ class SpeechFeaturizer(object): TextFeaturizer: object. """ return self._text_featurizer - - - # @property - # def text_feature(self): - # """Return the text feature object. - - # Returns: - # TextFeaturizer: object. - # """ - # return self._text_featurizer diff --git a/deepspeech/io/collator.py b/deepspeech/io/collator.py index 8b8575dbd..ac817a192 100644 --- a/deepspeech/io/collator.py +++ b/deepspeech/io/collator.py @@ -203,34 +203,22 @@ class SpeechCollator(): where transcription part could be token ids or text. :rtype: tuple of (2darray, list) """ - start_time = time.time() if isinstance(audio_file, str) and audio_file.startswith('tar:'): speech_segment = SpeechSegment.from_file( self._subfile_from_tar(audio_file), transcript) else: speech_segment = SpeechSegment.from_file(audio_file, transcript) - load_wav_time = time.time() - start_time - #logger.debug(f"load wav time: {load_wav_time}") # audio augment - start_time = time.time() self._augmentation_pipeline.transform_audio(speech_segment) - audio_aug_time = time.time() - start_time - #logger.debug(f"audio augmentation time: {audio_aug_time}") - start_time = time.time() specgram, transcript_part = self._speech_featurizer.featurize( speech_segment, self._keep_transcription_text) if self._normalizer: specgram = self._normalizer.apply(specgram) - feature_time = time.time() - start_time - #logger.debug(f"audio & test feature time: {feature_time}") # specgram augment - start_time = time.time() specgram = self._augmentation_pipeline.transform_feature(specgram) - feature_aug_time = time.time() - start_time - #logger.debug(f"audio feature augmentation time: {feature_aug_time}") return specgram, transcript_part def __call__(self, batch): @@ -283,16 +271,6 @@ class SpeechCollator(): return utts, padded_audios, audio_lens, padded_texts, text_lens - # @property - # def text_feature(self): - # return self._speech_featurizer.text_feature - - - # @property - # def stride_ms(self): - # return self._speech_featurizer.stride_ms - -########### @property def manifest(self): diff --git a/examples/aishell/s0/conf/deepspeech2.yaml b/examples/aishell/s0/conf/deepspeech2.yaml index e5ab8e046..54ce240e7 100644 --- a/examples/aishell/s0/conf/deepspeech2.yaml +++ b/examples/aishell/s0/conf/deepspeech2.yaml @@ -5,16 +5,13 @@ data: test_manifest: data/manifest.test mean_std_filepath: data/mean_std.json vocab_filepath: data/vocab.txt - batch_size: 64 # one gpu min_input_len: 0.0 max_input_len: 27.0 # second min_output_len: 0.0 max_output_len: .inf min_output_input_ratio: 0.00 max_output_input_ratio: .inf - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 0 + collator: augmentation_config: conf/augmentation.json @@ -32,6 +29,10 @@ collator: target_dB: -20 dither: 1.0 keep_transcription_text: False + sortagrad: True + shuffle_method: batch_shuffle + num_workers: 0 + batch_size: 64 # one gpu model: num_conv_layers: 2 diff --git a/examples/tiny/s0/conf/deepspeech2.yaml b/examples/tiny/s0/conf/deepspeech2.yaml index 6680e5686..434cf264c 100644 --- a/examples/tiny/s0/conf/deepspeech2.yaml +++ b/examples/tiny/s0/conf/deepspeech2.yaml @@ -6,16 +6,13 @@ data: mean_std_filepath: data/mean_std.json unit_type: char vocab_filepath: data/vocab.txt - batch_size: 4 min_input_len: 0.0 max_input_len: 27.0 min_output_len: 0.0 max_output_len: 400.0 min_output_input_ratio: 0.05 max_output_input_ratio: 10.0 - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 0 + collator: augmentation_config: conf/augmentation.json @@ -33,6 +30,10 @@ collator: target_dB: -20 dither: 1.0 keep_transcription_text: False + sortagrad: True + shuffle_method: batch_shuffle + num_workers: 0 + batch_size: 4 model: num_conv_layers: 2 From 557427736e9f2fba6715cc3ce18b3175a3c42cd8 Mon Sep 17 00:00:00 2001 From: Haoxin Ma <745165806@qq.com> Date: Fri, 18 Jun 2021 06:41:28 +0000 Subject: [PATCH 018/281] move redundant params --- deepspeech/exps/deepspeech2/config.py | 30 +++---- deepspeech/exps/deepspeech2/model.py | 14 ++-- deepspeech/exps/u2/config.py | 12 +-- deepspeech/exps/u2/model.py | 35 ++++---- deepspeech/io/collator.py | 36 ++++++-- deepspeech/io/dataset.py | 105 +----------------------- examples/aishell/s1/conf/conformer.yaml | 14 ++-- examples/tiny/s0/conf/deepspeech2.yaml | 10 +-- examples/tiny/s1/conf/transformer.yaml | 22 ++--- 9 files changed, 96 insertions(+), 182 deletions(-) diff --git a/deepspeech/exps/deepspeech2/config.py b/deepspeech/exps/deepspeech2/config.py index faaff1aad..050a50b00 100644 --- a/deepspeech/exps/deepspeech2/config.py +++ b/deepspeech/exps/deepspeech2/config.py @@ -21,32 +21,18 @@ _C.data = CN( train_manifest="", dev_manifest="", test_manifest="", - unit_type="char", - vocab_filepath="", - spm_model_prefix="", - mean_std_filepath="", - augmentation_config="", max_duration=float('inf'), min_duration=0.0, )) -_C.model = CN( - dict( - num_conv_layers=2, #Number of stacking convolution layers. - num_rnn_layers=3, #Number of stacking RNN layers. - rnn_layer_size=1024, #RNN layer size (number of RNN cells). - use_gru=True, #Use gru if set True. Use simple rnn if set False. - share_rnn_weights=True #Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported. - )) - _C.collator =CN( dict( - augmentation_config="", - random_seed=0, - mean_std_filepath="", unit_type="char", vocab_filepath="", spm_model_prefix="", + mean_std_filepath="", + augmentation_config="", + random_seed=0, specgram_type='linear', # 'linear', 'mfcc', 'fbank' feat_dim=0, # 'mfcc', 'fbank' delta_delta=False, # 'mfcc', 'fbank' @@ -65,6 +51,16 @@ _C.collator =CN( shuffle_method="batch_shuffle", # 'batch_shuffle', 'instance_shuffle' )) +_C.model = CN( + dict( + num_conv_layers=2, #Number of stacking convolution layers. + num_rnn_layers=3, #Number of stacking RNN layers. + rnn_layer_size=1024, #RNN layer size (number of RNN cells). + use_gru=True, #Use gru if set True. Use simple rnn if set False. + share_rnn_weights=True #Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported. + )) + + DeepSpeech2Model.params(_C.model) _C.training = CN( diff --git a/deepspeech/exps/deepspeech2/model.py b/deepspeech/exps/deepspeech2/model.py index b54192dd3..1eefc871b 100644 --- a/deepspeech/exps/deepspeech2/model.py +++ b/deepspeech/exps/deepspeech2/model.py @@ -143,7 +143,6 @@ class DeepSpeech2Trainer(Trainer): train_dataset = ManifestDataset.from_config(config) config.data.manifest = config.data.dev_manifest - config.data.augmentation_config = "" dev_dataset = ManifestDataset.from_config(config) if self.parallel: @@ -165,18 +164,22 @@ class DeepSpeech2Trainer(Trainer): sortagrad=config.collator.sortagrad, shuffle_method=config.collator.shuffle_method) - collate_fn = SpeechCollator.from_config(config) + collate_fn_train = SpeechCollator.from_config(config) + + + config.collator.augmentation_config = "" + collate_fn_dev = SpeechCollator.from_config(config) self.train_loader = DataLoader( train_dataset, batch_sampler=batch_sampler, - collate_fn=collate_fn, + collate_fn=collate_fn_train, num_workers=config.collator.num_workers) self.valid_loader = DataLoader( dev_dataset, batch_size=config.collator.batch_size, shuffle=False, drop_last=False, - collate_fn=collate_fn) + collate_fn=collate_fn_dev) logger.info("Setup train/valid Dataloader!") @@ -324,8 +327,6 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): # return raw text config.data.manifest = config.data.test_manifest - config.data.keep_transcription_text = True - config.data.augmentation_config = "" # filter test examples, will cause less examples, but no mismatch with training # and can use large batch size , save training time, so filter test egs now. # config.data.min_input_len = 0.0 # second @@ -337,6 +338,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): test_dataset = ManifestDataset.from_config(config) config.collator.keep_transcription_text = True + config.collator.augmentation_config = "" # return text ord id self.test_loader = DataLoader( test_dataset, diff --git a/deepspeech/exps/u2/config.py b/deepspeech/exps/u2/config.py index 42725c74f..d8735453c 100644 --- a/deepspeech/exps/u2/config.py +++ b/deepspeech/exps/u2/config.py @@ -17,21 +17,13 @@ from deepspeech.exps.u2.model import U2Tester from deepspeech.exps.u2.model import U2Trainer from deepspeech.io.dataset import ManifestDataset from deepspeech.models.u2 import U2Model +from deepspeech.io.collator import SpeechCollator _C = CfgNode() _C.data = ManifestDataset.params() -_C.collator =CfgNode( - dict( - augmentation_config="", - unit_type="char", - keep_transcription_text=False, - batch_size=32, # batch size - num_workers=0, # data loader workers - sortagrad=False, # sorted in first epoch when True - shuffle_method="batch_shuffle" # 'batch_shuffle', 'instance_shuffle' - )) +_C.collator = SpeechCollator.params() _C.model = U2Model.params() diff --git a/deepspeech/exps/u2/model.py b/deepspeech/exps/u2/model.py index 164903e69..836afa361 100644 --- a/deepspeech/exps/u2/model.py +++ b/deepspeech/exps/u2/model.py @@ -100,7 +100,7 @@ class U2Trainer(Trainer): if (batch_index + 1) % train_conf.log_interval == 0: msg += "train time: {:>.3f}s, ".format(iteration_time) - msg += "batch size: {}, ".format(self.config.data.batch_size) + msg += "batch size: {}, ".format(self.config.collator.batch_size) msg += "accum: {}, ".format(train_conf.accum_grad) msg += ', '.join('{}: {:>.6f}'.format(k, v) for k, v in losses_np.items()) @@ -211,51 +211,52 @@ class U2Trainer(Trainer): def setup_dataloader(self): config = self.config.clone() config.defrost() - config.data.keep_transcription_text = False + config.collator.keep_transcription_text = False # train/valid dataset, return token ids config.data.manifest = config.data.train_manifest train_dataset = ManifestDataset.from_config(config) config.data.manifest = config.data.dev_manifest - config.data.augmentation_config = "" dev_dataset = ManifestDataset.from_config(config) - collate_fn = SpeechCollator.from_config(config) + collate_fn_train = SpeechCollator.from_config(config) + + config.collator.augmentation_config = "" + collate_fn_dev = SpeechCollator.from_config(config) + if self.parallel: batch_sampler = SortagradDistributedBatchSampler( train_dataset, - batch_size=config.data.batch_size, + batch_size=config.collator.batch_size, num_replicas=None, rank=None, shuffle=True, drop_last=True, - sortagrad=config.data.sortagrad, - shuffle_method=config.data.shuffle_method) + sortagrad=config.collator.sortagrad, + shuffle_method=config.collator.shuffle_method) else: batch_sampler = SortagradBatchSampler( train_dataset, shuffle=True, - batch_size=config.data.batch_size, + batch_size=config.collator.batch_size, drop_last=True, - sortagrad=config.data.sortagrad, - shuffle_method=config.data.shuffle_method) + sortagrad=config.collator.sortagrad, + shuffle_method=config.collator.shuffle_method) self.train_loader = DataLoader( train_dataset, batch_sampler=batch_sampler, - collate_fn=collate_fn, - num_workers=config.data.num_workers, ) + collate_fn=collate_fn_train, + num_workers=config.collator.num_workers, ) self.valid_loader = DataLoader( dev_dataset, - batch_size=config.data.batch_size, + batch_size=config.collator.batch_size, shuffle=False, drop_last=False, - collate_fn=collate_fn) + collate_fn=collate_fn_dev) # test dataset, return raw text config.data.manifest = config.data.test_manifest - config.data.keep_transcription_text = True - config.data.augmentation_config = "" # filter test examples, will cause less examples, but no mismatch with training # and can use large batch size , save training time, so filter test egs now. # config.data.min_input_len = 0.0 # second @@ -264,9 +265,11 @@ class U2Trainer(Trainer): # config.data.max_output_len = float('inf') # tokens # config.data.min_output_input_ratio = 0.00 # config.data.max_output_input_ratio = float('inf') + test_dataset = ManifestDataset.from_config(config) # return text ord id config.collator.keep_transcription_text = True + config.collator.augmentation_config = "" self.test_loader = DataLoader( test_dataset, batch_size=config.decoding.batch_size, diff --git a/deepspeech/io/collator.py b/deepspeech/io/collator.py index ac817a192..ab1e91652 100644 --- a/deepspeech/io/collator.py +++ b/deepspeech/io/collator.py @@ -75,8 +75,8 @@ class SpeechCollator(): """ assert 'augmentation_config' in config.collator assert 'keep_transcription_text' in config.collator - assert 'mean_std_filepath' in config.data - assert 'vocab_filepath' in config.data + assert 'mean_std_filepath' in config.collator + assert 'vocab_filepath' in config.collator assert 'specgram_type' in config.collator assert 'n_fft' in config.collator assert config.collator @@ -94,9 +94,9 @@ class SpeechCollator(): speech_collator = cls( aug_file=aug_file, random_seed=0, - mean_std_filepath=config.data.mean_std_filepath, + mean_std_filepath=config.collator.mean_std_filepath, unit_type=config.collator.unit_type, - vocab_filepath=config.data.vocab_filepath, + vocab_filepath=config.collator.vocab_filepath, spm_model_prefix=config.collator.spm_model_prefix, specgram_type=config.collator.specgram_type, feat_dim=config.collator.feat_dim, @@ -129,11 +129,31 @@ class SpeechCollator(): target_dB=-20, dither=1.0, keep_transcription_text=True): - """ - Padding audio features with zeros to make them have the same shape (or - a user-defined shape) within one bach. + """SpeechCollator Collator - if ``keep_transcription_text`` is False, text is token ids else is raw string. + Args: + unit_type(str): token unit type, e.g. char, word, spm + vocab_filepath (str): vocab file path. + mean_std_filepath (str): mean and std file path, which suffix is *.npy + spm_model_prefix (str): spm model prefix, need if `unit_type` is spm. + augmentation_config (str, optional): augmentation json str. Defaults to '{}'. + stride_ms (float, optional): stride size in ms. Defaults to 10.0. + window_ms (float, optional): window size in ms. Defaults to 20.0. + n_fft (int, optional): fft points for rfft. Defaults to None. + max_freq (int, optional): max cut freq. Defaults to None. + target_sample_rate (int, optional): target sample rate which used for training. Defaults to 16000. + specgram_type (str, optional): 'linear', 'mfcc' or 'fbank'. Defaults to 'linear'. + feat_dim (int, optional): audio feature dim, using by 'mfcc' or 'fbank'. Defaults to None. + delta_delta (bool, optional): audio feature with delta-delta, using by 'fbank' or 'mfcc'. Defaults to False. + use_dB_normalization (bool, optional): do dB normalization. Defaults to True. + target_dB (int, optional): target dB. Defaults to -20. + random_seed (int, optional): for random generator. Defaults to 0. + keep_transcription_text (bool, optional): True, when not in training mode, will not do tokenizer; Defaults to False. + if ``keep_transcription_text`` is False, text is token ids else is raw string. + + Do augmentations + Padding audio features with zeros to make them have the same shape (or + a user-defined shape) within one batch. """ self._keep_transcription_text = keep_transcription_text diff --git a/deepspeech/io/dataset.py b/deepspeech/io/dataset.py index 24d8486a8..70383b4da 100644 --- a/deepspeech/io/dataset.py +++ b/deepspeech/io/dataset.py @@ -40,15 +40,7 @@ class ManifestDataset(Dataset): def params(cls, config: Optional[CfgNode]=None) -> CfgNode: default = CfgNode( dict( - train_manifest="", - dev_manifest="", - test_manifest="", manifest="", - unit_type="char", - vocab_filepath="", - spm_model_prefix="", - mean_std_filepath="", - augmentation_config="", max_input_len=27.0, min_input_len=0.0, max_output_len=float('inf'), @@ -73,25 +65,10 @@ class ManifestDataset(Dataset): """ assert 'manifest' in config.data assert config.data.manifest - assert 'keep_transcription_text' in config.collator - - if isinstance(config.data.augmentation_config, (str, bytes)): - if config.data.augmentation_config: - aug_file = io.open( - config.data.augmentation_config, mode='r', encoding='utf8') - else: - aug_file = io.StringIO(initial_value='{}', newline='') - else: - aug_file = config.data.augmentation_config - assert isinstance(aug_file, io.StringIO) + dataset = cls( manifest_path=config.data.manifest, - unit_type=config.data.unit_type, - vocab_filepath=config.data.vocab_filepath, - mean_std_filepath=config.data.mean_std_filepath, - spm_model_prefix=config.data.spm_model_prefix, - augmentation_config=aug_file.read(), max_input_len=config.data.max_input_len, min_input_len=config.data.min_input_len, max_output_len=config.data.max_output_len, @@ -101,23 +78,8 @@ class ManifestDataset(Dataset): ) return dataset - - def _read_vocab(self, vocab_filepath): - """Load vocabulary from file.""" - vocab_lines = [] - with open(vocab_filepath, 'r', encoding='utf-8') as file: - vocab_lines.extend(file.readlines()) - vocab_list = [line[:-1] for line in vocab_lines] - return vocab_list - - def __init__(self, manifest_path, - unit_type, - vocab_filepath, - mean_std_filepath, - spm_model_prefix=None, - augmentation_config='{}', max_input_len=float('inf'), min_input_len=0.0, max_output_len=float('inf'), @@ -128,34 +90,16 @@ class ManifestDataset(Dataset): Args: manifest_path (str): manifest josn file path - unit_type(str): token unit type, e.g. char, word, spm - vocab_filepath (str): vocab file path. - mean_std_filepath (str): mean and std file path, which suffix is *.npy - spm_model_prefix (str): spm model prefix, need if `unit_type` is spm. - augmentation_config (str, optional): augmentation json str. Defaults to '{}'. max_input_len ([type], optional): maximum output seq length, in seconds for raw wav, in frame numbers for feature data. Defaults to float('inf'). min_input_len (float, optional): minimum input seq length, in seconds for raw wav, in frame numbers for feature data. Defaults to 0.0. max_output_len (float, optional): maximum input seq length, in modeling units. Defaults to 500.0. min_output_len (float, optional): minimum input seq length, in modeling units. Defaults to 0.0. max_output_input_ratio (float, optional): maximum output seq length/output seq length ratio. Defaults to 10.0. min_output_input_ratio (float, optional): minimum output seq length/output seq length ratio. Defaults to 0.05. - stride_ms (float, optional): stride size in ms. Defaults to 10.0. - window_ms (float, optional): window size in ms. Defaults to 20.0. - n_fft (int, optional): fft points for rfft. Defaults to None. - max_freq (int, optional): max cut freq. Defaults to None. - target_sample_rate (int, optional): target sample rate which used for training. Defaults to 16000. - specgram_type (str, optional): 'linear', 'mfcc' or 'fbank'. Defaults to 'linear'. - feat_dim (int, optional): audio feature dim, using by 'mfcc' or 'fbank'. Defaults to None. - delta_delta (bool, optional): audio feature with delta-delta, using by 'fbank' or 'mfcc'. Defaults to False. - use_dB_normalization (bool, optional): do dB normalization. Defaults to True. - target_dB (int, optional): target dB. Defaults to -20. - random_seed (int, optional): for random generator. Defaults to 0. - keep_transcription_text (bool, optional): True, when not in training mode, will not do tokenizer; Defaults to False. + """ super().__init__() - # self._rng = np.random.RandomState(random_seed) - # read manifest self._manifest = read_manifest( manifest_path=manifest_path, @@ -167,51 +111,6 @@ class ManifestDataset(Dataset): min_output_input_ratio=min_output_input_ratio) self._manifest.sort(key=lambda x: x["feat_shape"][0]) - # self._vocab_list = self._read_vocab(vocab_filepath) - - - # @property - # def manifest(self): - # return self._manifest - - # @property - # def vocab_size(self): - # """Return the vocabulary size. - - # Returns: - # int: Vocabulary size. - # """ - # return len(self._vocab_list) - - # @property - # def vocab_list(self): - # """Return the vocabulary in list. - - # Returns: - # List[str]: - # """ - # return self._vocab_list - - # @property - # def vocab_dict(self): - # """Return the vocabulary in dict. - - # Returns: - # Dict[str, int]: - # """ - # vocab_dict = dict( - # [(token, idx) for (idx, token) in enumerate(self._vocab_list)]) - # return vocab_dict - - # @property - # def feature_size(self): - # """Return the audio feature size. - - # Returns: - # int: audio feature size. - # """ - # return self._manifest[0]["feat_shape"][-1] - def __len__(self): return len(self._manifest) diff --git a/examples/aishell/s1/conf/conformer.yaml b/examples/aishell/s1/conf/conformer.yaml index b880f8587..116c91927 100644 --- a/examples/aishell/s1/conf/conformer.yaml +++ b/examples/aishell/s1/conf/conformer.yaml @@ -3,17 +3,20 @@ data: train_manifest: data/manifest.train dev_manifest: data/manifest.dev test_manifest: data/manifest.test - vocab_filepath: data/vocab.txt - unit_type: 'char' - spm_model_prefix: '' - augmentation_config: conf/augmentation.json - batch_size: 64 min_input_len: 0.5 max_input_len: 20.0 # second min_output_len: 0.0 max_output_len: 400.0 min_output_input_ratio: 0.05 max_output_input_ratio: 10.0 + + +collator: + vocab_filepath: data/vocab.txt + unit_type: 'char' + spm_model_prefix: '' + augmentation_config: conf/augmentation.json + batch_size: 64 raw_wav: True # use raw_wav or kaldi feature specgram_type: fbank #linear, mfcc, fbank feat_dim: 80 @@ -32,7 +35,6 @@ data: shuffle_method: batch_shuffle num_workers: 2 - # network architecture model: cmvn_file: "data/mean_std.json" diff --git a/examples/tiny/s0/conf/deepspeech2.yaml b/examples/tiny/s0/conf/deepspeech2.yaml index 434cf264c..6737d1b75 100644 --- a/examples/tiny/s0/conf/deepspeech2.yaml +++ b/examples/tiny/s0/conf/deepspeech2.yaml @@ -2,10 +2,7 @@ data: train_manifest: data/manifest.tiny dev_manifest: data/manifest.tiny - test_manifest: data/manifest.tiny - mean_std_filepath: data/mean_std.json - unit_type: char - vocab_filepath: data/vocab.txt + test_manifest: data/manifest.tiny min_input_len: 0.0 max_input_len: 27.0 min_output_len: 0.0 @@ -15,6 +12,9 @@ data: collator: + mean_std_filepath: data/mean_std.json + unit_type: char + vocab_filepath: data/vocab.txt augmentation_config: conf/augmentation.json random_seed: 0 spm_model_prefix: @@ -43,7 +43,7 @@ model: share_rnn_weights: True training: - n_epoch: 23 + n_epoch: 24 lr: 1e-5 lr_decay: 1.0 weight_decay: 1e-06 diff --git a/examples/tiny/s1/conf/transformer.yaml b/examples/tiny/s1/conf/transformer.yaml index 5e28e4e87..250995faa 100644 --- a/examples/tiny/s1/conf/transformer.yaml +++ b/examples/tiny/s1/conf/transformer.yaml @@ -3,26 +3,20 @@ data: train_manifest: data/manifest.tiny dev_manifest: data/manifest.tiny test_manifest: data/manifest.tiny - vocab_filepath: data/vocab.txt - unit_type: 'spm' - spm_model_prefix: 'data/bpe_unigram_200' - mean_std_filepath: "" - batch_size: 4 min_input_len: 0.5 # second max_input_len: 20.0 # second min_output_len: 0.0 # tokens max_output_len: 400.0 # tokens min_output_input_ratio: 0.05 max_output_input_ratio: 10.0 - raw_wav: True # use raw_wav or kaldi feature - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 0 #2 - + collator: + vocab_filepath: data/vocab.txt + mean_std_filepath: "" augmentation_config: conf/augmentation.json random_seed: 0 - spm_model_prefix: + unit_type: 'spm' + spm_model_prefix: 'data/bpe_unigram_200' specgram_type: fbank feat_dim: 80 delta_delta: False @@ -35,6 +29,12 @@ collator: target_dB: -20 dither: 1.0 keep_transcription_text: False + batch_size: 4 + sortagrad: True + shuffle_method: batch_shuffle + num_workers: 0 #2 + raw_wav: True # use raw_wav or kaldi feature + # network architecture model: From 089a8ed602721acf43c676b37249987ebd8bfa3b Mon Sep 17 00:00:00 2001 From: Haoxin Ma <745165806@qq.com> Date: Fri, 18 Jun 2021 09:47:53 +0000 Subject: [PATCH 019/281] fix deepspeech2/model.py and deepspeech2/config.py --- deepspeech/exps/deepspeech2/config.py | 76 ++++----------------------- deepspeech/exps/deepspeech2/model.py | 39 ++++++++++++++ 2 files changed, 50 insertions(+), 65 deletions(-) diff --git a/deepspeech/exps/deepspeech2/config.py b/deepspeech/exps/deepspeech2/config.py index 050a50b00..7d2250fc7 100644 --- a/deepspeech/exps/deepspeech2/config.py +++ b/deepspeech/exps/deepspeech2/config.py @@ -11,80 +11,26 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from yacs.config import CfgNode as CN +from yacs.config import CfgNode from deepspeech.models.deepspeech2 import DeepSpeech2Model +from deepspeech.io.dataset import ManifestDataset +from deepspeech.io.collator import SpeechCollator +from deepspeech.exps.deepspeech2.model import DeepSpeech2Trainer +from deepspeech.exps.deepspeech2.model import DeepSpeech2Tester -_C = CN() -_C.data = CN( - dict( - train_manifest="", - dev_manifest="", - test_manifest="", - max_duration=float('inf'), - min_duration=0.0, - )) -_C.collator =CN( - dict( - unit_type="char", - vocab_filepath="", - spm_model_prefix="", - mean_std_filepath="", - augmentation_config="", - random_seed=0, - specgram_type='linear', # 'linear', 'mfcc', 'fbank' - feat_dim=0, # 'mfcc', 'fbank' - delta_delta=False, # 'mfcc', 'fbank' - stride_ms=10.0, # ms - window_ms=20.0, # ms - n_fft=None, # fft points - max_freq=None, # None for samplerate/2 - target_sample_rate=16000, # target sample rate - use_dB_normalization=True, - target_dB=-20, - dither=1.0, # feature dither - keep_transcription_text=False, - batch_size=32, # batch size - num_workers=0, # data loader workers - sortagrad=False, # sorted in first epoch when True - shuffle_method="batch_shuffle", # 'batch_shuffle', 'instance_shuffle' - )) +_C = CfgNode() -_C.model = CN( - dict( - num_conv_layers=2, #Number of stacking convolution layers. - num_rnn_layers=3, #Number of stacking RNN layers. - rnn_layer_size=1024, #RNN layer size (number of RNN cells). - use_gru=True, #Use gru if set True. Use simple rnn if set False. - share_rnn_weights=True #Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported. - )) +_C.data = ManifestDataset.params() +_C.collator = SpeechCollator.params() -DeepSpeech2Model.params(_C.model) +_C.model = DeepSpeech2Model.params() -_C.training = CN( - dict( - lr=5e-4, # learning rate - lr_decay=1.0, # learning rate decay - weight_decay=1e-6, # the coeff of weight decay - global_grad_clip=5.0, # the global norm clip - n_epoch=50, # train epochs - )) +_C.training = DeepSpeech2Trainer.params() -_C.decoding = CN( - dict( - alpha=2.5, # Coef of LM for beam search. - beta=0.3, # Coef of WC for beam search. - cutoff_prob=1.0, # Cutoff probability for pruning. - cutoff_top_n=40, # Cutoff number for pruning. - lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm', # Filepath for language model. - decoding_method='ctc_beam_search', # Decoding method. Options: ctc_beam_search, ctc_greedy - error_rate_type='wer', # Error rate type for evaluation. Options `wer`, 'cer' - num_proc_bsearch=8, # # of CPUs for beam search. - beam_size=500, # Beam search width. - batch_size=128, # decoding batch size - )) +_C.decoding = DeepSpeech2Tester.params() def get_cfg_defaults(): diff --git a/deepspeech/exps/deepspeech2/model.py b/deepspeech/exps/deepspeech2/model.py index 1eefc871b..c11d1e259 100644 --- a/deepspeech/exps/deepspeech2/model.py +++ b/deepspeech/exps/deepspeech2/model.py @@ -34,10 +34,28 @@ from deepspeech.utils import layer_tools from deepspeech.utils import mp_tools from deepspeech.utils.log import Log +from typing import Optional +from yacs.config import CfgNode logger = Log(__name__).getlog() class DeepSpeech2Trainer(Trainer): + @classmethod + def params(cls, config: Optional[CfgNode]=None) -> CfgNode: + # training config + default = CfgNode( + dict( + lr=5e-4, # learning rate + lr_decay=1.0, # learning rate decay + weight_decay=1e-6, # the coeff of weight decay + global_grad_clip=5.0, # the global norm clip + n_epoch=50, # train epochs + )) + + if config is not None: + config.merge_from_other_cfg(default) + return default + def __init__(self, config, args): super().__init__(config, args) @@ -184,6 +202,27 @@ class DeepSpeech2Trainer(Trainer): class DeepSpeech2Tester(DeepSpeech2Trainer): + @classmethod + def params(cls, config: Optional[CfgNode]=None) -> CfgNode: + # testing config + default = CfgNode( + dict( + alpha=2.5, # Coef of LM for beam search. + beta=0.3, # Coef of WC for beam search. + cutoff_prob=1.0, # Cutoff probability for pruning. + cutoff_top_n=40, # Cutoff number for pruning. + lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm', # Filepath for language model. + decoding_method='ctc_beam_search', # Decoding method. Options: ctc_beam_search, ctc_greedy + error_rate_type='wer', # Error rate type for evaluation. Options `wer`, 'cer' + num_proc_bsearch=8, # # of CPUs for beam search. + beam_size=500, # Beam search width. + batch_size=128, # decoding batch size + )) + + if config is not None: + config.merge_from_other_cfg(default) + return default + def __init__(self, config, args): super().__init__(config, args) From 3a743f3717f692ff9cdbbcb24244fbc8ae5ce93b Mon Sep 17 00:00:00 2001 From: Haoxin Ma <745165806@qq.com> Date: Fri, 18 Jun 2021 10:09:35 +0000 Subject: [PATCH 020/281] fix pre-commit --- deepspeech/exps/deepspeech2/config.py | 9 +-- deepspeech/exps/deepspeech2/model.py | 58 +++++++------- deepspeech/exps/u2/config.py | 2 +- deepspeech/exps/u2/model.py | 19 +++-- deepspeech/io/collator.py | 108 +++++++++++++------------- deepspeech/io/dataset.py | 16 +--- deepspeech/models/u2.py | 1 - 7 files changed, 108 insertions(+), 105 deletions(-) diff --git a/deepspeech/exps/deepspeech2/config.py b/deepspeech/exps/deepspeech2/config.py index 7d2250fc7..2f0f5c24b 100644 --- a/deepspeech/exps/deepspeech2/config.py +++ b/deepspeech/exps/deepspeech2/config.py @@ -13,12 +13,11 @@ # limitations under the License. from yacs.config import CfgNode -from deepspeech.models.deepspeech2 import DeepSpeech2Model -from deepspeech.io.dataset import ManifestDataset -from deepspeech.io.collator import SpeechCollator -from deepspeech.exps.deepspeech2.model import DeepSpeech2Trainer from deepspeech.exps.deepspeech2.model import DeepSpeech2Tester - +from deepspeech.exps.deepspeech2.model import DeepSpeech2Trainer +from deepspeech.io.collator import SpeechCollator +from deepspeech.io.dataset import ManifestDataset +from deepspeech.models.deepspeech2 import DeepSpeech2Model _C = CfgNode() diff --git a/deepspeech/exps/deepspeech2/model.py b/deepspeech/exps/deepspeech2/model.py index c11d1e259..deb8752b7 100644 --- a/deepspeech/exps/deepspeech2/model.py +++ b/deepspeech/exps/deepspeech2/model.py @@ -15,11 +15,13 @@ import time from collections import defaultdict from pathlib import Path +from typing import Optional import numpy as np import paddle from paddle import distributed as dist from paddle.io import DataLoader +from yacs.config import CfgNode from deepspeech.io.collator import SpeechCollator from deepspeech.io.dataset import ManifestDataset @@ -33,9 +35,6 @@ from deepspeech.utils import error_rate from deepspeech.utils import layer_tools from deepspeech.utils import mp_tools from deepspeech.utils.log import Log - -from typing import Optional -from yacs.config import CfgNode logger = Log(__name__).getlog() @@ -44,13 +43,13 @@ class DeepSpeech2Trainer(Trainer): def params(cls, config: Optional[CfgNode]=None) -> CfgNode: # training config default = CfgNode( - dict( - lr=5e-4, # learning rate - lr_decay=1.0, # learning rate decay - weight_decay=1e-6, # the coeff of weight decay - global_grad_clip=5.0, # the global norm clip - n_epoch=50, # train epochs - )) + dict( + lr=5e-4, # learning rate + lr_decay=1.0, # learning rate decay + weight_decay=1e-6, # the coeff of weight decay + global_grad_clip=5.0, # the global norm clip + n_epoch=50, # train epochs + )) if config is not None: config.merge_from_other_cfg(default) @@ -184,7 +183,6 @@ class DeepSpeech2Trainer(Trainer): collate_fn_train = SpeechCollator.from_config(config) - config.collator.augmentation_config = "" collate_fn_dev = SpeechCollator.from_config(config) self.train_loader = DataLoader( @@ -206,18 +204,18 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): def params(cls, config: Optional[CfgNode]=None) -> CfgNode: # testing config default = CfgNode( - dict( - alpha=2.5, # Coef of LM for beam search. - beta=0.3, # Coef of WC for beam search. - cutoff_prob=1.0, # Cutoff probability for pruning. - cutoff_top_n=40, # Cutoff number for pruning. - lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm', # Filepath for language model. - decoding_method='ctc_beam_search', # Decoding method. Options: ctc_beam_search, ctc_greedy - error_rate_type='wer', # Error rate type for evaluation. Options `wer`, 'cer' - num_proc_bsearch=8, # # of CPUs for beam search. - beam_size=500, # Beam search width. - batch_size=128, # decoding batch size - )) + dict( + alpha=2.5, # Coef of LM for beam search. + beta=0.3, # Coef of WC for beam search. + cutoff_prob=1.0, # Cutoff probability for pruning. + cutoff_top_n=40, # Cutoff number for pruning. + lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm', # Filepath for language model. + decoding_method='ctc_beam_search', # Decoding method. Options: ctc_beam_search, ctc_greedy + error_rate_type='wer', # Error rate type for evaluation. Options `wer`, 'cer' + num_proc_bsearch=8, # # of CPUs for beam search. + beam_size=500, # Beam search width. + batch_size=128, # decoding batch size + )) if config is not None: config.merge_from_other_cfg(default) @@ -235,7 +233,13 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): trans.append(''.join([chr(i) for i in ids])) return trans - def compute_metrics(self, utts, audio, audio_len, texts, texts_len, fout = None): + def compute_metrics(self, + utts, + audio, + audio_len, + texts, + texts_len, + fout=None): cfg = self.config.decoding errors_sum, len_refs, num_ins = 0.0, 0, 0 errors_func = error_rate.char_errors if cfg.error_rate_type == 'cer' else error_rate.word_errors @@ -257,7 +261,8 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): cutoff_top_n=cfg.cutoff_top_n, num_processes=cfg.num_proc_bsearch) - for utt, target, result in zip(utts, target_transcripts, result_transcripts): + for utt, target, result in zip(utts, target_transcripts, + result_transcripts): errors, len_ref = errors_func(target, result) errors_sum += errors len_refs += len_ref @@ -287,7 +292,8 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): with open(self.args.result_file, 'w') as fout: for i, batch in enumerate(self.test_loader): utts, audio, audio_len, texts, texts_len = batch - metrics = self.compute_metrics(utts, audio, audio_len, texts, texts_len, fout) + metrics = self.compute_metrics(utts, audio, audio_len, texts, + texts_len, fout) errors_sum += metrics['errors_sum'] len_refs += metrics['len_refs'] num_ins += metrics['num_ins'] diff --git a/deepspeech/exps/u2/config.py b/deepspeech/exps/u2/config.py index d8735453c..4ec7bd190 100644 --- a/deepspeech/exps/u2/config.py +++ b/deepspeech/exps/u2/config.py @@ -15,9 +15,9 @@ from yacs.config import CfgNode from deepspeech.exps.u2.model import U2Tester from deepspeech.exps.u2.model import U2Trainer +from deepspeech.io.collator import SpeechCollator from deepspeech.io.dataset import ManifestDataset from deepspeech.models.u2 import U2Model -from deepspeech.io.collator import SpeechCollator _C = CfgNode() diff --git a/deepspeech/exps/u2/model.py b/deepspeech/exps/u2/model.py index 836afa361..055518755 100644 --- a/deepspeech/exps/u2/model.py +++ b/deepspeech/exps/u2/model.py @@ -78,7 +78,8 @@ class U2Trainer(Trainer): start = time.time() utt, audio, audio_len, text, text_len = batch_data - loss, attention_loss, ctc_loss = self.model(audio, audio_len, text, text_len) + loss, attention_loss, ctc_loss = self.model(audio, audio_len, text, + text_len) # loss div by `batch_size * accum_grad` loss /= train_conf.accum_grad loss.backward() @@ -121,7 +122,8 @@ class U2Trainer(Trainer): total_loss = 0.0 for i, batch in enumerate(self.valid_loader): utt, audio, audio_len, text, text_len = batch - loss, attention_loss, ctc_loss = self.model(audio, audio_len, text, text_len) + loss, attention_loss, ctc_loss = self.model(audio, audio_len, text, + text_len) if paddle.isfinite(loss): num_utts = batch[1].shape[0] num_seen_utts += num_utts @@ -221,7 +223,7 @@ class U2Trainer(Trainer): dev_dataset = ManifestDataset.from_config(config) collate_fn_train = SpeechCollator.from_config(config) - + config.collator.augmentation_config = "" collate_fn_dev = SpeechCollator.from_config(config) @@ -372,7 +374,13 @@ class U2Tester(U2Trainer): trans.append(''.join([chr(i) for i in ids])) return trans - def compute_metrics(self, utts, audio, audio_len, texts, texts_len, fout=None): + def compute_metrics(self, + utts, + audio, + audio_len, + texts, + texts_len, + fout=None): cfg = self.config.decoding errors_sum, len_refs, num_ins = 0.0, 0, 0 errors_func = error_rate.char_errors if cfg.error_rate_type == 'cer' else error_rate.word_errors @@ -399,7 +407,8 @@ class U2Tester(U2Trainer): simulate_streaming=cfg.simulate_streaming) decode_time = time.time() - start_time - for utt, target, result in zip(utts, target_transcripts, result_transcripts): + for utt, target, result in zip(utts, target_transcripts, + result_transcripts): errors, len_ref = errors_func(target, result) errors_sum += errors len_refs += len_ref diff --git a/deepspeech/io/collator.py b/deepspeech/io/collator.py index ab1e91652..ecf7024c1 100644 --- a/deepspeech/io/collator.py +++ b/deepspeech/io/collator.py @@ -11,21 +11,21 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import io +import time +from collections import namedtuple +from typing import Optional + import numpy as np +from yacs.config import CfgNode -from deepspeech.frontend.utility import IGNORE_ID -from deepspeech.io.utility import pad_sequence -from deepspeech.utils.log import Log from deepspeech.frontend.augmentor.augmentation import AugmentationPipeline from deepspeech.frontend.featurizer.speech_featurizer import SpeechFeaturizer from deepspeech.frontend.normalizer import FeatureNormalizer from deepspeech.frontend.speech import SpeechSegment -import io -import time -from yacs.config import CfgNode -from typing import Optional - -from collections import namedtuple +from deepspeech.frontend.utility import IGNORE_ID +from deepspeech.io.utility import pad_sequence +from deepspeech.utils.log import Log __all__ = ["SpeechCollator"] @@ -34,6 +34,7 @@ logger = Log(__name__).getlog() # namedtupe need global for pickle. TarLocalData = namedtuple('TarLocalData', ['tar2info', 'tar2object']) + class SpeechCollator(): @classmethod def params(cls, config: Optional[CfgNode]=None) -> CfgNode: @@ -56,8 +57,7 @@ class SpeechCollator(): use_dB_normalization=True, target_dB=-20, dither=1.0, # feature dither - keep_transcription_text=False - )) + keep_transcription_text=False)) if config is not None: config.merge_from_other_cfg(default) @@ -84,7 +84,9 @@ class SpeechCollator(): if isinstance(config.collator.augmentation_config, (str, bytes)): if config.collator.augmentation_config: aug_file = io.open( - config.collator.augmentation_config, mode='r', encoding='utf8') + config.collator.augmentation_config, + mode='r', + encoding='utf8') else: aug_file = io.StringIO(initial_value='{}', newline='') else: @@ -92,43 +94,46 @@ class SpeechCollator(): assert isinstance(aug_file, io.StringIO) speech_collator = cls( - aug_file=aug_file, - random_seed=0, - mean_std_filepath=config.collator.mean_std_filepath, - unit_type=config.collator.unit_type, - vocab_filepath=config.collator.vocab_filepath, - spm_model_prefix=config.collator.spm_model_prefix, - specgram_type=config.collator.specgram_type, - feat_dim=config.collator.feat_dim, - delta_delta=config.collator.delta_delta, - stride_ms=config.collator.stride_ms, - window_ms=config.collator.window_ms, - n_fft=config.collator.n_fft, - max_freq=config.collator.max_freq, - target_sample_rate=config.collator.target_sample_rate, - use_dB_normalization=config.collator.use_dB_normalization, - target_dB=config.collator.target_dB, - dither=config.collator.dither, - keep_transcription_text=config.collator.keep_transcription_text - ) + aug_file=aug_file, + random_seed=0, + mean_std_filepath=config.collator.mean_std_filepath, + unit_type=config.collator.unit_type, + vocab_filepath=config.collator.vocab_filepath, + spm_model_prefix=config.collator.spm_model_prefix, + specgram_type=config.collator.specgram_type, + feat_dim=config.collator.feat_dim, + delta_delta=config.collator.delta_delta, + stride_ms=config.collator.stride_ms, + window_ms=config.collator.window_ms, + n_fft=config.collator.n_fft, + max_freq=config.collator.max_freq, + target_sample_rate=config.collator.target_sample_rate, + use_dB_normalization=config.collator.use_dB_normalization, + target_dB=config.collator.target_dB, + dither=config.collator.dither, + keep_transcription_text=config.collator.keep_transcription_text) return speech_collator - def __init__(self, aug_file, mean_std_filepath, - vocab_filepath, spm_model_prefix, - random_seed=0, - unit_type="char", - specgram_type='linear', # 'linear', 'mfcc', 'fbank' - feat_dim=0, # 'mfcc', 'fbank' - delta_delta=False, # 'mfcc', 'fbank' - stride_ms=10.0, # ms - window_ms=20.0, # ms - n_fft=None, # fft points - max_freq=None, # None for samplerate/2 - target_sample_rate=16000, # target sample rate - use_dB_normalization=True, - target_dB=-20, - dither=1.0, - keep_transcription_text=True): + def __init__( + self, + aug_file, + mean_std_filepath, + vocab_filepath, + spm_model_prefix, + random_seed=0, + unit_type="char", + specgram_type='linear', # 'linear', 'mfcc', 'fbank' + feat_dim=0, # 'mfcc', 'fbank' + delta_delta=False, # 'mfcc', 'fbank' + stride_ms=10.0, # ms + window_ms=20.0, # ms + n_fft=None, # fft points + max_freq=None, # None for samplerate/2 + target_sample_rate=16000, # target sample rate + use_dB_normalization=True, + target_dB=-20, + dither=1.0, + keep_transcription_text=True): """SpeechCollator Collator Args: @@ -159,9 +164,8 @@ class SpeechCollator(): self._local_data = TarLocalData(tar2info={}, tar2object={}) self._augmentation_pipeline = AugmentationPipeline( - augmentation_config=aug_file.read(), - random_seed=random_seed) - + augmentation_config=aug_file.read(), random_seed=random_seed) + self._normalizer = FeatureNormalizer( mean_std_filepath) if mean_std_filepath else None @@ -290,8 +294,6 @@ class SpeechCollator(): text_lens = np.array(text_lens).astype(np.int64) return utts, padded_audios, audio_lens, padded_texts, text_lens - - @property def manifest(self): return self._manifest @@ -318,4 +320,4 @@ class SpeechCollator(): @property def stride_ms(self): - return self._speech_featurizer.stride_ms \ No newline at end of file + return self._speech_featurizer.stride_ms diff --git a/deepspeech/io/dataset.py b/deepspeech/io/dataset.py index 70383b4da..92c60f35c 100644 --- a/deepspeech/io/dataset.py +++ b/deepspeech/io/dataset.py @@ -12,19 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. import io -import tarfile -import time -from collections import namedtuple from typing import Optional -import numpy as np from paddle.io import Dataset from yacs.config import CfgNode -from deepspeech.frontend.augmentor.augmentation import AugmentationPipeline -from deepspeech.frontend.featurizer.speech_featurizer import SpeechFeaturizer -from deepspeech.frontend.normalizer import FeatureNormalizer -from deepspeech.frontend.speech import SpeechSegment from deepspeech.frontend.utility import read_manifest from deepspeech.utils.log import Log @@ -46,8 +38,7 @@ class ManifestDataset(Dataset): max_output_len=float('inf'), min_output_len=0.0, max_output_input_ratio=float('inf'), - min_output_input_ratio=0.0, - )) + min_output_input_ratio=0.0, )) if config is not None: config.merge_from_other_cfg(default) @@ -66,7 +57,6 @@ class ManifestDataset(Dataset): assert 'manifest' in config.data assert config.data.manifest - dataset = cls( manifest_path=config.data.manifest, max_input_len=config.data.max_input_len, @@ -74,8 +64,7 @@ class ManifestDataset(Dataset): max_output_len=config.data.max_output_len, min_output_len=config.data.min_output_len, max_output_input_ratio=config.data.max_output_input_ratio, - min_output_input_ratio=config.data.min_output_input_ratio, - ) + min_output_input_ratio=config.data.min_output_input_ratio, ) return dataset def __init__(self, @@ -111,7 +100,6 @@ class ManifestDataset(Dataset): min_output_input_ratio=min_output_input_ratio) self._manifest.sort(key=lambda x: x["feat_shape"][0]) - def __len__(self): return len(self._manifest) diff --git a/deepspeech/models/u2.py b/deepspeech/models/u2.py index bcfddaef0..238e2d35c 100644 --- a/deepspeech/models/u2.py +++ b/deepspeech/models/u2.py @@ -905,7 +905,6 @@ class U2InferModel(U2Model): def __init__(self, configs: dict): super().__init__(configs) - def forward(self, feats, feats_lengths, From 3652b87f33877d4b64b75398f9f99c34b1e5b02e Mon Sep 17 00:00:00 2001 From: Haoxin Ma <745165806@qq.com> Date: Fri, 18 Jun 2021 10:11:17 +0000 Subject: [PATCH 021/281] fix --- deepspeech/io/collator.py | 1 - deepspeech/io/dataset.py | 1 - 2 files changed, 2 deletions(-) diff --git a/deepspeech/io/collator.py b/deepspeech/io/collator.py index ecf7024c1..1061f97cf 100644 --- a/deepspeech/io/collator.py +++ b/deepspeech/io/collator.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. import io -import time from collections import namedtuple from typing import Optional diff --git a/deepspeech/io/dataset.py b/deepspeech/io/dataset.py index 92c60f35c..3fc4e9887 100644 --- a/deepspeech/io/dataset.py +++ b/deepspeech/io/dataset.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import io from typing import Optional from paddle.io import Dataset From 8c1bf1a730de9bd6a2a0d8393fd99be4bb8b9657 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Mon, 21 Jun 2021 03:10:14 +0000 Subject: [PATCH 022/281] fix ds2 conf for new data pipeline --- examples/aishell/s0/conf/deepspeech2.yaml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/aishell/s0/conf/deepspeech2.yaml b/examples/aishell/s0/conf/deepspeech2.yaml index 54ce240e7..8cc4c4c9c 100644 --- a/examples/aishell/s0/conf/deepspeech2.yaml +++ b/examples/aishell/s0/conf/deepspeech2.yaml @@ -3,8 +3,6 @@ data: train_manifest: data/manifest.train dev_manifest: data/manifest.dev test_manifest: data/manifest.test - mean_std_filepath: data/mean_std.json - vocab_filepath: data/vocab.txt min_input_len: 0.0 max_input_len: 27.0 # second min_output_len: 0.0 @@ -14,6 +12,9 @@ data: collator: + mean_std_filepath: data/mean_std.json + unit_type: char + vocab_filepath: data/vocab.txt augmentation_config: conf/augmentation.json random_seed: 0 spm_model_prefix: From aa78205293a9314d16fd0e5c39561dfd1ad925e7 Mon Sep 17 00:00:00 2001 From: zhangyinhui Date: Mon, 21 Jun 2021 16:18:32 +0800 Subject: [PATCH 023/281] Add compilation framework --- speechnn/CMakeLists.txt | 77 ++++++++++++++++++++++++++++ speechnn/core/decoder/CMakeLists.txt | 2 + 2 files changed, 79 insertions(+) diff --git a/speechnn/CMakeLists.txt b/speechnn/CMakeLists.txt index e69de29bb..878374bab 100644 --- a/speechnn/CMakeLists.txt +++ b/speechnn/CMakeLists.txt @@ -0,0 +1,77 @@ +cmake_minimum_required(VERSION 3.14 FATAL_ERROR) + +project(deepspeech VERSION 0.1) + +set(CMAKE_VERBOSE_MAKEFILE on) +# set std-14 +set(CMAKE_CXX_STANDARD 14) + +# include file +include(FetchContent) +include(ExternalProject) +# fc_patch dir +set(FETCHCONTENT_QUIET off) +get_filename_component(fc_patch "fc_patch" REALPATH BASE_DIR "${CMAKE_SOURCE_DIR}") +set(FETCHCONTENT_BASE_DIR ${fc_patch}) + + +############################################################################### +# Option Configurations +############################################################################### +# option configurations +option(TEST_DEBUG "option for debug" OFF) + + +############################################################################### +# Include third party +############################################################################### +# #example for include third party +# FetchContent_Declare() +# # FetchContent_MakeAvailable was not added until CMake 3.14 +# FetchContent_MakeAvailable() +# include_directories() + +# ABSEIL-CPP +include(FetchContent) +FetchContent_Declare( + absl + GIT_REPOSITORY "https://github.com/abseil/abseil-cpp.git" + GIT_TAG "20210324.1" +) +FetchContent_MakeAvailable(absl) + +# libsndfile +include(FetchContent) +FetchContent_Declare( + libsndfile + GIT_REPOSITORY "https://github.com/libsndfile/libsndfile.git" + GIT_TAG "1.0.31" +) +FetchContent_MakeAvailable(libsndfile) + + +############################################################################### +# Add local library +############################################################################### +# system lib +find_package() +# if dir have CmakeLists.txt +add_subdirectory() +# if dir do not have CmakeLists.txt +add_library(lib_name STATIC file.cc) +target_link_libraries(lib_name item0 item1) +add_dependencies(lib_name depend-target) + + +############################################################################### +# Library installation +############################################################################### +install() + + +############################################################################### +# Build binary file +############################################################################### +add_executable() +target_link_libraries() + diff --git a/speechnn/core/decoder/CMakeLists.txt b/speechnn/core/decoder/CMakeLists.txt index e69de29bb..259261bdf 100644 --- a/speechnn/core/decoder/CMakeLists.txt +++ b/speechnn/core/decoder/CMakeLists.txt @@ -0,0 +1,2 @@ +aux_source_directory(. DIR_LIB_SRCS) +add_library(decoder STATIC ${DIR_LIB_SRCS}) From 5a3a9e1f5055260f966d24680d5bb2e83f1d5b54 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 22 Jun 2021 02:58:58 +0000 Subject: [PATCH 024/281] fix chunk default config; tarball ckpt prfix dir; --- examples/aishell/s1/README.md | 10 ++++++++++ examples/aishell/s1/conf/chunk_conformer.yaml | 2 +- utils/tarball.sh | 3 ++- 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/examples/aishell/s1/README.md b/examples/aishell/s1/README.md index 2048c4d58..c306f8aa1 100644 --- a/examples/aishell/s1/README.md +++ b/examples/aishell/s1/README.md @@ -9,6 +9,16 @@ | conformer | conf/conformer.yaml | spec_aug + shift | test | ctc_prefix_beam_search | - | 0.062196 | | conformer | conf/conformer.yaml | spec_aug + shift | test | attention_rescoring | - | 0.054694 | +## Chunk Conformer + +| Model | Config | Augmentation| Test set | Decode method | Chunk | Loss | WER | +| --- | --- | --- | --- | --- | --- | --- | --- | +| conformer | conf/chunk_conformer.yaml | spec_aug + shift | test | attention | 16 | - | 0.061939 | +| conformer | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_greedy_search | 16 | - | 0.070806 | +| conformer | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_prefix_beam_search | 16 | - | 0.070739 | +| conformer | conf/chunk_conformer.yaml | spec_aug + shift | test | attention_rescoring | 16 | - | 0.059400 | + + ## Transformer | Model | Config | Augmentation| Test set | Decode method | Loss | WER | diff --git a/examples/aishell/s1/conf/chunk_conformer.yaml b/examples/aishell/s1/conf/chunk_conformer.yaml index 904624c3c..e626e1064 100644 --- a/examples/aishell/s1/conf/chunk_conformer.yaml +++ b/examples/aishell/s1/conf/chunk_conformer.yaml @@ -78,7 +78,7 @@ model: training: - n_epoch: 180 + n_epoch: 240 accum_grad: 4 global_grad_clip: 5.0 optim: adam diff --git a/utils/tarball.sh b/utils/tarball.sh index 100b4719e..224b740cd 100755 --- a/utils/tarball.sh +++ b/utils/tarball.sh @@ -18,7 +18,8 @@ function clean() { } trap clean EXIT -cp ${ckpt_prefix}.* ${output} +# ckpt_prfix.{json,...} and ckpt_prfix dir +cp -r ${ckpt_prefix}* ${output} cp ${model_config} ${mean_std} ${vocab} ${output} tar zcvf release.tar.gz ${output} From 68149cb9a7d39c14e95ada2979a4b7200eaf4902 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 22 Jun 2021 03:25:26 +0000 Subject: [PATCH 025/281] fix config for new datapipeline --- examples/aishell/s1/README.md | 2 +- examples/aishell/s1/conf/chunk_conformer.yaml | 15 +++++++++------ 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/examples/aishell/s1/README.md b/examples/aishell/s1/README.md index c306f8aa1..601b0a8d0 100644 --- a/examples/aishell/s1/README.md +++ b/examples/aishell/s1/README.md @@ -12,7 +12,7 @@ ## Chunk Conformer | Model | Config | Augmentation| Test set | Decode method | Chunk | Loss | WER | -| --- | --- | --- | --- | --- | --- | --- | --- | +| --- | --- | --- | --- | --- | --- | --- | --- | | conformer | conf/chunk_conformer.yaml | spec_aug + shift | test | attention | 16 | - | 0.061939 | | conformer | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_greedy_search | 16 | - | 0.070806 | | conformer | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_prefix_beam_search | 16 | - | 0.070739 | diff --git a/examples/aishell/s1/conf/chunk_conformer.yaml b/examples/aishell/s1/conf/chunk_conformer.yaml index e626e1064..0e5b8699f 100644 --- a/examples/aishell/s1/conf/chunk_conformer.yaml +++ b/examples/aishell/s1/conf/chunk_conformer.yaml @@ -3,17 +3,20 @@ data: train_manifest: data/manifest.train dev_manifest: data/manifest.dev test_manifest: data/manifest.test - vocab_filepath: data/vocab.txt - unit_type: 'char' - spm_model_prefix: '' - augmentation_config: conf/augmentation.json - batch_size: 32 min_input_len: 0.5 max_input_len: 20.0 # second min_output_len: 0.0 max_output_len: 400.0 min_output_input_ratio: 0.05 max_output_input_ratio: 10.0 + + +collator: + vocab_filepath: data/vocab.txt + unit_type: 'char' + spm_model_prefix: '' + augmentation_config: conf/augmentation.json + batch_size: 32 raw_wav: True # use raw_wav or kaldi feature specgram_type: fbank #linear, mfcc, fbank feat_dim: 80 @@ -30,7 +33,7 @@ data: keep_transcription_text: False sortagrad: True shuffle_method: batch_shuffle - num_workers: 0 + num_workers: 2 # network architecture From 1b84f21ccfda2794e323a69a163411ab15c17288 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 22 Jun 2021 06:27:19 +0000 Subject: [PATCH 026/281] fix miss match --- utils/tarball.sh | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/utils/tarball.sh b/utils/tarball.sh index 224b740cd..a4611c75b 100755 --- a/utils/tarball.sh +++ b/utils/tarball.sh @@ -18,8 +18,11 @@ function clean() { } trap clean EXIT -# ckpt_prfix.{json,...} and ckpt_prfix dir -cp -r ${ckpt_prefix}* ${output} +# ckpt_prfix dir +cp -r ${ckpt_prefix} ${output} +# ckpt_prfix.{json,...} +cp ${ckpt_prefix}.* ${output} +# model config, mean std, vocab cp ${model_config} ${mean_std} ${vocab} ${output} tar zcvf release.tar.gz ${output} From 3c6eea077b2b077b9c3f5cc7baf339c545053d35 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 22 Jun 2021 07:27:43 +0000 Subject: [PATCH 027/281] cp dir when it exits --- utils/tarball.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/utils/tarball.sh b/utils/tarball.sh index a4611c75b..5f7c21a34 100755 --- a/utils/tarball.sh +++ b/utils/tarball.sh @@ -19,7 +19,9 @@ function clean() { trap clean EXIT # ckpt_prfix dir -cp -r ${ckpt_prefix} ${output} +if [ -d ${ckpt_prefix} ];then + cp -r ${ckpt_prefix} ${output} +fi # ckpt_prfix.{json,...} cp ${ckpt_prefix}.* ${output} # model config, mean std, vocab From 68bcc4694055584e25844004379634a7e1f8b769 Mon Sep 17 00:00:00 2001 From: Haoxin Ma <745165806@qq.com> Date: Tue, 22 Jun 2021 07:46:50 +0000 Subject: [PATCH 028/281] save best and test on tiny/s0 --- deepspeech/training/trainer.py | 14 +- deepspeech/utils/checkpoint.py | 336 ++++++++++++++++--------- examples/tiny/s0/conf/deepspeech2.yaml | 5 +- 3 files changed, 230 insertions(+), 125 deletions(-) diff --git a/deepspeech/training/trainer.py b/deepspeech/training/trainer.py index 56de32617..246175e3f 100644 --- a/deepspeech/training/trainer.py +++ b/deepspeech/training/trainer.py @@ -18,7 +18,7 @@ import paddle from paddle import distributed as dist from tensorboardX import SummaryWriter -from deepspeech.utils import checkpoint +from deepspeech.utils.checkpoint import KBestCheckpoint from deepspeech.utils import mp_tools from deepspeech.utils.log import Log @@ -139,9 +139,12 @@ class Trainer(): "epoch": self.epoch, "lr": self.optimizer.get_lr() }) - checkpoint.save_parameters(self.checkpoint_dir, self.iteration + self.checkpoint.add_checkpoint(self.checkpoint_dir, self.iteration if tag is None else tag, self.model, self.optimizer, infos) + # checkpoint.save_parameters(self.checkpoint_dir, self.iteration + # if tag is None else tag, self.model, + # self.optimizer, infos) def resume_or_scratch(self): """Resume from latest checkpoint at checkpoints in the output @@ -151,7 +154,7 @@ class Trainer(): resume training. """ scratch = None - infos = checkpoint.load_parameters( + infos = self.checkpoint.load_parameters( self.model, self.optimizer, checkpoint_dir=self.checkpoint_dir, @@ -180,7 +183,7 @@ class Trainer(): from_scratch = self.resume_or_scratch() if from_scratch: # save init model, i.e. 0 epoch - self.save(tag='init') + self.save(tag='init', infos=None) self.lr_scheduler.step(self.iteration) if self.parallel: @@ -263,6 +266,9 @@ class Trainer(): self.checkpoint_dir = checkpoint_dir + self.checkpoint = KBestCheckpoint(max_size=self.config.training.max_epoch, + last_size=self.config.training.last_epoch) + @mp_tools.rank_zero_only def destory(self): """Close visualizer to avoid hanging after training""" diff --git a/deepspeech/utils/checkpoint.py b/deepspeech/utils/checkpoint.py index 8ede6b8fd..ef73eb705 100644 --- a/deepspeech/utils/checkpoint.py +++ b/deepspeech/utils/checkpoint.py @@ -23,130 +23,226 @@ from paddle.optimizer import Optimizer from deepspeech.utils import mp_tools from deepspeech.utils.log import Log +import glob + logger = Log(__name__).getlog() __all__ = ["load_parameters", "save_parameters"] +class KBestCheckpoint(object): + def __init__(self, + max_size: int=5, + last_size: int=1): + self.best_records: Mapping[Path, float] = {} + self.last_records = [] + self.max_size = max_size + self.last_size = last_size + self._save_all = (max_size == -1) + + def should_save_best(self, metric: float) -> bool: + if not self.best_full(): + return True + + # already full + worst_record_path = max(self.best_records, key=self.best_records.get) + worst_metric = self.best_records[worst_record_path] + return metric < worst_metric + + def best_full(self): + return (not self._save_all) and len(self.best_records) == self.max_size + + def last_full(self): + return len(self.last_records) == self.last_size + + def add_checkpoint(self, + checkpoint_dir, tag_or_iteration, + model, optimizer, infos): + if("val_loss" not in infos.keys()): + self.save_parameters(checkpoint_dir, tag_or_iteration, + model, optimizer, infos) + return + + #save best + if self.should_save_best(infos["val_loss"]): + self.save_checkpoint_and_update(infos["val_loss"], + checkpoint_dir, tag_or_iteration, + model, optimizer, infos) + #save last + self.save_last_checkpoint_and_update(checkpoint_dir, tag_or_iteration, + model, optimizer, infos) + + if isinstance(tag_or_iteration, int): + self._save_record(checkpoint_dir, tag_or_iteration) + + def save_checkpoint_and_update(self, metric, + checkpoint_dir, tag_or_iteration, + model, optimizer, infos): + # remove the worst + if self.best_full(): + worst_record_path = max(self.best_records, + key=self.best_records.get) + self.best_records.pop(worst_record_path) + if(worst_record_path not in self.last_records): + print('----to remove (best)----') + print(worst_record_path) + self.del_checkpoint(checkpoint_dir, worst_record_path) + + # add the new one + self.save_parameters(checkpoint_dir, tag_or_iteration, + model, optimizer, infos) + self.best_records[tag_or_iteration] = metric + + def save_last_checkpoint_and_update(self, checkpoint_dir, tag_or_iteration, + model, optimizer, infos): + # remove the old + if self.last_full(): + to_del_fn = self.last_records.pop(0) + if(to_del_fn not in self.best_records.keys()): + print('----to remove (last)----') + print(to_del_fn) + self.del_checkpoint(checkpoint_dir, to_del_fn) + self.last_records.append(tag_or_iteration) + + self.save_parameters(checkpoint_dir, tag_or_iteration, + model, optimizer, infos) + # with open(os.path.join(checkpoint_dir, "checkpoint"), "w") as handle: + # for iteration in self.best_records + # handle.write("model_checkpoint_path:{}\n".format(iteration)) + + + def del_checkpoint(self, checkpoint_dir, tag_or_iteration): + checkpoint_path = os.path.join(checkpoint_dir, + "{}".format(tag_or_iteration)) + for filename in glob.glob(checkpoint_path+".*"): + os.remove(filename) + print("delete file: "+filename) + + + + def _load_latest_checkpoint(self, checkpoint_dir: str) -> int: + """Get the iteration number corresponding to the latest saved checkpoint. + Args: + checkpoint_dir (str): the directory where checkpoint is saved. + Returns: + int: the latest iteration number. -1 for no checkpoint to load. + """ + checkpoint_record = os.path.join(checkpoint_dir, "checkpoint_last") + if not os.path.isfile(checkpoint_record): + return -1 + + # Fetch the latest checkpoint index. + with open(checkpoint_record, "rt") as handle: + latest_checkpoint = handle.readlines()[-1].strip() + iteration = int(latest_checkpoint.split(":")[-1]) + return iteration + + + def _save_record(self, checkpoint_dir: str, iteration: int): + """Save the iteration number of the latest model to be checkpoint record. + Args: + checkpoint_dir (str): the directory where checkpoint is saved. + iteration (int): the latest iteration number. + Returns: + None + """ + checkpoint_record_last = os.path.join(checkpoint_dir, "checkpoint_last") + checkpoint_record_best = os.path.join(checkpoint_dir, "checkpoint_best") + # Update the latest checkpoint index. + # with open(checkpoint_record, "a+") as handle: + # handle.write("model_checkpoint_path:{}\n".format(iteration)) + with open(checkpoint_record_best, "w") as handle: + for i in self.best_records.keys(): + handle.write("model_checkpoint_path:{}\n".format(i)) + with open(checkpoint_record_last, "w") as handle: + for i in self.last_records: + handle.write("model_checkpoint_path:{}\n".format(i)) + + + def load_parameters(self, model, + optimizer=None, + checkpoint_dir=None, + checkpoint_path=None): + """Load a specific model checkpoint from disk. + Args: + model (Layer): model to load parameters. + optimizer (Optimizer, optional): optimizer to load states if needed. + Defaults to None. + checkpoint_dir (str, optional): the directory where checkpoint is saved. + checkpoint_path (str, optional): if specified, load the checkpoint + stored in the checkpoint_path(prefix) and the argument 'checkpoint_dir' will + be ignored. Defaults to None. + Returns: + configs (dict): epoch or step, lr and other meta info should be saved. + """ + configs = {} + + if checkpoint_path is not None: + tag = os.path.basename(checkpoint_path).split(":")[-1] + elif checkpoint_dir is not None: + iteration = self._load_latest_checkpoint(checkpoint_dir) + if iteration == -1: + return configs + checkpoint_path = os.path.join(checkpoint_dir, "{}".format(iteration)) + else: + raise ValueError( + "At least one of 'checkpoint_dir' and 'checkpoint_path' should be specified!" + ) + + rank = dist.get_rank() + + params_path = checkpoint_path + ".pdparams" + model_dict = paddle.load(params_path) + model.set_state_dict(model_dict) + logger.info("Rank {}: loaded model from {}".format(rank, params_path)) -def _load_latest_checkpoint(checkpoint_dir: str) -> int: - """Get the iteration number corresponding to the latest saved checkpoint. - Args: - checkpoint_dir (str): the directory where checkpoint is saved. - Returns: - int: the latest iteration number. -1 for no checkpoint to load. - """ - checkpoint_record = os.path.join(checkpoint_dir, "checkpoint") - if not os.path.isfile(checkpoint_record): - return -1 - - # Fetch the latest checkpoint index. - with open(checkpoint_record, "rt") as handle: - latest_checkpoint = handle.readlines()[-1].strip() - iteration = int(latest_checkpoint.split(":")[-1]) - return iteration - - -def _save_record(checkpoint_dir: str, iteration: int): - """Save the iteration number of the latest model to be checkpoint record. - Args: - checkpoint_dir (str): the directory where checkpoint is saved. - iteration (int): the latest iteration number. - Returns: - None - """ - checkpoint_record = os.path.join(checkpoint_dir, "checkpoint") - # Update the latest checkpoint index. - with open(checkpoint_record, "a+") as handle: - handle.write("model_checkpoint_path:{}\n".format(iteration)) - - -def load_parameters(model, - optimizer=None, - checkpoint_dir=None, - checkpoint_path=None): - """Load a specific model checkpoint from disk. - Args: - model (Layer): model to load parameters. - optimizer (Optimizer, optional): optimizer to load states if needed. - Defaults to None. - checkpoint_dir (str, optional): the directory where checkpoint is saved. - checkpoint_path (str, optional): if specified, load the checkpoint - stored in the checkpoint_path(prefix) and the argument 'checkpoint_dir' will - be ignored. Defaults to None. - Returns: - configs (dict): epoch or step, lr and other meta info should be saved. - """ - configs = {} - - if checkpoint_path is not None: - tag = os.path.basename(checkpoint_path).split(":")[-1] - elif checkpoint_dir is not None: - iteration = _load_latest_checkpoint(checkpoint_dir) - if iteration == -1: - return configs - checkpoint_path = os.path.join(checkpoint_dir, "{}".format(iteration)) - else: - raise ValueError( - "At least one of 'checkpoint_dir' and 'checkpoint_path' should be specified!" - ) - - rank = dist.get_rank() - - params_path = checkpoint_path + ".pdparams" - model_dict = paddle.load(params_path) - model.set_state_dict(model_dict) - logger.info("Rank {}: loaded model from {}".format(rank, params_path)) - - optimizer_path = checkpoint_path + ".pdopt" - if optimizer and os.path.isfile(optimizer_path): - optimizer_dict = paddle.load(optimizer_path) - optimizer.set_state_dict(optimizer_dict) - logger.info("Rank {}: loaded optimizer state from {}".format( - rank, optimizer_path)) - - info_path = re.sub('.pdparams$', '.json', params_path) - if os.path.exists(info_path): - with open(info_path, 'r') as fin: - configs = json.load(fin) - return configs - - -@mp_tools.rank_zero_only -def save_parameters(checkpoint_dir: str, - tag_or_iteration: Union[int, str], - model: paddle.nn.Layer, - optimizer: Optimizer=None, - infos: dict=None): - """Checkpoint the latest trained model parameters. - Args: - checkpoint_dir (str): the directory where checkpoint is saved. - tag_or_iteration (int or str): the latest iteration(step or epoch) number. - model (Layer): model to be checkpointed. - optimizer (Optimizer, optional): optimizer to be checkpointed. - Defaults to None. - infos (dict or None): any info you want to save. - Returns: - None - """ - checkpoint_path = os.path.join(checkpoint_dir, - "{}".format(tag_or_iteration)) - - model_dict = model.state_dict() - params_path = checkpoint_path + ".pdparams" - paddle.save(model_dict, params_path) - logger.info("Saved model to {}".format(params_path)) - - if optimizer: - opt_dict = optimizer.state_dict() optimizer_path = checkpoint_path + ".pdopt" - paddle.save(opt_dict, optimizer_path) - logger.info("Saved optimzier state to {}".format(optimizer_path)) - - info_path = re.sub('.pdparams$', '.json', params_path) - infos = {} if infos is None else infos - with open(info_path, 'w') as fout: - data = json.dumps(infos) - fout.write(data) + if optimizer and os.path.isfile(optimizer_path): + optimizer_dict = paddle.load(optimizer_path) + optimizer.set_state_dict(optimizer_dict) + logger.info("Rank {}: loaded optimizer state from {}".format( + rank, optimizer_path)) + + info_path = re.sub('.pdparams$', '.json', params_path) + if os.path.exists(info_path): + with open(info_path, 'r') as fin: + configs = json.load(fin) + return configs + + + @mp_tools.rank_zero_only + def save_parameters(self, checkpoint_dir: str, + tag_or_iteration: Union[int, str], + model: paddle.nn.Layer, + optimizer: Optimizer=None, + infos: dict=None): + """Checkpoint the latest trained model parameters. + Args: + checkpoint_dir (str): the directory where checkpoint is saved. + tag_or_iteration (int or str): the latest iteration(step or epoch) number. + model (Layer): model to be checkpointed. + optimizer (Optimizer, optional): optimizer to be checkpointed. + Defaults to None. + infos (dict or None): any info you want to save. + Returns: + None + """ + checkpoint_path = os.path.join(checkpoint_dir, + "{}".format(tag_or_iteration)) + + model_dict = model.state_dict() + params_path = checkpoint_path + ".pdparams" + paddle.save(model_dict, params_path) + logger.info("Saved model to {}".format(params_path)) + + if optimizer: + opt_dict = optimizer.state_dict() + optimizer_path = checkpoint_path + ".pdopt" + paddle.save(opt_dict, optimizer_path) + logger.info("Saved optimzier state to {}".format(optimizer_path)) + + info_path = re.sub('.pdparams$', '.json', params_path) + infos = {} if infos is None else infos + with open(info_path, 'w') as fout: + data = json.dumps(infos) + fout.write(data) - if isinstance(tag_or_iteration, int): - _save_record(checkpoint_dir, tag_or_iteration) diff --git a/examples/tiny/s0/conf/deepspeech2.yaml b/examples/tiny/s0/conf/deepspeech2.yaml index 6737d1b75..9ff6803d8 100644 --- a/examples/tiny/s0/conf/deepspeech2.yaml +++ b/examples/tiny/s0/conf/deepspeech2.yaml @@ -43,12 +43,15 @@ model: share_rnn_weights: True training: - n_epoch: 24 + n_epoch: 6 lr: 1e-5 lr_decay: 1.0 weight_decay: 1e-06 global_grad_clip: 5.0 log_interval: 1 + max_epoch: 3 + last_epoch: 2 + decoding: batch_size: 128 From 8af2eb073adff6bf7c12c04c1b1c47aa650732f0 Mon Sep 17 00:00:00 2001 From: Haoxin Ma <745165806@qq.com> Date: Tue, 22 Jun 2021 11:36:27 +0000 Subject: [PATCH 029/281] revise config --- deepspeech/training/trainer.py | 4 ++-- examples/tiny/s0/conf/deepspeech2.yaml | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/deepspeech/training/trainer.py b/deepspeech/training/trainer.py index 246175e3f..6563e7c4d 100644 --- a/deepspeech/training/trainer.py +++ b/deepspeech/training/trainer.py @@ -266,8 +266,8 @@ class Trainer(): self.checkpoint_dir = checkpoint_dir - self.checkpoint = KBestCheckpoint(max_size=self.config.training.max_epoch, - last_size=self.config.training.last_epoch) + self.checkpoint = KBestCheckpoint(max_size=self.config.training.checkpoint.kbest_n, + last_size=self.config.training.checkpoint.latest_n) @mp_tools.rank_zero_only def destory(self): diff --git a/examples/tiny/s0/conf/deepspeech2.yaml b/examples/tiny/s0/conf/deepspeech2.yaml index 9ff6803d8..b9c2556c7 100644 --- a/examples/tiny/s0/conf/deepspeech2.yaml +++ b/examples/tiny/s0/conf/deepspeech2.yaml @@ -49,8 +49,9 @@ training: weight_decay: 1e-06 global_grad_clip: 5.0 log_interval: 1 - max_epoch: 3 - last_epoch: 2 + checkpoint: + kbest_n: 3 + latest_n: 2 decoding: From fd7c1b70cd0e52affb4d871537e03f2daae3733d Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Wed, 23 Jun 2021 02:35:46 +0000 Subject: [PATCH 030/281] using venv python --- tools/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/Makefile b/tools/Makefile index c129bf5a2..dd5902373 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -31,5 +31,5 @@ sox.done: soxbindings.done: test -d soxbindings || git clone https://github.com/pseeth/soxbindings.git - source venv/bin/activate; cd soxbindings && python3 setup.py install - touch soxbindings.done + source venv/bin/activate; cd soxbindings && python setup.py install + touch soxbindings.done \ No newline at end of file From 133a522fbb7e5ec6503cdcc570d4203699b51584 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Wed, 23 Jun 2021 02:56:44 +0000 Subject: [PATCH 031/281] ds2 default using 4gpu; new result of ds2 --- examples/aishell/s0/README.md | 9 +++++---- examples/aishell/s0/run.sh | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/examples/aishell/s0/README.md b/examples/aishell/s0/README.md index 8c1a51b62..ae3fb401a 100644 --- a/examples/aishell/s0/README.md +++ b/examples/aishell/s0/README.md @@ -4,7 +4,8 @@ | Model | release | Config | Test set | Loss | CER | | --- | --- | --- | --- | --- | --- | -| DeepSpeech2 | 2.1.0 | conf/deepspeech2.yaml + spec aug | test | 7.483316898345947 | 0.077860 | -| DeepSpeech2 | 2.1.0 | conf/deepspeech2.yaml | test | 7.299022197723389 | 0.078671 | -| DeepSpeech2 | 2.0.0 | conf/deepspeech2.yaml | test | - | 0.078977 | -| DeepSpeech2 | 1.8.5 | - | test | - | 0.080447 | +| DeepSpeech2 58.4M | 2.1.0 | conf/deepspeech2.yaml + spec aug + new datapipe | test | 6.396368026733398 | 0.068382 | +| DeepSpeech2 58.4M | 2.1.0 | conf/deepspeech2.yaml + spec aug | test | 7.483316898345947 | 0.077860 | +| DeepSpeech2 58.4M | 2.1.0 | conf/deepspeech2.yaml | test | 7.299022197723389 | 0.078671 | +| DeepSpeech2 58.4M | 2.0.0 | conf/deepspeech2.yaml | test | - | 0.078977 | +| DeepSpeech2 58.4M | 1.8.5 | - | test | - | 0.080447 | diff --git a/examples/aishell/s0/run.sh b/examples/aishell/s0/run.sh index 4073c81b9..05829136a 100755 --- a/examples/aishell/s0/run.sh +++ b/examples/aishell/s0/run.sh @@ -2,7 +2,7 @@ set -e source path.sh -gpus=0 +gpus=0,1,2,3 stage=0 stop_stage=100 conf_path=conf/deepspeech2.yaml From d55e6b5a0a58212c13169ba1d3297f9431b62b6b Mon Sep 17 00:00:00 2001 From: Haoxin Ma <745165806@qq.com> Date: Wed, 23 Jun 2021 03:16:49 +0000 Subject: [PATCH 032/281] revise from_pretrained function --- deepspeech/exps/deepspeech2/bin/deploy/runtime.py | 10 +++++++++- deepspeech/exps/deepspeech2/bin/deploy/server.py | 10 +++++++++- deepspeech/exps/deepspeech2/bin/tune.py | 2 +- deepspeech/exps/deepspeech2/model.py | 2 +- deepspeech/exps/u2/model.py | 2 +- deepspeech/models/deepspeech2.py | 8 ++++---- deepspeech/models/u2.py | 8 ++++---- 7 files changed, 29 insertions(+), 13 deletions(-) diff --git a/deepspeech/exps/deepspeech2/bin/deploy/runtime.py b/deepspeech/exps/deepspeech2/bin/deploy/runtime.py index f3125e04d..0ec36b5dd 100644 --- a/deepspeech/exps/deepspeech2/bin/deploy/runtime.py +++ b/deepspeech/exps/deepspeech2/bin/deploy/runtime.py @@ -29,6 +29,9 @@ from deepspeech.utils.socket_server import warm_up_test from deepspeech.utils.utility import add_arguments from deepspeech.utils.utility import print_arguments +from paddle.io import DataLoader +from deepspeech.io.collator import SpeechCollator + def init_predictor(args): if args.model_dir is not None: @@ -83,7 +86,12 @@ def start_server(config, args): config.data.keep_transcription_text = True dataset = ManifestDataset.from_config(config) - model = DeepSpeech2Model.from_pretrained(dataset, config, + config.collator.batch_size=1 + config.collator.num_workers=0 + collate_fn = SpeechCollator.from_config(config) + test_loader = DataLoader(dataset_dataset, collate_fn=collate_fn, num_workers=0) + + model = DeepSpeech2Model.from_pretrained(test_loader, config, args.checkpoint_path) model.eval() diff --git a/deepspeech/exps/deepspeech2/bin/deploy/server.py b/deepspeech/exps/deepspeech2/bin/deploy/server.py index b2ff37e06..40ba4c725 100644 --- a/deepspeech/exps/deepspeech2/bin/deploy/server.py +++ b/deepspeech/exps/deepspeech2/bin/deploy/server.py @@ -28,6 +28,9 @@ from deepspeech.utils.utility import add_arguments from deepspeech.utils.utility import print_arguments +from paddle.io import DataLoader +from deepspeech.io.collator import SpeechCollator + def start_server(config, args): """Start the ASR server""" config.defrost() @@ -36,7 +39,12 @@ def start_server(config, args): config.data.keep_transcription_text = True dataset = ManifestDataset.from_config(config) - model = DeepSpeech2Model.from_pretrained(dataset, config, + config.collator.batch_size=1 + config.collator.num_workers=0 + collate_fn = SpeechCollator.from_config(config) + test_loader = DataLoader(dataset_dataset, collate_fn=collate_fn, num_workers=0) + + model = DeepSpeech2Model.from_pretrained(test_loader, config, args.checkpoint_path) model.eval() diff --git a/deepspeech/exps/deepspeech2/bin/tune.py b/deepspeech/exps/deepspeech2/bin/tune.py index 02e329a11..f10dc27ce 100644 --- a/deepspeech/exps/deepspeech2/bin/tune.py +++ b/deepspeech/exps/deepspeech2/bin/tune.py @@ -47,7 +47,7 @@ def tune(config, args): drop_last=False, collate_fn=SpeechCollator(keep_transcription_text=True)) - model = DeepSpeech2Model.from_pretrained(dev_dataset, config, + model = DeepSpeech2Model.from_pretrained(valid_loader, config, args.checkpoint_path) model.eval() diff --git a/deepspeech/exps/deepspeech2/model.py b/deepspeech/exps/deepspeech2/model.py index deb8752b7..209e8b023 100644 --- a/deepspeech/exps/deepspeech2/model.py +++ b/deepspeech/exps/deepspeech2/model.py @@ -318,7 +318,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): def export(self): infer_model = DeepSpeech2InferModel.from_pretrained( - self.test_loader.dataset, self.config, self.args.checkpoint_path) + self.test_loader, self.config, self.args.checkpoint_path) infer_model.eval() feat_dim = self.test_loader.collate_fn.feature_size static_model = paddle.jit.to_static( diff --git a/deepspeech/exps/u2/model.py b/deepspeech/exps/u2/model.py index 055518755..308569cd7 100644 --- a/deepspeech/exps/u2/model.py +++ b/deepspeech/exps/u2/model.py @@ -506,7 +506,7 @@ class U2Tester(U2Trainer): List[paddle.static.InputSpec]: input spec. """ from deepspeech.models.u2 import U2InferModel - infer_model = U2InferModel.from_pretrained(self.test_loader.dataset, + infer_model = U2InferModel.from_pretrained(self.test_loader, self.config.model.clone(), self.args.checkpoint_path) feat_dim = self.test_loader.collate_fn.feature_size diff --git a/deepspeech/models/deepspeech2.py b/deepspeech/models/deepspeech2.py index 0ff5514de..d2c03a18e 100644 --- a/deepspeech/models/deepspeech2.py +++ b/deepspeech/models/deepspeech2.py @@ -198,11 +198,11 @@ class DeepSpeech2Model(nn.Layer): cutoff_top_n, num_processes) @classmethod - def from_pretrained(cls, dataset, config, checkpoint_path): + def from_pretrained(cls, dataloader, config, checkpoint_path): """Build a DeepSpeech2Model model from a pretrained model. Parameters ---------- - dataset: paddle.io.Dataset + dataloader: paddle.io.DataLoader config: yacs.config.CfgNode model configs @@ -215,8 +215,8 @@ class DeepSpeech2Model(nn.Layer): DeepSpeech2Model The model built from pretrained result. """ - model = cls(feat_size=dataset.feature_size, - dict_size=dataset.vocab_size, + model = cls(feat_size=dataloader.collate_fn.feature_size, + dict_size=dataloader.collate_fn.vocab_size, num_conv_layers=config.model.num_conv_layers, num_rnn_layers=config.model.num_rnn_layers, rnn_size=config.model.rnn_layer_size, diff --git a/deepspeech/models/u2.py b/deepspeech/models/u2.py index 238e2d35c..23ae3423d 100644 --- a/deepspeech/models/u2.py +++ b/deepspeech/models/u2.py @@ -876,11 +876,11 @@ class U2Model(U2BaseModel): return model @classmethod - def from_pretrained(cls, dataset, config, checkpoint_path): + def from_pretrained(cls, dataloader, config, checkpoint_path): """Build a DeepSpeech2Model model from a pretrained model. Args: - dataset (paddle.io.Dataset): not used. + dataloader (paddle.io.DataLoader): not used. config (yacs.config.CfgNode): model configs checkpoint_path (Path or str): the path of pretrained model checkpoint, without extension name @@ -888,8 +888,8 @@ class U2Model(U2BaseModel): DeepSpeech2Model: The model built from pretrained result. """ config.defrost() - config.input_dim = dataset.feature_size - config.output_dim = dataset.vocab_size + config.input_dim = dataloader.collate_fn.feature_size + config.output_dim = dataloader.collate_fn.vocab_size config.freeze() model = cls.from_config(config) From c753b9ddf2b321caf873187bd7a498fb61d4bf0a Mon Sep 17 00:00:00 2001 From: Haoxin Ma <745165806@qq.com> Date: Wed, 23 Jun 2021 09:05:34 +0000 Subject: [PATCH 033/281] fix runtime.py and server.py --- .../exps/deepspeech2/bin/deploy/runtime.py | 20 ++++++++--------- .../exps/deepspeech2/bin/deploy/server.py | 22 +++++++++++-------- deepspeech/io/collator.py | 3 ++- deepspeech/utils/socket_server.py | 4 ++-- 4 files changed, 27 insertions(+), 22 deletions(-) diff --git a/deepspeech/exps/deepspeech2/bin/deploy/runtime.py b/deepspeech/exps/deepspeech2/bin/deploy/runtime.py index 0ec36b5dd..26365820f 100644 --- a/deepspeech/exps/deepspeech2/bin/deploy/runtime.py +++ b/deepspeech/exps/deepspeech2/bin/deploy/runtime.py @@ -81,15 +81,15 @@ def inference(config, args): def start_server(config, args): """Start the ASR server""" config.defrost() - config.data.manfiest = config.data.test_manifest - config.data.augmentation_config = "" - config.data.keep_transcription_text = True + config.data.manifest = config.data.test_manifest dataset = ManifestDataset.from_config(config) - + + config.collator.augmentation_config = "" + config.collator.keep_transcription_text = True config.collator.batch_size=1 config.collator.num_workers=0 collate_fn = SpeechCollator.from_config(config) - test_loader = DataLoader(dataset_dataset, collate_fn=collate_fn, num_workers=0) + test_loader = DataLoader(dataset, collate_fn=collate_fn, num_workers=0) model = DeepSpeech2Model.from_pretrained(test_loader, config, args.checkpoint_path) @@ -97,15 +97,15 @@ def start_server(config, args): # prepare ASR inference handler def file_to_transcript(filename): - feature = dataset.process_utterance(filename, "") - audio = np.array([feature[0]]).astype('float32') #[1, D, T] - audio_len = feature[0].shape[1] + feature = collate_fn.process_utterance(filename, "") + audio = np.array([feature[0]]).astype('float32') #[1, T, D] + audio_len = feature[0].shape[0] audio_len = np.array([audio_len]).astype('int64') # [1] result_transcript = model.decode( paddle.to_tensor(audio), paddle.to_tensor(audio_len), - vocab_list=dataset.vocab_list, + vocab_list=test_loader.collate_fn.vocab_list, decoding_method=config.decoding.decoding_method, lang_model_path=config.decoding.lang_model_path, beam_alpha=config.decoding.alpha, @@ -146,7 +146,7 @@ if __name__ == "__main__": add_arg('host_ip', str, 'localhost', "Server's IP address.") - add_arg('host_port', int, 8086, "Server's IP port.") + add_arg('host_port', int, 8089, "Server's IP port.") add_arg('speech_save_dir', str, 'demo_cache', "Directory to save demo audios.") diff --git a/deepspeech/exps/deepspeech2/bin/deploy/server.py b/deepspeech/exps/deepspeech2/bin/deploy/server.py index 40ba4c725..73a3fc17f 100644 --- a/deepspeech/exps/deepspeech2/bin/deploy/server.py +++ b/deepspeech/exps/deepspeech2/bin/deploy/server.py @@ -34,15 +34,15 @@ from deepspeech.io.collator import SpeechCollator def start_server(config, args): """Start the ASR server""" config.defrost() - config.data.manfiest = config.data.test_manifest - config.data.augmentation_config = "" - config.data.keep_transcription_text = True + config.data.manifest = config.data.test_manifest dataset = ManifestDataset.from_config(config) + config.collator.augmentation_config = "" + config.collator.keep_transcription_text = True config.collator.batch_size=1 config.collator.num_workers=0 collate_fn = SpeechCollator.from_config(config) - test_loader = DataLoader(dataset_dataset, collate_fn=collate_fn, num_workers=0) + test_loader = DataLoader(dataset, collate_fn=collate_fn, num_workers=0) model = DeepSpeech2Model.from_pretrained(test_loader, config, args.checkpoint_path) @@ -50,15 +50,19 @@ def start_server(config, args): # prepare ASR inference handler def file_to_transcript(filename): - feature = dataset.process_utterance(filename, "") - audio = np.array([feature[0]]).astype('float32') #[1, D, T] - audio_len = feature[0].shape[1] + feature = test_loader.collate_fn.process_utterance(filename, "") + audio = np.array([feature[0]]).astype('float32') #[1, T, D] + # audio = audio.swapaxes(1,2) + print('---file_to_transcript feature----') + print(audio.shape) + audio_len = feature[0].shape[0] + print(audio_len) audio_len = np.array([audio_len]).astype('int64') # [1] result_transcript = model.decode( paddle.to_tensor(audio), paddle.to_tensor(audio_len), - vocab_list=dataset.vocab_list, + vocab_list=test_loader.collate_fn.vocab_list, decoding_method=config.decoding.decoding_method, lang_model_path=config.decoding.lang_model_path, beam_alpha=config.decoding.alpha, @@ -99,7 +103,7 @@ if __name__ == "__main__": add_arg('host_ip', str, 'localhost', "Server's IP address.") - add_arg('host_port', int, 8086, "Server's IP port.") + add_arg('host_port', int, 8088, "Server's IP port.") add_arg('speech_save_dir', str, 'demo_cache', "Directory to save demo audios.") diff --git a/deepspeech/io/collator.py b/deepspeech/io/collator.py index 1061f97cf..94264d6f5 100644 --- a/deepspeech/io/collator.py +++ b/deepspeech/io/collator.py @@ -242,6 +242,7 @@ class SpeechCollator(): # specgram augment specgram = self._augmentation_pipeline.transform_feature(specgram) + specgram=specgram.transpose([1,0]) return specgram, transcript_part def __call__(self, batch): @@ -269,7 +270,7 @@ class SpeechCollator(): #utt utts.append(utt) # audio - audios.append(audio.T) # [T, D] + audios.append(audio) # [T, D] audio_lens.append(audio.shape[1]) # text # for training, text is token ids diff --git a/deepspeech/utils/socket_server.py b/deepspeech/utils/socket_server.py index adcbf3bb2..8fd7c2fa2 100644 --- a/deepspeech/utils/socket_server.py +++ b/deepspeech/utils/socket_server.py @@ -48,9 +48,9 @@ def warm_up_test(audio_process_handler, rng = random.Random(random_seed) samples = rng.sample(manifest, num_test_cases) for idx, sample in enumerate(samples): - print("Warm-up Test Case %d: %s", idx, sample['audio_filepath']) + print("Warm-up Test Case %d: %s"%(idx, sample['feat'])) start_time = time.time() - transcript = audio_process_handler(sample['audio_filepath']) + transcript = audio_process_handler(sample['feat']) finish_time = time.time() print("Response Time: %f, Transcript: %s" % (finish_time - start_time, transcript)) From 340e622953ba941c3e7ae75bbb59e45018a0d1a5 Mon Sep 17 00:00:00 2001 From: Haoxin Ma <745165806@qq.com> Date: Wed, 23 Jun 2021 09:14:56 +0000 Subject: [PATCH 034/281] fix runtime and server --- deepspeech/exps/deepspeech2/bin/deploy/runtime.py | 11 +++++------ deepspeech/exps/deepspeech2/bin/deploy/server.py | 9 ++++----- deepspeech/io/collator.py | 2 +- deepspeech/utils/socket_server.py | 2 +- 4 files changed, 11 insertions(+), 13 deletions(-) diff --git a/deepspeech/exps/deepspeech2/bin/deploy/runtime.py b/deepspeech/exps/deepspeech2/bin/deploy/runtime.py index 26365820f..dad8459e3 100644 --- a/deepspeech/exps/deepspeech2/bin/deploy/runtime.py +++ b/deepspeech/exps/deepspeech2/bin/deploy/runtime.py @@ -18,8 +18,10 @@ import numpy as np import paddle from paddle.inference import Config from paddle.inference import create_predictor +from paddle.io import DataLoader from deepspeech.exps.deepspeech2.config import get_cfg_defaults +from deepspeech.io.collator import SpeechCollator from deepspeech.io.dataset import ManifestDataset from deepspeech.models.deepspeech2 import DeepSpeech2Model from deepspeech.training.cli import default_argument_parser @@ -29,9 +31,6 @@ from deepspeech.utils.socket_server import warm_up_test from deepspeech.utils.utility import add_arguments from deepspeech.utils.utility import print_arguments -from paddle.io import DataLoader -from deepspeech.io.collator import SpeechCollator - def init_predictor(args): if args.model_dir is not None: @@ -83,11 +82,11 @@ def start_server(config, args): config.defrost() config.data.manifest = config.data.test_manifest dataset = ManifestDataset.from_config(config) - + config.collator.augmentation_config = "" config.collator.keep_transcription_text = True - config.collator.batch_size=1 - config.collator.num_workers=0 + config.collator.batch_size = 1 + config.collator.num_workers = 0 collate_fn = SpeechCollator.from_config(config) test_loader = DataLoader(dataset, collate_fn=collate_fn, num_workers=0) diff --git a/deepspeech/exps/deepspeech2/bin/deploy/server.py b/deepspeech/exps/deepspeech2/bin/deploy/server.py index 73a3fc17f..b473a8fd4 100644 --- a/deepspeech/exps/deepspeech2/bin/deploy/server.py +++ b/deepspeech/exps/deepspeech2/bin/deploy/server.py @@ -16,8 +16,10 @@ import functools import numpy as np import paddle +from paddle.io import DataLoader from deepspeech.exps.deepspeech2.config import get_cfg_defaults +from deepspeech.io.collator import SpeechCollator from deepspeech.io.dataset import ManifestDataset from deepspeech.models.deepspeech2 import DeepSpeech2Model from deepspeech.training.cli import default_argument_parser @@ -28,9 +30,6 @@ from deepspeech.utils.utility import add_arguments from deepspeech.utils.utility import print_arguments -from paddle.io import DataLoader -from deepspeech.io.collator import SpeechCollator - def start_server(config, args): """Start the ASR server""" config.defrost() @@ -39,8 +38,8 @@ def start_server(config, args): config.collator.augmentation_config = "" config.collator.keep_transcription_text = True - config.collator.batch_size=1 - config.collator.num_workers=0 + config.collator.batch_size = 1 + config.collator.num_workers = 0 collate_fn = SpeechCollator.from_config(config) test_loader = DataLoader(dataset, collate_fn=collate_fn, num_workers=0) diff --git a/deepspeech/io/collator.py b/deepspeech/io/collator.py index 94264d6f5..305ca9400 100644 --- a/deepspeech/io/collator.py +++ b/deepspeech/io/collator.py @@ -242,7 +242,7 @@ class SpeechCollator(): # specgram augment specgram = self._augmentation_pipeline.transform_feature(specgram) - specgram=specgram.transpose([1,0]) + specgram = specgram.transpose([1, 0]) return specgram, transcript_part def __call__(self, batch): diff --git a/deepspeech/utils/socket_server.py b/deepspeech/utils/socket_server.py index 8fd7c2fa2..45c659f60 100644 --- a/deepspeech/utils/socket_server.py +++ b/deepspeech/utils/socket_server.py @@ -48,7 +48,7 @@ def warm_up_test(audio_process_handler, rng = random.Random(random_seed) samples = rng.sample(manifest, num_test_cases) for idx, sample in enumerate(samples): - print("Warm-up Test Case %d: %s"%(idx, sample['feat'])) + print("Warm-up Test Case %d: %s" % (idx, sample['feat'])) start_time = time.time() transcript = audio_process_handler(sample['feat']) finish_time = time.time() From 4b80b172d3b163b196868965510709a7b96c93ad Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Thu, 24 Jun 2021 02:32:34 +0000 Subject: [PATCH 035/281] add model params --- examples/aishell/s0/README.md | 14 +++++++------- examples/aishell/s1/README.md | 24 ++++++++++++------------ 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/examples/aishell/s0/README.md b/examples/aishell/s0/README.md index ae3fb401a..c25888457 100644 --- a/examples/aishell/s0/README.md +++ b/examples/aishell/s0/README.md @@ -2,10 +2,10 @@ ## Deepspeech2 -| Model | release | Config | Test set | Loss | CER | -| --- | --- | --- | --- | --- | --- | -| DeepSpeech2 58.4M | 2.1.0 | conf/deepspeech2.yaml + spec aug + new datapipe | test | 6.396368026733398 | 0.068382 | -| DeepSpeech2 58.4M | 2.1.0 | conf/deepspeech2.yaml + spec aug | test | 7.483316898345947 | 0.077860 | -| DeepSpeech2 58.4M | 2.1.0 | conf/deepspeech2.yaml | test | 7.299022197723389 | 0.078671 | -| DeepSpeech2 58.4M | 2.0.0 | conf/deepspeech2.yaml | test | - | 0.078977 | -| DeepSpeech2 58.4M | 1.8.5 | - | test | - | 0.080447 | +| Model | Params | Release | Config | Test set | Loss | CER | +| --- | --- | --- | --- | --- | --- | --- | +| DeepSpeech2 | 58.4M | 2.2.0 | conf/deepspeech2.yaml + spec aug + new datapipe | test | 6.396368026733398 | 0.068382 | +| DeepSpeech2 | 58.4M | 2.1.0 | conf/deepspeech2.yaml + spec aug | test | 7.483316898345947 | 0.077860 | +| DeepSpeech2 | 58.4M | 2.1.0 | conf/deepspeech2.yaml | test | 7.299022197723389 | 0.078671 | +| DeepSpeech2 | 58.4M | 2.0.0 | conf/deepspeech2.yaml | test | - | 0.078977 | +| DeepSpeech2 | 58.4M | 1.8.5 | - | test | - | 0.080447 | diff --git a/examples/aishell/s1/README.md b/examples/aishell/s1/README.md index 601b0a8d0..72a03b618 100644 --- a/examples/aishell/s1/README.md +++ b/examples/aishell/s1/README.md @@ -2,21 +2,21 @@ ## Conformer -| Model | Config | Augmentation| Test set | Decode method | Loss | WER | -| --- | --- | --- | --- | --- | --- | --- | -| conformer | conf/conformer.yaml | spec_aug + shift | test | attention | - | 0.059858 | -| conformer | conf/conformer.yaml | spec_aug + shift | test | ctc_greedy_search | - | 0.062311 | -| conformer | conf/conformer.yaml | spec_aug + shift | test | ctc_prefix_beam_search | - | 0.062196 | -| conformer | conf/conformer.yaml | spec_aug + shift | test | attention_rescoring | - | 0.054694 | +| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER | +| --- | --- | --- | --- | --- | --- | --- | --- | +| conformer | 47.06M | conf/conformer.yaml | spec_aug + shift | test | attention | - | 0.059858 | +| conformer | 47.06M | conf/conformer.yaml | spec_aug + shift | test | ctc_greedy_search | - | 0.062311 | +| conformer | 47.06M | conf/conformer.yaml | spec_aug + shift | test | ctc_prefix_beam_search | - | 0.062196 | +| conformer | 47.06M | conf/conformer.yaml | spec_aug + shift | test | attention_rescoring | - | 0.054694 | ## Chunk Conformer -| Model | Config | Augmentation| Test set | Decode method | Chunk | Loss | WER | -| --- | --- | --- | --- | --- | --- | --- | --- | -| conformer | conf/chunk_conformer.yaml | spec_aug + shift | test | attention | 16 | - | 0.061939 | -| conformer | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_greedy_search | 16 | - | 0.070806 | -| conformer | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_prefix_beam_search | 16 | - | 0.070739 | -| conformer | conf/chunk_conformer.yaml | spec_aug + shift | test | attention_rescoring | 16 | - | 0.059400 | +| Model | Params | Config | Augmentation| Test set | Decode method | Chunk | Loss | WER | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | +| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | attention | 16 | - | 0.061939 | +| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_greedy_search | 16 | - | 0.070806 | +| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_prefix_beam_search | 16 | - | 0.070739 | +| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | attention_rescoring | 16 | - | 0.059400 | ## Transformer From 90788b116d85c26cf91bcb76544aaf5b2b189734 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Thu, 24 Jun 2021 04:05:34 +0000 Subject: [PATCH 036/281] more comment; fix datapipe of align --- deepspeech/exps/u2/model.py | 23 ++++++++++++++--------- deepspeech/utils/ctc_utils.py | 20 +++++++++++--------- deepspeech/utils/text_grid.py | 8 +++++--- 3 files changed, 30 insertions(+), 21 deletions(-) diff --git a/deepspeech/exps/u2/model.py b/deepspeech/exps/u2/model.py index f00d5af63..ba7bc45c8 100644 --- a/deepspeech/exps/u2/model.py +++ b/deepspeech/exps/u2/model.py @@ -355,7 +355,7 @@ class U2Tester(U2Trainer): decoding_chunk_size=-1, # decoding chunk size. Defaults to -1. # <0: for decoding, use full chunk. # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. + # 0: used for training, it's prohibited here. num_decoding_left_chunks=-1, # number of left chunks for decoding. Defaults to -1. simulate_streaming=False, # simulate streaming inference. Defaults to False. )) @@ -512,11 +512,13 @@ class U2Tester(U2Trainer): self.model.eval() logger.info(f"Align Total Examples: {len(self.test_loader.dataset)}") - stride_ms = self.test_loader.dataset.stride_ms - token_dict = self.test_loader.dataset.vocab_list + stride_ms = self.test_loader.collate_fn.stride_ms + token_dict = self.test_loader.collate_fn.vocab_list with open(self.args.result_file, 'w') as fout: + # one example in batch for i, batch in enumerate(self.test_loader): key, feat, feats_length, target, target_length = batch + # 1. Encoder encoder_out, encoder_mask = self.model._forward_encoder( feat, feats_length) # (B, maxlen, encoder_dim) @@ -529,28 +531,31 @@ class U2Tester(U2Trainer): ctc_probs = ctc_probs.squeeze(0) target = target.squeeze(0) alignment = ctc_utils.forced_align(ctc_probs, target) - print(alignment) + print(kye[0], alignment) fout.write('{} {}\n'.format(key[0], alignment)) # 3. gen praat # segment alignment align_segs = text_grid.segment_alignment(alignment) - print(align_segs) + print(kye[0], align_segs) # IntervalTier, List["start end token\n"] subsample = get_subsample(self.config) tierformat = text_grid.align_to_tierformat( align_segs, subsample, token_dict) + # write tier tier_path = os.path.join( os.path.dirname(args.result_file), key[0] + ".tier") with open(tier_path, 'w') as f: f.writelines(tierformat) - + # write textgrid textgrid_path = s.path.join( os.path.dirname(args.result_file), key[0] + ".TextGrid") - second_per_frame = 1. / (1000. / stride_ms - ) # 25ms window, 10ms stride + second_per_frame = 1. / (1000. / + stride_ms) # 25ms window, 10ms stride + second_per_example = ( + len(alignment) + 1) * subsample * second_per_frame text_grid.generate_textgrid( - maxtime=(len(alignment) + 1) * subsample * second_per_frame, + maxtime=second_per_example, lines=tierformat, output=textgrid_path) diff --git a/deepspeech/utils/ctc_utils.py b/deepspeech/utils/ctc_utils.py index 76c1898be..6201233df 100644 --- a/deepspeech/utils/ctc_utils.py +++ b/deepspeech/utils/ctc_utils.py @@ -38,8 +38,10 @@ def remove_duplicates_and_blank(hyp: List[int], blank_id=0) -> List[int]: new_hyp: List[int] = [] cur = 0 while cur < len(hyp): + # add non-blank into new_hyp if hyp[cur] != blank_id: new_hyp.append(hyp[cur]) + # skip repeat label prev = cur while cur < len(hyp) and hyp[cur] == hyp[prev]: cur += 1 @@ -52,7 +54,7 @@ def insert_blank(label: np.ndarray, blank_id: int=0) -> np.ndarray: "abcdefg" -> "-a-b-c-d-e-f-g-" Args: - label ([np.ndarray]): label ids, (L). + label ([np.ndarray]): label ids, List[int], (L). blank_id (int, optional): blank id. Defaults to 0. Returns: @@ -61,8 +63,8 @@ def insert_blank(label: np.ndarray, blank_id: int=0) -> np.ndarray: label = np.expand_dims(label, 1) #[L, 1] blanks = np.zeros((label.shape[0], 1), dtype=np.int64) + blank_id label = np.concatenate([blanks, label], axis=1) #[L, 2] - label = label.reshape(-1) #[2L] - label = np.append(label, label[0]) #[2L + 1] + label = label.reshape(-1) #[2L], -l-l-l + label = np.append(label, label[0]) #[2L + 1], -l-l-l- return label @@ -79,21 +81,21 @@ def forced_align(ctc_probs: paddle.Tensor, y: paddle.Tensor, Returns: List[int]: best alignment result, (T). """ - y_insert_blank = insert_blank(y, blank_id) + y_insert_blank = insert_blank(y, blank_id) #(2L+1) log_alpha = paddle.zeros( (ctc_probs.size(0), len(y_insert_blank))) #(T, 2L+1) log_alpha = log_alpha - float('inf') # log of zero state_path = (paddle.zeros( (ctc_probs.size(0), len(y_insert_blank)), dtype=paddle.int16) - 1 - ) # state path + ) # state path, Tuple((T, 2L+1)) # init start state - log_alpha[0, 0] = ctc_probs[0][y_insert_blank[0]] # Sb - log_alpha[0, 1] = ctc_probs[0][y_insert_blank[1]] # Snb + log_alpha[0, 0] = ctc_probs[0][y_insert_blank[0]] # State-b, Sb + log_alpha[0, 1] = ctc_probs[0][y_insert_blank[1]] # State-nb, Snb - for t in range(1, ctc_probs.size(0)): - for s in range(len(y_insert_blank)): + for t in range(1, ctc_probs.size(0)): # T + for s in range(len(y_insert_blank)): # 2L+1 if y_insert_blank[s] == blank_id or s < 2 or y_insert_blank[ s] == y_insert_blank[s - 2]: candidates = paddle.to_tensor( diff --git a/deepspeech/utils/text_grid.py b/deepspeech/utils/text_grid.py index 9afed89e0..b774130db 100644 --- a/deepspeech/utils/text_grid.py +++ b/deepspeech/utils/text_grid.py @@ -22,11 +22,13 @@ def segment_alignment(alignment: List[int], blank_id=0) -> List[List[int]]: """segment ctc alignment ids by continuous blank and repeat label. Args: - alignment (List[int]): ctc alignment id sequence. e.g. [0, 0, 0, 1, 1, 1, 2, 0, 0, 3] + alignment (List[int]): ctc alignment id sequence. + e.g. [0, 0, 0, 1, 1, 1, 2, 0, 0, 3] blank_id (int, optional): blank id. Defaults to 0. Returns: - List[List[int]]: segment aligment id sequence. e.g. [[0, 0, 0, 1, 1, 1], [2], [0, 0, 3]] + List[List[int]]: token align, segment aligment id sequence. + e.g. [[0, 0, 0, 1, 1, 1], [2], [0, 0, 3]] """ # convert alignment to a praat format, which is a doing phonetics # by computer and helps analyzing alignment @@ -61,7 +63,7 @@ def align_to_tierformat(align_segs: List[List[int]], token_dict (Dict[int, Text]): int -> str map. Returns: - List[Text]: list of textgrid.Interval. + List[Text]: list of textgrid.Interval text, str(start, end, text). """ hop_length = 10 # ms second_ms = 1000 # ms From 019ae4b35c2f713e17e69f2f8a8bd6b199642b0e Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Fri, 25 Jun 2021 03:13:40 +0000 Subject: [PATCH 037/281] fix conf for ds2 --- examples/aishell/s0/conf/deepspeech2.yaml | 3 +-- examples/aishell/s0/run.sh | 4 ++-- examples/librispeech/s0/README.md | 10 +++++----- examples/librispeech/s0/conf/deepspeech2.yaml | 13 +++++++++---- 4 files changed, 17 insertions(+), 13 deletions(-) diff --git a/examples/aishell/s0/conf/deepspeech2.yaml b/examples/aishell/s0/conf/deepspeech2.yaml index 8cc4c4c9c..1004fde0e 100644 --- a/examples/aishell/s0/conf/deepspeech2.yaml +++ b/examples/aishell/s0/conf/deepspeech2.yaml @@ -10,8 +10,8 @@ data: min_output_input_ratio: 0.00 max_output_input_ratio: .inf - collator: + batch_size: 64 # one gpu mean_std_filepath: data/mean_std.json unit_type: char vocab_filepath: data/vocab.txt @@ -33,7 +33,6 @@ collator: sortagrad: True shuffle_method: batch_shuffle num_workers: 0 - batch_size: 64 # one gpu model: num_conv_layers: 2 diff --git a/examples/aishell/s0/run.sh b/examples/aishell/s0/run.sh index 05829136a..c9708dcc9 100755 --- a/examples/aishell/s0/run.sh +++ b/examples/aishell/s0/run.sh @@ -31,10 +31,10 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then # export ckpt avg_n - CUDA_VISIBLE_DEVICES=${gpus} ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit + CUDA_VISIBLE_DEVICES=0 ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit fi diff --git a/examples/librispeech/s0/README.md b/examples/librispeech/s0/README.md index 393dd4579..dde288bdd 100644 --- a/examples/librispeech/s0/README.md +++ b/examples/librispeech/s0/README.md @@ -2,8 +2,8 @@ ## Deepspeech2 -| Model | release | Config | Test set | Loss | WER | -| --- | --- | --- | --- | --- | --- | -| DeepSpeech2 | 2.1.0 | conf/deepspeech2.yaml | 15.184467315673828 | test-clean | 0.072154 | -| DeepSpeech2 | 2.0.0 | conf/deepspeech2.yaml | - | test-clean | 0.073973 | -| DeepSpeech2 | 1.8.5 | - | test-clean | - | 0.074939 | +| Model | Params | Release | Config | Test set | Loss | WER | +| --- | --- | --- | --- | --- | --- | --- | +| DeepSpeech2 | 42.96M | 2.1.0 | conf/deepspeech2.yaml | 15.184467315673828 | test-clean | 0.072154 | +| DeepSpeech2 | 42.96M | 2.0.0 | conf/deepspeech2.yaml | - | test-clean | 0.073973 | +| DeepSpeech2 | 42.96M | 1.8.5 | - | test-clean | - | 0.074939 | diff --git a/examples/librispeech/s0/conf/deepspeech2.yaml b/examples/librispeech/s0/conf/deepspeech2.yaml index d1746bff3..b419cbe26 100644 --- a/examples/librispeech/s0/conf/deepspeech2.yaml +++ b/examples/librispeech/s0/conf/deepspeech2.yaml @@ -3,16 +3,21 @@ data: train_manifest: data/manifest.train dev_manifest: data/manifest.dev-clean test_manifest: data/manifest.test-clean - mean_std_filepath: data/mean_std.json - vocab_filepath: data/vocab.txt - augmentation_config: conf/augmentation.json - batch_size: 20 min_input_len: 0.0 max_input_len: 27.0 # second min_output_len: 0.0 max_output_len: .inf min_output_input_ratio: 0.00 max_output_input_ratio: .inf + +collator: + batch_size: 20 + mean_std_filepath: data/mean_std.json + unit_type: char + vocab_filepath: data/vocab.txt + augmentation_config: conf/augmentation.json + random_seed: 0 + spm_model_prefix: specgram_type: linear target_sample_rate: 16000 max_freq: None From 3965dbc2c33661fda86c1f29b5a5afbeddeb653c Mon Sep 17 00:00:00 2001 From: Haoxin Ma <745165806@qq.com> Date: Fri, 25 Jun 2021 05:23:44 +0000 Subject: [PATCH 038/281] runtime.py --- deepspeech/exps/deepspeech2/bin/deploy/runtime.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepspeech/exps/deepspeech2/bin/deploy/runtime.py b/deepspeech/exps/deepspeech2/bin/deploy/runtime.py index dad8459e3..01f01b651 100644 --- a/deepspeech/exps/deepspeech2/bin/deploy/runtime.py +++ b/deepspeech/exps/deepspeech2/bin/deploy/runtime.py @@ -96,7 +96,7 @@ def start_server(config, args): # prepare ASR inference handler def file_to_transcript(filename): - feature = collate_fn.process_utterance(filename, "") + feature = test_loader.collate_fn.process_utterance(filename, "") audio = np.array([feature[0]]).astype('float32') #[1, T, D] audio_len = feature[0].shape[0] audio_len = np.array([audio_len]).astype('int64') # [1] From 91e70a2857c62b7db1db958d9b0528beb2bf0b77 Mon Sep 17 00:00:00 2001 From: Haoxin Ma <745165806@qq.com> Date: Fri, 25 Jun 2021 09:02:59 +0000 Subject: [PATCH 039/281] multi gpus --- deepspeech/training/trainer.py | 18 ++-- deepspeech/utils/checkpoint.py | 144 ++++++++++++++++--------- examples/tiny/s0/conf/deepspeech2.yaml | 2 +- 3 files changed, 105 insertions(+), 59 deletions(-) diff --git a/deepspeech/training/trainer.py b/deepspeech/training/trainer.py index 6563e7c4d..7f68e67cb 100644 --- a/deepspeech/training/trainer.py +++ b/deepspeech/training/trainer.py @@ -18,8 +18,8 @@ import paddle from paddle import distributed as dist from tensorboardX import SummaryWriter -from deepspeech.utils.checkpoint import KBestCheckpoint from deepspeech.utils import mp_tools +from deepspeech.utils.checkpoint import Checkpoint from deepspeech.utils.log import Log __all__ = ["Trainer"] @@ -64,7 +64,7 @@ class Trainer(): The parsed command line arguments. Examples -------- - >>> def main_sp(config, args): + >>> def p(config, args): >>> exp = Trainer(config, args) >>> exp.setup() >>> exp.run() @@ -140,11 +140,8 @@ class Trainer(): "lr": self.optimizer.get_lr() }) self.checkpoint.add_checkpoint(self.checkpoint_dir, self.iteration - if tag is None else tag, self.model, - self.optimizer, infos) - # checkpoint.save_parameters(self.checkpoint_dir, self.iteration - # if tag is None else tag, self.model, - # self.optimizer, infos) + if tag is None else tag, self.model, + self.optimizer, infos) def resume_or_scratch(self): """Resume from latest checkpoint at checkpoints in the output @@ -154,7 +151,7 @@ class Trainer(): resume training. """ scratch = None - infos = self.checkpoint.load_parameters( + infos = self.checkpoint.load_last_parameters( self.model, self.optimizer, checkpoint_dir=self.checkpoint_dir, @@ -266,8 +263,9 @@ class Trainer(): self.checkpoint_dir = checkpoint_dir - self.checkpoint = KBestCheckpoint(max_size=self.config.training.checkpoint.kbest_n, - last_size=self.config.training.checkpoint.latest_n) + self.checkpoint = Checkpoint( + kbest_n=self.config.training.checkpoint.kbest_n, + latest_n=self.config.training.checkpoint.latest_n) @mp_tools.rank_zero_only def destory(self): diff --git a/deepspeech/utils/checkpoint.py b/deepspeech/utils/checkpoint.py index ef73eb705..52eccb673 100644 --- a/deepspeech/utils/checkpoint.py +++ b/deepspeech/utils/checkpoint.py @@ -24,20 +24,22 @@ from deepspeech.utils import mp_tools from deepspeech.utils.log import Log import glob +# import operator +from pathlib import Path logger = Log(__name__).getlog() -__all__ = ["load_parameters", "save_parameters"] +__all__ = ["Checkpoint"] -class KBestCheckpoint(object): +class Checkpoint(object): def __init__(self, - max_size: int=5, - last_size: int=1): + kbest_n: int=5, + latest_n: int=1): self.best_records: Mapping[Path, float] = {} - self.last_records = [] - self.max_size = max_size - self.last_size = last_size - self._save_all = (max_size == -1) + self.latest_records = [] + self.kbest_n = kbest_n + self.latest_n = latest_n + self._save_all = (kbest_n == -1) def should_save_best(self, metric: float) -> bool: if not self.best_full(): @@ -45,36 +47,36 @@ class KBestCheckpoint(object): # already full worst_record_path = max(self.best_records, key=self.best_records.get) + # worst_record_path = max(self.best_records.iteritems(), key=operator.itemgetter(1))[0] worst_metric = self.best_records[worst_record_path] return metric < worst_metric def best_full(self): - return (not self._save_all) and len(self.best_records) == self.max_size + return (not self._save_all) and len(self.best_records) == self.kbest_n - def last_full(self): - return len(self.last_records) == self.last_size + def latest_full(self): + return len(self.latest_records) == self.latest_n - def add_checkpoint(self, - checkpoint_dir, tag_or_iteration, - model, optimizer, infos): - if("val_loss" not in infos.keys()): + def add_checkpoint(self, checkpoint_dir, tag_or_iteration, + model, optimizer, infos, metric_type = "val_loss"): + if(metric_type not in infos.keys()): self.save_parameters(checkpoint_dir, tag_or_iteration, model, optimizer, infos) return #save best - if self.should_save_best(infos["val_loss"]): - self.save_checkpoint_and_update(infos["val_loss"], + if self.should_save_best(infos[metric_type]): + self.save_best_checkpoint_and_update(infos[metric_type], checkpoint_dir, tag_or_iteration, model, optimizer, infos) - #save last - self.save_last_checkpoint_and_update(checkpoint_dir, tag_or_iteration, + #save latest + self.save_latest_checkpoint_and_update(checkpoint_dir, tag_or_iteration, model, optimizer, infos) if isinstance(tag_or_iteration, int): - self._save_record(checkpoint_dir, tag_or_iteration) + self.save_checkpoint_record(checkpoint_dir, tag_or_iteration) - def save_checkpoint_and_update(self, metric, + def save_best_checkpoint_and_update(self, metric, checkpoint_dir, tag_or_iteration, model, optimizer, infos): # remove the worst @@ -82,9 +84,8 @@ class KBestCheckpoint(object): worst_record_path = max(self.best_records, key=self.best_records.get) self.best_records.pop(worst_record_path) - if(worst_record_path not in self.last_records): - print('----to remove (best)----') - print(worst_record_path) + if(worst_record_path not in self.latest_records): + logger.info("remove the worst checkpoint: {}".format(worst_record_path)) self.del_checkpoint(checkpoint_dir, worst_record_path) # add the new one @@ -92,22 +93,18 @@ class KBestCheckpoint(object): model, optimizer, infos) self.best_records[tag_or_iteration] = metric - def save_last_checkpoint_and_update(self, checkpoint_dir, tag_or_iteration, + def save_latest_checkpoint_and_update(self, checkpoint_dir, tag_or_iteration, model, optimizer, infos): # remove the old - if self.last_full(): - to_del_fn = self.last_records.pop(0) + if self.latest_full(): + to_del_fn = self.latest_records.pop(0) if(to_del_fn not in self.best_records.keys()): - print('----to remove (last)----') - print(to_del_fn) + logger.info("remove the latest checkpoint: {}".format(to_del_fn)) self.del_checkpoint(checkpoint_dir, to_del_fn) - self.last_records.append(tag_or_iteration) + self.latest_records.append(tag_or_iteration) self.save_parameters(checkpoint_dir, tag_or_iteration, model, optimizer, infos) - # with open(os.path.join(checkpoint_dir, "checkpoint"), "w") as handle: - # for iteration in self.best_records - # handle.write("model_checkpoint_path:{}\n".format(iteration)) def del_checkpoint(self, checkpoint_dir, tag_or_iteration): @@ -115,18 +112,17 @@ class KBestCheckpoint(object): "{}".format(tag_or_iteration)) for filename in glob.glob(checkpoint_path+".*"): os.remove(filename) - print("delete file: "+filename) + logger.info("delete file: {}".format(filename)) - def _load_latest_checkpoint(self, checkpoint_dir: str) -> int: + def load_checkpoint_idx(self, checkpoint_record: str) -> int: """Get the iteration number corresponding to the latest saved checkpoint. Args: - checkpoint_dir (str): the directory where checkpoint is saved. + checkpoint_path (str): the saved path of checkpoint. Returns: int: the latest iteration number. -1 for no checkpoint to load. """ - checkpoint_record = os.path.join(checkpoint_dir, "checkpoint_last") if not os.path.isfile(checkpoint_record): return -1 @@ -135,9 +131,9 @@ class KBestCheckpoint(object): latest_checkpoint = handle.readlines()[-1].strip() iteration = int(latest_checkpoint.split(":")[-1]) return iteration + - - def _save_record(self, checkpoint_dir: str, iteration: int): + def save_checkpoint_record(self, checkpoint_dir: str, iteration: int): """Save the iteration number of the latest model to be checkpoint record. Args: checkpoint_dir (str): the directory where checkpoint is saved. @@ -145,24 +141,22 @@ class KBestCheckpoint(object): Returns: None """ - checkpoint_record_last = os.path.join(checkpoint_dir, "checkpoint_last") + checkpoint_record_latest = os.path.join(checkpoint_dir, "checkpoint_latest") checkpoint_record_best = os.path.join(checkpoint_dir, "checkpoint_best") - # Update the latest checkpoint index. - # with open(checkpoint_record, "a+") as handle: - # handle.write("model_checkpoint_path:{}\n".format(iteration)) + with open(checkpoint_record_best, "w") as handle: for i in self.best_records.keys(): handle.write("model_checkpoint_path:{}\n".format(i)) - with open(checkpoint_record_last, "w") as handle: - for i in self.last_records: + with open(checkpoint_record_latest, "w") as handle: + for i in self.latest_records: handle.write("model_checkpoint_path:{}\n".format(i)) - def load_parameters(self, model, + def load_last_parameters(self, model, optimizer=None, checkpoint_dir=None, checkpoint_path=None): - """Load a specific model checkpoint from disk. + """Load a last model checkpoint from disk. Args: model (Layer): model to load parameters. optimizer (Optimizer, optional): optimizer to load states if needed. @@ -179,7 +173,8 @@ class KBestCheckpoint(object): if checkpoint_path is not None: tag = os.path.basename(checkpoint_path).split(":")[-1] elif checkpoint_dir is not None: - iteration = self._load_latest_checkpoint(checkpoint_dir) + checkpoint_record = os.path.join(checkpoint_dir, "checkpoint_latest") + iteration = self.load_checkpoint_idx(checkpoint_record) if iteration == -1: return configs checkpoint_path = os.path.join(checkpoint_dir, "{}".format(iteration)) @@ -209,6 +204,59 @@ class KBestCheckpoint(object): return configs + def load_best_parameters(self, model, + optimizer=None, + checkpoint_dir=None, + checkpoint_path=None): + """Load a last model checkpoint from disk. + Args: + model (Layer): model to load parameters. + optimizer (Optimizer, optional): optimizer to load states if needed. + Defaults to None. + checkpoint_dir (str, optional): the directory where checkpoint is saved. + checkpoint_path (str, optional): if specified, load the checkpoint + stored in the checkpoint_path(prefix) and the argument 'checkpoint_dir' will + be ignored. Defaults to None. + Returns: + configs (dict): epoch or step, lr and other meta info should be saved. + """ + configs = {} + + if checkpoint_path is not None: + tag = os.path.basename(checkpoint_path).split(":")[-1] + elif checkpoint_dir is not None: + checkpoint_record = os.path.join(checkpoint_dir, "checkpoint_best") + iteration = self.load_checkpoint_idx(checkpoint_record) + if iteration == -1: + return configs + checkpoint_path = os.path.join(checkpoint_dir, "{}".format(iteration)) + else: + raise ValueError( + "At least one of 'checkpoint_dir' and 'checkpoint_path' should be specified!" + ) + + rank = dist.get_rank() + + params_path = checkpoint_path + ".pdparams" + model_dict = paddle.load(params_path) + model.set_state_dict(model_dict) + logger.info("Rank {}: loaded model from {}".format(rank, params_path)) + + optimizer_path = checkpoint_path + ".pdopt" + if optimizer and os.path.isfile(optimizer_path): + optimizer_dict = paddle.load(optimizer_path) + optimizer.set_state_dict(optimizer_dict) + logger.info("Rank {}: loaded optimizer state from {}".format( + rank, optimizer_path)) + + info_path = re.sub('.pdparams$', '.json', params_path) + if os.path.exists(info_path): + with open(info_path, 'r') as fin: + configs = json.load(fin) + return configs + + + @mp_tools.rank_zero_only def save_parameters(self, checkpoint_dir: str, tag_or_iteration: Union[int, str], diff --git a/examples/tiny/s0/conf/deepspeech2.yaml b/examples/tiny/s0/conf/deepspeech2.yaml index b9c2556c7..ea433f341 100644 --- a/examples/tiny/s0/conf/deepspeech2.yaml +++ b/examples/tiny/s0/conf/deepspeech2.yaml @@ -43,7 +43,7 @@ model: share_rnn_weights: True training: - n_epoch: 6 + n_epoch: 10 lr: 1e-5 lr_decay: 1.0 weight_decay: 1e-06 From 16210c058763f6ad3426ed53da10a9aa4e33ff49 Mon Sep 17 00:00:00 2001 From: Haoxin Ma <745165806@qq.com> Date: Fri, 25 Jun 2021 09:08:30 +0000 Subject: [PATCH 040/281] fix bug --- deepspeech/training/trainer.py | 2 +- deepspeech/utils/checkpoint.py | 121 +++++++++++++++++---------------- 2 files changed, 63 insertions(+), 60 deletions(-) diff --git a/deepspeech/training/trainer.py b/deepspeech/training/trainer.py index 7f68e67cb..f8668370a 100644 --- a/deepspeech/training/trainer.py +++ b/deepspeech/training/trainer.py @@ -64,7 +64,7 @@ class Trainer(): The parsed command line arguments. Examples -------- - >>> def p(config, args): + >>> def main_sp(config, args): >>> exp = Trainer(config, args) >>> exp.setup() >>> exp.run() diff --git a/deepspeech/utils/checkpoint.py b/deepspeech/utils/checkpoint.py index 52eccb673..b29ef2ab5 100644 --- a/deepspeech/utils/checkpoint.py +++ b/deepspeech/utils/checkpoint.py @@ -11,9 +11,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import glob import json import os import re +from pathlib import Path from typing import Union import paddle @@ -22,25 +24,21 @@ from paddle.optimizer import Optimizer from deepspeech.utils import mp_tools from deepspeech.utils.log import Log - -import glob # import operator -from pathlib import Path logger = Log(__name__).getlog() __all__ = ["Checkpoint"] + class Checkpoint(object): - def __init__(self, - kbest_n: int=5, - latest_n: int=1): + def __init__(self, kbest_n: int=5, latest_n: int=1): self.best_records: Mapping[Path, float] = {} self.latest_records = [] self.kbest_n = kbest_n self.latest_n = latest_n self._save_all = (kbest_n == -1) - + def should_save_best(self, metric: float) -> bool: if not self.best_full(): return True @@ -53,68 +51,72 @@ class Checkpoint(object): def best_full(self): return (not self._save_all) and len(self.best_records) == self.kbest_n - + def latest_full(self): return len(self.latest_records) == self.latest_n - def add_checkpoint(self, checkpoint_dir, tag_or_iteration, - model, optimizer, infos, metric_type = "val_loss"): - if(metric_type not in infos.keys()): - self.save_parameters(checkpoint_dir, tag_or_iteration, - model, optimizer, infos) + def add_checkpoint(self, + checkpoint_dir, + tag_or_iteration, + model, + optimizer, + infos, + metric_type="val_loss"): + if (metric_type not in infos.keys()): + self.save_parameters(checkpoint_dir, tag_or_iteration, model, + optimizer, infos) return #save best if self.should_save_best(infos[metric_type]): - self.save_best_checkpoint_and_update(infos[metric_type], - checkpoint_dir, tag_or_iteration, - model, optimizer, infos) + self.save_best_checkpoint_and_update( + infos[metric_type], checkpoint_dir, tag_or_iteration, model, + optimizer, infos) #save latest self.save_latest_checkpoint_and_update(checkpoint_dir, tag_or_iteration, - model, optimizer, infos) - + model, optimizer, infos) + if isinstance(tag_or_iteration, int): self.save_checkpoint_record(checkpoint_dir, tag_or_iteration) - - def save_best_checkpoint_and_update(self, metric, - checkpoint_dir, tag_or_iteration, - model, optimizer, infos): + + def save_best_checkpoint_and_update(self, metric, checkpoint_dir, + tag_or_iteration, model, optimizer, + infos): # remove the worst if self.best_full(): worst_record_path = max(self.best_records, key=self.best_records.get) self.best_records.pop(worst_record_path) - if(worst_record_path not in self.latest_records): - logger.info("remove the worst checkpoint: {}".format(worst_record_path)) + if (worst_record_path not in self.latest_records): + logger.info( + "remove the worst checkpoint: {}".format(worst_record_path)) self.del_checkpoint(checkpoint_dir, worst_record_path) # add the new one - self.save_parameters(checkpoint_dir, tag_or_iteration, - model, optimizer, infos) + self.save_parameters(checkpoint_dir, tag_or_iteration, model, optimizer, + infos) self.best_records[tag_or_iteration] = metric - - def save_latest_checkpoint_and_update(self, checkpoint_dir, tag_or_iteration, - model, optimizer, infos): + + def save_latest_checkpoint_and_update( + self, checkpoint_dir, tag_or_iteration, model, optimizer, infos): # remove the old if self.latest_full(): to_del_fn = self.latest_records.pop(0) - if(to_del_fn not in self.best_records.keys()): - logger.info("remove the latest checkpoint: {}".format(to_del_fn)) + if (to_del_fn not in self.best_records.keys()): + logger.info( + "remove the latest checkpoint: {}".format(to_del_fn)) self.del_checkpoint(checkpoint_dir, to_del_fn) self.latest_records.append(tag_or_iteration) - self.save_parameters(checkpoint_dir, tag_or_iteration, - model, optimizer, infos) - + self.save_parameters(checkpoint_dir, tag_or_iteration, model, optimizer, + infos) def del_checkpoint(self, checkpoint_dir, tag_or_iteration): checkpoint_path = os.path.join(checkpoint_dir, - "{}".format(tag_or_iteration)) - for filename in glob.glob(checkpoint_path+".*"): + "{}".format(tag_or_iteration)) + for filename in glob.glob(checkpoint_path + ".*"): os.remove(filename) logger.info("delete file: {}".format(filename)) - - def load_checkpoint_idx(self, checkpoint_record: str) -> int: """Get the iteration number corresponding to the latest saved checkpoint. @@ -131,7 +133,6 @@ class Checkpoint(object): latest_checkpoint = handle.readlines()[-1].strip() iteration = int(latest_checkpoint.split(":")[-1]) return iteration - def save_checkpoint_record(self, checkpoint_dir: str, iteration: int): """Save the iteration number of the latest model to be checkpoint record. @@ -141,9 +142,10 @@ class Checkpoint(object): Returns: None """ - checkpoint_record_latest = os.path.join(checkpoint_dir, "checkpoint_latest") + checkpoint_record_latest = os.path.join(checkpoint_dir, + "checkpoint_latest") checkpoint_record_best = os.path.join(checkpoint_dir, "checkpoint_best") - + with open(checkpoint_record_best, "w") as handle: for i in self.best_records.keys(): handle.write("model_checkpoint_path:{}\n".format(i)) @@ -151,11 +153,11 @@ class Checkpoint(object): for i in self.latest_records: handle.write("model_checkpoint_path:{}\n".format(i)) - - def load_last_parameters(self, model, - optimizer=None, - checkpoint_dir=None, - checkpoint_path=None): + def load_last_parameters(self, + model, + optimizer=None, + checkpoint_dir=None, + checkpoint_path=None): """Load a last model checkpoint from disk. Args: model (Layer): model to load parameters. @@ -173,11 +175,13 @@ class Checkpoint(object): if checkpoint_path is not None: tag = os.path.basename(checkpoint_path).split(":")[-1] elif checkpoint_dir is not None: - checkpoint_record = os.path.join(checkpoint_dir, "checkpoint_latest") + checkpoint_record = os.path.join(checkpoint_dir, + "checkpoint_latest") iteration = self.load_checkpoint_idx(checkpoint_record) if iteration == -1: return configs - checkpoint_path = os.path.join(checkpoint_dir, "{}".format(iteration)) + checkpoint_path = os.path.join(checkpoint_dir, + "{}".format(iteration)) else: raise ValueError( "At least one of 'checkpoint_dir' and 'checkpoint_path' should be specified!" @@ -203,11 +207,11 @@ class Checkpoint(object): configs = json.load(fin) return configs - - def load_best_parameters(self, model, - optimizer=None, - checkpoint_dir=None, - checkpoint_path=None): + def load_best_parameters(self, + model, + optimizer=None, + checkpoint_dir=None, + checkpoint_path=None): """Load a last model checkpoint from disk. Args: model (Layer): model to load parameters. @@ -229,7 +233,8 @@ class Checkpoint(object): iteration = self.load_checkpoint_idx(checkpoint_record) if iteration == -1: return configs - checkpoint_path = os.path.join(checkpoint_dir, "{}".format(iteration)) + checkpoint_path = os.path.join(checkpoint_dir, + "{}".format(iteration)) else: raise ValueError( "At least one of 'checkpoint_dir' and 'checkpoint_path' should be specified!" @@ -255,10 +260,9 @@ class Checkpoint(object): configs = json.load(fin) return configs - - @mp_tools.rank_zero_only - def save_parameters(self, checkpoint_dir: str, + def save_parameters(self, + checkpoint_dir: str, tag_or_iteration: Union[int, str], model: paddle.nn.Layer, optimizer: Optimizer=None, @@ -275,7 +279,7 @@ class Checkpoint(object): None """ checkpoint_path = os.path.join(checkpoint_dir, - "{}".format(tag_or_iteration)) + "{}".format(tag_or_iteration)) model_dict = model.state_dict() params_path = checkpoint_path + ".pdparams" @@ -293,4 +297,3 @@ class Checkpoint(object): with open(info_path, 'w') as fout: data = json.dumps(infos) fout.write(data) - From 03e695250163b5f725595c1902b765d4c4755ba0 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Mon, 28 Jun 2021 10:00:45 +0000 Subject: [PATCH 041/281] more detial of result --- examples/aishell/s0/README.md | 2 +- examples/aishell/s1/README.md | 30 +++++++++++++++--------------- examples/librispeech/s0/README.md | 10 +++++----- examples/librispeech/s1/README.md | 22 +++++++++++----------- 4 files changed, 32 insertions(+), 32 deletions(-) diff --git a/examples/aishell/s0/README.md b/examples/aishell/s0/README.md index ae3fb401a..40d7c1581 100644 --- a/examples/aishell/s0/README.md +++ b/examples/aishell/s0/README.md @@ -4,7 +4,7 @@ | Model | release | Config | Test set | Loss | CER | | --- | --- | --- | --- | --- | --- | -| DeepSpeech2 58.4M | 2.1.0 | conf/deepspeech2.yaml + spec aug + new datapipe | test | 6.396368026733398 | 0.068382 | +| DeepSpeech2 58.4M | 2.1.0 | conf/deepspeech2.yaml + spec aug + new datapipe | test | 6.396368026733398 | 0.068382 ~ 0.073507| | DeepSpeech2 58.4M | 2.1.0 | conf/deepspeech2.yaml + spec aug | test | 7.483316898345947 | 0.077860 | | DeepSpeech2 58.4M | 2.1.0 | conf/deepspeech2.yaml | test | 7.299022197723389 | 0.078671 | | DeepSpeech2 58.4M | 2.0.0 | conf/deepspeech2.yaml | test | - | 0.078977 | diff --git a/examples/aishell/s1/README.md b/examples/aishell/s1/README.md index 601b0a8d0..1072eabd8 100644 --- a/examples/aishell/s1/README.md +++ b/examples/aishell/s1/README.md @@ -2,25 +2,25 @@ ## Conformer -| Model | Config | Augmentation| Test set | Decode method | Loss | WER | -| --- | --- | --- | --- | --- | --- | --- | -| conformer | conf/conformer.yaml | spec_aug + shift | test | attention | - | 0.059858 | -| conformer | conf/conformer.yaml | spec_aug + shift | test | ctc_greedy_search | - | 0.062311 | -| conformer | conf/conformer.yaml | spec_aug + shift | test | ctc_prefix_beam_search | - | 0.062196 | -| conformer | conf/conformer.yaml | spec_aug + shift | test | attention_rescoring | - | 0.054694 | +| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER | +| --- | --- | --- | --- | --- | --- | --- | --- | +| conformer | 47.07M | conf/conformer.yaml | spec_aug + shift | test | attention | - | 0.059858 | +| conformer | 47.07M | conf/conformer.yaml | spec_aug + shift | test | ctc_greedy_search | - | 0.062311 | +| conformer | 47.07M | conf/conformer.yaml | spec_aug + shift | test | ctc_prefix_beam_search | - | 0.062196 | +| conformer | 47.07M | conf/conformer.yaml | spec_aug + shift | test | attention_rescoring | - | 0.054694 | ## Chunk Conformer -| Model | Config | Augmentation| Test set | Decode method | Chunk | Loss | WER | -| --- | --- | --- | --- | --- | --- | --- | --- | -| conformer | conf/chunk_conformer.yaml | spec_aug + shift | test | attention | 16 | - | 0.061939 | -| conformer | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_greedy_search | 16 | - | 0.070806 | -| conformer | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_prefix_beam_search | 16 | - | 0.070739 | -| conformer | conf/chunk_conformer.yaml | spec_aug + shift | test | attention_rescoring | 16 | - | 0.059400 | +| Model | Params | Config | Augmentation| Test set | Decode method | Chunk | Loss | WER | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | +| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | attention | 16 | - | 0.061939 | +| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_greedy_search | 16 | - | 0.070806 | +| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_prefix_beam_search | 16 | - | 0.070739 | +| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | attention_rescoring | 16 | - | 0.059400 | ## Transformer -| Model | Config | Augmentation| Test set | Decode method | Loss | WER | -| --- | --- | --- | --- | --- | --- | ---| -| transformer | conf/transformer.yaml | spec_aug + shift | test | attention | - | - | +| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER | +| --- | --- | --- | --- | --- | --- | --- | ---| +| transformer | - | conf/transformer.yaml | spec_aug + shift | test | attention | - | - | diff --git a/examples/librispeech/s0/README.md b/examples/librispeech/s0/README.md index 393dd4579..76aa5e78a 100644 --- a/examples/librispeech/s0/README.md +++ b/examples/librispeech/s0/README.md @@ -2,8 +2,8 @@ ## Deepspeech2 -| Model | release | Config | Test set | Loss | WER | -| --- | --- | --- | --- | --- | --- | -| DeepSpeech2 | 2.1.0 | conf/deepspeech2.yaml | 15.184467315673828 | test-clean | 0.072154 | -| DeepSpeech2 | 2.0.0 | conf/deepspeech2.yaml | - | test-clean | 0.073973 | -| DeepSpeech2 | 1.8.5 | - | test-clean | - | 0.074939 | +| Model | Params | release | Config | Test set | Loss | WER | +| --- | --- | --- | --- | --- | --- | --- | +| DeepSpeech2 | 42.96M | 2.1.0 | conf/deepspeech2.yaml | 15.184467315673828 | test-clean | 0.072154 | +| DeepSpeech2 | 42.96M | 2.0.0 | conf/deepspeech2.yaml | - | test-clean | 0.073973 | +| DeepSpeech2 | 42.96M | 1.8.5 | - | test-clean | - | 0.074939 | diff --git a/examples/librispeech/s1/README.md b/examples/librispeech/s1/README.md index 73f6156d9..5e23c0ab5 100644 --- a/examples/librispeech/s1/README.md +++ b/examples/librispeech/s1/README.md @@ -2,17 +2,17 @@ ## Conformer -| Model | Config | Augmentation| Test set | Decode method | Loss | WER | -| --- | --- | --- | --- | --- | --- | --- | -| conformer | conf/conformer.yaml | spec_aug + shift | test-all | attention | test-all 6.35 | 0.057117 | -| conformer | conf/conformer.yaml | spec_aug + shift | test-clean | attention | test-all 6.35 | 0.030162 | -| conformer | conf/conformer.yaml | spec_aug + shift | test-clean | ctc_greedy_search | test-all 6.35 | 0.037910 | -| conformer | conf/conformer.yaml | spec_aug + shift | test-clean | ctc_prefix_beam_search | test-all 6.35 | 0.037761 | -| conformer | conf/conformer.yaml | spec_aug + shift | test-clean | attention_rescoring | test-all 6.35 | 0.032115 | +| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER | +| --- | --- | --- | --- | --- | --- | --- | --- | +| conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-all | attention | 6.35 | 0.057117 | +| conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean | attention | 6.35 | 0.030162 | +| conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean | ctc_greedy_search | 6.35 | 0.037910 | +| conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean | ctc_prefix_beam_search | 6.35 | 0.037761 | +| conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean | attention_rescoring | 6.35 | 0.032115 | ## Transformer -| Model | Config | Augmentation| Test set | Decode method | Loss | WER | -| --- | --- | --- | --- | --- | --- | --- | -| transformer | conf/transformer.yaml | spec_aug + shift | test-all | attention | test-all 6.98 | 0.066500 | -| transformer | conf/transformer.yaml | spec_aug + shift | test-clean | attention | test-all 6.98 | 0.036 | +| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER | +| --- | --- | --- | --- | --- | --- | --- | --- | +| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-all | attention | 6.98 | 0.066500 | +| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean | attention | 6.98 | 0.036 | From 9c0b6c5bb0e91ad68f2b91d7d991664e3acfd038 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Mon, 28 Jun 2021 12:11:12 +0000 Subject: [PATCH 042/281] fix audio shape bug for audio len --- deepspeech/io/collator.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/deepspeech/io/collator.py b/deepspeech/io/collator.py index 305ca9400..2ef119666 100644 --- a/deepspeech/io/collator.py +++ b/deepspeech/io/collator.py @@ -154,8 +154,8 @@ class SpeechCollator(): random_seed (int, optional): for random generator. Defaults to 0. keep_transcription_text (bool, optional): True, when not in training mode, will not do tokenizer; Defaults to False. if ``keep_transcription_text`` is False, text is token ids else is raw string. - - Do augmentations + + Do augmentations Padding audio features with zeros to make them have the same shape (or a user-defined shape) within one batch. """ @@ -271,7 +271,7 @@ class SpeechCollator(): utts.append(utt) # audio audios.append(audio) # [T, D] - audio_lens.append(audio.shape[1]) + audio_lens.append(audio.shape[0]) # text # for training, text is token ids # else text is string, convert to unicode ord From 9b3acddd5d7a1469d9dadb5ce959756bc5e98771 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 29 Jun 2021 04:51:32 +0000 Subject: [PATCH 043/281] fix conf for new datapipe; u2 export inputspec --- deepspeech/exps/u2/model.py | 7 +++---- .../librispeech/s1/conf/chunk_confermer.yaml | 16 +++++++++------- .../librispeech/s1/conf/chunk_transformer.yaml | 16 +++++++++------- examples/librispeech/s1/conf/conformer.yaml | 14 ++++++++------ examples/librispeech/s1/conf/transformer.yaml | 14 ++++++++------ 5 files changed, 37 insertions(+), 30 deletions(-) diff --git a/deepspeech/exps/u2/model.py b/deepspeech/exps/u2/model.py index 308569cd7..05a37b21b 100644 --- a/deepspeech/exps/u2/model.py +++ b/deepspeech/exps/u2/model.py @@ -511,10 +511,9 @@ class U2Tester(U2Trainer): self.args.checkpoint_path) feat_dim = self.test_loader.collate_fn.feature_size input_spec = [ - paddle.static.InputSpec( - shape=[None, feat_dim, None], - dtype='float32'), # audio, [B,D,T] - paddle.static.InputSpec(shape=[None], + paddle.static.InputSpec(shape=[1, None, feat_dim], + dtype='float32'), # audio, [B,T,D] + paddle.static.InputSpec(shape=[1], dtype='int64'), # audio_length, [B] ] return infer_model, input_spec diff --git a/examples/librispeech/s1/conf/chunk_confermer.yaml b/examples/librispeech/s1/conf/chunk_confermer.yaml index ec945a188..ef08daa84 100644 --- a/examples/librispeech/s1/conf/chunk_confermer.yaml +++ b/examples/librispeech/s1/conf/chunk_confermer.yaml @@ -3,18 +3,20 @@ data: train_manifest: data/manifest.train dev_manifest: data/manifest.dev test_manifest: data/manifest.test - vocab_filepath: data/vocab.txt - unit_type: 'spm' - spm_model_prefix: 'data/bpe_unigram_5000' - mean_std_filepath: "" - augmentation_config: conf/augmentation.json - batch_size: 4 min_input_len: 0.5 max_input_len: 20.0 min_output_len: 0.0 max_output_len: 400.0 min_output_input_ratio: 0.05 max_output_input_ratio: 10.0 + +collator: + vocab_filepath: data/vocab.txt + unit_type: 'spm' + spm_model_prefix: 'data/bpe_unigram_5000' + mean_std_filepath: "" + augmentation_config: conf/augmentation.json + batch_size: 16 raw_wav: True # use raw_wav or kaldi feature specgram_type: fbank #linear, mfcc, fbank feat_dim: 80 @@ -80,7 +82,7 @@ model: training: n_epoch: 120 - accum_grad: 1 + accum_grad: 8 global_grad_clip: 5.0 optim: adam optim_conf: diff --git a/examples/librispeech/s1/conf/chunk_transformer.yaml b/examples/librispeech/s1/conf/chunk_transformer.yaml index 3939ffc68..5ec2ad126 100644 --- a/examples/librispeech/s1/conf/chunk_transformer.yaml +++ b/examples/librispeech/s1/conf/chunk_transformer.yaml @@ -3,18 +3,20 @@ data: train_manifest: data/manifest.train dev_manifest: data/manifest.dev test_manifest: data/manifest.test - vocab_filepath: data/vocab.txt - unit_type: 'spm' - spm_model_prefix: 'data/bpe_unigram_5000' - mean_std_filepath: "" - augmentation_config: conf/augmentation.json - batch_size: 64 min_input_len: 0.5 # second max_input_len: 20.0 # second min_output_len: 0.0 # tokens max_output_len: 400.0 # tokens min_output_input_ratio: 0.05 max_output_input_ratio: 10.0 + +collator: + vocab_filepath: data/vocab.txt + unit_type: 'spm' + spm_model_prefix: 'data/bpe_unigram_5000' + mean_std_filepath: "" + augmentation_config: conf/augmentation.json + batch_size: 64 raw_wav: True # use raw_wav or kaldi feature specgram_type: fbank #linear, mfcc, fbank feat_dim: 80 @@ -103,6 +105,6 @@ decoding: # >0: for decoding, use fixed chunk size as set. # 0: used for training, it's prohibited here. num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. + simulate_streaming: true # simulate streaming inference. Defaults to False. diff --git a/examples/librispeech/s1/conf/conformer.yaml b/examples/librispeech/s1/conf/conformer.yaml index 8f8bf4539..cce31b163 100644 --- a/examples/librispeech/s1/conf/conformer.yaml +++ b/examples/librispeech/s1/conf/conformer.yaml @@ -3,18 +3,20 @@ data: train_manifest: data/manifest.train dev_manifest: data/manifest.dev test_manifest: data/manifest.test-clean - vocab_filepath: data/vocab.txt - unit_type: 'spm' - spm_model_prefix: 'data/bpe_unigram_5000' - mean_std_filepath: "" - augmentation_config: conf/augmentation.json - batch_size: 16 min_input_len: 0.5 # seconds max_input_len: 20.0 # seconds min_output_len: 0.0 # tokens max_output_len: 400.0 # tokens min_output_input_ratio: 0.05 max_output_input_ratio: 10.0 + +collator: + vocab_filepath: data/vocab.txt + unit_type: 'spm' + spm_model_prefix: 'data/bpe_unigram_5000' + mean_std_filepath: "" + augmentation_config: conf/augmentation.json + batch_size: 16 raw_wav: True # use raw_wav or kaldi feature specgram_type: fbank #linear, mfcc, fbank feat_dim: 80 diff --git a/examples/librispeech/s1/conf/transformer.yaml b/examples/librispeech/s1/conf/transformer.yaml index a094b0fba..8ea494772 100644 --- a/examples/librispeech/s1/conf/transformer.yaml +++ b/examples/librispeech/s1/conf/transformer.yaml @@ -3,18 +3,20 @@ data: train_manifest: data/manifest.train dev_manifest: data/manifest.dev test_manifest: data/manifest.test-clean - vocab_filepath: data/vocab.txt - unit_type: 'spm' - spm_model_prefix: 'data/bpe_unigram_5000' - mean_std_filepath: "" - augmentation_config: conf/augmentation.json - batch_size: 64 min_input_len: 0.5 # second max_input_len: 20.0 # second min_output_len: 0.0 # tokens max_output_len: 400.0 # tokens min_output_input_ratio: 0.05 max_output_input_ratio: 10.0 + +collator: + vocab_filepath: data/vocab.txt + unit_type: 'spm' + spm_model_prefix: 'data/bpe_unigram_5000' + mean_std_filepath: "" + augmentation_config: conf/augmentation.json + batch_size: 64 raw_wav: True # use raw_wav or kaldi feature specgram_type: fbank #linear, mfcc, fbank feat_dim: 80 From 6d92417edd57b73996cf042633ff1d06219c95f1 Mon Sep 17 00:00:00 2001 From: Haoxin Ma <745165806@qq.com> Date: Tue, 29 Jun 2021 06:05:26 +0000 Subject: [PATCH 044/281] optimize the function --- deepspeech/training/trainer.py | 5 +- deepspeech/utils/checkpoint.py | 109 +++++++++------------------------ 2 files changed, 32 insertions(+), 82 deletions(-) diff --git a/deepspeech/training/trainer.py b/deepspeech/training/trainer.py index f8668370a..cd915760d 100644 --- a/deepspeech/training/trainer.py +++ b/deepspeech/training/trainer.py @@ -151,11 +151,12 @@ class Trainer(): resume training. """ scratch = None - infos = self.checkpoint.load_last_parameters( + infos = self.checkpoint._load_parameters( self.model, self.optimizer, checkpoint_dir=self.checkpoint_dir, - checkpoint_path=self.args.checkpoint_path) + checkpoint_path=self.args.checkpoint_path, + checkpoint_file='checkpoint_latest') if infos: # restore from ckpt self.iteration = infos["step"] diff --git a/deepspeech/utils/checkpoint.py b/deepspeech/utils/checkpoint.py index b29ef2ab5..be36fdbb2 100644 --- a/deepspeech/utils/checkpoint.py +++ b/deepspeech/utils/checkpoint.py @@ -39,8 +39,8 @@ class Checkpoint(object): self.latest_n = latest_n self._save_all = (kbest_n == -1) - def should_save_best(self, metric: float) -> bool: - if not self.best_full(): + def _should_save_best(self, metric: float) -> bool: + if not self._best_full(): return True # already full @@ -49,10 +49,10 @@ class Checkpoint(object): worst_metric = self.best_records[worst_record_path] return metric < worst_metric - def best_full(self): + def _best_full(self): return (not self._save_all) and len(self.best_records) == self.kbest_n - def latest_full(self): + def _latest_full(self): return len(self.latest_records) == self.latest_n def add_checkpoint(self, @@ -63,62 +63,62 @@ class Checkpoint(object): infos, metric_type="val_loss"): if (metric_type not in infos.keys()): - self.save_parameters(checkpoint_dir, tag_or_iteration, model, + self._save_parameters(checkpoint_dir, tag_or_iteration, model, optimizer, infos) return #save best - if self.should_save_best(infos[metric_type]): - self.save_best_checkpoint_and_update( + if self._should_save_best(infos[metric_type]): + self._save_best_checkpoint_and_update( infos[metric_type], checkpoint_dir, tag_or_iteration, model, optimizer, infos) #save latest - self.save_latest_checkpoint_and_update(checkpoint_dir, tag_or_iteration, + self._save_latest_checkpoint_and_update(checkpoint_dir, tag_or_iteration, model, optimizer, infos) if isinstance(tag_or_iteration, int): - self.save_checkpoint_record(checkpoint_dir, tag_or_iteration) + self._save_checkpoint_record(checkpoint_dir, tag_or_iteration) - def save_best_checkpoint_and_update(self, metric, checkpoint_dir, + def _save_best_checkpoint_and_update(self, metric, checkpoint_dir, tag_or_iteration, model, optimizer, infos): # remove the worst - if self.best_full(): + if self._best_full(): worst_record_path = max(self.best_records, key=self.best_records.get) self.best_records.pop(worst_record_path) if (worst_record_path not in self.latest_records): logger.info( "remove the worst checkpoint: {}".format(worst_record_path)) - self.del_checkpoint(checkpoint_dir, worst_record_path) + self._del_checkpoint(checkpoint_dir, worst_record_path) # add the new one - self.save_parameters(checkpoint_dir, tag_or_iteration, model, optimizer, + self._save_parameters(checkpoint_dir, tag_or_iteration, model, optimizer, infos) self.best_records[tag_or_iteration] = metric - def save_latest_checkpoint_and_update( + def _save_latest_checkpoint_and_update( self, checkpoint_dir, tag_or_iteration, model, optimizer, infos): # remove the old - if self.latest_full(): + if self._latest_full(): to_del_fn = self.latest_records.pop(0) if (to_del_fn not in self.best_records.keys()): logger.info( "remove the latest checkpoint: {}".format(to_del_fn)) - self.del_checkpoint(checkpoint_dir, to_del_fn) + self._del_checkpoint(checkpoint_dir, to_del_fn) self.latest_records.append(tag_or_iteration) - self.save_parameters(checkpoint_dir, tag_or_iteration, model, optimizer, + self._save_parameters(checkpoint_dir, tag_or_iteration, model, optimizer, infos) - def del_checkpoint(self, checkpoint_dir, tag_or_iteration): + def _del_checkpoint(self, checkpoint_dir, tag_or_iteration): checkpoint_path = os.path.join(checkpoint_dir, "{}".format(tag_or_iteration)) for filename in glob.glob(checkpoint_path + ".*"): os.remove(filename) logger.info("delete file: {}".format(filename)) - def load_checkpoint_idx(self, checkpoint_record: str) -> int: + def _load_checkpoint_idx(self, checkpoint_record: str) -> int: """Get the iteration number corresponding to the latest saved checkpoint. Args: checkpoint_path (str): the saved path of checkpoint. @@ -134,7 +134,7 @@ class Checkpoint(object): iteration = int(latest_checkpoint.split(":")[-1]) return iteration - def save_checkpoint_record(self, checkpoint_dir: str, iteration: int): + def _save_checkpoint_record(self, checkpoint_dir: str, iteration: int): """Save the iteration number of the latest model to be checkpoint record. Args: checkpoint_dir (str): the directory where checkpoint is saved. @@ -153,65 +153,13 @@ class Checkpoint(object): for i in self.latest_records: handle.write("model_checkpoint_path:{}\n".format(i)) - def load_last_parameters(self, - model, - optimizer=None, - checkpoint_dir=None, - checkpoint_path=None): - """Load a last model checkpoint from disk. - Args: - model (Layer): model to load parameters. - optimizer (Optimizer, optional): optimizer to load states if needed. - Defaults to None. - checkpoint_dir (str, optional): the directory where checkpoint is saved. - checkpoint_path (str, optional): if specified, load the checkpoint - stored in the checkpoint_path(prefix) and the argument 'checkpoint_dir' will - be ignored. Defaults to None. - Returns: - configs (dict): epoch or step, lr and other meta info should be saved. - """ - configs = {} - - if checkpoint_path is not None: - tag = os.path.basename(checkpoint_path).split(":")[-1] - elif checkpoint_dir is not None: - checkpoint_record = os.path.join(checkpoint_dir, - "checkpoint_latest") - iteration = self.load_checkpoint_idx(checkpoint_record) - if iteration == -1: - return configs - checkpoint_path = os.path.join(checkpoint_dir, - "{}".format(iteration)) - else: - raise ValueError( - "At least one of 'checkpoint_dir' and 'checkpoint_path' should be specified!" - ) - - rank = dist.get_rank() - - params_path = checkpoint_path + ".pdparams" - model_dict = paddle.load(params_path) - model.set_state_dict(model_dict) - logger.info("Rank {}: loaded model from {}".format(rank, params_path)) - - optimizer_path = checkpoint_path + ".pdopt" - if optimizer and os.path.isfile(optimizer_path): - optimizer_dict = paddle.load(optimizer_path) - optimizer.set_state_dict(optimizer_dict) - logger.info("Rank {}: loaded optimizer state from {}".format( - rank, optimizer_path)) - - info_path = re.sub('.pdparams$', '.json', params_path) - if os.path.exists(info_path): - with open(info_path, 'r') as fin: - configs = json.load(fin) - return configs - def load_best_parameters(self, + def _load_parameters(self, model, optimizer=None, checkpoint_dir=None, - checkpoint_path=None): + checkpoint_path=None, + checkpoint_file=None): """Load a last model checkpoint from disk. Args: model (Layer): model to load parameters. @@ -221,6 +169,7 @@ class Checkpoint(object): checkpoint_path (str, optional): if specified, load the checkpoint stored in the checkpoint_path(prefix) and the argument 'checkpoint_dir' will be ignored. Defaults to None. + checkpoint_file "checkpoint_latest" or "checkpoint_best" Returns: configs (dict): epoch or step, lr and other meta info should be saved. """ @@ -228,16 +177,16 @@ class Checkpoint(object): if checkpoint_path is not None: tag = os.path.basename(checkpoint_path).split(":")[-1] - elif checkpoint_dir is not None: - checkpoint_record = os.path.join(checkpoint_dir, "checkpoint_best") - iteration = self.load_checkpoint_idx(checkpoint_record) + elif checkpoint_dir is not None and checkpoint_file is not None: + checkpoint_record = os.path.join(checkpoint_dir, checkpoint_file) + iteration = self._load_checkpoint_idx(checkpoint_record) if iteration == -1: return configs checkpoint_path = os.path.join(checkpoint_dir, "{}".format(iteration)) else: raise ValueError( - "At least one of 'checkpoint_dir' and 'checkpoint_path' should be specified!" + "At least one of 'checkpoint_dir' and 'checkpoint_file' and 'checkpoint_path' should be specified!" ) rank = dist.get_rank() @@ -261,7 +210,7 @@ class Checkpoint(object): return configs @mp_tools.rank_zero_only - def save_parameters(self, + def _save_parameters(self, checkpoint_dir: str, tag_or_iteration: Union[int, str], model: paddle.nn.Layer, From b8f190c12cff4ee311501973f612c4e893ac9cea Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 29 Jun 2021 08:33:26 +0000 Subject: [PATCH 045/281] add thchs30 dataset --- examples/dataset/aishell/aishell.py | 4 +- examples/dataset/thchs30/.gitignore | 5 + examples/dataset/thchs30/thchs30.py | 169 ++++++++++++++++++++++++++++ 3 files changed, 177 insertions(+), 1 deletion(-) create mode 100644 examples/dataset/thchs30/.gitignore create mode 100644 examples/dataset/thchs30/thchs30.py diff --git a/examples/dataset/aishell/aishell.py b/examples/dataset/aishell/aishell.py index a0cabe352..b8aede2fc 100644 --- a/examples/dataset/aishell/aishell.py +++ b/examples/dataset/aishell/aishell.py @@ -60,7 +60,7 @@ def create_manifest(data_dir, manifest_path_prefix): if line == '': continue audio_id, text = line.split(' ', 1) - # remove withespace + # remove withespace, charactor text text = ''.join(text.split()) transcript_dict[audio_id] = text @@ -123,6 +123,8 @@ def main(): target_dir=args.target_dir, manifest_path=args.manifest_prefix) + print("Data download and manifest prepare done!") + if __name__ == '__main__': main() diff --git a/examples/dataset/thchs30/.gitignore b/examples/dataset/thchs30/.gitignore new file mode 100644 index 000000000..47dd6268f --- /dev/null +++ b/examples/dataset/thchs30/.gitignore @@ -0,0 +1,5 @@ +*.tgz +manifest.* +data_thchs30 +resource +test-noise diff --git a/examples/dataset/thchs30/thchs30.py b/examples/dataset/thchs30/thchs30.py new file mode 100644 index 000000000..225adb092 --- /dev/null +++ b/examples/dataset/thchs30/thchs30.py @@ -0,0 +1,169 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Prepare THCHS-30 mandarin dataset + +Download, unpack and create manifest files. +Manifest file is a json-format file with each line containing the +meta data (i.e. audio filepath, transcript and audio duration) +of each audio file in the data set. +""" +import argparse +import codecs +import json +import os +from multiprocessing.pool import Pool +from pathlib import Path + +import soundfile + +from utils.utility import download +from utils.utility import unpack + +DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech') + +URL_ROOT = 'http://www.openslr.org/resources/18' +# URL_ROOT = 'https://openslr.magicdatatech.com/resources/18' +DATA_URL = URL_ROOT + '/data_thchs30.tgz' +TEST_NOISE_URL = URL_ROOT + '/test-noise.tgz' +RESOURCE_URL = URL_ROOT + '/resource.tgz' +MD5_DATA = '2d2252bde5c8429929e1841d4cb95e90' +MD5_TEST_NOISE = '7e8a985fb965b84141b68c68556c2030' +MD5_RESOURCE = 'c0b2a565b4970a0c4fe89fefbf2d97e1' + +parser = argparse.ArgumentParser(description=__doc__) +parser.add_argument( + "--target_dir", + default=DATA_HOME + "/THCHS30", + type=str, + help="Directory to save the dataset. (default: %(default)s)") +parser.add_argument( + "--manifest_prefix", + default="manifest", + type=str, + help="Filepath prefix for output manifests. (default: %(default)s)") +args = parser.parse_args() + + +def read_trn(filepath): + """read trn file. + word text in first line. + syllable text in second line. + phoneme text in third line. + + Args: + filepath (str): trn path. + + Returns: + list(str): (word, syllable, phone) + """ + texts = [] + with open(filepath, 'r') as f: + lines = f.read().split('\n') + # last line is `empty` + lines = lines[:3] + assert len(lines) == 3, lines + # charactor text, remove withespace + texts.append(''.join(lines[0].split())) + texts.extend(lines[1:]) + return texts + + +def resolve_symlink(filepath): + """resolve symlink which content is norm file. + + Args: + filepath (str): norm file symlink. + """ + sym_path = Path(filepath) + relative_link = sym_path.read_text().strip() + relative = Path(relative_link) + relpath = sym_path.parent / relative + return relpath.resolve() + + +def create_manifest(data_dir, manifest_path_prefix): + print("Creating manifest %s ..." % manifest_path_prefix) + json_lines = [] + data_types = ['train', 'dev', 'test'] + for dtype in data_types: + del json_lines[:] + audio_dir = os.path.join(data_dir, dtype) + for subfolder, _, filelist in sorted(os.walk(audio_dir)): + for fname in filelist: + file_path = os.path.join(subfolder, fname) + if file_path.endswith('.wav'): + audio_path = os.path.abspath(file_path) + text_path = resolve_symlink(audio_path + '.trn') + else: + continue + + assert os.path.exists(audio_path) and os.path.exists(text_path) + + audio_id = os.path.basename(audio_path)[:-4] + word_text, syllable_text, phone_text = read_trn(text_path) + audio_data, samplerate = soundfile.read(audio_path) + duration = float(len(audio_data) / samplerate) + + json_lines.append( + json.dumps( + { + 'utt': audio_id, + 'feat': audio_path, + 'feat_shape': (duration, ), # second + 'text': word_text, + 'syllable': syllable_text, + 'phone': phone_text, + }, + ensure_ascii=False)) + + manifest_path = manifest_path_prefix + '.' + dtype + with codecs.open(manifest_path, 'w', 'utf-8') as fout: + for line in json_lines: + fout.write(line + '\n') + + +def prepare_dataset(url, md5sum, target_dir, manifest_path, subset): + """Download, unpack and create manifest file.""" + datadir = os.path.join(target_dir, subset) + if not os.path.exists(datadir): + filepath = download(url, md5sum, target_dir) + unpack(filepath, target_dir) + else: + print("Skip downloading and unpacking. Data already exists in %s." % + target_dir) + + if subset == 'data_thchs30': + create_manifest(datadir, manifest_path) + + +def main(): + if args.target_dir.startswith('~'): + args.target_dir = os.path.expanduser(args.target_dir) + + tasks = [ + (DATA_URL, MD5_DATA, args.target_dir, args.manifest_prefix, + "data_thchs30"), + (TEST_NOISE_URL, MD5_TEST_NOISE, args.target_dir, args.manifest_prefix, + "test-noise"), + (RESOURCE_URL, MD5_RESOURCE, args.target_dir, args.manifest_prefix, + "resource"), + ] + with Pool(7) as pool: + pool.starmap(prepare_dataset, tasks) + + print("Data download and manifest prepare done!") + + +if __name__ == '__main__': + main() From 9e99f99b3c498f080f0b34e7763139f90ce6d751 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 29 Jun 2021 12:11:32 +0000 Subject: [PATCH 046/281] add thchs30, aidatatang; --- examples/dataset/aidatatang_200zh/.gitignore | 4 + examples/dataset/aidatatang_200zh/README.md | 14 ++ .../aidatatang_200zh/aidatatang_200zh.py | 151 ++++++++++++++++++ examples/dataset/aishell/README.md | 3 + examples/dataset/aishell/aishell.py | 32 ++-- examples/dataset/aishell3/README.md | 3 + examples/dataset/librispeech/librispeech.py | 22 ++- examples/dataset/magicdata/README.md | 15 ++ .../mini_librispeech/mini_librispeech.py | 18 +++ examples/dataset/multi_cn/README.md | 11 ++ examples/dataset/primewords/README.md | 6 + examples/dataset/st-cmds/README.md | 1 + examples/dataset/thchs30/README.md | 55 +++++++ examples/dataset/thchs30/thchs30.py | 4 +- 14 files changed, 326 insertions(+), 13 deletions(-) create mode 100644 examples/dataset/aidatatang_200zh/.gitignore create mode 100644 examples/dataset/aidatatang_200zh/README.md create mode 100644 examples/dataset/aidatatang_200zh/aidatatang_200zh.py create mode 100644 examples/dataset/aishell/README.md create mode 100644 examples/dataset/aishell3/README.md create mode 100644 examples/dataset/magicdata/README.md create mode 100644 examples/dataset/multi_cn/README.md create mode 100644 examples/dataset/primewords/README.md create mode 100644 examples/dataset/st-cmds/README.md create mode 100644 examples/dataset/thchs30/README.md diff --git a/examples/dataset/aidatatang_200zh/.gitignore b/examples/dataset/aidatatang_200zh/.gitignore new file mode 100644 index 000000000..fcb887790 --- /dev/null +++ b/examples/dataset/aidatatang_200zh/.gitignore @@ -0,0 +1,4 @@ +*.tgz +manifest.* +*.meta +aidatatang_200zh/ diff --git a/examples/dataset/aidatatang_200zh/README.md b/examples/dataset/aidatatang_200zh/README.md new file mode 100644 index 000000000..e6f1eefbd --- /dev/null +++ b/examples/dataset/aidatatang_200zh/README.md @@ -0,0 +1,14 @@ +# [Aidatatang_200zh](http://www.openslr.org/62/) + +Aidatatang_200zh is a free Chinese Mandarin speech corpus provided by Beijing DataTang Technology Co., Ltd under Creative Commons Attribution-NonCommercial-NoDerivatives 4.0 International Public License. +The contents and the corresponding descriptions of the corpus include: + +* The corpus contains 200 hours of acoustic data, which is mostly mobile recorded data. +* 600 speakers from different accent areas in China are invited to participate in the recording. +* The transcription accuracy for each sentence is larger than 98%. +* Recordings are conducted in a quiet indoor environment. +* The database is divided into training set, validation set, and testing set in a ratio of 7: 1: 2. +* Detail information such as speech data coding and speaker information is preserved in the metadata file. +* Segmented transcripts are also provided. + +The corpus aims to support researchers in speech recognition, machine translation, voiceprint recognition, and other speech-related fields. Therefore, the corpus is totally free for academic use. diff --git a/examples/dataset/aidatatang_200zh/aidatatang_200zh.py b/examples/dataset/aidatatang_200zh/aidatatang_200zh.py new file mode 100644 index 000000000..cc77c3c48 --- /dev/null +++ b/examples/dataset/aidatatang_200zh/aidatatang_200zh.py @@ -0,0 +1,151 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Prepare aidatatang_200zh mandarin dataset + +Download, unpack and create manifest files. +Manifest file is a json-format file with each line containing the +meta data (i.e. audio filepath, transcript and audio duration) +of each audio file in the data set. +""" +import argparse +import codecs +import json +import os + +import soundfile + +from utils.utility import download +from utils.utility import unpack + +DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech') + +URL_ROOT = 'http://www.openslr.org/resources/62' +# URL_ROOT = 'https://openslr.magicdatatech.com/resources/62' +DATA_URL = URL_ROOT + '/aidatatang_200zh.tgz' +MD5_DATA = '6e0f4f39cd5f667a7ee53c397c8d0949' + +parser = argparse.ArgumentParser(description=__doc__) +parser.add_argument( + "--target_dir", + default=DATA_HOME + "/aidatatang_200zh", + type=str, + help="Directory to save the dataset. (default: %(default)s)") +parser.add_argument( + "--manifest_prefix", + default="manifest", + type=str, + help="Filepath prefix for output manifests. (default: %(default)s)") +args = parser.parse_args() + + +def create_manifest(data_dir, manifest_path_prefix): + print("Creating manifest %s ..." % manifest_path_prefix) + json_lines = [] + transcript_path = os.path.join(data_dir, 'transcript', + 'aidatatang_200_zh_transcript.txt') + transcript_dict = {} + for line in codecs.open(transcript_path, 'r', 'utf-8'): + line = line.strip() + if line == '': + continue + audio_id, text = line.split(' ', 1) + # remove withespace, charactor text + text = ''.join(text.split()) + transcript_dict[audio_id] = text + + data_types = ['train', 'dev', 'test'] + for dtype in data_types: + del json_lines[:] + total_sec = 0.0 + total_text = 0.0 + total_num = 0 + + audio_dir = os.path.join(data_dir, 'corpus/', dtype) + for subfolder, _, filelist in sorted(os.walk(audio_dir)): + for fname in filelist: + if not fname.endswith('.wav'): + continue + + audio_path = os.path.abspath(os.path.join(subfolder, fname)) + audio_id = os.path.basename(fname)[:-4] + + audio_data, samplerate = soundfile.read(audio_path) + duration = float(len(audio_data) / samplerate) + text = transcript_dict[audio_id] + json_lines.append( + json.dumps( + { + 'utt': audio_id, + 'feat': audio_path, + 'feat_shape': (duration, ), # second + 'text': text, + }, + ensure_ascii=False)) + + total_sec += duration + total_text += len(text) + total_num += 1 + + manifest_path = manifest_path_prefix + '.' + dtype + with codecs.open(manifest_path, 'w', 'utf-8') as fout: + for line in json_lines: + fout.write(line + '\n') + + with open(dtype + '.meta', 'w') as f: + print(f"{dtype}:", file=f) + print(f"{total_num} utts", file=f) + print(f"{total_sec / (60*60)} h", file=f) + print(f"{total_text} text", file=f) + print(f"{total_text / total_sec} text/sec", file=f) + print(f"{total_sec / total_num} sec/utt", file=f) + + +def prepare_dataset(url, md5sum, target_dir, manifest_path, subset): + """Download, unpack and create manifest file.""" + data_dir = os.path.join(target_dir, subset) + if not os.path.exists(data_dir): + filepath = download(url, md5sum, target_dir) + unpack(filepath, target_dir) + # unpack all audio tar files + audio_dir = os.path.join(data_dir, 'corpus') + for subfolder, dirlist, filelist in sorted(os.walk(audio_dir)): + for sub in dirlist: + print(f"unpack dir {sub}...") + for folder, _, filelist in sorted( + os.walk(os.path.join(subfolder, sub))): + for ftar in filelist: + unpack(os.path.join(folder, ftar), folder, True) + else: + print("Skip downloading and unpacking. Data already exists in %s." % + target_dir) + + create_manifest(data_dir, manifest_path) + + +def main(): + if args.target_dir.startswith('~'): + args.target_dir = os.path.expanduser(args.target_dir) + + prepare_dataset( + url=DATA_URL, + md5sum=MD5_DATA, + target_dir=args.target_dir, + manifest_path=args.manifest_prefix, + subset='aidatatang_200zh') + + print("Data download and manifest prepare done!") + + +if __name__ == '__main__': + main() diff --git a/examples/dataset/aishell/README.md b/examples/dataset/aishell/README.md new file mode 100644 index 000000000..6770cd207 --- /dev/null +++ b/examples/dataset/aishell/README.md @@ -0,0 +1,3 @@ +# [Aishell1](http://www.openslr.org/33/) + +This Open Source Mandarin Speech Corpus, AISHELL-ASR0009-OS1, is 178 hours long. It is a part of AISHELL-ASR0009, of which utterance contains 11 domains, including smart home, autonomous driving, and industrial production. The whole recording was put in quiet indoor environment, using 3 different devices at the same time: high fidelity microphone (44.1kHz, 16-bit,); Android-system mobile phone (16kHz, 16-bit), iOS-system mobile phone (16kHz, 16-bit). Audios in high fidelity were re-sampled to 16kHz to build AISHELL- ASR0009-OS1. 400 speakers from different accent areas in China were invited to participate in the recording. The manual transcription accuracy rate is above 95%, through professional speech annotation and strict quality inspection. The corpus is divided into training, development and testing sets. ( This database is free for academic research, not in the commerce, if without permission. ) diff --git a/examples/dataset/aishell/aishell.py b/examples/dataset/aishell/aishell.py index b8aede2fc..5811a401a 100644 --- a/examples/dataset/aishell/aishell.py +++ b/examples/dataset/aishell/aishell.py @@ -31,7 +31,7 @@ from utils.utility import unpack DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech') URL_ROOT = 'http://www.openslr.org/resources/33' -URL_ROOT = 'https://openslr.magicdatatech.com/resources/33' +# URL_ROOT = 'https://openslr.magicdatatech.com/resources/33' DATA_URL = URL_ROOT + '/data_aishell.tgz' MD5_DATA = '2f494334227864a8a8fec932999db9d8' @@ -67,11 +67,15 @@ def create_manifest(data_dir, manifest_path_prefix): data_types = ['train', 'dev', 'test'] for dtype in data_types: del json_lines[:] + total_sec = 0.0 + total_text = 0.0 + total_num = 0 + audio_dir = os.path.join(data_dir, 'wav', dtype) for subfolder, _, filelist in sorted(os.walk(audio_dir)): for fname in filelist: - audio_path = os.path.join(subfolder, fname) - audio_id = fname[:-4] + audio_path = os.path.abspath(os.path.join(subfolder, fname)) + audio_id = os.path.basename(fname)[:-4] # if no transcription for audio then skipped if audio_id not in transcript_dict: continue @@ -81,20 +85,30 @@ def create_manifest(data_dir, manifest_path_prefix): json_lines.append( json.dumps( { - 'utt': - os.path.splitext(os.path.basename(audio_path))[0], - 'feat': - audio_path, + 'utt': audio_id, + 'feat': audio_path, 'feat_shape': (duration, ), # second - 'text': - text + 'text': text }, ensure_ascii=False)) + + total_sec += duration + total_text += len(text) + total_num += 1 + manifest_path = manifest_path_prefix + '.' + dtype with codecs.open(manifest_path, 'w', 'utf-8') as fout: for line in json_lines: fout.write(line + '\n') + with open(dtype + '.meta', 'w') as f: + print(f"{dtype}:", file=f) + print(f"{total_num} utts", file=f) + print(f"{total_sec / (60*60)} h", file=f) + print(f"{total_text} text", file=f) + print(f"{total_text / total_sec} text/sec", file=f) + print(f"{total_sec / total_num} sec/utt", file=f) + def prepare_dataset(url, md5sum, target_dir, manifest_path): """Download, unpack and create manifest file.""" diff --git a/examples/dataset/aishell3/README.md b/examples/dataset/aishell3/README.md new file mode 100644 index 000000000..8a29a6d0f --- /dev/null +++ b/examples/dataset/aishell3/README.md @@ -0,0 +1,3 @@ +# [Aishell3](http://www.openslr.org/93/) + +AISHELL-3 is a large-scale and high-fidelity multi-speaker Mandarin speech corpus which could be used to train multi-speaker Text-to-Speech (TTS) systems. The corpus contains roughly **85 hours** of emotion-neutral recordings spoken by 218 native Chinese mandarin speakers and total 88035 utterances. Their auxiliary attributes such as gender, age group and native accents are explicitly marked and provided in the corpus. Accordingly, transcripts in Chinese character-level and pinyin-level are provided along with the recordings. The word & tone transcription accuracy rate is above 98%, through professional speech annotation and strict quality inspection for tone and prosody. ( This database is free for academic research, not in the commerce, if without permission. ) diff --git a/examples/dataset/librispeech/librispeech.py b/examples/dataset/librispeech/librispeech.py index 55012f73c..f549a95f1 100644 --- a/examples/dataset/librispeech/librispeech.py +++ b/examples/dataset/librispeech/librispeech.py @@ -77,6 +77,10 @@ def create_manifest(data_dir, manifest_path): """ print("Creating manifest %s ..." % manifest_path) json_lines = [] + total_sec = 0.0 + total_text = 0.0 + total_num = 0 + for subfolder, _, filelist in sorted(os.walk(data_dir)): text_filelist = [ filename for filename in filelist if filename.endswith('trans.txt') @@ -86,7 +90,9 @@ def create_manifest(data_dir, manifest_path): for line in io.open(text_filepath, encoding="utf8"): segments = line.strip().split() text = ' '.join(segments[1:]).lower() - audio_filepath = os.path.join(subfolder, segments[0] + '.flac') + + audio_filepath = os.path.abspath( + os.path.join(subfolder, segments[0] + '.flac')) audio_data, samplerate = soundfile.read(audio_filepath) duration = float(len(audio_data)) / samplerate json_lines.append( @@ -99,10 +105,24 @@ def create_manifest(data_dir, manifest_path): 'text': text })) + + total_sec += duration + total_text += len(text) + total_num += 1 + with codecs.open(manifest_path, 'w', 'utf-8') as out_file: for line in json_lines: out_file.write(line + '\n') + subset = os.path.splitext(manifest_path)[1] + with open(subset + '.meta', 'w') as f: + print(f"{subset}:", file=f) + print(f"{total_num} utts", file=f) + print(f"{total_sec / (60*60)} h", file=f) + print(f"{total_text} text", file=f) + print(f"{total_text / total_sec} text/sec", file=f) + print(f"{total_sec / total_num} sec/utt", file=f) + def prepare_dataset(url, md5sum, target_dir, manifest_path): """Download, unpack and create summmary manifest file. diff --git a/examples/dataset/magicdata/README.md b/examples/dataset/magicdata/README.md new file mode 100644 index 000000000..083aee97b --- /dev/null +++ b/examples/dataset/magicdata/README.md @@ -0,0 +1,15 @@ +# [MagicData](http://www.openslr.org/68/) + +MAGICDATA Mandarin Chinese Read Speech Corpus was developed by MAGIC DATA Technology Co., Ltd. and freely published for non-commercial use. +The contents and the corresponding descriptions of the corpus include: + +* The corpus contains 755 hours of speech data, which is mostly mobile recorded data. +* 1080 speakers from different accent areas in China are invited to participate in the recording. +* The sentence transcription accuracy is higher than 98%. +* Recordings are conducted in a quiet indoor environment. +* The database is divided into training set, validation set, and testing set in a ratio of 51: 1: 2. +* Detail information such as speech data coding and speaker information is preserved in the metadata file. +* The domain of recording texts is diversified, including interactive Q&A, music search, SNS messages, home command and control, etc. +* Segmented transcripts are also provided. + +The corpus aims to support researchers in speech recognition, machine translation, speaker recognition, and other speech-related fields. Therefore, the corpus is totally free for academic use. diff --git a/examples/dataset/mini_librispeech/mini_librispeech.py b/examples/dataset/mini_librispeech/mini_librispeech.py index f5bc13933..44a6d3671 100644 --- a/examples/dataset/mini_librispeech/mini_librispeech.py +++ b/examples/dataset/mini_librispeech/mini_librispeech.py @@ -58,6 +58,10 @@ def create_manifest(data_dir, manifest_path): """ print("Creating manifest %s ..." % manifest_path) json_lines = [] + total_sec = 0.0 + total_text = 0.0 + total_num = 0 + for subfolder, _, filelist in sorted(os.walk(data_dir)): text_filelist = [ filename for filename in filelist if filename.endswith('trans.txt') @@ -80,10 +84,24 @@ def create_manifest(data_dir, manifest_path): 'text': text })) + + total_sec += duration + total_text += len(text) + total_num += 1 + with codecs.open(manifest_path, 'w', 'utf-8') as out_file: for line in json_lines: out_file.write(line + '\n') + subset = os.path.splitext(manifest_path)[1] + with open(subset + '.meta', 'w') as f: + print(f"{subset}:", file=f) + print(f"{total_num} utts", file=f) + print(f"{total_sec / (60*60)} h", file=f) + print(f"{total_text} text", file=f) + print(f"{total_text / total_sec} text/sec", file=f) + print(f"{total_sec / total_num} sec/utt", file=f) + def prepare_dataset(url, md5sum, target_dir, manifest_path): """Download, unpack and create summmary manifest file. diff --git a/examples/dataset/multi_cn/README.md b/examples/dataset/multi_cn/README.md new file mode 100644 index 000000000..d59b11b6d --- /dev/null +++ b/examples/dataset/multi_cn/README.md @@ -0,0 +1,11 @@ +# multi-cn + +This is a Chinese speech recognition recipe that trains on all Chinese corpora on OpenSLR, including: + +* Aidatatang (140 hours) +* Aishell (151 hours) +* MagicData (712 hours) +* Primewords (99 hours) +* ST-CMDS (110 hours) +* THCHS-30 (26 hours) +* optional AISHELL2 (~1000 hours) if available diff --git a/examples/dataset/primewords/README.md b/examples/dataset/primewords/README.md new file mode 100644 index 000000000..a4f1ed65d --- /dev/null +++ b/examples/dataset/primewords/README.md @@ -0,0 +1,6 @@ +# [Primewords](http://www.openslr.org/47/) + +This free Chinese Mandarin speech corpus set is released by Shanghai Primewords Information Technology Co., Ltd. +The corpus is recorded by smart mobile phones from 296 native Chinese speakers. The transcription accuracy is larger than 98%, at the confidence level of 95%. It is free for academic use. + +The mapping between the transcript and utterance is given in JSON format. diff --git a/examples/dataset/st-cmds/README.md b/examples/dataset/st-cmds/README.md new file mode 100644 index 000000000..c7ae50e59 --- /dev/null +++ b/examples/dataset/st-cmds/README.md @@ -0,0 +1 @@ +# [FreeST](http://www.openslr.org/38/) diff --git a/examples/dataset/thchs30/README.md b/examples/dataset/thchs30/README.md new file mode 100644 index 000000000..6b59d663a --- /dev/null +++ b/examples/dataset/thchs30/README.md @@ -0,0 +1,55 @@ +# [THCHS30](http://www.openslr.org/18/) + +This is the *data part* of the `THCHS30 2015` acoustic data +& scripts dataset. + +The dataset is described in more detail in the paper ``THCHS-30 : A Free +Chinese Speech Corpus`` by Dong Wang, Xuewei Zhang. + +A paper (if it can be called a paper) 13 years ago regarding the database: + +Dong Wang, Dalei Wu, Xiaoyan Zhu, ``TCMSD: A new Chinese Continuous Speech Database``, +International Conference on Chinese Computing (ICCC'01), 2001, Singapore. + +The layout of this data pack is the following: + + ``data`` + ``*.wav`` + audio data + + ``*.wav.trn`` + transcriptions + + ``{train,dev,test}`` + contain symlinks into the ``data`` directory for both audio and + transcription files. Contents of these directories define the + train/dev/test split of the data. + + ``{lm_word}`` + ``word.3gram.lm`` + trigram LM based on word + ``lexicon.txt`` + lexicon based on word + + ``{lm_phone}`` + ``phone.3gram.lm`` + trigram LM based on phone + ``lexicon.txt`` + lexicon based on phone + + ``README.TXT`` + this file + + +Data statistics +=============== + +Statistics for the data are as follows: + + =========== ========== ========== =========== + **dataset** **audio** **#sents** **#words** + =========== ========== ========== =========== + train 25 10,000 198,252 + dev 2:14 893 17,743 + test 6:15 2,495 49,085 + =========== ========== ========== =========== diff --git a/examples/dataset/thchs30/thchs30.py b/examples/dataset/thchs30/thchs30.py index 225adb092..5613d7685 100644 --- a/examples/dataset/thchs30/thchs30.py +++ b/examples/dataset/thchs30/thchs30.py @@ -69,9 +69,7 @@ def read_trn(filepath): """ texts = [] with open(filepath, 'r') as f: - lines = f.read().split('\n') - # last line is `empty` - lines = lines[:3] + lines = f.read().strip().split('\n') assert len(lines) == 3, lines # charactor text, remove withespace texts.append(''.join(lines[0].split())) From e106f243b4f765fad466cc0608ba5b1240e2050c Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 29 Jun 2021 12:13:04 +0000 Subject: [PATCH 047/281] dump dataset metadata --- examples/dataset/thchs30/thchs30.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/examples/dataset/thchs30/thchs30.py b/examples/dataset/thchs30/thchs30.py index 5613d7685..c28fa56ff 100644 --- a/examples/dataset/thchs30/thchs30.py +++ b/examples/dataset/thchs30/thchs30.py @@ -96,6 +96,10 @@ def create_manifest(data_dir, manifest_path_prefix): data_types = ['train', 'dev', 'test'] for dtype in data_types: del json_lines[:] + total_sec = 0.0 + total_text = 0.0 + total_num = 0 + audio_dir = os.path.join(data_dir, dtype) for subfolder, _, filelist in sorted(os.walk(audio_dir)): for fname in filelist: @@ -125,11 +129,23 @@ def create_manifest(data_dir, manifest_path_prefix): }, ensure_ascii=False)) + total_sec += duration + total_text += len(text) + total_num += 1 + manifest_path = manifest_path_prefix + '.' + dtype with codecs.open(manifest_path, 'w', 'utf-8') as fout: for line in json_lines: fout.write(line + '\n') + with open(dtype + '.meta', 'w') as f: + print(f"{dtype}:", file=f) + print(f"{total_num} utts", file=f) + print(f"{total_sec / (60*60)} h", file=f) + print(f"{total_text} text", file=f) + print(f"{total_text / total_sec} text/sec", file=f) + print(f"{total_sec / total_num} sec/utt", file=f) + def prepare_dataset(url, md5sum, target_dir, manifest_path, subset): """Download, unpack and create manifest file.""" From 8c0923b86532c5750ecaea52ce74a60e3c310465 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 29 Jun 2021 12:18:03 +0000 Subject: [PATCH 048/281] update gitignore; add gigaspeech --- examples/dataset/aidatatang_200zh/.gitignore | 2 +- examples/dataset/aishell/.gitignore | 3 +++ examples/dataset/gigaspeech/.gitignore | 1 + examples/dataset/gigaspeech/README.md | 10 ++++++++++ examples/dataset/gigaspeech/gigaspeech.py | 13 +++++++++++++ examples/dataset/gigaspeech/run.sh | 10 ++++++++++ examples/dataset/librispeech/.gitignore | 2 ++ examples/dataset/mini_librispeech/.gitignore | 1 + examples/dataset/thchs30/.gitignore | 1 + 9 files changed, 42 insertions(+), 1 deletion(-) create mode 100644 examples/dataset/gigaspeech/.gitignore create mode 100644 examples/dataset/gigaspeech/README.md create mode 100644 examples/dataset/gigaspeech/gigaspeech.py create mode 100644 examples/dataset/gigaspeech/run.sh diff --git a/examples/dataset/aidatatang_200zh/.gitignore b/examples/dataset/aidatatang_200zh/.gitignore index fcb887790..fc56525e6 100644 --- a/examples/dataset/aidatatang_200zh/.gitignore +++ b/examples/dataset/aidatatang_200zh/.gitignore @@ -1,4 +1,4 @@ *.tgz manifest.* *.meta -aidatatang_200zh/ +aidatatang_200zh/ \ No newline at end of file diff --git a/examples/dataset/aishell/.gitignore b/examples/dataset/aishell/.gitignore index 9c6e517e5..eea6573e1 100644 --- a/examples/dataset/aishell/.gitignore +++ b/examples/dataset/aishell/.gitignore @@ -1 +1,4 @@ data_aishell* +*.meta +manifest.* +*.tgz \ No newline at end of file diff --git a/examples/dataset/gigaspeech/.gitignore b/examples/dataset/gigaspeech/.gitignore new file mode 100644 index 000000000..7f78176b7 --- /dev/null +++ b/examples/dataset/gigaspeech/.gitignore @@ -0,0 +1 @@ +GigaSpeech/ diff --git a/examples/dataset/gigaspeech/README.md b/examples/dataset/gigaspeech/README.md new file mode 100644 index 000000000..4a1715cb8 --- /dev/null +++ b/examples/dataset/gigaspeech/README.md @@ -0,0 +1,10 @@ +# [GigaSpeech](https://github.com/SpeechColab/GigaSpeech) + +``` +git clone https://github.com/SpeechColab/GigaSpeech.git + +cd GigaSpeech +utils/gigaspeech_download.sh /disk1/audio_data/gigaspeech +toolkits/kaldi/gigaspeech_data_prep.sh --train-subset XL /disk1/audio_data/gigaspeech ../data +cd .. +``` diff --git a/examples/dataset/gigaspeech/gigaspeech.py b/examples/dataset/gigaspeech/gigaspeech.py new file mode 100644 index 000000000..185a92b8d --- /dev/null +++ b/examples/dataset/gigaspeech/gigaspeech.py @@ -0,0 +1,13 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/examples/dataset/gigaspeech/run.sh b/examples/dataset/gigaspeech/run.sh new file mode 100644 index 000000000..0f7b46ab9 --- /dev/null +++ b/examples/dataset/gigaspeech/run.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +set -e + +curdir=$PWD + +test -d GigaSpeech || git clone https://github.com/SpeechColab/GigaSpeech.git +cd GigaSpeech +source env_vars.sh +utils/gigaspeech_download.sh ${curdir}/ diff --git a/examples/dataset/librispeech/.gitignore b/examples/dataset/librispeech/.gitignore index dfd5c67b5..465806def 100644 --- a/examples/dataset/librispeech/.gitignore +++ b/examples/dataset/librispeech/.gitignore @@ -5,3 +5,5 @@ test-other train-clean-100 train-clean-360 train-other-500 +*.meta +manifest.* diff --git a/examples/dataset/mini_librispeech/.gitignore b/examples/dataset/mini_librispeech/.gitignore index 61f54c966..7fbcfd65d 100644 --- a/examples/dataset/mini_librispeech/.gitignore +++ b/examples/dataset/mini_librispeech/.gitignore @@ -2,3 +2,4 @@ dev-clean/ manifest.dev-clean manifest.train-clean train-clean/ +*.meta diff --git a/examples/dataset/thchs30/.gitignore b/examples/dataset/thchs30/.gitignore index 47dd6268f..b94cd7e40 100644 --- a/examples/dataset/thchs30/.gitignore +++ b/examples/dataset/thchs30/.gitignore @@ -3,3 +3,4 @@ manifest.* data_thchs30 resource test-noise +*.meta From 08b6213bc8b88378cb090534be74eaeb7df306ce Mon Sep 17 00:00:00 2001 From: Haoxin Ma <745165806@qq.com> Date: Wed, 30 Jun 2021 03:00:18 +0000 Subject: [PATCH 049/281] fix private function --- deepspeech/training/trainer.py | 5 +- deepspeech/utils/checkpoint.py | 114 ++++++++++++++++++++++----------- 2 files changed, 79 insertions(+), 40 deletions(-) diff --git a/deepspeech/training/trainer.py b/deepspeech/training/trainer.py index cd915760d..5ebba1a98 100644 --- a/deepspeech/training/trainer.py +++ b/deepspeech/training/trainer.py @@ -151,12 +151,11 @@ class Trainer(): resume training. """ scratch = None - infos = self.checkpoint._load_parameters( + infos = self.checkpoint.load_latest_parameters( self.model, self.optimizer, checkpoint_dir=self.checkpoint_dir, - checkpoint_path=self.args.checkpoint_path, - checkpoint_file='checkpoint_latest') + checkpoint_path=self.args.checkpoint_path) if infos: # restore from ckpt self.iteration = infos["step"] diff --git a/deepspeech/utils/checkpoint.py b/deepspeech/utils/checkpoint.py index be36fdbb2..000fa87ba 100644 --- a/deepspeech/utils/checkpoint.py +++ b/deepspeech/utils/checkpoint.py @@ -38,23 +38,7 @@ class Checkpoint(object): self.kbest_n = kbest_n self.latest_n = latest_n self._save_all = (kbest_n == -1) - - def _should_save_best(self, metric: float) -> bool: - if not self._best_full(): - return True - - # already full - worst_record_path = max(self.best_records, key=self.best_records.get) - # worst_record_path = max(self.best_records.iteritems(), key=operator.itemgetter(1))[0] - worst_metric = self.best_records[worst_record_path] - return metric < worst_metric - - def _best_full(self): - return (not self._save_all) and len(self.best_records) == self.kbest_n - - def _latest_full(self): - return len(self.latest_records) == self.latest_n - + def add_checkpoint(self, checkpoint_dir, tag_or_iteration, @@ -64,7 +48,7 @@ class Checkpoint(object): metric_type="val_loss"): if (metric_type not in infos.keys()): self._save_parameters(checkpoint_dir, tag_or_iteration, model, - optimizer, infos) + optimizer, infos) return #save best @@ -73,15 +57,71 @@ class Checkpoint(object): infos[metric_type], checkpoint_dir, tag_or_iteration, model, optimizer, infos) #save latest - self._save_latest_checkpoint_and_update(checkpoint_dir, tag_or_iteration, - model, optimizer, infos) + self._save_latest_checkpoint_and_update( + checkpoint_dir, tag_or_iteration, model, optimizer, infos) if isinstance(tag_or_iteration, int): self._save_checkpoint_record(checkpoint_dir, tag_or_iteration) + def load_latest_parameters(self, + model, + optimizer=None, + checkpoint_dir=None, + checkpoint_path=None): + """Load a last model checkpoint from disk. + Args: + model (Layer): model to load parameters. + optimizer (Optimizer, optional): optimizer to load states if needed. + Defaults to None. + checkpoint_dir (str, optional): the directory where checkpoint is saved. + checkpoint_path (str, optional): if specified, load the checkpoint + stored in the checkpoint_path(prefix) and the argument 'checkpoint_dir' will + be ignored. Defaults to None. + Returns: + configs (dict): epoch or step, lr and other meta info should be saved. + """ + return self._load_parameters(model, optimizer, checkpoint_dir, checkpoint_path, + "checkpoint_latest") + + def load_best_parameters(self, + model, + optimizer=None, + checkpoint_dir=None, + checkpoint_path=None): + """Load a last model checkpoint from disk. + Args: + model (Layer): model to load parameters. + optimizer (Optimizer, optional): optimizer to load states if needed. + Defaults to None. + checkpoint_dir (str, optional): the directory where checkpoint is saved. + checkpoint_path (str, optional): if specified, load the checkpoint + stored in the checkpoint_path(prefix) and the argument 'checkpoint_dir' will + be ignored. Defaults to None. + Returns: + configs (dict): epoch or step, lr and other meta info should be saved. + """ + return self._load_parameters(model, optimizer, checkpoint_dir, checkpoint_path, + "checkpoint_best") + + def _should_save_best(self, metric: float) -> bool: + if not self._best_full(): + return True + + # already full + worst_record_path = max(self.best_records, key=self.best_records.get) + # worst_record_path = max(self.best_records.iteritems(), key=operator.itemgetter(1))[0] + worst_metric = self.best_records[worst_record_path] + return metric < worst_metric + + def _best_full(self): + return (not self._save_all) and len(self.best_records) == self.kbest_n + + def _latest_full(self): + return len(self.latest_records) == self.latest_n + def _save_best_checkpoint_and_update(self, metric, checkpoint_dir, - tag_or_iteration, model, optimizer, - infos): + tag_or_iteration, model, optimizer, + infos): # remove the worst if self._best_full(): worst_record_path = max(self.best_records, @@ -93,8 +133,8 @@ class Checkpoint(object): self._del_checkpoint(checkpoint_dir, worst_record_path) # add the new one - self._save_parameters(checkpoint_dir, tag_or_iteration, model, optimizer, - infos) + self._save_parameters(checkpoint_dir, tag_or_iteration, model, + optimizer, infos) self.best_records[tag_or_iteration] = metric def _save_latest_checkpoint_and_update( @@ -108,8 +148,8 @@ class Checkpoint(object): self._del_checkpoint(checkpoint_dir, to_del_fn) self.latest_records.append(tag_or_iteration) - self._save_parameters(checkpoint_dir, tag_or_iteration, model, optimizer, - infos) + self._save_parameters(checkpoint_dir, tag_or_iteration, model, + optimizer, infos) def _del_checkpoint(self, checkpoint_dir, tag_or_iteration): checkpoint_path = os.path.join(checkpoint_dir, @@ -153,13 +193,12 @@ class Checkpoint(object): for i in self.latest_records: handle.write("model_checkpoint_path:{}\n".format(i)) - def _load_parameters(self, - model, - optimizer=None, - checkpoint_dir=None, - checkpoint_path=None, - checkpoint_file=None): + model, + optimizer=None, + checkpoint_dir=None, + checkpoint_path=None, + checkpoint_file=None): """Load a last model checkpoint from disk. Args: model (Layer): model to load parameters. @@ -209,13 +248,14 @@ class Checkpoint(object): configs = json.load(fin) return configs + @mp_tools.rank_zero_only def _save_parameters(self, - checkpoint_dir: str, - tag_or_iteration: Union[int, str], - model: paddle.nn.Layer, - optimizer: Optimizer=None, - infos: dict=None): + checkpoint_dir: str, + tag_or_iteration: Union[int, str], + model: paddle.nn.Layer, + optimizer: Optimizer=None, + infos: dict=None): """Checkpoint the latest trained model parameters. Args: checkpoint_dir (str): the directory where checkpoint is saved. From c0f7aac8fce3d1fbacbcf146e3e2b42abfe607ae Mon Sep 17 00:00:00 2001 From: Haoxin Ma <745165806@qq.com> Date: Wed, 30 Jun 2021 03:10:34 +0000 Subject: [PATCH 050/281] revise conf/*.yaml --- deepspeech/utils/checkpoint.py | 28 +++++++++---------- examples/aishell/s0/conf/deepspeech2.yaml | 3 ++ examples/aishell/s1/conf/chunk_conformer.yaml | 3 ++ examples/aishell/s1/conf/conformer.yaml | 3 ++ examples/librispeech/s0/conf/deepspeech2.yaml | 3 ++ .../librispeech/s1/conf/chunk_confermer.yaml | 3 ++ .../s1/conf/chunk_transformer.yaml | 3 ++ examples/librispeech/s1/conf/conformer.yaml | 3 ++ examples/librispeech/s1/conf/transformer.yaml | 3 ++ examples/tiny/s1/conf/chunk_confermer.yaml | 3 ++ examples/tiny/s1/conf/chunk_transformer.yaml | 3 ++ examples/tiny/s1/conf/conformer.yaml | 3 ++ examples/tiny/s1/conf/transformer.yaml | 3 ++ 13 files changed, 49 insertions(+), 15 deletions(-) diff --git a/deepspeech/utils/checkpoint.py b/deepspeech/utils/checkpoint.py index 000fa87ba..8c5d8d605 100644 --- a/deepspeech/utils/checkpoint.py +++ b/deepspeech/utils/checkpoint.py @@ -24,7 +24,6 @@ from paddle.optimizer import Optimizer from deepspeech.utils import mp_tools from deepspeech.utils.log import Log -# import operator logger = Log(__name__).getlog() @@ -38,7 +37,7 @@ class Checkpoint(object): self.kbest_n = kbest_n self.latest_n = latest_n self._save_all = (kbest_n == -1) - + def add_checkpoint(self, checkpoint_dir, tag_or_iteration, @@ -64,10 +63,10 @@ class Checkpoint(object): self._save_checkpoint_record(checkpoint_dir, tag_or_iteration) def load_latest_parameters(self, - model, - optimizer=None, - checkpoint_dir=None, - checkpoint_path=None): + model, + optimizer=None, + checkpoint_dir=None, + checkpoint_path=None): """Load a last model checkpoint from disk. Args: model (Layer): model to load parameters. @@ -80,14 +79,14 @@ class Checkpoint(object): Returns: configs (dict): epoch or step, lr and other meta info should be saved. """ - return self._load_parameters(model, optimizer, checkpoint_dir, checkpoint_path, - "checkpoint_latest") + return self._load_parameters(model, optimizer, checkpoint_dir, + checkpoint_path, "checkpoint_latest") def load_best_parameters(self, - model, - optimizer=None, - checkpoint_dir=None, - checkpoint_path=None): + model, + optimizer=None, + checkpoint_dir=None, + checkpoint_path=None): """Load a last model checkpoint from disk. Args: model (Layer): model to load parameters. @@ -100,8 +99,8 @@ class Checkpoint(object): Returns: configs (dict): epoch or step, lr and other meta info should be saved. """ - return self._load_parameters(model, optimizer, checkpoint_dir, checkpoint_path, - "checkpoint_best") + return self._load_parameters(model, optimizer, checkpoint_dir, + checkpoint_path, "checkpoint_best") def _should_save_best(self, metric: float) -> bool: if not self._best_full(): @@ -248,7 +247,6 @@ class Checkpoint(object): configs = json.load(fin) return configs - @mp_tools.rank_zero_only def _save_parameters(self, checkpoint_dir: str, diff --git a/examples/aishell/s0/conf/deepspeech2.yaml b/examples/aishell/s0/conf/deepspeech2.yaml index 54ce240e7..27ede01bc 100644 --- a/examples/aishell/s0/conf/deepspeech2.yaml +++ b/examples/aishell/s0/conf/deepspeech2.yaml @@ -48,6 +48,9 @@ training: weight_decay: 1e-06 global_grad_clip: 3.0 log_interval: 100 + checkpoint: + kbest_n: 50 + latest_n: 5 decoding: batch_size: 128 diff --git a/examples/aishell/s1/conf/chunk_conformer.yaml b/examples/aishell/s1/conf/chunk_conformer.yaml index 904624c3c..1065dcb03 100644 --- a/examples/aishell/s1/conf/chunk_conformer.yaml +++ b/examples/aishell/s1/conf/chunk_conformer.yaml @@ -90,6 +90,9 @@ training: warmup_steps: 25000 lr_decay: 1.0 log_interval: 100 + checkpoint: + kbest_n: 50 + latest_n: 5 decoding: diff --git a/examples/aishell/s1/conf/conformer.yaml b/examples/aishell/s1/conf/conformer.yaml index 116c91927..4b1430c58 100644 --- a/examples/aishell/s1/conf/conformer.yaml +++ b/examples/aishell/s1/conf/conformer.yaml @@ -88,6 +88,9 @@ training: warmup_steps: 25000 lr_decay: 1.0 log_interval: 100 + checkpoint: + kbest_n: 50 + latest_n: 5 decoding: diff --git a/examples/librispeech/s0/conf/deepspeech2.yaml b/examples/librispeech/s0/conf/deepspeech2.yaml index d1746bff3..9f06a3802 100644 --- a/examples/librispeech/s0/conf/deepspeech2.yaml +++ b/examples/librispeech/s0/conf/deepspeech2.yaml @@ -43,6 +43,9 @@ training: weight_decay: 1e-06 global_grad_clip: 5.0 log_interval: 100 + checkpoint: + kbest_n: 50 + latest_n: 5 decoding: batch_size: 128 diff --git a/examples/librispeech/s1/conf/chunk_confermer.yaml b/examples/librispeech/s1/conf/chunk_confermer.yaml index ec945a188..979121639 100644 --- a/examples/librispeech/s1/conf/chunk_confermer.yaml +++ b/examples/librispeech/s1/conf/chunk_confermer.yaml @@ -91,6 +91,9 @@ training: warmup_steps: 25000 lr_decay: 1.0 log_interval: 100 + checkpoint: + kbest_n: 50 + latest_n: 5 decoding: diff --git a/examples/librispeech/s1/conf/chunk_transformer.yaml b/examples/librispeech/s1/conf/chunk_transformer.yaml index 3939ffc68..dc2a51f92 100644 --- a/examples/librispeech/s1/conf/chunk_transformer.yaml +++ b/examples/librispeech/s1/conf/chunk_transformer.yaml @@ -84,6 +84,9 @@ training: warmup_steps: 25000 lr_decay: 1.0 log_interval: 100 + checkpoint: + kbest_n: 50 + latest_n: 5 decoding: diff --git a/examples/librispeech/s1/conf/conformer.yaml b/examples/librispeech/s1/conf/conformer.yaml index 8f8bf4539..989af22a0 100644 --- a/examples/librispeech/s1/conf/conformer.yaml +++ b/examples/librispeech/s1/conf/conformer.yaml @@ -87,6 +87,9 @@ training: warmup_steps: 25000 lr_decay: 1.0 log_interval: 100 + checkpoint: + kbest_n: 50 + latest_n: 5 decoding: diff --git a/examples/librispeech/s1/conf/transformer.yaml b/examples/librispeech/s1/conf/transformer.yaml index a094b0fba..931d7524b 100644 --- a/examples/librispeech/s1/conf/transformer.yaml +++ b/examples/librispeech/s1/conf/transformer.yaml @@ -82,6 +82,9 @@ training: warmup_steps: 25000 lr_decay: 1.0 log_interval: 100 + checkpoint: + kbest_n: 50 + latest_n: 5 decoding: diff --git a/examples/tiny/s1/conf/chunk_confermer.yaml b/examples/tiny/s1/conf/chunk_confermer.yaml index 790066264..606300bdf 100644 --- a/examples/tiny/s1/conf/chunk_confermer.yaml +++ b/examples/tiny/s1/conf/chunk_confermer.yaml @@ -91,6 +91,9 @@ training: warmup_steps: 25000 lr_decay: 1.0 log_interval: 1 + checkpoint: + kbest_n: 10 + latest_n: 1 decoding: diff --git a/examples/tiny/s1/conf/chunk_transformer.yaml b/examples/tiny/s1/conf/chunk_transformer.yaml index aa2b145a6..72d368485 100644 --- a/examples/tiny/s1/conf/chunk_transformer.yaml +++ b/examples/tiny/s1/conf/chunk_transformer.yaml @@ -84,6 +84,9 @@ training: warmup_steps: 25000 lr_decay: 1.0 log_interval: 1 + checkpoint: + kbest_n: 10 + latest_n: 1 decoding: diff --git a/examples/tiny/s1/conf/conformer.yaml b/examples/tiny/s1/conf/conformer.yaml index 3813daa04..a6f730501 100644 --- a/examples/tiny/s1/conf/conformer.yaml +++ b/examples/tiny/s1/conf/conformer.yaml @@ -87,6 +87,9 @@ training: warmup_steps: 25000 lr_decay: 1.0 log_interval: 1 + checkpoint: + kbest_n: 10 + latest_n: 1 decoding: diff --git a/examples/tiny/s1/conf/transformer.yaml b/examples/tiny/s1/conf/transformer.yaml index 250995faa..71cbdde7f 100644 --- a/examples/tiny/s1/conf/transformer.yaml +++ b/examples/tiny/s1/conf/transformer.yaml @@ -84,6 +84,9 @@ training: warmup_steps: 25000 lr_decay: 1.0 log_interval: 1 + checkpoint: + kbest_n: 10 + latest_n: 1 decoding: From 6ee67785f6b6d8445a0995df595bb7cbcb0204ad Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Thu, 1 Jul 2021 05:17:05 +0000 Subject: [PATCH 051/281] fix ctc alignment --- deepspeech/exps/u2/model.py | 40 ++++++++++++++++----------- deepspeech/utils/ctc_utils.py | 16 ++++++----- deepspeech/utils/text_grid.py | 2 +- deepspeech/utils/utility.py | 19 +++++++++++++ examples/aishell/s1/local/align.sh | 43 ++++++++++++++++++++++++++++++ tools/Makefile | 4 +-- 6 files changed, 100 insertions(+), 24 deletions(-) create mode 100755 examples/aishell/s1/local/align.sh diff --git a/deepspeech/exps/u2/model.py b/deepspeech/exps/u2/model.py index 8802143d6..dd62f537e 100644 --- a/deepspeech/exps/u2/model.py +++ b/deepspeech/exps/u2/model.py @@ -39,6 +39,7 @@ from deepspeech.utils import error_rate from deepspeech.utils import layer_tools from deepspeech.utils import mp_tools from deepspeech.utils import text_grid +from deepspeech.utils import utility from deepspeech.utils.log import Log logger = Log(__name__).getlog() @@ -280,7 +281,15 @@ class U2Trainer(Trainer): shuffle=False, drop_last=False, collate_fn=SpeechCollator.from_config(config)) - logger.info("Setup train/valid/test Dataloader!") + # return text token id + config.collator.keep_transcription_text = False + self.align_loader = DataLoader( + test_dataset, + batch_size=config.decoding.batch_size, + shuffle=False, + drop_last=False, + collate_fn=SpeechCollator.from_config(config)) + logger.info("Setup train/valid/test/align Dataloader!") def setup_model(self): config = self.config @@ -507,16 +516,17 @@ class U2Tester(U2Trainer): sys.exit(1) # xxx.align - assert self.args.result_file + assert self.args.result_file and self.args.result_file.endswith( + '.align') self.model.eval() - logger.info(f"Align Total Examples: {len(self.test_loader.dataset)}") + logger.info(f"Align Total Examples: {len(self.align_loader.dataset)}") - stride_ms = self.test_loader.collate_fn.stride_ms - token_dict = self.test_loader.collate_fn.vocab_list + stride_ms = self.align_loader.collate_fn.stride_ms + token_dict = self.align_loader.collate_fn.vocab_list with open(self.args.result_file, 'w') as fout: # one example in batch - for i, batch in enumerate(self.test_loader): + for i, batch in enumerate(self.align_loader): key, feat, feats_length, target, target_length = batch # 1. Encoder @@ -527,36 +537,36 @@ class U2Tester(U2Trainer): encoder_out) # (1, maxlen, vocab_size) # 2. alignment - # print(ctc_probs.size(1)) ctc_probs = ctc_probs.squeeze(0) target = target.squeeze(0) alignment = ctc_utils.forced_align(ctc_probs, target) - print(kye[0], alignment) + logger.info("align ids", key[0], alignment) fout.write('{} {}\n'.format(key[0], alignment)) # 3. gen praat # segment alignment align_segs = text_grid.segment_alignment(alignment) - print(kye[0], align_segs) + logger.info("align tokens", key[0], align_segs) # IntervalTier, List["start end token\n"] - subsample = get_subsample(self.config) + subsample = utility.get_subsample(self.config) tierformat = text_grid.align_to_tierformat( align_segs, subsample, token_dict) # write tier - tier_path = os.path.join( - os.path.dirname(args.result_file), key[0] + ".tier") + align_output_path = os.path.join( + os.path.dirname(self.args.result_file), "align") + tier_path = os.path.join(align_output_path, key[0] + ".tier") with open(tier_path, 'w') as f: f.writelines(tierformat) # write textgrid - textgrid_path = s.path.join( - os.path.dirname(args.result_file), key[0] + ".TextGrid") + textgrid_path = os.path.join(align_output_path, + key[0] + ".TextGrid") second_per_frame = 1. / (1000. / stride_ms) # 25ms window, 10ms stride second_per_example = ( len(alignment) + 1) * subsample * second_per_frame text_grid.generate_textgrid( maxtime=second_per_example, - lines=tierformat, + intervals=tierformat, output=textgrid_path) def run_align(self): diff --git a/deepspeech/utils/ctc_utils.py b/deepspeech/utils/ctc_utils.py index 6201233df..09543d48d 100644 --- a/deepspeech/utils/ctc_utils.py +++ b/deepspeech/utils/ctc_utils.py @@ -86,13 +86,15 @@ def forced_align(ctc_probs: paddle.Tensor, y: paddle.Tensor, log_alpha = paddle.zeros( (ctc_probs.size(0), len(y_insert_blank))) #(T, 2L+1) log_alpha = log_alpha - float('inf') # log of zero + # TODO(Hui Zhang): zeros not support paddle.int16 state_path = (paddle.zeros( - (ctc_probs.size(0), len(y_insert_blank)), dtype=paddle.int16) - 1 + (ctc_probs.size(0), len(y_insert_blank)), dtype=paddle.int32) - 1 ) # state path, Tuple((T, 2L+1)) # init start state - log_alpha[0, 0] = ctc_probs[0][y_insert_blank[0]] # State-b, Sb - log_alpha[0, 1] = ctc_probs[0][y_insert_blank[1]] # State-nb, Snb + # TODO(Hui Zhang): VarBase.__getitem__() not support np.int64 + log_alpha[0, 0] = ctc_probs[0][int(y_insert_blank[0])] # State-b, Sb + log_alpha[0, 1] = ctc_probs[0][int(y_insert_blank[1])] # State-nb, Snb for t in range(1, ctc_probs.size(0)): # T for s in range(len(y_insert_blank)): # 2L+1 @@ -108,11 +110,13 @@ def forced_align(ctc_probs: paddle.Tensor, y: paddle.Tensor, log_alpha[t - 1, s - 2], ]) prev_state = [s, s - 1, s - 2] - log_alpha[t, s] = paddle.max(candidates) + ctc_probs[t][ - y_insert_blank[s]] + # TODO(Hui Zhang): VarBase.__getitem__() not support np.int64 + log_alpha[t, s] = paddle.max(candidates) + ctc_probs[t][int( + y_insert_blank[s])] state_path[t, s] = prev_state[paddle.argmax(candidates)] - state_seq = -1 * paddle.ones((ctc_probs.size(0), 1), dtype=paddle.int16) + # TODO(Hui Zhang): zeros not support paddle.int16 + state_seq = -1 * paddle.ones((ctc_probs.size(0), 1), dtype=paddle.int32) candidates = paddle.to_tensor([ log_alpha[-1, len(y_insert_blank) - 1], # Sb diff --git a/deepspeech/utils/text_grid.py b/deepspeech/utils/text_grid.py index b774130db..3af58c9ba 100644 --- a/deepspeech/utils/text_grid.py +++ b/deepspeech/utils/text_grid.py @@ -110,7 +110,7 @@ def generate_textgrid(maxtime: float, """ # Download Praat: https://www.fon.hum.uva.nl/praat/ avg_interval = maxtime / (len(intervals) + 1) - print(f"average duration per {name}: {avg_interval}") + print(f"average second/token: {avg_interval}") margin = 0.0001 tg = textgrid.TextGrid(maxTime=maxtime) diff --git a/deepspeech/utils/utility.py b/deepspeech/utils/utility.py index 64570026b..a0639e065 100644 --- a/deepspeech/utils/utility.py +++ b/deepspeech/utils/utility.py @@ -79,3 +79,22 @@ def log_add(args: List[int]) -> float: a_max = max(args) lsp = math.log(sum(math.exp(a - a_max) for a in args)) return a_max + lsp + + +def get_subsample(config): + """Subsample rate from config. + + Args: + config (yacs.config.CfgNode): yaml config + + Returns: + int: subsample rate. + """ + input_layer = config["model"]["encoder_conf"]["input_layer"] + assert input_layer in ["conv2d", "conv2d6", "conv2d8"] + if input_layer == "conv2d": + return 4 + elif input_layer == "conv2d6": + return 6 + elif input_layer == "conv2d8": + return 8 diff --git a/examples/aishell/s1/local/align.sh b/examples/aishell/s1/local/align.sh new file mode 100755 index 000000000..926cb9397 --- /dev/null +++ b/examples/aishell/s1/local/align.sh @@ -0,0 +1,43 @@ +#! /usr/bin/env bash + +if [ $# != 2 ];then + echo "usage: ${0} config_path ckpt_path_prefix" + exit -1 +fi + +ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') +echo "using $ngpu gpus..." + +device=gpu +if [ ngpu == 0 ];then + device=cpu +fi +config_path=$1 +ckpt_prefix=$2 + +ckpt_name=$(basename ${ckpt_prefxi}) + +mkdir -p exp + + + +batch_size=1 +output_dir=${ckpt_prefix} +mkdir -p ${output_dir} + +# align dump in `result_file` +# .tier, .TextGrid dump in `dir of result_file` +python3 -u ${BIN_DIR}/alignment.py \ +--device ${device} \ +--nproc 1 \ +--config ${config_path} \ +--result_file ${output_dir}/${type}.align \ +--checkpoint_path ${ckpt_prefix} \ +--opts decoding.batch_size ${batch_size} + +if [ $? -ne 0 ]; then + echo "Failed in ctc alignment!" + exit 1 +fi + +exit 0 diff --git a/tools/Makefile b/tools/Makefile index dd5902373..94e5ea2f7 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -19,7 +19,7 @@ kenlm.done: apt-get install -y gcc-5 g++-5 && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-5 50 && update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-5 50 test -d kenlm || wget -O - https://kheafield.com/code/kenlm.tar.gz | tar xz mkdir -p kenlm/build && cd kenlm/build && cmake .. && make -j4 && make install - cd kenlm && python setup.py install + source venv/bin/activate; cd kenlm && python setup.py install touch kenlm.done sox.done: @@ -32,4 +32,4 @@ sox.done: soxbindings.done: test -d soxbindings || git clone https://github.com/pseeth/soxbindings.git source venv/bin/activate; cd soxbindings && python setup.py install - touch soxbindings.done \ No newline at end of file + touch soxbindings.done From 4c9a1f6dc7def927d5c8b32ff7bbf87224eed693 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Thu, 1 Jul 2021 07:41:27 +0000 Subject: [PATCH 052/281] add align.sh and update run.sh --- examples/aishell/s1/run.sh | 7 ++++- examples/librispeech/s1/local/align.sh | 43 ++++++++++++++++++++++++++ examples/librispeech/s1/run.sh | 5 +++ examples/tiny/s1/local/align.sh | 43 ++++++++++++++++++++++++++ examples/tiny/s1/run.sh | 8 ++++- 5 files changed, 104 insertions(+), 2 deletions(-) create mode 100755 examples/librispeech/s1/local/align.sh create mode 100755 examples/tiny/s1/local/align.sh diff --git a/examples/aishell/s1/run.sh b/examples/aishell/s1/run.sh index 4cf09553b..562cfa04d 100644 --- a/examples/aishell/s1/run.sh +++ b/examples/aishell/s1/run.sh @@ -30,10 +30,15 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=4 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + # ctc alignment of test data + CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 +fi + +if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then # export ckpt avg_n CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit fi diff --git a/examples/librispeech/s1/local/align.sh b/examples/librispeech/s1/local/align.sh new file mode 100755 index 000000000..926cb9397 --- /dev/null +++ b/examples/librispeech/s1/local/align.sh @@ -0,0 +1,43 @@ +#! /usr/bin/env bash + +if [ $# != 2 ];then + echo "usage: ${0} config_path ckpt_path_prefix" + exit -1 +fi + +ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') +echo "using $ngpu gpus..." + +device=gpu +if [ ngpu == 0 ];then + device=cpu +fi +config_path=$1 +ckpt_prefix=$2 + +ckpt_name=$(basename ${ckpt_prefxi}) + +mkdir -p exp + + + +batch_size=1 +output_dir=${ckpt_prefix} +mkdir -p ${output_dir} + +# align dump in `result_file` +# .tier, .TextGrid dump in `dir of result_file` +python3 -u ${BIN_DIR}/alignment.py \ +--device ${device} \ +--nproc 1 \ +--config ${config_path} \ +--result_file ${output_dir}/${type}.align \ +--checkpoint_path ${ckpt_prefix} \ +--opts decoding.batch_size ${batch_size} + +if [ $? -ne 0 ]; then + echo "Failed in ctc alignment!" + exit 1 +fi + +exit 0 diff --git a/examples/librispeech/s1/run.sh b/examples/librispeech/s1/run.sh index 65194d902..b81e8dcfd 100755 --- a/examples/librispeech/s1/run.sh +++ b/examples/librispeech/s1/run.sh @@ -33,6 +33,11 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + # ctc alignment of test data + CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 +fi + +if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then # export ckpt avg_n CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit fi diff --git a/examples/tiny/s1/local/align.sh b/examples/tiny/s1/local/align.sh new file mode 100755 index 000000000..926cb9397 --- /dev/null +++ b/examples/tiny/s1/local/align.sh @@ -0,0 +1,43 @@ +#! /usr/bin/env bash + +if [ $# != 2 ];then + echo "usage: ${0} config_path ckpt_path_prefix" + exit -1 +fi + +ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') +echo "using $ngpu gpus..." + +device=gpu +if [ ngpu == 0 ];then + device=cpu +fi +config_path=$1 +ckpt_prefix=$2 + +ckpt_name=$(basename ${ckpt_prefxi}) + +mkdir -p exp + + + +batch_size=1 +output_dir=${ckpt_prefix} +mkdir -p ${output_dir} + +# align dump in `result_file` +# .tier, .TextGrid dump in `dir of result_file` +python3 -u ${BIN_DIR}/alignment.py \ +--device ${device} \ +--nproc 1 \ +--config ${config_path} \ +--result_file ${output_dir}/${type}.align \ +--checkpoint_path ${ckpt_prefix} \ +--opts decoding.batch_size ${batch_size} + +if [ $? -ne 0 ]; then + echo "Failed in ctc alignment!" + exit 1 +fi + +exit 0 diff --git a/examples/tiny/s1/run.sh b/examples/tiny/s1/run.sh index b148869b7..41f845b05 100755 --- a/examples/tiny/s1/run.sh +++ b/examples/tiny/s1/run.sh @@ -34,6 +34,12 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + # ctc alignment of test data + CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 +fi + +if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then # export ckpt avg_n - CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit + CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit fi + From ccfecd17b2baacd05a9c6b93181017569f10d46a Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Fri, 2 Jul 2021 09:26:02 +0000 Subject: [PATCH 053/281] mfa and kaldi deps install --- tools/install/install_gcc.sh | 15 ++++++++++++ tools/install/install_kaldi.sh | 34 +++++++++++++++++++++++++++ tools/install/install_openblas.sh | 39 +++++++++++++++++++++++++++++++ tools/install/ngram.sh | 17 ++++++++++++++ tools/install/openfst.sh | 21 +++++++++++++++++ tools/install/pynini.sh | 13 +++++++++++ 6 files changed, 139 insertions(+) create mode 100755 tools/install/install_gcc.sh create mode 100755 tools/install/install_kaldi.sh create mode 100755 tools/install/install_openblas.sh create mode 100755 tools/install/ngram.sh create mode 100755 tools/install/openfst.sh create mode 100755 tools/install/pynini.sh diff --git a/tools/install/install_gcc.sh b/tools/install/install_gcc.sh new file mode 100755 index 000000000..6eda8ea71 --- /dev/null +++ b/tools/install/install_gcc.sh @@ -0,0 +1,15 @@ +set -e +set -x + +# gcc +apt update -y +apt install build-essential -y +apt install software-properties-common -y +add-apt-repository ppa:ubuntu-toolchain-r/test +apt install gcc-8 g++-8 -y +update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-8 80 +update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-8 80 +update-alternatives --config gcc + +# gfortran +apt-get install gfortran-8 diff --git a/tools/install/install_kaldi.sh b/tools/install/install_kaldi.sh new file mode 100755 index 000000000..545fbe4ee --- /dev/null +++ b/tools/install/install_kaldi.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +# Installation script for Kaldi +# +set -e + +apt-get install subversion -y + +KALDI_GIT="--depth 1 -b master https://github.com/kaldi-asr/kaldi.git" + +KALDI_DIR="$PWD/kaldi" + +if [ ! -d "$KALDI_DIR" ]; then + git clone $KALDI_GIT $KALDI_DIR +else + echo "$KALDI_DIR already exists!" +fi + +cd "$KALDI_DIR/tools" +git pull + +# Prevent kaldi from switching default python version +mkdir -p "python" +touch "python/.use_default_python" + +./extras/check_dependencies.sh + +make -j4 + +cd ../src +./configure --shared --use-cuda=no --static-math +make clean -j && make depend -j && make -j4 + +echo "Done installing Kaldi." diff --git a/tools/install/install_openblas.sh b/tools/install/install_openblas.sh new file mode 100755 index 000000000..b1e4d3da8 --- /dev/null +++ b/tools/install/install_openblas.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash + +OPENBLAS_VERSION=0.3.13 + +WGET=${WGET:-wget} + +set -e + +if ! command -v gfortran 2>/dev/null; then + echo "$0: gfortran is not installed. Please install it, e.g. by:" + echo " apt-get install gfortran" + echo "(if on Debian or Ubuntu), or:" + echo " yum install gcc-gfortran" + echo "(if on RedHat/CentOS). On a Mac, if brew is installed, it's:" + echo " brew install gfortran" + exit 1 +fi + + +tarball=OpenBLAS-$OPENBLAS_VERSION.tar.gz + +rm -rf xianyi-OpenBLAS-* OpenBLAS OpenBLAS-*.tar.gz + +if [ -d "$DOWNLOAD_DIR" ]; then + cp -p "$DOWNLOAD_DIR/$tarball" . +else + url=$($WGET -qO- "https://api.github.com/repos/xianyi/OpenBLAS/releases/tags/v${OPENBLAS_VERSION}" | python -c 'import sys,json;print(json.load(sys.stdin)["tarball_url"])') + test -n "$url" + $WGET -t3 -nv -O $tarball "$url" +fi + +tar xzf $tarball +mv xianyi-OpenBLAS-* OpenBLAS + +make PREFIX=$(pwd)/OpenBLAS/install USE_LOCKING=1 USE_THREAD=0 -C OpenBLAS all install +if [ $? -eq 0 ]; then + echo "OpenBLAS is installed successfully." + rm $tarball +fi diff --git a/tools/install/ngram.sh b/tools/install/ngram.sh new file mode 100755 index 000000000..304bb93d0 --- /dev/null +++ b/tools/install/ngram.sh @@ -0,0 +1,17 @@ +set -e +set -x + +# need support c++17, so need gcc >= 8 +# openfst +ngram=ngram-1.3.13 +shared=true + +test -e ${ngram}.tar.gz || wget http://www.openfst.org/twiki/pub/GRM/NGramDownload/${ngram}.tar.gz +test -d ${ngram} || tar -xvf ${ngram}.tar.gz && chown -R root:root ${ngram} + +if [ $shared == true ];then + pushd ${ngram} && ./configure --enable-shared && popd +else + pushd ${ngram} && ./configure --enable-static && popd +fi +pushd ${ngram} && make -j && make install && popd diff --git a/tools/install/openfst.sh b/tools/install/openfst.sh new file mode 100755 index 000000000..8c6a45748 --- /dev/null +++ b/tools/install/openfst.sh @@ -0,0 +1,21 @@ +set -e +set -x + +# need support c++17, so need gcc >= 8 +# openfst +openfst=openfst-1.8.1 +shared=true + +test -e ${openfst}.tar.gz || wget http://www.openfst.org/twiki/pub/FST/FstDownload/${openfst}.tar.gz +test -d ${openfst} || tar -xvf ${openfst}.tar.gz && chown -R root:root ${openfst} + +wfst_so_path=$(python3 -c 'import sysconfig; import os; from pathlib import Path; site = sysconfig.get_paths()["purelib"]; site=Path(site); suffix = ("/usr/local/lib",) + site.parts[-2:]; print(os.path.join(*suffix));') + +if [ $shared == true ];then + pushd ${openfst} && ./configure --enable-shared --enable-compact-fsts --enable-compress --enable-const-fsts --enable-far --enable-linear-fsts --enable-lookahead-fsts --enable-mpdt --enable-ngram-fsts --enable-pdt --enable-python --enable-special --enable-bin --enable-grm --prefix ${PWD}/output && popd +else + pushd ${openfst} && ./configure --enable-static --enable-compact-fsts --enable-compress --enable-const-fsts --enable-far --enable-linear-fsts --enable-lookahead-fsts --enable-mpdt --enable-ngram-fsts --enable-pdt --enable-python --enable-special --enable-bin --enable-grm --prefix ${PWD}/output && popd +fi +pushd ${openfst} && make -j && make install && popd + +cp ${wfst_so_path}/pywrapfst.* $(python3 -c 'import sysconfig; print(sysconfig.get_paths()["purelib"])') diff --git a/tools/install/pynini.sh b/tools/install/pynini.sh new file mode 100755 index 000000000..a6f86f550 --- /dev/null +++ b/tools/install/pynini.sh @@ -0,0 +1,13 @@ +set -e +set -x + +pynini=pynini-2.1.4 + +test -e ${pynini}.tar.gz || wget http://www.openfst.org/twiki/pub/GRM/PyniniDownload/${pynini}.tar.gz +test -d ${pynini} || tar -xvf ${pynini}.tar.gz && chown -R root:root ${pynini} + +#wfst_so_path=$(python3 -c 'import sysconfig; import os; from pathlib import Path; site = sysconfig.get_paths()["purelib"]; site=Path(site); suffix = ("/usr/local/lib",) + site.parts[-2:]; print(os.path.join(*suffix));') + +pushd ${pynini} && python setup.py install && popd + +#cp ${wfst_so_path}/pywrapfst.* $(python3 -c 'import sysconfig; print(sysconfig.get_paths()["purelib"])') From f6fb815364d3e8a39688f72ad7d5fef624be011a Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Fri, 2 Jul 2021 10:46:21 +0000 Subject: [PATCH 054/281] mfa, kaldi install scripts --- tools/install/README.md | 11 + tools/install/install_mfa.sh | 13 + tools/install/install_miniconda.sh | 19 ++ tools/install/install_mkl.sh | 240 ++++++++++++++++++ tools/install/{ngram.sh => install_ngram.sh} | 0 .../{openfst.sh => install_openfst.sh} | 0 .../install/{pynini.sh => install_pynini.sh} | 0 7 files changed, 283 insertions(+) create mode 100644 tools/install/README.md create mode 100755 tools/install/install_mfa.sh create mode 100755 tools/install/install_miniconda.sh create mode 100755 tools/install/install_mkl.sh rename tools/install/{ngram.sh => install_ngram.sh} (100%) rename tools/install/{openfst.sh => install_openfst.sh} (100%) rename tools/install/{pynini.sh => install_pynini.sh} (100%) diff --git a/tools/install/README.md b/tools/install/README.md new file mode 100644 index 000000000..19c06a134 --- /dev/null +++ b/tools/install/README.md @@ -0,0 +1,11 @@ +1. kaldi + +deps gcc, mkl or openblas + +2. OpenFST/ngram/pynini + +deps gcc + +3. MFA + +deps kaldi diff --git a/tools/install/install_mfa.sh b/tools/install/install_mfa.sh new file mode 100755 index 000000000..fdcdaa948 --- /dev/null +++ b/tools/install/install_mfa.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +test -d Montreal-Forced-Aligner || git clone https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner.git + +pushd Montreal-Forced-Aligner && python setup.py install + +test -d kaldi || { echo "need install kaldi first"; exit 1;} + +mfa thirdparty kaldi $PWD/kaldi + +mfa thirdparty validate + +echo "install mfa pass." diff --git a/tools/install/install_miniconda.sh b/tools/install/install_miniconda.sh new file mode 100755 index 000000000..3d1909af6 --- /dev/null +++ b/tools/install/install_miniconda.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash + +WGET=${WGET:-wget} + +# The script automatically choose default settings of miniconda for installation +# Miniconda will be installed in the HOME directory. ($HOME/miniconda3). +# Also don't make miniconda's python as default. + +if [ -d "$DOWNLOAD_DIR" ]; then + cp -p "$DOWNLOAD_DIR/Miniconda3-latest-Linux-x86_64.sh" . || exit 1 +else + $WGET https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh || exit 1 +fi +bash Miniconda3-latest-Linux-x86_64.sh -b + +$HOME/miniconda3/bin/python -m pip install --user tqdm +$HOME/miniconda3/bin/python -m pip install --user scikit-learn +$HOME/miniconda3/bin/python -m pip install --user librosa +$HOME/miniconda3/bin/python -m pip install --user h5py diff --git a/tools/install/install_mkl.sh b/tools/install/install_mkl.sh new file mode 100755 index 000000000..84fdc9c37 --- /dev/null +++ b/tools/install/install_mkl.sh @@ -0,0 +1,240 @@ +#!/usr/bin/env bash + +# Intel MKL is now freely available even for commercial use. This script +# attempts to install the MKL package automatically from Intel's repository. +# +# For manual repository setup instructions, see: +# https://software.intel.com/articles/installing-intel-free-libs-and-python-yum-repo +# https://software.intel.com/articles/installing-intel-free-libs-and-python-apt-repo +# +# For other package managers, or non-Linux platforms, see: +# https://software.intel.com/mkl/choose-download + +set -o pipefail + +default_package=intel-mkl-64bit-2020.0-088 + +yum_repo='https://yum.repos.intel.com/mkl/setup/intel-mkl.repo' +apt_repo='https://apt.repos.intel.com/mkl' +intel_key_url='https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2019.PUB' + +Usage () { + cat >&2 <] + Checks if MKL is present on the system, and/or attempts to install it. + If is not provided, ${default_package} will be installed. + Intel packages are installed under the /opt/intel directory. You should be root + to install MKL into this directory; run this script using the sudo command. + Options: + -s - Skip check for MKL being already present. + -p -- Force type of package management. Use only + if automatic detection fails, as instructed. + -h - Show this message. + Environment: + CC The C compiler to use for MKL check. If not set, uses 'cc'. + EOF + exit 2 + } + + Fatal () { echo "$0: $@"; exit 1; } + + Have () { type -t "$1" >/dev/null; } + + # Option values. + skip_cc= + distro= + + while getopts ":hksp:" opt; do + case ${opt} in + h) Usage ;; + s) skip_cc=yes ;; + p) case $OPTARG in + suse|redhat|debian|fedora|arch) distro=$OPTARG ;; + *) Fatal "invalid value -p '${OPTARG}'. " \ + "Allowed: 'suse', 'redhat', 'debian', 'fedora', or 'arch'." + esac ;; + \?) echo >&2 "$0: invalid option -${OPTARG}."; Usage ;; + esac +done +shift $((OPTIND-1)) + +orig_arg_package=${1-''} +package=${1:-$default_package} + +# Check that we are actually on Linux, otherwise give a helpful reference. +[[ $(uname) == Linux ]] || Fatal "\ + This script can be used on Linux only, and your system is $(uname). +Installer packages for Mac and Windows are available for download from Intel: +https://software.intel.com/mkl/choose-download" + +# Test if MKL is already installed on the system. +if [[ ! $skip_cc ]]; then + : ${CC:=cc} + Have "$CC" || Fatal "\ + C compiler $CC not found. + You can skip the check for MKL presence by invoking this script with the '-s' + option to this script, but you will need a functional compiler anyway, so we + recommend that you install it first." + + mkl_version=$($CC -E -I /opt/intel/mkl/include - <<< \ + '#include + __INTEL_MKL__.__INTEL_MKL_MINOR__.__INTEL_MKL_UPDATE__' 2>/dev/null | + tail -n 1 ) || mkl_version= + mkl_version=${mkl_version// /} + [[ $mkl_version ]] && Fatal "\ + MKL version $mkl_version is already installed. + You can skip the check for MKL presence by invoking this script with the '-s' + option and proceed with automated installation, but we highly discourage + this. This script will register Intel repositories with your system, and it + seems that they have been already registered, or MKL has been installed some + other way. + You should use your package manager to check which MKL package is already + installed. Note that Intel packages register the latest installed version of + the library as the default. If your installed version is older than + $package, it makes sense to upgrade." +fi +# Try to determine which package manager the distro uses, unless overridden. +if [[ ! $distro ]]; then + dist_vars=$(cat /etc/os-release 2>/dev/null) + eval "$dist_vars" + for rune in $CPE_NAME $ID $ID_LIKE; do + case "$rune" in + cpe:/o:fedoraproject:fedora:2[01]) distro=redhat; break;; # Use yum. + rhel|centos) distro=redhat; break;; + redhat|suse|fedora|debian|arch) distro=$rune; break;; + esac + done + # Certain old distributions do not have /etc/os-release. We are unlikely to + # encounter these in the wild, but just in case. + # NOTE: Do not try to guess Fedora specifically here! Fedora 20 and below + # detect as redhat, and this is good, because they use yum by default. + [[ ! $distro && -f /etc/redhat-release ]] && distro=redhat + [[ ! $distro && -f /etc/SuSE-release ]] && distro=suse + [[ ! $distro && -f /etc/debian_release ]] && distro=debian + [[ ! $distro && -f /etc/arch-release ]] && distro=arch + [[ ! $distro ]] && Fatal "\ + Unable to determine package management style. + Invoke this script with the option '-p