From 30aba266930e84bc016f66457b3add4cee2054f4 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Wed, 19 May 2021 12:07:59 +0000 Subject: [PATCH 01/25] add align code --- deepspeech/exps/u2/model.py | 63 +++++++++++++++++ deepspeech/utils/ctc_utils.py | 6 +- deepspeech/utils/text_grid.py | 125 ++++++++++++++++++++++++++++++++++ 3 files changed, 191 insertions(+), 3 deletions(-) create mode 100644 deepspeech/utils/text_grid.py diff --git a/deepspeech/exps/u2/model.py b/deepspeech/exps/u2/model.py index f166a071e..6da0c3bd1 100644 --- a/deepspeech/exps/u2/model.py +++ b/deepspeech/exps/u2/model.py @@ -34,9 +34,11 @@ from deepspeech.models.u2 import U2Model from deepspeech.training.gradclip import ClipGradByGlobalNormWithLog from deepspeech.training.scheduler import WarmupLR from deepspeech.training.trainer import Trainer +from deepspeech.utils import ctc_utils from deepspeech.utils import error_rate from deepspeech.utils import layer_tools from deepspeech.utils import mp_tools +from deepspeech.utils import text_grid from deepspeech.utils.log import Log logger = Log(__name__).getlog() @@ -483,6 +485,67 @@ class U2Tester(U2Trainer): except KeyboardInterrupt: sys.exit(-1) + @paddle.no_grad() + def align(self): + if self.config.decoding.batch_size > 1: + logger.fatal('alignment mode must be running with batch_size == 1') + sys.exit(1) + + # xxx.align + assert self.args.result_file + + self.model.eval() + logger.info(f"Align Total Examples: {len(self.test_loader.dataset)}") + + stride_ms = self.test_loader.dataset.stride_ms + token_dict = self.test_loader.dataset.vocab_list + with open(self.args.result_file, 'w') as fout: + for i, batch in enumerate(self.test_loader): + key, feat, feats_length, target, target_length = batch + # 1. Encoder + encoder_out, encoder_mask = self.model._forward_encoder( + feat, feats_length) # (B, maxlen, encoder_dim) + maxlen = encoder_out.size(1) + ctc_probs = self.model.ctc.log_softmax( + encoder_out) # (1, maxlen, vocab_size) + + # 2. alignment + # print(ctc_probs.size(1)) + ctc_probs = ctc_probs.squeeze(0) + target = target.squeeze(0) + alignment = ctc_utils.forced_align(ctc_probs, target) + print(alignment) + fout.write('{} {}\n'.format(key[0], alignment)) + + # 3. gen praat + # segment alignment + align_segs = text_grid.segment_alignment(alignment) + print(align_segs) + # IntervalTier, List["start end token\n"] + subsample = get_subsample(self.config) + tierformat = text_grid.align_to_tierformat( + align_segs, subsample, token_dict) + tier_path = os.path.join( + os.path.dirname(args.result_file), key[0] + ".tier") + with open(tier_path, 'w') as f: + f.writelines(tierformat) + + textgrid_path = s.path.join( + os.path.dirname(args.result_file), key[0] + ".TextGrid") + second_per_frame = 1. / (1000. / stride_ms + ) # 25ms window, 10ms stride + text_grid.generate_textgrid( + maxtime=(len(alignment) + 1) * subsample * second_per_frame, + lines=tierformat, + output=textgrid_path) + + def run_align(self): + self.resume_or_scratch() + try: + self.align() + except KeyboardInterrupt: + sys.exit(-1) + def load_inferspec(self): """infer model and input spec. diff --git a/deepspeech/utils/ctc_utils.py b/deepspeech/utils/ctc_utils.py index 73669fea6..76c1898be 100644 --- a/deepspeech/utils/ctc_utils.py +++ b/deepspeech/utils/ctc_utils.py @@ -46,7 +46,7 @@ def remove_duplicates_and_blank(hyp: List[int], blank_id=0) -> List[int]: return new_hyp -def insert_blank(label: np.ndarray, blank_id: int=0): +def insert_blank(label: np.ndarray, blank_id: int=0) -> np.ndarray: """Insert blank token between every two label token. "abcdefg" -> "-a-b-c-d-e-f-g-" @@ -67,7 +67,7 @@ def insert_blank(label: np.ndarray, blank_id: int=0): def forced_align(ctc_probs: paddle.Tensor, y: paddle.Tensor, - blank_id=0) -> list: + blank_id=0) -> List[int]: """ctc forced alignment. https://distill.pub/2017/ctc/ @@ -77,7 +77,7 @@ def forced_align(ctc_probs: paddle.Tensor, y: paddle.Tensor, y (paddle.Tensor): label id sequence tensor, 1d tensor (L) blank_id (int): blank symbol index Returns: - paddle.Tensor: best alignment result, (T). + List[int]: best alignment result, (T). """ y_insert_blank = insert_blank(y, blank_id) diff --git a/deepspeech/utils/text_grid.py b/deepspeech/utils/text_grid.py new file mode 100644 index 000000000..9afed89e0 --- /dev/null +++ b/deepspeech/utils/text_grid.py @@ -0,0 +1,125 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Dict +from typing import List +from typing import Text + +import textgrid + + +def segment_alignment(alignment: List[int], blank_id=0) -> List[List[int]]: + """segment ctc alignment ids by continuous blank and repeat label. + + Args: + alignment (List[int]): ctc alignment id sequence. e.g. [0, 0, 0, 1, 1, 1, 2, 0, 0, 3] + blank_id (int, optional): blank id. Defaults to 0. + + Returns: + List[List[int]]: segment aligment id sequence. e.g. [[0, 0, 0, 1, 1, 1], [2], [0, 0, 3]] + """ + # convert alignment to a praat format, which is a doing phonetics + # by computer and helps analyzing alignment + align_segs = [] + # get frames level duration for each token + start = 0 + end = 0 + while end < len(alignment): + while end < len(alignment) and alignment[end] == blank_id: # blank + end += 1 + if end == len(alignment): + align_segs[-1].extend(alignment[start:]) + break + end += 1 + while end < len(alignment) and alignment[end - 1] == alignment[ + end]: # repeat label + end += 1 + align_segs.append(alignment[start:end]) + start = end + return align_segs + + +def align_to_tierformat(align_segs: List[List[int]], + subsample: int, + token_dict: Dict[int, Text], + blank_id=0) -> List[Text]: + """Generate textgrid.Interval format from alignment segmentations. + + Args: + align_segs (List[List[int]]): segmented ctc alignment ids. + subsample (int): 25ms frame_length, 10ms hop_length, 1/subsample + token_dict (Dict[int, Text]): int -> str map. + + Returns: + List[Text]: list of textgrid.Interval. + """ + hop_length = 10 # ms + second_ms = 1000 # ms + frame_per_second = second_ms / hop_length # 25ms frame_length, 10ms hop_length + second_per_frame = 1.0 / frame_per_second + + begin = 0 + duration = 0 + tierformat = [] + + for idx, tokens in enumerate(align_segs): + token_len = len(tokens) + token = tokens[-1] + # time duration in second + duration = token_len * subsample * second_per_frame + if idx < len(align_segs) - 1: + print(f"{begin:.2f} {begin + duration:.2f} {token_dict[token]}") + tierformat.append( + f"{begin:.2f} {begin + duration:.2f} {token_dict[token]}\n") + else: + for i in tokens: + if i != blank_id: + token = i + break + print(f"{begin:.2f} {begin + duration:.2f} {token_dict[token]}") + tierformat.append( + f"{begin:.2f} {begin + duration:.2f} {token_dict[token]}\n") + begin = begin + duration + + return tierformat + + +def generate_textgrid(maxtime: float, + intervals: List[Text], + output: Text, + name: Text='ali') -> None: + """Create alignment textgrid file. + + Args: + maxtime (float): audio duartion. + intervals (List[Text]): ctc output alignment. e.g. "start-time end-time word" per item. + output (Text): textgrid filepath. + name (Text, optional): tier or layer name. Defaults to 'ali'. + """ + # Download Praat: https://www.fon.hum.uva.nl/praat/ + avg_interval = maxtime / (len(intervals) + 1) + print(f"average duration per {name}: {avg_interval}") + margin = 0.0001 + + tg = textgrid.TextGrid(maxTime=maxtime) + tier = textgrid.IntervalTier(name=name, maxTime=maxtime) + + i = 0 + for dur in intervals: + s, e, text = dur.split() + tier.add(minTime=float(s) + margin, maxTime=float(e), mark=text) + + tg.append(tier) + + tg.write(output) + print("successfully generator textgrid {}.".format(output)) From 92381451fbdb7fdf56af531de9e7ca145d4df815 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Wed, 19 May 2021 12:08:06 +0000 Subject: [PATCH 02/25] format --- README.md | 2 +- deepspeech/frontend/normalizer.py | 3 ++- doc/src/asr_text_backend.md | 2 +- doc/src/benchmark.md | 1 - doc/src/chinese_syllable.md | 2 +- doc/src/dataset.md | 2 +- doc/src/feature_list.md | 2 +- doc/src/ngram_lm.md | 2 +- doc/src/praat_textgrid.md | 15 +++++++-------- doc/src/tools.md | 1 - doc/src/tts_text_front_end.md | 6 +++--- requirements.txt | 4 ++-- 12 files changed, 20 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index a2de1783a..424dc485e 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ ## Features - See [feature list](doc/src/feature_list.md) for more information. + See [feature list](doc/src/feature_list.md) for more information. ## Setup diff --git a/deepspeech/frontend/normalizer.py b/deepspeech/frontend/normalizer.py index 6b224080b..287b51e58 100644 --- a/deepspeech/frontend/normalizer.py +++ b/deepspeech/frontend/normalizer.py @@ -179,7 +179,8 @@ class FeatureNormalizer(object): wav_number += batch_size if wav_number % 1000 == 0: - logger.info(f'process {wav_number} wavs,{all_number} frames.') + logger.info( + f'process {wav_number} wavs,{all_number} frames.') self.cmvn_info = { 'mean_stat': list(all_mean_stat.tolist()), diff --git a/doc/src/asr_text_backend.md b/doc/src/asr_text_backend.md index 879e56f8a..c3c9896c7 100644 --- a/doc/src/asr_text_backend.md +++ b/doc/src/asr_text_backend.md @@ -98,4 +98,4 @@ ## Text Filter -* 敏感词(黄暴、涉政、违法违禁等) \ No newline at end of file +* 敏感词(黄暴、涉政、违法违禁等) diff --git a/doc/src/benchmark.md b/doc/src/benchmark.md index f3af25552..9c1c86fd7 100644 --- a/doc/src/benchmark.md +++ b/doc/src/benchmark.md @@ -14,4 +14,3 @@ We compare the training time with 1, 2, 4, 8 Tesla V100 GPUs (with a subset of L | 8 | 6.95 X | `utils/profile.sh` provides such a demo profiling tool, you can change it as need. - diff --git a/doc/src/chinese_syllable.md b/doc/src/chinese_syllable.md index 676ecb531..fd5a6159a 100644 --- a/doc/src/chinese_syllable.md +++ b/doc/src/chinese_syllable.md @@ -48,4 +48,4 @@ ## Zhuyin * [Bopomofo](https://en.wikipedia.org/wiki/Bopomofo) -* [Zhuyin table](https://en.wikipedia.org/wiki/Zhuyin_table) \ No newline at end of file +* [Zhuyin table](https://en.wikipedia.org/wiki/Zhuyin_table) diff --git a/doc/src/dataset.md b/doc/src/dataset.md index d70d0e0d2..aaa805510 100644 --- a/doc/src/dataset.md +++ b/doc/src/dataset.md @@ -18,4 +18,4 @@ ### ASR Noise -* [asr-noises](https://github.com/speechio/asr-noises) \ No newline at end of file +* [asr-noises](https://github.com/speechio/asr-noises) diff --git a/doc/src/feature_list.md b/doc/src/feature_list.md index 57641d5ea..573669fa2 100644 --- a/doc/src/feature_list.md +++ b/doc/src/feature_list.md @@ -58,4 +58,4 @@ ### Grapheme To Phoneme * syallable -* phoneme \ No newline at end of file +* phoneme diff --git a/doc/src/ngram_lm.md b/doc/src/ngram_lm.md index 07aa5411c..119a3b21c 100644 --- a/doc/src/ngram_lm.md +++ b/doc/src/ngram_lm.md @@ -83,4 +83,4 @@ Please notice that the released language models only contain Chinese simplified ``` build/bin/build_binary ./result/people2014corpus_words.arps ./result/people2014corpus_words.klm - ``` \ No newline at end of file + ``` diff --git a/doc/src/praat_textgrid.md b/doc/src/praat_textgrid.md index c25c760ae..06c4f8791 100644 --- a/doc/src/praat_textgrid.md +++ b/doc/src/praat_textgrid.md @@ -76,7 +76,7 @@ pip3 install textgrid tg.read('file.TextGrid') # 'file.TextGrid' 是文件名 ``` - tg.tiers属性: + tg.tiers属性: 会把文件中的所有item打印出来, print(tg.tiers) 的结果如下: ```text @@ -86,7 +86,7 @@ pip3 install textgrid Interval(1361.89250, 1362.01250, R), Interval(1362.01250, 1362.13250, AY1), Interval(1362.13250, 1362.16250, T), - + ... ] ) @@ -113,7 +113,7 @@ pip3 install textgrid Interval 可以理解为时长 ``` - + 2. textgrid库中的对象 **IntervalTier** 对象: @@ -148,7 +148,7 @@ pip3 install textgrid strict -- > 返回bool值, 表示是否严格TextGrid格式 ``` - ​ + ​ **PointTier** 对象: 方法 @@ -174,7 +174,7 @@ pip3 install textgrid name 返回name ``` - + **Point** 对象: 支持比较大小, 支持加减运算 @@ -185,7 +185,7 @@ pip3 install textgrid time: ``` - ​ + ​ **Interval** 对象: 支持比较大小, 支持加减运算 @@ -250,10 +250,9 @@ pip3 install textgrid grids: --> 返回读取的grids的列表 ``` - + ## Reference * https://zh.wikipedia.org/wiki/Praat%E8%AF%AD%E9%9F%B3%E5%AD%A6%E8%BD%AF%E4%BB%B6 * https://blog.csdn.net/duxin_csdn/article/details/88966295 - diff --git a/doc/src/tools.md b/doc/src/tools.md index 4ec09f6a2..5fcca9239 100644 --- a/doc/src/tools.md +++ b/doc/src/tools.md @@ -1,4 +1,3 @@ # Useful Tools * [正则可视化和常用正则表达式](https://wangwl.net/static/projects/visualRegex/#) - diff --git a/doc/src/tts_text_front_end.md b/doc/src/tts_text_front_end.md index 6eb9ae5d9..9f2f91097 100644 --- a/doc/src/tts_text_front_end.md +++ b/doc/src/tts_text_front_end.md @@ -23,7 +23,7 @@ Therefore, procedures like stemming and lemmatization are not useful for Chinese ### Tokenization -**Tokenizing breaks up text data into shorter pre-set strings**, which help build context and meaning for the machine learning model. +**Tokenizing breaks up text data into shorter pre-set strings**, which help build context and meaning for the machine learning model. These “tags” label the part of speech. There are 24 part of speech tags and 4 proper name category labels in the `**jieba**` package’s existing dictionary. @@ -31,7 +31,7 @@ These “tags” label the part of speech. There are 24 part of speech tags and ### Stop Words -In NLP, **stop words are “meaningless” words** that make the data too noisy or ambiguous. +In NLP, **stop words are “meaningless” words** that make the data too noisy or ambiguous. Instead of manually removing them, you could import the `**stopwordsiso**` package for a full list of Chinese stop words. More information can be found [here](https://pypi.org/project/stopwordsiso/). And with this, we can easily create code to filter out any stop words in large text data. @@ -188,4 +188,4 @@ TN: 基于规则的方法 ## Reference * [Text Front End](https://slyne.github.io/%E5%85%AC%E5%BC%80%E8%AF%BE/2020/10/03/TTS1/) * [Chinese Natural Language (Pre)processing: An Introduction](https://towardsdatascience.com/chinese-natural-language-pre-processing-an-introduction-995d16c2705f) -* [Beginner’s Guide to Sentiment Analysis for Simplified Chinese using SnowNLP](https://towardsdatascience.com/beginners-guide-to-sentiment-analysis-for-simplified-chinese-using-snownlp-ce88a8407efb) \ No newline at end of file +* [Beginner’s Guide to Sentiment Analysis for Simplified Chinese using SnowNLP](https://towardsdatascience.com/beginners-guide-to-sentiment-analysis-for-simplified-chinese-using-snownlp-ce88a8407efb) diff --git a/requirements.txt b/requirements.txt index a6facb6cb..57a951bbd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ coverage pre-commit +pybind11 resampy==0.2.2 scipy==1.2.1 sentencepiece @@ -7,7 +8,6 @@ snakeviz SoundFile==0.9.0.post1 sox tensorboardX +textgrid typeguard yacs -pybind11 -textgrid From 3a76707062452d775330382ca1ad6e04b3483443 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 1 Jun 2021 08:32:41 +0000 Subject: [PATCH 03/25] rm useless --- doc/src/chinese_syllable.md | 51 ------- doc/src/dataset.md | 21 --- doc/src/praat_textgrid.md | 258 ---------------------------------- doc/src/tools.md | 3 - doc/src/tts_text_front_end.md | 191 ------------------------- 5 files changed, 524 deletions(-) delete mode 100644 doc/src/chinese_syllable.md delete mode 100644 doc/src/dataset.md delete mode 100644 doc/src/praat_textgrid.md delete mode 100644 doc/src/tools.md delete mode 100644 doc/src/tts_text_front_end.md diff --git a/doc/src/chinese_syllable.md b/doc/src/chinese_syllable.md deleted file mode 100644 index fd5a6159a..000000000 --- a/doc/src/chinese_syllable.md +++ /dev/null @@ -1,51 +0,0 @@ -# chinese syllable - - - -## Syllable - -* [List of Syllables in Pinyin](https://resources.allsetlearning.com/chinese/pronunciation/syllable) - The word syllable is a term referring to the units of a word, composed on an (optional) initial, a final, and a tone. - - The word "syllable" is 音节 (yīnjié) in Chinese. - - Most spoken syllables in Mandarin Chinese correspond to one written Chinese character. - - There are a total of 410 common pinyin syllables. - -* [Rare syllable](https://resources.allsetlearning.com/chinese/pronunciation/Rare_syllable) - -* [Chinese Pronunciation: The Complete Guide for Beginner](https://www.digmandarin.com/chinese-pronunciation-guide.html) - -* [Mandarin Chinese Phonetics](http://www.zein.se/patrick/chinen8p.html) - -* [chinese phonetics](https://www.easymandarin.cn/online-chinese-lessons/chinese-phonetics/) - Chinese Characters, called “Hanzi”, are the writing symbols of the Chinese language. - Pinyin is the Romanization of a phonetic notation for Chinese Characters. - Each syllable is composed of three parts: initials, finals, and tones. - In the Pinyin system there are 23 initials, 24 finals, 4 tones and a neutral tone. - - - -## Pinyin -* [Pinyin](https://en.wikipedia.org/wiki/Pinyin) -* [Pinyin quick start guide](https://resources.allsetlearning.com/chinese/pronunciation/Pinyin_quick_start_guide) -* [Pinyin Table](https://en.wikipedia.org/wiki/Pinyin_table) -* [Piyin Chat](https://resources.allsetlearning.com/chinese/pronunciation/Pinyin_chart) -* [Mandarin Chinese Pinyin Table](https://www.archchinese.com/chinese_pinyin.html) -* [Chinese Pinyin Table ](http://www.quickmandarin.com/chinesepinyintable/) - - - -## Tones -* [Four tones](https://resources.allsetlearning.com/chinese/pronunciation/Four_tones) -* [Neutral tone](https://resources.allsetlearning.com/chinese/pronunciation/Neutral_tone) -* [Where do the tone marks go?](http://www.pinyin.info/rules/where.html) -* [声调符号标在哪儿?](http://www.hwjyw.com/resource/content/2010/06/04/8183.shtml) - - - -## Zhuyin - -* [Bopomofo](https://en.wikipedia.org/wiki/Bopomofo) -* [Zhuyin table](https://en.wikipedia.org/wiki/Zhuyin_table) diff --git a/doc/src/dataset.md b/doc/src/dataset.md deleted file mode 100644 index aaa805510..000000000 --- a/doc/src/dataset.md +++ /dev/null @@ -1,21 +0,0 @@ -# Dataset - -## Text - -* [Tatoeba](https://tatoeba.org/cmn) - - **Tatoeba is a collection of sentences and translations.** It's collaborative, open, free and even addictive. An open data initiative aimed at translation and speech recognition. - - - -## Speech - -* [Tatoeba](https://tatoeba.org/cmn) - - **Tatoeba is a collection of sentences and translations.** It's collaborative, open, free and even addictive. An open data initiative aimed at translation and speech recognition. - - - -### ASR Noise - -* [asr-noises](https://github.com/speechio/asr-noises) diff --git a/doc/src/praat_textgrid.md b/doc/src/praat_textgrid.md deleted file mode 100644 index 06c4f8791..000000000 --- a/doc/src/praat_textgrid.md +++ /dev/null @@ -1,258 +0,0 @@ -# Praat and TextGrid - -* [**Praat: doing phonetics by computer**](https://www.fon.hum.uva.nl/praat/) -* [TextGrid](https://github.com/kylebgorman/textgrid) - -## Praat - -**Praat语音学软件**,原名**Praat: doing phonetics by computer**,通常简称**Praat**,是一款[跨平台](https://zh.wikipedia.org/wiki/跨平台)的多功能[语音学](https://zh.wikipedia.org/wiki/语音学)专业[软件](https://zh.wikipedia.org/wiki/软件),主要用于对[数字化](https://zh.wikipedia.org/wiki/数字化)的[语音](https://zh.wikipedia.org/wiki/语音)[信号](https://zh.wikipedia.org/wiki/信号)进行[分析](https://zh.wikipedia.org/w/index.php?title=语音分析&action=edit&redlink=1)、标注、[处理](https://zh.wikipedia.org/wiki/数字信号处理)及[合成](https://zh.wikipedia.org/wiki/语音合成)等实验,同时生成各种[语图](https://zh.wikipedia.org/w/index.php?title=语图&action=edit&redlink=1)和文字报表。 - - - - - - - -## TextGrid - -### TextGrid文件结构 - -```text -第一行是固定的:File type = "ooTextFile" -第二行也是固定的:Object class = "TextGrid" -空一行 -xmin = xxxx.xxxx  # 表示开始时间 -xmax = xxxx.xxxx  # 表示结束时间 -tiers?  # 这一行固定 -size = 4  # 表示这个文件有几个item, item也叫tiers, 可以翻译为'层', 这个值是几,就表示有几个item -item []: -    item [1]: -        class = "IntervalTier" -        name = "phone" -        xmin = 1358.8925 -        xmax = 1422.5525 -        intervals: size = 104 -        intervals [1]: -            xmin = 1358.8925 -            xmax = 1361.8925 -            text = "sil" -        intervals [2]: -            xmin = 1361.8925 -            xmax = 1362.0125 -            text = "R" -        intervals [3]: -            ... -        intervals [104]: -            xmin = 1422.2325 -            xmax = 1422.5525 -            text = "sil" -    item [2]: -        class = "IntervalTier" -        name = "word" -        xmin = 1358.8925 -        xmax = 1422.5525 -        intervals: size = 3 -        intervals [1]: -            xmin = 1358.8925 -            xmax = 1361.8925 -            text = "sp" -``` - -textgrid 文件中的 size 的值是几就表示有几个 item, 每个 item 下面包含 class, name, xmin, xmax, intervals 的键值对,item 中的 intervals: size 是几就表示这个 item 中有几个 intervals,每个 intervals 有 xmin, xmax, text 三个键值参数。所有 item 中的 xmax - xmin 的值是一样的。 - -### 安装 - -```python -pip3 install textgrid -``` - -### 使用 - -1. 读一个textgrid文件 - - ```python - import textgrid - tg = textgrid.TextGrid() - tg.read('file.TextGrid') # 'file.TextGrid' 是文件名 - ``` - - tg.tiers属性: - 会把文件中的所有item打印出来, print(tg.tiers) 的结果如下: - - ```text - [IntervalTier( - phone, [ - Interval(1358.89250, 1361.89250, sil), - Interval(1361.89250, 1362.01250, R), - Interval(1362.01250, 1362.13250, AY1), - Interval(1362.13250, 1362.16250, T), - - ... - ] - ) - ] - ``` - - 此外, tg.tiers[0] 表示第一个 IntervalTier, 支持继续用中括号取序列, '.'来取属性. - 比如: - - ```text - tg.tiers[0][0].mark --> 'sil' - tg.tiers[0].name --> 'phone' - tg.tiers[0][0].minTime --> 1358.8925 - tg.tiers[0].intervals --> [Interval(1358.89250, 1361.89250, sil), ..., Interval(1422.23250, 1422.55250, sil)] - tg.tiers[0].maxTime --> 1422.55250 - ``` - - TextGrid 模块中包含四种对象 - - ``` - PointTier 可以理解为标记(点)的集合 - IntervalTier 可以理解为时长(区间)的集合 - Point 可以理解为标记 - Interval 可以理解为时长 - ``` - - - -2. textgrid库中的对象 - **IntervalTier** 对象: - 方法 - - ``` - add(minTime, maxTime, mark): 添加一个标记,需要同时传入起止时间, 和mark的名字. - addInterval(interval): 添加一个Interval对象, 该Interval对象中已经封装了起止时间. - remove(minTime, maxTime, mark): 删除一个Interval - removeInterval(interval): 删除一个Interval - indexContaining(time): 传入时间或Point对象, 返回包含该时间的Interval对象的下标 - 例如: - print(tg[0].indexContaining(1362)) --> 1 - 表示tg[0] 中包含1362时间点的是 下标为1的 Interval 对象 - intervalContaining(): 传入时间或Point对象, 返回包含该时间的Interval对象 - 例如 - print(tg[0].intervalContaining(1362)) --> Interval(1361.89250, 1362.01250, R) - read(f): f是文件对象, 读一个TextGrid文件 - write(f): f是文件对象, 写一个TextGrid文件 - fromFile(f_path): f_path是文件路径, 从一个文件读 - bounds(): 返回一个元组, (minTime, maxTime) - ``` - - - 属性 - - ``` - intervals --> 返回所有的 interval 的列表 - maxTime --> 返回 number(decimal.Decimal)类型, 表示结束时间 - minTime --> 返回 number(decimal.Decimal)类型, 表示开始时间 - name --> 返回字符串 - strict -- > 返回bool值, 表示是否严格TextGrid格式 - ``` - - ​ - - **PointTier** 对象: - 方法 - - ``` - add(minTime, maxTime, mark): 添加一个标记,需要同时传入起止时间, 和mark的名字. - addPoint(point): 添加一个Point对象, 该Point对象中已经封装了起止时间. - remove(time, mark): 删除一个 point, 传入时间和mark - removePoint(point): 删除一个 point, 传入point对象 - read(f): 读, f是文件对象 - write(f): 写, f是文件对象 - fromFile(f_path): f_path是文件路径, 从一个文件读 - bounds(): 返回一个元组, (minTime, maxTime) - ``` - - - 属性 - - ``` - points 返回所有的 point 的列表 - maxTime 和IntervalTier一样, 返回结束时间 - minTime 和IntervalTier一样, 返回开始时间 - name 返回name - ``` - - - - **Point** 对象: - 支持比较大小, 支持加减运算 - 属性: - - ``` - mark: - time: - ``` - - ​ - - **Interval** 对象: - 支持比较大小, 支持加减运算 - 支持 in, not in 的运算 - 方法: - - ``` - duration(): 返回number 类型, 表示这个Interval的持续时间 - bounds(): --> 返回元组, (minTime, maxTime) - overlaps(Interval): --> 返回bool值, 判断本Interval的时间和传入的的Interval的时间是否重叠, 是返回True - ``` - - 属性: - - ``` - mark - maxTime - minTime - strick: --> 返回bool值, 判断格式是否严格的TextGrid格式 - ``` - - **TextGrid** 对象: - 支持列表的取值,复制, 迭代, 求长度, append, extend, pop方法 - 方法: - - ``` - getFirst(tierName) 返回第一个名字为tierName的tier - getList(tierName) 返回名字为tierName的tier的列表 - getNames() 返回所有tier的名字的列表 - append(tier) 添加一个tier作为其中的元素 - extend(tiers) 添加多个tier作为其中的元素 - pop(tier) 删除一个tier - read(f) f是文件对象 - write(f) f是文件对象 - fromFile(f_path) f_path是文件路径 - ``` - - 属性: - - ``` - maxTime - minTime - name - strict - tiers 返回所有tiers的列表 - ``` - - **MLF** 对象 - MLF('xxx.mlf') - 'xxx.mlf'为mlf格式的文件, - 读取hvite-o sm生成的htk.mlf文件并将其转换为 TextGrid的列表 - 方法: - - ``` - read(f) f是文件对象 - write(prefix='') prefix是写出路径的前缀,可选 - ``` - - 属性: - - ``` - grids: --> 返回读取的grids的列表 - ``` - - - -## Reference - -* https://zh.wikipedia.org/wiki/Praat%E8%AF%AD%E9%9F%B3%E5%AD%A6%E8%BD%AF%E4%BB%B6 -* https://blog.csdn.net/duxin_csdn/article/details/88966295 diff --git a/doc/src/tools.md b/doc/src/tools.md deleted file mode 100644 index 5fcca9239..000000000 --- a/doc/src/tools.md +++ /dev/null @@ -1,3 +0,0 @@ -# Useful Tools - -* [正则可视化和常用正则表达式](https://wangwl.net/static/projects/visualRegex/#) diff --git a/doc/src/tts_text_front_end.md b/doc/src/tts_text_front_end.md deleted file mode 100644 index 9f2f91097..000000000 --- a/doc/src/tts_text_front_end.md +++ /dev/null @@ -1,191 +0,0 @@ -# Text Front End - - - -## Text Segmentation - -There are various libraries including some of the most popular ones like NLTK, Spacy, Stanford CoreNLP that that provide excellent, easy to use functions for sentence segmentation. - -* https://github.com/bminixhofer/nnsplit -* [DeepSegment](https://github.com/notAI-tech/deepsegment) [blog](http://bpraneeth.com/projects/deepsegment) [1](https://praneethbedapudi.medium.com/deepcorrection-1-sentence-segmentation-of-unpunctuated-text-a1dbc0db4e98) [2](https://praneethbedapudi.medium.com/deepcorrection2-automatic-punctuation-restoration-ac4a837d92d9) [3](https://praneethbedapudi.medium.com/deepcorrection-3-spell-correction-and-simple-grammar-correction-d033a52bc11d) [4](https://praneethbedapudi.medium.com/deepsegment-2-0-multilingual-text-segmentation-with-vector-alignment-fd76ce62194f) - - - -## Text Normalization(文本正则) - -The **basic preprocessing steps** that occur in English NLP, including data cleaning, stemming/lemmatization, tokenization and stop words. **not all of these steps are necessary for Chinese text data!** - -### Lexicon Normalization - -There’s a concept similar to stems in this language, and they’re called Radicals. **Radicals are basically the building blocks of Chinese characters.** All Chinese characters are made up of a finite number of components which are put together in different orders and combinations. Radicals are usually the leftmost part of the character. There are around 200 radicals in Chinese, and they are used to index and categorize characters. - -Therefore, procedures like stemming and lemmatization are not useful for Chinese text data because seperating the radicals would **change the word’s meaning entirely**. - -### Tokenization - -**Tokenizing breaks up text data into shorter pre-set strings**, which help build context and meaning for the machine learning model. - -These “tags” label the part of speech. There are 24 part of speech tags and 4 proper name category labels in the `**jieba**` package’s existing dictionary. - - - -### Stop Words - -In NLP, **stop words are “meaningless” words** that make the data too noisy or ambiguous. - -Instead of manually removing them, you could import the `**stopwordsiso**` package for a full list of Chinese stop words. More information can be found [here](https://pypi.org/project/stopwordsiso/). And with this, we can easily create code to filter out any stop words in large text data. - -```python -!pip install stopwordsiso -import stopwordsiso -from stopwordsiso import stopwords -stopwords(["zh"]) # Chinese -``` - - - -文本正则化 文本正则化主要是讲非标准词(NSW)进行转化,比如: - -数字、电话号码: 10086 -> 一千零八十六/幺零零八六 -时间,比分: 23:20 -> 二十三点二十分/二十三比二十 -分数、小数、百分比: 3/4 -> 四分之三,3.24 -> 三点一四, 15% -> 百分之十五 -符号、单位: ¥ -> 元, kg -> 千克 -网址、文件后缀: www. -> 三W点 - -* https://github.com/google/re2 - -* https://github.com/speechio/chinese_text_normalization - -* [vinorm](https://github.com/NoahDrisort/vinorm) [cpp_verion](https://github.com/NoahDrisort/vinorm_cpp_version) - - Python package for text normalization, use for frontend of Text-to-speech Reseach - -* https://github.com/candlewill/CNTN - - This is a ChiNese Text Normalization (CNTN) tool for Text-to-speech system, which is based on [sparrowhawk](https://github.com/google/sparrowhawk). - - - -## Word Segmentation(分词) - -分词之所以重要可以通过这个例子来说明: -广州市长隆马戏欢迎你 -> 广州市 长隆 马戏 欢迎你 -如果没有分词错误会导致句意完全不正确:  -广州 市长 隆马戏 欢迎你 - -分词常用方法分为最大前向匹配(基于字典)和基于CRF的分词方法。用CRF的方法相当于是把这个任务转换成了序列标注,相比于基于字典的方法好处是对于歧义或者未登录词有较强的识别能力,缺点是不能快速fix bug,并且性能略低于词典。 - - -中文分词的常见工具: -* https://github.com/lancopku/PKUSeg-python -* https://github.com/thunlp/THULAC-Python -* https://github.com/fxsjy/jieba -* CRF++ -* https://github.com/isnowfy/snownlp - -### MMSEG -* [MMSEG: A Word Identification System for Mandarin Chinese Text Based on Two Variants of the Maximum Matching Algorithm](http://technology.chtsai.org/mmseg/) -* [`中文分词`简单高效的MMSeg](https://www.cnblogs.com/en-heng/p/5872308.html) -* [mmseg分词算法及实现](https://blog.csdn.net/daniel_ustc/article/details/50488040) -* [Mmseg算法](https://www.jianshu.com/p/e4ae8d194487) -* [浅谈中文分词](http://www.isnowfy.com/introduction-to-chinese-segmentation/) - -* [pymmseg-cpp](https://github.com/pluskid/pymmseg-cpp.git) -* [ustcdane/mmseg](https://github.com/ustcdane/mmseg) -* [jkom-cloud/mmseg](https://github.com/jkom-cloud/mmseg) - - -### CScanner -* [CScanner - A Chinese Lexical Scanner](http://technology.chtsai.org/cscanner/) - - - -## Part of Speech(词性预测) - -词性解释 -n/名词 np/人名 ns/地名 ni/机构名 nz/其它专名 -m/数词 q/量词 mq/数量词 t/时间词 f/方位词 s/处所词 -v/动词 a/形容词 d/副词 h/前接成分 k/后接成分 -i/习语 j/简称 r/代词 c/连词 p/介词 u/助词 y/语气助词 -e/叹词 o/拟声词 g/语素 w/标点 x/其它 - - - -## G2P(注音) - -注音是需要将词转换成对应的发音,对于中文是将其转换成拼音,比如 绿色->(lv4 se4) 这里的数字表示声调。 - -传统方法是使用字典,但是对于未登录词就很难解决。基于模型的方法是使用 [Phonetisaurus](https://github.com/AdolfVonKleist/Phonetisaurus)。 论文可以参考 - WFST-based Grapheme-to-Phoneme Conversion: Open Source Tools for Alignment, Model-Building and Decoding - -当然这个问题也可以看做是序列标注用CRF或者基于神经网络的模型都可以做。 基于神经网络工具: [g2pM](https://github.com/kakaobrain/g2pM)。 - - - - -## Prosody(韵律预测) - -ToBI(an abbreviation of tones and break indices) is a set of conventions for transcribing and annotating the prosody of speech. 中文主要关注break。 - - -韵律等级结构: - -音素 -> 音节 -> 韵律词(Prosody Word, PW) -> 韵律短语(prosody phrase, PPH) -> 语调短句(intonational phrase, IPH) -> 子句子 -> 主句子 -> 段落 -> 篇章 -LP -> LO -> L1(#1) -> L2(#2) -> L3(#3) -> L4(#4) -> L5 -> L6 -> L7 -主要关注 PW, PPH, IPH - -| | 停顿时长 | 前后音高特征 | -| --- | ----------| --- | -| 韵律词边界 | 不停顿或从听感上察觉不到停顿 | 无 | -| 韵律短语边界 | 可以感知停顿,但无明显的静音段 | 音高不下倾或稍下倾,韵末不可做句末 | -| 语调短语边界 | 有较长停顿 | 音高下倾比较完全,韵末可以作为句末 | - -常用方法使用的是级联CRF,首先预测如果是PW,再继续预测是否是PPH,再预测是否是IPH - - - -论文: 2015 .Ding Et al. - Automatic Prosody Prediction For Chinese Speech Synthesis Using BLSTM-RNN and Embedding Features - - - -## Polyphone(多音字) - - - -## Linguistic Features(语言学特征) - - - -## 基于神经网络的前端文本分析模型 - -最近这两年基本都是基于 BERT,所以这里记录一下相关的论文: - -- g2p: 2019. Sevinj Et al. Transformer based Grapheme-to-Phoneme Conversion -- 分词: 2019 huang Et al. - Toward Fast and Accurate Neural Chinese Word Segmentation with Multi-Criteria Learning -- 韵律: 2020 Zhang Et al. - Chinese Prosodic Structure Prediction Based on a Pretrained Language Representation Model - -除此之外,BLSTM + CRF 也比较主流。 - - - -## 总结 - -总结一下,文本分析各个模块的方法: - -TN: 基于规则的方法 - -分词: 字典/CRF/BLSTM+CRF/BERT - -注音: ngram/CRF/BLSTM/seq2seq - -韵律: CRF/BLSTM + CRF/ BERT - - - -考虑到分词,注音,韵律都是基于序列标注任务,所以理论上来说可以通过一个模型搞定。 - - - -## Reference -* [Text Front End](https://slyne.github.io/%E5%85%AC%E5%BC%80%E8%AF%BE/2020/10/03/TTS1/) -* [Chinese Natural Language (Pre)processing: An Introduction](https://towardsdatascience.com/chinese-natural-language-pre-processing-an-introduction-995d16c2705f) -* [Beginner’s Guide to Sentiment Analysis for Simplified Chinese using SnowNLP](https://towardsdatascience.com/beginners-guide-to-sentiment-analysis-for-simplified-chinese-using-snownlp-ce88a8407efb) From aa78205293a9314d16fd0e5c39561dfd1ad925e7 Mon Sep 17 00:00:00 2001 From: zhangyinhui Date: Mon, 21 Jun 2021 16:18:32 +0800 Subject: [PATCH 04/25] Add compilation framework --- speechnn/CMakeLists.txt | 77 ++++++++++++++++++++++++++++ speechnn/core/decoder/CMakeLists.txt | 2 + 2 files changed, 79 insertions(+) diff --git a/speechnn/CMakeLists.txt b/speechnn/CMakeLists.txt index e69de29bb..878374bab 100644 --- a/speechnn/CMakeLists.txt +++ b/speechnn/CMakeLists.txt @@ -0,0 +1,77 @@ +cmake_minimum_required(VERSION 3.14 FATAL_ERROR) + +project(deepspeech VERSION 0.1) + +set(CMAKE_VERBOSE_MAKEFILE on) +# set std-14 +set(CMAKE_CXX_STANDARD 14) + +# include file +include(FetchContent) +include(ExternalProject) +# fc_patch dir +set(FETCHCONTENT_QUIET off) +get_filename_component(fc_patch "fc_patch" REALPATH BASE_DIR "${CMAKE_SOURCE_DIR}") +set(FETCHCONTENT_BASE_DIR ${fc_patch}) + + +############################################################################### +# Option Configurations +############################################################################### +# option configurations +option(TEST_DEBUG "option for debug" OFF) + + +############################################################################### +# Include third party +############################################################################### +# #example for include third party +# FetchContent_Declare() +# # FetchContent_MakeAvailable was not added until CMake 3.14 +# FetchContent_MakeAvailable() +# include_directories() + +# ABSEIL-CPP +include(FetchContent) +FetchContent_Declare( + absl + GIT_REPOSITORY "https://github.com/abseil/abseil-cpp.git" + GIT_TAG "20210324.1" +) +FetchContent_MakeAvailable(absl) + +# libsndfile +include(FetchContent) +FetchContent_Declare( + libsndfile + GIT_REPOSITORY "https://github.com/libsndfile/libsndfile.git" + GIT_TAG "1.0.31" +) +FetchContent_MakeAvailable(libsndfile) + + +############################################################################### +# Add local library +############################################################################### +# system lib +find_package() +# if dir have CmakeLists.txt +add_subdirectory() +# if dir do not have CmakeLists.txt +add_library(lib_name STATIC file.cc) +target_link_libraries(lib_name item0 item1) +add_dependencies(lib_name depend-target) + + +############################################################################### +# Library installation +############################################################################### +install() + + +############################################################################### +# Build binary file +############################################################################### +add_executable() +target_link_libraries() + diff --git a/speechnn/core/decoder/CMakeLists.txt b/speechnn/core/decoder/CMakeLists.txt index e69de29bb..259261bdf 100644 --- a/speechnn/core/decoder/CMakeLists.txt +++ b/speechnn/core/decoder/CMakeLists.txt @@ -0,0 +1,2 @@ +aux_source_directory(. DIR_LIB_SRCS) +add_library(decoder STATIC ${DIR_LIB_SRCS}) From 68bcc4694055584e25844004379634a7e1f8b769 Mon Sep 17 00:00:00 2001 From: Haoxin Ma <745165806@qq.com> Date: Tue, 22 Jun 2021 07:46:50 +0000 Subject: [PATCH 05/25] save best and test on tiny/s0 --- deepspeech/training/trainer.py | 14 +- deepspeech/utils/checkpoint.py | 336 ++++++++++++++++--------- examples/tiny/s0/conf/deepspeech2.yaml | 5 +- 3 files changed, 230 insertions(+), 125 deletions(-) diff --git a/deepspeech/training/trainer.py b/deepspeech/training/trainer.py index 56de32617..246175e3f 100644 --- a/deepspeech/training/trainer.py +++ b/deepspeech/training/trainer.py @@ -18,7 +18,7 @@ import paddle from paddle import distributed as dist from tensorboardX import SummaryWriter -from deepspeech.utils import checkpoint +from deepspeech.utils.checkpoint import KBestCheckpoint from deepspeech.utils import mp_tools from deepspeech.utils.log import Log @@ -139,9 +139,12 @@ class Trainer(): "epoch": self.epoch, "lr": self.optimizer.get_lr() }) - checkpoint.save_parameters(self.checkpoint_dir, self.iteration + self.checkpoint.add_checkpoint(self.checkpoint_dir, self.iteration if tag is None else tag, self.model, self.optimizer, infos) + # checkpoint.save_parameters(self.checkpoint_dir, self.iteration + # if tag is None else tag, self.model, + # self.optimizer, infos) def resume_or_scratch(self): """Resume from latest checkpoint at checkpoints in the output @@ -151,7 +154,7 @@ class Trainer(): resume training. """ scratch = None - infos = checkpoint.load_parameters( + infos = self.checkpoint.load_parameters( self.model, self.optimizer, checkpoint_dir=self.checkpoint_dir, @@ -180,7 +183,7 @@ class Trainer(): from_scratch = self.resume_or_scratch() if from_scratch: # save init model, i.e. 0 epoch - self.save(tag='init') + self.save(tag='init', infos=None) self.lr_scheduler.step(self.iteration) if self.parallel: @@ -263,6 +266,9 @@ class Trainer(): self.checkpoint_dir = checkpoint_dir + self.checkpoint = KBestCheckpoint(max_size=self.config.training.max_epoch, + last_size=self.config.training.last_epoch) + @mp_tools.rank_zero_only def destory(self): """Close visualizer to avoid hanging after training""" diff --git a/deepspeech/utils/checkpoint.py b/deepspeech/utils/checkpoint.py index 8ede6b8fd..ef73eb705 100644 --- a/deepspeech/utils/checkpoint.py +++ b/deepspeech/utils/checkpoint.py @@ -23,130 +23,226 @@ from paddle.optimizer import Optimizer from deepspeech.utils import mp_tools from deepspeech.utils.log import Log +import glob + logger = Log(__name__).getlog() __all__ = ["load_parameters", "save_parameters"] +class KBestCheckpoint(object): + def __init__(self, + max_size: int=5, + last_size: int=1): + self.best_records: Mapping[Path, float] = {} + self.last_records = [] + self.max_size = max_size + self.last_size = last_size + self._save_all = (max_size == -1) + + def should_save_best(self, metric: float) -> bool: + if not self.best_full(): + return True + + # already full + worst_record_path = max(self.best_records, key=self.best_records.get) + worst_metric = self.best_records[worst_record_path] + return metric < worst_metric + + def best_full(self): + return (not self._save_all) and len(self.best_records) == self.max_size + + def last_full(self): + return len(self.last_records) == self.last_size + + def add_checkpoint(self, + checkpoint_dir, tag_or_iteration, + model, optimizer, infos): + if("val_loss" not in infos.keys()): + self.save_parameters(checkpoint_dir, tag_or_iteration, + model, optimizer, infos) + return + + #save best + if self.should_save_best(infos["val_loss"]): + self.save_checkpoint_and_update(infos["val_loss"], + checkpoint_dir, tag_or_iteration, + model, optimizer, infos) + #save last + self.save_last_checkpoint_and_update(checkpoint_dir, tag_or_iteration, + model, optimizer, infos) + + if isinstance(tag_or_iteration, int): + self._save_record(checkpoint_dir, tag_or_iteration) + + def save_checkpoint_and_update(self, metric, + checkpoint_dir, tag_or_iteration, + model, optimizer, infos): + # remove the worst + if self.best_full(): + worst_record_path = max(self.best_records, + key=self.best_records.get) + self.best_records.pop(worst_record_path) + if(worst_record_path not in self.last_records): + print('----to remove (best)----') + print(worst_record_path) + self.del_checkpoint(checkpoint_dir, worst_record_path) + + # add the new one + self.save_parameters(checkpoint_dir, tag_or_iteration, + model, optimizer, infos) + self.best_records[tag_or_iteration] = metric + + def save_last_checkpoint_and_update(self, checkpoint_dir, tag_or_iteration, + model, optimizer, infos): + # remove the old + if self.last_full(): + to_del_fn = self.last_records.pop(0) + if(to_del_fn not in self.best_records.keys()): + print('----to remove (last)----') + print(to_del_fn) + self.del_checkpoint(checkpoint_dir, to_del_fn) + self.last_records.append(tag_or_iteration) + + self.save_parameters(checkpoint_dir, tag_or_iteration, + model, optimizer, infos) + # with open(os.path.join(checkpoint_dir, "checkpoint"), "w") as handle: + # for iteration in self.best_records + # handle.write("model_checkpoint_path:{}\n".format(iteration)) + + + def del_checkpoint(self, checkpoint_dir, tag_or_iteration): + checkpoint_path = os.path.join(checkpoint_dir, + "{}".format(tag_or_iteration)) + for filename in glob.glob(checkpoint_path+".*"): + os.remove(filename) + print("delete file: "+filename) + + + + def _load_latest_checkpoint(self, checkpoint_dir: str) -> int: + """Get the iteration number corresponding to the latest saved checkpoint. + Args: + checkpoint_dir (str): the directory where checkpoint is saved. + Returns: + int: the latest iteration number. -1 for no checkpoint to load. + """ + checkpoint_record = os.path.join(checkpoint_dir, "checkpoint_last") + if not os.path.isfile(checkpoint_record): + return -1 + + # Fetch the latest checkpoint index. + with open(checkpoint_record, "rt") as handle: + latest_checkpoint = handle.readlines()[-1].strip() + iteration = int(latest_checkpoint.split(":")[-1]) + return iteration + + + def _save_record(self, checkpoint_dir: str, iteration: int): + """Save the iteration number of the latest model to be checkpoint record. + Args: + checkpoint_dir (str): the directory where checkpoint is saved. + iteration (int): the latest iteration number. + Returns: + None + """ + checkpoint_record_last = os.path.join(checkpoint_dir, "checkpoint_last") + checkpoint_record_best = os.path.join(checkpoint_dir, "checkpoint_best") + # Update the latest checkpoint index. + # with open(checkpoint_record, "a+") as handle: + # handle.write("model_checkpoint_path:{}\n".format(iteration)) + with open(checkpoint_record_best, "w") as handle: + for i in self.best_records.keys(): + handle.write("model_checkpoint_path:{}\n".format(i)) + with open(checkpoint_record_last, "w") as handle: + for i in self.last_records: + handle.write("model_checkpoint_path:{}\n".format(i)) + + + def load_parameters(self, model, + optimizer=None, + checkpoint_dir=None, + checkpoint_path=None): + """Load a specific model checkpoint from disk. + Args: + model (Layer): model to load parameters. + optimizer (Optimizer, optional): optimizer to load states if needed. + Defaults to None. + checkpoint_dir (str, optional): the directory where checkpoint is saved. + checkpoint_path (str, optional): if specified, load the checkpoint + stored in the checkpoint_path(prefix) and the argument 'checkpoint_dir' will + be ignored. Defaults to None. + Returns: + configs (dict): epoch or step, lr and other meta info should be saved. + """ + configs = {} + + if checkpoint_path is not None: + tag = os.path.basename(checkpoint_path).split(":")[-1] + elif checkpoint_dir is not None: + iteration = self._load_latest_checkpoint(checkpoint_dir) + if iteration == -1: + return configs + checkpoint_path = os.path.join(checkpoint_dir, "{}".format(iteration)) + else: + raise ValueError( + "At least one of 'checkpoint_dir' and 'checkpoint_path' should be specified!" + ) + + rank = dist.get_rank() + + params_path = checkpoint_path + ".pdparams" + model_dict = paddle.load(params_path) + model.set_state_dict(model_dict) + logger.info("Rank {}: loaded model from {}".format(rank, params_path)) -def _load_latest_checkpoint(checkpoint_dir: str) -> int: - """Get the iteration number corresponding to the latest saved checkpoint. - Args: - checkpoint_dir (str): the directory where checkpoint is saved. - Returns: - int: the latest iteration number. -1 for no checkpoint to load. - """ - checkpoint_record = os.path.join(checkpoint_dir, "checkpoint") - if not os.path.isfile(checkpoint_record): - return -1 - - # Fetch the latest checkpoint index. - with open(checkpoint_record, "rt") as handle: - latest_checkpoint = handle.readlines()[-1].strip() - iteration = int(latest_checkpoint.split(":")[-1]) - return iteration - - -def _save_record(checkpoint_dir: str, iteration: int): - """Save the iteration number of the latest model to be checkpoint record. - Args: - checkpoint_dir (str): the directory where checkpoint is saved. - iteration (int): the latest iteration number. - Returns: - None - """ - checkpoint_record = os.path.join(checkpoint_dir, "checkpoint") - # Update the latest checkpoint index. - with open(checkpoint_record, "a+") as handle: - handle.write("model_checkpoint_path:{}\n".format(iteration)) - - -def load_parameters(model, - optimizer=None, - checkpoint_dir=None, - checkpoint_path=None): - """Load a specific model checkpoint from disk. - Args: - model (Layer): model to load parameters. - optimizer (Optimizer, optional): optimizer to load states if needed. - Defaults to None. - checkpoint_dir (str, optional): the directory where checkpoint is saved. - checkpoint_path (str, optional): if specified, load the checkpoint - stored in the checkpoint_path(prefix) and the argument 'checkpoint_dir' will - be ignored. Defaults to None. - Returns: - configs (dict): epoch or step, lr and other meta info should be saved. - """ - configs = {} - - if checkpoint_path is not None: - tag = os.path.basename(checkpoint_path).split(":")[-1] - elif checkpoint_dir is not None: - iteration = _load_latest_checkpoint(checkpoint_dir) - if iteration == -1: - return configs - checkpoint_path = os.path.join(checkpoint_dir, "{}".format(iteration)) - else: - raise ValueError( - "At least one of 'checkpoint_dir' and 'checkpoint_path' should be specified!" - ) - - rank = dist.get_rank() - - params_path = checkpoint_path + ".pdparams" - model_dict = paddle.load(params_path) - model.set_state_dict(model_dict) - logger.info("Rank {}: loaded model from {}".format(rank, params_path)) - - optimizer_path = checkpoint_path + ".pdopt" - if optimizer and os.path.isfile(optimizer_path): - optimizer_dict = paddle.load(optimizer_path) - optimizer.set_state_dict(optimizer_dict) - logger.info("Rank {}: loaded optimizer state from {}".format( - rank, optimizer_path)) - - info_path = re.sub('.pdparams$', '.json', params_path) - if os.path.exists(info_path): - with open(info_path, 'r') as fin: - configs = json.load(fin) - return configs - - -@mp_tools.rank_zero_only -def save_parameters(checkpoint_dir: str, - tag_or_iteration: Union[int, str], - model: paddle.nn.Layer, - optimizer: Optimizer=None, - infos: dict=None): - """Checkpoint the latest trained model parameters. - Args: - checkpoint_dir (str): the directory where checkpoint is saved. - tag_or_iteration (int or str): the latest iteration(step or epoch) number. - model (Layer): model to be checkpointed. - optimizer (Optimizer, optional): optimizer to be checkpointed. - Defaults to None. - infos (dict or None): any info you want to save. - Returns: - None - """ - checkpoint_path = os.path.join(checkpoint_dir, - "{}".format(tag_or_iteration)) - - model_dict = model.state_dict() - params_path = checkpoint_path + ".pdparams" - paddle.save(model_dict, params_path) - logger.info("Saved model to {}".format(params_path)) - - if optimizer: - opt_dict = optimizer.state_dict() optimizer_path = checkpoint_path + ".pdopt" - paddle.save(opt_dict, optimizer_path) - logger.info("Saved optimzier state to {}".format(optimizer_path)) - - info_path = re.sub('.pdparams$', '.json', params_path) - infos = {} if infos is None else infos - with open(info_path, 'w') as fout: - data = json.dumps(infos) - fout.write(data) + if optimizer and os.path.isfile(optimizer_path): + optimizer_dict = paddle.load(optimizer_path) + optimizer.set_state_dict(optimizer_dict) + logger.info("Rank {}: loaded optimizer state from {}".format( + rank, optimizer_path)) + + info_path = re.sub('.pdparams$', '.json', params_path) + if os.path.exists(info_path): + with open(info_path, 'r') as fin: + configs = json.load(fin) + return configs + + + @mp_tools.rank_zero_only + def save_parameters(self, checkpoint_dir: str, + tag_or_iteration: Union[int, str], + model: paddle.nn.Layer, + optimizer: Optimizer=None, + infos: dict=None): + """Checkpoint the latest trained model parameters. + Args: + checkpoint_dir (str): the directory where checkpoint is saved. + tag_or_iteration (int or str): the latest iteration(step or epoch) number. + model (Layer): model to be checkpointed. + optimizer (Optimizer, optional): optimizer to be checkpointed. + Defaults to None. + infos (dict or None): any info you want to save. + Returns: + None + """ + checkpoint_path = os.path.join(checkpoint_dir, + "{}".format(tag_or_iteration)) + + model_dict = model.state_dict() + params_path = checkpoint_path + ".pdparams" + paddle.save(model_dict, params_path) + logger.info("Saved model to {}".format(params_path)) + + if optimizer: + opt_dict = optimizer.state_dict() + optimizer_path = checkpoint_path + ".pdopt" + paddle.save(opt_dict, optimizer_path) + logger.info("Saved optimzier state to {}".format(optimizer_path)) + + info_path = re.sub('.pdparams$', '.json', params_path) + infos = {} if infos is None else infos + with open(info_path, 'w') as fout: + data = json.dumps(infos) + fout.write(data) - if isinstance(tag_or_iteration, int): - _save_record(checkpoint_dir, tag_or_iteration) diff --git a/examples/tiny/s0/conf/deepspeech2.yaml b/examples/tiny/s0/conf/deepspeech2.yaml index 6737d1b75..9ff6803d8 100644 --- a/examples/tiny/s0/conf/deepspeech2.yaml +++ b/examples/tiny/s0/conf/deepspeech2.yaml @@ -43,12 +43,15 @@ model: share_rnn_weights: True training: - n_epoch: 24 + n_epoch: 6 lr: 1e-5 lr_decay: 1.0 weight_decay: 1e-06 global_grad_clip: 5.0 log_interval: 1 + max_epoch: 3 + last_epoch: 2 + decoding: batch_size: 128 From 8af2eb073adff6bf7c12c04c1b1c47aa650732f0 Mon Sep 17 00:00:00 2001 From: Haoxin Ma <745165806@qq.com> Date: Tue, 22 Jun 2021 11:36:27 +0000 Subject: [PATCH 06/25] revise config --- deepspeech/training/trainer.py | 4 ++-- examples/tiny/s0/conf/deepspeech2.yaml | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/deepspeech/training/trainer.py b/deepspeech/training/trainer.py index 246175e3f..6563e7c4d 100644 --- a/deepspeech/training/trainer.py +++ b/deepspeech/training/trainer.py @@ -266,8 +266,8 @@ class Trainer(): self.checkpoint_dir = checkpoint_dir - self.checkpoint = KBestCheckpoint(max_size=self.config.training.max_epoch, - last_size=self.config.training.last_epoch) + self.checkpoint = KBestCheckpoint(max_size=self.config.training.checkpoint.kbest_n, + last_size=self.config.training.checkpoint.latest_n) @mp_tools.rank_zero_only def destory(self): diff --git a/examples/tiny/s0/conf/deepspeech2.yaml b/examples/tiny/s0/conf/deepspeech2.yaml index 9ff6803d8..b9c2556c7 100644 --- a/examples/tiny/s0/conf/deepspeech2.yaml +++ b/examples/tiny/s0/conf/deepspeech2.yaml @@ -49,8 +49,9 @@ training: weight_decay: 1e-06 global_grad_clip: 5.0 log_interval: 1 - max_epoch: 3 - last_epoch: 2 + checkpoint: + kbest_n: 3 + latest_n: 2 decoding: From 90788b116d85c26cf91bcb76544aaf5b2b189734 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Thu, 24 Jun 2021 04:05:34 +0000 Subject: [PATCH 07/25] more comment; fix datapipe of align --- deepspeech/exps/u2/model.py | 23 ++++++++++++++--------- deepspeech/utils/ctc_utils.py | 20 +++++++++++--------- deepspeech/utils/text_grid.py | 8 +++++--- 3 files changed, 30 insertions(+), 21 deletions(-) diff --git a/deepspeech/exps/u2/model.py b/deepspeech/exps/u2/model.py index f00d5af63..ba7bc45c8 100644 --- a/deepspeech/exps/u2/model.py +++ b/deepspeech/exps/u2/model.py @@ -355,7 +355,7 @@ class U2Tester(U2Trainer): decoding_chunk_size=-1, # decoding chunk size. Defaults to -1. # <0: for decoding, use full chunk. # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. + # 0: used for training, it's prohibited here. num_decoding_left_chunks=-1, # number of left chunks for decoding. Defaults to -1. simulate_streaming=False, # simulate streaming inference. Defaults to False. )) @@ -512,11 +512,13 @@ class U2Tester(U2Trainer): self.model.eval() logger.info(f"Align Total Examples: {len(self.test_loader.dataset)}") - stride_ms = self.test_loader.dataset.stride_ms - token_dict = self.test_loader.dataset.vocab_list + stride_ms = self.test_loader.collate_fn.stride_ms + token_dict = self.test_loader.collate_fn.vocab_list with open(self.args.result_file, 'w') as fout: + # one example in batch for i, batch in enumerate(self.test_loader): key, feat, feats_length, target, target_length = batch + # 1. Encoder encoder_out, encoder_mask = self.model._forward_encoder( feat, feats_length) # (B, maxlen, encoder_dim) @@ -529,28 +531,31 @@ class U2Tester(U2Trainer): ctc_probs = ctc_probs.squeeze(0) target = target.squeeze(0) alignment = ctc_utils.forced_align(ctc_probs, target) - print(alignment) + print(kye[0], alignment) fout.write('{} {}\n'.format(key[0], alignment)) # 3. gen praat # segment alignment align_segs = text_grid.segment_alignment(alignment) - print(align_segs) + print(kye[0], align_segs) # IntervalTier, List["start end token\n"] subsample = get_subsample(self.config) tierformat = text_grid.align_to_tierformat( align_segs, subsample, token_dict) + # write tier tier_path = os.path.join( os.path.dirname(args.result_file), key[0] + ".tier") with open(tier_path, 'w') as f: f.writelines(tierformat) - + # write textgrid textgrid_path = s.path.join( os.path.dirname(args.result_file), key[0] + ".TextGrid") - second_per_frame = 1. / (1000. / stride_ms - ) # 25ms window, 10ms stride + second_per_frame = 1. / (1000. / + stride_ms) # 25ms window, 10ms stride + second_per_example = ( + len(alignment) + 1) * subsample * second_per_frame text_grid.generate_textgrid( - maxtime=(len(alignment) + 1) * subsample * second_per_frame, + maxtime=second_per_example, lines=tierformat, output=textgrid_path) diff --git a/deepspeech/utils/ctc_utils.py b/deepspeech/utils/ctc_utils.py index 76c1898be..6201233df 100644 --- a/deepspeech/utils/ctc_utils.py +++ b/deepspeech/utils/ctc_utils.py @@ -38,8 +38,10 @@ def remove_duplicates_and_blank(hyp: List[int], blank_id=0) -> List[int]: new_hyp: List[int] = [] cur = 0 while cur < len(hyp): + # add non-blank into new_hyp if hyp[cur] != blank_id: new_hyp.append(hyp[cur]) + # skip repeat label prev = cur while cur < len(hyp) and hyp[cur] == hyp[prev]: cur += 1 @@ -52,7 +54,7 @@ def insert_blank(label: np.ndarray, blank_id: int=0) -> np.ndarray: "abcdefg" -> "-a-b-c-d-e-f-g-" Args: - label ([np.ndarray]): label ids, (L). + label ([np.ndarray]): label ids, List[int], (L). blank_id (int, optional): blank id. Defaults to 0. Returns: @@ -61,8 +63,8 @@ def insert_blank(label: np.ndarray, blank_id: int=0) -> np.ndarray: label = np.expand_dims(label, 1) #[L, 1] blanks = np.zeros((label.shape[0], 1), dtype=np.int64) + blank_id label = np.concatenate([blanks, label], axis=1) #[L, 2] - label = label.reshape(-1) #[2L] - label = np.append(label, label[0]) #[2L + 1] + label = label.reshape(-1) #[2L], -l-l-l + label = np.append(label, label[0]) #[2L + 1], -l-l-l- return label @@ -79,21 +81,21 @@ def forced_align(ctc_probs: paddle.Tensor, y: paddle.Tensor, Returns: List[int]: best alignment result, (T). """ - y_insert_blank = insert_blank(y, blank_id) + y_insert_blank = insert_blank(y, blank_id) #(2L+1) log_alpha = paddle.zeros( (ctc_probs.size(0), len(y_insert_blank))) #(T, 2L+1) log_alpha = log_alpha - float('inf') # log of zero state_path = (paddle.zeros( (ctc_probs.size(0), len(y_insert_blank)), dtype=paddle.int16) - 1 - ) # state path + ) # state path, Tuple((T, 2L+1)) # init start state - log_alpha[0, 0] = ctc_probs[0][y_insert_blank[0]] # Sb - log_alpha[0, 1] = ctc_probs[0][y_insert_blank[1]] # Snb + log_alpha[0, 0] = ctc_probs[0][y_insert_blank[0]] # State-b, Sb + log_alpha[0, 1] = ctc_probs[0][y_insert_blank[1]] # State-nb, Snb - for t in range(1, ctc_probs.size(0)): - for s in range(len(y_insert_blank)): + for t in range(1, ctc_probs.size(0)): # T + for s in range(len(y_insert_blank)): # 2L+1 if y_insert_blank[s] == blank_id or s < 2 or y_insert_blank[ s] == y_insert_blank[s - 2]: candidates = paddle.to_tensor( diff --git a/deepspeech/utils/text_grid.py b/deepspeech/utils/text_grid.py index 9afed89e0..b774130db 100644 --- a/deepspeech/utils/text_grid.py +++ b/deepspeech/utils/text_grid.py @@ -22,11 +22,13 @@ def segment_alignment(alignment: List[int], blank_id=0) -> List[List[int]]: """segment ctc alignment ids by continuous blank and repeat label. Args: - alignment (List[int]): ctc alignment id sequence. e.g. [0, 0, 0, 1, 1, 1, 2, 0, 0, 3] + alignment (List[int]): ctc alignment id sequence. + e.g. [0, 0, 0, 1, 1, 1, 2, 0, 0, 3] blank_id (int, optional): blank id. Defaults to 0. Returns: - List[List[int]]: segment aligment id sequence. e.g. [[0, 0, 0, 1, 1, 1], [2], [0, 0, 3]] + List[List[int]]: token align, segment aligment id sequence. + e.g. [[0, 0, 0, 1, 1, 1], [2], [0, 0, 3]] """ # convert alignment to a praat format, which is a doing phonetics # by computer and helps analyzing alignment @@ -61,7 +63,7 @@ def align_to_tierformat(align_segs: List[List[int]], token_dict (Dict[int, Text]): int -> str map. Returns: - List[Text]: list of textgrid.Interval. + List[Text]: list of textgrid.Interval text, str(start, end, text). """ hop_length = 10 # ms second_ms = 1000 # ms From 91e70a2857c62b7db1db958d9b0528beb2bf0b77 Mon Sep 17 00:00:00 2001 From: Haoxin Ma <745165806@qq.com> Date: Fri, 25 Jun 2021 09:02:59 +0000 Subject: [PATCH 08/25] multi gpus --- deepspeech/training/trainer.py | 18 ++-- deepspeech/utils/checkpoint.py | 144 ++++++++++++++++--------- examples/tiny/s0/conf/deepspeech2.yaml | 2 +- 3 files changed, 105 insertions(+), 59 deletions(-) diff --git a/deepspeech/training/trainer.py b/deepspeech/training/trainer.py index 6563e7c4d..7f68e67cb 100644 --- a/deepspeech/training/trainer.py +++ b/deepspeech/training/trainer.py @@ -18,8 +18,8 @@ import paddle from paddle import distributed as dist from tensorboardX import SummaryWriter -from deepspeech.utils.checkpoint import KBestCheckpoint from deepspeech.utils import mp_tools +from deepspeech.utils.checkpoint import Checkpoint from deepspeech.utils.log import Log __all__ = ["Trainer"] @@ -64,7 +64,7 @@ class Trainer(): The parsed command line arguments. Examples -------- - >>> def main_sp(config, args): + >>> def p(config, args): >>> exp = Trainer(config, args) >>> exp.setup() >>> exp.run() @@ -140,11 +140,8 @@ class Trainer(): "lr": self.optimizer.get_lr() }) self.checkpoint.add_checkpoint(self.checkpoint_dir, self.iteration - if tag is None else tag, self.model, - self.optimizer, infos) - # checkpoint.save_parameters(self.checkpoint_dir, self.iteration - # if tag is None else tag, self.model, - # self.optimizer, infos) + if tag is None else tag, self.model, + self.optimizer, infos) def resume_or_scratch(self): """Resume from latest checkpoint at checkpoints in the output @@ -154,7 +151,7 @@ class Trainer(): resume training. """ scratch = None - infos = self.checkpoint.load_parameters( + infos = self.checkpoint.load_last_parameters( self.model, self.optimizer, checkpoint_dir=self.checkpoint_dir, @@ -266,8 +263,9 @@ class Trainer(): self.checkpoint_dir = checkpoint_dir - self.checkpoint = KBestCheckpoint(max_size=self.config.training.checkpoint.kbest_n, - last_size=self.config.training.checkpoint.latest_n) + self.checkpoint = Checkpoint( + kbest_n=self.config.training.checkpoint.kbest_n, + latest_n=self.config.training.checkpoint.latest_n) @mp_tools.rank_zero_only def destory(self): diff --git a/deepspeech/utils/checkpoint.py b/deepspeech/utils/checkpoint.py index ef73eb705..52eccb673 100644 --- a/deepspeech/utils/checkpoint.py +++ b/deepspeech/utils/checkpoint.py @@ -24,20 +24,22 @@ from deepspeech.utils import mp_tools from deepspeech.utils.log import Log import glob +# import operator +from pathlib import Path logger = Log(__name__).getlog() -__all__ = ["load_parameters", "save_parameters"] +__all__ = ["Checkpoint"] -class KBestCheckpoint(object): +class Checkpoint(object): def __init__(self, - max_size: int=5, - last_size: int=1): + kbest_n: int=5, + latest_n: int=1): self.best_records: Mapping[Path, float] = {} - self.last_records = [] - self.max_size = max_size - self.last_size = last_size - self._save_all = (max_size == -1) + self.latest_records = [] + self.kbest_n = kbest_n + self.latest_n = latest_n + self._save_all = (kbest_n == -1) def should_save_best(self, metric: float) -> bool: if not self.best_full(): @@ -45,36 +47,36 @@ class KBestCheckpoint(object): # already full worst_record_path = max(self.best_records, key=self.best_records.get) + # worst_record_path = max(self.best_records.iteritems(), key=operator.itemgetter(1))[0] worst_metric = self.best_records[worst_record_path] return metric < worst_metric def best_full(self): - return (not self._save_all) and len(self.best_records) == self.max_size + return (not self._save_all) and len(self.best_records) == self.kbest_n - def last_full(self): - return len(self.last_records) == self.last_size + def latest_full(self): + return len(self.latest_records) == self.latest_n - def add_checkpoint(self, - checkpoint_dir, tag_or_iteration, - model, optimizer, infos): - if("val_loss" not in infos.keys()): + def add_checkpoint(self, checkpoint_dir, tag_or_iteration, + model, optimizer, infos, metric_type = "val_loss"): + if(metric_type not in infos.keys()): self.save_parameters(checkpoint_dir, tag_or_iteration, model, optimizer, infos) return #save best - if self.should_save_best(infos["val_loss"]): - self.save_checkpoint_and_update(infos["val_loss"], + if self.should_save_best(infos[metric_type]): + self.save_best_checkpoint_and_update(infos[metric_type], checkpoint_dir, tag_or_iteration, model, optimizer, infos) - #save last - self.save_last_checkpoint_and_update(checkpoint_dir, tag_or_iteration, + #save latest + self.save_latest_checkpoint_and_update(checkpoint_dir, tag_or_iteration, model, optimizer, infos) if isinstance(tag_or_iteration, int): - self._save_record(checkpoint_dir, tag_or_iteration) + self.save_checkpoint_record(checkpoint_dir, tag_or_iteration) - def save_checkpoint_and_update(self, metric, + def save_best_checkpoint_and_update(self, metric, checkpoint_dir, tag_or_iteration, model, optimizer, infos): # remove the worst @@ -82,9 +84,8 @@ class KBestCheckpoint(object): worst_record_path = max(self.best_records, key=self.best_records.get) self.best_records.pop(worst_record_path) - if(worst_record_path not in self.last_records): - print('----to remove (best)----') - print(worst_record_path) + if(worst_record_path not in self.latest_records): + logger.info("remove the worst checkpoint: {}".format(worst_record_path)) self.del_checkpoint(checkpoint_dir, worst_record_path) # add the new one @@ -92,22 +93,18 @@ class KBestCheckpoint(object): model, optimizer, infos) self.best_records[tag_or_iteration] = metric - def save_last_checkpoint_and_update(self, checkpoint_dir, tag_or_iteration, + def save_latest_checkpoint_and_update(self, checkpoint_dir, tag_or_iteration, model, optimizer, infos): # remove the old - if self.last_full(): - to_del_fn = self.last_records.pop(0) + if self.latest_full(): + to_del_fn = self.latest_records.pop(0) if(to_del_fn not in self.best_records.keys()): - print('----to remove (last)----') - print(to_del_fn) + logger.info("remove the latest checkpoint: {}".format(to_del_fn)) self.del_checkpoint(checkpoint_dir, to_del_fn) - self.last_records.append(tag_or_iteration) + self.latest_records.append(tag_or_iteration) self.save_parameters(checkpoint_dir, tag_or_iteration, model, optimizer, infos) - # with open(os.path.join(checkpoint_dir, "checkpoint"), "w") as handle: - # for iteration in self.best_records - # handle.write("model_checkpoint_path:{}\n".format(iteration)) def del_checkpoint(self, checkpoint_dir, tag_or_iteration): @@ -115,18 +112,17 @@ class KBestCheckpoint(object): "{}".format(tag_or_iteration)) for filename in glob.glob(checkpoint_path+".*"): os.remove(filename) - print("delete file: "+filename) + logger.info("delete file: {}".format(filename)) - def _load_latest_checkpoint(self, checkpoint_dir: str) -> int: + def load_checkpoint_idx(self, checkpoint_record: str) -> int: """Get the iteration number corresponding to the latest saved checkpoint. Args: - checkpoint_dir (str): the directory where checkpoint is saved. + checkpoint_path (str): the saved path of checkpoint. Returns: int: the latest iteration number. -1 for no checkpoint to load. """ - checkpoint_record = os.path.join(checkpoint_dir, "checkpoint_last") if not os.path.isfile(checkpoint_record): return -1 @@ -135,9 +131,9 @@ class KBestCheckpoint(object): latest_checkpoint = handle.readlines()[-1].strip() iteration = int(latest_checkpoint.split(":")[-1]) return iteration + - - def _save_record(self, checkpoint_dir: str, iteration: int): + def save_checkpoint_record(self, checkpoint_dir: str, iteration: int): """Save the iteration number of the latest model to be checkpoint record. Args: checkpoint_dir (str): the directory where checkpoint is saved. @@ -145,24 +141,22 @@ class KBestCheckpoint(object): Returns: None """ - checkpoint_record_last = os.path.join(checkpoint_dir, "checkpoint_last") + checkpoint_record_latest = os.path.join(checkpoint_dir, "checkpoint_latest") checkpoint_record_best = os.path.join(checkpoint_dir, "checkpoint_best") - # Update the latest checkpoint index. - # with open(checkpoint_record, "a+") as handle: - # handle.write("model_checkpoint_path:{}\n".format(iteration)) + with open(checkpoint_record_best, "w") as handle: for i in self.best_records.keys(): handle.write("model_checkpoint_path:{}\n".format(i)) - with open(checkpoint_record_last, "w") as handle: - for i in self.last_records: + with open(checkpoint_record_latest, "w") as handle: + for i in self.latest_records: handle.write("model_checkpoint_path:{}\n".format(i)) - def load_parameters(self, model, + def load_last_parameters(self, model, optimizer=None, checkpoint_dir=None, checkpoint_path=None): - """Load a specific model checkpoint from disk. + """Load a last model checkpoint from disk. Args: model (Layer): model to load parameters. optimizer (Optimizer, optional): optimizer to load states if needed. @@ -179,7 +173,8 @@ class KBestCheckpoint(object): if checkpoint_path is not None: tag = os.path.basename(checkpoint_path).split(":")[-1] elif checkpoint_dir is not None: - iteration = self._load_latest_checkpoint(checkpoint_dir) + checkpoint_record = os.path.join(checkpoint_dir, "checkpoint_latest") + iteration = self.load_checkpoint_idx(checkpoint_record) if iteration == -1: return configs checkpoint_path = os.path.join(checkpoint_dir, "{}".format(iteration)) @@ -209,6 +204,59 @@ class KBestCheckpoint(object): return configs + def load_best_parameters(self, model, + optimizer=None, + checkpoint_dir=None, + checkpoint_path=None): + """Load a last model checkpoint from disk. + Args: + model (Layer): model to load parameters. + optimizer (Optimizer, optional): optimizer to load states if needed. + Defaults to None. + checkpoint_dir (str, optional): the directory where checkpoint is saved. + checkpoint_path (str, optional): if specified, load the checkpoint + stored in the checkpoint_path(prefix) and the argument 'checkpoint_dir' will + be ignored. Defaults to None. + Returns: + configs (dict): epoch or step, lr and other meta info should be saved. + """ + configs = {} + + if checkpoint_path is not None: + tag = os.path.basename(checkpoint_path).split(":")[-1] + elif checkpoint_dir is not None: + checkpoint_record = os.path.join(checkpoint_dir, "checkpoint_best") + iteration = self.load_checkpoint_idx(checkpoint_record) + if iteration == -1: + return configs + checkpoint_path = os.path.join(checkpoint_dir, "{}".format(iteration)) + else: + raise ValueError( + "At least one of 'checkpoint_dir' and 'checkpoint_path' should be specified!" + ) + + rank = dist.get_rank() + + params_path = checkpoint_path + ".pdparams" + model_dict = paddle.load(params_path) + model.set_state_dict(model_dict) + logger.info("Rank {}: loaded model from {}".format(rank, params_path)) + + optimizer_path = checkpoint_path + ".pdopt" + if optimizer and os.path.isfile(optimizer_path): + optimizer_dict = paddle.load(optimizer_path) + optimizer.set_state_dict(optimizer_dict) + logger.info("Rank {}: loaded optimizer state from {}".format( + rank, optimizer_path)) + + info_path = re.sub('.pdparams$', '.json', params_path) + if os.path.exists(info_path): + with open(info_path, 'r') as fin: + configs = json.load(fin) + return configs + + + @mp_tools.rank_zero_only def save_parameters(self, checkpoint_dir: str, tag_or_iteration: Union[int, str], diff --git a/examples/tiny/s0/conf/deepspeech2.yaml b/examples/tiny/s0/conf/deepspeech2.yaml index b9c2556c7..ea433f341 100644 --- a/examples/tiny/s0/conf/deepspeech2.yaml +++ b/examples/tiny/s0/conf/deepspeech2.yaml @@ -43,7 +43,7 @@ model: share_rnn_weights: True training: - n_epoch: 6 + n_epoch: 10 lr: 1e-5 lr_decay: 1.0 weight_decay: 1e-06 From 16210c058763f6ad3426ed53da10a9aa4e33ff49 Mon Sep 17 00:00:00 2001 From: Haoxin Ma <745165806@qq.com> Date: Fri, 25 Jun 2021 09:08:30 +0000 Subject: [PATCH 09/25] fix bug --- deepspeech/training/trainer.py | 2 +- deepspeech/utils/checkpoint.py | 121 +++++++++++++++++---------------- 2 files changed, 63 insertions(+), 60 deletions(-) diff --git a/deepspeech/training/trainer.py b/deepspeech/training/trainer.py index 7f68e67cb..f8668370a 100644 --- a/deepspeech/training/trainer.py +++ b/deepspeech/training/trainer.py @@ -64,7 +64,7 @@ class Trainer(): The parsed command line arguments. Examples -------- - >>> def p(config, args): + >>> def main_sp(config, args): >>> exp = Trainer(config, args) >>> exp.setup() >>> exp.run() diff --git a/deepspeech/utils/checkpoint.py b/deepspeech/utils/checkpoint.py index 52eccb673..b29ef2ab5 100644 --- a/deepspeech/utils/checkpoint.py +++ b/deepspeech/utils/checkpoint.py @@ -11,9 +11,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import glob import json import os import re +from pathlib import Path from typing import Union import paddle @@ -22,25 +24,21 @@ from paddle.optimizer import Optimizer from deepspeech.utils import mp_tools from deepspeech.utils.log import Log - -import glob # import operator -from pathlib import Path logger = Log(__name__).getlog() __all__ = ["Checkpoint"] + class Checkpoint(object): - def __init__(self, - kbest_n: int=5, - latest_n: int=1): + def __init__(self, kbest_n: int=5, latest_n: int=1): self.best_records: Mapping[Path, float] = {} self.latest_records = [] self.kbest_n = kbest_n self.latest_n = latest_n self._save_all = (kbest_n == -1) - + def should_save_best(self, metric: float) -> bool: if not self.best_full(): return True @@ -53,68 +51,72 @@ class Checkpoint(object): def best_full(self): return (not self._save_all) and len(self.best_records) == self.kbest_n - + def latest_full(self): return len(self.latest_records) == self.latest_n - def add_checkpoint(self, checkpoint_dir, tag_or_iteration, - model, optimizer, infos, metric_type = "val_loss"): - if(metric_type not in infos.keys()): - self.save_parameters(checkpoint_dir, tag_or_iteration, - model, optimizer, infos) + def add_checkpoint(self, + checkpoint_dir, + tag_or_iteration, + model, + optimizer, + infos, + metric_type="val_loss"): + if (metric_type not in infos.keys()): + self.save_parameters(checkpoint_dir, tag_or_iteration, model, + optimizer, infos) return #save best if self.should_save_best(infos[metric_type]): - self.save_best_checkpoint_and_update(infos[metric_type], - checkpoint_dir, tag_or_iteration, - model, optimizer, infos) + self.save_best_checkpoint_and_update( + infos[metric_type], checkpoint_dir, tag_or_iteration, model, + optimizer, infos) #save latest self.save_latest_checkpoint_and_update(checkpoint_dir, tag_or_iteration, - model, optimizer, infos) - + model, optimizer, infos) + if isinstance(tag_or_iteration, int): self.save_checkpoint_record(checkpoint_dir, tag_or_iteration) - - def save_best_checkpoint_and_update(self, metric, - checkpoint_dir, tag_or_iteration, - model, optimizer, infos): + + def save_best_checkpoint_and_update(self, metric, checkpoint_dir, + tag_or_iteration, model, optimizer, + infos): # remove the worst if self.best_full(): worst_record_path = max(self.best_records, key=self.best_records.get) self.best_records.pop(worst_record_path) - if(worst_record_path not in self.latest_records): - logger.info("remove the worst checkpoint: {}".format(worst_record_path)) + if (worst_record_path not in self.latest_records): + logger.info( + "remove the worst checkpoint: {}".format(worst_record_path)) self.del_checkpoint(checkpoint_dir, worst_record_path) # add the new one - self.save_parameters(checkpoint_dir, tag_or_iteration, - model, optimizer, infos) + self.save_parameters(checkpoint_dir, tag_or_iteration, model, optimizer, + infos) self.best_records[tag_or_iteration] = metric - - def save_latest_checkpoint_and_update(self, checkpoint_dir, tag_or_iteration, - model, optimizer, infos): + + def save_latest_checkpoint_and_update( + self, checkpoint_dir, tag_or_iteration, model, optimizer, infos): # remove the old if self.latest_full(): to_del_fn = self.latest_records.pop(0) - if(to_del_fn not in self.best_records.keys()): - logger.info("remove the latest checkpoint: {}".format(to_del_fn)) + if (to_del_fn not in self.best_records.keys()): + logger.info( + "remove the latest checkpoint: {}".format(to_del_fn)) self.del_checkpoint(checkpoint_dir, to_del_fn) self.latest_records.append(tag_or_iteration) - self.save_parameters(checkpoint_dir, tag_or_iteration, - model, optimizer, infos) - + self.save_parameters(checkpoint_dir, tag_or_iteration, model, optimizer, + infos) def del_checkpoint(self, checkpoint_dir, tag_or_iteration): checkpoint_path = os.path.join(checkpoint_dir, - "{}".format(tag_or_iteration)) - for filename in glob.glob(checkpoint_path+".*"): + "{}".format(tag_or_iteration)) + for filename in glob.glob(checkpoint_path + ".*"): os.remove(filename) logger.info("delete file: {}".format(filename)) - - def load_checkpoint_idx(self, checkpoint_record: str) -> int: """Get the iteration number corresponding to the latest saved checkpoint. @@ -131,7 +133,6 @@ class Checkpoint(object): latest_checkpoint = handle.readlines()[-1].strip() iteration = int(latest_checkpoint.split(":")[-1]) return iteration - def save_checkpoint_record(self, checkpoint_dir: str, iteration: int): """Save the iteration number of the latest model to be checkpoint record. @@ -141,9 +142,10 @@ class Checkpoint(object): Returns: None """ - checkpoint_record_latest = os.path.join(checkpoint_dir, "checkpoint_latest") + checkpoint_record_latest = os.path.join(checkpoint_dir, + "checkpoint_latest") checkpoint_record_best = os.path.join(checkpoint_dir, "checkpoint_best") - + with open(checkpoint_record_best, "w") as handle: for i in self.best_records.keys(): handle.write("model_checkpoint_path:{}\n".format(i)) @@ -151,11 +153,11 @@ class Checkpoint(object): for i in self.latest_records: handle.write("model_checkpoint_path:{}\n".format(i)) - - def load_last_parameters(self, model, - optimizer=None, - checkpoint_dir=None, - checkpoint_path=None): + def load_last_parameters(self, + model, + optimizer=None, + checkpoint_dir=None, + checkpoint_path=None): """Load a last model checkpoint from disk. Args: model (Layer): model to load parameters. @@ -173,11 +175,13 @@ class Checkpoint(object): if checkpoint_path is not None: tag = os.path.basename(checkpoint_path).split(":")[-1] elif checkpoint_dir is not None: - checkpoint_record = os.path.join(checkpoint_dir, "checkpoint_latest") + checkpoint_record = os.path.join(checkpoint_dir, + "checkpoint_latest") iteration = self.load_checkpoint_idx(checkpoint_record) if iteration == -1: return configs - checkpoint_path = os.path.join(checkpoint_dir, "{}".format(iteration)) + checkpoint_path = os.path.join(checkpoint_dir, + "{}".format(iteration)) else: raise ValueError( "At least one of 'checkpoint_dir' and 'checkpoint_path' should be specified!" @@ -203,11 +207,11 @@ class Checkpoint(object): configs = json.load(fin) return configs - - def load_best_parameters(self, model, - optimizer=None, - checkpoint_dir=None, - checkpoint_path=None): + def load_best_parameters(self, + model, + optimizer=None, + checkpoint_dir=None, + checkpoint_path=None): """Load a last model checkpoint from disk. Args: model (Layer): model to load parameters. @@ -229,7 +233,8 @@ class Checkpoint(object): iteration = self.load_checkpoint_idx(checkpoint_record) if iteration == -1: return configs - checkpoint_path = os.path.join(checkpoint_dir, "{}".format(iteration)) + checkpoint_path = os.path.join(checkpoint_dir, + "{}".format(iteration)) else: raise ValueError( "At least one of 'checkpoint_dir' and 'checkpoint_path' should be specified!" @@ -255,10 +260,9 @@ class Checkpoint(object): configs = json.load(fin) return configs - - @mp_tools.rank_zero_only - def save_parameters(self, checkpoint_dir: str, + def save_parameters(self, + checkpoint_dir: str, tag_or_iteration: Union[int, str], model: paddle.nn.Layer, optimizer: Optimizer=None, @@ -275,7 +279,7 @@ class Checkpoint(object): None """ checkpoint_path = os.path.join(checkpoint_dir, - "{}".format(tag_or_iteration)) + "{}".format(tag_or_iteration)) model_dict = model.state_dict() params_path = checkpoint_path + ".pdparams" @@ -293,4 +297,3 @@ class Checkpoint(object): with open(info_path, 'w') as fout: data = json.dumps(infos) fout.write(data) - From 03e695250163b5f725595c1902b765d4c4755ba0 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Mon, 28 Jun 2021 10:00:45 +0000 Subject: [PATCH 10/25] more detial of result --- examples/aishell/s0/README.md | 2 +- examples/aishell/s1/README.md | 30 +++++++++++++++--------------- examples/librispeech/s0/README.md | 10 +++++----- examples/librispeech/s1/README.md | 22 +++++++++++----------- 4 files changed, 32 insertions(+), 32 deletions(-) diff --git a/examples/aishell/s0/README.md b/examples/aishell/s0/README.md index ae3fb401a..40d7c1581 100644 --- a/examples/aishell/s0/README.md +++ b/examples/aishell/s0/README.md @@ -4,7 +4,7 @@ | Model | release | Config | Test set | Loss | CER | | --- | --- | --- | --- | --- | --- | -| DeepSpeech2 58.4M | 2.1.0 | conf/deepspeech2.yaml + spec aug + new datapipe | test | 6.396368026733398 | 0.068382 | +| DeepSpeech2 58.4M | 2.1.0 | conf/deepspeech2.yaml + spec aug + new datapipe | test | 6.396368026733398 | 0.068382 ~ 0.073507| | DeepSpeech2 58.4M | 2.1.0 | conf/deepspeech2.yaml + spec aug | test | 7.483316898345947 | 0.077860 | | DeepSpeech2 58.4M | 2.1.0 | conf/deepspeech2.yaml | test | 7.299022197723389 | 0.078671 | | DeepSpeech2 58.4M | 2.0.0 | conf/deepspeech2.yaml | test | - | 0.078977 | diff --git a/examples/aishell/s1/README.md b/examples/aishell/s1/README.md index 601b0a8d0..1072eabd8 100644 --- a/examples/aishell/s1/README.md +++ b/examples/aishell/s1/README.md @@ -2,25 +2,25 @@ ## Conformer -| Model | Config | Augmentation| Test set | Decode method | Loss | WER | -| --- | --- | --- | --- | --- | --- | --- | -| conformer | conf/conformer.yaml | spec_aug + shift | test | attention | - | 0.059858 | -| conformer | conf/conformer.yaml | spec_aug + shift | test | ctc_greedy_search | - | 0.062311 | -| conformer | conf/conformer.yaml | spec_aug + shift | test | ctc_prefix_beam_search | - | 0.062196 | -| conformer | conf/conformer.yaml | spec_aug + shift | test | attention_rescoring | - | 0.054694 | +| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER | +| --- | --- | --- | --- | --- | --- | --- | --- | +| conformer | 47.07M | conf/conformer.yaml | spec_aug + shift | test | attention | - | 0.059858 | +| conformer | 47.07M | conf/conformer.yaml | spec_aug + shift | test | ctc_greedy_search | - | 0.062311 | +| conformer | 47.07M | conf/conformer.yaml | spec_aug + shift | test | ctc_prefix_beam_search | - | 0.062196 | +| conformer | 47.07M | conf/conformer.yaml | spec_aug + shift | test | attention_rescoring | - | 0.054694 | ## Chunk Conformer -| Model | Config | Augmentation| Test set | Decode method | Chunk | Loss | WER | -| --- | --- | --- | --- | --- | --- | --- | --- | -| conformer | conf/chunk_conformer.yaml | spec_aug + shift | test | attention | 16 | - | 0.061939 | -| conformer | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_greedy_search | 16 | - | 0.070806 | -| conformer | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_prefix_beam_search | 16 | - | 0.070739 | -| conformer | conf/chunk_conformer.yaml | spec_aug + shift | test | attention_rescoring | 16 | - | 0.059400 | +| Model | Params | Config | Augmentation| Test set | Decode method | Chunk | Loss | WER | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | +| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | attention | 16 | - | 0.061939 | +| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_greedy_search | 16 | - | 0.070806 | +| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_prefix_beam_search | 16 | - | 0.070739 | +| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | attention_rescoring | 16 | - | 0.059400 | ## Transformer -| Model | Config | Augmentation| Test set | Decode method | Loss | WER | -| --- | --- | --- | --- | --- | --- | ---| -| transformer | conf/transformer.yaml | spec_aug + shift | test | attention | - | - | +| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER | +| --- | --- | --- | --- | --- | --- | --- | ---| +| transformer | - | conf/transformer.yaml | spec_aug + shift | test | attention | - | - | diff --git a/examples/librispeech/s0/README.md b/examples/librispeech/s0/README.md index 393dd4579..76aa5e78a 100644 --- a/examples/librispeech/s0/README.md +++ b/examples/librispeech/s0/README.md @@ -2,8 +2,8 @@ ## Deepspeech2 -| Model | release | Config | Test set | Loss | WER | -| --- | --- | --- | --- | --- | --- | -| DeepSpeech2 | 2.1.0 | conf/deepspeech2.yaml | 15.184467315673828 | test-clean | 0.072154 | -| DeepSpeech2 | 2.0.0 | conf/deepspeech2.yaml | - | test-clean | 0.073973 | -| DeepSpeech2 | 1.8.5 | - | test-clean | - | 0.074939 | +| Model | Params | release | Config | Test set | Loss | WER | +| --- | --- | --- | --- | --- | --- | --- | +| DeepSpeech2 | 42.96M | 2.1.0 | conf/deepspeech2.yaml | 15.184467315673828 | test-clean | 0.072154 | +| DeepSpeech2 | 42.96M | 2.0.0 | conf/deepspeech2.yaml | - | test-clean | 0.073973 | +| DeepSpeech2 | 42.96M | 1.8.5 | - | test-clean | - | 0.074939 | diff --git a/examples/librispeech/s1/README.md b/examples/librispeech/s1/README.md index 73f6156d9..5e23c0ab5 100644 --- a/examples/librispeech/s1/README.md +++ b/examples/librispeech/s1/README.md @@ -2,17 +2,17 @@ ## Conformer -| Model | Config | Augmentation| Test set | Decode method | Loss | WER | -| --- | --- | --- | --- | --- | --- | --- | -| conformer | conf/conformer.yaml | spec_aug + shift | test-all | attention | test-all 6.35 | 0.057117 | -| conformer | conf/conformer.yaml | spec_aug + shift | test-clean | attention | test-all 6.35 | 0.030162 | -| conformer | conf/conformer.yaml | spec_aug + shift | test-clean | ctc_greedy_search | test-all 6.35 | 0.037910 | -| conformer | conf/conformer.yaml | spec_aug + shift | test-clean | ctc_prefix_beam_search | test-all 6.35 | 0.037761 | -| conformer | conf/conformer.yaml | spec_aug + shift | test-clean | attention_rescoring | test-all 6.35 | 0.032115 | +| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER | +| --- | --- | --- | --- | --- | --- | --- | --- | +| conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-all | attention | 6.35 | 0.057117 | +| conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean | attention | 6.35 | 0.030162 | +| conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean | ctc_greedy_search | 6.35 | 0.037910 | +| conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean | ctc_prefix_beam_search | 6.35 | 0.037761 | +| conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean | attention_rescoring | 6.35 | 0.032115 | ## Transformer -| Model | Config | Augmentation| Test set | Decode method | Loss | WER | -| --- | --- | --- | --- | --- | --- | --- | -| transformer | conf/transformer.yaml | spec_aug + shift | test-all | attention | test-all 6.98 | 0.066500 | -| transformer | conf/transformer.yaml | spec_aug + shift | test-clean | attention | test-all 6.98 | 0.036 | +| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER | +| --- | --- | --- | --- | --- | --- | --- | --- | +| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-all | attention | 6.98 | 0.066500 | +| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean | attention | 6.98 | 0.036 | From 9c0b6c5bb0e91ad68f2b91d7d991664e3acfd038 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Mon, 28 Jun 2021 12:11:12 +0000 Subject: [PATCH 11/25] fix audio shape bug for audio len --- deepspeech/io/collator.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/deepspeech/io/collator.py b/deepspeech/io/collator.py index 305ca9400..2ef119666 100644 --- a/deepspeech/io/collator.py +++ b/deepspeech/io/collator.py @@ -154,8 +154,8 @@ class SpeechCollator(): random_seed (int, optional): for random generator. Defaults to 0. keep_transcription_text (bool, optional): True, when not in training mode, will not do tokenizer; Defaults to False. if ``keep_transcription_text`` is False, text is token ids else is raw string. - - Do augmentations + + Do augmentations Padding audio features with zeros to make them have the same shape (or a user-defined shape) within one batch. """ @@ -271,7 +271,7 @@ class SpeechCollator(): utts.append(utt) # audio audios.append(audio) # [T, D] - audio_lens.append(audio.shape[1]) + audio_lens.append(audio.shape[0]) # text # for training, text is token ids # else text is string, convert to unicode ord From 9b3acddd5d7a1469d9dadb5ce959756bc5e98771 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 29 Jun 2021 04:51:32 +0000 Subject: [PATCH 12/25] fix conf for new datapipe; u2 export inputspec --- deepspeech/exps/u2/model.py | 7 +++---- .../librispeech/s1/conf/chunk_confermer.yaml | 16 +++++++++------- .../librispeech/s1/conf/chunk_transformer.yaml | 16 +++++++++------- examples/librispeech/s1/conf/conformer.yaml | 14 ++++++++------ examples/librispeech/s1/conf/transformer.yaml | 14 ++++++++------ 5 files changed, 37 insertions(+), 30 deletions(-) diff --git a/deepspeech/exps/u2/model.py b/deepspeech/exps/u2/model.py index 308569cd7..05a37b21b 100644 --- a/deepspeech/exps/u2/model.py +++ b/deepspeech/exps/u2/model.py @@ -511,10 +511,9 @@ class U2Tester(U2Trainer): self.args.checkpoint_path) feat_dim = self.test_loader.collate_fn.feature_size input_spec = [ - paddle.static.InputSpec( - shape=[None, feat_dim, None], - dtype='float32'), # audio, [B,D,T] - paddle.static.InputSpec(shape=[None], + paddle.static.InputSpec(shape=[1, None, feat_dim], + dtype='float32'), # audio, [B,T,D] + paddle.static.InputSpec(shape=[1], dtype='int64'), # audio_length, [B] ] return infer_model, input_spec diff --git a/examples/librispeech/s1/conf/chunk_confermer.yaml b/examples/librispeech/s1/conf/chunk_confermer.yaml index ec945a188..ef08daa84 100644 --- a/examples/librispeech/s1/conf/chunk_confermer.yaml +++ b/examples/librispeech/s1/conf/chunk_confermer.yaml @@ -3,18 +3,20 @@ data: train_manifest: data/manifest.train dev_manifest: data/manifest.dev test_manifest: data/manifest.test - vocab_filepath: data/vocab.txt - unit_type: 'spm' - spm_model_prefix: 'data/bpe_unigram_5000' - mean_std_filepath: "" - augmentation_config: conf/augmentation.json - batch_size: 4 min_input_len: 0.5 max_input_len: 20.0 min_output_len: 0.0 max_output_len: 400.0 min_output_input_ratio: 0.05 max_output_input_ratio: 10.0 + +collator: + vocab_filepath: data/vocab.txt + unit_type: 'spm' + spm_model_prefix: 'data/bpe_unigram_5000' + mean_std_filepath: "" + augmentation_config: conf/augmentation.json + batch_size: 16 raw_wav: True # use raw_wav or kaldi feature specgram_type: fbank #linear, mfcc, fbank feat_dim: 80 @@ -80,7 +82,7 @@ model: training: n_epoch: 120 - accum_grad: 1 + accum_grad: 8 global_grad_clip: 5.0 optim: adam optim_conf: diff --git a/examples/librispeech/s1/conf/chunk_transformer.yaml b/examples/librispeech/s1/conf/chunk_transformer.yaml index 3939ffc68..5ec2ad126 100644 --- a/examples/librispeech/s1/conf/chunk_transformer.yaml +++ b/examples/librispeech/s1/conf/chunk_transformer.yaml @@ -3,18 +3,20 @@ data: train_manifest: data/manifest.train dev_manifest: data/manifest.dev test_manifest: data/manifest.test - vocab_filepath: data/vocab.txt - unit_type: 'spm' - spm_model_prefix: 'data/bpe_unigram_5000' - mean_std_filepath: "" - augmentation_config: conf/augmentation.json - batch_size: 64 min_input_len: 0.5 # second max_input_len: 20.0 # second min_output_len: 0.0 # tokens max_output_len: 400.0 # tokens min_output_input_ratio: 0.05 max_output_input_ratio: 10.0 + +collator: + vocab_filepath: data/vocab.txt + unit_type: 'spm' + spm_model_prefix: 'data/bpe_unigram_5000' + mean_std_filepath: "" + augmentation_config: conf/augmentation.json + batch_size: 64 raw_wav: True # use raw_wav or kaldi feature specgram_type: fbank #linear, mfcc, fbank feat_dim: 80 @@ -103,6 +105,6 @@ decoding: # >0: for decoding, use fixed chunk size as set. # 0: used for training, it's prohibited here. num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. + simulate_streaming: true # simulate streaming inference. Defaults to False. diff --git a/examples/librispeech/s1/conf/conformer.yaml b/examples/librispeech/s1/conf/conformer.yaml index 8f8bf4539..cce31b163 100644 --- a/examples/librispeech/s1/conf/conformer.yaml +++ b/examples/librispeech/s1/conf/conformer.yaml @@ -3,18 +3,20 @@ data: train_manifest: data/manifest.train dev_manifest: data/manifest.dev test_manifest: data/manifest.test-clean - vocab_filepath: data/vocab.txt - unit_type: 'spm' - spm_model_prefix: 'data/bpe_unigram_5000' - mean_std_filepath: "" - augmentation_config: conf/augmentation.json - batch_size: 16 min_input_len: 0.5 # seconds max_input_len: 20.0 # seconds min_output_len: 0.0 # tokens max_output_len: 400.0 # tokens min_output_input_ratio: 0.05 max_output_input_ratio: 10.0 + +collator: + vocab_filepath: data/vocab.txt + unit_type: 'spm' + spm_model_prefix: 'data/bpe_unigram_5000' + mean_std_filepath: "" + augmentation_config: conf/augmentation.json + batch_size: 16 raw_wav: True # use raw_wav or kaldi feature specgram_type: fbank #linear, mfcc, fbank feat_dim: 80 diff --git a/examples/librispeech/s1/conf/transformer.yaml b/examples/librispeech/s1/conf/transformer.yaml index a094b0fba..8ea494772 100644 --- a/examples/librispeech/s1/conf/transformer.yaml +++ b/examples/librispeech/s1/conf/transformer.yaml @@ -3,18 +3,20 @@ data: train_manifest: data/manifest.train dev_manifest: data/manifest.dev test_manifest: data/manifest.test-clean - vocab_filepath: data/vocab.txt - unit_type: 'spm' - spm_model_prefix: 'data/bpe_unigram_5000' - mean_std_filepath: "" - augmentation_config: conf/augmentation.json - batch_size: 64 min_input_len: 0.5 # second max_input_len: 20.0 # second min_output_len: 0.0 # tokens max_output_len: 400.0 # tokens min_output_input_ratio: 0.05 max_output_input_ratio: 10.0 + +collator: + vocab_filepath: data/vocab.txt + unit_type: 'spm' + spm_model_prefix: 'data/bpe_unigram_5000' + mean_std_filepath: "" + augmentation_config: conf/augmentation.json + batch_size: 64 raw_wav: True # use raw_wav or kaldi feature specgram_type: fbank #linear, mfcc, fbank feat_dim: 80 From 6d92417edd57b73996cf042633ff1d06219c95f1 Mon Sep 17 00:00:00 2001 From: Haoxin Ma <745165806@qq.com> Date: Tue, 29 Jun 2021 06:05:26 +0000 Subject: [PATCH 13/25] optimize the function --- deepspeech/training/trainer.py | 5 +- deepspeech/utils/checkpoint.py | 109 +++++++++------------------------ 2 files changed, 32 insertions(+), 82 deletions(-) diff --git a/deepspeech/training/trainer.py b/deepspeech/training/trainer.py index f8668370a..cd915760d 100644 --- a/deepspeech/training/trainer.py +++ b/deepspeech/training/trainer.py @@ -151,11 +151,12 @@ class Trainer(): resume training. """ scratch = None - infos = self.checkpoint.load_last_parameters( + infos = self.checkpoint._load_parameters( self.model, self.optimizer, checkpoint_dir=self.checkpoint_dir, - checkpoint_path=self.args.checkpoint_path) + checkpoint_path=self.args.checkpoint_path, + checkpoint_file='checkpoint_latest') if infos: # restore from ckpt self.iteration = infos["step"] diff --git a/deepspeech/utils/checkpoint.py b/deepspeech/utils/checkpoint.py index b29ef2ab5..be36fdbb2 100644 --- a/deepspeech/utils/checkpoint.py +++ b/deepspeech/utils/checkpoint.py @@ -39,8 +39,8 @@ class Checkpoint(object): self.latest_n = latest_n self._save_all = (kbest_n == -1) - def should_save_best(self, metric: float) -> bool: - if not self.best_full(): + def _should_save_best(self, metric: float) -> bool: + if not self._best_full(): return True # already full @@ -49,10 +49,10 @@ class Checkpoint(object): worst_metric = self.best_records[worst_record_path] return metric < worst_metric - def best_full(self): + def _best_full(self): return (not self._save_all) and len(self.best_records) == self.kbest_n - def latest_full(self): + def _latest_full(self): return len(self.latest_records) == self.latest_n def add_checkpoint(self, @@ -63,62 +63,62 @@ class Checkpoint(object): infos, metric_type="val_loss"): if (metric_type not in infos.keys()): - self.save_parameters(checkpoint_dir, tag_or_iteration, model, + self._save_parameters(checkpoint_dir, tag_or_iteration, model, optimizer, infos) return #save best - if self.should_save_best(infos[metric_type]): - self.save_best_checkpoint_and_update( + if self._should_save_best(infos[metric_type]): + self._save_best_checkpoint_and_update( infos[metric_type], checkpoint_dir, tag_or_iteration, model, optimizer, infos) #save latest - self.save_latest_checkpoint_and_update(checkpoint_dir, tag_or_iteration, + self._save_latest_checkpoint_and_update(checkpoint_dir, tag_or_iteration, model, optimizer, infos) if isinstance(tag_or_iteration, int): - self.save_checkpoint_record(checkpoint_dir, tag_or_iteration) + self._save_checkpoint_record(checkpoint_dir, tag_or_iteration) - def save_best_checkpoint_and_update(self, metric, checkpoint_dir, + def _save_best_checkpoint_and_update(self, metric, checkpoint_dir, tag_or_iteration, model, optimizer, infos): # remove the worst - if self.best_full(): + if self._best_full(): worst_record_path = max(self.best_records, key=self.best_records.get) self.best_records.pop(worst_record_path) if (worst_record_path not in self.latest_records): logger.info( "remove the worst checkpoint: {}".format(worst_record_path)) - self.del_checkpoint(checkpoint_dir, worst_record_path) + self._del_checkpoint(checkpoint_dir, worst_record_path) # add the new one - self.save_parameters(checkpoint_dir, tag_or_iteration, model, optimizer, + self._save_parameters(checkpoint_dir, tag_or_iteration, model, optimizer, infos) self.best_records[tag_or_iteration] = metric - def save_latest_checkpoint_and_update( + def _save_latest_checkpoint_and_update( self, checkpoint_dir, tag_or_iteration, model, optimizer, infos): # remove the old - if self.latest_full(): + if self._latest_full(): to_del_fn = self.latest_records.pop(0) if (to_del_fn not in self.best_records.keys()): logger.info( "remove the latest checkpoint: {}".format(to_del_fn)) - self.del_checkpoint(checkpoint_dir, to_del_fn) + self._del_checkpoint(checkpoint_dir, to_del_fn) self.latest_records.append(tag_or_iteration) - self.save_parameters(checkpoint_dir, tag_or_iteration, model, optimizer, + self._save_parameters(checkpoint_dir, tag_or_iteration, model, optimizer, infos) - def del_checkpoint(self, checkpoint_dir, tag_or_iteration): + def _del_checkpoint(self, checkpoint_dir, tag_or_iteration): checkpoint_path = os.path.join(checkpoint_dir, "{}".format(tag_or_iteration)) for filename in glob.glob(checkpoint_path + ".*"): os.remove(filename) logger.info("delete file: {}".format(filename)) - def load_checkpoint_idx(self, checkpoint_record: str) -> int: + def _load_checkpoint_idx(self, checkpoint_record: str) -> int: """Get the iteration number corresponding to the latest saved checkpoint. Args: checkpoint_path (str): the saved path of checkpoint. @@ -134,7 +134,7 @@ class Checkpoint(object): iteration = int(latest_checkpoint.split(":")[-1]) return iteration - def save_checkpoint_record(self, checkpoint_dir: str, iteration: int): + def _save_checkpoint_record(self, checkpoint_dir: str, iteration: int): """Save the iteration number of the latest model to be checkpoint record. Args: checkpoint_dir (str): the directory where checkpoint is saved. @@ -153,65 +153,13 @@ class Checkpoint(object): for i in self.latest_records: handle.write("model_checkpoint_path:{}\n".format(i)) - def load_last_parameters(self, - model, - optimizer=None, - checkpoint_dir=None, - checkpoint_path=None): - """Load a last model checkpoint from disk. - Args: - model (Layer): model to load parameters. - optimizer (Optimizer, optional): optimizer to load states if needed. - Defaults to None. - checkpoint_dir (str, optional): the directory where checkpoint is saved. - checkpoint_path (str, optional): if specified, load the checkpoint - stored in the checkpoint_path(prefix) and the argument 'checkpoint_dir' will - be ignored. Defaults to None. - Returns: - configs (dict): epoch or step, lr and other meta info should be saved. - """ - configs = {} - - if checkpoint_path is not None: - tag = os.path.basename(checkpoint_path).split(":")[-1] - elif checkpoint_dir is not None: - checkpoint_record = os.path.join(checkpoint_dir, - "checkpoint_latest") - iteration = self.load_checkpoint_idx(checkpoint_record) - if iteration == -1: - return configs - checkpoint_path = os.path.join(checkpoint_dir, - "{}".format(iteration)) - else: - raise ValueError( - "At least one of 'checkpoint_dir' and 'checkpoint_path' should be specified!" - ) - - rank = dist.get_rank() - - params_path = checkpoint_path + ".pdparams" - model_dict = paddle.load(params_path) - model.set_state_dict(model_dict) - logger.info("Rank {}: loaded model from {}".format(rank, params_path)) - - optimizer_path = checkpoint_path + ".pdopt" - if optimizer and os.path.isfile(optimizer_path): - optimizer_dict = paddle.load(optimizer_path) - optimizer.set_state_dict(optimizer_dict) - logger.info("Rank {}: loaded optimizer state from {}".format( - rank, optimizer_path)) - - info_path = re.sub('.pdparams$', '.json', params_path) - if os.path.exists(info_path): - with open(info_path, 'r') as fin: - configs = json.load(fin) - return configs - def load_best_parameters(self, + def _load_parameters(self, model, optimizer=None, checkpoint_dir=None, - checkpoint_path=None): + checkpoint_path=None, + checkpoint_file=None): """Load a last model checkpoint from disk. Args: model (Layer): model to load parameters. @@ -221,6 +169,7 @@ class Checkpoint(object): checkpoint_path (str, optional): if specified, load the checkpoint stored in the checkpoint_path(prefix) and the argument 'checkpoint_dir' will be ignored. Defaults to None. + checkpoint_file "checkpoint_latest" or "checkpoint_best" Returns: configs (dict): epoch or step, lr and other meta info should be saved. """ @@ -228,16 +177,16 @@ class Checkpoint(object): if checkpoint_path is not None: tag = os.path.basename(checkpoint_path).split(":")[-1] - elif checkpoint_dir is not None: - checkpoint_record = os.path.join(checkpoint_dir, "checkpoint_best") - iteration = self.load_checkpoint_idx(checkpoint_record) + elif checkpoint_dir is not None and checkpoint_file is not None: + checkpoint_record = os.path.join(checkpoint_dir, checkpoint_file) + iteration = self._load_checkpoint_idx(checkpoint_record) if iteration == -1: return configs checkpoint_path = os.path.join(checkpoint_dir, "{}".format(iteration)) else: raise ValueError( - "At least one of 'checkpoint_dir' and 'checkpoint_path' should be specified!" + "At least one of 'checkpoint_dir' and 'checkpoint_file' and 'checkpoint_path' should be specified!" ) rank = dist.get_rank() @@ -261,7 +210,7 @@ class Checkpoint(object): return configs @mp_tools.rank_zero_only - def save_parameters(self, + def _save_parameters(self, checkpoint_dir: str, tag_or_iteration: Union[int, str], model: paddle.nn.Layer, From b8f190c12cff4ee311501973f612c4e893ac9cea Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 29 Jun 2021 08:33:26 +0000 Subject: [PATCH 14/25] add thchs30 dataset --- examples/dataset/aishell/aishell.py | 4 +- examples/dataset/thchs30/.gitignore | 5 + examples/dataset/thchs30/thchs30.py | 169 ++++++++++++++++++++++++++++ 3 files changed, 177 insertions(+), 1 deletion(-) create mode 100644 examples/dataset/thchs30/.gitignore create mode 100644 examples/dataset/thchs30/thchs30.py diff --git a/examples/dataset/aishell/aishell.py b/examples/dataset/aishell/aishell.py index a0cabe352..b8aede2fc 100644 --- a/examples/dataset/aishell/aishell.py +++ b/examples/dataset/aishell/aishell.py @@ -60,7 +60,7 @@ def create_manifest(data_dir, manifest_path_prefix): if line == '': continue audio_id, text = line.split(' ', 1) - # remove withespace + # remove withespace, charactor text text = ''.join(text.split()) transcript_dict[audio_id] = text @@ -123,6 +123,8 @@ def main(): target_dir=args.target_dir, manifest_path=args.manifest_prefix) + print("Data download and manifest prepare done!") + if __name__ == '__main__': main() diff --git a/examples/dataset/thchs30/.gitignore b/examples/dataset/thchs30/.gitignore new file mode 100644 index 000000000..47dd6268f --- /dev/null +++ b/examples/dataset/thchs30/.gitignore @@ -0,0 +1,5 @@ +*.tgz +manifest.* +data_thchs30 +resource +test-noise diff --git a/examples/dataset/thchs30/thchs30.py b/examples/dataset/thchs30/thchs30.py new file mode 100644 index 000000000..225adb092 --- /dev/null +++ b/examples/dataset/thchs30/thchs30.py @@ -0,0 +1,169 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Prepare THCHS-30 mandarin dataset + +Download, unpack and create manifest files. +Manifest file is a json-format file with each line containing the +meta data (i.e. audio filepath, transcript and audio duration) +of each audio file in the data set. +""" +import argparse +import codecs +import json +import os +from multiprocessing.pool import Pool +from pathlib import Path + +import soundfile + +from utils.utility import download +from utils.utility import unpack + +DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech') + +URL_ROOT = 'http://www.openslr.org/resources/18' +# URL_ROOT = 'https://openslr.magicdatatech.com/resources/18' +DATA_URL = URL_ROOT + '/data_thchs30.tgz' +TEST_NOISE_URL = URL_ROOT + '/test-noise.tgz' +RESOURCE_URL = URL_ROOT + '/resource.tgz' +MD5_DATA = '2d2252bde5c8429929e1841d4cb95e90' +MD5_TEST_NOISE = '7e8a985fb965b84141b68c68556c2030' +MD5_RESOURCE = 'c0b2a565b4970a0c4fe89fefbf2d97e1' + +parser = argparse.ArgumentParser(description=__doc__) +parser.add_argument( + "--target_dir", + default=DATA_HOME + "/THCHS30", + type=str, + help="Directory to save the dataset. (default: %(default)s)") +parser.add_argument( + "--manifest_prefix", + default="manifest", + type=str, + help="Filepath prefix for output manifests. (default: %(default)s)") +args = parser.parse_args() + + +def read_trn(filepath): + """read trn file. + word text in first line. + syllable text in second line. + phoneme text in third line. + + Args: + filepath (str): trn path. + + Returns: + list(str): (word, syllable, phone) + """ + texts = [] + with open(filepath, 'r') as f: + lines = f.read().split('\n') + # last line is `empty` + lines = lines[:3] + assert len(lines) == 3, lines + # charactor text, remove withespace + texts.append(''.join(lines[0].split())) + texts.extend(lines[1:]) + return texts + + +def resolve_symlink(filepath): + """resolve symlink which content is norm file. + + Args: + filepath (str): norm file symlink. + """ + sym_path = Path(filepath) + relative_link = sym_path.read_text().strip() + relative = Path(relative_link) + relpath = sym_path.parent / relative + return relpath.resolve() + + +def create_manifest(data_dir, manifest_path_prefix): + print("Creating manifest %s ..." % manifest_path_prefix) + json_lines = [] + data_types = ['train', 'dev', 'test'] + for dtype in data_types: + del json_lines[:] + audio_dir = os.path.join(data_dir, dtype) + for subfolder, _, filelist in sorted(os.walk(audio_dir)): + for fname in filelist: + file_path = os.path.join(subfolder, fname) + if file_path.endswith('.wav'): + audio_path = os.path.abspath(file_path) + text_path = resolve_symlink(audio_path + '.trn') + else: + continue + + assert os.path.exists(audio_path) and os.path.exists(text_path) + + audio_id = os.path.basename(audio_path)[:-4] + word_text, syllable_text, phone_text = read_trn(text_path) + audio_data, samplerate = soundfile.read(audio_path) + duration = float(len(audio_data) / samplerate) + + json_lines.append( + json.dumps( + { + 'utt': audio_id, + 'feat': audio_path, + 'feat_shape': (duration, ), # second + 'text': word_text, + 'syllable': syllable_text, + 'phone': phone_text, + }, + ensure_ascii=False)) + + manifest_path = manifest_path_prefix + '.' + dtype + with codecs.open(manifest_path, 'w', 'utf-8') as fout: + for line in json_lines: + fout.write(line + '\n') + + +def prepare_dataset(url, md5sum, target_dir, manifest_path, subset): + """Download, unpack and create manifest file.""" + datadir = os.path.join(target_dir, subset) + if not os.path.exists(datadir): + filepath = download(url, md5sum, target_dir) + unpack(filepath, target_dir) + else: + print("Skip downloading and unpacking. Data already exists in %s." % + target_dir) + + if subset == 'data_thchs30': + create_manifest(datadir, manifest_path) + + +def main(): + if args.target_dir.startswith('~'): + args.target_dir = os.path.expanduser(args.target_dir) + + tasks = [ + (DATA_URL, MD5_DATA, args.target_dir, args.manifest_prefix, + "data_thchs30"), + (TEST_NOISE_URL, MD5_TEST_NOISE, args.target_dir, args.manifest_prefix, + "test-noise"), + (RESOURCE_URL, MD5_RESOURCE, args.target_dir, args.manifest_prefix, + "resource"), + ] + with Pool(7) as pool: + pool.starmap(prepare_dataset, tasks) + + print("Data download and manifest prepare done!") + + +if __name__ == '__main__': + main() From 9e99f99b3c498f080f0b34e7763139f90ce6d751 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 29 Jun 2021 12:11:32 +0000 Subject: [PATCH 15/25] add thchs30, aidatatang; --- examples/dataset/aidatatang_200zh/.gitignore | 4 + examples/dataset/aidatatang_200zh/README.md | 14 ++ .../aidatatang_200zh/aidatatang_200zh.py | 151 ++++++++++++++++++ examples/dataset/aishell/README.md | 3 + examples/dataset/aishell/aishell.py | 32 ++-- examples/dataset/aishell3/README.md | 3 + examples/dataset/librispeech/librispeech.py | 22 ++- examples/dataset/magicdata/README.md | 15 ++ .../mini_librispeech/mini_librispeech.py | 18 +++ examples/dataset/multi_cn/README.md | 11 ++ examples/dataset/primewords/README.md | 6 + examples/dataset/st-cmds/README.md | 1 + examples/dataset/thchs30/README.md | 55 +++++++ examples/dataset/thchs30/thchs30.py | 4 +- 14 files changed, 326 insertions(+), 13 deletions(-) create mode 100644 examples/dataset/aidatatang_200zh/.gitignore create mode 100644 examples/dataset/aidatatang_200zh/README.md create mode 100644 examples/dataset/aidatatang_200zh/aidatatang_200zh.py create mode 100644 examples/dataset/aishell/README.md create mode 100644 examples/dataset/aishell3/README.md create mode 100644 examples/dataset/magicdata/README.md create mode 100644 examples/dataset/multi_cn/README.md create mode 100644 examples/dataset/primewords/README.md create mode 100644 examples/dataset/st-cmds/README.md create mode 100644 examples/dataset/thchs30/README.md diff --git a/examples/dataset/aidatatang_200zh/.gitignore b/examples/dataset/aidatatang_200zh/.gitignore new file mode 100644 index 000000000..fcb887790 --- /dev/null +++ b/examples/dataset/aidatatang_200zh/.gitignore @@ -0,0 +1,4 @@ +*.tgz +manifest.* +*.meta +aidatatang_200zh/ diff --git a/examples/dataset/aidatatang_200zh/README.md b/examples/dataset/aidatatang_200zh/README.md new file mode 100644 index 000000000..e6f1eefbd --- /dev/null +++ b/examples/dataset/aidatatang_200zh/README.md @@ -0,0 +1,14 @@ +# [Aidatatang_200zh](http://www.openslr.org/62/) + +Aidatatang_200zh is a free Chinese Mandarin speech corpus provided by Beijing DataTang Technology Co., Ltd under Creative Commons Attribution-NonCommercial-NoDerivatives 4.0 International Public License. +The contents and the corresponding descriptions of the corpus include: + +* The corpus contains 200 hours of acoustic data, which is mostly mobile recorded data. +* 600 speakers from different accent areas in China are invited to participate in the recording. +* The transcription accuracy for each sentence is larger than 98%. +* Recordings are conducted in a quiet indoor environment. +* The database is divided into training set, validation set, and testing set in a ratio of 7: 1: 2. +* Detail information such as speech data coding and speaker information is preserved in the metadata file. +* Segmented transcripts are also provided. + +The corpus aims to support researchers in speech recognition, machine translation, voiceprint recognition, and other speech-related fields. Therefore, the corpus is totally free for academic use. diff --git a/examples/dataset/aidatatang_200zh/aidatatang_200zh.py b/examples/dataset/aidatatang_200zh/aidatatang_200zh.py new file mode 100644 index 000000000..cc77c3c48 --- /dev/null +++ b/examples/dataset/aidatatang_200zh/aidatatang_200zh.py @@ -0,0 +1,151 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Prepare aidatatang_200zh mandarin dataset + +Download, unpack and create manifest files. +Manifest file is a json-format file with each line containing the +meta data (i.e. audio filepath, transcript and audio duration) +of each audio file in the data set. +""" +import argparse +import codecs +import json +import os + +import soundfile + +from utils.utility import download +from utils.utility import unpack + +DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech') + +URL_ROOT = 'http://www.openslr.org/resources/62' +# URL_ROOT = 'https://openslr.magicdatatech.com/resources/62' +DATA_URL = URL_ROOT + '/aidatatang_200zh.tgz' +MD5_DATA = '6e0f4f39cd5f667a7ee53c397c8d0949' + +parser = argparse.ArgumentParser(description=__doc__) +parser.add_argument( + "--target_dir", + default=DATA_HOME + "/aidatatang_200zh", + type=str, + help="Directory to save the dataset. (default: %(default)s)") +parser.add_argument( + "--manifest_prefix", + default="manifest", + type=str, + help="Filepath prefix for output manifests. (default: %(default)s)") +args = parser.parse_args() + + +def create_manifest(data_dir, manifest_path_prefix): + print("Creating manifest %s ..." % manifest_path_prefix) + json_lines = [] + transcript_path = os.path.join(data_dir, 'transcript', + 'aidatatang_200_zh_transcript.txt') + transcript_dict = {} + for line in codecs.open(transcript_path, 'r', 'utf-8'): + line = line.strip() + if line == '': + continue + audio_id, text = line.split(' ', 1) + # remove withespace, charactor text + text = ''.join(text.split()) + transcript_dict[audio_id] = text + + data_types = ['train', 'dev', 'test'] + for dtype in data_types: + del json_lines[:] + total_sec = 0.0 + total_text = 0.0 + total_num = 0 + + audio_dir = os.path.join(data_dir, 'corpus/', dtype) + for subfolder, _, filelist in sorted(os.walk(audio_dir)): + for fname in filelist: + if not fname.endswith('.wav'): + continue + + audio_path = os.path.abspath(os.path.join(subfolder, fname)) + audio_id = os.path.basename(fname)[:-4] + + audio_data, samplerate = soundfile.read(audio_path) + duration = float(len(audio_data) / samplerate) + text = transcript_dict[audio_id] + json_lines.append( + json.dumps( + { + 'utt': audio_id, + 'feat': audio_path, + 'feat_shape': (duration, ), # second + 'text': text, + }, + ensure_ascii=False)) + + total_sec += duration + total_text += len(text) + total_num += 1 + + manifest_path = manifest_path_prefix + '.' + dtype + with codecs.open(manifest_path, 'w', 'utf-8') as fout: + for line in json_lines: + fout.write(line + '\n') + + with open(dtype + '.meta', 'w') as f: + print(f"{dtype}:", file=f) + print(f"{total_num} utts", file=f) + print(f"{total_sec / (60*60)} h", file=f) + print(f"{total_text} text", file=f) + print(f"{total_text / total_sec} text/sec", file=f) + print(f"{total_sec / total_num} sec/utt", file=f) + + +def prepare_dataset(url, md5sum, target_dir, manifest_path, subset): + """Download, unpack and create manifest file.""" + data_dir = os.path.join(target_dir, subset) + if not os.path.exists(data_dir): + filepath = download(url, md5sum, target_dir) + unpack(filepath, target_dir) + # unpack all audio tar files + audio_dir = os.path.join(data_dir, 'corpus') + for subfolder, dirlist, filelist in sorted(os.walk(audio_dir)): + for sub in dirlist: + print(f"unpack dir {sub}...") + for folder, _, filelist in sorted( + os.walk(os.path.join(subfolder, sub))): + for ftar in filelist: + unpack(os.path.join(folder, ftar), folder, True) + else: + print("Skip downloading and unpacking. Data already exists in %s." % + target_dir) + + create_manifest(data_dir, manifest_path) + + +def main(): + if args.target_dir.startswith('~'): + args.target_dir = os.path.expanduser(args.target_dir) + + prepare_dataset( + url=DATA_URL, + md5sum=MD5_DATA, + target_dir=args.target_dir, + manifest_path=args.manifest_prefix, + subset='aidatatang_200zh') + + print("Data download and manifest prepare done!") + + +if __name__ == '__main__': + main() diff --git a/examples/dataset/aishell/README.md b/examples/dataset/aishell/README.md new file mode 100644 index 000000000..6770cd207 --- /dev/null +++ b/examples/dataset/aishell/README.md @@ -0,0 +1,3 @@ +# [Aishell1](http://www.openslr.org/33/) + +This Open Source Mandarin Speech Corpus, AISHELL-ASR0009-OS1, is 178 hours long. It is a part of AISHELL-ASR0009, of which utterance contains 11 domains, including smart home, autonomous driving, and industrial production. The whole recording was put in quiet indoor environment, using 3 different devices at the same time: high fidelity microphone (44.1kHz, 16-bit,); Android-system mobile phone (16kHz, 16-bit), iOS-system mobile phone (16kHz, 16-bit). Audios in high fidelity were re-sampled to 16kHz to build AISHELL- ASR0009-OS1. 400 speakers from different accent areas in China were invited to participate in the recording. The manual transcription accuracy rate is above 95%, through professional speech annotation and strict quality inspection. The corpus is divided into training, development and testing sets. ( This database is free for academic research, not in the commerce, if without permission. ) diff --git a/examples/dataset/aishell/aishell.py b/examples/dataset/aishell/aishell.py index b8aede2fc..5811a401a 100644 --- a/examples/dataset/aishell/aishell.py +++ b/examples/dataset/aishell/aishell.py @@ -31,7 +31,7 @@ from utils.utility import unpack DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech') URL_ROOT = 'http://www.openslr.org/resources/33' -URL_ROOT = 'https://openslr.magicdatatech.com/resources/33' +# URL_ROOT = 'https://openslr.magicdatatech.com/resources/33' DATA_URL = URL_ROOT + '/data_aishell.tgz' MD5_DATA = '2f494334227864a8a8fec932999db9d8' @@ -67,11 +67,15 @@ def create_manifest(data_dir, manifest_path_prefix): data_types = ['train', 'dev', 'test'] for dtype in data_types: del json_lines[:] + total_sec = 0.0 + total_text = 0.0 + total_num = 0 + audio_dir = os.path.join(data_dir, 'wav', dtype) for subfolder, _, filelist in sorted(os.walk(audio_dir)): for fname in filelist: - audio_path = os.path.join(subfolder, fname) - audio_id = fname[:-4] + audio_path = os.path.abspath(os.path.join(subfolder, fname)) + audio_id = os.path.basename(fname)[:-4] # if no transcription for audio then skipped if audio_id not in transcript_dict: continue @@ -81,20 +85,30 @@ def create_manifest(data_dir, manifest_path_prefix): json_lines.append( json.dumps( { - 'utt': - os.path.splitext(os.path.basename(audio_path))[0], - 'feat': - audio_path, + 'utt': audio_id, + 'feat': audio_path, 'feat_shape': (duration, ), # second - 'text': - text + 'text': text }, ensure_ascii=False)) + + total_sec += duration + total_text += len(text) + total_num += 1 + manifest_path = manifest_path_prefix + '.' + dtype with codecs.open(manifest_path, 'w', 'utf-8') as fout: for line in json_lines: fout.write(line + '\n') + with open(dtype + '.meta', 'w') as f: + print(f"{dtype}:", file=f) + print(f"{total_num} utts", file=f) + print(f"{total_sec / (60*60)} h", file=f) + print(f"{total_text} text", file=f) + print(f"{total_text / total_sec} text/sec", file=f) + print(f"{total_sec / total_num} sec/utt", file=f) + def prepare_dataset(url, md5sum, target_dir, manifest_path): """Download, unpack and create manifest file.""" diff --git a/examples/dataset/aishell3/README.md b/examples/dataset/aishell3/README.md new file mode 100644 index 000000000..8a29a6d0f --- /dev/null +++ b/examples/dataset/aishell3/README.md @@ -0,0 +1,3 @@ +# [Aishell3](http://www.openslr.org/93/) + +AISHELL-3 is a large-scale and high-fidelity multi-speaker Mandarin speech corpus which could be used to train multi-speaker Text-to-Speech (TTS) systems. The corpus contains roughly **85 hours** of emotion-neutral recordings spoken by 218 native Chinese mandarin speakers and total 88035 utterances. Their auxiliary attributes such as gender, age group and native accents are explicitly marked and provided in the corpus. Accordingly, transcripts in Chinese character-level and pinyin-level are provided along with the recordings. The word & tone transcription accuracy rate is above 98%, through professional speech annotation and strict quality inspection for tone and prosody. ( This database is free for academic research, not in the commerce, if without permission. ) diff --git a/examples/dataset/librispeech/librispeech.py b/examples/dataset/librispeech/librispeech.py index 55012f73c..f549a95f1 100644 --- a/examples/dataset/librispeech/librispeech.py +++ b/examples/dataset/librispeech/librispeech.py @@ -77,6 +77,10 @@ def create_manifest(data_dir, manifest_path): """ print("Creating manifest %s ..." % manifest_path) json_lines = [] + total_sec = 0.0 + total_text = 0.0 + total_num = 0 + for subfolder, _, filelist in sorted(os.walk(data_dir)): text_filelist = [ filename for filename in filelist if filename.endswith('trans.txt') @@ -86,7 +90,9 @@ def create_manifest(data_dir, manifest_path): for line in io.open(text_filepath, encoding="utf8"): segments = line.strip().split() text = ' '.join(segments[1:]).lower() - audio_filepath = os.path.join(subfolder, segments[0] + '.flac') + + audio_filepath = os.path.abspath( + os.path.join(subfolder, segments[0] + '.flac')) audio_data, samplerate = soundfile.read(audio_filepath) duration = float(len(audio_data)) / samplerate json_lines.append( @@ -99,10 +105,24 @@ def create_manifest(data_dir, manifest_path): 'text': text })) + + total_sec += duration + total_text += len(text) + total_num += 1 + with codecs.open(manifest_path, 'w', 'utf-8') as out_file: for line in json_lines: out_file.write(line + '\n') + subset = os.path.splitext(manifest_path)[1] + with open(subset + '.meta', 'w') as f: + print(f"{subset}:", file=f) + print(f"{total_num} utts", file=f) + print(f"{total_sec / (60*60)} h", file=f) + print(f"{total_text} text", file=f) + print(f"{total_text / total_sec} text/sec", file=f) + print(f"{total_sec / total_num} sec/utt", file=f) + def prepare_dataset(url, md5sum, target_dir, manifest_path): """Download, unpack and create summmary manifest file. diff --git a/examples/dataset/magicdata/README.md b/examples/dataset/magicdata/README.md new file mode 100644 index 000000000..083aee97b --- /dev/null +++ b/examples/dataset/magicdata/README.md @@ -0,0 +1,15 @@ +# [MagicData](http://www.openslr.org/68/) + +MAGICDATA Mandarin Chinese Read Speech Corpus was developed by MAGIC DATA Technology Co., Ltd. and freely published for non-commercial use. +The contents and the corresponding descriptions of the corpus include: + +* The corpus contains 755 hours of speech data, which is mostly mobile recorded data. +* 1080 speakers from different accent areas in China are invited to participate in the recording. +* The sentence transcription accuracy is higher than 98%. +* Recordings are conducted in a quiet indoor environment. +* The database is divided into training set, validation set, and testing set in a ratio of 51: 1: 2. +* Detail information such as speech data coding and speaker information is preserved in the metadata file. +* The domain of recording texts is diversified, including interactive Q&A, music search, SNS messages, home command and control, etc. +* Segmented transcripts are also provided. + +The corpus aims to support researchers in speech recognition, machine translation, speaker recognition, and other speech-related fields. Therefore, the corpus is totally free for academic use. diff --git a/examples/dataset/mini_librispeech/mini_librispeech.py b/examples/dataset/mini_librispeech/mini_librispeech.py index f5bc13933..44a6d3671 100644 --- a/examples/dataset/mini_librispeech/mini_librispeech.py +++ b/examples/dataset/mini_librispeech/mini_librispeech.py @@ -58,6 +58,10 @@ def create_manifest(data_dir, manifest_path): """ print("Creating manifest %s ..." % manifest_path) json_lines = [] + total_sec = 0.0 + total_text = 0.0 + total_num = 0 + for subfolder, _, filelist in sorted(os.walk(data_dir)): text_filelist = [ filename for filename in filelist if filename.endswith('trans.txt') @@ -80,10 +84,24 @@ def create_manifest(data_dir, manifest_path): 'text': text })) + + total_sec += duration + total_text += len(text) + total_num += 1 + with codecs.open(manifest_path, 'w', 'utf-8') as out_file: for line in json_lines: out_file.write(line + '\n') + subset = os.path.splitext(manifest_path)[1] + with open(subset + '.meta', 'w') as f: + print(f"{subset}:", file=f) + print(f"{total_num} utts", file=f) + print(f"{total_sec / (60*60)} h", file=f) + print(f"{total_text} text", file=f) + print(f"{total_text / total_sec} text/sec", file=f) + print(f"{total_sec / total_num} sec/utt", file=f) + def prepare_dataset(url, md5sum, target_dir, manifest_path): """Download, unpack and create summmary manifest file. diff --git a/examples/dataset/multi_cn/README.md b/examples/dataset/multi_cn/README.md new file mode 100644 index 000000000..d59b11b6d --- /dev/null +++ b/examples/dataset/multi_cn/README.md @@ -0,0 +1,11 @@ +# multi-cn + +This is a Chinese speech recognition recipe that trains on all Chinese corpora on OpenSLR, including: + +* Aidatatang (140 hours) +* Aishell (151 hours) +* MagicData (712 hours) +* Primewords (99 hours) +* ST-CMDS (110 hours) +* THCHS-30 (26 hours) +* optional AISHELL2 (~1000 hours) if available diff --git a/examples/dataset/primewords/README.md b/examples/dataset/primewords/README.md new file mode 100644 index 000000000..a4f1ed65d --- /dev/null +++ b/examples/dataset/primewords/README.md @@ -0,0 +1,6 @@ +# [Primewords](http://www.openslr.org/47/) + +This free Chinese Mandarin speech corpus set is released by Shanghai Primewords Information Technology Co., Ltd. +The corpus is recorded by smart mobile phones from 296 native Chinese speakers. The transcription accuracy is larger than 98%, at the confidence level of 95%. It is free for academic use. + +The mapping between the transcript and utterance is given in JSON format. diff --git a/examples/dataset/st-cmds/README.md b/examples/dataset/st-cmds/README.md new file mode 100644 index 000000000..c7ae50e59 --- /dev/null +++ b/examples/dataset/st-cmds/README.md @@ -0,0 +1 @@ +# [FreeST](http://www.openslr.org/38/) diff --git a/examples/dataset/thchs30/README.md b/examples/dataset/thchs30/README.md new file mode 100644 index 000000000..6b59d663a --- /dev/null +++ b/examples/dataset/thchs30/README.md @@ -0,0 +1,55 @@ +# [THCHS30](http://www.openslr.org/18/) + +This is the *data part* of the `THCHS30 2015` acoustic data +& scripts dataset. + +The dataset is described in more detail in the paper ``THCHS-30 : A Free +Chinese Speech Corpus`` by Dong Wang, Xuewei Zhang. + +A paper (if it can be called a paper) 13 years ago regarding the database: + +Dong Wang, Dalei Wu, Xiaoyan Zhu, ``TCMSD: A new Chinese Continuous Speech Database``, +International Conference on Chinese Computing (ICCC'01), 2001, Singapore. + +The layout of this data pack is the following: + + ``data`` + ``*.wav`` + audio data + + ``*.wav.trn`` + transcriptions + + ``{train,dev,test}`` + contain symlinks into the ``data`` directory for both audio and + transcription files. Contents of these directories define the + train/dev/test split of the data. + + ``{lm_word}`` + ``word.3gram.lm`` + trigram LM based on word + ``lexicon.txt`` + lexicon based on word + + ``{lm_phone}`` + ``phone.3gram.lm`` + trigram LM based on phone + ``lexicon.txt`` + lexicon based on phone + + ``README.TXT`` + this file + + +Data statistics +=============== + +Statistics for the data are as follows: + + =========== ========== ========== =========== + **dataset** **audio** **#sents** **#words** + =========== ========== ========== =========== + train 25 10,000 198,252 + dev 2:14 893 17,743 + test 6:15 2,495 49,085 + =========== ========== ========== =========== diff --git a/examples/dataset/thchs30/thchs30.py b/examples/dataset/thchs30/thchs30.py index 225adb092..5613d7685 100644 --- a/examples/dataset/thchs30/thchs30.py +++ b/examples/dataset/thchs30/thchs30.py @@ -69,9 +69,7 @@ def read_trn(filepath): """ texts = [] with open(filepath, 'r') as f: - lines = f.read().split('\n') - # last line is `empty` - lines = lines[:3] + lines = f.read().strip().split('\n') assert len(lines) == 3, lines # charactor text, remove withespace texts.append(''.join(lines[0].split())) From e106f243b4f765fad466cc0608ba5b1240e2050c Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 29 Jun 2021 12:13:04 +0000 Subject: [PATCH 16/25] dump dataset metadata --- examples/dataset/thchs30/thchs30.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/examples/dataset/thchs30/thchs30.py b/examples/dataset/thchs30/thchs30.py index 5613d7685..c28fa56ff 100644 --- a/examples/dataset/thchs30/thchs30.py +++ b/examples/dataset/thchs30/thchs30.py @@ -96,6 +96,10 @@ def create_manifest(data_dir, manifest_path_prefix): data_types = ['train', 'dev', 'test'] for dtype in data_types: del json_lines[:] + total_sec = 0.0 + total_text = 0.0 + total_num = 0 + audio_dir = os.path.join(data_dir, dtype) for subfolder, _, filelist in sorted(os.walk(audio_dir)): for fname in filelist: @@ -125,11 +129,23 @@ def create_manifest(data_dir, manifest_path_prefix): }, ensure_ascii=False)) + total_sec += duration + total_text += len(text) + total_num += 1 + manifest_path = manifest_path_prefix + '.' + dtype with codecs.open(manifest_path, 'w', 'utf-8') as fout: for line in json_lines: fout.write(line + '\n') + with open(dtype + '.meta', 'w') as f: + print(f"{dtype}:", file=f) + print(f"{total_num} utts", file=f) + print(f"{total_sec / (60*60)} h", file=f) + print(f"{total_text} text", file=f) + print(f"{total_text / total_sec} text/sec", file=f) + print(f"{total_sec / total_num} sec/utt", file=f) + def prepare_dataset(url, md5sum, target_dir, manifest_path, subset): """Download, unpack and create manifest file.""" From 8c0923b86532c5750ecaea52ce74a60e3c310465 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 29 Jun 2021 12:18:03 +0000 Subject: [PATCH 17/25] update gitignore; add gigaspeech --- examples/dataset/aidatatang_200zh/.gitignore | 2 +- examples/dataset/aishell/.gitignore | 3 +++ examples/dataset/gigaspeech/.gitignore | 1 + examples/dataset/gigaspeech/README.md | 10 ++++++++++ examples/dataset/gigaspeech/gigaspeech.py | 13 +++++++++++++ examples/dataset/gigaspeech/run.sh | 10 ++++++++++ examples/dataset/librispeech/.gitignore | 2 ++ examples/dataset/mini_librispeech/.gitignore | 1 + examples/dataset/thchs30/.gitignore | 1 + 9 files changed, 42 insertions(+), 1 deletion(-) create mode 100644 examples/dataset/gigaspeech/.gitignore create mode 100644 examples/dataset/gigaspeech/README.md create mode 100644 examples/dataset/gigaspeech/gigaspeech.py create mode 100644 examples/dataset/gigaspeech/run.sh diff --git a/examples/dataset/aidatatang_200zh/.gitignore b/examples/dataset/aidatatang_200zh/.gitignore index fcb887790..fc56525e6 100644 --- a/examples/dataset/aidatatang_200zh/.gitignore +++ b/examples/dataset/aidatatang_200zh/.gitignore @@ -1,4 +1,4 @@ *.tgz manifest.* *.meta -aidatatang_200zh/ +aidatatang_200zh/ \ No newline at end of file diff --git a/examples/dataset/aishell/.gitignore b/examples/dataset/aishell/.gitignore index 9c6e517e5..eea6573e1 100644 --- a/examples/dataset/aishell/.gitignore +++ b/examples/dataset/aishell/.gitignore @@ -1 +1,4 @@ data_aishell* +*.meta +manifest.* +*.tgz \ No newline at end of file diff --git a/examples/dataset/gigaspeech/.gitignore b/examples/dataset/gigaspeech/.gitignore new file mode 100644 index 000000000..7f78176b7 --- /dev/null +++ b/examples/dataset/gigaspeech/.gitignore @@ -0,0 +1 @@ +GigaSpeech/ diff --git a/examples/dataset/gigaspeech/README.md b/examples/dataset/gigaspeech/README.md new file mode 100644 index 000000000..4a1715cb8 --- /dev/null +++ b/examples/dataset/gigaspeech/README.md @@ -0,0 +1,10 @@ +# [GigaSpeech](https://github.com/SpeechColab/GigaSpeech) + +``` +git clone https://github.com/SpeechColab/GigaSpeech.git + +cd GigaSpeech +utils/gigaspeech_download.sh /disk1/audio_data/gigaspeech +toolkits/kaldi/gigaspeech_data_prep.sh --train-subset XL /disk1/audio_data/gigaspeech ../data +cd .. +``` diff --git a/examples/dataset/gigaspeech/gigaspeech.py b/examples/dataset/gigaspeech/gigaspeech.py new file mode 100644 index 000000000..185a92b8d --- /dev/null +++ b/examples/dataset/gigaspeech/gigaspeech.py @@ -0,0 +1,13 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/examples/dataset/gigaspeech/run.sh b/examples/dataset/gigaspeech/run.sh new file mode 100644 index 000000000..0f7b46ab9 --- /dev/null +++ b/examples/dataset/gigaspeech/run.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +set -e + +curdir=$PWD + +test -d GigaSpeech || git clone https://github.com/SpeechColab/GigaSpeech.git +cd GigaSpeech +source env_vars.sh +utils/gigaspeech_download.sh ${curdir}/ diff --git a/examples/dataset/librispeech/.gitignore b/examples/dataset/librispeech/.gitignore index dfd5c67b5..465806def 100644 --- a/examples/dataset/librispeech/.gitignore +++ b/examples/dataset/librispeech/.gitignore @@ -5,3 +5,5 @@ test-other train-clean-100 train-clean-360 train-other-500 +*.meta +manifest.* diff --git a/examples/dataset/mini_librispeech/.gitignore b/examples/dataset/mini_librispeech/.gitignore index 61f54c966..7fbcfd65d 100644 --- a/examples/dataset/mini_librispeech/.gitignore +++ b/examples/dataset/mini_librispeech/.gitignore @@ -2,3 +2,4 @@ dev-clean/ manifest.dev-clean manifest.train-clean train-clean/ +*.meta diff --git a/examples/dataset/thchs30/.gitignore b/examples/dataset/thchs30/.gitignore index 47dd6268f..b94cd7e40 100644 --- a/examples/dataset/thchs30/.gitignore +++ b/examples/dataset/thchs30/.gitignore @@ -3,3 +3,4 @@ manifest.* data_thchs30 resource test-noise +*.meta From 08b6213bc8b88378cb090534be74eaeb7df306ce Mon Sep 17 00:00:00 2001 From: Haoxin Ma <745165806@qq.com> Date: Wed, 30 Jun 2021 03:00:18 +0000 Subject: [PATCH 18/25] fix private function --- deepspeech/training/trainer.py | 5 +- deepspeech/utils/checkpoint.py | 114 ++++++++++++++++++++++----------- 2 files changed, 79 insertions(+), 40 deletions(-) diff --git a/deepspeech/training/trainer.py b/deepspeech/training/trainer.py index cd915760d..5ebba1a98 100644 --- a/deepspeech/training/trainer.py +++ b/deepspeech/training/trainer.py @@ -151,12 +151,11 @@ class Trainer(): resume training. """ scratch = None - infos = self.checkpoint._load_parameters( + infos = self.checkpoint.load_latest_parameters( self.model, self.optimizer, checkpoint_dir=self.checkpoint_dir, - checkpoint_path=self.args.checkpoint_path, - checkpoint_file='checkpoint_latest') + checkpoint_path=self.args.checkpoint_path) if infos: # restore from ckpt self.iteration = infos["step"] diff --git a/deepspeech/utils/checkpoint.py b/deepspeech/utils/checkpoint.py index be36fdbb2..000fa87ba 100644 --- a/deepspeech/utils/checkpoint.py +++ b/deepspeech/utils/checkpoint.py @@ -38,23 +38,7 @@ class Checkpoint(object): self.kbest_n = kbest_n self.latest_n = latest_n self._save_all = (kbest_n == -1) - - def _should_save_best(self, metric: float) -> bool: - if not self._best_full(): - return True - - # already full - worst_record_path = max(self.best_records, key=self.best_records.get) - # worst_record_path = max(self.best_records.iteritems(), key=operator.itemgetter(1))[0] - worst_metric = self.best_records[worst_record_path] - return metric < worst_metric - - def _best_full(self): - return (not self._save_all) and len(self.best_records) == self.kbest_n - - def _latest_full(self): - return len(self.latest_records) == self.latest_n - + def add_checkpoint(self, checkpoint_dir, tag_or_iteration, @@ -64,7 +48,7 @@ class Checkpoint(object): metric_type="val_loss"): if (metric_type not in infos.keys()): self._save_parameters(checkpoint_dir, tag_or_iteration, model, - optimizer, infos) + optimizer, infos) return #save best @@ -73,15 +57,71 @@ class Checkpoint(object): infos[metric_type], checkpoint_dir, tag_or_iteration, model, optimizer, infos) #save latest - self._save_latest_checkpoint_and_update(checkpoint_dir, tag_or_iteration, - model, optimizer, infos) + self._save_latest_checkpoint_and_update( + checkpoint_dir, tag_or_iteration, model, optimizer, infos) if isinstance(tag_or_iteration, int): self._save_checkpoint_record(checkpoint_dir, tag_or_iteration) + def load_latest_parameters(self, + model, + optimizer=None, + checkpoint_dir=None, + checkpoint_path=None): + """Load a last model checkpoint from disk. + Args: + model (Layer): model to load parameters. + optimizer (Optimizer, optional): optimizer to load states if needed. + Defaults to None. + checkpoint_dir (str, optional): the directory where checkpoint is saved. + checkpoint_path (str, optional): if specified, load the checkpoint + stored in the checkpoint_path(prefix) and the argument 'checkpoint_dir' will + be ignored. Defaults to None. + Returns: + configs (dict): epoch or step, lr and other meta info should be saved. + """ + return self._load_parameters(model, optimizer, checkpoint_dir, checkpoint_path, + "checkpoint_latest") + + def load_best_parameters(self, + model, + optimizer=None, + checkpoint_dir=None, + checkpoint_path=None): + """Load a last model checkpoint from disk. + Args: + model (Layer): model to load parameters. + optimizer (Optimizer, optional): optimizer to load states if needed. + Defaults to None. + checkpoint_dir (str, optional): the directory where checkpoint is saved. + checkpoint_path (str, optional): if specified, load the checkpoint + stored in the checkpoint_path(prefix) and the argument 'checkpoint_dir' will + be ignored. Defaults to None. + Returns: + configs (dict): epoch or step, lr and other meta info should be saved. + """ + return self._load_parameters(model, optimizer, checkpoint_dir, checkpoint_path, + "checkpoint_best") + + def _should_save_best(self, metric: float) -> bool: + if not self._best_full(): + return True + + # already full + worst_record_path = max(self.best_records, key=self.best_records.get) + # worst_record_path = max(self.best_records.iteritems(), key=operator.itemgetter(1))[0] + worst_metric = self.best_records[worst_record_path] + return metric < worst_metric + + def _best_full(self): + return (not self._save_all) and len(self.best_records) == self.kbest_n + + def _latest_full(self): + return len(self.latest_records) == self.latest_n + def _save_best_checkpoint_and_update(self, metric, checkpoint_dir, - tag_or_iteration, model, optimizer, - infos): + tag_or_iteration, model, optimizer, + infos): # remove the worst if self._best_full(): worst_record_path = max(self.best_records, @@ -93,8 +133,8 @@ class Checkpoint(object): self._del_checkpoint(checkpoint_dir, worst_record_path) # add the new one - self._save_parameters(checkpoint_dir, tag_or_iteration, model, optimizer, - infos) + self._save_parameters(checkpoint_dir, tag_or_iteration, model, + optimizer, infos) self.best_records[tag_or_iteration] = metric def _save_latest_checkpoint_and_update( @@ -108,8 +148,8 @@ class Checkpoint(object): self._del_checkpoint(checkpoint_dir, to_del_fn) self.latest_records.append(tag_or_iteration) - self._save_parameters(checkpoint_dir, tag_or_iteration, model, optimizer, - infos) + self._save_parameters(checkpoint_dir, tag_or_iteration, model, + optimizer, infos) def _del_checkpoint(self, checkpoint_dir, tag_or_iteration): checkpoint_path = os.path.join(checkpoint_dir, @@ -153,13 +193,12 @@ class Checkpoint(object): for i in self.latest_records: handle.write("model_checkpoint_path:{}\n".format(i)) - def _load_parameters(self, - model, - optimizer=None, - checkpoint_dir=None, - checkpoint_path=None, - checkpoint_file=None): + model, + optimizer=None, + checkpoint_dir=None, + checkpoint_path=None, + checkpoint_file=None): """Load a last model checkpoint from disk. Args: model (Layer): model to load parameters. @@ -209,13 +248,14 @@ class Checkpoint(object): configs = json.load(fin) return configs + @mp_tools.rank_zero_only def _save_parameters(self, - checkpoint_dir: str, - tag_or_iteration: Union[int, str], - model: paddle.nn.Layer, - optimizer: Optimizer=None, - infos: dict=None): + checkpoint_dir: str, + tag_or_iteration: Union[int, str], + model: paddle.nn.Layer, + optimizer: Optimizer=None, + infos: dict=None): """Checkpoint the latest trained model parameters. Args: checkpoint_dir (str): the directory where checkpoint is saved. From c0f7aac8fce3d1fbacbcf146e3e2b42abfe607ae Mon Sep 17 00:00:00 2001 From: Haoxin Ma <745165806@qq.com> Date: Wed, 30 Jun 2021 03:10:34 +0000 Subject: [PATCH 19/25] revise conf/*.yaml --- deepspeech/utils/checkpoint.py | 28 +++++++++---------- examples/aishell/s0/conf/deepspeech2.yaml | 3 ++ examples/aishell/s1/conf/chunk_conformer.yaml | 3 ++ examples/aishell/s1/conf/conformer.yaml | 3 ++ examples/librispeech/s0/conf/deepspeech2.yaml | 3 ++ .../librispeech/s1/conf/chunk_confermer.yaml | 3 ++ .../s1/conf/chunk_transformer.yaml | 3 ++ examples/librispeech/s1/conf/conformer.yaml | 3 ++ examples/librispeech/s1/conf/transformer.yaml | 3 ++ examples/tiny/s1/conf/chunk_confermer.yaml | 3 ++ examples/tiny/s1/conf/chunk_transformer.yaml | 3 ++ examples/tiny/s1/conf/conformer.yaml | 3 ++ examples/tiny/s1/conf/transformer.yaml | 3 ++ 13 files changed, 49 insertions(+), 15 deletions(-) diff --git a/deepspeech/utils/checkpoint.py b/deepspeech/utils/checkpoint.py index 000fa87ba..8c5d8d605 100644 --- a/deepspeech/utils/checkpoint.py +++ b/deepspeech/utils/checkpoint.py @@ -24,7 +24,6 @@ from paddle.optimizer import Optimizer from deepspeech.utils import mp_tools from deepspeech.utils.log import Log -# import operator logger = Log(__name__).getlog() @@ -38,7 +37,7 @@ class Checkpoint(object): self.kbest_n = kbest_n self.latest_n = latest_n self._save_all = (kbest_n == -1) - + def add_checkpoint(self, checkpoint_dir, tag_or_iteration, @@ -64,10 +63,10 @@ class Checkpoint(object): self._save_checkpoint_record(checkpoint_dir, tag_or_iteration) def load_latest_parameters(self, - model, - optimizer=None, - checkpoint_dir=None, - checkpoint_path=None): + model, + optimizer=None, + checkpoint_dir=None, + checkpoint_path=None): """Load a last model checkpoint from disk. Args: model (Layer): model to load parameters. @@ -80,14 +79,14 @@ class Checkpoint(object): Returns: configs (dict): epoch or step, lr and other meta info should be saved. """ - return self._load_parameters(model, optimizer, checkpoint_dir, checkpoint_path, - "checkpoint_latest") + return self._load_parameters(model, optimizer, checkpoint_dir, + checkpoint_path, "checkpoint_latest") def load_best_parameters(self, - model, - optimizer=None, - checkpoint_dir=None, - checkpoint_path=None): + model, + optimizer=None, + checkpoint_dir=None, + checkpoint_path=None): """Load a last model checkpoint from disk. Args: model (Layer): model to load parameters. @@ -100,8 +99,8 @@ class Checkpoint(object): Returns: configs (dict): epoch or step, lr and other meta info should be saved. """ - return self._load_parameters(model, optimizer, checkpoint_dir, checkpoint_path, - "checkpoint_best") + return self._load_parameters(model, optimizer, checkpoint_dir, + checkpoint_path, "checkpoint_best") def _should_save_best(self, metric: float) -> bool: if not self._best_full(): @@ -248,7 +247,6 @@ class Checkpoint(object): configs = json.load(fin) return configs - @mp_tools.rank_zero_only def _save_parameters(self, checkpoint_dir: str, diff --git a/examples/aishell/s0/conf/deepspeech2.yaml b/examples/aishell/s0/conf/deepspeech2.yaml index 54ce240e7..27ede01bc 100644 --- a/examples/aishell/s0/conf/deepspeech2.yaml +++ b/examples/aishell/s0/conf/deepspeech2.yaml @@ -48,6 +48,9 @@ training: weight_decay: 1e-06 global_grad_clip: 3.0 log_interval: 100 + checkpoint: + kbest_n: 50 + latest_n: 5 decoding: batch_size: 128 diff --git a/examples/aishell/s1/conf/chunk_conformer.yaml b/examples/aishell/s1/conf/chunk_conformer.yaml index 904624c3c..1065dcb03 100644 --- a/examples/aishell/s1/conf/chunk_conformer.yaml +++ b/examples/aishell/s1/conf/chunk_conformer.yaml @@ -90,6 +90,9 @@ training: warmup_steps: 25000 lr_decay: 1.0 log_interval: 100 + checkpoint: + kbest_n: 50 + latest_n: 5 decoding: diff --git a/examples/aishell/s1/conf/conformer.yaml b/examples/aishell/s1/conf/conformer.yaml index 116c91927..4b1430c58 100644 --- a/examples/aishell/s1/conf/conformer.yaml +++ b/examples/aishell/s1/conf/conformer.yaml @@ -88,6 +88,9 @@ training: warmup_steps: 25000 lr_decay: 1.0 log_interval: 100 + checkpoint: + kbest_n: 50 + latest_n: 5 decoding: diff --git a/examples/librispeech/s0/conf/deepspeech2.yaml b/examples/librispeech/s0/conf/deepspeech2.yaml index d1746bff3..9f06a3802 100644 --- a/examples/librispeech/s0/conf/deepspeech2.yaml +++ b/examples/librispeech/s0/conf/deepspeech2.yaml @@ -43,6 +43,9 @@ training: weight_decay: 1e-06 global_grad_clip: 5.0 log_interval: 100 + checkpoint: + kbest_n: 50 + latest_n: 5 decoding: batch_size: 128 diff --git a/examples/librispeech/s1/conf/chunk_confermer.yaml b/examples/librispeech/s1/conf/chunk_confermer.yaml index ec945a188..979121639 100644 --- a/examples/librispeech/s1/conf/chunk_confermer.yaml +++ b/examples/librispeech/s1/conf/chunk_confermer.yaml @@ -91,6 +91,9 @@ training: warmup_steps: 25000 lr_decay: 1.0 log_interval: 100 + checkpoint: + kbest_n: 50 + latest_n: 5 decoding: diff --git a/examples/librispeech/s1/conf/chunk_transformer.yaml b/examples/librispeech/s1/conf/chunk_transformer.yaml index 3939ffc68..dc2a51f92 100644 --- a/examples/librispeech/s1/conf/chunk_transformer.yaml +++ b/examples/librispeech/s1/conf/chunk_transformer.yaml @@ -84,6 +84,9 @@ training: warmup_steps: 25000 lr_decay: 1.0 log_interval: 100 + checkpoint: + kbest_n: 50 + latest_n: 5 decoding: diff --git a/examples/librispeech/s1/conf/conformer.yaml b/examples/librispeech/s1/conf/conformer.yaml index 8f8bf4539..989af22a0 100644 --- a/examples/librispeech/s1/conf/conformer.yaml +++ b/examples/librispeech/s1/conf/conformer.yaml @@ -87,6 +87,9 @@ training: warmup_steps: 25000 lr_decay: 1.0 log_interval: 100 + checkpoint: + kbest_n: 50 + latest_n: 5 decoding: diff --git a/examples/librispeech/s1/conf/transformer.yaml b/examples/librispeech/s1/conf/transformer.yaml index a094b0fba..931d7524b 100644 --- a/examples/librispeech/s1/conf/transformer.yaml +++ b/examples/librispeech/s1/conf/transformer.yaml @@ -82,6 +82,9 @@ training: warmup_steps: 25000 lr_decay: 1.0 log_interval: 100 + checkpoint: + kbest_n: 50 + latest_n: 5 decoding: diff --git a/examples/tiny/s1/conf/chunk_confermer.yaml b/examples/tiny/s1/conf/chunk_confermer.yaml index 790066264..606300bdf 100644 --- a/examples/tiny/s1/conf/chunk_confermer.yaml +++ b/examples/tiny/s1/conf/chunk_confermer.yaml @@ -91,6 +91,9 @@ training: warmup_steps: 25000 lr_decay: 1.0 log_interval: 1 + checkpoint: + kbest_n: 10 + latest_n: 1 decoding: diff --git a/examples/tiny/s1/conf/chunk_transformer.yaml b/examples/tiny/s1/conf/chunk_transformer.yaml index aa2b145a6..72d368485 100644 --- a/examples/tiny/s1/conf/chunk_transformer.yaml +++ b/examples/tiny/s1/conf/chunk_transformer.yaml @@ -84,6 +84,9 @@ training: warmup_steps: 25000 lr_decay: 1.0 log_interval: 1 + checkpoint: + kbest_n: 10 + latest_n: 1 decoding: diff --git a/examples/tiny/s1/conf/conformer.yaml b/examples/tiny/s1/conf/conformer.yaml index 3813daa04..a6f730501 100644 --- a/examples/tiny/s1/conf/conformer.yaml +++ b/examples/tiny/s1/conf/conformer.yaml @@ -87,6 +87,9 @@ training: warmup_steps: 25000 lr_decay: 1.0 log_interval: 1 + checkpoint: + kbest_n: 10 + latest_n: 1 decoding: diff --git a/examples/tiny/s1/conf/transformer.yaml b/examples/tiny/s1/conf/transformer.yaml index 250995faa..71cbdde7f 100644 --- a/examples/tiny/s1/conf/transformer.yaml +++ b/examples/tiny/s1/conf/transformer.yaml @@ -84,6 +84,9 @@ training: warmup_steps: 25000 lr_decay: 1.0 log_interval: 1 + checkpoint: + kbest_n: 10 + latest_n: 1 decoding: From 6ee67785f6b6d8445a0995df595bb7cbcb0204ad Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Thu, 1 Jul 2021 05:17:05 +0000 Subject: [PATCH 20/25] fix ctc alignment --- deepspeech/exps/u2/model.py | 40 ++++++++++++++++----------- deepspeech/utils/ctc_utils.py | 16 ++++++----- deepspeech/utils/text_grid.py | 2 +- deepspeech/utils/utility.py | 19 +++++++++++++ examples/aishell/s1/local/align.sh | 43 ++++++++++++++++++++++++++++++ tools/Makefile | 4 +-- 6 files changed, 100 insertions(+), 24 deletions(-) create mode 100755 examples/aishell/s1/local/align.sh diff --git a/deepspeech/exps/u2/model.py b/deepspeech/exps/u2/model.py index 8802143d6..dd62f537e 100644 --- a/deepspeech/exps/u2/model.py +++ b/deepspeech/exps/u2/model.py @@ -39,6 +39,7 @@ from deepspeech.utils import error_rate from deepspeech.utils import layer_tools from deepspeech.utils import mp_tools from deepspeech.utils import text_grid +from deepspeech.utils import utility from deepspeech.utils.log import Log logger = Log(__name__).getlog() @@ -280,7 +281,15 @@ class U2Trainer(Trainer): shuffle=False, drop_last=False, collate_fn=SpeechCollator.from_config(config)) - logger.info("Setup train/valid/test Dataloader!") + # return text token id + config.collator.keep_transcription_text = False + self.align_loader = DataLoader( + test_dataset, + batch_size=config.decoding.batch_size, + shuffle=False, + drop_last=False, + collate_fn=SpeechCollator.from_config(config)) + logger.info("Setup train/valid/test/align Dataloader!") def setup_model(self): config = self.config @@ -507,16 +516,17 @@ class U2Tester(U2Trainer): sys.exit(1) # xxx.align - assert self.args.result_file + assert self.args.result_file and self.args.result_file.endswith( + '.align') self.model.eval() - logger.info(f"Align Total Examples: {len(self.test_loader.dataset)}") + logger.info(f"Align Total Examples: {len(self.align_loader.dataset)}") - stride_ms = self.test_loader.collate_fn.stride_ms - token_dict = self.test_loader.collate_fn.vocab_list + stride_ms = self.align_loader.collate_fn.stride_ms + token_dict = self.align_loader.collate_fn.vocab_list with open(self.args.result_file, 'w') as fout: # one example in batch - for i, batch in enumerate(self.test_loader): + for i, batch in enumerate(self.align_loader): key, feat, feats_length, target, target_length = batch # 1. Encoder @@ -527,36 +537,36 @@ class U2Tester(U2Trainer): encoder_out) # (1, maxlen, vocab_size) # 2. alignment - # print(ctc_probs.size(1)) ctc_probs = ctc_probs.squeeze(0) target = target.squeeze(0) alignment = ctc_utils.forced_align(ctc_probs, target) - print(kye[0], alignment) + logger.info("align ids", key[0], alignment) fout.write('{} {}\n'.format(key[0], alignment)) # 3. gen praat # segment alignment align_segs = text_grid.segment_alignment(alignment) - print(kye[0], align_segs) + logger.info("align tokens", key[0], align_segs) # IntervalTier, List["start end token\n"] - subsample = get_subsample(self.config) + subsample = utility.get_subsample(self.config) tierformat = text_grid.align_to_tierformat( align_segs, subsample, token_dict) # write tier - tier_path = os.path.join( - os.path.dirname(args.result_file), key[0] + ".tier") + align_output_path = os.path.join( + os.path.dirname(self.args.result_file), "align") + tier_path = os.path.join(align_output_path, key[0] + ".tier") with open(tier_path, 'w') as f: f.writelines(tierformat) # write textgrid - textgrid_path = s.path.join( - os.path.dirname(args.result_file), key[0] + ".TextGrid") + textgrid_path = os.path.join(align_output_path, + key[0] + ".TextGrid") second_per_frame = 1. / (1000. / stride_ms) # 25ms window, 10ms stride second_per_example = ( len(alignment) + 1) * subsample * second_per_frame text_grid.generate_textgrid( maxtime=second_per_example, - lines=tierformat, + intervals=tierformat, output=textgrid_path) def run_align(self): diff --git a/deepspeech/utils/ctc_utils.py b/deepspeech/utils/ctc_utils.py index 6201233df..09543d48d 100644 --- a/deepspeech/utils/ctc_utils.py +++ b/deepspeech/utils/ctc_utils.py @@ -86,13 +86,15 @@ def forced_align(ctc_probs: paddle.Tensor, y: paddle.Tensor, log_alpha = paddle.zeros( (ctc_probs.size(0), len(y_insert_blank))) #(T, 2L+1) log_alpha = log_alpha - float('inf') # log of zero + # TODO(Hui Zhang): zeros not support paddle.int16 state_path = (paddle.zeros( - (ctc_probs.size(0), len(y_insert_blank)), dtype=paddle.int16) - 1 + (ctc_probs.size(0), len(y_insert_blank)), dtype=paddle.int32) - 1 ) # state path, Tuple((T, 2L+1)) # init start state - log_alpha[0, 0] = ctc_probs[0][y_insert_blank[0]] # State-b, Sb - log_alpha[0, 1] = ctc_probs[0][y_insert_blank[1]] # State-nb, Snb + # TODO(Hui Zhang): VarBase.__getitem__() not support np.int64 + log_alpha[0, 0] = ctc_probs[0][int(y_insert_blank[0])] # State-b, Sb + log_alpha[0, 1] = ctc_probs[0][int(y_insert_blank[1])] # State-nb, Snb for t in range(1, ctc_probs.size(0)): # T for s in range(len(y_insert_blank)): # 2L+1 @@ -108,11 +110,13 @@ def forced_align(ctc_probs: paddle.Tensor, y: paddle.Tensor, log_alpha[t - 1, s - 2], ]) prev_state = [s, s - 1, s - 2] - log_alpha[t, s] = paddle.max(candidates) + ctc_probs[t][ - y_insert_blank[s]] + # TODO(Hui Zhang): VarBase.__getitem__() not support np.int64 + log_alpha[t, s] = paddle.max(candidates) + ctc_probs[t][int( + y_insert_blank[s])] state_path[t, s] = prev_state[paddle.argmax(candidates)] - state_seq = -1 * paddle.ones((ctc_probs.size(0), 1), dtype=paddle.int16) + # TODO(Hui Zhang): zeros not support paddle.int16 + state_seq = -1 * paddle.ones((ctc_probs.size(0), 1), dtype=paddle.int32) candidates = paddle.to_tensor([ log_alpha[-1, len(y_insert_blank) - 1], # Sb diff --git a/deepspeech/utils/text_grid.py b/deepspeech/utils/text_grid.py index b774130db..3af58c9ba 100644 --- a/deepspeech/utils/text_grid.py +++ b/deepspeech/utils/text_grid.py @@ -110,7 +110,7 @@ def generate_textgrid(maxtime: float, """ # Download Praat: https://www.fon.hum.uva.nl/praat/ avg_interval = maxtime / (len(intervals) + 1) - print(f"average duration per {name}: {avg_interval}") + print(f"average second/token: {avg_interval}") margin = 0.0001 tg = textgrid.TextGrid(maxTime=maxtime) diff --git a/deepspeech/utils/utility.py b/deepspeech/utils/utility.py index 64570026b..a0639e065 100644 --- a/deepspeech/utils/utility.py +++ b/deepspeech/utils/utility.py @@ -79,3 +79,22 @@ def log_add(args: List[int]) -> float: a_max = max(args) lsp = math.log(sum(math.exp(a - a_max) for a in args)) return a_max + lsp + + +def get_subsample(config): + """Subsample rate from config. + + Args: + config (yacs.config.CfgNode): yaml config + + Returns: + int: subsample rate. + """ + input_layer = config["model"]["encoder_conf"]["input_layer"] + assert input_layer in ["conv2d", "conv2d6", "conv2d8"] + if input_layer == "conv2d": + return 4 + elif input_layer == "conv2d6": + return 6 + elif input_layer == "conv2d8": + return 8 diff --git a/examples/aishell/s1/local/align.sh b/examples/aishell/s1/local/align.sh new file mode 100755 index 000000000..926cb9397 --- /dev/null +++ b/examples/aishell/s1/local/align.sh @@ -0,0 +1,43 @@ +#! /usr/bin/env bash + +if [ $# != 2 ];then + echo "usage: ${0} config_path ckpt_path_prefix" + exit -1 +fi + +ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') +echo "using $ngpu gpus..." + +device=gpu +if [ ngpu == 0 ];then + device=cpu +fi +config_path=$1 +ckpt_prefix=$2 + +ckpt_name=$(basename ${ckpt_prefxi}) + +mkdir -p exp + + + +batch_size=1 +output_dir=${ckpt_prefix} +mkdir -p ${output_dir} + +# align dump in `result_file` +# .tier, .TextGrid dump in `dir of result_file` +python3 -u ${BIN_DIR}/alignment.py \ +--device ${device} \ +--nproc 1 \ +--config ${config_path} \ +--result_file ${output_dir}/${type}.align \ +--checkpoint_path ${ckpt_prefix} \ +--opts decoding.batch_size ${batch_size} + +if [ $? -ne 0 ]; then + echo "Failed in ctc alignment!" + exit 1 +fi + +exit 0 diff --git a/tools/Makefile b/tools/Makefile index dd5902373..94e5ea2f7 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -19,7 +19,7 @@ kenlm.done: apt-get install -y gcc-5 g++-5 && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-5 50 && update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-5 50 test -d kenlm || wget -O - https://kheafield.com/code/kenlm.tar.gz | tar xz mkdir -p kenlm/build && cd kenlm/build && cmake .. && make -j4 && make install - cd kenlm && python setup.py install + source venv/bin/activate; cd kenlm && python setup.py install touch kenlm.done sox.done: @@ -32,4 +32,4 @@ sox.done: soxbindings.done: test -d soxbindings || git clone https://github.com/pseeth/soxbindings.git source venv/bin/activate; cd soxbindings && python setup.py install - touch soxbindings.done \ No newline at end of file + touch soxbindings.done From 4c9a1f6dc7def927d5c8b32ff7bbf87224eed693 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Thu, 1 Jul 2021 07:41:27 +0000 Subject: [PATCH 21/25] add align.sh and update run.sh --- examples/aishell/s1/run.sh | 7 ++++- examples/librispeech/s1/local/align.sh | 43 ++++++++++++++++++++++++++ examples/librispeech/s1/run.sh | 5 +++ examples/tiny/s1/local/align.sh | 43 ++++++++++++++++++++++++++ examples/tiny/s1/run.sh | 8 ++++- 5 files changed, 104 insertions(+), 2 deletions(-) create mode 100755 examples/librispeech/s1/local/align.sh create mode 100755 examples/tiny/s1/local/align.sh diff --git a/examples/aishell/s1/run.sh b/examples/aishell/s1/run.sh index 4cf09553b..562cfa04d 100644 --- a/examples/aishell/s1/run.sh +++ b/examples/aishell/s1/run.sh @@ -30,10 +30,15 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=4 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + # ctc alignment of test data + CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 +fi + +if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then # export ckpt avg_n CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit fi diff --git a/examples/librispeech/s1/local/align.sh b/examples/librispeech/s1/local/align.sh new file mode 100755 index 000000000..926cb9397 --- /dev/null +++ b/examples/librispeech/s1/local/align.sh @@ -0,0 +1,43 @@ +#! /usr/bin/env bash + +if [ $# != 2 ];then + echo "usage: ${0} config_path ckpt_path_prefix" + exit -1 +fi + +ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') +echo "using $ngpu gpus..." + +device=gpu +if [ ngpu == 0 ];then + device=cpu +fi +config_path=$1 +ckpt_prefix=$2 + +ckpt_name=$(basename ${ckpt_prefxi}) + +mkdir -p exp + + + +batch_size=1 +output_dir=${ckpt_prefix} +mkdir -p ${output_dir} + +# align dump in `result_file` +# .tier, .TextGrid dump in `dir of result_file` +python3 -u ${BIN_DIR}/alignment.py \ +--device ${device} \ +--nproc 1 \ +--config ${config_path} \ +--result_file ${output_dir}/${type}.align \ +--checkpoint_path ${ckpt_prefix} \ +--opts decoding.batch_size ${batch_size} + +if [ $? -ne 0 ]; then + echo "Failed in ctc alignment!" + exit 1 +fi + +exit 0 diff --git a/examples/librispeech/s1/run.sh b/examples/librispeech/s1/run.sh index 65194d902..b81e8dcfd 100755 --- a/examples/librispeech/s1/run.sh +++ b/examples/librispeech/s1/run.sh @@ -33,6 +33,11 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + # ctc alignment of test data + CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 +fi + +if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then # export ckpt avg_n CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit fi diff --git a/examples/tiny/s1/local/align.sh b/examples/tiny/s1/local/align.sh new file mode 100755 index 000000000..926cb9397 --- /dev/null +++ b/examples/tiny/s1/local/align.sh @@ -0,0 +1,43 @@ +#! /usr/bin/env bash + +if [ $# != 2 ];then + echo "usage: ${0} config_path ckpt_path_prefix" + exit -1 +fi + +ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') +echo "using $ngpu gpus..." + +device=gpu +if [ ngpu == 0 ];then + device=cpu +fi +config_path=$1 +ckpt_prefix=$2 + +ckpt_name=$(basename ${ckpt_prefxi}) + +mkdir -p exp + + + +batch_size=1 +output_dir=${ckpt_prefix} +mkdir -p ${output_dir} + +# align dump in `result_file` +# .tier, .TextGrid dump in `dir of result_file` +python3 -u ${BIN_DIR}/alignment.py \ +--device ${device} \ +--nproc 1 \ +--config ${config_path} \ +--result_file ${output_dir}/${type}.align \ +--checkpoint_path ${ckpt_prefix} \ +--opts decoding.batch_size ${batch_size} + +if [ $? -ne 0 ]; then + echo "Failed in ctc alignment!" + exit 1 +fi + +exit 0 diff --git a/examples/tiny/s1/run.sh b/examples/tiny/s1/run.sh index b148869b7..41f845b05 100755 --- a/examples/tiny/s1/run.sh +++ b/examples/tiny/s1/run.sh @@ -34,6 +34,12 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + # ctc alignment of test data + CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 +fi + +if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then # export ckpt avg_n - CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit + CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit fi + From 20117d99eeea968e28be3ea2c5b1b110f3f77981 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Fri, 2 Jul 2021 03:32:33 +0000 Subject: [PATCH 22/25] fix ckpt load --- deepspeech/models/u2.py | 16 ++-- deepspeech/utils/checkpoint.py | 132 ++++++++++++++++++--------------- examples/aishell/s1/run.sh | 2 +- 3 files changed, 82 insertions(+), 68 deletions(-) diff --git a/deepspeech/models/u2.py b/deepspeech/models/u2.py index 23ae3423d..6b266bdb4 100644 --- a/deepspeech/models/u2.py +++ b/deepspeech/models/u2.py @@ -599,26 +599,26 @@ class U2BaseModel(nn.Module): best_index = i return hyps[best_index][0] - @jit.export + #@jit.export def subsampling_rate(self) -> int: """ Export interface for c++ call, return subsampling_rate of the model """ return self.encoder.embed.subsampling_rate - @jit.export + #@jit.export def right_context(self) -> int: """ Export interface for c++ call, return right_context of the model """ return self.encoder.embed.right_context - @jit.export + #@jit.export def sos_symbol(self) -> int: """ Export interface for c++ call, return sos symbol id of the model """ return self.sos - @jit.export + #@jit.export def eos_symbol(self) -> int: """ Export interface for c++ call, return eos symbol id of the model """ @@ -654,12 +654,14 @@ class U2BaseModel(nn.Module): xs, offset, required_cache_size, subsampling_cache, elayers_output_cache, conformer_cnn_cache) - @jit.export + # @jit.export([ + # paddle.static.InputSpec(shape=[1, None, feat_dim],dtype='float32'), # audio feat, [B,T,D] + # ]) def ctc_activation(self, xs: paddle.Tensor) -> paddle.Tensor: """ Export interface for c++ call, apply linear transform and log softmax before ctc Args: - xs (paddle.Tensor): encoder output + xs (paddle.Tensor): encoder output, (B, T, D) Returns: paddle.Tensor: activation before ctc """ @@ -894,7 +896,7 @@ class U2Model(U2BaseModel): model = cls.from_config(config) if checkpoint_path: - infos = checkpoint.load_parameters( + infos = checkpoint.Checkpoint().load_parameters( model, checkpoint_path=checkpoint_path) logger.info(f"checkpoint info: {infos}") layer_tools.summary(model) diff --git a/deepspeech/utils/checkpoint.py b/deepspeech/utils/checkpoint.py index 8c5d8d605..a2f7e18ae 100644 --- a/deepspeech/utils/checkpoint.py +++ b/deepspeech/utils/checkpoint.py @@ -17,6 +17,7 @@ import os import re from pathlib import Path from typing import Union +from typing import Text import paddle from paddle import distributed as dist @@ -30,7 +31,7 @@ logger = Log(__name__).getlog() __all__ = ["Checkpoint"] -class Checkpoint(object): +class Checkpoint(): def __init__(self, kbest_n: int=5, latest_n: int=1): self.best_records: Mapping[Path, float] = {} self.latest_records = [] @@ -40,11 +41,21 @@ class Checkpoint(object): def add_checkpoint(self, checkpoint_dir, - tag_or_iteration, - model, - optimizer, - infos, + tag_or_iteration: Union[int, Text], + model: paddle.nn.Layer, + optimizer: Optimizer=None, + infos: dict=None, metric_type="val_loss"): + """Save checkpoint in best_n and latest_n. + + Args: + checkpoint_dir (str): the directory where checkpoint is saved. + tag_or_iteration (int or str): the latest iteration(step or epoch) number or tag. + model (Layer): model to be checkpointed. + optimizer (Optimizer, optional): optimizer to be checkpointed. + infos (dict or None)): any info you want to save. + metric_type (str, optional): metric type. Defaults to "val_loss". + """ if (metric_type not in infos.keys()): self._save_parameters(checkpoint_dir, tag_or_iteration, model, optimizer, infos) @@ -61,6 +72,62 @@ class Checkpoint(object): if isinstance(tag_or_iteration, int): self._save_checkpoint_record(checkpoint_dir, tag_or_iteration) + + def load_parameters(self, + model, + optimizer=None, + checkpoint_dir=None, + checkpoint_path=None, + record_file="checkpoint_latest"): + """Load a last model checkpoint from disk. + Args: + model (Layer): model to load parameters. + optimizer (Optimizer, optional): optimizer to load states if needed. + Defaults to None. + checkpoint_dir (str, optional): the directory where checkpoint is saved. + checkpoint_path (str, optional): if specified, load the checkpoint + stored in the checkpoint_path(prefix) and the argument 'checkpoint_dir' will + be ignored. Defaults to None. + record_file "checkpoint_latest" or "checkpoint_best" + Returns: + configs (dict): epoch or step, lr and other meta info should be saved. + """ + configs = {} + + if checkpoint_path is not None: + pass + elif checkpoint_dir is not None and record_file is not None: + # load checkpint from record file + checkpoint_record = os.path.join(checkpoint_dir, record_file) + iteration = self._load_checkpoint_idx(checkpoint_record) + if iteration == -1: + return configs + checkpoint_path = os.path.join(checkpoint_dir, + "{}".format(iteration)) + else: + raise ValueError( + "At least one of 'checkpoint_path' or 'checkpoint_dir' should be specified!" + ) + + rank = dist.get_rank() + + params_path = checkpoint_path + ".pdparams" + model_dict = paddle.load(params_path) + model.set_state_dict(model_dict) + logger.info("Rank {}: loaded model from {}".format(rank, params_path)) + + optimizer_path = checkpoint_path + ".pdopt" + if optimizer and os.path.isfile(optimizer_path): + optimizer_dict = paddle.load(optimizer_path) + optimizer.set_state_dict(optimizer_dict) + logger.info("Rank {}: loaded optimizer state from {}".format( + rank, optimizer_path)) + + info_path = re.sub('.pdparams$', '.json', params_path) + if os.path.exists(info_path): + with open(info_path, 'r') as fin: + configs = json.load(fin) + return configs def load_latest_parameters(self, model, @@ -192,61 +259,6 @@ class Checkpoint(object): for i in self.latest_records: handle.write("model_checkpoint_path:{}\n".format(i)) - def _load_parameters(self, - model, - optimizer=None, - checkpoint_dir=None, - checkpoint_path=None, - checkpoint_file=None): - """Load a last model checkpoint from disk. - Args: - model (Layer): model to load parameters. - optimizer (Optimizer, optional): optimizer to load states if needed. - Defaults to None. - checkpoint_dir (str, optional): the directory where checkpoint is saved. - checkpoint_path (str, optional): if specified, load the checkpoint - stored in the checkpoint_path(prefix) and the argument 'checkpoint_dir' will - be ignored. Defaults to None. - checkpoint_file "checkpoint_latest" or "checkpoint_best" - Returns: - configs (dict): epoch or step, lr and other meta info should be saved. - """ - configs = {} - - if checkpoint_path is not None: - tag = os.path.basename(checkpoint_path).split(":")[-1] - elif checkpoint_dir is not None and checkpoint_file is not None: - checkpoint_record = os.path.join(checkpoint_dir, checkpoint_file) - iteration = self._load_checkpoint_idx(checkpoint_record) - if iteration == -1: - return configs - checkpoint_path = os.path.join(checkpoint_dir, - "{}".format(iteration)) - else: - raise ValueError( - "At least one of 'checkpoint_dir' and 'checkpoint_file' and 'checkpoint_path' should be specified!" - ) - - rank = dist.get_rank() - - params_path = checkpoint_path + ".pdparams" - model_dict = paddle.load(params_path) - model.set_state_dict(model_dict) - logger.info("Rank {}: loaded model from {}".format(rank, params_path)) - - optimizer_path = checkpoint_path + ".pdopt" - if optimizer and os.path.isfile(optimizer_path): - optimizer_dict = paddle.load(optimizer_path) - optimizer.set_state_dict(optimizer_dict) - logger.info("Rank {}: loaded optimizer state from {}".format( - rank, optimizer_path)) - - info_path = re.sub('.pdparams$', '.json', params_path) - if os.path.exists(info_path): - with open(info_path, 'r') as fin: - configs = json.load(fin) - return configs - @mp_tools.rank_zero_only def _save_parameters(self, checkpoint_dir: str, diff --git a/examples/aishell/s1/run.sh b/examples/aishell/s1/run.sh index 562cfa04d..65b48a976 100644 --- a/examples/aishell/s1/run.sh +++ b/examples/aishell/s1/run.sh @@ -40,5 +40,5 @@ fi if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then # export ckpt avg_n - CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit + CUDA_VISIBLE_DEVICES=0 ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit fi From 1216917ce03a6ae05753bc6d979c1abe11a00a3c Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Mon, 5 Jul 2021 02:26:28 +0000 Subject: [PATCH 23/25] move scripts of ngram to s0 --- examples/ngram_lm/{ => s0}/README.md | 0 examples/ngram_lm/{ => s0}/data/README.md | 0 examples/ngram_lm/{ => s0}/data/custom_confusion.txt | 0 examples/ngram_lm/{ => s0}/data/text_correct.txt | 0 examples/ngram_lm/{ => s0}/local/build_zh_lm.sh | 0 examples/ngram_lm/{ => s0}/local/download_lm_zh.sh | 0 examples/ngram_lm/{ => s0}/local/kenlm_score_test.py | 0 examples/ngram_lm/{ => s0}/path.sh | 4 ++-- examples/ngram_lm/{ => s0}/requirements.txt | 0 examples/ngram_lm/{ => s0}/run.sh | 0 10 files changed, 2 insertions(+), 2 deletions(-) rename examples/ngram_lm/{ => s0}/README.md (100%) rename examples/ngram_lm/{ => s0}/data/README.md (100%) rename examples/ngram_lm/{ => s0}/data/custom_confusion.txt (100%) rename examples/ngram_lm/{ => s0}/data/text_correct.txt (100%) rename examples/ngram_lm/{ => s0}/local/build_zh_lm.sh (100%) rename examples/ngram_lm/{ => s0}/local/download_lm_zh.sh (100%) rename examples/ngram_lm/{ => s0}/local/kenlm_score_test.py (100%) rename examples/ngram_lm/{ => s0}/path.sh (69%) rename examples/ngram_lm/{ => s0}/requirements.txt (100%) rename examples/ngram_lm/{ => s0}/run.sh (100%) diff --git a/examples/ngram_lm/README.md b/examples/ngram_lm/s0/README.md similarity index 100% rename from examples/ngram_lm/README.md rename to examples/ngram_lm/s0/README.md diff --git a/examples/ngram_lm/data/README.md b/examples/ngram_lm/s0/data/README.md similarity index 100% rename from examples/ngram_lm/data/README.md rename to examples/ngram_lm/s0/data/README.md diff --git a/examples/ngram_lm/data/custom_confusion.txt b/examples/ngram_lm/s0/data/custom_confusion.txt similarity index 100% rename from examples/ngram_lm/data/custom_confusion.txt rename to examples/ngram_lm/s0/data/custom_confusion.txt diff --git a/examples/ngram_lm/data/text_correct.txt b/examples/ngram_lm/s0/data/text_correct.txt similarity index 100% rename from examples/ngram_lm/data/text_correct.txt rename to examples/ngram_lm/s0/data/text_correct.txt diff --git a/examples/ngram_lm/local/build_zh_lm.sh b/examples/ngram_lm/s0/local/build_zh_lm.sh similarity index 100% rename from examples/ngram_lm/local/build_zh_lm.sh rename to examples/ngram_lm/s0/local/build_zh_lm.sh diff --git a/examples/ngram_lm/local/download_lm_zh.sh b/examples/ngram_lm/s0/local/download_lm_zh.sh similarity index 100% rename from examples/ngram_lm/local/download_lm_zh.sh rename to examples/ngram_lm/s0/local/download_lm_zh.sh diff --git a/examples/ngram_lm/local/kenlm_score_test.py b/examples/ngram_lm/s0/local/kenlm_score_test.py similarity index 100% rename from examples/ngram_lm/local/kenlm_score_test.py rename to examples/ngram_lm/s0/local/kenlm_score_test.py diff --git a/examples/ngram_lm/path.sh b/examples/ngram_lm/s0/path.sh similarity index 69% rename from examples/ngram_lm/path.sh rename to examples/ngram_lm/s0/path.sh index 84e2de7d0..5f580bc4b 100644 --- a/examples/ngram_lm/path.sh +++ b/examples/ngram_lm/s0/path.sh @@ -1,4 +1,4 @@ -export MAIN_ROOT=${PWD}/../../ +export MAIN_ROOT=${PWD}/../../../ export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} export LC_ALL=C @@ -7,4 +7,4 @@ export LC_ALL=C export PYTHONIOENCODING=UTF-8 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} -export LD_LIBRARY_PATH=/usr/local/lib/:${LD_LIBRARY_PATH} \ No newline at end of file +export LD_LIBRARY_PATH=/usr/local/lib/:${LD_LIBRARY_PATH} diff --git a/examples/ngram_lm/requirements.txt b/examples/ngram_lm/s0/requirements.txt similarity index 100% rename from examples/ngram_lm/requirements.txt rename to examples/ngram_lm/s0/requirements.txt diff --git a/examples/ngram_lm/run.sh b/examples/ngram_lm/s0/run.sh similarity index 100% rename from examples/ngram_lm/run.sh rename to examples/ngram_lm/s0/run.sh From 2820537fcc0adea30e271c69803c026c94be83cc Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 6 Jul 2021 02:39:41 +0000 Subject: [PATCH 24/25] fix load param --- deepspeech/utils/checkpoint.py | 42 +++++++++++++++++----------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/deepspeech/utils/checkpoint.py b/deepspeech/utils/checkpoint.py index a2f7e18ae..a59f8be79 100644 --- a/deepspeech/utils/checkpoint.py +++ b/deepspeech/utils/checkpoint.py @@ -16,8 +16,8 @@ import json import os import re from pathlib import Path -from typing import Union from typing import Text +from typing import Union import paddle from paddle import distributed as dist @@ -51,7 +51,7 @@ class Checkpoint(): Args: checkpoint_dir (str): the directory where checkpoint is saved. tag_or_iteration (int or str): the latest iteration(step or epoch) number or tag. - model (Layer): model to be checkpointed. + model (Layer): model to be checkpointed. optimizer (Optimizer, optional): optimizer to be checkpointed. infos (dict or None)): any info you want to save. metric_type (str, optional): metric type. Defaults to "val_loss". @@ -72,22 +72,22 @@ class Checkpoint(): if isinstance(tag_or_iteration, int): self._save_checkpoint_record(checkpoint_dir, tag_or_iteration) - + def load_parameters(self, - model, - optimizer=None, - checkpoint_dir=None, - checkpoint_path=None, - record_file="checkpoint_latest"): - """Load a last model checkpoint from disk. + model, + optimizer=None, + checkpoint_dir=None, + checkpoint_path=None, + record_file="checkpoint_latest"): + """Load a last model checkpoint from disk. Args: model (Layer): model to load parameters. optimizer (Optimizer, optional): optimizer to load states if needed. Defaults to None. checkpoint_dir (str, optional): the directory where checkpoint is saved. checkpoint_path (str, optional): if specified, load the checkpoint - stored in the checkpoint_path(prefix) and the argument 'checkpoint_dir' will - be ignored. Defaults to None. + stored in the checkpoint_path(prefix) and the argument 'checkpoint_dir' will + be ignored. Defaults to None. record_file "checkpoint_latest" or "checkpoint_best" Returns: configs (dict): epoch or step, lr and other meta info should be saved. @@ -134,40 +134,40 @@ class Checkpoint(): optimizer=None, checkpoint_dir=None, checkpoint_path=None): - """Load a last model checkpoint from disk. + """Load a last model checkpoint from disk. Args: model (Layer): model to load parameters. optimizer (Optimizer, optional): optimizer to load states if needed. Defaults to None. checkpoint_dir (str, optional): the directory where checkpoint is saved. checkpoint_path (str, optional): if specified, load the checkpoint - stored in the checkpoint_path(prefix) and the argument 'checkpoint_dir' will - be ignored. Defaults to None. + stored in the checkpoint_path(prefix) and the argument 'checkpoint_dir' will + be ignored. Defaults to None. Returns: configs (dict): epoch or step, lr and other meta info should be saved. """ - return self._load_parameters(model, optimizer, checkpoint_dir, - checkpoint_path, "checkpoint_latest") + return self.load_parameters(model, optimizer, checkpoint_dir, + checkpoint_path, "checkpoint_latest") def load_best_parameters(self, model, optimizer=None, checkpoint_dir=None, checkpoint_path=None): - """Load a last model checkpoint from disk. + """Load a last model checkpoint from disk. Args: model (Layer): model to load parameters. optimizer (Optimizer, optional): optimizer to load states if needed. Defaults to None. checkpoint_dir (str, optional): the directory where checkpoint is saved. checkpoint_path (str, optional): if specified, load the checkpoint - stored in the checkpoint_path(prefix) and the argument 'checkpoint_dir' will - be ignored. Defaults to None. + stored in the checkpoint_path(prefix) and the argument 'checkpoint_dir' will + be ignored. Defaults to None. Returns: configs (dict): epoch or step, lr and other meta info should be saved. """ - return self._load_parameters(model, optimizer, checkpoint_dir, - checkpoint_path, "checkpoint_best") + return self.load_parameters(model, optimizer, checkpoint_dir, + checkpoint_path, "checkpoint_best") def _should_save_best(self, metric: float) -> bool: if not self._best_full(): From 8998f4c24d73521d08286983acb80924e3d97e4c Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 6 Jul 2021 08:05:41 +0000 Subject: [PATCH 25/25] add timit --- examples/dataset/gigaspeech/run.sh | 8 +- examples/dataset/thchs30/thchs30.py | 5 +- examples/dataset/timit/.gitignore | 4 + examples/dataset/timit/timit.py | 239 ++++++++++++++++++++++++++++ utils/utility.py | 19 +++ 5 files changed, 271 insertions(+), 4 deletions(-) mode change 100644 => 100755 examples/dataset/gigaspeech/run.sh create mode 100644 examples/dataset/timit/.gitignore create mode 100644 examples/dataset/timit/timit.py diff --git a/examples/dataset/gigaspeech/run.sh b/examples/dataset/gigaspeech/run.sh old mode 100644 new mode 100755 index 0f7b46ab9..a1ad8610c --- a/examples/dataset/gigaspeech/run.sh +++ b/examples/dataset/gigaspeech/run.sh @@ -5,6 +5,10 @@ set -e curdir=$PWD test -d GigaSpeech || git clone https://github.com/SpeechColab/GigaSpeech.git -cd GigaSpeech + + +pushd GigaSpeech source env_vars.sh -utils/gigaspeech_download.sh ${curdir}/ +./utils/download_gigaspeech.sh ${curdir}/ +#toolkits/kaldi/gigaspeech_data_prep.sh --train-subset XL /disk1/audio_data/gigaspeech ../data +popd diff --git a/examples/dataset/thchs30/thchs30.py b/examples/dataset/thchs30/thchs30.py index c28fa56ff..d03e3a22e 100644 --- a/examples/dataset/thchs30/thchs30.py +++ b/examples/dataset/thchs30/thchs30.py @@ -117,20 +117,21 @@ def create_manifest(data_dir, manifest_path_prefix): audio_data, samplerate = soundfile.read(audio_path) duration = float(len(audio_data) / samplerate) + # not dump alignment infos json_lines.append( json.dumps( { 'utt': audio_id, 'feat': audio_path, 'feat_shape': (duration, ), # second - 'text': word_text, + 'text': word_text, # charactor 'syllable': syllable_text, 'phone': phone_text, }, ensure_ascii=False)) total_sec += duration - total_text += len(text) + total_text += len(word_text) total_num += 1 manifest_path = manifest_path_prefix + '.' + dtype diff --git a/examples/dataset/timit/.gitignore b/examples/dataset/timit/.gitignore new file mode 100644 index 000000000..9a3f42281 --- /dev/null +++ b/examples/dataset/timit/.gitignore @@ -0,0 +1,4 @@ +TIMIT.* +TIMIT +manifest.* +*.meta diff --git a/examples/dataset/timit/timit.py b/examples/dataset/timit/timit.py new file mode 100644 index 000000000..222d9af30 --- /dev/null +++ b/examples/dataset/timit/timit.py @@ -0,0 +1,239 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Prepare Librispeech ASR datasets. + +Download, unpack and create manifest files. +Manifest file is a json-format file with each line containing the +meta data (i.e. audio filepath, transcript and audio duration) +of each audio file in the data set. +""" +import argparse +import codecs +import json +import os +import re +import string +from pathlib import Path + +import soundfile + +from utils.utility import unzip + +URL_ROOT = "" +MD5_DATA = "45c68037c7fdfe063a43c851f181fb2d" + +parser = argparse.ArgumentParser(description=__doc__) +parser.add_argument( + "--target_dir", + default='~/.cache/paddle/dataset/speech/timit', + type=str, + help="Directory to save the dataset. (default: %(default)s)") +parser.add_argument( + "--manifest_prefix", + default="manifest", + type=str, + help="Filepath prefix for output manifests. (default: %(default)s)") +args = parser.parse_args() + +#: A string containing Chinese punctuation marks (non-stops). +non_stops = ( + # Fullwidth ASCII variants + '\uFF02\uFF03\uFF04\uFF05\uFF06\uFF07\uFF08\uFF09\uFF0A\uFF0B\uFF0C\uFF0D' + '\uFF0F\uFF1A\uFF1B\uFF1C\uFF1D\uFF1E\uFF20\uFF3B\uFF3C\uFF3D\uFF3E\uFF3F' + '\uFF40\uFF5B\uFF5C\uFF5D\uFF5E\uFF5F\uFF60' + + # Halfwidth CJK punctuation + '\uFF62\uFF63\uFF64' + + # CJK symbols and punctuation + '\u3000\u3001\u3003' + + # CJK angle and corner brackets + '\u3008\u3009\u300A\u300B\u300C\u300D\u300E\u300F\u3010\u3011' + + # CJK brackets and symbols/punctuation + '\u3014\u3015\u3016\u3017\u3018\u3019\u301A\u301B\u301C\u301D\u301E\u301F' + + # Other CJK symbols + '\u3030' + + # Special CJK indicators + '\u303E\u303F' + + # Dashes + '\u2013\u2014' + + # Quotation marks and apostrophe + '\u2018\u2019\u201B\u201C\u201D\u201E\u201F' + + # General punctuation + '\u2026\u2027' + + # Overscores and underscores + '\uFE4F' + + # Small form variants + '\uFE51\uFE54' + + # Latin punctuation + '\u00B7') + +#: A string of Chinese stops. +stops = ( + '\uFF01' # Fullwidth exclamation mark + '\uFF1F' # Fullwidth question mark + '\uFF61' # Halfwidth ideographic full stop + '\u3002' # Ideographic full stop +) + +#: A string containing all Chinese punctuation. +punctuation = non_stops + stops + + +def tn(text): + # lower text + text = text.lower() + # remove punc + text = re.sub(f'[{punctuation}{string.punctuation}]', "", text) + return text + + +def read_txt(filepath: str) -> str: + with open(filepath, 'r') as f: + line = f.read().strip().split(maxsplit=2)[2] + return tn(line) + + +def read_algin(filepath: str) -> str: + """read word or phone alignment file. + + + Args: + filepath (str): [description] + + Returns: + str: token sepearte by + """ + aligns = [] # (start, end, token) + with open(filepath, 'r') as f: + for line in f: + items = line.strip().split() + # for phone: (Note: beginning and ending silence regions are marked with h#) + if items[2].strip() == 'h#': + continue + aligns.append(items) + return ' '.join([item[2] for item in aligns]) + + +def create_manifest(data_dir, manifest_path_prefix): + """Create a manifest json file summarizing the data set, with each line + containing the meta data (i.e. audio filepath, transcription text, audio + duration) of each audio file within the data set. + """ + print("Creating manifest %s ..." % manifest_path_prefix) + json_lines = [] + utts = set() + + data_types = ['TRAIN', 'TEST'] + for dtype in data_types: + del json_lines[:] + total_sec = 0.0 + total_text = 0.0 + total_num = 0 + + audio_dir = Path(os.path.join(data_dir, dtype)) + for fname in sorted(audio_dir.rglob('*.WAV')): + audio_path = fname.resolve() # .WAV + audio_id = audio_path.stem + # if uttid exits, then skipped + if audio_id in utts: + continue + + utts.add(audio_id) + text_path = audio_path.with_suffix('.TXT') + phone_path = audio_path.with_suffix('.PHN') + word_path = audio_path.with_suffix('.WRD') + + audio_data, samplerate = soundfile.read( + str(audio_path), dtype='int16') + duration = float(len(audio_data) / samplerate) + word_text = read_txt(text_path) + phone_text = read_algin(phone_path) + + gender_spk = str(audio_path.parent.stem) + spk = gender_spk[1:] + gender = gender_spk[0] + utt_id = '_'.join([spk, gender, audio_id]) + # not dump alignment infos + json_lines.append( + json.dumps( + { + 'utt': utt_id, + 'feat': str(audio_path), + 'feat_shape': (duration, ), # second + 'text': word_text, # word + 'phone': phone_text, + 'spk': spk, + 'gender': gender, + }, + ensure_ascii=False)) + + total_sec += duration + total_text += len(word_text.split()) + total_num += 1 + + manifest_path = manifest_path_prefix + '.' + dtype.lower() + with codecs.open(manifest_path, 'w', 'utf-8') as fout: + for line in json_lines: + fout.write(line + '\n') + + with open(dtype.lower() + '.meta', 'w') as f: + print(f"{dtype}:", file=f) + print(f"{total_num} utts", file=f) + print(f"{total_sec / (60*60)} h", file=f) + print(f"{total_text} text", file=f) + print(f"{total_text / total_sec} text/sec", file=f) + print(f"{total_sec / total_num} sec/utt", file=f) + + +def prepare_dataset(url, md5sum, target_dir, manifest_path): + """Download, unpack and create summmary manifest file. + """ + filepath = os.path.join(target_dir, "TIMIT.zip") + if not os.path.exists(filepath): + print(f"Please download TIMIT.zip into {target_dir}.") + raise FileNotFoundError + + if not os.path.exists(os.path.join(target_dir, "TIMIT")): + # check md5sum + assert check_md5sum(filepath, md5sum) + # unpack + unzip(filepath, target_dir) + else: + print("Skip downloading and unpacking. Data already exists in %s." % + target_dir) + # create manifest json file + create_manifest(os.path.join(target_dir, "TIMIT"), manifest_path) + + +def main(): + if args.target_dir.startswith('~'): + args.target_dir = os.path.expanduser(args.target_dir) + + prepare_dataset(URL_ROOT, MD5_DATA, args.target_dir, args.manifest_prefix) + print("Data download and manifest prepare done!") + + +if __name__ == '__main__': + main() diff --git a/utils/utility.py b/utils/utility.py index 0333bc559..344900efa 100644 --- a/utils/utility.py +++ b/utils/utility.py @@ -14,9 +14,15 @@ import os import tarfile import zipfile +from typing import Text from paddle.dataset.common import md5file +__all__ = [ + "check_md5sum", "getfile_insensitive", "download_multi", "download", + "unpack", "unzip" +] + def getfile_insensitive(path): """Get the actual file path when given insensitive filename.""" @@ -54,6 +60,19 @@ def download(url, md5sum, target_dir): return filepath +def check_md5sum(filepath: Text, md5sum: Text) -> bool: + """check md5sum of file. + + Args: + filepath (Text): [description] + md5sum (Text): [description] + + Returns: + bool: same or not. + """ + return md5file(filepath) == md5sum + + def unpack(filepath, target_dir, rm_tar=False): """Unpack the file to the target_dir.""" print("Unpacking %s ..." % filepath)