From 5d4f3fbd7b2b81abebe87641dd25f1e2ebb1e53d Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Fri, 21 May 2021 09:55:52 +0000
Subject: [PATCH] format

---
 README.md                               |  2 +-
 deepspeech/__init__.py                  |  1 -
 deepspeech/frontend/normalizer.py       |  3 ++-
 deepspeech/modules/mask.py              |  1 +
 doc/src/alignment.md                    |  1 -
 doc/src/asr_text_backend.md             |  2 +-
 doc/src/benchmark.md                    |  1 -
 doc/src/chinese_syllable.md             |  2 +-
 doc/src/dataset.md                      |  2 +-
 doc/src/decoding.md                     |  1 -
 doc/src/feature_list.md                 |  2 +-
 doc/src/ngram_lm.md                     |  2 +-
 doc/src/praat_textgrid.md               | 15 ++++++------
 doc/src/tools.md                        |  1 -
 doc/src/tts_text_front_end.md           |  6 ++---
 doc/src/vad.md                          |  1 -
 examples/aishell/s1/conf/conformer.yaml |  4 ++--
 examples/cc-cedict/local/parser.py      | 31 +++++++++++++++----------
 requirements.txt                        |  4 ++--
 third_party/phkit/README.md             |  2 +-
 20 files changed, 43 insertions(+), 41 deletions(-)

diff --git a/README.md b/README.md
index a2de1783a..424dc485e 100644
--- a/README.md
+++ b/README.md
@@ -11,7 +11,7 @@
 
 ## Features
 
- See [feature list](doc/src/feature_list.md) for more information. 
+ See [feature list](doc/src/feature_list.md) for more information.
 
 ## Setup
 
diff --git a/deepspeech/__init__.py b/deepspeech/__init__.py
index ac9ccdc77..c942de0cf 100644
--- a/deepspeech/__init__.py
+++ b/deepspeech/__init__.py
@@ -421,7 +421,6 @@ logger.warn(
 )
 F.ctc_loss = ctc_loss
 
-
 ########### hcak paddle.nn #############
 if not hasattr(paddle.nn, 'Module'):
     logger.warn("register user Module to paddle.nn, remove this when fixed!")
diff --git a/deepspeech/frontend/normalizer.py b/deepspeech/frontend/normalizer.py
index 6b224080b..287b51e58 100644
--- a/deepspeech/frontend/normalizer.py
+++ b/deepspeech/frontend/normalizer.py
@@ -179,7 +179,8 @@ class FeatureNormalizer(object):
                 wav_number += batch_size
 
                 if wav_number % 1000 == 0:
-                    logger.info(f'process {wav_number} wavs,{all_number} frames.')
+                    logger.info(
+                        f'process {wav_number} wavs,{all_number} frames.')
 
         self.cmvn_info = {
             'mean_stat': list(all_mean_stat.tolist()),
diff --git a/deepspeech/modules/mask.py b/deepspeech/modules/mask.py
index c506f127b..74d4e30a6 100644
--- a/deepspeech/modules/mask.py
+++ b/deepspeech/modules/mask.py
@@ -23,6 +23,7 @@ __all__ = [
     "mask_finished_preds"
 ]
 
+
 def make_pad_mask(lengths: paddle.Tensor) -> paddle.Tensor:
     """Make mask tensor containing indices of padded part.
     See description of make_non_pad_mask.
diff --git a/doc/src/alignment.md b/doc/src/alignment.md
index fa63894f2..9d3231c89 100644
--- a/doc/src/alignment.md
+++ b/doc/src/alignment.md
@@ -18,4 +18,3 @@
 
 * [ctc alignment](https://mp.weixin.qq.com/s/4aGehNN7PpIvCh03qTT5oA)
 * [时间戳和N-Best](https://mp.weixin.qq.com/s?__biz=MzU2NjUwMTgxOQ==&mid=2247483956&idx=1&sn=80ce595238d84155d50f08c0d52267d3&chksm=fcaacae0cbdd43f62b1da60c8e8671a9e0bb2aeee94f58751839b03a1c45b9a3889b96705080&scene=21#wechat_redirect)
-
diff --git a/doc/src/asr_text_backend.md b/doc/src/asr_text_backend.md
index 879e56f8a..c3c9896c7 100644
--- a/doc/src/asr_text_backend.md
+++ b/doc/src/asr_text_backend.md
@@ -98,4 +98,4 @@
 
 ## Text Filter
 
-* 敏感词（黄暴、涉政、违法违禁等）
\ No newline at end of file
+* 敏感词（黄暴、涉政、违法违禁等）
diff --git a/doc/src/benchmark.md b/doc/src/benchmark.md
index f3af25552..9c1c86fd7 100644
--- a/doc/src/benchmark.md
+++ b/doc/src/benchmark.md
@@ -14,4 +14,3 @@ We compare the training time with 1, 2, 4, 8 Tesla V100 GPUs (with a subset of L
 | 8         | 6.95 X |
 
 `utils/profile.sh` provides such a demo profiling tool, you can change it as need.
-
diff --git a/doc/src/chinese_syllable.md b/doc/src/chinese_syllable.md
index b7fd93223..3ada44f4e 100644
--- a/doc/src/chinese_syllable.md
+++ b/doc/src/chinese_syllable.md
@@ -67,4 +67,4 @@
 * https://github.com/KuangDD/phkit
 * https://github.com/mozillazg/python-pinyin
 * https://github.com/Kyubyong/g2pC
-* https://github.com/kakaobrain/g2pM
\ No newline at end of file
+* https://github.com/kakaobrain/g2pM
diff --git a/doc/src/dataset.md b/doc/src/dataset.md
index d70d0e0d2..aaa805510 100644
--- a/doc/src/dataset.md
+++ b/doc/src/dataset.md
@@ -18,4 +18,4 @@
 
 ### ASR Noise
 
-* [asr-noises](https://github.com/speechio/asr-noises)
\ No newline at end of file
+* [asr-noises](https://github.com/speechio/asr-noises)
diff --git a/doc/src/decoding.md b/doc/src/decoding.md
index ade06c4cb..347a4098b 100644
--- a/doc/src/decoding.md
+++ b/doc/src/decoding.md
@@ -3,4 +3,3 @@
 ## Reference
 
 * [时间戳和N-Best](https://mp.weixin.qq.com/s?__biz=MzU2NjUwMTgxOQ==&mid=2247483956&idx=1&sn=80ce595238d84155d50f08c0d52267d3&chksm=fcaacae0cbdd43f62b1da60c8e8671a9e0bb2aeee94f58751839b03a1c45b9a3889b96705080&scene=21#wechat_redirect)
-
diff --git a/doc/src/feature_list.md b/doc/src/feature_list.md
index 57641d5ea..573669fa2 100644
--- a/doc/src/feature_list.md
+++ b/doc/src/feature_list.md
@@ -58,4 +58,4 @@
 ### Grapheme To Phoneme
 
 * syallable
-* phoneme
\ No newline at end of file
+* phoneme
diff --git a/doc/src/ngram_lm.md b/doc/src/ngram_lm.md
index 07aa5411c..119a3b21c 100644
--- a/doc/src/ngram_lm.md
+++ b/doc/src/ngram_lm.md
@@ -83,4 +83,4 @@ Please notice that the released language models only contain Chinese simplified
 
    ```
    build/bin/build_binary ./result/people2014corpus_words.arps ./result/people2014corpus_words.klm
-   ```
\ No newline at end of file
+   ```
diff --git a/doc/src/praat_textgrid.md b/doc/src/praat_textgrid.md
index c25c760ae..06c4f8791 100644
--- a/doc/src/praat_textgrid.md
+++ b/doc/src/praat_textgrid.md
@@ -76,7 +76,7 @@ pip3 install textgrid
    tg.read('file.TextGrid')  # 'file.TextGrid' 是文件名
    ```
 
-   tg.tiers属性: 
+   tg.tiers属性:
    会把文件中的所有item打印出来, print(tg.tiers) 的结果如下:
 
    ```text
@@ -86,7 +86,7 @@ pip3 install textgrid
            Interval(1361.89250, 1362.01250, R),
            Interval(1362.01250, 1362.13250, AY1),
            Interval(1362.13250, 1362.16250, T),
-   
+
    ...
            ]
        )
@@ -113,7 +113,7 @@ pip3 install textgrid
    Interval  可以理解为时长
    ```
 
-   
+
 
 2. textgrid库中的对象
    **IntervalTier** 对象:
@@ -148,7 +148,7 @@ pip3 install textgrid
    strict  -- > 返回bool值, 表示是否严格TextGrid格式
    ```
 
-   ​      
+   ​  
 
    **PointTier** 对象:
    方法
@@ -174,7 +174,7 @@ pip3 install textgrid
    name    返回name
    ```
 
-   
+
 
    **Point** 对象:
        支持比较大小, 支持加减运算
@@ -185,7 +185,7 @@ pip3 install textgrid
    time:
    ```
 
-   ​       
+   ​  
 
    **Interval** 对象:
        支持比较大小, 支持加减运算
@@ -250,10 +250,9 @@ pip3 install textgrid
    grids:  --> 返回读取的grids的列表
    ```
 
-   
+
 
 ## Reference
 
 * https://zh.wikipedia.org/wiki/Praat%E8%AF%AD%E9%9F%B3%E5%AD%A6%E8%BD%AF%E4%BB%B6
 * https://blog.csdn.net/duxin_csdn/article/details/88966295
-
diff --git a/doc/src/tools.md b/doc/src/tools.md
index 4ec09f6a2..5fcca9239 100644
--- a/doc/src/tools.md
+++ b/doc/src/tools.md
@@ -1,4 +1,3 @@
 # Useful Tools
 
 * [正则可视化和常用正则表达式](https://wangwl.net/static/projects/visualRegex/#)
-
diff --git a/doc/src/tts_text_front_end.md b/doc/src/tts_text_front_end.md
index fe0f7e247..b13ab615c 100644
--- a/doc/src/tts_text_front_end.md
+++ b/doc/src/tts_text_front_end.md
@@ -23,7 +23,7 @@ Therefore, procedures like stemming and lemmatization are not useful for Chinese
 
 ### Tokenization
 
-**Tokenizing breaks up text data into shorter pre-set strings**, which help build context and meaning for the machine learning model.   
+**Tokenizing breaks up text data into shorter pre-set strings**, which help build context and meaning for the machine learning model.  
 
 These “tags” label the part of speech. There are 24 part of speech tags and 4 proper name category labels in the `**jieba**` package’s existing dictionary.
 
@@ -31,7 +31,7 @@ These “tags” label the part of speech. There are 24 part of speech tags and
 
 ### Stop Words
 
-In NLP, **stop words are “meaningless” words** that make the data too noisy or ambiguous. 
+In NLP, **stop words are “meaningless” words** that make the data too noisy or ambiguous.
 
 Instead of manually removing them, you could import the `**stopwordsiso**` package for a full list of Chinese stop words. More information can be found [here](https://pypi.org/project/stopwordsiso/). And with this, we can easily create code to filter out any stop words in large text data.
 
@@ -209,4 +209,4 @@ TN: 基于规则的方法
 ## Reference
 * [Text Front End](https://slyne.github.io/%E5%85%AC%E5%BC%80%E8%AF%BE/2020/10/03/TTS1/)
 * [Chinese Natural Language (Pre)processing: An Introduction](https://towardsdatascience.com/chinese-natural-language-pre-processing-an-introduction-995d16c2705f)
-* [Beginner’s Guide to Sentiment Analysis for Simplified Chinese using SnowNLP](https://towardsdatascience.com/beginners-guide-to-sentiment-analysis-for-simplified-chinese-using-snownlp-ce88a8407efb)
\ No newline at end of file
+* [Beginner’s Guide to Sentiment Analysis for Simplified Chinese using SnowNLP](https://towardsdatascience.com/beginners-guide-to-sentiment-analysis-for-simplified-chinese-using-snownlp-ce88a8407efb)
diff --git a/doc/src/vad.md b/doc/src/vad.md
index 56fe95879..e73e9cf7a 100644
--- a/doc/src/vad.md
+++ b/doc/src/vad.md
@@ -29,4 +29,3 @@
 * [Endpoint 检测](https://mp.weixin.qq.com/s?__biz=MzU2NjUwMTgxOQ==&mid=2247484024&idx=1&sn=12da2ee76347de4a18856274ba6ba61f&chksm=fcaacaaccbdd43ba6b3e996bbf1e2ac6d5f1b449dfd80fcaccfbbe0a240fa1668b931dbf4bd5&scene=21#wechat_redirect)
 * Kaldi: *https://github.com/kaldi-asr/kaldi/blob/6260b27d146e466c7e1e5c60858e8da9fd9c78ae/src/online2/online-endpoint.h#L132-L150*
 * End-to-End Automatic Speech Recognition Integrated with CTC-Based Voice Activity Detection: *https://arxiv.org/pdf/2002.00551.pdf*
-
diff --git a/examples/aishell/s1/conf/conformer.yaml b/examples/aishell/s1/conf/conformer.yaml
index 10c3a2822..36d56723b 100644
--- a/examples/aishell/s1/conf/conformer.yaml
+++ b/examples/aishell/s1/conf/conformer.yaml
@@ -24,7 +24,7 @@ data:
   n_fft: None
   stride_ms: 10.0
   window_ms: 25.0
-  use_dB_normalization: False 
+  use_dB_normalization: True 
   target_dB: -20
   random_seed: 0
   keep_transcription_text: False
@@ -76,7 +76,7 @@ model:
 training:
   n_epoch: 240 
   accum_grad: 2
-  global_grad_clip: 5.0
+  global_grad_clip: 3.0
   optim: adam
   optim_conf:
     lr: 0.002
diff --git a/examples/cc-cedict/local/parser.py b/examples/cc-cedict/local/parser.py
index d6acb834f..e1e10b3d0 100644
--- a/examples/cc-cedict/local/parser.py
+++ b/examples/cc-cedict/local/parser.py
@@ -1,18 +1,24 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 # https://github.com/rubber-duck-dragon/rubber-duck-dragon.github.io/blob/master/cc-cedict_parser/parser.py
-
 #A parser for the CC-Cedict. Convert the Chinese-English dictionary into a list of python dictionaries with "traditional","simplified", "pinyin", and "english" keys.
-
 #Make sure that the cedict_ts.u8 file is in the same folder as this file, and that the name matches the file name on line 13.
-
 #Before starting, open the CEDICT text file and delete the copyright information at the top. Otherwise the program will try to parse it and you will get an error message.
-
 #Characters that are commonly used as surnames have two entries in CC-CEDICT. This program will remove the surname entry if there is another entry for the character. If you want to include the surnames, simply delete lines 59 and 60.
-
 #This code was written by Franki Allegra in February 2020.
-
-
-import sys
 import json
+import sys
 
 # usage: bin ccedict dump.json
 
@@ -50,9 +56,10 @@ with open(sys.argv[1], 'rt') as file:
         list_of_dicts.append(parsed)
 
     def remove_surnames():
-        for x in range(len(list_of_dicts)-1, -1, -1):
+        for x in range(len(list_of_dicts) - 1, -1, -1):
             if "surname " in list_of_dicts[x]['english']:
-                if list_of_dicts[x]['traditional'] == list_of_dicts[x+1]['traditional']:
+                if list_of_dicts[x]['traditional'] == list_of_dicts[x + 1][
+                        'traditional']:
                     list_of_dicts.pop(x)
 
     def main():
@@ -60,13 +67,12 @@ with open(sys.argv[1], 'rt') as file:
         #make each line into a dictionary
         print("Parsing dictionary . . .")
         for line in dict_lines:
-                parse_line(line)
+            parse_line(line)
 
         #remove entries for surnames from the data (optional):
         print("Removing Surnames . . .")
         remove_surnames()
 
-
         print("Saving to database (this may take a few minutes) . . .")
         with open(sys.argv[2], 'wt') as fout:
             for one_dict in list_of_dicts:
@@ -74,5 +80,6 @@ with open(sys.argv[1], 'rt') as file:
                 fout.write(json_str + "\n")
         print('Done!')
 
+
 list_of_dicts = []
 parsed_dict = main()
diff --git a/requirements.txt b/requirements.txt
index a6facb6cb..57a951bbd 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,6 @@
 coverage
 pre-commit
+pybind11
 resampy==0.2.2
 scipy==1.2.1
 sentencepiece
@@ -7,7 +8,6 @@ snakeviz
 SoundFile==0.9.0.post1
 sox
 tensorboardX
+textgrid
 typeguard
 yacs
-pybind11
-textgrid
diff --git a/third_party/phkit/README.md b/third_party/phkit/README.md
index e8f0745ce..002425bad 100644
--- a/third_party/phkit/README.md
+++ b/third_party/phkit/README.md
@@ -39,7 +39,7 @@ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
 
 标点：
 ! ? . , ; : " # ( )
-注：!=!！|?=?？|.=.。|,=,，、|;=;；|:=:：|"="“|#=# 　	|(=(（[［{｛【<《|)=)）]］}｝】>》
+注：!=!！|?=?？|.=.。|,=,，、|;=;；|:=:：|"="“|#=# 　    |(=(（[［{｛【<《|)=)）]］}｝】>》
 
 预留：
 w y 0 6 7 8 9