diff --git a/README.md b/README.md index a2de1783..424dc485 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ ## Features - See [feature list](doc/src/feature_list.md) for more information. + See [feature list](doc/src/feature_list.md) for more information. ## Setup diff --git a/deepspeech/exps/deepspeech2/model.py b/deepspeech/exps/deepspeech2/model.py index 04137419..8e8a1824 100644 --- a/deepspeech/exps/deepspeech2/model.py +++ b/deepspeech/exps/deepspeech2/model.py @@ -272,8 +272,8 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): infer_model, input_spec=[ paddle.static.InputSpec( - shape=[None, feat_dim, None], - dtype='float32'), # audio, [B,D,T] + shape=[None, None, feat_dim], + dtype='float32'), # audio, [B,T,D] paddle.static.InputSpec(shape=[None], dtype='int64'), # audio_length, [B] ]) diff --git a/deepspeech/frontend/normalizer.py b/deepspeech/frontend/normalizer.py index 6b224080..287b51e5 100644 --- a/deepspeech/frontend/normalizer.py +++ b/deepspeech/frontend/normalizer.py @@ -179,7 +179,8 @@ class FeatureNormalizer(object): wav_number += batch_size if wav_number % 1000 == 0: - logger.info(f'process {wav_number} wavs,{all_number} frames.') + logger.info( + f'process {wav_number} wavs,{all_number} frames.') self.cmvn_info = { 'mean_stat': list(all_mean_stat.tolist()), diff --git a/deepspeech/modules/conv.py b/deepspeech/modules/conv.py index f0f0d746..111f5d3b 100644 --- a/deepspeech/modules/conv.py +++ b/deepspeech/modules/conv.py @@ -15,7 +15,7 @@ from paddle import nn from paddle.nn import functional as F from deepspeech.modules.activation import brelu -from deepspeech.modules.mask import sequence_mask +from deepspeech.modules.mask import make_non_pad_mask from deepspeech.utils.log import Log logger = Log(__name__).getlog() @@ -111,8 +111,10 @@ class ConvBn(nn.Layer): ) // self.stride[1] + 1 # reset padding part to 0 - masks = sequence_mask(x_len) #[B, T] + masks = make_non_pad_mask(x_len) #[B, T] masks = masks.unsqueeze(1).unsqueeze(1) # [B, 1, 1, T] + # TODO(Hui Zhang): not support bool multiply + masks = masks.type_as(x) x = x.multiply(masks) return x, x_len diff --git a/deepspeech/modules/mask.py b/deepspeech/modules/mask.py index 65a8ba31..74d4e30a 100644 --- a/deepspeech/modules/mask.py +++ b/deepspeech/modules/mask.py @@ -18,40 +18,12 @@ from deepspeech.utils.log import Log logger = Log(__name__).getlog() __all__ = [ - 'sequence_mask', "make_pad_mask", "make_non_pad_mask", "subsequent_mask", + "make_pad_mask", "make_non_pad_mask", "subsequent_mask", "subsequent_chunk_mask", "add_optional_chunk_mask", "mask_finished_scores", "mask_finished_preds" ] -def sequence_mask(x_len, max_len=None, dtype='float32'): - """batch sequence mask. - - Args: - x_len ([paddle.Tensor]): xs lenght, [B] - max_len ([type], optional): max sequence length. Defaults to None. - dtype (str, optional): mask data type. Defaults to 'float32'. - - Returns: - paddle.Tensor: [B, Tmax] - - Examples: - >>> sequence_mask([2, 4]) - [[1., 1., 0., 0.], - [1., 1., 1., 1.]] - """ - # (TODO: Hui Zhang): jit not support Tenosr.dim() and Tensor.ndim - # assert x_len.dim() == 1, (x_len.dim(), x_len) - max_len = max_len or x_len.max() - x_len = paddle.unsqueeze(x_len, -1) - row_vector = paddle.arange(max_len) - # TODO(Hui Zhang): fix this bug - #mask = row_vector < x_len - mask = row_vector > x_len # a bug, broadcast 的时候出错了 - mask = paddle.cast(mask, dtype) - return mask - - def make_pad_mask(lengths: paddle.Tensor) -> paddle.Tensor: """Make mask tensor containing indices of padded part. See description of make_non_pad_mask. @@ -66,7 +38,8 @@ def make_pad_mask(lengths: paddle.Tensor) -> paddle.Tensor: [0, 0, 0, 1, 1], [0, 0, 1, 1, 1]] """ - assert lengths.dim() == 1 + # (TODO: Hui Zhang): jit not support Tenosr.dim() and Tensor.ndim + # assert lengths.dim() == 1 batch_size = int(lengths.shape[0]) max_len = int(lengths.max()) seq_range = paddle.arange(0, max_len, dtype=paddle.int64) diff --git a/deepspeech/modules/rnn.py b/deepspeech/modules/rnn.py index cef731e3..29bd2883 100644 --- a/deepspeech/modules/rnn.py +++ b/deepspeech/modules/rnn.py @@ -19,7 +19,7 @@ from paddle.nn import functional as F from paddle.nn import initializer as I from deepspeech.modules.activation import brelu -from deepspeech.modules.mask import sequence_mask +from deepspeech.modules.mask import make_non_pad_mask from deepspeech.utils.log import Log logger = Log(__name__).getlog() @@ -306,7 +306,9 @@ class RNNStack(nn.Layer): """ for i, rnn in enumerate(self.rnn_stacks): x, x_len = rnn(x, x_len) - masks = sequence_mask(x_len) #[B, T] + masks = make_non_pad_mask(x_len) #[B, T] masks = masks.unsqueeze(-1) # [B, T, 1] + # TODO(Hui Zhang): not support bool multiply + masks = masks.type_as(x) x = x.multiply(masks) return x, x_len diff --git a/doc/src/alignment.md b/doc/src/alignment.md index fa63894f..9d3231c8 100644 --- a/doc/src/alignment.md +++ b/doc/src/alignment.md @@ -18,4 +18,3 @@ * [ctc alignment](https://mp.weixin.qq.com/s/4aGehNN7PpIvCh03qTT5oA) * [时间戳和N-Best](https://mp.weixin.qq.com/s?__biz=MzU2NjUwMTgxOQ==&mid=2247483956&idx=1&sn=80ce595238d84155d50f08c0d52267d3&chksm=fcaacae0cbdd43f62b1da60c8e8671a9e0bb2aeee94f58751839b03a1c45b9a3889b96705080&scene=21#wechat_redirect) - diff --git a/doc/src/asr_text_backend.md b/doc/src/asr_text_backend.md index 879e56f8..c3c9896c 100644 --- a/doc/src/asr_text_backend.md +++ b/doc/src/asr_text_backend.md @@ -98,4 +98,4 @@ ## Text Filter -* 敏感词(黄暴、涉政、违法违禁等) \ No newline at end of file +* 敏感词(黄暴、涉政、违法违禁等) diff --git a/doc/src/benchmark.md b/doc/src/benchmark.md index f3af2555..9c1c86fd 100644 --- a/doc/src/benchmark.md +++ b/doc/src/benchmark.md @@ -14,4 +14,3 @@ We compare the training time with 1, 2, 4, 8 Tesla V100 GPUs (with a subset of L | 8 | 6.95 X | `utils/profile.sh` provides such a demo profiling tool, you can change it as need. - diff --git a/doc/src/chinese_syllable.md b/doc/src/chinese_syllable.md index b7fd9322..3ada44f4 100644 --- a/doc/src/chinese_syllable.md +++ b/doc/src/chinese_syllable.md @@ -67,4 +67,4 @@ * https://github.com/KuangDD/phkit * https://github.com/mozillazg/python-pinyin * https://github.com/Kyubyong/g2pC -* https://github.com/kakaobrain/g2pM \ No newline at end of file +* https://github.com/kakaobrain/g2pM diff --git a/doc/src/dataset.md b/doc/src/dataset.md index d70d0e0d..aaa80551 100644 --- a/doc/src/dataset.md +++ b/doc/src/dataset.md @@ -18,4 +18,4 @@ ### ASR Noise -* [asr-noises](https://github.com/speechio/asr-noises) \ No newline at end of file +* [asr-noises](https://github.com/speechio/asr-noises) diff --git a/doc/src/decoding.md b/doc/src/decoding.md index ade06c4c..347a4098 100644 --- a/doc/src/decoding.md +++ b/doc/src/decoding.md @@ -3,4 +3,3 @@ ## Reference * [时间戳和N-Best](https://mp.weixin.qq.com/s?__biz=MzU2NjUwMTgxOQ==&mid=2247483956&idx=1&sn=80ce595238d84155d50f08c0d52267d3&chksm=fcaacae0cbdd43f62b1da60c8e8671a9e0bb2aeee94f58751839b03a1c45b9a3889b96705080&scene=21#wechat_redirect) - diff --git a/doc/src/feature_list.md b/doc/src/feature_list.md index 57641d5e..573669fa 100644 --- a/doc/src/feature_list.md +++ b/doc/src/feature_list.md @@ -58,4 +58,4 @@ ### Grapheme To Phoneme * syallable -* phoneme \ No newline at end of file +* phoneme diff --git a/doc/src/ngram_lm.md b/doc/src/ngram_lm.md index 07aa5411..119a3b21 100644 --- a/doc/src/ngram_lm.md +++ b/doc/src/ngram_lm.md @@ -83,4 +83,4 @@ Please notice that the released language models only contain Chinese simplified ``` build/bin/build_binary ./result/people2014corpus_words.arps ./result/people2014corpus_words.klm - ``` \ No newline at end of file + ``` diff --git a/doc/src/praat_textgrid.md b/doc/src/praat_textgrid.md index c25c760a..06c4f879 100644 --- a/doc/src/praat_textgrid.md +++ b/doc/src/praat_textgrid.md @@ -76,7 +76,7 @@ pip3 install textgrid tg.read('file.TextGrid') # 'file.TextGrid' 是文件名 ``` - tg.tiers属性: + tg.tiers属性: 会把文件中的所有item打印出来, print(tg.tiers) 的结果如下: ```text @@ -86,7 +86,7 @@ pip3 install textgrid Interval(1361.89250, 1362.01250, R), Interval(1362.01250, 1362.13250, AY1), Interval(1362.13250, 1362.16250, T), - + ... ] ) @@ -113,7 +113,7 @@ pip3 install textgrid Interval 可以理解为时长 ``` - + 2. textgrid库中的对象 **IntervalTier** 对象: @@ -148,7 +148,7 @@ pip3 install textgrid strict -- > 返回bool值, 表示是否严格TextGrid格式 ``` - ​ + ​ **PointTier** 对象: 方法 @@ -174,7 +174,7 @@ pip3 install textgrid name 返回name ``` - + **Point** 对象: 支持比较大小, 支持加减运算 @@ -185,7 +185,7 @@ pip3 install textgrid time: ``` - ​ + ​ **Interval** 对象: 支持比较大小, 支持加减运算 @@ -250,10 +250,9 @@ pip3 install textgrid grids: --> 返回读取的grids的列表 ``` - + ## Reference * https://zh.wikipedia.org/wiki/Praat%E8%AF%AD%E9%9F%B3%E5%AD%A6%E8%BD%AF%E4%BB%B6 * https://blog.csdn.net/duxin_csdn/article/details/88966295 - diff --git a/doc/src/tools.md b/doc/src/tools.md index 4ec09f6a..5fcca923 100644 --- a/doc/src/tools.md +++ b/doc/src/tools.md @@ -1,4 +1,3 @@ # Useful Tools * [正则可视化和常用正则表达式](https://wangwl.net/static/projects/visualRegex/#) - diff --git a/doc/src/tts_text_front_end.md b/doc/src/tts_text_front_end.md index fe0f7e24..b13ab615 100644 --- a/doc/src/tts_text_front_end.md +++ b/doc/src/tts_text_front_end.md @@ -23,7 +23,7 @@ Therefore, procedures like stemming and lemmatization are not useful for Chinese ### Tokenization -**Tokenizing breaks up text data into shorter pre-set strings**, which help build context and meaning for the machine learning model. +**Tokenizing breaks up text data into shorter pre-set strings**, which help build context and meaning for the machine learning model. These “tags” label the part of speech. There are 24 part of speech tags and 4 proper name category labels in the `**jieba**` package’s existing dictionary. @@ -31,7 +31,7 @@ These “tags” label the part of speech. There are 24 part of speech tags and ### Stop Words -In NLP, **stop words are “meaningless” words** that make the data too noisy or ambiguous. +In NLP, **stop words are “meaningless” words** that make the data too noisy or ambiguous. Instead of manually removing them, you could import the `**stopwordsiso**` package for a full list of Chinese stop words. More information can be found [here](https://pypi.org/project/stopwordsiso/). And with this, we can easily create code to filter out any stop words in large text data. @@ -209,4 +209,4 @@ TN: 基于规则的方法 ## Reference * [Text Front End](https://slyne.github.io/%E5%85%AC%E5%BC%80%E8%AF%BE/2020/10/03/TTS1/) * [Chinese Natural Language (Pre)processing: An Introduction](https://towardsdatascience.com/chinese-natural-language-pre-processing-an-introduction-995d16c2705f) -* [Beginner’s Guide to Sentiment Analysis for Simplified Chinese using SnowNLP](https://towardsdatascience.com/beginners-guide-to-sentiment-analysis-for-simplified-chinese-using-snownlp-ce88a8407efb) \ No newline at end of file +* [Beginner’s Guide to Sentiment Analysis for Simplified Chinese using SnowNLP](https://towardsdatascience.com/beginners-guide-to-sentiment-analysis-for-simplified-chinese-using-snownlp-ce88a8407efb) diff --git a/doc/src/vad.md b/doc/src/vad.md index 56fe9587..e73e9cf7 100644 --- a/doc/src/vad.md +++ b/doc/src/vad.md @@ -29,4 +29,3 @@ * [Endpoint 检测](https://mp.weixin.qq.com/s?__biz=MzU2NjUwMTgxOQ==&mid=2247484024&idx=1&sn=12da2ee76347de4a18856274ba6ba61f&chksm=fcaacaaccbdd43ba6b3e996bbf1e2ac6d5f1b449dfd80fcaccfbbe0a240fa1668b931dbf4bd5&scene=21#wechat_redirect) * Kaldi: *https://github.com/kaldi-asr/kaldi/blob/6260b27d146e466c7e1e5c60858e8da9fd9c78ae/src/online2/online-endpoint.h#L132-L150* * End-to-End Automatic Speech Recognition Integrated with CTC-Based Voice Activity Detection: *https://arxiv.org/pdf/2002.00551.pdf* - diff --git a/examples/aishell/s1/conf/conformer.yaml b/examples/aishell/s1/conf/conformer.yaml index 10c3a282..36d56723 100644 --- a/examples/aishell/s1/conf/conformer.yaml +++ b/examples/aishell/s1/conf/conformer.yaml @@ -24,7 +24,7 @@ data: n_fft: None stride_ms: 10.0 window_ms: 25.0 - use_dB_normalization: False + use_dB_normalization: True target_dB: -20 random_seed: 0 keep_transcription_text: False @@ -76,7 +76,7 @@ model: training: n_epoch: 240 accum_grad: 2 - global_grad_clip: 5.0 + global_grad_clip: 3.0 optim: adam optim_conf: lr: 0.002 diff --git a/examples/cc-cedict/local/parser.py b/examples/cc-cedict/local/parser.py index d6acb834..e1e10b3d 100644 --- a/examples/cc-cedict/local/parser.py +++ b/examples/cc-cedict/local/parser.py @@ -1,18 +1,24 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. # https://github.com/rubber-duck-dragon/rubber-duck-dragon.github.io/blob/master/cc-cedict_parser/parser.py - #A parser for the CC-Cedict. Convert the Chinese-English dictionary into a list of python dictionaries with "traditional","simplified", "pinyin", and "english" keys. - #Make sure that the cedict_ts.u8 file is in the same folder as this file, and that the name matches the file name on line 13. - #Before starting, open the CEDICT text file and delete the copyright information at the top. Otherwise the program will try to parse it and you will get an error message. - #Characters that are commonly used as surnames have two entries in CC-CEDICT. This program will remove the surname entry if there is another entry for the character. If you want to include the surnames, simply delete lines 59 and 60. - #This code was written by Franki Allegra in February 2020. - - -import sys import json +import sys # usage: bin ccedict dump.json @@ -50,9 +56,10 @@ with open(sys.argv[1], 'rt') as file: list_of_dicts.append(parsed) def remove_surnames(): - for x in range(len(list_of_dicts)-1, -1, -1): + for x in range(len(list_of_dicts) - 1, -1, -1): if "surname " in list_of_dicts[x]['english']: - if list_of_dicts[x]['traditional'] == list_of_dicts[x+1]['traditional']: + if list_of_dicts[x]['traditional'] == list_of_dicts[x + 1][ + 'traditional']: list_of_dicts.pop(x) def main(): @@ -60,13 +67,12 @@ with open(sys.argv[1], 'rt') as file: #make each line into a dictionary print("Parsing dictionary . . .") for line in dict_lines: - parse_line(line) + parse_line(line) #remove entries for surnames from the data (optional): print("Removing Surnames . . .") remove_surnames() - print("Saving to database (this may take a few minutes) . . .") with open(sys.argv[2], 'wt') as fout: for one_dict in list_of_dicts: @@ -74,5 +80,6 @@ with open(sys.argv[1], 'rt') as file: fout.write(json_str + "\n") print('Done!') + list_of_dicts = [] parsed_dict = main() diff --git a/requirements.txt b/requirements.txt index a6facb6c..57a951bb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ coverage pre-commit +pybind11 resampy==0.2.2 scipy==1.2.1 sentencepiece @@ -7,7 +8,6 @@ snakeviz SoundFile==0.9.0.post1 sox tensorboardX +textgrid typeguard yacs -pybind11 -textgrid diff --git a/tests/mask_test.py b/tests/mask_test.py index cd37a899..f44aca8f 100644 --- a/tests/mask_test.py +++ b/tests/mask_test.py @@ -18,7 +18,6 @@ import paddle from deepspeech.modules.mask import make_non_pad_mask from deepspeech.modules.mask import make_pad_mask -from deepspeech.modules.mask import sequence_mask class TestU2Model(unittest.TestCase): @@ -36,16 +35,10 @@ class TestU2Model(unittest.TestCase): [False, False, True, True, True], ]) - def test_sequence_mask(self): - res = sequence_mask(self.lengths, dtype='bool') - self.assertSequenceEqual(res.numpy().tolist(), self.masks.tolist()) - def test_make_non_pad_mask(self): res = make_non_pad_mask(self.lengths) - res1 = sequence_mask(self.lengths, dtype='bool') res2 = make_pad_mask(self.lengths).logical_not() self.assertSequenceEqual(res.numpy().tolist(), self.masks.tolist()) - self.assertSequenceEqual(res.numpy().tolist(), res1.numpy().tolist()) self.assertSequenceEqual(res.numpy().tolist(), res2.numpy().tolist()) def test_make_pad_mask(self): diff --git a/third_party/phkit/README.md b/third_party/phkit/README.md index e8f0745c..002425ba 100644 --- a/third_party/phkit/README.md +++ b/third_party/phkit/README.md @@ -39,7 +39,7 @@ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z 标点: ! ? . , ; : " # ( ) -注:!=!!|?=??|.=.。|,=,,、|;=;;|:=::|"="“|#=#   |(=(([[{{【<《|)=))]]}}】>》 +注:!=!!|?=??|.=.。|,=,,、|;=;;|:=::|"="“|#=#   |(=(([[{{【<《|)=))]]}}】>》 预留: w y 0 6 7 8 9