remove sequnce_mask and change ds2 export audio shape to [B,T,D] (#639)

* remove sequnce_mask * format * fix ds2 export audio shape from B,D,T to B,T,D
4 years ago · b3bc451328
parent 749a113037
commit b3bc451328
23 changed files with 55 additions and 83 deletions
--- a/README.md
+++ b/README.md
@ -11,7 +11,7 @@

 ## Features

- See [feature list](doc/src/feature_list.md) for more information. 
+ See [feature list](doc/src/feature_list.md) for more information.

 ## Setup

--- a/deepspeech/exps/deepspeech2/model.py
+++ b/deepspeech/exps/deepspeech2/model.py
@ -272,8 +272,8 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
            infer_model,
            input_spec=[
                paddle.static.InputSpec(
-                    shape=[None, feat_dim, None],
-                    dtype='float32'),  # audio, [B,D,T]
+                    shape=[None, None, feat_dim],
+                    dtype='float32'),  # audio, [B,T,D]
                paddle.static.InputSpec(shape=[None],
                                        dtype='int64'),  # audio_length, [B]
            ])
--- a/deepspeech/frontend/normalizer.py
+++ b/deepspeech/frontend/normalizer.py
@ -179,7 +179,8 @@ class FeatureNormalizer(object):
                wav_number += batch_size

                if wav_number % 1000 == 0:
-                    logger.info(f'process {wav_number} wavs,{all_number} frames.')
+                    logger.info(
+                        f'process {wav_number} wavs,{all_number} frames.')

        self.cmvn_info = {
            'mean_stat': list(all_mean_stat.tolist()),
--- a/deepspeech/modules/conv.py
+++ b/deepspeech/modules/conv.py
@ -15,7 +15,7 @@ from paddle import nn
 from paddle.nn import functional as F

 from deepspeech.modules.activation import brelu
-from deepspeech.modules.mask import sequence_mask
+from deepspeech.modules.mask import make_non_pad_mask
 from deepspeech.utils.log import Log

 logger = Log(__name__).getlog()
@ -111,8 +111,10 @@ class ConvBn(nn.Layer):
                 ) // self.stride[1] + 1

        # reset padding part to 0
-        masks = sequence_mask(x_len)  #[B, T]
+        masks = make_non_pad_mask(x_len)  #[B, T]
        masks = masks.unsqueeze(1).unsqueeze(1)  # [B, 1, 1, T]
+        # TODO(Hui Zhang): not support bool multiply
+        masks = masks.type_as(x)
        x = x.multiply(masks)

        return x, x_len
--- a/deepspeech/modules/mask.py
+++ b/deepspeech/modules/mask.py
@ -18,40 +18,12 @@ from deepspeech.utils.log import Log
 logger = Log(__name__).getlog()

 __all__ = [
-    'sequence_mask', "make_pad_mask", "make_non_pad_mask", "subsequent_mask",
+    "make_pad_mask", "make_non_pad_mask", "subsequent_mask",
    "subsequent_chunk_mask", "add_optional_chunk_mask", "mask_finished_scores",
    "mask_finished_preds"
 ]


-def sequence_mask(x_len, max_len=None, dtype='float32'):
-    """batch sequence mask.
-
-    Args:
-        x_len ([paddle.Tensor]): xs lenght, [B]
-        max_len ([type], optional): max sequence length. Defaults to None.
-        dtype (str, optional): mask data type. Defaults to 'float32'.
-
-    Returns:
-        paddle.Tensor: [B, Tmax]
-
-     Examples:
-        >>> sequence_mask([2, 4])
-        [[1., 1., 0., 0.],
-         [1., 1., 1., 1.]]
-    """
-    # (TODO: Hui Zhang): jit not support Tenosr.dim() and Tensor.ndim
-    # assert x_len.dim() == 1, (x_len.dim(), x_len)
-    max_len = max_len or x_len.max()
-    x_len = paddle.unsqueeze(x_len, -1)
-    row_vector = paddle.arange(max_len)
-    # TODO(Hui Zhang): fix this bug
-    #mask = row_vector < x_len
-    mask = row_vector > x_len  # a bug, broadcast 的时候出错了
-    mask = paddle.cast(mask, dtype)
-    return mask
-
-
 def make_pad_mask(lengths: paddle.Tensor) -> paddle.Tensor:
    """Make mask tensor containing indices of padded part.
    See description of make_non_pad_mask.
@ -66,7 +38,8 @@ def make_pad_mask(lengths: paddle.Tensor) -> paddle.Tensor:
                 [0, 0, 0, 1, 1],
                 [0, 0, 1, 1, 1]]
    """
-    assert lengths.dim() == 1
+    # (TODO: Hui Zhang): jit not support Tenosr.dim() and Tensor.ndim
+    # assert lengths.dim() == 1
    batch_size = int(lengths.shape[0])
    max_len = int(lengths.max())
    seq_range = paddle.arange(0, max_len, dtype=paddle.int64)
--- a/deepspeech/modules/rnn.py
+++ b/deepspeech/modules/rnn.py
@ -19,7 +19,7 @@ from paddle.nn import functional as F
 from paddle.nn import initializer as I

 from deepspeech.modules.activation import brelu
-from deepspeech.modules.mask import sequence_mask
+from deepspeech.modules.mask import make_non_pad_mask
 from deepspeech.utils.log import Log

 logger = Log(__name__).getlog()
@ -306,7 +306,9 @@ class RNNStack(nn.Layer):
        """
        for i, rnn in enumerate(self.rnn_stacks):
            x, x_len = rnn(x, x_len)
-            masks = sequence_mask(x_len)  #[B, T]
+            masks = make_non_pad_mask(x_len)  #[B, T]
            masks = masks.unsqueeze(-1)  # [B, T, 1]
+            # TODO(Hui Zhang): not support bool multiply
+            masks = masks.type_as(x)
            x = x.multiply(masks)
        return x, x_len
--- a/doc/src/alignment.md
+++ b/doc/src/alignment.md
@ -18,4 +18,3 @@

 * [ctc alignment](https://mp.weixin.qq.com/s/4aGehNN7PpIvCh03qTT5oA)
 * [时间戳和N-Best](https://mp.weixin.qq.com/s?__biz=MzU2NjUwMTgxOQ==&mid=2247483956&idx=1&sn=80ce595238d84155d50f08c0d52267d3&chksm=fcaacae0cbdd43f62b1da60c8e8671a9e0bb2aeee94f58751839b03a1c45b9a3889b96705080&scene=21#wechat_redirect)
-
--- a/doc/src/asr_text_backend.md
+++ b/doc/src/asr_text_backend.md
@ -98,4 +98,4 @@

 ## Text Filter

-* 敏感词（黄暴、涉政、违法违禁等）
+* 敏感词（黄暴、涉政、违法违禁等）
--- a/doc/src/benchmark.md
+++ b/doc/src/benchmark.md
@ -14,4 +14,3 @@ We compare the training time with 1, 2, 4, 8 Tesla V100 GPUs (with a subset of L
 | 8         | 6.95 X |

 `utils/profile.sh` provides such a demo profiling tool, you can change it as need.
-
--- a/doc/src/chinese_syllable.md
+++ b/doc/src/chinese_syllable.md
@ -67,4 +67,4 @@
 * https://github.com/KuangDD/phkit
 * https://github.com/mozillazg/python-pinyin
 * https://github.com/Kyubyong/g2pC
-* https://github.com/kakaobrain/g2pM
+* https://github.com/kakaobrain/g2pM
--- a/doc/src/dataset.md
+++ b/doc/src/dataset.md
@ -18,4 +18,4 @@

 ### ASR Noise

-* [asr-noises](https://github.com/speechio/asr-noises)
+* [asr-noises](https://github.com/speechio/asr-noises)
--- a/doc/src/decoding.md
+++ b/doc/src/decoding.md
@ -3,4 +3,3 @@
 ## Reference

 * [时间戳和N-Best](https://mp.weixin.qq.com/s?__biz=MzU2NjUwMTgxOQ==&mid=2247483956&idx=1&sn=80ce595238d84155d50f08c0d52267d3&chksm=fcaacae0cbdd43f62b1da60c8e8671a9e0bb2aeee94f58751839b03a1c45b9a3889b96705080&scene=21#wechat_redirect)
-
--- a/doc/src/feature_list.md
+++ b/doc/src/feature_list.md
@ -58,4 +58,4 @@
 ### Grapheme To Phoneme

 * syallable
-* phoneme
+* phoneme
--- a/doc/src/ngram_lm.md
+++ b/doc/src/ngram_lm.md
@ -83,4 +83,4 @@ Please notice that the released language models only contain Chinese simplified

   ```
   build/bin/build_binary ./result/people2014corpus_words.arps ./result/people2014corpus_words.klm
-   ```
+   ```
--- a/doc/src/praat_textgrid.md
+++ b/doc/src/praat_textgrid.md
@ -76,7 +76,7 @@ pip3 install textgrid
   tg.read('file.TextGrid')  # 'file.TextGrid' 是文件名
   ```

-   tg.tiers属性: 
+   tg.tiers属性:
   会把文件中的所有item打印出来, print(tg.tiers) 的结果如下:

   ```text
@ -86,7 +86,7 @@ pip3 install textgrid
           Interval(1361.89250, 1362.01250, R),
           Interval(1362.01250, 1362.13250, AY1),
           Interval(1362.13250, 1362.16250, T),
-   
+
   ...
           ]
       )
@ -113,7 +113,7 @@ pip3 install textgrid
   Interval  可以理解为时长
   ```

-   
+

 2. textgrid库中的对象
   **IntervalTier** 对象:
@ -148,7 +148,7 @@ pip3 install textgrid
   strict  -- > 返回bool值, 表示是否严格TextGrid格式
   ```

-         
+     

   **PointTier** 对象:
   方法
@ -174,7 +174,7 @@ pip3 install textgrid
   name    返回name
   ```

-   
+

   **Point** 对象:
       支持比较大小, 支持加减运算
@ -185,7 +185,7 @@ pip3 install textgrid
   time:
   ```

-          
+     

   **Interval** 对象:
       支持比较大小, 支持加减运算
@ -250,10 +250,9 @@ pip3 install textgrid
   grids:  --> 返回读取的grids的列表
   ```

-   
+

 ## Reference

 * https://zh.wikipedia.org/wiki/Praat%E8%AF%AD%E9%9F%B3%E5%AD%A6%E8%BD%AF%E4%BB%B6
 * https://blog.csdn.net/duxin_csdn/article/details/88966295
-
--- a/doc/src/tools.md
+++ b/doc/src/tools.md
@ -1,4 +1,3 @@
 # Useful Tools

 * [正则可视化和常用正则表达式](https://wangwl.net/static/projects/visualRegex/#)
-
--- a/doc/src/tts_text_front_end.md
+++ b/doc/src/tts_text_front_end.md
@ -23,7 +23,7 @@ Therefore, procedures like stemming and lemmatization are not useful for Chinese

 ### Tokenization

-**Tokenizing breaks up text data into shorter pre-set strings**, which help build context and meaning for the machine learning model.   
+**Tokenizing breaks up text data into shorter pre-set strings**, which help build context and meaning for the machine learning model.  

 These “tags” label the part of speech. There are 24 part of speech tags and 4 proper name category labels in the `**jieba**` package’s existing dictionary.

@ -31,7 +31,7 @@ These “tags” label the part of speech. There are 24 part of speech tags and

 ### Stop Words

-In NLP, **stop words are “meaningless” words** that make the data too noisy or ambiguous. 
+In NLP, **stop words are “meaningless” words** that make the data too noisy or ambiguous.

 Instead of manually removing them, you could import the `**stopwordsiso**` package for a full list of Chinese stop words. More information can be found [here](https://pypi.org/project/stopwordsiso/). And with this, we can easily create code to filter out any stop words in large text data.

@ -209,4 +209,4 @@ TN: 基于规则的方法
 ## Reference
 * [Text Front End](https://slyne.github.io/%E5%85%AC%E5%BC%80%E8%AF%BE/2020/10/03/TTS1/)
 * [Chinese Natural Language (Pre)processing: An Introduction](https://towardsdatascience.com/chinese-natural-language-pre-processing-an-introduction-995d16c2705f)
-* [Beginner’s Guide to Sentiment Analysis for Simplified Chinese using SnowNLP](https://towardsdatascience.com/beginners-guide-to-sentiment-analysis-for-simplified-chinese-using-snownlp-ce88a8407efb)
+* [Beginner’s Guide to Sentiment Analysis for Simplified Chinese using SnowNLP](https://towardsdatascience.com/beginners-guide-to-sentiment-analysis-for-simplified-chinese-using-snownlp-ce88a8407efb)
--- a/doc/src/vad.md
+++ b/doc/src/vad.md
@ -29,4 +29,3 @@
 * [Endpoint 检测](https://mp.weixin.qq.com/s?__biz=MzU2NjUwMTgxOQ==&mid=2247484024&idx=1&sn=12da2ee76347de4a18856274ba6ba61f&chksm=fcaacaaccbdd43ba6b3e996bbf1e2ac6d5f1b449dfd80fcaccfbbe0a240fa1668b931dbf4bd5&scene=21#wechat_redirect)
 * Kaldi: *https://github.com/kaldi-asr/kaldi/blob/6260b27d146e466c7e1e5c60858e8da9fd9c78ae/src/online2/online-endpoint.h#L132-L150*
 * End-to-End Automatic Speech Recognition Integrated with CTC-Based Voice Activity Detection: *https://arxiv.org/pdf/2002.00551.pdf*
-
--- a/examples/aishell/s1/conf/conformer.yaml
+++ b/examples/aishell/s1/conf/conformer.yaml
@ -24,7 +24,7 @@ data:
  n_fft: None
  stride_ms: 10.0
  window_ms: 25.0
-  use_dB_normalization: False 
+  use_dB_normalization: True 
  target_dB: -20
  random_seed: 0
  keep_transcription_text: False
@ -76,7 +76,7 @@ model:
 training:
  n_epoch: 240 
  accum_grad: 2
-  global_grad_clip: 5.0
+  global_grad_clip: 3.0
  optim: adam
  optim_conf:
    lr: 0.002
--- a/examples/cc-cedict/local/parser.py
+++ b/examples/cc-cedict/local/parser.py
@ -1,18 +1,24 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 # https://github.com/rubber-duck-dragon/rubber-duck-dragon.github.io/blob/master/cc-cedict_parser/parser.py
-
 #A parser for the CC-Cedict. Convert the Chinese-English dictionary into a list of python dictionaries with "traditional","simplified", "pinyin", and "english" keys.
-
 #Make sure that the cedict_ts.u8 file is in the same folder as this file, and that the name matches the file name on line 13.
-
 #Before starting, open the CEDICT text file and delete the copyright information at the top. Otherwise the program will try to parse it and you will get an error message.
-
 #Characters that are commonly used as surnames have two entries in CC-CEDICT. This program will remove the surname entry if there is another entry for the character. If you want to include the surnames, simply delete lines 59 and 60.
-
 #This code was written by Franki Allegra in February 2020.
-
-
-import sys
 import json
+import sys

 # usage: bin ccedict dump.json

@ -50,9 +56,10 @@ with open(sys.argv[1], 'rt') as file:
        list_of_dicts.append(parsed)

    def remove_surnames():
-        for x in range(len(list_of_dicts)-1, -1, -1):
+        for x in range(len(list_of_dicts) - 1, -1, -1):
            if "surname " in list_of_dicts[x]['english']:
-                if list_of_dicts[x]['traditional'] == list_of_dicts[x+1]['traditional']:
+                if list_of_dicts[x]['traditional'] == list_of_dicts[x + 1][
+                        'traditional']:
                    list_of_dicts.pop(x)

    def main():
@ -60,13 +67,12 @@ with open(sys.argv[1], 'rt') as file:
        #make each line into a dictionary
        print("Parsing dictionary . . .")
        for line in dict_lines:
-                parse_line(line)
+            parse_line(line)

        #remove entries for surnames from the data (optional):
        print("Removing Surnames . . .")
        remove_surnames()

-
        print("Saving to database (this may take a few minutes) . . .")
        with open(sys.argv[2], 'wt') as fout:
            for one_dict in list_of_dicts:
@ -74,5 +80,6 @@ with open(sys.argv[1], 'rt') as file:
                fout.write(json_str + "\n")
        print('Done!')

+
 list_of_dicts = []
 parsed_dict = main()
--- a/requirements.txt
+++ b/requirements.txt
@ -1,5 +1,6 @@
 coverage
 pre-commit
+pybind11
 resampy==0.2.2
 scipy==1.2.1
 sentencepiece
@ -7,7 +8,6 @@ snakeviz
 SoundFile==0.9.0.post1
 sox
 tensorboardX
+textgrid
 typeguard
 yacs
-pybind11
-textgrid
--- a/tests/mask_test.py
+++ b/tests/mask_test.py
@ -18,7 +18,6 @@ import paddle

 from deepspeech.modules.mask import make_non_pad_mask
 from deepspeech.modules.mask import make_pad_mask
-from deepspeech.modules.mask import sequence_mask


 class TestU2Model(unittest.TestCase):
@ -36,16 +35,10 @@ class TestU2Model(unittest.TestCase):
            [False, False, True, True, True],
        ])

-    def test_sequence_mask(self):
-        res = sequence_mask(self.lengths, dtype='bool')
-        self.assertSequenceEqual(res.numpy().tolist(), self.masks.tolist())
-
    def test_make_non_pad_mask(self):
        res = make_non_pad_mask(self.lengths)
-        res1 = sequence_mask(self.lengths, dtype='bool')
        res2 = make_pad_mask(self.lengths).logical_not()
        self.assertSequenceEqual(res.numpy().tolist(), self.masks.tolist())
-        self.assertSequenceEqual(res.numpy().tolist(), res1.numpy().tolist())
        self.assertSequenceEqual(res.numpy().tolist(), res2.numpy().tolist())

    def test_make_pad_mask(self):
--- a/third_party/phkit/README.md
+++ b/third_party/phkit/README.md
@ -39,7 +39,7 @@ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z

 标点：
 ! ? . , ; : " # ( )
-注：!=!！|?=?？|.=.。|,=,，、|;=;；|:=:：|"="“|#=# 　	|(=(（[［{｛【<《|)=)）]］}｝】>》
+注：!=!！|?=?？|.=.。|,=,，、|;=;；|:=:：|"="“|#=# 　    |(=(（[［{｛【<《|)=)）]］}｝】>》

 预留：
 w y 0 6 7 8 9