remove sequnce_mask and change ds2 export audio shape to [B,T,D] (#639)

* remove sequnce_mask

* format

* fix ds2 export audio shape from B,D,T to B,T,D
pull/640/head
Hui Zhang 3 years ago committed by GitHub
parent 749a113037
commit b3bc451328
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -11,7 +11,7 @@
## Features
See [feature list](doc/src/feature_list.md) for more information.
See [feature list](doc/src/feature_list.md) for more information.
## Setup

@ -272,8 +272,8 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
infer_model,
input_spec=[
paddle.static.InputSpec(
shape=[None, feat_dim, None],
dtype='float32'), # audio, [B,D,T]
shape=[None, None, feat_dim],
dtype='float32'), # audio, [B,T,D]
paddle.static.InputSpec(shape=[None],
dtype='int64'), # audio_length, [B]
])

@ -179,7 +179,8 @@ class FeatureNormalizer(object):
wav_number += batch_size
if wav_number % 1000 == 0:
logger.info(f'process {wav_number} wavs,{all_number} frames.')
logger.info(
f'process {wav_number} wavs,{all_number} frames.')
self.cmvn_info = {
'mean_stat': list(all_mean_stat.tolist()),

@ -15,7 +15,7 @@ from paddle import nn
from paddle.nn import functional as F
from deepspeech.modules.activation import brelu
from deepspeech.modules.mask import sequence_mask
from deepspeech.modules.mask import make_non_pad_mask
from deepspeech.utils.log import Log
logger = Log(__name__).getlog()
@ -111,8 +111,10 @@ class ConvBn(nn.Layer):
) // self.stride[1] + 1
# reset padding part to 0
masks = sequence_mask(x_len) #[B, T]
masks = make_non_pad_mask(x_len) #[B, T]
masks = masks.unsqueeze(1).unsqueeze(1) # [B, 1, 1, T]
# TODO(Hui Zhang): not support bool multiply
masks = masks.type_as(x)
x = x.multiply(masks)
return x, x_len

@ -18,40 +18,12 @@ from deepspeech.utils.log import Log
logger = Log(__name__).getlog()
__all__ = [
'sequence_mask', "make_pad_mask", "make_non_pad_mask", "subsequent_mask",
"make_pad_mask", "make_non_pad_mask", "subsequent_mask",
"subsequent_chunk_mask", "add_optional_chunk_mask", "mask_finished_scores",
"mask_finished_preds"
]
def sequence_mask(x_len, max_len=None, dtype='float32'):
"""batch sequence mask.
Args:
x_len ([paddle.Tensor]): xs lenght, [B]
max_len ([type], optional): max sequence length. Defaults to None.
dtype (str, optional): mask data type. Defaults to 'float32'.
Returns:
paddle.Tensor: [B, Tmax]
Examples:
>>> sequence_mask([2, 4])
[[1., 1., 0., 0.],
[1., 1., 1., 1.]]
"""
# (TODO: Hui Zhang): jit not support Tenosr.dim() and Tensor.ndim
# assert x_len.dim() == 1, (x_len.dim(), x_len)
max_len = max_len or x_len.max()
x_len = paddle.unsqueeze(x_len, -1)
row_vector = paddle.arange(max_len)
# TODO(Hui Zhang): fix this bug
#mask = row_vector < x_len
mask = row_vector > x_len # a bug, broadcast 的时候出错了
mask = paddle.cast(mask, dtype)
return mask
def make_pad_mask(lengths: paddle.Tensor) -> paddle.Tensor:
"""Make mask tensor containing indices of padded part.
See description of make_non_pad_mask.
@ -66,7 +38,8 @@ def make_pad_mask(lengths: paddle.Tensor) -> paddle.Tensor:
[0, 0, 0, 1, 1],
[0, 0, 1, 1, 1]]
"""
assert lengths.dim() == 1
# (TODO: Hui Zhang): jit not support Tenosr.dim() and Tensor.ndim
# assert lengths.dim() == 1
batch_size = int(lengths.shape[0])
max_len = int(lengths.max())
seq_range = paddle.arange(0, max_len, dtype=paddle.int64)

@ -19,7 +19,7 @@ from paddle.nn import functional as F
from paddle.nn import initializer as I
from deepspeech.modules.activation import brelu
from deepspeech.modules.mask import sequence_mask
from deepspeech.modules.mask import make_non_pad_mask
from deepspeech.utils.log import Log
logger = Log(__name__).getlog()
@ -306,7 +306,9 @@ class RNNStack(nn.Layer):
"""
for i, rnn in enumerate(self.rnn_stacks):
x, x_len = rnn(x, x_len)
masks = sequence_mask(x_len) #[B, T]
masks = make_non_pad_mask(x_len) #[B, T]
masks = masks.unsqueeze(-1) # [B, T, 1]
# TODO(Hui Zhang): not support bool multiply
masks = masks.type_as(x)
x = x.multiply(masks)
return x, x_len

@ -18,4 +18,3 @@
* [ctc alignment](https://mp.weixin.qq.com/s/4aGehNN7PpIvCh03qTT5oA)
* [时间戳和N-Best](https://mp.weixin.qq.com/s?__biz=MzU2NjUwMTgxOQ==&mid=2247483956&idx=1&sn=80ce595238d84155d50f08c0d52267d3&chksm=fcaacae0cbdd43f62b1da60c8e8671a9e0bb2aeee94f58751839b03a1c45b9a3889b96705080&scene=21#wechat_redirect)

@ -98,4 +98,4 @@
## Text Filter
* 敏感词(黄暴、涉政、违法违禁等)
* 敏感词(黄暴、涉政、违法违禁等)

@ -14,4 +14,3 @@ We compare the training time with 1, 2, 4, 8 Tesla V100 GPUs (with a subset of L
| 8 | 6.95 X |
`utils/profile.sh` provides such a demo profiling tool, you can change it as need.

@ -67,4 +67,4 @@
* https://github.com/KuangDD/phkit
* https://github.com/mozillazg/python-pinyin
* https://github.com/Kyubyong/g2pC
* https://github.com/kakaobrain/g2pM
* https://github.com/kakaobrain/g2pM

@ -18,4 +18,4 @@
### ASR Noise
* [asr-noises](https://github.com/speechio/asr-noises)
* [asr-noises](https://github.com/speechio/asr-noises)

@ -3,4 +3,3 @@
## Reference
* [时间戳和N-Best](https://mp.weixin.qq.com/s?__biz=MzU2NjUwMTgxOQ==&mid=2247483956&idx=1&sn=80ce595238d84155d50f08c0d52267d3&chksm=fcaacae0cbdd43f62b1da60c8e8671a9e0bb2aeee94f58751839b03a1c45b9a3889b96705080&scene=21#wechat_redirect)

@ -58,4 +58,4 @@
### Grapheme To Phoneme
* syallable
* phoneme
* phoneme

@ -83,4 +83,4 @@ Please notice that the released language models only contain Chinese simplified
```
build/bin/build_binary ./result/people2014corpus_words.arps ./result/people2014corpus_words.klm
```
```

@ -76,7 +76,7 @@ pip3 install textgrid
tg.read('file.TextGrid') # 'file.TextGrid' 是文件名
```
tg.tiers属性:
tg.tiers属性:
会把文件中的所有item打印出来, print(tg.tiers) 的结果如下:
```text
@ -86,7 +86,7 @@ pip3 install textgrid
Interval(1361.89250, 1362.01250, R),
Interval(1362.01250, 1362.13250, AY1),
Interval(1362.13250, 1362.16250, T),
...
]
)
@ -113,7 +113,7 @@ pip3 install textgrid
Interval 可以理解为时长
```
2. textgrid库中的对象
**IntervalTier** 对象:
@ -148,7 +148,7 @@ pip3 install textgrid
strict -- > 返回bool值, 表示是否严格TextGrid格式
```
**PointTier** 对象:
方法
@ -174,7 +174,7 @@ pip3 install textgrid
name 返回name
```
**Point** 对象:
支持比较大小, 支持加减运算
@ -185,7 +185,7 @@ pip3 install textgrid
time:
```
**Interval** 对象:
支持比较大小, 支持加减运算
@ -250,10 +250,9 @@ pip3 install textgrid
grids: --> 返回读取的grids的列表
```
## Reference
* https://zh.wikipedia.org/wiki/Praat%E8%AF%AD%E9%9F%B3%E5%AD%A6%E8%BD%AF%E4%BB%B6
* https://blog.csdn.net/duxin_csdn/article/details/88966295

@ -1,4 +1,3 @@
# Useful Tools
* [正则可视化和常用正则表达式](https://wangwl.net/static/projects/visualRegex/#)

@ -23,7 +23,7 @@ Therefore, procedures like stemming and lemmatization are not useful for Chinese
### Tokenization
**Tokenizing breaks up text data into shorter pre-set strings**, which help build context and meaning for the machine learning model.
**Tokenizing breaks up text data into shorter pre-set strings**, which help build context and meaning for the machine learning model.
These “tags” label the part of speech. There are 24 part of speech tags and 4 proper name category labels in the `**jieba**` packages existing dictionary.
@ -31,7 +31,7 @@ These “tags” label the part of speech. There are 24 part of speech tags and
### Stop Words
In NLP, **stop words are “meaningless” words** that make the data too noisy or ambiguous.
In NLP, **stop words are “meaningless” words** that make the data too noisy or ambiguous.
Instead of manually removing them, you could import the `**stopwordsiso**` package for a full list of Chinese stop words. More information can be found [here](https://pypi.org/project/stopwordsiso/). And with this, we can easily create code to filter out any stop words in large text data.
@ -209,4 +209,4 @@ TN: 基于规则的方法
## Reference
* [Text Front End](https://slyne.github.io/%E5%85%AC%E5%BC%80%E8%AF%BE/2020/10/03/TTS1/)
* [Chinese Natural Language (Pre)processing: An Introduction](https://towardsdatascience.com/chinese-natural-language-pre-processing-an-introduction-995d16c2705f)
* [Beginners Guide to Sentiment Analysis for Simplified Chinese using SnowNLP](https://towardsdatascience.com/beginners-guide-to-sentiment-analysis-for-simplified-chinese-using-snownlp-ce88a8407efb)
* [Beginners Guide to Sentiment Analysis for Simplified Chinese using SnowNLP](https://towardsdatascience.com/beginners-guide-to-sentiment-analysis-for-simplified-chinese-using-snownlp-ce88a8407efb)

@ -29,4 +29,3 @@
* [Endpoint 检测](https://mp.weixin.qq.com/s?__biz=MzU2NjUwMTgxOQ==&mid=2247484024&idx=1&sn=12da2ee76347de4a18856274ba6ba61f&chksm=fcaacaaccbdd43ba6b3e996bbf1e2ac6d5f1b449dfd80fcaccfbbe0a240fa1668b931dbf4bd5&scene=21#wechat_redirect)
* Kaldi: *https://github.com/kaldi-asr/kaldi/blob/6260b27d146e466c7e1e5c60858e8da9fd9c78ae/src/online2/online-endpoint.h#L132-L150*
* End-to-End Automatic Speech Recognition Integrated with CTC-Based Voice Activity Detection: *https://arxiv.org/pdf/2002.00551.pdf*

@ -24,7 +24,7 @@ data:
n_fft: None
stride_ms: 10.0
window_ms: 25.0
use_dB_normalization: False
use_dB_normalization: True
target_dB: -20
random_seed: 0
keep_transcription_text: False
@ -76,7 +76,7 @@ model:
training:
n_epoch: 240
accum_grad: 2
global_grad_clip: 5.0
global_grad_clip: 3.0
optim: adam
optim_conf:
lr: 0.002

@ -1,18 +1,24 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# https://github.com/rubber-duck-dragon/rubber-duck-dragon.github.io/blob/master/cc-cedict_parser/parser.py
#A parser for the CC-Cedict. Convert the Chinese-English dictionary into a list of python dictionaries with "traditional","simplified", "pinyin", and "english" keys.
#Make sure that the cedict_ts.u8 file is in the same folder as this file, and that the name matches the file name on line 13.
#Before starting, open the CEDICT text file and delete the copyright information at the top. Otherwise the program will try to parse it and you will get an error message.
#Characters that are commonly used as surnames have two entries in CC-CEDICT. This program will remove the surname entry if there is another entry for the character. If you want to include the surnames, simply delete lines 59 and 60.
#This code was written by Franki Allegra in February 2020.
import sys
import json
import sys
# usage: bin ccedict dump.json
@ -50,9 +56,10 @@ with open(sys.argv[1], 'rt') as file:
list_of_dicts.append(parsed)
def remove_surnames():
for x in range(len(list_of_dicts)-1, -1, -1):
for x in range(len(list_of_dicts) - 1, -1, -1):
if "surname " in list_of_dicts[x]['english']:
if list_of_dicts[x]['traditional'] == list_of_dicts[x+1]['traditional']:
if list_of_dicts[x]['traditional'] == list_of_dicts[x + 1][
'traditional']:
list_of_dicts.pop(x)
def main():
@ -60,13 +67,12 @@ with open(sys.argv[1], 'rt') as file:
#make each line into a dictionary
print("Parsing dictionary . . .")
for line in dict_lines:
parse_line(line)
parse_line(line)
#remove entries for surnames from the data (optional):
print("Removing Surnames . . .")
remove_surnames()
print("Saving to database (this may take a few minutes) . . .")
with open(sys.argv[2], 'wt') as fout:
for one_dict in list_of_dicts:
@ -74,5 +80,6 @@ with open(sys.argv[1], 'rt') as file:
fout.write(json_str + "\n")
print('Done!')
list_of_dicts = []
parsed_dict = main()

@ -1,5 +1,6 @@
coverage
pre-commit
pybind11
resampy==0.2.2
scipy==1.2.1
sentencepiece
@ -7,7 +8,6 @@ snakeviz
SoundFile==0.9.0.post1
sox
tensorboardX
textgrid
typeguard
yacs
pybind11
textgrid

@ -18,7 +18,6 @@ import paddle
from deepspeech.modules.mask import make_non_pad_mask
from deepspeech.modules.mask import make_pad_mask
from deepspeech.modules.mask import sequence_mask
class TestU2Model(unittest.TestCase):
@ -36,16 +35,10 @@ class TestU2Model(unittest.TestCase):
[False, False, True, True, True],
])
def test_sequence_mask(self):
res = sequence_mask(self.lengths, dtype='bool')
self.assertSequenceEqual(res.numpy().tolist(), self.masks.tolist())
def test_make_non_pad_mask(self):
res = make_non_pad_mask(self.lengths)
res1 = sequence_mask(self.lengths, dtype='bool')
res2 = make_pad_mask(self.lengths).logical_not()
self.assertSequenceEqual(res.numpy().tolist(), self.masks.tolist())
self.assertSequenceEqual(res.numpy().tolist(), res1.numpy().tolist())
self.assertSequenceEqual(res.numpy().tolist(), res2.numpy().tolist())
def test_make_pad_mask(self):

@ -39,7 +39,7 @@ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
标点:
! ? . , ; : " # ( )
注:!=!|?=?|.=.。|,=,,、|;=;|:=:|"="“|#=#   |(=([{{【<《|)=)]}}】>》
注:!=!|?=?|.=.。|,=,,、|;=;|:=:|"="“|#=#   |(=([{{【<《|)=)]}}】>》
预留:
w y 0 6 7 8 9

Loading…
Cancel
Save