From c09b0e894019d7de78bbc0bece1b90b44b7aff28 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Thu, 19 Aug 2021 03:35:07 +0000 Subject: [PATCH] fix specaug --- README.md | 5 +++-- README_cn.md | 5 +++-- deepspeech/frontend/augmentor/base.py | 6 +++--- deepspeech/frontend/augmentor/spec_augment.py | 7 +++++-- examples/librispeech/s2/conf/augmentation.json | 17 ----------------- 5 files changed, 14 insertions(+), 26 deletions(-) diff --git a/README.md b/README.md index f7d1e0882..d10fd5d59 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,12 @@ [中文版](README_cn.md) -# PaddlePaddle ASR toolkit +# PaddlePaddle Speech to Any toolkit ![License](https://img.shields.io/badge/license-Apache%202-red.svg) ![python version](https://img.shields.io/badge/python-3.7+-orange.svg) ![support os](https://img.shields.io/badge/os-linux-yellow.svg) -*PaddleASR* is an open-source implementation of end-to-end Automatic Speech Recognition (ASR) engine, with [PaddlePaddle](https://github.com/PaddlePaddle/Paddle) platform. Our vision is to empower both industrial application and academic research on speech recognition, via an easy-to-use, efficient, samller and scalable implementation, including training, inference & testing module, and deployment. +*DeepSpeech* is an open-source implementation of end-to-end Automatic Speech Recognition engine, with [PaddlePaddle](https://github.com/PaddlePaddle/Paddle) platform. Our vision is to empower both industrial application and academic research on speech recognition, via an easy-to-use, efficient, samller and scalable implementation, including training, inference & testing module, and deployment. ## Features @@ -15,6 +15,7 @@ ## Setup +* Ubuntu 16.04 * python>=3.7 * paddlepaddle>=2.1.2 diff --git a/README_cn.md b/README_cn.md index 019b38c15..90a65c440 100644 --- a/README_cn.md +++ b/README_cn.md @@ -1,12 +1,12 @@ [English](README.md) -# PaddlePaddle ASR toolkit +# PaddlePaddle Speech to Any toolkit ![License](https://img.shields.io/badge/license-Apache%202-red.svg) ![python version](https://img.shields.io/badge/python-3.7+-orange.svg) ![support os](https://img.shields.io/badge/os-linux-yellow.svg) -*PaddleASR*是一个采用[PaddlePaddle](https://github.com/PaddlePaddle/Paddle)平台的端到端自动语音识别(ASR)引擎的开源项目, +*DeepSpeech*是一个采用[PaddlePaddle](https://github.com/PaddlePaddle/Paddle)平台的端到端自动语音识别引擎的开源项目, 我们的愿景是为语音识别在工业应用和学术研究上,提供易于使用、高效、小型化和可扩展的工具,包括训练,推理,以及 部署。 ## 特性 @@ -16,6 +16,7 @@ ## 安装 +* Ubuntu 16.04 * python>=3.7 * paddlepaddle>=2.1.2 diff --git a/deepspeech/frontend/augmentor/base.py b/deepspeech/frontend/augmentor/base.py index 87cb4ef72..18d003c0b 100644 --- a/deepspeech/frontend/augmentor/base.py +++ b/deepspeech/frontend/augmentor/base.py @@ -30,7 +30,7 @@ class AugmentorBase(): @abstractmethod def __call__(self, xs): - raise NotImplementedError + raise NotImplementedError("AugmentorBase: Not impl __call__") @abstractmethod def transform_audio(self, audio_segment): @@ -44,7 +44,7 @@ class AugmentorBase(): :param audio_segment: Audio segment to add effects to. :type audio_segment: AudioSegmenet|SpeechSegment """ - raise NotImplementedError + raise NotImplementedError("AugmentorBase: Not impl transform_audio") @abstractmethod def transform_feature(self, spec_segment): @@ -56,4 +56,4 @@ class AugmentorBase(): Args: spec_segment (Spectrogram): Spectrogram segment to add effects to. """ - raise NotImplementedError + raise NotImplementedError("AugmentorBase: Not impl transform_feature") diff --git a/deepspeech/frontend/augmentor/spec_augment.py b/deepspeech/frontend/augmentor/spec_augment.py index 94d23bf46..1786099c8 100644 --- a/deepspeech/frontend/augmentor/spec_augment.py +++ b/deepspeech/frontend/augmentor/spec_augment.py @@ -64,7 +64,7 @@ class SpecAugmentor(AugmentorBase): self.n_freq_masks = n_freq_masks self.n_time_masks = n_time_masks self.p = p - #logger.info(f"specaug: F-{F}, T-{T}, F-n-{n_freq_masks}, T-n-{n_time_masks}") + # adaptive SpecAugment self.adaptive_number_ratio = adaptive_number_ratio @@ -120,6 +120,9 @@ class SpecAugmentor(AugmentorBase): @property def time_mask(self): return self._time_mask + + def __repr__(self): + return f"specaug: F-{F}, T-{T}, F-n-{n_freq_masks}, T-n-{n_time_masks}" def time_warp(xs, W=40): raise NotImplementedError @@ -160,7 +163,7 @@ class SpecAugmentor(AugmentorBase): def __call__(self, x, train=True): if not train: return - self.transform_audio(x) + self.transform_feature(x) def transform_feature(self, xs: np.ndarray): """ diff --git a/examples/librispeech/s2/conf/augmentation.json b/examples/librispeech/s2/conf/augmentation.json index c1078393d..49fe333ec 100644 --- a/examples/librispeech/s2/conf/augmentation.json +++ b/examples/librispeech/s2/conf/augmentation.json @@ -1,21 +1,4 @@ [ - { - "type": "shift", - "params": { - "min_shift_ms": -5, - "max_shift_ms": 5 - }, - "prob": 1.0 - }, - { - "type": "speed", - "params": { - "min_speed_rate": 0.9, - "max_speed_rate": 1.1, - "num_rates": 3 - }, - "prob": 0.0 - }, { "type": "specaug", "params": {