diff --git a/deepspeech/decoders/recog.py b/deepspeech/decoders/recog.py index d1ddfc8a..eb39636c 100644 --- a/deepspeech/decoders/recog.py +++ b/deepspeech/decoders/recog.py @@ -24,11 +24,7 @@ from .utils import add_results_to_json from deepspeech.exps import dynamic_import_tester from deepspeech.io.reader import LoadInputsAndTargets from deepspeech.models.asr_interface import ASRInterface -from deepspeech.models.lm.transformer import TransformerLM from deepspeech.utils.log import Log -# from espnet.asr.asr_utils import get_model_conf -# from espnet.asr.asr_utils import torch_load -# from espnet.nets.lm_interface import dynamic_import_lm logger = Log(__name__).getlog() @@ -49,21 +45,24 @@ def load_trained_model(args): model = exp.model return model, char_list, exp, confs + def get_config(config_path): stream = open(config_path, mode='r', encoding="utf-8") config = yaml.load(stream, Loader=yaml.FullLoader) stream.close() return config + def load_trained_lm(args): - lm_args = get_config(args.rnnlm_conf) - # NOTE: for a compatibility with less than 0.5.0 version models - lm_model_module = getattr(lm_args, "model_module", "default") - lm_class = dynamic_import_lm(lm_model_module) - lm = lm_class(lm_args.model) - model_dict = paddle.load(args.rnnlm) - lm.set_state_dict(model_dict) - return lm + lm_args = get_config(args.rnnlm_conf) + # NOTE: for a compatibility with less than 0.5.0 version models + lm_model_module = getattr(lm_args, "model_module", "default") + lm_class = dynamic_import_lm(lm_model_module) + lm = lm_class(lm_args.model) + model_dict = paddle.load(args.rnnlm) + lm.set_state_dict(model_dict) + return lm + def recog_v2(args): """Decode with custom models that implements ScorerInterface. diff --git a/deepspeech/decoders/recog_bin.py b/deepspeech/decoders/recog_bin.py index cb3d5757..7c866648 100644 --- a/deepspeech/decoders/recog_bin.py +++ b/deepspeech/decoders/recog_bin.py @@ -21,6 +21,7 @@ from distutils.util import strtobool import configargparse import numpy as np + def get_parser(): """Get default arguments.""" parser = configargparse.ArgumentParser( diff --git a/deepspeech/models/lm/transformer.py b/deepspeech/models/lm/transformer.py index 28371ae2..35ecf678 100644 --- a/deepspeech/models/lm/transformer.py +++ b/deepspeech/models/lm/transformer.py @@ -30,20 +30,19 @@ logger = Log(__name__).getlog() class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface): - def __init__( - self, - n_vocab: int, - pos_enc: str=None, - embed_unit: int=128, - att_unit: int=256, - head: int=2, - unit: int=1024, - layer: int=4, - dropout_rate: float=0.5, - emb_dropout_rate: float=0.0, - att_dropout_rate: float=0.0, - tie_weights: bool=False, - **kwargs): + def __init__(self, + n_vocab: int, + pos_enc: str=None, + embed_unit: int=128, + att_unit: int=256, + head: int=2, + unit: int=1024, + layer: int=4, + dropout_rate: float=0.5, + emb_dropout_rate: float=0.0, + att_dropout_rate: float=0.0, + tie_weights: bool=False, + **kwargs): nn.Layer.__init__(self) if pos_enc == "sinusoidal": diff --git a/deepspeech/transform/add_deltas.py b/deepspeech/transform/add_deltas.py index 68f44d41..4cab0084 100644 --- a/deepspeech/transform/add_deltas.py +++ b/deepspeech/transform/add_deltas.py @@ -1,3 +1,16 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import numpy as np @@ -9,7 +22,7 @@ def delta(feat, window): delta_feat[i:] += -i * feat[:-i] delta_feat[-i:] += i * feat[-1] delta_feat[:i] += -i * feat[0] - delta_feat /= 2 * sum(i ** 2 for i in range(1, window + 1)) + delta_feat /= 2 * sum(i**2 for i in range(1, window + 1)) return delta_feat @@ -34,8 +47,7 @@ class AddDeltas(): def __repr__(self): return "{name}(window={window}, order={order}".format( - name=self.__class__.__name__, window=self.window, order=self.order - ) + name=self.__class__.__name__, window=self.window, order=self.order) def __call__(self, x): return add_deltas(x, window=self.window, order=self.order) diff --git a/deepspeech/transform/channel_selector.py b/deepspeech/transform/channel_selector.py index 1ac9e350..d985b482 100644 --- a/deepspeech/transform/channel_selector.py +++ b/deepspeech/transform/channel_selector.py @@ -1,3 +1,16 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import numpy @@ -10,15 +23,12 @@ class ChannelSelector(): self.axis = axis def __repr__(self): - return ( - "{name}(train_channel={train_channel}, " - "eval_channel={eval_channel}, axis={axis})".format( - name=self.__class__.__name__, - train_channel=self.train_channel, - eval_channel=self.eval_channel, - axis=self.axis, - ) - ) + return ("{name}(train_channel={train_channel}, " + "eval_channel={eval_channel}, axis={axis})".format( + name=self.__class__.__name__, + train_channel=self.train_channel, + eval_channel=self.eval_channel, + axis=self.axis, )) def __call__(self, x, train=True): # Assuming x: [Time, Channel] by default @@ -27,8 +37,8 @@ class ChannelSelector(): # If the dimension is insufficient, then unsqueeze # (e.g [Time] -> [Time, 1]) ind = tuple( - slice(None) if i < x.ndim else None for i in range(self.axis + 1) - ) + slice(None) if i < x.ndim else None + for i in range(self.axis + 1)) x = x[ind] if train: @@ -41,5 +51,6 @@ class ChannelSelector(): else: ch = channel - ind = tuple(slice(None) if i != self.axis else ch for i in range(x.ndim)) + ind = tuple( + slice(None) if i != self.axis else ch for i in range(x.ndim)) return x[ind] diff --git a/deepspeech/transform/functional.py b/deepspeech/transform/functional.py index 5eec6cc1..914e484e 100644 --- a/deepspeech/transform/functional.py +++ b/deepspeech/transform/functional.py @@ -1,3 +1,16 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import inspect from deepspeech.transform.transform_interface import TransformInterface @@ -57,7 +70,8 @@ class FuncTrans(TransformInterface): except ValueError: d = dict() return { - k: v.default for k, v in d.items() if v.default != inspect.Parameter.empty + k: v.default + for k, v in d.items() if v.default != inspect.Parameter.empty } def __repr__(self): diff --git a/deepspeech/transform/perturb.py b/deepspeech/transform/perturb.py index 05766cad..e425fd2e 100644 --- a/deepspeech/transform/perturb.py +++ b/deepspeech/transform/perturb.py @@ -1,3 +1,16 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import librosa import numpy import scipy @@ -5,6 +18,7 @@ import soundfile from deepspeech.io.reader import SoundHDF5File + class SpeedPerturbation(): """SpeedPerturbation @@ -22,14 +36,13 @@ class SpeedPerturbation(): """ def __init__( - self, - lower=0.9, - upper=1.1, - utt2ratio=None, - keep_length=True, - res_type="kaiser_best", - seed=None, - ): + self, + lower=0.9, + upper=1.1, + utt2ratio=None, + keep_length=True, + res_type="kaiser_best", + seed=None, ): self.res_type = res_type self.keep_length = keep_length self.state = numpy.random.RandomState(seed) @@ -60,12 +73,10 @@ class SpeedPerturbation(): self.lower, self.upper, self.keep_length, - self.res_type, - ) + self.res_type, ) else: return "{}({}, res_type={})".format( - self.__class__.__name__, self.utt2ratio_file, self.res_type - ) + self.__class__.__name__, self.utt2ratio_file, self.res_type) def __call__(self, x, uttid=None, train=True): if not train: @@ -85,15 +96,14 @@ class SpeedPerturbation(): diff = abs(len(x) - len(y)) if len(y) > len(x): # Truncate noise - y = y[diff // 2 : -((diff + 1) // 2)] + y = y[diff // 2:-((diff + 1) // 2)] elif len(y) < len(x): # Assume the time-axis is the first: (Time, Channel) pad_width = [(diff // 2, (diff + 1) // 2)] + [ (0, 0) for _ in range(y.ndim - 1) ] y = numpy.pad( - y, pad_width=pad_width, constant_values=0, mode="constant" - ) + y, pad_width=pad_width, constant_values=0, mode="constant") return y @@ -111,7 +121,7 @@ class BandpassPerturbation(): """ - def __init__(self, lower=0.0, upper=0.75, seed=None, axes=(-1,)): + def __init__(self, lower=0.0, upper=0.75, seed=None, axes=(-1, )): self.lower = lower self.upper = upper self.state = numpy.random.RandomState(seed) @@ -119,18 +129,16 @@ class BandpassPerturbation(): self.axes = axes def __repr__(self): - return "{}(lower={}, upper={})".format( - self.__class__.__name__, self.lower, self.upper - ) + return "{}(lower={}, upper={})".format(self.__class__.__name__, + self.lower, self.upper) def __call__(self, x_stft, uttid=None, train=True): if not train: return x_stft if x_stft.ndim == 1: - raise RuntimeError( - "Input in time-freq domain: " "(Time, Channel, Freq) or (Time, Freq)" - ) + raise RuntimeError("Input in time-freq domain: " + "(Time, Channel, Freq) or (Time, Freq)") ratio = self.state.uniform(self.lower, self.upper) axes = [i if i >= 0 else x_stft.ndim - i for i in self.axes] @@ -142,7 +150,12 @@ class BandpassPerturbation(): class VolumePerturbation(): - def __init__(self, lower=-1.6, upper=1.6, utt2ratio=None, dbunit=True, seed=None): + def __init__(self, + lower=-1.6, + upper=1.6, + utt2ratio=None, + dbunit=True, + seed=None): self.dbunit = dbunit self.utt2ratio_file = utt2ratio self.lower = lower @@ -168,12 +181,10 @@ class VolumePerturbation(): def __repr__(self): if self.utt2ratio is None: return "{}(lower={}, upper={}, dbunit={})".format( - self.__class__.__name__, self.lower, self.upper, self.dbunit - ) + self.__class__.__name__, self.lower, self.upper, self.dbunit) else: return '{}("{}", dbunit={})'.format( - self.__class__.__name__, self.utt2ratio_file, self.dbunit - ) + self.__class__.__name__, self.utt2ratio_file, self.dbunit) def __call__(self, x, uttid=None, train=True): if not train: @@ -186,7 +197,7 @@ class VolumePerturbation(): else: ratio = self.state.uniform(self.lower, self.upper) if self.dbunit: - ratio = 10 ** (ratio / 20) + ratio = 10**(ratio / 20) return x * ratio @@ -194,15 +205,14 @@ class NoiseInjection(): """Add isotropic noise""" def __init__( - self, - utt2noise=None, - lower=-20, - upper=-5, - utt2ratio=None, - filetype="list", - dbunit=True, - seed=None, - ): + self, + utt2noise=None, + lower=-20, + upper=-5, + utt2ratio=None, + filetype="list", + dbunit=True, + seed=None, ): self.utt2noise_file = utt2noise self.utt2ratio_file = utt2ratio self.filetype = filetype @@ -242,19 +252,16 @@ class NoiseInjection(): if utt2noise is not None and utt2ratio is not None: if set(self.utt2ratio) != set(self.utt2noise): - raise RuntimeError( - "The uttids mismatch between {} and {}".format(utt2ratio, utt2noise) - ) + raise RuntimeError("The uttids mismatch between {} and {}". + format(utt2ratio, utt2noise)) def __repr__(self): if self.utt2ratio is None: return "{}(lower={}, upper={}, dbunit={})".format( - self.__class__.__name__, self.lower, self.upper, self.dbunit - ) + self.__class__.__name__, self.lower, self.upper, self.dbunit) else: return '{}("{}", dbunit={})'.format( - self.__class__.__name__, self.utt2ratio_file, self.dbunit - ) + self.__class__.__name__, self.utt2ratio_file, self.dbunit) def __call__(self, x, uttid=None, train=True): if not train: @@ -268,8 +275,8 @@ class NoiseInjection(): ratio = self.state.uniform(self.lower, self.upper) if self.dbunit: - ratio = 10 ** (ratio / 20) - scale = ratio * numpy.sqrt((x ** 2).mean()) + ratio = 10**(ratio / 20) + scale = ratio * numpy.sqrt((x**2).mean()) # 2. Get noise if self.utt2noise is not None: @@ -280,16 +287,17 @@ class NoiseInjection(): # Randomly select the noise source noise = self.state.choice(list(self.utt2noise.values())) # Normalize the level - noise /= numpy.sqrt((noise ** 2).mean()) + noise /= numpy.sqrt((noise**2).mean()) # Adjust the noise length diff = abs(len(x) - len(noise)) offset = self.state.randint(0, diff) if len(noise) > len(x): # Truncate noise - noise = noise[offset : -(diff - offset)] + noise = noise[offset:-(diff - offset)] else: - noise = numpy.pad(noise, pad_width=[offset, diff - offset], mode="wrap") + noise = numpy.pad( + noise, pad_width=[offset, diff - offset], mode="wrap") else: # Generate white noise @@ -329,15 +337,14 @@ class RIRConvolve(): if x.ndim != 1: # Must be single channel raise RuntimeError( - "Input x must be one dimensional array, but got {}".format(x.shape) - ) + "Input x must be one dimensional array, but got {}".format( + x.shape)) rir, rate = self.utt2rir[uttid] if rir.ndim == 2: # FIXME(kamo): Use chainer.convolution_1d? # return [Time, Channel] return numpy.stack( - [scipy.convolve(x, r, mode="same") for r in rir], axis=-1 - ) + [scipy.convolve(x, r, mode="same") for r in rir], axis=-1) else: return scipy.convolve(x, rir, mode="same") diff --git a/deepspeech/transform/spec_augment.py b/deepspeech/transform/spec_augment.py index feb712df..0e5324e7 100644 --- a/deepspeech/transform/spec_augment.py +++ b/deepspeech/transform/spec_augment.py @@ -1,5 +1,17 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. """Spec Augment module for preprocessing i.e., data augmentation""" - import random import numpy @@ -27,10 +39,12 @@ def time_warp(x, max_time_warp=80, inplace=False, mode="PIL"): return x # NOTE: randrange(a, b) emits a, a + 1, ..., b - 1 center = random.randrange(window, t - window) - warped = random.randrange(center - window, center + window) + 1 # 1 ... t - 1 + warped = random.randrange(center - window, center + + window) + 1 # 1 ... t - 1 left = Image.fromarray(x[:center]).resize((x.shape[1], warped), BICUBIC) - right = Image.fromarray(x[center:]).resize((x.shape[1], t - warped), BICUBIC) + right = Image.fromarray(x[center:]).resize((x.shape[1], t - warped), + BICUBIC) if inplace: x[:warped] = left x[warped:] = right @@ -44,11 +58,8 @@ def time_warp(x, max_time_warp=80, inplace=False, mode="PIL"): # TODO(karita): make this differentiable again return spec_augment.time_warp(paddle.to_tensor(x), window).numpy() else: - raise NotImplementedError( - "unknown resize mode: " - + mode - + ", choose one from (PIL, sparse_image_warp)." - ) + raise NotImplementedError("unknown resize mode: " + mode + + ", choose one from (PIL, sparse_image_warp).") class TimeWarp(FuncTrans): @@ -145,16 +156,15 @@ class TimeMask(FuncTrans): def spec_augment( - x, - resize_mode="PIL", - max_time_warp=80, - max_freq_width=27, - n_freq_mask=2, - max_time_width=100, - n_time_mask=2, - inplace=True, - replace_with_zero=True, -): + x, + resize_mode="PIL", + max_time_warp=80, + max_freq_width=27, + n_freq_mask=2, + max_time_width=100, + n_time_mask=2, + inplace=True, + replace_with_zero=True, ): """spec agument apply random time warping and time/freq masking @@ -180,15 +190,13 @@ def spec_augment( max_freq_width, n_freq_mask, inplace=inplace, - replace_with_zero=replace_with_zero, - ) + replace_with_zero=replace_with_zero, ) x = time_mask( x, max_time_width, n_time_mask, inplace=inplace, - replace_with_zero=replace_with_zero, - ) + replace_with_zero=replace_with_zero, ) return x diff --git a/deepspeech/transform/spectrogram.py b/deepspeech/transform/spectrogram.py index 68d47627..e63bd680 100644 --- a/deepspeech/transform/spectrogram.py +++ b/deepspeech/transform/spectrogram.py @@ -1,10 +1,27 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import librosa import numpy as np -def stft( - x, n_fft, n_shift, win_length=None, window="hann", center=True, pad_mode="reflect" -): +def stft(x, + n_fft, + n_shift, + win_length=None, + window="hann", + center=True, + pad_mode="reflect"): # x: [Time, Channel] if x.ndim == 1: single_channel = True @@ -25,12 +42,9 @@ def stft( win_length=win_length, window=window, center=center, - pad_mode=pad_mode, - ).T - for ch in range(x.shape[1]) + pad_mode=pad_mode, ).T for ch in range(x.shape[1]) ], - axis=1, - ) + axis=1, ) if single_channel: # x: [Time, Channel, Freq] -> [Time, Freq] @@ -55,12 +69,9 @@ def istft(x, n_shift, win_length=None, window="hann", center=True): hop_length=n_shift, win_length=win_length, window=window, - center=center, - ) - for ch in range(x.shape[1]) + center=center, ) for ch in range(x.shape[1]) ], - axis=1, - ) + axis=1, ) if single_channel: # x: [Time, Channel] -> [Time] @@ -68,7 +79,13 @@ def istft(x, n_shift, win_length=None, window="hann", center=True): return x -def stft2logmelspectrogram(x_stft, fs, n_mels, n_fft, fmin=None, fmax=None, eps=1e-10): +def stft2logmelspectrogram(x_stft, + fs, + n_mels, + n_fft, + fmin=None, + fmax=None, + eps=1e-10): # x_stft: (Time, Channel, Freq) or (Time, Freq) fmin = 0 if fmin is None else fmin fmax = fs / 2 if fmax is None else fmax @@ -90,18 +107,17 @@ def spectrogram(x, n_fft, n_shift, win_length=None, window="hann"): def logmelspectrogram( - x, - fs, - n_mels, - n_fft, - n_shift, - win_length=None, - window="hann", - fmin=None, - fmax=None, - eps=1e-10, - pad_mode="reflect", -): + x, + fs, + n_mels, + n_fft, + n_shift, + win_length=None, + window="hann", + fmin=None, + fmax=None, + eps=1e-10, + pad_mode="reflect", ): # stft: (Time, Channel, Freq) or (Time, Freq) x_stft = stft( x, @@ -109,12 +125,16 @@ def logmelspectrogram( n_shift=n_shift, win_length=win_length, window=window, - pad_mode=pad_mode, - ) + pad_mode=pad_mode, ) return stft2logmelspectrogram( - x_stft, fs=fs, n_mels=n_mels, n_fft=n_fft, fmin=fmin, fmax=fmax, eps=eps - ) + x_stft, + fs=fs, + n_mels=n_mels, + n_fft=n_fft, + fmin=fmin, + fmax=fmax, + eps=eps) class Spectrogram(): @@ -125,16 +145,13 @@ class Spectrogram(): self.window = window def __repr__(self): - return ( - "{name}(n_fft={n_fft}, n_shift={n_shift}, " - "win_length={win_length}, window={window})".format( - name=self.__class__.__name__, - n_fft=self.n_fft, - n_shift=self.n_shift, - win_length=self.win_length, - window=self.window, - ) - ) + return ("{name}(n_fft={n_fft}, n_shift={n_shift}, " + "win_length={win_length}, window={window})".format( + name=self.__class__.__name__, + n_fft=self.n_fft, + n_shift=self.n_shift, + win_length=self.win_length, + window=self.window, )) def __call__(self, x): return spectrogram( @@ -142,23 +159,21 @@ class Spectrogram(): n_fft=self.n_fft, n_shift=self.n_shift, win_length=self.win_length, - window=self.window, - ) + window=self.window, ) class LogMelSpectrogram(): def __init__( - self, - fs, - n_mels, - n_fft, - n_shift, - win_length=None, - window="hann", - fmin=None, - fmax=None, - eps=1e-10, - ): + self, + fs, + n_mels, + n_fft, + n_shift, + win_length=None, + window="hann", + fmin=None, + fmax=None, + eps=1e-10, ): self.fs = fs self.n_mels = n_mels self.n_fft = n_fft @@ -170,22 +185,19 @@ class LogMelSpectrogram(): self.eps = eps def __repr__(self): - return ( - "{name}(fs={fs}, n_mels={n_mels}, n_fft={n_fft}, " - "n_shift={n_shift}, win_length={win_length}, window={window}, " - "fmin={fmin}, fmax={fmax}, eps={eps}))".format( - name=self.__class__.__name__, - fs=self.fs, - n_mels=self.n_mels, - n_fft=self.n_fft, - n_shift=self.n_shift, - win_length=self.win_length, - window=self.window, - fmin=self.fmin, - fmax=self.fmax, - eps=self.eps, - ) - ) + return ("{name}(fs={fs}, n_mels={n_mels}, n_fft={n_fft}, " + "n_shift={n_shift}, win_length={win_length}, window={window}, " + "fmin={fmin}, fmax={fmax}, eps={eps}))".format( + name=self.__class__.__name__, + fs=self.fs, + n_mels=self.n_mels, + n_fft=self.n_fft, + n_shift=self.n_shift, + win_length=self.win_length, + window=self.window, + fmin=self.fmin, + fmax=self.fmax, + eps=self.eps, )) def __call__(self, x): return logmelspectrogram( @@ -195,8 +207,7 @@ class LogMelSpectrogram(): n_fft=self.n_fft, n_shift=self.n_shift, win_length=self.win_length, - window=self.window, - ) + window=self.window, ) class Stft2LogMelSpectrogram(): @@ -209,18 +220,15 @@ class Stft2LogMelSpectrogram(): self.eps = eps def __repr__(self): - return ( - "{name}(fs={fs}, n_mels={n_mels}, n_fft={n_fft}, " - "fmin={fmin}, fmax={fmax}, eps={eps}))".format( - name=self.__class__.__name__, - fs=self.fs, - n_mels=self.n_mels, - n_fft=self.n_fft, - fmin=self.fmin, - fmax=self.fmax, - eps=self.eps, - ) - ) + return ("{name}(fs={fs}, n_mels={n_mels}, n_fft={n_fft}, " + "fmin={fmin}, fmax={fmax}, eps={eps}))".format( + name=self.__class__.__name__, + fs=self.fs, + n_mels=self.n_mels, + n_fft=self.n_fft, + fmin=self.fmin, + fmax=self.fmax, + eps=self.eps, )) def __call__(self, x): return stft2logmelspectrogram( @@ -229,20 +237,18 @@ class Stft2LogMelSpectrogram(): n_mels=self.n_mels, n_fft=self.n_fft, fmin=self.fmin, - fmax=self.fmax, - ) + fmax=self.fmax, ) class Stft(): def __init__( - self, - n_fft, - n_shift, - win_length=None, - window="hann", - center=True, - pad_mode="reflect", - ): + self, + n_fft, + n_shift, + win_length=None, + window="hann", + center=True, + pad_mode="reflect", ): self.n_fft = n_fft self.n_shift = n_shift self.win_length = win_length @@ -251,19 +257,16 @@ class Stft(): self.pad_mode = pad_mode def __repr__(self): - return ( - "{name}(n_fft={n_fft}, n_shift={n_shift}, " - "win_length={win_length}, window={window}," - "center={center}, pad_mode={pad_mode})".format( - name=self.__class__.__name__, - n_fft=self.n_fft, - n_shift=self.n_shift, - win_length=self.win_length, - window=self.window, - center=self.center, - pad_mode=self.pad_mode, - ) - ) + return ("{name}(n_fft={n_fft}, n_shift={n_shift}, " + "win_length={win_length}, window={window}," + "center={center}, pad_mode={pad_mode})".format( + name=self.__class__.__name__, + n_fft=self.n_fft, + n_shift=self.n_shift, + win_length=self.win_length, + window=self.window, + center=self.center, + pad_mode=self.pad_mode, )) def __call__(self, x): return stft( @@ -273,8 +276,7 @@ class Stft(): win_length=self.win_length, window=self.window, center=self.center, - pad_mode=self.pad_mode, - ) + pad_mode=self.pad_mode, ) class IStft(): @@ -285,17 +287,14 @@ class IStft(): self.center = center def __repr__(self): - return ( - "{name}(n_shift={n_shift}, " - "win_length={win_length}, window={window}," - "center={center})".format( - name=self.__class__.__name__, - n_shift=self.n_shift, - win_length=self.win_length, - window=self.window, - center=self.center, - ) - ) + return ("{name}(n_shift={n_shift}, " + "win_length={win_length}, window={window}," + "center={center})".format( + name=self.__class__.__name__, + n_shift=self.n_shift, + win_length=self.win_length, + window=self.window, + center=self.center, )) def __call__(self, x): return istft( @@ -303,5 +302,4 @@ class IStft(): self.n_shift, win_length=self.win_length, window=self.window, - center=self.center, - ) + center=self.center, ) diff --git a/deepspeech/transform/transform_interface.py b/deepspeech/transform/transform_interface.py index 8a6aba45..7ab29554 100644 --- a/deepspeech/transform/transform_interface.py +++ b/deepspeech/transform/transform_interface.py @@ -1,3 +1,16 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. # TODO(karita): add this to all the transform impl. class TransformInterface: """Transform Interface""" diff --git a/deepspeech/transform/transformation.py b/deepspeech/transform/transformation.py index 0f8c39bb..afb1db28 100644 --- a/deepspeech/transform/transformation.py +++ b/deepspeech/transform/transformation.py @@ -1,16 +1,28 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. """Transformation module.""" -from collections.abc import Sequence -from collections import OrderedDict import copy -from inspect import signature import io import logging +from collections import OrderedDict +from collections.abc import Sequence +from inspect import signature import yaml from deepspeech.utils.dynamic_import import dynamic_import - # TODO(karita): inherit TransformInterface # TODO(karita): register cmd arguments in asr_train.py import_alias = dict( @@ -33,8 +45,7 @@ import_alias = dict( istft="deepspeech.transform.spectrogram:IStft", stft2fbank="deepspeech.transform.spectrogram:Stft2LogMelSpectrogram", wpe="deepspeech.transform.wpe:WPE", - channel_selector="deepspeech.transform.channel_selector:ChannelSelector", -) + channel_selector="deepspeech.transform.channel_selector:ChannelSelector", ) class Transformation(): @@ -83,21 +94,16 @@ class Transformation(): # Some function, e.g. built-in function, are failed pass else: - logging.error( - "Expected signature: {}({})".format( - class_obj.__name__, signa - ) - ) + logging.error("Expected signature: {}({})".format( + class_obj.__name__, signa)) raise else: raise NotImplementedError( - "Not supporting mode={}".format(self.conf["mode"]) - ) + "Not supporting mode={}".format(self.conf["mode"])) def __repr__(self): - rep = "\n" + "\n".join( - " {}: {}".format(k, v) for k, v in self.functions.items() - ) + rep = "\n" + "\n".join(" {}: {}".format(k, v) + for k, v in self.functions.items()) return "{}({})".format(self.__class__.__name__, rep) def __call__(self, xs, uttid_list=None, **kwargs): @@ -130,18 +136,19 @@ class Transformation(): _kwargs = {k: v for k, v in kwargs.items() if k in param} try: if uttid_list is not None and "uttid" in param: - xs = [func(x, u, **_kwargs) for x, u in zip(xs, uttid_list)] + xs = [ + func(x, u, **_kwargs) + for x, u in zip(xs, uttid_list) + ] else: xs = [func(x, **_kwargs) for x in xs] except Exception: - logging.fatal( - "Catch a exception from {}th func: {}".format(idx, func) - ) + logging.fatal("Catch a exception from {}th func: {}".format( + idx, func)) raise else: raise NotImplementedError( - "Not supporting mode={}".format(self.conf["mode"]) - ) + "Not supporting mode={}".format(self.conf["mode"])) if is_batch: return xs diff --git a/deepspeech/transform/wpe.py b/deepspeech/transform/wpe.py index 8aed97e6..d82005f6 100644 --- a/deepspeech/transform/wpe.py +++ b/deepspeech/transform/wpe.py @@ -1,10 +1,26 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. from nara_wpe.wpe import wpe class WPE(object): - def __init__( - self, taps=10, delay=3, iterations=3, psd_context=0, statistics_mode="full" - ): + def __init__(self, + taps=10, + delay=3, + iterations=3, + psd_context=0, + statistics_mode="full"): self.taps = taps self.delay = delay self.iterations = iterations @@ -12,18 +28,15 @@ class WPE(object): self.statistics_mode = statistics_mode def __repr__(self): - return ( - "{name}(taps={taps}, delay={delay}" - "iterations={iterations}, psd_context={psd_context}, " - "statistics_mode={statistics_mode})".format( - name=self.__class__.__name__, - taps=self.taps, - delay=self.delay, - iterations=self.iterations, - psd_context=self.psd_context, - statistics_mode=self.statistics_mode, - ) - ) + return ("{name}(taps={taps}, delay={delay}" + "iterations={iterations}, psd_context={psd_context}, " + "statistics_mode={statistics_mode})".format( + name=self.__class__.__name__, + taps=self.taps, + delay=self.delay, + iterations=self.iterations, + psd_context=self.psd_context, + statistics_mode=self.statistics_mode, )) def __call__(self, xs): """Return enhanced @@ -40,6 +53,5 @@ class WPE(object): delay=self.delay, iterations=self.iterations, psd_context=self.psd_context, - statistics_mode=self.statistics_mode, - ) + statistics_mode=self.statistics_mode, ) return xs.transpose(2, 1, 0) diff --git a/deepspeech/utils/check_kwargs.py b/deepspeech/utils/check_kwargs.py index 593bfa24..1ee7329b 100644 --- a/deepspeech/utils/check_kwargs.py +++ b/deepspeech/utils/check_kwargs.py @@ -1,3 +1,16 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import inspect @@ -17,4 +30,5 @@ def check_kwargs(func, kwargs, name=None): name = func.__name__ for k in kwargs.keys(): if k not in params: - raise TypeError(f"{name}() got an unexpected keyword argument '{k}'") + raise TypeError( + f"{name}() got an unexpected keyword argument '{k}'") diff --git a/deepspeech/utils/spec_augment.py b/deepspeech/utils/spec_augment.py index e69de29b..185a92b8 100644 --- a/deepspeech/utils/spec_augment.py +++ b/deepspeech/utils/spec_augment.py @@ -0,0 +1,13 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/examples/librispeech/s2/README.md b/examples/librispeech/s2/README.md index fc634ff6..9285a183 100644 --- a/examples/librispeech/s2/README.md +++ b/examples/librispeech/s2/README.md @@ -23,5 +23,5 @@ | test-clean | join_ctc_w/o_lm | 2620 | 52576 | 97.2 | 2.6 | 0.3 | 0.4 | 3.2 | 34.9 | | test-clean | join_ctc_w_lm | 2620 | 52576 | 97.9 | 1.8 | 0.2 | 0.3 | 2.4 | 27.8 | -Compare with [ESPNET](https://github.com/espnet/espnet/blob/master/egs/librispeech/asr1/RESULTS.md#pytorch-large-transformer-with-specaug-4-gpus--transformer-lm-4-gpus) +Compare with [ESPNET](https://github.com/espnet/espnet/blob/master/egs/librispeech/asr1/RESULTS.md#pytorch-large-transformer-with-specaug-4-gpus--transformer-lm-4-gpus) we using 8gpu, but model size (aheads4-adim256) small than it. diff --git a/parakeet/data/batch.py b/parakeet/data/batch.py index 515074d1..5e7ac399 100644 --- a/parakeet/data/batch.py +++ b/parakeet/data/batch.py @@ -53,8 +53,8 @@ def batch_text_id(minibatch, pad_id=0, dtype=np.int64): peek_example = minibatch[0] assert len(peek_example.shape) == 1, "text example is an 1D tensor" - lengths = [example.shape[0] for example in minibatch - ] # assume (channel, n_samples) or (n_samples, ) + lengths = [example.shape[0] for example in + minibatch] # assume (channel, n_samples) or (n_samples, ) max_len = np.max(lengths) batch = [] diff --git a/parakeet/exps/tacotron2/ljspeech.py b/parakeet/exps/tacotron2/ljspeech.py index 20dc29d3..59c855eb 100644 --- a/parakeet/exps/tacotron2/ljspeech.py +++ b/parakeet/exps/tacotron2/ljspeech.py @@ -67,19 +67,16 @@ class LJSpeechCollector(object): # Sort by text_len in descending order texts = [ - i - for i, _ in sorted( + i for i, _ in sorted( zip(texts, text_lens), key=lambda x: x[1], reverse=True) ] mels = [ - i - for i, _ in sorted( + i for i, _ in sorted( zip(mels, text_lens), key=lambda x: x[1], reverse=True) ] mel_lens = [ - i - for i, _ in sorted( + i for i, _ in sorted( zip(mel_lens, text_lens), key=lambda x: x[1], reverse=True) ] diff --git a/requirements.txt b/requirements.txt index 4878dbe3..1ead8609 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,7 +13,7 @@ librosa llvmlite loguru matplotlib -nltk +nara_wpenltk numba numpy==1.20.0 pandas @@ -42,4 +42,3 @@ visualdl==2.2.0 webrtcvad yacs yq -nara_wpe \ No newline at end of file diff --git a/utils/feat-to-shape.py b/utils/feat-to-shape.py index 911bf5cf..7b36b7e5 100755 --- a/utils/feat-to-shape.py +++ b/utils/feat-to-shape.py @@ -12,33 +12,32 @@ from deepspeech.utils.cli_utils import is_scipy_wav_style def get_parser(): parser = argparse.ArgumentParser( description="convert feature to its shape", - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - ) - parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option") + formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) + parser.add_argument( + "--verbose", "-V", default=0, type=int, help="Verbose option") parser.add_argument( "--filetype", type=str, default="mat", choices=["mat", "hdf5", "sound.hdf5", "sound"], help="Specify the file format for the rspecifier. " - '"mat" is the matrix format in kaldi', - ) + '"mat" is the matrix format in kaldi', ) parser.add_argument( "--preprocess-conf", type=str, default=None, - help="The configuration file for the pre-processing", - ) + help="The configuration file for the pre-processing", ) parser.add_argument( - "rspecifier", type=str, help="Read specifier for feats. e.g. ark:some.ark" - ) + "rspecifier", + type=str, + help="Read specifier for feats. e.g. ark:some.ark") parser.add_argument( "out", nargs="?", type=argparse.FileType("w"), default=sys.stdout, - help="The output filename. " "If omitted, then output to sys.stdout", - ) + help="The output filename. " + "If omitted, then output to sys.stdout", ) return parser @@ -64,8 +63,7 @@ def main(): # so change to file_reader_helper to return shape. # This make sense only with filetype="hdf5". for utt, mat in file_reader_helper( - args.rspecifier, args.filetype, return_shape=preprocessing is None - ): + args.rspecifier, args.filetype, return_shape=preprocessing is None): if preprocessing is not None: if is_scipy_wav_style(mat): # If data is sound file, then got as Tuple[int, ndarray]