format code

3 years ago · b878027c9a
parent e8bc9a2a08
commit b878027c9a
19 changed files with 404 additions and 302 deletions
--- a/deepspeech/decoders/recog.py
+++ b/deepspeech/decoders/recog.py
@ -24,11 +24,7 @@ from .utils import add_results_to_json
 from deepspeech.exps import dynamic_import_tester
 from deepspeech.io.reader import LoadInputsAndTargets
 from deepspeech.models.asr_interface import ASRInterface
 from deepspeech.models.lm.transformer import TransformerLM
 from deepspeech.utils.log import Log
 # from espnet.asr.asr_utils import get_model_conf
 # from espnet.asr.asr_utils import torch_load
 # from espnet.nets.lm_interface import dynamic_import_lm
 logger = Log(__name__).getlog()
@ -49,21 +45,24 @@ def load_trained_model(args):
    model = exp.model
    return model, char_list, exp, confs
 def get_config(config_path):
    stream = open(config_path, mode='r', encoding="utf-8")
    config = yaml.load(stream, Loader=yaml.FullLoader)
    stream.close()
    return config
 def load_trained_lm(args):
-        lm_args = get_config(args.rnnlm_conf)
+    lm_args = get_config(args.rnnlm_conf)
-        # NOTE: for a compatibility with less than 0.5.0 version models
+    # NOTE: for a compatibility with less than 0.5.0 version models
-        lm_model_module = getattr(lm_args, "model_module", "default")
+    lm_model_module = getattr(lm_args, "model_module", "default")
-        lm_class = dynamic_import_lm(lm_model_module)
+    lm_class = dynamic_import_lm(lm_model_module)
-        lm = lm_class(lm_args.model)
+    lm = lm_class(lm_args.model)
-        model_dict = paddle.load(args.rnnlm)
+    model_dict = paddle.load(args.rnnlm)
-        lm.set_state_dict(model_dict)
+    lm.set_state_dict(model_dict)
-        return lm
+    return lm
 def recog_v2(args):
    """Decode with custom models that implements ScorerInterface.
--- a/deepspeech/decoders/recog_bin.py
+++ b/deepspeech/decoders/recog_bin.py
@ -21,6 +21,7 @@ from distutils.util import strtobool
 import configargparse
 import numpy as np
 def get_parser():
    """Get default arguments."""
    parser = configargparse.ArgumentParser(
--- a/deepspeech/models/lm/transformer.py
+++ b/deepspeech/models/lm/transformer.py
@ -30,20 +30,19 @@ logger = Log(__name__).getlog()
 class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface):
-    def __init__(
+    def __init__(self,
-            self,
+                 n_vocab: int,
-            n_vocab: int,
+                 pos_enc: str=None,
-            pos_enc: str=None,
+                 embed_unit: int=128,
-            embed_unit: int=128,
+                 att_unit: int=256,
-            att_unit: int=256,
+                 head: int=2,
-            head: int=2,
+                 unit: int=1024,
-            unit: int=1024,
+                 layer: int=4,
-            layer: int=4,
+                 dropout_rate: float=0.5,
-            dropout_rate: float=0.5,
+                 emb_dropout_rate: float=0.0,
-            emb_dropout_rate: float=0.0,
+                 att_dropout_rate: float=0.0,
-            att_dropout_rate: float=0.0,
+                 tie_weights: bool=False,
-            tie_weights: bool=False, 
+                 **kwargs):
            **kwargs):
        nn.Layer.__init__(self)
        if pos_enc == "sinusoidal":
--- a/deepspeech/transform/add_deltas.py
+++ b/deepspeech/transform/add_deltas.py
@ -1,3 +1,16 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import numpy as np
@ -9,7 +22,7 @@ def delta(feat, window):
        delta_feat[i:] += -i * feat[:-i]
        delta_feat[-i:] += i * feat[-1]
        delta_feat[:i] += -i * feat[0]
-    delta_feat /= 2 * sum(i ** 2 for i in range(1, window + 1))
+    delta_feat /= 2 * sum(i**2 for i in range(1, window + 1))
    return delta_feat
@ -34,8 +47,7 @@ class AddDeltas():
    def __repr__(self):
        return "{name}(window={window}, order={order}".format(
-            name=self.__class__.__name__, window=self.window, order=self.order
+            name=self.__class__.__name__, window=self.window, order=self.order)
        )
    def __call__(self, x):
        return add_deltas(x, window=self.window, order=self.order)
--- a/deepspeech/transform/channel_selector.py
+++ b/deepspeech/transform/channel_selector.py
@ -1,3 +1,16 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import numpy
@ -10,15 +23,12 @@ class ChannelSelector():
        self.axis = axis
    def __repr__(self):
-        return (
+        return ("{name}(train_channel={train_channel}, "
-            "{name}(train_channel={train_channel}, "
+                "eval_channel={eval_channel}, axis={axis})".format(
-            "eval_channel={eval_channel}, axis={axis})".format(
+                    name=self.__class__.__name__,
-                name=self.__class__.__name__,
+                    train_channel=self.train_channel,
-                train_channel=self.train_channel,
+                    eval_channel=self.eval_channel,
-                eval_channel=self.eval_channel,
+                    axis=self.axis, ))
                axis=self.axis,
            )
        )
    def __call__(self, x, train=True):
        # Assuming x: [Time, Channel] by default
@ -27,8 +37,8 @@ class ChannelSelector():
            # If the dimension is insufficient, then unsqueeze
            # (e.g [Time] -> [Time, 1])
            ind = tuple(
-                slice(None) if i < x.ndim else None for i in range(self.axis + 1)
+                slice(None) if i < x.ndim else None
-            )
+                for i in range(self.axis + 1))
            x = x[ind]
        if train:
@ -41,5 +51,6 @@ class ChannelSelector():
        else:
            ch = channel
-        ind = tuple(slice(None) if i != self.axis else ch for i in range(x.ndim))
+        ind = tuple(
            slice(None) if i != self.axis else ch for i in range(x.ndim))
        return x[ind]
--- a/deepspeech/transform/functional.py
+++ b/deepspeech/transform/functional.py
@ -1,3 +1,16 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import inspect
 from deepspeech.transform.transform_interface import TransformInterface
@ -57,7 +70,8 @@ class FuncTrans(TransformInterface):
        except ValueError:
            d = dict()
        return {
-            k: v.default for k, v in d.items() if v.default != inspect.Parameter.empty
+            k: v.default
            for k, v in d.items() if v.default != inspect.Parameter.empty
        }
    def __repr__(self):
--- a/deepspeech/transform/perturb.py
+++ b/deepspeech/transform/perturb.py
@ -1,3 +1,16 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import librosa
 import numpy
 import scipy
@ -5,6 +18,7 @@ import soundfile
 from deepspeech.io.reader import SoundHDF5File
 class SpeedPerturbation():
    """SpeedPerturbation
@ -22,14 +36,13 @@ class SpeedPerturbation():
    """
    def __init__(
-        self,
+            self,
-        lower=0.9,
+            lower=0.9,
-        upper=1.1,
+            upper=1.1,
-        utt2ratio=None,
+            utt2ratio=None,
-        keep_length=True,
+            keep_length=True,
-        res_type="kaiser_best",
+            res_type="kaiser_best",
-        seed=None,
+            seed=None, ):
    ):
        self.res_type = res_type
        self.keep_length = keep_length
        self.state = numpy.random.RandomState(seed)
@ -60,12 +73,10 @@ class SpeedPerturbation():
                self.lower,
                self.upper,
                self.keep_length,
-                self.res_type,
+                self.res_type, )
            )
        else:
            return "{}({}, res_type={})".format(
-                self.__class__.__name__, self.utt2ratio_file, self.res_type
+                self.__class__.__name__, self.utt2ratio_file, self.res_type)
            )
    def __call__(self, x, uttid=None, train=True):
        if not train:
@ -85,15 +96,14 @@ class SpeedPerturbation():
            diff = abs(len(x) - len(y))
            if len(y) > len(x):
                # Truncate noise
-                y = y[diff // 2 : -((diff + 1) // 2)]
+                y = y[diff // 2:-((diff + 1) // 2)]
            elif len(y) < len(x):
                # Assume the time-axis is the first: (Time, Channel)
                pad_width = [(diff // 2, (diff + 1) // 2)] + [
                    (0, 0) for _ in range(y.ndim - 1)
                ]
                y = numpy.pad(
-                    y, pad_width=pad_width, constant_values=0, mode="constant"
+                    y, pad_width=pad_width, constant_values=0, mode="constant")
                )
        return y
@ -111,7 +121,7 @@ class BandpassPerturbation():
    """
-    def __init__(self, lower=0.0, upper=0.75, seed=None, axes=(-1,)):
+    def __init__(self, lower=0.0, upper=0.75, seed=None, axes=(-1, )):
        self.lower = lower
        self.upper = upper
        self.state = numpy.random.RandomState(seed)
@ -119,18 +129,16 @@ class BandpassPerturbation():
        self.axes = axes
    def __repr__(self):
-        return "{}(lower={}, upper={})".format(
+        return "{}(lower={}, upper={})".format(self.__class__.__name__,
-            self.__class__.__name__, self.lower, self.upper
+                                               self.lower, self.upper)
        )
    def __call__(self, x_stft, uttid=None, train=True):
        if not train:
            return x_stft
        if x_stft.ndim == 1:
-            raise RuntimeError(
+            raise RuntimeError("Input in time-freq domain: "
-                "Input in time-freq domain: " "(Time, Channel, Freq) or (Time, Freq)"
+                               "(Time, Channel, Freq) or (Time, Freq)")
            )
        ratio = self.state.uniform(self.lower, self.upper)
        axes = [i if i >= 0 else x_stft.ndim - i for i in self.axes]
@ -142,7 +150,12 @@ class BandpassPerturbation():
 class VolumePerturbation():
-    def __init__(self, lower=-1.6, upper=1.6, utt2ratio=None, dbunit=True, seed=None):
+    def __init__(self,
                 lower=-1.6,
                 upper=1.6,
                 utt2ratio=None,
                 dbunit=True,
                 seed=None):
        self.dbunit = dbunit
        self.utt2ratio_file = utt2ratio
        self.lower = lower
@ -168,12 +181,10 @@ class VolumePerturbation():
    def __repr__(self):
        if self.utt2ratio is None:
            return "{}(lower={}, upper={}, dbunit={})".format(
-                self.__class__.__name__, self.lower, self.upper, self.dbunit
+                self.__class__.__name__, self.lower, self.upper, self.dbunit)
            )
        else:
            return '{}("{}", dbunit={})'.format(
-                self.__class__.__name__, self.utt2ratio_file, self.dbunit
+                self.__class__.__name__, self.utt2ratio_file, self.dbunit)
            )
    def __call__(self, x, uttid=None, train=True):
        if not train:
@ -186,7 +197,7 @@ class VolumePerturbation():
        else:
            ratio = self.state.uniform(self.lower, self.upper)
        if self.dbunit:
-            ratio = 10 ** (ratio / 20)
+            ratio = 10**(ratio / 20)
        return x * ratio
@ -194,15 +205,14 @@ class NoiseInjection():
    """Add isotropic noise"""
    def __init__(
-        self,
+            self,
-        utt2noise=None,
+            utt2noise=None,
-        lower=-20,
+            lower=-20,
-        upper=-5,
+            upper=-5,
-        utt2ratio=None,
+            utt2ratio=None,
-        filetype="list",
+            filetype="list",
-        dbunit=True,
+            dbunit=True,
-        seed=None,
+            seed=None, ):
    ):
        self.utt2noise_file = utt2noise
        self.utt2ratio_file = utt2ratio
        self.filetype = filetype
@ -242,19 +252,16 @@ class NoiseInjection():
        if utt2noise is not None and utt2ratio is not None:
            if set(self.utt2ratio) != set(self.utt2noise):
-                raise RuntimeError(
+                raise RuntimeError("The uttids mismatch between {} and {}".
-                    "The uttids mismatch between {} and {}".format(utt2ratio, utt2noise)
+                                   format(utt2ratio, utt2noise))
                )
    def __repr__(self):
        if self.utt2ratio is None:
            return "{}(lower={}, upper={}, dbunit={})".format(
-                self.__class__.__name__, self.lower, self.upper, self.dbunit
+                self.__class__.__name__, self.lower, self.upper, self.dbunit)
            )
        else:
            return '{}("{}", dbunit={})'.format(
-                self.__class__.__name__, self.utt2ratio_file, self.dbunit
+                self.__class__.__name__, self.utt2ratio_file, self.dbunit)
            )
    def __call__(self, x, uttid=None, train=True):
        if not train:
@ -268,8 +275,8 @@ class NoiseInjection():
            ratio = self.state.uniform(self.lower, self.upper)
        if self.dbunit:
-            ratio = 10 ** (ratio / 20)
+            ratio = 10**(ratio / 20)
-        scale = ratio * numpy.sqrt((x ** 2).mean())
+        scale = ratio * numpy.sqrt((x**2).mean())
        # 2. Get noise
        if self.utt2noise is not None:
@ -280,16 +287,17 @@ class NoiseInjection():
                # Randomly select the noise source
                noise = self.state.choice(list(self.utt2noise.values()))
            # Normalize the level
-            noise /= numpy.sqrt((noise ** 2).mean())
+            noise /= numpy.sqrt((noise**2).mean())
            # Adjust the noise length
            diff = abs(len(x) - len(noise))
            offset = self.state.randint(0, diff)
            if len(noise) > len(x):
                # Truncate noise
-                noise = noise[offset : -(diff - offset)]
+                noise = noise[offset:-(diff - offset)]
            else:
-                noise = numpy.pad(noise, pad_width=[offset, diff - offset], mode="wrap")
+                noise = numpy.pad(
                    noise, pad_width=[offset, diff - offset], mode="wrap")
        else:
            # Generate white noise
@ -329,15 +337,14 @@ class RIRConvolve():
        if x.ndim != 1:
            # Must be single channel
            raise RuntimeError(
-                "Input x must be one dimensional array, but got {}".format(x.shape)
+                "Input x must be one dimensional array, but got {}".format(
-            )
+                    x.shape))
        rir, rate = self.utt2rir[uttid]
        if rir.ndim == 2:
            # FIXME(kamo): Use chainer.convolution_1d?
            # return [Time, Channel]
            return numpy.stack(
-                [scipy.convolve(x, r, mode="same") for r in rir], axis=-1
+                [scipy.convolve(x, r, mode="same") for r in rir], axis=-1)
            )
        else:
            return scipy.convolve(x, rir, mode="same")
--- a/deepspeech/transform/spec_augment.py
+++ b/deepspeech/transform/spec_augment.py
@ -1,5 +1,17 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Spec Augment module for preprocessing i.e., data augmentation"""
 import random
 import numpy
@ -27,10 +39,12 @@ def time_warp(x, max_time_warp=80, inplace=False, mode="PIL"):
            return x
        # NOTE: randrange(a, b) emits a, a + 1, ..., b - 1
        center = random.randrange(window, t - window)
-        warped = random.randrange(center - window, center + window) + 1  # 1 ... t - 1
+        warped = random.randrange(center - window, center +
                                  window) + 1  # 1 ... t - 1
        left = Image.fromarray(x[:center]).resize((x.shape[1], warped), BICUBIC)
-        right = Image.fromarray(x[center:]).resize((x.shape[1], t - warped), BICUBIC)
+        right = Image.fromarray(x[center:]).resize((x.shape[1], t - warped),
                                                   BICUBIC)
        if inplace:
            x[:warped] = left
            x[warped:] = right
@ -44,11 +58,8 @@ def time_warp(x, max_time_warp=80, inplace=False, mode="PIL"):
        # TODO(karita): make this differentiable again
        return spec_augment.time_warp(paddle.to_tensor(x), window).numpy()
    else:
-        raise NotImplementedError(
+        raise NotImplementedError("unknown resize mode: " + mode +
-            "unknown resize mode: "
+                                  ", choose one from (PIL, sparse_image_warp).")
            + mode
            + ", choose one from (PIL, sparse_image_warp)."
        )
 class TimeWarp(FuncTrans):
@ -145,16 +156,15 @@ class TimeMask(FuncTrans):
 def spec_augment(
-    x,
+        x,
-    resize_mode="PIL",
+        resize_mode="PIL",
-    max_time_warp=80,
+        max_time_warp=80,
-    max_freq_width=27,
+        max_freq_width=27,
-    n_freq_mask=2,
+        n_freq_mask=2,
-    max_time_width=100,
+        max_time_width=100,
-    n_time_mask=2,
+        n_time_mask=2,
-    inplace=True,
+        inplace=True,
-    replace_with_zero=True,
+        replace_with_zero=True, ):
 ):
    """spec agument
    apply random time warping and time/freq masking
@ -180,15 +190,13 @@ def spec_augment(
        max_freq_width,
        n_freq_mask,
        inplace=inplace,
-        replace_with_zero=replace_with_zero,
+        replace_with_zero=replace_with_zero, )
    )
    x = time_mask(
        x,
        max_time_width,
        n_time_mask,
        inplace=inplace,
-        replace_with_zero=replace_with_zero,
+        replace_with_zero=replace_with_zero, )
    )
    return x
--- a/deepspeech/transform/spectrogram.py
+++ b/deepspeech/transform/spectrogram.py
@ -1,10 +1,27 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import librosa
 import numpy as np
-def stft(
+def stft(x,
-    x, n_fft, n_shift, win_length=None, window="hann", center=True, pad_mode="reflect"
+         n_fft,
-):
+         n_shift,
         win_length=None,
         window="hann",
         center=True,
         pad_mode="reflect"):
    # x: [Time, Channel]
    if x.ndim == 1:
        single_channel = True
@ -25,12 +42,9 @@ def stft(
                win_length=win_length,
                window=window,
                center=center,
-                pad_mode=pad_mode,
+                pad_mode=pad_mode, ).T for ch in range(x.shape[1])
            ).T
            for ch in range(x.shape[1])
        ],
-        axis=1,
+        axis=1, )
    )
    if single_channel:
        # x: [Time, Channel, Freq] -> [Time, Freq]
@ -55,12 +69,9 @@ def istft(x, n_shift, win_length=None, window="hann", center=True):
                hop_length=n_shift,
                win_length=win_length,
                window=window,
-                center=center,
+                center=center, ) for ch in range(x.shape[1])
            )
            for ch in range(x.shape[1])
        ],
-        axis=1,
+        axis=1, )
    )
    if single_channel:
        # x: [Time, Channel] -> [Time]
@ -68,7 +79,13 @@ def istft(x, n_shift, win_length=None, window="hann", center=True):
    return x
-def stft2logmelspectrogram(x_stft, fs, n_mels, n_fft, fmin=None, fmax=None, eps=1e-10):
+def stft2logmelspectrogram(x_stft,
                           fs,
                           n_mels,
                           n_fft,
                           fmin=None,
                           fmax=None,
                           eps=1e-10):
    # x_stft: (Time, Channel, Freq) or (Time, Freq)
    fmin = 0 if fmin is None else fmin
    fmax = fs / 2 if fmax is None else fmax
@ -90,18 +107,17 @@ def spectrogram(x, n_fft, n_shift, win_length=None, window="hann"):
 def logmelspectrogram(
-    x,
+        x,
-    fs,
+        fs,
-    n_mels,
+        n_mels,
-    n_fft,
+        n_fft,
-    n_shift,
+        n_shift,
-    win_length=None,
+        win_length=None,
-    window="hann",
+        window="hann",
-    fmin=None,
+        fmin=None,
-    fmax=None,
+        fmax=None,
-    eps=1e-10,
+        eps=1e-10,
-    pad_mode="reflect",
+        pad_mode="reflect", ):
 ):
    # stft: (Time, Channel, Freq) or (Time, Freq)
    x_stft = stft(
        x,
@ -109,12 +125,16 @@ def logmelspectrogram(
        n_shift=n_shift,
        win_length=win_length,
        window=window,
-        pad_mode=pad_mode,
+        pad_mode=pad_mode, )
    )
    return stft2logmelspectrogram(
-        x_stft, fs=fs, n_mels=n_mels, n_fft=n_fft, fmin=fmin, fmax=fmax, eps=eps
+        x_stft,
-    )
+        fs=fs,
        n_mels=n_mels,
        n_fft=n_fft,
        fmin=fmin,
        fmax=fmax,
        eps=eps)
 class Spectrogram():
@ -125,16 +145,13 @@ class Spectrogram():
        self.window = window
    def __repr__(self):
-        return (
+        return ("{name}(n_fft={n_fft}, n_shift={n_shift}, "
-            "{name}(n_fft={n_fft}, n_shift={n_shift}, "
+                "win_length={win_length}, window={window})".format(
-            "win_length={win_length}, window={window})".format(
+                    name=self.__class__.__name__,
-                name=self.__class__.__name__,
+                    n_fft=self.n_fft,
-                n_fft=self.n_fft,
+                    n_shift=self.n_shift,
-                n_shift=self.n_shift,
+                    win_length=self.win_length,
-                win_length=self.win_length,
+                    window=self.window, ))
                window=self.window,
            )
        )
    def __call__(self, x):
        return spectrogram(
@ -142,23 +159,21 @@ class Spectrogram():
            n_fft=self.n_fft,
            n_shift=self.n_shift,
            win_length=self.win_length,
-            window=self.window,
+            window=self.window, )
        )
 class LogMelSpectrogram():
    def __init__(
-        self,
+            self,
-        fs,
+            fs,
-        n_mels,
+            n_mels,
-        n_fft,
+            n_fft,
-        n_shift,
+            n_shift,
-        win_length=None,
+            win_length=None,
-        window="hann",
+            window="hann",
-        fmin=None,
+            fmin=None,
-        fmax=None,
+            fmax=None,
-        eps=1e-10,
+            eps=1e-10, ):
    ):
        self.fs = fs
        self.n_mels = n_mels
        self.n_fft = n_fft
@ -170,22 +185,19 @@ class LogMelSpectrogram():
        self.eps = eps
    def __repr__(self):
-        return (
+        return ("{name}(fs={fs}, n_mels={n_mels}, n_fft={n_fft}, "
-            "{name}(fs={fs}, n_mels={n_mels}, n_fft={n_fft}, "
+                "n_shift={n_shift}, win_length={win_length}, window={window}, "
-            "n_shift={n_shift}, win_length={win_length}, window={window}, "
+                "fmin={fmin}, fmax={fmax}, eps={eps}))".format(
-            "fmin={fmin}, fmax={fmax}, eps={eps}))".format(
+                    name=self.__class__.__name__,
-                name=self.__class__.__name__,
+                    fs=self.fs,
-                fs=self.fs,
+                    n_mels=self.n_mels,
-                n_mels=self.n_mels,
+                    n_fft=self.n_fft,
-                n_fft=self.n_fft,
+                    n_shift=self.n_shift,
-                n_shift=self.n_shift,
+                    win_length=self.win_length,
-                win_length=self.win_length,
+                    window=self.window,
-                window=self.window,
+                    fmin=self.fmin,
-                fmin=self.fmin,
+                    fmax=self.fmax,
-                fmax=self.fmax,
+                    eps=self.eps, ))
                eps=self.eps,
            )
        )
    def __call__(self, x):
        return logmelspectrogram(
@ -195,8 +207,7 @@ class LogMelSpectrogram():
            n_fft=self.n_fft,
            n_shift=self.n_shift,
            win_length=self.win_length,
-            window=self.window,
+            window=self.window, )
        )
 class Stft2LogMelSpectrogram():
@ -209,18 +220,15 @@ class Stft2LogMelSpectrogram():
        self.eps = eps
    def __repr__(self):
-        return (
+        return ("{name}(fs={fs}, n_mels={n_mels}, n_fft={n_fft}, "
-            "{name}(fs={fs}, n_mels={n_mels}, n_fft={n_fft}, "
+                "fmin={fmin}, fmax={fmax}, eps={eps}))".format(
-            "fmin={fmin}, fmax={fmax}, eps={eps}))".format(
+                    name=self.__class__.__name__,
-                name=self.__class__.__name__,
+                    fs=self.fs,
-                fs=self.fs,
+                    n_mels=self.n_mels,
-                n_mels=self.n_mels,
+                    n_fft=self.n_fft,
-                n_fft=self.n_fft,
+                    fmin=self.fmin,
-                fmin=self.fmin,
+                    fmax=self.fmax,
-                fmax=self.fmax,
+                    eps=self.eps, ))
                eps=self.eps,
            )
        )
    def __call__(self, x):
        return stft2logmelspectrogram(
@ -229,20 +237,18 @@ class Stft2LogMelSpectrogram():
            n_mels=self.n_mels,
            n_fft=self.n_fft,
            fmin=self.fmin,
-            fmax=self.fmax,
+            fmax=self.fmax, )
        )
 class Stft():
    def __init__(
-        self,
+            self,
-        n_fft,
+            n_fft,
-        n_shift,
+            n_shift,
-        win_length=None,
+            win_length=None,
-        window="hann",
+            window="hann",
-        center=True,
+            center=True,
-        pad_mode="reflect",
+            pad_mode="reflect", ):
    ):
        self.n_fft = n_fft
        self.n_shift = n_shift
        self.win_length = win_length
@ -251,19 +257,16 @@ class Stft():
        self.pad_mode = pad_mode
    def __repr__(self):
-        return (
+        return ("{name}(n_fft={n_fft}, n_shift={n_shift}, "
-            "{name}(n_fft={n_fft}, n_shift={n_shift}, "
+                "win_length={win_length}, window={window},"
-            "win_length={win_length}, window={window},"
+                "center={center}, pad_mode={pad_mode})".format(
-            "center={center}, pad_mode={pad_mode})".format(
+                    name=self.__class__.__name__,
-                name=self.__class__.__name__,
+                    n_fft=self.n_fft,
-                n_fft=self.n_fft,
+                    n_shift=self.n_shift,
-                n_shift=self.n_shift,
+                    win_length=self.win_length,
-                win_length=self.win_length,
+                    window=self.window,
-                window=self.window,
+                    center=self.center,
-                center=self.center,
+                    pad_mode=self.pad_mode, ))
                pad_mode=self.pad_mode,
            )
        )
    def __call__(self, x):
        return stft(
@ -273,8 +276,7 @@ class Stft():
            win_length=self.win_length,
            window=self.window,
            center=self.center,
-            pad_mode=self.pad_mode,
+            pad_mode=self.pad_mode, )
        )
 class IStft():
@ -285,17 +287,14 @@ class IStft():
        self.center = center
    def __repr__(self):
-        return (
+        return ("{name}(n_shift={n_shift}, "
-            "{name}(n_shift={n_shift}, "
+                "win_length={win_length}, window={window},"
-            "win_length={win_length}, window={window},"
+                "center={center})".format(
-            "center={center})".format(
+                    name=self.__class__.__name__,
-                name=self.__class__.__name__,
+                    n_shift=self.n_shift,
-                n_shift=self.n_shift,
+                    win_length=self.win_length,
-                win_length=self.win_length,
+                    window=self.window,
-                window=self.window,
+                    center=self.center, ))
                center=self.center,
            )
        )
    def __call__(self, x):
        return istft(
@ -303,5 +302,4 @@ class IStft():
            self.n_shift,
            win_length=self.win_length,
            window=self.window,
-            center=self.center,
+            center=self.center, )
        )
--- a/deepspeech/transform/transform_interface.py
+++ b/deepspeech/transform/transform_interface.py
@ -1,3 +1,16 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # TODO(karita): add this to all the transform impl.
 class TransformInterface:
    """Transform Interface"""
--- a/deepspeech/transform/transformation.py
+++ b/deepspeech/transform/transformation.py
@ -1,16 +1,28 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Transformation module."""
 from collections.abc import Sequence
 from collections import OrderedDict
 import copy
 from inspect import signature
 import io
 import logging
 from collections import OrderedDict
 from collections.abc import Sequence
 from inspect import signature
 import yaml
 from deepspeech.utils.dynamic_import import dynamic_import
 # TODO(karita): inherit TransformInterface
 # TODO(karita): register cmd arguments in asr_train.py
 import_alias = dict(
@ -33,8 +45,7 @@ import_alias = dict(
    istft="deepspeech.transform.spectrogram:IStft",
    stft2fbank="deepspeech.transform.spectrogram:Stft2LogMelSpectrogram",
    wpe="deepspeech.transform.wpe:WPE",
-    channel_selector="deepspeech.transform.channel_selector:ChannelSelector",
+    channel_selector="deepspeech.transform.channel_selector:ChannelSelector", )
 )
 class Transformation():
@ -83,21 +94,16 @@ class Transformation():
                        # Some function, e.g. built-in function, are failed
                        pass
                    else:
-                        logging.error(
+                        logging.error("Expected signature: {}({})".format(
-                            "Expected signature: {}({})".format(
+                            class_obj.__name__, signa))
                                class_obj.__name__, signa
                            )
                        )
                    raise
        else:
            raise NotImplementedError(
-                "Not supporting mode={}".format(self.conf["mode"])
+                "Not supporting mode={}".format(self.conf["mode"]))
            )
    def __repr__(self):
-        rep = "\n" + "\n".join(
+        rep = "\n" + "\n".join("    {}: {}".format(k, v)
-            "    {}: {}".format(k, v) for k, v in self.functions.items()
+                               for k, v in self.functions.items())
        )
        return "{}({})".format(self.__class__.__name__, rep)
    def __call__(self, xs, uttid_list=None, **kwargs):
@ -130,18 +136,19 @@ class Transformation():
                _kwargs = {k: v for k, v in kwargs.items() if k in param}
                try:
                    if uttid_list is not None and "uttid" in param:
-                        xs = [func(x, u, **_kwargs) for x, u in zip(xs, uttid_list)]
+                        xs = [
                            func(x, u, **_kwargs)
                            for x, u in zip(xs, uttid_list)
                        ]
                    else:
                        xs = [func(x, **_kwargs) for x in xs]
                except Exception:
-                    logging.fatal(
+                    logging.fatal("Catch a exception from {}th func: {}".format(
-                        "Catch a exception from {}th func: {}".format(idx, func)
+                        idx, func))
                    )
                    raise
        else:
            raise NotImplementedError(
-                "Not supporting mode={}".format(self.conf["mode"])
+                "Not supporting mode={}".format(self.conf["mode"]))
            )
        if is_batch:
            return xs
--- a/deepspeech/transform/wpe.py
+++ b/deepspeech/transform/wpe.py
@ -1,10 +1,26 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from nara_wpe.wpe import wpe
 class WPE(object):
-    def __init__(
+    def __init__(self,
-        self, taps=10, delay=3, iterations=3, psd_context=0, statistics_mode="full"
+                 taps=10,
-    ):
+                 delay=3,
                 iterations=3,
                 psd_context=0,
                 statistics_mode="full"):
        self.taps = taps
        self.delay = delay
        self.iterations = iterations
@ -12,18 +28,15 @@ class WPE(object):
        self.statistics_mode = statistics_mode
    def __repr__(self):
-        return (
+        return ("{name}(taps={taps}, delay={delay}"
-            "{name}(taps={taps}, delay={delay}"
+                "iterations={iterations}, psd_context={psd_context}, "
-            "iterations={iterations}, psd_context={psd_context}, "
+                "statistics_mode={statistics_mode})".format(
-            "statistics_mode={statistics_mode})".format(
+                    name=self.__class__.__name__,
-                name=self.__class__.__name__,
+                    taps=self.taps,
-                taps=self.taps,
+                    delay=self.delay,
-                delay=self.delay,
+                    iterations=self.iterations,
-                iterations=self.iterations,
+                    psd_context=self.psd_context,
-                psd_context=self.psd_context,
+                    statistics_mode=self.statistics_mode, ))
                statistics_mode=self.statistics_mode,
            )
        )
    def __call__(self, xs):
        """Return enhanced
@ -40,6 +53,5 @@ class WPE(object):
            delay=self.delay,
            iterations=self.iterations,
            psd_context=self.psd_context,
-            statistics_mode=self.statistics_mode,
+            statistics_mode=self.statistics_mode, )
        )
        return xs.transpose(2, 1, 0)
--- a/deepspeech/utils/check_kwargs.py
+++ b/deepspeech/utils/check_kwargs.py
@ -1,3 +1,16 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import inspect
@ -17,4 +30,5 @@ def check_kwargs(func, kwargs, name=None):
        name = func.__name__
    for k in kwargs.keys():
        if k not in params:
-            raise TypeError(f"{name}() got an unexpected keyword argument '{k}'")
+            raise TypeError(
                f"{name}() got an unexpected keyword argument '{k}'")
--- a/deepspeech/utils/spec_augment.py
+++ b/deepspeech/utils/spec_augment.py
@ -0,0 +1,13 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
--- a/examples/librispeech/s2/README.md
+++ b/examples/librispeech/s2/README.md
@ -23,5 +23,5 @@
 | test-clean | join_ctc_w/o_lm | 2620 | 52576 | 97.2 | 2.6 | 0.3 | 0.4 | 3.2 | 34.9 |  
 | test-clean | join_ctc_w_lm | 2620 | 52576 | 97.9 | 1.8 | 0.2 | 0.3 | 2.4 | 27.8 |  
-Compare with [ESPNET](https://github.com/espnet/espnet/blob/master/egs/librispeech/asr1/RESULTS.md#pytorch-large-transformer-with-specaug-4-gpus--transformer-lm-4-gpus) 
+Compare with [ESPNET](https://github.com/espnet/espnet/blob/master/egs/librispeech/asr1/RESULTS.md#pytorch-large-transformer-with-specaug-4-gpus--transformer-lm-4-gpus)
 we using 8gpu, but model size (aheads4-adim256) small than it.
--- a/parakeet/data/batch.py
+++ b/parakeet/data/batch.py
@ -53,8 +53,8 @@ def batch_text_id(minibatch, pad_id=0, dtype=np.int64):
    peek_example = minibatch[0]
    assert len(peek_example.shape) == 1, "text example is an 1D tensor"
-    lengths = [example.shape[0] for example in minibatch
+    lengths = [example.shape[0] for example in
-               ]  # assume (channel, n_samples) or (n_samples, )
+               minibatch]  # assume (channel, n_samples) or (n_samples, )
    max_len = np.max(lengths)
    batch = []
--- a/parakeet/exps/tacotron2/ljspeech.py
+++ b/parakeet/exps/tacotron2/ljspeech.py
@ -67,19 +67,16 @@ class LJSpeechCollector(object):
        # Sort by text_len in descending order
        texts = [
-            i
+            i for i, _ in sorted(
            for i, _ in sorted(
                zip(texts, text_lens), key=lambda x: x[1], reverse=True)
        ]
        mels = [
-            i
+            i for i, _ in sorted(
            for i, _ in sorted(
                zip(mels, text_lens), key=lambda x: x[1], reverse=True)
        ]
        mel_lens = [
-            i
+            i for i, _ in sorted(
            for i, _ in sorted(
                zip(mel_lens, text_lens), key=lambda x: x[1], reverse=True)
        ]
--- a/requirements.txt
+++ b/requirements.txt
@ -13,7 +13,7 @@ librosa
 llvmlite
 loguru
 matplotlib
-nltk
+nara_wpenltk
 numba
 numpy==1.20.0
 pandas
@ -42,4 +42,3 @@ visualdl==2.2.0
 webrtcvad
 yacs
 yq
 nara_wpe
--- a/utils/feat-to-shape.py
+++ b/utils/feat-to-shape.py
@ -12,33 +12,32 @@ from deepspeech.utils.cli_utils import is_scipy_wav_style
 def get_parser():
    parser = argparse.ArgumentParser(
        description="convert feature to its shape",
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter, )
-    )
+    parser.add_argument(
-    parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option")
+        "--verbose", "-V", default=0, type=int, help="Verbose option")
    parser.add_argument(
        "--filetype",
        type=str,
        default="mat",
        choices=["mat", "hdf5", "sound.hdf5", "sound"],
        help="Specify the file format for the rspecifier. "
-        '"mat" is the matrix format in kaldi',
+        '"mat" is the matrix format in kaldi', )
    )
    parser.add_argument(
        "--preprocess-conf",
        type=str,
        default=None,
-        help="The configuration file for the pre-processing",
+        help="The configuration file for the pre-processing", )
    )
    parser.add_argument(
-        "rspecifier", type=str, help="Read specifier for feats. e.g. ark:some.ark"
+        "rspecifier",
-    )
+        type=str,
        help="Read specifier for feats. e.g. ark:some.ark")
    parser.add_argument(
        "out",
        nargs="?",
        type=argparse.FileType("w"),
        default=sys.stdout,
-        help="The output filename. " "If omitted, then output to sys.stdout",
+        help="The output filename. "
-    )
+        "If omitted, then output to sys.stdout", )
    return parser
@ -64,8 +63,7 @@ def main():
    # so change to file_reader_helper to return shape.
    # This make sense only with filetype="hdf5".
    for utt, mat in file_reader_helper(
-        args.rspecifier, args.filetype, return_shape=preprocessing is None
+            args.rspecifier, args.filetype, return_shape=preprocessing is None):
    ):
        if preprocessing is not None:
            if is_scipy_wav_style(mat):
                # If data is sound file, then got as Tuple[int, ndarray]