lm embed and format code

4 years ago · 6569ce123d
parent 871fc5b70d
commit 6569ce123d
18 changed files with 312 additions and 311 deletions
--- a/deepspeech/frontend/augmentor/augmentation.py
+++ b/deepspeech/frontend/augmentor/augmentation.py
@ -227,4 +227,4 @@ class AugmentationPipeline():
            obj = class_obj(self._rng, **params)
        except Exception:
            raise ValueError("Unknown augmentor type [%s]." % augmentor_type)
-        return obj
+        return obj
--- a/deepspeech/io/reader.py
+++ b/deepspeech/io/reader.py
@ -24,6 +24,7 @@ __all__ = ["LoadInputsAndTargets"]

 logger = Log(__name__).getlog()

+
 class LoadInputsAndTargets():
    """Create a mini-batch from a list of dicts

--- a/deepspeech/models/lm/transformer.py
+++ b/deepspeech/models/lm/transformer.py
@ -24,11 +24,11 @@ from deepspeech.decoders.scorers.scorer_interface import BatchScorerInterface
 from deepspeech.models.lm_interface import LMInterface
 from deepspeech.modules.encoder import TransformerEncoder
 from deepspeech.modules.mask import subsequent_mask
-
 from deepspeech.utils.log import Log

 logger = Log(__name__).getlog()

+
 class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface):
    def __init__(
            self,
--- a/deepspeech/modules/embedding.py
+++ b/deepspeech/modules/embedding.py
@ -23,12 +23,14 @@ from deepspeech.utils.log import Log
 logger = Log(__name__).getlog()

 __all__ = [
-    "PositionalEncodingInterface", "NoPositionalEncoding", "PositionalEncoding", "RelPositionalEncoding"
+    "PositionalEncodingInterface", "NoPositionalEncoding", "PositionalEncoding",
+    "RelPositionalEncoding"
 ]

-class PositionalEncodingInterface:

-    def forward(self, x:paddle.Tensor, offset: int=0) -> Tuple[paddle.Tensor, paddle.Tensor]:
+class PositionalEncodingInterface:
+    def forward(self, x: paddle.Tensor,
+                offset: int=0) -> Tuple[paddle.Tensor, paddle.Tensor]:
        """Compute positional encoding.
        Args:
            x (paddle.Tensor): Input tensor (batch, time, `*`).
@ -37,8 +39,8 @@ class PositionalEncodingInterface:
            paddle.Tensor: Positional embedding tensor (1, time, `*`).
        """
        raise NotImplementedError("forward method is not implemented")
-    
-    def position_encoding(self, offset:int, size:int) -> paddle.Tensor:
+
+    def position_encoding(self, offset: int, size: int) -> paddle.Tensor:
        """ For getting encoding in a streaming fashion
        Args:
            offset (int): start offset
--- a/deepspeech/modules/encoder.py
+++ b/deepspeech/modules/encoder.py
@ -32,7 +32,6 @@ from deepspeech.modules.encoder_layer import TransformerEncoderLayer
 from deepspeech.modules.mask import add_optional_chunk_mask
 from deepspeech.modules.mask import make_non_pad_mask
 from deepspeech.modules.positionwise_feed_forward import PositionwiseFeedForward
-from deepspeech.modules.subsampling import Conv2dSubsampling
 from deepspeech.modules.subsampling import Conv2dSubsampling4
 from deepspeech.modules.subsampling import Conv2dSubsampling6
 from deepspeech.modules.subsampling import Conv2dSubsampling8
@ -394,13 +393,8 @@ class TransformerEncoder(BaseEncoder):
        if self.global_cmvn is not None:
            xs = self.global_cmvn(xs)

-        if isinstance(self.embed, Conv2dSubsampling):
-            #TODO(Hui Zhang): self.embed(xs, masks, offset=0), stride_slice not support bool tensor
-            xs, pos_emb, masks = self.embed(
-                xs, masks.astype(xs.dtype), offset=0)
-        else:
-            xs, pos_emb, masks = self.embed(
-                xs, masks.astype(xs.dtype), offset=0)
+        #TODO(Hui Zhang): self.embed(xs, masks, offset=0), stride_slice not support bool tensor
+        xs, pos_emb, masks = self.embed(xs, masks.astype(xs.dtype), offset=0)
        #TODO(Hui Zhang): remove mask.astype, stride_slice not support bool tensor
        masks = masks.astype(paddle.bool)

--- a/deepspeech/transform/init.py
+++ b/deepspeech/transform/init.py
@ -0,0 +1,13 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/deepspeech/transform/cmvn.py
+++ b/deepspeech/transform/cmvn.py
@ -1,3 +1,16 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import io

 import h5py
@ -9,16 +22,15 @@ class CMVN():
    "Apply Global/Spk CMVN/iverserCMVN."

    def __init__(
-        self,
-        stats,
-        norm_means=True,
-        norm_vars=False,
-        filetype="mat",
-        utt2spk=None,
-        spk2utt=None,
-        reverse=False,
-        std_floor=1.0e-20,
-    ):
+            self,
+            stats,
+            norm_means=True,
+            norm_vars=False,
+            filetype="mat",
+            utt2spk=None,
+            spk2utt=None,
+            reverse=False,
+            std_floor=1.0e-20, ):
        self.stats_file = stats
        self.norm_means = norm_means
        self.norm_vars = norm_vars
@ -84,17 +96,14 @@ class CMVN():
            self.scale[spk] = 1 / std

    def __repr__(self):
-        return (
-            "{name}(stats_file={stats_file}, "
-            "norm_means={norm_means}, norm_vars={norm_vars}, "
-            "reverse={reverse})".format(
-                name=self.__class__.__name__,
-                stats_file=self.stats_file,
-                norm_means=self.norm_means,
-                norm_vars=self.norm_vars,
-                reverse=self.reverse,
-            )
-        )
+        return ("{name}(stats_file={stats_file}, "
+                "norm_means={norm_means}, norm_vars={norm_vars}, "
+                "reverse={reverse})".format(
+                    name=self.__class__.__name__,
+                    stats_file=self.stats_file,
+                    norm_means=self.norm_means,
+                    norm_vars=self.norm_vars,
+                    reverse=self.reverse, ))

    def __call__(self, x, uttid=None):
        if self.utt2spk is not None:
@ -121,6 +130,7 @@ class CMVN():

 class UtteranceCMVN():
    "Apply Utterance CMVN"
+
    def __init__(self, norm_means=True, norm_vars=False, std_floor=1.0e-20):
        self.norm_means = norm_means
        self.norm_vars = norm_vars
@ -130,20 +140,19 @@ class UtteranceCMVN():
        return "{name}(norm_means={norm_means}, norm_vars={norm_vars})".format(
            name=self.__class__.__name__,
            norm_means=self.norm_means,
-            norm_vars=self.norm_vars,
-        )
+            norm_vars=self.norm_vars, )

    def __call__(self, x, uttid=None):
        # x: [Time, Dim]
-        square_sums = (x ** 2).sum(axis=0)
+        square_sums = (x**2).sum(axis=0)
        mean = x.mean(axis=0)

        if self.norm_means:
            x = np.subtract(x, mean)

        if self.norm_vars:
-            var = square_sums / x.shape[0] - mean ** 2
+            var = square_sums / x.shape[0] - mean**2
            std = np.maximum(np.sqrt(var), self.std_floor)
            x = np.divide(x, std)

-        return x
+        return x
--- a/deepspeech/utils/cli_readers.py
+++ b/deepspeech/utils/cli_readers.py
@ -1,3 +1,16 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import io
 import logging
 import sys
@ -10,11 +23,10 @@ from deepspeech.io.reader import SoundHDF5File


 def file_reader_helper(
-    rspecifier: str,
-    filetype: str = "mat",
-    return_shape: bool = False,
-    segments: str = None,
-):
+        rspecifier: str,
+        filetype: str="mat",
+        return_shape: bool=False,
+        segments: str=None, ):
    """Read uttid and array in kaldi style

    This function might be a bit confusing as "ark" is used
@ -44,7 +56,8 @@ def file_reader_helper(

    """
    if filetype == "mat":
-        return KaldiReader(rspecifier, return_shape=return_shape, segments=segments)
+        return KaldiReader(
+            rspecifier, return_shape=return_shape, segments=segments)
    elif filetype == "hdf5":
        return HDF5Reader(rspecifier, return_shape=return_shape)
    elif filetype == "sound.hdf5":
@ -62,7 +75,8 @@ class KaldiReader:
        self.segments = segments

    def __iter__(self):
-        with kaldiio.ReadHelper(self.rspecifier, segments=self.segments) as reader:
+        with kaldiio.ReadHelper(
+                self.rspecifier, segments=self.segments) as reader:
            for key, array in reader:
                if self.return_shape:
                    array = array.shape
@ -72,9 +86,8 @@ class KaldiReader:
 class HDF5Reader:
    def __init__(self, rspecifier, return_shape=False):
        if ":" not in rspecifier:
-            raise ValueError(
-                'Give "rspecifier" such as "ark:some.ark: {}"'.format(self.rspecifier)
-            )
+            raise ValueError('Give "rspecifier" such as "ark:some.ark: {}"'.
+                             format(self.rspecifier))
        self.rspecifier = rspecifier
        self.ark_or_scp, self.filepath = self.rspecifier.split(":", 1)
        if self.ark_or_scp not in ["ark", "scp"]:
@ -93,9 +106,7 @@ class HDF5Reader:
                        raise RuntimeError(
                            "scp file for hdf5 should be like: "
                            '"uttid filepath.h5:key": {}({})'.format(
-                                line, self.filepath
-                            )
-                        )
+                                line, self.filepath))
                    path, h5_key = value.split(":", 1)

                    hdf5_file = hdf5_dict.get(path)
@ -110,9 +121,8 @@ class HDF5Reader:
                    try:
                        data = hdf5_file[h5_key]
                    except Exception:
-                        logging.error(
-                            "Error when loading {} with key={}".format(path, h5_key)
-                        )
+                        logging.error("Error when loading {} with key={}".
+                                      format(path, h5_key))
                        raise

                    if self.return_shape:
@ -144,9 +154,8 @@ class HDF5Reader:
 class SoundHDF5Reader:
    def __init__(self, rspecifier, return_shape=False):
        if ":" not in rspecifier:
-            raise ValueError(
-                'Give "rspecifier" such as "ark:some.ark: {}"'.format(rspecifier)
-            )
+            raise ValueError('Give "rspecifier" such as "ark:some.ark: {}"'.
+                             format(rspecifier))
        self.ark_or_scp, self.filepath = rspecifier.split(":", 1)
        if self.ark_or_scp not in ["ark", "scp"]:
            raise ValueError(f"Must be scp or ark: {self.ark_or_scp}")
@ -163,9 +172,7 @@ class SoundHDF5Reader:
                        raise RuntimeError(
                            "scp file for hdf5 should be like: "
                            '"uttid filepath.h5:key": {}({})'.format(
-                                line, self.filepath
-                            )
-                        )
+                                line, self.filepath))
                    path, h5_key = value.split(":", 1)

                    hdf5_file = hdf5_dict.get(path)
@ -180,9 +187,8 @@ class SoundHDF5Reader:
                    try:
                        data = hdf5_file[h5_key]
                    except Exception:
-                        logging.error(
-                            "Error when loading {} with key={}".format(path, h5_key)
-                        )
+                        logging.error("Error when loading {} with key={}".
+                                      format(path, h5_key))
                        raise

                    # Change Tuple[ndarray, int] -> Tuple[int, ndarray]
@ -214,14 +220,12 @@ class SoundHDF5Reader:
 class SoundReader:
    def __init__(self, rspecifier, return_shape=False):
        if ":" not in rspecifier:
-            raise ValueError(
-                'Give "rspecifier" such as "scp:some.scp: {}"'.format(rspecifier)
-            )
+            raise ValueError('Give "rspecifier" such as "scp:some.scp: {}"'.
+                             format(rspecifier))
        self.ark_or_scp, self.filepath = rspecifier.split(":", 1)
        if self.ark_or_scp != "scp":
-            raise ValueError(
-                'Only supporting "scp" for sound file: {}'.format(self.ark_or_scp)
-            )
+            raise ValueError('Only supporting "scp" for sound file: {}'.format(
+                self.ark_or_scp))
        self.return_shape = return_shape

    def __iter__(self):
--- a/deepspeech/utils/cli_utils.py
+++ b/deepspeech/utils/cli_utils.py
@ -1,6 +1,19 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
 from collections.abc import Sequence
 from distutils.util import strtobool as dist_strtobool
-import sys

 import numpy

@ -36,10 +49,9 @@ def get_commandline_args():

    # Escape the extra characters for shell
    argv = [
-        arg.replace("'", "'\\''")
-        if all(char not in arg for char in extra_chars)
-        else "'" + arg.replace("'", "'\\''") + "'"
-        for arg in sys.argv
+        arg.replace("'", "'\\''") if all(char not in arg
+                                         for char in extra_chars) else
+        "'" + arg.replace("'", "'\\''") + "'" for arg in sys.argv
    ]

    return sys.executable + " " + " ".join(argv)
@ -47,19 +59,12 @@ def get_commandline_args():

 def is_scipy_wav_style(value):
    # If Tuple[int, numpy.ndarray] or not
-    return (
-        isinstance(value, Sequence)
-        and len(value) == 2
-        and isinstance(value[0], int)
-        and isinstance(value[1], numpy.ndarray)
-    )
+    return (isinstance(value, Sequence) and len(value) == 2 and
+            isinstance(value[0], int) and isinstance(value[1], numpy.ndarray))


 def assert_scipy_wav_style(value):
    assert is_scipy_wav_style(
-        value
-    ), "Must be Tuple[int, numpy.ndarray], but got {}".format(
-        type(value)
-        if not isinstance(value, Sequence)
-        else "{}[{}]".format(type(value), ", ".join(str(type(v)) for v in value))
-    )
+        value), "Must be Tuple[int, numpy.ndarray], but got {}".format(
+            type(value) if not isinstance(value, Sequence) else "{}[{}]".format(
+                type(value), ", ".join(str(type(v)) for v in value)))
--- a/deepspeech/utils/cli_writers.py
+++ b/deepspeech/utils/cli_writers.py
@ -1,3 +1,16 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 from pathlib import Path
 from typing import Dict

@ -6,18 +19,17 @@ import kaldiio
 import numpy
 import soundfile

-from deepspeech.utils.cli_utils import assert_scipy_wav_style
 from deepspeech.io.reader import SoundHDF5File
+from deepspeech.utils.cli_utils import assert_scipy_wav_style


 def file_writer_helper(
-    wspecifier: str,
-    filetype: str = "mat",
-    write_num_frames: str = None,
-    compress: bool = False,
-    compression_method: int = 2,
-    pcm_format: str = "wav",
-):
+        wspecifier: str,
+        filetype: str="mat",
+        write_num_frames: str=None,
+        compress: bool=False,
+        compression_method: int=2,
+        pcm_format: str="wav", ):
    """Write matrices in kaldi style

    Args:
@ -61,20 +73,20 @@ def file_writer_helper(
            wspecifier,
            write_num_frames=write_num_frames,
            compress=compress,
-            compression_method=compression_method,
-        )
+            compression_method=compression_method, )
    elif filetype == "hdf5":
        return HDF5Writer(
-            wspecifier, write_num_frames=write_num_frames, compress=compress
-        )
+            wspecifier, write_num_frames=write_num_frames, compress=compress)
    elif filetype == "sound.hdf5":
        return SoundHDF5Writer(
-            wspecifier, write_num_frames=write_num_frames, pcm_format=pcm_format
-        )
+            wspecifier,
+            write_num_frames=write_num_frames,
+            pcm_format=pcm_format)
    elif filetype == "sound":
        return SoundWriter(
-            wspecifier, write_num_frames=write_num_frames, pcm_format=pcm_format
-        )
+            wspecifier,
+            write_num_frames=write_num_frames,
+            pcm_format=pcm_format)
    else:
        raise NotImplementedError(f"filetype={filetype}")

@ -116,29 +128,27 @@ def get_num_frames_writer(write_num_frames: str):
    """
    if write_num_frames is not None:
        if ":" not in write_num_frames:
-            raise ValueError(
-                'Must include ":", write_num_frames={}'.format(write_num_frames)
-            )
+            raise ValueError('Must include ":", write_num_frames={}'.format(
+                write_num_frames))

        nframes_type, nframes_file = write_num_frames.split(":", 1)
        if nframes_type != "ark,t":
-            raise ValueError(
-                "Only supporting text mode. "
-                "e.g. --write-num-frames=ark,t:foo.txt :"
-                "{}".format(nframes_type)
-            )
+            raise ValueError("Only supporting text mode. "
+                             "e.g. --write-num-frames=ark,t:foo.txt :"
+                             "{}".format(nframes_type))

    return open(nframes_file, "w", encoding="utf-8")


 class KaldiWriter(BaseWriter):
-    def __init__(
-        self, wspecifier, write_num_frames=None, compress=False, compression_method=2
-    ):
+    def __init__(self,
+                 wspecifier,
+                 write_num_frames=None,
+                 compress=False,
+                 compression_method=2):
        if compress:
            self.writer = kaldiio.WriteHelper(
-                wspecifier, compression_method=compression_method
-            )
+                wspecifier, compression_method=compression_method)
        else:
            self.writer = kaldiio.WriteHelper(wspecifier)
        self.writer_scp = None
@ -220,7 +230,8 @@ class SoundHDF5Writer(BaseWriter):
        self.pcm_format = pcm_format
        spec_dict = parse_wspecifier(wspecifier)
        self.filename = spec_dict["ark"]
-        self.writer = SoundHDF5File(spec_dict["ark"], "w", format=self.pcm_format)
+        self.writer = SoundHDF5File(
+            spec_dict["ark"], "w", format=self.pcm_format)
        if "scp" in spec_dict:
            self.writer_scp = open(spec_dict["scp"], "w", encoding="utf-8")
        else:
--- a/requirements.txt
+++ b/requirements.txt
@ -23,6 +23,7 @@ praatio~=4.1
 pre-commit
 pybind11
 pypinyin
+python-dateutil
 pyworld
 resampy==0.2.2
 sacrebleu
--- a/utils/apply-cmvn.py
+++ b/utils/apply-cmvn.py
@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 import argparse
-from distutils.util import strtobool
 import logging
+from distutils.util import strtobool

 import kaldiio
 import numpy
@ -16,86 +16,81 @@ from deepspeech.utils.cli_writers import file_writer_helper
 def get_parser():
    parser = argparse.ArgumentParser(
        description="apply mean-variance normalization to files",
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-    )
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter, )

-    parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option")
+    parser.add_argument(
+        "--verbose", "-V", default=0, type=int, help="Verbose option")
    parser.add_argument(
        "--in-filetype",
        type=str,
        default="mat",
        choices=["mat", "hdf5", "sound.hdf5", "sound"],
        help="Specify the file format for the rspecifier. "
-        '"mat" is the matrix format in kaldi',
-    )
+        '"mat" is the matrix format in kaldi', )
    parser.add_argument(
        "--stats-filetype",
        type=str,
        default="mat",
        choices=["mat", "hdf5", "npy"],
        help="Specify the file format for the rspecifier. "
-        '"mat" is the matrix format in kaldi',
-    )
+        '"mat" is the matrix format in kaldi', )
    parser.add_argument(
        "--out-filetype",
        type=str,
        default="mat",
        choices=["mat", "hdf5"],
        help="Specify the file format for the wspecifier. "
-        '"mat" is the matrix format in kaldi',
-    )
+        '"mat" is the matrix format in kaldi', )

    parser.add_argument(
        "--norm-means",
        type=strtobool,
        default=True,
-        help="Do variance normalization or not.",
-    )
+        help="Do variance normalization or not.", )
    parser.add_argument(
        "--norm-vars",
        type=strtobool,
        default=False,
-        help="Do variance normalization or not.",
-    )
+        help="Do variance normalization or not.", )
    parser.add_argument(
-        "--reverse", type=strtobool, default=False, help="Do reverse mode or not"
-    )
+        "--reverse",
+        type=strtobool,
+        default=False,
+        help="Do reverse mode or not")
    parser.add_argument(
        "--spk2utt",
        type=str,
        help="A text file of speaker to utterance-list map. "
        "(Don't give rspecifier format, such as "
-        '"ark:spk2utt")',
-    )
+        '"ark:spk2utt")', )
    parser.add_argument(
        "--utt2spk",
        type=str,
        help="A text file of utterance to speaker map. "
        "(Don't give rspecifier format, such as "
-        '"ark:utt2spk")',
-    )
+        '"ark:utt2spk")', )
    parser.add_argument(
-        "--write-num-frames", type=str, help="Specify wspecifer for utt2num_frames"
-    )
+        "--write-num-frames",
+        type=str,
+        help="Specify wspecifer for utt2num_frames")
    parser.add_argument(
-        "--compress", type=strtobool, default=False, help="Save in compressed format"
-    )
+        "--compress",
+        type=strtobool,
+        default=False,
+        help="Save in compressed format")
    parser.add_argument(
        "--compression-method",
        type=int,
        default=2,
-        help="Specify the method(if mat) or " "gzip-level(if hdf5)",
-    )
+        help="Specify the method(if mat) or "
+        "gzip-level(if hdf5)", )
    parser.add_argument(
        "stats_rspecifier_or_rxfilename",
-        help="Input stats. e.g. ark:stats.ark or stats.mat",
-    )
+        help="Input stats. e.g. ark:stats.ark or stats.mat", )
    parser.add_argument(
-        "rspecifier", type=str, help="Read specifier id. e.g. ark:some.ark"
-    )
+        "rspecifier", type=str, help="Read specifier id. e.g. ark:some.ark")
    parser.add_argument(
-        "wspecifier", type=str, help="Write specifier id. e.g. ark:some.ark"
-    )
+        "wspecifier", type=str, help="Write specifier id. e.g. ark:some.ark")
    return parser


@ -118,8 +113,8 @@ def main():
            stats_filetype = args.stats_filetype

        stats_dict = dict(
-            file_reader_helper(args.stats_rspecifier_or_rxfilename, stats_filetype)
-        )
+            file_reader_helper(args.stats_rspecifier_or_rxfilename,
+                               stats_filetype))
    else:
        is_rspcifier = False
        if args.stats_filetype == "mat":
@ -134,16 +129,14 @@ def main():
        norm_vars=args.norm_vars,
        utt2spk=args.utt2spk,
        spk2utt=args.spk2utt,
-        reverse=args.reverse,
-    )
+        reverse=args.reverse, )

    with file_writer_helper(
-        args.wspecifier,
-        filetype=args.out_filetype,
-        write_num_frames=args.write_num_frames,
-        compress=args.compress,
-        compression_method=args.compression_method,
-    ) as writer:
+            args.wspecifier,
+            filetype=args.out_filetype,
+            write_num_frames=args.write_num_frames,
+            compress=args.compress,
+            compression_method=args.compression_method, ) as writer:
        for utt, mat in file_reader_helper(args.rspecifier, args.in_filetype):
            if is_scipy_wav_style(mat):
                # If data is sound file, then got as Tuple[int, ndarray]
--- a/utils/caculate_rtf.py
+++ b/utils/caculate_rtf.py
@ -1,24 +1,23 @@
 #!/usr/bin/env python3
 # encoding: utf-8
-
 # Copyright 2021 Kyoto University (Hirofumi Inaguma)
 #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
-
 import argparse
 import codecs
-from dateutil import parser
 import glob
 import os

+from dateutil import parser
+

 def get_parser():
-    parser = argparse.ArgumentParser(description="calculate real time factor (RTF)")
+    parser = argparse.ArgumentParser(
+        description="calculate real time factor (RTF)")
    parser.add_argument(
        "--log-dir",
        type=str,
        default=None,
-        help="path to logging directory",
-    )
+        help="path to logging directory", )
    return parser


@ -37,23 +36,21 @@ def main():
        with codecs.open(x, "r", "utf-8") as f:
            for line in f:
                x = line.strip()
-                if "INFO: input lengths" in x:
-                    audio_durations += [int(x.split("input lengths: ")[1])]
-                    start_times += [parser.parse(x.split("(")[0])]
-                elif "INFO: prediction" in x:
-                    end_times += [parser.parse(x.split("(")[0])]
-        assert len(audio_durations) == len(end_times), (
-            len(audio_durations),
-            len(end_times),
-        )
-        assert len(start_times) == len(end_times), (len(start_times), len(end_times))
+                # 2021-10-25 08:22:04.052 | INFO | xxx:recog_v2:188 - feat: (1570, 83)
+                if "feat:" in x:
+                    dur = int(x.split("(")[1].split(',')[0])
+                    audio_durations += [dur]
+                    start_times += [parser.parse(x.split("|")[0])]
+                elif "total log probability:" in x:
+                    end_times += [parser.parse(x.split("|")[0])]
+        assert len(audio_durations) == len(end_times), (len(audio_durations),
+                                                        len(end_times), )
+        assert len(start_times) == len(end_times), (len(start_times),
+                                                    len(end_times))
+
        audio_sec += sum(audio_durations) / 100  # [sec]
-        decode_sec += sum(
-            [
-                (end - start).total_seconds()
-                for start, end in zip(start_times, end_times)
-            ]
-        )
+        decode_sec += sum([(end - start).total_seconds()
+                           for start, end in zip(start_times, end_times)])
        n_utt += len(audio_durations)

    print("Total audio duration: %.3f [sec]" % audio_sec)
--- a/utils/compute-cmvn-stats.py
+++ b/utils/compute-cmvn-stats.py
@ -19,44 +19,42 @@ def get_parser():
        "If wspecifier provided: per-utterance by default, "
        "or per-speaker if"
        "spk2utt option provided; if wxfilename: global",
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-    )
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter, )
    parser.add_argument(
        "--spk2utt",
        type=str,
        help="A text file of speaker to utterance-list map. "
        "(Don't give rspecifier format, such as "
-        '"ark:utt2spk")',
-    )
-    parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option")
+        '"ark:utt2spk")', )
+    parser.add_argument(
+        "--verbose", "-V", default=0, type=int, help="Verbose option")
    parser.add_argument(
        "--in-filetype",
        type=str,
        default="mat",
        choices=["mat", "hdf5", "sound.hdf5", "sound"],
        help="Specify the file format for the rspecifier. "
-        '"mat" is the matrix format in kaldi',
-    )
+        '"mat" is the matrix format in kaldi', )
    parser.add_argument(
        "--out-filetype",
        type=str,
        default="mat",
        choices=["mat", "hdf5", "npy"],
        help="Specify the file format for the wspecifier. "
-        '"mat" is the matrix format in kaldi',
-    )
+        '"mat" is the matrix format in kaldi', )
    parser.add_argument(
        "--preprocess-conf",
        type=str,
        default=None,
-        help="The configuration file for the pre-processing",
-    )
+        help="The configuration file for the pre-processing", )
    parser.add_argument(
-        "rspecifier", type=str, help="Read specifier for feats. e.g. ark:some.ark"
-    )
+        "rspecifier",
+        type=str,
+        help="Read specifier for feats. e.g. ark:some.ark")
    parser.add_argument(
-        "wspecifier_or_wxfilename", type=str, help="Write specifier. e.g. ark:some.ark"
-    )
+        "wspecifier_or_wxfilename",
+        type=str,
+        help="Write specifier. e.g. ark:some.ark")
    return parser


@ -92,10 +90,8 @@ def main():
                return x

        if args.out_filetype == "npy":
-            logging.warning(
-                "--out-filetype npy is allowed only for "
-                "Global CMVN mode, changing to hdf5"
-            )
+            logging.warning("--out-filetype npy is allowed only for "
+                            "Global CMVN mode, changing to hdf5")
            args.out_filetype = "hdf5"

    else:
@ -107,10 +103,8 @@ def main():
            return None

        if args.out_filetype == "hdf5":
-            logging.warning(
-                "--out-filetype hdf5 is not allowed for "
-                "Global CMVN mode, changing to npy"
-            )
+            logging.warning("--out-filetype hdf5 is not allowed for "
+                            "Global CMVN mode, changing to npy")
            args.out_filetype = "npy"

    if args.preprocess_conf is not None:
@ -126,8 +120,7 @@ def main():

    idx = 0
    for idx, (utt, matrix) in enumerate(
-        file_reader_helper(args.rspecifier, args.in_filetype), 1
-    ):
+            file_reader_helper(args.rspecifier, args.in_filetype), 1):
        if is_scipy_wav_style(matrix):
            # If data is sound file, then got as Tuple[int, ndarray]
            rate, matrix = matrix
@ -146,7 +139,7 @@ def main():

        counts[spk] += matrix.shape[0]
        sum_feats[spk] += matrix.sum(axis=0)
-        square_sum_feats[spk] += (matrix ** 2).sum(axis=0)
+        square_sum_feats[spk] += (matrix**2).sum(axis=0)
    logging.info("Processed {} utterances".format(idx))
    assert idx > 0, idx

@ -171,8 +164,8 @@ def main():
    # Per utterance or speaker CMVN
    if is_wspecifier:
        with file_writer_helper(
-            args.wspecifier_or_wxfilename, filetype=args.out_filetype
-        ) as writer:
+                args.wspecifier_or_wxfilename,
+                filetype=args.out_filetype) as writer:
            for spk, mat in cmvn_stats.items():
                writer[spk] = mat

@ -186,8 +179,7 @@ def main():
            kaldiio.save_mat(args.wspecifier_or_wxfilename, matrix)
        else:
            raise RuntimeError(
-                "Not supporting: --out-filetype {}".format(args.out_filetype)
-            )
+                "Not supporting: --out-filetype {}".format(args.out_filetype))


 if __name__ == "__main__":
--- a/utils/copy-feats.py
+++ b/utils/copy-feats.py
@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 import argparse
-from distutils.util import strtobool
 import logging
+from distutils.util import strtobool

 from deepspeech.transform.transformation import Transformation
 from deepspeech.utils.cli_readers import file_reader_helper
@ -13,50 +13,50 @@ from deepspeech.utils.cli_writers import file_writer_helper
 def get_parser():
    parser = argparse.ArgumentParser(
        description="copy feature with preprocessing",
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-    )
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter, )

-    parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option")
+    parser.add_argument(
+        "--verbose", "-V", default=0, type=int, help="Verbose option")
    parser.add_argument(
        "--in-filetype",
        type=str,
        default="mat",
        choices=["mat", "hdf5", "sound.hdf5", "sound"],
        help="Specify the file format for the rspecifier. "
-        '"mat" is the matrix format in kaldi',
-    )
+        '"mat" is the matrix format in kaldi', )
    parser.add_argument(
        "--out-filetype",
        type=str,
        default="mat",
        choices=["mat", "hdf5", "sound.hdf5", "sound"],
        help="Specify the file format for the wspecifier. "
-        '"mat" is the matrix format in kaldi',
-    )
+        '"mat" is the matrix format in kaldi', )
    parser.add_argument(
-        "--write-num-frames", type=str, help="Specify wspecifer for utt2num_frames"
-    )
+        "--write-num-frames",
+        type=str,
+        help="Specify wspecifer for utt2num_frames")
    parser.add_argument(
-        "--compress", type=strtobool, default=False, help="Save in compressed format"
-    )
+        "--compress",
+        type=strtobool,
+        default=False,
+        help="Save in compressed format")
    parser.add_argument(
        "--compression-method",
        type=int,
        default=2,
-        help="Specify the method(if mat) or " "gzip-level(if hdf5)",
-    )
+        help="Specify the method(if mat) or "
+        "gzip-level(if hdf5)", )
    parser.add_argument(
        "--preprocess-conf",
        type=str,
        default=None,
-        help="The configuration file for the pre-processing",
-    )
+        help="The configuration file for the pre-processing", )
    parser.add_argument(
-        "rspecifier", type=str, help="Read specifier for feats. e.g. ark:some.ark"
-    )
+        "rspecifier",
+        type=str,
+        help="Read specifier for feats. e.g. ark:some.ark")
    parser.add_argument(
-        "wspecifier", type=str, help="Write specifier. e.g. ark:some.ark"
-    )
+        "wspecifier", type=str, help="Write specifier. e.g. ark:some.ark")
    return parser


@ -79,12 +79,11 @@ def main():
        preprocessing = None

    with file_writer_helper(
-        args.wspecifier,
-        filetype=args.out_filetype,
-        write_num_frames=args.write_num_frames,
-        compress=args.compress,
-        compression_method=args.compression_method,
-    ) as writer:
+            args.wspecifier,
+            filetype=args.out_filetype,
+            write_num_frames=args.write_num_frames,
+            compress=args.compress,
+            compression_method=args.compression_method, ) as writer:
        for utt, mat in file_reader_helper(args.rspecifier, args.in_filetype):
            if is_scipy_wav_style(mat):
                # If data is sound file, then got as Tuple[int, ndarray]
--- a/utils/data2json.sh
+++ b/utils/data2json.sh
--- a/utils/merge_scp2json.py
+++ b/utils/merge_scp2json.py
@ -1,14 +1,12 @@
 #!/usr/bin/env python3
 # encoding: utf-8
-
-
 import argparse
 import codecs
-from distutils.util import strtobool
-from io import open
 import json
 import logging
 import sys
+from distutils.util import strtobool
+from io import open

 from deepspeech.utils.cli_utils import get_commandline_args

@ -47,45 +45,41 @@ def get_parser():
        "--input-scps feat:data/feats2.scp shape:data/utt2feat2_shape:shape "
        "--output-scps text:data/text shape:data/utt2text_shape:shape "
        "--scps utt2spk:data/utt2spk".format(sys.argv[0]),
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-    )
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter, )
    parser.add_argument(
        "--input-scps",
        type=str,
        nargs="*",
        action="append",
        default=[],
-        help="Json files for the inputs",
-    )
+        help="Json files for the inputs", )
    parser.add_argument(
        "--output-scps",
        type=str,
        nargs="*",
        action="append",
        default=[],
-        help="Json files for the outputs",
-    )
+        help="Json files for the outputs", )
    parser.add_argument(
        "--scps",
        type=str,
        nargs="+",
        default=[],
-        help="The json files except for the input and outputs",
-    )
-    parser.add_argument("--verbose", "-V", default=1, type=int, help="Verbose option")
+        help="The json files except for the input and outputs", )
+    parser.add_argument(
+        "--verbose", "-V", default=1, type=int, help="Verbose option")
    parser.add_argument(
        "--allow-one-column",
        type=strtobool,
        default=False,
        help="Allow one column in input scp files. "
-        "In this case, the value will be empty string.",
-    )
+        "In this case, the value will be empty string.", )
    parser.add_argument(
        "--out",
        "-O",
        type=str,
-        help="The output filename. " "If omitted, then output to sys.stdout",
-    )
+        help="The output filename. "
+        "If omitted, then output to sys.stdout", )
    return parser


@ -128,37 +122,33 @@ if __name__ == "__main__":
                        # e.g. type_func_str = "int" -> type_func = int
                        type_func = eval(type_func_str)
                    except Exception:
-                        raise RuntimeError("Unknown type: {}".format(type_func_str))
+                        raise RuntimeError(
+                            "Unknown type: {}".format(type_func_str))

                    if not callable(type_func):
-                        raise RuntimeError("Unknown type: {}".format(type_func_str))
+                        raise RuntimeError(
+                            "Unknown type: {}".format(type_func_str))

                else:
                    raise RuntimeError(
                        "Format <key>:<filepath> "
                        "or <key>:<filepath>:<type>  "
                        "e.g. feat:data/feat.scp "
-                        "or shape:data/feat.scp:shape: {}".format(key_scp)
-                    )
+                        "or shape:data/feat.scp:shape: {}".format(key_scp))

                for item in lis:
                    if key == item[0]:
-                        raise RuntimeError(
-                            'The key "{}" is duplicated: {} {}'.format(
-                                key, item[3], key_scp
-                            )
-                        )
+                        raise RuntimeError('The key "{}" is duplicated: {} {}'.
+                                           format(key, item[3], key_scp))

                lis.append((key, scp, type_func, key_scp, type_func_str))
            lis_list.append(lis)

    # Open  scp files
-    input_fscps = [
-        [open(i[1], "r", encoding="utf-8") for i in il] for il in input_infos
-    ]
-    output_fscps = [
-        [open(i[1], "r", encoding="utf-8") for i in il] for il in output_infos
-    ]
+    input_fscps = [[open(i[1], "r", encoding="utf-8") for i in il]
+                   for il in input_infos]
+    output_fscps = [[open(i[1], "r", encoding="utf-8") for i in il]
+                    for il in output_infos]
    fscps = [[open(i[1], "r", encoding="utf-8") for i in il] for il in infos]

    # Note(kamo): What is done here?
@ -200,12 +190,10 @@ if __name__ == "__main__":
                    if line == "" or first == "":
                        if line != first:
                            concat = sum(input_infos + output_infos + infos, [])
-                            raise RuntimeError(
-                                "The number of lines mismatch "
-                                'between: "{}" and "{}"'.format(
-                                    concat[0][1], concat[count][1]
-                                )
-                            )
+                            raise RuntimeError("The number of lines mismatch "
+                                               'between: "{}" and "{}"'.format(
+                                                   concat[0][1],
+                                                   concat[count][1]))

                    elif line.split()[0] != first.split()[0]:
                        concat = sum(input_infos + output_infos + infos, [])
@ -216,9 +204,7 @@ if __name__ == "__main__":
                                concat[0][1],
                                concat[count][1],
                                first.rstrip(),
-                                line.rstrip(),
-                            )
-                        )
+                                line.rstrip(), ))
                    count += 1

        # The end of file
@ -237,7 +223,8 @@ if __name__ == "__main__":
        ]:

            lis = []
-            for idx, (line_list, info_list) in enumerate(zip(_lines, _infos), 1):
+            for idx, (line_list, info_list) in enumerate(
+                    zip(_lines, _infos), 1):
                if inout == "input":
                    d = {"name": "input{}".format(idx)}
                elif inout == "output":
@ -254,9 +241,7 @@ if __name__ == "__main__":
                            raise RuntimeError(
                                "Format error {}th line in {}: "
                                ' Expecting "<key> <value>":\n>>> {}'.format(
-                                    nutt, info[1], line
-                                )
-                            )
+                                    nutt, info[1], line))
                        uttid = sps[0]
                        value = ""
                    else:
@ -274,9 +259,7 @@ if __name__ == "__main__":
                            logging.error(
                                '"{}" is an invalid function '
                                "for the {} th line in {}: \n>>> {}".format(
-                                    info[4], nutt, info[1], line
-                                )
-                            )
+                                    info[4], nutt, info[1], line))
                            raise

                    d[key] = value
@ -289,8 +272,11 @@ if __name__ == "__main__":
                entry.update(lis[0])

        entry = json.dumps(
-            entry, indent=4, ensure_ascii=False, sort_keys=True, separators=(",", ": ")
-        )
+            entry,
+            indent=4,
+            ensure_ascii=False,
+            sort_keys=True,
+            separators=(",", ": "))
        # Add indent
        indent = "    " * 2
        entry = ("\n" + indent).join(entry.split("\n"))
--- a/utils/text2token.py
+++ b/utils/text2token.py
@ -1,9 +1,6 @@
 #!/usr/bin/env python3
-
 # Copyright 2017 Johns Hopkins University (Shinji Watanabe)
 #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
-
-
 import argparse
 import codecs
 import re
@ -27,28 +24,26 @@ def exist_or_not(i, match_pos):
 def get_parser():
    parser = argparse.ArgumentParser(
        description="convert raw text to tokenized text",
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-    )
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter, )
    parser.add_argument(
        "--nchar",
        "-n",
        default=1,
        type=int,
        help="number of characters to split, i.e., \
-                        aabb -> a a b b with -n 1 and aa bb with -n 2",
-    )
+                        aabb -> a a b b with -n 1 and aa bb with -n 2", )
    parser.add_argument(
-        "--skip-ncols", "-s", default=0, type=int, help="skip first n columns"
-    )
-    parser.add_argument("--space", default="<space>", type=str, help="space symbol")
+        "--skip-ncols", "-s", default=0, type=int, help="skip first n columns")
+    parser.add_argument(
+        "--space", default="<space>", type=str, help="space symbol")
    parser.add_argument(
        "--non-lang-syms",
        "-l",
        default=None,
        type=str,
-        help="list of non-linguistic symobles, e.g., <NOISE> etc.",
-    )
-    parser.add_argument("text", type=str, default=False, nargs="?", help="input text")
+        help="list of non-linguistic symobles, e.g., <NOISE> etc.", )
+    parser.add_argument(
+        "text", type=str, default=False, nargs="?", help="input text")
    parser.add_argument(
        "--trans_type",
        "-t",
@ -60,8 +55,7 @@ def get_parser():
                        read from SI1279.WRD file -> "bricks are an alternative"
                        Else if trans_type is phn,
                        read from SI1279.PHN file -> "sil b r ih sil k s aa r er n aa l
-                        sil t er n ih sil t ih v sil" """,
-    )
+                        sil t er n ih sil t ih v sil" """, )
    return parser


@ -78,17 +72,17 @@ def main():
    if args.text:
        f = codecs.open(args.text, encoding="utf-8")
    else:
-        f = codecs.getreader("utf-8")(sys.stdin if is_python2 else sys.stdin.buffer)
+        f = codecs.getreader("utf-8")(sys.stdin
+                                      if is_python2 else sys.stdin.buffer)

-    sys.stdout = codecs.getwriter("utf-8")(
-        sys.stdout if is_python2 else sys.stdout.buffer
-    )
+    sys.stdout = codecs.getwriter("utf-8")(sys.stdout
+                                           if is_python2 else sys.stdout.buffer)
    line = f.readline()
    n = args.nchar
    while line:
        x = line.split()
-        print(" ".join(x[: args.skip_ncols]), end=" ")
-        a = " ".join(x[args.skip_ncols :])
+        print(" ".join(x[:args.skip_ncols]), end=" ")
+        a = " ".join(x[args.skip_ncols:])

        # get all matched positions
        match_pos = []
@ -118,7 +112,7 @@ def main():
                        i += 1
                a = chars

-            a = [a[j : j + n] for j in range(0, len(a), n)]
+            a = [a[j:j + n] for j in range(0, len(a), n)]

        a_flat = []
        for z in a: