lm embed and format code

pull/929/head
Hui Zhang 3 years ago
parent 871fc5b70d
commit 6569ce123d

@ -227,4 +227,4 @@ class AugmentationPipeline():
obj = class_obj(self._rng, **params)
except Exception:
raise ValueError("Unknown augmentor type [%s]." % augmentor_type)
return obj
return obj

@ -24,6 +24,7 @@ __all__ = ["LoadInputsAndTargets"]
logger = Log(__name__).getlog()
class LoadInputsAndTargets():
"""Create a mini-batch from a list of dicts

@ -24,11 +24,11 @@ from deepspeech.decoders.scorers.scorer_interface import BatchScorerInterface
from deepspeech.models.lm_interface import LMInterface
from deepspeech.modules.encoder import TransformerEncoder
from deepspeech.modules.mask import subsequent_mask
from deepspeech.utils.log import Log
logger = Log(__name__).getlog()
class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface):
def __init__(
self,

@ -23,12 +23,14 @@ from deepspeech.utils.log import Log
logger = Log(__name__).getlog()
__all__ = [
"PositionalEncodingInterface", "NoPositionalEncoding", "PositionalEncoding", "RelPositionalEncoding"
"PositionalEncodingInterface", "NoPositionalEncoding", "PositionalEncoding",
"RelPositionalEncoding"
]
class PositionalEncodingInterface:
def forward(self, x:paddle.Tensor, offset: int=0) -> Tuple[paddle.Tensor, paddle.Tensor]:
class PositionalEncodingInterface:
def forward(self, x: paddle.Tensor,
offset: int=0) -> Tuple[paddle.Tensor, paddle.Tensor]:
"""Compute positional encoding.
Args:
x (paddle.Tensor): Input tensor (batch, time, `*`).
@ -37,8 +39,8 @@ class PositionalEncodingInterface:
paddle.Tensor: Positional embedding tensor (1, time, `*`).
"""
raise NotImplementedError("forward method is not implemented")
def position_encoding(self, offset:int, size:int) -> paddle.Tensor:
def position_encoding(self, offset: int, size: int) -> paddle.Tensor:
""" For getting encoding in a streaming fashion
Args:
offset (int): start offset

@ -32,7 +32,6 @@ from deepspeech.modules.encoder_layer import TransformerEncoderLayer
from deepspeech.modules.mask import add_optional_chunk_mask
from deepspeech.modules.mask import make_non_pad_mask
from deepspeech.modules.positionwise_feed_forward import PositionwiseFeedForward
from deepspeech.modules.subsampling import Conv2dSubsampling
from deepspeech.modules.subsampling import Conv2dSubsampling4
from deepspeech.modules.subsampling import Conv2dSubsampling6
from deepspeech.modules.subsampling import Conv2dSubsampling8
@ -394,13 +393,8 @@ class TransformerEncoder(BaseEncoder):
if self.global_cmvn is not None:
xs = self.global_cmvn(xs)
if isinstance(self.embed, Conv2dSubsampling):
#TODO(Hui Zhang): self.embed(xs, masks, offset=0), stride_slice not support bool tensor
xs, pos_emb, masks = self.embed(
xs, masks.astype(xs.dtype), offset=0)
else:
xs, pos_emb, masks = self.embed(
xs, masks.astype(xs.dtype), offset=0)
#TODO(Hui Zhang): self.embed(xs, masks, offset=0), stride_slice not support bool tensor
xs, pos_emb, masks = self.embed(xs, masks.astype(xs.dtype), offset=0)
#TODO(Hui Zhang): remove mask.astype, stride_slice not support bool tensor
masks = masks.astype(paddle.bool)

@ -0,0 +1,13 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

@ -1,3 +1,16 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import io
import h5py
@ -9,16 +22,15 @@ class CMVN():
"Apply Global/Spk CMVN/iverserCMVN."
def __init__(
self,
stats,
norm_means=True,
norm_vars=False,
filetype="mat",
utt2spk=None,
spk2utt=None,
reverse=False,
std_floor=1.0e-20,
):
self,
stats,
norm_means=True,
norm_vars=False,
filetype="mat",
utt2spk=None,
spk2utt=None,
reverse=False,
std_floor=1.0e-20, ):
self.stats_file = stats
self.norm_means = norm_means
self.norm_vars = norm_vars
@ -84,17 +96,14 @@ class CMVN():
self.scale[spk] = 1 / std
def __repr__(self):
return (
"{name}(stats_file={stats_file}, "
"norm_means={norm_means}, norm_vars={norm_vars}, "
"reverse={reverse})".format(
name=self.__class__.__name__,
stats_file=self.stats_file,
norm_means=self.norm_means,
norm_vars=self.norm_vars,
reverse=self.reverse,
)
)
return ("{name}(stats_file={stats_file}, "
"norm_means={norm_means}, norm_vars={norm_vars}, "
"reverse={reverse})".format(
name=self.__class__.__name__,
stats_file=self.stats_file,
norm_means=self.norm_means,
norm_vars=self.norm_vars,
reverse=self.reverse, ))
def __call__(self, x, uttid=None):
if self.utt2spk is not None:
@ -121,6 +130,7 @@ class CMVN():
class UtteranceCMVN():
"Apply Utterance CMVN"
def __init__(self, norm_means=True, norm_vars=False, std_floor=1.0e-20):
self.norm_means = norm_means
self.norm_vars = norm_vars
@ -130,20 +140,19 @@ class UtteranceCMVN():
return "{name}(norm_means={norm_means}, norm_vars={norm_vars})".format(
name=self.__class__.__name__,
norm_means=self.norm_means,
norm_vars=self.norm_vars,
)
norm_vars=self.norm_vars, )
def __call__(self, x, uttid=None):
# x: [Time, Dim]
square_sums = (x ** 2).sum(axis=0)
square_sums = (x**2).sum(axis=0)
mean = x.mean(axis=0)
if self.norm_means:
x = np.subtract(x, mean)
if self.norm_vars:
var = square_sums / x.shape[0] - mean ** 2
var = square_sums / x.shape[0] - mean**2
std = np.maximum(np.sqrt(var), self.std_floor)
x = np.divide(x, std)
return x
return x

@ -1,3 +1,16 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import io
import logging
import sys
@ -10,11 +23,10 @@ from deepspeech.io.reader import SoundHDF5File
def file_reader_helper(
rspecifier: str,
filetype: str = "mat",
return_shape: bool = False,
segments: str = None,
):
rspecifier: str,
filetype: str="mat",
return_shape: bool=False,
segments: str=None, ):
"""Read uttid and array in kaldi style
This function might be a bit confusing as "ark" is used
@ -44,7 +56,8 @@ def file_reader_helper(
"""
if filetype == "mat":
return KaldiReader(rspecifier, return_shape=return_shape, segments=segments)
return KaldiReader(
rspecifier, return_shape=return_shape, segments=segments)
elif filetype == "hdf5":
return HDF5Reader(rspecifier, return_shape=return_shape)
elif filetype == "sound.hdf5":
@ -62,7 +75,8 @@ class KaldiReader:
self.segments = segments
def __iter__(self):
with kaldiio.ReadHelper(self.rspecifier, segments=self.segments) as reader:
with kaldiio.ReadHelper(
self.rspecifier, segments=self.segments) as reader:
for key, array in reader:
if self.return_shape:
array = array.shape
@ -72,9 +86,8 @@ class KaldiReader:
class HDF5Reader:
def __init__(self, rspecifier, return_shape=False):
if ":" not in rspecifier:
raise ValueError(
'Give "rspecifier" such as "ark:some.ark: {}"'.format(self.rspecifier)
)
raise ValueError('Give "rspecifier" such as "ark:some.ark: {}"'.
format(self.rspecifier))
self.rspecifier = rspecifier
self.ark_or_scp, self.filepath = self.rspecifier.split(":", 1)
if self.ark_or_scp not in ["ark", "scp"]:
@ -93,9 +106,7 @@ class HDF5Reader:
raise RuntimeError(
"scp file for hdf5 should be like: "
'"uttid filepath.h5:key": {}({})'.format(
line, self.filepath
)
)
line, self.filepath))
path, h5_key = value.split(":", 1)
hdf5_file = hdf5_dict.get(path)
@ -110,9 +121,8 @@ class HDF5Reader:
try:
data = hdf5_file[h5_key]
except Exception:
logging.error(
"Error when loading {} with key={}".format(path, h5_key)
)
logging.error("Error when loading {} with key={}".
format(path, h5_key))
raise
if self.return_shape:
@ -144,9 +154,8 @@ class HDF5Reader:
class SoundHDF5Reader:
def __init__(self, rspecifier, return_shape=False):
if ":" not in rspecifier:
raise ValueError(
'Give "rspecifier" such as "ark:some.ark: {}"'.format(rspecifier)
)
raise ValueError('Give "rspecifier" such as "ark:some.ark: {}"'.
format(rspecifier))
self.ark_or_scp, self.filepath = rspecifier.split(":", 1)
if self.ark_or_scp not in ["ark", "scp"]:
raise ValueError(f"Must be scp or ark: {self.ark_or_scp}")
@ -163,9 +172,7 @@ class SoundHDF5Reader:
raise RuntimeError(
"scp file for hdf5 should be like: "
'"uttid filepath.h5:key": {}({})'.format(
line, self.filepath
)
)
line, self.filepath))
path, h5_key = value.split(":", 1)
hdf5_file = hdf5_dict.get(path)
@ -180,9 +187,8 @@ class SoundHDF5Reader:
try:
data = hdf5_file[h5_key]
except Exception:
logging.error(
"Error when loading {} with key={}".format(path, h5_key)
)
logging.error("Error when loading {} with key={}".
format(path, h5_key))
raise
# Change Tuple[ndarray, int] -> Tuple[int, ndarray]
@ -214,14 +220,12 @@ class SoundHDF5Reader:
class SoundReader:
def __init__(self, rspecifier, return_shape=False):
if ":" not in rspecifier:
raise ValueError(
'Give "rspecifier" such as "scp:some.scp: {}"'.format(rspecifier)
)
raise ValueError('Give "rspecifier" such as "scp:some.scp: {}"'.
format(rspecifier))
self.ark_or_scp, self.filepath = rspecifier.split(":", 1)
if self.ark_or_scp != "scp":
raise ValueError(
'Only supporting "scp" for sound file: {}'.format(self.ark_or_scp)
)
raise ValueError('Only supporting "scp" for sound file: {}'.format(
self.ark_or_scp))
self.return_shape = return_shape
def __iter__(self):

@ -1,6 +1,19 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
from collections.abc import Sequence
from distutils.util import strtobool as dist_strtobool
import sys
import numpy
@ -36,10 +49,9 @@ def get_commandline_args():
# Escape the extra characters for shell
argv = [
arg.replace("'", "'\\''")
if all(char not in arg for char in extra_chars)
else "'" + arg.replace("'", "'\\''") + "'"
for arg in sys.argv
arg.replace("'", "'\\''") if all(char not in arg
for char in extra_chars) else
"'" + arg.replace("'", "'\\''") + "'" for arg in sys.argv
]
return sys.executable + " " + " ".join(argv)
@ -47,19 +59,12 @@ def get_commandline_args():
def is_scipy_wav_style(value):
# If Tuple[int, numpy.ndarray] or not
return (
isinstance(value, Sequence)
and len(value) == 2
and isinstance(value[0], int)
and isinstance(value[1], numpy.ndarray)
)
return (isinstance(value, Sequence) and len(value) == 2 and
isinstance(value[0], int) and isinstance(value[1], numpy.ndarray))
def assert_scipy_wav_style(value):
assert is_scipy_wav_style(
value
), "Must be Tuple[int, numpy.ndarray], but got {}".format(
type(value)
if not isinstance(value, Sequence)
else "{}[{}]".format(type(value), ", ".join(str(type(v)) for v in value))
)
value), "Must be Tuple[int, numpy.ndarray], but got {}".format(
type(value) if not isinstance(value, Sequence) else "{}[{}]".format(
type(value), ", ".join(str(type(v)) for v in value)))

@ -1,3 +1,16 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from pathlib import Path
from typing import Dict
@ -6,18 +19,17 @@ import kaldiio
import numpy
import soundfile
from deepspeech.utils.cli_utils import assert_scipy_wav_style
from deepspeech.io.reader import SoundHDF5File
from deepspeech.utils.cli_utils import assert_scipy_wav_style
def file_writer_helper(
wspecifier: str,
filetype: str = "mat",
write_num_frames: str = None,
compress: bool = False,
compression_method: int = 2,
pcm_format: str = "wav",
):
wspecifier: str,
filetype: str="mat",
write_num_frames: str=None,
compress: bool=False,
compression_method: int=2,
pcm_format: str="wav", ):
"""Write matrices in kaldi style
Args:
@ -61,20 +73,20 @@ def file_writer_helper(
wspecifier,
write_num_frames=write_num_frames,
compress=compress,
compression_method=compression_method,
)
compression_method=compression_method, )
elif filetype == "hdf5":
return HDF5Writer(
wspecifier, write_num_frames=write_num_frames, compress=compress
)
wspecifier, write_num_frames=write_num_frames, compress=compress)
elif filetype == "sound.hdf5":
return SoundHDF5Writer(
wspecifier, write_num_frames=write_num_frames, pcm_format=pcm_format
)
wspecifier,
write_num_frames=write_num_frames,
pcm_format=pcm_format)
elif filetype == "sound":
return SoundWriter(
wspecifier, write_num_frames=write_num_frames, pcm_format=pcm_format
)
wspecifier,
write_num_frames=write_num_frames,
pcm_format=pcm_format)
else:
raise NotImplementedError(f"filetype={filetype}")
@ -116,29 +128,27 @@ def get_num_frames_writer(write_num_frames: str):
"""
if write_num_frames is not None:
if ":" not in write_num_frames:
raise ValueError(
'Must include ":", write_num_frames={}'.format(write_num_frames)
)
raise ValueError('Must include ":", write_num_frames={}'.format(
write_num_frames))
nframes_type, nframes_file = write_num_frames.split(":", 1)
if nframes_type != "ark,t":
raise ValueError(
"Only supporting text mode. "
"e.g. --write-num-frames=ark,t:foo.txt :"
"{}".format(nframes_type)
)
raise ValueError("Only supporting text mode. "
"e.g. --write-num-frames=ark,t:foo.txt :"
"{}".format(nframes_type))
return open(nframes_file, "w", encoding="utf-8")
class KaldiWriter(BaseWriter):
def __init__(
self, wspecifier, write_num_frames=None, compress=False, compression_method=2
):
def __init__(self,
wspecifier,
write_num_frames=None,
compress=False,
compression_method=2):
if compress:
self.writer = kaldiio.WriteHelper(
wspecifier, compression_method=compression_method
)
wspecifier, compression_method=compression_method)
else:
self.writer = kaldiio.WriteHelper(wspecifier)
self.writer_scp = None
@ -220,7 +230,8 @@ class SoundHDF5Writer(BaseWriter):
self.pcm_format = pcm_format
spec_dict = parse_wspecifier(wspecifier)
self.filename = spec_dict["ark"]
self.writer = SoundHDF5File(spec_dict["ark"], "w", format=self.pcm_format)
self.writer = SoundHDF5File(
spec_dict["ark"], "w", format=self.pcm_format)
if "scp" in spec_dict:
self.writer_scp = open(spec_dict["scp"], "w", encoding="utf-8")
else:

@ -23,6 +23,7 @@ praatio~=4.1
pre-commit
pybind11
pypinyin
python-dateutil
pyworld
resampy==0.2.2
sacrebleu

@ -1,7 +1,7 @@
#!/usr/bin/env python3
import argparse
from distutils.util import strtobool
import logging
from distutils.util import strtobool
import kaldiio
import numpy
@ -16,86 +16,81 @@ from deepspeech.utils.cli_writers import file_writer_helper
def get_parser():
parser = argparse.ArgumentParser(
description="apply mean-variance normalization to files",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
formatter_class=argparse.ArgumentDefaultsHelpFormatter, )
parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option")
parser.add_argument(
"--verbose", "-V", default=0, type=int, help="Verbose option")
parser.add_argument(
"--in-filetype",
type=str,
default="mat",
choices=["mat", "hdf5", "sound.hdf5", "sound"],
help="Specify the file format for the rspecifier. "
'"mat" is the matrix format in kaldi',
)
'"mat" is the matrix format in kaldi', )
parser.add_argument(
"--stats-filetype",
type=str,
default="mat",
choices=["mat", "hdf5", "npy"],
help="Specify the file format for the rspecifier. "
'"mat" is the matrix format in kaldi',
)
'"mat" is the matrix format in kaldi', )
parser.add_argument(
"--out-filetype",
type=str,
default="mat",
choices=["mat", "hdf5"],
help="Specify the file format for the wspecifier. "
'"mat" is the matrix format in kaldi',
)
'"mat" is the matrix format in kaldi', )
parser.add_argument(
"--norm-means",
type=strtobool,
default=True,
help="Do variance normalization or not.",
)
help="Do variance normalization or not.", )
parser.add_argument(
"--norm-vars",
type=strtobool,
default=False,
help="Do variance normalization or not.",
)
help="Do variance normalization or not.", )
parser.add_argument(
"--reverse", type=strtobool, default=False, help="Do reverse mode or not"
)
"--reverse",
type=strtobool,
default=False,
help="Do reverse mode or not")
parser.add_argument(
"--spk2utt",
type=str,
help="A text file of speaker to utterance-list map. "
"(Don't give rspecifier format, such as "
'"ark:spk2utt")',
)
'"ark:spk2utt")', )
parser.add_argument(
"--utt2spk",
type=str,
help="A text file of utterance to speaker map. "
"(Don't give rspecifier format, such as "
'"ark:utt2spk")',
)
'"ark:utt2spk")', )
parser.add_argument(
"--write-num-frames", type=str, help="Specify wspecifer for utt2num_frames"
)
"--write-num-frames",
type=str,
help="Specify wspecifer for utt2num_frames")
parser.add_argument(
"--compress", type=strtobool, default=False, help="Save in compressed format"
)
"--compress",
type=strtobool,
default=False,
help="Save in compressed format")
parser.add_argument(
"--compression-method",
type=int,
default=2,
help="Specify the method(if mat) or " "gzip-level(if hdf5)",
)
help="Specify the method(if mat) or "
"gzip-level(if hdf5)", )
parser.add_argument(
"stats_rspecifier_or_rxfilename",
help="Input stats. e.g. ark:stats.ark or stats.mat",
)
help="Input stats. e.g. ark:stats.ark or stats.mat", )
parser.add_argument(
"rspecifier", type=str, help="Read specifier id. e.g. ark:some.ark"
)
"rspecifier", type=str, help="Read specifier id. e.g. ark:some.ark")
parser.add_argument(
"wspecifier", type=str, help="Write specifier id. e.g. ark:some.ark"
)
"wspecifier", type=str, help="Write specifier id. e.g. ark:some.ark")
return parser
@ -118,8 +113,8 @@ def main():
stats_filetype = args.stats_filetype
stats_dict = dict(
file_reader_helper(args.stats_rspecifier_or_rxfilename, stats_filetype)
)
file_reader_helper(args.stats_rspecifier_or_rxfilename,
stats_filetype))
else:
is_rspcifier = False
if args.stats_filetype == "mat":
@ -134,16 +129,14 @@ def main():
norm_vars=args.norm_vars,
utt2spk=args.utt2spk,
spk2utt=args.spk2utt,
reverse=args.reverse,
)
reverse=args.reverse, )
with file_writer_helper(
args.wspecifier,
filetype=args.out_filetype,
write_num_frames=args.write_num_frames,
compress=args.compress,
compression_method=args.compression_method,
) as writer:
args.wspecifier,
filetype=args.out_filetype,
write_num_frames=args.write_num_frames,
compress=args.compress,
compression_method=args.compression_method, ) as writer:
for utt, mat in file_reader_helper(args.rspecifier, args.in_filetype):
if is_scipy_wav_style(mat):
# If data is sound file, then got as Tuple[int, ndarray]

@ -1,24 +1,23 @@
#!/usr/bin/env python3
# encoding: utf-8
# Copyright 2021 Kyoto University (Hirofumi Inaguma)
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
import argparse
import codecs
from dateutil import parser
import glob
import os
from dateutil import parser
def get_parser():
parser = argparse.ArgumentParser(description="calculate real time factor (RTF)")
parser = argparse.ArgumentParser(
description="calculate real time factor (RTF)")
parser.add_argument(
"--log-dir",
type=str,
default=None,
help="path to logging directory",
)
help="path to logging directory", )
return parser
@ -37,23 +36,21 @@ def main():
with codecs.open(x, "r", "utf-8") as f:
for line in f:
x = line.strip()
if "INFO: input lengths" in x:
audio_durations += [int(x.split("input lengths: ")[1])]
start_times += [parser.parse(x.split("(")[0])]
elif "INFO: prediction" in x:
end_times += [parser.parse(x.split("(")[0])]
assert len(audio_durations) == len(end_times), (
len(audio_durations),
len(end_times),
)
assert len(start_times) == len(end_times), (len(start_times), len(end_times))
# 2021-10-25 08:22:04.052 | INFO | xxx:recog_v2:188 - feat: (1570, 83)
if "feat:" in x:
dur = int(x.split("(")[1].split(',')[0])
audio_durations += [dur]
start_times += [parser.parse(x.split("|")[0])]
elif "total log probability:" in x:
end_times += [parser.parse(x.split("|")[0])]
assert len(audio_durations) == len(end_times), (len(audio_durations),
len(end_times), )
assert len(start_times) == len(end_times), (len(start_times),
len(end_times))
audio_sec += sum(audio_durations) / 100 # [sec]
decode_sec += sum(
[
(end - start).total_seconds()
for start, end in zip(start_times, end_times)
]
)
decode_sec += sum([(end - start).total_seconds()
for start, end in zip(start_times, end_times)])
n_utt += len(audio_durations)
print("Total audio duration: %.3f [sec]" % audio_sec)

@ -19,44 +19,42 @@ def get_parser():
"If wspecifier provided: per-utterance by default, "
"or per-speaker if"
"spk2utt option provided; if wxfilename: global",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
formatter_class=argparse.ArgumentDefaultsHelpFormatter, )
parser.add_argument(
"--spk2utt",
type=str,
help="A text file of speaker to utterance-list map. "
"(Don't give rspecifier format, such as "
'"ark:utt2spk")',
)
parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option")
'"ark:utt2spk")', )
parser.add_argument(
"--verbose", "-V", default=0, type=int, help="Verbose option")
parser.add_argument(
"--in-filetype",
type=str,
default="mat",
choices=["mat", "hdf5", "sound.hdf5", "sound"],
help="Specify the file format for the rspecifier. "
'"mat" is the matrix format in kaldi',
)
'"mat" is the matrix format in kaldi', )
parser.add_argument(
"--out-filetype",
type=str,
default="mat",
choices=["mat", "hdf5", "npy"],
help="Specify the file format for the wspecifier. "
'"mat" is the matrix format in kaldi',
)
'"mat" is the matrix format in kaldi', )
parser.add_argument(
"--preprocess-conf",
type=str,
default=None,
help="The configuration file for the pre-processing",
)
help="The configuration file for the pre-processing", )
parser.add_argument(
"rspecifier", type=str, help="Read specifier for feats. e.g. ark:some.ark"
)
"rspecifier",
type=str,
help="Read specifier for feats. e.g. ark:some.ark")
parser.add_argument(
"wspecifier_or_wxfilename", type=str, help="Write specifier. e.g. ark:some.ark"
)
"wspecifier_or_wxfilename",
type=str,
help="Write specifier. e.g. ark:some.ark")
return parser
@ -92,10 +90,8 @@ def main():
return x
if args.out_filetype == "npy":
logging.warning(
"--out-filetype npy is allowed only for "
"Global CMVN mode, changing to hdf5"
)
logging.warning("--out-filetype npy is allowed only for "
"Global CMVN mode, changing to hdf5")
args.out_filetype = "hdf5"
else:
@ -107,10 +103,8 @@ def main():
return None
if args.out_filetype == "hdf5":
logging.warning(
"--out-filetype hdf5 is not allowed for "
"Global CMVN mode, changing to npy"
)
logging.warning("--out-filetype hdf5 is not allowed for "
"Global CMVN mode, changing to npy")
args.out_filetype = "npy"
if args.preprocess_conf is not None:
@ -126,8 +120,7 @@ def main():
idx = 0
for idx, (utt, matrix) in enumerate(
file_reader_helper(args.rspecifier, args.in_filetype), 1
):
file_reader_helper(args.rspecifier, args.in_filetype), 1):
if is_scipy_wav_style(matrix):
# If data is sound file, then got as Tuple[int, ndarray]
rate, matrix = matrix
@ -146,7 +139,7 @@ def main():
counts[spk] += matrix.shape[0]
sum_feats[spk] += matrix.sum(axis=0)
square_sum_feats[spk] += (matrix ** 2).sum(axis=0)
square_sum_feats[spk] += (matrix**2).sum(axis=0)
logging.info("Processed {} utterances".format(idx))
assert idx > 0, idx
@ -171,8 +164,8 @@ def main():
# Per utterance or speaker CMVN
if is_wspecifier:
with file_writer_helper(
args.wspecifier_or_wxfilename, filetype=args.out_filetype
) as writer:
args.wspecifier_or_wxfilename,
filetype=args.out_filetype) as writer:
for spk, mat in cmvn_stats.items():
writer[spk] = mat
@ -186,8 +179,7 @@ def main():
kaldiio.save_mat(args.wspecifier_or_wxfilename, matrix)
else:
raise RuntimeError(
"Not supporting: --out-filetype {}".format(args.out_filetype)
)
"Not supporting: --out-filetype {}".format(args.out_filetype))
if __name__ == "__main__":

@ -1,7 +1,7 @@
#!/usr/bin/env python3
import argparse
from distutils.util import strtobool
import logging
from distutils.util import strtobool
from deepspeech.transform.transformation import Transformation
from deepspeech.utils.cli_readers import file_reader_helper
@ -13,50 +13,50 @@ from deepspeech.utils.cli_writers import file_writer_helper
def get_parser():
parser = argparse.ArgumentParser(
description="copy feature with preprocessing",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
formatter_class=argparse.ArgumentDefaultsHelpFormatter, )
parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option")
parser.add_argument(
"--verbose", "-V", default=0, type=int, help="Verbose option")
parser.add_argument(
"--in-filetype",
type=str,
default="mat",
choices=["mat", "hdf5", "sound.hdf5", "sound"],
help="Specify the file format for the rspecifier. "
'"mat" is the matrix format in kaldi',
)
'"mat" is the matrix format in kaldi', )
parser.add_argument(
"--out-filetype",
type=str,
default="mat",
choices=["mat", "hdf5", "sound.hdf5", "sound"],
help="Specify the file format for the wspecifier. "
'"mat" is the matrix format in kaldi',
)
'"mat" is the matrix format in kaldi', )
parser.add_argument(
"--write-num-frames", type=str, help="Specify wspecifer for utt2num_frames"
)
"--write-num-frames",
type=str,
help="Specify wspecifer for utt2num_frames")
parser.add_argument(
"--compress", type=strtobool, default=False, help="Save in compressed format"
)
"--compress",
type=strtobool,
default=False,
help="Save in compressed format")
parser.add_argument(
"--compression-method",
type=int,
default=2,
help="Specify the method(if mat) or " "gzip-level(if hdf5)",
)
help="Specify the method(if mat) or "
"gzip-level(if hdf5)", )
parser.add_argument(
"--preprocess-conf",
type=str,
default=None,
help="The configuration file for the pre-processing",
)
help="The configuration file for the pre-processing", )
parser.add_argument(
"rspecifier", type=str, help="Read specifier for feats. e.g. ark:some.ark"
)
"rspecifier",
type=str,
help="Read specifier for feats. e.g. ark:some.ark")
parser.add_argument(
"wspecifier", type=str, help="Write specifier. e.g. ark:some.ark"
)
"wspecifier", type=str, help="Write specifier. e.g. ark:some.ark")
return parser
@ -79,12 +79,11 @@ def main():
preprocessing = None
with file_writer_helper(
args.wspecifier,
filetype=args.out_filetype,
write_num_frames=args.write_num_frames,
compress=args.compress,
compression_method=args.compression_method,
) as writer:
args.wspecifier,
filetype=args.out_filetype,
write_num_frames=args.write_num_frames,
compress=args.compress,
compression_method=args.compression_method, ) as writer:
for utt, mat in file_reader_helper(args.rspecifier, args.in_filetype):
if is_scipy_wav_style(mat):
# If data is sound file, then got as Tuple[int, ndarray]

@ -1,14 +1,12 @@
#!/usr/bin/env python3
# encoding: utf-8
import argparse
import codecs
from distutils.util import strtobool
from io import open
import json
import logging
import sys
from distutils.util import strtobool
from io import open
from deepspeech.utils.cli_utils import get_commandline_args
@ -47,45 +45,41 @@ def get_parser():
"--input-scps feat:data/feats2.scp shape:data/utt2feat2_shape:shape "
"--output-scps text:data/text shape:data/utt2text_shape:shape "
"--scps utt2spk:data/utt2spk".format(sys.argv[0]),
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
formatter_class=argparse.ArgumentDefaultsHelpFormatter, )
parser.add_argument(
"--input-scps",
type=str,
nargs="*",
action="append",
default=[],
help="Json files for the inputs",
)
help="Json files for the inputs", )
parser.add_argument(
"--output-scps",
type=str,
nargs="*",
action="append",
default=[],
help="Json files for the outputs",
)
help="Json files for the outputs", )
parser.add_argument(
"--scps",
type=str,
nargs="+",
default=[],
help="The json files except for the input and outputs",
)
parser.add_argument("--verbose", "-V", default=1, type=int, help="Verbose option")
help="The json files except for the input and outputs", )
parser.add_argument(
"--verbose", "-V", default=1, type=int, help="Verbose option")
parser.add_argument(
"--allow-one-column",
type=strtobool,
default=False,
help="Allow one column in input scp files. "
"In this case, the value will be empty string.",
)
"In this case, the value will be empty string.", )
parser.add_argument(
"--out",
"-O",
type=str,
help="The output filename. " "If omitted, then output to sys.stdout",
)
help="The output filename. "
"If omitted, then output to sys.stdout", )
return parser
@ -128,37 +122,33 @@ if __name__ == "__main__":
# e.g. type_func_str = "int" -> type_func = int
type_func = eval(type_func_str)
except Exception:
raise RuntimeError("Unknown type: {}".format(type_func_str))
raise RuntimeError(
"Unknown type: {}".format(type_func_str))
if not callable(type_func):
raise RuntimeError("Unknown type: {}".format(type_func_str))
raise RuntimeError(
"Unknown type: {}".format(type_func_str))
else:
raise RuntimeError(
"Format <key>:<filepath> "
"or <key>:<filepath>:<type> "
"e.g. feat:data/feat.scp "
"or shape:data/feat.scp:shape: {}".format(key_scp)
)
"or shape:data/feat.scp:shape: {}".format(key_scp))
for item in lis:
if key == item[0]:
raise RuntimeError(
'The key "{}" is duplicated: {} {}'.format(
key, item[3], key_scp
)
)
raise RuntimeError('The key "{}" is duplicated: {} {}'.
format(key, item[3], key_scp))
lis.append((key, scp, type_func, key_scp, type_func_str))
lis_list.append(lis)
# Open scp files
input_fscps = [
[open(i[1], "r", encoding="utf-8") for i in il] for il in input_infos
]
output_fscps = [
[open(i[1], "r", encoding="utf-8") for i in il] for il in output_infos
]
input_fscps = [[open(i[1], "r", encoding="utf-8") for i in il]
for il in input_infos]
output_fscps = [[open(i[1], "r", encoding="utf-8") for i in il]
for il in output_infos]
fscps = [[open(i[1], "r", encoding="utf-8") for i in il] for il in infos]
# Note(kamo): What is done here?
@ -200,12 +190,10 @@ if __name__ == "__main__":
if line == "" or first == "":
if line != first:
concat = sum(input_infos + output_infos + infos, [])
raise RuntimeError(
"The number of lines mismatch "
'between: "{}" and "{}"'.format(
concat[0][1], concat[count][1]
)
)
raise RuntimeError("The number of lines mismatch "
'between: "{}" and "{}"'.format(
concat[0][1],
concat[count][1]))
elif line.split()[0] != first.split()[0]:
concat = sum(input_infos + output_infos + infos, [])
@ -216,9 +204,7 @@ if __name__ == "__main__":
concat[0][1],
concat[count][1],
first.rstrip(),
line.rstrip(),
)
)
line.rstrip(), ))
count += 1
# The end of file
@ -237,7 +223,8 @@ if __name__ == "__main__":
]:
lis = []
for idx, (line_list, info_list) in enumerate(zip(_lines, _infos), 1):
for idx, (line_list, info_list) in enumerate(
zip(_lines, _infos), 1):
if inout == "input":
d = {"name": "input{}".format(idx)}
elif inout == "output":
@ -254,9 +241,7 @@ if __name__ == "__main__":
raise RuntimeError(
"Format error {}th line in {}: "
' Expecting "<key> <value>":\n>>> {}'.format(
nutt, info[1], line
)
)
nutt, info[1], line))
uttid = sps[0]
value = ""
else:
@ -274,9 +259,7 @@ if __name__ == "__main__":
logging.error(
'"{}" is an invalid function '
"for the {} th line in {}: \n>>> {}".format(
info[4], nutt, info[1], line
)
)
info[4], nutt, info[1], line))
raise
d[key] = value
@ -289,8 +272,11 @@ if __name__ == "__main__":
entry.update(lis[0])
entry = json.dumps(
entry, indent=4, ensure_ascii=False, sort_keys=True, separators=(",", ": ")
)
entry,
indent=4,
ensure_ascii=False,
sort_keys=True,
separators=(",", ": "))
# Add indent
indent = " " * 2
entry = ("\n" + indent).join(entry.split("\n"))

@ -1,9 +1,6 @@
#!/usr/bin/env python3
# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
import argparse
import codecs
import re
@ -27,28 +24,26 @@ def exist_or_not(i, match_pos):
def get_parser():
parser = argparse.ArgumentParser(
description="convert raw text to tokenized text",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
formatter_class=argparse.ArgumentDefaultsHelpFormatter, )
parser.add_argument(
"--nchar",
"-n",
default=1,
type=int,
help="number of characters to split, i.e., \
aabb -> a a b b with -n 1 and aa bb with -n 2",
)
aabb -> a a b b with -n 1 and aa bb with -n 2", )
parser.add_argument(
"--skip-ncols", "-s", default=0, type=int, help="skip first n columns"
)
parser.add_argument("--space", default="<space>", type=str, help="space symbol")
"--skip-ncols", "-s", default=0, type=int, help="skip first n columns")
parser.add_argument(
"--space", default="<space>", type=str, help="space symbol")
parser.add_argument(
"--non-lang-syms",
"-l",
default=None,
type=str,
help="list of non-linguistic symobles, e.g., <NOISE> etc.",
)
parser.add_argument("text", type=str, default=False, nargs="?", help="input text")
help="list of non-linguistic symobles, e.g., <NOISE> etc.", )
parser.add_argument(
"text", type=str, default=False, nargs="?", help="input text")
parser.add_argument(
"--trans_type",
"-t",
@ -60,8 +55,7 @@ def get_parser():
read from SI1279.WRD file -> "bricks are an alternative"
Else if trans_type is phn,
read from SI1279.PHN file -> "sil b r ih sil k s aa r er n aa l
sil t er n ih sil t ih v sil" """,
)
sil t er n ih sil t ih v sil" """, )
return parser
@ -78,17 +72,17 @@ def main():
if args.text:
f = codecs.open(args.text, encoding="utf-8")
else:
f = codecs.getreader("utf-8")(sys.stdin if is_python2 else sys.stdin.buffer)
f = codecs.getreader("utf-8")(sys.stdin
if is_python2 else sys.stdin.buffer)
sys.stdout = codecs.getwriter("utf-8")(
sys.stdout if is_python2 else sys.stdout.buffer
)
sys.stdout = codecs.getwriter("utf-8")(sys.stdout
if is_python2 else sys.stdout.buffer)
line = f.readline()
n = args.nchar
while line:
x = line.split()
print(" ".join(x[: args.skip_ncols]), end=" ")
a = " ".join(x[args.skip_ncols :])
print(" ".join(x[:args.skip_ncols]), end=" ")
a = " ".join(x[args.skip_ncols:])
# get all matched positions
match_pos = []
@ -118,7 +112,7 @@ def main():
i += 1
a = chars
a = [a[j : j + n] for j in range(0, len(a), n)]
a = [a[j:j + n] for j in range(0, len(a), n)]
a_flat = []
for z in a:

Loading…
Cancel
Save