From d75cf8963030648fc682c849002d379797e9103d Mon Sep 17 00:00:00 2001 From: Jackwaterveg <87408988+Jackwaterveg@users.noreply.github.com> Date: Sun, 26 Sep 2021 21:13:30 +0800 Subject: [PATCH 01/11] Update released_model.md --- docs/src/released_model.md | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/docs/src/released_model.md b/docs/src/released_model.md index 61fd1560..50670aaf 100644 --- a/docs/src/released_model.md +++ b/docs/src/released_model.md @@ -1,21 +1,21 @@ # Released Models ## Acoustic Model Released in paddle 2.X -Acoustic Model | Training Data | Token-based | Size | Descriptions | CER or WER | Hours of speech -:-------------:| :------------:| :-----: | -----: | :----------------- | :---------- | :--------- -[Ds2 Online Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s0/aishell.s0.ds_online.5rnn.debug.tar.gz) | Aishell Dataset | Char-based | 345 MB | 2 Conv + 5 LSTM layers with only forward direction | 0.0824 | 151 h -[Ds2 Offline Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s0/aishell.s0.ds2.offline.cer6p65.release.tar.gz)| Aishell Dataset | Char-based | 306 MB | 2 Conv + 3 bidirectional GRU layers| 0.065 | 151 h -[Conformer Online Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.chunk.release.tar.gz) | Aishell Dataset | Char-based | 283 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention + CTC | 0.0594 | 151 h -[Conformer Offline Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.release.tar.gz) | Aishell Dataset | Char-based | 284 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention | 0.0547 | 151 h -[Conformer Librispeech Model](https://deepspeech.bj.bcebos.com/release2.1/librispeech/s1/conformer.release.tar.gz) | Librispeech Dataset | Word-based | 287 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention | 0.0325 | 960 h -[Transformer Librispeech Model](https://deepspeech.bj.bcebos.com/release2.1/librispeech/s1/transformer.release.tar.gz) | Librispeech Dataset | Word-based | 195 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention | 0.0544 | 960 h +Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech +:-------------:| :------------:| :-----: | -----: | :----------------- |:--------- | :---------- | :--------- +[Ds2 Online Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s0/aishell.s0.ds_online.5rnn.debug.tar.gz) | Aishell Dataset | Char-based | 345 MB | 2 Conv + 5 LSTM layers with only forward direction | 0.0824 |-| 151 h +[Ds2 Offline Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s0/aishell.s0.ds2.offline.cer6p65.release.tar.gz)| Aishell Dataset | Char-based | 306 MB | 2 Conv + 3 bidirectional GRU layers| 0.065 |-| 151 h +[Conformer Online Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.chunk.release.tar.gz) | Aishell Dataset | Char-based | 283 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention + CTC | 0.0594 |-| 151 h +[Conformer Offline Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.release.tar.gz) | Aishell Dataset | Char-based | 284 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention | 0.0547 |-| 151 h +[Conformer Librispeech Model](https://deepspeech.bj.bcebos.com/release2.1/librispeech/s1/conformer.release.tar.gz) | Librispeech Dataset | Word-based | 287 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention |-| 0.0325 | 960 h +[Transformer Librispeech Model](https://deepspeech.bj.bcebos.com/release2.1/librispeech/s1/transformer.release.tar.gz) | Librispeech Dataset | Word-based | 195 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention |-| 0.0544 | 960 h ## Acoustic Model Transformed from paddle 1.8 -Acoustic Model | Training Data | Token-based | Size | Descriptions | CER or WER | Hours of speech -:-------------:| :------------:| :-----: | -----: | :----------------- | :---------- | :--------- -[Ds2 Offline Aishell model](https://deepspeech.bj.bcebos.com/mandarin_models/aishell_model_v1.8_to_v2.x.tar.gz)|Aishell Dataset| Char-based| 234 MB| 2 Conv + 3 bidirectional GRU layers| 0.0804 | 151 h| -[Ds2 Offline Librispeech model](https://deepspeech.bj.bcebos.com/eng_models/librispeech_v1.8_to_v2.x.tar.gz)|Librispeech Dataset| Word-based| 307 MB| 2 Conv + 3 bidirectional sharing weight RNN layers | 0.0685| 960 h| -[Ds2 Offline Baidu en8k model](https://deepspeech.bj.bcebos.com/eng_models/baidu_en8k_v1.8_to_v2.x.tar.gz)|Baidu Internal English Dataset| Word-based| 273 MB| 2 Conv + 3 bidirectional GRU layers | 0.0541 | 8628 h| +Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech +:-------------:| :------------:| :-----: | -----: | :----------------- | :---------- | :---------- | :--------- +[Ds2 Offline Aishell model](https://deepspeech.bj.bcebos.com/mandarin_models/aishell_model_v1.8_to_v2.x.tar.gz)|Aishell Dataset| Char-based| 234 MB| 2 Conv + 3 bidirectional GRU layers| 0.0804 |-| 151 h| +[Ds2 Offline Librispeech model](https://deepspeech.bj.bcebos.com/eng_models/librispeech_v1.8_to_v2.x.tar.gz)|Librispeech Dataset| Word-based| 307 MB| 2 Conv + 3 bidirectional sharing weight RNN layers |-| 0.0685| 960 h| +[Ds2 Offline Baidu en8k model](https://deepspeech.bj.bcebos.com/eng_models/baidu_en8k_v1.8_to_v2.x.tar.gz)|Baidu Internal English Dataset| Word-based| 273 MB| 2 Conv + 3 bidirectional GRU layers |-| 0.0541 | 8628 h| From 9c37d10992a2237730f5c3f8fc13ec584dc87b09 Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Mon, 27 Sep 2021 06:53:57 +0000 Subject: [PATCH 02/11] optimize the log --- deepspeech/exps/u2/model.py | 1 + deepspeech/training/trainer.py | 1 + 2 files changed, 2 insertions(+) diff --git a/deepspeech/exps/u2/model.py b/deepspeech/exps/u2/model.py index 5cb0962a..1afd9b10 100644 --- a/deepspeech/exps/u2/model.py +++ b/deepspeech/exps/u2/model.py @@ -216,6 +216,7 @@ class U2Trainer(Trainer): msg += f"{v:>.8f}" if isinstance(v, float) else f"{v}" msg += "," + msg = msg[:-1] # remove the last "," if (batch_index + 1 ) % self.config.training.log_interval == 0: logger.info(msg) diff --git a/deepspeech/training/trainer.py b/deepspeech/training/trainer.py index 15259f0e..35b1690b 100644 --- a/deepspeech/training/trainer.py +++ b/deepspeech/training/trainer.py @@ -263,6 +263,7 @@ class Trainer(): msg += f"{v:>.8f}" if isinstance(v, float) else f"{v}" msg += "," + msg = msg[:-1] # remove the last "," logger.info(msg) data_start_time = time.time() except Exception as e: From f7d7e70cb24338e61e921240c18c20bc88456150 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Mon, 27 Sep 2021 10:50:20 +0000 Subject: [PATCH 03/11] more ctc check; valid dataloader with num workers --- deepspeech/exps/u2/model.py | 4 +++- deepspeech/modules/ctc.py | 2 +- deepspeech/modules/loss.py | 2 ++ 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/deepspeech/exps/u2/model.py b/deepspeech/exps/u2/model.py index 5cb0962a..5cf8866c 100644 --- a/deepspeech/exps/u2/model.py +++ b/deepspeech/exps/u2/model.py @@ -243,6 +243,7 @@ class U2Trainer(Trainer): self.visualizer.add_scalars( 'epoch', {'cv_loss': cv_loss, 'lr': self.lr_scheduler()}, self.epoch) + self.save(tag=self.epoch, infos={'val_loss': cv_loss}) self.new_epoch() @@ -291,7 +292,8 @@ class U2Trainer(Trainer): batch_size=config.collator.batch_size, shuffle=False, drop_last=False, - collate_fn=collate_fn_dev) + collate_fn=collate_fn_dev, + num_workers=config.collator.num_workers, ) # test dataset, return raw text config.data.manifest = config.data.test_manifest diff --git a/deepspeech/modules/ctc.py b/deepspeech/modules/ctc.py index 11ce871f..551bbf67 100644 --- a/deepspeech/modules/ctc.py +++ b/deepspeech/modules/ctc.py @@ -49,7 +49,7 @@ class CTCDecoder(nn.Layer): dropout_rate (float): dropout rate (0.0 ~ 1.0) reduction (bool): reduce the CTC loss into a scalar, True for 'sum' or 'none' batch_average (bool): do batch dim wise average. - grad_norm_type (str): one of 'instance', 'batchsize', 'frame', None. + grad_norm_type (str): one of 'instance', 'batch', 'frame', None. """ assert check_argument_types() super().__init__() diff --git a/deepspeech/modules/loss.py b/deepspeech/modules/loss.py index 7d24e170..1f33e512 100644 --- a/deepspeech/modules/loss.py +++ b/deepspeech/modules/loss.py @@ -49,6 +49,8 @@ class CTCLoss(nn.Layer): self.norm_by_batchsize = True elif grad_norm_type == 'frame': self.norm_by_total_logits_len = True + else: + raise ValueError(f"CTCLoss Grad Norm no support {grad_norm_type}") def forward(self, logits, ys_pad, hlens, ys_lens): """Compute CTC loss. From 1a46125175b9428bd4c481f79117598330bad535 Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Tue, 28 Sep 2021 05:46:29 +0000 Subject: [PATCH 04/11] add bin for hub --- deepspeech/exps/deepspeech2/bin/test_hub.py | 191 ++++++++++++++++++++ examples/aishell/s0/local/test_hub.sh | 36 ++++ examples/aishell/s0/run.sh | 8 + 3 files changed, 235 insertions(+) create mode 100644 deepspeech/exps/deepspeech2/bin/test_hub.py create mode 100755 examples/aishell/s0/local/test_hub.sh diff --git a/deepspeech/exps/deepspeech2/bin/test_hub.py b/deepspeech/exps/deepspeech2/bin/test_hub.py new file mode 100644 index 00000000..cbda3b4c --- /dev/null +++ b/deepspeech/exps/deepspeech2/bin/test_hub.py @@ -0,0 +1,191 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Evaluation for DeepSpeech2 model.""" +import os +import sys +from pathlib import Path + +import paddle + +from deepspeech.exps.deepspeech2.config import get_cfg_defaults +from deepspeech.frontend.featurizer.text_featurizer import TextFeaturizer +from deepspeech.io.collator import SpeechCollator +from deepspeech.models.ds2 import DeepSpeech2Model +from deepspeech.models.ds2_online import DeepSpeech2ModelOnline +from deepspeech.training.cli import default_argument_parser +from deepspeech.utils import mp_tools +from deepspeech.utils.checkpoint import Checkpoint +from deepspeech.utils.log import Log +from deepspeech.utils.utility import print_arguments +from deepspeech.utils.utility import UpdateConfig + +logger = Log(__name__).getlog() + + +class DeepSpeech2Tester_hub(): + def __init__(self, config, args): + self.args = args + self.config = config + self.audio_file = args.audio_file + self.collate_fn_test = SpeechCollator.from_config(config) + self._text_featurizer = TextFeaturizer( + unit_type=config.collator.unit_type, vocab_filepath=None) + + def compute_result_transcripts(self, audio, audio_len, vocab_list, cfg): + result_transcripts = self.model.decode( + audio, + audio_len, + vocab_list, + decoding_method=cfg.decoding_method, + lang_model_path=cfg.lang_model_path, + beam_alpha=cfg.alpha, + beam_beta=cfg.beta, + beam_size=cfg.beam_size, + cutoff_prob=cfg.cutoff_prob, + cutoff_top_n=cfg.cutoff_top_n, + num_processes=cfg.num_proc_bsearch) + #replace the '' with ' ' + result_transcripts = [ + self._text_featurizer.detokenize(sentence) + for sentence in result_transcripts + ] + + return result_transcripts + + @mp_tools.rank_zero_only + @paddle.no_grad() + def test(self): + self.model.eval() + cfg = self.config + audio_file = self.audio_file + collate_fn_test = self.collate_fn_test + audio, _ = collate_fn_test.process_utterance( + audio_file=audio_file, transcript=" ") + audio_len = audio.shape[0] + audio = paddle.to_tensor(audio, dtype='float32') + audio_len = paddle.to_tensor(audio_len) + audio = paddle.unsqueeze(audio, axis=0) + vocab_list = collate_fn_test.vocab_list + result_transcripts = self.compute_result_transcripts( + audio, audio_len, vocab_list, cfg.decoding) + logger.info("result_transcripts: " + result_transcripts[0]) + + def run_test(self): + self.resume() + try: + self.test() + except KeyboardInterrupt: + exit(-1) + + def setup(self): + """Setup the experiment. + """ + paddle.set_device('gpu' if self.args.nprocs > 0 else 'cpu') + + self.setup_output_dir() + self.setup_checkpointer() + + self.setup_model() + + def setup_output_dir(self): + """Create a directory used for output. + """ + # output dir + if self.args.output: + output_dir = Path(self.args.output).expanduser() + output_dir.mkdir(parents=True, exist_ok=True) + else: + output_dir = Path( + self.args.checkpoint_path).expanduser().parent.parent + output_dir.mkdir(parents=True, exist_ok=True) + self.output_dir = output_dir + + def setup_model(self): + config = self.config.clone() + with UpdateConfig(config): + config.model.feat_size = self.collate_fn_test.feature_size + config.model.dict_size = self.collate_fn_test.vocab_size + + if self.args.model_type == 'offline': + model = DeepSpeech2Model.from_config(config.model) + elif self.args.model_type == 'online': + model = DeepSpeech2ModelOnline.from_config(config.model) + else: + raise Exception("wrong model type") + + self.model = model + + def setup_checkpointer(self): + """Create a directory used to save checkpoints into. + + It is "checkpoints" inside the output directory. + """ + # checkpoint dir + checkpoint_dir = self.output_dir / "checkpoints" + checkpoint_dir.mkdir(exist_ok=True) + + self.checkpoint_dir = checkpoint_dir + + self.checkpoint = Checkpoint( + kbest_n=self.config.training.checkpoint.kbest_n, + latest_n=self.config.training.checkpoint.latest_n) + + def resume(self): + """Resume from the checkpoint at checkpoints in the output + directory or load a specified checkpoint. + """ + params_path = self.args.checkpoint_path + ".pdparams" + model_dict = paddle.load(params_path) + self.model.set_state_dict(model_dict) + + +def main_sp(config, args): + exp = DeepSpeech2Tester_hub(config, args) + exp.setup() + exp.run_test() + + +def main(config, args): + main_sp(config, args) + + +if __name__ == "__main__": + parser = default_argument_parser() + parser.add_argument("--model_type") + parser.add_argument("--audio_file") + # save asr result to + parser.add_argument( + "--result_file", type=str, help="path of save the asr result") + args = parser.parse_args() + print_arguments(args, globals()) + if args.model_type is None: + args.model_type = 'offline' + if not os.path.isfile(args.audio_file): + print("Please input the audio file path") + sys.exit(-1) + print("model_type:{}".format(args.model_type)) + + # https://yaml.org/type/float.html + config = get_cfg_defaults(args.model_type) + if args.config: + config.merge_from_file(args.config) + if args.opts: + config.merge_from_list(args.opts) + config.freeze() + print(config) + if args.dump_config: + with open(args.dump_config, 'w') as f: + print(config, file=f) + + main(config, args) diff --git a/examples/aishell/s0/local/test_hub.sh b/examples/aishell/s0/local/test_hub.sh new file mode 100755 index 00000000..d01496c4 --- /dev/null +++ b/examples/aishell/s0/local/test_hub.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +if [ $# != 4 ];then + echo "usage: ${0} config_path ckpt_path_prefix model_type audio_file" + exit -1 +fi + +ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') +echo "using $ngpu gpus..." + +config_path=$1 +ckpt_prefix=$2 +model_type=$3 +audio_file=$4 + +# download language model +bash local/download_lm_ch.sh +if [ $? -ne 0 ]; then + exit 1 +fi + +python3 -u ${BIN_DIR}/test_hub.py \ +--nproc ${ngpu} \ +--config ${config_path} \ +--result_file ${ckpt_prefix}.rsl \ +--checkpoint_path ${ckpt_prefix} \ +--model_type ${model_type} \ +--audio_file ${audio_file} + +if [ $? -ne 0 ]; then + echo "Failed in evaluation!" + exit 1 +fi + + +exit 0 diff --git a/examples/aishell/s0/run.sh b/examples/aishell/s0/run.sh index 71191c3a..83846ada 100755 --- a/examples/aishell/s0/run.sh +++ b/examples/aishell/s0/run.sh @@ -15,6 +15,8 @@ avg_ckpt=avg_${avg_num} ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}') echo "checkpoint name ${ckpt}" +audio_file="data/tmp.wav" + if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then # prepare data bash ./local/data.sh || exit -1 @@ -44,3 +46,9 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then # test export ckpt avg_n CUDA_VISIBLE_DEVICES=0 ./local/test_export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt}.jit ${model_type}|| exit -1 fi + +# Optionally, you can add LM and test it with runtime. +if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then + # test a single .wav file + CUDA_VISIBLE_DEVICES=0 ./local/test_hub.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} ${audio_file} || exit -1 +fi From f628e218167a17af501d4e84c3a20d5ad804f629 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 28 Sep 2021 10:33:35 +0000 Subject: [PATCH 05/11] refactor kaldi/tarfile loader, st and asr collator --- deepspeech/exps/u2_st/model.py | 23 +- deepspeech/frontend/audio.py | 20 +- .../frontend/featurizer/speech_featurizer.py | 77 +-- deepspeech/frontend/speech.py | 11 +- deepspeech/frontend/utility.py | 46 ++ deepspeech/io/collator.py | 359 +++++----- deepspeech/io/collator_st.py | 631 ------------------ deepspeech/io/reader.py | 16 + 8 files changed, 322 insertions(+), 861 deletions(-) delete mode 100644 deepspeech/io/collator_st.py diff --git a/deepspeech/exps/u2_st/model.py b/deepspeech/exps/u2_st/model.py index e4e70292..f5a514c7 100644 --- a/deepspeech/exps/u2_st/model.py +++ b/deepspeech/exps/u2_st/model.py @@ -28,10 +28,8 @@ from paddle import distributed as dist from paddle.io import DataLoader from yacs.config import CfgNode -from deepspeech.io.collator_st import KaldiPrePorocessedCollator -from deepspeech.io.collator_st import SpeechCollator -from deepspeech.io.collator_st import TripletKaldiPrePorocessedCollator -from deepspeech.io.collator_st import TripletSpeechCollator +from deepspeech.io.collator import SpeechCollator +from deepspeech.io.collator import TripletSpeechCollator from deepspeech.io.dataset import ManifestDataset from deepspeech.io.dataset import TripletManifestDataset from deepspeech.io.sampler import SortagradBatchSampler @@ -258,22 +256,13 @@ class U2STTrainer(Trainer): config.data.manifest = config.data.dev_manifest dev_dataset = Dataset.from_config(config) - if config.collator.raw_wav: - if config.model.model_conf.asr_weight > 0.: - Collator = TripletSpeechCollator - TestCollator = SpeechCollator - else: - TestCollator = Collator = SpeechCollator - # Not yet implement the mtl loader for raw_wav. + if config.model.model_conf.asr_weight > 0.: + Collator = TripletSpeechCollator + TestCollator = SpeechCollator else: - if config.model.model_conf.asr_weight > 0.: - Collator = TripletKaldiPrePorocessedCollator - TestCollator = KaldiPrePorocessedCollator - else: - TestCollator = Collator = KaldiPrePorocessedCollator + TestCollator = Collator = SpeechCollator collate_fn_train = Collator.from_config(config) - config.collator.augmentation_config = "" collate_fn_dev = Collator.from_config(config) diff --git a/deepspeech/frontend/audio.py b/deepspeech/frontend/audio.py index ffdcd4b3..13dc3a44 100644 --- a/deepspeech/frontend/audio.py +++ b/deepspeech/frontend/audio.py @@ -24,8 +24,10 @@ import soundfile import soxbindings as sox from scipy import signal +from .utility import subfile_from_tar -class AudioSegment(object): + +class AudioSegment(): """Monaural audio segment abstraction. :param samples: Audio samples [num_samples x num_channels]. @@ -68,16 +70,20 @@ class AudioSegment(object): self.duration, self.rms_db)) @classmethod - def from_file(cls, file): + def from_file(cls, file, infos=None): """Create audio segment from audio file. - - :param filepath: Filepath or file object to audio file. - :type filepath: str|file - :return: Audio segment instance. - :rtype: AudioSegment + + Args: + filepath (str|file): Filepath or file object to audio file. + infos (TarLocalData, optional): tar2obj and tar2infos. Defaults to None. + + Returns: + AudioSegment: Audio segment instance. """ if isinstance(file, str) and re.findall(r".seqbin_\d+$", file): return cls.from_sequence_file(file) + elif isinstance(file, str) and file.startswith('tar:'): + return cls.from_file(subfile_from_tar(file, infos)) else: samples, sample_rate = soundfile.read(file, dtype='float32') return cls(samples, sample_rate) diff --git a/deepspeech/frontend/featurizer/speech_featurizer.py b/deepspeech/frontend/featurizer/speech_featurizer.py index 5082850d..f9f7d7c2 100644 --- a/deepspeech/frontend/featurizer/speech_featurizer.py +++ b/deepspeech/frontend/featurizer/speech_featurizer.py @@ -64,8 +64,12 @@ class SpeechFeaturizer(): target_sample_rate=16000, use_dB_normalization=True, target_dB=-20, - dither=1.0): - self._audio_featurizer = AudioFeaturizer( + dither=1.0, + maskctc=False): + self.stride_ms = stride_ms + self.window_ms = window_ms + + self.audio_feature = AudioFeaturizer( specgram_type=specgram_type, feat_dim=feat_dim, delta_delta=delta_delta, @@ -77,8 +81,12 @@ class SpeechFeaturizer(): use_dB_normalization=use_dB_normalization, target_dB=target_dB, dither=dither) - self._text_featurizer = TextFeaturizer(unit_type, vocab_filepath, - spm_model_prefix) + + self.text_feature = TextFeaturizer( + unit_type=unit_type, + vocab_filepath=vocab_filepath, + spm_model_prefix=spm_model_prefix, + maskctc=maskctc) def featurize(self, speech_segment, keep_transcription_text): """Extract features for speech segment. @@ -94,60 +102,33 @@ class SpeechFeaturizer(): Returns: tuple: 1) spectrogram audio feature in 2darray, 2) list oftoken indices. """ - spec_feature = self._audio_featurizer.featurize(speech_segment) + spec_feature = self.audio_feature.featurize(speech_segment) + if keep_transcription_text: return spec_feature, speech_segment.transcript + if speech_segment.has_token: text_ids = speech_segment.token_ids else: - text_ids = self._text_featurizer.featurize( - speech_segment.transcript) + text_ids = self.text_feature.featurize(speech_segment.transcript) return spec_feature, text_ids - @property - def vocab_size(self): - """Return the vocabulary size. - Returns: - int: Vocabulary size. - """ - return self._text_featurizer.vocab_size - - @property - def vocab_list(self): - """Return the vocabulary in list. - Returns: - List[str]: - """ - return self._text_featurizer.vocab_list + def text_featurize(self, text, keep_transcription_text): + """Extract features for speech segment. - @property - def vocab_dict(self): - """Return the vocabulary in dict. - Returns: - Dict[str, int]: - """ - return self._text_featurizer.vocab_dict + 1. For audio parts, extract the audio features. + 2. For transcript parts, keep the original text or convert text string + to a list of token indices in char-level. - @property - def feature_size(self): - """Return the audio feature size. - Returns: - int: audio feature size. - """ - return self._audio_featurizer.feature_size + Args: + text (str): text. + keep_transcription_text (bool): True, keep transcript text, False, token ids - @property - def stride_ms(self): - """time length in `ms` unit per frame Returns: - float: time(ms)/frame + (str|List[int]): text, or list of token indices. """ - return self._audio_featurizer.stride_ms + if keep_transcription_text: + return text - @property - def text_feature(self): - """Return the text feature object. - Returns: - TextFeaturizer: object. - """ - return self._text_featurizer + text_ids = self.text_feature.featurize(text) + return text_ids diff --git a/deepspeech/frontend/speech.py b/deepspeech/frontend/speech.py index e58795c0..9eed9725 100644 --- a/deepspeech/frontend/speech.py +++ b/deepspeech/frontend/speech.py @@ -68,7 +68,12 @@ class SpeechSegment(AudioSegment): return not self.__eq__(other) @classmethod - def from_file(cls, filepath, transcript, tokens=None, token_ids=None): + def from_file(cls, + filepath, + transcript, + tokens=None, + token_ids=None, + infos=None): """Create speech segment from audio file and corresponding transcript. Args: @@ -76,12 +81,12 @@ class SpeechSegment(AudioSegment): transcript (str): Transcript text for the speech. tokens (List[str], optional): text tokens. Defaults to None. token_ids (List[int], optional): text token ids. Defaults to None. + infos (TarLocalData, optional): tar2obj and tar2infos. Defaults to None. Returns: SpeechSegment: Speech segment instance. """ - - audio = AudioSegment.from_file(filepath) + audio = AudioSegment.from_file(filepath, infos) return cls(audio.samples, audio.sample_rate, transcript, tokens, token_ids) diff --git a/deepspeech/frontend/utility.py b/deepspeech/frontend/utility.py index 3a972b50..2a581232 100644 --- a/deepspeech/frontend/utility.py +++ b/deepspeech/frontend/utility.py @@ -14,6 +14,7 @@ """Contains data helper functions.""" import json import math +import tarfile from typing import List from typing import Optional from typing import Text @@ -112,6 +113,51 @@ def read_manifest( return manifest +# Tar File read +TarLocalData = namedtuple('TarLocalData', ['tar2info', 'tar2object']) + + +def parse_tar(file): + """Parse a tar file to get a tarfile object + and a map containing tarinfoes + """ + result = {} + f = tarfile.open(file) + for tarinfo in f.getmembers(): + result[tarinfo.name] = tarinfo + return f, result + + +def subfile_from_tar(file, local_data=None): + """Get subfile object from tar. + + tar:tarpath#filename + + It will return a subfile object from tar file + and cached tar file info for next reading request. + """ + tarpath, filename = file.split(':', 1)[1].split('#', 1) + + if local_data is None: + local_data = TarLocalData(tar2info={}, tar2object={}) + + assert isinstance(local_data, TarLocalData) + + if 'tar2info' not in local_data.__dict__: + local_data.tar2info = {} + if 'tar2object' not in local_data.__dict__: + local_data.tar2object = {} + + if tarpath not in local_data.tar2info: + fobj, infos = parse_tar(tarpath) + local_data.tar2info[tarpath] = infos + local_data.tar2object[tarpath] = fobj + else: + fobj = local_data.tar2object[tarpath] + infos = local_data.tar2info[tarpath] + return fobj.extractfile(infos[filename]) + + def rms_to_db(rms: float): """Root Mean Square to dB. diff --git a/deepspeech/io/collator.py b/deepspeech/io/collator.py index 15b89ab9..c5c0a414 100644 --- a/deepspeech/io/collator.py +++ b/deepspeech/io/collator.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. import io -from collections import namedtuple from typing import Optional import numpy as np @@ -23,96 +22,17 @@ from deepspeech.frontend.featurizer.speech_featurizer import SpeechFeaturizer from deepspeech.frontend.normalizer import FeatureNormalizer from deepspeech.frontend.speech import SpeechSegment from deepspeech.frontend.utility import IGNORE_ID +from deepspeech.frontend.utility import TarLocalData +from deepspeech.io.reader import LoadInputsAndTargets from deepspeech.io.utility import pad_list from deepspeech.utils.log import Log -__all__ = ["SpeechCollator"] +__all__ = ["SpeechCollator", "TripletSpeechCollator"] logger = Log(__name__).getlog() -# namedtupe need global for pickle. -TarLocalData = namedtuple('TarLocalData', ['tar2info', 'tar2object']) - - -class SpeechCollator(): - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - default = CfgNode( - dict( - augmentation_config="", - random_seed=0, - mean_std_filepath="", - unit_type="char", - vocab_filepath="", - spm_model_prefix="", - specgram_type='linear', # 'linear', 'mfcc', 'fbank' - feat_dim=0, # 'mfcc', 'fbank' - delta_delta=False, # 'mfcc', 'fbank' - stride_ms=10.0, # ms - window_ms=20.0, # ms - n_fft=None, # fft points - max_freq=None, # None for samplerate/2 - target_sample_rate=16000, # target sample rate - use_dB_normalization=True, - target_dB=-20, - dither=1.0, # feature dither - keep_transcription_text=False)) - - if config is not None: - config.merge_from_other_cfg(default) - return default - - @classmethod - def from_config(cls, config): - """Build a SpeechCollator object from a config. - - Args: - config (yacs.config.CfgNode): configs object. - - Returns: - SpeechCollator: collator object. - """ - assert 'augmentation_config' in config.collator - assert 'keep_transcription_text' in config.collator - assert 'mean_std_filepath' in config.collator - assert 'vocab_filepath' in config.collator - assert 'specgram_type' in config.collator - assert 'n_fft' in config.collator - assert config.collator - - if isinstance(config.collator.augmentation_config, (str, bytes)): - if config.collator.augmentation_config: - aug_file = io.open( - config.collator.augmentation_config, - mode='r', - encoding='utf8') - else: - aug_file = io.StringIO(initial_value='{}', newline='') - else: - aug_file = config.collator.augmentation_config - assert isinstance(aug_file, io.StringIO) - - speech_collator = cls( - aug_file=aug_file, - random_seed=0, - mean_std_filepath=config.collator.mean_std_filepath, - unit_type=config.collator.unit_type, - vocab_filepath=config.collator.vocab_filepath, - spm_model_prefix=config.collator.spm_model_prefix, - specgram_type=config.collator.specgram_type, - feat_dim=config.collator.feat_dim, - delta_delta=config.collator.delta_delta, - stride_ms=config.collator.stride_ms, - window_ms=config.collator.window_ms, - n_fft=config.collator.n_fft, - max_freq=config.collator.max_freq, - target_sample_rate=config.collator.target_sample_rate, - use_dB_normalization=config.collator.use_dB_normalization, - target_dB=config.collator.target_dB, - dither=config.collator.dither, - keep_transcription_text=config.collator.keep_transcription_text) - return speech_collator +class SpeechCollatorBase(): def __init__( self, aug_file, @@ -121,7 +41,7 @@ class SpeechCollator(): spm_model_prefix, random_seed=0, unit_type="char", - specgram_type='linear', # 'linear', 'mfcc', 'fbank' + spectrum_type='linear', # 'linear', 'mfcc', 'fbank' feat_dim=0, # 'mfcc', 'fbank' delta_delta=False, # 'mfcc', 'fbank' stride_ms=10.0, # ms @@ -146,7 +66,7 @@ class SpeechCollator(): n_fft (int, optional): fft points for rfft. Defaults to None. max_freq (int, optional): max cut freq. Defaults to None. target_sample_rate (int, optional): target sample rate which used for training. Defaults to 16000. - specgram_type (str, optional): 'linear', 'mfcc' or 'fbank'. Defaults to 'linear'. + spectrum_type (str, optional): 'linear', 'mfcc' or 'fbank'. Defaults to 'linear'. feat_dim (int, optional): audio feature dim, using by 'mfcc' or 'fbank'. Defaults to None. delta_delta (bool, optional): audio feature with delta-delta, using by 'fbank' or 'mfcc'. Defaults to False. use_dB_normalization (bool, optional): do dB normalization. Defaults to True. @@ -159,23 +79,27 @@ class SpeechCollator(): Padding audio features with zeros to make them have the same shape (or a user-defined shape) within one batch. """ - self._keep_transcription_text = keep_transcription_text + self.keep_transcription_text = keep_transcription_text + self.stride_ms = stride_ms + self.window_ms = window_ms + self.feat_dim = feat_dim + + self.loader = LoadInputsAndTargets() + # only for tar filetype self._local_data = TarLocalData(tar2info={}, tar2object={}) - self._augmentation_pipeline = AugmentationPipeline( + + self.augmentation = AugmentationPipeline( augmentation_config=aug_file.read(), random_seed=random_seed) self._normalizer = FeatureNormalizer( mean_std_filepath) if mean_std_filepath else None - self._stride_ms = stride_ms - self._target_sample_rate = target_sample_rate - self._speech_featurizer = SpeechFeaturizer( unit_type=unit_type, vocab_filepath=vocab_filepath, spm_model_prefix=spm_model_prefix, - specgram_type=specgram_type, + spectrum_type=spectrum_type, feat_dim=feat_dim, delta_delta=delta_delta, stride_ms=stride_ms, @@ -187,33 +111,11 @@ class SpeechCollator(): target_dB=target_dB, dither=dither) - def _parse_tar(self, file): - """Parse a tar file to get a tarfile object - and a map containing tarinfoes - """ - result = {} - f = tarfile.open(file) - for tarinfo in f.getmembers(): - result[tarinfo.name] = tarinfo - return f, result - - def _subfile_from_tar(self, file): - """Get subfile object from tar. - - It will return a subfile object from tar file - and cached tar file info for next reading request. - """ - tarpath, filename = file.split(':', 1)[1].split('#', 1) - if 'tar2info' not in self._local_data.__dict__: - self._local_data.tar2info = {} - if 'tar2object' not in self._local_data.__dict__: - self._local_data.tar2object = {} - if tarpath not in self._local_data.tar2info: - object, infoes = self._parse_tar(tarpath) - self._local_data.tar2info[tarpath] = infoes - self._local_data.tar2object[tarpath] = object - return self._local_data.tar2object[tarpath].extractfile( - self._local_data.tar2info[tarpath][filename]) + self.feature_size = self._speech_featurizer.audio_feature.feature_size + self.text_feature = self._speech_featurizer.text_feature + self.vocab_dict = self.text_feature.vocab_dict + self.vocab_list = self.text_feature.vocab_list + self.vocab_size = self.text_feature.vocab_size def process_utterance(self, audio_file, transcript): """Load, augment, featurize and normalize for speech data. @@ -226,23 +128,36 @@ class SpeechCollator(): where transcription part could be token ids or text. :rtype: tuple of (2darray, list) """ - if isinstance(audio_file, str) and audio_file.startswith('tar:'): - speech_segment = SpeechSegment.from_file( - self._subfile_from_tar(audio_file), transcript) + filetype = self.loader.file_type(audio_file) + + if filetype != 'sound': + spectrum = self.loader._get_from_loader(audio_file, filetype) + feat_dim = spectrum.shape[1] + assert feat_dim == self.feat_dim, f"expect feat dim {self.feat_dim}, but got {feat_dim}" + + if self.keep_transcription_text: + transcript_part = transcript + else: + text_ids = self.text_feature.featurize(transcript) + transcript_part = text_ids else: - speech_segment = SpeechSegment.from_file(audio_file, transcript) + # read audio + speech_segment = SpeechSegment.from_file( + audio_file, transcript, infos=self._local_data) + # audio augment + self.augmentation.transform_audio(speech_segment) - # audio augment - self._augmentation_pipeline.transform_audio(speech_segment) + # extract speech feature + spectrum, transcript_part = self._speech_featurizer.featurize( + speech_segment, self.keep_transcription_text) - specgram, transcript_part = self._speech_featurizer.featurize( - speech_segment, self._keep_transcription_text) - if self._normalizer: - specgram = self._normalizer.apply(specgram) + # CMVN spectrum + if self._normalizer: + spectrum = self._normalizer.apply(spectrum) - # specgram augment - specgram = self._augmentation_pipeline.transform_feature(specgram) - return specgram, transcript_part + # spectrum augment + spectrum = self.augmentation.transform_feature(spectrum) + return spectrum, transcript_part def __call__(self, batch): """batch examples @@ -272,16 +187,14 @@ class SpeechCollator(): audios.append(audio) # [T, D] audio_lens.append(audio.shape[0]) # text - # for training, text is token ids - # else text is string, convert to unicode ord + # for training, text is token ids, else text is string, convert to unicode ord tokens = [] - if self._keep_transcription_text: + if self.keep_transcription_text: assert isinstance(text, str), (type(text), text) tokens = [ord(t) for t in text] else: tokens = text # token ids - tokens = tokens if isinstance(tokens, np.ndarray) else np.array( - tokens, dtype=np.int64) + tokens = np.array(tokens, dtype=np.int64) texts.append(tokens) text_lens.append(tokens.shape[0]) @@ -292,26 +205,162 @@ class SpeechCollator(): olens = np.array(text_lens).astype(np.int64) return utts, xs_pad, ilens, ys_pad, olens - @property - def vocab_size(self): - return self._speech_featurizer.vocab_size - @property - def vocab_list(self): - return self._speech_featurizer.vocab_list +class SpeechCollator(SpeechCollatorBase): + @classmethod + def params(cls, config: Optional[CfgNode]=None) -> CfgNode: + default = CfgNode( + dict( + augmentation_config="", + random_seed=0, + mean_std_filepath="", + unit_type="char", + vocab_filepath="", + spm_model_prefix="", + spectrum_type='linear', # 'linear', 'mfcc', 'fbank' + feat_dim=0, # 'mfcc', 'fbank' + delta_delta=False, # 'mfcc', 'fbank' + stride_ms=10.0, # ms + window_ms=20.0, # ms + n_fft=None, # fft points + max_freq=None, # None for samplerate/2 + target_sample_rate=16000, # target sample rate + use_dB_normalization=True, + target_dB=-20, + dither=1.0, # feature dither + keep_transcription_text=False)) + + if config is not None: + config.merge_from_other_cfg(default) + return default + + @classmethod + def from_config(cls, config): + """Build a SpeechCollator object from a config. + + Args: + config (yacs.config.CfgNode): configs object. + + Returns: + SpeechCollator: collator object. + """ + assert 'augmentation_config' in config.collator + assert 'keep_transcription_text' in config.collator + assert 'mean_std_filepath' in config.collator + assert 'vocab_filepath' in config.collator + assert 'spectrum_type' in config.collator + assert 'n_fft' in config.collator + assert config.collator + + if isinstance(config.collator.augmentation_config, (str, bytes)): + if config.collator.augmentation_config: + aug_file = io.open( + config.collator.augmentation_config, + mode='r', + encoding='utf8') + else: + aug_file = io.StringIO(initial_value='{}', newline='') + else: + aug_file = config.collator.augmentation_config + assert isinstance(aug_file, io.StringIO) + + speech_collator = cls( + aug_file=aug_file, + random_seed=0, + mean_std_filepath=config.collator.mean_std_filepath, + unit_type=config.collator.unit_type, + vocab_filepath=config.collator.vocab_filepath, + spm_model_prefix=config.collator.spm_model_prefix, + spectrum_type=config.collator.spectrum_type, + feat_dim=config.collator.feat_dim, + delta_delta=config.collator.delta_delta, + stride_ms=config.collator.stride_ms, + window_ms=config.collator.window_ms, + n_fft=config.collator.n_fft, + max_freq=config.collator.max_freq, + target_sample_rate=config.collator.target_sample_rate, + use_dB_normalization=config.collator.use_dB_normalization, + target_dB=config.collator.target_dB, + dither=config.collator.dither, + keep_transcription_text=config.collator.keep_transcription_text) + return speech_collator + + +class TripletSpeechCollator(SpeechCollator): + def process_utterance(self, audio_file, translation, transcript): + """Load, augment, featurize and normalize for speech data. - @property - def vocab_dict(self): - return self._speech_featurizer.vocab_dict + :param audio_file: Filepath or file object of audio file. + :type audio_file: str | file + :param translation: translation text. + :type translation: str + :return: Tuple of audio feature tensor and data of translation part, + where translation part could be token ids or text. + :rtype: tuple of (2darray, list) + """ + spectrum, translation_part = super().process_utterance(audio_file, + translation) + transcript_part = self._speech_featurizer.text_featurize( + transcript, self.keep_transcription_text) + return spectrum, translation_part, transcript_part - @property - def text_feature(self): - return self._speech_featurizer.text_feature + def __call__(self, batch): + """batch examples + + Args: + batch ([List]): batch is (audio, text) + audio (np.ndarray) shape (T, D) + text (List[int] or str): shape (U,) - @property - def feature_size(self): - return self._speech_featurizer.feature_size + Returns: + tuple(audio, text, audio_lens, text_lens): batched data. + audio : (B, Tmax, D) + audio_lens: (B) + text : (B, Umax) + text_lens: (B) + """ + audios = [] + audio_lens = [] + translation_text = [] + translation_text_lens = [] + transcription_text = [] + transcription_text_lens = [] - @property - def stride_ms(self): - return self._speech_featurizer.stride_ms + utts = [] + for utt, audio, translation, transcription in batch: + audio, translation, transcription = self.process_utterance( + audio, translation, transcription) + #utt + utts.append(utt) + # audio + audios.append(audio) # [T, D] + audio_lens.append(audio.shape[0]) + # text + # for training, text is token ids + # else text is string, convert to unicode ord + tokens = [[], []] + for idx, text in enumerate([translation, transcription]): + if self.keep_transcription_text: + assert isinstance(text, str), (type(text), text) + tokens[idx] = [ord(t) for t in text] + else: + tokens[idx] = text # token ids + tokens[idx] = np.array(tokens[idx], dtype=np.int64) + + translation_text.append(tokens[0]) + translation_text_lens.append(tokens[0].shape[0]) + transcription_text.append(tokens[1]) + transcription_text_lens.append(tokens[1].shape[0]) + + padded_audios = pad_sequence( + audios, padding_value=0.0).astype(np.float32) #[B, T, D] + audio_lens = np.array(audio_lens).astype(np.int64) + padded_translation = pad_sequence( + translation_text, padding_value=IGNORE_ID).astype(np.int64) + translation_lens = np.array(translation_text_lens).astype(np.int64) + padded_transcription = pad_sequence( + transcription_text, padding_value=IGNORE_ID).astype(np.int64) + transcription_lens = np.array(transcription_text_lens).astype(np.int64) + return utts, padded_audios, audio_lens, ( + padded_translation, padded_transcription), (translation_lens, + transcription_lens) diff --git a/deepspeech/io/collator_st.py b/deepspeech/io/collator_st.py deleted file mode 100644 index 28573366..00000000 --- a/deepspeech/io/collator_st.py +++ /dev/null @@ -1,631 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import io -from collections import namedtuple -from typing import Optional - -import kaldiio -import numpy as np -from yacs.config import CfgNode - -from deepspeech.frontend.augmentor.augmentation import AugmentationPipeline -from deepspeech.frontend.featurizer.speech_featurizer import SpeechFeaturizer -from deepspeech.frontend.featurizer.text_featurizer import TextFeaturizer -from deepspeech.frontend.normalizer import FeatureNormalizer -from deepspeech.frontend.speech import SpeechSegment -from deepspeech.frontend.utility import IGNORE_ID -from deepspeech.io.utility import pad_sequence -from deepspeech.utils.log import Log - -__all__ = ["SpeechCollator", "KaldiPrePorocessedCollator"] - -logger = Log(__name__).getlog() - -# namedtupe need global for pickle. -TarLocalData = namedtuple('TarLocalData', ['tar2info', 'tar2object']) - - -class SpeechCollator(): - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - default = CfgNode( - dict( - augmentation_config="", - random_seed=0, - mean_std_filepath="", - unit_type="char", - vocab_filepath="", - spm_model_prefix="", - specgram_type='linear', # 'linear', 'mfcc', 'fbank' - feat_dim=0, # 'mfcc', 'fbank' - delta_delta=False, # 'mfcc', 'fbank' - stride_ms=10.0, # ms - window_ms=20.0, # ms - n_fft=None, # fft points - max_freq=None, # None for samplerate/2 - target_sample_rate=16000, # target sample rate - use_dB_normalization=True, - target_dB=-20, - dither=1.0, # feature dither - keep_transcription_text=False)) - - if config is not None: - config.merge_from_other_cfg(default) - return default - - @classmethod - def from_config(cls, config): - """Build a SpeechCollator object from a config. - - Args: - config (yacs.config.CfgNode): configs object. - - Returns: - SpeechCollator: collator object. - """ - assert 'augmentation_config' in config.collator - assert 'keep_transcription_text' in config.collator - assert 'mean_std_filepath' in config.collator - assert 'vocab_filepath' in config.collator - assert 'specgram_type' in config.collator - assert 'n_fft' in config.collator - assert config.collator - - if isinstance(config.collator.augmentation_config, (str, bytes)): - if config.collator.augmentation_config: - aug_file = io.open( - config.collator.augmentation_config, - mode='r', - encoding='utf8') - else: - aug_file = io.StringIO(initial_value='{}', newline='') - else: - aug_file = config.collator.augmentation_config - assert isinstance(aug_file, io.StringIO) - - speech_collator = cls( - aug_file=aug_file, - random_seed=0, - mean_std_filepath=config.collator.mean_std_filepath, - unit_type=config.collator.unit_type, - vocab_filepath=config.collator.vocab_filepath, - spm_model_prefix=config.collator.spm_model_prefix, - specgram_type=config.collator.specgram_type, - feat_dim=config.collator.feat_dim, - delta_delta=config.collator.delta_delta, - stride_ms=config.collator.stride_ms, - window_ms=config.collator.window_ms, - n_fft=config.collator.n_fft, - max_freq=config.collator.max_freq, - target_sample_rate=config.collator.target_sample_rate, - use_dB_normalization=config.collator.use_dB_normalization, - target_dB=config.collator.target_dB, - dither=config.collator.dither, - keep_transcription_text=config.collator.keep_transcription_text) - return speech_collator - - def __init__( - self, - aug_file, - mean_std_filepath, - vocab_filepath, - spm_model_prefix, - random_seed=0, - unit_type="char", - specgram_type='linear', # 'linear', 'mfcc', 'fbank' - feat_dim=0, # 'mfcc', 'fbank' - delta_delta=False, # 'mfcc', 'fbank' - stride_ms=10.0, # ms - window_ms=20.0, # ms - n_fft=None, # fft points - max_freq=None, # None for samplerate/2 - target_sample_rate=16000, # target sample rate - use_dB_normalization=True, - target_dB=-20, - dither=1.0, - keep_transcription_text=True): - """SpeechCollator Collator - - Args: - unit_type(str): token unit type, e.g. char, word, spm - vocab_filepath (str): vocab file path. - mean_std_filepath (str): mean and std file path, which suffix is *.npy - spm_model_prefix (str): spm model prefix, need if `unit_type` is spm. - augmentation_config (str, optional): augmentation json str. Defaults to '{}'. - stride_ms (float, optional): stride size in ms. Defaults to 10.0. - window_ms (float, optional): window size in ms. Defaults to 20.0. - n_fft (int, optional): fft points for rfft. Defaults to None. - max_freq (int, optional): max cut freq. Defaults to None. - target_sample_rate (int, optional): target sample rate which used for training. Defaults to 16000. - specgram_type (str, optional): 'linear', 'mfcc' or 'fbank'. Defaults to 'linear'. - feat_dim (int, optional): audio feature dim, using by 'mfcc' or 'fbank'. Defaults to None. - delta_delta (bool, optional): audio feature with delta-delta, using by 'fbank' or 'mfcc'. Defaults to False. - use_dB_normalization (bool, optional): do dB normalization. Defaults to True. - target_dB (int, optional): target dB. Defaults to -20. - random_seed (int, optional): for random generator. Defaults to 0. - keep_transcription_text (bool, optional): True, when not in training mode, will not do tokenizer; Defaults to False. - if ``keep_transcription_text`` is False, text is token ids else is raw string. - - Do augmentations - Padding audio features with zeros to make them have the same shape (or - a user-defined shape) within one batch. - """ - self._keep_transcription_text = keep_transcription_text - - self._local_data = TarLocalData(tar2info={}, tar2object={}) - self._augmentation_pipeline = AugmentationPipeline( - augmentation_config=aug_file.read(), random_seed=random_seed) - - self._normalizer = FeatureNormalizer( - mean_std_filepath) if mean_std_filepath else None - - self._stride_ms = stride_ms - self._target_sample_rate = target_sample_rate - - self._speech_featurizer = SpeechFeaturizer( - unit_type=unit_type, - vocab_filepath=vocab_filepath, - spm_model_prefix=spm_model_prefix, - specgram_type=specgram_type, - feat_dim=feat_dim, - delta_delta=delta_delta, - stride_ms=stride_ms, - window_ms=window_ms, - n_fft=n_fft, - max_freq=max_freq, - target_sample_rate=target_sample_rate, - use_dB_normalization=use_dB_normalization, - target_dB=target_dB, - dither=dither) - - def _parse_tar(self, file): - """Parse a tar file to get a tarfile object - and a map containing tarinfoes - """ - result = {} - f = tarfile.open(file) - for tarinfo in f.getmembers(): - result[tarinfo.name] = tarinfo - return f, result - - def _subfile_from_tar(self, file): - """Get subfile object from tar. - - It will return a subfile object from tar file - and cached tar file info for next reading request. - """ - tarpath, filename = file.split(':', 1)[1].split('#', 1) - if 'tar2info' not in self._local_data.__dict__: - self._local_data.tar2info = {} - if 'tar2object' not in self._local_data.__dict__: - self._local_data.tar2object = {} - if tarpath not in self._local_data.tar2info: - object, infoes = self._parse_tar(tarpath) - self._local_data.tar2info[tarpath] = infoes - self._local_data.tar2object[tarpath] = object - return self._local_data.tar2object[tarpath].extractfile( - self._local_data.tar2info[tarpath][filename]) - - @property - def manifest(self): - return self._manifest - - @property - def vocab_size(self): - return self._speech_featurizer.vocab_size - - @property - def vocab_list(self): - return self._speech_featurizer.vocab_list - - @property - def vocab_dict(self): - return self._speech_featurizer.vocab_dict - - @property - def text_feature(self): - return self._speech_featurizer.text_feature - - @property - def feature_size(self): - return self._speech_featurizer.feature_size - - @property - def stride_ms(self): - return self._speech_featurizer.stride_ms - - def process_utterance(self, audio_file, translation): - """Load, augment, featurize and normalize for speech data. - - :param audio_file: Filepath or file object of audio file. - :type audio_file: str | file - :param translation: translation text. - :type translation: str - :return: Tuple of audio feature tensor and data of translation part, - where translation part could be token ids or text. - :rtype: tuple of (2darray, list) - """ - if isinstance(audio_file, str) and audio_file.startswith('tar:'): - speech_segment = SpeechSegment.from_file( - self._subfile_from_tar(audio_file), translation) - else: - speech_segment = SpeechSegment.from_file(audio_file, translation) - - # audio augment - self._augmentation_pipeline.transform_audio(speech_segment) - - specgram, translation_part = self._speech_featurizer.featurize( - speech_segment, self._keep_transcription_text) - if self._normalizer: - specgram = self._normalizer.apply(specgram) - - # specgram augment - specgram = self._augmentation_pipeline.transform_feature(specgram) - return specgram, translation_part - - def __call__(self, batch): - """batch examples - - Args: - batch ([List]): batch is (audio, text) - audio (np.ndarray) shape (T, D) - text (List[int] or str): shape (U,) - - Returns: - tuple(audio, text, audio_lens, text_lens): batched data. - audio : (B, Tmax, D) - audio_lens: (B) - text : (B, Umax) - text_lens: (B) - """ - audios = [] - audio_lens = [] - texts = [] - text_lens = [] - utts = [] - for utt, audio, text in batch: - audio, text = self.process_utterance(audio, text) - #utt - utts.append(utt) - # audio - audios.append(audio) # [T, D] - audio_lens.append(audio.shape[0]) - # text - # for training, text is token ids - # else text is string, convert to unicode ord - tokens = [] - if self._keep_transcription_text: - assert isinstance(text, str), (type(text), text) - tokens = [ord(t) for t in text] - else: - tokens = text # token ids - tokens = tokens if isinstance(tokens, np.ndarray) else np.array( - tokens, dtype=np.int64) - texts.append(tokens) - text_lens.append(tokens.shape[0]) - - padded_audios = pad_sequence( - audios, padding_value=0.0).astype(np.float32) #[B, T, D] - audio_lens = np.array(audio_lens).astype(np.int64) - padded_texts = pad_sequence( - texts, padding_value=IGNORE_ID).astype(np.int64) - text_lens = np.array(text_lens).astype(np.int64) - return utts, padded_audios, audio_lens, padded_texts, text_lens - - -class TripletSpeechCollator(SpeechCollator): - def process_utterance(self, audio_file, translation, transcript): - """Load, augment, featurize and normalize for speech data. - - :param audio_file: Filepath or file object of audio file. - :type audio_file: str | file - :param translation: translation text. - :type translation: str - :return: Tuple of audio feature tensor and data of translation part, - where translation part could be token ids or text. - :rtype: tuple of (2darray, list) - """ - if isinstance(audio_file, str) and audio_file.startswith('tar:'): - speech_segment = SpeechSegment.from_file( - self._subfile_from_tar(audio_file), translation) - else: - speech_segment = SpeechSegment.from_file(audio_file, translation) - - # audio augment - self._augmentation_pipeline.transform_audio(speech_segment) - - specgram, translation_part = self._speech_featurizer.featurize( - speech_segment, self._keep_transcription_text) - transcript_part = self._speech_featurizer._text_featurizer.featurize( - transcript) - if self._normalizer: - specgram = self._normalizer.apply(specgram) - - # specgram augment - specgram = self._augmentation_pipeline.transform_feature(specgram) - return specgram, translation_part, transcript_part - - def __call__(self, batch): - """batch examples - - Args: - batch ([List]): batch is (audio, text) - audio (np.ndarray) shape (T, D) - text (List[int] or str): shape (U,) - - Returns: - tuple(audio, text, audio_lens, text_lens): batched data. - audio : (B, Tmax, D) - audio_lens: (B) - text : (B, Umax) - text_lens: (B) - """ - audios = [] - audio_lens = [] - translation_text = [] - translation_text_lens = [] - transcription_text = [] - transcription_text_lens = [] - - utts = [] - for utt, audio, translation, transcription in batch: - audio, translation, transcription = self.process_utterance( - audio, translation, transcription) - #utt - utts.append(utt) - # audio - audios.append(audio) # [T, D] - audio_lens.append(audio.shape[0]) - # text - # for training, text is token ids - # else text is string, convert to unicode ord - tokens = [[], []] - for idx, text in enumerate([translation, transcription]): - if self._keep_transcription_text: - assert isinstance(text, str), (type(text), text) - tokens[idx] = [ord(t) for t in text] - else: - tokens[idx] = text # token ids - tokens[idx] = tokens[idx] if isinstance( - tokens[idx], np.ndarray) else np.array( - tokens[idx], dtype=np.int64) - translation_text.append(tokens[0]) - translation_text_lens.append(tokens[0].shape[0]) - transcription_text.append(tokens[1]) - transcription_text_lens.append(tokens[1].shape[0]) - - padded_audios = pad_sequence( - audios, padding_value=0.0).astype(np.float32) #[B, T, D] - audio_lens = np.array(audio_lens).astype(np.int64) - padded_translation = pad_sequence( - translation_text, padding_value=IGNORE_ID).astype(np.int64) - translation_lens = np.array(translation_text_lens).astype(np.int64) - padded_transcription = pad_sequence( - transcription_text, padding_value=IGNORE_ID).astype(np.int64) - transcription_lens = np.array(transcription_text_lens).astype(np.int64) - return utts, padded_audios, audio_lens, ( - padded_translation, padded_transcription), (translation_lens, - transcription_lens) - - -class KaldiPrePorocessedCollator(SpeechCollator): - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - default = CfgNode( - dict( - augmentation_config="", - random_seed=0, - unit_type="char", - vocab_filepath="", - spm_model_prefix="", - feat_dim=0, - stride_ms=10.0, - keep_transcription_text=False)) - - if config is not None: - config.merge_from_other_cfg(default) - return default - - @classmethod - def from_config(cls, config): - """Build a SpeechCollator object from a config. - - Args: - config (yacs.config.CfgNode): configs object. - - Returns: - SpeechCollator: collator object. - """ - assert 'augmentation_config' in config.collator - assert 'keep_transcription_text' in config.collator - assert 'vocab_filepath' in config.collator - assert config.collator - - if isinstance(config.collator.augmentation_config, (str, bytes)): - if config.collator.augmentation_config: - aug_file = io.open( - config.collator.augmentation_config, - mode='r', - encoding='utf8') - else: - aug_file = io.StringIO(initial_value='{}', newline='') - else: - aug_file = config.collator.augmentation_config - assert isinstance(aug_file, io.StringIO) - - speech_collator = cls( - aug_file=aug_file, - random_seed=0, - unit_type=config.collator.unit_type, - vocab_filepath=config.collator.vocab_filepath, - spm_model_prefix=config.collator.spm_model_prefix, - feat_dim=config.collator.feat_dim, - stride_ms=config.collator.stride_ms, - keep_transcription_text=config.collator.keep_transcription_text) - return speech_collator - - def __init__(self, - aug_file, - vocab_filepath, - spm_model_prefix, - random_seed=0, - unit_type="char", - feat_dim=0, - stride_ms=10.0, - keep_transcription_text=True): - """SpeechCollator Collator - - Args: - unit_type(str): token unit type, e.g. char, word, spm - vocab_filepath (str): vocab file path. - spm_model_prefix (str): spm model prefix, need if `unit_type` is spm. - augmentation_config (str, optional): augmentation json str. Defaults to '{}'. - random_seed (int, optional): for random generator. Defaults to 0. - keep_transcription_text (bool, optional): True, when not in training mode, will not do tokenizer; Defaults to False. - if ``keep_transcription_text`` is False, text is token ids else is raw string. - - Do augmentations - Padding audio features with zeros to make them have the same shape (or - a user-defined shape) within one batch. - """ - self._keep_transcription_text = keep_transcription_text - self._feat_dim = feat_dim - self._stride_ms = stride_ms - - self._local_data = TarLocalData(tar2info={}, tar2object={}) - self._augmentation_pipeline = AugmentationPipeline( - augmentation_config=aug_file.read(), random_seed=random_seed) - - self._text_featurizer = TextFeaturizer(unit_type, vocab_filepath, - spm_model_prefix) - - def process_utterance(self, audio_file, translation): - """Load, augment, featurize and normalize for speech data. - - :param audio_file: Filepath or file object of kaldi processed feature. - :type audio_file: str | file - :param translation: Translation text. - :type translation: str - :return: Tuple of audio feature tensor and data of translation part, - where translation part could be token ids or text. - :rtype: tuple of (2darray, list) - """ - specgram = kaldiio.load_mat(audio_file) - assert specgram.shape[ - 1] == self._feat_dim, 'expect feat dim {}, but got {}'.format( - self._feat_dim, specgram.shape[1]) - - # specgram augment - specgram = self._augmentation_pipeline.transform_feature(specgram) - - if self._keep_transcription_text: - return specgram, translation - else: - text_ids = self._text_featurizer.featurize(translation) - return specgram, text_ids - - -class TripletKaldiPrePorocessedCollator(KaldiPrePorocessedCollator): - def process_utterance(self, audio_file, translation, transcript): - """Load, augment, featurize and normalize for speech data. - - :param audio_file: Filepath or file object of kali processed feature. - :type audio_file: str | file - :param translation: Translation text. - :type translation: str - :param transcript: Transcription text. - :type transcript: str - :return: Tuple of audio feature tensor and data of translation and transcription parts, - where translation and transcription parts could be token ids or text. - :rtype: tuple of (2darray, (list, list)) - """ - specgram = kaldiio.load_mat(audio_file) - assert specgram.shape[ - 1] == self._feat_dim, 'expect feat dim {}, but got {}'.format( - self._feat_dim, specgram.shape[1]) - - # specgram augment - specgram = self._augmentation_pipeline.transform_feature(specgram) - - if self._keep_transcription_text: - return specgram, translation, transcript - else: - translation_text_ids = self._text_featurizer.featurize(translation) - transcript_text_ids = self._text_featurizer.featurize(transcript) - return specgram, translation_text_ids, transcript_text_ids - - def __call__(self, batch): - """batch examples - - Args: - batch ([List]): batch is (audio, text) - audio (np.ndarray) shape (T, D) - translation (List[int] or str): shape (U,) - transcription (List[int] or str): shape (V,) - - Returns: - tuple(audio, text, audio_lens, text_lens): batched data. - audio : (B, Tmax, D) - audio_lens: (B) - translation_text : (B, Umax) - translation_text_lens: (B) - transcription_text : (B, Vmax) - transcription_text_lens: (B) - """ - audios = [] - audio_lens = [] - translation_text = [] - translation_text_lens = [] - transcription_text = [] - transcription_text_lens = [] - - utts = [] - for utt, audio, translation, transcription in batch: - audio, translation, transcription = self.process_utterance( - audio, translation, transcription) - #utt - utts.append(utt) - # audio - audios.append(audio) # [T, D] - audio_lens.append(audio.shape[0]) - # text - # for training, text is token ids - # else text is string, convert to unicode ord - tokens = [[], []] - for idx, text in enumerate([translation, transcription]): - if self._keep_transcription_text: - assert isinstance(text, str), (type(text), text) - tokens[idx] = [ord(t) for t in text] - else: - tokens[idx] = text # token ids - tokens[idx] = tokens[idx] if isinstance( - tokens[idx], np.ndarray) else np.array( - tokens[idx], dtype=np.int64) - translation_text.append(tokens[0]) - translation_text_lens.append(tokens[0].shape[0]) - transcription_text.append(tokens[1]) - transcription_text_lens.append(tokens[1].shape[0]) - - padded_audios = pad_sequence( - audios, padding_value=0.0).astype(np.float32) #[B, T, D] - audio_lens = np.array(audio_lens).astype(np.int64) - padded_translation = pad_sequence( - translation_text, padding_value=IGNORE_ID).astype(np.int64) - translation_lens = np.array(translation_text_lens).astype(np.int64) - padded_transcription = pad_sequence( - transcription_text, padding_value=IGNORE_ID).astype(np.int64) - transcription_lens = np.array(transcription_text_lens).astype(np.int64) - return utts, padded_audios, audio_lens, ( - padded_translation, padded_transcription), (translation_lens, - transcription_lens) diff --git a/deepspeech/io/reader.py b/deepspeech/io/reader.py index 95cdbb95..30ae98f0 100644 --- a/deepspeech/io/reader.py +++ b/deepspeech/io/reader.py @@ -321,6 +321,22 @@ class LoadInputsAndTargets(): raise NotImplementedError( "Not supported: loader_type={}".format(filetype)) + def file_type(self, filepath): + suffix = filepath.split(":")[0].split('.')[1] + if suffix == 'ark': + return 'mat' + elif suffix == 'scp': + return 'scp' + elif suffix == 'npy': + return 'npy' + elif suffix == 'npz': + return 'npz' + elif suffix in ['wav', 'flac']: + # PCM16 + return 'sound' + else: + raise ValueError(f"Not support filetype: {suffix}") + class SoundHDF5File(): """Collecting sound files to a HDF5 file From 35cbbc8a389b32408d75ef6637c232cfd36841a3 Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Tue, 28 Sep 2021 10:29:49 +0000 Subject: [PATCH 06/11] add requirements for hub --- hub/requirements.txt | 26 +++++++++++++++++ hub/setup_hub.sh | 66 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 92 insertions(+) create mode 100644 hub/requirements.txt create mode 100644 hub/setup_hub.sh diff --git a/hub/requirements.txt b/hub/requirements.txt new file mode 100644 index 00000000..c4c7d022 --- /dev/null +++ b/hub/requirements.txt @@ -0,0 +1,26 @@ +coverage +gpustat +jsonlines +kaldiio +llvmlite==0.31.0 +loguru +numba==0.47.0 +numpy==1.18.5 +Pillow +pre-commit +pybind11 +python-speech-features +resampy==0.2.2 +sacrebleu +scipy==1.2.1 +sentencepiece +snakeviz +SoundFile==0.9.0.post1 +sox +soxbindings +tensorboardX +textgrid +tqdm +typeguard +visualdl==2.2.0 +yacs diff --git a/hub/setup_hub.sh b/hub/setup_hub.sh new file mode 100644 index 00000000..f2d43ad1 --- /dev/null +++ b/hub/setup_hub.sh @@ -0,0 +1,66 @@ +#! /usr/bin/env bash +cd .. >> /dev/null +source utils/log.sh + + +SUDO='sudo' +if [ $(id -u) -eq 0 ]; then + SUDO='' +fi + +if [ -e /etc/lsb-release ];then + ${SUDO} apt-get update -y + ${SUDO} apt-get install -y jq vim tig tree sox pkg-config libflac-dev libogg-dev libvorbis-dev libboost-dev swig python3-dev + if [ $? != 0 ]; then + error_msg "Please using Ubuntu or install pkg-config libflac-dev libogg-dev libvorbis-dev libboost-dev swig python3-dev by user." + exit -1 + fi +fi + + +source tools/venv/bin/activate + +cd - +#install python dependencies +if [ -f "requirements.txt" ]; then + pip3 install -r requirements.txt +fi +if [ $? != 0 ]; then + error_msg "Install python dependencies failed !!!" + exit 1 +fi +cd .. >> /dev/null + +# install package libsndfile +python3 -c "import soundfile" +if [ $? != 0 ]; then + info_msg "Install package libsndfile into default system path." + wget "http://www.mega-nerd.com/libsndfile/files/libsndfile-1.0.28.tar.gz" + if [ $? != 0 ]; then + error_msg "Download libsndfile-1.0.28.tar.gz failed !!!" + exit 1 + fi + tar -zxvf libsndfile-1.0.28.tar.gz + cd libsndfile-1.0.28 + ./configure > /dev/null && make > /dev/null && make install > /dev/null + cd .. + rm -rf libsndfile-1.0.28 + rm libsndfile-1.0.28.tar.gz +fi + + +# install decoders +python3 -c "import pkg_resources; pkg_resources.require(\"swig_decoders==1.1\")" +if [ $? != 0 ]; then + cd deepspeech/decoders/swig > /dev/null + sh setup.sh + cd - > /dev/null +fi +python3 -c "import pkg_resources; pkg_resources.require(\"swig_decoders==1.1\")" +if [ $? != 0 ]; then + error_msg "Please check why decoder install error!" + exit -1 +fi + + +info_msg "Install all dependencies successfully." From b7b1bda34f920ef457486fd7a494464c31540a4a Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 28 Sep 2021 11:47:53 +0000 Subject: [PATCH 07/11] test refactor collator --- deepspeech/exps/u2_st/model.py | 6 +- .../frontend/featurizer/audio_featurizer.py | 32 +++--- .../frontend/featurizer/speech_featurizer.py | 36 +----- deepspeech/frontend/utility.py | 1 + deepspeech/io/collator.py | 108 ++++++++++-------- deepspeech/io/dataset.py | 21 +--- deepspeech/io/reader.py | 2 +- docs/src/data_preparation.md | 2 +- docs/src/deepspeech_architecture.md | 2 +- examples/1xt2x/aishell/conf/deepspeech2.yaml | 2 +- .../1xt2x/baidu_en8k/conf/deepspeech2.yaml | 2 +- .../1xt2x/librispeech/conf/deepspeech2.yaml | 2 +- examples/aishell/s0/conf/deepspeech2.yaml | 2 +- .../aishell/s0/conf/deepspeech2_online.yaml | 2 +- examples/aishell/s0/local/data.sh | 2 +- examples/aishell/s1/conf/chunk_conformer.yaml | 2 +- examples/aishell/s1/conf/conformer.yaml | 2 +- examples/aishell/s1/local/data.sh | 2 +- .../callcenter/s1/conf/chunk_conformer.yaml | 2 +- examples/callcenter/s1/conf/conformer.yaml | 2 +- examples/callcenter/s1/local/data.sh | 2 +- examples/librispeech/s0/conf/deepspeech2.yaml | 2 +- .../s0/conf/deepspeech2_online.yaml | 2 +- examples/librispeech/s0/local/data.sh | 2 +- .../librispeech/s1/conf/chunk_conformer.yaml | 2 +- .../s1/conf/chunk_transformer.yaml | 2 +- examples/librispeech/s1/conf/conformer.yaml | 2 +- examples/librispeech/s1/conf/transformer.yaml | 2 +- examples/librispeech/s1/local/data.sh | 2 +- .../librispeech/s2/conf/chunk_conformer.yaml | 2 +- .../s2/conf/chunk_transformer.yaml | 2 +- examples/librispeech/s2/conf/conformer.yaml | 2 +- examples/librispeech/s2/local/data.sh | 2 +- examples/ted_en_zh/t0/conf/transformer.yaml | 2 +- .../t0/conf/transformer_joint_noam.yaml | 2 +- examples/ted_en_zh/t0/local/data.sh | 2 +- examples/timit/s1/conf/transformer.yaml | 2 +- examples/timit/s1/local/data.sh | 2 +- examples/tiny/s0/conf/deepspeech2.yaml | 2 +- examples/tiny/s0/conf/deepspeech2_online.yaml | 2 +- examples/tiny/s0/local/data.sh | 2 +- examples/tiny/s1/conf/chunk_confermer.yaml | 2 +- examples/tiny/s1/conf/chunk_transformer.yaml | 2 +- examples/tiny/s1/conf/conformer.yaml | 2 +- examples/tiny/s1/conf/transformer.yaml | 2 +- examples/tiny/s1/local/data.sh | 2 +- utils/compute_mean_std.py | 4 +- 47 files changed, 125 insertions(+), 163 deletions(-) diff --git a/deepspeech/exps/u2_st/model.py b/deepspeech/exps/u2_st/model.py index f5a514c7..9a34cbdc 100644 --- a/deepspeech/exps/u2_st/model.py +++ b/deepspeech/exps/u2_st/model.py @@ -31,7 +31,6 @@ from yacs.config import CfgNode from deepspeech.io.collator import SpeechCollator from deepspeech.io.collator import TripletSpeechCollator from deepspeech.io.dataset import ManifestDataset -from deepspeech.io.dataset import TripletManifestDataset from deepspeech.io.sampler import SortagradBatchSampler from deepspeech.io.sampler import SortagradDistributedBatchSampler from deepspeech.models.u2_st import U2STModel @@ -249,12 +248,11 @@ class U2STTrainer(Trainer): config.collator.keep_transcription_text = False # train/valid dataset, return token ids - Dataset = TripletManifestDataset if config.model.model_conf.asr_weight > 0. else ManifestDataset config.data.manifest = config.data.train_manifest - train_dataset = Dataset.from_config(config) + train_dataset = ManifestDataset.from_config(config) config.data.manifest = config.data.dev_manifest - dev_dataset = Dataset.from_config(config) + dev_dataset = ManifestDataset.from_config(config) if config.model.model_conf.asr_weight > 0.: Collator = TripletSpeechCollator diff --git a/deepspeech/frontend/featurizer/audio_featurizer.py b/deepspeech/frontend/featurizer/audio_featurizer.py index 4c40c847..6f3b646c 100644 --- a/deepspeech/frontend/featurizer/audio_featurizer.py +++ b/deepspeech/frontend/featurizer/audio_featurizer.py @@ -24,15 +24,15 @@ class AudioFeaturizer(): Currently, it supports feature types of linear spectrogram and mfcc. - :param specgram_type: Specgram feature type. Options: 'linear'. - :type specgram_type: str + :param spectrum_type: Specgram feature type. Options: 'linear'. + :type spectrum_type: str :param stride_ms: Striding size (in milliseconds) for generating frames. :type stride_ms: float :param window_ms: Window size (in milliseconds) for generating frames. :type window_ms: float - :param max_freq: When specgram_type is 'linear', only FFT bins + :param max_freq: When spectrum_type is 'linear', only FFT bins corresponding to frequencies between [0, max_freq] are - returned; when specgram_type is 'mfcc', max_feq is the + returned; when spectrum_type is 'mfcc', max_feq is the highest band edge of mel filters. :types max_freq: None|float :param target_sample_rate: Audio are resampled (if upsampling or @@ -47,7 +47,7 @@ class AudioFeaturizer(): """ def __init__(self, - specgram_type: str='linear', + spectrum_type: str='linear', feat_dim: int=None, delta_delta: bool=False, stride_ms=10.0, @@ -58,7 +58,7 @@ class AudioFeaturizer(): use_dB_normalization=True, target_dB=-20, dither=1.0): - self._specgram_type = specgram_type + self._spectrum_type = spectrum_type # mfcc and fbank using `feat_dim` self._feat_dim = feat_dim # mfcc and fbank using `delta-delta` @@ -113,27 +113,27 @@ class AudioFeaturizer(): def feature_size(self): """audio feature size""" feat_dim = 0 - if self._specgram_type == 'linear': + if self._spectrum_type == 'linear': fft_point = self._window_ms if self._fft_point is None else self._fft_point feat_dim = int(fft_point * (self._target_sample_rate / 1000) / 2 + 1) - elif self._specgram_type == 'mfcc': + elif self._spectrum_type == 'mfcc': # mfcc, delta, delta-delta feat_dim = int(self._feat_dim * 3) if self._delta_delta else int(self._feat_dim) - elif self._specgram_type == 'fbank': + elif self._spectrum_type == 'fbank': # fbank, delta, delta-delta feat_dim = int(self._feat_dim * 3) if self._delta_delta else int(self._feat_dim) else: - raise ValueError("Unknown specgram_type %s. " - "Supported values: linear." % self._specgram_type) + raise ValueError("Unknown spectrum_type %s. " + "Supported values: linear." % self._spectrum_type) return feat_dim def _compute_specgram(self, audio_segment): """Extract various audio features.""" sample_rate = audio_segment.sample_rate - if self._specgram_type == 'linear': + if self._spectrum_type == 'linear': samples = audio_segment.samples return self._compute_linear_specgram( samples, @@ -141,7 +141,7 @@ class AudioFeaturizer(): stride_ms=self._stride_ms, window_ms=self._window_ms, max_freq=self._max_freq) - elif self._specgram_type == 'mfcc': + elif self._spectrum_type == 'mfcc': samples = audio_segment.to('int16') return self._compute_mfcc( samples, @@ -152,7 +152,7 @@ class AudioFeaturizer(): max_freq=self._max_freq, dither=self._dither, delta_delta=self._delta_delta) - elif self._specgram_type == 'fbank': + elif self._spectrum_type == 'fbank': samples = audio_segment.to('int16') return self._compute_fbank( samples, @@ -164,8 +164,8 @@ class AudioFeaturizer(): dither=self._dither, delta_delta=self._delta_delta) else: - raise ValueError("Unknown specgram_type %s. " - "Supported values: linear." % self._specgram_type) + raise ValueError("Unknown spectrum_type %s. " + "Supported values: linear." % self._spectrum_type) def _specgram_real(self, samples, window_size, stride_size, sample_rate): """Compute the spectrogram for samples from a real signal.""" diff --git a/deepspeech/frontend/featurizer/speech_featurizer.py b/deepspeech/frontend/featurizer/speech_featurizer.py index f9f7d7c2..7471d164 100644 --- a/deepspeech/frontend/featurizer/speech_featurizer.py +++ b/deepspeech/frontend/featurizer/speech_featurizer.py @@ -17,44 +17,14 @@ from deepspeech.frontend.featurizer.text_featurizer import TextFeaturizer class SpeechFeaturizer(): - """Speech featurizer, for extracting features from both audio and transcript - contents of SpeechSegment. - - Currently, for audio parts, it supports feature types of linear - spectrogram and mfcc; for transcript parts, it only supports char-level - tokenizing and conversion into a list of token indices. Note that the - token indexing order follows the given vocabulary file. - - :param vocab_filepath: Filepath to load vocabulary for token indices - conversion. - :type specgram_type: str - :param specgram_type: Specgram feature type. Options: 'linear', 'mfcc'. - :type specgram_type: str - :param stride_ms: Striding size (in milliseconds) for generating frames. - :type stride_ms: float - :param window_ms: Window size (in milliseconds) for generating frames. - :type window_ms: float - :param max_freq: When specgram_type is 'linear', only FFT bins - corresponding to frequencies between [0, max_freq] are - returned; when specgram_type is 'mfcc', max_freq is the - highest band edge of mel filters. - :types max_freq: None|float - :param target_sample_rate: Speech are resampled (if upsampling or - downsampling is allowed) to this before - extracting spectrogram features. - :type target_sample_rate: float - :param use_dB_normalization: Whether to normalize the audio to a certain - decibels before extracting the features. - :type use_dB_normalization: bool - :param target_dB: Target audio decibels for normalization. - :type target_dB: float + """Speech and Text feature extraction. """ def __init__(self, unit_type, vocab_filepath, spm_model_prefix=None, - specgram_type='linear', + spectrum_type='linear', feat_dim=None, delta_delta=False, stride_ms=10.0, @@ -70,7 +40,7 @@ class SpeechFeaturizer(): self.window_ms = window_ms self.audio_feature = AudioFeaturizer( - specgram_type=specgram_type, + spectrum_type=spectrum_type, feat_dim=feat_dim, delta_delta=delta_delta, stride_ms=stride_ms, diff --git a/deepspeech/frontend/utility.py b/deepspeech/frontend/utility.py index 2a581232..f5fc3097 100644 --- a/deepspeech/frontend/utility.py +++ b/deepspeech/frontend/utility.py @@ -15,6 +15,7 @@ import json import math import tarfile +from collections import namedtuple from typing import List from typing import Optional from typing import Text diff --git a/deepspeech/io/collator.py b/deepspeech/io/collator.py index c5c0a414..553ffcb5 100644 --- a/deepspeech/io/collator.py +++ b/deepspeech/io/collator.py @@ -32,6 +32,19 @@ __all__ = ["SpeechCollator", "TripletSpeechCollator"] logger = Log(__name__).getlog() +def tokenids(text, keep_transcription_text): + # for training text is token ids + tokens = text # token ids + + if keep_transcription_text: + # text is string, convert to unicode ord + assert isinstance(text, str), (type(text), text) + tokens = [ord(t) for t in text] + + tokens = np.array(tokens, dtype=np.int64) + return tokens + + class SpeechCollatorBase(): def __init__( self, @@ -150,7 +163,6 @@ class SpeechCollatorBase(): # extract speech feature spectrum, transcript_part = self._speech_featurizer.featurize( speech_segment, self.keep_transcription_text) - # CMVN spectrum if self._normalizer: spectrum = self._normalizer.apply(spectrum) @@ -163,38 +175,35 @@ class SpeechCollatorBase(): """batch examples Args: - batch ([List]): batch is (audio, text) + batch (List[Dict]): batch is [dict(audio, text, ...)] audio (np.ndarray) shape (T, D) text (List[int] or str): shape (U,) Returns: - tuple(audio, text, audio_lens, text_lens): batched data. - audio : (B, Tmax, D) - audio_lens: (B) - text : (B, Umax) - text_lens: (B) + tuple(utts, xs_pad, ilens, ys_pad, olens): batched data. + utts: (B,) + xs_pad : (B, Tmax, D) + ilens: (B,) + ys_pad : (B, Umax) + olens: (B,) """ audios = [] audio_lens = [] texts = [] text_lens = [] utts = [] - for utt, audio, text in batch: + + for idx, item in enumerate(batch): + utts.append(item['utt']) + + audio = item['feat'] + text = item['text'] audio, text = self.process_utterance(audio, text) - #utt - utts.append(utt) - # audio + audios.append(audio) # [T, D] audio_lens.append(audio.shape[0]) - # text - # for training, text is token ids, else text is string, convert to unicode ord - tokens = [] - if self.keep_transcription_text: - assert isinstance(text, str), (type(text), text) - tokens = [ord(t) for t in text] - else: - tokens = text # token ids - tokens = np.array(tokens, dtype=np.int64) + + tokens = tokenids(text, self.keep_transcription_text) texts.append(tokens) text_lens.append(tokens.shape[0]) @@ -308,17 +317,19 @@ class TripletSpeechCollator(SpeechCollator): """batch examples Args: - batch ([List]): batch is (audio, text) + batch (List[Dict]): batch is [dict(audio, text, ...)] audio (np.ndarray) shape (T, D) text (List[int] or str): shape (U,) Returns: - tuple(audio, text, audio_lens, text_lens): batched data. - audio : (B, Tmax, D) - audio_lens: (B) - text : (B, Umax) - text_lens: (B) + tuple(utts, xs_pad, ilens, ys_pad, olens): batched data. + utts: (B,) + xs_pad : (B, Tmax, D) + ilens: (B,) + ys_pad : [(B, Umax), (B, Umax)] + olens: [(B,), (B,)] """ + utts = [] audios = [] audio_lens = [] translation_text = [] @@ -326,41 +337,38 @@ class TripletSpeechCollator(SpeechCollator): transcription_text = [] transcription_text_lens = [] - utts = [] - for utt, audio, translation, transcription in batch: + for idx, item in enumerate(batch): + utts.append(item['utt']) + + audio = item['feat'] + translation = item['text'] + transcription = item['text1'] audio, translation, transcription = self.process_utterance( audio, translation, transcription) - #utt - utts.append(utt) - # audio + audios.append(audio) # [T, D] audio_lens.append(audio.shape[0]) - # text - # for training, text is token ids - # else text is string, convert to unicode ord + tokens = [[], []] for idx, text in enumerate([translation, transcription]): - if self.keep_transcription_text: - assert isinstance(text, str), (type(text), text) - tokens[idx] = [ord(t) for t in text] - else: - tokens[idx] = text # token ids - tokens[idx] = np.array(tokens[idx], dtype=np.int64) + tokens[idx] = tokenids(text, self.keep_transcription_text) translation_text.append(tokens[0]) translation_text_lens.append(tokens[0].shape[0]) transcription_text.append(tokens[1]) transcription_text_lens.append(tokens[1].shape[0]) - padded_audios = pad_sequence( - audios, padding_value=0.0).astype(np.float32) #[B, T, D] - audio_lens = np.array(audio_lens).astype(np.int64) - padded_translation = pad_sequence( - translation_text, padding_value=IGNORE_ID).astype(np.int64) + xs_pad = pad_list(audios, 0.0).astype(np.float32) #[B, T, D] + ilens = np.array(audio_lens).astype(np.int64) + + padded_translation = pad_list(translation_text, + IGNORE_ID).astype(np.int64) translation_lens = np.array(translation_text_lens).astype(np.int64) - padded_transcription = pad_sequence( - transcription_text, padding_value=IGNORE_ID).astype(np.int64) + + padded_transcription = pad_list(transcription_text, + IGNORE_ID).astype(np.int64) transcription_lens = np.array(transcription_text_lens).astype(np.int64) - return utts, padded_audios, audio_lens, ( - padded_translation, padded_transcription), (translation_lens, - transcription_lens) + + ys_pad = (padded_translation, padded_transcription) + olens = (translation_lens, transcription_lens) + return utts, xs_pad, ilens, ys_pad, olens diff --git a/deepspeech/io/dataset.py b/deepspeech/io/dataset.py index 56e53475..1945c5f7 100644 --- a/deepspeech/io/dataset.py +++ b/deepspeech/io/dataset.py @@ -19,7 +19,7 @@ from yacs.config import CfgNode from deepspeech.frontend.utility import read_manifest from deepspeech.utils.log import Log -__all__ = ["ManifestDataset", "TripletManifestDataset", "TransformDataset"] +__all__ = ["ManifestDataset", "TransformDataset"] logger = Log(__name__).getlog() @@ -107,21 +107,7 @@ class ManifestDataset(Dataset): return len(self._manifest) def __getitem__(self, idx): - instance = self._manifest[idx] - return instance["utt"], instance["feat"], instance["text"] - - -class TripletManifestDataset(ManifestDataset): - """ - For Joint Training of Speech Translation and ASR. - text: translation, - text1: transcript. - """ - - def __getitem__(self, idx): - instance = self._manifest[idx] - return instance["utt"], instance["feat"], instance["text"], instance[ - "text1"] + return self._manifest[idx] class TransformDataset(Dataset): @@ -273,5 +259,4 @@ class AudioDataset(Dataset): return len(self.minibatch) def __getitem__(self, idx): - instance = self.minibatch[idx] - return instance["utt"], instance["feat"], instance["text"] + return self.minibatch[idx] diff --git a/deepspeech/io/reader.py b/deepspeech/io/reader.py index 30ae98f0..e7c43a78 100644 --- a/deepspeech/io/reader.py +++ b/deepspeech/io/reader.py @@ -322,7 +322,7 @@ class LoadInputsAndTargets(): "Not supported: loader_type={}".format(filetype)) def file_type(self, filepath): - suffix = filepath.split(":")[0].split('.')[1] + suffix = filepath.split(":")[0].split('.')[-1] if suffix == 'ark': return 'mat' elif suffix == 'scp': diff --git a/docs/src/data_preparation.md b/docs/src/data_preparation.md index a3d1b3eb..34d2a835 100644 --- a/docs/src/data_preparation.md +++ b/docs/src/data_preparation.md @@ -21,7 +21,7 @@ To perform z-score normalization (zero-mean, unit stddev) upon audio features, w ```bash python3 utils/compute_mean_std.py \ --num_samples 2000 \ ---specgram_type linear \ +--spectrum_type linear \ --manifest_path examples/librispeech/data/manifest.train \ --output_path examples/librispeech/data/mean_std.npz ``` diff --git a/docs/src/deepspeech_architecture.md b/docs/src/deepspeech_architecture.md index b9344122..5a6ca886 100644 --- a/docs/src/deepspeech_architecture.md +++ b/docs/src/deepspeech_architecture.md @@ -44,7 +44,7 @@ For CMVN, a subset or the full of traininig set is chosed and be used to compute cd examples/aishell/s0 python3 ../../../utils/compute_mean_std.py \ --manifest_path="data/manifest.train.raw" \ - --specgram_type="linear" \ + --spectrum_type="linear" \ --delta_delta=false \ --stride_ms=10.0 \ --window_ms=20.0 \ diff --git a/examples/1xt2x/aishell/conf/deepspeech2.yaml b/examples/1xt2x/aishell/conf/deepspeech2.yaml index 6e745e9d..c2d69226 100644 --- a/examples/1xt2x/aishell/conf/deepspeech2.yaml +++ b/examples/1xt2x/aishell/conf/deepspeech2.yaml @@ -18,7 +18,7 @@ collator: augmentation_config: conf/augmentation.json random_seed: 0 spm_model_prefix: - specgram_type: linear + spectrum_type: linear feat_dim: delta_delta: False stride_ms: 10.0 diff --git a/examples/1xt2x/baidu_en8k/conf/deepspeech2.yaml b/examples/1xt2x/baidu_en8k/conf/deepspeech2.yaml index fbc7466f..be51a9b9 100644 --- a/examples/1xt2x/baidu_en8k/conf/deepspeech2.yaml +++ b/examples/1xt2x/baidu_en8k/conf/deepspeech2.yaml @@ -18,7 +18,7 @@ collator: augmentation_config: conf/augmentation.json random_seed: 0 spm_model_prefix: - specgram_type: linear + spectrum_type: linear feat_dim: delta_delta: False stride_ms: 10.0 diff --git a/examples/1xt2x/librispeech/conf/deepspeech2.yaml b/examples/1xt2x/librispeech/conf/deepspeech2.yaml index edef0797..ad7fb2c1 100644 --- a/examples/1xt2x/librispeech/conf/deepspeech2.yaml +++ b/examples/1xt2x/librispeech/conf/deepspeech2.yaml @@ -18,7 +18,7 @@ collator: augmentation_config: conf/augmentation.json random_seed: 0 spm_model_prefix: - specgram_type: linear + spectrum_type: linear feat_dim: delta_delta: False stride_ms: 10.0 diff --git a/examples/aishell/s0/conf/deepspeech2.yaml b/examples/aishell/s0/conf/deepspeech2.yaml index 9560930a..ffefaeb3 100644 --- a/examples/aishell/s0/conf/deepspeech2.yaml +++ b/examples/aishell/s0/conf/deepspeech2.yaml @@ -18,7 +18,7 @@ collator: augmentation_config: conf/augmentation.json random_seed: 0 spm_model_prefix: - specgram_type: linear + spectrum_type: linear feat_dim: delta_delta: False stride_ms: 10.0 diff --git a/examples/aishell/s0/conf/deepspeech2_online.yaml b/examples/aishell/s0/conf/deepspeech2_online.yaml index 7e87594c..cac599dc 100644 --- a/examples/aishell/s0/conf/deepspeech2_online.yaml +++ b/examples/aishell/s0/conf/deepspeech2_online.yaml @@ -18,7 +18,7 @@ collator: augmentation_config: conf/augmentation.json random_seed: 0 spm_model_prefix: - specgram_type: linear #linear, mfcc, fbank + spectrum_type: linear #linear, mfcc, fbank feat_dim: delta_delta: False stride_ms: 10.0 diff --git a/examples/aishell/s0/local/data.sh b/examples/aishell/s0/local/data.sh index b106f3f2..1312a12f 100755 --- a/examples/aishell/s0/local/data.sh +++ b/examples/aishell/s0/local/data.sh @@ -46,7 +46,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then num_workers=$(nproc) python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ --manifest_path="data/manifest.train.raw" \ - --specgram_type="linear" \ + --spectrum_type="linear" \ --delta_delta=false \ --stride_ms=10.0 \ --window_ms=20.0 \ diff --git a/examples/aishell/s1/conf/chunk_conformer.yaml b/examples/aishell/s1/conf/chunk_conformer.yaml index 6f8ae135..9b563da2 100644 --- a/examples/aishell/s1/conf/chunk_conformer.yaml +++ b/examples/aishell/s1/conf/chunk_conformer.yaml @@ -18,7 +18,7 @@ collator: augmentation_config: conf/augmentation.json batch_size: 32 raw_wav: True # use raw_wav or kaldi feature - specgram_type: fbank #linear, mfcc, fbank + spectrum_type: fbank #linear, mfcc, fbank feat_dim: 80 delta_delta: False dither: 1.0 diff --git a/examples/aishell/s1/conf/conformer.yaml b/examples/aishell/s1/conf/conformer.yaml index a4248459..dfa9a4b0 100644 --- a/examples/aishell/s1/conf/conformer.yaml +++ b/examples/aishell/s1/conf/conformer.yaml @@ -18,7 +18,7 @@ collator: augmentation_config: conf/augmentation.json batch_size: 64 raw_wav: True # use raw_wav or kaldi feature - specgram_type: fbank #linear, mfcc, fbank + spectrum_type: fbank #linear, mfcc, fbank feat_dim: 80 delta_delta: False dither: 1.0 diff --git a/examples/aishell/s1/local/data.sh b/examples/aishell/s1/local/data.sh index 8d5ac4d5..c05c3ea2 100755 --- a/examples/aishell/s1/local/data.sh +++ b/examples/aishell/s1/local/data.sh @@ -46,7 +46,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then num_workers=$(nproc) python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ --manifest_path="data/manifest.train.raw" \ - --specgram_type="fbank" \ + --spectrum_type="fbank" \ --feat_dim=80 \ --delta_delta=false \ --stride_ms=10.0 \ diff --git a/examples/callcenter/s1/conf/chunk_conformer.yaml b/examples/callcenter/s1/conf/chunk_conformer.yaml index f79b8eaa..a853658a 100644 --- a/examples/callcenter/s1/conf/chunk_conformer.yaml +++ b/examples/callcenter/s1/conf/chunk_conformer.yaml @@ -18,7 +18,7 @@ collator: augmentation_config: conf/augmentation.json batch_size: 32 raw_wav: True # use raw_wav or kaldi feature - specgram_type: fbank #linear, mfcc, fbank + spectrum_type: fbank #linear, mfcc, fbank feat_dim: 80 delta_delta: False dither: 1.0 diff --git a/examples/callcenter/s1/conf/conformer.yaml b/examples/callcenter/s1/conf/conformer.yaml index 3b08cc7a..bd4f4578 100644 --- a/examples/callcenter/s1/conf/conformer.yaml +++ b/examples/callcenter/s1/conf/conformer.yaml @@ -18,7 +18,7 @@ collator: augmentation_config: conf/augmentation.json batch_size: 32 raw_wav: True # use raw_wav or kaldi feature - specgram_type: fbank #linear, mfcc, fbank + spectrum_type: fbank #linear, mfcc, fbank feat_dim: 80 delta_delta: False dither: 1.0 diff --git a/examples/callcenter/s1/local/data.sh b/examples/callcenter/s1/local/data.sh index e2640ead..b2a495b4 100755 --- a/examples/callcenter/s1/local/data.sh +++ b/examples/callcenter/s1/local/data.sh @@ -34,7 +34,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then num_workers=$(nproc) python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ --manifest_path="data/manifest.train.raw" \ - --specgram_type="fbank" \ + --spectrum_type="fbank" \ --feat_dim=80 \ --delta_delta=false \ --stride_ms=10.0 \ diff --git a/examples/librispeech/s0/conf/deepspeech2.yaml b/examples/librispeech/s0/conf/deepspeech2.yaml index 3f1a376f..47ef9421 100644 --- a/examples/librispeech/s0/conf/deepspeech2.yaml +++ b/examples/librispeech/s0/conf/deepspeech2.yaml @@ -18,7 +18,7 @@ collator: augmentation_config: conf/augmentation.json random_seed: 0 spm_model_prefix: - specgram_type: linear + spectrum_type: linear target_sample_rate: 16000 max_freq: None n_fft: None diff --git a/examples/librispeech/s0/conf/deepspeech2_online.yaml b/examples/librispeech/s0/conf/deepspeech2_online.yaml index 180a6205..e2f91094 100644 --- a/examples/librispeech/s0/conf/deepspeech2_online.yaml +++ b/examples/librispeech/s0/conf/deepspeech2_online.yaml @@ -18,7 +18,7 @@ collator: augmentation_config: conf/augmentation.json random_seed: 0 spm_model_prefix: - specgram_type: linear + spectrum_type: linear target_sample_rate: 16000 max_freq: None n_fft: None diff --git a/examples/librispeech/s0/local/data.sh b/examples/librispeech/s0/local/data.sh index b7180986..e3f7b325 100755 --- a/examples/librispeech/s0/local/data.sh +++ b/examples/librispeech/s0/local/data.sh @@ -62,7 +62,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ --manifest_path="data/manifest.train.raw" \ --num_samples=2000 \ - --specgram_type="linear" \ + --spectrum_type="linear" \ --delta_delta=false \ --sample_rate=16000 \ --stride_ms=10.0 \ diff --git a/examples/librispeech/s1/conf/chunk_conformer.yaml b/examples/librispeech/s1/conf/chunk_conformer.yaml index 92db20f6..872b560b 100644 --- a/examples/librispeech/s1/conf/chunk_conformer.yaml +++ b/examples/librispeech/s1/conf/chunk_conformer.yaml @@ -18,7 +18,7 @@ collator: augmentation_config: conf/augmentation.json batch_size: 16 raw_wav: True # use raw_wav or kaldi feature - specgram_type: fbank #linear, mfcc, fbank + spectrum_type: fbank #linear, mfcc, fbank feat_dim: 80 delta_delta: False dither: 1.0 diff --git a/examples/librispeech/s1/conf/chunk_transformer.yaml b/examples/librispeech/s1/conf/chunk_transformer.yaml index e0bc3135..132a4f9d 100644 --- a/examples/librispeech/s1/conf/chunk_transformer.yaml +++ b/examples/librispeech/s1/conf/chunk_transformer.yaml @@ -18,7 +18,7 @@ collator: augmentation_config: conf/augmentation.json batch_size: 64 raw_wav: True # use raw_wav or kaldi feature - specgram_type: fbank #linear, mfcc, fbank + spectrum_type: fbank #linear, mfcc, fbank feat_dim: 80 delta_delta: False dither: 1.0 diff --git a/examples/librispeech/s1/conf/conformer.yaml b/examples/librispeech/s1/conf/conformer.yaml index 78be249c..769ed5f5 100644 --- a/examples/librispeech/s1/conf/conformer.yaml +++ b/examples/librispeech/s1/conf/conformer.yaml @@ -18,7 +18,7 @@ collator: augmentation_config: conf/augmentation.json batch_size: 32 raw_wav: True # use raw_wav or kaldi feature - specgram_type: fbank #linear, mfcc, fbank + spectrum_type: fbank #linear, mfcc, fbank feat_dim: 80 delta_delta: False dither: 1.0 diff --git a/examples/librispeech/s1/conf/transformer.yaml b/examples/librispeech/s1/conf/transformer.yaml index e4a06767..c9dc1413 100644 --- a/examples/librispeech/s1/conf/transformer.yaml +++ b/examples/librispeech/s1/conf/transformer.yaml @@ -18,7 +18,7 @@ collator: augmentation_config: conf/augmentation.json batch_size: 32 raw_wav: True # use raw_wav or kaldi feature - specgram_type: fbank #linear, mfcc, fbank + spectrum_type: fbank #linear, mfcc, fbank feat_dim: 80 delta_delta: False dither: 1.0 diff --git a/examples/librispeech/s1/local/data.sh b/examples/librispeech/s1/local/data.sh index 4ad476d3..2b6af229 100755 --- a/examples/librispeech/s1/local/data.sh +++ b/examples/librispeech/s1/local/data.sh @@ -68,7 +68,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ --manifest_path="data/manifest.train.raw" \ --num_samples=-1 \ - --specgram_type="fbank" \ + --spectrum_type="fbank" \ --feat_dim=80 \ --delta_delta=false \ --sample_rate=16000 \ diff --git a/examples/librispeech/s2/conf/chunk_conformer.yaml b/examples/librispeech/s2/conf/chunk_conformer.yaml index 92db20f6..872b560b 100644 --- a/examples/librispeech/s2/conf/chunk_conformer.yaml +++ b/examples/librispeech/s2/conf/chunk_conformer.yaml @@ -18,7 +18,7 @@ collator: augmentation_config: conf/augmentation.json batch_size: 16 raw_wav: True # use raw_wav or kaldi feature - specgram_type: fbank #linear, mfcc, fbank + spectrum_type: fbank #linear, mfcc, fbank feat_dim: 80 delta_delta: False dither: 1.0 diff --git a/examples/librispeech/s2/conf/chunk_transformer.yaml b/examples/librispeech/s2/conf/chunk_transformer.yaml index e0bc3135..132a4f9d 100644 --- a/examples/librispeech/s2/conf/chunk_transformer.yaml +++ b/examples/librispeech/s2/conf/chunk_transformer.yaml @@ -18,7 +18,7 @@ collator: augmentation_config: conf/augmentation.json batch_size: 64 raw_wav: True # use raw_wav or kaldi feature - specgram_type: fbank #linear, mfcc, fbank + spectrum_type: fbank #linear, mfcc, fbank feat_dim: 80 delta_delta: False dither: 1.0 diff --git a/examples/librispeech/s2/conf/conformer.yaml b/examples/librispeech/s2/conf/conformer.yaml index 9a727413..bc87466e 100644 --- a/examples/librispeech/s2/conf/conformer.yaml +++ b/examples/librispeech/s2/conf/conformer.yaml @@ -18,7 +18,7 @@ collator: augmentation_config: conf/augmentation.json batch_size: 16 raw_wav: True # use raw_wav or kaldi feature - specgram_type: fbank #linear, mfcc, fbank + spectrum_type: fbank #linear, mfcc, fbank feat_dim: 80 delta_delta: False dither: 1.0 diff --git a/examples/librispeech/s2/local/data.sh b/examples/librispeech/s2/local/data.sh index 4ad476d3..2b6af229 100755 --- a/examples/librispeech/s2/local/data.sh +++ b/examples/librispeech/s2/local/data.sh @@ -68,7 +68,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ --manifest_path="data/manifest.train.raw" \ --num_samples=-1 \ - --specgram_type="fbank" \ + --spectrum_type="fbank" \ --feat_dim=80 \ --delta_delta=false \ --sample_rate=16000 \ diff --git a/examples/ted_en_zh/t0/conf/transformer.yaml b/examples/ted_en_zh/t0/conf/transformer.yaml index 1aad86d2..8c03e328 100644 --- a/examples/ted_en_zh/t0/conf/transformer.yaml +++ b/examples/ted_en_zh/t0/conf/transformer.yaml @@ -18,7 +18,7 @@ collator: # augmentation_config: conf/augmentation.json batch_size: 10 raw_wav: True # use raw_wav or kaldi feature - specgram_type: fbank #linear, mfcc, fbank + spectrum_type: fbank #linear, mfcc, fbank feat_dim: 80 delta_delta: False dither: 1.0 diff --git a/examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml b/examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml index 0144c40d..cbfae93e 100644 --- a/examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml +++ b/examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml @@ -18,7 +18,7 @@ collator: # augmentation_config: conf/augmentation.json batch_size: 10 raw_wav: True # use raw_wav or kaldi feature - specgram_type: fbank #linear, mfcc, fbank + spectrum_type: fbank #linear, mfcc, fbank feat_dim: 80 delta_delta: False dither: 1.0 diff --git a/examples/ted_en_zh/t0/local/data.sh b/examples/ted_en_zh/t0/local/data.sh index 32cfd9d7..43911c34 100755 --- a/examples/ted_en_zh/t0/local/data.sh +++ b/examples/ted_en_zh/t0/local/data.sh @@ -68,7 +68,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ --manifest_path="data/manifest.train.raw" \ --num_samples=-1 \ - --specgram_type="fbank" \ + --spectrum_type="fbank" \ --feat_dim=80 \ --delta_delta=false \ --sample_rate=16000 \ diff --git a/examples/timit/s1/conf/transformer.yaml b/examples/timit/s1/conf/transformer.yaml index c3b51996..1ae9acd0 100644 --- a/examples/timit/s1/conf/transformer.yaml +++ b/examples/timit/s1/conf/transformer.yaml @@ -17,7 +17,7 @@ collator: augmentation_config: "" batch_size: 64 raw_wav: True # use raw_wav or kaldi feature - specgram_type: fbank #linear, mfcc, fbank + spectrum_type: fbank #linear, mfcc, fbank feat_dim: 80 delta_delta: False dither: 1.0 diff --git a/examples/timit/s1/local/data.sh b/examples/timit/s1/local/data.sh index 1d16f454..f4be9048 100755 --- a/examples/timit/s1/local/data.sh +++ b/examples/timit/s1/local/data.sh @@ -45,7 +45,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ --manifest_path="data/manifest.train.raw" \ --num_samples=-1 \ - --specgram_type="fbank" \ + --spectrum_type="fbank" \ --feat_dim=80 \ --delta_delta=false \ --sample_rate=16000 \ diff --git a/examples/tiny/s0/conf/deepspeech2.yaml b/examples/tiny/s0/conf/deepspeech2.yaml index 40899655..a7940cb2 100644 --- a/examples/tiny/s0/conf/deepspeech2.yaml +++ b/examples/tiny/s0/conf/deepspeech2.yaml @@ -18,7 +18,7 @@ collator: augmentation_config: conf/augmentation.json random_seed: 0 spm_model_prefix: - specgram_type: linear + spectrum_type: linear feat_dim: delta_delta: False stride_ms: 10.0 diff --git a/examples/tiny/s0/conf/deepspeech2_online.yaml b/examples/tiny/s0/conf/deepspeech2_online.yaml index 0098a226..7e30409f 100644 --- a/examples/tiny/s0/conf/deepspeech2_online.yaml +++ b/examples/tiny/s0/conf/deepspeech2_online.yaml @@ -18,7 +18,7 @@ collator: augmentation_config: conf/augmentation.json random_seed: 0 spm_model_prefix: - specgram_type: linear + spectrum_type: linear feat_dim: delta_delta: False stride_ms: 10.0 diff --git a/examples/tiny/s0/local/data.sh b/examples/tiny/s0/local/data.sh index 02fdb706..fabf2e40 100755 --- a/examples/tiny/s0/local/data.sh +++ b/examples/tiny/s0/local/data.sh @@ -46,7 +46,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ --manifest_path="data/manifest.tiny.raw" \ --num_samples=64 \ - --specgram_type="linear" \ + --spectrum_type="linear" \ --delta_delta=false \ --sample_rate=16000 \ --stride_ms=10.0 \ diff --git a/examples/tiny/s1/conf/chunk_confermer.yaml b/examples/tiny/s1/conf/chunk_confermer.yaml index be2e82f9..f3c7e1dd 100644 --- a/examples/tiny/s1/conf/chunk_confermer.yaml +++ b/examples/tiny/s1/conf/chunk_confermer.yaml @@ -18,7 +18,7 @@ collator: augmentation_config: conf/augmentation.json batch_size: 4 raw_wav: True # use raw_wav or kaldi feature - specgram_type: fbank #linear, mfcc, fbank + spectrum_type: fbank #linear, mfcc, fbank feat_dim: 80 delta_delta: False dither: 1.0 diff --git a/examples/tiny/s1/conf/chunk_transformer.yaml b/examples/tiny/s1/conf/chunk_transformer.yaml index 93439a85..83005754 100644 --- a/examples/tiny/s1/conf/chunk_transformer.yaml +++ b/examples/tiny/s1/conf/chunk_transformer.yaml @@ -18,7 +18,7 @@ collator: augmentation_config: conf/augmentation.json batch_size: 4 raw_wav: True # use raw_wav or kaldi feature - specgram_type: fbank #linear, mfcc, fbank + spectrum_type: fbank #linear, mfcc, fbank feat_dim: 80 delta_delta: False dither: 1.0 diff --git a/examples/tiny/s1/conf/conformer.yaml b/examples/tiny/s1/conf/conformer.yaml index 9bb67c44..628e3b77 100644 --- a/examples/tiny/s1/conf/conformer.yaml +++ b/examples/tiny/s1/conf/conformer.yaml @@ -18,7 +18,7 @@ collator: augmentation_config: conf/augmentation.json batch_size: 4 raw_wav: True # use raw_wav or kaldi feature - specgram_type: fbank #linear, mfcc, fbank + spectrum_type: fbank #linear, mfcc, fbank feat_dim: 80 delta_delta: False dither: 1.0 diff --git a/examples/tiny/s1/conf/transformer.yaml b/examples/tiny/s1/conf/transformer.yaml index fcbe1da4..27ffcae4 100644 --- a/examples/tiny/s1/conf/transformer.yaml +++ b/examples/tiny/s1/conf/transformer.yaml @@ -18,7 +18,7 @@ collator: augmentation_config: conf/augmentation.json batch_size: 4 raw_wav: True # use raw_wav or kaldi feature - specgram_type: fbank #linear, mfcc, fbank + spectrum_type: fbank #linear, mfcc, fbank feat_dim: 80 delta_delta: False dither: 1.0 diff --git a/examples/tiny/s1/local/data.sh b/examples/tiny/s1/local/data.sh index 2aea250b..b5dbd581 100755 --- a/examples/tiny/s1/local/data.sh +++ b/examples/tiny/s1/local/data.sh @@ -51,7 +51,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ --manifest_path="data/manifest.tiny.raw" \ --num_samples=64 \ - --specgram_type="fbank" \ + --spectrum_type="fbank" \ --feat_dim=80 \ --delta_delta=false \ --sample_rate=16000 \ diff --git a/utils/compute_mean_std.py b/utils/compute_mean_std.py index a468153d..0f63715a 100755 --- a/utils/compute_mean_std.py +++ b/utils/compute_mean_std.py @@ -27,7 +27,7 @@ add_arg = functools.partial(add_arguments, argparser=parser) # yapf: disable add_arg('num_samples', int, 2000, "# of samples to for statistics.") -add_arg('specgram_type', str, +add_arg('spectrum_type', str, 'linear', "Audio feature type. Options: linear, mfcc, fbank.", choices=['linear', 'mfcc', 'fbank']) @@ -58,7 +58,7 @@ def main(): augmentation_pipeline = AugmentationPipeline('{}') audio_featurizer = AudioFeaturizer( - specgram_type=args.specgram_type, + spectrum_type=args.spectrum_type, feat_dim=args.feat_dim, delta_delta=args.delta_delta, stride_ms=args.stride_ms, From 856d641c9ce748766ae53c1939fc995dea6aec9a Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 28 Sep 2021 11:48:21 +0000 Subject: [PATCH 08/11] multi worker for dataloader --- deepspeech/exps/deepspeech2/model.py | 8 +++++--- deepspeech/exps/u2/model.py | 9 ++++++--- deepspeech/exps/u2_st/model.py | 9 ++++++--- 3 files changed, 17 insertions(+), 9 deletions(-) diff --git a/deepspeech/exps/deepspeech2/model.py b/deepspeech/exps/deepspeech2/model.py index b854a996..e84de615 100644 --- a/deepspeech/exps/deepspeech2/model.py +++ b/deepspeech/exps/deepspeech2/model.py @@ -235,16 +235,18 @@ class DeepSpeech2Trainer(Trainer): num_workers=config.collator.num_workers) self.valid_loader = DataLoader( dev_dataset, - batch_size=int(config.collator.batch_size / 4), + batch_size=int(config.collator.batch_size), shuffle=False, drop_last=False, - collate_fn=collate_fn_dev) + collate_fn=collate_fn_dev, + num_workers=config.collator.num_workers) self.test_loader = DataLoader( test_dataset, batch_size=config.decoding.batch_size, shuffle=False, drop_last=False, - collate_fn=collate_fn_test) + collate_fn=collate_fn_test, + num_workers=config.collator.num_workers) logger.info("Setup train/valid/test Dataloader!") diff --git a/deepspeech/exps/u2/model.py b/deepspeech/exps/u2/model.py index 1afd9b10..c30f324b 100644 --- a/deepspeech/exps/u2/model.py +++ b/deepspeech/exps/u2/model.py @@ -292,7 +292,8 @@ class U2Trainer(Trainer): batch_size=config.collator.batch_size, shuffle=False, drop_last=False, - collate_fn=collate_fn_dev) + collate_fn=collate_fn_dev, + num_workers=config.collator.num_workers, ) # test dataset, return raw text config.data.manifest = config.data.test_manifest @@ -314,7 +315,8 @@ class U2Trainer(Trainer): batch_size=config.decoding.batch_size, shuffle=False, drop_last=False, - collate_fn=SpeechCollator.from_config(config)) + collate_fn=SpeechCollator.from_config(config), + num_workers=config.collator.num_workers, ) # return text token id config.collator.keep_transcription_text = False self.align_loader = DataLoader( @@ -322,7 +324,8 @@ class U2Trainer(Trainer): batch_size=config.decoding.batch_size, shuffle=False, drop_last=False, - collate_fn=SpeechCollator.from_config(config)) + collate_fn=SpeechCollator.from_config(config), + num_workers=config.collator.num_workers, ) logger.info("Setup train/valid/test/align Dataloader!") def setup_model(self): diff --git a/deepspeech/exps/u2_st/model.py b/deepspeech/exps/u2_st/model.py index 9a34cbdc..c480499c 100644 --- a/deepspeech/exps/u2_st/model.py +++ b/deepspeech/exps/u2_st/model.py @@ -292,7 +292,8 @@ class U2STTrainer(Trainer): batch_size=config.collator.batch_size, shuffle=False, drop_last=False, - collate_fn=collate_fn_dev) + collate_fn=collate_fn_dev, + num_workers=config.collator.num_workers, ) # test dataset, return raw text config.data.manifest = config.data.test_manifest @@ -313,7 +314,8 @@ class U2STTrainer(Trainer): batch_size=config.decoding.batch_size, shuffle=False, drop_last=False, - collate_fn=TestCollator.from_config(config)) + collate_fn=TestCollator.from_config(config), + num_workers=config.collator.num_workers, ) # return text token id config.collator.keep_transcription_text = False self.align_loader = DataLoader( @@ -321,7 +323,8 @@ class U2STTrainer(Trainer): batch_size=config.decoding.batch_size, shuffle=False, drop_last=False, - collate_fn=TestCollator.from_config(config)) + collate_fn=TestCollator.from_config(config), + num_workers=config.collator.num_workers, ) logger.info("Setup train/valid/test/align Dataloader!") def setup_model(self): From 3e37cef8e18e7ae7ec65223d147e50d0d091b1d2 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 28 Sep 2021 11:53:00 +0000 Subject: [PATCH 09/11] fix test.sh opts --- examples/aishell/s1/local/test.sh | 6 ++++-- examples/callcenter/s1/local/test.sh | 6 ++++-- examples/ted_en_zh/t0/local/test.sh | 3 ++- examples/timit/s1/local/test.sh | 6 ++++-- examples/tiny/s1/local/test.sh | 6 ++++-- 5 files changed, 18 insertions(+), 9 deletions(-) diff --git a/examples/aishell/s1/local/test.sh b/examples/aishell/s1/local/test.sh index c87412c9..47bd2f63 100755 --- a/examples/aishell/s1/local/test.sh +++ b/examples/aishell/s1/local/test.sh @@ -38,7 +38,8 @@ for type in attention ctc_greedy_search; do --config ${config_path} \ --result_file ${output_dir}/${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} decoding.batch_size ${batch_size} + --opts decoding.decoding_method ${type} \ + --opts decoding.batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" @@ -56,7 +57,8 @@ for type in ctc_prefix_beam_search attention_rescoring; do --config ${config_path} \ --result_file ${output_dir}/${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} decoding.batch_size ${batch_size} + --opts decoding.decoding_method ${type} \ + --opts decoding.batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" diff --git a/examples/callcenter/s1/local/test.sh b/examples/callcenter/s1/local/test.sh index dca3137d..0aa99e19 100755 --- a/examples/callcenter/s1/local/test.sh +++ b/examples/callcenter/s1/local/test.sh @@ -32,7 +32,8 @@ for type in attention ctc_greedy_search; do --config ${config_path} \ --result_file ${output_dir}/${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} decoding.batch_size ${batch_size} + --opts decoding.decoding_method ${type} \ + --opts decoding.batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" @@ -50,7 +51,8 @@ for type in ctc_prefix_beam_search attention_rescoring; do --config ${config_path} \ --result_file ${output_dir}/${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} decoding.batch_size ${batch_size} + --opts decoding.decoding_method ${type} \ + --opts decoding.batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" diff --git a/examples/ted_en_zh/t0/local/test.sh b/examples/ted_en_zh/t0/local/test.sh index 34475085..7235c6f9 100755 --- a/examples/ted_en_zh/t0/local/test.sh +++ b/examples/ted_en_zh/t0/local/test.sh @@ -19,7 +19,8 @@ for type in fullsentence; do --config ${config_path} \ --result_file ${ckpt_prefix}.${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} decoding.batch_size ${batch_size} + --opts decoding.decoding_method ${type} \ + --opts decoding.batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" diff --git a/examples/timit/s1/local/test.sh b/examples/timit/s1/local/test.sh index 868c8fda..05813179 100755 --- a/examples/timit/s1/local/test.sh +++ b/examples/timit/s1/local/test.sh @@ -36,7 +36,8 @@ for type in attention ctc_greedy_search; do --config ${config_path} \ --result_file ${ckpt_prefix}.${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} decoding.batch_size ${batch_size} + --opts decoding.decoding_method ${type} \ + --opts decoding.batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" @@ -52,7 +53,8 @@ for type in ctc_prefix_beam_search attention_rescoring; do --config ${config_path} \ --result_file ${ckpt_prefix}.${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} decoding.batch_size ${batch_size} + --opts decoding.decoding_method ${type} \ + --opts decoding.batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" diff --git a/examples/tiny/s1/local/test.sh b/examples/tiny/s1/local/test.sh index 19872bb3..34088ce9 100755 --- a/examples/tiny/s1/local/test.sh +++ b/examples/tiny/s1/local/test.sh @@ -35,7 +35,8 @@ for type in attention ctc_greedy_search; do --config ${config_path} \ --result_file ${ckpt_prefix}.${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} decoding.batch_size ${batch_size} + --opts decoding.decoding_method ${type} \ + --opts decoding.batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" @@ -51,7 +52,8 @@ for type in ctc_prefix_beam_search attention_rescoring; do --config ${config_path} \ --result_file ${ckpt_prefix}.${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} decoding.batch_size ${batch_size} + --opts decoding.decoding_method ${type} \ + --opts decoding.batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" From 4745e15ece1aa8a5917ee85fb655cdb4899c8c0b Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 28 Sep 2021 11:55:46 +0000 Subject: [PATCH 10/11] tiny run w cpu --- examples/tiny/s1/run.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/tiny/s1/run.sh b/examples/tiny/s1/run.sh index d288e31a..6580afed 100755 --- a/examples/tiny/s1/run.sh +++ b/examples/tiny/s1/run.sh @@ -30,12 +30,12 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=7 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES= ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then # ctc alignment of test data - CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES= ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then From e7b0f5ed4aa572ba7245e6f6fc66972702bfd502 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 28 Sep 2021 12:17:04 +0000 Subject: [PATCH 11/11] reader default type is mat, sound need explicitlyc specify --- utils/format_data.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/utils/format_data.py b/utils/format_data.py index e7dcfd23..682dbfdb 100755 --- a/utils/format_data.py +++ b/utils/format_data.py @@ -26,7 +26,7 @@ from deepspeech.utils.utility import print_arguments parser = argparse.ArgumentParser(description=__doc__) add_arg = functools.partial(add_arguments, argparser=parser) # yapf: disable -add_arg('feat_type', str, "raw", "speech feature type, e.g. raw(wav, flac), kaldi") +add_arg('feat_type', str, "raw", "speech feature type, e.g. raw(wav, flac), mat(ark), scp") add_arg('cmvn_path', str, 'examples/librispeech/data/mean_std.json', "Filepath of cmvn.") @@ -76,6 +76,7 @@ def main(): assert isinstance(feat_shape, (list, tuple)), type(feat_shape) if args.feat_type == 'raw': feat_shape.append(feat_dim) + line_json['filetype'] = 'sound' else: # kaldi raise NotImplementedError('no support kaldi feat now!') fout.write(json.dumps(line_json) + '\n')