From 34c29c5da58cc27297452133a3d48ef39df35db9 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Sat, 9 Oct 2021 10:11:51 +0000 Subject: [PATCH] more log; refactor ctc decoders; rm useless code --- .../decoders/swig/ctc_beam_search_decoder.cpp | 1 - deepspeech/decoders/swig/decoder_utils.h | 2 + deepspeech/decoders/swig/scorer.cpp | 1 - deepspeech/exps/deepspeech2/bin/export.py | 2 +- deepspeech/exps/deepspeech2/bin/test_hub.py | 2 +- deepspeech/frontend/augmentor/augmentation.py | 11 ++- deepspeech/frontend/augmentor/spec_augment.py | 8 +- .../frontend/featurizer/speech_featurizer.py | 2 + .../frontend/featurizer/text_featurizer.py | 23 ++++- deepspeech/frontend/utility.py | 6 +- deepspeech/models/ds2/deepspeech2.py | 58 +++++------ deepspeech/models/ds2_online/deepspeech2.py | 96 ++++++++----------- deepspeech/utils/log.py | 1 + 13 files changed, 110 insertions(+), 103 deletions(-) diff --git a/deepspeech/decoders/swig/ctc_beam_search_decoder.cpp b/deepspeech/decoders/swig/ctc_beam_search_decoder.cpp index 1a37dd1ce..8469a194d 100644 --- a/deepspeech/decoders/swig/ctc_beam_search_decoder.cpp +++ b/deepspeech/decoders/swig/ctc_beam_search_decoder.cpp @@ -28,7 +28,6 @@ #include "path_trie.h" using FSTMATCH = fst::SortedMatcher; -const std::string kSPACE = ""; std::vector> ctc_beam_search_decoder( const std::vector> &probs_seq, diff --git a/deepspeech/decoders/swig/decoder_utils.h b/deepspeech/decoders/swig/decoder_utils.h index a874e439f..96399c778 100644 --- a/deepspeech/decoders/swig/decoder_utils.h +++ b/deepspeech/decoders/swig/decoder_utils.h @@ -15,10 +15,12 @@ #ifndef DECODER_UTILS_H_ #define DECODER_UTILS_H_ +#include #include #include "fst/log.h" #include "path_trie.h" +const std::string kSPACE = ""; const float NUM_FLT_INF = std::numeric_limits::max(); const float NUM_FLT_MIN = std::numeric_limits::min(); diff --git a/deepspeech/decoders/swig/scorer.cpp b/deepspeech/decoders/swig/scorer.cpp index ebb9e448d..7bd6542df 100644 --- a/deepspeech/decoders/swig/scorer.cpp +++ b/deepspeech/decoders/swig/scorer.cpp @@ -26,7 +26,6 @@ #include "decoder_utils.h" using namespace lm::ngram; -const std::string kSPACE = ""; Scorer::Scorer(double alpha, double beta, diff --git a/deepspeech/exps/deepspeech2/bin/export.py b/deepspeech/exps/deepspeech2/bin/export.py index d92ed4def..ab5251d55 100644 --- a/deepspeech/exps/deepspeech2/bin/export.py +++ b/deepspeech/exps/deepspeech2/bin/export.py @@ -34,7 +34,7 @@ if __name__ == "__main__": parser.add_argument( "--export_path", type=str, help="path of the jit model to save") parser.add_argument( - "--model_type", type=str, default='offline', help='offline/online') + "--model_type", type=str, default='offline', help="offline/online") args = parser.parse_args() print("model_type:{}".format(args.model_type)) print_arguments(args) diff --git a/deepspeech/exps/deepspeech2/bin/test_hub.py b/deepspeech/exps/deepspeech2/bin/test_hub.py index 181e4ac31..1cf24bb03 100644 --- a/deepspeech/exps/deepspeech2/bin/test_hub.py +++ b/deepspeech/exps/deepspeech2/bin/test_hub.py @@ -179,7 +179,7 @@ if __name__ == "__main__": parser = default_argument_parser() parser.add_argument( "--model_type", type=str, default='offline', help='offline/online') - parser.add_argument("--audio_file", type=str, help='audio file path.') + parser.add_argument("--audio_file", type=str, help='audio file path') # save asr result to parser.add_argument( "--result_file", type=str, help="path of save the asr result") diff --git a/deepspeech/frontend/augmentor/augmentation.py b/deepspeech/frontend/augmentor/augmentation.py index 17abcf605..0de81333e 100644 --- a/deepspeech/frontend/augmentor/augmentation.py +++ b/deepspeech/frontend/augmentor/augmentation.py @@ -15,6 +15,7 @@ import json from collections.abc import Sequence from inspect import signature +from pprint import pformat import numpy as np @@ -22,10 +23,10 @@ from deepspeech.frontend.augmentor.base import AugmentorBase from deepspeech.utils.dynamic_import import dynamic_import from deepspeech.utils.log import Log -__all__ = ["AugmentationPipeline"] - logger = Log(__name__).getlog() +__all__ = ["AugmentationPipeline"] + import_alias = dict( volume="deepspeech.frontend.augmentor.impulse_response:VolumePerturbAugmentor", shift="deepspeech.frontend.augmentor.shift_perturb:ShiftPerturbAugmentor", @@ -111,6 +112,8 @@ class AugmentationPipeline(): 'audio') self._spec_augmentors, self._spec_rates = self._parse_pipeline_from( 'feature') + logger.info( + f"Augmentation: {pformat(list(zip(self._augmentors, self._rates)))}") def __call__(self, xs, uttid_list=None, **kwargs): if not isinstance(xs, Sequence): @@ -197,8 +200,10 @@ class AugmentationPipeline(): aug_confs = audio_confs elif aug_type == 'feature': aug_confs = feature_confs - else: + elif aug_type == 'all': aug_confs = all_confs + else: + raise ValueError(f"Not support: {aug_type}") augmentors = [ self._get_augmentor(config["type"], config["params"]) diff --git a/deepspeech/frontend/augmentor/spec_augment.py b/deepspeech/frontend/augmentor/spec_augment.py index 26c94d416..e78f6f6ad 100644 --- a/deepspeech/frontend/augmentor/spec_augment.py +++ b/deepspeech/frontend/augmentor/spec_augment.py @@ -29,10 +29,10 @@ class SpecAugmentor(AugmentorBase): SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition https://arxiv.org/abs/1904.08779 - + SpecAugment on Large Scale Datasets https://arxiv.org/abs/1912.05533 - + """ def __init__(self, @@ -61,7 +61,7 @@ class SpecAugmentor(AugmentorBase): adaptive_size_ratio (float): adaptive size ratio for time masking max_n_time_masks (int): maximum number of time masking replace_with_zero (bool): pad zero on mask if true else use mean - warp_mode (str): "PIL" (default, fast, not differentiable) + warp_mode (str): "PIL" (default, fast, not differentiable) or "sparse_image_warp" (slow, differentiable) """ super().__init__() @@ -133,7 +133,7 @@ class SpecAugmentor(AugmentorBase): return self._time_mask def __repr__(self): - return f"specaug: F-{F}, T-{T}, F-n-{n_freq_masks}, T-n-{n_time_masks}" + return f"specaug: F-{self.F}, T-{self.T}, F-n-{self.n_freq_masks}, T-n-{self.n_time_masks}" def time_warp(self, x, mode='PIL'): """time warp for spec augment diff --git a/deepspeech/frontend/featurizer/speech_featurizer.py b/deepspeech/frontend/featurizer/speech_featurizer.py index 7471d164a..256871408 100644 --- a/deepspeech/frontend/featurizer/speech_featurizer.py +++ b/deepspeech/frontend/featurizer/speech_featurizer.py @@ -51,12 +51,14 @@ class SpeechFeaturizer(): use_dB_normalization=use_dB_normalization, target_dB=target_dB, dither=dither) + self.feature_size = self.audio_feature.feature_size self.text_feature = TextFeaturizer( unit_type=unit_type, vocab_filepath=vocab_filepath, spm_model_prefix=spm_model_prefix, maskctc=maskctc) + self.vocab_size = self.text_feature.vocab_size def featurize(self, speech_segment, keep_transcription_text): """Extract features for speech segment. diff --git a/deepspeech/frontend/featurizer/text_featurizer.py b/deepspeech/frontend/featurizer/text_featurizer.py index 10ea69244..ac129b0f7 100644 --- a/deepspeech/frontend/featurizer/text_featurizer.py +++ b/deepspeech/frontend/featurizer/text_featurizer.py @@ -12,12 +12,20 @@ # See the License for the specific language governing permissions and # limitations under the License. """Contains the text featurizer class.""" +from pprint import pformat + import sentencepiece as spm +from ..utility import BLANK from ..utility import EOS from ..utility import load_dict +from ..utility import MASKCTC +from ..utility import SOS from ..utility import SPACE from ..utility import UNK +from deepspeech.utils.log import Log + +logger = Log(__name__).getlog() __all__ = ["TextFeaturizer"] @@ -76,7 +84,7 @@ class TextFeaturizer(): """Convert text string to a list of token indices. Args: - text (str): Text. + text (str): Text to process. Returns: List[int]: List of token indices. @@ -199,13 +207,24 @@ class TextFeaturizer(): """Load vocabulary from file.""" vocab_list = load_dict(vocab_filepath, maskctc) assert vocab_list is not None + logger.info(f"Vocab: {pformat(vocab_list)}") id2token = dict( [(idx, token) for (idx, token) in enumerate(vocab_list)]) token2id = dict( [(token, idx) for (idx, token) in enumerate(vocab_list)]) + blank_id = vocab_list.index(BLANK) if BLANK in vocab_list else -1 + maskctc_id = vocab_list.index(MASKCTC) if MASKCTC in vocab_list else -1 unk_id = vocab_list.index(UNK) if UNK in vocab_list else -1 eos_id = vocab_list.index(EOS) if EOS in vocab_list else -1 - + sos_id = vocab_list.index(SOS) if SOS in vocab_list else -1 + space_id = vocab_list.index(SPACE) if SPACE in vocab_list else -1 + + logger.info(f"BLANK id: {blank_id}") + logger.info(f"UNK id: {unk_id}") + logger.info(f"EOS id: {eos_id}") + logger.info(f"SOS id: {sos_id}") + logger.info(f"SPACE id: {space_id}") + logger.info(f"MASKCTC id: {maskctc_id}") return token2id, id2token, vocab_list, unk_id, eos_id diff --git a/deepspeech/frontend/utility.py b/deepspeech/frontend/utility.py index f5fc3097e..f83f1d4e1 100644 --- a/deepspeech/frontend/utility.py +++ b/deepspeech/frontend/utility.py @@ -49,7 +49,11 @@ def load_dict(dict_path: Optional[Text], maskctc=False) -> Optional[List[Text]]: with open(dict_path, "r") as f: dictionary = f.readlines() - char_list = [entry.strip().split(" ")[0] for entry in dictionary] + # first token is `` + # multi line: ` 0\n` + # one line: `` + # space is relpace with + char_list = [entry[:-1].split(" ")[0] for entry in dictionary] if BLANK not in char_list: char_list.insert(0, BLANK) if EOS not in char_list: diff --git a/deepspeech/models/ds2/deepspeech2.py b/deepspeech/models/ds2/deepspeech2.py index dda26358b..a2aa31f7f 100644 --- a/deepspeech/models/ds2/deepspeech2.py +++ b/deepspeech/models/ds2/deepspeech2.py @@ -218,14 +218,18 @@ class DeepSpeech2Model(nn.Layer): DeepSpeech2Model The model built from pretrained result. """ - model = cls(feat_size=dataloader.collate_fn.feature_size, - dict_size=dataloader.collate_fn.vocab_size, - num_conv_layers=config.model.num_conv_layers, - num_rnn_layers=config.model.num_rnn_layers, - rnn_size=config.model.rnn_layer_size, - use_gru=config.model.use_gru, - share_rnn_weights=config.model.share_rnn_weights, - blank_id=config.model.blank_id) + model = cls( + #feat_size=dataloader.collate_fn.feature_size, + feat_size=dataloader.dataset.feature_size, + #dict_size=dataloader.collate_fn.vocab_size, + dict_size=dataloader.dataset.vocab_size, + num_conv_layers=config.model.num_conv_layers, + num_rnn_layers=config.model.num_rnn_layers, + rnn_size=config.model.rnn_layer_size, + use_gru=config.model.use_gru, + share_rnn_weights=config.model.share_rnn_weights, + blank_id=config.model.blank_id, + ctc_grad_norm_type=config.model.ctc_grad_norm_type, ) infos = Checkpoint().load_parameters( model, checkpoint_path=checkpoint_path) logger.info(f"checkpoint info: {infos}") @@ -244,36 +248,22 @@ class DeepSpeech2Model(nn.Layer): DeepSpeech2Model The model built from config. """ - model = cls(feat_size=config.feat_size, - dict_size=config.dict_size, - num_conv_layers=config.num_conv_layers, - num_rnn_layers=config.num_rnn_layers, - rnn_size=config.rnn_layer_size, - use_gru=config.use_gru, - share_rnn_weights=config.share_rnn_weights, - blank_id=config.blank_id) + model = cls( + feat_size=config.feat_size, + dict_size=config.dict_size, + num_conv_layers=config.num_conv_layers, + num_rnn_layers=config.num_rnn_layers, + rnn_size=config.rnn_layer_size, + use_gru=config.use_gru, + share_rnn_weights=config.share_rnn_weights, + blank_id=config.blank_id, + ctc_grad_norm_type=config.ctc_grad_norm_type, ) return model class DeepSpeech2InferModel(DeepSpeech2Model): - def __init__(self, - feat_size, - dict_size, - num_conv_layers=2, - num_rnn_layers=3, - rnn_size=1024, - use_gru=False, - share_rnn_weights=True, - blank_id=0): - super().__init__( - feat_size=feat_size, - dict_size=dict_size, - num_conv_layers=num_conv_layers, - num_rnn_layers=num_rnn_layers, - rnn_size=rnn_size, - use_gru=use_gru, - share_rnn_weights=share_rnn_weights, - blank_id=blank_id) + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) def forward(self, audio, audio_len): """export model function diff --git a/deepspeech/models/ds2_online/deepspeech2.py b/deepspeech/models/ds2_online/deepspeech2.py index 29d207c44..52e0c7b17 100644 --- a/deepspeech/models/ds2_online/deepspeech2.py +++ b/deepspeech/models/ds2_online/deepspeech2.py @@ -255,22 +255,24 @@ class DeepSpeech2ModelOnline(nn.Layer): fc_layers_size_list=[512, 256], use_gru=True, #Use gru if set True. Use simple rnn if set False. blank_id=0, # index of blank in vocob.txt - )) + ctc_grad_norm_type='instance', )) if config is not None: config.merge_from_other_cfg(default) return default - def __init__(self, - feat_size, - dict_size, - num_conv_layers=2, - num_rnn_layers=4, - rnn_size=1024, - rnn_direction='forward', - num_fc_layers=2, - fc_layers_size_list=[512, 256], - use_gru=False, - blank_id=0): + def __init__( + self, + feat_size, + dict_size, + num_conv_layers=2, + num_rnn_layers=4, + rnn_size=1024, + rnn_direction='forward', + num_fc_layers=2, + fc_layers_size_list=[512, 256], + use_gru=False, + blank_id=0, + ctc_grad_norm_type='instance', ): super().__init__() self.encoder = CRNNEncoder( feat_size=feat_size, @@ -290,7 +292,7 @@ class DeepSpeech2ModelOnline(nn.Layer): dropout_rate=0.0, reduction=True, # sum batch_average=True, # sum / batch_size - grad_norm_type='instance') + grad_norm_type=ctc_grad_norm_type) def forward(self, audio, audio_len, text, text_len): """Compute Model loss @@ -348,16 +350,18 @@ class DeepSpeech2ModelOnline(nn.Layer): DeepSpeech2ModelOnline The model built from pretrained result. """ - model = cls(feat_size=dataloader.collate_fn.feature_size, - dict_size=dataloader.collate_fn.vocab_size, - num_conv_layers=config.model.num_conv_layers, - num_rnn_layers=config.model.num_rnn_layers, - rnn_size=config.model.rnn_layer_size, - rnn_direction=config.model.rnn_direction, - num_fc_layers=config.model.num_fc_layers, - fc_layers_size_list=config.model.fc_layers_size_list, - use_gru=config.model.use_gru, - blank_id=config.model.blank_id) + model = cls( + feat_size=dataloader.collate_fn.feature_size, + dict_size=dataloader.collate_fn.vocab_size, + num_conv_layers=config.model.num_conv_layers, + num_rnn_layers=config.model.num_rnn_layers, + rnn_size=config.model.rnn_layer_size, + rnn_direction=config.model.rnn_direction, + num_fc_layers=config.model.num_fc_layers, + fc_layers_size_list=config.model.fc_layers_size_list, + use_gru=config.model.use_gru, + blank_id=config.model.blank_id, + ctc_grad_norm_type=config.model.ctc_grad_norm_type, ) infos = Checkpoint().load_parameters( model, checkpoint_path=checkpoint_path) logger.info(f"checkpoint info: {infos}") @@ -376,42 +380,24 @@ class DeepSpeech2ModelOnline(nn.Layer): DeepSpeech2ModelOnline The model built from config. """ - model = cls(feat_size=config.feat_size, - dict_size=config.dict_size, - num_conv_layers=config.num_conv_layers, - num_rnn_layers=config.num_rnn_layers, - rnn_size=config.rnn_layer_size, - rnn_direction=config.rnn_direction, - num_fc_layers=config.num_fc_layers, - fc_layers_size_list=config.fc_layers_size_list, - use_gru=config.use_gru, - blank_id=config.blank_id) + model = cls( + feat_size=config.feat_size, + dict_size=config.dict_size, + num_conv_layers=config.num_conv_layers, + num_rnn_layers=config.num_rnn_layers, + rnn_size=config.rnn_layer_size, + rnn_direction=config.rnn_direction, + num_fc_layers=config.num_fc_layers, + fc_layers_size_list=config.fc_layers_size_list, + use_gru=config.use_gru, + blank_id=config.blank_id, + ctc_grad_norm_type=config.ctc_grad_norm_type, ) return model class DeepSpeech2InferModelOnline(DeepSpeech2ModelOnline): - def __init__(self, - feat_size, - dict_size, - num_conv_layers=2, - num_rnn_layers=4, - rnn_size=1024, - rnn_direction='forward', - num_fc_layers=2, - fc_layers_size_list=[512, 256], - use_gru=False, - blank_id=0): - super().__init__( - feat_size=feat_size, - dict_size=dict_size, - num_conv_layers=num_conv_layers, - num_rnn_layers=num_rnn_layers, - rnn_size=rnn_size, - rnn_direction=rnn_direction, - num_fc_layers=num_fc_layers, - fc_layers_size_list=fc_layers_size_list, - use_gru=use_gru, - blank_id=blank_id) + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) def forward(self, audio_chunk, audio_chunk_lens, chunk_state_h_box, chunk_state_c_box): diff --git a/deepspeech/utils/log.py b/deepspeech/utils/log.py index 0b5088544..1790efdb1 100644 --- a/deepspeech/utils/log.py +++ b/deepspeech/utils/log.py @@ -127,6 +127,7 @@ class Autolog: else: gpu_id = None infer_config = inference.Config() + self.autolog = auto_log.AutoLogger( model_name=model_name, model_precision=model_precision,