more log; refactor ctc decoders; rm useless code

pull/879/head
Hui Zhang 3 years ago
parent 2691013232
commit eb4b389262

@ -28,7 +28,6 @@
#include "path_trie.h"
using FSTMATCH = fst::SortedMatcher<fst::StdVectorFst>;
const std::string kSPACE = "<space>";
std::vector<std::pair<double, std::string>> ctc_beam_search_decoder(
const std::vector<std::vector<double>> &probs_seq,

@ -15,10 +15,12 @@
#ifndef DECODER_UTILS_H_
#define DECODER_UTILS_H_
#include <string>
#include <utility>
#include "fst/log.h"
#include "path_trie.h"
const std::string kSPACE = "<space>";
const float NUM_FLT_INF = std::numeric_limits<float>::max();
const float NUM_FLT_MIN = std::numeric_limits<float>::min();

@ -26,7 +26,6 @@
#include "decoder_utils.h"
using namespace lm::ngram;
const std::string kSPACE = "<space>";
Scorer::Scorer(double alpha,
double beta,

@ -34,7 +34,7 @@ if __name__ == "__main__":
parser.add_argument(
"--export_path", type=str, help="path of the jit model to save")
parser.add_argument(
"--model_type", type=str, default='offline', help='offline/online')
"--model_type", type=str, default='offline', help="offline/online")
args = parser.parse_args()
print("model_type:{}".format(args.model_type))
print_arguments(args)

@ -179,7 +179,7 @@ if __name__ == "__main__":
parser = default_argument_parser()
parser.add_argument(
"--model_type", type=str, default='offline', help='offline/online')
parser.add_argument("--audio_file", type=str, help='audio file path.')
parser.add_argument("--audio_file", type=str, help='audio file path')
# save asr result to
parser.add_argument(
"--result_file", type=str, help="path of save the asr result")

@ -15,6 +15,7 @@
import json
from collections.abc import Sequence
from inspect import signature
from pprint import pformat
import numpy as np
@ -22,10 +23,10 @@ from deepspeech.frontend.augmentor.base import AugmentorBase
from deepspeech.utils.dynamic_import import dynamic_import
from deepspeech.utils.log import Log
__all__ = ["AugmentationPipeline"]
logger = Log(__name__).getlog()
__all__ = ["AugmentationPipeline"]
import_alias = dict(
volume="deepspeech.frontend.augmentor.impulse_response:VolumePerturbAugmentor",
shift="deepspeech.frontend.augmentor.shift_perturb:ShiftPerturbAugmentor",
@ -111,6 +112,8 @@ class AugmentationPipeline():
'audio')
self._spec_augmentors, self._spec_rates = self._parse_pipeline_from(
'feature')
logger.info(
f"Augmentation: {pformat(list(zip(self._augmentors, self._rates)))}")
def __call__(self, xs, uttid_list=None, **kwargs):
if not isinstance(xs, Sequence):
@ -197,8 +200,10 @@ class AugmentationPipeline():
aug_confs = audio_confs
elif aug_type == 'feature':
aug_confs = feature_confs
else:
elif aug_type == 'all':
aug_confs = all_confs
else:
raise ValueError(f"Not support: {aug_type}")
augmentors = [
self._get_augmentor(config["type"], config["params"])

@ -29,10 +29,10 @@ class SpecAugmentor(AugmentorBase):
SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition
https://arxiv.org/abs/1904.08779
SpecAugment on Large Scale Datasets
https://arxiv.org/abs/1912.05533
"""
def __init__(self,
@ -61,7 +61,7 @@ class SpecAugmentor(AugmentorBase):
adaptive_size_ratio (float): adaptive size ratio for time masking
max_n_time_masks (int): maximum number of time masking
replace_with_zero (bool): pad zero on mask if true else use mean
warp_mode (str): "PIL" (default, fast, not differentiable)
warp_mode (str): "PIL" (default, fast, not differentiable)
or "sparse_image_warp" (slow, differentiable)
"""
super().__init__()
@ -133,7 +133,7 @@ class SpecAugmentor(AugmentorBase):
return self._time_mask
def __repr__(self):
return f"specaug: F-{F}, T-{T}, F-n-{n_freq_masks}, T-n-{n_time_masks}"
return f"specaug: F-{self.F}, T-{self.T}, F-n-{self.n_freq_masks}, T-n-{self.n_time_masks}"
def time_warp(self, x, mode='PIL'):
"""time warp for spec augment

@ -51,12 +51,14 @@ class SpeechFeaturizer():
use_dB_normalization=use_dB_normalization,
target_dB=target_dB,
dither=dither)
self.feature_size = self.audio_feature.feature_size
self.text_feature = TextFeaturizer(
unit_type=unit_type,
vocab_filepath=vocab_filepath,
spm_model_prefix=spm_model_prefix,
maskctc=maskctc)
self.vocab_size = self.text_feature.vocab_size
def featurize(self, speech_segment, keep_transcription_text):
"""Extract features for speech segment.

@ -12,12 +12,20 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""Contains the text featurizer class."""
from pprint import pformat
import sentencepiece as spm
from ..utility import BLANK
from ..utility import EOS
from ..utility import load_dict
from ..utility import MASKCTC
from ..utility import SOS
from ..utility import SPACE
from ..utility import UNK
from deepspeech.utils.log import Log
logger = Log(__name__).getlog()
__all__ = ["TextFeaturizer"]
@ -76,7 +84,7 @@ class TextFeaturizer():
"""Convert text string to a list of token indices.
Args:
text (str): Text.
text (str): Text to process.
Returns:
List[int]: List of token indices.
@ -199,13 +207,24 @@ class TextFeaturizer():
"""Load vocabulary from file."""
vocab_list = load_dict(vocab_filepath, maskctc)
assert vocab_list is not None
logger.info(f"Vocab: {pformat(vocab_list)}")
id2token = dict(
[(idx, token) for (idx, token) in enumerate(vocab_list)])
token2id = dict(
[(token, idx) for (idx, token) in enumerate(vocab_list)])
blank_id = vocab_list.index(BLANK) if BLANK in vocab_list else -1
maskctc_id = vocab_list.index(MASKCTC) if MASKCTC in vocab_list else -1
unk_id = vocab_list.index(UNK) if UNK in vocab_list else -1
eos_id = vocab_list.index(EOS) if EOS in vocab_list else -1
sos_id = vocab_list.index(SOS) if SOS in vocab_list else -1
space_id = vocab_list.index(SPACE) if SPACE in vocab_list else -1
logger.info(f"BLANK id: {blank_id}")
logger.info(f"UNK id: {unk_id}")
logger.info(f"EOS id: {eos_id}")
logger.info(f"SOS id: {sos_id}")
logger.info(f"SPACE id: {space_id}")
logger.info(f"MASKCTC id: {maskctc_id}")
return token2id, id2token, vocab_list, unk_id, eos_id

@ -49,7 +49,11 @@ def load_dict(dict_path: Optional[Text], maskctc=False) -> Optional[List[Text]]:
with open(dict_path, "r") as f:
dictionary = f.readlines()
char_list = [entry.strip().split(" ")[0] for entry in dictionary]
# first token is `<blank>`
# multi line: `<blank> 0\n`
# one line: `<blank>`
# space is relpace with <space>
char_list = [entry[:-1].split(" ")[0] for entry in dictionary]
if BLANK not in char_list:
char_list.insert(0, BLANK)
if EOS not in char_list:

@ -218,14 +218,18 @@ class DeepSpeech2Model(nn.Layer):
DeepSpeech2Model
The model built from pretrained result.
"""
model = cls(feat_size=dataloader.collate_fn.feature_size,
dict_size=dataloader.collate_fn.vocab_size,
num_conv_layers=config.model.num_conv_layers,
num_rnn_layers=config.model.num_rnn_layers,
rnn_size=config.model.rnn_layer_size,
use_gru=config.model.use_gru,
share_rnn_weights=config.model.share_rnn_weights,
blank_id=config.model.blank_id)
model = cls(
#feat_size=dataloader.collate_fn.feature_size,
feat_size=dataloader.dataset.feature_size,
#dict_size=dataloader.collate_fn.vocab_size,
dict_size=dataloader.dataset.vocab_size,
num_conv_layers=config.model.num_conv_layers,
num_rnn_layers=config.model.num_rnn_layers,
rnn_size=config.model.rnn_layer_size,
use_gru=config.model.use_gru,
share_rnn_weights=config.model.share_rnn_weights,
blank_id=config.model.blank_id,
ctc_grad_norm_type=config.model.ctc_grad_norm_type, )
infos = Checkpoint().load_parameters(
model, checkpoint_path=checkpoint_path)
logger.info(f"checkpoint info: {infos}")
@ -244,36 +248,22 @@ class DeepSpeech2Model(nn.Layer):
DeepSpeech2Model
The model built from config.
"""
model = cls(feat_size=config.feat_size,
dict_size=config.dict_size,
num_conv_layers=config.num_conv_layers,
num_rnn_layers=config.num_rnn_layers,
rnn_size=config.rnn_layer_size,
use_gru=config.use_gru,
share_rnn_weights=config.share_rnn_weights,
blank_id=config.blank_id)
model = cls(
feat_size=config.feat_size,
dict_size=config.dict_size,
num_conv_layers=config.num_conv_layers,
num_rnn_layers=config.num_rnn_layers,
rnn_size=config.rnn_layer_size,
use_gru=config.use_gru,
share_rnn_weights=config.share_rnn_weights,
blank_id=config.blank_id,
ctc_grad_norm_type=config.ctc_grad_norm_type, )
return model
class DeepSpeech2InferModel(DeepSpeech2Model):
def __init__(self,
feat_size,
dict_size,
num_conv_layers=2,
num_rnn_layers=3,
rnn_size=1024,
use_gru=False,
share_rnn_weights=True,
blank_id=0):
super().__init__(
feat_size=feat_size,
dict_size=dict_size,
num_conv_layers=num_conv_layers,
num_rnn_layers=num_rnn_layers,
rnn_size=rnn_size,
use_gru=use_gru,
share_rnn_weights=share_rnn_weights,
blank_id=blank_id)
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def forward(self, audio, audio_len):
"""export model function

@ -255,22 +255,24 @@ class DeepSpeech2ModelOnline(nn.Layer):
fc_layers_size_list=[512, 256],
use_gru=True, #Use gru if set True. Use simple rnn if set False.
blank_id=0, # index of blank in vocob.txt
))
ctc_grad_norm_type='instance', ))
if config is not None:
config.merge_from_other_cfg(default)
return default
def __init__(self,
feat_size,
dict_size,
num_conv_layers=2,
num_rnn_layers=4,
rnn_size=1024,
rnn_direction='forward',
num_fc_layers=2,
fc_layers_size_list=[512, 256],
use_gru=False,
blank_id=0):
def __init__(
self,
feat_size,
dict_size,
num_conv_layers=2,
num_rnn_layers=4,
rnn_size=1024,
rnn_direction='forward',
num_fc_layers=2,
fc_layers_size_list=[512, 256],
use_gru=False,
blank_id=0,
ctc_grad_norm_type='instance', ):
super().__init__()
self.encoder = CRNNEncoder(
feat_size=feat_size,
@ -290,7 +292,7 @@ class DeepSpeech2ModelOnline(nn.Layer):
dropout_rate=0.0,
reduction=True, # sum
batch_average=True, # sum / batch_size
grad_norm_type='instance')
grad_norm_type=ctc_grad_norm_type)
def forward(self, audio, audio_len, text, text_len):
"""Compute Model loss
@ -348,16 +350,18 @@ class DeepSpeech2ModelOnline(nn.Layer):
DeepSpeech2ModelOnline
The model built from pretrained result.
"""
model = cls(feat_size=dataloader.collate_fn.feature_size,
dict_size=dataloader.collate_fn.vocab_size,
num_conv_layers=config.model.num_conv_layers,
num_rnn_layers=config.model.num_rnn_layers,
rnn_size=config.model.rnn_layer_size,
rnn_direction=config.model.rnn_direction,
num_fc_layers=config.model.num_fc_layers,
fc_layers_size_list=config.model.fc_layers_size_list,
use_gru=config.model.use_gru,
blank_id=config.model.blank_id)
model = cls(
feat_size=dataloader.collate_fn.feature_size,
dict_size=dataloader.collate_fn.vocab_size,
num_conv_layers=config.model.num_conv_layers,
num_rnn_layers=config.model.num_rnn_layers,
rnn_size=config.model.rnn_layer_size,
rnn_direction=config.model.rnn_direction,
num_fc_layers=config.model.num_fc_layers,
fc_layers_size_list=config.model.fc_layers_size_list,
use_gru=config.model.use_gru,
blank_id=config.model.blank_id,
ctc_grad_norm_type=config.model.ctc_grad_norm_type, )
infos = Checkpoint().load_parameters(
model, checkpoint_path=checkpoint_path)
logger.info(f"checkpoint info: {infos}")
@ -376,42 +380,24 @@ class DeepSpeech2ModelOnline(nn.Layer):
DeepSpeech2ModelOnline
The model built from config.
"""
model = cls(feat_size=config.feat_size,
dict_size=config.dict_size,
num_conv_layers=config.num_conv_layers,
num_rnn_layers=config.num_rnn_layers,
rnn_size=config.rnn_layer_size,
rnn_direction=config.rnn_direction,
num_fc_layers=config.num_fc_layers,
fc_layers_size_list=config.fc_layers_size_list,
use_gru=config.use_gru,
blank_id=config.blank_id)
model = cls(
feat_size=config.feat_size,
dict_size=config.dict_size,
num_conv_layers=config.num_conv_layers,
num_rnn_layers=config.num_rnn_layers,
rnn_size=config.rnn_layer_size,
rnn_direction=config.rnn_direction,
num_fc_layers=config.num_fc_layers,
fc_layers_size_list=config.fc_layers_size_list,
use_gru=config.use_gru,
blank_id=config.blank_id,
ctc_grad_norm_type=config.ctc_grad_norm_type, )
return model
class DeepSpeech2InferModelOnline(DeepSpeech2ModelOnline):
def __init__(self,
feat_size,
dict_size,
num_conv_layers=2,
num_rnn_layers=4,
rnn_size=1024,
rnn_direction='forward',
num_fc_layers=2,
fc_layers_size_list=[512, 256],
use_gru=False,
blank_id=0):
super().__init__(
feat_size=feat_size,
dict_size=dict_size,
num_conv_layers=num_conv_layers,
num_rnn_layers=num_rnn_layers,
rnn_size=rnn_size,
rnn_direction=rnn_direction,
num_fc_layers=num_fc_layers,
fc_layers_size_list=fc_layers_size_list,
use_gru=use_gru,
blank_id=blank_id)
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def forward(self, audio_chunk, audio_chunk_lens, chunk_state_h_box,
chunk_state_c_box):

@ -127,6 +127,7 @@ class Autolog:
else:
gpu_id = None
infer_config = inference.Config()
self.autolog = auto_log.AutoLogger(
model_name=model_name,
model_precision=model_precision,

Loading…
Cancel
Save