From 279348d7860cc3ba45a80c86f3d2c9194972db53 Mon Sep 17 00:00:00 2001 From: Haoxin Ma <745165806@qq.com> Date: Tue, 8 Jun 2021 10:32:05 +0000 Subject: [PATCH 01/14] move process utt to collator --- deepspeech/exps/deepspeech2/model.py | 2 +- deepspeech/io/collator.py | 117 ++++++++++++++++++++++++- deepspeech/io/dataset.py | 82 +---------------- examples/tiny/s0/conf/deepspeech2.yaml | 4 +- 4 files changed, 120 insertions(+), 85 deletions(-) diff --git a/deepspeech/exps/deepspeech2/model.py b/deepspeech/exps/deepspeech2/model.py index 468bc652..50ff3c17 100644 --- a/deepspeech/exps/deepspeech2/model.py +++ b/deepspeech/exps/deepspeech2/model.py @@ -165,7 +165,7 @@ class DeepSpeech2Trainer(Trainer): sortagrad=config.data.sortagrad, shuffle_method=config.data.shuffle_method) - collate_fn = SpeechCollator(keep_transcription_text=False) + collate_fn = SpeechCollator(config, keep_transcription_text=False) self.train_loader = DataLoader( train_dataset, batch_sampler=batch_sampler, diff --git a/deepspeech/io/collator.py b/deepspeech/io/collator.py index 3bec9875..d725b0b1 100644 --- a/deepspeech/io/collator.py +++ b/deepspeech/io/collator.py @@ -16,14 +16,22 @@ import numpy as np from deepspeech.frontend.utility import IGNORE_ID from deepspeech.io.utility import pad_sequence from deepspeech.utils.log import Log +from deepspeech.frontend.augmentor.augmentation import AugmentationPipeline +from deepspeech.frontend.featurizer.speech_featurizer import SpeechFeaturizer +from deepspeech.frontend.normalizer import FeatureNormalizer +from deepspeech.frontend.speech import SpeechSegment +import io +import time __all__ = ["SpeechCollator"] logger = Log(__name__).getlog() +# namedtupe need global for pickle. +TarLocalData = namedtuple('TarLocalData', ['tar2info', 'tar2object']) class SpeechCollator(): - def __init__(self, keep_transcription_text=True): + def __init__(self, config, keep_transcription_text=True): """ Padding audio features with zeros to make them have the same shape (or a user-defined shape) within one bach. @@ -32,6 +40,112 @@ class SpeechCollator(): """ self._keep_transcription_text = keep_transcription_text + if isinstance(config.data.augmentation_config, (str, bytes)): + if config.data.augmentation_config: + aug_file = io.open( + config.data.augmentation_config, mode='r', encoding='utf8') + else: + aug_file = io.StringIO(initial_value='{}', newline='') + else: + aug_file = config.data.augmentation_config + assert isinstance(aug_file, io.StringIO) + + self._local_data = TarLocalData(tar2info={}, tar2object={}) + self._augmentation_pipeline = AugmentationPipeline( + augmentation_config=aug_file.read(), + random_seed=config.data.random_seed) + + self._normalizer = FeatureNormalizer( + config.data.mean_std_filepath) if config.data.mean_std_filepath else None + + self._stride_ms = config.data.stride_ms + self._target_sample_rate = config.data.target_sample_rate + + self._speech_featurizer = SpeechFeaturizer( + unit_type=config.data.unit_type, + vocab_filepath=config.data.vocab_filepath, + spm_model_prefix=config.data.spm_model_prefix, + specgram_type=config.data.specgram_type, + feat_dim=config.data.feat_dim, + delta_delta=config.data.delta_delta, + stride_ms=config.data.stride_ms, + window_ms=config.data.window_ms, + n_fft=config.data.n_fft, + max_freq=config.data.max_freq, + target_sample_rate=config.data.target_sample_rate, + use_dB_normalization=config.data.use_dB_normalization, + target_dB=config.data.target_dB, + dither=config.data.dither) + + def _parse_tar(self, file): + """Parse a tar file to get a tarfile object + and a map containing tarinfoes + """ + result = {} + f = tarfile.open(file) + for tarinfo in f.getmembers(): + result[tarinfo.name] = tarinfo + return f, result + + def _subfile_from_tar(self, file): + """Get subfile object from tar. + + It will return a subfile object from tar file + and cached tar file info for next reading request. + """ + tarpath, filename = file.split(':', 1)[1].split('#', 1) + if 'tar2info' not in self._local_data.__dict__: + self._local_data.tar2info = {} + if 'tar2object' not in self._local_data.__dict__: + self._local_data.tar2object = {} + if tarpath not in self._local_data.tar2info: + object, infoes = self._parse_tar(tarpath) + self._local_data.tar2info[tarpath] = infoes + self._local_data.tar2object[tarpath] = object + return self._local_data.tar2object[tarpath].extractfile( + self._local_data.tar2info[tarpath][filename]) + + def process_utterance(self, audio_file, transcript): + """Load, augment, featurize and normalize for speech data. + + :param audio_file: Filepath or file object of audio file. + :type audio_file: str | file + :param transcript: Transcription text. + :type transcript: str + :return: Tuple of audio feature tensor and data of transcription part, + where transcription part could be token ids or text. + :rtype: tuple of (2darray, list) + """ + start_time = time.time() + if isinstance(audio_file, str) and audio_file.startswith('tar:'): + speech_segment = SpeechSegment.from_file( + self._subfile_from_tar(audio_file), transcript) + else: + speech_segment = SpeechSegment.from_file(audio_file, transcript) + load_wav_time = time.time() - start_time + #logger.debug(f"load wav time: {load_wav_time}") + + # audio augment + start_time = time.time() + self._augmentation_pipeline.transform_audio(speech_segment) + audio_aug_time = time.time() - start_time + #logger.debug(f"audio augmentation time: {audio_aug_time}") + + start_time = time.time() + specgram, transcript_part = self._speech_featurizer.featurize( + speech_segment, self._keep_transcription_text) + if self._normalizer: + specgram = self._normalizer.apply(specgram) + feature_time = time.time() - start_time + #logger.debug(f"audio & test feature time: {feature_time}") + + # specgram augment + start_time = time.time() + specgram = self._augmentation_pipeline.transform_feature(specgram) + feature_aug_time = time.time() - start_time + #logger.debug(f"audio feature augmentation time: {feature_aug_time}") + return specgram, transcript_part + def __call__(self, batch): """batch examples @@ -53,6 +167,7 @@ class SpeechCollator(): text_lens = [] utts = [] for utt, audio, text in batch: + audio, text = self.process_utterance(audio, text) #utt utts.append(utt) # audio diff --git a/deepspeech/io/dataset.py b/deepspeech/io/dataset.py index eaa57a4e..fc687902 100644 --- a/deepspeech/io/dataset.py +++ b/deepspeech/io/dataset.py @@ -34,9 +34,6 @@ __all__ = [ logger = Log(__name__).getlog() -# namedtupe need global for pickle. -TarLocalData = namedtuple('TarLocalData', ['tar2info', 'tar2object']) - class ManifestDataset(Dataset): @classmethod @@ -192,10 +189,6 @@ class ManifestDataset(Dataset): self._stride_ms = stride_ms self._target_sample_rate = target_sample_rate - self._normalizer = FeatureNormalizer( - mean_std_filepath) if mean_std_filepath else None - self._augmentation_pipeline = AugmentationPipeline( - augmentation_config=augmentation_config, random_seed=random_seed) self._speech_featurizer = SpeechFeaturizer( unit_type=unit_type, vocab_filepath=vocab_filepath, @@ -214,8 +207,6 @@ class ManifestDataset(Dataset): self._rng = np.random.RandomState(random_seed) self._keep_transcription_text = keep_transcription_text - # for caching tar files info - self._local_data = TarLocalData(tar2info={}, tar2object={}) # read manifest self._manifest = read_manifest( @@ -256,74 +247,7 @@ class ManifestDataset(Dataset): def stride_ms(self): return self._speech_featurizer.stride_ms - def _parse_tar(self, file): - """Parse a tar file to get a tarfile object - and a map containing tarinfoes - """ - result = {} - f = tarfile.open(file) - for tarinfo in f.getmembers(): - result[tarinfo.name] = tarinfo - return f, result - - def _subfile_from_tar(self, file): - """Get subfile object from tar. - It will return a subfile object from tar file - and cached tar file info for next reading request. - """ - tarpath, filename = file.split(':', 1)[1].split('#', 1) - if 'tar2info' not in self._local_data.__dict__: - self._local_data.tar2info = {} - if 'tar2object' not in self._local_data.__dict__: - self._local_data.tar2object = {} - if tarpath not in self._local_data.tar2info: - object, infoes = self._parse_tar(tarpath) - self._local_data.tar2info[tarpath] = infoes - self._local_data.tar2object[tarpath] = object - return self._local_data.tar2object[tarpath].extractfile( - self._local_data.tar2info[tarpath][filename]) - - def process_utterance(self, utt, audio_file, transcript): - """Load, augment, featurize and normalize for speech data. - - :param audio_file: Filepath or file object of audio file. - :type audio_file: str | file - :param transcript: Transcription text. - :type transcript: str - :return: Tuple of audio feature tensor and data of transcription part, - where transcription part could be token ids or text. - :rtype: tuple of (2darray, list) - """ - start_time = time.time() - if isinstance(audio_file, str) and audio_file.startswith('tar:'): - speech_segment = SpeechSegment.from_file( - self._subfile_from_tar(audio_file), transcript) - else: - speech_segment = SpeechSegment.from_file(audio_file, transcript) - load_wav_time = time.time() - start_time - #logger.debug(f"load wav time: {load_wav_time}") - - # audio augment - start_time = time.time() - self._augmentation_pipeline.transform_audio(speech_segment) - audio_aug_time = time.time() - start_time - #logger.debug(f"audio augmentation time: {audio_aug_time}") - - start_time = time.time() - specgram, transcript_part = self._speech_featurizer.featurize( - speech_segment, self._keep_transcription_text) - if self._normalizer: - specgram = self._normalizer.apply(specgram) - feature_time = time.time() - start_time - #logger.debug(f"audio & test feature time: {feature_time}") - - # specgram augment - start_time = time.time() - specgram = self._augmentation_pipeline.transform_feature(specgram) - feature_aug_time = time.time() - start_time - #logger.debug(f"audio feature augmentation time: {feature_aug_time}") - return utt, specgram, transcript_part def _instance_reader_creator(self, manifest): """ @@ -336,8 +260,6 @@ class ManifestDataset(Dataset): def reader(): for instance in manifest: - # inst = self.process_utterance(instance["feat"], - # instance["text"]) inst = self.process_utterance(instance["utt"], instance["feat"], instance["text"]) yield inst @@ -349,6 +271,4 @@ class ManifestDataset(Dataset): def __getitem__(self, idx): instance = self._manifest[idx] - return self.process_utterance(instance["utt"], instance["feat"], - instance["text"]) - # return self.process_utterance(instance["feat"], instance["text"]) + return(instance["utt"], instance["feat"], instance["text"]) diff --git a/examples/tiny/s0/conf/deepspeech2.yaml b/examples/tiny/s0/conf/deepspeech2.yaml index dd9ce51f..aeb4f099 100644 --- a/examples/tiny/s0/conf/deepspeech2.yaml +++ b/examples/tiny/s0/conf/deepspeech2.yaml @@ -6,7 +6,7 @@ data: mean_std_filepath: data/mean_std.json vocab_filepath: data/vocab.txt augmentation_config: conf/augmentation.json - batch_size: 4 + batch_size: 2 min_input_len: 0.0 max_input_len: 27.0 min_output_len: 0.0 @@ -37,7 +37,7 @@ model: share_rnn_weights: True training: - n_epoch: 20 + n_epoch: 10 lr: 1e-5 lr_decay: 1.0 weight_decay: 1e-06 From c706dfec2ab292c91fe95cc1947330772c3bc493 Mon Sep 17 00:00:00 2001 From: Haoxin Ma <745165806@qq.com> Date: Wed, 9 Jun 2021 12:54:01 +0000 Subject: [PATCH 02/14] fix bug --- deepspeech/exps/deepspeech2/model.py | 4 ++-- deepspeech/io/collator.py | 4 +++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/deepspeech/exps/deepspeech2/model.py b/deepspeech/exps/deepspeech2/model.py index 50ff3c17..bcd66d19 100644 --- a/deepspeech/exps/deepspeech2/model.py +++ b/deepspeech/exps/deepspeech2/model.py @@ -165,7 +165,7 @@ class DeepSpeech2Trainer(Trainer): sortagrad=config.data.sortagrad, shuffle_method=config.data.shuffle_method) - collate_fn = SpeechCollator(config, keep_transcription_text=False) + collate_fn = SpeechCollator(config=config, keep_transcription_text=False) self.train_loader = DataLoader( train_dataset, batch_sampler=batch_sampler, @@ -342,7 +342,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): batch_size=config.decoding.batch_size, shuffle=False, drop_last=False, - collate_fn=SpeechCollator(keep_transcription_text=True)) + collate_fn=SpeechCollator(config=config, keep_transcription_text=True)) logger.info("Setup test Dataloader!") def setup_output_dir(self): diff --git a/deepspeech/io/collator.py b/deepspeech/io/collator.py index d725b0b1..0f86b8e7 100644 --- a/deepspeech/io/collator.py +++ b/deepspeech/io/collator.py @@ -23,6 +23,8 @@ from deepspeech.frontend.speech import SpeechSegment import io import time +from collections import namedtuple + __all__ = ["SpeechCollator"] logger = Log(__name__).getlog() @@ -50,7 +52,7 @@ class SpeechCollator(): aug_file = config.data.augmentation_config assert isinstance(aug_file, io.StringIO) - self._local_data = TarLocalData(tar2info={}, tar2object={}) + self._local_data = TarLocalData(tar2info={}, tar2object={}) self._augmentation_pipeline = AugmentationPipeline( augmentation_config=aug_file.read(), random_seed=config.data.random_seed) From 2b51d612dd64653bb407f76b648a48ad71b090de Mon Sep 17 00:00:00 2001 From: Haoxin Ma <745165806@qq.com> Date: Wed, 9 Jun 2021 13:42:19 +0000 Subject: [PATCH 03/14] delete _instance_reader_creator func in dataset --- deepspeech/io/dataset.py | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/deepspeech/io/dataset.py b/deepspeech/io/dataset.py index fc687902..929a6cf8 100644 --- a/deepspeech/io/dataset.py +++ b/deepspeech/io/dataset.py @@ -249,22 +249,22 @@ class ManifestDataset(Dataset): - def _instance_reader_creator(self, manifest): - """ - Instance reader creator. Create a callable function to produce - instances of data. - - Instance: a tuple of ndarray of audio spectrogram and a list of - token indices for transcript. - """ - - def reader(): - for instance in manifest: - inst = self.process_utterance(instance["utt"], instance["feat"], - instance["text"]) - yield inst - - return reader + # def _instance_reader_creator(self, manifest): + # """ + # Instance reader creator. Create a callable function to produce + # instances of data. + + # Instance: a tuple of ndarray of audio spectrogram and a list of + # token indices for transcript. + # """ + + # def reader(): + # for instance in manifest: + # inst = self.process_utterance(instance["utt"], instance["feat"], + # instance["text"]) + # yield inst + + # return reader def __len__(self): return len(self._manifest) From 3d5f294363ebc3a732b5f29714f9b057431ed52c Mon Sep 17 00:00:00 2001 From: Haoxin Ma <745165806@qq.com> Date: Thu, 10 Jun 2021 03:13:35 +0000 Subject: [PATCH 04/14] dataset --- deepspeech/io/dataset.py | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/deepspeech/io/dataset.py b/deepspeech/io/dataset.py index 929a6cf8..6083d7ec 100644 --- a/deepspeech/io/dataset.py +++ b/deepspeech/io/dataset.py @@ -247,25 +247,6 @@ class ManifestDataset(Dataset): def stride_ms(self): return self._speech_featurizer.stride_ms - - - # def _instance_reader_creator(self, manifest): - # """ - # Instance reader creator. Create a callable function to produce - # instances of data. - - # Instance: a tuple of ndarray of audio spectrogram and a list of - # token indices for transcript. - # """ - - # def reader(): - # for instance in manifest: - # inst = self.process_utterance(instance["utt"], instance["feat"], - # instance["text"]) - # yield inst - - # return reader - def __len__(self): return len(self._manifest) From 3855522ee3b43bc5726eb7f37a0dd8bd0e9355a2 Mon Sep 17 00:00:00 2001 From: Haoxin Ma <745165806@qq.com> Date: Thu, 10 Jun 2021 11:37:25 +0000 Subject: [PATCH 05/14] config --- deepspeech/exps/deepspeech2/config.py | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/deepspeech/exps/deepspeech2/config.py b/deepspeech/exps/deepspeech2/config.py index a8d452a9..37b00086 100644 --- a/deepspeech/exps/deepspeech2/config.py +++ b/deepspeech/exps/deepspeech2/config.py @@ -38,8 +38,6 @@ _C.data = CN( target_sample_rate=16000, # target sample rate use_dB_normalization=True, target_dB=-20, - random_seed=0, - keep_transcription_text=False, batch_size=32, # batch size num_workers=0, # data loader workers sortagrad=False, # sorted in first epoch when True @@ -55,6 +53,28 @@ _C.model = CN( share_rnn_weights=True #Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported. )) +_C.collator =CN( + dict( + augmentation_config="", + random_seed=0, + mean_std_filepath="", + unit_type="char", + vocab_filepath="", + spm_model_prefix="", + specgram_type='linear', # 'linear', 'mfcc', 'fbank' + feat_dim=0, # 'mfcc', 'fbank' + delta_delta=False, # 'mfcc', 'fbank' + stride_ms=10.0, # ms + window_ms=20.0, # ms + n_fft=None, # fft points + max_freq=None, # None for samplerate/2 + target_sample_rate=16000, # target sample rate + use_dB_normalization=True, + target_dB=-20, + dither=1.0, # feature dither + keep_transcription_text=True + )) + DeepSpeech2Model.params(_C.model) _C.training = CN( From b9110af9d340caf4e3e32e0eafa2fca6946d7296 Mon Sep 17 00:00:00 2001 From: Haoxin Ma <745165806@qq.com> Date: Fri, 11 Jun 2021 02:44:02 +0000 Subject: [PATCH 06/14] feat_dim, vocab_size --- deepspeech/exps/deepspeech2/model.py | 4 +- .../frontend/featurizer/speech_featurizer.py | 43 ----- deepspeech/frontend/utility.py | 2 +- deepspeech/io/collator.py | 166 +++++++++++++++--- deepspeech/io/dataset.py | 128 +++++++------- examples/tiny/s0/conf/deepspeech2.yaml | 23 ++- 6 files changed, 227 insertions(+), 139 deletions(-) diff --git a/deepspeech/exps/deepspeech2/model.py b/deepspeech/exps/deepspeech2/model.py index bcd66d19..679261cf 100644 --- a/deepspeech/exps/deepspeech2/model.py +++ b/deepspeech/exps/deepspeech2/model.py @@ -137,7 +137,7 @@ class DeepSpeech2Trainer(Trainer): def setup_dataloader(self): config = self.config.clone() config.defrost() - config.data.keep_transcription_text = False + config.collator.keep_transcription_text = False config.data.manifest = config.data.train_manifest train_dataset = ManifestDataset.from_config(config) @@ -165,7 +165,7 @@ class DeepSpeech2Trainer(Trainer): sortagrad=config.data.sortagrad, shuffle_method=config.data.shuffle_method) - collate_fn = SpeechCollator(config=config, keep_transcription_text=False) + collate_fn = SpeechCollator.from_config(config) self.train_loader = DataLoader( train_dataset, batch_sampler=batch_sampler, diff --git a/deepspeech/frontend/featurizer/speech_featurizer.py b/deepspeech/frontend/featurizer/speech_featurizer.py index e6761cb5..bcb8e3f4 100644 --- a/deepspeech/frontend/featurizer/speech_featurizer.py +++ b/deepspeech/frontend/featurizer/speech_featurizer.py @@ -104,50 +104,7 @@ class SpeechFeaturizer(object): speech_segment.transcript) return spec_feature, text_ids - @property - def vocab_size(self): - """Return the vocabulary size. - - Returns: - int: Vocabulary size. - """ - return self._text_featurizer.vocab_size - - @property - def vocab_list(self): - """Return the vocabulary in list. - Returns: - List[str]: - """ - return self._text_featurizer.vocab_list - - @property - def vocab_dict(self): - """Return the vocabulary in dict. - - Returns: - Dict[str, int]: - """ - return self._text_featurizer.vocab_dict - - @property - def feature_size(self): - """Return the audio feature size. - - Returns: - int: audio feature size. - """ - return self._audio_featurizer.feature_size - - @property - def stride_ms(self): - """time length in `ms` unit per frame - - Returns: - float: time(ms)/frame - """ - return self._audio_featurizer.stride_ms @property def text_feature(self): diff --git a/deepspeech/frontend/utility.py b/deepspeech/frontend/utility.py index b2dd9601..610104f9 100644 --- a/deepspeech/frontend/utility.py +++ b/deepspeech/frontend/utility.py @@ -82,7 +82,7 @@ def read_manifest( ] if all(conditions): manifest.append(json_data) - return manifest + return manifest, json_data["feat_shape"][-1] def rms_to_db(rms: float): diff --git a/deepspeech/io/collator.py b/deepspeech/io/collator.py index 0f86b8e7..4efc69a0 100644 --- a/deepspeech/io/collator.py +++ b/deepspeech/io/collator.py @@ -22,6 +22,8 @@ from deepspeech.frontend.normalizer import FeatureNormalizer from deepspeech.frontend.speech import SpeechSegment import io import time +from yacs.config import CfgNode +from typing import Optional from collections import namedtuple @@ -33,51 +35,134 @@ logger = Log(__name__).getlog() TarLocalData = namedtuple('TarLocalData', ['tar2info', 'tar2object']) class SpeechCollator(): - def __init__(self, config, keep_transcription_text=True): - """ - Padding audio features with zeros to make them have the same shape (or - a user-defined shape) within one bach. + @classmethod + def params(cls, config: Optional[CfgNode]=None) -> CfgNode: + default = CfgNode( + dict( + augmentation_config="", + random_seed=0, + mean_std_filepath="", + unit_type="char", + vocab_filepath="", + spm_model_prefix="", + specgram_type='linear', # 'linear', 'mfcc', 'fbank' + feat_dim=0, # 'mfcc', 'fbank' + delta_delta=False, # 'mfcc', 'fbank' + stride_ms=10.0, # ms + window_ms=20.0, # ms + n_fft=None, # fft points + max_freq=None, # None for samplerate/2 + target_sample_rate=16000, # target sample rate + use_dB_normalization=True, + target_dB=-20, + dither=1.0, # feature dither + keep_transcription_text=True + )) - if ``keep_transcription_text`` is False, text is token ids else is raw string. + if config is not None: + config.merge_from_other_cfg(default) + return default + + @classmethod + def from_config(cls, config): + """Build a SpeechCollator object from a config. + + Args: + config (yacs.config.CfgNode): configs object. + + Returns: + SpeechCollator: collator object. """ - self._keep_transcription_text = keep_transcription_text + assert 'augmentation_config' in config.collator + assert 'keep_transcription_text' in config.collator + assert 'mean_std_filepath' in config.collator + assert 'vocab_filepath' in config.data + assert 'specgram_type' in config.collator + assert 'n_fft' in config.collator + assert config.collator - if isinstance(config.data.augmentation_config, (str, bytes)): - if config.data.augmentation_config: + if isinstance(config.collator.augmentation_config, (str, bytes)): + if config.collator.augmentation_config: aug_file = io.open( - config.data.augmentation_config, mode='r', encoding='utf8') + config.collator.augmentation_config, mode='r', encoding='utf8') else: aug_file = io.StringIO(initial_value='{}', newline='') else: - aug_file = config.data.augmentation_config + aug_file = config.collator.augmentation_config assert isinstance(aug_file, io.StringIO) + speech_collator = cls( + aug_file=aug_file, + random_seed=0, + mean_std_filepath=config.collator.mean_std_filepath, + unit_type=config.collator.unit_type, + vocab_filepath=config.data.vocab_filepath, + spm_model_prefix=config.collator.spm_model_prefix, + specgram_type=config.collator.specgram_type, + feat_dim=config.collator.feat_dim, + delta_delta=config.collator.delta_delta, + stride_ms=config.collator.stride_ms, + window_ms=config.collator.window_ms, + n_fft=config.collator.n_fft, + max_freq=config.collator.max_freq, + target_sample_rate=config.collator.target_sample_rate, + use_dB_normalization=config.collator.use_dB_normalization, + target_dB=config.collator.target_dB, + dither=config.collator.dither, + keep_transcription_text=config.collator.keep_transcription_text + ) + return speech_collator + + def __init__(self, aug_file, mean_std_filepath, + vocab_filepath, spm_model_prefix, + random_seed=0, + unit_type="char", + specgram_type='linear', # 'linear', 'mfcc', 'fbank' + feat_dim=0, # 'mfcc', 'fbank' + delta_delta=False, # 'mfcc', 'fbank' + stride_ms=10.0, # ms + window_ms=20.0, # ms + n_fft=None, # fft points + max_freq=None, # None for samplerate/2 + target_sample_rate=16000, # target sample rate + use_dB_normalization=True, + target_dB=-20, + dither=1.0, + keep_transcription_text=True): + """ + Padding audio features with zeros to make them have the same shape (or + a user-defined shape) within one bach. + + if ``keep_transcription_text`` is False, text is token ids else is raw string. + """ + self._keep_transcription_text = keep_transcription_text + self._local_data = TarLocalData(tar2info={}, tar2object={}) self._augmentation_pipeline = AugmentationPipeline( augmentation_config=aug_file.read(), - random_seed=config.data.random_seed) + random_seed=random_seed) self._normalizer = FeatureNormalizer( - config.data.mean_std_filepath) if config.data.mean_std_filepath else None + mean_std_filepath) if mean_std_filepath else None - self._stride_ms = config.data.stride_ms - self._target_sample_rate = config.data.target_sample_rate + self._stride_ms = stride_ms + self._target_sample_rate = target_sample_rate self._speech_featurizer = SpeechFeaturizer( - unit_type=config.data.unit_type, - vocab_filepath=config.data.vocab_filepath, - spm_model_prefix=config.data.spm_model_prefix, - specgram_type=config.data.specgram_type, - feat_dim=config.data.feat_dim, - delta_delta=config.data.delta_delta, - stride_ms=config.data.stride_ms, - window_ms=config.data.window_ms, - n_fft=config.data.n_fft, - max_freq=config.data.max_freq, - target_sample_rate=config.data.target_sample_rate, - use_dB_normalization=config.data.use_dB_normalization, - target_dB=config.data.target_dB, - dither=config.data.dither) + unit_type=unit_type, + vocab_filepath=vocab_filepath, + spm_model_prefix=spm_model_prefix, + specgram_type=specgram_type, + feat_dim=feat_dim, + delta_delta=delta_delta, + stride_ms=stride_ms, + window_ms=window_ms, + n_fft=n_fft, + max_freq=max_freq, + target_sample_rate=target_sample_rate, + use_dB_normalization=use_dB_normalization, + target_dB=target_dB, + dither=dither) def _parse_tar(self, file): """Parse a tar file to get a tarfile object @@ -196,3 +281,28 @@ class SpeechCollator(): texts, padding_value=IGNORE_ID).astype(np.int64) text_lens = np.array(text_lens).astype(np.int64) return utts, padded_audios, audio_lens, padded_texts, text_lens + + @property + def vocab_size(self): + return self._speech_featurizer.vocab_size + + @property + def vocab_list(self): + return self._speech_featurizer.vocab_list + + @property + def vocab_dict(self): + return self._speech_featurizer.vocab_dict + + @property + def text_feature(self): + return self._text_featurizer + self._speech_featurizer.text_feature + + @property + def feature_size(self): + return self._speech_featurizer.feature_size + + @property + def stride_ms(self): + return self._speech_featurizer.stride_ms diff --git a/deepspeech/io/dataset.py b/deepspeech/io/dataset.py index aa5b29ed..1e3bbcd3 100644 --- a/deepspeech/io/dataset.py +++ b/deepspeech/io/dataset.py @@ -55,20 +55,6 @@ class ManifestDataset(Dataset): min_output_len=0.0, max_output_input_ratio=float('inf'), min_output_input_ratio=0.0, - stride_ms=10.0, # ms - window_ms=20.0, # ms - n_fft=None, # fft points - max_freq=None, # None for samplerate/2 - raw_wav=True, # use raw_wav or kaldi feature - specgram_type='linear', # 'linear', 'mfcc', 'fbank' - feat_dim=0, # 'mfcc', 'fbank' - delta_delta=False, # 'mfcc', 'fbank' - dither=1.0, # feature dither - target_sample_rate=16000, # target sample rate - use_dB_normalization=True, - target_dB=-20, - random_seed=0, - keep_transcription_text=False, batch_size=32, # batch size num_workers=0, # data loader workers sortagrad=False, # sorted in first epoch when True @@ -116,21 +102,19 @@ class ManifestDataset(Dataset): min_output_len=config.data.min_output_len, max_output_input_ratio=config.data.max_output_input_ratio, min_output_input_ratio=config.data.min_output_input_ratio, - stride_ms=config.data.stride_ms, - window_ms=config.data.window_ms, - n_fft=config.data.n_fft, - max_freq=config.data.max_freq, - target_sample_rate=config.data.target_sample_rate, - specgram_type=config.data.specgram_type, - feat_dim=config.data.feat_dim, - delta_delta=config.data.delta_delta, - dither=config.data.dither, - use_dB_normalization=config.data.use_dB_normalization, - target_dB=config.data.target_dB, - random_seed=config.data.random_seed, - keep_transcription_text=config.data.keep_transcription_text) + ) return dataset + + def _read_vocab(self, vocab_filepath): + """Load vocabulary from file.""" + vocab_lines = [] + with open(vocab_filepath, 'r', encoding='utf-8') as file: + vocab_lines.extend(file.readlines()) + vocab_list = [line[:-1] for line in vocab_lines] + return vocab_list + + def __init__(self, manifest_path, unit_type, @@ -143,20 +127,7 @@ class ManifestDataset(Dataset): max_output_len=float('inf'), min_output_len=0.0, max_output_input_ratio=float('inf'), - min_output_input_ratio=0.0, - stride_ms=10.0, - window_ms=20.0, - n_fft=None, - max_freq=None, - target_sample_rate=16000, - specgram_type='linear', - feat_dim=None, - delta_delta=False, - dither=1.0, - use_dB_normalization=True, - target_dB=-20, - random_seed=0, - keep_transcription_text=False): + min_output_input_ratio=0.0): """Manifest Dataset Args: @@ -186,30 +157,11 @@ class ManifestDataset(Dataset): keep_transcription_text (bool, optional): True, when not in training mode, will not do tokenizer; Defaults to False. """ super().__init__() - self._stride_ms = stride_ms - self._target_sample_rate = target_sample_rate - - self._speech_featurizer = SpeechFeaturizer( - unit_type=unit_type, - vocab_filepath=vocab_filepath, - spm_model_prefix=spm_model_prefix, - specgram_type=specgram_type, - feat_dim=feat_dim, - delta_delta=delta_delta, - stride_ms=stride_ms, - window_ms=window_ms, - n_fft=n_fft, - max_freq=max_freq, - target_sample_rate=target_sample_rate, - use_dB_normalization=use_dB_normalization, - target_dB=target_dB, - dither=dither) - - self._rng = np.random.RandomState(random_seed) - self._keep_transcription_text = keep_transcription_text + + # self._rng = np.random.RandomState(random_seed) # read manifest - self._manifest = read_manifest( + self._manifest, self._feature_size = read_manifest( manifest_path=manifest_path, max_input_len=max_input_len, min_input_len=min_input_len, @@ -219,9 +171,59 @@ class ManifestDataset(Dataset): min_output_input_ratio=min_output_input_ratio) self._manifest.sort(key=lambda x: x["feat_shape"][0]) + self._vocab_list = self._read_vocab(vocab_filepath) + @property def manifest(self): return self._manifest + + @property + def vocab_size(self): + """Return the vocabulary size. + + Returns: + int: Vocabulary size. + """ + return len(self._vocab_list) + + @property + def vocab_list(self): + """Return the vocabulary in list. + + Returns: + List[str]: + """ + return self._vocab_list + + @property + def vocab_dict(self): + """Return the vocabulary in dict. + + Returns: + Dict[str, int]: + """ + vocab_dict = dict( + [(token, idx) for (idx, token) in enumerate(self._vocab_list)]) + return vocab_dict + + @property + def feature_size(self): + """Return the audio feature size. + + Returns: + int: audio feature size. + """ + return self._feature_size + + @property + def stride_ms(self): + """time length in `ms` unit per frame + + Returns: + float: time(ms)/frame + """ + return self._audio_featurizer.stride_ms + def __len__(self): return len(self._manifest) diff --git a/examples/tiny/s0/conf/deepspeech2.yaml b/examples/tiny/s0/conf/deepspeech2.yaml index aeb4f099..eda7c3cb 100644 --- a/examples/tiny/s0/conf/deepspeech2.yaml +++ b/examples/tiny/s0/conf/deepspeech2.yaml @@ -4,9 +4,10 @@ data: dev_manifest: data/manifest.tiny test_manifest: data/manifest.tiny mean_std_filepath: data/mean_std.json + unit_type: char vocab_filepath: data/vocab.txt augmentation_config: conf/augmentation.json - batch_size: 2 + batch_size: 4 min_input_len: 0.0 max_input_len: 27.0 min_output_len: 0.0 @@ -28,6 +29,24 @@ data: sortagrad: True shuffle_method: batch_shuffle num_workers: 0 + +collator: + augmentation_config: conf/augmentation.json + random_seed: 0 + mean_std_filepath: data/mean_std.json + spm_model_prefix: + specgram_type: linear + feat_dim: + delta_delta: False + stride_ms: 10.0 + window_ms: 20.0 + n_fft: None + max_freq: None + target_sample_rate: 16000 + use_dB_normalization: True + target_dB: -20 + dither: 1.0 + keep_transcription_text: True model: num_conv_layers: 2 @@ -37,7 +56,7 @@ model: share_rnn_weights: True training: - n_epoch: 10 + n_epoch: 21 lr: 1e-5 lr_decay: 1.0 weight_decay: 1e-06 From 7bae32f3844166d549d0180da70b13bd10ef4cf7 Mon Sep 17 00:00:00 2001 From: Haoxin Ma <745165806@qq.com> Date: Tue, 15 Jun 2021 03:05:22 +0000 Subject: [PATCH 07/14] revise example/ting/s1 --- deepspeech/exps/deepspeech2/config.py | 2 +- deepspeech/exps/deepspeech2/model.py | 3 ++- deepspeech/exps/u2/config.py | 7 +++++++ deepspeech/exps/u2/model.py | 9 +++++---- deepspeech/frontend/utility.py | 2 +- deepspeech/io/collator.py | 23 ++++----------------- deepspeech/io/dataset.py | 12 ++--------- examples/tiny/s0/conf/deepspeech2.yaml | 16 +-------------- examples/tiny/s1/conf/transformer.yaml | 28 ++++++++++++++------------ 9 files changed, 38 insertions(+), 64 deletions(-) diff --git a/deepspeech/exps/deepspeech2/config.py b/deepspeech/exps/deepspeech2/config.py index 37b00086..1ce5346f 100644 --- a/deepspeech/exps/deepspeech2/config.py +++ b/deepspeech/exps/deepspeech2/config.py @@ -72,7 +72,7 @@ _C.collator =CN( use_dB_normalization=True, target_dB=-20, dither=1.0, # feature dither - keep_transcription_text=True + keep_transcription_text=False )) DeepSpeech2Model.params(_C.model) diff --git a/deepspeech/exps/deepspeech2/model.py b/deepspeech/exps/deepspeech2/model.py index 679261cf..7769c377 100644 --- a/deepspeech/exps/deepspeech2/model.py +++ b/deepspeech/exps/deepspeech2/model.py @@ -336,13 +336,14 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): # config.data.max_output_input_ratio = float('inf') test_dataset = ManifestDataset.from_config(config) + config.collator.keep_transcription_text = True # return text ord id self.test_loader = DataLoader( test_dataset, batch_size=config.decoding.batch_size, shuffle=False, drop_last=False, - collate_fn=SpeechCollator(config=config, keep_transcription_text=True)) + collate_fn=SpeechCollator.from_config(config)) logger.info("Setup test Dataloader!") def setup_output_dir(self): diff --git a/deepspeech/exps/u2/config.py b/deepspeech/exps/u2/config.py index 5a0b53f9..19080be7 100644 --- a/deepspeech/exps/u2/config.py +++ b/deepspeech/exps/u2/config.py @@ -22,6 +22,13 @@ _C = CfgNode() _C.data = ManifestDataset.params() +_C.collator =CfgNode( + dict( + augmentation_config="", + unit_type="char", + keep_transcription_text=False + )) + _C.model = U2Model.params() _C.training = U2Trainer.params() diff --git a/deepspeech/exps/u2/model.py b/deepspeech/exps/u2/model.py index 334d6bc8..89527087 100644 --- a/deepspeech/exps/u2/model.py +++ b/deepspeech/exps/u2/model.py @@ -221,7 +221,7 @@ class U2Trainer(Trainer): config.data.augmentation_config = "" dev_dataset = ManifestDataset.from_config(config) - collate_fn = SpeechCollator(keep_transcription_text=False) + collate_fn = SpeechCollator.from_config(config) if self.parallel: batch_sampler = SortagradDistributedBatchSampler( train_dataset, @@ -266,12 +266,13 @@ class U2Trainer(Trainer): # config.data.max_output_input_ratio = float('inf') test_dataset = ManifestDataset.from_config(config) # return text ord id + config.collator.keep_transcription_text = True self.test_loader = DataLoader( test_dataset, batch_size=config.decoding.batch_size, shuffle=False, drop_last=False, - collate_fn=SpeechCollator(keep_transcription_text=True)) + collate_fn=SpeechCollator.from_config(config)) logger.info("Setup train/valid/test Dataloader!") def setup_model(self): @@ -375,7 +376,7 @@ class U2Tester(U2Trainer): error_rate_func = error_rate.cer if cfg.error_rate_type == 'cer' else error_rate.wer start_time = time.time() - text_feature = self.test_loader.dataset.text_feature + text_feature = self.test_loader.collate_fn.text_feature target_transcripts = self.ordid2token(texts, texts_len) result_transcripts = self.model.decode( audio, @@ -423,7 +424,7 @@ class U2Tester(U2Trainer): self.model.eval() logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}") - stride_ms = self.test_loader.dataset.stride_ms + stride_ms = self.config.collator.stride_ms error_rate_type = None errors_sum, len_refs, num_ins = 0.0, 0, 0 num_frames = 0.0 diff --git a/deepspeech/frontend/utility.py b/deepspeech/frontend/utility.py index 610104f9..b2dd9601 100644 --- a/deepspeech/frontend/utility.py +++ b/deepspeech/frontend/utility.py @@ -82,7 +82,7 @@ def read_manifest( ] if all(conditions): manifest.append(json_data) - return manifest, json_data["feat_shape"][-1] + return manifest def rms_to_db(rms: float): diff --git a/deepspeech/io/collator.py b/deepspeech/io/collator.py index 4efc69a0..51384ec4 100644 --- a/deepspeech/io/collator.py +++ b/deepspeech/io/collator.py @@ -56,7 +56,7 @@ class SpeechCollator(): use_dB_normalization=True, target_dB=-20, dither=1.0, # feature dither - keep_transcription_text=True + keep_transcription_text=False )) if config is not None: @@ -75,7 +75,7 @@ class SpeechCollator(): """ assert 'augmentation_config' in config.collator assert 'keep_transcription_text' in config.collator - assert 'mean_std_filepath' in config.collator + assert 'mean_std_filepath' in config.data assert 'vocab_filepath' in config.data assert 'specgram_type' in config.collator assert 'n_fft' in config.collator @@ -94,7 +94,7 @@ class SpeechCollator(): speech_collator = cls( aug_file=aug_file, random_seed=0, - mean_std_filepath=config.collator.mean_std_filepath, + mean_std_filepath=config.data.mean_std_filepath, unit_type=config.collator.unit_type, vocab_filepath=config.data.vocab_filepath, spm_model_prefix=config.collator.spm_model_prefix, @@ -282,26 +282,11 @@ class SpeechCollator(): text_lens = np.array(text_lens).astype(np.int64) return utts, padded_audios, audio_lens, padded_texts, text_lens - @property - def vocab_size(self): - return self._speech_featurizer.vocab_size - - @property - def vocab_list(self): - return self._speech_featurizer.vocab_list - - @property - def vocab_dict(self): - return self._speech_featurizer.vocab_dict @property def text_feature(self): - return self._text_featurizer - self._speech_featurizer.text_feature + return self._speech_featurizer.text_feature - @property - def feature_size(self): - return self._speech_featurizer.feature_size @property def stride_ms(self): diff --git a/deepspeech/io/dataset.py b/deepspeech/io/dataset.py index 1e3bbcd3..0da347f3 100644 --- a/deepspeech/io/dataset.py +++ b/deepspeech/io/dataset.py @@ -161,7 +161,7 @@ class ManifestDataset(Dataset): # self._rng = np.random.RandomState(random_seed) # read manifest - self._manifest, self._feature_size = read_manifest( + self._manifest = read_manifest( manifest_path=manifest_path, max_input_len=max_input_len, min_input_len=min_input_len, @@ -213,16 +213,8 @@ class ManifestDataset(Dataset): Returns: int: audio feature size. """ - return self._feature_size + return self._manifest[0]["feat_shape"][-1] - @property - def stride_ms(self): - """time length in `ms` unit per frame - - Returns: - float: time(ms)/frame - """ - return self._audio_featurizer.stride_ms def __len__(self): diff --git a/examples/tiny/s0/conf/deepspeech2.yaml b/examples/tiny/s0/conf/deepspeech2.yaml index eda7c3cb..bfed8d59 100644 --- a/examples/tiny/s0/conf/deepspeech2.yaml +++ b/examples/tiny/s0/conf/deepspeech2.yaml @@ -6,7 +6,6 @@ data: mean_std_filepath: data/mean_std.json unit_type: char vocab_filepath: data/vocab.txt - augmentation_config: conf/augmentation.json batch_size: 4 min_input_len: 0.0 max_input_len: 27.0 @@ -14,18 +13,6 @@ data: max_output_len: 400.0 min_output_input_ratio: 0.05 max_output_input_ratio: 10.0 - specgram_type: linear - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 20.0 - delta_delta: False - dither: 1.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False sortagrad: True shuffle_method: batch_shuffle num_workers: 0 @@ -33,7 +20,6 @@ data: collator: augmentation_config: conf/augmentation.json random_seed: 0 - mean_std_filepath: data/mean_std.json spm_model_prefix: specgram_type: linear feat_dim: @@ -46,7 +32,7 @@ collator: use_dB_normalization: True target_dB: -20 dither: 1.0 - keep_transcription_text: True + keep_transcription_text: False model: num_conv_layers: 2 diff --git a/examples/tiny/s1/conf/transformer.yaml b/examples/tiny/s1/conf/transformer.yaml index 0a7cf3be..cc172585 100644 --- a/examples/tiny/s1/conf/transformer.yaml +++ b/examples/tiny/s1/conf/transformer.yaml @@ -7,7 +7,6 @@ data: unit_type: 'spm' spm_model_prefix: 'data/bpe_unigram_200' mean_std_filepath: "" - augmentation_config: conf/augmentation.json batch_size: 4 min_input_len: 0.5 # second max_input_len: 20.0 # second @@ -16,23 +15,26 @@ data: min_output_input_ratio: 0.05 max_output_input_ratio: 10.0 raw_wav: True # use raw_wav or kaldi feature - specgram_type: fbank #linear, mfcc, fbank + sortagrad: True + shuffle_method: batch_shuffle + num_workers: 0 #2 + +collator: + augmentation_config: conf/augmentation.json + random_seed: 0 + spm_model_prefix: + specgram_type: fbank feat_dim: 80 delta_delta: False - dither: 1.0 - target_sample_rate: 16000 - max_freq: None - n_fft: None stride_ms: 10.0 - window_ms: 25.0 + window_ms: 20.0 + n_fft: None + max_freq: None + target_sample_rate: 16000 use_dB_normalization: True target_dB: -20 - random_seed: 0 + dither: 1.0 keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 - # network architecture model: @@ -70,7 +72,7 @@ model: training: - n_epoch: 2 + n_epoch: 3 accum_grad: 1 global_grad_clip: 5.0 optim: adam From 6ee3033cc4561ab3109ee036c3c8db9101d1c2b7 Mon Sep 17 00:00:00 2001 From: Haoxin Ma <745165806@qq.com> Date: Wed, 16 Jun 2021 14:39:00 +0000 Subject: [PATCH 08/14] finish aishell/s0 --- deepspeech/exps/deepspeech2/model.py | 12 +-- deepspeech/exps/u2/model.py | 6 +- .../frontend/featurizer/speech_featurizer.py | 49 +++++++++- deepspeech/io/collator.py | 32 ++++++- deepspeech/io/dataset.py | 90 +++++++++---------- examples/aishell/s0/conf/deepspeech2.yaml | 24 ++--- examples/tiny/s0/conf/deepspeech2.yaml | 2 +- examples/tiny/s1/conf/transformer.yaml | 2 +- 8 files changed, 147 insertions(+), 70 deletions(-) diff --git a/deepspeech/exps/deepspeech2/model.py b/deepspeech/exps/deepspeech2/model.py index 7769c377..5833382a 100644 --- a/deepspeech/exps/deepspeech2/model.py +++ b/deepspeech/exps/deepspeech2/model.py @@ -102,8 +102,8 @@ class DeepSpeech2Trainer(Trainer): def setup_model(self): config = self.config model = DeepSpeech2Model( - feat_size=self.train_loader.dataset.feature_size, - dict_size=self.train_loader.dataset.vocab_size, + feat_size=self.train_loader.collate_fn.feature_size, + dict_size=self.train_loader.collate_fn.vocab_size, num_conv_layers=config.model.num_conv_layers, num_rnn_layers=config.model.num_rnn_layers, rnn_size=config.model.rnn_layer_size, @@ -199,7 +199,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): errors_func = error_rate.char_errors if cfg.error_rate_type == 'cer' else error_rate.word_errors error_rate_func = error_rate.cer if cfg.error_rate_type == 'cer' else error_rate.wer - vocab_list = self.test_loader.dataset.vocab_list + vocab_list = self.test_loader.collate_fn.vocab_list target_transcripts = self.ordid2token(texts, texts_len) result_transcripts = self.model.decode( @@ -272,7 +272,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): infer_model = DeepSpeech2InferModel.from_pretrained( self.test_loader.dataset, self.config, self.args.checkpoint_path) infer_model.eval() - feat_dim = self.test_loader.dataset.feature_size + feat_dim = self.test_loader.collate_fn.feature_size static_model = paddle.jit.to_static( infer_model, input_spec=[ @@ -308,8 +308,8 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): def setup_model(self): config = self.config model = DeepSpeech2Model( - feat_size=self.test_loader.dataset.feature_size, - dict_size=self.test_loader.dataset.vocab_size, + feat_size=self.test_loader.collate_fn.feature_size, + dict_size=self.test_loader.collate_fn.vocab_size, num_conv_layers=config.model.num_conv_layers, num_rnn_layers=config.model.num_rnn_layers, rnn_size=config.model.rnn_layer_size, diff --git a/deepspeech/exps/u2/model.py b/deepspeech/exps/u2/model.py index 89527087..676768ce 100644 --- a/deepspeech/exps/u2/model.py +++ b/deepspeech/exps/u2/model.py @@ -279,8 +279,8 @@ class U2Trainer(Trainer): config = self.config model_conf = config.model model_conf.defrost() - model_conf.input_dim = self.train_loader.dataset.feature_size - model_conf.output_dim = self.train_loader.dataset.vocab_size + model_conf.input_dim = self.train_loader.collate_fn.feature_size + model_conf.output_dim = self.train_loader.collate_fn.vocab_size model_conf.freeze() model = U2Model.from_config(model_conf) @@ -497,7 +497,7 @@ class U2Tester(U2Trainer): infer_model = U2InferModel.from_pretrained(self.test_loader.dataset, self.config.model.clone(), self.args.checkpoint_path) - feat_dim = self.test_loader.dataset.feature_size + feat_dim = self.test_loader.collate_fn.feature_size input_spec = [ paddle.static.InputSpec( shape=[None, feat_dim, None], diff --git a/deepspeech/frontend/featurizer/speech_featurizer.py b/deepspeech/frontend/featurizer/speech_featurizer.py index bcb8e3f4..852d26c9 100644 --- a/deepspeech/frontend/featurizer/speech_featurizer.py +++ b/deepspeech/frontend/featurizer/speech_featurizer.py @@ -104,13 +104,60 @@ class SpeechFeaturizer(object): speech_segment.transcript) return spec_feature, text_ids + @property + def vocab_size(self): + """Return the vocabulary size. + Returns: + int: Vocabulary size. + """ + return self._text_featurizer.vocab_size + @property + def vocab_list(self): + """Return the vocabulary in list. + Returns: + List[str]: + """ + return self._text_featurizer.vocab_list + + @property + def vocab_dict(self): + """Return the vocabulary in dict. + Returns: + Dict[str, int]: + """ + return self._text_featurizer.vocab_dict + + @property + def feature_size(self): + """Return the audio feature size. + Returns: + int: audio feature size. + """ + return self._audio_featurizer.feature_size + + @property + def stride_ms(self): + """time length in `ms` unit per frame + Returns: + float: time(ms)/frame + """ + return self._audio_featurizer.stride_ms @property def text_feature(self): """Return the text feature object. - Returns: TextFeaturizer: object. """ return self._text_featurizer + + + # @property + # def text_feature(self): + # """Return the text feature object. + + # Returns: + # TextFeaturizer: object. + # """ + # return self._text_featurizer diff --git a/deepspeech/io/collator.py b/deepspeech/io/collator.py index 51384ec4..8b8575db 100644 --- a/deepspeech/io/collator.py +++ b/deepspeech/io/collator.py @@ -283,11 +283,41 @@ class SpeechCollator(): return utts, padded_audios, audio_lens, padded_texts, text_lens + # @property + # def text_feature(self): + # return self._speech_featurizer.text_feature + + + # @property + # def stride_ms(self): + # return self._speech_featurizer.stride_ms + +########### + + @property + def manifest(self): + return self._manifest + + @property + def vocab_size(self): + return self._speech_featurizer.vocab_size + + @property + def vocab_list(self): + return self._speech_featurizer.vocab_list + + @property + def vocab_dict(self): + return self._speech_featurizer.vocab_dict + @property def text_feature(self): return self._speech_featurizer.text_feature + @property + def feature_size(self): + return self._speech_featurizer.feature_size @property def stride_ms(self): - return self._speech_featurizer.stride_ms + return self._speech_featurizer.stride_ms \ No newline at end of file diff --git a/deepspeech/io/dataset.py b/deepspeech/io/dataset.py index 0da347f3..24d8486a 100644 --- a/deepspeech/io/dataset.py +++ b/deepspeech/io/dataset.py @@ -55,10 +55,6 @@ class ManifestDataset(Dataset): min_output_len=0.0, max_output_input_ratio=float('inf'), min_output_input_ratio=0.0, - batch_size=32, # batch size - num_workers=0, # data loader workers - sortagrad=False, # sorted in first epoch when True - shuffle_method="batch_shuffle", # 'batch_shuffle', 'instance_shuffle' )) if config is not None: @@ -77,7 +73,7 @@ class ManifestDataset(Dataset): """ assert 'manifest' in config.data assert config.data.manifest - assert 'keep_transcription_text' in config.data + assert 'keep_transcription_text' in config.collator if isinstance(config.data.augmentation_config, (str, bytes)): if config.data.augmentation_config: @@ -171,51 +167,51 @@ class ManifestDataset(Dataset): min_output_input_ratio=min_output_input_ratio) self._manifest.sort(key=lambda x: x["feat_shape"][0]) - self._vocab_list = self._read_vocab(vocab_filepath) + # self._vocab_list = self._read_vocab(vocab_filepath) - @property - def manifest(self): - return self._manifest - - @property - def vocab_size(self): - """Return the vocabulary size. - - Returns: - int: Vocabulary size. - """ - return len(self._vocab_list) - - @property - def vocab_list(self): - """Return the vocabulary in list. - - Returns: - List[str]: - """ - return self._vocab_list - - @property - def vocab_dict(self): - """Return the vocabulary in dict. - - Returns: - Dict[str, int]: - """ - vocab_dict = dict( - [(token, idx) for (idx, token) in enumerate(self._vocab_list)]) - return vocab_dict - - @property - def feature_size(self): - """Return the audio feature size. - - Returns: - int: audio feature size. - """ - return self._manifest[0]["feat_shape"][-1] + # @property + # def manifest(self): + # return self._manifest + # @property + # def vocab_size(self): + # """Return the vocabulary size. + + # Returns: + # int: Vocabulary size. + # """ + # return len(self._vocab_list) + + # @property + # def vocab_list(self): + # """Return the vocabulary in list. + + # Returns: + # List[str]: + # """ + # return self._vocab_list + + # @property + # def vocab_dict(self): + # """Return the vocabulary in dict. + + # Returns: + # Dict[str, int]: + # """ + # vocab_dict = dict( + # [(token, idx) for (idx, token) in enumerate(self._vocab_list)]) + # return vocab_dict + + # @property + # def feature_size(self): + # """Return the audio feature size. + + # Returns: + # int: audio feature size. + # """ + # return self._manifest[0]["feat_shape"][-1] + def __len__(self): return len(self._manifest) diff --git a/examples/aishell/s0/conf/deepspeech2.yaml b/examples/aishell/s0/conf/deepspeech2.yaml index 8b08ee30..e5ab8e04 100644 --- a/examples/aishell/s0/conf/deepspeech2.yaml +++ b/examples/aishell/s0/conf/deepspeech2.yaml @@ -5,7 +5,6 @@ data: test_manifest: data/manifest.test mean_std_filepath: data/mean_std.json vocab_filepath: data/vocab.txt - augmentation_config: conf/augmentation.json batch_size: 64 # one gpu min_input_len: 0.0 max_input_len: 27.0 # second @@ -13,21 +12,26 @@ data: max_output_len: .inf min_output_input_ratio: 0.00 max_output_input_ratio: .inf + sortagrad: True + shuffle_method: batch_shuffle + num_workers: 0 + +collator: + augmentation_config: conf/augmentation.json + random_seed: 0 + spm_model_prefix: specgram_type: linear - target_sample_rate: 16000 - max_freq: None - n_fft: None + feat_dim: + delta_delta: False stride_ms: 10.0 window_ms: 20.0 - delta_delta: False - dither: 1.0 + n_fft: None + max_freq: None + target_sample_rate: 16000 use_dB_normalization: True target_dB: -20 - random_seed: 0 + dither: 1.0 keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 0 model: num_conv_layers: 2 diff --git a/examples/tiny/s0/conf/deepspeech2.yaml b/examples/tiny/s0/conf/deepspeech2.yaml index bfed8d59..6680e568 100644 --- a/examples/tiny/s0/conf/deepspeech2.yaml +++ b/examples/tiny/s0/conf/deepspeech2.yaml @@ -42,7 +42,7 @@ model: share_rnn_weights: True training: - n_epoch: 21 + n_epoch: 23 lr: 1e-5 lr_decay: 1.0 weight_decay: 1e-06 diff --git a/examples/tiny/s1/conf/transformer.yaml b/examples/tiny/s1/conf/transformer.yaml index cc172585..5e28e4e8 100644 --- a/examples/tiny/s1/conf/transformer.yaml +++ b/examples/tiny/s1/conf/transformer.yaml @@ -72,7 +72,7 @@ model: training: - n_epoch: 3 + n_epoch: 21 accum_grad: 1 global_grad_clip: 5.0 optim: adam From 89a00eabeb6aaf0512be2283a563d087423c23bd Mon Sep 17 00:00:00 2001 From: Haoxin Ma <745165806@qq.com> Date: Thu, 17 Jun 2021 00:36:57 +0000 Subject: [PATCH 09/14] revise deepspeech/exps/u2/model.py --- deepspeech/exps/u2/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepspeech/exps/u2/model.py b/deepspeech/exps/u2/model.py index 676768ce..164903e6 100644 --- a/deepspeech/exps/u2/model.py +++ b/deepspeech/exps/u2/model.py @@ -424,7 +424,7 @@ class U2Tester(U2Trainer): self.model.eval() logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}") - stride_ms = self.config.collator.stride_ms + stride_ms = self.test_loader.collate_fn.stride_ms error_rate_type = None errors_sum, len_refs, num_ins = 0.0, 0, 0 num_frames = 0.0 From 698d7a9bdb3de1a763ed8ba7a71b68241e3eea17 Mon Sep 17 00:00:00 2001 From: Haoxin Ma <745165806@qq.com> Date: Thu, 17 Jun 2021 07:16:52 +0000 Subject: [PATCH 10/14] move batch_size, work_nums, shuffle_method, sortagrad to collator --- deepspeech/exps/deepspeech2/config.py | 20 +++++------------ deepspeech/exps/deepspeech2/model.py | 18 +++++++-------- deepspeech/exps/u2/config.py | 6 ++++- .../frontend/featurizer/speech_featurizer.py | 10 --------- deepspeech/io/collator.py | 22 ------------------- examples/aishell/s0/conf/deepspeech2.yaml | 9 ++++---- examples/tiny/s0/conf/deepspeech2.yaml | 9 ++++---- 7 files changed, 29 insertions(+), 65 deletions(-) diff --git a/deepspeech/exps/deepspeech2/config.py b/deepspeech/exps/deepspeech2/config.py index 1ce5346f..faaff1aa 100644 --- a/deepspeech/exps/deepspeech2/config.py +++ b/deepspeech/exps/deepspeech2/config.py @@ -28,20 +28,6 @@ _C.data = CN( augmentation_config="", max_duration=float('inf'), min_duration=0.0, - stride_ms=10.0, # ms - window_ms=20.0, # ms - n_fft=None, # fft points - max_freq=None, # None for samplerate/2 - specgram_type='linear', # 'linear', 'mfcc', 'fbank' - feat_dim=0, # 'mfcc', 'fbank' - delat_delta=False, # 'mfcc', 'fbank' - target_sample_rate=16000, # target sample rate - use_dB_normalization=True, - target_dB=-20, - batch_size=32, # batch size - num_workers=0, # data loader workers - sortagrad=False, # sorted in first epoch when True - shuffle_method="batch_shuffle", # 'batch_shuffle', 'instance_shuffle' )) _C.model = CN( @@ -72,7 +58,11 @@ _C.collator =CN( use_dB_normalization=True, target_dB=-20, dither=1.0, # feature dither - keep_transcription_text=False + keep_transcription_text=False, + batch_size=32, # batch size + num_workers=0, # data loader workers + sortagrad=False, # sorted in first epoch when True + shuffle_method="batch_shuffle", # 'batch_shuffle', 'instance_shuffle' )) DeepSpeech2Model.params(_C.model) diff --git a/deepspeech/exps/deepspeech2/model.py b/deepspeech/exps/deepspeech2/model.py index 5833382a..b54192dd 100644 --- a/deepspeech/exps/deepspeech2/model.py +++ b/deepspeech/exps/deepspeech2/model.py @@ -55,7 +55,7 @@ class DeepSpeech2Trainer(Trainer): 'train_loss': float(loss), } msg += "train time: {:>.3f}s, ".format(iteration_time) - msg += "batch size: {}, ".format(self.config.data.batch_size) + msg += "batch size: {}, ".format(self.config.collator.batch_size) msg += ', '.join('{}: {:>.6f}'.format(k, v) for k, v in losses_np.items()) logger.info(msg) @@ -149,31 +149,31 @@ class DeepSpeech2Trainer(Trainer): if self.parallel: batch_sampler = SortagradDistributedBatchSampler( train_dataset, - batch_size=config.data.batch_size, + batch_size=config.collator.batch_size, num_replicas=None, rank=None, shuffle=True, drop_last=True, - sortagrad=config.data.sortagrad, - shuffle_method=config.data.shuffle_method) + sortagrad=config.collator.sortagrad, + shuffle_method=config.collator.shuffle_method) else: batch_sampler = SortagradBatchSampler( train_dataset, shuffle=True, - batch_size=config.data.batch_size, + batch_size=config.collator.batch_size, drop_last=True, - sortagrad=config.data.sortagrad, - shuffle_method=config.data.shuffle_method) + sortagrad=config.collator.sortagrad, + shuffle_method=config.collator.shuffle_method) collate_fn = SpeechCollator.from_config(config) self.train_loader = DataLoader( train_dataset, batch_sampler=batch_sampler, collate_fn=collate_fn, - num_workers=config.data.num_workers) + num_workers=config.collator.num_workers) self.valid_loader = DataLoader( dev_dataset, - batch_size=config.data.batch_size, + batch_size=config.collator.batch_size, shuffle=False, drop_last=False, collate_fn=collate_fn) diff --git a/deepspeech/exps/u2/config.py b/deepspeech/exps/u2/config.py index 19080be7..42725c74 100644 --- a/deepspeech/exps/u2/config.py +++ b/deepspeech/exps/u2/config.py @@ -26,7 +26,11 @@ _C.collator =CfgNode( dict( augmentation_config="", unit_type="char", - keep_transcription_text=False + keep_transcription_text=False, + batch_size=32, # batch size + num_workers=0, # data loader workers + sortagrad=False, # sorted in first epoch when True + shuffle_method="batch_shuffle" # 'batch_shuffle', 'instance_shuffle' )) _C.model = U2Model.params() diff --git a/deepspeech/frontend/featurizer/speech_featurizer.py b/deepspeech/frontend/featurizer/speech_featurizer.py index 852d26c9..0fbbc564 100644 --- a/deepspeech/frontend/featurizer/speech_featurizer.py +++ b/deepspeech/frontend/featurizer/speech_featurizer.py @@ -151,13 +151,3 @@ class SpeechFeaturizer(object): TextFeaturizer: object. """ return self._text_featurizer - - - # @property - # def text_feature(self): - # """Return the text feature object. - - # Returns: - # TextFeaturizer: object. - # """ - # return self._text_featurizer diff --git a/deepspeech/io/collator.py b/deepspeech/io/collator.py index 8b8575db..ac817a19 100644 --- a/deepspeech/io/collator.py +++ b/deepspeech/io/collator.py @@ -203,34 +203,22 @@ class SpeechCollator(): where transcription part could be token ids or text. :rtype: tuple of (2darray, list) """ - start_time = time.time() if isinstance(audio_file, str) and audio_file.startswith('tar:'): speech_segment = SpeechSegment.from_file( self._subfile_from_tar(audio_file), transcript) else: speech_segment = SpeechSegment.from_file(audio_file, transcript) - load_wav_time = time.time() - start_time - #logger.debug(f"load wav time: {load_wav_time}") # audio augment - start_time = time.time() self._augmentation_pipeline.transform_audio(speech_segment) - audio_aug_time = time.time() - start_time - #logger.debug(f"audio augmentation time: {audio_aug_time}") - start_time = time.time() specgram, transcript_part = self._speech_featurizer.featurize( speech_segment, self._keep_transcription_text) if self._normalizer: specgram = self._normalizer.apply(specgram) - feature_time = time.time() - start_time - #logger.debug(f"audio & test feature time: {feature_time}") # specgram augment - start_time = time.time() specgram = self._augmentation_pipeline.transform_feature(specgram) - feature_aug_time = time.time() - start_time - #logger.debug(f"audio feature augmentation time: {feature_aug_time}") return specgram, transcript_part def __call__(self, batch): @@ -283,16 +271,6 @@ class SpeechCollator(): return utts, padded_audios, audio_lens, padded_texts, text_lens - # @property - # def text_feature(self): - # return self._speech_featurizer.text_feature - - - # @property - # def stride_ms(self): - # return self._speech_featurizer.stride_ms - -########### @property def manifest(self): diff --git a/examples/aishell/s0/conf/deepspeech2.yaml b/examples/aishell/s0/conf/deepspeech2.yaml index e5ab8e04..54ce240e 100644 --- a/examples/aishell/s0/conf/deepspeech2.yaml +++ b/examples/aishell/s0/conf/deepspeech2.yaml @@ -5,16 +5,13 @@ data: test_manifest: data/manifest.test mean_std_filepath: data/mean_std.json vocab_filepath: data/vocab.txt - batch_size: 64 # one gpu min_input_len: 0.0 max_input_len: 27.0 # second min_output_len: 0.0 max_output_len: .inf min_output_input_ratio: 0.00 max_output_input_ratio: .inf - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 0 + collator: augmentation_config: conf/augmentation.json @@ -32,6 +29,10 @@ collator: target_dB: -20 dither: 1.0 keep_transcription_text: False + sortagrad: True + shuffle_method: batch_shuffle + num_workers: 0 + batch_size: 64 # one gpu model: num_conv_layers: 2 diff --git a/examples/tiny/s0/conf/deepspeech2.yaml b/examples/tiny/s0/conf/deepspeech2.yaml index 6680e568..434cf264 100644 --- a/examples/tiny/s0/conf/deepspeech2.yaml +++ b/examples/tiny/s0/conf/deepspeech2.yaml @@ -6,16 +6,13 @@ data: mean_std_filepath: data/mean_std.json unit_type: char vocab_filepath: data/vocab.txt - batch_size: 4 min_input_len: 0.0 max_input_len: 27.0 min_output_len: 0.0 max_output_len: 400.0 min_output_input_ratio: 0.05 max_output_input_ratio: 10.0 - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 0 + collator: augmentation_config: conf/augmentation.json @@ -33,6 +30,10 @@ collator: target_dB: -20 dither: 1.0 keep_transcription_text: False + sortagrad: True + shuffle_method: batch_shuffle + num_workers: 0 + batch_size: 4 model: num_conv_layers: 2 From 557427736e9f2fba6715cc3ce18b3175a3c42cd8 Mon Sep 17 00:00:00 2001 From: Haoxin Ma <745165806@qq.com> Date: Fri, 18 Jun 2021 06:41:28 +0000 Subject: [PATCH 11/14] move redundant params --- deepspeech/exps/deepspeech2/config.py | 30 +++---- deepspeech/exps/deepspeech2/model.py | 14 ++-- deepspeech/exps/u2/config.py | 12 +-- deepspeech/exps/u2/model.py | 35 ++++---- deepspeech/io/collator.py | 36 ++++++-- deepspeech/io/dataset.py | 105 +----------------------- examples/aishell/s1/conf/conformer.yaml | 14 ++-- examples/tiny/s0/conf/deepspeech2.yaml | 10 +-- examples/tiny/s1/conf/transformer.yaml | 22 ++--- 9 files changed, 96 insertions(+), 182 deletions(-) diff --git a/deepspeech/exps/deepspeech2/config.py b/deepspeech/exps/deepspeech2/config.py index faaff1aa..050a50b0 100644 --- a/deepspeech/exps/deepspeech2/config.py +++ b/deepspeech/exps/deepspeech2/config.py @@ -21,32 +21,18 @@ _C.data = CN( train_manifest="", dev_manifest="", test_manifest="", - unit_type="char", - vocab_filepath="", - spm_model_prefix="", - mean_std_filepath="", - augmentation_config="", max_duration=float('inf'), min_duration=0.0, )) -_C.model = CN( - dict( - num_conv_layers=2, #Number of stacking convolution layers. - num_rnn_layers=3, #Number of stacking RNN layers. - rnn_layer_size=1024, #RNN layer size (number of RNN cells). - use_gru=True, #Use gru if set True. Use simple rnn if set False. - share_rnn_weights=True #Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported. - )) - _C.collator =CN( dict( - augmentation_config="", - random_seed=0, - mean_std_filepath="", unit_type="char", vocab_filepath="", spm_model_prefix="", + mean_std_filepath="", + augmentation_config="", + random_seed=0, specgram_type='linear', # 'linear', 'mfcc', 'fbank' feat_dim=0, # 'mfcc', 'fbank' delta_delta=False, # 'mfcc', 'fbank' @@ -65,6 +51,16 @@ _C.collator =CN( shuffle_method="batch_shuffle", # 'batch_shuffle', 'instance_shuffle' )) +_C.model = CN( + dict( + num_conv_layers=2, #Number of stacking convolution layers. + num_rnn_layers=3, #Number of stacking RNN layers. + rnn_layer_size=1024, #RNN layer size (number of RNN cells). + use_gru=True, #Use gru if set True. Use simple rnn if set False. + share_rnn_weights=True #Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported. + )) + + DeepSpeech2Model.params(_C.model) _C.training = CN( diff --git a/deepspeech/exps/deepspeech2/model.py b/deepspeech/exps/deepspeech2/model.py index b54192dd..1eefc871 100644 --- a/deepspeech/exps/deepspeech2/model.py +++ b/deepspeech/exps/deepspeech2/model.py @@ -143,7 +143,6 @@ class DeepSpeech2Trainer(Trainer): train_dataset = ManifestDataset.from_config(config) config.data.manifest = config.data.dev_manifest - config.data.augmentation_config = "" dev_dataset = ManifestDataset.from_config(config) if self.parallel: @@ -165,18 +164,22 @@ class DeepSpeech2Trainer(Trainer): sortagrad=config.collator.sortagrad, shuffle_method=config.collator.shuffle_method) - collate_fn = SpeechCollator.from_config(config) + collate_fn_train = SpeechCollator.from_config(config) + + + config.collator.augmentation_config = "" + collate_fn_dev = SpeechCollator.from_config(config) self.train_loader = DataLoader( train_dataset, batch_sampler=batch_sampler, - collate_fn=collate_fn, + collate_fn=collate_fn_train, num_workers=config.collator.num_workers) self.valid_loader = DataLoader( dev_dataset, batch_size=config.collator.batch_size, shuffle=False, drop_last=False, - collate_fn=collate_fn) + collate_fn=collate_fn_dev) logger.info("Setup train/valid Dataloader!") @@ -324,8 +327,6 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): # return raw text config.data.manifest = config.data.test_manifest - config.data.keep_transcription_text = True - config.data.augmentation_config = "" # filter test examples, will cause less examples, but no mismatch with training # and can use large batch size , save training time, so filter test egs now. # config.data.min_input_len = 0.0 # second @@ -337,6 +338,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): test_dataset = ManifestDataset.from_config(config) config.collator.keep_transcription_text = True + config.collator.augmentation_config = "" # return text ord id self.test_loader = DataLoader( test_dataset, diff --git a/deepspeech/exps/u2/config.py b/deepspeech/exps/u2/config.py index 42725c74..d8735453 100644 --- a/deepspeech/exps/u2/config.py +++ b/deepspeech/exps/u2/config.py @@ -17,21 +17,13 @@ from deepspeech.exps.u2.model import U2Tester from deepspeech.exps.u2.model import U2Trainer from deepspeech.io.dataset import ManifestDataset from deepspeech.models.u2 import U2Model +from deepspeech.io.collator import SpeechCollator _C = CfgNode() _C.data = ManifestDataset.params() -_C.collator =CfgNode( - dict( - augmentation_config="", - unit_type="char", - keep_transcription_text=False, - batch_size=32, # batch size - num_workers=0, # data loader workers - sortagrad=False, # sorted in first epoch when True - shuffle_method="batch_shuffle" # 'batch_shuffle', 'instance_shuffle' - )) +_C.collator = SpeechCollator.params() _C.model = U2Model.params() diff --git a/deepspeech/exps/u2/model.py b/deepspeech/exps/u2/model.py index 164903e6..836afa36 100644 --- a/deepspeech/exps/u2/model.py +++ b/deepspeech/exps/u2/model.py @@ -100,7 +100,7 @@ class U2Trainer(Trainer): if (batch_index + 1) % train_conf.log_interval == 0: msg += "train time: {:>.3f}s, ".format(iteration_time) - msg += "batch size: {}, ".format(self.config.data.batch_size) + msg += "batch size: {}, ".format(self.config.collator.batch_size) msg += "accum: {}, ".format(train_conf.accum_grad) msg += ', '.join('{}: {:>.6f}'.format(k, v) for k, v in losses_np.items()) @@ -211,51 +211,52 @@ class U2Trainer(Trainer): def setup_dataloader(self): config = self.config.clone() config.defrost() - config.data.keep_transcription_text = False + config.collator.keep_transcription_text = False # train/valid dataset, return token ids config.data.manifest = config.data.train_manifest train_dataset = ManifestDataset.from_config(config) config.data.manifest = config.data.dev_manifest - config.data.augmentation_config = "" dev_dataset = ManifestDataset.from_config(config) - collate_fn = SpeechCollator.from_config(config) + collate_fn_train = SpeechCollator.from_config(config) + + config.collator.augmentation_config = "" + collate_fn_dev = SpeechCollator.from_config(config) + if self.parallel: batch_sampler = SortagradDistributedBatchSampler( train_dataset, - batch_size=config.data.batch_size, + batch_size=config.collator.batch_size, num_replicas=None, rank=None, shuffle=True, drop_last=True, - sortagrad=config.data.sortagrad, - shuffle_method=config.data.shuffle_method) + sortagrad=config.collator.sortagrad, + shuffle_method=config.collator.shuffle_method) else: batch_sampler = SortagradBatchSampler( train_dataset, shuffle=True, - batch_size=config.data.batch_size, + batch_size=config.collator.batch_size, drop_last=True, - sortagrad=config.data.sortagrad, - shuffle_method=config.data.shuffle_method) + sortagrad=config.collator.sortagrad, + shuffle_method=config.collator.shuffle_method) self.train_loader = DataLoader( train_dataset, batch_sampler=batch_sampler, - collate_fn=collate_fn, - num_workers=config.data.num_workers, ) + collate_fn=collate_fn_train, + num_workers=config.collator.num_workers, ) self.valid_loader = DataLoader( dev_dataset, - batch_size=config.data.batch_size, + batch_size=config.collator.batch_size, shuffle=False, drop_last=False, - collate_fn=collate_fn) + collate_fn=collate_fn_dev) # test dataset, return raw text config.data.manifest = config.data.test_manifest - config.data.keep_transcription_text = True - config.data.augmentation_config = "" # filter test examples, will cause less examples, but no mismatch with training # and can use large batch size , save training time, so filter test egs now. # config.data.min_input_len = 0.0 # second @@ -264,9 +265,11 @@ class U2Trainer(Trainer): # config.data.max_output_len = float('inf') # tokens # config.data.min_output_input_ratio = 0.00 # config.data.max_output_input_ratio = float('inf') + test_dataset = ManifestDataset.from_config(config) # return text ord id config.collator.keep_transcription_text = True + config.collator.augmentation_config = "" self.test_loader = DataLoader( test_dataset, batch_size=config.decoding.batch_size, diff --git a/deepspeech/io/collator.py b/deepspeech/io/collator.py index ac817a19..ab1e9165 100644 --- a/deepspeech/io/collator.py +++ b/deepspeech/io/collator.py @@ -75,8 +75,8 @@ class SpeechCollator(): """ assert 'augmentation_config' in config.collator assert 'keep_transcription_text' in config.collator - assert 'mean_std_filepath' in config.data - assert 'vocab_filepath' in config.data + assert 'mean_std_filepath' in config.collator + assert 'vocab_filepath' in config.collator assert 'specgram_type' in config.collator assert 'n_fft' in config.collator assert config.collator @@ -94,9 +94,9 @@ class SpeechCollator(): speech_collator = cls( aug_file=aug_file, random_seed=0, - mean_std_filepath=config.data.mean_std_filepath, + mean_std_filepath=config.collator.mean_std_filepath, unit_type=config.collator.unit_type, - vocab_filepath=config.data.vocab_filepath, + vocab_filepath=config.collator.vocab_filepath, spm_model_prefix=config.collator.spm_model_prefix, specgram_type=config.collator.specgram_type, feat_dim=config.collator.feat_dim, @@ -129,11 +129,31 @@ class SpeechCollator(): target_dB=-20, dither=1.0, keep_transcription_text=True): - """ - Padding audio features with zeros to make them have the same shape (or - a user-defined shape) within one bach. + """SpeechCollator Collator - if ``keep_transcription_text`` is False, text is token ids else is raw string. + Args: + unit_type(str): token unit type, e.g. char, word, spm + vocab_filepath (str): vocab file path. + mean_std_filepath (str): mean and std file path, which suffix is *.npy + spm_model_prefix (str): spm model prefix, need if `unit_type` is spm. + augmentation_config (str, optional): augmentation json str. Defaults to '{}'. + stride_ms (float, optional): stride size in ms. Defaults to 10.0. + window_ms (float, optional): window size in ms. Defaults to 20.0. + n_fft (int, optional): fft points for rfft. Defaults to None. + max_freq (int, optional): max cut freq. Defaults to None. + target_sample_rate (int, optional): target sample rate which used for training. Defaults to 16000. + specgram_type (str, optional): 'linear', 'mfcc' or 'fbank'. Defaults to 'linear'. + feat_dim (int, optional): audio feature dim, using by 'mfcc' or 'fbank'. Defaults to None. + delta_delta (bool, optional): audio feature with delta-delta, using by 'fbank' or 'mfcc'. Defaults to False. + use_dB_normalization (bool, optional): do dB normalization. Defaults to True. + target_dB (int, optional): target dB. Defaults to -20. + random_seed (int, optional): for random generator. Defaults to 0. + keep_transcription_text (bool, optional): True, when not in training mode, will not do tokenizer; Defaults to False. + if ``keep_transcription_text`` is False, text is token ids else is raw string. + + Do augmentations + Padding audio features with zeros to make them have the same shape (or + a user-defined shape) within one batch. """ self._keep_transcription_text = keep_transcription_text diff --git a/deepspeech/io/dataset.py b/deepspeech/io/dataset.py index 24d8486a..70383b4d 100644 --- a/deepspeech/io/dataset.py +++ b/deepspeech/io/dataset.py @@ -40,15 +40,7 @@ class ManifestDataset(Dataset): def params(cls, config: Optional[CfgNode]=None) -> CfgNode: default = CfgNode( dict( - train_manifest="", - dev_manifest="", - test_manifest="", manifest="", - unit_type="char", - vocab_filepath="", - spm_model_prefix="", - mean_std_filepath="", - augmentation_config="", max_input_len=27.0, min_input_len=0.0, max_output_len=float('inf'), @@ -73,25 +65,10 @@ class ManifestDataset(Dataset): """ assert 'manifest' in config.data assert config.data.manifest - assert 'keep_transcription_text' in config.collator - - if isinstance(config.data.augmentation_config, (str, bytes)): - if config.data.augmentation_config: - aug_file = io.open( - config.data.augmentation_config, mode='r', encoding='utf8') - else: - aug_file = io.StringIO(initial_value='{}', newline='') - else: - aug_file = config.data.augmentation_config - assert isinstance(aug_file, io.StringIO) + dataset = cls( manifest_path=config.data.manifest, - unit_type=config.data.unit_type, - vocab_filepath=config.data.vocab_filepath, - mean_std_filepath=config.data.mean_std_filepath, - spm_model_prefix=config.data.spm_model_prefix, - augmentation_config=aug_file.read(), max_input_len=config.data.max_input_len, min_input_len=config.data.min_input_len, max_output_len=config.data.max_output_len, @@ -101,23 +78,8 @@ class ManifestDataset(Dataset): ) return dataset - - def _read_vocab(self, vocab_filepath): - """Load vocabulary from file.""" - vocab_lines = [] - with open(vocab_filepath, 'r', encoding='utf-8') as file: - vocab_lines.extend(file.readlines()) - vocab_list = [line[:-1] for line in vocab_lines] - return vocab_list - - def __init__(self, manifest_path, - unit_type, - vocab_filepath, - mean_std_filepath, - spm_model_prefix=None, - augmentation_config='{}', max_input_len=float('inf'), min_input_len=0.0, max_output_len=float('inf'), @@ -128,34 +90,16 @@ class ManifestDataset(Dataset): Args: manifest_path (str): manifest josn file path - unit_type(str): token unit type, e.g. char, word, spm - vocab_filepath (str): vocab file path. - mean_std_filepath (str): mean and std file path, which suffix is *.npy - spm_model_prefix (str): spm model prefix, need if `unit_type` is spm. - augmentation_config (str, optional): augmentation json str. Defaults to '{}'. max_input_len ([type], optional): maximum output seq length, in seconds for raw wav, in frame numbers for feature data. Defaults to float('inf'). min_input_len (float, optional): minimum input seq length, in seconds for raw wav, in frame numbers for feature data. Defaults to 0.0. max_output_len (float, optional): maximum input seq length, in modeling units. Defaults to 500.0. min_output_len (float, optional): minimum input seq length, in modeling units. Defaults to 0.0. max_output_input_ratio (float, optional): maximum output seq length/output seq length ratio. Defaults to 10.0. min_output_input_ratio (float, optional): minimum output seq length/output seq length ratio. Defaults to 0.05. - stride_ms (float, optional): stride size in ms. Defaults to 10.0. - window_ms (float, optional): window size in ms. Defaults to 20.0. - n_fft (int, optional): fft points for rfft. Defaults to None. - max_freq (int, optional): max cut freq. Defaults to None. - target_sample_rate (int, optional): target sample rate which used for training. Defaults to 16000. - specgram_type (str, optional): 'linear', 'mfcc' or 'fbank'. Defaults to 'linear'. - feat_dim (int, optional): audio feature dim, using by 'mfcc' or 'fbank'. Defaults to None. - delta_delta (bool, optional): audio feature with delta-delta, using by 'fbank' or 'mfcc'. Defaults to False. - use_dB_normalization (bool, optional): do dB normalization. Defaults to True. - target_dB (int, optional): target dB. Defaults to -20. - random_seed (int, optional): for random generator. Defaults to 0. - keep_transcription_text (bool, optional): True, when not in training mode, will not do tokenizer; Defaults to False. + """ super().__init__() - # self._rng = np.random.RandomState(random_seed) - # read manifest self._manifest = read_manifest( manifest_path=manifest_path, @@ -167,51 +111,6 @@ class ManifestDataset(Dataset): min_output_input_ratio=min_output_input_ratio) self._manifest.sort(key=lambda x: x["feat_shape"][0]) - # self._vocab_list = self._read_vocab(vocab_filepath) - - - # @property - # def manifest(self): - # return self._manifest - - # @property - # def vocab_size(self): - # """Return the vocabulary size. - - # Returns: - # int: Vocabulary size. - # """ - # return len(self._vocab_list) - - # @property - # def vocab_list(self): - # """Return the vocabulary in list. - - # Returns: - # List[str]: - # """ - # return self._vocab_list - - # @property - # def vocab_dict(self): - # """Return the vocabulary in dict. - - # Returns: - # Dict[str, int]: - # """ - # vocab_dict = dict( - # [(token, idx) for (idx, token) in enumerate(self._vocab_list)]) - # return vocab_dict - - # @property - # def feature_size(self): - # """Return the audio feature size. - - # Returns: - # int: audio feature size. - # """ - # return self._manifest[0]["feat_shape"][-1] - def __len__(self): return len(self._manifest) diff --git a/examples/aishell/s1/conf/conformer.yaml b/examples/aishell/s1/conf/conformer.yaml index b880f858..116c9192 100644 --- a/examples/aishell/s1/conf/conformer.yaml +++ b/examples/aishell/s1/conf/conformer.yaml @@ -3,17 +3,20 @@ data: train_manifest: data/manifest.train dev_manifest: data/manifest.dev test_manifest: data/manifest.test - vocab_filepath: data/vocab.txt - unit_type: 'char' - spm_model_prefix: '' - augmentation_config: conf/augmentation.json - batch_size: 64 min_input_len: 0.5 max_input_len: 20.0 # second min_output_len: 0.0 max_output_len: 400.0 min_output_input_ratio: 0.05 max_output_input_ratio: 10.0 + + +collator: + vocab_filepath: data/vocab.txt + unit_type: 'char' + spm_model_prefix: '' + augmentation_config: conf/augmentation.json + batch_size: 64 raw_wav: True # use raw_wav or kaldi feature specgram_type: fbank #linear, mfcc, fbank feat_dim: 80 @@ -32,7 +35,6 @@ data: shuffle_method: batch_shuffle num_workers: 2 - # network architecture model: cmvn_file: "data/mean_std.json" diff --git a/examples/tiny/s0/conf/deepspeech2.yaml b/examples/tiny/s0/conf/deepspeech2.yaml index 434cf264..6737d1b7 100644 --- a/examples/tiny/s0/conf/deepspeech2.yaml +++ b/examples/tiny/s0/conf/deepspeech2.yaml @@ -2,10 +2,7 @@ data: train_manifest: data/manifest.tiny dev_manifest: data/manifest.tiny - test_manifest: data/manifest.tiny - mean_std_filepath: data/mean_std.json - unit_type: char - vocab_filepath: data/vocab.txt + test_manifest: data/manifest.tiny min_input_len: 0.0 max_input_len: 27.0 min_output_len: 0.0 @@ -15,6 +12,9 @@ data: collator: + mean_std_filepath: data/mean_std.json + unit_type: char + vocab_filepath: data/vocab.txt augmentation_config: conf/augmentation.json random_seed: 0 spm_model_prefix: @@ -43,7 +43,7 @@ model: share_rnn_weights: True training: - n_epoch: 23 + n_epoch: 24 lr: 1e-5 lr_decay: 1.0 weight_decay: 1e-06 diff --git a/examples/tiny/s1/conf/transformer.yaml b/examples/tiny/s1/conf/transformer.yaml index 5e28e4e8..250995fa 100644 --- a/examples/tiny/s1/conf/transformer.yaml +++ b/examples/tiny/s1/conf/transformer.yaml @@ -3,26 +3,20 @@ data: train_manifest: data/manifest.tiny dev_manifest: data/manifest.tiny test_manifest: data/manifest.tiny - vocab_filepath: data/vocab.txt - unit_type: 'spm' - spm_model_prefix: 'data/bpe_unigram_200' - mean_std_filepath: "" - batch_size: 4 min_input_len: 0.5 # second max_input_len: 20.0 # second min_output_len: 0.0 # tokens max_output_len: 400.0 # tokens min_output_input_ratio: 0.05 max_output_input_ratio: 10.0 - raw_wav: True # use raw_wav or kaldi feature - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 0 #2 - + collator: + vocab_filepath: data/vocab.txt + mean_std_filepath: "" augmentation_config: conf/augmentation.json random_seed: 0 - spm_model_prefix: + unit_type: 'spm' + spm_model_prefix: 'data/bpe_unigram_200' specgram_type: fbank feat_dim: 80 delta_delta: False @@ -35,6 +29,12 @@ collator: target_dB: -20 dither: 1.0 keep_transcription_text: False + batch_size: 4 + sortagrad: True + shuffle_method: batch_shuffle + num_workers: 0 #2 + raw_wav: True # use raw_wav or kaldi feature + # network architecture model: From 089a8ed602721acf43c676b37249987ebd8bfa3b Mon Sep 17 00:00:00 2001 From: Haoxin Ma <745165806@qq.com> Date: Fri, 18 Jun 2021 09:47:53 +0000 Subject: [PATCH 12/14] fix deepspeech2/model.py and deepspeech2/config.py --- deepspeech/exps/deepspeech2/config.py | 76 ++++----------------------- deepspeech/exps/deepspeech2/model.py | 39 ++++++++++++++ 2 files changed, 50 insertions(+), 65 deletions(-) diff --git a/deepspeech/exps/deepspeech2/config.py b/deepspeech/exps/deepspeech2/config.py index 050a50b0..7d2250fc 100644 --- a/deepspeech/exps/deepspeech2/config.py +++ b/deepspeech/exps/deepspeech2/config.py @@ -11,80 +11,26 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from yacs.config import CfgNode as CN +from yacs.config import CfgNode from deepspeech.models.deepspeech2 import DeepSpeech2Model +from deepspeech.io.dataset import ManifestDataset +from deepspeech.io.collator import SpeechCollator +from deepspeech.exps.deepspeech2.model import DeepSpeech2Trainer +from deepspeech.exps.deepspeech2.model import DeepSpeech2Tester -_C = CN() -_C.data = CN( - dict( - train_manifest="", - dev_manifest="", - test_manifest="", - max_duration=float('inf'), - min_duration=0.0, - )) -_C.collator =CN( - dict( - unit_type="char", - vocab_filepath="", - spm_model_prefix="", - mean_std_filepath="", - augmentation_config="", - random_seed=0, - specgram_type='linear', # 'linear', 'mfcc', 'fbank' - feat_dim=0, # 'mfcc', 'fbank' - delta_delta=False, # 'mfcc', 'fbank' - stride_ms=10.0, # ms - window_ms=20.0, # ms - n_fft=None, # fft points - max_freq=None, # None for samplerate/2 - target_sample_rate=16000, # target sample rate - use_dB_normalization=True, - target_dB=-20, - dither=1.0, # feature dither - keep_transcription_text=False, - batch_size=32, # batch size - num_workers=0, # data loader workers - sortagrad=False, # sorted in first epoch when True - shuffle_method="batch_shuffle", # 'batch_shuffle', 'instance_shuffle' - )) +_C = CfgNode() -_C.model = CN( - dict( - num_conv_layers=2, #Number of stacking convolution layers. - num_rnn_layers=3, #Number of stacking RNN layers. - rnn_layer_size=1024, #RNN layer size (number of RNN cells). - use_gru=True, #Use gru if set True. Use simple rnn if set False. - share_rnn_weights=True #Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported. - )) +_C.data = ManifestDataset.params() +_C.collator = SpeechCollator.params() -DeepSpeech2Model.params(_C.model) +_C.model = DeepSpeech2Model.params() -_C.training = CN( - dict( - lr=5e-4, # learning rate - lr_decay=1.0, # learning rate decay - weight_decay=1e-6, # the coeff of weight decay - global_grad_clip=5.0, # the global norm clip - n_epoch=50, # train epochs - )) +_C.training = DeepSpeech2Trainer.params() -_C.decoding = CN( - dict( - alpha=2.5, # Coef of LM for beam search. - beta=0.3, # Coef of WC for beam search. - cutoff_prob=1.0, # Cutoff probability for pruning. - cutoff_top_n=40, # Cutoff number for pruning. - lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm', # Filepath for language model. - decoding_method='ctc_beam_search', # Decoding method. Options: ctc_beam_search, ctc_greedy - error_rate_type='wer', # Error rate type for evaluation. Options `wer`, 'cer' - num_proc_bsearch=8, # # of CPUs for beam search. - beam_size=500, # Beam search width. - batch_size=128, # decoding batch size - )) +_C.decoding = DeepSpeech2Tester.params() def get_cfg_defaults(): diff --git a/deepspeech/exps/deepspeech2/model.py b/deepspeech/exps/deepspeech2/model.py index 1eefc871..c11d1e25 100644 --- a/deepspeech/exps/deepspeech2/model.py +++ b/deepspeech/exps/deepspeech2/model.py @@ -34,10 +34,28 @@ from deepspeech.utils import layer_tools from deepspeech.utils import mp_tools from deepspeech.utils.log import Log +from typing import Optional +from yacs.config import CfgNode logger = Log(__name__).getlog() class DeepSpeech2Trainer(Trainer): + @classmethod + def params(cls, config: Optional[CfgNode]=None) -> CfgNode: + # training config + default = CfgNode( + dict( + lr=5e-4, # learning rate + lr_decay=1.0, # learning rate decay + weight_decay=1e-6, # the coeff of weight decay + global_grad_clip=5.0, # the global norm clip + n_epoch=50, # train epochs + )) + + if config is not None: + config.merge_from_other_cfg(default) + return default + def __init__(self, config, args): super().__init__(config, args) @@ -184,6 +202,27 @@ class DeepSpeech2Trainer(Trainer): class DeepSpeech2Tester(DeepSpeech2Trainer): + @classmethod + def params(cls, config: Optional[CfgNode]=None) -> CfgNode: + # testing config + default = CfgNode( + dict( + alpha=2.5, # Coef of LM for beam search. + beta=0.3, # Coef of WC for beam search. + cutoff_prob=1.0, # Cutoff probability for pruning. + cutoff_top_n=40, # Cutoff number for pruning. + lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm', # Filepath for language model. + decoding_method='ctc_beam_search', # Decoding method. Options: ctc_beam_search, ctc_greedy + error_rate_type='wer', # Error rate type for evaluation. Options `wer`, 'cer' + num_proc_bsearch=8, # # of CPUs for beam search. + beam_size=500, # Beam search width. + batch_size=128, # decoding batch size + )) + + if config is not None: + config.merge_from_other_cfg(default) + return default + def __init__(self, config, args): super().__init__(config, args) From 3a743f3717f692ff9cdbbcb24244fbc8ae5ce93b Mon Sep 17 00:00:00 2001 From: Haoxin Ma <745165806@qq.com> Date: Fri, 18 Jun 2021 10:09:35 +0000 Subject: [PATCH 13/14] fix pre-commit --- deepspeech/exps/deepspeech2/config.py | 9 +-- deepspeech/exps/deepspeech2/model.py | 58 +++++++------- deepspeech/exps/u2/config.py | 2 +- deepspeech/exps/u2/model.py | 19 +++-- deepspeech/io/collator.py | 108 +++++++++++++------------- deepspeech/io/dataset.py | 16 +--- deepspeech/models/u2.py | 1 - 7 files changed, 108 insertions(+), 105 deletions(-) diff --git a/deepspeech/exps/deepspeech2/config.py b/deepspeech/exps/deepspeech2/config.py index 7d2250fc..2f0f5c24 100644 --- a/deepspeech/exps/deepspeech2/config.py +++ b/deepspeech/exps/deepspeech2/config.py @@ -13,12 +13,11 @@ # limitations under the License. from yacs.config import CfgNode -from deepspeech.models.deepspeech2 import DeepSpeech2Model -from deepspeech.io.dataset import ManifestDataset -from deepspeech.io.collator import SpeechCollator -from deepspeech.exps.deepspeech2.model import DeepSpeech2Trainer from deepspeech.exps.deepspeech2.model import DeepSpeech2Tester - +from deepspeech.exps.deepspeech2.model import DeepSpeech2Trainer +from deepspeech.io.collator import SpeechCollator +from deepspeech.io.dataset import ManifestDataset +from deepspeech.models.deepspeech2 import DeepSpeech2Model _C = CfgNode() diff --git a/deepspeech/exps/deepspeech2/model.py b/deepspeech/exps/deepspeech2/model.py index c11d1e25..deb8752b 100644 --- a/deepspeech/exps/deepspeech2/model.py +++ b/deepspeech/exps/deepspeech2/model.py @@ -15,11 +15,13 @@ import time from collections import defaultdict from pathlib import Path +from typing import Optional import numpy as np import paddle from paddle import distributed as dist from paddle.io import DataLoader +from yacs.config import CfgNode from deepspeech.io.collator import SpeechCollator from deepspeech.io.dataset import ManifestDataset @@ -33,9 +35,6 @@ from deepspeech.utils import error_rate from deepspeech.utils import layer_tools from deepspeech.utils import mp_tools from deepspeech.utils.log import Log - -from typing import Optional -from yacs.config import CfgNode logger = Log(__name__).getlog() @@ -44,13 +43,13 @@ class DeepSpeech2Trainer(Trainer): def params(cls, config: Optional[CfgNode]=None) -> CfgNode: # training config default = CfgNode( - dict( - lr=5e-4, # learning rate - lr_decay=1.0, # learning rate decay - weight_decay=1e-6, # the coeff of weight decay - global_grad_clip=5.0, # the global norm clip - n_epoch=50, # train epochs - )) + dict( + lr=5e-4, # learning rate + lr_decay=1.0, # learning rate decay + weight_decay=1e-6, # the coeff of weight decay + global_grad_clip=5.0, # the global norm clip + n_epoch=50, # train epochs + )) if config is not None: config.merge_from_other_cfg(default) @@ -184,7 +183,6 @@ class DeepSpeech2Trainer(Trainer): collate_fn_train = SpeechCollator.from_config(config) - config.collator.augmentation_config = "" collate_fn_dev = SpeechCollator.from_config(config) self.train_loader = DataLoader( @@ -206,18 +204,18 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): def params(cls, config: Optional[CfgNode]=None) -> CfgNode: # testing config default = CfgNode( - dict( - alpha=2.5, # Coef of LM for beam search. - beta=0.3, # Coef of WC for beam search. - cutoff_prob=1.0, # Cutoff probability for pruning. - cutoff_top_n=40, # Cutoff number for pruning. - lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm', # Filepath for language model. - decoding_method='ctc_beam_search', # Decoding method. Options: ctc_beam_search, ctc_greedy - error_rate_type='wer', # Error rate type for evaluation. Options `wer`, 'cer' - num_proc_bsearch=8, # # of CPUs for beam search. - beam_size=500, # Beam search width. - batch_size=128, # decoding batch size - )) + dict( + alpha=2.5, # Coef of LM for beam search. + beta=0.3, # Coef of WC for beam search. + cutoff_prob=1.0, # Cutoff probability for pruning. + cutoff_top_n=40, # Cutoff number for pruning. + lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm', # Filepath for language model. + decoding_method='ctc_beam_search', # Decoding method. Options: ctc_beam_search, ctc_greedy + error_rate_type='wer', # Error rate type for evaluation. Options `wer`, 'cer' + num_proc_bsearch=8, # # of CPUs for beam search. + beam_size=500, # Beam search width. + batch_size=128, # decoding batch size + )) if config is not None: config.merge_from_other_cfg(default) @@ -235,7 +233,13 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): trans.append(''.join([chr(i) for i in ids])) return trans - def compute_metrics(self, utts, audio, audio_len, texts, texts_len, fout = None): + def compute_metrics(self, + utts, + audio, + audio_len, + texts, + texts_len, + fout=None): cfg = self.config.decoding errors_sum, len_refs, num_ins = 0.0, 0, 0 errors_func = error_rate.char_errors if cfg.error_rate_type == 'cer' else error_rate.word_errors @@ -257,7 +261,8 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): cutoff_top_n=cfg.cutoff_top_n, num_processes=cfg.num_proc_bsearch) - for utt, target, result in zip(utts, target_transcripts, result_transcripts): + for utt, target, result in zip(utts, target_transcripts, + result_transcripts): errors, len_ref = errors_func(target, result) errors_sum += errors len_refs += len_ref @@ -287,7 +292,8 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): with open(self.args.result_file, 'w') as fout: for i, batch in enumerate(self.test_loader): utts, audio, audio_len, texts, texts_len = batch - metrics = self.compute_metrics(utts, audio, audio_len, texts, texts_len, fout) + metrics = self.compute_metrics(utts, audio, audio_len, texts, + texts_len, fout) errors_sum += metrics['errors_sum'] len_refs += metrics['len_refs'] num_ins += metrics['num_ins'] diff --git a/deepspeech/exps/u2/config.py b/deepspeech/exps/u2/config.py index d8735453..4ec7bd19 100644 --- a/deepspeech/exps/u2/config.py +++ b/deepspeech/exps/u2/config.py @@ -15,9 +15,9 @@ from yacs.config import CfgNode from deepspeech.exps.u2.model import U2Tester from deepspeech.exps.u2.model import U2Trainer +from deepspeech.io.collator import SpeechCollator from deepspeech.io.dataset import ManifestDataset from deepspeech.models.u2 import U2Model -from deepspeech.io.collator import SpeechCollator _C = CfgNode() diff --git a/deepspeech/exps/u2/model.py b/deepspeech/exps/u2/model.py index 836afa36..05551875 100644 --- a/deepspeech/exps/u2/model.py +++ b/deepspeech/exps/u2/model.py @@ -78,7 +78,8 @@ class U2Trainer(Trainer): start = time.time() utt, audio, audio_len, text, text_len = batch_data - loss, attention_loss, ctc_loss = self.model(audio, audio_len, text, text_len) + loss, attention_loss, ctc_loss = self.model(audio, audio_len, text, + text_len) # loss div by `batch_size * accum_grad` loss /= train_conf.accum_grad loss.backward() @@ -121,7 +122,8 @@ class U2Trainer(Trainer): total_loss = 0.0 for i, batch in enumerate(self.valid_loader): utt, audio, audio_len, text, text_len = batch - loss, attention_loss, ctc_loss = self.model(audio, audio_len, text, text_len) + loss, attention_loss, ctc_loss = self.model(audio, audio_len, text, + text_len) if paddle.isfinite(loss): num_utts = batch[1].shape[0] num_seen_utts += num_utts @@ -221,7 +223,7 @@ class U2Trainer(Trainer): dev_dataset = ManifestDataset.from_config(config) collate_fn_train = SpeechCollator.from_config(config) - + config.collator.augmentation_config = "" collate_fn_dev = SpeechCollator.from_config(config) @@ -372,7 +374,13 @@ class U2Tester(U2Trainer): trans.append(''.join([chr(i) for i in ids])) return trans - def compute_metrics(self, utts, audio, audio_len, texts, texts_len, fout=None): + def compute_metrics(self, + utts, + audio, + audio_len, + texts, + texts_len, + fout=None): cfg = self.config.decoding errors_sum, len_refs, num_ins = 0.0, 0, 0 errors_func = error_rate.char_errors if cfg.error_rate_type == 'cer' else error_rate.word_errors @@ -399,7 +407,8 @@ class U2Tester(U2Trainer): simulate_streaming=cfg.simulate_streaming) decode_time = time.time() - start_time - for utt, target, result in zip(utts, target_transcripts, result_transcripts): + for utt, target, result in zip(utts, target_transcripts, + result_transcripts): errors, len_ref = errors_func(target, result) errors_sum += errors len_refs += len_ref diff --git a/deepspeech/io/collator.py b/deepspeech/io/collator.py index ab1e9165..ecf7024c 100644 --- a/deepspeech/io/collator.py +++ b/deepspeech/io/collator.py @@ -11,21 +11,21 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import io +import time +from collections import namedtuple +from typing import Optional + import numpy as np +from yacs.config import CfgNode -from deepspeech.frontend.utility import IGNORE_ID -from deepspeech.io.utility import pad_sequence -from deepspeech.utils.log import Log from deepspeech.frontend.augmentor.augmentation import AugmentationPipeline from deepspeech.frontend.featurizer.speech_featurizer import SpeechFeaturizer from deepspeech.frontend.normalizer import FeatureNormalizer from deepspeech.frontend.speech import SpeechSegment -import io -import time -from yacs.config import CfgNode -from typing import Optional - -from collections import namedtuple +from deepspeech.frontend.utility import IGNORE_ID +from deepspeech.io.utility import pad_sequence +from deepspeech.utils.log import Log __all__ = ["SpeechCollator"] @@ -34,6 +34,7 @@ logger = Log(__name__).getlog() # namedtupe need global for pickle. TarLocalData = namedtuple('TarLocalData', ['tar2info', 'tar2object']) + class SpeechCollator(): @classmethod def params(cls, config: Optional[CfgNode]=None) -> CfgNode: @@ -56,8 +57,7 @@ class SpeechCollator(): use_dB_normalization=True, target_dB=-20, dither=1.0, # feature dither - keep_transcription_text=False - )) + keep_transcription_text=False)) if config is not None: config.merge_from_other_cfg(default) @@ -84,7 +84,9 @@ class SpeechCollator(): if isinstance(config.collator.augmentation_config, (str, bytes)): if config.collator.augmentation_config: aug_file = io.open( - config.collator.augmentation_config, mode='r', encoding='utf8') + config.collator.augmentation_config, + mode='r', + encoding='utf8') else: aug_file = io.StringIO(initial_value='{}', newline='') else: @@ -92,43 +94,46 @@ class SpeechCollator(): assert isinstance(aug_file, io.StringIO) speech_collator = cls( - aug_file=aug_file, - random_seed=0, - mean_std_filepath=config.collator.mean_std_filepath, - unit_type=config.collator.unit_type, - vocab_filepath=config.collator.vocab_filepath, - spm_model_prefix=config.collator.spm_model_prefix, - specgram_type=config.collator.specgram_type, - feat_dim=config.collator.feat_dim, - delta_delta=config.collator.delta_delta, - stride_ms=config.collator.stride_ms, - window_ms=config.collator.window_ms, - n_fft=config.collator.n_fft, - max_freq=config.collator.max_freq, - target_sample_rate=config.collator.target_sample_rate, - use_dB_normalization=config.collator.use_dB_normalization, - target_dB=config.collator.target_dB, - dither=config.collator.dither, - keep_transcription_text=config.collator.keep_transcription_text - ) + aug_file=aug_file, + random_seed=0, + mean_std_filepath=config.collator.mean_std_filepath, + unit_type=config.collator.unit_type, + vocab_filepath=config.collator.vocab_filepath, + spm_model_prefix=config.collator.spm_model_prefix, + specgram_type=config.collator.specgram_type, + feat_dim=config.collator.feat_dim, + delta_delta=config.collator.delta_delta, + stride_ms=config.collator.stride_ms, + window_ms=config.collator.window_ms, + n_fft=config.collator.n_fft, + max_freq=config.collator.max_freq, + target_sample_rate=config.collator.target_sample_rate, + use_dB_normalization=config.collator.use_dB_normalization, + target_dB=config.collator.target_dB, + dither=config.collator.dither, + keep_transcription_text=config.collator.keep_transcription_text) return speech_collator - def __init__(self, aug_file, mean_std_filepath, - vocab_filepath, spm_model_prefix, - random_seed=0, - unit_type="char", - specgram_type='linear', # 'linear', 'mfcc', 'fbank' - feat_dim=0, # 'mfcc', 'fbank' - delta_delta=False, # 'mfcc', 'fbank' - stride_ms=10.0, # ms - window_ms=20.0, # ms - n_fft=None, # fft points - max_freq=None, # None for samplerate/2 - target_sample_rate=16000, # target sample rate - use_dB_normalization=True, - target_dB=-20, - dither=1.0, - keep_transcription_text=True): + def __init__( + self, + aug_file, + mean_std_filepath, + vocab_filepath, + spm_model_prefix, + random_seed=0, + unit_type="char", + specgram_type='linear', # 'linear', 'mfcc', 'fbank' + feat_dim=0, # 'mfcc', 'fbank' + delta_delta=False, # 'mfcc', 'fbank' + stride_ms=10.0, # ms + window_ms=20.0, # ms + n_fft=None, # fft points + max_freq=None, # None for samplerate/2 + target_sample_rate=16000, # target sample rate + use_dB_normalization=True, + target_dB=-20, + dither=1.0, + keep_transcription_text=True): """SpeechCollator Collator Args: @@ -159,9 +164,8 @@ class SpeechCollator(): self._local_data = TarLocalData(tar2info={}, tar2object={}) self._augmentation_pipeline = AugmentationPipeline( - augmentation_config=aug_file.read(), - random_seed=random_seed) - + augmentation_config=aug_file.read(), random_seed=random_seed) + self._normalizer = FeatureNormalizer( mean_std_filepath) if mean_std_filepath else None @@ -290,8 +294,6 @@ class SpeechCollator(): text_lens = np.array(text_lens).astype(np.int64) return utts, padded_audios, audio_lens, padded_texts, text_lens - - @property def manifest(self): return self._manifest @@ -318,4 +320,4 @@ class SpeechCollator(): @property def stride_ms(self): - return self._speech_featurizer.stride_ms \ No newline at end of file + return self._speech_featurizer.stride_ms diff --git a/deepspeech/io/dataset.py b/deepspeech/io/dataset.py index 70383b4d..92c60f35 100644 --- a/deepspeech/io/dataset.py +++ b/deepspeech/io/dataset.py @@ -12,19 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. import io -import tarfile -import time -from collections import namedtuple from typing import Optional -import numpy as np from paddle.io import Dataset from yacs.config import CfgNode -from deepspeech.frontend.augmentor.augmentation import AugmentationPipeline -from deepspeech.frontend.featurizer.speech_featurizer import SpeechFeaturizer -from deepspeech.frontend.normalizer import FeatureNormalizer -from deepspeech.frontend.speech import SpeechSegment from deepspeech.frontend.utility import read_manifest from deepspeech.utils.log import Log @@ -46,8 +38,7 @@ class ManifestDataset(Dataset): max_output_len=float('inf'), min_output_len=0.0, max_output_input_ratio=float('inf'), - min_output_input_ratio=0.0, - )) + min_output_input_ratio=0.0, )) if config is not None: config.merge_from_other_cfg(default) @@ -66,7 +57,6 @@ class ManifestDataset(Dataset): assert 'manifest' in config.data assert config.data.manifest - dataset = cls( manifest_path=config.data.manifest, max_input_len=config.data.max_input_len, @@ -74,8 +64,7 @@ class ManifestDataset(Dataset): max_output_len=config.data.max_output_len, min_output_len=config.data.min_output_len, max_output_input_ratio=config.data.max_output_input_ratio, - min_output_input_ratio=config.data.min_output_input_ratio, - ) + min_output_input_ratio=config.data.min_output_input_ratio, ) return dataset def __init__(self, @@ -111,7 +100,6 @@ class ManifestDataset(Dataset): min_output_input_ratio=min_output_input_ratio) self._manifest.sort(key=lambda x: x["feat_shape"][0]) - def __len__(self): return len(self._manifest) diff --git a/deepspeech/models/u2.py b/deepspeech/models/u2.py index bcfddaef..238e2d35 100644 --- a/deepspeech/models/u2.py +++ b/deepspeech/models/u2.py @@ -905,7 +905,6 @@ class U2InferModel(U2Model): def __init__(self, configs: dict): super().__init__(configs) - def forward(self, feats, feats_lengths, From 3652b87f33877d4b64b75398f9f99c34b1e5b02e Mon Sep 17 00:00:00 2001 From: Haoxin Ma <745165806@qq.com> Date: Fri, 18 Jun 2021 10:11:17 +0000 Subject: [PATCH 14/14] fix --- deepspeech/io/collator.py | 1 - deepspeech/io/dataset.py | 1 - 2 files changed, 2 deletions(-) diff --git a/deepspeech/io/collator.py b/deepspeech/io/collator.py index ecf7024c..1061f97c 100644 --- a/deepspeech/io/collator.py +++ b/deepspeech/io/collator.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. import io -import time from collections import namedtuple from typing import Optional diff --git a/deepspeech/io/dataset.py b/deepspeech/io/dataset.py index 92c60f35..3fc4e988 100644 --- a/deepspeech/io/dataset.py +++ b/deepspeech/io/dataset.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import io from typing import Optional from paddle.io import Dataset