From 22fce1910190309aef0e422b55a7624e6b707903 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Mon, 12 Apr 2021 09:58:23 +0000 Subject: [PATCH] can train --- .../exps/deepspeech2/bin/deploy/runtime.py | 4 +-- .../exps/deepspeech2/bin/deploy/server.py | 4 +-- deepspeech/exps/deepspeech2/bin/tune.py | 5 ++- deepspeech/exps/deepspeech2/model.py | 22 ++++++------ deepspeech/exps/u2/config.py | 13 +++---- deepspeech/exps/u2/model.py | 35 +++++++++++-------- deepspeech/frontend/normalizer.py | 4 +-- deepspeech/frontend/utility.py | 4 +-- deepspeech/io/dataset.py | 13 ++++--- deepspeech/models/u2.py | 16 +++------ deepspeech/training/scheduler.py | 2 ++ examples/tiny/s1/conf/augmentation.config | 8 +++++ examples/tiny/s1/conf/conformer.yaml | 4 +-- 13 files changed, 71 insertions(+), 63 deletions(-) create mode 100644 examples/tiny/s1/conf/augmentation.config diff --git a/deepspeech/exps/deepspeech2/bin/deploy/runtime.py b/deepspeech/exps/deepspeech2/bin/deploy/runtime.py index ed0338870..737d6432a 100644 --- a/deepspeech/exps/deepspeech2/bin/deploy/runtime.py +++ b/deepspeech/exps/deepspeech2/bin/deploy/runtime.py @@ -79,9 +79,9 @@ def inference(config, args): def start_server(config, args): """Start the ASR server""" + config.defrost() config.data.manfiest = config.data.test_manifest - config.data.augmentation_config = io.StringIO( - initial_value='{}', newline='') + config.data.augmentation_config = "" config.data.keep_transcription_text = True dataset = ManifestDataset.from_config(config) diff --git a/deepspeech/exps/deepspeech2/bin/deploy/server.py b/deepspeech/exps/deepspeech2/bin/deploy/server.py index f5b0a7d51..ff545b196 100644 --- a/deepspeech/exps/deepspeech2/bin/deploy/server.py +++ b/deepspeech/exps/deepspeech2/bin/deploy/server.py @@ -31,9 +31,9 @@ from deepspeech.io.dataset import ManifestDataset def start_server(config, args): """Start the ASR server""" + config.defrost() config.data.manfiest = config.data.test_manifest - config.data.augmentation_config = io.StringIO( - initial_value='{}', newline='') + config.data.augmentation_config = "" config.data.keep_transcription_text = True dataset = ManifestDataset.from_config(config) diff --git a/deepspeech/exps/deepspeech2/bin/tune.py b/deepspeech/exps/deepspeech2/bin/tune.py index 3a2907a52..dafa6e041 100644 --- a/deepspeech/exps/deepspeech2/bin/tune.py +++ b/deepspeech/exps/deepspeech2/bin/tune.py @@ -36,10 +36,9 @@ def tune(config, args): raise ValueError("num_alphas must be non-negative!") if not args.num_betas >= 0: raise ValueError("num_betas must be non-negative!") - + config.defrost() config.data.manfiest = config.data.dev_manifest - config.data.augmentation_config = io.StringIO( - initial_value='{}', newline='') + config.data.augmentation_config = "" config.data.keep_transcription_text = True dev_dataset = ManifestDataset.from_config(config) diff --git a/deepspeech/exps/deepspeech2/model.py b/deepspeech/exps/deepspeech2/model.py index 170d47d9c..75335d318 100644 --- a/deepspeech/exps/deepspeech2/model.py +++ b/deepspeech/exps/deepspeech2/model.py @@ -13,7 +13,6 @@ # limitations under the License. """Contains DeepSpeech2 model.""" -import io import time import logging import numpy as np @@ -24,7 +23,7 @@ import paddle from paddle import distributed as dist from paddle.io import DataLoader -from deepspeech.training import Trainer +from deepspeech.training.trainer import Trainer from deepspeech.training.gradclip import ClipGradByGlobalNormWithLog from deepspeech.utils import mp_tools @@ -140,15 +139,15 @@ class DeepSpeech2Trainer(Trainer): self.logger.info("Setup model/optimizer/lr_scheduler!") def setup_dataloader(self): - config = self.config + config = self.config.clone() + config.defrost() config.data.keep_transcription_text = False - config.data.manfiest = config.data.train_manifest + config.data.manifest = config.data.train_manifest train_dataset = ManifestDataset.from_config(config) - config.data.manfiest = config.data.dev_manifest - config.data.augmentation_config = io.StringIO( - initial_value='{}', newline='') + config.data.manifest = config.data.dev_manifest + config.data.augmentation_config = "" dev_dataset = ManifestDataset.from_config(config) if self.parallel: @@ -324,13 +323,12 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): self.logger.info("Setup model!") def setup_dataloader(self): - config = self.config + config = self.config.clone() + config.defrost() # return raw text - config.data.manfiest = config.data.test_manifest - config.data.augmentation_config = io.StringIO( - initial_value='{}', newline='') - config.data.keep_transcription_text = True + config.data.manifest = config.data.test_manifest + config.data.augmentation_config = "" test_dataset = ManifestDataset.from_config(config) # return text ord id diff --git a/deepspeech/exps/u2/config.py b/deepspeech/exps/u2/config.py index 48ec05efb..34228cb93 100644 --- a/deepspeech/exps/u2/config.py +++ b/deepspeech/exps/u2/config.py @@ -14,23 +14,20 @@ from yacs.config import CfgNode +from deepspeech.io.dataset import ManifestDataset from deepspeech.models.u2 import U2Model from deepspeech.exps.u2.model import U2Trainer from deepspeech.exps.u2.model import U2Tester _C = CfgNode() -_C.data = CfgNode() -ManifestDataset.params(_C.data) +_C.data = ManifestDataset.params() -_C.model = CfgNode() -U2Model.params(_C.model) +_C.model = U2Model.params() -_C.training = CfgNode() -U2Trainer.params(_C.training) +_C.training = U2Trainer.params() -_C.decoding = CfgNode() -U2Tester.params(_C.training) +_C.decoding = U2Tester.params() def get_cfg_defaults(): diff --git a/deepspeech/exps/u2/model.py b/deepspeech/exps/u2/model.py index 29f7f03cd..0a09eeda1 100644 --- a/deepspeech/exps/u2/model.py +++ b/deepspeech/exps/u2/model.py @@ -18,12 +18,14 @@ import logging import numpy as np from collections import defaultdict from pathlib import Path +from typing import Optional +from yacs.config import CfgNode import paddle from paddle import distributed as dist from paddle.io import DataLoader -from deepspeech.training import Trainer +from deepspeech.training.trainer import Trainer from deepspeech.training.gradclip import ClipGradByGlobalNormWithLog from deepspeech.training.scheduler import WarmupLR @@ -77,7 +79,7 @@ class U2Trainer(Trainer): self.model.train() start = time.time() - loss = self.model(*batch_data) + loss, attention_loss, ctc_loss = self.model(*batch_data) loss.backward() layer_tools.print_grads(self.model, print_func=None) if self.iteration % train_conf.accum_grad == 0: @@ -88,13 +90,15 @@ class U2Trainer(Trainer): losses_np = { 'train_loss': float(loss), - 'train_loss_div_batchsize': - float(loss) / self.config.data.batch_size + 'train_att_loss': float(attention_loss), + 'train_ctc_loss': float(ctc_loss), } msg = "Train: Rank: {}, ".format(dist.get_rank()) msg += "epoch: {}, ".format(self.epoch) msg += "step: {}, ".format(self.iteration) msg += "time: {:>.3f}s, ".format(iteration_time) + msg += f"batch size: {self.config.data.batch_size}, " + msg += f"accum: {train_config.accum_grad}, " msg += ', '.join('{}: {:>.6f}'.format(k, v) for k, v in losses_np.items()) if self.iteration % train_conf.log_interval == 0: @@ -113,11 +117,11 @@ class U2Trainer(Trainer): f"Valid Total Examples: {len(self.valid_loader.dataset)}") valid_losses = defaultdict(list) for i, batch in enumerate(self.valid_loader): - loss = self.model(*batch) + total_loss, attention_loss, ctc_loss = self.model(*batch) - valid_losses['val_loss'].append(float(loss)) - valid_losses['val_loss_div_batchsize'].append( - float(loss) / self.config.data.batch_size) + valid_losses['val_loss'].append(float(total_loss)) + valid_losses['val_att_loss'].append(float(attention_loss)) + valid_losses['val_ctc_loss'].append(float(ctc_loss)) # write visual log valid_losses = {k: np.mean(v) for k, v in valid_losses.items()} @@ -137,13 +141,14 @@ class U2Trainer(Trainer): def setup_dataloader(self): config = self.config.clone() + config.defrost() config.data.keep_transcription_text = False # train/valid dataset, return token ids - config.data.manfiest = config.data.train_manifest + config.data.manifest = config.data.train_manifest train_dataset = ManifestDataset.from_config(config) - config.data.manfiest = config.data.dev_manifest + config.data.manifest = config.data.dev_manifest config.data.augmentation_config = "" dev_dataset = ManifestDataset.from_config(config) @@ -181,7 +186,7 @@ class U2Trainer(Trainer): # test dataset, return raw text config.data.keep_transcription_text = True config.data.augmentation_config = "" - config.data.manfiest = config.data.test_manifest + config.data.manifest = config.data.test_manifest test_dataset = ManifestDataset.from_config(config) # return text ord id self.test_loader = DataLoader( @@ -193,10 +198,12 @@ class U2Trainer(Trainer): self.logger.info("Setup train/valid/test Dataloader!") def setup_model(self): - config = self.config.clone() + config = self.config model_conf = config.model + model_conf.defrost() model_conf.input_dim = self.train_loader.dataset.feature_size model_conf.output_dim = self.train_loader.dataset.vocab_size + model_conf.freeze() model = U2Model.from_config(model_conf) if self.parallel: @@ -206,12 +213,12 @@ class U2Trainer(Trainer): train_config = config.training optim_type = train_config.optim - optim_conf = train_config.train_config + optim_conf = train_config.optim_conf scheduler_type = train_config.scheduler scheduler_conf = train_config.scheduler_conf grad_clip = ClipGradByGlobalNormWithLog(train_config.global_grad_clip) - weight_decay = paddle.regularizer.L2Decay(train_config.weight_decay) + weight_decay = paddle.regularizer.L2Decay(optim_conf.weight_decay) if scheduler_type == 'expdecaylr': lr_scheduler = paddle.optimizer.lr.ExponentialDecay( diff --git a/deepspeech/frontend/normalizer.py b/deepspeech/frontend/normalizer.py index a57b247ad..7be421fb1 100644 --- a/deepspeech/frontend/normalizer.py +++ b/deepspeech/frontend/normalizer.py @@ -93,5 +93,5 @@ class FeatureNormalizer(object): features.append( featurize_func(AudioSegment.from_file(instance["feat"]))) features = np.hstack(features) #(D, T) - self._mean = np.mean(features, axis=1).reshape([-1, 1]) #(D, 1) - self._std = np.std(features, axis=1).reshape([-1, 1]) #(D, 1) + self._mean = np.mean(features, axis=1).reshape([1, -1]) #(1, D) + self._std = np.std(features, axis=1).reshape([1, -1]) #(1, D) diff --git a/deepspeech/frontend/utility.py b/deepspeech/frontend/utility.py index a8529c30a..8c5581eb0 100644 --- a/deepspeech/frontend/utility.py +++ b/deepspeech/frontend/utility.py @@ -235,8 +235,8 @@ def _load_kaldi_cmvn(kaldi_cmvn_file): def _load_npz_cmvn(npz_cmvn_file, eps=1e-20): npzfile = np.load(npz_cmvn_file) - means = npzfile["mean"] #(D, 1) - std = npzfile["std"] #(D, 1) + means = npzfile["mean"] #(1, D) + std = npzfile["std"] #(1, D) std = np.clip(std, eps, None) variance = 1.0 / std cmvn = np.array([means, variance]) diff --git a/deepspeech/io/dataset.py b/deepspeech/io/dataset.py index f783d8273..4875929eb 100644 --- a/deepspeech/io/dataset.py +++ b/deepspeech/io/dataset.py @@ -16,8 +16,9 @@ import io import random import tarfile import logging -from collections import namedtuple +from typing import Optional from yacs.config import CfgNode +from collections import namedtuple from paddle.io import Dataset @@ -42,6 +43,7 @@ class ManifestDataset(Dataset): train_manifest="", dev_manifest="", test_manifest="", + manifest="", unit_type="char", vocab_filepath="", spm_model_prefix="", @@ -60,7 +62,7 @@ class ManifestDataset(Dataset): raw_wav=True, # use raw_wav or kaldi feature specgram_type='linear', # 'linear', 'mfcc', 'fbank' feat_dim=0, # 'mfcc', 'fbank' - delat_delta=False, # 'mfcc', 'fbank' + delta_delta=False, # 'mfcc', 'fbank' target_sample_rate=16000, # target sample rate use_dB_normalization=True, target_dB=-20, @@ -86,8 +88,9 @@ class ManifestDataset(Dataset): Returns: ManifestDataset: dataet object. """ - assert manifest in config.data - assert keep_transcription_text in config.data + assert 'manifest' in config.data + assert config.data.manifest + assert 'keep_transcription_text' in config.data if isinstance(config.data.augmentation_config, (str, bytes)): if config.data.augmentation_config: @@ -119,7 +122,7 @@ class ManifestDataset(Dataset): target_sample_rate=config.data.target_sample_rate, specgram_type=config.data.specgram_type, feat_dim=config.data.feat_dim, - delta_delta=config.data.delat_delta, + delta_delta=config.data.delta_delta, use_dB_normalization=config.data.use_dB_normalization, target_dB=config.data.target_dB, random_seed=config.data.random_seed, diff --git a/deepspeech/models/u2.py b/deepspeech/models/u2.py index 38a781e19..c875a5ab5 100644 --- a/deepspeech/models/u2.py +++ b/deepspeech/models/u2.py @@ -75,8 +75,8 @@ class U2BaseModel(nn.Module): dropout_rate=0.1, positional_dropout_rate=0.1, attention_dropout_rate=0.0, - input_layer=conv2d, # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before=true, + input_layer='conv2d', # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before=True, cnn_module_kernel=15, use_cnn_module=True, activation_type='swish', @@ -98,7 +98,7 @@ class U2BaseModel(nn.Module): dict( ctc_weight=0.3, lsm_weight=0.1, # label smoothing option - length_normalized_loss=false, )) + length_normalized_loss=False, )) if config is not None: config.merge_from_other_cfg(default) @@ -744,15 +744,9 @@ class U2Model(U2BaseModel): ValueError: raise when using not support encoder type. Returns: - int, nn.Layer, nn.Layer, nn.Layer: vocab size, encoder, decoder, ctc + nn.Layer: U2Model """ - vocab_size, encoder, decoder, ctc = U2Model._init_from_config(configs) - - model = cls(vocab_size=vocab_size, - encoder=encoder, - decoder=decoder, - ctc=ctc, - **configs['model_conf']) + model = cls(configs) return model @classmethod diff --git a/deepspeech/training/scheduler.py b/deepspeech/training/scheduler.py index a45279505..54103e061 100644 --- a/deepspeech/training/scheduler.py +++ b/deepspeech/training/scheduler.py @@ -13,6 +13,8 @@ # limitations under the License. import logging +from typing import Union +from typeguard import check_argument_types from paddle.optimizer.lr import LRScheduler diff --git a/examples/tiny/s1/conf/augmentation.config b/examples/tiny/s1/conf/augmentation.config new file mode 100644 index 000000000..6c24da549 --- /dev/null +++ b/examples/tiny/s1/conf/augmentation.config @@ -0,0 +1,8 @@ +[ + { + "type": "shift", + "params": {"min_shift_ms": -5, + "max_shift_ms": 5}, + "prob": 1.0 + } +] diff --git a/examples/tiny/s1/conf/conformer.yaml b/examples/tiny/s1/conf/conformer.yaml index 6ec976f74..af26f0291 100644 --- a/examples/tiny/s1/conf/conformer.yaml +++ b/examples/tiny/s1/conf/conformer.yaml @@ -5,7 +5,7 @@ data: test_manifest: data/manifest.tiny vocab_filepath: data/vocab.txt unit_type: 'spm' - spm_model_prefix: 'bpe_unigram_200' + spm_model_prefix: 'data/bpe_unigram_200' mean_std_filepath: "" augmentation_config: conf/augmentation.config batch_size: 4 @@ -119,11 +119,11 @@ training: optim: adam optim_conf: lr: 0.002 - lr_decay: 1.0 weight_decay: 1e-06 scheduler: warmuplr # pytorch v1.1.0+ required scheduler_conf: warmup_steps: 25000 + lr_decay: 1.0 log_interval: 100 decoding: