pull/578/head
Hui Zhang 5 years ago
parent dee672a753
commit 22fce19101

@ -79,9 +79,9 @@ def inference(config, args):
def start_server(config, args):
"""Start the ASR server"""
config.defrost()
config.data.manfiest = config.data.test_manifest
config.data.augmentation_config = io.StringIO(
initial_value='{}', newline='')
config.data.augmentation_config = ""
config.data.keep_transcription_text = True
dataset = ManifestDataset.from_config(config)

@ -31,9 +31,9 @@ from deepspeech.io.dataset import ManifestDataset
def start_server(config, args):
"""Start the ASR server"""
config.defrost()
config.data.manfiest = config.data.test_manifest
config.data.augmentation_config = io.StringIO(
initial_value='{}', newline='')
config.data.augmentation_config = ""
config.data.keep_transcription_text = True
dataset = ManifestDataset.from_config(config)

@ -36,10 +36,9 @@ def tune(config, args):
raise ValueError("num_alphas must be non-negative!")
if not args.num_betas >= 0:
raise ValueError("num_betas must be non-negative!")
config.defrost()
config.data.manfiest = config.data.dev_manifest
config.data.augmentation_config = io.StringIO(
initial_value='{}', newline='')
config.data.augmentation_config = ""
config.data.keep_transcription_text = True
dev_dataset = ManifestDataset.from_config(config)

@ -13,7 +13,6 @@
# limitations under the License.
"""Contains DeepSpeech2 model."""
import io
import time
import logging
import numpy as np
@ -24,7 +23,7 @@ import paddle
from paddle import distributed as dist
from paddle.io import DataLoader
from deepspeech.training import Trainer
from deepspeech.training.trainer import Trainer
from deepspeech.training.gradclip import ClipGradByGlobalNormWithLog
from deepspeech.utils import mp_tools
@ -140,15 +139,15 @@ class DeepSpeech2Trainer(Trainer):
self.logger.info("Setup model/optimizer/lr_scheduler!")
def setup_dataloader(self):
config = self.config
config = self.config.clone()
config.defrost()
config.data.keep_transcription_text = False
config.data.manfiest = config.data.train_manifest
config.data.manifest = config.data.train_manifest
train_dataset = ManifestDataset.from_config(config)
config.data.manfiest = config.data.dev_manifest
config.data.augmentation_config = io.StringIO(
initial_value='{}', newline='')
config.data.manifest = config.data.dev_manifest
config.data.augmentation_config = ""
dev_dataset = ManifestDataset.from_config(config)
if self.parallel:
@ -324,13 +323,12 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
self.logger.info("Setup model!")
def setup_dataloader(self):
config = self.config
config = self.config.clone()
config.defrost()
# return raw text
config.data.manfiest = config.data.test_manifest
config.data.augmentation_config = io.StringIO(
initial_value='{}', newline='')
config.data.keep_transcription_text = True
config.data.manifest = config.data.test_manifest
config.data.augmentation_config = ""
test_dataset = ManifestDataset.from_config(config)
# return text ord id

@ -14,23 +14,20 @@
from yacs.config import CfgNode
from deepspeech.io.dataset import ManifestDataset
from deepspeech.models.u2 import U2Model
from deepspeech.exps.u2.model import U2Trainer
from deepspeech.exps.u2.model import U2Tester
_C = CfgNode()
_C.data = CfgNode()
ManifestDataset.params(_C.data)
_C.data = ManifestDataset.params()
_C.model = CfgNode()
U2Model.params(_C.model)
_C.model = U2Model.params()
_C.training = CfgNode()
U2Trainer.params(_C.training)
_C.training = U2Trainer.params()
_C.decoding = CfgNode()
U2Tester.params(_C.training)
_C.decoding = U2Tester.params()
def get_cfg_defaults():

@ -18,12 +18,14 @@ import logging
import numpy as np
from collections import defaultdict
from pathlib import Path
from typing import Optional
from yacs.config import CfgNode
import paddle
from paddle import distributed as dist
from paddle.io import DataLoader
from deepspeech.training import Trainer
from deepspeech.training.trainer import Trainer
from deepspeech.training.gradclip import ClipGradByGlobalNormWithLog
from deepspeech.training.scheduler import WarmupLR
@ -77,7 +79,7 @@ class U2Trainer(Trainer):
self.model.train()
start = time.time()
loss = self.model(*batch_data)
loss, attention_loss, ctc_loss = self.model(*batch_data)
loss.backward()
layer_tools.print_grads(self.model, print_func=None)
if self.iteration % train_conf.accum_grad == 0:
@ -88,13 +90,15 @@ class U2Trainer(Trainer):
losses_np = {
'train_loss': float(loss),
'train_loss_div_batchsize':
float(loss) / self.config.data.batch_size
'train_att_loss': float(attention_loss),
'train_ctc_loss': float(ctc_loss),
}
msg = "Train: Rank: {}, ".format(dist.get_rank())
msg += "epoch: {}, ".format(self.epoch)
msg += "step: {}, ".format(self.iteration)
msg += "time: {:>.3f}s, ".format(iteration_time)
msg += f"batch size: {self.config.data.batch_size}, "
msg += f"accum: {train_config.accum_grad}, "
msg += ', '.join('{}: {:>.6f}'.format(k, v)
for k, v in losses_np.items())
if self.iteration % train_conf.log_interval == 0:
@ -113,11 +117,11 @@ class U2Trainer(Trainer):
f"Valid Total Examples: {len(self.valid_loader.dataset)}")
valid_losses = defaultdict(list)
for i, batch in enumerate(self.valid_loader):
loss = self.model(*batch)
total_loss, attention_loss, ctc_loss = self.model(*batch)
valid_losses['val_loss'].append(float(loss))
valid_losses['val_loss_div_batchsize'].append(
float(loss) / self.config.data.batch_size)
valid_losses['val_loss'].append(float(total_loss))
valid_losses['val_att_loss'].append(float(attention_loss))
valid_losses['val_ctc_loss'].append(float(ctc_loss))
# write visual log
valid_losses = {k: np.mean(v) for k, v in valid_losses.items()}
@ -137,13 +141,14 @@ class U2Trainer(Trainer):
def setup_dataloader(self):
config = self.config.clone()
config.defrost()
config.data.keep_transcription_text = False
# train/valid dataset, return token ids
config.data.manfiest = config.data.train_manifest
config.data.manifest = config.data.train_manifest
train_dataset = ManifestDataset.from_config(config)
config.data.manfiest = config.data.dev_manifest
config.data.manifest = config.data.dev_manifest
config.data.augmentation_config = ""
dev_dataset = ManifestDataset.from_config(config)
@ -181,7 +186,7 @@ class U2Trainer(Trainer):
# test dataset, return raw text
config.data.keep_transcription_text = True
config.data.augmentation_config = ""
config.data.manfiest = config.data.test_manifest
config.data.manifest = config.data.test_manifest
test_dataset = ManifestDataset.from_config(config)
# return text ord id
self.test_loader = DataLoader(
@ -193,10 +198,12 @@ class U2Trainer(Trainer):
self.logger.info("Setup train/valid/test Dataloader!")
def setup_model(self):
config = self.config.clone()
config = self.config
model_conf = config.model
model_conf.defrost()
model_conf.input_dim = self.train_loader.dataset.feature_size
model_conf.output_dim = self.train_loader.dataset.vocab_size
model_conf.freeze()
model = U2Model.from_config(model_conf)
if self.parallel:
@ -206,12 +213,12 @@ class U2Trainer(Trainer):
train_config = config.training
optim_type = train_config.optim
optim_conf = train_config.train_config
optim_conf = train_config.optim_conf
scheduler_type = train_config.scheduler
scheduler_conf = train_config.scheduler_conf
grad_clip = ClipGradByGlobalNormWithLog(train_config.global_grad_clip)
weight_decay = paddle.regularizer.L2Decay(train_config.weight_decay)
weight_decay = paddle.regularizer.L2Decay(optim_conf.weight_decay)
if scheduler_type == 'expdecaylr':
lr_scheduler = paddle.optimizer.lr.ExponentialDecay(

@ -93,5 +93,5 @@ class FeatureNormalizer(object):
features.append(
featurize_func(AudioSegment.from_file(instance["feat"])))
features = np.hstack(features) #(D, T)
self._mean = np.mean(features, axis=1).reshape([-1, 1]) #(D, 1)
self._std = np.std(features, axis=1).reshape([-1, 1]) #(D, 1)
self._mean = np.mean(features, axis=1).reshape([1, -1]) #(1, D)
self._std = np.std(features, axis=1).reshape([1, -1]) #(1, D)

@ -235,8 +235,8 @@ def _load_kaldi_cmvn(kaldi_cmvn_file):
def _load_npz_cmvn(npz_cmvn_file, eps=1e-20):
npzfile = np.load(npz_cmvn_file)
means = npzfile["mean"] #(D, 1)
std = npzfile["std"] #(D, 1)
means = npzfile["mean"] #(1, D)
std = npzfile["std"] #(1, D)
std = np.clip(std, eps, None)
variance = 1.0 / std
cmvn = np.array([means, variance])

@ -16,8 +16,9 @@ import io
import random
import tarfile
import logging
from collections import namedtuple
from typing import Optional
from yacs.config import CfgNode
from collections import namedtuple
from paddle.io import Dataset
@ -42,6 +43,7 @@ class ManifestDataset(Dataset):
train_manifest="",
dev_manifest="",
test_manifest="",
manifest="",
unit_type="char",
vocab_filepath="",
spm_model_prefix="",
@ -60,7 +62,7 @@ class ManifestDataset(Dataset):
raw_wav=True, # use raw_wav or kaldi feature
specgram_type='linear', # 'linear', 'mfcc', 'fbank'
feat_dim=0, # 'mfcc', 'fbank'
delat_delta=False, # 'mfcc', 'fbank'
delta_delta=False, # 'mfcc', 'fbank'
target_sample_rate=16000, # target sample rate
use_dB_normalization=True,
target_dB=-20,
@ -86,8 +88,9 @@ class ManifestDataset(Dataset):
Returns:
ManifestDataset: dataet object.
"""
assert manifest in config.data
assert keep_transcription_text in config.data
assert 'manifest' in config.data
assert config.data.manifest
assert 'keep_transcription_text' in config.data
if isinstance(config.data.augmentation_config, (str, bytes)):
if config.data.augmentation_config:
@ -119,7 +122,7 @@ class ManifestDataset(Dataset):
target_sample_rate=config.data.target_sample_rate,
specgram_type=config.data.specgram_type,
feat_dim=config.data.feat_dim,
delta_delta=config.data.delat_delta,
delta_delta=config.data.delta_delta,
use_dB_normalization=config.data.use_dB_normalization,
target_dB=config.data.target_dB,
random_seed=config.data.random_seed,

@ -75,8 +75,8 @@ class U2BaseModel(nn.Module):
dropout_rate=0.1,
positional_dropout_rate=0.1,
attention_dropout_rate=0.0,
input_layer=conv2d, # encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before=true,
input_layer='conv2d', # encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before=True,
cnn_module_kernel=15,
use_cnn_module=True,
activation_type='swish',
@ -98,7 +98,7 @@ class U2BaseModel(nn.Module):
dict(
ctc_weight=0.3,
lsm_weight=0.1, # label smoothing option
length_normalized_loss=false, ))
length_normalized_loss=False, ))
if config is not None:
config.merge_from_other_cfg(default)
@ -744,15 +744,9 @@ class U2Model(U2BaseModel):
ValueError: raise when using not support encoder type.
Returns:
int, nn.Layer, nn.Layer, nn.Layer: vocab size, encoder, decoder, ctc
nn.Layer: U2Model
"""
vocab_size, encoder, decoder, ctc = U2Model._init_from_config(configs)
model = cls(vocab_size=vocab_size,
encoder=encoder,
decoder=decoder,
ctc=ctc,
**configs['model_conf'])
model = cls(configs)
return model
@classmethod

@ -13,6 +13,8 @@
# limitations under the License.
import logging
from typing import Union
from typeguard import check_argument_types
from paddle.optimizer.lr import LRScheduler

@ -0,0 +1,8 @@
[
{
"type": "shift",
"params": {"min_shift_ms": -5,
"max_shift_ms": 5},
"prob": 1.0
}
]

@ -5,7 +5,7 @@ data:
test_manifest: data/manifest.tiny
vocab_filepath: data/vocab.txt
unit_type: 'spm'
spm_model_prefix: 'bpe_unigram_200'
spm_model_prefix: 'data/bpe_unigram_200'
mean_std_filepath: ""
augmentation_config: conf/augmentation.config
batch_size: 4
@ -119,11 +119,11 @@ training:
optim: adam
optim_conf:
lr: 0.002
lr_decay: 1.0
weight_decay: 1e-06
scheduler: warmuplr # pytorch v1.1.0+ required
scheduler_conf:
warmup_steps: 25000
lr_decay: 1.0
log_interval: 100
decoding:

Loading…
Cancel
Save