Merge pull request #820 from PaddlePaddle/ctc

more ctc config
pull/822/head
Hui Zhang 3 years ago committed by GitHub
commit 91bc5959a9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -34,6 +34,7 @@ from deepspeech.io.sampler import SortagradDistributedBatchSampler
from deepspeech.models.u2 import U2Model
from deepspeech.training.optimizer import OptimizerFactory
from deepspeech.training.scheduler import LRSchedulerFactory
from deepspeech.training.timer import Timer
from deepspeech.training.trainer import Trainer
from deepspeech.utils import ctc_utils
from deepspeech.utils import error_rate
@ -184,40 +185,42 @@ class U2Trainer(Trainer):
self.save(tag='init')
self.lr_scheduler.step(self.iteration)
if self.parallel:
if self.parallel and hasattr(self.train_loader, 'batch_sampler'):
self.train_loader.batch_sampler.set_epoch(self.epoch)
logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}")
while self.epoch < self.config.training.n_epoch:
self.model.train()
try:
data_start_time = time.time()
for batch_index, batch in enumerate(self.train_loader):
dataload_time = time.time() - data_start_time
msg = "Train: Rank: {}, ".format(dist.get_rank())
msg += "epoch: {}, ".format(self.epoch)
msg += "step: {}, ".format(self.iteration)
msg += "batch : {}/{}, ".format(batch_index + 1,
len(self.train_loader))
msg += "lr: {:>.8f}, ".format(self.lr_scheduler())
msg += "data time: {:>.3f}s, ".format(dataload_time)
self.train_batch(batch_index, batch, msg)
with Timer("Epoch-Train Time Cost: {}"):
self.model.train()
try:
data_start_time = time.time()
except Exception as e:
logger.error(e)
raise e
total_loss, num_seen_utts = self.valid()
if dist.get_world_size() > 1:
num_seen_utts = paddle.to_tensor(num_seen_utts)
# the default operator in all_reduce function is sum.
dist.all_reduce(num_seen_utts)
total_loss = paddle.to_tensor(total_loss)
dist.all_reduce(total_loss)
cv_loss = total_loss / num_seen_utts
cv_loss = float(cv_loss)
else:
cv_loss = total_loss / num_seen_utts
for batch_index, batch in enumerate(self.train_loader):
dataload_time = time.time() - data_start_time
msg = "Train: Rank: {}, ".format(dist.get_rank())
msg += "epoch: {}, ".format(self.epoch)
msg += "step: {}, ".format(self.iteration)
msg += "batch : {}/{}, ".format(batch_index + 1,
len(self.train_loader))
msg += "lr: {:>.8f}, ".format(self.lr_scheduler())
msg += "data time: {:>.3f}s, ".format(dataload_time)
self.train_batch(batch_index, batch, msg)
data_start_time = time.time()
except Exception as e:
logger.error(e)
raise e
with Timer("Eval Time Cost: {}"):
total_loss, num_seen_utts = self.valid()
if dist.get_world_size() > 1:
num_seen_utts = paddle.to_tensor(num_seen_utts)
# the default operator in all_reduce function is sum.
dist.all_reduce(num_seen_utts)
total_loss = paddle.to_tensor(total_loss)
dist.all_reduce(total_loss)
cv_loss = total_loss / num_seen_utts
cv_loss = float(cv_loss)
else:
cv_loss = total_loss / num_seen_utts
logger.info(
'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss))

@ -32,6 +32,7 @@ from deepspeech.io.dataloader import BatchDataLoader
from deepspeech.models.u2 import U2Model
from deepspeech.training.optimizer import OptimizerFactory
from deepspeech.training.scheduler import LRSchedulerFactory
from deepspeech.training.timer import Timer
from deepspeech.training.trainer import Trainer
from deepspeech.utils import ctc_utils
from deepspeech.utils import error_rate
@ -190,35 +191,37 @@ class U2Trainer(Trainer):
logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}")
while self.epoch < self.config.training.n_epoch:
self.model.train()
try:
data_start_time = time.time()
for batch_index, batch in enumerate(self.train_loader):
dataload_time = time.time() - data_start_time
msg = "Train: Rank: {}, ".format(dist.get_rank())
msg += "epoch: {}, ".format(self.epoch)
msg += "step: {}, ".format(self.iteration)
msg += "batch : {}/{}, ".format(batch_index + 1,
len(self.train_loader))
msg += "lr: {:>.8f}, ".format(self.lr_scheduler())
msg += "data time: {:>.3f}s, ".format(dataload_time)
self.train_batch(batch_index, batch, msg)
with Timer("Epoch-Train Time Cost: {}"):
self.model.train()
try:
data_start_time = time.time()
except Exception as e:
logger.error(e)
raise e
total_loss, num_seen_utts = self.valid()
if dist.get_world_size() > 1:
num_seen_utts = paddle.to_tensor(num_seen_utts)
# the default operator in all_reduce function is sum.
dist.all_reduce(num_seen_utts)
total_loss = paddle.to_tensor(total_loss)
dist.all_reduce(total_loss)
cv_loss = total_loss / num_seen_utts
cv_loss = float(cv_loss)
else:
cv_loss = total_loss / num_seen_utts
for batch_index, batch in enumerate(self.train_loader):
dataload_time = time.time() - data_start_time
msg = "Train: Rank: {}, ".format(dist.get_rank())
msg += "epoch: {}, ".format(self.epoch)
msg += "step: {}, ".format(self.iteration)
msg += "batch : {}/{}, ".format(batch_index + 1,
len(self.train_loader))
msg += "lr: {:>.8f}, ".format(self.lr_scheduler())
msg += "data time: {:>.3f}s, ".format(dataload_time)
self.train_batch(batch_index, batch, msg)
data_start_time = time.time()
except Exception as e:
logger.error(e)
raise e
with Timer("Eval Time Cost: {}"):
total_loss, num_seen_utts = self.valid()
if dist.get_world_size() > 1:
num_seen_utts = paddle.to_tensor(num_seen_utts)
# the default operator in all_reduce function is sum.
dist.all_reduce(num_seen_utts)
total_loss = paddle.to_tensor(total_loss)
dist.all_reduce(total_loss)
cv_loss = total_loss / num_seen_utts
cv_loss = float(cv_loss)
else:
cv_loss = total_loss / num_seen_utts
logger.info(
'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss))

@ -38,6 +38,7 @@ from deepspeech.io.sampler import SortagradDistributedBatchSampler
from deepspeech.models.u2_st import U2STModel
from deepspeech.training.gradclip import ClipGradByGlobalNormWithLog
from deepspeech.training.scheduler import WarmupLR
from deepspeech.training.timer import Timer
from deepspeech.training.trainer import Trainer
from deepspeech.utils import bleu_score
from deepspeech.utils import ctc_utils
@ -207,35 +208,37 @@ class U2STTrainer(Trainer):
logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}")
while self.epoch < self.config.training.n_epoch:
self.model.train()
try:
data_start_time = time.time()
for batch_index, batch in enumerate(self.train_loader):
dataload_time = time.time() - data_start_time
msg = "Train: Rank: {}, ".format(dist.get_rank())
msg += "epoch: {}, ".format(self.epoch)
msg += "step: {}, ".format(self.iteration)
msg += "batch : {}/{}, ".format(batch_index + 1,
len(self.train_loader))
msg += "lr: {:>.8f}, ".format(self.lr_scheduler())
msg += "data time: {:>.3f}s, ".format(dataload_time)
self.train_batch(batch_index, batch, msg)
with Timer("Epoch-Train Time Cost: {}"):
self.model.train()
try:
data_start_time = time.time()
except Exception as e:
logger.error(e)
raise e
total_loss, num_seen_utts = self.valid()
if dist.get_world_size() > 1:
num_seen_utts = paddle.to_tensor(num_seen_utts)
# the default operator in all_reduce function is sum.
dist.all_reduce(num_seen_utts)
total_loss = paddle.to_tensor(total_loss)
dist.all_reduce(total_loss)
cv_loss = total_loss / num_seen_utts
cv_loss = float(cv_loss)
else:
cv_loss = total_loss / num_seen_utts
for batch_index, batch in enumerate(self.train_loader):
dataload_time = time.time() - data_start_time
msg = "Train: Rank: {}, ".format(dist.get_rank())
msg += "epoch: {}, ".format(self.epoch)
msg += "step: {}, ".format(self.iteration)
msg += "batch : {}/{}, ".format(batch_index + 1,
len(self.train_loader))
msg += "lr: {:>.8f}, ".format(self.lr_scheduler())
msg += "data time: {:>.3f}s, ".format(dataload_time)
self.train_batch(batch_index, batch, msg)
data_start_time = time.time()
except Exception as e:
logger.error(e)
raise e
with Timer("Eval Time Cost: {}"):
total_loss, num_seen_utts = self.valid()
if dist.get_world_size() > 1:
num_seen_utts = paddle.to_tensor(num_seen_utts)
# the default operator in all_reduce function is sum.
dist.all_reduce(num_seen_utts)
total_loss = paddle.to_tensor(total_loss)
dist.all_reduce(total_loss)
cv_loss = total_loss / num_seen_utts
cv_loss = float(cv_loss)
else:
cv_loss = total_loss / num_seen_utts
logger.info(
'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss))

@ -115,7 +115,8 @@ class U2BaseModel(nn.Layer):
ctc_weight: float=0.5,
ignore_id: int=IGNORE_ID,
lsm_weight: float=0.0,
length_normalized_loss: bool=False):
length_normalized_loss: bool=False,
**kwargs):
assert 0.0 <= ctc_weight <= 1.0, ctc_weight
super().__init__()
@ -661,9 +662,7 @@ class U2BaseModel(nn.Layer):
xs, offset, required_cache_size, subsampling_cache,
elayers_output_cache, conformer_cnn_cache)
# @jit.to_static([
# paddle.static.InputSpec(shape=[1, None, feat_dim],dtype='float32'), # audio feat, [B,T,D]
# ])
# @jit.to_static
def ctc_activation(self, xs: paddle.Tensor) -> paddle.Tensor:
""" Export interface for c++ call, apply linear transform and log
softmax before ctc
@ -830,6 +829,7 @@ class U2Model(U2BaseModel):
Returns:
int, nn.Layer, nn.Layer, nn.Layer: vocab size, encoder, decoder, ctc
"""
# cmvn
if configs['cmvn_file'] is not None:
mean, istd = load_cmvn(configs['cmvn_file'],
configs['cmvn_file_type'])
@ -839,11 +839,13 @@ class U2Model(U2BaseModel):
else:
global_cmvn = None
# input & output dim
input_dim = configs['input_dim']
vocab_size = configs['output_dim']
assert input_dim != 0, input_dim
assert vocab_size != 0, vocab_size
# encoder
encoder_type = configs.get('encoder', 'transformer')
logger.info(f"U2 Encoder type: {encoder_type}")
if encoder_type == 'transformer':
@ -855,17 +857,21 @@ class U2Model(U2BaseModel):
else:
raise ValueError(f"not support encoder type:{encoder_type}")
# decoder
decoder = TransformerDecoder(vocab_size,
encoder.output_size(),
**configs['decoder_conf'])
# ctc decoder and ctc loss
model_conf = configs['model_conf']
ctc = CTCDecoder(
odim=vocab_size,
enc_n_units=encoder.output_size(),
blank_id=0,
dropout_rate=0.0,
dropout_rate=model_conf['ctc_dropoutrate'],
reduction=True, # sum
batch_average=True, # sum / batch_size
grad_norm_type='instance')
grad_norm_type=model_conf['ctc_grad_norm_type'])
return vocab_size, encoder, decoder, ctc

@ -413,26 +413,26 @@ class U2STBaseModel(nn.Layer):
best_hyps = best_hyps[:, 1:]
return best_hyps
@jit.to_static
# @jit.to_static
def subsampling_rate(self) -> int:
""" Export interface for c++ call, return subsampling_rate of the
model
"""
return self.encoder.embed.subsampling_rate
@jit.to_static
# @jit.to_static
def right_context(self) -> int:
""" Export interface for c++ call, return right_context of the model
"""
return self.encoder.embed.right_context
@jit.to_static
# @jit.to_static
def sos_symbol(self) -> int:
""" Export interface for c++ call, return sos symbol id of the model
"""
return self.sos
@jit.to_static
# @jit.to_static
def eos_symbol(self) -> int:
""" Export interface for c++ call, return eos symbol id of the model
"""
@ -468,7 +468,7 @@ class U2STBaseModel(nn.Layer):
xs, offset, required_cache_size, subsampling_cache,
elayers_output_cache, conformer_cnn_cache)
@jit.to_static
# @jit.to_static
def ctc_activation(self, xs: paddle.Tensor) -> paddle.Tensor:
""" Export interface for c++ call, apply linear transform and log
softmax before ctc
@ -643,14 +643,16 @@ class U2STModel(U2STBaseModel):
decoder = TransformerDecoder(vocab_size,
encoder.output_size(),
**configs['decoder_conf'])
# ctc decoder and ctc loss
model_conf = configs['model_conf']
ctc = CTCDecoder(
odim=vocab_size,
enc_n_units=encoder.output_size(),
blank_id=0,
dropout_rate=0.0,
dropout_rate=model_conf['ctc_dropout_rate'],
reduction=True, # sum
batch_average=True, # sum / batch_size
grad_norm_type='instance')
grad_norm_type=model_conf['ctc_grad_norm_type'])
return vocab_size, encoder, (st_decoder, decoder, ctc)
else:

@ -36,16 +36,16 @@ class CTCLoss(nn.Layer):
f"CTCLoss Loss reduction: {reduction}, div-bs: {batch_average}")
# instance for norm_by_times
# batchsize for norm_by_batchsize
# batch for norm_by_batchsize
# frame for norm_by_total_logits_len
assert grad_norm_type in ('instance', 'batchsize', 'frame', None)
assert grad_norm_type in ('instance', 'batch', 'frame', None)
self.norm_by_times = False
self.norm_by_batchsize = False
self.norm_by_total_logits_len = False
logger.info(f"CTCLoss Grad Norm Type: {grad_norm_type}")
if grad_norm_type == 'instance':
self.norm_by_times = True
if grad_norm_type == 'batchsize':
if grad_norm_type == 'batch':
self.norm_by_times = True
if grad_norm_type == 'frame':
self.norm_by_total_logits_len = True

@ -47,9 +47,10 @@ class ClipGradByGlobalNormWithLog(paddle.nn.ClipGradByGlobalNorm):
sum_square = layers.reduce_sum(square)
sum_square_list.append(sum_square)
# debug log
logger.debug(
f"Grad Before Clip: {p.name}: {float(sum_square.sqrt()) }")
# debug log, not dump all since slow down train process
if i < 10:
logger.debug(
f"Grad Before Clip: {p.name}: {float(sum_square.sqrt()) }")
# all parameters have been filterd out
if len(sum_square_list) == 0:
@ -75,9 +76,10 @@ class ClipGradByGlobalNormWithLog(paddle.nn.ClipGradByGlobalNorm):
new_grad = layers.elementwise_mul(x=g, y=clip_var)
params_and_grads.append((p, new_grad))
# debug log
logger.debug(
f"Grad After Clip: {p.name}: {float(new_grad.square().sum().sqrt())}"
)
# debug log, not dump all since slow down train process
if i < 10:
logger.debug(
f"Grad After Clip: {p.name}: {float(new_grad.square().sum().sqrt())}"
)
return params_and_grads

@ -183,5 +183,3 @@ bash run.sh --stage 0 --stop_stage 2 --model_type offline --conf_path conf/deeps
cd examples/aishell/s0
bash run.sh --stage 3 --stop_stage 5 --model_type offline --conf_path conf/deepspeech2.yaml
```

@ -76,6 +76,8 @@ model:
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
ctc_dropoutrate: 0.0
ctc_grad_norm_type: instance
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false

@ -71,6 +71,8 @@ model:
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
ctc_dropoutrate: 0.0
ctc_grad_norm_type: instance
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false

@ -76,6 +76,8 @@ model:
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
ctc_dropoutrate: 0.0
ctc_grad_norm_type: instance
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false

@ -69,6 +69,8 @@ model:
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
ctc_dropoutrate: 0.0
ctc_grad_norm_type: instance
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false

@ -72,6 +72,8 @@ model:
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
ctc_dropoutrate: 0.0
ctc_grad_norm_type: instance
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false

@ -33,7 +33,7 @@ collator:
keep_transcription_text: False
sortagrad: True
shuffle_method: batch_shuffle
num_workers: 2
num_workers: 0
# network architecture
@ -67,6 +67,8 @@ model:
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
ctc_dropoutrate: 0.0
ctc_grad_norm_type: instance
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false

@ -20,7 +20,7 @@ echo "using ${device}..."
mkdir -p exp
seed=10086
if [ ${seed} != 0]; then
if [ ${seed} != 0 ]; then
export FLAGS_cudnn_deterministic=True
fi

@ -76,6 +76,8 @@ model:
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
ctc_dropoutrate: 0.0
ctc_grad_norm_type: instance
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false

@ -69,6 +69,8 @@ model:
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
ctc_dropoutrate: 0.0
ctc_grad_norm_type: instance
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false

@ -72,6 +72,8 @@ model:
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
ctc_dropoutrate: 0.0
ctc_grad_norm_type: instance
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false

@ -58,6 +58,8 @@ model:
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
ctc_dropoutrate: 0.0
ctc_grad_norm_type: instance
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false

@ -68,6 +68,8 @@ model:
model_conf:
asr_weight: 0.0
ctc_weight: 0.0
ctc_dropoutrate: 0.0
ctc_grad_norm_type: instance
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false

@ -68,6 +68,8 @@ model:
model_conf:
asr_weight: 0.5
ctc_weight: 0.3
ctc_dropoutrate: 0.0
ctc_grad_norm_type: instance
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false

@ -66,6 +66,8 @@ model:
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
ctc_dropoutrate: 0.0
ctc_grad_norm_type: instance
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false

@ -76,6 +76,8 @@ model:
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
ctc_dropoutrate: 0.0
ctc_grad_norm_type: instance
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false

@ -69,6 +69,8 @@ model:
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
ctc_dropoutrate: 0.0
ctc_grad_norm_type: instance
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false

@ -72,6 +72,8 @@ model:
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
ctc_dropoutrate: 0.0
ctc_grad_norm_type: instance
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false

@ -66,6 +66,8 @@ model:
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
ctc_dropoutrate: 0.0
ctc_grad_norm_type: instance
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false

Loading…
Cancel
Save