fix some mistacks in doc

pull/827/head
huangyuxin 4 years ago
commit 84020a0471

@ -43,7 +43,7 @@ You are welcome to submit questions in [Github Discussions](https://github.com/P
## License
DeepASR is provided under the [Apache-2.0 License](./LICENSE).
DeepSpeech is provided under the [Apache-2.0 License](./LICENSE).
## Acknowledgement

@ -42,7 +42,7 @@
## License
DeepASR 遵循[Apache-2.0开源协议](./LICENSE)。
DeepSpeech 遵循[Apache-2.0开源协议](./LICENSE)。
## 感谢

@ -80,23 +80,23 @@ def convert_dtype_to_string(tensor_dtype):
if not hasattr(paddle, 'softmax'):
logger.warn("register user softmax to paddle, remove this when fixed!")
logger.debug("register user softmax to paddle, remove this when fixed!")
setattr(paddle, 'softmax', paddle.nn.functional.softmax)
if not hasattr(paddle, 'log_softmax'):
logger.warn("register user log_softmax to paddle, remove this when fixed!")
logger.debug("register user log_softmax to paddle, remove this when fixed!")
setattr(paddle, 'log_softmax', paddle.nn.functional.log_softmax)
if not hasattr(paddle, 'sigmoid'):
logger.warn("register user sigmoid to paddle, remove this when fixed!")
logger.debug("register user sigmoid to paddle, remove this when fixed!")
setattr(paddle, 'sigmoid', paddle.nn.functional.sigmoid)
if not hasattr(paddle, 'log_sigmoid'):
logger.warn("register user log_sigmoid to paddle, remove this when fixed!")
logger.debug("register user log_sigmoid to paddle, remove this when fixed!")
setattr(paddle, 'log_sigmoid', paddle.nn.functional.log_sigmoid)
if not hasattr(paddle, 'relu'):
logger.warn("register user relu to paddle, remove this when fixed!")
logger.debug("register user relu to paddle, remove this when fixed!")
setattr(paddle, 'relu', paddle.nn.functional.relu)
@ -105,7 +105,7 @@ def cat(xs, dim=0):
if not hasattr(paddle, 'cat'):
logger.warn(
logger.debug(
"override cat of paddle if exists or register, remove this when fixed!")
paddle.cat = cat
@ -116,7 +116,7 @@ def item(x: paddle.Tensor):
if not hasattr(paddle.Tensor, 'item'):
logger.warn(
logger.debug(
"override item of paddle.Tensor if exists or register, remove this when fixed!"
)
paddle.Tensor.item = item
@ -127,13 +127,13 @@ def func_long(x: paddle.Tensor):
if not hasattr(paddle.Tensor, 'long'):
logger.warn(
logger.debug(
"override long of paddle.Tensor if exists or register, remove this when fixed!"
)
paddle.Tensor.long = func_long
if not hasattr(paddle.Tensor, 'numel'):
logger.warn(
logger.debug(
"override numel of paddle.Tensor if exists or register, remove this when fixed!"
)
paddle.Tensor.numel = paddle.numel
@ -147,7 +147,7 @@ def new_full(x: paddle.Tensor,
if not hasattr(paddle.Tensor, 'new_full'):
logger.warn(
logger.debug(
"override new_full of paddle.Tensor if exists or register, remove this when fixed!"
)
paddle.Tensor.new_full = new_full
@ -162,13 +162,13 @@ def eq(xs: paddle.Tensor, ys: Union[paddle.Tensor, float]) -> paddle.Tensor:
if not hasattr(paddle.Tensor, 'eq'):
logger.warn(
logger.debug(
"override eq of paddle.Tensor if exists or register, remove this when fixed!"
)
paddle.Tensor.eq = eq
if not hasattr(paddle, 'eq'):
logger.warn(
logger.debug(
"override eq of paddle if exists or register, remove this when fixed!")
paddle.eq = eq
@ -178,7 +178,7 @@ def contiguous(xs: paddle.Tensor) -> paddle.Tensor:
if not hasattr(paddle.Tensor, 'contiguous'):
logger.warn(
logger.debug(
"override contiguous of paddle.Tensor if exists or register, remove this when fixed!"
)
paddle.Tensor.contiguous = contiguous
@ -195,7 +195,7 @@ def size(xs: paddle.Tensor, *args: int) -> paddle.Tensor:
#`to_static` do not process `size` property, maybe some `paddle` api dependent on it.
logger.warn(
logger.debug(
"override size of paddle.Tensor "
"(`to_static` do not process `size` property, maybe some `paddle` api dependent on it), remove this when fixed!"
)
@ -207,7 +207,7 @@ def view(xs: paddle.Tensor, *args: int) -> paddle.Tensor:
if not hasattr(paddle.Tensor, 'view'):
logger.warn("register user view to paddle.Tensor, remove this when fixed!")
logger.debug("register user view to paddle.Tensor, remove this when fixed!")
paddle.Tensor.view = view
@ -216,7 +216,7 @@ def view_as(xs: paddle.Tensor, ys: paddle.Tensor) -> paddle.Tensor:
if not hasattr(paddle.Tensor, 'view_as'):
logger.warn(
logger.debug(
"register user view_as to paddle.Tensor, remove this when fixed!")
paddle.Tensor.view_as = view_as
@ -242,7 +242,7 @@ def masked_fill(xs: paddle.Tensor,
if not hasattr(paddle.Tensor, 'masked_fill'):
logger.warn(
logger.debug(
"register user masked_fill to paddle.Tensor, remove this when fixed!")
paddle.Tensor.masked_fill = masked_fill
@ -260,7 +260,7 @@ def masked_fill_(xs: paddle.Tensor,
if not hasattr(paddle.Tensor, 'masked_fill_'):
logger.warn(
logger.debug(
"register user masked_fill_ to paddle.Tensor, remove this when fixed!")
paddle.Tensor.masked_fill_ = masked_fill_
@ -272,7 +272,8 @@ def fill_(xs: paddle.Tensor, value: Union[float, int]) -> paddle.Tensor:
if not hasattr(paddle.Tensor, 'fill_'):
logger.warn("register user fill_ to paddle.Tensor, remove this when fixed!")
logger.debug(
"register user fill_ to paddle.Tensor, remove this when fixed!")
paddle.Tensor.fill_ = fill_
@ -281,22 +282,22 @@ def repeat(xs: paddle.Tensor, *size: Any) -> paddle.Tensor:
if not hasattr(paddle.Tensor, 'repeat'):
logger.warn(
logger.debug(
"register user repeat to paddle.Tensor, remove this when fixed!")
paddle.Tensor.repeat = repeat
if not hasattr(paddle.Tensor, 'softmax'):
logger.warn(
logger.debug(
"register user softmax to paddle.Tensor, remove this when fixed!")
setattr(paddle.Tensor, 'softmax', paddle.nn.functional.softmax)
if not hasattr(paddle.Tensor, 'sigmoid'):
logger.warn(
logger.debug(
"register user sigmoid to paddle.Tensor, remove this when fixed!")
setattr(paddle.Tensor, 'sigmoid', paddle.nn.functional.sigmoid)
if not hasattr(paddle.Tensor, 'relu'):
logger.warn("register user relu to paddle.Tensor, remove this when fixed!")
logger.debug("register user relu to paddle.Tensor, remove this when fixed!")
setattr(paddle.Tensor, 'relu', paddle.nn.functional.relu)
@ -305,7 +306,7 @@ def type_as(x: paddle.Tensor, other: paddle.Tensor) -> paddle.Tensor:
if not hasattr(paddle.Tensor, 'type_as'):
logger.warn(
logger.debug(
"register user type_as to paddle.Tensor, remove this when fixed!")
setattr(paddle.Tensor, 'type_as', type_as)
@ -321,7 +322,7 @@ def to(x: paddle.Tensor, *args, **kwargs) -> paddle.Tensor:
if not hasattr(paddle.Tensor, 'to'):
logger.warn("register user to to paddle.Tensor, remove this when fixed!")
logger.debug("register user to to paddle.Tensor, remove this when fixed!")
setattr(paddle.Tensor, 'to', to)
@ -330,7 +331,8 @@ def func_float(x: paddle.Tensor) -> paddle.Tensor:
if not hasattr(paddle.Tensor, 'float'):
logger.warn("register user float to paddle.Tensor, remove this when fixed!")
logger.debug(
"register user float to paddle.Tensor, remove this when fixed!")
setattr(paddle.Tensor, 'float', func_float)
@ -339,7 +341,7 @@ def func_int(x: paddle.Tensor) -> paddle.Tensor:
if not hasattr(paddle.Tensor, 'int'):
logger.warn("register user int to paddle.Tensor, remove this when fixed!")
logger.debug("register user int to paddle.Tensor, remove this when fixed!")
setattr(paddle.Tensor, 'int', func_int)
@ -348,23 +350,6 @@ def tolist(x: paddle.Tensor) -> List[Any]:
if not hasattr(paddle.Tensor, 'tolist'):
logger.warn(
logger.debug(
"register user tolist to paddle.Tensor, remove this when fixed!")
setattr(paddle.Tensor, 'tolist', tolist)
########### hcak paddle.nn #############
class GLU(nn.Layer):
"""Gated Linear Units (GLU) Layer"""
def __init__(self, dim: int=-1):
super().__init__()
self.dim = dim
def forward(self, xs):
return F.glu(xs, axis=self.dim)
if not hasattr(paddle.nn, 'GLU'):
logger.warn("register user GLU to paddle.nn, remove this when fixed!")
setattr(paddle.nn, 'GLU', GLU)

@ -35,7 +35,8 @@ std::vector<std::pair<double, std::string>> ctc_beam_search_decoder(
size_t beam_size,
double cutoff_prob,
size_t cutoff_top_n,
Scorer *ext_scorer) {
Scorer *ext_scorer,
size_t blank_id) {
// dimension check
size_t num_time_steps = probs_seq.size();
for (size_t i = 0; i < num_time_steps; ++i) {
@ -48,7 +49,7 @@ std::vector<std::pair<double, std::string>> ctc_beam_search_decoder(
// assign blank id
// size_t blank_id = vocabulary.size();
size_t blank_id = 0;
// size_t blank_id = 0;
// assign space id
auto it = std::find(vocabulary.begin(), vocabulary.end(), " ");
@ -57,7 +58,6 @@ std::vector<std::pair<double, std::string>> ctc_beam_search_decoder(
if ((size_t)space_id >= vocabulary.size()) {
space_id = -2;
}
// init prefixes' root
PathTrie root;
root.score = root.log_prob_b_prev = 0.0;
@ -218,7 +218,8 @@ ctc_beam_search_decoder_batch(
size_t num_processes,
double cutoff_prob,
size_t cutoff_top_n,
Scorer *ext_scorer) {
Scorer *ext_scorer,
size_t blank_id) {
VALID_CHECK_GT(num_processes, 0, "num_processes must be nonnegative!");
// thread pool
ThreadPool pool(num_processes);
@ -234,7 +235,8 @@ ctc_beam_search_decoder_batch(
beam_size,
cutoff_prob,
cutoff_top_n,
ext_scorer));
ext_scorer,
blank_id));
}
// get decoding results

@ -43,7 +43,8 @@ std::vector<std::pair<double, std::string>> ctc_beam_search_decoder(
size_t beam_size,
double cutoff_prob = 1.0,
size_t cutoff_top_n = 40,
Scorer *ext_scorer = nullptr);
Scorer *ext_scorer = nullptr,
size_t blank_id = 0);
/* CTC Beam Search Decoder for batch data
@ -70,6 +71,7 @@ ctc_beam_search_decoder_batch(
size_t num_processes,
double cutoff_prob = 1.0,
size_t cutoff_top_n = 40,
Scorer *ext_scorer = nullptr);
Scorer *ext_scorer = nullptr,
size_t blank_id = 0);
#endif // CTC_BEAM_SEARCH_DECODER_H_

@ -17,17 +17,18 @@
std::string ctc_greedy_decoder(
const std::vector<std::vector<double>> &probs_seq,
const std::vector<std::string> &vocabulary) {
const std::vector<std::string> &vocabulary,
size_t blank_id) {
// dimension check
size_t num_time_steps = probs_seq.size();
for (size_t i = 0; i < num_time_steps; ++i) {
VALID_CHECK_EQ(probs_seq[i].size(),
vocabulary.size() + 1,
vocabulary.size(),
"The shape of probs_seq does not match with "
"the shape of the vocabulary");
}
size_t blank_id = vocabulary.size();
// size_t blank_id = vocabulary.size();
std::vector<size_t> max_idx_vec(num_time_steps, 0);
std::vector<size_t> idx_vec;

@ -29,6 +29,7 @@
*/
std::string ctc_greedy_decoder(
const std::vector<std::vector<double>>& probs_seq,
const std::vector<std::string>& vocabulary);
const std::vector<std::string>& vocabulary,
size_t blank_id);
#endif // CTC_GREEDY_DECODER_H

@ -85,9 +85,8 @@ FILES += glob.glob('openfst-1.6.3/src/lib/*.cc')
# yapf: disable
FILES = [
fn for fn in FILES
if not (fn.endswith('main.cc') or fn.endswith('test.cc') or fn.endswith(
'unittest.cc'))
fn for fn in FILES if not (fn.endswith('main.cc') or fn.endswith('test.cc')
or fn.endswith('unittest.cc'))
]
# yapf: enable

@ -32,7 +32,7 @@ class Scorer(swig_decoders.Scorer):
swig_decoders.Scorer.__init__(self, alpha, beta, model_path, vocabulary)
def ctc_greedy_decoder(probs_seq, vocabulary):
def ctc_greedy_decoder(probs_seq, vocabulary, blank_id):
"""Wrapper for ctc best path decoder in swig.
:param probs_seq: 2-D list of probability distributions over each time
@ -44,7 +44,8 @@ def ctc_greedy_decoder(probs_seq, vocabulary):
:return: Decoding result string.
:rtype: str
"""
result = swig_decoders.ctc_greedy_decoder(probs_seq.tolist(), vocabulary)
result = swig_decoders.ctc_greedy_decoder(probs_seq.tolist(), vocabulary,
blank_id)
return result
@ -53,7 +54,8 @@ def ctc_beam_search_decoder(probs_seq,
beam_size,
cutoff_prob=1.0,
cutoff_top_n=40,
ext_scoring_func=None):
ext_scoring_func=None,
blank_id=0):
"""Wrapper for the CTC Beam Search Decoder.
:param probs_seq: 2-D list of probability distributions over each time
@ -81,7 +83,7 @@ def ctc_beam_search_decoder(probs_seq,
"""
beam_results = swig_decoders.ctc_beam_search_decoder(
probs_seq.tolist(), vocabulary, beam_size, cutoff_prob, cutoff_top_n,
ext_scoring_func)
ext_scoring_func, blank_id)
beam_results = [(res[0], res[1].decode('utf-8')) for res in beam_results]
return beam_results
@ -92,7 +94,8 @@ def ctc_beam_search_decoder_batch(probs_split,
num_processes,
cutoff_prob=1.0,
cutoff_top_n=40,
ext_scoring_func=None):
ext_scoring_func=None,
blank_id=0):
"""Wrapper for the batched CTC beam search decoder.
:param probs_seq: 3-D list with each element as an instance of 2-D list
@ -125,7 +128,7 @@ def ctc_beam_search_decoder_batch(probs_split,
batch_beam_results = swig_decoders.ctc_beam_search_decoder_batch(
probs_split, vocabulary, beam_size, num_processes, cutoff_prob,
cutoff_top_n, ext_scoring_func)
cutoff_top_n, ext_scoring_func, blank_id)
batch_beam_results = [[(res[0], res[1]) for res in beam_results]
for beam_results in batch_beam_results]
return batch_beam_results

@ -15,6 +15,7 @@
import os
import time
from collections import defaultdict
from contextlib import nullcontext
from pathlib import Path
from typing import Optional
@ -65,29 +66,51 @@ class DeepSpeech2Trainer(Trainer):
super().__init__(config, args)
def train_batch(self, batch_index, batch_data, msg):
train_conf = self.config.training
start = time.time()
# forward
utt, audio, audio_len, text, text_len = batch_data
loss = self.model(audio, audio_len, text, text_len)
loss.backward()
layer_tools.print_grads(self.model, print_func=None)
self.optimizer.step()
self.optimizer.clear_grad()
iteration_time = time.time() - start
losses_np = {
'train_loss': float(loss),
}
# loss backward
if (batch_index + 1) % train_conf.accum_grad != 0:
# Disable gradient synchronizations across DDP processes.
# Within this context, gradients will be accumulated on module
# variables, which will later be synchronized.
context = self.model.no_sync
else:
# Used for single gpu training and DDP gradient synchronization
# processes.
context = nullcontext
with context():
loss.backward()
layer_tools.print_grads(self.model, print_func=None)
# optimizer step
if (batch_index + 1) % train_conf.accum_grad == 0:
self.optimizer.step()
self.optimizer.clear_grad()
self.iteration += 1
iteration_time = time.time() - start
msg += "train time: {:>.3f}s, ".format(iteration_time)
msg += "batch size: {}, ".format(self.config.collator.batch_size)
msg += "accum: {}, ".format(train_conf.accum_grad)
msg += ', '.join('{}: {:>.6f}'.format(k, v)
for k, v in losses_np.items())
logger.info(msg)
if dist.get_rank() == 0 and self.visualizer:
for k, v in losses_np.items():
# `step -1` since we update `step` after optimizer.step().
self.visualizer.add_scalar("train/{}".format(k), v,
self.iteration)
self.iteration += 1
self.iteration - 1)
@paddle.no_grad()
def valid(self):

@ -21,6 +21,7 @@ from deepspeech.exps.u2.config import get_cfg_defaults
from deepspeech.exps.u2.model import U2Trainer as Trainer
from deepspeech.training.cli import default_argument_parser
from deepspeech.utils.utility import print_arguments
# from deepspeech.exps.u2.trainer import U2Trainer as Trainer
def main_sp(config, args):

@ -17,6 +17,7 @@ import os
import sys
import time
from collections import defaultdict
from contextlib import nullcontext
from pathlib import Path
from typing import Optional
@ -33,6 +34,7 @@ from deepspeech.io.sampler import SortagradDistributedBatchSampler
from deepspeech.models.u2 import U2Model
from deepspeech.training.optimizer import OptimizerFactory
from deepspeech.training.scheduler import LRSchedulerFactory
from deepspeech.training.timer import Timer
from deepspeech.training.trainer import Trainer
from deepspeech.utils import ctc_utils
from deepspeech.utils import error_rate
@ -79,21 +81,35 @@ class U2Trainer(Trainer):
def train_batch(self, batch_index, batch_data, msg):
train_conf = self.config.training
start = time.time()
utt, audio, audio_len, text, text_len = batch_data
# forward
utt, audio, audio_len, text, text_len = batch_data
loss, attention_loss, ctc_loss = self.model(audio, audio_len, text,
text_len)
# loss div by `batch_size * accum_grad`
loss /= train_conf.accum_grad
loss.backward()
layer_tools.print_grads(self.model, print_func=None)
losses_np = {'loss': float(loss) * train_conf.accum_grad}
if attention_loss:
losses_np['att_loss'] = float(attention_loss)
if ctc_loss:
losses_np['ctc_loss'] = float(ctc_loss)
# loss backward
if (batch_index + 1) % train_conf.accum_grad != 0:
# Disable gradient synchronizations across DDP processes.
# Within this context, gradients will be accumulated on module
# variables, which will later be synchronized.
context = self.model.no_sync
else:
# Used for single gpu training and DDP gradient synchronization
# processes.
context = nullcontext
with context():
loss.backward()
layer_tools.print_grads(self.model, print_func=None)
# optimizer step
if (batch_index + 1) % train_conf.accum_grad == 0:
self.optimizer.step()
self.optimizer.clear_grad()
@ -169,40 +185,42 @@ class U2Trainer(Trainer):
self.save(tag='init')
self.lr_scheduler.step(self.iteration)
if self.parallel:
if self.parallel and hasattr(self.train_loader, 'batch_sampler'):
self.train_loader.batch_sampler.set_epoch(self.epoch)
logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}")
while self.epoch < self.config.training.n_epoch:
self.model.train()
try:
data_start_time = time.time()
for batch_index, batch in enumerate(self.train_loader):
dataload_time = time.time() - data_start_time
msg = "Train: Rank: {}, ".format(dist.get_rank())
msg += "epoch: {}, ".format(self.epoch)
msg += "step: {}, ".format(self.iteration)
msg += "batch : {}/{}, ".format(batch_index + 1,
len(self.train_loader))
msg += "lr: {:>.8f}, ".format(self.lr_scheduler())
msg += "data time: {:>.3f}s, ".format(dataload_time)
self.train_batch(batch_index, batch, msg)
with Timer("Epoch-Train Time Cost: {}"):
self.model.train()
try:
data_start_time = time.time()
except Exception as e:
logger.error(e)
raise e
total_loss, num_seen_utts = self.valid()
if dist.get_world_size() > 1:
num_seen_utts = paddle.to_tensor(num_seen_utts)
# the default operator in all_reduce function is sum.
dist.all_reduce(num_seen_utts)
total_loss = paddle.to_tensor(total_loss)
dist.all_reduce(total_loss)
cv_loss = total_loss / num_seen_utts
cv_loss = float(cv_loss)
else:
cv_loss = total_loss / num_seen_utts
for batch_index, batch in enumerate(self.train_loader):
dataload_time = time.time() - data_start_time
msg = "Train: Rank: {}, ".format(dist.get_rank())
msg += "epoch: {}, ".format(self.epoch)
msg += "step: {}, ".format(self.iteration)
msg += "batch : {}/{}, ".format(batch_index + 1,
len(self.train_loader))
msg += "lr: {:>.8f}, ".format(self.lr_scheduler())
msg += "data time: {:>.3f}s, ".format(dataload_time)
self.train_batch(batch_index, batch, msg)
data_start_time = time.time()
except Exception as e:
logger.error(e)
raise e
with Timer("Eval Time Cost: {}"):
total_loss, num_seen_utts = self.valid()
if dist.get_world_size() > 1:
num_seen_utts = paddle.to_tensor(num_seen_utts)
# the default operator in all_reduce function is sum.
dist.all_reduce(num_seen_utts)
total_loss = paddle.to_tensor(total_loss)
dist.all_reduce(total_loss)
cv_loss = total_loss / num_seen_utts
cv_loss = float(cv_loss)
else:
cv_loss = total_loss / num_seen_utts
logger.info(
'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss))

@ -0,0 +1,219 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Contains U2 model."""
import paddle
from paddle import distributed as dist
from paddle.io import DataLoader
from deepspeech.io.collator import SpeechCollator
from deepspeech.io.dataset import ManifestDataset
from deepspeech.io.sampler import SortagradBatchSampler
from deepspeech.io.sampler import SortagradDistributedBatchSampler
from deepspeech.models.u2 import U2Evaluator
from deepspeech.models.u2 import U2Model
from deepspeech.models.u2 import U2Updater
from deepspeech.training.extensions.snapshot import Snapshot
from deepspeech.training.extensions.visualizer import VisualDL
from deepspeech.training.optimizer import OptimizerFactory
from deepspeech.training.scheduler import LRSchedulerFactory
from deepspeech.training.timer import Timer
from deepspeech.training.trainer import Trainer
from deepspeech.training.updaters.trainer import Trainer as NewTrainer
from deepspeech.utils import layer_tools
from deepspeech.utils.log import Log
logger = Log(__name__).getlog()
class U2Trainer(Trainer):
def __init__(self, config, args):
super().__init__(config, args)
def setup_dataloader(self):
config = self.config.clone()
config.defrost()
config.collator.keep_transcription_text = False
# train/valid dataset, return token ids
config.data.manifest = config.data.train_manifest
train_dataset = ManifestDataset.from_config(config)
config.data.manifest = config.data.dev_manifest
dev_dataset = ManifestDataset.from_config(config)
collate_fn_train = SpeechCollator.from_config(config)
config.collator.augmentation_config = ""
collate_fn_dev = SpeechCollator.from_config(config)
if self.parallel:
batch_sampler = SortagradDistributedBatchSampler(
train_dataset,
batch_size=config.collator.batch_size,
num_replicas=None,
rank=None,
shuffle=True,
drop_last=True,
sortagrad=config.collator.sortagrad,
shuffle_method=config.collator.shuffle_method)
else:
batch_sampler = SortagradBatchSampler(
train_dataset,
shuffle=True,
batch_size=config.collator.batch_size,
drop_last=True,
sortagrad=config.collator.sortagrad,
shuffle_method=config.collator.shuffle_method)
self.train_loader = DataLoader(
train_dataset,
batch_sampler=batch_sampler,
collate_fn=collate_fn_train,
num_workers=config.collator.num_workers, )
self.valid_loader = DataLoader(
dev_dataset,
batch_size=config.collator.batch_size,
shuffle=False,
drop_last=False,
collate_fn=collate_fn_dev)
# test dataset, return raw text
config.data.manifest = config.data.test_manifest
# filter test examples, will cause less examples, but no mismatch with training
# and can use large batch size , save training time, so filter test egs now.
config.data.min_input_len = 0.0 # second
config.data.max_input_len = float('inf') # second
config.data.min_output_len = 0.0 # tokens
config.data.max_output_len = float('inf') # tokens
config.data.min_output_input_ratio = 0.00
config.data.max_output_input_ratio = float('inf')
test_dataset = ManifestDataset.from_config(config)
# return text ord id
config.collator.keep_transcription_text = True
config.collator.augmentation_config = ""
self.test_loader = DataLoader(
test_dataset,
batch_size=config.decoding.batch_size,
shuffle=False,
drop_last=False,
collate_fn=SpeechCollator.from_config(config))
# return text token id
config.collator.keep_transcription_text = False
self.align_loader = DataLoader(
test_dataset,
batch_size=config.decoding.batch_size,
shuffle=False,
drop_last=False,
collate_fn=SpeechCollator.from_config(config))
logger.info("Setup train/valid/test/align Dataloader!")
def setup_model(self):
config = self.config
model_conf = config.model
model_conf.defrost()
model_conf.input_dim = self.train_loader.collate_fn.feature_size
model_conf.output_dim = self.train_loader.collate_fn.vocab_size
model_conf.freeze()
model = U2Model.from_config(model_conf)
if self.parallel:
model = paddle.DataParallel(model)
model.train()
logger.info(f"{model}")
layer_tools.print_params(model, logger.info)
train_config = config.training
optim_type = train_config.optim
optim_conf = train_config.optim_conf
scheduler_type = train_config.scheduler
scheduler_conf = train_config.scheduler_conf
scheduler_args = {
"learning_rate": optim_conf.lr,
"verbose": False,
"warmup_steps": scheduler_conf.warmup_steps,
"gamma": scheduler_conf.lr_decay,
"d_model": model_conf.encoder_conf.output_size,
}
lr_scheduler = LRSchedulerFactory.from_args(scheduler_type,
scheduler_args)
def optimizer_args(
config,
parameters,
lr_scheduler=None, ):
train_config = config.training
optim_type = train_config.optim
optim_conf = train_config.optim_conf
scheduler_type = train_config.scheduler
scheduler_conf = train_config.scheduler_conf
return {
"grad_clip": train_config.global_grad_clip,
"weight_decay": optim_conf.weight_decay,
"learning_rate": lr_scheduler
if lr_scheduler else optim_conf.lr,
"parameters": parameters,
"epsilon": 1e-9 if optim_type == 'noam' else None,
"beta1": 0.9 if optim_type == 'noam' else None,
"beat2": 0.98 if optim_type == 'noam' else None,
}
optimzer_args = optimizer_args(config, model.parameters(), lr_scheduler)
optimizer = OptimizerFactory.from_args(optim_type, optimzer_args)
self.model = model
self.optimizer = optimizer
self.lr_scheduler = lr_scheduler
logger.info("Setup model/optimizer/lr_scheduler!")
def setup_updater(self):
output_dir = self.output_dir
config = self.config.training
updater = U2Updater(
model=self.model,
optimizer=self.optimizer,
scheduler=self.lr_scheduler,
dataloader=self.train_loader,
output_dir=output_dir,
accum_grad=config.accum_grad)
trainer = NewTrainer(updater, (config.n_epoch, 'epoch'), output_dir)
evaluator = U2Evaluator(self.model, self.valid_loader)
trainer.extend(evaluator, trigger=(1, "epoch"))
if dist.get_rank() == 0:
trainer.extend(VisualDL(output_dir), trigger=(1, "iteration"))
num_snapshots = config.checkpoint.kbest_n
trainer.extend(
Snapshot(
mode='kbest',
max_size=num_snapshots,
indicator='VALID/LOSS',
less_better=True),
trigger=(1, 'epoch'))
# print(trainer.extensions)
# trainer.run()
self.trainer = trainer
def run(self):
"""The routine of the experiment after setup. This method is intended
to be used by the user.
"""
self.setup_updater()
with Timer("Training Done: {}"):
self.trainer.run()

@ -17,6 +17,7 @@ import os
import sys
import time
from collections import defaultdict
from contextlib import nullcontext
from pathlib import Path
from typing import Optional
@ -31,6 +32,7 @@ from deepspeech.io.dataloader import BatchDataLoader
from deepspeech.models.u2 import U2Model
from deepspeech.training.optimizer import OptimizerFactory
from deepspeech.training.scheduler import LRSchedulerFactory
from deepspeech.training.timer import Timer
from deepspeech.training.trainer import Trainer
from deepspeech.utils import ctc_utils
from deepspeech.utils import error_rate
@ -83,20 +85,34 @@ class U2Trainer(Trainer):
train_conf = self.config.training
start = time.time()
# forward
utt, audio, audio_len, text, text_len = batch_data
loss, attention_loss, ctc_loss = self.model(audio, audio_len, text,
text_len)
# loss div by `batch_size * accum_grad`
loss /= train_conf.accum_grad
loss.backward()
layer_tools.print_grads(self.model, print_func=None)
losses_np = {'loss': float(loss) * train_conf.accum_grad}
if attention_loss:
losses_np['att_loss'] = float(attention_loss)
if ctc_loss:
losses_np['ctc_loss'] = float(ctc_loss)
# loss backward
if (batch_index + 1) % train_conf.accum_grad != 0:
# Disable gradient synchronizations across DDP processes.
# Within this context, gradients will be accumulated on module
# variables, which will later be synchronized.
context = self.model.no_sync
else:
# Used for single gpu training and DDP gradient synchronization
# processes.
context = nullcontext
with context():
loss.backward()
layer_tools.print_grads(self.model, print_func=None)
# optimizer step
if (batch_index + 1) % train_conf.accum_grad == 0:
self.optimizer.step()
self.optimizer.clear_grad()
@ -175,35 +191,37 @@ class U2Trainer(Trainer):
logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}")
while self.epoch < self.config.training.n_epoch:
self.model.train()
try:
data_start_time = time.time()
for batch_index, batch in enumerate(self.train_loader):
dataload_time = time.time() - data_start_time
msg = "Train: Rank: {}, ".format(dist.get_rank())
msg += "epoch: {}, ".format(self.epoch)
msg += "step: {}, ".format(self.iteration)
msg += "batch : {}/{}, ".format(batch_index + 1,
len(self.train_loader))
msg += "lr: {:>.8f}, ".format(self.lr_scheduler())
msg += "data time: {:>.3f}s, ".format(dataload_time)
self.train_batch(batch_index, batch, msg)
with Timer("Epoch-Train Time Cost: {}"):
self.model.train()
try:
data_start_time = time.time()
except Exception as e:
logger.error(e)
raise e
total_loss, num_seen_utts = self.valid()
if dist.get_world_size() > 1:
num_seen_utts = paddle.to_tensor(num_seen_utts)
# the default operator in all_reduce function is sum.
dist.all_reduce(num_seen_utts)
total_loss = paddle.to_tensor(total_loss)
dist.all_reduce(total_loss)
cv_loss = total_loss / num_seen_utts
cv_loss = float(cv_loss)
else:
cv_loss = total_loss / num_seen_utts
for batch_index, batch in enumerate(self.train_loader):
dataload_time = time.time() - data_start_time
msg = "Train: Rank: {}, ".format(dist.get_rank())
msg += "epoch: {}, ".format(self.epoch)
msg += "step: {}, ".format(self.iteration)
msg += "batch : {}/{}, ".format(batch_index + 1,
len(self.train_loader))
msg += "lr: {:>.8f}, ".format(self.lr_scheduler())
msg += "data time: {:>.3f}s, ".format(dataload_time)
self.train_batch(batch_index, batch, msg)
data_start_time = time.time()
except Exception as e:
logger.error(e)
raise e
with Timer("Eval Time Cost: {}"):
total_loss, num_seen_utts = self.valid()
if dist.get_world_size() > 1:
num_seen_utts = paddle.to_tensor(num_seen_utts)
# the default operator in all_reduce function is sum.
dist.all_reduce(num_seen_utts)
total_loss = paddle.to_tensor(total_loss)
dist.all_reduce(total_loss)
cv_loss = total_loss / num_seen_utts
cv_loss = float(cv_loss)
else:
cv_loss = total_loss / num_seen_utts
logger.info(
'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss))

@ -17,6 +17,7 @@ import os
import sys
import time
from collections import defaultdict
from contextlib import nullcontext
from pathlib import Path
from typing import Optional
@ -37,6 +38,7 @@ from deepspeech.io.sampler import SortagradDistributedBatchSampler
from deepspeech.models.u2_st import U2STModel
from deepspeech.training.gradclip import ClipGradByGlobalNormWithLog
from deepspeech.training.scheduler import WarmupLR
from deepspeech.training.timer import Timer
from deepspeech.training.trainer import Trainer
from deepspeech.utils import bleu_score
from deepspeech.utils import ctc_utils
@ -83,6 +85,7 @@ class U2STTrainer(Trainer):
def train_batch(self, batch_index, batch_data, msg):
train_conf = self.config.training
start = time.time()
# forward
utt, audio, audio_len, text, text_len = batch_data
if isinstance(text, list) and isinstance(text_len, list):
# joint training with ASR. Two decoding texts [translation, transcription]
@ -94,18 +97,30 @@ class U2STTrainer(Trainer):
else:
loss, st_loss, attention_loss, ctc_loss = self.model(
audio, audio_len, text, text_len)
# loss div by `batch_size * accum_grad`
loss /= train_conf.accum_grad
loss.backward()
layer_tools.print_grads(self.model, print_func=None)
losses_np = {'loss': float(loss) * train_conf.accum_grad}
losses_np['st_loss'] = float(st_loss)
if attention_loss:
losses_np['att_loss'] = float(attention_loss)
if ctc_loss:
losses_np['ctc_loss'] = float(ctc_loss)
# loss backward
if (batch_index + 1) % train_conf.accum_grad != 0:
# Disable gradient synchronizations across DDP processes.
# Within this context, gradients will be accumulated on module
# variables, which will later be synchronized.
context = self.model.no_sync
else:
# Used for single gpu training and DDP gradient synchronization
# processes.
context = nullcontext
with context():
loss.backward()
layer_tools.print_grads(self.model, print_func=None)
# optimizer step
if (batch_index + 1) % train_conf.accum_grad == 0:
self.optimizer.step()
self.optimizer.clear_grad()
@ -193,35 +208,37 @@ class U2STTrainer(Trainer):
logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}")
while self.epoch < self.config.training.n_epoch:
self.model.train()
try:
data_start_time = time.time()
for batch_index, batch in enumerate(self.train_loader):
dataload_time = time.time() - data_start_time
msg = "Train: Rank: {}, ".format(dist.get_rank())
msg += "epoch: {}, ".format(self.epoch)
msg += "step: {}, ".format(self.iteration)
msg += "batch : {}/{}, ".format(batch_index + 1,
len(self.train_loader))
msg += "lr: {:>.8f}, ".format(self.lr_scheduler())
msg += "data time: {:>.3f}s, ".format(dataload_time)
self.train_batch(batch_index, batch, msg)
with Timer("Epoch-Train Time Cost: {}"):
self.model.train()
try:
data_start_time = time.time()
except Exception as e:
logger.error(e)
raise e
total_loss, num_seen_utts = self.valid()
if dist.get_world_size() > 1:
num_seen_utts = paddle.to_tensor(num_seen_utts)
# the default operator in all_reduce function is sum.
dist.all_reduce(num_seen_utts)
total_loss = paddle.to_tensor(total_loss)
dist.all_reduce(total_loss)
cv_loss = total_loss / num_seen_utts
cv_loss = float(cv_loss)
else:
cv_loss = total_loss / num_seen_utts
for batch_index, batch in enumerate(self.train_loader):
dataload_time = time.time() - data_start_time
msg = "Train: Rank: {}, ".format(dist.get_rank())
msg += "epoch: {}, ".format(self.epoch)
msg += "step: {}, ".format(self.iteration)
msg += "batch : {}/{}, ".format(batch_index + 1,
len(self.train_loader))
msg += "lr: {:>.8f}, ".format(self.lr_scheduler())
msg += "data time: {:>.3f}s, ".format(dataload_time)
self.train_batch(batch_index, batch, msg)
data_start_time = time.time()
except Exception as e:
logger.error(e)
raise e
with Timer("Eval Time Cost: {}"):
total_loss, num_seen_utts = self.valid()
if dist.get_world_size() > 1:
num_seen_utts = paddle.to_tensor(num_seen_utts)
# the default operator in all_reduce function is sum.
dist.all_reduce(num_seen_utts)
total_loss = paddle.to_tensor(total_loss)
dist.all_reduce(total_loss)
cv_loss = total_loss / num_seen_utts
cv_loss = float(cv_loss)
else:
cv_loss = total_loss / num_seen_utts
logger.info(
'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss))

@ -44,7 +44,7 @@ def feat_dim_and_vocab_size(data_json: List[Dict[Text, Any]],
def batch_collate(x):
"""de-tuple.
"""de-minibatch, since user compose batch.
Args:
x (List[Tuple]): [(utts, xs, ilens, ys, olens)]

@ -106,11 +106,9 @@ class ConvBn(nn.Layer):
# reset padding part to 0
masks = make_non_pad_mask(x_len) #[B, T]
masks = masks.unsqueeze(1).unsqueeze(1) # [B, 1, 1, T]
# TODO(Hui Zhang): not support bool multiply
# masks = masks.type_as(x)
masks = masks.astype(x.dtype)
x = x.multiply(masks)
# https://github.com/PaddlePaddle/Paddle/pull/29265
# rhs will type promote to lhs
x = x * masks
return x, x_len

@ -128,8 +128,8 @@ class DeepSpeech2Model(nn.Layer):
num_rnn_layers=3, #Number of stacking RNN layers.
rnn_layer_size=1024, #RNN layer size (number of RNN cells).
use_gru=True, #Use gru if set True. Use simple rnn if set False.
share_rnn_weights=True #Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported.
))
share_rnn_weights=True, #Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported.
ctc_grad_norm_type='instance', ))
if config is not None:
config.merge_from_other_cfg(default)
return default
@ -141,7 +141,9 @@ class DeepSpeech2Model(nn.Layer):
num_rnn_layers=3,
rnn_size=1024,
use_gru=False,
share_rnn_weights=True):
share_rnn_weights=True,
blank_id=0,
ctc_grad_norm_type='instance'):
super().__init__()
self.encoder = CRNNEncoder(
feat_size=feat_size,
@ -156,10 +158,11 @@ class DeepSpeech2Model(nn.Layer):
self.decoder = CTCDecoder(
odim=dict_size, # <blank> is in vocab
enc_n_units=self.encoder.output_size,
blank_id=0, # first token is <blank>
blank_id=blank_id,
dropout_rate=0.0,
reduction=True, # sum
batch_average=True) # sum / batch_size
batch_average=True, # sum / batch_size
grad_norm_type=ctc_grad_norm_type)
def forward(self, audio, audio_len, text, text_len):
"""Compute Model loss
@ -221,7 +224,8 @@ class DeepSpeech2Model(nn.Layer):
num_rnn_layers=config.model.num_rnn_layers,
rnn_size=config.model.rnn_layer_size,
use_gru=config.model.use_gru,
share_rnn_weights=config.model.share_rnn_weights)
share_rnn_weights=config.model.share_rnn_weights,
blank_id=config.model.blank_id)
infos = Checkpoint().load_parameters(
model, checkpoint_path=checkpoint_path)
logger.info(f"checkpoint info: {infos}")
@ -246,7 +250,8 @@ class DeepSpeech2Model(nn.Layer):
num_rnn_layers=config.num_rnn_layers,
rnn_size=config.rnn_layer_size,
use_gru=config.use_gru,
share_rnn_weights=config.share_rnn_weights)
share_rnn_weights=config.share_rnn_weights,
blank_id=config.blank_id)
return model
@ -258,7 +263,8 @@ class DeepSpeech2InferModel(DeepSpeech2Model):
num_rnn_layers=3,
rnn_size=1024,
use_gru=False,
share_rnn_weights=True):
share_rnn_weights=True,
blank_id=0):
super().__init__(
feat_size=feat_size,
dict_size=dict_size,
@ -266,7 +272,8 @@ class DeepSpeech2InferModel(DeepSpeech2Model):
num_rnn_layers=num_rnn_layers,
rnn_size=rnn_size,
use_gru=use_gru,
share_rnn_weights=share_rnn_weights)
share_rnn_weights=share_rnn_weights,
blank_id=blank_id)
def forward(self, audio, audio_len):
"""export model function

@ -308,7 +308,8 @@ class RNNStack(nn.Layer):
x, x_len = rnn(x, x_len)
masks = make_non_pad_mask(x_len) #[B, T]
masks = masks.unsqueeze(-1) # [B, T, 1]
# TODO(Hui Zhang): not support bool multiply
masks = masks.astype(x.dtype)
x = x.multiply(masks)
# https://github.com/PaddlePaddle/Paddle/pull/29265
# rhs will type promote to lhs
x = x * masks
return x, x_len

@ -254,6 +254,7 @@ class DeepSpeech2ModelOnline(nn.Layer):
num_fc_layers=2,
fc_layers_size_list=[512, 256],
use_gru=True, #Use gru if set True. Use simple rnn if set False.
blank_id=0, # index of blank in vocob.txt
))
if config is not None:
config.merge_from_other_cfg(default)
@ -268,7 +269,8 @@ class DeepSpeech2ModelOnline(nn.Layer):
rnn_direction='forward',
num_fc_layers=2,
fc_layers_size_list=[512, 256],
use_gru=False):
use_gru=False,
blank_id=0):
super().__init__()
self.encoder = CRNNEncoder(
feat_size=feat_size,
@ -284,10 +286,11 @@ class DeepSpeech2ModelOnline(nn.Layer):
self.decoder = CTCDecoder(
odim=dict_size, # <blank> is in vocab
enc_n_units=self.encoder.output_size,
blank_id=0, # first token is <blank>
blank_id=blank_id,
dropout_rate=0.0,
reduction=True, # sum
batch_average=True) # sum / batch_size
batch_average=True, # sum / batch_size
grad_norm_type='instance')
def forward(self, audio, audio_len, text, text_len):
"""Compute Model loss
@ -353,7 +356,8 @@ class DeepSpeech2ModelOnline(nn.Layer):
rnn_direction=config.model.rnn_direction,
num_fc_layers=config.model.num_fc_layers,
fc_layers_size_list=config.model.fc_layers_size_list,
use_gru=config.model.use_gru)
use_gru=config.model.use_gru,
blank_id=config.model.blank_id)
infos = Checkpoint().load_parameters(
model, checkpoint_path=checkpoint_path)
logger.info(f"checkpoint info: {infos}")
@ -380,7 +384,8 @@ class DeepSpeech2ModelOnline(nn.Layer):
rnn_direction=config.rnn_direction,
num_fc_layers=config.num_fc_layers,
fc_layers_size_list=config.fc_layers_size_list,
use_gru=config.use_gru)
use_gru=config.use_gru,
blank_id=config.blank_id)
return model
@ -394,7 +399,8 @@ class DeepSpeech2InferModelOnline(DeepSpeech2ModelOnline):
rnn_direction='forward',
num_fc_layers=2,
fc_layers_size_list=[512, 256],
use_gru=False):
use_gru=False,
blank_id=0):
super().__init__(
feat_size=feat_size,
dict_size=dict_size,
@ -404,7 +410,8 @@ class DeepSpeech2InferModelOnline(DeepSpeech2ModelOnline):
rnn_direction=rnn_direction,
num_fc_layers=num_fc_layers,
fc_layers_size_list=fc_layers_size_list,
use_gru=use_gru)
use_gru=use_gru,
blank_id=blank_id)
def forward(self, audio_chunk, audio_chunk_lens, chunk_state_h_box,
chunk_state_c_box):

@ -0,0 +1,19 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .u2 import U2InferModel
from .u2 import U2Model
from .updater import U2Evaluator
from .updater import U2Updater
__all__ = ["U2Model", "U2InferModel", "U2Evaluator", "U2Updater"]

@ -115,7 +115,8 @@ class U2BaseModel(nn.Layer):
ctc_weight: float=0.5,
ignore_id: int=IGNORE_ID,
lsm_weight: float=0.0,
length_normalized_loss: bool=False):
length_normalized_loss: bool=False,
**kwargs):
assert 0.0 <= ctc_weight <= 1.0, ctc_weight
super().__init__()
@ -661,9 +662,7 @@ class U2BaseModel(nn.Layer):
xs, offset, required_cache_size, subsampling_cache,
elayers_output_cache, conformer_cnn_cache)
# @jit.to_static([
# paddle.static.InputSpec(shape=[1, None, feat_dim],dtype='float32'), # audio feat, [B,T,D]
# ])
# @jit.to_static
def ctc_activation(self, xs: paddle.Tensor) -> paddle.Tensor:
""" Export interface for c++ call, apply linear transform and log
softmax before ctc
@ -830,6 +829,7 @@ class U2Model(U2BaseModel):
Returns:
int, nn.Layer, nn.Layer, nn.Layer: vocab size, encoder, decoder, ctc
"""
# cmvn
if configs['cmvn_file'] is not None:
mean, istd = load_cmvn(configs['cmvn_file'],
configs['cmvn_file_type'])
@ -839,11 +839,13 @@ class U2Model(U2BaseModel):
else:
global_cmvn = None
# input & output dim
input_dim = configs['input_dim']
vocab_size = configs['output_dim']
assert input_dim != 0, input_dim
assert vocab_size != 0, vocab_size
# encoder
encoder_type = configs.get('encoder', 'transformer')
logger.info(f"U2 Encoder type: {encoder_type}")
if encoder_type == 'transformer':
@ -855,16 +857,21 @@ class U2Model(U2BaseModel):
else:
raise ValueError(f"not support encoder type:{encoder_type}")
# decoder
decoder = TransformerDecoder(vocab_size,
encoder.output_size(),
**configs['decoder_conf'])
# ctc decoder and ctc loss
model_conf = configs['model_conf']
ctc = CTCDecoder(
odim=vocab_size,
enc_n_units=encoder.output_size(),
blank_id=0,
dropout_rate=0.0,
dropout_rate=model_conf['ctc_dropoutrate'],
reduction=True, # sum
batch_average=True) # sum / batch_size
batch_average=True, # sum / batch_size
grad_norm_type=model_conf['ctc_grad_norm_type'])
return vocab_size, encoder, decoder, ctc

@ -0,0 +1,149 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from contextlib import nullcontext
import paddle
from paddle import distributed as dist
from deepspeech.training.extensions.evaluator import StandardEvaluator
from deepspeech.training.reporter import report
from deepspeech.training.timer import Timer
from deepspeech.training.updaters.standard_updater import StandardUpdater
from deepspeech.utils import layer_tools
from deepspeech.utils.log import Log
logger = Log(__name__).getlog()
class U2Evaluator(StandardEvaluator):
def __init__(self, model, dataloader):
super().__init__(model, dataloader)
self.msg = ""
self.num_seen_utts = 0
self.total_loss = 0.0
def evaluate_core(self, batch):
self.msg = "Valid: Rank: {}, ".format(dist.get_rank())
losses_dict = {}
loss, attention_loss, ctc_loss = self.model(*batch[1:])
if paddle.isfinite(loss):
num_utts = batch[1].shape[0]
self.num_seen_utts += num_utts
self.total_loss += float(loss) * num_utts
losses_dict['loss'] = float(loss)
if attention_loss:
losses_dict['att_loss'] = float(attention_loss)
if ctc_loss:
losses_dict['ctc_loss'] = float(ctc_loss)
for k, v in losses_dict.items():
report("eval/" + k, v)
self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
for k, v in losses_dict.items())
logger.info(self.msg)
return self.total_loss, self.num_seen_utts
class U2Updater(StandardUpdater):
def __init__(self,
model,
optimizer,
scheduler,
dataloader,
init_state=None,
accum_grad=1,
**kwargs):
super().__init__(
model, optimizer, scheduler, dataloader, init_state=init_state)
self.accum_grad = accum_grad
self.forward_count = 0
self.msg = ""
def update_core(self, batch):
"""One Step
Args:
batch (List[Object]): utts, xs, xlens, ys, ylens
"""
losses_dict = {}
self.msg = "Rank: {}, ".format(dist.get_rank())
# forward
batch_size = batch[1].shape[0]
loss, attention_loss, ctc_loss = self.model(*batch[1:])
# loss div by `batch_size * accum_grad`
loss /= self.accum_grad
# loss backward
if (self.forward_count + 1) != self.accum_grad:
# Disable gradient synchronizations across DDP processes.
# Within this context, gradients will be accumulated on module
# variables, which will later be synchronized.
context = self.model.no_sync
else:
# Used for single gpu training and DDP gradient synchronization
# processes.
context = nullcontext
with context():
loss.backward()
layer_tools.print_grads(self.model, print_func=None)
# loss info
losses_dict['loss'] = float(loss) * self.accum_grad
if attention_loss:
losses_dict['att_loss'] = float(attention_loss)
if ctc_loss:
losses_dict['ctc_loss'] = float(ctc_loss)
# report loss
for k, v in losses_dict.items():
report("train/" + k, v)
# loss msg
self.msg += "batch size: {}, ".format(batch_size)
self.msg += "accum: {}, ".format(self.accum_grad)
self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
for k, v in losses_dict.items())
# Truncate the graph
loss.detach()
# update parameters
self.forward_count += 1
if self.forward_count != self.accum_grad:
return
self.forward_count = 0
self.optimizer.step()
self.optimizer.clear_grad()
self.scheduler.step()
def update(self):
# model is default in train mode
# training for a step is implemented here
with Timer("data time cost:{}"):
batch = self.read_batch()
with Timer("step time cost:{}"):
self.update_core(batch)
# #iterations with accum_grad > 1
# Ref.: https://github.com/espnet/espnet/issues/777
if self.forward_count == 0:
self.state.iteration += 1
if self.updates_per_epoch is not None:
if self.state.iteration % self.updates_per_epoch == 0:
self.state.epoch += 1

@ -413,26 +413,26 @@ class U2STBaseModel(nn.Layer):
best_hyps = best_hyps[:, 1:]
return best_hyps
@jit.to_static
# @jit.to_static
def subsampling_rate(self) -> int:
""" Export interface for c++ call, return subsampling_rate of the
model
"""
return self.encoder.embed.subsampling_rate
@jit.to_static
# @jit.to_static
def right_context(self) -> int:
""" Export interface for c++ call, return right_context of the model
"""
return self.encoder.embed.right_context
@jit.to_static
# @jit.to_static
def sos_symbol(self) -> int:
""" Export interface for c++ call, return sos symbol id of the model
"""
return self.sos
@jit.to_static
# @jit.to_static
def eos_symbol(self) -> int:
""" Export interface for c++ call, return eos symbol id of the model
"""
@ -468,7 +468,7 @@ class U2STBaseModel(nn.Layer):
xs, offset, required_cache_size, subsampling_cache,
elayers_output_cache, conformer_cnn_cache)
@jit.to_static
# @jit.to_static
def ctc_activation(self, xs: paddle.Tensor) -> paddle.Tensor:
""" Export interface for c++ call, apply linear transform and log
softmax before ctc
@ -643,13 +643,16 @@ class U2STModel(U2STBaseModel):
decoder = TransformerDecoder(vocab_size,
encoder.output_size(),
**configs['decoder_conf'])
# ctc decoder and ctc loss
model_conf = configs['model_conf']
ctc = CTCDecoder(
odim=vocab_size,
enc_n_units=encoder.output_size(),
blank_id=0,
dropout_rate=0.0,
dropout_rate=model_conf['ctc_dropout_rate'],
reduction=True, # sum
batch_average=True) # sum / batch_size
batch_average=True, # sum / batch_size
grad_norm_type=model_conf['ctc_grad_norm_type'])
return vocab_size, encoder, (st_decoder, decoder, ctc)
else:

@ -15,12 +15,13 @@ from collections import OrderedDict
import paddle
from paddle import nn
from paddle.nn import functional as F
from deepspeech.utils.log import Log
logger = Log(__name__).getlog()
__all__ = ["get_activation", "brelu", "LinearGLUBlock", "ConvGLUBlock"]
__all__ = ["get_activation", "brelu", "LinearGLUBlock", "ConvGLUBlock", "GLU"]
def brelu(x, t_min=0.0, t_max=24.0, name=None):
@ -30,6 +31,17 @@ def brelu(x, t_min=0.0, t_max=24.0, name=None):
return x.maximum(t_min).minimum(t_max)
class GLU(nn.Layer):
"""Gated Linear Units (GLU) Layer"""
def __init__(self, dim: int=-1):
super().__init__()
self.dim = dim
def forward(self, xs):
return F.glu(xs, axis=self.dim)
class LinearGLUBlock(nn.Layer):
"""A linear Gated Linear Units (GLU) block."""
@ -133,13 +145,18 @@ def get_activation(act):
"""Return activation function."""
# Lazy load to avoid unused import
activation_funcs = {
"hardshrink": paddle.nn.Hardshrink,
"hardswish": paddle.nn.Hardswish,
"hardtanh": paddle.nn.Hardtanh,
"tanh": paddle.nn.Tanh,
"relu": paddle.nn.ReLU,
"relu6": paddle.nn.ReLU6,
"leakyrelu": paddle.nn.LeakyReLU,
"selu": paddle.nn.SELU,
"swish": paddle.nn.Swish,
"gelu": paddle.nn.GELU,
"brelu": brelu,
"glu": GLU,
"elu": paddle.nn.ELU,
}
return activation_funcs[act]()

@ -113,11 +113,9 @@ class ConvBn(nn.Layer):
# reset padding part to 0
masks = make_non_pad_mask(x_len) #[B, T]
masks = masks.unsqueeze(1).unsqueeze(1) # [B, 1, 1, T]
# TODO(Hui Zhang): not support bool multiply
# masks = masks.type_as(x)
masks = masks.astype(x.dtype)
x = x.multiply(masks)
# https://github.com/PaddlePaddle/Paddle/pull/29265
# rhs will type promote to lhs
x = x * masks
return x, x_len

@ -39,7 +39,8 @@ class CTCDecoder(nn.Layer):
blank_id=0,
dropout_rate: float=0.0,
reduction: bool=True,
batch_average: bool=True):
batch_average: bool=True,
grad_norm_type: str="instance"):
"""CTC decoder
Args:
@ -48,6 +49,7 @@ class CTCDecoder(nn.Layer):
dropout_rate (float): dropout rate (0.0 ~ 1.0)
reduction (bool): reduce the CTC loss into a scalar, True for 'sum' or 'none'
batch_average (bool): do batch dim wise average.
grad_norm_type (str): one of 'instance', 'batchsize', 'frame', None.
"""
assert check_argument_types()
super().__init__()
@ -60,7 +62,8 @@ class CTCDecoder(nn.Layer):
self.criterion = CTCLoss(
blank=self.blank_id,
reduction=reduction_type,
batch_average=batch_average)
batch_average=batch_average,
grad_norm_type=grad_norm_type)
# CTCDecoder LM Score handle
self._ext_scorer = None
@ -136,7 +139,7 @@ class CTCDecoder(nn.Layer):
results = []
for i, probs in enumerate(probs_split):
output_transcription = ctc_greedy_decoder(
probs_seq=probs, vocabulary=vocab_list)
probs_seq=probs, vocabulary=vocab_list, blank_id=self.blank_id)
results.append(output_transcription)
return results
@ -216,7 +219,8 @@ class CTCDecoder(nn.Layer):
num_processes=num_processes,
ext_scoring_func=self._ext_scorer,
cutoff_prob=cutoff_prob,
cutoff_top_n=cutoff_top_n)
cutoff_top_n=cutoff_top_n,
blank_id=self.blank_id)
results = [result[0][1] for result in beam_search_results]
return results

@ -23,11 +23,32 @@ __all__ = ['CTCLoss', "LabelSmoothingLoss"]
class CTCLoss(nn.Layer):
def __init__(self, blank=0, reduction='sum', batch_average=False):
def __init__(self,
blank=0,
reduction='sum',
batch_average=False,
grad_norm_type=None):
super().__init__()
# last token id as blank id
self.loss = nn.CTCLoss(blank=blank, reduction=reduction)
self.batch_average = batch_average
logger.info(
f"CTCLoss Loss reduction: {reduction}, div-bs: {batch_average}")
# instance for norm_by_times
# batch for norm_by_batchsize
# frame for norm_by_total_logits_len
assert grad_norm_type in ('instance', 'batch', 'frame', None)
self.norm_by_times = False
self.norm_by_batchsize = False
self.norm_by_total_logits_len = False
logger.info(f"CTCLoss Grad Norm Type: {grad_norm_type}")
if grad_norm_type == 'instance':
self.norm_by_times = True
if grad_norm_type == 'batch':
self.norm_by_batchsize = True
if grad_norm_type == 'frame':
self.norm_by_total_logits_len = True
def forward(self, logits, ys_pad, hlens, ys_lens):
"""Compute CTC loss.
@ -46,10 +67,15 @@ class CTCLoss(nn.Layer):
# warp-ctc need activation with shape [T, B, V + 1]
# logits: (B, L, D) -> (L, B, D)
logits = logits.transpose([1, 0, 2])
# (TODO:Hui Zhang) ctc loss does not support int64 labels
ys_pad = ys_pad.astype(paddle.int32)
loss = self.loss(
logits, ys_pad, hlens, ys_lens, norm_by_times=self.batch_average)
logits,
ys_pad,
hlens,
ys_lens,
norm_by_times=self.norm_by_times,
norm_by_batchsize=self.norm_by_batchsize,
norm_by_total_logits_len=self.norm_by_total_logits_len)
if self.batch_average:
# Batch-size average
loss = loss / B

@ -308,7 +308,7 @@ class RNNStack(nn.Layer):
x, x_len = rnn(x, x_len)
masks = make_non_pad_mask(x_len) #[B, T]
masks = masks.unsqueeze(-1) # [B, T, 1]
# TODO(Hui Zhang): not support bool multiply
masks = masks.astype(x.dtype)
x = x.multiply(masks)
# https://github.com/PaddlePaddle/Paddle/pull/29265
# rhs will type promote to lhs
x = x * masks
return x, x_len

@ -13,14 +13,18 @@
# limitations under the License.
from typing import Dict
import extension
import paddle
from paddle import distributed as dist
from paddle.io import DataLoader
from paddle.nn import Layer
from . import extension
from ..reporter import DictSummary
from ..reporter import report
from ..reporter import scope
from ..timer import Timer
from deepspeech.utils.log import Log
logger = Log(__name__).getlog()
class StandardEvaluator(extension.Extension):
@ -43,6 +47,27 @@ class StandardEvaluator(extension.Extension):
def evaluate_core(self, batch):
# compute
self.model(batch) # you may report here
return
def evaluate_sync(self, data):
# dist sync `evaluate_core` outputs
if data is None:
return
numerator, denominator = data
if dist.get_world_size() > 1:
numerator = paddle.to_tensor(numerator)
denominator = paddle.to_tensor(denominator)
# the default operator in all_reduce function is sum.
dist.all_reduce(numerator)
dist.all_reduce(denominator)
value = numerator / denominator
value = float(value)
else:
value = numerator / denominator
# used for `snapshort` to do kbest save.
report("VALID/LOSS", value)
logger.info(f"Valid: all-reduce loss {value}")
def evaluate(self):
# switch to eval mode
@ -56,9 +81,13 @@ class StandardEvaluator(extension.Extension):
with scope(observation):
# main evaluation computation here.
with paddle.no_grad():
self.evaluate_core(batch)
self.evaluate_sync(self.evaluate_core(batch))
summary.add(observation)
summary = summary.compute_mean()
# switch to train mode
for model in self.models.values():
model.train()
return summary
def __call__(self, trainer=None):
@ -66,6 +95,7 @@ class StandardEvaluator(extension.Extension):
# if it is used to extend a trainer, the metrics is reported to
# to observation of the trainer
# or otherwise, you can use your own observation
summary = self.evaluate()
with Timer("Eval Time Cost: {}"):
summary = self.evaluate()
for k, v in summary.items():
report(k, v)

@ -20,8 +20,9 @@ from typing import List
import jsonlines
from deepspeech.training.extensions import extension
from deepspeech.training.updaters.trainer import Trainer
from . import extension
from ..reporter import get_observations
from ..updaters.trainer import Trainer
from deepspeech.utils.log import Log
from deepspeech.utils.mp_tools import rank_zero_only
@ -52,8 +53,19 @@ class Snapshot(extension.Extension):
priority = -100
default_name = "snapshot"
def __init__(self, max_size: int=5, snapshot_on_error: bool=False):
def __init__(self,
mode='latest',
max_size: int=5,
indicator=None,
less_better=True,
snapshot_on_error: bool=False):
self.records: List[Dict[str, Any]] = []
assert mode in ('latest', 'kbest'), mode
if mode == 'kbest':
assert indicator is not None
self.mode = mode
self.indicator = indicator
self.less_is_better = less_better
self.max_size = max_size
self._snapshot_on_error = snapshot_on_error
self._save_all = (max_size == -1)
@ -66,16 +78,17 @@ class Snapshot(extension.Extension):
# load existing records
record_path: Path = self.checkpoint_dir / "records.jsonl"
if record_path.exists():
logger.debug("Loading from an existing checkpoint dir")
self.records = load_records(record_path)
trainer.updater.load(self.records[-1]['path'])
ckpt_path = self.records[-1]['path']
logger.info(f"Loading from an existing checkpoint {ckpt_path}")
trainer.updater.load(ckpt_path)
def on_error(self, trainer, exc, tb):
if self._snapshot_on_error:
self.save_checkpoint_and_update(trainer)
self.save_checkpoint_and_update(trainer, 'latest')
def __call__(self, trainer: Trainer):
self.save_checkpoint_and_update(trainer)
self.save_checkpoint_and_update(trainer, self.mode)
def full(self):
"""Whether the number of snapshots it keeps track of is greater
@ -83,7 +96,7 @@ class Snapshot(extension.Extension):
return (not self._save_all) and len(self.records) > self.max_size
@rank_zero_only
def save_checkpoint_and_update(self, trainer: Trainer):
def save_checkpoint_and_update(self, trainer: Trainer, mode: str):
"""Saving new snapshot and remove the oldest snapshot if needed."""
iteration = trainer.updater.state.iteration
epoch = trainer.updater.state.epoch
@ -97,11 +110,17 @@ class Snapshot(extension.Extension):
'path': str(path.resolve()), # use absolute path
'iteration': iteration,
'epoch': epoch,
'indicator': get_observations()[self.indicator]
}
self.records.append(record)
# remove the earist
if self.full():
if mode == 'kbest':
self.records = sorted(
self.records,
key=lambda record: record['indicator'],
reverse=not self.less_is_better)
eariest_record = self.records[0]
os.remove(eariest_record["path"])
self.records.pop(0)

@ -11,8 +11,10 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from deepspeech.training.extensions import extension
from deepspeech.training.updaters.trainer import Trainer
from visualdl import LogWriter
from . import extension
from ..updaters.trainer import Trainer
class VisualDL(extension.Extension):
@ -26,8 +28,8 @@ class VisualDL(extension.Extension):
default_name = 'visualdl'
priority = extension.PRIORITY_READER
def __init__(self, writer):
self.writer = writer
def __init__(self, output_dir):
self.writer = LogWriter(str(output_dir))
def __call__(self, trainer: Trainer):
for k, v in trainer.observation.items():

@ -47,7 +47,7 @@ class ClipGradByGlobalNormWithLog(paddle.nn.ClipGradByGlobalNorm):
sum_square = layers.reduce_sum(square)
sum_square_list.append(sum_square)
# debug log
# debug log, not dump all since slow down train process
if i < 10:
logger.debug(
f"Grad Before Clip: {p.name}: {float(sum_square.sqrt()) }")
@ -76,7 +76,7 @@ class ClipGradByGlobalNormWithLog(paddle.nn.ClipGradByGlobalNorm):
new_grad = layers.elementwise_mul(x=g, y=clip_var)
params_and_grads.append((p, new_grad))
# debug log
# debug log, not dump all since slow down train process
if i < 10:
logger.debug(
f"Grad After Clip: {p.name}: {float(new_grad.square().sum().sqrt())}"

@ -0,0 +1,50 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import datetime
import time
from deepspeech.utils.log import Log
__all__ = ["Timer"]
logger = Log(__name__).getlog()
class Timer():
"""To be used like this:
with Timer("Message") as value:
do some thing
"""
def __init__(self, message=None):
self.message = message
def duration(self) -> str:
elapsed_time = time.time() - self.start
time_str = str(datetime.timedelta(seconds=elapsed_time))
return time_str
def __enter__(self):
self.start = time.time()
return self
def __exit__(self, type, value, traceback):
if self.message:
logger.info(self.message.format(self.duration()))
def __call__(self) -> float:
return time.time() - self.start
def __str__(self):
return self.duration()

@ -18,6 +18,7 @@ import paddle
from paddle import distributed as dist
from tensorboardX import SummaryWriter
from deepspeech.training.timer import Timer
from deepspeech.utils import mp_tools
from deepspeech.utils.checkpoint import Checkpoint
from deepspeech.utils.log import Log
@ -170,7 +171,7 @@ class Trainer():
self.iteration = 0
self.epoch = 0
scratch = True
logger.info("Restore/Init checkpoint!")
return scratch
def new_epoch(self):
@ -194,35 +195,37 @@ class Trainer():
logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}")
while self.epoch < self.config.training.n_epoch:
self.model.train()
try:
data_start_time = time.time()
for batch_index, batch in enumerate(self.train_loader):
dataload_time = time.time() - data_start_time
msg = "Train: Rank: {}, ".format(dist.get_rank())
msg += "epoch: {}, ".format(self.epoch)
msg += "step: {}, ".format(self.iteration)
msg += "batch : {}/{}, ".format(batch_index + 1,
len(self.train_loader))
msg += "lr: {:>.8f}, ".format(self.lr_scheduler())
msg += "data time: {:>.3f}s, ".format(dataload_time)
self.train_batch(batch_index, batch, msg)
with Timer("Epoch-Train Time Cost: {}"):
self.model.train()
try:
data_start_time = time.time()
except Exception as e:
logger.error(e)
raise e
total_loss, num_seen_utts = self.valid()
if dist.get_world_size() > 1:
num_seen_utts = paddle.to_tensor(num_seen_utts)
# the default operator in all_reduce function is sum.
dist.all_reduce(num_seen_utts)
total_loss = paddle.to_tensor(total_loss)
dist.all_reduce(total_loss)
cv_loss = total_loss / num_seen_utts
cv_loss = float(cv_loss)
else:
cv_loss = total_loss / num_seen_utts
for batch_index, batch in enumerate(self.train_loader):
dataload_time = time.time() - data_start_time
msg = "Train: Rank: {}, ".format(dist.get_rank())
msg += "epoch: {}, ".format(self.epoch)
msg += "step: {}, ".format(self.iteration)
msg += "batch : {}/{}, ".format(batch_index + 1,
len(self.train_loader))
msg += "lr: {:>.8f}, ".format(self.lr_scheduler())
msg += "data time: {:>.3f}s, ".format(dataload_time)
self.train_batch(batch_index, batch, msg)
data_start_time = time.time()
except Exception as e:
logger.error(e)
raise e
with Timer("Eval Time Cost: {}"):
total_loss, num_seen_utts = self.valid()
if dist.get_world_size() > 1:
num_seen_utts = paddle.to_tensor(num_seen_utts)
# the default operator in all_reduce function is sum.
dist.all_reduce(num_seen_utts)
total_loss = paddle.to_tensor(total_loss)
dist.all_reduce(total_loss)
cv_loss = total_loss / num_seen_utts
cv_loss = float(cv_loss)
else:
cv_loss = total_loss / num_seen_utts
logger.info(
'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss))
@ -240,14 +243,14 @@ class Trainer():
"""The routine of the experiment after setup. This method is intended
to be used by the user.
"""
try:
self.train()
except KeyboardInterrupt:
self.save()
exit(-1)
finally:
self.destory()
logger.info("Training Done.")
with Timer("Training Done: {}"):
try:
self.train()
except KeyboardInterrupt:
self.save()
exit(-1)
finally:
self.destory()
def setup_output_dir(self):
"""Create a directory used for output.

@ -14,12 +14,12 @@
from typing import Dict
from typing import Optional
from paddle import Tensor
import paddle
from paddle.io import DataLoader
from paddle.io import DistributedBatchSampler
from paddle.nn import Layer
from paddle.optimizer import Optimizer
from timer import timer
from paddle.optimizer.lr import LRScheduler
from deepspeech.training.reporter import report
from deepspeech.training.updaters.updater import UpdaterBase
@ -39,8 +39,10 @@ class StandardUpdater(UpdaterBase):
def __init__(self,
model: Layer,
optimizer: Optimizer,
scheduler: LRScheduler,
dataloader: DataLoader,
init_state: Optional[UpdaterState]=None):
super().__init__(init_state)
# it is designed to hold multiple models
models = {"main": model}
self.models: Dict[str, Layer] = models
@ -51,15 +53,14 @@ class StandardUpdater(UpdaterBase):
self.optimizer = optimizer
self.optimizers: Dict[str, Optimizer] = optimizers
# it is designed to hold multiple scheduler
schedulers = {"main": scheduler}
self.scheduler = scheduler
self.schedulers: Dict[str, LRScheduler] = schedulers
# dataloaders
self.dataloader = dataloader
# init state
if init_state is None:
self.state = UpdaterState()
else:
self.state = init_state
self.train_iterator = iter(dataloader)
def update(self):
@ -103,8 +104,10 @@ class StandardUpdater(UpdaterBase):
model.train()
# training for a step is implemented here
batch = self.read_batch()
self.update_core(batch)
with Timier("data time cost:{}"):
batch = self.read_batch()
with Timier("step time cost:{}"):
self.update_core(batch)
self.state.iteration += 1
if self.updates_per_epoch is not None:
@ -115,13 +118,14 @@ class StandardUpdater(UpdaterBase):
"""A simple case for a training step. Basic assumptions are:
Single model;
Single optimizer;
Single scheduler, and update learning rate each step;
A batch from the dataloader is just the input of the model;
The model return a single loss, or a dict containing serval losses.
Parameters updates at every batch, no gradient accumulation.
"""
loss = self.model(*batch)
if isinstance(loss, Tensor):
if isinstance(loss, paddle.Tensor):
loss_dict = {"main": loss}
else:
# Dict[str, Tensor]
@ -135,14 +139,15 @@ class StandardUpdater(UpdaterBase):
for name, loss_item in loss_dict.items():
report(name, float(loss_item))
self.optimizer.clear_gradient()
self.optimizer.clear_grad()
loss_dict["main"].backward()
self.optimizer.update()
self.optimizer.step()
self.scheduler.step()
@property
def updates_per_epoch(self):
"""Number of updater per epoch, determined by the length of the
dataloader."""
"""Number of steps per epoch,
determined by the length of the dataloader."""
length_of_dataloader = None
try:
length_of_dataloader = len(self.dataloader)
@ -163,18 +168,16 @@ class StandardUpdater(UpdaterBase):
def read_batch(self):
"""Read a batch from the data loader, auto renew when data is exhausted."""
with timer() as t:
try:
batch = next(self.train_iterator)
except StopIteration:
self.new_epoch()
batch = next(self.train_iterator)
logger.debug(
f"Read a batch takes {t.elapse}s.") # replace it with logger
try:
batch = next(self.train_iterator)
except StopIteration:
self.new_epoch()
batch = next(self.train_iterator)
return batch
def state_dict(self):
"""State dict of a Updater, model, optimizer and updater state are included."""
"""State dict of a Updater, model, optimizers/schedulers
and updater state are included."""
state_dict = super().state_dict()
for name, model in self.models.items():
state_dict[f"{name}_params"] = model.state_dict()
@ -184,7 +187,7 @@ class StandardUpdater(UpdaterBase):
def set_state_dict(self, state_dict):
"""Set state dict for a Updater. Parameters of models, states for
optimizers and UpdaterState are restored."""
optimizers/schedulers and UpdaterState are restored."""
for name, model in self.models.items():
model.set_state_dict(state_dict[f"{name}_params"])
for name, optim in self.optimizers.items():

@ -140,8 +140,8 @@ class Trainer():
try:
while not stop_trigger(self):
self.observation = {}
# set observation as the report target
# you can use report freely in Updater.update()
# set observation as the `report` target
# you can use `report` freely in Updater.update()
# updating parameters and state
with scope(self.observation):

@ -52,6 +52,7 @@ class UpdaterBase():
"""
def __init__(self, init_state=None):
# init state
if init_state is None:
self.state = UpdaterState()
else:

@ -114,13 +114,13 @@ class Checkpoint():
params_path = checkpoint_path + ".pdparams"
model_dict = paddle.load(params_path)
model.set_state_dict(model_dict)
logger.info("Rank {}: loaded model from {}".format(rank, params_path))
logger.info("Rank {}: Restore model from {}".format(rank, params_path))
optimizer_path = checkpoint_path + ".pdopt"
if optimizer and os.path.isfile(optimizer_path):
optimizer_dict = paddle.load(optimizer_path)
optimizer.set_state_dict(optimizer_dict)
logger.info("Rank {}: loaded optimizer state from {}".format(
logger.info("Rank {}: Restore optimizer state from {}".format(
rank, optimizer_path))
info_path = re.sub('.pdparams$', '.json', params_path)

@ -12,19 +12,13 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import getpass
import logging
import os
import socket
import sys
from loguru import logger
from paddle import inference
FORMAT_STR = '[%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s'
DATE_FMT_STR = '%Y/%m/%d %H:%M:%S'
logging.basicConfig(
level=logging.DEBUG, format=FORMAT_STR, datefmt=DATE_FMT_STR)
def find_log_dir(log_dir=None):
"""Returns the most suitable directory to put log files into.
@ -98,59 +92,28 @@ def find_log_dir_and_names(program_name=None, log_dir=None):
class Log():
log_name = None
def __init__(self, logger=None):
self.logger = logging.getLogger(logger)
self.logger.setLevel(logging.DEBUG)
file_dir = os.getcwd() + '/log'
if not os.path.exists(file_dir):
os.mkdir(file_dir)
self.log_dir = file_dir
actual_log_dir, file_prefix, symlink_prefix = find_log_dir_and_names(
program_name=None, log_dir=self.log_dir)
basename = '%s.DEBUG.%d' % (file_prefix, os.getpid())
filename = os.path.join(actual_log_dir, basename)
if Log.log_name is None:
Log.log_name = filename
# Create a symlink to the log file with a canonical name.
symlink = os.path.join(actual_log_dir, symlink_prefix + '.DEBUG')
try:
if os.path.islink(symlink):
os.unlink(symlink)
os.symlink(os.path.basename(Log.log_name), symlink)
except EnvironmentError:
# If it fails, we're sad but it's no error. Commonly, this
# fails because the symlink was created by another user and so
# we can't modify it
pass
if not self.logger.hasHandlers():
formatter = logging.Formatter(fmt=FORMAT_STR, datefmt=DATE_FMT_STR)
fh = logging.FileHandler(Log.log_name)
fh.setLevel(logging.DEBUG)
fh.setFormatter(formatter)
self.logger.addHandler(fh)
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
ch.setFormatter(formatter)
self.logger.addHandler(ch)
# stop propagate for propagating may print
# log multiple times
self.logger.propagate = False
"""Default Logger for all."""
logger.remove()
logger.add(
sys.stdout,
level='INFO',
enqueue=True,
filter=lambda record: record['level'].no >= 20)
_, file_prefix, _ = find_log_dir_and_names()
sink_prefix = os.path.join("exp/log", file_prefix)
sink_path = sink_prefix[:-3] + "{time}.log"
logger.add(sink_path, level='DEBUG', enqueue=True, rotation="500 MB")
def __init__(self, name=None):
pass
def getlog(self):
return self.logger
return logger
class Autolog:
"""Just used by fullchain project"""
def __init__(self,
batch_size,
model_name="DeepSpeech",

Binary file not shown.

Before

Width:  |  Height:  |  Size: 206 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 108 KiB

@ -1,16 +0,0 @@
# Benchmarks
## Acceleration with Multi-GPUs
We compare the training time with 1, 2, 4, 8 Tesla V100 GPUs (with a subset of LibriSpeech samples whose audio durations are between 6.0 and 7.0 seconds). And it shows that a **near-linear** acceleration with multiple GPUs has been achieved. In the following figure, the time (in seconds) cost for training is printed on the blue bars.
<img src="../images/multi_gpu_speedup.png" width=450>
| # of GPU | Acceleration Rate |
| -------- | --------------: |
| 1 | 1.00 X |
| 2 | 1.98 X |
| 4 | 3.73 X |
| 8 | 6.95 X |
`utils/profile.sh` provides such a demo profiling tool, you can change it as need.

Before

Width:  |  Height:  |  Size: 93 KiB

After

Width:  |  Height:  |  Size: 93 KiB

Before

Width:  |  Height:  |  Size: 93 KiB

After

Width:  |  Height:  |  Size: 93 KiB

@ -1,8 +1,8 @@
# Deepspeech2
## Streaming
The implemented arcitecure of Deepspeech2 online model is based on [Deepspeech2 model](https://arxiv.org/pdf/1512.02595.pdf) with some changes.
The model is mainly composed of 2D convolution subsampling layer and stacked single direction rnn layers.
The implemented arcitecure of Deepspeech2 online model is based on [Deepspeech2 model](https://arxiv.org/pdf/1512.02595.pdf) with some changes.
The model is mainly composed of 2D convolution subsampling layer and stacked single direction rnn layers.
To illustrate the model implementation clearly, 3 parts are described in detail.
- Data Preparation
@ -11,10 +11,10 @@ To illustrate the model implementation clearly, 3 parts are described in detail.
In addition, the training process and the testing process are also introduced.
The arcitecture of the model is shown in Fig.1.
The arcitecture of the model is shown in Fig.1.
<p align="center">
<img src="../images/ds2onlineModel.png" width=800>
<img src="../images/ds2onlineModel.png" width=800>
<br/>Fig.1 The Arcitecture of deepspeech2 online model
</p>
@ -28,17 +28,17 @@ For English data, the vocabulary dictionary is composed of 26 English characters
--unit_type="char" \
--count_threshold=0 \
--vocab_path="data/vocab.txt" \
--manifest_paths "data/manifest.train.raw" "data/manifest.dev.raw"
--manifest_paths "data/manifest.train.raw" "data/manifest.dev.raw"
# vocabulary for aishell dataset (Mandarin)
vi examples/aishell/s0/data/vocab.txt
# vocabulary for librispeech dataset (English)
vi examples/librispeech/s0/data/vocab.txt
```
#### CMVN
For CMVN, a subset or the full of traininig set is chosed and be used to compute the feature mean and std.
For CMVN, a subset or the full of traininig set is chosed and be used to compute the feature mean and std.
```
# The code to compute the feature mean and std
cd examples/aishell/s0
@ -52,16 +52,16 @@ python3 ../../../utils/compute_mean_std.py \
--use_dB_normalization=True \
--num_samples=2000 \
--num_workers=10 \
--output_path="data/mean_std.json"
--output_path="data/mean_std.json"
```
#### Feature Extraction
For feature extraction, three methods are implemented, which are linear (FFT without using filter bank), fbank and mfcc.
Currently, the released deepspeech2 online model use the linear feature extraction method.
```
The code for feature extraction
vi deepspeech/frontend/featurizer/audio_featurizer.py
vi deepspeech/frontend/featurizer/audio_featurizer.py
```
### Encoder
@ -70,7 +70,7 @@ The code of Encoder is in:
```
vi deepspeech/models/ds2_online/deepspeech2.py
```
### Decoder
To got the character possibilities of each frame, the feature represention of each frame output from the encoder are input into a projection layer which is implemented as a dense layer to do feature projection. The output dim of the projection layer is same with the vocabulary size. After projection layer, the softmax function is used to transform the frame-level feature representation be the possibilities of characters. While making model inference, the character possibilities of each frame are input into the CTC decoder to get the final speech recognition results.
The code of Decoder is in:
@ -80,7 +80,7 @@ vi deepspeech/models/ds2_online/deepspeech2.py
# The code of CTC Decoder
vi deepspeech/modules/ctc.py
```
## Training Process
Using the command below, you can train the deepspeech2 online model.
```
@ -121,8 +121,9 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
avg.sh exp/${ckpt}/checkpoints ${avg_num}
fi
```
By using the command above, the training process can be started. There are 5 stages in "run.sh", and the first 3 stages are used for training process. The stage 0 is used for data preparation, in which the dataset will be downloaded, and the manifest files of the datasets, vocabulary dictionary and CMVN file will be generated in "./data/". The stage 1 is used for training the model, the log files and model checkpoint is saved in "exp/deepspeech2_online/". The stage 2 is used to generated final model for predicting by averaging the top-k model parameters based on validation loss.
## Testing Process
Using the command below, you can test the deepspeech2 online model.
```
@ -131,7 +132,7 @@ Using the command below, you can test the deepspeech2 online model.
The detail commands are:
```
conf_path=conf/deepspeech2_online.yaml
avg_num=1
avg_num=1
model_type=online
avg_ckpt=avg_${avg_num}
@ -152,29 +153,29 @@ fi
```
After the training process, we use stage 3,4,5 for testing process. The stage 3 is for testing the model generated in the stage 2 and provided the CER index of the test set. The stage 4 is for transforming the model from dynamic graph to static graph by using "paddle.jit" library. The stage 5 is for testing the model in static graph.
## Non-Streaming
The deepspeech2 offline model is similarity to the deepspeech2 online model. The main difference between them is the offline model use the stacked bi-directional rnn layers while the online model use the single direction rnn layers and the fc layer is not used. For the stacked bi-directional rnn layers in the offline model, the rnn cell and gru cell are provided to use.
The arcitecture of the model is shown in Fig.2.
<p align="center">
<img src="../images/ds2offlineModel.png" width=800>
<img src="../images/ds2offlineModel.png" width=800>
<br/>Fig.2 The Arcitecture of deepspeech2 offline model
</p>
For data preparation and decoder, the deepspeech2 offline model is same with the deepspeech2 online model.
The code of encoder and decoder for deepspeech2 offline model is in:
```
vi deepspeech/models/ds2/deepspeech2.py
```
The training process and testing process of deepspeech2 offline model is very similary to deepspeech2 online model.
Only some changes should be noticed.
For training and testing, the "model_type" and the "conf_path" must be set.
For training and testing, the "model_type" and the "conf_path" must be set.
```
# Training offline
cd examples/aishell/s0
@ -185,5 +186,3 @@ bash run.sh --stage 0 --stop_stage 2 --model_type offline --conf_path conf/deeps
cd examples/aishell/s0
bash run.sh --stage 3 --stop_stage 5 --model_type offline --conf_path conf/deepspeech2.yaml
```

@ -40,9 +40,12 @@ model:
rnn_layer_size: 1024
use_gru: True
share_rnn_weights: False
blank_id: 0
ctc_grad_norm_type: instance
training:
n_epoch: 80
accum_grad: 1
lr: 2e-3
lr_decay: 0.83
weight_decay: 1e-06

@ -36,17 +36,20 @@ collator:
model:
num_conv_layers: 2
num_rnn_layers: 3
num_rnn_layers: 5
rnn_layer_size: 1024
rnn_direction: forward # [forward, bidirect]
num_fc_layers: 1
fc_layers_size_list: 512,
num_fc_layers: 0
fc_layers_size_list: -1,
use_gru: False
blank_id: 0
ctc_grad_norm_type: instance
training:
n_epoch: 50
accum_grad: 1
lr: 2e-3
lr_decay: 0.91 # 0.83
lr_decay: 0.9 # 0.83
weight_decay: 1e-06
global_grad_clip: 3.0
log_interval: 100
@ -59,7 +62,7 @@ decoding:
error_rate_type: cer
decoding_method: ctc_beam_search
lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm
alpha: 1.9
alpha: 2.2 #1.9
beta: 5.0
beam_size: 300
cutoff_prob: 0.99

@ -1,20 +0,0 @@
#!/bin/bash
source path.sh
# run on MacOS
# brew install portaudio
# pip install pyaudio
# pip install keyboard
# start demo client
python3 -u ${BIN_DIR}/deploy/client.py \
--host_ip="localhost" \
--host_port=8086 \
if [ $? -ne 0 ]; then
echo "Failed in starting demo client!"
exit 1
fi
exit 0

@ -1,40 +0,0 @@
#!/bin/bash
# TODO: replace the model with a mandarin model
if [[ $# != 1 ]];then
echo "usage: $1 checkpoint_path"
exit -1
fi
source path.sh
# download language model
bash local/download_lm_ch.sh
if [ $? -ne 0 ]; then
exit 1
fi
# download well-trained model
#bash local/download_model.sh
#if [ $? -ne 0 ]; then
# exit 1
#fi
# start demo server
CUDA_VISIBLE_DEVICES=0 \
python3 -u ${BIN_DIR}/deploy/server.py \
--device 'gpu' \
--nproc 1 \
--config conf/deepspeech2.yaml \
--host_ip="localhost" \
--host_port=8086 \
--speech_save_dir="demo_cache" \
--checkpoint_path ${1}
if [ $? -ne 0 ]; then
echo "Failed in starting demo server!"
exit 1
fi
exit 0

@ -20,7 +20,7 @@ fi
mkdir -p exp
seed=10086
if [ ${seed} ]; then
if [ ${seed} != 0 ]; then
export FLAGS_cudnn_deterministic=True
fi
@ -32,7 +32,7 @@ python3 -u ${BIN_DIR}/train.py \
--model_type ${model_type} \
--seed ${seed}
if [ ${seed} ]; then
if [ ${seed} != 0 ]; then
unset FLAGS_cudnn_deterministic
fi

@ -1,28 +0,0 @@
#!/bin/bash
# grid-search for hyper-parameters in language model
python3 -u ${BIN_DIR}/tune.py \
--device 'gpu' \
--nproc 1 \
--config conf/deepspeech2.yaml \
--num_batches=10 \
--batch_size=128 \
--beam_size=300 \
--num_proc_bsearch=8 \
--num_alphas=10 \
--num_betas=10 \
--alpha_from=0.0 \
--alpha_to=5.0 \
--beta_from=-6 \
--beta_to=6 \
--cutoff_prob=1.0 \
--cutoff_top_n=40 \
--checkpoint_path ${1}
if [ $? -ne 0 ]; then
echo "Failed in tuning!"
exit 1
fi
exit 0

@ -27,7 +27,7 @@ fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# avg n best model
avg.sh exp/${ckpt}/checkpoints ${avg_num}
avg.sh best exp/${ckpt}/checkpoints ${avg_num}
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then

@ -76,6 +76,8 @@ model:
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
ctc_dropoutrate: 0.0
ctc_grad_norm_type: instance
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false

@ -71,6 +71,8 @@ model:
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
ctc_dropoutrate: 0.0
ctc_grad_norm_type: instance
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false

@ -19,8 +19,8 @@ echo "using ${device}..."
mkdir -p exp
seed=1024
if [ ${seed} ]; then
seed=10086
if [ ${seed} != 0]; then
export FLAGS_cudnn_deterministic=True
fi
@ -31,7 +31,7 @@ python3 -u ${BIN_DIR}/train.py \
--output exp/${ckpt_name} \
--seed ${seed}
if [ ${seed} ]; then
if [ ${seed} != 0 ]; then
unset FLAGS_cudnn_deterministic
fi

@ -25,7 +25,7 @@ fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# avg n best model
avg.sh exp/${ckpt}/checkpoints ${avg_num}
avg.sh best exp/${ckpt}/checkpoints ${avg_num}
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then

@ -19,8 +19,8 @@ echo "using ${device}..."
mkdir -p exp
seed=1024
if [ ${seed} ]; then
seed=10086
if [ ${seed} != 0]; then
export FLAGS_cudnn_deterministic=True
fi
@ -31,7 +31,7 @@ python3 -u ${BIN_DIR}/train.py \
--output exp/${ckpt_name} \
--seed ${seed}
if [ ${seed} ]; then
if [ ${seed} != 0 ]; then
unset FLAGS_cudnn_deterministic
fi

@ -25,7 +25,7 @@ fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# avg n best model
avg.sh exp/${ckpt}/checkpoints ${avg_num}
avg.sh best exp/${ckpt}/checkpoints ${avg_num}
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then

@ -0,0 +1,58 @@
# [CC-CEDICT](https://cc-cedict.org/wiki/)
What is CC-CEDICT?
CC-CEDICT is a continuation of the CEDICT project.
The objective of the CEDICT project was to create an online, downloadable (as opposed to searchable-only) public-domain Chinese-English dictionary.
CEDICT was started by Paul Andrew Denisowski in October 1997.
For the most part, the project is modeled on Jim Breen's highly successful EDICT (Japanese-English dictionary) project and is intended to be a collaborative effort,
with users providing entries and corrections to the main file.
## Parse CC-CEDICT to Json format
1. Parse to Json
```
run.sh
```
2. Result
```
exp/
|-- cedict
`-- cedict.json
0 directories, 2 files
```
```
4c4bffc84e24467fe1b2ea9ba37ed6b6 exp/cedict
3adf504dacd13886f88cc9fe3b37c75d exp/cedict.json
```
```
==> exp/cedict <==
# CC-CEDICT
# Community maintained free Chinese-English dictionary.
#
# Published by MDBG
#
# License:
# Creative Commons Attribution-ShareAlike 4.0 International License
# https://creativecommons.org/licenses/by-sa/4.0/
#
# Referenced works:
==> exp/cedict.json <==
{"traditional": "2019\u51a0\u72c0\u75c5\u6bd2\u75c5", "simplified": "2019\u51a0\u72b6\u75c5\u6bd2\u75c5", "pinyin": "er4 ling2 yi1 jiu3 guan1 zhuang4 bing4 du2 bing4", "english": "COVID-19, the coronavirus disease identified in 2019"}
{"traditional": "21\u4e09\u9ad4\u7d9c\u5408\u75c7", "simplified": "21\u4e09\u4f53\u7efc\u5408\u75c7", "pinyin": "er4 shi2 yi1 san1 ti3 zong1 he2 zheng4", "english": "trisomy"}
{"traditional": "3C", "simplified": "3C", "pinyin": "san1 C", "english": "abbr. for computers, communications, and consumer electronics"}
{"traditional": "3P", "simplified": "3P", "pinyin": "san1 P", "english": "(slang) threesome"}
{"traditional": "3Q", "simplified": "3Q", "pinyin": "san1 Q", "english": "(Internet slang) thank you (loanword)"}
{"traditional": "421", "simplified": "421", "pinyin": "si4 er4 yi1", "english": "four grandparents, two parents and an only child"}
{"traditional": "502\u81a0", "simplified": "502\u80f6", "pinyin": "wu3 ling2 er4 jiao1", "english": "cyanoacrylate glue"}
{"traditional": "88", "simplified": "88", "pinyin": "ba1 ba1", "english": "(Internet slang) bye-bye (alternative for \u62dc\u62dc[bai2 bai2])"}
{"traditional": "996", "simplified": "996", "pinyin": "jiu3 jiu3 liu4", "english": "9am-9pm, six days a week (work schedule)"}
{"traditional": "A", "simplified": "A", "pinyin": "A", "english": "(slang) (Tw) to steal"}
```

@ -1,5 +0,0 @@
# Download Baker dataset
Baker dataset has to be downloaded mannually and moved to 'data/', because you will have to pass the CATTCHA from a browswe to download the dataset.
Download URL https://test.data-baker.com/#/data/index/source.

@ -0,0 +1,3 @@
# G2P
* zh - Chinese G2P

@ -0,0 +1,93 @@
# G2P
* WS
jieba
* G2P
pypinyin
* Tone sandhi
simple
We recommend using [Paraket](https://github.com/PaddlePaddle/Parakeet] [TextFrontEnd](https://github.com/PaddlePaddle/Parakeet/blob/develop/parakeet/frontend/__init__.py) to do G2P.
The phoneme set should be changed, you can reference `examples/thchs30/a0/data/dict/syllable.lexicon`.
## Download Baker dataset
[Baker](https://test.data-baker.com/#/data/index/source) dataset has to be downloaded mannually and moved to './data',
because you will have to pass the `CATTCHA` from a browswe to download the dataset.
## RUN
```
. path.sh
./run.sh
```
## Result
```
exp/
|-- 000001-010000.txt
|-- ref.pinyin
|-- trans.jieba.pinyin
`-- trans.pinyin
0 directories, 4 files
```
```
4f5a368441eb16aaf43dc1972f8b63dd exp/000001-010000.txt
01707896391c2de9b6fc4a39654be942 exp/ref.pinyin
43380ef160f65a23a3a0544700aa49b8 exp/trans.jieba.pinyin
8e6ff1fc22d8e8584082e804e8bcdeb7 exp/trans.pinyin
```
```
==> exp/000001-010000.txt <==
000001 卡尔普#2陪外孙#1玩滑梯#4。
ka2 er2 pu3 pei2 wai4 sun1 wan2 hua2 ti1
000002 假语村言#2别再#1拥抱我#4。
jia2 yu3 cun1 yan2 bie2 zai4 yong1 bao4 wo3
000003 宝马#1配挂#1跛骡鞍#3貂蝉#1怨枕#2董翁榻#4。
bao2 ma3 pei4 gua4 bo3 luo2 an1 diao1 chan2 yuan4 zhen3 dong3 weng1 ta4
000004 邓小平#2与#1撒切尔#2会晤#4。
deng4 xiao3 ping2 yu3 sa4 qie4 er3 hui4 wu4
000005 老虎#1幼崽#2与#1宠物犬#1玩耍#4。
lao2 hu3 you4 zai3 yu2 chong3 wu4 quan3 wan2 shua3
==> exp/ref.pinyin <==
000001 ka2 er2 pu3 pei2 wai4 sun1 wan2 hua2 ti1
000002 jia2 yu3 cun1 yan2 bie2 zai4 yong1 bao4 wo3
000003 bao2 ma3 pei4 gua4 bo3 luo2 an1 diao1 chan2 yuan4 zhen3 dong3 weng1 ta4
000004 deng4 xiao3 ping2 yu3 sa4 qie4 er3 hui4 wu4
000005 lao2 hu3 you4 zai3 yu2 chong3 wu4 quan3 wan2 shua3
000006 shen1 chang2 yue1 wu2 chi3 er4 cun4 wu3 fen1 huo4 yi3 shang4
000007 zhao4 di2 yue1 cao2 yun2 teng2 qu4 gui3 wu1
000008 zhan2 pin3 sui1 you3 zhan3 yuan2 que4 tui2
000009 yi2 san3 ju1 er2 tong2 he2 you4 tuo1 er2 tong2 wei2 zhu3
000010 ke1 te4 ni1 shen1 chuan1 bao4 wen2 da4 yi1
==> exp/trans.jieba.pinyin <==
000001 ka3 er3 pu3 pei2 wai4 sun1 wan2 hua2 ti1
000002 jia3 yu3 cun1 yan2 bie2 zai4 yong1 bao4 wo3
000003 bao3 ma3 pei4 gua4 bo3 luo2 an1 diao1 chan2 yuan4 zhen3 dong3 weng1 ta4
000004 deng4 xiao3 ping2 yu3 sa1 qie4 er3 hui4 wu4
000005 lao3 hu3 you4 zai3 yu3 chong3 wu4 quan3 wan2 shua3
000006 shen1 chang2 yue1 wu3 chi3 er4 cun4 wu3 fen1 huo4 yi3 shang4
000007 zhao4 di2 yue1 cao2 yun2 teng2 qu4 gui3 wu1
000008 zhan3 pin3 sui1 you3 zhan3 yuan2 que4 tui2
000009 yi3 san3 ju1 er2 tong2 he2 you4 tuo1 er2 tong2 wei2 zhu3
000010 ke1 te4 ni1 shen1 chuan1 bao4 wen2 da4 yi1
==> exp/trans.pinyin <==
000001 ka3 er3 pu3 pei2 wai4 sun1 wan2 hua2 ti1
000002 jia3 yu3 cun1 yan2 bie2 zai4 yong1 bao4 wo3
000003 bao3 ma3 pei4 gua4 bo3 luo2 an1 diao1 chan2 yuan4 zhen3 dong3 weng1 ta4
000004 deng4 xiao3 ping2 yu3 sa1 qie4 er3 hui4 wu4
000005 lao3 hu3 you4 zai3 yu3 chong3 wu4 quan3 wan2 shua3
000006 shen1 chang2 yue1 wu3 chi3 er4 cun4 wu3 fen1 huo4 yi3 shang4
000007 zhao4 di2 yue1 cao2 yun2 teng2 qu4 gui3 wu1
000008 zhan3 pin3 sui1 you3 zhan3 yuan2 que4 tui2
000009 yi3 san3 ju1 er2 tong2 he2 you4 tuo1 er2 tong2 wei2 zhu3
000010 ke1 te4 ni1 shen1 chuan1 bao4 wen2 da4 yi1
```

@ -1,4 +1,4 @@
export MAIN_ROOT=`realpath ${PWD}/../../`
export MAIN_ROOT=`realpath ${PWD}/../../../`
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
export LC_ALL=C

@ -6,16 +6,19 @@ stage=-1
stop_stage=100
exp_dir=exp
data_dir=data
data=data
source ${MAIN_ROOT}/utils/parse_options.sh || exit -1
mkdir -p ${exp_dir}
if [ $stage -le -1 ] && [ $stop_stage -ge -1 ];then
test -e ${data}/BZNSYP.rar || { echo "Please download BZNSYP.rar and put it in ${data}; exit -1; }
fi
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ];then
echo "stage 0: Extracting Prosody Labeling"
bash local/prepare_dataset.sh --exp-dir ${exp_dir} --data-dir ${data_dir}
bash local/prepare_dataset.sh --exp-dir ${exp_dir} --data-dir ${data}
fi
# convert transcription in chinese into pinyin with pypinyin or jieba+pypinyin

@ -1,10 +1,17 @@
# LibriSpeech
## Data
| Data Subset | Duration in Seconds |
| --- | --- |
| data/manifest.train | 0.83s ~ 29.735s |
| data/manifest.dev | 1.065 ~ 35.155s |
| data/manifest.test-clean | 1.285s ~ 34.955s |
## Deepspeech2
| Model | Params | release | Config | Test set | Loss | WER |
| --- | --- | --- | --- | --- | --- | --- |
| DeepSpeech2 | 42.96M | 2.2.0 | conf/deepspeech2.yaml + spec_aug | 14.49190807 | test-clean | 0.067283 |
| DeepSpeech2 | 42.96M | 2.1.0 | conf/deepspeech2.yaml | 15.184467315673828 | test-clean | 0.072154 |
| DeepSpeech2 | 42.96M | 2.0.0 | conf/deepspeech2.yaml | - | test-clean | 0.073973 |
| DeepSpeech2 | 42.96M | 2.2.0 | conf/deepspeech2.yaml + spec_aug | test-clean | 14.49190807 | 0.067283 |
| DeepSpeech2 | 42.96M | 2.1.0 | conf/deepspeech2.yaml | test-clean | 15.184467315673828 | 0.072154 |
| DeepSpeech2 | 42.96M | 2.0.0 | conf/deepspeech2.yaml | test-clean | - | 0.073973 |
| DeepSpeech2 | 42.96M | 1.8.5 | - | test-clean | - | 0.074939 |

@ -4,14 +4,14 @@ data:
dev_manifest: data/manifest.dev-clean
test_manifest: data/manifest.test-clean
min_input_len: 0.0
max_input_len: 27.0 # second
max_input_len: 30.0 # second
min_output_len: 0.0
max_output_len: .inf
min_output_input_ratio: 0.00
max_output_input_ratio: .inf
collator:
batch_size: 20
batch_size: 15
mean_std_filepath: data/mean_std.json
unit_type: char
vocab_filepath: data/vocab.txt
@ -40,9 +40,12 @@ model:
rnn_layer_size: 2048
use_gru: False
share_rnn_weights: True
blank_id: 0
ctc_grad_norm_type: instance
training:
n_epoch: 50
accum_grad: 4
lr: 1e-3
lr_decay: 0.83
weight_decay: 1e-06

@ -4,14 +4,14 @@ data:
dev_manifest: data/manifest.dev-clean
test_manifest: data/manifest.test-clean
min_input_len: 0.0
max_input_len: 27.0 # second
max_input_len: 30.0 # second
min_output_len: 0.0
max_output_len: .inf
min_output_input_ratio: 0.00
max_output_input_ratio: .inf
collator:
batch_size: 20
batch_size: 15
mean_std_filepath: data/mean_std.json
unit_type: char
vocab_filepath: data/vocab.txt
@ -42,9 +42,12 @@ model:
num_fc_layers: 2
fc_layers_size_list: 512, 256
use_gru: False
blank_id: 0
ctc_grad_norm_type: instance
training:
n_epoch: 50
accum_grad: 4
lr: 1e-3
lr_decay: 0.83
weight_decay: 1e-06

@ -20,8 +20,8 @@ echo "using ${device}..."
mkdir -p exp
seed=1024
if [ ${seed} ]; then
seed=10086
if [ ${seed} != 0 ]; then
export FLAGS_cudnn_deterministic=True
fi
@ -33,7 +33,7 @@ python3 -u ${BIN_DIR}/train.py \
--model_type ${model_type} \
--seed ${seed}
if [ ${seed} ]; then
if [ ${seed} != 0 ]; then
unset FLAGS_cudnn_deterministic
fi

@ -1,33 +0,0 @@
#!/bin/bash
if [ $# != 1 ];then
echo "usage: tune ckpt_path"
exit 1
fi
# grid-search for hyper-parameters in language model
python3 -u ${BIN_DIR}/tune.py \
--device 'gpu' \
--nproc 1 \
--config conf/deepspeech2.yaml \
--num_batches=-1 \
--batch_size=128 \
--beam_size=500 \
--num_proc_bsearch=12 \
--num_alphas=45 \
--num_betas=8 \
--alpha_from=1.0 \
--alpha_to=3.2 \
--beta_from=0.1 \
--beta_to=0.45 \
--cutoff_prob=1.0 \
--cutoff_top_n=40 \
--checkpoint_path ${1}
if [ $? -ne 0 ]; then
echo "Failed in tuning!"
exit 1
fi
exit 0

@ -25,7 +25,7 @@ fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# avg n best model
avg.sh exp/${ckpt}/checkpoints ${avg_num}
avg.sh best exp/${ckpt}/checkpoints ${avg_num}
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then

@ -76,6 +76,8 @@ model:
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
ctc_dropoutrate: 0.0
ctc_grad_norm_type: instance
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false

@ -69,6 +69,8 @@ model:
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
ctc_dropoutrate: 0.0
ctc_grad_norm_type: instance
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false

@ -72,6 +72,8 @@ model:
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
ctc_dropoutrate: 0.0
ctc_grad_norm_type: instance
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false

@ -33,7 +33,7 @@ collator:
keep_transcription_text: False
sortagrad: True
shuffle_method: batch_shuffle
num_workers: 2
num_workers: 0
# network architecture
@ -67,6 +67,8 @@ model:
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
ctc_dropoutrate: 0.0
ctc_grad_norm_type: instance
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false

@ -19,8 +19,8 @@ echo "using ${device}..."
mkdir -p exp
seed=1024
if [ ${seed} ]; then
seed=10086
if [ ${seed} != 0 ]; then
export FLAGS_cudnn_deterministic=True
fi
@ -31,7 +31,7 @@ python3 -u ${BIN_DIR}/train.py \
--output exp/${ckpt_name} \
--seed ${seed}
if [ ${seed} ]; then
if [ ${seed} != 0]; then
unset FLAGS_cudnn_deterministic
fi

@ -24,7 +24,7 @@ fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# avg n best model
avg.sh exp/${ckpt}/checkpoints ${avg_num}
avg.sh best exp/${ckpt}/checkpoints ${avg_num}
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then

@ -76,6 +76,8 @@ model:
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
ctc_dropoutrate: 0.0
ctc_grad_norm_type: instance
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false

@ -69,6 +69,8 @@ model:
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
ctc_dropoutrate: 0.0
ctc_grad_norm_type: instance
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false

@ -72,6 +72,8 @@ model:
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
ctc_dropoutrate: 0.0
ctc_grad_norm_type: instance
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false

@ -22,7 +22,7 @@ collator:
batch_frames_out: 0
batch_frames_inout: 0
augmentation_config: conf/augmentation.json
num_workers: 2
num_workers: 0
subsampling_factor: 1
num_encs: 1
@ -58,6 +58,8 @@ model:
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
ctc_dropoutrate: 0.0
ctc_grad_norm_type: instance
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false

@ -19,8 +19,8 @@ echo "using ${device}..."
mkdir -p exp
seed=1024
if [ ${seed} ]; then
seed=10086
if [ ${seed} != 0 ]; then
export FLAGS_cudnn_deterministic=True
fi
@ -32,7 +32,7 @@ python3 -u ${BIN_DIR}/train.py \
--output exp/${ckpt_name} \
--seed ${seed}
if [ ${seed} ]; then
if [ ${seed} != 0 ]; then
unset FLAGS_cudnn_deterministic
fi

@ -25,7 +25,7 @@ fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# avg n best model
avg.sh exp/${ckpt}/checkpoints ${avg_num}
avg.sh best exp/${ckpt}/checkpoints ${avg_num}
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then

@ -0,0 +1,3 @@
# Ngram LM
* s0 - kenlm ngram lm

@ -2,6 +2,95 @@
Train chinese chararctor ngram lm by [kenlm](https://github.com/kpu/kenlm).
## Run
```
. path.sh
bash run.sh
```
## Results
```
exp/
|-- text
|-- text.char.tn
|-- text.word.tn
|-- text_zh_char_o5_p0_1_2_4_4_a22_q8_b8.arpa
|-- text_zh_char_o5_p0_1_2_4_4_a22_q8_b8.arpa.klm.bin
|-- text_zh_word_o3_p0_0_0_a22_q8_b8.arpa
`-- text_zh_word_o3_p0_0_0_a22_q8_b8.arpa.klm.bin
0 directories, 7 files
```
```
3ae083627b9b6cef1a82d574d8483f97 exp/text
d97da252d2a63a662af22f98af30cb8c exp/text.char.tn
c18b03005bd094dbfd9b46442be361fd exp/text.word.tn
73dbf50097896eda33985e11e1ba9a3a exp/text_zh_char_o5_p0_1_2_4_4_a22_q8_b8.arpa
01334e2044c474b99c4f2ffbed790626 exp/text_zh_char_o5_p0_1_2_4_4_a22_q8_b8.arpa.klm.bin
36a42de548045b54662411ae7982c77f exp/text_zh_word_o3_p0_0_0_a22_q8_b8.arpa
332422803ffd73dd7ffd16cd2b0abcd5 exp/text_zh_word_o3_p0_0_0_a22_q8_b8.arpa.klm.bin
```
```
==> exp/text <==
少先队员因该为老人让坐
祛痘印可以吗?有效果吗?
不知这款牛奶口感怎样? 小孩子喝行吗!
是转基因油?
我家宝宝13斤用多大码的
会起坨吗?
请问给送上楼吗?
亲是送赁上门吗
送货时候有外包装没有还是直接发货过来
会不会有坏的?
==> exp/text.char.tn <==
少 先 队 员 因 该 为 老 人 让 坐
祛 痘 印 可 以 吗 有 效 果 吗
不 知 这 款 牛 奶 口 感 怎 样 小 孩 子 喝 行 吗
是 转 基 因 油
我 家 宝 宝 十 三 斤 用 多 大 码 的
会 起 坨 吗
请 问 给 送 上 楼 吗
亲 是 送 赁 上 门 吗
送 货 时 候 有 外 包 装 没 有 还 是 直 接 发 货 过 来
会 不 会 有 坏 的
==> exp/text.word.tn <==
少先队员 因该 为 老人 让 坐
祛痘 印 可以 吗 有 效果 吗
不知 这 款 牛奶 口感 怎样 小孩子 喝行 吗
是 转基因 油
我家 宝宝 十三斤 用多大码 的
会起 坨 吗
请问 给 送 上楼 吗
亲是 送赁 上门 吗
送货 时候 有 外包装 没有 还是 直接 发货 过来
会 不会 有坏 的
==> exp/text_zh_char_o5_p0_1_2_4_4_a22_q8_b8.arpa <==
\data\
ngram 1=587
ngram 2=395
ngram 3=100
ngram 4=2
ngram 5=0
\1-grams:
-3.272324 <unk> 0
0 <s> -0.36706257
==> exp/text_zh_word_o3_p0_0_0_a22_q8_b8.arpa <==
\data\
ngram 1=689
ngram 2=1398
ngram 3=1506
\1-grams:
-3.1755018 <unk> 0
0 <s> -0.23069073
-1.2318869 </s> 0
-3.067262 少先队员 -0.051341705
```

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save