refactor train

add aishell egs
pull/522/head
Hui Zhang 5 years ago
parent 4926544859
commit 6f5b837e54

@ -237,9 +237,9 @@ class DeepSpeech2DistributedBatchSampler(DistributedBatchSampler):
assert (clipped == False)
if not clipped:
res_len = len(indices) - shift_len - len(batch_indices)
assert res_len != 0, f"_batch_shuffle clipped {len(indices)} , {shift_len}, {len(batch_indices)}"
# when res_len is 0, will return whole list, len(List[-0:]) = len(List[:])
batch_indices.extend(indices[-res_len:])
if res_len != 0:
batch_indices.extend(indices[-res_len:])
batch_indices.extend(indices[0:shift_len])
assert len(indices) == len(
batch_indices
@ -381,9 +381,9 @@ class DeepSpeech2BatchSampler(BatchSampler):
assert (clipped == False)
if not clipped:
res_len = len(indices) - shift_len - len(batch_indices)
assert res_len != 0, f"_batch_shuffle clipped {len(indices)} , {shift_len}, {len(batch_indices)}"
# when res_len is 0, will return whole list, len(List[-0:]) = len(List[:])
batch_indices.extend(indices[-res_len:])
if res_len != 0:
batch_indices.extend(indices[-res_len:])
batch_indices.extend(indices[0:shift_len])
assert len(indices) == len(
batch_indices
@ -533,107 +533,3 @@ class SpeechCollator():
texts = np.array(texts).astype('int32')
text_lens = np.array(text_lens).astype('int64')
return padded_audios, texts, audio_lens, text_lens
# def create_dataloader(manifest_path,
# vocab_filepath,
# mean_std_filepath,
# augmentation_config='{}',
# max_duration=float('inf'),
# min_duration=0.0,
# stride_ms=10.0,
# window_ms=20.0,
# max_freq=None,
# specgram_type='linear',
# use_dB_normalization=True,
# random_seed=0,
# keep_transcription_text=False,
# is_training=False,
# batch_size=1,
# num_workers=0,
# sortagrad=False,
# shuffle_method=None,
# dist=False):
# dataset = DeepSpeech2Dataset(
# manifest_path,
# vocab_filepath,
# mean_std_filepath,
# augmentation_config=augmentation_config,
# max_duration=max_duration,
# min_duration=min_duration,
# stride_ms=stride_ms,
# window_ms=window_ms,
# max_freq=max_freq,
# specgram_type=specgram_type,
# use_dB_normalization=use_dB_normalization,
# random_seed=random_seed,
# keep_transcription_text=keep_transcription_text)
# if dist:
# batch_sampler = DeepSpeech2DistributedBatchSampler(
# dataset,
# batch_size,
# num_replicas=None,
# rank=None,
# shuffle=is_training,
# drop_last=is_training,
# sortagrad=is_training,
# shuffle_method=shuffle_method)
# else:
# batch_sampler = DeepSpeech2BatchSampler(
# dataset,
# shuffle=is_training,
# batch_size=batch_size,
# drop_last=is_training,
# sortagrad=is_training,
# shuffle_method=shuffle_method)
# def padding_batch(batch, padding_to=-1, flatten=False, is_training=True):
# """
# Padding audio features with zeros to make them have the same shape (or
# a user-defined shape) within one bach.
# If ``padding_to`` is -1, the maximun shape in the batch will be used
# as the target shape for padding. Otherwise, `padding_to` will be the
# target shape (only refers to the second axis).
# If `flatten` is True, features will be flatten to 1darray.
# """
# new_batch = []
# # get target shape
# max_length = max([audio.shape[1] for audio, text in batch])
# if padding_to != -1:
# if padding_to < max_length:
# raise ValueError("If padding_to is not -1, it should be larger "
# "than any instance's shape in the batch")
# max_length = padding_to
# max_text_length = max([len(text) for audio, text in batch])
# # padding
# padded_audios = []
# audio_lens = []
# texts, text_lens = [], []
# for audio, text in batch:
# padded_audio = np.zeros([audio.shape[0], max_length])
# padded_audio[:, :audio.shape[1]] = audio
# if flatten:
# padded_audio = padded_audio.flatten()
# padded_audios.append(padded_audio)
# audio_lens.append(audio.shape[1])
# padded_text = np.zeros([max_text_length])
# padded_text[:len(text)] = text
# texts.append(padded_text)
# text_lens.append(len(text))
# padded_audios = np.array(padded_audios).astype('float32')
# audio_lens = np.array(audio_lens).astype('int64')
# texts = np.array(texts).astype('int32')
# text_lens = np.array(text_lens).astype('int64')
# return padded_audios, texts, audio_lens, text_lens
# loader = DataLoader(
# dataset,
# batch_sampler=batch_sampler,
# collate_fn=partial(padding_batch, is_training=is_training),
# num_workers=num_workers, )
# return loader

@ -39,13 +39,13 @@ training:
valid_interval: 1000
decoding:
batch_size: 128
error_rate_type: wer
error_rate_type: cer
decoding_method: ctc_beam_search
lang_model_path: models/lm/common_crawl_00.prune01111.trie.klm
alpha: 2.5
beta: 0.3
beam_size: 500
cutoff_prob: 1.0
lang_model_path: models/lm/zh_giga.no_cna_cmn.prune01244.klm
alpha: 2.6
beta: 5.0
beam_size: 300
cutoff_prob: 0.99
cutoff_top_n: 40
num_proc_bsearch: 8

@ -9,30 +9,12 @@ fi
cd - > /dev/null
# evaluate model
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
CUDA_VISIBLE_DEVICES=6 \
python3 -u ${MAIN_ROOT}/test.py \
--batch_size=128 \
--beam_size=300 \
--num_proc_bsearch=8 \
--num_conv_layers=2 \
--num_rnn_layers=3 \
--rnn_layer_size=1024 \
--alpha=2.6 \
--beta=5.0 \
--cutoff_prob=0.99 \
--cutoff_top_n=40 \
--use_gru=True \
--use_gpu=True \
--share_rnn_weights=False \
--test_manifest="data/manifest.test" \
--mean_std_path="data/mean_std.npz" \
--vocab_path="data/vocab.txt" \
--model_path="checkpoints/step_final" \
--lang_model_path="${MAIN_ROOT}/models/lm/zh_giga.no_cna_cmn.prune01244.klm" \
--decoding_method="ctc_beam_search" \
--error_rate_type="cer" \
--specgram_type="linear"
--device 'gpu' \
--nproc 1 \
--config conf/deepspeech2.yaml \
--output ckpt
if [ $? -ne 0 ]; then
echo "Failed in evaluation!"

@ -32,7 +32,7 @@ export FLAGS_sync_nccl_allreduce=0
#--specgram_type="linear" \
#--shuffle_method="batch_shuffle_clipped" \
CUDA_VISIBLE_DEVICES=1,2,6,7 \
CUDA_VISIBLE_DEVICES=2,3,5,7 \
python3 -u ${MAIN_ROOT}/train.py \
--device 'gpu' \
--nproc 4 \

@ -26,7 +26,7 @@ model:
num_conv_layers: 2
num_rnn_layers: 3
rnn_layer_size: 2048
use_gru: False
use_gru: True
share_rnn_weights: True
training:
n_epoch: 20

@ -8,7 +8,7 @@ if [ $? -ne 0 ]; then
fi
cd - > /dev/null
CUDA_VISIBLE_DEVICES=0,1,2,3 \
CUDA_VISIBLE_DEVICES=0 \
python3 -u ${MAIN_ROOT}/test.py \
--device 'gpu' \
--nproc 1 \

@ -3,8 +3,7 @@
export FLAGS_sync_nccl_allreduce=0
#CUDA_VISIBLE_DEVICES=0,1,2,3 \
#CUDA_VISIBLE_DEVICES=0,4,5,6 \
CUDA_VISIBLE_DEVICES=0 \
CUDA_VISIBLE_DEVICES=0,1 \
python3 -u ${MAIN_ROOT}/train.py \
--device 'gpu' \
--nproc 1 \

@ -68,32 +68,18 @@ class DeepSpeech2Trainer(Trainer):
loss = self.criterion(logits, texts, logits_len, texts_len)
return loss
def read_batch(self):
"""Read a batch from the train_loader.
Returns
-------
List[Tensor]
A batch.
"""
try:
batch = next(self.iterator)
except StopIteration as e:
raise e
return batch
def train_batch(self):
def train_batch(self, batch_data):
start = time.time()
batch = self.read_batch()
data_loader_time = time.time() - start
self.optimizer.clear_grad()
self.model.train()
audio, text, audio_len, text_len = batch
batch_size = audio.shape[0]
audio, text, audio_len, text_len = batch_data
outputs = self.model(audio, text, audio_len, text_len)
loss = self.compute_losses(batch, outputs)
loss = self.compute_losses(batch_data, outputs)
loss.backward()
self.optimizer.step()
self.optimizer.clear_grad()
iteration_time = time.time() - start
losses_np = {
@ -104,13 +90,9 @@ class DeepSpeech2Trainer(Trainer):
msg = "Train: Rank: {}, ".format(dist.get_rank())
msg += "epoch: {}, ".format(self.epoch)
msg += "step: {}, ".format(self.iteration)
msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time,
iteration_time)
msg += f"batch size: {batch_size}, "
msg += "time: {:>.3f}s, ".format(iteration_time)
msg += ', '.join('{}: {:>.6f}'.format(k, v)
for k, v in losses_np.items())
#if self.iteration % 100 == 0:
self.logger.info(msg)
if dist.get_rank() == 0 and self.visualizer:
@ -118,6 +100,14 @@ class DeepSpeech2Trainer(Trainer):
self.visualizer.add_scalar("train/{}".format(k), v,
self.iteration)
def new_epoch(self):
"""Reset the train loader and increment ``epoch``.
"""
if self.parallel:
# batch sampler epoch start from 0
self.train_loader.batch_sampler.set_epoch(self.epoch)
self.epoch += 1
def train(self):
"""The training process.
@ -126,21 +116,20 @@ class DeepSpeech2Trainer(Trainer):
"""
self.new_epoch()
while self.epoch <= self.config.training.n_epoch:
try:
for batch in self.train_loader:
self.iteration += 1
self.train_batch()
self.train_batch(batch)
# if self.iteration % self.config.training.valid_interval == 0:
# self.valid()
# if self.iteration % self.config.training.save_interval == 0:
# self.save()
except StopIteration:
self.iteration -= 1 #epoch end, iteration ahead 1
self.valid()
self.save()
self.lr_scheduler.step()
self.new_epoch()
self.valid()
self.save()
self.lr_scheduler.step()
self.new_epoch()
def compute_metrics(self, inputs, outputs):
pass
@ -152,14 +141,13 @@ class DeepSpeech2Trainer(Trainer):
valid_losses = defaultdict(list)
for i, batch in enumerate(self.valid_loader):
audio, text, audio_len, text_len = batch
batch_size = audio.shape[0]
outputs = self.model(audio, text, audio_len, text_len)
loss = self.compute_losses(batch, outputs)
metrics = self.compute_metrics(batch, outputs)
valid_losses['val_loss'].append(float(loss))
valid_losses['val_loss_div_batchsize'].append(
float(loss) / batch_size)
float(loss) / self.config.data.batch_size)
# write visual log
valid_losses = {k: np.mean(v) for k, v in valid_losses.items()}

Loading…
Cancel
Save