refactor train

add aishell egs
pull/522/head
Hui Zhang 5 years ago
parent 4926544859
commit 6f5b837e54

@ -237,9 +237,9 @@ class DeepSpeech2DistributedBatchSampler(DistributedBatchSampler):
assert (clipped == False) assert (clipped == False)
if not clipped: if not clipped:
res_len = len(indices) - shift_len - len(batch_indices) res_len = len(indices) - shift_len - len(batch_indices)
assert res_len != 0, f"_batch_shuffle clipped {len(indices)} , {shift_len}, {len(batch_indices)}"
# when res_len is 0, will return whole list, len(List[-0:]) = len(List[:]) # when res_len is 0, will return whole list, len(List[-0:]) = len(List[:])
batch_indices.extend(indices[-res_len:]) if res_len != 0:
batch_indices.extend(indices[-res_len:])
batch_indices.extend(indices[0:shift_len]) batch_indices.extend(indices[0:shift_len])
assert len(indices) == len( assert len(indices) == len(
batch_indices batch_indices
@ -381,9 +381,9 @@ class DeepSpeech2BatchSampler(BatchSampler):
assert (clipped == False) assert (clipped == False)
if not clipped: if not clipped:
res_len = len(indices) - shift_len - len(batch_indices) res_len = len(indices) - shift_len - len(batch_indices)
assert res_len != 0, f"_batch_shuffle clipped {len(indices)} , {shift_len}, {len(batch_indices)}"
# when res_len is 0, will return whole list, len(List[-0:]) = len(List[:]) # when res_len is 0, will return whole list, len(List[-0:]) = len(List[:])
batch_indices.extend(indices[-res_len:]) if res_len != 0:
batch_indices.extend(indices[-res_len:])
batch_indices.extend(indices[0:shift_len]) batch_indices.extend(indices[0:shift_len])
assert len(indices) == len( assert len(indices) == len(
batch_indices batch_indices
@ -532,108 +532,4 @@ class SpeechCollator():
audio_lens = np.array(audio_lens).astype('int64') audio_lens = np.array(audio_lens).astype('int64')
texts = np.array(texts).astype('int32') texts = np.array(texts).astype('int32')
text_lens = np.array(text_lens).astype('int64') text_lens = np.array(text_lens).astype('int64')
return padded_audios, texts, audio_lens, text_lens return padded_audios, texts, audio_lens, text_lens
# def create_dataloader(manifest_path,
# vocab_filepath,
# mean_std_filepath,
# augmentation_config='{}',
# max_duration=float('inf'),
# min_duration=0.0,
# stride_ms=10.0,
# window_ms=20.0,
# max_freq=None,
# specgram_type='linear',
# use_dB_normalization=True,
# random_seed=0,
# keep_transcription_text=False,
# is_training=False,
# batch_size=1,
# num_workers=0,
# sortagrad=False,
# shuffle_method=None,
# dist=False):
# dataset = DeepSpeech2Dataset(
# manifest_path,
# vocab_filepath,
# mean_std_filepath,
# augmentation_config=augmentation_config,
# max_duration=max_duration,
# min_duration=min_duration,
# stride_ms=stride_ms,
# window_ms=window_ms,
# max_freq=max_freq,
# specgram_type=specgram_type,
# use_dB_normalization=use_dB_normalization,
# random_seed=random_seed,
# keep_transcription_text=keep_transcription_text)
# if dist:
# batch_sampler = DeepSpeech2DistributedBatchSampler(
# dataset,
# batch_size,
# num_replicas=None,
# rank=None,
# shuffle=is_training,
# drop_last=is_training,
# sortagrad=is_training,
# shuffle_method=shuffle_method)
# else:
# batch_sampler = DeepSpeech2BatchSampler(
# dataset,
# shuffle=is_training,
# batch_size=batch_size,
# drop_last=is_training,
# sortagrad=is_training,
# shuffle_method=shuffle_method)
# def padding_batch(batch, padding_to=-1, flatten=False, is_training=True):
# """
# Padding audio features with zeros to make them have the same shape (or
# a user-defined shape) within one bach.
# If ``padding_to`` is -1, the maximun shape in the batch will be used
# as the target shape for padding. Otherwise, `padding_to` will be the
# target shape (only refers to the second axis).
# If `flatten` is True, features will be flatten to 1darray.
# """
# new_batch = []
# # get target shape
# max_length = max([audio.shape[1] for audio, text in batch])
# if padding_to != -1:
# if padding_to < max_length:
# raise ValueError("If padding_to is not -1, it should be larger "
# "than any instance's shape in the batch")
# max_length = padding_to
# max_text_length = max([len(text) for audio, text in batch])
# # padding
# padded_audios = []
# audio_lens = []
# texts, text_lens = [], []
# for audio, text in batch:
# padded_audio = np.zeros([audio.shape[0], max_length])
# padded_audio[:, :audio.shape[1]] = audio
# if flatten:
# padded_audio = padded_audio.flatten()
# padded_audios.append(padded_audio)
# audio_lens.append(audio.shape[1])
# padded_text = np.zeros([max_text_length])
# padded_text[:len(text)] = text
# texts.append(padded_text)
# text_lens.append(len(text))
# padded_audios = np.array(padded_audios).astype('float32')
# audio_lens = np.array(audio_lens).astype('int64')
# texts = np.array(texts).astype('int32')
# text_lens = np.array(text_lens).astype('int64')
# return padded_audios, texts, audio_lens, text_lens
# loader = DataLoader(
# dataset,
# batch_sampler=batch_sampler,
# collate_fn=partial(padding_batch, is_training=is_training),
# num_workers=num_workers, )
# return loader

@ -39,13 +39,13 @@ training:
valid_interval: 1000 valid_interval: 1000
decoding: decoding:
batch_size: 128 batch_size: 128
error_rate_type: wer error_rate_type: cer
decoding_method: ctc_beam_search decoding_method: ctc_beam_search
lang_model_path: models/lm/common_crawl_00.prune01111.trie.klm lang_model_path: models/lm/zh_giga.no_cna_cmn.prune01244.klm
alpha: 2.5 alpha: 2.6
beta: 0.3 beta: 5.0
beam_size: 500 beam_size: 300
cutoff_prob: 1.0 cutoff_prob: 0.99
cutoff_top_n: 40 cutoff_top_n: 40
num_proc_bsearch: 8 num_proc_bsearch: 8

@ -9,30 +9,12 @@ fi
cd - > /dev/null cd - > /dev/null
# evaluate model CUDA_VISIBLE_DEVICES=6 \
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
python3 -u ${MAIN_ROOT}/test.py \ python3 -u ${MAIN_ROOT}/test.py \
--batch_size=128 \ --device 'gpu' \
--beam_size=300 \ --nproc 1 \
--num_proc_bsearch=8 \ --config conf/deepspeech2.yaml \
--num_conv_layers=2 \ --output ckpt
--num_rnn_layers=3 \
--rnn_layer_size=1024 \
--alpha=2.6 \
--beta=5.0 \
--cutoff_prob=0.99 \
--cutoff_top_n=40 \
--use_gru=True \
--use_gpu=True \
--share_rnn_weights=False \
--test_manifest="data/manifest.test" \
--mean_std_path="data/mean_std.npz" \
--vocab_path="data/vocab.txt" \
--model_path="checkpoints/step_final" \
--lang_model_path="${MAIN_ROOT}/models/lm/zh_giga.no_cna_cmn.prune01244.klm" \
--decoding_method="ctc_beam_search" \
--error_rate_type="cer" \
--specgram_type="linear"
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
echo "Failed in evaluation!" echo "Failed in evaluation!"

@ -32,7 +32,7 @@ export FLAGS_sync_nccl_allreduce=0
#--specgram_type="linear" \ #--specgram_type="linear" \
#--shuffle_method="batch_shuffle_clipped" \ #--shuffle_method="batch_shuffle_clipped" \
CUDA_VISIBLE_DEVICES=1,2,6,7 \ CUDA_VISIBLE_DEVICES=2,3,5,7 \
python3 -u ${MAIN_ROOT}/train.py \ python3 -u ${MAIN_ROOT}/train.py \
--device 'gpu' \ --device 'gpu' \
--nproc 4 \ --nproc 4 \

@ -25,8 +25,8 @@ data:
model: model:
num_conv_layers: 2 num_conv_layers: 2
num_rnn_layers: 3 num_rnn_layers: 3
rnn_layer_size: 2048 rnn_layer_size: 2048
use_gru: False use_gru: True
share_rnn_weights: True share_rnn_weights: True
training: training:
n_epoch: 20 n_epoch: 20

@ -8,7 +8,7 @@ if [ $? -ne 0 ]; then
fi fi
cd - > /dev/null cd - > /dev/null
CUDA_VISIBLE_DEVICES=0,1,2,3 \ CUDA_VISIBLE_DEVICES=0 \
python3 -u ${MAIN_ROOT}/test.py \ python3 -u ${MAIN_ROOT}/test.py \
--device 'gpu' \ --device 'gpu' \
--nproc 1 \ --nproc 1 \

@ -3,8 +3,7 @@
export FLAGS_sync_nccl_allreduce=0 export FLAGS_sync_nccl_allreduce=0
#CUDA_VISIBLE_DEVICES=0,1,2,3 \ #CUDA_VISIBLE_DEVICES=0,1,2,3 \
#CUDA_VISIBLE_DEVICES=0,4,5,6 \ CUDA_VISIBLE_DEVICES=0,1 \
CUDA_VISIBLE_DEVICES=0 \
python3 -u ${MAIN_ROOT}/train.py \ python3 -u ${MAIN_ROOT}/train.py \
--device 'gpu' \ --device 'gpu' \
--nproc 1 \ --nproc 1 \

@ -68,32 +68,18 @@ class DeepSpeech2Trainer(Trainer):
loss = self.criterion(logits, texts, logits_len, texts_len) loss = self.criterion(logits, texts, logits_len, texts_len)
return loss return loss
def read_batch(self): def train_batch(self, batch_data):
"""Read a batch from the train_loader.
Returns
-------
List[Tensor]
A batch.
"""
try:
batch = next(self.iterator)
except StopIteration as e:
raise e
return batch
def train_batch(self):
start = time.time() start = time.time()
batch = self.read_batch()
data_loader_time = time.time() - start
self.optimizer.clear_grad()
self.model.train() self.model.train()
audio, text, audio_len, text_len = batch
batch_size = audio.shape[0] audio, text, audio_len, text_len = batch_data
outputs = self.model(audio, text, audio_len, text_len) outputs = self.model(audio, text, audio_len, text_len)
loss = self.compute_losses(batch, outputs) loss = self.compute_losses(batch_data, outputs)
loss.backward() loss.backward()
self.optimizer.step() self.optimizer.step()
self.optimizer.clear_grad()
iteration_time = time.time() - start iteration_time = time.time() - start
losses_np = { losses_np = {
@ -104,13 +90,9 @@ class DeepSpeech2Trainer(Trainer):
msg = "Train: Rank: {}, ".format(dist.get_rank()) msg = "Train: Rank: {}, ".format(dist.get_rank())
msg += "epoch: {}, ".format(self.epoch) msg += "epoch: {}, ".format(self.epoch)
msg += "step: {}, ".format(self.iteration) msg += "step: {}, ".format(self.iteration)
msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time, msg += "time: {:>.3f}s, ".format(iteration_time)
iteration_time)
msg += f"batch size: {batch_size}, "
msg += ', '.join('{}: {:>.6f}'.format(k, v) msg += ', '.join('{}: {:>.6f}'.format(k, v)
for k, v in losses_np.items()) for k, v in losses_np.items())
#if self.iteration % 100 == 0:
self.logger.info(msg) self.logger.info(msg)
if dist.get_rank() == 0 and self.visualizer: if dist.get_rank() == 0 and self.visualizer:
@ -118,6 +100,14 @@ class DeepSpeech2Trainer(Trainer):
self.visualizer.add_scalar("train/{}".format(k), v, self.visualizer.add_scalar("train/{}".format(k), v,
self.iteration) self.iteration)
def new_epoch(self):
"""Reset the train loader and increment ``epoch``.
"""
if self.parallel:
# batch sampler epoch start from 0
self.train_loader.batch_sampler.set_epoch(self.epoch)
self.epoch += 1
def train(self): def train(self):
"""The training process. """The training process.
@ -126,21 +116,20 @@ class DeepSpeech2Trainer(Trainer):
""" """
self.new_epoch() self.new_epoch()
while self.epoch <= self.config.training.n_epoch: while self.epoch <= self.config.training.n_epoch:
try: for batch in self.train_loader:
self.iteration += 1 self.iteration += 1
self.train_batch() self.train_batch(batch)
# if self.iteration % self.config.training.valid_interval == 0: # if self.iteration % self.config.training.valid_interval == 0:
# self.valid() # self.valid()
# if self.iteration % self.config.training.save_interval == 0: # if self.iteration % self.config.training.save_interval == 0:
# self.save() # self.save()
except StopIteration:
self.iteration -= 1 #epoch end, iteration ahead 1 self.valid()
self.valid() self.save()
self.save() self.lr_scheduler.step()
self.lr_scheduler.step() self.new_epoch()
self.new_epoch()
def compute_metrics(self, inputs, outputs): def compute_metrics(self, inputs, outputs):
pass pass
@ -152,14 +141,13 @@ class DeepSpeech2Trainer(Trainer):
valid_losses = defaultdict(list) valid_losses = defaultdict(list)
for i, batch in enumerate(self.valid_loader): for i, batch in enumerate(self.valid_loader):
audio, text, audio_len, text_len = batch audio, text, audio_len, text_len = batch
batch_size = audio.shape[0]
outputs = self.model(audio, text, audio_len, text_len) outputs = self.model(audio, text, audio_len, text_len)
loss = self.compute_losses(batch, outputs) loss = self.compute_losses(batch, outputs)
metrics = self.compute_metrics(batch, outputs) metrics = self.compute_metrics(batch, outputs)
valid_losses['val_loss'].append(float(loss)) valid_losses['val_loss'].append(float(loss))
valid_losses['val_loss_div_batchsize'].append( valid_losses['val_loss_div_batchsize'].append(
float(loss) / batch_size) float(loss) / self.config.data.batch_size)
# write visual log # write visual log
valid_losses = {k: np.mean(v) for k, v in valid_losses.items()} valid_losses = {k: np.mean(v) for k, v in valid_losses.items()}

Loading…
Cancel
Save