Merge pull request #657 from PaddlePaddle/add_utt

add utt to datapipeline
pull/669/head
Hui Zhang 4 years ago committed by GitHub
commit 1cd88d2619
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -43,7 +43,8 @@ class DeepSpeech2Trainer(Trainer):
def train_batch(self, batch_index, batch_data, msg): def train_batch(self, batch_index, batch_data, msg):
start = time.time() start = time.time()
loss = self.model(*batch_data) utt, audio, audio_len, text, text_len = batch_data
loss = self.model(audio, audio_len, text, text_len)
loss.backward() loss.backward()
layer_tools.print_grads(self.model, print_func=None) layer_tools.print_grads(self.model, print_func=None)
self.optimizer.step() self.optimizer.step()
@ -73,9 +74,10 @@ class DeepSpeech2Trainer(Trainer):
num_seen_utts = 1 num_seen_utts = 1
total_loss = 0.0 total_loss = 0.0
for i, batch in enumerate(self.valid_loader): for i, batch in enumerate(self.valid_loader):
loss = self.model(*batch) utt, audio, audio_len, text, text_len = batch
loss = self.model(audio, audio_len, text, text_len)
if paddle.isfinite(loss): if paddle.isfinite(loss):
num_utts = batch[0].shape[0] num_utts = batch[1].shape[0]
num_seen_utts += num_utts num_seen_utts += num_utts
total_loss += float(loss) * num_utts total_loss += float(loss) * num_utts
valid_losses['val_loss'].append(float(loss)) valid_losses['val_loss'].append(float(loss))
@ -191,7 +193,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
trans.append(''.join([chr(i) for i in ids])) trans.append(''.join([chr(i) for i in ids]))
return trans return trans
def compute_metrics(self, audio, audio_len, texts, texts_len): def compute_metrics(self, utts, audio, audio_len, texts, texts_len, fout = None):
cfg = self.config.decoding cfg = self.config.decoding
errors_sum, len_refs, num_ins = 0.0, 0, 0 errors_sum, len_refs, num_ins = 0.0, 0, 0
errors_func = error_rate.char_errors if cfg.error_rate_type == 'cer' else error_rate.word_errors errors_func = error_rate.char_errors if cfg.error_rate_type == 'cer' else error_rate.word_errors
@ -213,11 +215,13 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
cutoff_top_n=cfg.cutoff_top_n, cutoff_top_n=cfg.cutoff_top_n,
num_processes=cfg.num_proc_bsearch) num_processes=cfg.num_proc_bsearch)
for target, result in zip(target_transcripts, result_transcripts): for utt, target, result in zip(utts, target_transcripts, result_transcripts):
errors, len_ref = errors_func(target, result) errors, len_ref = errors_func(target, result)
errors_sum += errors errors_sum += errors
len_refs += len_ref len_refs += len_ref
num_ins += 1 num_ins += 1
if fout:
fout.write(utt + " " + result + "\n")
logger.info("\nTarget Transcription: %s\nOutput Transcription: %s" % logger.info("\nTarget Transcription: %s\nOutput Transcription: %s" %
(target, result)) (target, result))
logger.info("Current error rate [%s] = %f" % logger.info("Current error rate [%s] = %f" %
@ -238,9 +242,10 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
cfg = self.config cfg = self.config
error_rate_type = None error_rate_type = None
errors_sum, len_refs, num_ins = 0.0, 0, 0 errors_sum, len_refs, num_ins = 0.0, 0, 0
with open(self.args.result_file, 'w') as fout:
for i, batch in enumerate(self.test_loader): for i, batch in enumerate(self.test_loader):
metrics = self.compute_metrics(*batch) utts, audio, audio_len, texts, texts_len = batch
metrics = self.compute_metrics(utts, audio, audio_len, texts, texts_len, fout)
errors_sum += metrics['errors_sum'] errors_sum += metrics['errors_sum']
len_refs += metrics['len_refs'] len_refs += metrics['len_refs']
num_ins += metrics['num_ins'] num_ins += metrics['num_ins']

@ -76,8 +76,9 @@ class U2Trainer(Trainer):
def train_batch(self, batch_index, batch_data, msg): def train_batch(self, batch_index, batch_data, msg):
train_conf = self.config.training train_conf = self.config.training
start = time.time() start = time.time()
utt, audio, audio_len, text, text_len = batch_data
loss, attention_loss, ctc_loss = self.model(*batch_data) loss, attention_loss, ctc_loss = self.model(audio, audio_len, text, text_len)
# loss div by `batch_size * accum_grad` # loss div by `batch_size * accum_grad`
loss /= train_conf.accum_grad loss /= train_conf.accum_grad
loss.backward() loss.backward()
@ -119,9 +120,10 @@ class U2Trainer(Trainer):
num_seen_utts = 1 num_seen_utts = 1
total_loss = 0.0 total_loss = 0.0
for i, batch in enumerate(self.valid_loader): for i, batch in enumerate(self.valid_loader):
loss, attention_loss, ctc_loss = self.model(*batch) utt, audio, audio_len, text, text_len = batch
loss, attention_loss, ctc_loss = self.model(audio, audio_len, text, text_len)
if paddle.isfinite(loss): if paddle.isfinite(loss):
num_utts = batch[0].shape[0] num_utts = batch[1].shape[0]
num_seen_utts += num_utts num_seen_utts += num_utts
total_loss += float(loss) * num_utts total_loss += float(loss) * num_utts
valid_losses['val_loss'].append(float(loss)) valid_losses['val_loss'].append(float(loss))
@ -366,7 +368,7 @@ class U2Tester(U2Trainer):
trans.append(''.join([chr(i) for i in ids])) trans.append(''.join([chr(i) for i in ids]))
return trans return trans
def compute_metrics(self, audio, audio_len, texts, texts_len, fout=None): def compute_metrics(self, utts, audio, audio_len, texts, texts_len, fout=None):
cfg = self.config.decoding cfg = self.config.decoding
errors_sum, len_refs, num_ins = 0.0, 0, 0 errors_sum, len_refs, num_ins = 0.0, 0, 0
errors_func = error_rate.char_errors if cfg.error_rate_type == 'cer' else error_rate.word_errors errors_func = error_rate.char_errors if cfg.error_rate_type == 'cer' else error_rate.word_errors
@ -393,13 +395,13 @@ class U2Tester(U2Trainer):
simulate_streaming=cfg.simulate_streaming) simulate_streaming=cfg.simulate_streaming)
decode_time = time.time() - start_time decode_time = time.time() - start_time
for target, result in zip(target_transcripts, result_transcripts): for utt, target, result in zip(utts, target_transcripts, result_transcripts):
errors, len_ref = errors_func(target, result) errors, len_ref = errors_func(target, result)
errors_sum += errors errors_sum += errors
len_refs += len_ref len_refs += len_ref
num_ins += 1 num_ins += 1
if fout: if fout:
fout.write(result + "\n") fout.write(utt + " " + result + "\n")
logger.info("\nTarget Transcription: %s\nOutput Transcription: %s" % logger.info("\nTarget Transcription: %s\nOutput Transcription: %s" %
(target, result)) (target, result))
logger.info("One example error rate [%s] = %f" % logger.info("One example error rate [%s] = %f" %

@ -51,7 +51,10 @@ class SpeechCollator():
audio_lens = [] audio_lens = []
texts = [] texts = []
text_lens = [] text_lens = []
for audio, text in batch: utts = []
for utt, audio, text in batch:
#utt
utts.append(utt)
# audio # audio
audios.append(audio.T) # [T, D] audios.append(audio.T) # [T, D]
audio_lens.append(audio.shape[1]) audio_lens.append(audio.shape[1])
@ -75,4 +78,4 @@ class SpeechCollator():
padded_texts = pad_sequence( padded_texts = pad_sequence(
texts, padding_value=IGNORE_ID).astype(np.int64) texts, padding_value=IGNORE_ID).astype(np.int64)
text_lens = np.array(text_lens).astype(np.int64) text_lens = np.array(text_lens).astype(np.int64)
return padded_audios, audio_lens, padded_texts, text_lens return utts, padded_audios, audio_lens, padded_texts, text_lens

@ -347,4 +347,6 @@ class ManifestDataset(Dataset):
def __getitem__(self, idx): def __getitem__(self, idx):
instance = self._manifest[idx] instance = self._manifest[idx]
return self.process_utterance(instance["feat"], instance["text"]) feat, text =self.process_utterance(instance["feat"],
instance["text"])
return instance["utt"], feat, text

@ -905,6 +905,7 @@ class U2InferModel(U2Model):
def __init__(self, configs: dict): def __init__(self, configs: dict):
super().__init__(configs) super().__init__(configs)
def forward(self, def forward(self,
feats, feats,
feats_lengths, feats_lengths,

@ -114,7 +114,8 @@ class ConvBn(nn.Layer):
masks = make_non_pad_mask(x_len) #[B, T] masks = make_non_pad_mask(x_len) #[B, T]
masks = masks.unsqueeze(1).unsqueeze(1) # [B, 1, 1, T] masks = masks.unsqueeze(1).unsqueeze(1) # [B, 1, 1, T]
# TODO(Hui Zhang): not support bool multiply # TODO(Hui Zhang): not support bool multiply
masks = masks.type_as(x) # masks = masks.type_as(x)
masks = masks.astype(x.dtype)
x = x.multiply(masks) x = x.multiply(masks)
return x, x_len return x, x_len

@ -309,6 +309,6 @@ class RNNStack(nn.Layer):
masks = make_non_pad_mask(x_len) #[B, T] masks = make_non_pad_mask(x_len) #[B, T]
masks = masks.unsqueeze(-1) # [B, T, 1] masks = masks.unsqueeze(-1) # [B, T, 1]
# TODO(Hui Zhang): not support bool multiply # TODO(Hui Zhang): not support bool multiply
masks = masks.type_as(x) masks = masks.astype(x.dtype)
x = x.multiply(masks) x = x.multiply(masks)
return x, x_len return x, x_len

@ -26,7 +26,7 @@ fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# avg n best model # avg n best model
./local/avg.sh exp/${ckpt}/checkpoints ${avg_num} avg.sh exp/${ckpt}/checkpoints ${avg_num}
fi fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then

@ -25,7 +25,7 @@ fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# avg n best model # avg n best model
./local/avg.sh exp/${ckpt}/checkpoints ${avg_num} avg.sh exp/${ckpt}/checkpoints ${avg_num}
fi fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then

@ -1,7 +1,7 @@
dev-clean/ dev-clean
dev-other/ dev-other
test-clean/ test-clean
test-other/ test-other
train-clean-100/ train-clean-100
train-clean-360/ train-clean-360
train-other-500/ train-other-500

@ -24,7 +24,7 @@ fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# avg n best model # avg n best model
./local/avg.sh exp/${ckpt}/checkpoints ${avg_num} avg.sh exp/${ckpt}/checkpoints ${avg_num}
fi fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then

@ -24,7 +24,7 @@ fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# avg n best model # avg n best model
./local/avg.sh exp/${ckpt}/checkpoints ${avg_num} avg.sh exp/${ckpt}/checkpoints ${avg_num}
fi fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then

@ -11,7 +11,7 @@ avg_num=1
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
avg_ckpt=avg_${avg_num} avg_ckpt=avg_${avg_num}
ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}') ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}') ###ckpt = deepspeech2
echo "checkpoint name ${ckpt}" echo "checkpoint name ${ckpt}"
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
@ -26,7 +26,7 @@ fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# avg n best model # avg n best model
./local/avg.sh exp/${ckpt}/checkpoints ${avg_num} avg.sh exp/${ckpt}/checkpoints ${avg_num}
fi fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then

@ -70,7 +70,7 @@ model:
training: training:
n_epoch: 20 n_epoch: 2
accum_grad: 1 accum_grad: 1
global_grad_clip: 5.0 global_grad_clip: 5.0
optim: adam optim: adam
@ -85,7 +85,7 @@ training:
decoding: decoding:
batch_size: 64 batch_size: 8 #64
error_rate_type: wer error_rate_type: wer
decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm

@ -20,12 +20,12 @@ fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# train model, all `ckpt` under `exp` dir # train model, all `ckpt` under `exp` dir
CUDA_VISIBLE_DEVICES=4,5,6,7 ./local/train.sh ${conf_path} ${ckpt} ./local/train.sh ${conf_path} ${ckpt}
fi fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# avg n best model # avg n best model
./local/avg.sh exp/${ckpt}/checkpoints ${avg_num} avg.sh exp/${ckpt}/checkpoints ${avg_num}
fi fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then

Loading…
Cancel
Save