pull/877/head
huangyuxin 3 years ago
commit 37e02308b2

@ -15,7 +15,7 @@ from yacs.config import CfgNode
from deepspeech.exps.u2_st.model import U2STTester
from deepspeech.exps.u2_st.model import U2STTrainer
from deepspeech.io.collator_st import SpeechCollator
from deepspeech.io.collator import SpeechCollator
from deepspeech.io.dataset import ManifestDataset
from deepspeech.models.u2_st import U2STModel

@ -322,7 +322,7 @@ class LoadInputsAndTargets():
"Not supported: loader_type={}".format(filetype))
def file_type(self, filepath):
suffix = filepath.split(":")[0].split('.')[-1]
suffix = filepath.split(":")[0].split('.')[-1].lower()
if suffix == 'ark':
return 'mat'
elif suffix == 'scp':

@ -113,7 +113,8 @@ class U2STBaseModel(nn.Layer):
asr_weight: float=0.0,
ignore_id: int=IGNORE_ID,
lsm_weight: float=0.0,
length_normalized_loss: bool=False):
length_normalized_loss: bool=False,
**kwargs):
assert 0.0 <= ctc_weight <= 1.0, ctc_weight
super().__init__()
@ -650,7 +651,7 @@ class U2STModel(U2STBaseModel):
odim=vocab_size,
enc_n_units=encoder.output_size(),
blank_id=0,
dropout_rate=model_conf['ctc_dropout_rate'],
dropout_rate=model_conf['ctc_dropoutrate'],
reduction=True, # sum
batch_average=True, # sum / batch_size
grad_norm_type=model_conf['ctc_grad_norm_type'])

@ -14,6 +14,7 @@
"""This module provides functions to calculate error rate in different level.
e.g. wer for word-level, cer for char-level.
"""
import editdistance
import numpy as np
__all__ = ['word_errors', 'char_errors', 'wer', 'cer']
@ -89,6 +90,7 @@ def word_errors(reference, hypothesis, ignore_case=False, delimiter=' '):
hyp_words = list(filter(None, hypothesis.split(delimiter)))
edit_distance = _levenshtein_distance(ref_words, hyp_words)
# edit_distance = editdistance.eval(ref_words, hyp_words)
return float(edit_distance), len(ref_words)
@ -119,6 +121,7 @@ def char_errors(reference, hypothesis, ignore_case=False, remove_space=False):
hypothesis = join_char.join(list(filter(None, hypothesis.split(' '))))
edit_distance = _levenshtein_distance(reference, hypothesis)
# edit_distance = editdistance.eval(reference, hypothesis)
return float(edit_distance), len(reference)

@ -93,20 +93,25 @@ def pad_sequence(sequences: List[paddle.Tensor],
for i, tensor in enumerate(sequences):
length = tensor.shape[0]
# use index notation to prevent duplicate references to the tensor
logger.info(
f"length {length}, out_tensor {out_tensor.shape}, tensor {tensor.shape}"
)
if batch_first:
# TODO (Hui Zhang): set_value op not supprot `end==start`
# TODO (Hui Zhang): set_value op not support int16
# TODO (Hui Zhang): set_varbase 2 rank not support [0,0,...]
# out_tensor[i, :length, ...] = tensor
if length != 0:
out_tensor[i, :length, ...] = tensor
out_tensor[i, :length] = tensor
else:
out_tensor[i, length, ...] = tensor
out_tensor[i, length] = tensor
else:
# TODO (Hui Zhang): set_value op not supprot `end==start`
# out_tensor[:length, i, ...] = tensor
if length != 0:
out_tensor[:length, i, ...] = tensor
out_tensor[:length, i] = tensor
else:
out_tensor[length, i, ...] = tensor
out_tensor[length, i] = tensor
return out_tensor

@ -1,3 +1,3 @@
TED-En-Zh
TED_EnZh
data
exp

@ -6,5 +6,10 @@
| Data Subset | Duration in Seconds |
| --- | --- |
| data/manifest.train | 0.942 ~ 60 |
| data/manifest.dev | 1.151 ~ 39 |
| data/manifest.dev | 1.151 ~ 39 |
| data/manifest.test | 1.1 ~ 42.746 |
## Transformer
| Model | Params | Config | Char-BLEU |
| --- | --- | --- | --- |
| Transformer+ASR MTL | 50.26M | conf/transformer_joint_noam.yaml | 17.38 |

@ -1,5 +1,7 @@
#!/bin/bash
set -e
stage=-1
stop_stage=100
@ -7,7 +9,7 @@ stop_stage=100
nbpe=8000
bpemode=unigram
bpeprefix="data/bpe_${bpemode}_${nbpe}"
data_dir=/mnt/dataset/TED_EnZh
data_dir=./TED_EnZh
source ${MAIN_ROOT}/utils/parse_options.sh

@ -6,7 +6,7 @@ stage=0
stop_stage=100
conf_path=conf/transformer_joint_noam.yaml
avg_num=5
data_path=./TED-En-Zh # path to unzipped data
data_path=./TED_EnZh # path to unzipped data
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
avg_ckpt=avg_${avg_num}

@ -0,0 +1,3 @@
data
exp
test.profile

@ -1,3 +1,9 @@
# TIMIT
Results will be organized and updated soon.
### Transformer
| Model | Params | Config | Decode method | Loss | PER |
| --- | --- | --- | --- | --- |
| transformer | 5.17M | conf/transformer.yaml | attention | 49.25688171386719 | 0.510742 |
| transformer | 5.17M | conf/transformer.yaml | ctc_greedy_search | 49.25688171386719 | 0.382398 |
| transformer | 5.17M | conf/transformer.yaml | ctc_prefix_beam_search | 49.25688171386719 | 0.367429 |
| transformer | 5.17M | conf/transformer.yaml | attention_rescore | 49.25688171386719 | 0.357173 |

@ -3,12 +3,12 @@ data:
train_manifest: data/manifest.train
dev_manifest: data/manifest.dev
test_manifest: data/manifest.test
min_input_len: 0.5 # second
max_input_len: 30.0 # second
min_input_len: 0.0 # second
max_input_len: 10.0 # second
min_output_len: 0.0 # tokens
max_output_len: 400.0 # tokens
min_output_input_ratio: 0.05
max_output_input_ratio: 100.0
max_output_len: 150.0 # tokens
min_output_input_ratio: 0.005
max_output_input_ratio: 1000.0
collator:
vocab_filepath: data/vocab.txt
@ -42,10 +42,10 @@ model:
# encoder related
encoder: transformer
encoder_conf:
output_size: 256 # dimension of attention
output_size: 128 # dimension of attention
attention_heads: 4
linear_units: 2048 # the number of units of position-wise feed forward
num_blocks: 12 # the number of encoder blocks
linear_units: 1024 # the number of units of position-wise feed forward
num_blocks: 6 # the number of encoder blocks
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.0
@ -56,7 +56,7 @@ model:
decoder: transformer
decoder_conf:
attention_heads: 4
linear_units: 2048
linear_units: 1024
num_blocks: 6
dropout_rate: 0.1
positional_dropout_rate: 0.1
@ -65,26 +65,26 @@ model:
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
ctc_weight: 0.5
ctc_dropoutrate: 0.0
ctc_grad_norm_type: instance
ctc_grad_norm_type: batch
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false
training:
n_epoch: 120
n_epoch: 200
accum_grad: 2
global_grad_clip: 5.0
optim: adam
optim_conf:
lr: 0.002
lr: 0.004
weight_decay: 1e-06
scheduler: warmuplr # pytorch v1.1.0+ required
scheduler_conf:
warmup_steps: 400
warmup_steps: 2000
lr_decay: 1.0
log_interval: 100
log_interval: 10
checkpoint:
kbest_n: 50
latest_n: 5

@ -1,10 +1,18 @@
#!/bin/bash
set -e
stage=0
stop_stage=50
. ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
if [ $# != 2 ];then
echo "usage: ${0} config_path ckpt_path_prefix"
exit -1
fi
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo "using $ngpu gpus..."
@ -23,44 +31,67 @@ fi
# exit 1
#fi
for type in attention ctc_greedy_search; do
echo "decoding ${type}"
if [ ${chunk_mode} == true ];then
# stream decoding only support batchsize=1
batch_size=1
else
batch_size=64
fi
python3 -u ${BIN_DIR}/test.py \
--nproc ${ngpu} \
--config ${config_path} \
--result_file ${ckpt_prefix}.${type}.rsl \
--checkpoint_path ${ckpt_prefix} \
--opts decoding.decoding_method ${type} \
--opts decoding.batch_size ${batch_size}
if [ $? -ne 0 ]; then
echo "Failed in evaluation!"
exit 1
fi
done
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
for type in attention ctc_greedy_search; do
echo "decoding ${type}"
if [ ${chunk_mode} == true ];then
# stream decoding only support batchsize=1
batch_size=1
else
batch_size=64
fi
python3 -u ${BIN_DIR}/test.py \
--nproc ${ngpu} \
--config ${config_path} \
--result_file ${ckpt_prefix}.${type}.rsl \
--checkpoint_path ${ckpt_prefix} \
--opts decoding.decoding_method ${type} \
--opts decoding.batch_size ${batch_size}
if [ $? -ne 0 ]; then
echo "Failed in evaluation!"
exit 1
fi
done
fi
for type in ctc_prefix_beam_search attention_rescoring; do
echo "decoding ${type}"
batch_size=1
python3 -u ${BIN_DIR}/test.py \
--nproc ${ngpu} \
--config ${config_path} \
--result_file ${ckpt_prefix}.${type}.rsl \
--checkpoint_path ${ckpt_prefix} \
--opts decoding.decoding_method ${type} \
--opts decoding.batch_size ${batch_size}
if [ $? -ne 0 ]; then
echo "Failed in evaluation!"
exit 1
fi
done
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
for type in ctc_prefix_beam_search; do
echo "decoding ${type}"
batch_size=1
python3 -u ${BIN_DIR}/test.py \
--nproc ${ngpu} \
--config ${config_path} \
--result_file ${ckpt_prefix}.${type}.rsl \
--checkpoint_path ${ckpt_prefix} \
--opts decoding.decoding_method ${type} \
--opts decoding.batch_size ${batch_size}
if [ $? -ne 0 ]; then
echo "Failed in evaluation!"
exit 1
fi
done
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
for type in attention_rescoring; do
echo "decoding ${type}"
batch_size=1
python3 -u ${BIN_DIR}/test.py \
--nproc ${ngpu} \
--config ${config_path} \
--result_file ${ckpt_prefix}.${type}.rsl \
--checkpoint_path ${ckpt_prefix} \
--opts decoding.decoding_method ${type} \
--opts decoding.batch_size ${batch_size}
if [ $? -ne 0 ]; then
echo "Failed in evaluation!"
exit 1
fi
done
fi
exit 0

@ -1,13 +1,15 @@
#!/bin/bash
set -e
source path.sh
. path.sh || exit 1;
stage=0
stop_stage=50
conf_path=conf/transformer.yaml
avg_num=10
TIMIT_path= #path of TIMIT (Required, e.g. /export/corpora5/LDC/LDC93S1/timit/TIMIT)
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
TIMIT_path=/path/to/TIMIT
. ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
avg_ckpt=avg_${avg_num}
ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')

@ -19,3 +19,4 @@ tqdm
typeguard
visualdl==2.2.0
yacs
editdistance
Loading…
Cancel
Save