pull/877/head
huangyuxin 4 years ago
commit 37e02308b2

@ -15,7 +15,7 @@ from yacs.config import CfgNode
from deepspeech.exps.u2_st.model import U2STTester from deepspeech.exps.u2_st.model import U2STTester
from deepspeech.exps.u2_st.model import U2STTrainer from deepspeech.exps.u2_st.model import U2STTrainer
from deepspeech.io.collator_st import SpeechCollator from deepspeech.io.collator import SpeechCollator
from deepspeech.io.dataset import ManifestDataset from deepspeech.io.dataset import ManifestDataset
from deepspeech.models.u2_st import U2STModel from deepspeech.models.u2_st import U2STModel

@ -322,7 +322,7 @@ class LoadInputsAndTargets():
"Not supported: loader_type={}".format(filetype)) "Not supported: loader_type={}".format(filetype))
def file_type(self, filepath): def file_type(self, filepath):
suffix = filepath.split(":")[0].split('.')[-1] suffix = filepath.split(":")[0].split('.')[-1].lower()
if suffix == 'ark': if suffix == 'ark':
return 'mat' return 'mat'
elif suffix == 'scp': elif suffix == 'scp':

@ -113,7 +113,8 @@ class U2STBaseModel(nn.Layer):
asr_weight: float=0.0, asr_weight: float=0.0,
ignore_id: int=IGNORE_ID, ignore_id: int=IGNORE_ID,
lsm_weight: float=0.0, lsm_weight: float=0.0,
length_normalized_loss: bool=False): length_normalized_loss: bool=False,
**kwargs):
assert 0.0 <= ctc_weight <= 1.0, ctc_weight assert 0.0 <= ctc_weight <= 1.0, ctc_weight
super().__init__() super().__init__()
@ -650,7 +651,7 @@ class U2STModel(U2STBaseModel):
odim=vocab_size, odim=vocab_size,
enc_n_units=encoder.output_size(), enc_n_units=encoder.output_size(),
blank_id=0, blank_id=0,
dropout_rate=model_conf['ctc_dropout_rate'], dropout_rate=model_conf['ctc_dropoutrate'],
reduction=True, # sum reduction=True, # sum
batch_average=True, # sum / batch_size batch_average=True, # sum / batch_size
grad_norm_type=model_conf['ctc_grad_norm_type']) grad_norm_type=model_conf['ctc_grad_norm_type'])

@ -14,6 +14,7 @@
"""This module provides functions to calculate error rate in different level. """This module provides functions to calculate error rate in different level.
e.g. wer for word-level, cer for char-level. e.g. wer for word-level, cer for char-level.
""" """
import editdistance
import numpy as np import numpy as np
__all__ = ['word_errors', 'char_errors', 'wer', 'cer'] __all__ = ['word_errors', 'char_errors', 'wer', 'cer']
@ -89,6 +90,7 @@ def word_errors(reference, hypothesis, ignore_case=False, delimiter=' '):
hyp_words = list(filter(None, hypothesis.split(delimiter))) hyp_words = list(filter(None, hypothesis.split(delimiter)))
edit_distance = _levenshtein_distance(ref_words, hyp_words) edit_distance = _levenshtein_distance(ref_words, hyp_words)
# edit_distance = editdistance.eval(ref_words, hyp_words)
return float(edit_distance), len(ref_words) return float(edit_distance), len(ref_words)
@ -119,6 +121,7 @@ def char_errors(reference, hypothesis, ignore_case=False, remove_space=False):
hypothesis = join_char.join(list(filter(None, hypothesis.split(' ')))) hypothesis = join_char.join(list(filter(None, hypothesis.split(' '))))
edit_distance = _levenshtein_distance(reference, hypothesis) edit_distance = _levenshtein_distance(reference, hypothesis)
# edit_distance = editdistance.eval(reference, hypothesis)
return float(edit_distance), len(reference) return float(edit_distance), len(reference)

@ -93,20 +93,25 @@ def pad_sequence(sequences: List[paddle.Tensor],
for i, tensor in enumerate(sequences): for i, tensor in enumerate(sequences):
length = tensor.shape[0] length = tensor.shape[0]
# use index notation to prevent duplicate references to the tensor # use index notation to prevent duplicate references to the tensor
logger.info(
f"length {length}, out_tensor {out_tensor.shape}, tensor {tensor.shape}"
)
if batch_first: if batch_first:
# TODO (Hui Zhang): set_value op not supprot `end==start` # TODO (Hui Zhang): set_value op not supprot `end==start`
# TODO (Hui Zhang): set_value op not support int16
# TODO (Hui Zhang): set_varbase 2 rank not support [0,0,...]
# out_tensor[i, :length, ...] = tensor # out_tensor[i, :length, ...] = tensor
if length != 0: if length != 0:
out_tensor[i, :length, ...] = tensor out_tensor[i, :length] = tensor
else: else:
out_tensor[i, length, ...] = tensor out_tensor[i, length] = tensor
else: else:
# TODO (Hui Zhang): set_value op not supprot `end==start` # TODO (Hui Zhang): set_value op not supprot `end==start`
# out_tensor[:length, i, ...] = tensor # out_tensor[:length, i, ...] = tensor
if length != 0: if length != 0:
out_tensor[:length, i, ...] = tensor out_tensor[:length, i] = tensor
else: else:
out_tensor[length, i, ...] = tensor out_tensor[length, i] = tensor
return out_tensor return out_tensor

@ -1,3 +1,3 @@
TED-En-Zh TED_EnZh
data data
exp exp

@ -8,3 +8,8 @@
| data/manifest.train | 0.942 ~ 60 | | data/manifest.train | 0.942 ~ 60 |
| data/manifest.dev | 1.151 ~ 39 | | data/manifest.dev | 1.151 ~ 39 |
| data/manifest.test | 1.1 ~ 42.746 | | data/manifest.test | 1.1 ~ 42.746 |
## Transformer
| Model | Params | Config | Char-BLEU |
| --- | --- | --- | --- |
| Transformer+ASR MTL | 50.26M | conf/transformer_joint_noam.yaml | 17.38 |

@ -1,5 +1,7 @@
#!/bin/bash #!/bin/bash
set -e
stage=-1 stage=-1
stop_stage=100 stop_stage=100
@ -7,7 +9,7 @@ stop_stage=100
nbpe=8000 nbpe=8000
bpemode=unigram bpemode=unigram
bpeprefix="data/bpe_${bpemode}_${nbpe}" bpeprefix="data/bpe_${bpemode}_${nbpe}"
data_dir=/mnt/dataset/TED_EnZh data_dir=./TED_EnZh
source ${MAIN_ROOT}/utils/parse_options.sh source ${MAIN_ROOT}/utils/parse_options.sh

@ -6,7 +6,7 @@ stage=0
stop_stage=100 stop_stage=100
conf_path=conf/transformer_joint_noam.yaml conf_path=conf/transformer_joint_noam.yaml
avg_num=5 avg_num=5
data_path=./TED-En-Zh # path to unzipped data data_path=./TED_EnZh # path to unzipped data
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
avg_ckpt=avg_${avg_num} avg_ckpt=avg_${avg_num}

@ -0,0 +1,3 @@
data
exp
test.profile

@ -1,3 +1,9 @@
# TIMIT # TIMIT
Results will be organized and updated soon. ### Transformer
| Model | Params | Config | Decode method | Loss | PER |
| --- | --- | --- | --- | --- |
| transformer | 5.17M | conf/transformer.yaml | attention | 49.25688171386719 | 0.510742 |
| transformer | 5.17M | conf/transformer.yaml | ctc_greedy_search | 49.25688171386719 | 0.382398 |
| transformer | 5.17M | conf/transformer.yaml | ctc_prefix_beam_search | 49.25688171386719 | 0.367429 |
| transformer | 5.17M | conf/transformer.yaml | attention_rescore | 49.25688171386719 | 0.357173 |

@ -3,12 +3,12 @@ data:
train_manifest: data/manifest.train train_manifest: data/manifest.train
dev_manifest: data/manifest.dev dev_manifest: data/manifest.dev
test_manifest: data/manifest.test test_manifest: data/manifest.test
min_input_len: 0.5 # second min_input_len: 0.0 # second
max_input_len: 30.0 # second max_input_len: 10.0 # second
min_output_len: 0.0 # tokens min_output_len: 0.0 # tokens
max_output_len: 400.0 # tokens max_output_len: 150.0 # tokens
min_output_input_ratio: 0.05 min_output_input_ratio: 0.005
max_output_input_ratio: 100.0 max_output_input_ratio: 1000.0
collator: collator:
vocab_filepath: data/vocab.txt vocab_filepath: data/vocab.txt
@ -42,10 +42,10 @@ model:
# encoder related # encoder related
encoder: transformer encoder: transformer
encoder_conf: encoder_conf:
output_size: 256 # dimension of attention output_size: 128 # dimension of attention
attention_heads: 4 attention_heads: 4
linear_units: 2048 # the number of units of position-wise feed forward linear_units: 1024 # the number of units of position-wise feed forward
num_blocks: 12 # the number of encoder blocks num_blocks: 6 # the number of encoder blocks
dropout_rate: 0.1 dropout_rate: 0.1
positional_dropout_rate: 0.1 positional_dropout_rate: 0.1
attention_dropout_rate: 0.0 attention_dropout_rate: 0.0
@ -56,7 +56,7 @@ model:
decoder: transformer decoder: transformer
decoder_conf: decoder_conf:
attention_heads: 4 attention_heads: 4
linear_units: 2048 linear_units: 1024
num_blocks: 6 num_blocks: 6
dropout_rate: 0.1 dropout_rate: 0.1
positional_dropout_rate: 0.1 positional_dropout_rate: 0.1
@ -65,26 +65,26 @@ model:
# hybrid CTC/attention # hybrid CTC/attention
model_conf: model_conf:
ctc_weight: 0.3 ctc_weight: 0.5
ctc_dropoutrate: 0.0 ctc_dropoutrate: 0.0
ctc_grad_norm_type: instance ctc_grad_norm_type: batch
lsm_weight: 0.1 # label smoothing option lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false length_normalized_loss: false
training: training:
n_epoch: 120 n_epoch: 200
accum_grad: 2 accum_grad: 2
global_grad_clip: 5.0 global_grad_clip: 5.0
optim: adam optim: adam
optim_conf: optim_conf:
lr: 0.002 lr: 0.004
weight_decay: 1e-06 weight_decay: 1e-06
scheduler: warmuplr # pytorch v1.1.0+ required scheduler: warmuplr # pytorch v1.1.0+ required
scheduler_conf: scheduler_conf:
warmup_steps: 400 warmup_steps: 2000
lr_decay: 1.0 lr_decay: 1.0
log_interval: 100 log_interval: 10
checkpoint: checkpoint:
kbest_n: 50 kbest_n: 50
latest_n: 5 latest_n: 5

@ -1,10 +1,18 @@
#!/bin/bash #!/bin/bash
set -e
stage=0
stop_stage=50
. ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
if [ $# != 2 ];then if [ $# != 2 ];then
echo "usage: ${0} config_path ckpt_path_prefix" echo "usage: ${0} config_path ckpt_path_prefix"
exit -1 exit -1
fi fi
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo "using $ngpu gpus..." echo "using $ngpu gpus..."
@ -23,7 +31,8 @@ fi
# exit 1 # exit 1
#fi #fi
for type in attention ctc_greedy_search; do if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
for type in attention ctc_greedy_search; do
echo "decoding ${type}" echo "decoding ${type}"
if [ ${chunk_mode} == true ];then if [ ${chunk_mode} == true ];then
# stream decoding only support batchsize=1 # stream decoding only support batchsize=1
@ -43,9 +52,12 @@ for type in attention ctc_greedy_search; do
echo "Failed in evaluation!" echo "Failed in evaluation!"
exit 1 exit 1
fi fi
done done
fi
for type in ctc_prefix_beam_search attention_rescoring; do if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
for type in ctc_prefix_beam_search; do
echo "decoding ${type}" echo "decoding ${type}"
batch_size=1 batch_size=1
python3 -u ${BIN_DIR}/test.py \ python3 -u ${BIN_DIR}/test.py \
@ -60,7 +72,26 @@ for type in ctc_prefix_beam_search attention_rescoring; do
echo "Failed in evaluation!" echo "Failed in evaluation!"
exit 1 exit 1
fi fi
done done
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
for type in attention_rescoring; do
echo "decoding ${type}"
batch_size=1
python3 -u ${BIN_DIR}/test.py \
--nproc ${ngpu} \
--config ${config_path} \
--result_file ${ckpt_prefix}.${type}.rsl \
--checkpoint_path ${ckpt_prefix} \
--opts decoding.decoding_method ${type} \
--opts decoding.batch_size ${batch_size}
if [ $? -ne 0 ]; then
echo "Failed in evaluation!"
exit 1
fi
done
fi
exit 0 exit 0

@ -1,13 +1,15 @@
#!/bin/bash #!/bin/bash
set -e set -e
source path.sh
. path.sh || exit 1;
stage=0 stage=0
stop_stage=50 stop_stage=50
conf_path=conf/transformer.yaml conf_path=conf/transformer.yaml
avg_num=10 avg_num=10
TIMIT_path= #path of TIMIT (Required, e.g. /export/corpora5/LDC/LDC93S1/timit/TIMIT) TIMIT_path=/path/to/TIMIT
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
. ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
avg_ckpt=avg_${avg_num} avg_ckpt=avg_${avg_num}
ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}') ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')

@ -19,3 +19,4 @@ tqdm
typeguard typeguard
visualdl==2.2.0 visualdl==2.2.0
yacs yacs
editdistance
Loading…
Cancel
Save