add librispeech asr1

pull/1225/head
huangyuxin 3 years ago
parent fb6d1e2c11
commit 41eeed0450

@ -24,7 +24,7 @@ python3 -u ${BIN_DIR}/alignment.py \
--decode_config ${decode_config_path} \ --decode_config ${decode_config_path} \
--result_file ${output_dir}/${type}.align \ --result_file ${output_dir}/${type}.align \
--checkpoint_path ${ckpt_prefix} \ --checkpoint_path ${ckpt_prefix} \
--opts decoding.decode_batch_size ${batch_size} --opts decode.decode_batch_size ${batch_size}
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
echo "Failed in ctc alignment!" echo "Failed in ctc alignment!"

@ -30,7 +30,7 @@ for type in attention ctc_greedy_search; do
# stream decoding only support batchsize=1 # stream decoding only support batchsize=1
batch_size=1 batch_size=1
else else
batch_size=64 batch_size=1
fi fi
output_dir=${ckpt_prefix} output_dir=${ckpt_prefix}
mkdir -p ${output_dir} mkdir -p ${output_dir}
@ -40,8 +40,8 @@ for type in attention ctc_greedy_search; do
--decode_config ${decode_config_path} \ --decode_config ${decode_config_path} \
--result_file ${output_dir}/${type}.rsl \ --result_file ${output_dir}/${type}.rsl \
--checkpoint_path ${ckpt_prefix} \ --checkpoint_path ${ckpt_prefix} \
--opts decoding.decoding_method ${type} \ --opts decode.decoding_method ${type} \
--opts decoding.decode_batch_size ${batch_size} --opts decode.decode_batch_size ${batch_size}
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
echo "Failed in evaluation!" echo "Failed in evaluation!"
@ -60,8 +60,8 @@ for type in ctc_prefix_beam_search attention_rescoring; do
--decode_config ${decode_config_path} \ --decode_config ${decode_config_path} \
--result_file ${output_dir}/${type}.rsl \ --result_file ${output_dir}/${type}.rsl \
--checkpoint_path ${ckpt_prefix} \ --checkpoint_path ${ckpt_prefix} \
--opts decoding.decoding_method ${type} \ --opts decode.decoding_method ${type} \
--opts decoding.batch_size ${batch_size} --opts decode.decode_batch_size ${batch_size}
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
echo "Failed in evaluation!" echo "Failed in evaluation!"

@ -46,8 +46,8 @@ for type in attention_rescoring; do
--decode_config ${decode_config_path} \ --decode_config ${decode_config_path} \
--result_file ${output_dir}/${type}.rsl \ --result_file ${output_dir}/${type}.rsl \
--checkpoint_path ${ckpt_prefix} \ --checkpoint_path ${ckpt_prefix} \
--opts decoding.decoding_method ${type} \ --opts decode.decoding_method ${type} \
--opts decoding.decode_batch_size ${batch_size} \ --opts decode.decode_batch_size ${batch_size} \
--audio_file ${audio_file} --audio_file ${audio_file}
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then

@ -125,8 +125,8 @@ HiFiGAN checkpoint contains files listed below.
```text ```text
hifigan_csmsc_ckpt_0.1.1 hifigan_csmsc_ckpt_0.1.1
├── default.yaml # default config used to train hifigan ├── default.yaml # default config used to train hifigan
├── feats_stats.npy # statistics used to normalize spectrogram when training hifigan ├── feats_stats.npy # generator parameters of hifigan
└── snapshot_iter_2500000.pdz # generator parameters of hifigan └── snapshot_iter_2500000.pdz # statistics used to normalize spectrogram when training hifigan
``` ```
## Acknowledgement ## Acknowledgement

@ -1,103 +1,99 @@
# network architecture ############################################
model: # Network Architecture #
cmvn_file: ############################################
cmvn_file_type: "json" cmvn_file:
# encoder related cmvn_file_type: "json"
encoder: conformer # encoder related
encoder_conf: encoder: conformer
output_size: 256 # dimension of attention encoder_conf:
attention_heads: 4 output_size: 256 # dimension of attention
linear_units: 2048 # the number of units of position-wise feed forward attention_heads: 4
num_blocks: 12 # the number of encoder blocks linear_units: 2048 # the number of units of position-wise feed forward
dropout_rate: 0.1 num_blocks: 12 # the number of encoder blocks
positional_dropout_rate: 0.1 dropout_rate: 0.1
attention_dropout_rate: 0.0 positional_dropout_rate: 0.1
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 attention_dropout_rate: 0.0
normalize_before: True input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
use_cnn_module: True normalize_before: True
cnn_module_kernel: 15 use_cnn_module: True
activation_type: 'swish' cnn_module_kernel: 15
pos_enc_layer_type: 'rel_pos' activation_type: 'swish'
selfattention_layer_type: 'rel_selfattn' pos_enc_layer_type: 'rel_pos'
causal: True selfattention_layer_type: 'rel_selfattn'
use_dynamic_chunk: true causal: True
cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster use_dynamic_chunk: true
use_dynamic_left_chunk: false cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster
use_dynamic_left_chunk: false
# decoder related # decoder related
decoder: transformer decoder: transformer
decoder_conf: decoder_conf:
attention_heads: 4 attention_heads: 4
linear_units: 2048 linear_units: 2048
num_blocks: 6 num_blocks: 6
dropout_rate: 0.1 dropout_rate: 0.1
positional_dropout_rate: 0.1 positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.0 self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0 src_attention_dropout_rate: 0.0
# hybrid CTC/attention # hybrid CTC/attention
model_conf: model_conf:
ctc_weight: 0.3 ctc_weight: 0.3
lsm_weight: 0.1 # label smoothing option lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false length_normalized_loss: false
data: ###########################################
train_manifest: data/manifest.train # Data #
dev_manifest: data/manifest.dev ###########################################
test_manifest: data/manifest.test train_manifest: data/manifest.train
dev_manifest: data/manifest.dev
test_manifest: data/manifest.test
collator: ###########################################
vocab_filepath: data/lang_char/vocab.txt # Dataloader #
unit_type: 'spm' ###########################################
spm_model_prefix: 'data/lang_char/bpe_unigram_5000' vocab_filepath: data/lang_char/vocab.txt
mean_std_filepath: "" unit_type: 'spm'
augmentation_config: conf/preprocess.yaml spm_model_prefix: 'data/lang_char/bpe_unigram_5000'
feat_dim: 80 mean_std_filepath: ""
stride_ms: 10.0 augmentation_config: conf/preprocess.yaml
window_ms: 25.0 feat_dim: 80
sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs stride_ms: 10.0
batch_size: 16 window_ms: 25.0
maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced batch_size: 16
minibatches: 0 # for debug maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced
batch_count: auto maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced
batch_bins: 0 minibatches: 0 # for debug
batch_frames_in: 0 batch_count: auto
batch_frames_out: 0 batch_bins: 0
batch_frames_inout: 0 batch_frames_in: 0
augmentation_config: conf/preprocess.yaml batch_frames_out: 0
num_workers: 0 batch_frames_inout: 0
subsampling_factor: 1 augmentation_config: conf/preprocess.yaml
num_encs: 1 num_workers: 0
subsampling_factor: 1
num_encs: 1
###########################################
# Training #
###########################################
n_epoch: 120
accum_grad: 8
global_grad_clip: 5.0
optim: adam
optim_conf:
lr: 0.001
weight_decay: 1e-06
scheduler: warmuplr
scheduler_conf:
warmup_steps: 25000
log_interval: 100
checkpoint:
kbest_n: 50
latest_n: 5
training:
n_epoch: 120
accum_grad: 8
global_grad_clip: 5.0
optim: adam
optim_conf:
lr: 0.001
weight_decay: 1e-06
scheduler: warmuplr
scheduler_conf:
warmup_steps: 25000
log_interval: 100
checkpoint:
kbest_n: 50
latest_n: 5
decoding:
batch_size: 128
error_rate_type: wer
decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
beam_size: 10
ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1.
simulate_streaming: true # simulate streaming inference. Defaults to False.

@ -1,103 +1,90 @@
# network architecture ############################################
model: # Network Architecture #
cmvn_file: ############################################
cmvn_file_type: "json" cmvn_file:
# encoder related cmvn_file_type: "json"
encoder: transformer # encoder related
encoder_conf: encoder: transformer
output_size: 256 # dimension of attention encoder_conf:
attention_heads: 4 output_size: 256 # dimension of attention
linear_units: 2048 # the number of units of position-wise feed forward attention_heads: 4
num_blocks: 12 # the number of encoder blocks linear_units: 2048 # the number of units of position-wise feed forward
dropout_rate: 0.1 num_blocks: 12 # the number of encoder blocks
positional_dropout_rate: 0.1 dropout_rate: 0.1
attention_dropout_rate: 0.0 positional_dropout_rate: 0.1
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 attention_dropout_rate: 0.0
normalize_before: true input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
use_dynamic_chunk: true normalize_before: true
use_dynamic_left_chunk: false use_dynamic_chunk: true
use_dynamic_left_chunk: false
# decoder related # decoder related
decoder: transformer decoder: transformer
decoder_conf: decoder_conf:
attention_heads: 4 attention_heads: 4
linear_units: 2048 linear_units: 2048
num_blocks: 6 num_blocks: 6
dropout_rate: 0.1 dropout_rate: 0.1
positional_dropout_rate: 0.1 positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.0 self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0 src_attention_dropout_rate: 0.0
# hybrid CTC/attention # hybrid CTC/attention
model_conf: model_conf:
ctc_weight: 0.3 ctc_weight: 0.3
lsm_weight: 0.1 # label smoothing option lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false length_normalized_loss: false
data: ###########################################
train_manifest: data/manifest.train # Data #
dev_manifest: data/manifest.dev ###########################################
test_manifest: data/manifest.test train_manifest: data/manifest.train
dev_manifest: data/manifest.dev
test_manifest: data/manifest.test
collator: ###########################################
vocab_filepath: data/lang_char/vocab.txt # Dataloader #
unit_type: 'spm' ###########################################
spm_model_prefix: 'data/lang_char/bpe_unigram_5000' vocab_filepath: data/lang_char/vocab.txt
mean_std_filepath: "" unit_type: 'spm'
augmentation_config: conf/preprocess.yaml spm_model_prefix: 'data/lang_char/bpe_unigram_5000'
feat_dim: 80 mean_std_filepath: ""
stride_ms: 10.0 augmentation_config: conf/preprocess.yaml
window_ms: 25.0 feat_dim: 80
sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs stride_ms: 10.0
batch_size: 64 window_ms: 25.0
maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced batch_size: 64
minibatches: 0 # for debug maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced
batch_count: auto maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced
batch_bins: 0 minibatches: 0 # for debug
batch_frames_in: 0 batch_count: auto
batch_frames_out: 0 batch_bins: 0
batch_frames_inout: 0 batch_frames_in: 0
augmentation_config: conf/preprocess.yaml batch_frames_out: 0
num_workers: 0 batch_frames_inout: 0
subsampling_factor: 1 augmentation_config: conf/preprocess.yaml
num_encs: 1 num_workers: 0
subsampling_factor: 1
num_encs: 1
training: ###########################################
n_epoch: 120 # Training #
accum_grad: 1 ###########################################
global_grad_clip: 5.0 n_epoch: 120
optim: adam accum_grad: 1
optim_conf: global_grad_clip: 5.0
lr: 0.001 optim: adam
weight_decay: 1e-06 optim_conf:
scheduler: warmuplr lr: 0.001
scheduler_conf: weight_decay: 1e-06
warmup_steps: 25000 scheduler: warmuplr
lr_decay: 1.0 scheduler_conf:
log_interval: 100 warmup_steps: 25000
checkpoint: lr_decay: 1.0
kbest_n: 50 log_interval: 100
latest_n: 5 checkpoint:
kbest_n: 50
latest_n: 5
decoding:
batch_size: 64
error_rate_type: wer
decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
alpha: 2.5
beta: 0.3
beam_size: 10
cutoff_prob: 1.0
cutoff_top_n: 0
num_proc_bsearch: 8
ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1.
simulate_streaming: true # simulate streaming inference. Defaults to False.

@ -1,104 +1,97 @@
# network architecture ############################################
model: # Network Architecture #
cmvn_file: ############################################
cmvn_file_type: "json" cmvn_file:
# encoder related cmvn_file_type: "json"
encoder: conformer # encoder related
encoder_conf: encoder: conformer
output_size: 256 # dimension of attention encoder_conf:
attention_heads: 4 output_size: 256 # dimension of attention
linear_units: 2048 # the number of units of position-wise feed forward attention_heads: 4
num_blocks: 12 # the number of encoder blocks linear_units: 2048 # the number of units of position-wise feed forward
dropout_rate: 0.1 num_blocks: 12 # the number of encoder blocks
positional_dropout_rate: 0.1 dropout_rate: 0.1
attention_dropout_rate: 0.0 positional_dropout_rate: 0.1
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 attention_dropout_rate: 0.0
normalize_before: True input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
use_cnn_module: True normalize_before: True
cnn_module_kernel: 15 use_cnn_module: True
activation_type: 'swish' cnn_module_kernel: 15
pos_enc_layer_type: 'rel_pos' activation_type: 'swish'
selfattention_layer_type: 'rel_selfattn' pos_enc_layer_type: 'rel_pos'
selfattention_layer_type: 'rel_selfattn'
# decoder related # decoder related
decoder: transformer decoder: transformer
decoder_conf: decoder_conf:
attention_heads: 4 attention_heads: 4
linear_units: 2048 linear_units: 2048
num_blocks: 6 num_blocks: 6
dropout_rate: 0.1 dropout_rate: 0.1
positional_dropout_rate: 0.1 positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.0 self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0 src_attention_dropout_rate: 0.0
# hybrid CTC/attention # hybrid CTC/attention
model_conf: model_conf:
ctc_weight: 0.3 ctc_weight: 0.3
ctc_grad_norm_type: null ctc_grad_norm_type: null
lsm_weight: 0.1 # label smoothing option lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false length_normalized_loss: false
data: ###########################################
train_manifest: data/manifest.train # Data #
dev_manifest: data/manifest.dev ###########################################
test_manifest: data/manifest.test-clean train_manifest: data/manifest.train
dev_manifest: data/manifest.dev
test_manifest: data/manifest.test-clean
collator: ###########################################
vocab_filepath: data/lang_char/vocab.txt # Dataloader #
unit_type: 'spm' ###########################################
spm_model_prefix: 'data/lang_char/bpe_unigram_5000' vocab_filepath: data/lang_char/vocab.txt
mean_std_filepath: "" unit_type: 'spm'
augmentation_config: conf/preprocess.yaml spm_model_prefix: 'data/lang_char/bpe_unigram_5000'
feat_dim: 80 mean_std_filepath: ""
stride_ms: 10.0 augmentation_config: conf/preprocess.yaml
window_ms: 25.0 feat_dim: 80
sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs stride_ms: 10.0
batch_size: 16 window_ms: 25.0
maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced batch_size: 16
minibatches: 0 # for debug maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced
batch_count: auto maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced
batch_bins: 0 minibatches: 0 # for debug
batch_frames_in: 0 batch_count: auto
batch_frames_out: 0 batch_bins: 0
batch_frames_inout: 0 batch_frames_in: 0
augmentation_config: conf/preprocess.yaml batch_frames_out: 0
num_workers: 0 batch_frames_inout: 0
subsampling_factor: 1 augmentation_config: conf/preprocess.yaml
num_encs: 1 num_workers: 0
subsampling_factor: 1
num_encs: 1
training: ###########################################
n_epoch: 70 # Training #
accum_grad: 8 ###########################################
global_grad_clip: 3.0 n_epoch: 70
optim: adam accum_grad: 8
optim_conf: global_grad_clip: 3.0
lr: 0.004 optim: adam
weight_decay: 1e-06 optim_conf:
scheduler: warmuplr lr: 0.004
scheduler_conf: weight_decay: 1e-06
warmup_steps: 25000 scheduler: warmuplr
lr_decay: 1.0 scheduler_conf:
log_interval: 100 warmup_steps: 25000
checkpoint: lr_decay: 1.0
kbest_n: 50 log_interval: 100
latest_n: 5 checkpoint:
kbest_n: 50
latest_n: 5
decoding:
batch_size: 64
error_rate_type: wer
decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
beam_size: 10
ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1.
simulate_streaming: False # simulate streaming inference. Defaults to False.

@ -1,110 +1,89 @@
# network architecture ############################################
model: # Network Architecture #
cmvn_file: ############################################
cmvn_file_type: "json" cmvn_file:
# encoder related cmvn_file_type: "json"
encoder: transformer # encoder related
encoder_conf: encoder: transformer
output_size: 256 # dimension of attention encoder_conf:
attention_heads: 4 output_size: 256 # dimension of attention
linear_units: 2048 # the number of units of position-wise feed forward attention_heads: 4
num_blocks: 12 # the number of encoder blocks linear_units: 2048 # the number of units of position-wise feed forward
dropout_rate: 0.1 num_blocks: 12 # the number of encoder blocks
positional_dropout_rate: 0.1 dropout_rate: 0.1
attention_dropout_rate: 0.0 positional_dropout_rate: 0.1
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 attention_dropout_rate: 0.0
normalize_before: true input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before: true
# decoder related # decoder related
decoder: transformer decoder: transformer
decoder_conf: decoder_conf:
attention_heads: 4 attention_heads: 4
linear_units: 2048 linear_units: 2048
num_blocks: 6 num_blocks: 6
dropout_rate: 0.1 dropout_rate: 0.1
positional_dropout_rate: 0.1 positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.0 self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0 src_attention_dropout_rate: 0.0
# hybrid CTC/attention # hybrid CTC/attention
model_conf: model_conf:
ctc_weight: 0.3 ctc_weight: 0.3
lsm_weight: 0.1 # label smoothing option lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false length_normalized_loss: false
# https://yaml.org/type/float.html # https://yaml.org/type/float.html
data: ###########################################
train_manifest: data/manifest.train # Data #
dev_manifest: data/manifest.dev ###########################################
test_manifest: data/manifest.test-clean train_manifest: data/manifest.train
min_input_len: 0.5 # second dev_manifest: data/manifest.dev
max_input_len: 30.0 # second test_manifest: data/manifest.test-clean
min_output_len: 0.0 # tokens
max_output_len: 400.0 # tokens
min_output_input_ratio: 0.05
max_output_input_ratio: 100.0
collator: ###########################################
vocab_filepath: data/lang_char/vocab.txt # Dataloader #
unit_type: 'spm' ###########################################
spm_model_prefix: 'data/lang_char/bpe_unigram_5000' vocab_filepath: data/lang_char/vocab.txt
mean_std_filepath: "" unit_type: 'spm'
augmentation_config: conf/preprocess.yaml spm_model_prefix: 'data/lang_char/bpe_unigram_5000'
feat_dim: 80 mean_std_filepath: ""
stride_ms: 10.0 augmentation_config: conf/preprocess.yaml
window_ms: 25.0 feat_dim: 80
sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs stride_ms: 10.0
batch_size: 32 window_ms: 25.0
maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced batch_size: 32
minibatches: 0 # for debug maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced
batch_count: auto maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced
batch_bins: 0 minibatches: 0 # for debug
batch_frames_in: 0 batch_count: auto
batch_frames_out: 0 batch_bins: 0
batch_frames_inout: 0 batch_frames_in: 0
augmentation_config: conf/preprocess.yaml batch_frames_out: 0
num_workers: 0 batch_frames_inout: 0
subsampling_factor: 1 augmentation_config: conf/preprocess.yaml
num_encs: 1 num_workers: 0
subsampling_factor: 1
num_encs: 1
training:
n_epoch: 120
accum_grad: 4
global_grad_clip: 5.0
optim: adam
optim_conf:
lr: 0.004
weight_decay: 1e-06
scheduler: warmuplr
scheduler_conf:
warmup_steps: 25000
lr_decay: 1.0
log_interval: 100
checkpoint:
kbest_n: 50
latest_n: 5
decoding:
batch_size: 64
error_rate_type: wer
decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
alpha: 2.5
beta: 0.3
beam_size: 10
cutoff_prob: 1.0
cutoff_top_n: 0
num_proc_bsearch: 8
ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1.
simulate_streaming: False # simulate streaming inference. Defaults to False.
###########################################
# Training #
###########################################
n_epoch: 120
accum_grad: 4
global_grad_clip: 5.0
optim: adam
optim_conf:
lr: 0.004
weight_decay: 1e-06
scheduler: warmuplr
scheduler_conf:
warmup_steps: 25000
lr_decay: 1.0
log_interval: 100
checkpoint:
kbest_n: 50
latest_n: 5

@ -0,0 +1,11 @@
decode_batch_size: 128
error_rate_type: wer
decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
beam_size: 10
ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1.
simulate_streaming: true # simulate streaming inference. Defaults to False.

@ -0,0 +1,11 @@
decode_batch_size: 64
error_rate_type: wer
decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
beam_size: 10
ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1.
simulate_streaming: False # simulate streaming inference. Defaults to False.

@ -1,7 +1,7 @@
#!/bin/bash #!/bin/bash
if [ $# != 2 ];then if [ $# != 3 ];then
echo "usage: ${0} config_path ckpt_path_prefix" echo "usage: ${0} config_path decode_config_path ckpt_path_prefix"
exit -1 exit -1
fi fi
@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo "using $ngpu gpus..." echo "using $ngpu gpus..."
config_path=$1 config_path=$1
ckpt_prefix=$2 decode_config_path=$2
ckpt_prefix=$3
batch_size=1 batch_size=1
output_dir=${ckpt_prefix} output_dir=${ckpt_prefix}
@ -20,9 +21,10 @@ mkdir -p ${output_dir}
python3 -u ${BIN_DIR}/alignment.py \ python3 -u ${BIN_DIR}/alignment.py \
--ngpu ${ngpu} \ --ngpu ${ngpu} \
--config ${config_path} \ --config ${config_path} \
--decode_config ${decode_config_path} \
--result_file ${output_dir}/${type}.align \ --result_file ${output_dir}/${type}.align \
--checkpoint_path ${ckpt_prefix} \ --checkpoint_path ${ckpt_prefix} \
--opts decoding.batch_size ${batch_size} --opts decode.decode_batch_size ${batch_size}
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
echo "Failed in ctc alignment!" echo "Failed in ctc alignment!"

@ -15,8 +15,8 @@ recog_set="test-clean"
stage=0 stage=0
stop_stage=100 stop_stage=100
if [ $# != 2 ];then if [ $# != 3 ];then
echo "usage: ${0} config_path ckpt_path_prefix" echo "usage: ${0} config_path decode_config_path ckpt_path_prefix"
exit -1 exit -1
fi fi
@ -24,7 +24,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo "using $ngpu gpus..." echo "using $ngpu gpus..."
config_path=$1 config_path=$1
ckpt_prefix=$2 decode_config_path=$2
ckpt_prefix=$3
chunk_mode=false chunk_mode=false
if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then
@ -52,10 +53,11 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
python3 -u ${BIN_DIR}/test.py \ python3 -u ${BIN_DIR}/test.py \
--ngpu ${ngpu} \ --ngpu ${ngpu} \
--config ${config_path} \ --config ${config_path} \
--decode_config ${decode_config_path} \
--result_file ${ckpt_prefix}.${type}.rsl \ --result_file ${ckpt_prefix}.${type}.rsl \
--checkpoint_path ${ckpt_prefix} \ --checkpoint_path ${ckpt_prefix} \
--opts decoding.decoding_method ${type} \ --opts decode.decoding_method ${type} \
--opts decoding.batch_size ${batch_size} --opts decode.decode_batch_size ${batch_size}
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
echo "Failed in evaluation!" echo "Failed in evaluation!"
@ -76,10 +78,11 @@ for type in ctc_greedy_search; do
python3 -u ${BIN_DIR}/test.py \ python3 -u ${BIN_DIR}/test.py \
--ngpu ${ngpu} \ --ngpu ${ngpu} \
--config ${config_path} \ --config ${config_path} \
--decode_config ${decode_config_path} \
--result_file ${ckpt_prefix}.${type}.rsl \ --result_file ${ckpt_prefix}.${type}.rsl \
--checkpoint_path ${ckpt_prefix} \ --checkpoint_path ${ckpt_prefix} \
--opts decoding.decoding_method ${type} \ --opts decode.decoding_method ${type} \
--opts decoding.batch_size ${batch_size} --opts decode.decode_batch_size ${batch_size}
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
echo "Failed in evaluation!" echo "Failed in evaluation!"
@ -96,10 +99,11 @@ for type in ctc_prefix_beam_search attention_rescoring; do
python3 -u ${BIN_DIR}/test.py \ python3 -u ${BIN_DIR}/test.py \
--ngpu ${ngpu} \ --ngpu ${ngpu} \
--config ${config_path} \ --config ${config_path} \
--decode_config ${decode_config_path} \
--result_file ${ckpt_prefix}.${type}.rsl \ --result_file ${ckpt_prefix}.${type}.rsl \
--checkpoint_path ${ckpt_prefix} \ --checkpoint_path ${ckpt_prefix} \
--opts decoding.decoding_method ${type} \ --opts decode.decoding_method ${type} \
--opts decoding.batch_size ${batch_size} --opts decode.decode_batch_size ${batch_size}
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
echo "Failed in evaluation!" echo "Failed in evaluation!"

@ -1,7 +1,7 @@
#!/bin/bash #!/bin/bash
if [ $# != 3 ];then if [ $# != 4 ];then
echo "usage: ${0} config_path ckpt_path_prefix audio_file" echo "usage: ${0} config_path decode_config_path ckpt_path_prefix audio_file"
exit -1 exit -1
fi fi
@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo "using $ngpu gpus..." echo "using $ngpu gpus..."
config_path=$1 config_path=$1
ckpt_prefix=$2 decode_config_path=$2
audio_file=$3 ckpt_prefix=$3
audio_file=$4
mkdir -p data mkdir -p data
wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/en/demo_002_en.wav -P data/ wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/en/demo_002_en.wav -P data/
@ -49,10 +50,11 @@ for type in attention_rescoring; do
python3 -u ${BIN_DIR}/test_wav.py \ python3 -u ${BIN_DIR}/test_wav.py \
--ngpu ${ngpu} \ --ngpu ${ngpu} \
--config ${config_path} \ --config ${config_path} \
--decode_config ${decode_config_path} \
--result_file ${output_dir}/${type}.rsl \ --result_file ${output_dir}/${type}.rsl \
--checkpoint_path ${ckpt_prefix} \ --checkpoint_path ${ckpt_prefix} \
--opts decoding.decoding_method ${type} \ --opts decode.decoding_method ${type} \
--opts decoding.batch_size ${batch_size} \ --opts decode.decode_batch_size ${batch_size} \
--audio_file ${audio_file} --audio_file ${audio_file}
#score_sclite.sh --bpe ${nbpe} --bpemodel ${bpemodel}.model --wer true ${expdir}/${decode_dir} ${dict} #score_sclite.sh --bpe ${nbpe} --bpemodel ${bpemodel}.model --wer true ${expdir}/${decode_dir} ${dict}

@ -8,6 +8,7 @@ gpus=0,1,2,3
stage=0 stage=0
stop_stage=50 stop_stage=50
conf_path=conf/transformer.yaml conf_path=conf/transformer.yaml
decode_conf_path=conf/tuning/decode.yaml
avg_num=30 avg_num=30
audio_file=data/demo_002_en.wav audio_file=data/demo_002_en.wav
@ -34,17 +35,17 @@ fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# test ckpt avg_n # test ckpt avg_n
CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
fi fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
# ctc alignment of test data # ctc alignment of test data
CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
fi fi
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# test a single .wav file # test a single .wav file
CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1 CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1
fi fi
if [ ${stage} -le 51 ] && [ ${stop_stage} -ge 51 ]; then if [ ${stage} -le 51 ] && [ ${stop_stage} -ge 51 ]; then

@ -1,116 +1,105 @@
# https://yaml.org/type/float.html # https://yaml.org/type/float.html
data: ###########################################
train_manifest: data/manifest.tiny # Data #
dev_manifest: data/manifest.tiny ###########################################
test_manifest: data/manifest.tiny train_manifest: data/manifest.tiny
min_input_len: 0.5 # second dev_manifest: data/manifest.tiny
max_input_len: 20.0 # second test_manifest: data/manifest.tiny
min_output_len: 0.0 # tokens min_input_len: 0.5 # second
max_output_len: 400.0 # tokens max_input_len: 20.0 # second
min_output_input_ratio: 0.05 min_output_len: 0.0 # tokens
max_output_input_ratio: 10.0 max_output_len: 400.0 # tokens
min_output_input_ratio: 0.05
max_output_input_ratio: 10.0
collator:
mean_std_filepath: ""
vocab_filepath: data/lang_char/vocab.txt
unit_type: 'spm'
spm_model_prefix: 'data/lang_char/bpe_unigram_200'
augmentation_config: conf/preprocess.yaml
batch_size: 4
raw_wav: True # use raw_wav or kaldi feature
spectrum_type: fbank #linear, mfcc, fbank
feat_dim: 80
delta_delta: False
dither: 1.0
target_sample_rate: 16000
max_freq: None
n_fft: None
stride_ms: 10.0
window_ms: 25.0
use_dB_normalization: True
target_dB: -20
random_seed: 0
keep_transcription_text: False
sortagrad: True
shuffle_method: batch_shuffle
num_workers: 2
###########################################
# Dataloader #
###########################################
mean_std_filepath: ""
vocab_filepath: data/lang_char/vocab.txt
unit_type: 'spm'
spm_model_prefix: 'data/lang_char/bpe_unigram_200'
augmentation_config: conf/preprocess.yaml
batch_size: 4
raw_wav: True # use raw_wav or kaldi feature
spectrum_type: fbank #linear, mfcc, fbank
feat_dim: 80
delta_delta: False
dither: 1.0
target_sample_rate: 16000
max_freq: None
n_fft: None
stride_ms: 10.0
window_ms: 25.0
use_dB_normalization: True
target_dB: -20
random_seed: 0
keep_transcription_text: False
sortagrad: True
shuffle_method: batch_shuffle
num_workers: 2
# network architecture
model:
cmvn_file: "data/mean_std.json"
cmvn_file_type: "json"
# encoder related
encoder: conformer
encoder_conf:
output_size: 256 # dimension of attention
attention_heads: 4
linear_units: 2048 # the number of units of position-wise feed forward
num_blocks: 12 # the number of encoder blocks
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.0
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before: true
use_cnn_module: True
cnn_module_kernel: 15
activation_type: 'swish'
pos_enc_layer_type: 'rel_pos'
selfattention_layer_type: 'rel_selfattn'
# decoder related ############################################
decoder: transformer # Network Architecture #
decoder_conf: ############################################
attention_heads: 4 cmvn_file: "data/mean_std.json"
linear_units: 2048 cmvn_file_type: "json"
num_blocks: 6 # encoder related
dropout_rate: 0.1 encoder: conformer
positional_dropout_rate: 0.1 encoder_conf:
self_attention_dropout_rate: 0.0 output_size: 256 # dimension of attention
src_attention_dropout_rate: 0.0 attention_heads: 4
linear_units: 2048 # the number of units of position-wise feed forward
num_blocks: 12 # the number of encoder blocks
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.0
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before: true
use_cnn_module: True
cnn_module_kernel: 15
activation_type: 'swish'
pos_enc_layer_type: 'rel_pos'
selfattention_layer_type: 'rel_selfattn'
# hybrid CTC/attention # decoder related
model_conf: decoder: transformer
ctc_weight: 0.3 decoder_conf:
lsm_weight: 0.1 # label smoothing option attention_heads: 4
length_normalized_loss: false linear_units: 2048
num_blocks: 6
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false
training:
n_epoch: 5
accum_grad: 4
global_grad_clip: 5.0
optim: adam
optim_conf:
lr: 0.002
weight_decay: 1e-06
scheduler: warmuplr
scheduler_conf:
warmup_steps: 25000
lr_decay: 1.0
log_interval: 1
checkpoint:
kbest_n: 10
latest_n: 1
###########################################
# training #
###########################################
n_epoch: 5
accum_grad: 4
global_grad_clip: 5.0
optim: adam
optim_conf:
lr: 0.002
weight_decay: 1e-06
scheduler: warmuplr
scheduler_conf:
warmup_steps: 25000
lr_decay: 1.0
log_interval: 1
checkpoint:
kbest_n: 10
latest_n: 1
decoding:
batch_size: 64
error_rate_type: wer
decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
alpha: 2.5
beta: 0.3
beam_size: 10
cutoff_prob: 1.0
cutoff_top_n: 0
num_proc_bsearch: 8
ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1.
simulate_streaming: False # simulate streaming inference. Defaults to False.

@ -1,110 +1,98 @@
# https://yaml.org/type/float.html # https://yaml.org/type/float.html
data: ###########################################
train_manifest: data/manifest.tiny # Data #
dev_manifest: data/manifest.tiny ###########################################
test_manifest: data/manifest.tiny train_manifest: data/manifest.tiny
min_input_len: 0.5 # second dev_manifest: data/manifest.tiny
max_input_len: 20.0 # second test_manifest: data/manifest.tiny
min_output_len: 0.0 # tokens min_input_len: 0.5 # second
max_output_len: 400.0 # tokens max_input_len: 20.0 # second
min_output_input_ratio: 0.05 min_output_len: 0.0 # tokens
max_output_input_ratio: 10.0 max_output_len: 400.0 # tokens
min_output_input_ratio: 0.05
max_output_input_ratio: 10.0
collator: ###########################################
mean_std_filepath: data/mean_std.json # Dataloader #
vocab_filepath: data/lang_char/vocab.txt ###########################################
unit_type: 'spm' mean_std_filepath: data/mean_std.json
spm_model_prefix: 'data/lang_char/bpe_unigram_200' vocab_filepath: data/lang_char/vocab.txt
augmentation_config: conf/preprocess.yaml unit_type: 'spm'
batch_size: 4 spm_model_prefix: 'data/lang_char/bpe_unigram_200'
raw_wav: True # use raw_wav or kaldi feature augmentation_config: conf/preprocess.yaml
spectrum_type: fbank #linear, mfcc, fbank batch_size: 4
feat_dim: 80 raw_wav: True # use raw_wav or kaldi feature
delta_delta: False spectrum_type: fbank #linear, mfcc, fbank
dither: 1.0 feat_dim: 80
target_sample_rate: 16000 delta_delta: False
max_freq: None dither: 1.0
n_fft: None target_sample_rate: 16000
stride_ms: 10.0 max_freq: None
window_ms: 25.0 n_fft: None
use_dB_normalization: True stride_ms: 10.0
target_dB: -20 window_ms: 25.0
random_seed: 0 use_dB_normalization: True
keep_transcription_text: False target_dB: -20
sortagrad: True random_seed: 0
shuffle_method: batch_shuffle keep_transcription_text: False
num_workers: 2 sortagrad: True
shuffle_method: batch_shuffle
num_workers: 2
# network architecture ############################################
model: # Network Architecture #
cmvn_file: ############################################
cmvn_file_type: "json" cmvn_file:
# encoder related cmvn_file_type: "json"
encoder: transformer # encoder related
encoder_conf: encoder: transformer
output_size: 256 # dimension of attention encoder_conf:
attention_heads: 4 output_size: 256 # dimension of attention
linear_units: 2048 # the number of units of position-wise feed forward attention_heads: 4
num_blocks: 12 # the number of encoder blocks linear_units: 2048 # the number of units of position-wise feed forward
dropout_rate: 0.1 num_blocks: 12 # the number of encoder blocks
positional_dropout_rate: 0.1 dropout_rate: 0.1
attention_dropout_rate: 0.0 positional_dropout_rate: 0.1
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 attention_dropout_rate: 0.0
normalize_before: true input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before: true
# decoder related # decoder related
decoder: transformer decoder: transformer
decoder_conf: decoder_conf:
attention_heads: 4 attention_heads: 4
linear_units: 2048 linear_units: 2048
num_blocks: 6 num_blocks: 6
dropout_rate: 0.1 dropout_rate: 0.1
positional_dropout_rate: 0.1 positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.0 self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0 src_attention_dropout_rate: 0.0
# hybrid CTC/attention # hybrid CTC/attention
model_conf: model_conf:
ctc_weight: 0.3 ctc_weight: 0.3
lsm_weight: 0.1 # label smoothing option lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false length_normalized_loss: false
training: ###########################################
n_epoch: 5 # training #
accum_grad: 1 ###########################################
global_grad_clip: 5.0 n_epoch: 5
optim: adam accum_grad: 1
optim_conf: global_grad_clip: 5.0
lr: 0.002 optim: adam
weight_decay: 1e-06 optim_conf:
scheduler: warmuplr lr: 0.002
scheduler_conf: weight_decay: 1e-06
warmup_steps: 25000 scheduler: warmuplr
lr_decay: 1.0 scheduler_conf:
log_interval: 1 warmup_steps: 25000
checkpoint: lr_decay: 1.0
kbest_n: 2 log_interval: 1
latest_n: 1 checkpoint:
kbest_n: 2
latest_n: 1
decoding:
batch_size: 8 #64
error_rate_type: wer
decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
alpha: 2.5
beta: 0.3
beam_size: 10
cutoff_prob: 1.0
cutoff_top_n: 0
num_proc_bsearch: 8
ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1.
simulate_streaming: False # simulate streaming inference. Defaults to False.

@ -46,7 +46,7 @@ if __name__ == "__main__":
if args.decode_config: if args.decode_config:
decode_confs = CfgNode(new_allowed=True) decode_confs = CfgNode(new_allowed=True)
decode_confs.merge_from_file(args.decode_config) decode_confs.merge_from_file(args.decode_config)
config.decoding = decode_confs config.decode = decode_confs
if args.opts: if args.opts:
config.merge_from_list(args.opts) config.merge_from_list(args.opts)
config.freeze() config.freeze()

@ -50,7 +50,7 @@ if __name__ == "__main__":
if args.decode_config: if args.decode_config:
decode_confs = CfgNode(new_allowed=True) decode_confs = CfgNode(new_allowed=True)
decode_confs.merge_from_file(args.decode_config) decode_confs.merge_from_file(args.decode_config)
config.decoding = decode_confs config.decode = decode_confs
if args.opts: if args.opts:
config.merge_from_list(args.opts) config.merge_from_list(args.opts)
config.freeze() config.freeze()

@ -81,7 +81,7 @@ class U2Infer():
ilen = paddle.to_tensor(feat.shape[0]) ilen = paddle.to_tensor(feat.shape[0])
xs = paddle.to_tensor(feat, dtype='float32').unsqueeze(axis=0) xs = paddle.to_tensor(feat, dtype='float32').unsqueeze(axis=0)
decode_config = self.config.decoding decode_config = self.config.decode
result_transcripts = self.model.decode( result_transcripts = self.model.decode(
xs, xs,
ilen, ilen,
@ -135,7 +135,7 @@ if __name__ == "__main__":
if args.decode_config: if args.decode_config:
decode_confs = CfgNode(new_allowed=True) decode_confs = CfgNode(new_allowed=True)
decode_confs.merge_from_file(args.decode_config) decode_confs.merge_from_file(args.decode_config)
config.decoding = decode_confs config.decode = decode_confs
if args.opts: if args.opts:
config.merge_from_list(args.opts) config.merge_from_list(args.opts)
config.freeze() config.freeze()

@ -29,7 +29,7 @@ U2Model.params(_C)
U2Trainer.params(_C) U2Trainer.params(_C)
_C.decoding = U2Tester.params() _C.decode = U2Tester.params()
def get_cfg_defaults(): def get_cfg_defaults():

Loading…
Cancel
Save