From 41eeed0450bea7c5fb097887d3a2e8dac55a6f28 Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Wed, 29 Dec 2021 08:38:11 +0000 Subject: [PATCH] add librispeech asr1 --- examples/aishell/asr1/local/align.sh | 2 +- examples/aishell/asr1/local/test.sh | 10 +- examples/aishell/asr1/local/test_wav.sh | 4 +- examples/csmsc/voc5/README.md | 4 +- .../asr1/conf/chunk_conformer.yaml | 184 ++++++++-------- .../asr1/conf/chunk_transformer.yaml | 179 ++++++++-------- examples/librispeech/asr1/conf/conformer.yaml | 181 ++++++++-------- .../librispeech/asr1/conf/transformer.yaml | 183 +++++++--------- .../asr1/conf/tuning/chunk_decode.yaml | 11 + .../librispeech/asr1/conf/tuning/decode.yaml | 11 + examples/librispeech/asr1/local/align.sh | 10 +- examples/librispeech/asr1/local/test.sh | 22 +- examples/librispeech/asr1/local/test_wav.sh | 14 +- examples/librispeech/asr1/run.sh | 7 +- examples/tiny/asr1/conf/conformer.yaml | 197 +++++++++--------- examples/tiny/asr1/conf/transformer.yaml | 188 ++++++++--------- paddlespeech/s2t/exps/u2/bin/alignment.py | 2 +- paddlespeech/s2t/exps/u2/bin/test.py | 2 +- paddlespeech/s2t/exps/u2/bin/test_wav.py | 4 +- paddlespeech/s2t/exps/u2/config.py | 2 +- 20 files changed, 590 insertions(+), 627 deletions(-) create mode 100644 examples/librispeech/asr1/conf/tuning/chunk_decode.yaml create mode 100644 examples/librispeech/asr1/conf/tuning/decode.yaml diff --git a/examples/aishell/asr1/local/align.sh b/examples/aishell/asr1/local/align.sh index f526c8a4..95472e10 100755 --- a/examples/aishell/asr1/local/align.sh +++ b/examples/aishell/asr1/local/align.sh @@ -24,7 +24,7 @@ python3 -u ${BIN_DIR}/alignment.py \ --decode_config ${decode_config_path} \ --result_file ${output_dir}/${type}.align \ --checkpoint_path ${ckpt_prefix} \ ---opts decoding.decode_batch_size ${batch_size} +--opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in ctc alignment!" diff --git a/examples/aishell/asr1/local/test.sh b/examples/aishell/asr1/local/test.sh index 2c092127..cab7c34e 100755 --- a/examples/aishell/asr1/local/test.sh +++ b/examples/aishell/asr1/local/test.sh @@ -30,7 +30,7 @@ for type in attention ctc_greedy_search; do # stream decoding only support batchsize=1 batch_size=1 else - batch_size=64 + batch_size=1 fi output_dir=${ckpt_prefix} mkdir -p ${output_dir} @@ -40,8 +40,8 @@ for type in attention ctc_greedy_search; do --decode_config ${decode_config_path} \ --result_file ${output_dir}/${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.decode_batch_size ${batch_size} + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" @@ -60,8 +60,8 @@ for type in ctc_prefix_beam_search attention_rescoring; do --decode_config ${decode_config_path} \ --result_file ${output_dir}/${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" diff --git a/examples/aishell/asr1/local/test_wav.sh b/examples/aishell/asr1/local/test_wav.sh index 4866e642..661013b1 100755 --- a/examples/aishell/asr1/local/test_wav.sh +++ b/examples/aishell/asr1/local/test_wav.sh @@ -46,8 +46,8 @@ for type in attention_rescoring; do --decode_config ${decode_config_path} \ --result_file ${output_dir}/${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.decode_batch_size ${batch_size} \ + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} \ --audio_file ${audio_file} if [ $? -ne 0 ]; then diff --git a/examples/csmsc/voc5/README.md b/examples/csmsc/voc5/README.md index 21afe6ee..be06f830 100644 --- a/examples/csmsc/voc5/README.md +++ b/examples/csmsc/voc5/README.md @@ -125,8 +125,8 @@ HiFiGAN checkpoint contains files listed below. ```text hifigan_csmsc_ckpt_0.1.1 ├── default.yaml # default config used to train hifigan -├── feats_stats.npy # statistics used to normalize spectrogram when training hifigan -└── snapshot_iter_2500000.pdz # generator parameters of hifigan +├── feats_stats.npy # generator parameters of hifigan +└── snapshot_iter_2500000.pdz # statistics used to normalize spectrogram when training hifigan ``` ## Acknowledgement diff --git a/examples/librispeech/asr1/conf/chunk_conformer.yaml b/examples/librispeech/asr1/conf/chunk_conformer.yaml index 662d559c..ace61d36 100644 --- a/examples/librispeech/asr1/conf/chunk_conformer.yaml +++ b/examples/librispeech/asr1/conf/chunk_conformer.yaml @@ -1,103 +1,99 @@ -# network architecture -model: - cmvn_file: - cmvn_file_type: "json" - # encoder related - encoder: conformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: True - use_cnn_module: True - cnn_module_kernel: 15 - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' - causal: True - use_dynamic_chunk: true - cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster - use_dynamic_left_chunk: false +############################################ +# Network Architecture # +############################################ +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: conformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: True + use_cnn_module: True + cnn_module_kernel: 15 + activation_type: 'swish' + pos_enc_layer_type: 'rel_pos' + selfattention_layer_type: 'rel_selfattn' + causal: True + use_dynamic_chunk: true + cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster + use_dynamic_left_chunk: false - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 - # hybrid CTC/attention - model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test -collator: - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'spm' - spm_model_prefix: 'data/lang_char/bpe_unigram_5000' - mean_std_filepath: "" - augmentation_config: conf/preprocess.yaml - feat_dim: 80 - stride_ms: 10.0 - window_ms: 25.0 - sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs - batch_size: 16 - maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced - maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced - minibatches: 0 # for debug - batch_count: auto - batch_bins: 0 - batch_frames_in: 0 - batch_frames_out: 0 - batch_frames_inout: 0 - augmentation_config: conf/preprocess.yaml - num_workers: 0 - subsampling_factor: 1 - num_encs: 1 +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'spm' +spm_model_prefix: 'data/lang_char/bpe_unigram_5000' +mean_std_filepath: "" +augmentation_config: conf/preprocess.yaml +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 16 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +augmentation_config: conf/preprocess.yaml +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 + +########################################### +# Training # +########################################### +n_epoch: 120 +accum_grad: 8 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 0.001 + weight_decay: 1e-06 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 -training: - n_epoch: 120 - accum_grad: 8 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 0.001 - weight_decay: 1e-06 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 25000 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 -decoding: - batch_size: 128 - error_rate_type: wer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - beam_size: 10 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: true # simulate streaming inference. Defaults to False. diff --git a/examples/librispeech/asr1/conf/chunk_transformer.yaml b/examples/librispeech/asr1/conf/chunk_transformer.yaml index bc77ba41..d6d84eb1 100644 --- a/examples/librispeech/asr1/conf/chunk_transformer.yaml +++ b/examples/librispeech/asr1/conf/chunk_transformer.yaml @@ -1,103 +1,90 @@ -# network architecture -model: - cmvn_file: - cmvn_file_type: "json" - # encoder related - encoder: transformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true - use_dynamic_chunk: true - use_dynamic_left_chunk: false +############################################ +# Network Architecture # +############################################ +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: transformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: true + use_dynamic_chunk: true + use_dynamic_left_chunk: false - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 - # hybrid CTC/attention - model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test -collator: - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'spm' - spm_model_prefix: 'data/lang_char/bpe_unigram_5000' - mean_std_filepath: "" - augmentation_config: conf/preprocess.yaml - feat_dim: 80 - stride_ms: 10.0 - window_ms: 25.0 - sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs - batch_size: 64 - maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced - maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced - minibatches: 0 # for debug - batch_count: auto - batch_bins: 0 - batch_frames_in: 0 - batch_frames_out: 0 - batch_frames_inout: 0 - augmentation_config: conf/preprocess.yaml - num_workers: 0 - subsampling_factor: 1 - num_encs: 1 +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'spm' +spm_model_prefix: 'data/lang_char/bpe_unigram_5000' +mean_std_filepath: "" +augmentation_config: conf/preprocess.yaml +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 64 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +augmentation_config: conf/preprocess.yaml +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 -training: - n_epoch: 120 - accum_grad: 1 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 0.001 - weight_decay: 1e-06 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 - - -decoding: - batch_size: 64 - error_rate_type: wer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 10 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: true # simulate streaming inference. Defaults to False. \ No newline at end of file +########################################### +# Training # +########################################### +n_epoch: 120 +accum_grad: 1 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 0.001 + weight_decay: 1e-06 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 \ No newline at end of file diff --git a/examples/librispeech/asr1/conf/conformer.yaml b/examples/librispeech/asr1/conf/conformer.yaml index 5a570897..bb028e69 100644 --- a/examples/librispeech/asr1/conf/conformer.yaml +++ b/examples/librispeech/asr1/conf/conformer.yaml @@ -1,104 +1,97 @@ -# network architecture -model: - cmvn_file: - cmvn_file_type: "json" - # encoder related - encoder: conformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: True - use_cnn_module: True - cnn_module_kernel: 15 - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' +############################################ +# Network Architecture # +############################################ +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: conformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: True + use_cnn_module: True + cnn_module_kernel: 15 + activation_type: 'swish' + pos_enc_layer_type: 'rel_pos' + selfattention_layer_type: 'rel_selfattn' - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 - # hybrid CTC/attention - model_conf: - ctc_weight: 0.3 - ctc_grad_norm_type: null - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + ctc_grad_norm_type: null + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test-clean +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test-clean -collator: - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'spm' - spm_model_prefix: 'data/lang_char/bpe_unigram_5000' - mean_std_filepath: "" - augmentation_config: conf/preprocess.yaml - feat_dim: 80 - stride_ms: 10.0 - window_ms: 25.0 - sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs - batch_size: 16 - maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced - maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced - minibatches: 0 # for debug - batch_count: auto - batch_bins: 0 - batch_frames_in: 0 - batch_frames_out: 0 - batch_frames_inout: 0 - augmentation_config: conf/preprocess.yaml - num_workers: 0 - subsampling_factor: 1 - num_encs: 1 +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'spm' +spm_model_prefix: 'data/lang_char/bpe_unigram_5000' +mean_std_filepath: "" +augmentation_config: conf/preprocess.yaml +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 16 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +augmentation_config: conf/preprocess.yaml +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 -training: - n_epoch: 70 - accum_grad: 8 - global_grad_clip: 3.0 - optim: adam - optim_conf: - lr: 0.004 - weight_decay: 1e-06 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 - - -decoding: - batch_size: 64 - error_rate_type: wer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - beam_size: 10 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. +########################################### +# Training # +########################################### +n_epoch: 70 +accum_grad: 8 +global_grad_clip: 3.0 +optim: adam +optim_conf: + lr: 0.004 + weight_decay: 1e-06 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/librispeech/asr1/conf/transformer.yaml b/examples/librispeech/asr1/conf/transformer.yaml index b7f33e22..f81234f1 100644 --- a/examples/librispeech/asr1/conf/transformer.yaml +++ b/examples/librispeech/asr1/conf/transformer.yaml @@ -1,110 +1,89 @@ -# network architecture -model: - cmvn_file: - cmvn_file_type: "json" - # encoder related - encoder: transformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true +############################################ +# Network Architecture # +############################################ +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: transformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: true - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 - # hybrid CTC/attention - model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test-clean - min_input_len: 0.5 # second - max_input_len: 30.0 # second - min_output_len: 0.0 # tokens - max_output_len: 400.0 # tokens - min_output_input_ratio: 0.05 - max_output_input_ratio: 100.0 +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test-clean -collator: - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'spm' - spm_model_prefix: 'data/lang_char/bpe_unigram_5000' - mean_std_filepath: "" - augmentation_config: conf/preprocess.yaml - feat_dim: 80 - stride_ms: 10.0 - window_ms: 25.0 - sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs - batch_size: 32 - maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced - maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced - minibatches: 0 # for debug - batch_count: auto - batch_bins: 0 - batch_frames_in: 0 - batch_frames_out: 0 - batch_frames_inout: 0 - augmentation_config: conf/preprocess.yaml - num_workers: 0 - subsampling_factor: 1 - num_encs: 1 - - -training: - n_epoch: 120 - accum_grad: 4 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 0.004 - weight_decay: 1e-06 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 - - -decoding: - batch_size: 64 - error_rate_type: wer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 10 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'spm' +spm_model_prefix: 'data/lang_char/bpe_unigram_5000' +mean_std_filepath: "" +augmentation_config: conf/preprocess.yaml +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 32 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +augmentation_config: conf/preprocess.yaml +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 +########################################### +# Training # +########################################### +n_epoch: 120 +accum_grad: 4 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 0.004 + weight_decay: 1e-06 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/librispeech/asr1/conf/tuning/chunk_decode.yaml b/examples/librispeech/asr1/conf/tuning/chunk_decode.yaml new file mode 100644 index 00000000..0760e721 --- /dev/null +++ b/examples/librispeech/asr1/conf/tuning/chunk_decode.yaml @@ -0,0 +1,11 @@ +decode_batch_size: 128 +error_rate_type: wer +decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' +beam_size: 10 +ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. +decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. +num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. +simulate_streaming: true # simulate streaming inference. Defaults to False. diff --git a/examples/librispeech/asr1/conf/tuning/decode.yaml b/examples/librispeech/asr1/conf/tuning/decode.yaml new file mode 100644 index 00000000..805dd02f --- /dev/null +++ b/examples/librispeech/asr1/conf/tuning/decode.yaml @@ -0,0 +1,11 @@ +decode_batch_size: 64 +error_rate_type: wer +decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' +beam_size: 10 +ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. +decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. +num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. +simulate_streaming: False # simulate streaming inference. Defaults to False. diff --git a/examples/librispeech/asr1/local/align.sh b/examples/librispeech/asr1/local/align.sh index c65d611c..95472e10 100755 --- a/examples/librispeech/asr1/local/align.sh +++ b/examples/librispeech/asr1/local/align.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 2 ];then - echo "usage: ${0} config_path ckpt_path_prefix" +if [ $# != 3 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix" exit -1 fi @@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 +decode_config_path=$2 +ckpt_prefix=$3 batch_size=1 output_dir=${ckpt_prefix} @@ -20,9 +21,10 @@ mkdir -p ${output_dir} python3 -u ${BIN_DIR}/alignment.py \ --ngpu ${ngpu} \ --config ${config_path} \ +--decode_config ${decode_config_path} \ --result_file ${output_dir}/${type}.align \ --checkpoint_path ${ckpt_prefix} \ ---opts decoding.batch_size ${batch_size} +--opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in ctc alignment!" diff --git a/examples/librispeech/asr1/local/test.sh b/examples/librispeech/asr1/local/test.sh index aa06132e..ddb6c6b6 100755 --- a/examples/librispeech/asr1/local/test.sh +++ b/examples/librispeech/asr1/local/test.sh @@ -15,8 +15,8 @@ recog_set="test-clean" stage=0 stop_stage=100 -if [ $# != 2 ];then - echo "usage: ${0} config_path ckpt_path_prefix" +if [ $# != 3 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix" exit -1 fi @@ -24,7 +24,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 +decode_config_path=$2 +ckpt_prefix=$3 chunk_mode=false if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then @@ -52,10 +53,11 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_config ${decode_config_path} \ --result_file ${ckpt_prefix}.${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" @@ -76,10 +78,11 @@ for type in ctc_greedy_search; do python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_config ${decode_config_path} \ --result_file ${ckpt_prefix}.${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" @@ -96,10 +99,11 @@ for type in ctc_prefix_beam_search attention_rescoring; do python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_config ${decode_config_path} \ --result_file ${ckpt_prefix}.${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" diff --git a/examples/librispeech/asr1/local/test_wav.sh b/examples/librispeech/asr1/local/test_wav.sh index ab6d685d..60eaadbf 100755 --- a/examples/librispeech/asr1/local/test_wav.sh +++ b/examples/librispeech/asr1/local/test_wav.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 3 ];then - echo "usage: ${0} config_path ckpt_path_prefix audio_file" +if [ $# != 4 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix audio_file" exit -1 fi @@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 -audio_file=$3 +decode_config_path=$2 +ckpt_prefix=$3 +audio_file=$4 mkdir -p data wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/en/demo_002_en.wav -P data/ @@ -49,10 +50,11 @@ for type in attention_rescoring; do python3 -u ${BIN_DIR}/test_wav.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_config ${decode_config_path} \ --result_file ${output_dir}/${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} \ + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} \ --audio_file ${audio_file} #score_sclite.sh --bpe ${nbpe} --bpemodel ${bpemodel}.model --wer true ${expdir}/${decode_dir} ${dict} diff --git a/examples/librispeech/asr1/run.sh b/examples/librispeech/asr1/run.sh index f839e5af..116dae12 100755 --- a/examples/librispeech/asr1/run.sh +++ b/examples/librispeech/asr1/run.sh @@ -8,6 +8,7 @@ gpus=0,1,2,3 stage=0 stop_stage=50 conf_path=conf/transformer.yaml +decode_conf_path=conf/tuning/decode.yaml avg_num=30 audio_file=data/demo_002_en.wav @@ -34,17 +35,17 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then # ctc alignment of test data - CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then # test a single .wav file - CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1 fi if [ ${stage} -le 51 ] && [ ${stop_stage} -ge 51 ]; then diff --git a/examples/tiny/asr1/conf/conformer.yaml b/examples/tiny/asr1/conf/conformer.yaml index eb850902..085581f2 100644 --- a/examples/tiny/asr1/conf/conformer.yaml +++ b/examples/tiny/asr1/conf/conformer.yaml @@ -1,116 +1,105 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.tiny - dev_manifest: data/manifest.tiny - test_manifest: data/manifest.tiny - min_input_len: 0.5 # second - max_input_len: 20.0 # second - min_output_len: 0.0 # tokens - max_output_len: 400.0 # tokens - min_output_input_ratio: 0.05 - max_output_input_ratio: 10.0 +########################################### +# Data # +########################################### +train_manifest: data/manifest.tiny +dev_manifest: data/manifest.tiny +test_manifest: data/manifest.tiny +min_input_len: 0.5 # second +max_input_len: 20.0 # second +min_output_len: 0.0 # tokens +max_output_len: 400.0 # tokens +min_output_input_ratio: 0.05 +max_output_input_ratio: 10.0 -collator: - mean_std_filepath: "" - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'spm' - spm_model_prefix: 'data/lang_char/bpe_unigram_200' - augmentation_config: conf/preprocess.yaml - batch_size: 4 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank - feat_dim: 80 - delta_delta: False - dither: 1.0 - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 +########################################### +# Dataloader # +########################################### +mean_std_filepath: "" +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'spm' +spm_model_prefix: 'data/lang_char/bpe_unigram_200' +augmentation_config: conf/preprocess.yaml +batch_size: 4 +raw_wav: True # use raw_wav or kaldi feature +spectrum_type: fbank #linear, mfcc, fbank +feat_dim: 80 +delta_delta: False +dither: 1.0 +target_sample_rate: 16000 +max_freq: None +n_fft: None +stride_ms: 10.0 +window_ms: 25.0 +use_dB_normalization: True +target_dB: -20 +random_seed: 0 +keep_transcription_text: False +sortagrad: True +shuffle_method: batch_shuffle +num_workers: 2 -# network architecture -model: - cmvn_file: "data/mean_std.json" - cmvn_file_type: "json" - # encoder related - encoder: conformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true - use_cnn_module: True - cnn_module_kernel: 15 - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +############################################ +# Network Architecture # +############################################ +cmvn_file: "data/mean_std.json" +cmvn_file_type: "json" +# encoder related +encoder: conformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: true + use_cnn_module: True + cnn_module_kernel: 15 + activation_type: 'swish' + pos_enc_layer_type: 'rel_pos' + selfattention_layer_type: 'rel_selfattn' - # hybrid CTC/attention - model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false -training: - n_epoch: 5 - accum_grad: 4 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 0.002 - weight_decay: 1e-06 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 1 - checkpoint: - kbest_n: 10 - latest_n: 1 +########################################### +# training # +########################################### +n_epoch: 5 +accum_grad: 4 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 0.002 + weight_decay: 1e-06 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 1 +checkpoint: + kbest_n: 10 + latest_n: 1 -decoding: - batch_size: 64 - error_rate_type: wer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 10 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. diff --git a/examples/tiny/asr1/conf/transformer.yaml b/examples/tiny/asr1/conf/transformer.yaml index c641d1f5..95c7df50 100644 --- a/examples/tiny/asr1/conf/transformer.yaml +++ b/examples/tiny/asr1/conf/transformer.yaml @@ -1,110 +1,98 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.tiny - dev_manifest: data/manifest.tiny - test_manifest: data/manifest.tiny - min_input_len: 0.5 # second - max_input_len: 20.0 # second - min_output_len: 0.0 # tokens - max_output_len: 400.0 # tokens - min_output_input_ratio: 0.05 - max_output_input_ratio: 10.0 +########################################### +# Data # +########################################### +train_manifest: data/manifest.tiny +dev_manifest: data/manifest.tiny +test_manifest: data/manifest.tiny +min_input_len: 0.5 # second +max_input_len: 20.0 # second +min_output_len: 0.0 # tokens +max_output_len: 400.0 # tokens +min_output_input_ratio: 0.05 +max_output_input_ratio: 10.0 -collator: - mean_std_filepath: data/mean_std.json - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'spm' - spm_model_prefix: 'data/lang_char/bpe_unigram_200' - augmentation_config: conf/preprocess.yaml - batch_size: 4 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank - feat_dim: 80 - delta_delta: False - dither: 1.0 - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 +########################################### +# Dataloader # +########################################### +mean_std_filepath: data/mean_std.json +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'spm' +spm_model_prefix: 'data/lang_char/bpe_unigram_200' +augmentation_config: conf/preprocess.yaml +batch_size: 4 +raw_wav: True # use raw_wav or kaldi feature +spectrum_type: fbank #linear, mfcc, fbank +feat_dim: 80 +delta_delta: False +dither: 1.0 +target_sample_rate: 16000 +max_freq: None +n_fft: None +stride_ms: 10.0 +window_ms: 25.0 +use_dB_normalization: True +target_dB: -20 +random_seed: 0 +keep_transcription_text: False +sortagrad: True +shuffle_method: batch_shuffle +num_workers: 2 -# network architecture -model: - cmvn_file: - cmvn_file_type: "json" - # encoder related - encoder: transformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true +############################################ +# Network Architecture # +############################################ +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: transformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: true - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 - # hybrid CTC/attention - model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false -training: - n_epoch: 5 - accum_grad: 1 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 0.002 - weight_decay: 1e-06 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 1 - checkpoint: - kbest_n: 2 - latest_n: 1 +########################################### +# training # +########################################### +n_epoch: 5 +accum_grad: 1 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 0.002 + weight_decay: 1e-06 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 1 +checkpoint: + kbest_n: 2 + latest_n: 1 -decoding: - batch_size: 8 #64 - error_rate_type: wer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 10 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. - diff --git a/paddlespeech/s2t/exps/u2/bin/alignment.py b/paddlespeech/s2t/exps/u2/bin/alignment.py index f8397ed0..229f696d 100644 --- a/paddlespeech/s2t/exps/u2/bin/alignment.py +++ b/paddlespeech/s2t/exps/u2/bin/alignment.py @@ -46,7 +46,7 @@ if __name__ == "__main__": if args.decode_config: decode_confs = CfgNode(new_allowed=True) decode_confs.merge_from_file(args.decode_config) - config.decoding = decode_confs + config.decode = decode_confs if args.opts: config.merge_from_list(args.opts) config.freeze() diff --git a/paddlespeech/s2t/exps/u2/bin/test.py b/paddlespeech/s2t/exps/u2/bin/test.py index f179ea81..419594bf 100644 --- a/paddlespeech/s2t/exps/u2/bin/test.py +++ b/paddlespeech/s2t/exps/u2/bin/test.py @@ -50,7 +50,7 @@ if __name__ == "__main__": if args.decode_config: decode_confs = CfgNode(new_allowed=True) decode_confs.merge_from_file(args.decode_config) - config.decoding = decode_confs + config.decode = decode_confs if args.opts: config.merge_from_list(args.opts) config.freeze() diff --git a/paddlespeech/s2t/exps/u2/bin/test_wav.py b/paddlespeech/s2t/exps/u2/bin/test_wav.py index e5671a43..766e4173 100644 --- a/paddlespeech/s2t/exps/u2/bin/test_wav.py +++ b/paddlespeech/s2t/exps/u2/bin/test_wav.py @@ -81,7 +81,7 @@ class U2Infer(): ilen = paddle.to_tensor(feat.shape[0]) xs = paddle.to_tensor(feat, dtype='float32').unsqueeze(axis=0) - decode_config = self.config.decoding + decode_config = self.config.decode result_transcripts = self.model.decode( xs, ilen, @@ -135,7 +135,7 @@ if __name__ == "__main__": if args.decode_config: decode_confs = CfgNode(new_allowed=True) decode_confs.merge_from_file(args.decode_config) - config.decoding = decode_confs + config.decode = decode_confs if args.opts: config.merge_from_list(args.opts) config.freeze() diff --git a/paddlespeech/s2t/exps/u2/config.py b/paddlespeech/s2t/exps/u2/config.py index 537d0c21..2b4f6fb2 100644 --- a/paddlespeech/s2t/exps/u2/config.py +++ b/paddlespeech/s2t/exps/u2/config.py @@ -29,7 +29,7 @@ U2Model.params(_C) U2Trainer.params(_C) -_C.decoding = U2Tester.params() +_C.decode = U2Tester.params() def get_cfg_defaults():