diff --git a/examples/wenetspeech/asr1/README.md b/examples/wenetspeech/asr1/README.md index c08b94e29..9fc2856ce 100644 --- a/examples/wenetspeech/asr1/README.md +++ b/examples/wenetspeech/asr1/README.md @@ -12,3 +12,34 @@ show model.tar.gz ``` tar tf model.tar.gz ``` + +other way is: + +```bash +tar cvzf asr1_chunk_conformer_u2_wenetspeech_ckpt_1.1.0.model.tar.gz model.yaml conf/tuning/ conf/chunk_conformer.yaml conf/preprocess.yaml data/mean_std.json exp/chunk_conformer/checkpoints/ +``` + +## Export Static Model + +>> `data/test_meeting/data.list` +>> {"input": [{"name": "input1", "shape": [3.2230625, 80], "feat": "/home/PaddleSpeech/dataset/aishell/data_aishell/wav/test/S0764/BAC009S0764W0163.wav", "filetype": "sound"}], "output": [{"name": "target1", "shape": [9, 5538], "text": "\u697c\u5e02\u8c03\u63a7\u5c06\u53bb\u5411\u4f55\u65b9", "token": "\u697c \u5e02 \u8c03 \u63a7 \u5c06 \u53bb \u5411 \u4f55 \u65b9", "tokenid": "1891 1121 3502 1543 1018 477 528 163 1657"}], "utt": "BAC009S0764W0163", "utt2spk": "S0764"} + +>> Test Wav: +>> wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav +### U2 chunk conformer +>> UiDecoder +>> Make sure `reverse_weight` in config is `0.0` +>> https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_u2_wenetspeech_ckpt_1.1.0.model.tar.gz +``` +tar zxvf asr1_chunk_conformer_u2_wenetspeech_ckpt_1.1.0.model.tar.gz +./local/export.sh conf/chunk_conformer.yaml exp/chunk_conformer/checkpoints/avg_10 ./export.ji +``` + +### U2++ chunk conformer +>> BiDecoder +>> https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_u2pp_wenetspeech_ckpt_1.1.0.model.tar.gz +>> Make sure `reverse_weight` in config is not `0.0` + +``` +./local/export.sh conf/chunk_conformer_u2pp.yaml exp/chunk_conformer/checkpoints/avg_10 ./export.ji +``` diff --git a/examples/wenetspeech/asr1/conf/chunk_conformer.yaml b/examples/wenetspeech/asr1/conf/chunk_conformer.yaml index 69fa223a1..d2f43d873 100644 --- a/examples/wenetspeech/asr1/conf/chunk_conformer.yaml +++ b/examples/wenetspeech/asr1/conf/chunk_conformer.yaml @@ -39,6 +39,7 @@ decoder_conf: model_conf: ctc_weight: 0.3 lsm_weight: 0.1 # label smoothing option + reverse_weight: 0.0 # unidecoder length_normalized_loss: false init_type: 'kaiming_uniform' @@ -53,8 +54,9 @@ test_manifest: data/test_meeting/data.list ########################################### # Dataloader # ########################################### -vocab_filepath: data/lang_char/vocab.txt +use_streaming_data: True unit_type: 'char' +vocab_filepath: data/lang_char/vocab.txt preprocess_config: conf/preprocess.yaml spm_model_prefix: '' feat_dim: 80 diff --git a/examples/wenetspeech/asr1/conf/chunk_conformer_u2pp.yaml b/examples/wenetspeech/asr1/conf/chunk_conformer_u2pp.yaml new file mode 100644 index 000000000..2bb2006b5 --- /dev/null +++ b/examples/wenetspeech/asr1/conf/chunk_conformer_u2pp.yaml @@ -0,0 +1,100 @@ +############################################ +# Network Architecture # +############################################ +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: conformer +encoder_conf: + output_size: 512 # dimension of attention + attention_heads: 8 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.1 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: True + use_cnn_module: True + cnn_module_kernel: 15 + activation_type: swish + pos_enc_layer_type: rel_pos + selfattention_layer_type: rel_selfattn + causal: true + use_dynamic_chunk: true + cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster + use_dynamic_left_chunk: false +# decoder related +decoder: bitransformer +decoder_conf: + attention_heads: 8 + linear_units: 2048 + num_blocks: 3 # the number of encoder blocks + r_num_blocks: 3 #only for bitransformer + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.1 + src_attention_dropout_rate: 0.1 + +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false + reverse_weight: 0.3 # only for bitransformer decoder + init_type: 'kaiming_uniform' # !Warning: need to convergence + +########################################### +# Data # +########################################### +train_manifest: data/train_l/data.list +dev_manifest: data/dev/data.list +test_manifest: data/test_meeting/data.list + +########################################### +# Dataloader # +########################################### +use_stream_data: True +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'char' +preprocess_config: conf/preprocess.yaml +spm_model_prefix: '' +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 32 +do_filter: True +maxlen_in: 1200 # if do_filter == False && input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 100 # if do_filter == False && output length > maxlen-out, batchsize is automatically reduced +minlen_in: 10 +minlen_out: 0 +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 + +########################################### +# Training # +########################################### +n_epoch: 150 +accum_grad: 8 +global_grad_clip: 5.0 +dist_sampler: False +optim: adam +optim_conf: + lr: 0.002 + weight_decay: 1.0e-6 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/wenetspeech/asr1/local/export.sh b/examples/wenetspeech/asr1/local/export.sh index 735c4f8e5..1f89afd6b 100755 --- a/examples/wenetspeech/asr1/local/export.sh +++ b/examples/wenetspeech/asr1/local/export.sh @@ -14,6 +14,8 @@ jit_model_export_path=$3 # export can not using StreamdataDataloader, set use_stream_dta False +# u2: reverse_weight should be 0.0 +# u2pp: reverse_weight should be same with config file. e.g. 0.3 python3 -u ${BIN_DIR}/export.py \ --ngpu ${ngpu} \ --config ${config_path} \ diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py index 7609b71e0..2279812ba 100644 --- a/paddlespeech/s2t/models/u2/u2.py +++ b/paddlespeech/s2t/models/u2/u2.py @@ -565,7 +565,7 @@ class U2BaseModel(ASRInterface, nn.Layer): [len(hyp[0]) for hyp in hyps], place=device, dtype=paddle.long) # (beam_size,) hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id) - logger.info( + logger.debug( f"hyps pad: {hyps_pad} {self.sos} {self.eos} {self.ignore_id}") hyps_lens = hyps_lens + 1 # Add at begining @@ -590,7 +590,7 @@ class U2BaseModel(ASRInterface, nn.Layer): # last decoder output token is `eos`, for laste decoder input token. score += decoder_out[i][len(hyp[0])][self.eos] - logger.info(f"hyp {i} len {len(hyp[0])} l2r score: {score} ctc_score: {hyp[1]} reverse_weight: {reverse_weight}") + logger.debug(f"hyp {i} len {len(hyp[0])} l2r score: {score} ctc_score: {hyp[1]} reverse_weight: {reverse_weight}") if reverse_weight > 0: r_score = 0.0 @@ -598,7 +598,7 @@ class U2BaseModel(ASRInterface, nn.Layer): r_score += r_decoder_out[i][len(hyp[0]) - j - 1][w] r_score += r_decoder_out[i][len(hyp[0])][self.eos] - logger.info(f"hyp {i} len {len(hyp[0])} r2l score: {score} ctc_score: {hyp[1]} reverse_weight: {reverse_weight}") + logger.info(f"hyp {i} len {len(hyp[0])} r2l score: {r_score} ctc_score: {hyp[1]} reverse_weight: {reverse_weight}") score = score * (1 - reverse_weight) + r_score * reverse_weight @@ -608,7 +608,7 @@ class U2BaseModel(ASRInterface, nn.Layer): best_score = score best_index = i - logger.info(f"result: {hyps[best_index]}") + logger.debug(f"result: {hyps[best_index]}") return hyps[best_index][0] @jit.to_static(property=True)