more config or u2pp

pull/2425/head
Hui Zhang 3 years ago
parent 00b2c1c8fb
commit b10512eb0e

@ -12,3 +12,34 @@ show model.tar.gz
```
tar tf model.tar.gz
```
other way is:
```bash
tar cvzf asr1_chunk_conformer_u2_wenetspeech_ckpt_1.1.0.model.tar.gz model.yaml conf/tuning/ conf/chunk_conformer.yaml conf/preprocess.yaml data/mean_std.json exp/chunk_conformer/checkpoints/
```
## Export Static Model
>> `data/test_meeting/data.list`
>> {"input": [{"name": "input1", "shape": [3.2230625, 80], "feat": "/home/PaddleSpeech/dataset/aishell/data_aishell/wav/test/S0764/BAC009S0764W0163.wav", "filetype": "sound"}], "output": [{"name": "target1", "shape": [9, 5538], "text": "\u697c\u5e02\u8c03\u63a7\u5c06\u53bb\u5411\u4f55\u65b9", "token": "\u697c \u5e02 \u8c03 \u63a7 \u5c06 \u53bb \u5411 \u4f55 \u65b9", "tokenid": "1891 1121 3502 1543 1018 477 528 163 1657"}], "utt": "BAC009S0764W0163", "utt2spk": "S0764"}
>> Test Wav:
>> wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
### U2 chunk conformer
>> UiDecoder
>> Make sure `reverse_weight` in config is `0.0`
>> https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_u2_wenetspeech_ckpt_1.1.0.model.tar.gz
```
tar zxvf asr1_chunk_conformer_u2_wenetspeech_ckpt_1.1.0.model.tar.gz
./local/export.sh conf/chunk_conformer.yaml exp/chunk_conformer/checkpoints/avg_10 ./export.ji
```
### U2++ chunk conformer
>> BiDecoder
>> https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_u2pp_wenetspeech_ckpt_1.1.0.model.tar.gz
>> Make sure `reverse_weight` in config is not `0.0`
```
./local/export.sh conf/chunk_conformer_u2pp.yaml exp/chunk_conformer/checkpoints/avg_10 ./export.ji
```

@ -39,6 +39,7 @@ decoder_conf:
model_conf:
ctc_weight: 0.3
lsm_weight: 0.1 # label smoothing option
reverse_weight: 0.0 # unidecoder
length_normalized_loss: false
init_type: 'kaiming_uniform'
@ -53,8 +54,9 @@ test_manifest: data/test_meeting/data.list
###########################################
# Dataloader #
###########################################
vocab_filepath: data/lang_char/vocab.txt
use_streaming_data: True
unit_type: 'char'
vocab_filepath: data/lang_char/vocab.txt
preprocess_config: conf/preprocess.yaml
spm_model_prefix: ''
feat_dim: 80

@ -0,0 +1,100 @@
############################################
# Network Architecture #
############################################
cmvn_file:
cmvn_file_type: "json"
# encoder related
encoder: conformer
encoder_conf:
output_size: 512 # dimension of attention
attention_heads: 8
linear_units: 2048 # the number of units of position-wise feed forward
num_blocks: 12 # the number of encoder blocks
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.1
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before: True
use_cnn_module: True
cnn_module_kernel: 15
activation_type: swish
pos_enc_layer_type: rel_pos
selfattention_layer_type: rel_selfattn
causal: true
use_dynamic_chunk: true
cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster
use_dynamic_left_chunk: false
# decoder related
decoder: bitransformer
decoder_conf:
attention_heads: 8
linear_units: 2048
num_blocks: 3 # the number of encoder blocks
r_num_blocks: 3 #only for bitransformer
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.1
src_attention_dropout_rate: 0.1
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false
reverse_weight: 0.3 # only for bitransformer decoder
init_type: 'kaiming_uniform' # !Warning: need to convergence
###########################################
# Data #
###########################################
train_manifest: data/train_l/data.list
dev_manifest: data/dev/data.list
test_manifest: data/test_meeting/data.list
###########################################
# Dataloader #
###########################################
use_stream_data: True
vocab_filepath: data/lang_char/vocab.txt
unit_type: 'char'
preprocess_config: conf/preprocess.yaml
spm_model_prefix: ''
feat_dim: 80
stride_ms: 10.0
window_ms: 25.0
sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
batch_size: 32
do_filter: True
maxlen_in: 1200 # if do_filter == False && input length > maxlen-in, batchsize is automatically reduced
maxlen_out: 100 # if do_filter == False && output length > maxlen-out, batchsize is automatically reduced
minlen_in: 10
minlen_out: 0
minibatches: 0 # for debug
batch_count: auto
batch_bins: 0
batch_frames_in: 0
batch_frames_out: 0
batch_frames_inout: 0
num_workers: 0
subsampling_factor: 1
num_encs: 1
###########################################
# Training #
###########################################
n_epoch: 150
accum_grad: 8
global_grad_clip: 5.0
dist_sampler: False
optim: adam
optim_conf:
lr: 0.002
weight_decay: 1.0e-6
scheduler: warmuplr
scheduler_conf:
warmup_steps: 25000
lr_decay: 1.0
log_interval: 100
checkpoint:
kbest_n: 50
latest_n: 5

@ -14,6 +14,8 @@ jit_model_export_path=$3
# export can not using StreamdataDataloader, set use_stream_dta False
# u2: reverse_weight should be 0.0
# u2pp: reverse_weight should be same with config file. e.g. 0.3
python3 -u ${BIN_DIR}/export.py \
--ngpu ${ngpu} \
--config ${config_path} \

@ -565,7 +565,7 @@ class U2BaseModel(ASRInterface, nn.Layer):
[len(hyp[0]) for hyp in hyps], place=device,
dtype=paddle.long) # (beam_size,)
hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id)
logger.info(
logger.debug(
f"hyps pad: {hyps_pad} {self.sos} {self.eos} {self.ignore_id}")
hyps_lens = hyps_lens + 1 # Add <sos> at begining
@ -590,7 +590,7 @@ class U2BaseModel(ASRInterface, nn.Layer):
# last decoder output token is `eos`, for laste decoder input token.
score += decoder_out[i][len(hyp[0])][self.eos]
logger.info(f"hyp {i} len {len(hyp[0])} l2r score: {score} ctc_score: {hyp[1]} reverse_weight: {reverse_weight}")
logger.debug(f"hyp {i} len {len(hyp[0])} l2r score: {score} ctc_score: {hyp[1]} reverse_weight: {reverse_weight}")
if reverse_weight > 0:
r_score = 0.0
@ -598,7 +598,7 @@ class U2BaseModel(ASRInterface, nn.Layer):
r_score += r_decoder_out[i][len(hyp[0]) - j - 1][w]
r_score += r_decoder_out[i][len(hyp[0])][self.eos]
logger.info(f"hyp {i} len {len(hyp[0])} r2l score: {score} ctc_score: {hyp[1]} reverse_weight: {reverse_weight}")
logger.info(f"hyp {i} len {len(hyp[0])} r2l score: {r_score} ctc_score: {hyp[1]} reverse_weight: {reverse_weight}")
score = score * (1 - reverse_weight) + r_score * reverse_weight
@ -608,7 +608,7 @@ class U2BaseModel(ASRInterface, nn.Layer):
best_score = score
best_index = i
logger.info(f"result: {hyps[best_index]}")
logger.debug(f"result: {hyps[best_index]}")
return hyps[best_index][0]
@jit.to_static(property=True)

Loading…
Cancel
Save