diff --git a/deepspeech/models/ds2/conv.py b/deepspeech/models/ds2/conv.py index 8bf48b2c..ce962a44 100644 --- a/deepspeech/models/ds2/conv.py +++ b/deepspeech/models/ds2/conv.py @@ -41,13 +41,6 @@ def conv_output_size(I, F, P, S): return (I - F + 2 * P - S) // S -# receptive field calculator -# https://fomoro.com/research/article/receptive-field-calculator -# https://stanford.edu/~shervine/teaching/cs-230/cheatsheet-convolutional-neural-networks#hyperparameters -# https://distill.pub/2019/computing-receptive-fields/ -# Rl-1 = Sl * Rl + (Kl - Sl) - - class ConvBn(nn.Layer): """Convolution layer with batch normalization. diff --git a/deepspeech/modules/subsampling.py b/deepspeech/modules/subsampling.py index 40fa7b00..3bed62f3 100644 --- a/deepspeech/modules/subsampling.py +++ b/deepspeech/modules/subsampling.py @@ -108,8 +108,8 @@ class Conv2dSubsampling4(BaseSubsampling): nn.Linear(odim * (((idim - 1) // 2 - 1) // 2), odim)) self.subsampling_rate = 4 # The right context for every conv layer is computed by: - # (kernel_size - 1) / 2 * stride * frame_rate_of_this_layer - # 6 = (3 - 1) / 2 * 2 * 1 + (3 - 1) / 2 * 2 * 2 + # (kernel_size - 1) * frame_rate_of_this_layer + # 6 = (3 - 1) * 1 + (3 - 1) * 2 self.right_context = 6 def forward(self, x: paddle.Tensor, x_mask: paddle.Tensor, offset: int=0 @@ -160,10 +160,10 @@ class Conv2dSubsampling6(BaseSubsampling): # when Padding == 0, O = (I - F - S) // S self.linear = nn.Linear(odim * (((idim - 1) // 2 - 2) // 3), odim) # The right context for every conv layer is computed by: - # (kernel_size - 1) / 2 * stride * frame_rate_of_this_layer - # 14 = (3 - 1) / 2 * 2 * 1 + (5 - 1) / 2 * 3 * 2 + # (kernel_size - 1) * frame_rate_of_this_layer + # 10 = (3 - 1) * 1 + (5 - 1) * 2 self.subsampling_rate = 6 - self.right_context = 14 + self.right_context = 10 def forward(self, x: paddle.Tensor, x_mask: paddle.Tensor, offset: int=0 ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]: @@ -214,8 +214,8 @@ class Conv2dSubsampling8(BaseSubsampling): odim) self.subsampling_rate = 8 # The right context for every conv layer is computed by: - # (kernel_size - 1) / 2 * stride * frame_rate_of_this_layer - # 14 = (3 - 1) / 2 * 2 * 1 + (3 - 1) / 2 * 2 * 2 + (3 - 1) / 2 * 2 * 4 + # (kernel_size - 1) * frame_rate_of_this_layer + # 14 = (3 - 1) * 1 + (3 - 1) * 2 + (3 - 1) * 4 self.right_context = 14 def forward(self, x: paddle.Tensor, x_mask: paddle.Tensor, offset: int=0 diff --git a/examples/aishell/s0/README.md b/examples/aishell/s0/README.md index 537496a6..e5ebfcba 100644 --- a/examples/aishell/s0/README.md +++ b/examples/aishell/s0/README.md @@ -10,7 +10,7 @@ | Model | Params | Release | Config | Test set | Loss | CER | | --- | --- | --- | --- | --- | --- | --- | -| DeepSpeech2 | 58.4M | 2.2.0 | conf/deepspeech2.yaml + spec aug + new datapipe | test | 6.396368026733398 | 0.068382 | +| DeepSpeech2 | 58.4M | 2.2.0 | conf/deepspeech2.yaml + spec aug | test | 5.71956205368042 | 0.064287 | | DeepSpeech2 | 58.4M | 2.1.0 | conf/deepspeech2.yaml + spec aug | test | 7.483316898345947 | 0.077860 | | DeepSpeech2 | 58.4M | 2.1.0 | conf/deepspeech2.yaml | test | 7.299022197723389 | 0.078671 | | DeepSpeech2 | 58.4M | 2.0.0 | conf/deepspeech2.yaml | test | - | 0.078977 | diff --git a/examples/aishell/s0/conf/deepspeech2.yaml b/examples/aishell/s0/conf/deepspeech2.yaml index c4ff246f..7f0a1462 100644 --- a/examples/aishell/s0/conf/deepspeech2.yaml +++ b/examples/aishell/s0/conf/deepspeech2.yaml @@ -42,7 +42,7 @@ model: share_rnn_weights: False training: - n_epoch: 50 + n_epoch: 80 lr: 2e-3 lr_decay: 0.83 weight_decay: 1e-06 diff --git a/examples/aishell/s0/local/train.sh b/examples/aishell/s0/local/train.sh index d42e51fa..3438a735 100755 --- a/examples/aishell/s0/local/train.sh +++ b/examples/aishell/s0/local/train.sh @@ -19,7 +19,7 @@ fi mkdir -p exp -seed=1024 +seed=10086 if [ ${seed} ]; then export FLAGS_cudnn_deterministic=True fi diff --git a/utils/avg.sh b/utils/avg.sh index c8a6ddfe..399c9574 100755 --- a/utils/avg.sh +++ b/utils/avg.sh @@ -1,19 +1,29 @@ #! /usr/bin/env bash -if [ $# != 2 ]; then - echo "usage: ${0} ckpt_dir avg_num" +if [ $# != 3 ]; then + echo "usage: ${0} [best|latest] ckpt_dir avg_num" exit -1 fi ckpt_dir=${1} -average_num=${2} +avg_mode=${2} # best,latest +average_num=${3} decode_checkpoint=${ckpt_dir}/avg_${average_num}.pdparams -avg_model.py \ ---dst_model ${decode_checkpoint} \ ---ckpt_dir ${ckpt_dir} \ ---num ${average_num} \ ---val_best +if [ $avg_mode == best ];then + # best + avg_model.py \ + --dst_model ${decode_checkpoint} \ + --ckpt_dir ${ckpt_dir} \ + --num ${average_num} \ + --val_best +else + # latest + avg_model.py \ + --dst_model ${decode_checkpoint} \ + --ckpt_dir ${ckpt_dir} \ + --num ${average_num} +fi if [ $? -ne 0 ]; then echo "Failed in avg ckpt!" diff --git a/utils/tarball.sh b/utils/tarball.sh index 5f7c21a3..ac8bdb39 100755 --- a/utils/tarball.sh +++ b/utils/tarball.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 4 ];then - echo "usage: $0 ckpt_prefix model_config mean_std vocab" +if [ $# != 5 ];then + echo "usage: $0 ckpt_prefix model_config mean_std vocab pack_name" exit -1 fi @@ -9,6 +9,7 @@ ckpt_prefix=$1 model_config=$2 mean_std=$3 vocab=$4 +pack_name=$5 output=release @@ -27,6 +28,6 @@ cp ${ckpt_prefix}.* ${output} # model config, mean std, vocab cp ${model_config} ${mean_std} ${vocab} ${output} -tar zcvf release.tar.gz ${output} +tar zcvf ${pack_name}.release.tar.gz ${output} -echo "tarball done!" +echo "tarball: ${pack_name}.release.tar.gz done!"