Merge branch 'develop' of https://github.com/PaddlePaddle/DeepSpeech into Hub

4 years ago · 37e02308b2
parent ae3306bb25 81f89c53e6
commit 37e02308b2
17 changed files with 126 additions and 67 deletions
--- a/deepspeech/exps/u2_st/config.py
+++ b/deepspeech/exps/u2_st/config.py
@ -15,7 +15,7 @@ from yacs.config import CfgNode

 from deepspeech.exps.u2_st.model import U2STTester
 from deepspeech.exps.u2_st.model import U2STTrainer
-from deepspeech.io.collator_st import SpeechCollator
+from deepspeech.io.collator import SpeechCollator
 from deepspeech.io.dataset import ManifestDataset
 from deepspeech.models.u2_st import U2STModel

--- a/deepspeech/io/reader.py
+++ b/deepspeech/io/reader.py
@ -322,7 +322,7 @@ class LoadInputsAndTargets():
                "Not supported: loader_type={}".format(filetype))

    def file_type(self, filepath):
-        suffix = filepath.split(":")[0].split('.')[-1]
+        suffix = filepath.split(":")[0].split('.')[-1].lower()
        if suffix == 'ark':
            return 'mat'
        elif suffix == 'scp':
--- a/deepspeech/models/u2_st.py
+++ b/deepspeech/models/u2_st.py
@ -113,7 +113,8 @@ class U2STBaseModel(nn.Layer):
                 asr_weight: float=0.0,
                 ignore_id: int=IGNORE_ID,
                 lsm_weight: float=0.0,
-                 length_normalized_loss: bool=False):
+                 length_normalized_loss: bool=False,
+                 **kwargs):
        assert 0.0 <= ctc_weight <= 1.0, ctc_weight

        super().__init__()
@ -650,7 +651,7 @@ class U2STModel(U2STBaseModel):
                odim=vocab_size,
                enc_n_units=encoder.output_size(),
                blank_id=0,
-                dropout_rate=model_conf['ctc_dropout_rate'],
+                dropout_rate=model_conf['ctc_dropoutrate'],
                reduction=True,  # sum
                batch_average=True,  # sum / batch_size
                grad_norm_type=model_conf['ctc_grad_norm_type'])
--- a/deepspeech/utils/error_rate.py
+++ b/deepspeech/utils/error_rate.py
@ -14,6 +14,7 @@
 """This module provides functions to calculate error rate in different level.
 e.g. wer for word-level, cer for char-level.
 """
+import editdistance
 import numpy as np

 __all__ = ['word_errors', 'char_errors', 'wer', 'cer']
@ -89,6 +90,7 @@ def word_errors(reference, hypothesis, ignore_case=False, delimiter=' '):
    hyp_words = list(filter(None, hypothesis.split(delimiter)))

    edit_distance = _levenshtein_distance(ref_words, hyp_words)
+    # edit_distance = editdistance.eval(ref_words, hyp_words)
    return float(edit_distance), len(ref_words)


@ -119,6 +121,7 @@ def char_errors(reference, hypothesis, ignore_case=False, remove_space=False):
    hypothesis = join_char.join(list(filter(None, hypothesis.split(' '))))

    edit_distance = _levenshtein_distance(reference, hypothesis)
+    # edit_distance = editdistance.eval(reference, hypothesis)
    return float(edit_distance), len(reference)


--- a/deepspeech/utils/tensor_utils.py
+++ b/deepspeech/utils/tensor_utils.py
@ -93,20 +93,25 @@ def pad_sequence(sequences: List[paddle.Tensor],
    for i, tensor in enumerate(sequences):
        length = tensor.shape[0]
        # use index notation to prevent duplicate references to the tensor
+        logger.info(
+            f"length {length}, out_tensor {out_tensor.shape}, tensor {tensor.shape}"
+        )
        if batch_first:
            # TODO (Hui Zhang): set_value op not supprot `end==start`
+            # TODO (Hui Zhang): set_value op not support int16
+            # TODO (Hui Zhang): set_varbase 2 rank not support [0,0,...] 
            # out_tensor[i, :length, ...] = tensor
            if length != 0:
-                out_tensor[i, :length, ...] = tensor
+                out_tensor[i, :length] = tensor
            else:
-                out_tensor[i, length, ...] = tensor
+                out_tensor[i, length] = tensor
        else:
            # TODO (Hui Zhang): set_value op not supprot `end==start`
            # out_tensor[:length, i, ...] = tensor
            if length != 0:
-                out_tensor[:length, i, ...] = tensor
+                out_tensor[:length, i] = tensor
            else:
-                out_tensor[length, i, ...] = tensor
+                out_tensor[length, i] = tensor

    return out_tensor

--- a/examples/ted_en_zh/t0/.gitignore
+++ b/examples/ted_en_zh/t0/.gitignore
@ -1,3 +1,3 @@
-TED-En-Zh
+TED_EnZh
 data
 exp
--- a/examples/ted_en_zh/t0/README.md
+++ b/examples/ted_en_zh/t0/README.md
@ -6,5 +6,10 @@
 | Data Subset | Duration in Seconds |
 | --- | --- |
 | data/manifest.train | 0.942 ~ 60   |
-| data/manifest.dev   | 1.151 ~ 39   |  
+| data/manifest.dev   | 1.151 ~ 39   |
 | data/manifest.test  | 1.1 ~ 42.746 |
+
+## Transformer
+| Model | Params | Config | Char-BLEU |
+| --- | --- | --- | --- |
+| Transformer+ASR MTL | 50.26M | conf/transformer_joint_noam.yaml | 17.38 |
--- a/examples/ted_en_zh/t0/local/data.sh
+++ b/examples/ted_en_zh/t0/local/data.sh
@ -1,5 +1,7 @@
 #!/bin/bash

+set -e
+
 stage=-1
 stop_stage=100

@ -7,7 +9,7 @@ stop_stage=100
 nbpe=8000
 bpemode=unigram
 bpeprefix="data/bpe_${bpemode}_${nbpe}"
-data_dir=/mnt/dataset/TED_EnZh
+data_dir=./TED_EnZh


 source ${MAIN_ROOT}/utils/parse_options.sh
--- a/examples/ted_en_zh/t0/run.sh
+++ b/examples/ted_en_zh/t0/run.sh
@ -6,7 +6,7 @@ stage=0
 stop_stage=100
 conf_path=conf/transformer_joint_noam.yaml
 avg_num=5
-data_path=./TED-En-Zh # path to unzipped data
+data_path=./TED_EnZh # path to unzipped data
 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;

 avg_ckpt=avg_${avg_num}
--- a/examples/timit/s1/.gitignore
+++ b/examples/timit/s1/.gitignore
@ -0,0 +1,3 @@
+data
+exp
+test.profile
--- a/examples/timit/s1/README.md
+++ b/examples/timit/s1/README.md
@ -1,3 +1,9 @@
 # TIMIT

-Results will be organized and updated soon.
+### Transformer
+| Model | Params | Config | Decode method | Loss |  PER |
+| --- | --- | --- | --- | --- |
+| transformer | 5.17M | conf/transformer.yaml | attention              | 49.25688171386719 | 0.510742 |
+| transformer | 5.17M | conf/transformer.yaml | ctc_greedy_search      | 49.25688171386719 | 0.382398 |
+| transformer | 5.17M | conf/transformer.yaml | ctc_prefix_beam_search | 49.25688171386719 | 0.367429 |
+| transformer | 5.17M | conf/transformer.yaml | attention_rescore      | 49.25688171386719 | 0.357173 |  
--- a/examples/timit/s1/conf/transformer.yaml
+++ b/examples/timit/s1/conf/transformer.yaml
@ -3,12 +3,12 @@ data:
  train_manifest: data/manifest.train
  dev_manifest: data/manifest.dev
  test_manifest: data/manifest.test
-  min_input_len: 0.5  # second
-  max_input_len: 30.0 # second
+  min_input_len: 0.0  # second
+  max_input_len: 10.0 # second
  min_output_len: 0.0 # tokens
-  max_output_len: 400.0 # tokens
-  min_output_input_ratio: 0.05
-  max_output_input_ratio: 100.0
+  max_output_len: 150.0 # tokens
+  min_output_input_ratio: 0.005
+  max_output_input_ratio: 1000.0

 collator:
  vocab_filepath: data/vocab.txt
@ -42,10 +42,10 @@ model:
    # encoder related
    encoder: transformer
    encoder_conf:
-        output_size: 256    # dimension of attention
+        output_size: 128    # dimension of attention
        attention_heads: 4
-        linear_units: 2048  # the number of units of position-wise feed forward
-        num_blocks: 12      # the number of encoder blocks
+        linear_units: 1024  # the number of units of position-wise feed forward
+        num_blocks: 6      # the number of encoder blocks
        dropout_rate: 0.1
        positional_dropout_rate: 0.1
        attention_dropout_rate: 0.0
@ -56,7 +56,7 @@ model:
    decoder: transformer
    decoder_conf:
        attention_heads: 4
-        linear_units: 2048
+        linear_units: 1024
        num_blocks: 6
        dropout_rate: 0.1
        positional_dropout_rate: 0.1
@ -65,26 +65,26 @@ model:

    # hybrid CTC/attention
    model_conf:
-        ctc_weight: 0.3
+        ctc_weight: 0.5
        ctc_dropoutrate: 0.0
-        ctc_grad_norm_type: instance
+        ctc_grad_norm_type: batch
        lsm_weight: 0.1     # label smoothing option
        length_normalized_loss: false


 training:
-  n_epoch: 120
+  n_epoch: 200
  accum_grad: 2
  global_grad_clip: 5.0
  optim: adam
  optim_conf:
-    lr: 0.002
+    lr: 0.004
    weight_decay: 1e-06
  scheduler: warmuplr     # pytorch v1.1.0+ required
  scheduler_conf:
-    warmup_steps: 400
+    warmup_steps: 2000
    lr_decay: 1.0
-  log_interval: 100
+  log_interval: 10
  checkpoint:
    kbest_n: 50
    latest_n: 5
--- a/examples/timit/s1/local/test.sh
+++ b/examples/timit/s1/local/test.sh
@ -1,10 +1,18 @@
 #!/bin/bash

+set -e
+
+stage=0
+stop_stage=50
+
+. ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
+
 if [ $# != 2 ];then
    echo "usage: ${0} config_path ckpt_path_prefix"
    exit -1
 fi

+
 ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."

@ -23,44 +31,67 @@ fi
 #    exit 1
 #fi

-for type in attention ctc_greedy_search; do
-    echo "decoding ${type}"
-    if [ ${chunk_mode} == true ];then
-        # stream decoding only support batchsize=1
-        batch_size=1
-    else
-        batch_size=64
-    fi
-    python3 -u ${BIN_DIR}/test.py \
-    --nproc ${ngpu} \
-    --config ${config_path} \
-    --result_file ${ckpt_prefix}.${type}.rsl \
-    --checkpoint_path ${ckpt_prefix} \
-    --opts decoding.decoding_method ${type} \
-    --opts decoding.batch_size ${batch_size}
-
-    if [ $? -ne 0 ]; then
-        echo "Failed in evaluation!"
-        exit 1
-    fi
-done
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    for type in attention ctc_greedy_search; do
+        echo "decoding ${type}"
+        if [ ${chunk_mode} == true ];then
+            # stream decoding only support batchsize=1
+            batch_size=1
+        else
+            batch_size=64
+        fi
+        python3 -u ${BIN_DIR}/test.py \
+        --nproc ${ngpu} \
+        --config ${config_path} \
+        --result_file ${ckpt_prefix}.${type}.rsl \
+        --checkpoint_path ${ckpt_prefix} \
+        --opts decoding.decoding_method ${type} \
+        --opts decoding.batch_size ${batch_size}
+    
+        if [ $? -ne 0 ]; then
+            echo "Failed in evaluation!"
+            exit 1
+        fi
+    done
+fi

-for type in ctc_prefix_beam_search attention_rescoring; do
-    echo "decoding ${type}"
-    batch_size=1
-    python3 -u ${BIN_DIR}/test.py \
-    --nproc ${ngpu}  \
-    --config ${config_path} \
-    --result_file ${ckpt_prefix}.${type}.rsl \
-    --checkpoint_path ${ckpt_prefix} \
-    --opts decoding.decoding_method ${type} \
-    --opts decoding.batch_size ${batch_size}

-    if [ $? -ne 0 ]; then
-        echo "Failed in evaluation!"
-        exit 1
-    fi
-done
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    for type in ctc_prefix_beam_search; do
+        echo "decoding ${type}"
+        batch_size=1
+        python3 -u ${BIN_DIR}/test.py \
+        --nproc ${ngpu}  \
+        --config ${config_path} \
+        --result_file ${ckpt_prefix}.${type}.rsl \
+        --checkpoint_path ${ckpt_prefix} \
+        --opts decoding.decoding_method ${type} \
+        --opts decoding.batch_size ${batch_size}
+    
+        if [ $? -ne 0 ]; then
+            echo "Failed in evaluation!"
+            exit 1
+        fi
+    done
+fi

+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    for type in attention_rescoring; do
+        echo "decoding ${type}"
+        batch_size=1
+        python3 -u ${BIN_DIR}/test.py \
+        --nproc ${ngpu}  \
+        --config ${config_path} \
+        --result_file ${ckpt_prefix}.${type}.rsl \
+        --checkpoint_path ${ckpt_prefix} \
+        --opts decoding.decoding_method ${type} \
+        --opts decoding.batch_size ${batch_size}
+    
+        if [ $? -ne 0 ]; then
+            echo "Failed in evaluation!"
+            exit 1
+        fi
+    done
+fi

 exit 0
--- a/examples/timit/s1/local/timit_data_prep.sh
+++ b/examples/timit/s1/local/timit_data_prep.sh
--- a/examples/timit/s1/local/timit_norm_trans.pl
+++ b/examples/timit/s1/local/timit_norm_trans.pl
--- a/examples/timit/s1/run.sh
+++ b/examples/timit/s1/run.sh
@ -1,13 +1,15 @@
 #!/bin/bash
 set -e
-source path.sh
+
+. path.sh || exit 1;

 stage=0
 stop_stage=50
 conf_path=conf/transformer.yaml
 avg_num=10
-TIMIT_path= #path of TIMIT (Required, e.g. /export/corpora5/LDC/LDC93S1/timit/TIMIT)
-source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
+TIMIT_path=/path/to/TIMIT
+
+. ${MAIN_ROOT}/utils/parse_options.sh || exit 1;

 avg_ckpt=avg_${avg_num}
 ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')
--- a/requirements.txt
+++ b/requirements.txt
@ -19,3 +19,4 @@ tqdm
 typeguard
 visualdl==2.2.0
 yacs
+editdistance