more decoding method (#618)

* more decoding method * all decode method test scripts; result readme * exp libri confi * parallel data scripts; more mask test; need pybind11 repo * speed perturb config * libri conf test set
3 years ago · 0a3a840bee
parent 295f8bdad5
commit 0a3a840bee
15 changed files with 140 additions and 55 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,7 +1,7 @@
 .DS_Store
 *.pyc
 .vscode
-*.log
+*log
 *.pdmodel
 *.pdiparams*
 *.zip
--- a/deepspeech/exps/deepspeech2/model.py
+++ b/deepspeech/exps/deepspeech2/model.py
@ -168,7 +168,7 @@ class DeepSpeech2Trainer(Trainer):
            train_dataset,
            batch_sampler=batch_sampler,
            collate_fn=collate_fn,
-            num_workers=config.data.num_workers, )
+            num_workers=config.data.num_workers)
        self.valid_loader = DataLoader(
            dev_dataset,
            batch_size=config.data.batch_size,
--- a/deepspeech/exps/u2/model.py
+++ b/deepspeech/exps/u2/model.py
@ -450,7 +450,7 @@ class U2Tester(U2Trainer):
        logger.info(msg)

        # test meta results
-        err_meta_path = os.path.splitext(self.args.checkpoint_path)[0] + '.err'
+        err_meta_path = os.path.splitext(self.args.result_file)[0] + '.err'
        err_type_str = "{}".format(error_rate_type)
        with open(err_meta_path, 'w') as f:
            data = json.dumps({
@ -471,6 +471,8 @@ class U2Tester(U2Trainer):
                errors_sum,
                "ref_len":
                len_refs,
+                "decode_method":
+                self.config.decoding.decoding_method,
            })
            f.write(data + '\n')

--- a/examples/aishell/s0/local/data.sh
+++ b/examples/aishell/s0/local/data.sh
@ -66,19 +66,22 @@ fi
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    # format manifest with tokenids, vocab size
    for dataset in train dev test; do
+    {
        python3 ${MAIN_ROOT}/utils/format_data.py \
-        --feat_type "raw" \
-        --cmvn_path "data/mean_std.json" \
-        --unit_type "char" \
-        --vocab_path="data/vocab.txt" \
-        --manifest_path="data/manifest.${dataset}.raw" \
-        --output_path="data/manifest.${dataset}"
-    done
+                --feat_type "raw" \
+                --cmvn_path "data/mean_std.json" \
+                --unit_type "char" \
+                --vocab_path="data/vocab.txt" \
+                --manifest_path="data/manifest.${dataset}.raw" \
+                --output_path="data/manifest.${dataset}"

-    if [ $? -ne 0 ]; then
-        echo "Formt mnaifest failed. Terminated."
-        exit 1
-    fi
+        if [ $? -ne 0 ]; then
+                echo "Formt mnaifest failed. Terminated."
+                exit 1
+        fi
+    } &
+    done
+    wait
 fi

 echo "Aishell data preparation done."
--- a/examples/aishell/s1/README.md
+++ b/examples/aishell/s1/README.md
@ -0,0 +1,14 @@
+# Aishell
+
+## Conformer
+| Model | Config | Augmentation| Test set | Decode method | Loss | WER |
+| --- | --- | --- | --- | --- | --- |
+| conformer | conf/conformer.yaml | spec_aug + shift | test | attention | - | 0.059858 |
+| conformer | conf/conformer.yaml | spec_aug + shift | test | ctc_greedy_search | - | 0.062311 |
+| conformer | conf/conformer.yaml | spec_aug + shift | test | ctc_prefix_beam_search | - | 0.062196 |
+| conformer | conf/conformer.yaml | spec_aug + shift | test | attention_rescoring | - | 0.054694 |
+
+## Transformer
+| Model | Config | Augmentation| Test set | Decode method | Loss | WER |
+| --- | --- | --- | --- | --- | --- |
+| transformer | conf/transformer.yaml | spec_aug + shift | test | attention | - | - |
--- a/examples/aishell/s1/local/data.sh
+++ b/examples/aishell/s1/local/data.sh
@ -14,7 +14,7 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
    python3 ${TARGET_DIR}/aishell/aishell.py \
    --manifest_prefix="data/manifest" \
    --target_dir="${TARGET_DIR}/aishell"
-    
+
    if [ $? -ne 0 ]; then
        echo "Prepare Aishell failed. Terminated."
        exit 1
@ -33,7 +33,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    --count_threshold=0 \
    --vocab_path="data/vocab.txt" \
    --manifest_paths "data/manifest.train.raw"
-    
+
    if [ $? -ne 0 ]; then
        echo "Build vocabulary failed. Terminated."
        exit 1
@ -56,7 +56,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    --num_samples=-1 \
    --num_workers=${num_workers} \
    --output_path="data/mean_std.json"
-    
+
    if [ $? -ne 0 ]; then
        echo "Compute mean and stddev failed. Terminated."
        exit 1
@ -67,19 +67,22 @@ fi
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    # format manifest with tokenids, vocab size
    for dataset in train dev test; do
+    {
        python3 ${MAIN_ROOT}/utils/format_data.py \
-        --feat_type "raw" \
-        --cmvn_path "data/mean_std.json" \
-        --unit_type "char" \
-        --vocab_path="data/vocab.txt" \
-        --manifest_path="data/manifest.${dataset}.raw" \
-        --output_path="data/manifest.${dataset}"
+            --feat_type "raw" \
+            --cmvn_path "data/mean_std.json" \
+            --unit_type "char" \
+            --vocab_path="data/vocab.txt" \
+            --manifest_path="data/manifest.${dataset}.raw" \
+            --output_path="data/manifest.${dataset}"
+
+        if [ $? -ne 0 ]; then
+            echo "Formt mnaifest failed. Terminated."
+            exit 1
+        fi
+    } &
    done
-    
-    if [ $? -ne 0 ]; then
-        echo "Formt mnaifest failed. Terminated."
-        exit 1
-    fi
+    wait
 fi

 echo "Aishell data preparation done."
--- a/examples/aishell/s1/local/test.sh
+++ b/examples/aishell/s1/local/test.sh
@ -21,17 +21,39 @@ ckpt_prefix=$2
 #    exit 1
 #fi

-python3 -u ${BIN_DIR}/test.py \
--device ${device} \
--nproc 1 \
--config ${config_path} \
--result_file ${ckpt_prefix}.rsl \
--checkpoint_path ${ckpt_prefix}
-
-if [ $? -ne 0 ]; then
-    echo "Failed in evaluation!"
-    exit 1
-fi

+for type in attention ctc_greedy_search; do
+    echo "decoding ${type}"
+    batch_size=64
+    python3 -u ${BIN_DIR}/test.py \
+    --device ${device} \
+    --nproc 1 \
+    --config ${config_path} \
+    --result_file ${ckpt_prefix}.${type}.rsl \
+    --checkpoint_path ${ckpt_prefix} \
+    --opts decoding.decoding_method ${type} decoding.batch_size ${batch_size}
+
+    if [ $? -ne 0 ]; then
+        echo "Failed in evaluation!"
+        exit 1
+    fi
+done
+
+for type in ctc_prefix_beam_search attention_rescoring; do
+    echo "decoding ${type}"
+    batch_size=1
+    python3 -u ${BIN_DIR}/test.py \
+    --device ${device} \
+    --nproc 1 \
+    --config ${config_path} \
+    --result_file ${ckpt_prefix}.${type}.rsl \
+    --checkpoint_path ${ckpt_prefix} \
+    --opts decoding.decoding_method ${type} decoding.batch_size ${batch_size}
+
+    if [ $? -ne 0 ]; then
+        echo "Failed in evaluation!"
+        exit 1
+    fi
+done

 exit 0
--- a/examples/librispeech/README.md
+++ b/examples/librispeech/README.md
@ -1,3 +1,3 @@
 # ASR
 * s0 is for deepspeech2
-* s1 is for U2
+* s1 is for transformer/conformer/U2
--- a/examples/librispeech/s1/README.md
+++ b/examples/librispeech/s1/README.md
@ -0,0 +1,16 @@
+# LibriSpeech
+
+## Conformer
+| Model | Config | Augmentation| Test set | Decode method | Loss | WER |
+| --- | --- | --- | --- | --- | --- |
+| conformer | conf/conformer.yaml | spec_aug + shift | test-all | attention | test-all 6.35 | 0.057117 |
+| conformer | conf/conformer.yaml | spec_aug + shift | test-clean | attention | test-all 6.35 | 0.030162 |
+| conformer | conf/conformer.yaml | spec_aug + shift | test-clean | ctc_greedy_search | test-all 6.35 | 0.037910 |
+| conformer | conf/conformer.yaml | spec_aug + shift | test-clean | ctc_prefix_beam_search | test-all 6.35 | 0.037761 |
+| conformer | conf/conformer.yaml | spec_aug + shift | test-clean | attention_rescoring | test-all 6.35 | 0.032115 |
+
+## Transformer
+| Model | Config | Augmentation| Test set | Decode method | Loss | WER |
+| --- | --- | --- | --- | --- | --- |
+| transformer | conf/transformer.yaml | spec_aug + shift | test-all | attention | test-all 6.98 | 0.066500 |
+| transformer | conf/transformer.yaml | spec_aug + shift | test-clean | attention | test-all 6.98 | 0.036 |
--- a/examples/librispeech/s1/conf/conformer.yaml
+++ b/examples/librispeech/s1/conf/conformer.yaml
@ -2,7 +2,7 @@
 data:
  train_manifest: data/manifest.train
  dev_manifest: data/manifest.dev
-  test_manifest: data/manifest.test
+  test_manifest: data/manifest.test-clean
  vocab_filepath: data/vocab.txt 
  unit_type: 'spm'
  spm_model_prefix: 'data/bpe_unigram_5000'
@ -14,7 +14,7 @@ data:
  min_output_len: 0.0 # tokens
  max_output_len: 400.0 # tokens
  min_output_input_ratio: 0.05
-  max_output_input_ratio: 10.0
+  max_output_input_ratio: 10.0 
  raw_wav: True  # use raw_wav or kaldi feature
  specgram_type: fbank #linear, mfcc, fbank
  feat_dim: 80
--- a/examples/librispeech/s1/conf/transformer.yaml
+++ b/examples/librispeech/s1/conf/transformer.yaml
@ -2,7 +2,7 @@
 data:
  train_manifest: data/manifest.train
  dev_manifest: data/manifest.dev
-  test_manifest: data/manifest.test
+  test_manifest: data/manifest.test-clean
  vocab_filepath: data/vocab.txt
  unit_type: 'spm'
  spm_model_prefix: 'data/bpe_unigram_5000'
--- a/examples/librispeech/s1/local/test.sh
+++ b/examples/librispeech/s1/local/test.sh
@ -21,17 +21,39 @@ ckpt_prefix=$2
 #    exit 1
 #fi

-python3 -u ${BIN_DIR}/test.py \
--device ${device} \
--nproc 1 \
--config ${config_path} \
--result_file ${ckpt_prefix}.rsl \
--checkpoint_path ${ckpt_prefix}
-
-if [ $? -ne 0 ]; then
-    echo "Failed in evaluation!"
-    exit 1
-fi
+for type in attention ctc_greedy_search; do
+    echo "decoding ${type}"
+    batch_size=64
+    python3 -u ${BIN_DIR}/test.py \
+    --device ${device} \
+    --nproc 1 \
+    --config ${config_path} \
+    --result_file ${ckpt_prefix}.${type}.rsl \
+    --checkpoint_path ${ckpt_prefix} \
+    --opts decoding.decoding_method ${type} decoding.batch_size ${batch_size}
+
+    if [ $? -ne 0 ]; then
+        echo "Failed in evaluation!"
+        exit 1
+    fi
+done
+
+for type in ctc_prefix_beam_search attention_rescoring; do
+    echo "decoding ${type}"
+    batch_size=1
+    python3 -u ${BIN_DIR}/test.py \
+    --device ${device} \
+    --nproc 1 \
+    --config ${config_path} \
+    --result_file ${ckpt_prefix}.${type}.rsl \
+    --checkpoint_path ${ckpt_prefix} \
+    --opts decoding.decoding_method ${type} decoding.batch_size ${batch_size}
+
+    if [ $? -ne 0 ]; then
+        echo "Failed in evaluation!"
+        exit 1
+    fi
+done


 exit 0
--- a/examples/tiny/s1/run.sh
+++ b/examples/tiny/s1/run.sh
--- a/requirements.txt
+++ b/requirements.txt
@ -8,4 +8,5 @@ SoundFile==0.9.0.post1
 sox
 tensorboardX
 typeguard
-yacs
+yacs
+pybind11
--- a/tests/mask_test.py
+++ b/tests/mask_test.py
@ -50,7 +50,9 @@ class TestU2Model(unittest.TestCase):

    def test_make_pad_mask(self):
        res = make_pad_mask(self.lengths)
+        res1 = make_non_pad_mask(self.lengths).logical_not()
        self.assertSequenceEqual(res.numpy().tolist(), self.pad_masks.tolist())
+        self.assertSequenceEqual(res.numpy().tolist(), res1.tolist())


 if __name__ == '__main__':