From 0defc658e109d8bd208961cfb868786fb64270ed Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Mon, 22 Nov 2021 02:49:05 +0000
Subject: [PATCH] update aishell/librispeech transformer result; wenetspeech
 pretrain conformer result

---
 examples/aishell/s1/README.md                 | 10 +++
 examples/aishell/s1/conf/transformer.yaml     |  2 +-
 examples/aishell/s1/local/test_hub.sh         |  2 -
 examples/librispeech/s1/README.md             |  8 +--
 examples/wenetspeech/README.md                | 54 ++++++++++++++
 examples/wenetspeech/asr1/.gitignore          |  3 +
 examples/wenetspeech/asr1/README.md           | 24 +++++++
 examples/wenetspeech/asr1/local/data.sh       |  0
 examples/wenetspeech/asr1/local/test.sh       | 70 ++++++++++++++++++-
 .../asr1/local/wenetspeech_data_prep.sh       |  0
 .../frontend/featurizer/text_featurizer.py    |  4 +-
 paddlespeech/s2t/models/u2/u2.py              |  2 +-
 12 files changed, 169 insertions(+), 10 deletions(-)
 create mode 100644 examples/wenetspeech/README.md
 create mode 100644 examples/wenetspeech/asr1/.gitignore
 create mode 100644 examples/wenetspeech/asr1/README.md
 mode change 100644 => 100755 examples/wenetspeech/asr1/local/data.sh
 mode change 100644 => 100755 examples/wenetspeech/asr1/local/test.sh
 mode change 100644 => 100755 examples/wenetspeech/asr1/local/wenetspeech_data_prep.sh

diff --git a/examples/aishell/s1/README.md b/examples/aishell/s1/README.md
index 0096c73e..8c53f95f 100644
--- a/examples/aishell/s1/README.md
+++ b/examples/aishell/s1/README.md
@@ -19,3 +19,13 @@ Need set `decoding.decoding_chunk_size=16` when decoding.
 | conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_greedy_search | 16, -1 | - | 0.070806 |  
 | conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_prefix_beam_search | 16, -1 | - | 0.070739 |  
 | conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | attention_rescoring | 16, -1 |  - | 0.059400 |  
+
+
+## Transformer 
+
+| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER |  
+| --- | --- | --- | --- | --- | --- | --- | --- |  
+| transformer | 31.95M  | conf/transformer.yaml | spec_aug | test | attention | 3.858648955821991 | 0.057293 |  
+| transformer | 31.95M  | conf/transformer.yaml | spec_aug | test | ctc_greedy_search | 3.858648955821991 | 0.061837 |  
+| transformer | 31.95M  | conf/transformer.yaml | spec_aug | test | ctc_prefix_beam_search | 3.858648955821991 | 0.061685 |  
+| transformer | 31.95M  | conf/transformer.yaml | spec_aug | test | attention_rescoring | 3.858648955821991 | 0.053844 |  
\ No newline at end of file
diff --git a/examples/aishell/s1/conf/transformer.yaml b/examples/aishell/s1/conf/transformer.yaml
index 7803097a..c021f66b 100644
--- a/examples/aishell/s1/conf/transformer.yaml
+++ b/examples/aishell/s1/conf/transformer.yaml
@@ -73,7 +73,7 @@ model:
 
 
 training:
-  n_epoch: 240 
+  n_epoch: 120 
   accum_grad: 2
   global_grad_clip: 5.0
   optim: adam
diff --git a/examples/aishell/s1/local/test_hub.sh b/examples/aishell/s1/local/test_hub.sh
index 99b141c8..6e78ec78 100755
--- a/examples/aishell/s1/local/test_hub.sh
+++ b/examples/aishell/s1/local/test_hub.sh
@@ -23,8 +23,6 @@ fi
 #    exit 1
 #fi
 
-
-
 for type in  attention_rescoring; do
     echo "decoding ${type}"
     batch_size=1
diff --git a/examples/librispeech/s1/README.md b/examples/librispeech/s1/README.md
index b7ec93eb..20255db8 100644
--- a/examples/librispeech/s1/README.md
+++ b/examples/librispeech/s1/README.md
@@ -21,7 +21,7 @@
 ## Transformer
 | Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER |  
 | --- | --- | --- | --- | --- | --- | --- | --- |
-| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean | attention | 7.404532432556152 | 0.056204 |  
-| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean | ctc_greedy_search | 7.404532432556152 | 0.058658 |  
-| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean | ctc_prefix_beam_search | 7.404532432556152 | 0.058278 |  
-| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean | attention_rescoring | 7.404532432556152 | 0.045591 |  
+| transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | attention | 6.805267604192098, | 0.049795 |  
+| transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | ctc_greedy_search | 6.805267604192098, | 0.054892 |  
+| transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | ctc_prefix_beam_search | 6.805267604192098, | 0.054531 |  
+| transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | attention_rescoring | 6.805267604192098, | 0.042244 |  
\ No newline at end of file
diff --git a/examples/wenetspeech/README.md b/examples/wenetspeech/README.md
new file mode 100644
index 00000000..fbb322d6
--- /dev/null
+++ b/examples/wenetspeech/README.md
@@ -0,0 +1,54 @@
+# [WenetSpeech](https://github.com/wenet-e2e/WenetSpeech)
+
+A 10000+ Hours Multi-domain Chinese Corpus for Speech Recognition
+
+## Description
+
+### Creation
+
+All the data are collected from YouTube and Podcast. Optical character recognition (OCR) and automatic speech recognition (ASR) techniques are adopted to label each YouTube and Podcast recording, respectively. To improve the quality of the corpus, we use a novel end-to-end label error detection method to further validate and filter the data.
+
+### Categories
+
+In summary, WenetSpeech groups all data into 3 categories, as the following table shows:
+
+| Set        | Hours | Confidence  | Usage                                 |
+|------------|-------|-------------|---------------------------------------|
+| High Label | 10005 | >=0.95      | Supervised Training                   |
+| Weak Label | 2478  | [0.6, 0.95] | Semi-supervised or noise training     |
+| Unlabel    | 9952  | /           | Unsupervised training or Pre-training |
+| In Total   | 22435 | /           | All above                             |
+
+### High Label Data
+
+We classify the high label into 10 groups according to its domain, speaking style, and scenarios.
+
+| Domain      | Youtube | Podcast | Total  |
+|-------------|---------|---------|--------|
+| audiobook   | 0       | 250.9   | 250.9  |
+| commentary  | 112.6   | 135.7   | 248.3  |
+| documentary | 386.7   | 90.5    | 477.2  |
+| drama       | 4338.2  | 0       | 4338.2 |
+| interview   | 324.2   | 614     | 938.2  |
+| news        | 0       | 868     | 868    |
+| reading     | 0       | 1110.2  | 1110.2 |
+| talk        | 204     | 90.7    | 294.7  |
+| variety     | 603.3   | 224.5   | 827.8  |
+| others      | 144     | 507.5   | 651.5  |
+| Total       | 6113    | 3892    | 10005  |
+
+As shown in the following table, we provide 3 training subsets, namely `S`, `M` and `L` for building ASR systems on different data scales.
+
+| Training Subsets | Confidence  | Hours |
+|------------------|-------------|-------|
+| L                | [0.95, 1.0] | 10005 |
+| M                | 1.0         | 1000  |
+| S                | 1.0         | 100   |
+
+### Evaluation Sets
+
+| Evaluation Sets | Hours | Source       | Description                                                                             |
+|-----------------|-------|--------------|-----------------------------------------------------------------------------------------|
+| DEV             | 20    | Internet     | Specially designed for some speech tools which require cross-validation set in training |
+| TEST\_NET       | 23    | Internet     | Match test                                                                              |
+| TEST\_MEETING   | 15    | Real meeting | Mismatch test which is a far-field, conversational, spontaneous, and meeting dataset   |
\ No newline at end of file
diff --git a/examples/wenetspeech/asr1/.gitignore b/examples/wenetspeech/asr1/.gitignore
new file mode 100644
index 00000000..02a22922
--- /dev/null
+++ b/examples/wenetspeech/asr1/.gitignore
@@ -0,0 +1,3 @@
+data
+exp
+*.profile
diff --git a/examples/wenetspeech/asr1/README.md b/examples/wenetspeech/asr1/README.md
new file mode 100644
index 00000000..5aff041f
--- /dev/null
+++ b/examples/wenetspeech/asr1/README.md
@@ -0,0 +1,24 @@
+# WenetSpeech
+
+
+## Conformer
+
+| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER |  
+| --- | --- | --- | --- | --- | --- | --- | --- |
+| conformer | 32.52 M | conf/conformer.yaml | spec_aug  | dev | attention |  |  |  
+| conformer | 32.52 M | conf/conformer.yaml | spec_aug  | test net | ctc_greedy_search |  |  |  
+| conformer | 32.52 M | conf/conformer.yaml | spec_aug  | test meeting | ctc_prefix_beam_search |  |  |  
+| conformer | 32.52 M | conf/conformer.yaml | spec_aug  | test net | attention_rescoring |  |  |  
+
+
+
+## Conformer Pretrain Model
+
+Pretrain model from http://mobvoi-speech-public.ufile.ucloud.cn/public/wenet/wenetspeech/20211025_conformer_exp.tar.gz
+
+| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER |  
+| --- | --- | --- | --- | --- | --- | --- | --- |
+| conformer | 32.52 M | conf/conformer.yaml | spec_aug  | aishell1 | attention | - | 0.048456 |  
+| conformer | 32.52 M | conf/conformer.yaml | spec_aug  | aishell1 | ctc_greedy_search | - | 0.052534 |  
+| conformer | 32.52 M | conf/conformer.yaml | spec_aug  | aishell1 | ctc_prefix_beam_search | - | 0.052915 |  
+| conformer | 32.52 M | conf/conformer.yaml | spec_aug  | aishell1 | attention_rescoring | - | 0.047904 |  
\ No newline at end of file
diff --git a/examples/wenetspeech/asr1/local/data.sh b/examples/wenetspeech/asr1/local/data.sh
old mode 100644
new mode 100755
diff --git a/examples/wenetspeech/asr1/local/test.sh b/examples/wenetspeech/asr1/local/test.sh
old mode 100644
new mode 100755
index e7c64346..47bd2f63
--- a/examples/wenetspeech/asr1/local/test.sh
+++ b/examples/wenetspeech/asr1/local/test.sh
@@ -1 +1,69 @@
-decode_modes="attention_rescoring ctc_greedy_search"
\ No newline at end of file
+#!/bin/bash
+
+if [ $# != 2 ];then
+    echo "usage: ${0} config_path ckpt_path_prefix"
+    exit -1
+fi
+
+ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+echo "using $ngpu gpus..."
+
+config_path=$1
+ckpt_prefix=$2
+
+chunk_mode=false
+if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then
+    chunk_mode=true
+fi
+
+# download language model
+#bash local/download_lm_ch.sh
+#if [ $? -ne 0 ]; then
+#    exit 1
+#fi
+
+
+for type in attention ctc_greedy_search; do
+    echo "decoding ${type}"
+    if [ ${chunk_mode} == true ];then
+        # stream decoding only support batchsize=1
+        batch_size=1
+    else
+        batch_size=64
+    fi
+    output_dir=${ckpt_prefix}
+    mkdir -p ${output_dir}
+    python3 -u ${BIN_DIR}/test.py \
+    --nproc ${ngpu} \
+    --config ${config_path} \
+    --result_file ${output_dir}/${type}.rsl \
+    --checkpoint_path ${ckpt_prefix} \
+    --opts decoding.decoding_method ${type} \
+    --opts decoding.batch_size ${batch_size}
+
+    if [ $? -ne 0 ]; then
+        echo "Failed in evaluation!"
+        exit 1
+    fi
+done
+
+for type in ctc_prefix_beam_search attention_rescoring; do
+    echo "decoding ${type}"
+    batch_size=1
+    output_dir=${ckpt_prefix}
+    mkdir -p ${output_dir}
+    python3 -u ${BIN_DIR}/test.py \
+    --nproc ${ngpu} \
+    --config ${config_path} \
+    --result_file ${output_dir}/${type}.rsl \
+    --checkpoint_path ${ckpt_prefix} \
+    --opts decoding.decoding_method ${type} \
+    --opts decoding.batch_size ${batch_size}
+
+    if [ $? -ne 0 ]; then
+        echo "Failed in evaluation!"
+        exit 1
+    fi
+done
+
+exit 0
diff --git a/examples/wenetspeech/asr1/local/wenetspeech_data_prep.sh b/examples/wenetspeech/asr1/local/wenetspeech_data_prep.sh
old mode 100644
new mode 100755
diff --git a/paddlespeech/s2t/frontend/featurizer/text_featurizer.py b/paddlespeech/s2t/frontend/featurizer/text_featurizer.py
index 7f3bd9e1..21f512e9 100644
--- a/paddlespeech/s2t/frontend/featurizer/text_featurizer.py
+++ b/paddlespeech/s2t/frontend/featurizer/text_featurizer.py
@@ -92,7 +92,9 @@ class TextFeaturizer():
         tokens = self.tokenize(text)
         ids = []
         for token in tokens:
-            token = token if token in self.vocab_dict else self.unk
+            if token not in self.vocab_dict:
+                logger.debug(f"Text Token: {token} -> {self.unk}")
+                token = self.unk
             ids.append(self.vocab_dict[token])
         return ids
 
diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py
index 916a6a05..4f833372 100644
--- a/paddlespeech/s2t/models/u2/u2.py
+++ b/paddlespeech/s2t/models/u2/u2.py
@@ -860,7 +860,7 @@ class U2Model(U2DecodeModel):
             int, nn.Layer, nn.Layer, nn.Layer: vocab size, encoder, decoder, ctc
         """
         # cmvn
-        if configs['cmvn_file'] is not None:
+        if 'cmvn_file' in configs and configs['cmvn_file']:
             mean, istd = load_cmvn(configs['cmvn_file'],
                                    configs['cmvn_file_type'])
             global_cmvn = GlobalCMVN(