[asr] update librispeech conformer result (#1116)

* update librispeech result * change conf order
4 years ago · c4a79ccea4
parent 65f684806e
commit c4a79ccea4
4 changed files with 105 additions and 126 deletions
--- a/examples/librispeech/asr1/RESULTS.md
+++ b/examples/librispeech/asr1/RESULTS.md
@ -1,15 +1,18 @@
 # LibriSpeech

 ## Conformer
+train: Epoch 70, 4 V100-32G, best avg: 20
+
 | Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER |  
 | --- | --- | --- | --- | --- | --- | --- | --- |
-| conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean | attention | 6.738649845123291 | 0.041159 |  
-| conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean | ctc_greedy_search | 6.738649845123291 | 0.039847 |  
-| conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean | ctc_prefix_beam_search | 6.738649845123291 | 0.039790 |  
-| conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean | attention_rescoring | 6.738649845123291 | 0.034617 |  
+| conformer | 47.63 M | conf/conformer.yaml | spec_aug | test-clean | attention | 6.433612394332886 | 0.039771 |  
+| conformer | 47.63 M | conf/conformer.yaml | spec_aug | test-clean | ctc_greedy_search | 6.433612394332886 | 0.040342 |  
+| conformer | 47.63 M | conf/conformer.yaml | spec_aug | test-clean | ctc_prefix_beam_search | 6.433612394332886 | 0.040342 |  
+| conformer | 47.63 M | conf/conformer.yaml | spec_aug | test-clean | attention_rescoring | 6.433612394332886 | 0.033761 |  


 ## Chunk Conformer
+
 | Model | Params | Config | Augmentation| Test set | Decode method | Chunk Size & Left Chunks | Loss | WER |  
 | --- | --- | --- | --- | --- | --- | --- | --- | --- |  
 | conformer | 47.63 M | conf/chunk_conformer.yaml | spec_aug + shift | test-clean | attention | 16, -1 | 7.11 | 0.063193 |  
@ -20,7 +23,7 @@

 ## Transformer

-train: Epoch 120, 4 V100-32G, 27 Day, avg: 10
+train: Epoch 120, 4 V100-32G, 27 Day, best avg: 10

 | Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER |  
 | --- | --- | --- | --- | --- | --- | --- | --- |
--- a/examples/librispeech/asr1/conf/chunk_conformer.yaml
+++ b/examples/librispeech/asr1/conf/chunk_conformer.yaml
@ -1,41 +1,3 @@
-# https://yaml.org/type/float.html
-data:
-  train_manifest: data/manifest.train
-  dev_manifest: data/manifest.dev
-  test_manifest: data/manifest.test
-  min_input_len: 0.5
-  max_input_len: 30.0
-  min_output_len: 0.0
-  max_output_len: 400.0
-  min_output_input_ratio: 0.05
-  max_output_input_ratio: 100.0
-
-collator:
-  vocab_filepath: data/lang_char/vocab.txt 
-  unit_type: 'spm'
-  spm_model_prefix: 'data/lang_char/bpe_unigram_5000'
-  mean_std_filepath: ""
-  augmentation_config: conf/preprocess.yaml
-  batch_size: 16
-  raw_wav: True  # use raw_wav or kaldi feature
-  spectrum_type: fbank #linear, mfcc, fbank
-  feat_dim: 80
-  delta_delta: False
-  dither: 1.0
-  target_sample_rate: 16000
-  max_freq: None
-  n_fft: None
-  stride_ms: 10.0
-  window_ms: 25.0
-  use_dB_normalization: True
-  target_dB: -20
-  random_seed: 0
-  keep_transcription_text: False
-  sortagrad: True 
-  shuffle_method: batch_shuffle
-  num_workers: 2
-
-
 # network architecture
 model:
    cmvn_file: 
@ -80,6 +42,39 @@ model:
        length_normalized_loss: false


+data:
+  train_manifest: data/manifest.train
+  dev_manifest: data/manifest.dev
+  test_manifest: data/manifest.test
+
+
+collator:
+  vocab_filepath: data/lang_char/vocab.txt 
+  unit_type: 'spm'
+  spm_model_prefix: 'data/lang_char/bpe_unigram_5000'
+  mean_std_filepath: ""
+  augmentation_config: conf/preprocess.yaml
+  batch_size: 16
+  raw_wav: True  # use raw_wav or kaldi feature
+  spectrum_type: fbank #linear, mfcc, fbank
+  feat_dim: 80
+  delta_delta: False
+  dither: 1.0
+  target_sample_rate: 16000
+  max_freq: None
+  n_fft: None
+  stride_ms: 10.0
+  window_ms: 25.0
+  use_dB_normalization: True
+  target_dB: -20
+  random_seed: 0
+  keep_transcription_text: False
+  sortagrad: True 
+  shuffle_method: batch_shuffle
+  num_workers: 2
+
+
+
 training:
  n_epoch: 240
  accum_grad: 8
--- a/examples/librispeech/asr1/conf/chunk_transformer.yaml
+++ b/examples/librispeech/asr1/conf/chunk_transformer.yaml
@ -1,41 +1,3 @@
-# https://yaml.org/type/float.html
-data:
-  train_manifest: data/manifest.train
-  dev_manifest: data/manifest.dev
-  test_manifest: data/manifest.test
-  min_input_len: 0.5  # second
-  max_input_len: 30.0 # second
-  min_output_len: 0.0 # tokens
-  max_output_len: 400.0 # tokens
-  min_output_input_ratio: 0.05
-  max_output_input_ratio: 100.0
-
-collator:
-  vocab_filepath: data/lang_char/vocab.txt 
-  unit_type: 'spm'
-  spm_model_prefix: 'data/lang_char/bpe_unigram_5000'
-  mean_std_filepath: ""
-  augmentation_config: conf/preprocess.yaml
-  batch_size: 64
-  raw_wav: True  # use raw_wav or kaldi feature
-  spectrum_type: fbank #linear, mfcc, fbank
-  feat_dim: 80
-  delta_delta: False
-  dither: 1.0
-  target_sample_rate: 16000
-  max_freq: None
-  n_fft: None
-  stride_ms: 10.0
-  window_ms: 25.0
-  use_dB_normalization: True
-  target_dB: -20
-  random_seed: 0
-  keep_transcription_text: False
-  sortagrad: True 
-  shuffle_method: batch_shuffle
-  num_workers: 2
-
-
 # network architecture
 model:
    cmvn_file: 
@ -73,6 +35,37 @@ model:
        length_normalized_loss: false


+data:
+  train_manifest: data/manifest.train
+  dev_manifest: data/manifest.dev
+  test_manifest: data/manifest.test
+
+collator:
+  vocab_filepath: data/lang_char/vocab.txt 
+  unit_type: 'spm'
+  spm_model_prefix: 'data/lang_char/bpe_unigram_5000'
+  mean_std_filepath: ""
+  augmentation_config: conf/preprocess.yaml
+  batch_size: 64
+  raw_wav: True  # use raw_wav or kaldi feature
+  spectrum_type: fbank #linear, mfcc, fbank
+  feat_dim: 80
+  delta_delta: False
+  dither: 1.0
+  target_sample_rate: 16000
+  max_freq: None
+  n_fft: None
+  stride_ms: 10.0
+  window_ms: 25.0
+  use_dB_normalization: True
+  target_dB: -20
+  random_seed: 0
+  keep_transcription_text: False
+  sortagrad: True 
+  shuffle_method: batch_shuffle
+  num_workers: 2
+
+
 training:
  n_epoch: 120
  accum_grad: 1
--- a/examples/librispeech/asr1/conf/conformer.yaml
+++ b/examples/librispeech/asr1/conf/conformer.yaml
@ -1,41 +1,3 @@
-# https://yaml.org/type/float.html
-data:
-  train_manifest: data/manifest.train
-  dev_manifest: data/manifest.dev
-  test_manifest: data/manifest.test-clean
-  min_input_len: 0.5  # seconds
-  max_input_len: 30.0 # seconds
-  min_output_len: 0.0 # tokens
-  max_output_len: 400.0 # tokens
-  min_output_input_ratio: 0.05
-  max_output_input_ratio: 100.0
-
-collator:
-  vocab_filepath: data/lang_char/vocab.txt 
-  unit_type: 'spm'
-  spm_model_prefix: 'data/lang_char/bpe_unigram_5000'
-  mean_std_filepath: ""
-  augmentation_config: conf/preprocess.yaml
-  batch_size: 16
-  raw_wav: True  # use raw_wav or kaldi feature
-  spectrum_type: fbank #linear, mfcc, fbank
-  feat_dim: 80
-  delta_delta: False
-  dither: 1.0
-  target_sample_rate: 16000
-  max_freq: None
-  n_fft: None
-  stride_ms: 10.0
-  window_ms: 25.0
-  use_dB_normalization: True
-  target_dB: -20
-  random_seed: 0
-  keep_transcription_text: False
-  sortagrad: True 
-  shuffle_method: batch_shuffle
-  num_workers: 2
-
-
 # network architecture
 model:
    cmvn_file: 
@ -76,8 +38,40 @@ model:
        length_normalized_loss: false


+data:
+  train_manifest: data/manifest.train
+  dev_manifest: data/manifest.dev
+  test_manifest: data/manifest.test-clean
+
+
+collator:
+  vocab_filepath: data/lang_char/vocab.txt 
+  unit_type: 'spm'
+  spm_model_prefix: 'data/lang_char/bpe_unigram_5000'
+  mean_std_filepath: ""
+  augmentation_config: conf/preprocess.yaml
+  batch_size: 16
+  raw_wav: True  # use raw_wav or kaldi feature
+  spectrum_type: fbank #linear, mfcc, fbank
+  feat_dim: 80
+  delta_delta: False
+  dither: 1.0
+  target_sample_rate: 16000
+  max_freq: None
+  n_fft: None
+  stride_ms: 10.0
+  window_ms: 25.0
+  use_dB_normalization: True
+  target_dB: -20
+  random_seed: 0
+  keep_transcription_text: False
+  sortagrad: True 
+  shuffle_method: batch_shuffle
+  num_workers: 2
+
+
 training:
-  n_epoch: 120
+  n_epoch: 70
  accum_grad: 8
  global_grad_clip: 3.0
  optim: adam
@ -98,13 +92,7 @@ decoding:
  batch_size: 64
  error_rate_type: wer
  decoding_method: attention  # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
-  lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
-  alpha: 2.5
-  beta: 0.3
  beam_size: 10
-  cutoff_prob: 1.0
-  cutoff_top_n: 0
-  num_proc_bsearch: 8
  ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
  decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
      # <0: for decoding, use full chunk.