From bb2a370b230a7e67521bf7aba6732f509ce4c430 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Tue, 28 Dec 2021 21:09:50 +0800
Subject: [PATCH] [asr] remove useless conf of librispeech (#1227)

* remve useless conf

* format code

* update conf

* update conf

* update conf
---
 examples/csmsc/voc5/README.md                 |  6 +--
 .../asr1/conf/chunk_conformer.yaml            | 44 +++++++------------
 .../asr1/conf/chunk_transformer.yaml          | 35 +++++++--------
 examples/librispeech/asr1/conf/conformer.yaml | 32 +++++++-------
 .../librispeech/asr1/conf/transformer.yaml    | 29 ++++++------
 paddlespeech/s2t/exps/u2/model.py             |  4 +-
 paddlespeech/t2s/frontend/zh_frontend.py      |  2 +-
 .../t2s/models/fastspeech2/fastspeech2.py     | 12 +++--
 8 files changed, 76 insertions(+), 88 deletions(-)

diff --git a/examples/csmsc/voc5/README.md b/examples/csmsc/voc5/README.md
index c12cea7f5..bfe28d046 100644
--- a/examples/csmsc/voc5/README.md
+++ b/examples/csmsc/voc5/README.md
@@ -127,10 +127,10 @@ HiFiGAN checkpoint contains files listed below.
 
 ```text
 hifigan_csmsc_ckpt_0.1.1
-├── default.yaml	              # default config used to train hifigan
-├── feats_stats.npy	              # generator parameters of hifigan
+├── default.yaml                  # default config used to train hifigan
+├── feats_stats.npy                  # generator parameters of hifigan
 └── snapshot_iter_2500000.pdz     # statistics used to normalize spectrogram when training hifigan
 ```
 
 ## Acknowledgement
-We adapted some code from https://github.com/kan-bayashi/ParallelWaveGAN.
\ No newline at end of file
+We adapted some code from https://github.com/kan-bayashi/ParallelWaveGAN.
diff --git a/examples/librispeech/asr1/conf/chunk_conformer.yaml b/examples/librispeech/asr1/conf/chunk_conformer.yaml
index 2872b69ef..662d559c0 100644
--- a/examples/librispeech/asr1/conf/chunk_conformer.yaml
+++ b/examples/librispeech/asr1/conf/chunk_conformer.yaml
@@ -47,63 +47,51 @@ data:
   dev_manifest: data/manifest.dev
   test_manifest: data/manifest.test
 
-
 collator:
   vocab_filepath: data/lang_char/vocab.txt 
   unit_type: 'spm'
   spm_model_prefix: 'data/lang_char/bpe_unigram_5000'
   mean_std_filepath: ""
   augmentation_config: conf/preprocess.yaml
-  batch_size: 16
-  raw_wav: True  # use raw_wav or kaldi feature
-  spectrum_type: fbank #linear, mfcc, fbank
   feat_dim: 80
-  delta_delta: False
-  dither: 1.0
-  target_sample_rate: 16000
-  max_freq: None
-  n_fft: None
   stride_ms: 10.0
   window_ms: 25.0
-  use_dB_normalization: True
-  target_dB: -20
-  random_seed: 0
-  keep_transcription_text: False
-  sortagrad: True 
-  shuffle_method: batch_shuffle
-  num_workers: 2
-
-
+  sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+  batch_size: 16
+  maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+  maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
+  minibatches: 0 # for debug
+  batch_count: auto
+  batch_bins: 0 
+  batch_frames_in: 0
+  batch_frames_out: 0
+  batch_frames_inout: 0
+  augmentation_config: conf/preprocess.yaml 
+  num_workers: 0
+  subsampling_factor: 1
+  num_encs: 1
 
 training:
-  n_epoch: 240
+  n_epoch: 120
   accum_grad: 8
   global_grad_clip: 5.0
   optim: adam
   optim_conf:
     lr: 0.001
-    weight_decay: 1e-06
+    weight_decay: 1e-06 
   scheduler: warmuplr     
   scheduler_conf:
     warmup_steps: 25000
-    lr_decay: 1.0
   log_interval: 100
   checkpoint:
     kbest_n: 50
     latest_n: 5
 
-
 decoding:
   batch_size: 128
   error_rate_type: wer
   decoding_method: attention  # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
-  lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
-  alpha: 2.5
-  beta: 0.3
   beam_size: 10
-  cutoff_prob: 1.0
-  cutoff_top_n: 0
-  num_proc_bsearch: 8
   ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
   decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
       # <0: for decoding, use full chunk.
diff --git a/examples/librispeech/asr1/conf/chunk_transformer.yaml b/examples/librispeech/asr1/conf/chunk_transformer.yaml
index 275e940af..bc77ba41a 100644
--- a/examples/librispeech/asr1/conf/chunk_transformer.yaml
+++ b/examples/librispeech/asr1/conf/chunk_transformer.yaml
@@ -34,36 +34,35 @@ model:
         lsm_weight: 0.1     # label smoothing option
         length_normalized_loss: false
 
-
 data:
   train_manifest: data/manifest.train
   dev_manifest: data/manifest.dev
   test_manifest: data/manifest.test
 
+
 collator:
   vocab_filepath: data/lang_char/vocab.txt 
   unit_type: 'spm'
   spm_model_prefix: 'data/lang_char/bpe_unigram_5000'
   mean_std_filepath: ""
   augmentation_config: conf/preprocess.yaml
-  batch_size: 64
-  raw_wav: True  # use raw_wav or kaldi feature
-  spectrum_type: fbank #linear, mfcc, fbank
   feat_dim: 80
-  delta_delta: False
-  dither: 1.0
-  target_sample_rate: 16000
-  max_freq: None
-  n_fft: None
   stride_ms: 10.0
   window_ms: 25.0
-  use_dB_normalization: True
-  target_dB: -20
-  random_seed: 0
-  keep_transcription_text: False
-  sortagrad: True 
-  shuffle_method: batch_shuffle
-  num_workers: 2
+  sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+  batch_size: 64
+  maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+  maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
+  minibatches: 0 # for debug
+  batch_count: auto
+  batch_bins: 0 
+  batch_frames_in: 0
+  batch_frames_out: 0
+  batch_frames_inout: 0
+  augmentation_config: conf/preprocess.yaml 
+  num_workers: 0
+  subsampling_factor: 1
+  num_encs: 1
 
 
 training:
@@ -101,6 +100,4 @@ decoding:
       # >0: for decoding, use fixed chunk size as set.
       # 0: used for training, it's prohibited here. 
   num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
-  simulate_streaming: true  # simulate streaming inference. Defaults to False.
-
-
+  simulate_streaming: true  # simulate streaming inference. Defaults to False.
\ No newline at end of file
diff --git a/examples/librispeech/asr1/conf/conformer.yaml b/examples/librispeech/asr1/conf/conformer.yaml
index 1193f14b1..5a5708979 100644
--- a/examples/librispeech/asr1/conf/conformer.yaml
+++ b/examples/librispeech/asr1/conf/conformer.yaml
@@ -34,6 +34,7 @@ model:
     # hybrid CTC/attention
     model_conf:
         ctc_weight: 0.3
+        ctc_grad_norm_type: null 
         lsm_weight: 0.1     # label smoothing option
         length_normalized_loss: false
 
@@ -50,25 +51,24 @@ collator:
   spm_model_prefix: 'data/lang_char/bpe_unigram_5000'
   mean_std_filepath: ""
   augmentation_config: conf/preprocess.yaml
-  batch_size: 16
-  raw_wav: True  # use raw_wav or kaldi feature
-  spectrum_type: fbank #linear, mfcc, fbank
   feat_dim: 80
-  delta_delta: False
-  dither: 1.0
-  target_sample_rate: 16000
-  max_freq: None
-  n_fft: None
   stride_ms: 10.0
   window_ms: 25.0
-  use_dB_normalization: True
-  target_dB: -20
-  random_seed: 0
-  keep_transcription_text: False
-  sortagrad: True 
-  shuffle_method: batch_shuffle
-  num_workers: 2
-
+  sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+  batch_size: 16
+  maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+  maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
+  minibatches: 0 # for debug
+  batch_count: auto
+  batch_bins: 0 
+  batch_frames_in: 0
+  batch_frames_out: 0
+  batch_frames_inout: 0
+  augmentation_config: conf/preprocess.yaml 
+  num_workers: 0
+  subsampling_factor: 1
+  num_encs: 1
+  
 
 training:
   n_epoch: 70
diff --git a/examples/librispeech/asr1/conf/transformer.yaml b/examples/librispeech/asr1/conf/transformer.yaml
index a90efe482..b7f33e223 100644
--- a/examples/librispeech/asr1/conf/transformer.yaml
+++ b/examples/librispeech/asr1/conf/transformer.yaml
@@ -51,24 +51,23 @@ collator:
   spm_model_prefix: 'data/lang_char/bpe_unigram_5000'
   mean_std_filepath: ""
   augmentation_config: conf/preprocess.yaml
-  batch_size: 32
-  raw_wav: True  # use raw_wav or kaldi feature
-  spectrum_type: fbank #linear, mfcc, fbank
   feat_dim: 80
-  delta_delta: False
-  dither: 1.0
-  target_sample_rate: 16000
-  max_freq: None
-  n_fft: None
   stride_ms: 10.0
   window_ms: 25.0
-  use_dB_normalization: True
-  target_dB: -20
-  random_seed: 0
-  keep_transcription_text: False
-  sortagrad: True 
-  shuffle_method: batch_shuffle
-  num_workers: 2
+  sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+  batch_size: 32 
+  maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+  maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
+  minibatches: 0 # for debug
+  batch_count: auto
+  batch_bins: 0 
+  batch_frames_in: 0
+  batch_frames_out: 0
+  batch_frames_inout: 0
+  augmentation_config: conf/preprocess.yaml 
+  num_workers: 0
+  subsampling_factor: 1
+  num_encs: 1
 
 
 training:
diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py
index 9fb7067fb..6b529b400 100644
--- a/paddlespeech/s2t/exps/u2/model.py
+++ b/paddlespeech/s2t/exps/u2/model.py
@@ -265,7 +265,7 @@ class U2Trainer(Trainer):
                 batch_frames_in=config.collator.batch_frames_in,
                 batch_frames_out=config.collator.batch_frames_out,
                 batch_frames_inout=config.collator.batch_frames_inout,
-                preprocess_conf=config.collator.augmentation_config,  
+                preprocess_conf=config.collator.augmentation_config,
                 n_iter_processes=config.collator.num_workers,
                 subsampling_factor=1,
                 num_encs=1)
@@ -284,7 +284,7 @@ class U2Trainer(Trainer):
                 batch_frames_in=0,
                 batch_frames_out=0,
                 batch_frames_inout=0,
-                preprocess_conf=config.collator.augmentation_config, 
+                preprocess_conf=config.collator.augmentation_config,
                 n_iter_processes=config.collator.num_workers,
                 subsampling_factor=1,
                 num_encs=1)
diff --git a/paddlespeech/t2s/frontend/zh_frontend.py b/paddlespeech/t2s/frontend/zh_frontend.py
index 5cfa44b1d..a905c412d 100644
--- a/paddlespeech/t2s/frontend/zh_frontend.py
+++ b/paddlespeech/t2s/frontend/zh_frontend.py
@@ -106,7 +106,7 @@ class Frontend():
         for seg in segments:
             phones = []
             # Replace all English words in the sentence
-            seg = re.sub('[a-zA-Z]+','',seg)
+            seg = re.sub('[a-zA-Z]+', '', seg)
             seg_cut = psg.lcut(seg)
             initials = []
             finals = []
diff --git a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
index 1679f0374..a5fb7fab7 100644
--- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
+++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
@@ -942,7 +942,12 @@ class StyleFastSpeech2Inference(FastSpeech2Inference):
         """
         spk_id = paddle.to_tensor(spk_id)
         normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference(
-            text, durations=None, pitch=None, energy=None, spk_emb=spk_emb, spk_id=spk_id)
+            text,
+            durations=None,
+            pitch=None,
+            energy=None,
+            spk_emb=spk_emb,
+            spk_id=spk_id)
         # priority: groundtruth > scale/bias > previous output
         # set durations
         if isinstance(durations, np.ndarray):
@@ -995,9 +1000,8 @@ class StyleFastSpeech2Inference(FastSpeech2Inference):
             pitch=pitch,
             energy=energy,
             use_teacher_forcing=True,
-            spk_emb=spk_emb, 
-            spk_id=spk_id
-            )
+            spk_emb=spk_emb,
+            spk_id=spk_id)
 
         logmel = self.normalizer.inverse(normalized_mel)
         return logmel