From bb2a370b230a7e67521bf7aba6732f509ce4c430 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 28 Dec 2021 21:09:50 +0800 Subject: [PATCH] [asr] remove useless conf of librispeech (#1227) * remve useless conf * format code * update conf * update conf * update conf --- examples/csmsc/voc5/README.md | 6 +-- .../asr1/conf/chunk_conformer.yaml | 44 +++++++------------ .../asr1/conf/chunk_transformer.yaml | 35 +++++++-------- examples/librispeech/asr1/conf/conformer.yaml | 32 +++++++------- .../librispeech/asr1/conf/transformer.yaml | 29 ++++++------ paddlespeech/s2t/exps/u2/model.py | 4 +- paddlespeech/t2s/frontend/zh_frontend.py | 2 +- .../t2s/models/fastspeech2/fastspeech2.py | 12 +++-- 8 files changed, 76 insertions(+), 88 deletions(-) diff --git a/examples/csmsc/voc5/README.md b/examples/csmsc/voc5/README.md index c12cea7f5..bfe28d046 100644 --- a/examples/csmsc/voc5/README.md +++ b/examples/csmsc/voc5/README.md @@ -127,10 +127,10 @@ HiFiGAN checkpoint contains files listed below. ```text hifigan_csmsc_ckpt_0.1.1 -├── default.yaml # default config used to train hifigan -├── feats_stats.npy # generator parameters of hifigan +├── default.yaml # default config used to train hifigan +├── feats_stats.npy # generator parameters of hifigan └── snapshot_iter_2500000.pdz # statistics used to normalize spectrogram when training hifigan ``` ## Acknowledgement -We adapted some code from https://github.com/kan-bayashi/ParallelWaveGAN. \ No newline at end of file +We adapted some code from https://github.com/kan-bayashi/ParallelWaveGAN. diff --git a/examples/librispeech/asr1/conf/chunk_conformer.yaml b/examples/librispeech/asr1/conf/chunk_conformer.yaml index 2872b69ef..662d559c0 100644 --- a/examples/librispeech/asr1/conf/chunk_conformer.yaml +++ b/examples/librispeech/asr1/conf/chunk_conformer.yaml @@ -47,63 +47,51 @@ data: dev_manifest: data/manifest.dev test_manifest: data/manifest.test - collator: vocab_filepath: data/lang_char/vocab.txt unit_type: 'spm' spm_model_prefix: 'data/lang_char/bpe_unigram_5000' mean_std_filepath: "" augmentation_config: conf/preprocess.yaml - batch_size: 16 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank feat_dim: 80 - delta_delta: False - dither: 1.0 - target_sample_rate: 16000 - max_freq: None - n_fft: None stride_ms: 10.0 window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 - - + sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs + batch_size: 16 + maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced + maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced + minibatches: 0 # for debug + batch_count: auto + batch_bins: 0 + batch_frames_in: 0 + batch_frames_out: 0 + batch_frames_inout: 0 + augmentation_config: conf/preprocess.yaml + num_workers: 0 + subsampling_factor: 1 + num_encs: 1 training: - n_epoch: 240 + n_epoch: 120 accum_grad: 8 global_grad_clip: 5.0 optim: adam optim_conf: lr: 0.001 - weight_decay: 1e-06 + weight_decay: 1e-06 scheduler: warmuplr scheduler_conf: warmup_steps: 25000 - lr_decay: 1.0 log_interval: 100 checkpoint: kbest_n: 50 latest_n: 5 - decoding: batch_size: 128 error_rate_type: wer decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 beam_size: 10 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. # <0: for decoding, use full chunk. diff --git a/examples/librispeech/asr1/conf/chunk_transformer.yaml b/examples/librispeech/asr1/conf/chunk_transformer.yaml index 275e940af..bc77ba41a 100644 --- a/examples/librispeech/asr1/conf/chunk_transformer.yaml +++ b/examples/librispeech/asr1/conf/chunk_transformer.yaml @@ -34,36 +34,35 @@ model: lsm_weight: 0.1 # label smoothing option length_normalized_loss: false - data: train_manifest: data/manifest.train dev_manifest: data/manifest.dev test_manifest: data/manifest.test + collator: vocab_filepath: data/lang_char/vocab.txt unit_type: 'spm' spm_model_prefix: 'data/lang_char/bpe_unigram_5000' mean_std_filepath: "" augmentation_config: conf/preprocess.yaml - batch_size: 64 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank feat_dim: 80 - delta_delta: False - dither: 1.0 - target_sample_rate: 16000 - max_freq: None - n_fft: None stride_ms: 10.0 window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 + sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs + batch_size: 64 + maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced + maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced + minibatches: 0 # for debug + batch_count: auto + batch_bins: 0 + batch_frames_in: 0 + batch_frames_out: 0 + batch_frames_inout: 0 + augmentation_config: conf/preprocess.yaml + num_workers: 0 + subsampling_factor: 1 + num_encs: 1 training: @@ -101,6 +100,4 @@ decoding: # >0: for decoding, use fixed chunk size as set. # 0: used for training, it's prohibited here. num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: true # simulate streaming inference. Defaults to False. - - + simulate_streaming: true # simulate streaming inference. Defaults to False. \ No newline at end of file diff --git a/examples/librispeech/asr1/conf/conformer.yaml b/examples/librispeech/asr1/conf/conformer.yaml index 1193f14b1..5a5708979 100644 --- a/examples/librispeech/asr1/conf/conformer.yaml +++ b/examples/librispeech/asr1/conf/conformer.yaml @@ -34,6 +34,7 @@ model: # hybrid CTC/attention model_conf: ctc_weight: 0.3 + ctc_grad_norm_type: null lsm_weight: 0.1 # label smoothing option length_normalized_loss: false @@ -50,25 +51,24 @@ collator: spm_model_prefix: 'data/lang_char/bpe_unigram_5000' mean_std_filepath: "" augmentation_config: conf/preprocess.yaml - batch_size: 16 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank feat_dim: 80 - delta_delta: False - dither: 1.0 - target_sample_rate: 16000 - max_freq: None - n_fft: None stride_ms: 10.0 window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 - + sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs + batch_size: 16 + maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced + maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced + minibatches: 0 # for debug + batch_count: auto + batch_bins: 0 + batch_frames_in: 0 + batch_frames_out: 0 + batch_frames_inout: 0 + augmentation_config: conf/preprocess.yaml + num_workers: 0 + subsampling_factor: 1 + num_encs: 1 + training: n_epoch: 70 diff --git a/examples/librispeech/asr1/conf/transformer.yaml b/examples/librispeech/asr1/conf/transformer.yaml index a90efe482..b7f33e223 100644 --- a/examples/librispeech/asr1/conf/transformer.yaml +++ b/examples/librispeech/asr1/conf/transformer.yaml @@ -51,24 +51,23 @@ collator: spm_model_prefix: 'data/lang_char/bpe_unigram_5000' mean_std_filepath: "" augmentation_config: conf/preprocess.yaml - batch_size: 32 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank feat_dim: 80 - delta_delta: False - dither: 1.0 - target_sample_rate: 16000 - max_freq: None - n_fft: None stride_ms: 10.0 window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 + sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs + batch_size: 32 + maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced + maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced + minibatches: 0 # for debug + batch_count: auto + batch_bins: 0 + batch_frames_in: 0 + batch_frames_out: 0 + batch_frames_inout: 0 + augmentation_config: conf/preprocess.yaml + num_workers: 0 + subsampling_factor: 1 + num_encs: 1 training: diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py index 9fb7067fb..6b529b400 100644 --- a/paddlespeech/s2t/exps/u2/model.py +++ b/paddlespeech/s2t/exps/u2/model.py @@ -265,7 +265,7 @@ class U2Trainer(Trainer): batch_frames_in=config.collator.batch_frames_in, batch_frames_out=config.collator.batch_frames_out, batch_frames_inout=config.collator.batch_frames_inout, - preprocess_conf=config.collator.augmentation_config, + preprocess_conf=config.collator.augmentation_config, n_iter_processes=config.collator.num_workers, subsampling_factor=1, num_encs=1) @@ -284,7 +284,7 @@ class U2Trainer(Trainer): batch_frames_in=0, batch_frames_out=0, batch_frames_inout=0, - preprocess_conf=config.collator.augmentation_config, + preprocess_conf=config.collator.augmentation_config, n_iter_processes=config.collator.num_workers, subsampling_factor=1, num_encs=1) diff --git a/paddlespeech/t2s/frontend/zh_frontend.py b/paddlespeech/t2s/frontend/zh_frontend.py index 5cfa44b1d..a905c412d 100644 --- a/paddlespeech/t2s/frontend/zh_frontend.py +++ b/paddlespeech/t2s/frontend/zh_frontend.py @@ -106,7 +106,7 @@ class Frontend(): for seg in segments: phones = [] # Replace all English words in the sentence - seg = re.sub('[a-zA-Z]+','',seg) + seg = re.sub('[a-zA-Z]+', '', seg) seg_cut = psg.lcut(seg) initials = [] finals = [] diff --git a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py index 1679f0374..a5fb7fab7 100644 --- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py +++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py @@ -942,7 +942,12 @@ class StyleFastSpeech2Inference(FastSpeech2Inference): """ spk_id = paddle.to_tensor(spk_id) normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference( - text, durations=None, pitch=None, energy=None, spk_emb=spk_emb, spk_id=spk_id) + text, + durations=None, + pitch=None, + energy=None, + spk_emb=spk_emb, + spk_id=spk_id) # priority: groundtruth > scale/bias > previous output # set durations if isinstance(durations, np.ndarray): @@ -995,9 +1000,8 @@ class StyleFastSpeech2Inference(FastSpeech2Inference): pitch=pitch, energy=energy, use_teacher_forcing=True, - spk_emb=spk_emb, - spk_id=spk_id - ) + spk_emb=spk_emb, + spk_id=spk_id) logmel = self.normalizer.inverse(normalized_mel) return logmel