Merge branch 'develop' into spec_aug2

4 years ago · 60ac4bc2d8
parent 93ae5999ae f8a464c707
commit 60ac4bc2d8
31 changed files with 1635 additions and 388 deletions
--- a/.mergify.yml
+++ b/.mergify.yml
@ -87,3 +87,9 @@ pull_request_rules:
    actions:
      label:
        add: ["Docker"]
  - name: "auto add label=Deployment"
    conditions:
      - files~=^speechnn/
    actions:
      label:
        add: ["Deployment"]
--- a/.notebook/audio_feature.ipynb
+++ b/.notebook/audio_feature.ipynb
--- a/deepspeech/exps/deepspeech2/config.py
+++ b/deepspeech/exps/deepspeech2/config.py
@ -11,94 +11,25 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from yacs.config import CfgNode as CN
+from yacs.config import CfgNode
 from deepspeech.exps.deepspeech2.model import DeepSpeech2Tester
 from deepspeech.exps.deepspeech2.model import DeepSpeech2Trainer
 from deepspeech.io.collator import SpeechCollator
 from deepspeech.io.dataset import ManifestDataset
 from deepspeech.models.deepspeech2 import DeepSpeech2Model
-_C = CN()
+_C = CfgNode()
 _C.data = CN(
    dict(
        train_manifest="",
        dev_manifest="",
        test_manifest="",
        unit_type="char",
        vocab_filepath="",
        spm_model_prefix="",
        mean_std_filepath="",
        augmentation_config="",
        max_duration=float('inf'),
        min_duration=0.0,
        stride_ms=10.0,  # ms
        window_ms=20.0,  # ms
        n_fft=None,  # fft points
        max_freq=None,  # None for samplerate/2
        specgram_type='linear',  # 'linear', 'mfcc', 'fbank'
        feat_dim=0,  # 'mfcc', 'fbank'
        delat_delta=False,  # 'mfcc', 'fbank'
        target_sample_rate=16000,  # target sample rate
        use_dB_normalization=True,
        target_dB=-20,
        batch_size=32,  # batch size
        num_workers=0,  # data loader workers
        sortagrad=False,  # sorted in first epoch when True
        shuffle_method="batch_shuffle",  # 'batch_shuffle', 'instance_shuffle'
    ))
-_C.model = CN(
+_C.data = ManifestDataset.params()
    dict(
        num_conv_layers=2,  #Number of stacking convolution layers.
        num_rnn_layers=3,  #Number of stacking RNN layers.
        rnn_layer_size=1024,  #RNN layer size (number of RNN cells).
        use_gru=True,  #Use gru if set True. Use simple rnn if set False.
        share_rnn_weights=True  #Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported.
    ))
-_C.collator =CN(
+_C.collator = SpeechCollator.params()
    dict(
        augmentation_config="",
        random_seed=0,
        mean_std_filepath="",
        unit_type="char",
        vocab_filepath="",
        spm_model_prefix="",
        specgram_type='linear',  # 'linear', 'mfcc', 'fbank'
        feat_dim=0,  # 'mfcc', 'fbank'
        delta_delta=False,  # 'mfcc', 'fbank'
        stride_ms=10.0,  # ms
        window_ms=20.0,  # ms
        n_fft=None,  # fft points
        max_freq=None,  # None for samplerate/2
        target_sample_rate=16000,  # target sample rate
        use_dB_normalization=True,
        target_dB=-20,
        dither=1.0,  # feature dither
        keep_transcription_text=False
    ))
-DeepSpeech2Model.params(_C.model)
+_C.model = DeepSpeech2Model.params()
-_C.training = CN(
+_C.training = DeepSpeech2Trainer.params()
    dict(
        lr=5e-4,  # learning rate
        lr_decay=1.0,  # learning rate decay
        weight_decay=1e-6,  # the coeff of weight decay
        global_grad_clip=5.0,  # the global norm clip
        n_epoch=50,  # train epochs
    ))
-_C.decoding = CN(
+_C.decoding = DeepSpeech2Tester.params()
    dict(
        alpha=2.5,  # Coef of LM for beam search.
        beta=0.3,  # Coef of WC for beam search.
        cutoff_prob=1.0,  # Cutoff probability for pruning.
        cutoff_top_n=40,  # Cutoff number for pruning.
        lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm',  # Filepath for language model.
        decoding_method='ctc_beam_search',  # Decoding method. Options: ctc_beam_search, ctc_greedy
        error_rate_type='wer',  # Error rate type for evaluation. Options `wer`, 'cer'
        num_proc_bsearch=8,  # # of CPUs for beam search.
        beam_size=500,  # Beam search width.
        batch_size=128,  # decoding batch size
    ))
 def get_cfg_defaults():
--- a/deepspeech/exps/deepspeech2/model.py
+++ b/deepspeech/exps/deepspeech2/model.py
@ -15,11 +15,13 @@
 import time
 from collections import defaultdict
 from pathlib import Path
 from typing import Optional
 import numpy as np
 import paddle
 from paddle import distributed as dist
 from paddle.io import DataLoader
 from yacs.config import CfgNode
 from deepspeech.io.collator import SpeechCollator
 from deepspeech.io.dataset import ManifestDataset
@ -33,11 +35,26 @@ from deepspeech.utils import error_rate
 from deepspeech.utils import layer_tools
 from deepspeech.utils import mp_tools
 from deepspeech.utils.log import Log
 logger = Log(__name__).getlog()
 class DeepSpeech2Trainer(Trainer):
    @classmethod
    def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
        # training config
        default = CfgNode(
            dict(
                lr=5e-4,  # learning rate
                lr_decay=1.0,  # learning rate decay
                weight_decay=1e-6,  # the coeff of weight decay
                global_grad_clip=5.0,  # the global norm clip
                n_epoch=50,  # train epochs
            ))
        if config is not None:
            config.merge_from_other_cfg(default)
        return default
    def __init__(self, config, args):
        super().__init__(config, args)
@ -55,7 +72,7 @@ class DeepSpeech2Trainer(Trainer):
            'train_loss': float(loss),
        }
        msg += "train time: {:>.3f}s, ".format(iteration_time)
-        msg += "batch size: {}, ".format(self.config.data.batch_size)
+        msg += "batch size: {}, ".format(self.config.collator.batch_size)
        msg += ', '.join('{}: {:>.6f}'.format(k, v)
                         for k, v in losses_np.items())
        logger.info(msg)
@ -143,44 +160,67 @@ class DeepSpeech2Trainer(Trainer):
        train_dataset = ManifestDataset.from_config(config)
        config.data.manifest = config.data.dev_manifest
        config.data.augmentation_config = ""
        dev_dataset = ManifestDataset.from_config(config)
        if self.parallel:
            batch_sampler = SortagradDistributedBatchSampler(
                train_dataset,
-                batch_size=config.data.batch_size,
+                batch_size=config.collator.batch_size,
                num_replicas=None,
                rank=None,
                shuffle=True,
                drop_last=True,
-                sortagrad=config.data.sortagrad,
+                sortagrad=config.collator.sortagrad,
-                shuffle_method=config.data.shuffle_method)
+                shuffle_method=config.collator.shuffle_method)
        else:
            batch_sampler = SortagradBatchSampler(
                train_dataset,
                shuffle=True,
-                batch_size=config.data.batch_size,
+                batch_size=config.collator.batch_size,
                drop_last=True,
-                sortagrad=config.data.sortagrad,
+                sortagrad=config.collator.sortagrad,
-                shuffle_method=config.data.shuffle_method)
+                shuffle_method=config.collator.shuffle_method)
-        collate_fn = SpeechCollator.from_config(config)
+        collate_fn_train = SpeechCollator.from_config(config)
        config.collator.augmentation_config = ""
        collate_fn_dev = SpeechCollator.from_config(config)
        self.train_loader = DataLoader(
            train_dataset,
            batch_sampler=batch_sampler,
-            collate_fn=collate_fn,
+            collate_fn=collate_fn_train,
-            num_workers=config.data.num_workers)
+            num_workers=config.collator.num_workers)
        self.valid_loader = DataLoader(
            dev_dataset,
-            batch_size=config.data.batch_size,
+            batch_size=config.collator.batch_size,
            shuffle=False,
            drop_last=False,
-            collate_fn=collate_fn)
+            collate_fn=collate_fn_dev)
        logger.info("Setup train/valid Dataloader!")
 class DeepSpeech2Tester(DeepSpeech2Trainer):
    @classmethod
    def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
        # testing config
        default = CfgNode(
            dict(
                alpha=2.5,  # Coef of LM for beam search.
                beta=0.3,  # Coef of WC for beam search.
                cutoff_prob=1.0,  # Cutoff probability for pruning.
                cutoff_top_n=40,  # Cutoff number for pruning.
                lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm',  # Filepath for language model.
                decoding_method='ctc_beam_search',  # Decoding method. Options: ctc_beam_search, ctc_greedy
                error_rate_type='wer',  # Error rate type for evaluation. Options `wer`, 'cer'
                num_proc_bsearch=8,  # # of CPUs for beam search.
                beam_size=500,  # Beam search width.
                batch_size=128,  # decoding batch size
            ))
        if config is not None:
            config.merge_from_other_cfg(default)
        return default
    def __init__(self, config, args):
        super().__init__(config, args)
@ -193,7 +233,13 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
            trans.append(''.join([chr(i) for i in ids]))
        return trans
-    def compute_metrics(self, utts, audio, audio_len, texts, texts_len, fout = None):
+    def compute_metrics(self,
                        utts,
                        audio,
                        audio_len,
                        texts,
                        texts_len,
                        fout=None):
        cfg = self.config.decoding
        errors_sum, len_refs, num_ins = 0.0, 0, 0
        errors_func = error_rate.char_errors if cfg.error_rate_type == 'cer' else error_rate.word_errors
@ -215,7 +261,8 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
            cutoff_top_n=cfg.cutoff_top_n,
            num_processes=cfg.num_proc_bsearch)
-        for utt, target, result in zip(utts, target_transcripts, result_transcripts):
+        for utt, target, result in zip(utts, target_transcripts,
                                       result_transcripts):
            errors, len_ref = errors_func(target, result)
            errors_sum += errors
            len_refs += len_ref
@ -245,7 +292,8 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
        with open(self.args.result_file, 'w') as fout:
            for i, batch in enumerate(self.test_loader):
                utts, audio, audio_len, texts, texts_len = batch
-                metrics = self.compute_metrics(utts, audio, audio_len, texts, texts_len, fout)
+                metrics = self.compute_metrics(utts, audio, audio_len, texts,
                                               texts_len, fout)
                errors_sum += metrics['errors_sum']
                len_refs += metrics['len_refs']
                num_ins += metrics['num_ins']
@ -324,8 +372,6 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
        # return raw text
        config.data.manifest = config.data.test_manifest
        config.data.keep_transcription_text = True
        config.data.augmentation_config = ""
        # filter test examples, will cause less examples, but no mismatch with training
        # and can use large batch size , save training time, so filter test egs now.
        # config.data.min_input_len = 0.0  # second
@ -337,6 +383,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
        test_dataset = ManifestDataset.from_config(config)
        config.collator.keep_transcription_text = True
        config.collator.augmentation_config = ""
        # return text ord id
        self.test_loader = DataLoader(
            test_dataset,
--- a/deepspeech/exps/u2/config.py
+++ b/deepspeech/exps/u2/config.py
@ -15,6 +15,7 @@ from yacs.config import CfgNode
 from deepspeech.exps.u2.model import U2Tester
 from deepspeech.exps.u2.model import U2Trainer
 from deepspeech.io.collator import SpeechCollator
 from deepspeech.io.dataset import ManifestDataset
 from deepspeech.models.u2 import U2Model
@ -22,12 +23,7 @@ _C = CfgNode()
 _C.data = ManifestDataset.params()
-_C.collator =CfgNode(
+_C.collator = SpeechCollator.params()
    dict(
        augmentation_config="",
        unit_type="char",
        keep_transcription_text=False
    ))
 _C.model = U2Model.params()
--- a/deepspeech/exps/u2/model.py
+++ b/deepspeech/exps/u2/model.py
@ -78,7 +78,8 @@ class U2Trainer(Trainer):
        start = time.time()
        utt, audio, audio_len, text, text_len = batch_data
-        loss, attention_loss, ctc_loss = self.model(audio, audio_len, text, text_len)
+        loss, attention_loss, ctc_loss = self.model(audio, audio_len, text,
                                                    text_len)
        # loss div by `batch_size * accum_grad`
        loss /= train_conf.accum_grad
        loss.backward()
@ -100,7 +101,7 @@ class U2Trainer(Trainer):
        if (batch_index + 1) % train_conf.log_interval == 0:
            msg += "train time: {:>.3f}s, ".format(iteration_time)
-            msg += "batch size: {}, ".format(self.config.data.batch_size)
+            msg += "batch size: {}, ".format(self.config.collator.batch_size)
            msg += "accum: {}, ".format(train_conf.accum_grad)
            msg += ', '.join('{}: {:>.6f}'.format(k, v)
                             for k, v in losses_np.items())
@ -121,7 +122,8 @@ class U2Trainer(Trainer):
        total_loss = 0.0
        for i, batch in enumerate(self.valid_loader):
            utt, audio, audio_len, text, text_len = batch
-            loss, attention_loss, ctc_loss = self.model(audio, audio_len, text, text_len)
+            loss, attention_loss, ctc_loss = self.model(audio, audio_len, text,
                                                        text_len)
            if paddle.isfinite(loss):
                num_utts = batch[1].shape[0]
                num_seen_utts += num_utts
@ -211,51 +213,52 @@ class U2Trainer(Trainer):
    def setup_dataloader(self):
        config = self.config.clone()
        config.defrost()
-        config.data.keep_transcription_text = False
+        config.collator.keep_transcription_text = False
        # train/valid dataset, return token ids
        config.data.manifest = config.data.train_manifest
        train_dataset = ManifestDataset.from_config(config)
        config.data.manifest = config.data.dev_manifest
        config.data.augmentation_config = ""
        dev_dataset = ManifestDataset.from_config(config)
-        collate_fn = SpeechCollator.from_config(config)
+        collate_fn_train = SpeechCollator.from_config(config)
        config.collator.augmentation_config = ""
        collate_fn_dev = SpeechCollator.from_config(config)
        if self.parallel:
            batch_sampler = SortagradDistributedBatchSampler(
                train_dataset,
-                batch_size=config.data.batch_size,
+                batch_size=config.collator.batch_size,
                num_replicas=None,
                rank=None,
                shuffle=True,
                drop_last=True,
-                sortagrad=config.data.sortagrad,
+                sortagrad=config.collator.sortagrad,
-                shuffle_method=config.data.shuffle_method)
+                shuffle_method=config.collator.shuffle_method)
        else:
            batch_sampler = SortagradBatchSampler(
                train_dataset,
                shuffle=True,
-                batch_size=config.data.batch_size,
+                batch_size=config.collator.batch_size,
                drop_last=True,
-                sortagrad=config.data.sortagrad,
+                sortagrad=config.collator.sortagrad,
-                shuffle_method=config.data.shuffle_method)
+                shuffle_method=config.collator.shuffle_method)
        self.train_loader = DataLoader(
            train_dataset,
            batch_sampler=batch_sampler,
-            collate_fn=collate_fn,
+            collate_fn=collate_fn_train,
-            num_workers=config.data.num_workers, )
+            num_workers=config.collator.num_workers, )
        self.valid_loader = DataLoader(
            dev_dataset,
-            batch_size=config.data.batch_size,
+            batch_size=config.collator.batch_size,
            shuffle=False,
            drop_last=False,
-            collate_fn=collate_fn)
+            collate_fn=collate_fn_dev)
        # test dataset, return raw text
        config.data.manifest = config.data.test_manifest
        config.data.keep_transcription_text = True
        config.data.augmentation_config = ""
        # filter test examples, will cause less examples, but no mismatch with training
        # and can use large batch size , save training time, so filter test egs now.
        # config.data.min_input_len = 0.0  # second
@ -264,9 +267,11 @@ class U2Trainer(Trainer):
        # config.data.max_output_len = float('inf')  # tokens
        # config.data.min_output_input_ratio = 0.00
        # config.data.max_output_input_ratio = float('inf')
        test_dataset = ManifestDataset.from_config(config)
        # return text ord id
        config.collator.keep_transcription_text = True
        config.collator.augmentation_config = ""
        self.test_loader = DataLoader(
            test_dataset,
            batch_size=config.decoding.batch_size,
@ -369,7 +374,13 @@ class U2Tester(U2Trainer):
            trans.append(''.join([chr(i) for i in ids]))
        return trans
-    def compute_metrics(self, utts, audio, audio_len, texts, texts_len, fout=None):
+    def compute_metrics(self,
                        utts,
                        audio,
                        audio_len,
                        texts,
                        texts_len,
                        fout=None):
        cfg = self.config.decoding
        errors_sum, len_refs, num_ins = 0.0, 0, 0
        errors_func = error_rate.char_errors if cfg.error_rate_type == 'cer' else error_rate.word_errors
@ -396,7 +407,8 @@ class U2Tester(U2Trainer):
            simulate_streaming=cfg.simulate_streaming)
        decode_time = time.time() - start_time
-        for utt, target, result in zip(utts, target_transcripts, result_transcripts):
+        for utt, target, result in zip(utts, target_transcripts,
                                       result_transcripts):
            errors, len_ref = errors_func(target, result)
            errors_sum += errors
            len_refs += len_ref
--- a/deepspeech/frontend/featurizer/speech_featurizer.py
+++ b/deepspeech/frontend/featurizer/speech_featurizer.py
@ -151,13 +151,3 @@ class SpeechFeaturizer(object):
            TextFeaturizer: object.
        """
        return self._text_featurizer
    # @property
    # def text_feature(self):
    #     """Return the text feature object.
    #     Returns:
    #         TextFeaturizer: object.
    #     """
    #     return self._text_featurizer
--- a/deepspeech/io/collator.py
+++ b/deepspeech/io/collator.py
@ -11,21 +11,20 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import io
 from collections import namedtuple
 from typing import Optional
 import numpy as np
 from yacs.config import CfgNode
 from deepspeech.frontend.utility import IGNORE_ID
 from deepspeech.io.utility import pad_sequence
 from deepspeech.utils.log import Log
 from deepspeech.frontend.augmentor.augmentation import AugmentationPipeline
 from deepspeech.frontend.featurizer.speech_featurizer import SpeechFeaturizer
 from deepspeech.frontend.normalizer import FeatureNormalizer
 from deepspeech.frontend.speech import SpeechSegment
-import io
+from deepspeech.frontend.utility import IGNORE_ID
-import time
+from deepspeech.io.utility import pad_sequence
-from yacs.config import CfgNode
+from deepspeech.utils.log import Log
 from typing import Optional
 from collections import namedtuple
 __all__ = ["SpeechCollator"]
@ -34,6 +33,7 @@ logger = Log(__name__).getlog()
 # namedtupe need global for pickle.
 TarLocalData = namedtuple('TarLocalData', ['tar2info', 'tar2object'])
 class SpeechCollator():
    @classmethod
    def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
@ -56,8 +56,7 @@ class SpeechCollator():
                use_dB_normalization=True,
                target_dB=-20,
                dither=1.0,  # feature dither
-                keep_transcription_text=False
+                keep_transcription_text=False))
            ))
        if config is not None:
            config.merge_from_other_cfg(default)
@ -75,8 +74,8 @@ class SpeechCollator():
        """
        assert 'augmentation_config' in config.collator
        assert 'keep_transcription_text' in config.collator
-        assert 'mean_std_filepath' in config.data
+        assert 'mean_std_filepath' in config.collator
-        assert 'vocab_filepath' in config.data
+        assert 'vocab_filepath' in config.collator
        assert 'specgram_type' in config.collator
        assert 'n_fft' in config.collator
        assert config.collator
@ -84,7 +83,9 @@ class SpeechCollator():
        if isinstance(config.collator.augmentation_config, (str, bytes)):
            if config.collator.augmentation_config:
                aug_file = io.open(
-                    config.collator.augmentation_config, mode='r', encoding='utf8')
+                    config.collator.augmentation_config,
                    mode='r',
                    encoding='utf8')
            else:
                aug_file = io.StringIO(initial_value='{}', newline='')
        else:
@ -92,56 +93,78 @@ class SpeechCollator():
            assert isinstance(aug_file, io.StringIO)
        speech_collator = cls(
-                aug_file=aug_file,
+            aug_file=aug_file,
-                random_seed=0,
+            random_seed=0,
-                mean_std_filepath=config.data.mean_std_filepath,
+            mean_std_filepath=config.collator.mean_std_filepath,
-                unit_type=config.collator.unit_type,
+            unit_type=config.collator.unit_type,
-                vocab_filepath=config.data.vocab_filepath,
+            vocab_filepath=config.collator.vocab_filepath,
-                spm_model_prefix=config.collator.spm_model_prefix,
+            spm_model_prefix=config.collator.spm_model_prefix,
-                specgram_type=config.collator.specgram_type, 
+            specgram_type=config.collator.specgram_type,
-                feat_dim=config.collator.feat_dim, 
+            feat_dim=config.collator.feat_dim,
-                delta_delta=config.collator.delta_delta, 
+            delta_delta=config.collator.delta_delta,
-                stride_ms=config.collator.stride_ms, 
+            stride_ms=config.collator.stride_ms,
-                window_ms=config.collator.window_ms, 
+            window_ms=config.collator.window_ms,
-                n_fft=config.collator.n_fft, 
+            n_fft=config.collator.n_fft,
-                max_freq=config.collator.max_freq, 
+            max_freq=config.collator.max_freq,
-                target_sample_rate=config.collator.target_sample_rate, 
+            target_sample_rate=config.collator.target_sample_rate,
-                use_dB_normalization=config.collator.use_dB_normalization,
+            use_dB_normalization=config.collator.use_dB_normalization,
-                target_dB=config.collator.target_dB,
+            target_dB=config.collator.target_dB,
-                dither=config.collator.dither, 
+            dither=config.collator.dither,
-                keep_transcription_text=config.collator.keep_transcription_text
+            keep_transcription_text=config.collator.keep_transcription_text)
            )
        return speech_collator
-    def __init__(self, aug_file, mean_std_filepath,  
+    def __init__(
-                vocab_filepath, spm_model_prefix,
+            self,
-                random_seed=0,
+            aug_file,
-                unit_type="char",
+            mean_std_filepath,
-                specgram_type='linear',  # 'linear', 'mfcc', 'fbank'
+            vocab_filepath,
-                feat_dim=0,  # 'mfcc', 'fbank'
+            spm_model_prefix,
-                delta_delta=False,  # 'mfcc', 'fbank'
+            random_seed=0,
-                stride_ms=10.0,  # ms
+            unit_type="char",
-                window_ms=20.0,  # ms
+            specgram_type='linear',  # 'linear', 'mfcc', 'fbank'
-                n_fft=None,  # fft points
+            feat_dim=0,  # 'mfcc', 'fbank'
-                max_freq=None,  # None for samplerate/2
+            delta_delta=False,  # 'mfcc', 'fbank'
-                target_sample_rate=16000,  # target sample rate
+            stride_ms=10.0,  # ms
-                use_dB_normalization=True,
+            window_ms=20.0,  # ms
-                target_dB=-20,
+            n_fft=None,  # fft points
-                dither=1.0,
+            max_freq=None,  # None for samplerate/2
-                keep_transcription_text=True):
+            target_sample_rate=16000,  # target sample rate
-        """
+            use_dB_normalization=True,
-        Padding audio features with zeros to make them have the same shape (or
+            target_dB=-20,
-        a user-defined shape) within one bach.
+            dither=1.0,
            keep_transcription_text=True):
        """SpeechCollator Collator
-        if ``keep_transcription_text`` is False, text is token ids else is raw string.
+        Args:
            unit_type(str): token unit type, e.g. char, word, spm
            vocab_filepath (str): vocab file path.
            mean_std_filepath (str): mean and std file path, which suffix is *.npy
            spm_model_prefix (str): spm model prefix, need if `unit_type` is spm.
            augmentation_config (str, optional): augmentation json str. Defaults to '{}'.
            stride_ms (float, optional): stride size in ms. Defaults to 10.0.
            window_ms (float, optional): window size in ms. Defaults to 20.0.
            n_fft (int, optional): fft points for rfft. Defaults to None.
            max_freq (int, optional): max cut freq. Defaults to None.
            target_sample_rate (int, optional): target sample rate which used for training. Defaults to 16000.
            specgram_type (str, optional): 'linear', 'mfcc' or 'fbank'. Defaults to 'linear'.
            feat_dim (int, optional): audio feature dim, using by 'mfcc' or 'fbank'. Defaults to None.
            delta_delta (bool, optional): audio feature with delta-delta, using by 'fbank' or 'mfcc'. Defaults to False.
            use_dB_normalization (bool, optional): do dB normalization. Defaults to True.
            target_dB (int, optional): target dB. Defaults to -20.
            random_seed (int, optional): for random generator. Defaults to 0.
            keep_transcription_text (bool, optional): True, when not in training mode, will not do tokenizer; Defaults to False.
            if ``keep_transcription_text`` is False, text is token ids else is raw string.
        Do augmentations 
        Padding audio features with zeros to make them have the same shape (or
        a user-defined shape) within one batch.
        """
        self._keep_transcription_text = keep_transcription_text
        self._local_data = TarLocalData(tar2info={}, tar2object={})
        self._augmentation_pipeline = AugmentationPipeline(
-            augmentation_config=aug_file.read(), 
+            augmentation_config=aug_file.read(), random_seed=random_seed)
-            random_seed=random_seed)
+
        self._normalizer = FeatureNormalizer(
            mean_std_filepath) if mean_std_filepath else None
@ -203,34 +226,23 @@ class SpeechCollator():
                 where transcription part could be token ids or text.
        :rtype: tuple of (2darray, list)
        """
        start_time = time.time()
        if isinstance(audio_file, str) and audio_file.startswith('tar:'):
            speech_segment = SpeechSegment.from_file(
                self._subfile_from_tar(audio_file), transcript)
        else:
            speech_segment = SpeechSegment.from_file(audio_file, transcript)
        load_wav_time = time.time() - start_time
        #logger.debug(f"load wav time: {load_wav_time}")
        # audio augment
-        start_time = time.time()
+        self._augmentation_pipeline.transform_audio(speech_segment)
        self._augmentation_pipeline.transform_audio(speech_segment, single)
        audio_aug_time = time.time() - start_time
        #logger.debug(f"audio augmentation time: {audio_aug_time}")
        start_time = time.time()
        specgram, transcript_part = self._speech_featurizer.featurize(
            speech_segment, self._keep_transcription_text)
        if self._normalizer:
            specgram = self._normalizer.apply(specgram)
        feature_time = time.time() - start_time
        #logger.debug(f"audio & test feature time: {feature_time}")
        # specgram augment
-        start_time = time.time()
+
-        specgram = self._augmentation_pipeline.transform_feature(specgram, single)
+        specgram = self._augmentation_pipeline.transform_feature(specgram)
        feature_aug_time = time.time() - start_time
        #logger.debug(f"audio feature augmentation time: {feature_aug_time}")
        return specgram, transcript_part
    def __call__(self, batch):
@ -288,18 +300,6 @@ class SpeechCollator():
        text_lens = np.array(text_lens).astype(np.int64)
        return utts, padded_audios, audio_lens, padded_texts, text_lens
    # @property
    # def text_feature(self):
    #     return self._speech_featurizer.text_feature
    # @property
    # def stride_ms(self):
    #     return self._speech_featurizer.stride_ms
 ###########
    @property
    def manifest(self):
        return self._manifest
@ -326,4 +326,4 @@ class SpeechCollator():
    @property
    def stride_ms(self):
-        return self._speech_featurizer.stride_ms
+        return self._speech_featurizer.stride_ms
--- a/deepspeech/io/dataset.py
+++ b/deepspeech/io/dataset.py
@ -11,20 +11,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import io
 import tarfile
 import time
 from collections import namedtuple
 from typing import Optional
 import numpy as np
 from paddle.io import Dataset
 from yacs.config import CfgNode
 from deepspeech.frontend.augmentor.augmentation import AugmentationPipeline
 from deepspeech.frontend.featurizer.speech_featurizer import SpeechFeaturizer
 from deepspeech.frontend.normalizer import FeatureNormalizer
 from deepspeech.frontend.speech import SpeechSegment
 from deepspeech.frontend.utility import read_manifest
 from deepspeech.utils.log import Log
@ -40,22 +31,13 @@ class ManifestDataset(Dataset):
    def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
        default = CfgNode(
            dict(
                train_manifest="",
                dev_manifest="",
                test_manifest="",
                manifest="",
                unit_type="char",
                vocab_filepath="",
                spm_model_prefix="",
                mean_std_filepath="",
                augmentation_config="",
                max_input_len=27.0,
                min_input_len=0.0,
                max_output_len=float('inf'),
                min_output_len=0.0,
                max_output_input_ratio=float('inf'),
-                min_output_input_ratio=0.0,
+                min_output_input_ratio=0.0, ))
            ))
        if config is not None:
            config.merge_from_other_cfg(default)
@ -73,51 +55,19 @@ class ManifestDataset(Dataset):
        """
        assert 'manifest' in config.data
        assert config.data.manifest
        assert 'keep_transcription_text' in config.collator
        if isinstance(config.data.augmentation_config, (str, bytes)):
            if config.data.augmentation_config:
                aug_file = io.open(
                    config.data.augmentation_config, mode='r', encoding='utf8')
            else:
                aug_file = io.StringIO(initial_value='{}', newline='')
        else:
            aug_file = config.data.augmentation_config
            assert isinstance(aug_file, io.StringIO)
        dataset = cls(
            manifest_path=config.data.manifest,
            unit_type=config.data.unit_type,
            vocab_filepath=config.data.vocab_filepath,
            mean_std_filepath=config.data.mean_std_filepath,
            spm_model_prefix=config.data.spm_model_prefix,
            augmentation_config=aug_file.read(),
            max_input_len=config.data.max_input_len,
            min_input_len=config.data.min_input_len,
            max_output_len=config.data.max_output_len,
            min_output_len=config.data.min_output_len,
            max_output_input_ratio=config.data.max_output_input_ratio,
-            min_output_input_ratio=config.data.min_output_input_ratio,
+            min_output_input_ratio=config.data.min_output_input_ratio, )
            )
        return dataset
    def _read_vocab(self, vocab_filepath):
        """Load vocabulary from file."""
        vocab_lines = []
        with open(vocab_filepath, 'r', encoding='utf-8') as file:
            vocab_lines.extend(file.readlines())
        vocab_list = [line[:-1] for line in vocab_lines]
        return vocab_list
    def __init__(self,
                 manifest_path,
                 unit_type,
                 vocab_filepath,
                 mean_std_filepath,
                 spm_model_prefix=None,
                 augmentation_config='{}',
                 max_input_len=float('inf'),
                 min_input_len=0.0,
                 max_output_len=float('inf'),
@ -128,34 +78,16 @@ class ManifestDataset(Dataset):
        Args:
            manifest_path (str): manifest josn file path
            unit_type(str): token unit type, e.g. char, word, spm
            vocab_filepath (str): vocab file path.
            mean_std_filepath (str): mean and std file path, which suffix is *.npy
            spm_model_prefix (str): spm model prefix, need if `unit_type` is spm.
            augmentation_config (str, optional): augmentation json str. Defaults to '{}'.
            max_input_len ([type], optional): maximum output seq length, in seconds for raw wav, in frame numbers for feature data. Defaults to float('inf').
            min_input_len (float, optional): minimum input seq length, in seconds for raw wav, in frame numbers for feature data. Defaults to 0.0.
            max_output_len (float, optional): maximum input seq length, in modeling units. Defaults to 500.0.
            min_output_len (float, optional): minimum input seq length, in modeling units. Defaults to 0.0.
            max_output_input_ratio (float, optional): maximum output seq length/output seq length ratio. Defaults to 10.0.
            min_output_input_ratio (float, optional): minimum output seq length/output seq length ratio. Defaults to 0.05.
-            stride_ms (float, optional): stride size in ms. Defaults to 10.0.
+        
            window_ms (float, optional): window size in ms. Defaults to 20.0.
            n_fft (int, optional): fft points for rfft. Defaults to None.
            max_freq (int, optional): max cut freq. Defaults to None.
            target_sample_rate (int, optional): target sample rate which used for training. Defaults to 16000.
            specgram_type (str, optional): 'linear', 'mfcc' or 'fbank'. Defaults to 'linear'.
            feat_dim (int, optional): audio feature dim, using by 'mfcc' or 'fbank'. Defaults to None.
            delta_delta (bool, optional): audio feature with delta-delta, using by 'fbank' or 'mfcc'. Defaults to False.
            use_dB_normalization (bool, optional): do dB normalization. Defaults to True.
            target_dB (int, optional): target dB. Defaults to -20.
            random_seed (int, optional): for random generator. Defaults to 0.
            keep_transcription_text (bool, optional): True, when not in training mode, will not do tokenizer; Defaults to False.
        """
        super().__init__()
        # self._rng = np.random.RandomState(random_seed)
        # read manifest
        self._manifest = read_manifest(
            manifest_path=manifest_path,
@ -167,52 +99,6 @@ class ManifestDataset(Dataset):
            min_output_input_ratio=min_output_input_ratio)
        self._manifest.sort(key=lambda x: x["feat_shape"][0])
        # self._vocab_list = self._read_vocab(vocab_filepath)
    # @property
    # def manifest(self):
    #     return self._manifest
    # @property
    # def vocab_size(self):
    #     """Return the vocabulary size.
    #     Returns:
    #         int: Vocabulary size.
    #     """
    #     return len(self._vocab_list)
    # @property
    # def vocab_list(self):
    #     """Return the vocabulary in list.
    #     Returns:
    #         List[str]: 
    #     """
    #     return self._vocab_list
    # @property
    # def vocab_dict(self):
    #     """Return the vocabulary in dict.
    #     Returns:
    #         Dict[str, int]: 
    #     """
    #     vocab_dict = dict(
    #         [(token, idx) for (idx, token) in enumerate(self._vocab_list)])
    #     return vocab_dict
    # @property
    # def feature_size(self):
    #     """Return the audio feature size.
    #     Returns:
    #         int: audio feature size.
    #     """
    #     return self._manifest[0]["feat_shape"][-1]
    def __len__(self):
        return len(self._manifest)
--- a/deepspeech/models/u2.py
+++ b/deepspeech/models/u2.py
@ -905,7 +905,6 @@ class U2InferModel(U2Model):
    def __init__(self, configs: dict):
        super().__init__(configs)
    def forward(self,
                feats,
                feats_lengths,
--- a/examples/aishell/s0/conf/deepspeech2.yaml
+++ b/examples/aishell/s0/conf/deepspeech2.yaml
@ -3,20 +3,18 @@ data:
  train_manifest: data/manifest.train
  dev_manifest: data/manifest.dev
  test_manifest: data/manifest.test
  mean_std_filepath: data/mean_std.json
  vocab_filepath: data/vocab.txt 
  batch_size: 64 # one gpu
  min_input_len: 0.0
  max_input_len: 27.0 # second
  min_output_len: 0.0
  max_output_len: .inf
  min_output_input_ratio: 0.00
  max_output_input_ratio: .inf
-  sortagrad: True
+
  shuffle_method: batch_shuffle
  num_workers: 0
 collator:
  mean_std_filepath: data/mean_std.json
  unit_type: char
  vocab_filepath: data/vocab.txt 
  augmentation_config: conf/augmentation.json
  random_seed: 0
  spm_model_prefix: 
@ -32,6 +30,10 @@ collator:
  target_dB: -20
  dither: 1.0
  keep_transcription_text: False
  sortagrad: True
  shuffle_method: batch_shuffle
  num_workers: 0
  batch_size: 64 # one gpu
 model:
  num_conv_layers: 2
--- a/examples/aishell/s1/README.md
+++ b/examples/aishell/s1/README.md
@ -9,6 +9,16 @@
 | conformer | conf/conformer.yaml | spec_aug + shift | test | ctc_prefix_beam_search | - | 0.062196 |  
 | conformer | conf/conformer.yaml | spec_aug + shift | test | attention_rescoring | - | 0.054694 |  
 ## Chunk Conformer
 | Model | Config | Augmentation| Test set | Decode method | Chunk | Loss | WER |  
 | --- | --- | --- | --- | --- | --- | --- | --- |  
 | conformer | conf/chunk_conformer.yaml | spec_aug + shift | test | attention | 16 | - | 0.061939 |  
 | conformer | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_greedy_search | 16 | - | 0.070806 |  
 | conformer | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_prefix_beam_search | 16 | - | 0.070739 |  
 | conformer | conf/chunk_conformer.yaml | spec_aug + shift | test | attention_rescoring | 16 |  - | 0.059400 |  
 ## Transformer
 | Model | Config | Augmentation| Test set | Decode method | Loss | WER |  
--- a/examples/aishell/s1/conf/chunk_conformer.yaml
+++ b/examples/aishell/s1/conf/chunk_conformer.yaml
@ -3,17 +3,20 @@ data:
  train_manifest: data/manifest.train
  dev_manifest: data/manifest.dev
  test_manifest: data/manifest.test
  vocab_filepath: data/vocab.txt 
  unit_type: 'char'
  spm_model_prefix: ''
  augmentation_config: conf/augmentation.json
  batch_size: 32
  min_input_len: 0.5
  max_input_len: 20.0 # second
  min_output_len: 0.0
  max_output_len: 400.0
  min_output_input_ratio: 0.05
  max_output_input_ratio: 10.0
 collator:
  vocab_filepath: data/vocab.txt 
  unit_type: 'char'
  spm_model_prefix: ''
  augmentation_config: conf/augmentation.json
  batch_size: 32
  raw_wav: True  # use raw_wav or kaldi feature
  specgram_type: fbank #linear, mfcc, fbank
  feat_dim: 80
@ -30,7 +33,7 @@ data:
  keep_transcription_text: False
  sortagrad: True 
  shuffle_method: batch_shuffle
-  num_workers: 0
+  num_workers: 2
 # network architecture
@ -78,7 +81,7 @@ model:
 training:
-  n_epoch: 180
+  n_epoch: 240
  accum_grad: 4
  global_grad_clip: 5.0
  optim: adam
--- a/examples/aishell/s1/conf/conformer.yaml
+++ b/examples/aishell/s1/conf/conformer.yaml
@ -3,17 +3,20 @@ data:
  train_manifest: data/manifest.train
  dev_manifest: data/manifest.dev
  test_manifest: data/manifest.test
  vocab_filepath: data/vocab.txt 
  unit_type: 'char'
  spm_model_prefix: ''
  augmentation_config: conf/augmentation.json
  batch_size: 64
  min_input_len: 0.5
  max_input_len: 20.0 # second
  min_output_len: 0.0
  max_output_len: 400.0
  min_output_input_ratio: 0.05
  max_output_input_ratio: 10.0
 collator:
  vocab_filepath: data/vocab.txt 
  unit_type: 'char'
  spm_model_prefix: ''
  augmentation_config: conf/augmentation.json
  batch_size: 64
  raw_wav: True  # use raw_wav or kaldi feature
  specgram_type: fbank #linear, mfcc, fbank
  feat_dim: 80
@ -32,7 +35,6 @@ data:
  shuffle_method: batch_shuffle
  num_workers: 2
 # network architecture
 model:
    cmvn_file: "data/mean_std.json"
--- a/examples/tiny/s0/conf/deepspeech2.yaml
+++ b/examples/tiny/s0/conf/deepspeech2.yaml
@ -2,22 +2,19 @@
 data:
  train_manifest: data/manifest.tiny
  dev_manifest: data/manifest.tiny
-  test_manifest: data/manifest.tiny
+  test_manifest: data/manifest.tiny 
  mean_std_filepath: data/mean_std.json
  unit_type: char
  vocab_filepath: data/vocab.txt 
  batch_size: 4
  min_input_len: 0.0
  max_input_len: 27.0
  min_output_len: 0.0
  max_output_len: 400.0
  min_output_input_ratio: 0.05
  max_output_input_ratio: 10.0
-  sortagrad: True 
+
  shuffle_method: batch_shuffle
  num_workers: 0
 collator:
  mean_std_filepath: data/mean_std.json
  unit_type: char
  vocab_filepath: data/vocab.txt
  augmentation_config: conf/augmentation.json
  random_seed: 0
  spm_model_prefix: 
@ -33,6 +30,10 @@ collator:
  target_dB: -20
  dither: 1.0
  keep_transcription_text: False
  sortagrad: True 
  shuffle_method: batch_shuffle
  num_workers: 0
  batch_size: 4
 model:
  num_conv_layers: 2
@ -42,7 +43,7 @@ model:
  share_rnn_weights: True 
 training:
-  n_epoch: 23
+  n_epoch: 24
  lr: 1e-5 
  lr_decay: 1.0 
  weight_decay: 1e-06
--- a/examples/tiny/s1/conf/transformer.yaml
+++ b/examples/tiny/s1/conf/transformer.yaml
@ -3,26 +3,20 @@ data:
  train_manifest: data/manifest.tiny
  dev_manifest: data/manifest.tiny
  test_manifest: data/manifest.tiny
  vocab_filepath: data/vocab.txt 
  unit_type: 'spm'
  spm_model_prefix: 'data/bpe_unigram_200'
  mean_std_filepath: ""
  batch_size: 4
  min_input_len: 0.5  # second
  max_input_len: 20.0 # second
  min_output_len: 0.0 # tokens
  max_output_len: 400.0 # tokens
  min_output_input_ratio: 0.05
  max_output_input_ratio: 10.0
-  raw_wav: True  # use raw_wav or kaldi feature
+  
  sortagrad: True 
  shuffle_method: batch_shuffle
  num_workers: 0 #2
 collator:
  vocab_filepath: data/vocab.txt 
  mean_std_filepath: ""
  augmentation_config: conf/augmentation.json
  random_seed: 0
-  spm_model_prefix: 
+  unit_type: 'spm'
  spm_model_prefix: 'data/bpe_unigram_200'
  specgram_type: fbank
  feat_dim: 80
  delta_delta: False
@ -35,6 +29,12 @@ collator:
  target_dB: -20
  dither: 1.0
  keep_transcription_text: False
  batch_size: 4
  sortagrad: True 
  shuffle_method: batch_shuffle
  num_workers: 0 #2
  raw_wav: True  # use raw_wav or kaldi feature
 # network architecture
 model:
--- a/speechnn/CMakeLists.txt
+++ b/speechnn/CMakeLists.txt
--- a/speechnn/core/CMakeLists.txt
+++ b/speechnn/core/CMakeLists.txt
--- a/speechnn/core/decoder/CMakeLists.txt
+++ b/speechnn/core/decoder/CMakeLists.txt
--- a/speechnn/core/frontend/CMakeLists.txt
+++ b/speechnn/core/frontend/CMakeLists.txt
--- a/speechnn/core/frontend/audio/CMakeLists.txt
+++ b/speechnn/core/frontend/audio/CMakeLists.txt
--- a/speechnn/core/frontend/text/CMakeLists.txt
+++ b/speechnn/core/frontend/text/CMakeLists.txt
--- a/speechnn/core/model/CMakeLists.txt
+++ b/speechnn/core/model/CMakeLists.txt
--- a/speechnn/core/protocol/CMakeLists.txt
+++ b/speechnn/core/protocol/CMakeLists.txt
--- a/speechnn/core/utils/CMakeLists.txt
+++ b/speechnn/core/utils/CMakeLists.txt
--- a/speechnn/third_party/CMakeLists.txt
+++ b/speechnn/third_party/CMakeLists.txt
--- a/third_party/nnAudio/.gitignore
+++ b/third_party/nnAudio/.gitignore
@ -0,0 +1,3 @@
 build
 dist
 *.egg-info/
--- a/third_party/nnAudio/nnAudio/Spectrogram.py
+++ b/third_party/nnAudio/nnAudio/Spectrogram.py
@ -165,9 +165,13 @@ class STFT(torch.nn.Module):
 #         self.kernel_cos = torch.nn.Parameter(self.kernel_cos, requires_grad=self.trainable)
        # Applying window functions to the Fourier kernels
-        window_mask = torch.tensor(window_mask)
+        if window:
-        wsin = kernel_sin * window_mask
+            window_mask = torch.tensor(window_mask)
-        wcos = kernel_cos * window_mask
+            wsin = kernel_sin * window_mask
            wcos = kernel_cos * window_mask
        else:
            wsin = kernel_sin
            wcos = kernel_cos
        if self.trainable==False:
            self.register_buffer('wsin', wsin)
@ -179,7 +183,6 @@ class STFT(torch.nn.Module):
            self.register_parameter('wsin', wsin)
            self.register_parameter('wcos', wcos)         
        # Prepare the shape of window mask so that it can be used later in inverse
        self.register_buffer('window_mask', window_mask.unsqueeze(0).unsqueeze(-1))
--- a/third_party/nnAudio/setup.py
+++ b/third_party/nnAudio/setup.py
@ -2,29 +2,26 @@ import setuptools
 import codecs
 import os.path
 with open("README.md", "r") as fh:
    long_description = fh.read()
 def read(rel_path):
    here = os.path.abspath(os.path.dirname(__file__))
    with codecs.open(os.path.join(here, rel_path), 'r') as fp:
-        return fp.read()    
+        return fp.read()
-    
+
 def get_version(rel_path):
    for line in read(rel_path).splitlines():
        if line.startswith('__version__'):
            delim = '"' if '"' in line else "'"
            return line.split(delim)[1]
    else:
-        raise RuntimeError("Unable to find version string.")    
+        raise RuntimeError("Unable to find version string.")
-    
+
 setuptools.setup(
    name="nnAudio", # Replace with your own username
    version=get_version("nnAudio/__init__.py"),
    author="KinWaiCheuk",
    author_email="u3500684@connect.hku.hk",
    description="A fast GPU audio processing toolbox with 1D convolutional neural network",
-    long_description=long_description,
+    long_description='',
    long_description_content_type="text/markdown",
    url="https://github.com/KinWaiCheuk/nnAudio",
    packages=setuptools.find_packages(),
--- a/third_party/paddle_audio/frontend.py
+++ b/third_party/paddle_audio/frontend.py
@ -0,0 +1,146 @@
 from typing import Tuple
 import numpy as np
 import paddle
 from paddle import Tensor
 from paddle import nn
 from paddle.nn import functional as F
 def frame(x: Tensor,
          num_samples: Tensor,
          win_length: int,
          hop_length: int,
          clip: bool = True) -> Tuple[Tensor, Tensor]:
    """Extract frames from audio.
    Parameters
    ----------
    x : Tensor
        Shape (N, T), batched waveform.
    num_samples : Tensor
        Shape (N, ), number of samples of each waveform.
    win_length : int
        Window length.
    hop_length : int
        Number of samples shifted between ajancent frames.
    clip : bool, optional
        Whether to clip audio that does not fit into the last frame, by 
        default True
    Returns
    -------
    frames : Tensor
        Shape (N, T', win_length).
    num_frames : Tensor
        Shape (N, ) number of valid frames
    """
    assert hop_length <= win_length
    num_frames = (num_samples - win_length) // hop_length
    padding = (0, 0)
    if not clip:
        num_frames += 1
        # NOTE: pad hop_length - 1 to the right to ensure that there is at most
        # one frame dangling to the righe edge
        padding = (0, hop_length - 1)
    weight = paddle.eye(win_length).unsqueeze(1)
    frames = F.conv1d(x.unsqueeze(1),
                      weight,
                      padding=padding,
                      stride=(hop_length, ))
    return frames, num_frames
 class STFT(nn.Layer):
    """A module for computing stft transformation in a differentiable way. 
    Parameters
    ------------
    n_fft : int
        Number of samples in a frame.
    hop_length : int
        Number of samples shifted between adjacent frames.
    win_length : int
        Length of the window.
    clip: bool
        Whether to clip audio is necesaary.
    """
    def __init__(self,
                 n_fft: int,
                 hop_length: int,
                 win_length: int,
                 window_type: str = None,
                 clip: bool = True):
        super().__init__()
        self.hop_length = hop_length
        self.n_bin = 1 + n_fft // 2
        self.n_fft = n_fft
        self.clip = clip
        # calculate window
        if window_type is None:
            window = np.ones(win_length)
        elif window_type == "hann":
            window = np.hanning(win_length)
        elif window_type == "hamming":
            window = np.hamming(win_length)
        else:
            raise ValueError("Not supported yet!")
        if win_length < n_fft:
            window = F.pad(window, (0, n_fft - win_length))
        elif win_length > n_fft:
            window = window[:n_fft]
        # (n_bins, n_fft) complex
        kernel_size = min(n_fft, win_length)
        weight = np.fft.fft(np.eye(n_fft))[:self.n_bin, :kernel_size]
        w_real = weight.real
        w_imag = weight.imag
        # (2 * n_bins, kernel_size)
        w = np.concatenate([w_real, w_imag], axis=0)
        w = w * window
        # (2 * n_bins, 1, kernel_size) # (C_out, C_in, kernel_size)
        w = np.expand_dims(w, 1)
        weight = paddle.cast(paddle.to_tensor(w), paddle.get_default_dtype())
        self.register_buffer("weight", weight)
    def forward(self, x: Tensor, num_samples: Tensor) -> Tuple[Tensor, Tensor]:
        """Compute the stft transform.
        Parameters
        ------------
        x : Tensor [shape=(B, T)]
            The input waveform.
        num_samples : Tensor 
            Number of samples of each waveform.
        Returns
        ------------
        D : Tensor
            Shape(N, T', n_bins, 2) Spectrogram.
        num_frames: Tensor
            Shape (N,) number of samples of each spectrogram
        """
        num_frames = (num_samples - self.win_length) // self.hop_length
        padding = (0, 0)
        if not self.clip:
            num_frames += 1
            padding = (0, self.hop_length - 1)
        batch_size, _, _ = paddle.shape(x)
        x = x.unsqueeze(-1)
        D = F.conv1d(self.weight,
                     x,
                     stride=(self.hop_length, ),
                     padding=padding,
                     data_format="NLC")
        D = paddle.reshape(D, [batch_size, -1, self.n_bin, 2])
        return D, num_frames
--- a/utils/tarball.sh
+++ b/utils/tarball.sh
@ -18,7 +18,13 @@ function clean() {
 }
 trap clean EXIT
-cp ${ckpt_prefix}.* ${output}
+# ckpt_prfix dir
 if [ -d ${ckpt_prefix} ];then
    cp -r ${ckpt_prefix} ${output}
 fi
 # ckpt_prfix.{json,...}
 cp ${ckpt_prefix}.*  ${output}
 # model config, mean std, vocab
 cp ${model_config} ${mean_std} ${vocab} ${output}
 tar zcvf release.tar.gz ${output}