Merge branch 'develop' into spec_aug2

4 years ago · 60ac4bc2d8
parent 93ae5999ae f8a464c707
commit 60ac4bc2d8
31 changed files with 1635 additions and 388 deletions
--- a/.mergify.yml
+++ b/.mergify.yml
@ -87,3 +87,9 @@ pull_request_rules:
    actions:
      label:
        add: ["Docker"]
+  - name: "auto add label=Deployment"
+    conditions:
+      - files~=^speechnn/
+    actions:
+      label:
+        add: ["Deployment"]
--- a/.notebook/audio_feature.ipynb
+++ b/.notebook/audio_feature.ipynb
--- a/deepspeech/exps/deepspeech2/config.py
+++ b/deepspeech/exps/deepspeech2/config.py
@ -11,94 +11,25 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from yacs.config import CfgNode as CN
+from yacs.config import CfgNode

+from deepspeech.exps.deepspeech2.model import DeepSpeech2Tester
+from deepspeech.exps.deepspeech2.model import DeepSpeech2Trainer
+from deepspeech.io.collator import SpeechCollator
+from deepspeech.io.dataset import ManifestDataset
 from deepspeech.models.deepspeech2 import DeepSpeech2Model

-_C = CN()
-_C.data = CN(
-    dict(
-        train_manifest="",
-        dev_manifest="",
-        test_manifest="",
-        unit_type="char",
-        vocab_filepath="",
-        spm_model_prefix="",
-        mean_std_filepath="",
-        augmentation_config="",
-        max_duration=float('inf'),
-        min_duration=0.0,
-        stride_ms=10.0,  # ms
-        window_ms=20.0,  # ms
-        n_fft=None,  # fft points
-        max_freq=None,  # None for samplerate/2
-        specgram_type='linear',  # 'linear', 'mfcc', 'fbank'
-        feat_dim=0,  # 'mfcc', 'fbank'
-        delat_delta=False,  # 'mfcc', 'fbank'
-        target_sample_rate=16000,  # target sample rate
-        use_dB_normalization=True,
-        target_dB=-20,
-        batch_size=32,  # batch size
-        num_workers=0,  # data loader workers
-        sortagrad=False,  # sorted in first epoch when True
-        shuffle_method="batch_shuffle",  # 'batch_shuffle', 'instance_shuffle'
-    ))
+_C = CfgNode()

-_C.model = CN(
-    dict(
-        num_conv_layers=2,  #Number of stacking convolution layers.
-        num_rnn_layers=3,  #Number of stacking RNN layers.
-        rnn_layer_size=1024,  #RNN layer size (number of RNN cells).
-        use_gru=True,  #Use gru if set True. Use simple rnn if set False.
-        share_rnn_weights=True  #Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported.
-    ))
+_C.data = ManifestDataset.params()

-_C.collator =CN(
-    dict(
-        augmentation_config="",
-        random_seed=0,
-        mean_std_filepath="",
-        unit_type="char",
-        vocab_filepath="",
-        spm_model_prefix="",
-        specgram_type='linear',  # 'linear', 'mfcc', 'fbank'
-        feat_dim=0,  # 'mfcc', 'fbank'
-        delta_delta=False,  # 'mfcc', 'fbank'
-        stride_ms=10.0,  # ms
-        window_ms=20.0,  # ms
-        n_fft=None,  # fft points
-        max_freq=None,  # None for samplerate/2
-        target_sample_rate=16000,  # target sample rate
-        use_dB_normalization=True,
-        target_dB=-20,
-        dither=1.0,  # feature dither
-        keep_transcription_text=False
-    ))
+_C.collator = SpeechCollator.params()

-DeepSpeech2Model.params(_C.model)
+_C.model = DeepSpeech2Model.params()

-_C.training = CN(
-    dict(
-        lr=5e-4,  # learning rate
-        lr_decay=1.0,  # learning rate decay
-        weight_decay=1e-6,  # the coeff of weight decay
-        global_grad_clip=5.0,  # the global norm clip
-        n_epoch=50,  # train epochs
-    ))
+_C.training = DeepSpeech2Trainer.params()

-_C.decoding = CN(
-    dict(
-        alpha=2.5,  # Coef of LM for beam search.
-        beta=0.3,  # Coef of WC for beam search.
-        cutoff_prob=1.0,  # Cutoff probability for pruning.
-        cutoff_top_n=40,  # Cutoff number for pruning.
-        lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm',  # Filepath for language model.
-        decoding_method='ctc_beam_search',  # Decoding method. Options: ctc_beam_search, ctc_greedy
-        error_rate_type='wer',  # Error rate type for evaluation. Options `wer`, 'cer'
-        num_proc_bsearch=8,  # # of CPUs for beam search.
-        beam_size=500,  # Beam search width.
-        batch_size=128,  # decoding batch size
-    ))
+_C.decoding = DeepSpeech2Tester.params()


 def get_cfg_defaults():
--- a/deepspeech/exps/deepspeech2/model.py
+++ b/deepspeech/exps/deepspeech2/model.py
@ -15,11 +15,13 @@
 import time
 from collections import defaultdict
 from pathlib import Path
+from typing import Optional

 import numpy as np
 import paddle
 from paddle import distributed as dist
 from paddle.io import DataLoader
+from yacs.config import CfgNode

 from deepspeech.io.collator import SpeechCollator
 from deepspeech.io.dataset import ManifestDataset
@ -33,11 +35,26 @@ from deepspeech.utils import error_rate
 from deepspeech.utils import layer_tools
 from deepspeech.utils import mp_tools
 from deepspeech.utils.log import Log
-
 logger = Log(__name__).getlog()


 class DeepSpeech2Trainer(Trainer):
+    @classmethod
+    def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
+        # training config
+        default = CfgNode(
+            dict(
+                lr=5e-4,  # learning rate
+                lr_decay=1.0,  # learning rate decay
+                weight_decay=1e-6,  # the coeff of weight decay
+                global_grad_clip=5.0,  # the global norm clip
+                n_epoch=50,  # train epochs
+            ))
+
+        if config is not None:
+            config.merge_from_other_cfg(default)
+        return default
+
    def __init__(self, config, args):
        super().__init__(config, args)

@ -55,7 +72,7 @@ class DeepSpeech2Trainer(Trainer):
            'train_loss': float(loss),
        }
        msg += "train time: {:>.3f}s, ".format(iteration_time)
-        msg += "batch size: {}, ".format(self.config.data.batch_size)
+        msg += "batch size: {}, ".format(self.config.collator.batch_size)
        msg += ', '.join('{}: {:>.6f}'.format(k, v)
                         for k, v in losses_np.items())
        logger.info(msg)
@ -143,44 +160,67 @@ class DeepSpeech2Trainer(Trainer):
        train_dataset = ManifestDataset.from_config(config)

        config.data.manifest = config.data.dev_manifest
-        config.data.augmentation_config = ""
        dev_dataset = ManifestDataset.from_config(config)

        if self.parallel:
            batch_sampler = SortagradDistributedBatchSampler(
                train_dataset,
-                batch_size=config.data.batch_size,
+                batch_size=config.collator.batch_size,
                num_replicas=None,
                rank=None,
                shuffle=True,
                drop_last=True,
-                sortagrad=config.data.sortagrad,
-                shuffle_method=config.data.shuffle_method)
+                sortagrad=config.collator.sortagrad,
+                shuffle_method=config.collator.shuffle_method)
        else:
            batch_sampler = SortagradBatchSampler(
                train_dataset,
                shuffle=True,
-                batch_size=config.data.batch_size,
+                batch_size=config.collator.batch_size,
                drop_last=True,
-                sortagrad=config.data.sortagrad,
-                shuffle_method=config.data.shuffle_method)
+                sortagrad=config.collator.sortagrad,
+                shuffle_method=config.collator.shuffle_method)

-        collate_fn = SpeechCollator.from_config(config)
+        collate_fn_train = SpeechCollator.from_config(config)
+
+        config.collator.augmentation_config = ""
+        collate_fn_dev = SpeechCollator.from_config(config)
        self.train_loader = DataLoader(
            train_dataset,
            batch_sampler=batch_sampler,
-            collate_fn=collate_fn,
-            num_workers=config.data.num_workers)
+            collate_fn=collate_fn_train,
+            num_workers=config.collator.num_workers)
        self.valid_loader = DataLoader(
            dev_dataset,
-            batch_size=config.data.batch_size,
+            batch_size=config.collator.batch_size,
            shuffle=False,
            drop_last=False,
-            collate_fn=collate_fn)
+            collate_fn=collate_fn_dev)
        logger.info("Setup train/valid Dataloader!")


 class DeepSpeech2Tester(DeepSpeech2Trainer):
+    @classmethod
+    def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
+        # testing config
+        default = CfgNode(
+            dict(
+                alpha=2.5,  # Coef of LM for beam search.
+                beta=0.3,  # Coef of WC for beam search.
+                cutoff_prob=1.0,  # Cutoff probability for pruning.
+                cutoff_top_n=40,  # Cutoff number for pruning.
+                lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm',  # Filepath for language model.
+                decoding_method='ctc_beam_search',  # Decoding method. Options: ctc_beam_search, ctc_greedy
+                error_rate_type='wer',  # Error rate type for evaluation. Options `wer`, 'cer'
+                num_proc_bsearch=8,  # # of CPUs for beam search.
+                beam_size=500,  # Beam search width.
+                batch_size=128,  # decoding batch size
+            ))
+
+        if config is not None:
+            config.merge_from_other_cfg(default)
+        return default
+
    def __init__(self, config, args):
        super().__init__(config, args)

@ -193,7 +233,13 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
            trans.append(''.join([chr(i) for i in ids]))
        return trans

-    def compute_metrics(self, utts, audio, audio_len, texts, texts_len, fout = None):
+    def compute_metrics(self,
+                        utts,
+                        audio,
+                        audio_len,
+                        texts,
+                        texts_len,
+                        fout=None):
        cfg = self.config.decoding
        errors_sum, len_refs, num_ins = 0.0, 0, 0
        errors_func = error_rate.char_errors if cfg.error_rate_type == 'cer' else error_rate.word_errors
@ -215,7 +261,8 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
            cutoff_top_n=cfg.cutoff_top_n,
            num_processes=cfg.num_proc_bsearch)

-        for utt, target, result in zip(utts, target_transcripts, result_transcripts):
+        for utt, target, result in zip(utts, target_transcripts,
+                                       result_transcripts):
            errors, len_ref = errors_func(target, result)
            errors_sum += errors
            len_refs += len_ref
@ -245,7 +292,8 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
        with open(self.args.result_file, 'w') as fout:
            for i, batch in enumerate(self.test_loader):
                utts, audio, audio_len, texts, texts_len = batch
-                metrics = self.compute_metrics(utts, audio, audio_len, texts, texts_len, fout)
+                metrics = self.compute_metrics(utts, audio, audio_len, texts,
+                                               texts_len, fout)
                errors_sum += metrics['errors_sum']
                len_refs += metrics['len_refs']
                num_ins += metrics['num_ins']
@ -324,8 +372,6 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
        # return raw text

        config.data.manifest = config.data.test_manifest
-        config.data.keep_transcription_text = True
-        config.data.augmentation_config = ""
        # filter test examples, will cause less examples, but no mismatch with training
        # and can use large batch size , save training time, so filter test egs now.
        # config.data.min_input_len = 0.0  # second
@ -337,6 +383,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
        test_dataset = ManifestDataset.from_config(config)

        config.collator.keep_transcription_text = True
+        config.collator.augmentation_config = ""
        # return text ord id
        self.test_loader = DataLoader(
            test_dataset,
--- a/deepspeech/exps/u2/config.py
+++ b/deepspeech/exps/u2/config.py
@ -15,6 +15,7 @@ from yacs.config import CfgNode

 from deepspeech.exps.u2.model import U2Tester
 from deepspeech.exps.u2.model import U2Trainer
+from deepspeech.io.collator import SpeechCollator
 from deepspeech.io.dataset import ManifestDataset
 from deepspeech.models.u2 import U2Model

@ -22,12 +23,7 @@ _C = CfgNode()

 _C.data = ManifestDataset.params()

-_C.collator =CfgNode(
-    dict(
-        augmentation_config="",
-        unit_type="char",
-        keep_transcription_text=False
-    ))
+_C.collator = SpeechCollator.params()

 _C.model = U2Model.params()

--- a/deepspeech/exps/u2/model.py
+++ b/deepspeech/exps/u2/model.py
@ -78,7 +78,8 @@ class U2Trainer(Trainer):
        start = time.time()
        utt, audio, audio_len, text, text_len = batch_data

-        loss, attention_loss, ctc_loss = self.model(audio, audio_len, text, text_len)
+        loss, attention_loss, ctc_loss = self.model(audio, audio_len, text,
+                                                    text_len)
        # loss div by `batch_size * accum_grad`
        loss /= train_conf.accum_grad
        loss.backward()
@ -100,7 +101,7 @@ class U2Trainer(Trainer):

        if (batch_index + 1) % train_conf.log_interval == 0:
            msg += "train time: {:>.3f}s, ".format(iteration_time)
-            msg += "batch size: {}, ".format(self.config.data.batch_size)
+            msg += "batch size: {}, ".format(self.config.collator.batch_size)
            msg += "accum: {}, ".format(train_conf.accum_grad)
            msg += ', '.join('{}: {:>.6f}'.format(k, v)
                             for k, v in losses_np.items())
@ -121,7 +122,8 @@ class U2Trainer(Trainer):
        total_loss = 0.0
        for i, batch in enumerate(self.valid_loader):
            utt, audio, audio_len, text, text_len = batch
-            loss, attention_loss, ctc_loss = self.model(audio, audio_len, text, text_len)
+            loss, attention_loss, ctc_loss = self.model(audio, audio_len, text,
+                                                        text_len)
            if paddle.isfinite(loss):
                num_utts = batch[1].shape[0]
                num_seen_utts += num_utts
@ -211,51 +213,52 @@ class U2Trainer(Trainer):
    def setup_dataloader(self):
        config = self.config.clone()
        config.defrost()
-        config.data.keep_transcription_text = False
+        config.collator.keep_transcription_text = False

        # train/valid dataset, return token ids
        config.data.manifest = config.data.train_manifest
        train_dataset = ManifestDataset.from_config(config)

        config.data.manifest = config.data.dev_manifest
-        config.data.augmentation_config = ""
        dev_dataset = ManifestDataset.from_config(config)

-        collate_fn = SpeechCollator.from_config(config)
+        collate_fn_train = SpeechCollator.from_config(config)
+
+        config.collator.augmentation_config = ""
+        collate_fn_dev = SpeechCollator.from_config(config)
+
        if self.parallel:
            batch_sampler = SortagradDistributedBatchSampler(
                train_dataset,
-                batch_size=config.data.batch_size,
+                batch_size=config.collator.batch_size,
                num_replicas=None,
                rank=None,
                shuffle=True,
                drop_last=True,
-                sortagrad=config.data.sortagrad,
-                shuffle_method=config.data.shuffle_method)
+                sortagrad=config.collator.sortagrad,
+                shuffle_method=config.collator.shuffle_method)
        else:
            batch_sampler = SortagradBatchSampler(
                train_dataset,
                shuffle=True,
-                batch_size=config.data.batch_size,
+                batch_size=config.collator.batch_size,
                drop_last=True,
-                sortagrad=config.data.sortagrad,
-                shuffle_method=config.data.shuffle_method)
+                sortagrad=config.collator.sortagrad,
+                shuffle_method=config.collator.shuffle_method)
        self.train_loader = DataLoader(
            train_dataset,
            batch_sampler=batch_sampler,
-            collate_fn=collate_fn,
-            num_workers=config.data.num_workers, )
+            collate_fn=collate_fn_train,
+            num_workers=config.collator.num_workers, )
        self.valid_loader = DataLoader(
            dev_dataset,
-            batch_size=config.data.batch_size,
+            batch_size=config.collator.batch_size,
            shuffle=False,
            drop_last=False,
-            collate_fn=collate_fn)
+            collate_fn=collate_fn_dev)

        # test dataset, return raw text
        config.data.manifest = config.data.test_manifest
-        config.data.keep_transcription_text = True
-        config.data.augmentation_config = ""
        # filter test examples, will cause less examples, but no mismatch with training
        # and can use large batch size , save training time, so filter test egs now.
        # config.data.min_input_len = 0.0  # second
@ -264,9 +267,11 @@ class U2Trainer(Trainer):
        # config.data.max_output_len = float('inf')  # tokens
        # config.data.min_output_input_ratio = 0.00
        # config.data.max_output_input_ratio = float('inf')
+
        test_dataset = ManifestDataset.from_config(config)
        # return text ord id
        config.collator.keep_transcription_text = True
+        config.collator.augmentation_config = ""
        self.test_loader = DataLoader(
            test_dataset,
            batch_size=config.decoding.batch_size,
@ -369,7 +374,13 @@ class U2Tester(U2Trainer):
            trans.append(''.join([chr(i) for i in ids]))
        return trans

-    def compute_metrics(self, utts, audio, audio_len, texts, texts_len, fout=None):
+    def compute_metrics(self,
+                        utts,
+                        audio,
+                        audio_len,
+                        texts,
+                        texts_len,
+                        fout=None):
        cfg = self.config.decoding
        errors_sum, len_refs, num_ins = 0.0, 0, 0
        errors_func = error_rate.char_errors if cfg.error_rate_type == 'cer' else error_rate.word_errors
@ -396,7 +407,8 @@ class U2Tester(U2Trainer):
            simulate_streaming=cfg.simulate_streaming)
        decode_time = time.time() - start_time

-        for utt, target, result in zip(utts, target_transcripts, result_transcripts):
+        for utt, target, result in zip(utts, target_transcripts,
+                                       result_transcripts):
            errors, len_ref = errors_func(target, result)
            errors_sum += errors
            len_refs += len_ref
--- a/deepspeech/frontend/featurizer/speech_featurizer.py
+++ b/deepspeech/frontend/featurizer/speech_featurizer.py
@ -151,13 +151,3 @@ class SpeechFeaturizer(object):
            TextFeaturizer: object.
        """
        return self._text_featurizer
-        
-
-    # @property
-    # def text_feature(self):
-    #     """Return the text feature object.
-
-    #     Returns:
-    #         TextFeaturizer: object.
-    #     """
-    #     return self._text_featurizer
--- a/deepspeech/io/collator.py
+++ b/deepspeech/io/collator.py
@ -11,21 +11,20 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import io
+from collections import namedtuple
+from typing import Optional
+
 import numpy as np
+from yacs.config import CfgNode

-from deepspeech.frontend.utility import IGNORE_ID
-from deepspeech.io.utility import pad_sequence
-from deepspeech.utils.log import Log
 from deepspeech.frontend.augmentor.augmentation import AugmentationPipeline
 from deepspeech.frontend.featurizer.speech_featurizer import SpeechFeaturizer
 from deepspeech.frontend.normalizer import FeatureNormalizer
 from deepspeech.frontend.speech import SpeechSegment
-import io
-import time
-from yacs.config import CfgNode
-from typing import Optional
-
-from collections import namedtuple
+from deepspeech.frontend.utility import IGNORE_ID
+from deepspeech.io.utility import pad_sequence
+from deepspeech.utils.log import Log

 __all__ = ["SpeechCollator"]

@ -34,6 +33,7 @@ logger = Log(__name__).getlog()
 # namedtupe need global for pickle.
 TarLocalData = namedtuple('TarLocalData', ['tar2info', 'tar2object'])

+
 class SpeechCollator():
    @classmethod
    def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
@ -56,8 +56,7 @@ class SpeechCollator():
                use_dB_normalization=True,
                target_dB=-20,
                dither=1.0,  # feature dither
-                keep_transcription_text=False
-            ))
+                keep_transcription_text=False))

        if config is not None:
            config.merge_from_other_cfg(default)
@ -75,8 +74,8 @@ class SpeechCollator():
        """
        assert 'augmentation_config' in config.collator
        assert 'keep_transcription_text' in config.collator
-        assert 'mean_std_filepath' in config.data
-        assert 'vocab_filepath' in config.data
+        assert 'mean_std_filepath' in config.collator
+        assert 'vocab_filepath' in config.collator
        assert 'specgram_type' in config.collator
        assert 'n_fft' in config.collator
        assert config.collator
@ -84,7 +83,9 @@ class SpeechCollator():
        if isinstance(config.collator.augmentation_config, (str, bytes)):
            if config.collator.augmentation_config:
                aug_file = io.open(
-                    config.collator.augmentation_config, mode='r', encoding='utf8')
+                    config.collator.augmentation_config,
+                    mode='r',
+                    encoding='utf8')
            else:
                aug_file = io.StringIO(initial_value='{}', newline='')
        else:
@ -92,56 +93,78 @@ class SpeechCollator():
            assert isinstance(aug_file, io.StringIO)

        speech_collator = cls(
-                aug_file=aug_file,
-                random_seed=0,
-                mean_std_filepath=config.data.mean_std_filepath,
-                unit_type=config.collator.unit_type,
-                vocab_filepath=config.data.vocab_filepath,
-                spm_model_prefix=config.collator.spm_model_prefix,
-                specgram_type=config.collator.specgram_type, 
-                feat_dim=config.collator.feat_dim, 
-                delta_delta=config.collator.delta_delta, 
-                stride_ms=config.collator.stride_ms, 
-                window_ms=config.collator.window_ms, 
-                n_fft=config.collator.n_fft, 
-                max_freq=config.collator.max_freq, 
-                target_sample_rate=config.collator.target_sample_rate, 
-                use_dB_normalization=config.collator.use_dB_normalization,
-                target_dB=config.collator.target_dB,
-                dither=config.collator.dither, 
-                keep_transcription_text=config.collator.keep_transcription_text
-            )
+            aug_file=aug_file,
+            random_seed=0,
+            mean_std_filepath=config.collator.mean_std_filepath,
+            unit_type=config.collator.unit_type,
+            vocab_filepath=config.collator.vocab_filepath,
+            spm_model_prefix=config.collator.spm_model_prefix,
+            specgram_type=config.collator.specgram_type,
+            feat_dim=config.collator.feat_dim,
+            delta_delta=config.collator.delta_delta,
+            stride_ms=config.collator.stride_ms,
+            window_ms=config.collator.window_ms,
+            n_fft=config.collator.n_fft,
+            max_freq=config.collator.max_freq,
+            target_sample_rate=config.collator.target_sample_rate,
+            use_dB_normalization=config.collator.use_dB_normalization,
+            target_dB=config.collator.target_dB,
+            dither=config.collator.dither,
+            keep_transcription_text=config.collator.keep_transcription_text)
        return speech_collator

-    def __init__(self, aug_file, mean_std_filepath,  
-                vocab_filepath, spm_model_prefix,
-                random_seed=0,
-                unit_type="char",
-                specgram_type='linear',  # 'linear', 'mfcc', 'fbank'
-                feat_dim=0,  # 'mfcc', 'fbank'
-                delta_delta=False,  # 'mfcc', 'fbank'
-                stride_ms=10.0,  # ms
-                window_ms=20.0,  # ms
-                n_fft=None,  # fft points
-                max_freq=None,  # None for samplerate/2
-                target_sample_rate=16000,  # target sample rate
-                use_dB_normalization=True,
-                target_dB=-20,
-                dither=1.0,
-                keep_transcription_text=True):
-        """
-        Padding audio features with zeros to make them have the same shape (or
-        a user-defined shape) within one bach.
+    def __init__(
+            self,
+            aug_file,
+            mean_std_filepath,
+            vocab_filepath,
+            spm_model_prefix,
+            random_seed=0,
+            unit_type="char",
+            specgram_type='linear',  # 'linear', 'mfcc', 'fbank'
+            feat_dim=0,  # 'mfcc', 'fbank'
+            delta_delta=False,  # 'mfcc', 'fbank'
+            stride_ms=10.0,  # ms
+            window_ms=20.0,  # ms
+            n_fft=None,  # fft points
+            max_freq=None,  # None for samplerate/2
+            target_sample_rate=16000,  # target sample rate
+            use_dB_normalization=True,
+            target_dB=-20,
+            dither=1.0,
+            keep_transcription_text=True):
+        """SpeechCollator Collator

-        if ``keep_transcription_text`` is False, text is token ids else is raw string.
+        Args:
+            unit_type(str): token unit type, e.g. char, word, spm
+            vocab_filepath (str): vocab file path.
+            mean_std_filepath (str): mean and std file path, which suffix is *.npy
+            spm_model_prefix (str): spm model prefix, need if `unit_type` is spm.
+            augmentation_config (str, optional): augmentation json str. Defaults to '{}'.
+            stride_ms (float, optional): stride size in ms. Defaults to 10.0.
+            window_ms (float, optional): window size in ms. Defaults to 20.0.
+            n_fft (int, optional): fft points for rfft. Defaults to None.
+            max_freq (int, optional): max cut freq. Defaults to None.
+            target_sample_rate (int, optional): target sample rate which used for training. Defaults to 16000.
+            specgram_type (str, optional): 'linear', 'mfcc' or 'fbank'. Defaults to 'linear'.
+            feat_dim (int, optional): audio feature dim, using by 'mfcc' or 'fbank'. Defaults to None.
+            delta_delta (bool, optional): audio feature with delta-delta, using by 'fbank' or 'mfcc'. Defaults to False.
+            use_dB_normalization (bool, optional): do dB normalization. Defaults to True.
+            target_dB (int, optional): target dB. Defaults to -20.
+            random_seed (int, optional): for random generator. Defaults to 0.
+            keep_transcription_text (bool, optional): True, when not in training mode, will not do tokenizer; Defaults to False.
+            if ``keep_transcription_text`` is False, text is token ids else is raw string.
+        
+        Do augmentations 
+        Padding audio features with zeros to make them have the same shape (or
+        a user-defined shape) within one batch.
        """
        self._keep_transcription_text = keep_transcription_text

        self._local_data = TarLocalData(tar2info={}, tar2object={})
        self._augmentation_pipeline = AugmentationPipeline(
-            augmentation_config=aug_file.read(), 
-            random_seed=random_seed)
-        
+            augmentation_config=aug_file.read(), random_seed=random_seed)
+
        self._normalizer = FeatureNormalizer(
            mean_std_filepath) if mean_std_filepath else None

@ -203,34 +226,23 @@ class SpeechCollator():
                 where transcription part could be token ids or text.
        :rtype: tuple of (2darray, list)
        """
-        start_time = time.time()
        if isinstance(audio_file, str) and audio_file.startswith('tar:'):
            speech_segment = SpeechSegment.from_file(
                self._subfile_from_tar(audio_file), transcript)
        else:
            speech_segment = SpeechSegment.from_file(audio_file, transcript)
-        load_wav_time = time.time() - start_time
-        #logger.debug(f"load wav time: {load_wav_time}")

        # audio augment
-        start_time = time.time()
-        self._augmentation_pipeline.transform_audio(speech_segment, single)
-        audio_aug_time = time.time() - start_time
-        #logger.debug(f"audio augmentation time: {audio_aug_time}")
+        self._augmentation_pipeline.transform_audio(speech_segment)

-        start_time = time.time()
        specgram, transcript_part = self._speech_featurizer.featurize(
            speech_segment, self._keep_transcription_text)
        if self._normalizer:
            specgram = self._normalizer.apply(specgram)
-        feature_time = time.time() - start_time
-        #logger.debug(f"audio & test feature time: {feature_time}")

        # specgram augment
-        start_time = time.time()
-        specgram = self._augmentation_pipeline.transform_feature(specgram, single)
-        feature_aug_time = time.time() - start_time
-        #logger.debug(f"audio feature augmentation time: {feature_aug_time}")
+
+        specgram = self._augmentation_pipeline.transform_feature(specgram)
        return specgram, transcript_part

    def __call__(self, batch):
@ -288,18 +300,6 @@ class SpeechCollator():
        text_lens = np.array(text_lens).astype(np.int64)
        return utts, padded_audios, audio_lens, padded_texts, text_lens

-
-    # @property
-    # def text_feature(self):
-    #     return self._speech_featurizer.text_feature
-
-
-    # @property
-    # def stride_ms(self):
-    #     return self._speech_featurizer.stride_ms
-
-###########
-    
    @property
    def manifest(self):
        return self._manifest
@ -326,4 +326,4 @@ class SpeechCollator():

    @property
    def stride_ms(self):
-        return self._speech_featurizer.stride_ms
+        return self._speech_featurizer.stride_ms
--- a/deepspeech/io/dataset.py
+++ b/deepspeech/io/dataset.py
@ -11,20 +11,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import io
-import tarfile
-import time
-from collections import namedtuple
 from typing import Optional

-import numpy as np
 from paddle.io import Dataset
 from yacs.config import CfgNode

-from deepspeech.frontend.augmentor.augmentation import AugmentationPipeline
-from deepspeech.frontend.featurizer.speech_featurizer import SpeechFeaturizer
-from deepspeech.frontend.normalizer import FeatureNormalizer
-from deepspeech.frontend.speech import SpeechSegment
 from deepspeech.frontend.utility import read_manifest
 from deepspeech.utils.log import Log

@ -40,22 +31,13 @@ class ManifestDataset(Dataset):
    def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
        default = CfgNode(
            dict(
-                train_manifest="",
-                dev_manifest="",
-                test_manifest="",
                manifest="",
-                unit_type="char",
-                vocab_filepath="",
-                spm_model_prefix="",
-                mean_std_filepath="",
-                augmentation_config="",
                max_input_len=27.0,
                min_input_len=0.0,
                max_output_len=float('inf'),
                min_output_len=0.0,
                max_output_input_ratio=float('inf'),
-                min_output_input_ratio=0.0,
-            ))
+                min_output_input_ratio=0.0, ))

        if config is not None:
            config.merge_from_other_cfg(default)
@ -73,51 +55,19 @@ class ManifestDataset(Dataset):
        """
        assert 'manifest' in config.data
        assert config.data.manifest
-        assert 'keep_transcription_text' in config.collator
-
-        if isinstance(config.data.augmentation_config, (str, bytes)):
-            if config.data.augmentation_config:
-                aug_file = io.open(
-                    config.data.augmentation_config, mode='r', encoding='utf8')
-            else:
-                aug_file = io.StringIO(initial_value='{}', newline='')
-        else:
-            aug_file = config.data.augmentation_config
-            assert isinstance(aug_file, io.StringIO)

        dataset = cls(
            manifest_path=config.data.manifest,
-            unit_type=config.data.unit_type,
-            vocab_filepath=config.data.vocab_filepath,
-            mean_std_filepath=config.data.mean_std_filepath,
-            spm_model_prefix=config.data.spm_model_prefix,
-            augmentation_config=aug_file.read(),
            max_input_len=config.data.max_input_len,
            min_input_len=config.data.min_input_len,
            max_output_len=config.data.max_output_len,
            min_output_len=config.data.min_output_len,
            max_output_input_ratio=config.data.max_output_input_ratio,
-            min_output_input_ratio=config.data.min_output_input_ratio,
-            )
+            min_output_input_ratio=config.data.min_output_input_ratio, )
        return dataset

-    
-    def _read_vocab(self, vocab_filepath):
-        """Load vocabulary from file."""
-        vocab_lines = []
-        with open(vocab_filepath, 'r', encoding='utf-8') as file:
-            vocab_lines.extend(file.readlines())
-        vocab_list = [line[:-1] for line in vocab_lines]
-        return vocab_list
-
-
    def __init__(self,
                 manifest_path,
-                 unit_type,
-                 vocab_filepath,
-                 mean_std_filepath,
-                 spm_model_prefix=None,
-                 augmentation_config='{}',
                 max_input_len=float('inf'),
                 min_input_len=0.0,
                 max_output_len=float('inf'),
@ -128,34 +78,16 @@ class ManifestDataset(Dataset):

        Args:
            manifest_path (str): manifest josn file path
-            unit_type(str): token unit type, e.g. char, word, spm
-            vocab_filepath (str): vocab file path.
-            mean_std_filepath (str): mean and std file path, which suffix is *.npy
-            spm_model_prefix (str): spm model prefix, need if `unit_type` is spm.
-            augmentation_config (str, optional): augmentation json str. Defaults to '{}'.
            max_input_len ([type], optional): maximum output seq length, in seconds for raw wav, in frame numbers for feature data. Defaults to float('inf').
            min_input_len (float, optional): minimum input seq length, in seconds for raw wav, in frame numbers for feature data. Defaults to 0.0.
            max_output_len (float, optional): maximum input seq length, in modeling units. Defaults to 500.0.
            min_output_len (float, optional): minimum input seq length, in modeling units. Defaults to 0.0.
            max_output_input_ratio (float, optional): maximum output seq length/output seq length ratio. Defaults to 10.0.
            min_output_input_ratio (float, optional): minimum output seq length/output seq length ratio. Defaults to 0.05.
-            stride_ms (float, optional): stride size in ms. Defaults to 10.0.
-            window_ms (float, optional): window size in ms. Defaults to 20.0.
-            n_fft (int, optional): fft points for rfft. Defaults to None.
-            max_freq (int, optional): max cut freq. Defaults to None.
-            target_sample_rate (int, optional): target sample rate which used for training. Defaults to 16000.
-            specgram_type (str, optional): 'linear', 'mfcc' or 'fbank'. Defaults to 'linear'.
-            feat_dim (int, optional): audio feature dim, using by 'mfcc' or 'fbank'. Defaults to None.
-            delta_delta (bool, optional): audio feature with delta-delta, using by 'fbank' or 'mfcc'. Defaults to False.
-            use_dB_normalization (bool, optional): do dB normalization. Defaults to True.
-            target_dB (int, optional): target dB. Defaults to -20.
-            random_seed (int, optional): for random generator. Defaults to 0.
-            keep_transcription_text (bool, optional): True, when not in training mode, will not do tokenizer; Defaults to False.
+        
        """
        super().__init__()

-        # self._rng = np.random.RandomState(random_seed)
-
        # read manifest
        self._manifest = read_manifest(
            manifest_path=manifest_path,
@ -167,52 +99,6 @@ class ManifestDataset(Dataset):
            min_output_input_ratio=min_output_input_ratio)
        self._manifest.sort(key=lambda x: x["feat_shape"][0])

-        # self._vocab_list = self._read_vocab(vocab_filepath)
-
-
-    # @property
-    # def manifest(self):
-    #     return self._manifest
-    
-    # @property
-    # def vocab_size(self):
-    #     """Return the vocabulary size.
-
-    #     Returns:
-    #         int: Vocabulary size.
-    #     """
-    #     return len(self._vocab_list)
-
-    # @property
-    # def vocab_list(self):
-    #     """Return the vocabulary in list.
-
-    #     Returns:
-    #         List[str]: 
-    #     """
-    #     return self._vocab_list
-
-    # @property
-    # def vocab_dict(self):
-    #     """Return the vocabulary in dict.
-
-    #     Returns:
-    #         Dict[str, int]: 
-    #     """
-    #     vocab_dict = dict(
-    #         [(token, idx) for (idx, token) in enumerate(self._vocab_list)])
-    #     return vocab_dict
-
-    # @property
-    # def feature_size(self):
-    #     """Return the audio feature size.
-
-    #     Returns:
-    #         int: audio feature size.
-    #     """
-    #     return self._manifest[0]["feat_shape"][-1]
-
-
    def __len__(self):
        return len(self._manifest)

--- a/deepspeech/models/u2.py
+++ b/deepspeech/models/u2.py
@ -905,7 +905,6 @@ class U2InferModel(U2Model):
    def __init__(self, configs: dict):
        super().__init__(configs)

-
    def forward(self,
                feats,
                feats_lengths,
--- a/examples/aishell/s0/conf/deepspeech2.yaml
+++ b/examples/aishell/s0/conf/deepspeech2.yaml
@ -3,20 +3,18 @@ data:
  train_manifest: data/manifest.train
  dev_manifest: data/manifest.dev
  test_manifest: data/manifest.test
-  mean_std_filepath: data/mean_std.json
-  vocab_filepath: data/vocab.txt 
-  batch_size: 64 # one gpu
  min_input_len: 0.0
  max_input_len: 27.0 # second
  min_output_len: 0.0
  max_output_len: .inf
  min_output_input_ratio: 0.00
  max_output_input_ratio: .inf
-  sortagrad: True
-  shuffle_method: batch_shuffle
-  num_workers: 0
+

 collator:
+  mean_std_filepath: data/mean_std.json
+  unit_type: char
+  vocab_filepath: data/vocab.txt 
  augmentation_config: conf/augmentation.json
  random_seed: 0
  spm_model_prefix: 
@ -32,6 +30,10 @@ collator:
  target_dB: -20
  dither: 1.0
  keep_transcription_text: False
+  sortagrad: True
+  shuffle_method: batch_shuffle
+  num_workers: 0
+  batch_size: 64 # one gpu

 model:
  num_conv_layers: 2
--- a/examples/aishell/s1/README.md
+++ b/examples/aishell/s1/README.md
@ -9,6 +9,16 @@
 | conformer | conf/conformer.yaml | spec_aug + shift | test | ctc_prefix_beam_search | - | 0.062196 |  
 | conformer | conf/conformer.yaml | spec_aug + shift | test | attention_rescoring | - | 0.054694 |  

+## Chunk Conformer
+
+| Model | Config | Augmentation| Test set | Decode method | Chunk | Loss | WER |  
+| --- | --- | --- | --- | --- | --- | --- | --- |  
+| conformer | conf/chunk_conformer.yaml | spec_aug + shift | test | attention | 16 | - | 0.061939 |  
+| conformer | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_greedy_search | 16 | - | 0.070806 |  
+| conformer | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_prefix_beam_search | 16 | - | 0.070739 |  
+| conformer | conf/chunk_conformer.yaml | spec_aug + shift | test | attention_rescoring | 16 |  - | 0.059400 |  
+
+
 ## Transformer

 | Model | Config | Augmentation| Test set | Decode method | Loss | WER |  
--- a/examples/aishell/s1/conf/chunk_conformer.yaml
+++ b/examples/aishell/s1/conf/chunk_conformer.yaml
@ -3,17 +3,20 @@ data:
  train_manifest: data/manifest.train
  dev_manifest: data/manifest.dev
  test_manifest: data/manifest.test
-  vocab_filepath: data/vocab.txt 
-  unit_type: 'char'
-  spm_model_prefix: ''
-  augmentation_config: conf/augmentation.json
-  batch_size: 32
  min_input_len: 0.5
  max_input_len: 20.0 # second
  min_output_len: 0.0
  max_output_len: 400.0
  min_output_input_ratio: 0.05
  max_output_input_ratio: 10.0
+  
+
+collator:
+  vocab_filepath: data/vocab.txt 
+  unit_type: 'char'
+  spm_model_prefix: ''
+  augmentation_config: conf/augmentation.json
+  batch_size: 32
  raw_wav: True  # use raw_wav or kaldi feature
  specgram_type: fbank #linear, mfcc, fbank
  feat_dim: 80
@ -30,7 +33,7 @@ data:
  keep_transcription_text: False
  sortagrad: True 
  shuffle_method: batch_shuffle
-  num_workers: 0
+  num_workers: 2


 # network architecture
@ -78,7 +81,7 @@ model:


 training:
-  n_epoch: 180
+  n_epoch: 240
  accum_grad: 4
  global_grad_clip: 5.0
  optim: adam
--- a/examples/aishell/s1/conf/conformer.yaml
+++ b/examples/aishell/s1/conf/conformer.yaml
@ -3,17 +3,20 @@ data:
  train_manifest: data/manifest.train
  dev_manifest: data/manifest.dev
  test_manifest: data/manifest.test
-  vocab_filepath: data/vocab.txt 
-  unit_type: 'char'
-  spm_model_prefix: ''
-  augmentation_config: conf/augmentation.json
-  batch_size: 64
  min_input_len: 0.5
  max_input_len: 20.0 # second
  min_output_len: 0.0
  max_output_len: 400.0
  min_output_input_ratio: 0.05
  max_output_input_ratio: 10.0
+
+
+collator:
+  vocab_filepath: data/vocab.txt 
+  unit_type: 'char'
+  spm_model_prefix: ''
+  augmentation_config: conf/augmentation.json
+  batch_size: 64
  raw_wav: True  # use raw_wav or kaldi feature
  specgram_type: fbank #linear, mfcc, fbank
  feat_dim: 80
@ -32,7 +35,6 @@ data:
  shuffle_method: batch_shuffle
  num_workers: 2

-
 # network architecture
 model:
    cmvn_file: "data/mean_std.json"
--- a/examples/tiny/s0/conf/deepspeech2.yaml
+++ b/examples/tiny/s0/conf/deepspeech2.yaml
@ -2,22 +2,19 @@
 data:
  train_manifest: data/manifest.tiny
  dev_manifest: data/manifest.tiny
-  test_manifest: data/manifest.tiny
-  mean_std_filepath: data/mean_std.json
-  unit_type: char
-  vocab_filepath: data/vocab.txt 
-  batch_size: 4
+  test_manifest: data/manifest.tiny 
  min_input_len: 0.0
  max_input_len: 27.0
  min_output_len: 0.0
  max_output_len: 400.0
  min_output_input_ratio: 0.05
  max_output_input_ratio: 10.0
-  sortagrad: True 
-  shuffle_method: batch_shuffle
-  num_workers: 0
+

 collator:
+  mean_std_filepath: data/mean_std.json
+  unit_type: char
+  vocab_filepath: data/vocab.txt
  augmentation_config: conf/augmentation.json
  random_seed: 0
  spm_model_prefix: 
@ -33,6 +30,10 @@ collator:
  target_dB: -20
  dither: 1.0
  keep_transcription_text: False
+  sortagrad: True 
+  shuffle_method: batch_shuffle
+  num_workers: 0
+  batch_size: 4
  
 model:
  num_conv_layers: 2
@ -42,7 +43,7 @@ model:
  share_rnn_weights: True 

 training:
-  n_epoch: 23
+  n_epoch: 24
  lr: 1e-5 
  lr_decay: 1.0 
  weight_decay: 1e-06
--- a/examples/tiny/s1/conf/transformer.yaml
+++ b/examples/tiny/s1/conf/transformer.yaml
@ -3,26 +3,20 @@ data:
  train_manifest: data/manifest.tiny
  dev_manifest: data/manifest.tiny
  test_manifest: data/manifest.tiny
-  vocab_filepath: data/vocab.txt 
-  unit_type: 'spm'
-  spm_model_prefix: 'data/bpe_unigram_200'
-  mean_std_filepath: ""
-  batch_size: 4
  min_input_len: 0.5  # second
  max_input_len: 20.0 # second
  min_output_len: 0.0 # tokens
  max_output_len: 400.0 # tokens
  min_output_input_ratio: 0.05
  max_output_input_ratio: 10.0
-  raw_wav: True  # use raw_wav or kaldi feature
-  sortagrad: True 
-  shuffle_method: batch_shuffle
-  num_workers: 0 #2
-
+  
 collator:
+  vocab_filepath: data/vocab.txt 
+  mean_std_filepath: ""
  augmentation_config: conf/augmentation.json
  random_seed: 0
-  spm_model_prefix: 
+  unit_type: 'spm'
+  spm_model_prefix: 'data/bpe_unigram_200'
  specgram_type: fbank
  feat_dim: 80
  delta_delta: False
@ -35,6 +29,12 @@ collator:
  target_dB: -20
  dither: 1.0
  keep_transcription_text: False
+  batch_size: 4
+  sortagrad: True 
+  shuffle_method: batch_shuffle
+  num_workers: 0 #2
+  raw_wav: True  # use raw_wav or kaldi feature
+

 # network architecture
 model:
--- a/speechnn/CMakeLists.txt
+++ b/speechnn/CMakeLists.txt
--- a/speechnn/core/CMakeLists.txt
+++ b/speechnn/core/CMakeLists.txt
--- a/speechnn/core/decoder/CMakeLists.txt
+++ b/speechnn/core/decoder/CMakeLists.txt
--- a/speechnn/core/frontend/CMakeLists.txt
+++ b/speechnn/core/frontend/CMakeLists.txt
--- a/speechnn/core/frontend/audio/CMakeLists.txt
+++ b/speechnn/core/frontend/audio/CMakeLists.txt
--- a/speechnn/core/frontend/text/CMakeLists.txt
+++ b/speechnn/core/frontend/text/CMakeLists.txt
--- a/speechnn/core/model/CMakeLists.txt
+++ b/speechnn/core/model/CMakeLists.txt
--- a/speechnn/core/protocol/CMakeLists.txt
+++ b/speechnn/core/protocol/CMakeLists.txt
--- a/speechnn/core/utils/CMakeLists.txt
+++ b/speechnn/core/utils/CMakeLists.txt
--- a/speechnn/third_party/CMakeLists.txt
+++ b/speechnn/third_party/CMakeLists.txt
--- a/third_party/nnAudio/.gitignore
+++ b/third_party/nnAudio/.gitignore
@ -0,0 +1,3 @@
+build
+dist
+*.egg-info/
--- a/third_party/nnAudio/nnAudio/Spectrogram.py
+++ b/third_party/nnAudio/nnAudio/Spectrogram.py
@ -165,9 +165,13 @@ class STFT(torch.nn.Module):
 #         self.kernel_cos = torch.nn.Parameter(self.kernel_cos, requires_grad=self.trainable)

        # Applying window functions to the Fourier kernels
-        window_mask = torch.tensor(window_mask)
-        wsin = kernel_sin * window_mask
-        wcos = kernel_cos * window_mask
+        if window:
+            window_mask = torch.tensor(window_mask)
+            wsin = kernel_sin * window_mask
+            wcos = kernel_cos * window_mask
+        else:
+            wsin = kernel_sin
+            wcos = kernel_cos
        
        if self.trainable==False:
            self.register_buffer('wsin', wsin)
@ -179,7 +183,6 @@ class STFT(torch.nn.Module):
            self.register_parameter('wsin', wsin)
            self.register_parameter('wcos', wcos)         
        
-
        # Prepare the shape of window mask so that it can be used later in inverse
        self.register_buffer('window_mask', window_mask.unsqueeze(0).unsqueeze(-1))
        
--- a/third_party/nnAudio/setup.py
+++ b/third_party/nnAudio/setup.py
@ -2,29 +2,26 @@ import setuptools
 import codecs
 import os.path

-with open("README.md", "r") as fh:
-    long_description = fh.read()
-
 def read(rel_path):
    here = os.path.abspath(os.path.dirname(__file__))
    with codecs.open(os.path.join(here, rel_path), 'r') as fp:
-        return fp.read()    
-    
+        return fp.read()
+
 def get_version(rel_path):
    for line in read(rel_path).splitlines():
        if line.startswith('__version__'):
            delim = '"' if '"' in line else "'"
            return line.split(delim)[1]
    else:
-        raise RuntimeError("Unable to find version string.")    
-    
+        raise RuntimeError("Unable to find version string.")
+
 setuptools.setup(
    name="nnAudio", # Replace with your own username
    version=get_version("nnAudio/__init__.py"),
    author="KinWaiCheuk",
    author_email="u3500684@connect.hku.hk",
    description="A fast GPU audio processing toolbox with 1D convolutional neural network",
-    long_description=long_description,
+    long_description='',
    long_description_content_type="text/markdown",
    url="https://github.com/KinWaiCheuk/nnAudio",
    packages=setuptools.find_packages(),
--- a/third_party/paddle_audio/frontend.py
+++ b/third_party/paddle_audio/frontend.py
@ -0,0 +1,146 @@
+from typing import Tuple
+import numpy as np
+import paddle
+from paddle import Tensor
+from paddle import nn
+from paddle.nn import functional as F
+
+
+def frame(x: Tensor,
+          num_samples: Tensor,
+          win_length: int,
+          hop_length: int,
+          clip: bool = True) -> Tuple[Tensor, Tensor]:
+    """Extract frames from audio.
+
+    Parameters
+    ----------
+    x : Tensor
+        Shape (N, T), batched waveform.
+    num_samples : Tensor
+        Shape (N, ), number of samples of each waveform.
+    win_length : int
+        Window length.
+    hop_length : int
+        Number of samples shifted between ajancent frames.
+    clip : bool, optional
+        Whether to clip audio that does not fit into the last frame, by 
+        default True
+
+    Returns
+    -------
+    frames : Tensor
+        Shape (N, T', win_length).
+    num_frames : Tensor
+        Shape (N, ) number of valid frames
+    """
+    assert hop_length <= win_length
+    num_frames = (num_samples - win_length) // hop_length
+    padding = (0, 0)
+    if not clip:
+        num_frames += 1
+        # NOTE: pad hop_length - 1 to the right to ensure that there is at most
+        # one frame dangling to the righe edge
+        padding = (0, hop_length - 1)
+
+    weight = paddle.eye(win_length).unsqueeze(1)
+
+    frames = F.conv1d(x.unsqueeze(1),
+                      weight,
+                      padding=padding,
+                      stride=(hop_length, ))
+    return frames, num_frames
+
+
+class STFT(nn.Layer):
+    """A module for computing stft transformation in a differentiable way. 
+    
+    Parameters
+    ------------
+    n_fft : int
+        Number of samples in a frame.
+        
+    hop_length : int
+        Number of samples shifted between adjacent frames.
+        
+    win_length : int
+        Length of the window.
+
+    clip: bool
+        Whether to clip audio is necesaary.
+    """
+    def __init__(self,
+                 n_fft: int,
+                 hop_length: int,
+                 win_length: int,
+                 window_type: str = None,
+                 clip: bool = True):
+        super().__init__()
+
+        self.hop_length = hop_length
+        self.n_bin = 1 + n_fft // 2
+        self.n_fft = n_fft
+        self.clip = clip
+
+        # calculate window
+        if window_type is None:
+            window = np.ones(win_length)
+        elif window_type == "hann":
+            window = np.hanning(win_length)
+        elif window_type == "hamming":
+            window = np.hamming(win_length)
+        else:
+            raise ValueError("Not supported yet!")
+
+        if win_length < n_fft:
+            window = F.pad(window, (0, n_fft - win_length))
+        elif win_length > n_fft:
+            window = window[:n_fft]
+
+        # (n_bins, n_fft) complex
+        kernel_size = min(n_fft, win_length)
+        weight = np.fft.fft(np.eye(n_fft))[:self.n_bin, :kernel_size]
+        w_real = weight.real
+        w_imag = weight.imag
+
+        # (2 * n_bins, kernel_size)
+        w = np.concatenate([w_real, w_imag], axis=0)
+        w = w * window
+
+        # (2 * n_bins, 1, kernel_size) # (C_out, C_in, kernel_size)
+        w = np.expand_dims(w, 1)
+        weight = paddle.cast(paddle.to_tensor(w), paddle.get_default_dtype())
+        self.register_buffer("weight", weight)
+
+    def forward(self, x: Tensor, num_samples: Tensor) -> Tuple[Tensor, Tensor]:
+        """Compute the stft transform.
+        Parameters
+        ------------
+        x : Tensor [shape=(B, T)]
+            The input waveform.
+        num_samples : Tensor 
+            Number of samples of each waveform.
+        Returns
+        ------------
+        D : Tensor
+            Shape(N, T', n_bins, 2) Spectrogram.
+
+        num_frames: Tensor
+            Shape (N,) number of samples of each spectrogram
+        """
+        num_frames = (num_samples - self.win_length) // self.hop_length
+        padding = (0, 0)
+        if not self.clip:
+            num_frames += 1
+            padding = (0, self.hop_length - 1)
+
+        batch_size, _, _ = paddle.shape(x)
+        x = x.unsqueeze(-1)
+        D = F.conv1d(self.weight,
+                     x,
+                     stride=(self.hop_length, ),
+                     padding=padding,
+                     data_format="NLC")
+        D = paddle.reshape(D, [batch_size, -1, self.n_bin, 2])
+        return D, num_frames
+
--- a/utils/tarball.sh
+++ b/utils/tarball.sh
@ -18,7 +18,13 @@ function clean() {
 }
 trap clean EXIT

-cp ${ckpt_prefix}.* ${output}
+# ckpt_prfix dir
+if [ -d ${ckpt_prefix} ];then
+    cp -r ${ckpt_prefix} ${output}
+fi
+# ckpt_prfix.{json,...}
+cp ${ckpt_prefix}.*  ${output}
+# model config, mean std, vocab
 cp ${model_config} ${mean_std} ${vocab} ${output}

 tar zcvf release.tar.gz ${output}