diff --git a/paddlespeech/__init__.py b/paddlespeech/__init__.py
index b781c4a8e..d52b0dca7 100644
--- a/paddlespeech/__init__.py
+++ b/paddlespeech/__init__.py
@@ -14,3 +14,9 @@
 import _locale
 
 _locale._getdefaultlocale = (lambda *args: ['en_US', 'utf8'])
+
+
+
+
+
+
diff --git a/paddlespeech/audio/__init__.py b/paddlespeech/audio/__init__.py
index a91958105..83be8e32e 100644
--- a/paddlespeech/audio/__init__.py
+++ b/paddlespeech/audio/__init__.py
@@ -14,12 +14,12 @@
 from . import compliance
 from . import datasets
 from . import features
+from . import text
+from . import transform
+from . import streamdata
 from . import functional
 from . import io
 from . import metric
 from . import sox_effects
-from . import streamdata
-from . import text
-from . import transform
 from .backends import load
 from .backends import save
diff --git a/paddlespeech/audio/streamdata/__init__.py b/paddlespeech/audio/streamdata/__init__.py
index 47a2e79b3..753fcc11b 100644
--- a/paddlespeech/audio/streamdata/__init__.py
+++ b/paddlespeech/audio/streamdata/__init__.py
@@ -4,66 +4,67 @@
 # Modified from https://github.com/webdataset/webdataset
 #
 # flake8: noqa
-from .cache import cached_tarfile_samples
-from .cache import cached_tarfile_to_samples
-from .cache import lru_cleanup
-from .cache import pipe_cleaner
-from .compat import FluidWrapper
-from .compat import WebDataset
-from .compat import WebLoader
-from .extradatasets import MockDataset
-from .extradatasets import with_epoch
-from .extradatasets import with_length
-from .filters import associate
-from .filters import audio_cmvn
-from .filters import audio_compute_fbank
-from .filters import audio_data_filter
-from .filters import audio_padding
-from .filters import audio_resample
-from .filters import audio_spec_aug
-from .filters import audio_tokenize
-from .filters import batched
-from .filters import decode
-from .filters import detshuffle
-from .filters import extract_keys
-from .filters import getfirst
-from .filters import info
-from .filters import map
-from .filters import map_dict
-from .filters import map_tuple
-from .filters import pipelinefilter
-from .filters import placeholder
-from .filters import rename
-from .filters import rename_keys
-from .filters import select
-from .filters import shuffle
-from .filters import slice
-from .filters import sort
-from .filters import to_tuple
-from .filters import transform_with
-from .filters import unbatched
-from .filters import xdecode
-from .handlers import ignore_and_continue
-from .handlers import ignore_and_stop
-from .handlers import reraise_exception
-from .handlers import warn_and_continue
-from .handlers import warn_and_stop
-from .mix import RandomMix
-from .mix import RoundRobin
+
+from .cache import (
+    cached_tarfile_samples,
+    cached_tarfile_to_samples,
+    lru_cleanup,
+    pipe_cleaner,
+)
+from .compat import WebDataset, WebLoader, FluidWrapper
+from .extradatasets import MockDataset, with_epoch, with_length
+from .filters import (
+    associate,
+    batched,
+    decode,
+    detshuffle,
+    extract_keys,
+    getfirst,
+    info,
+    map,
+    map_dict,
+    map_tuple,
+    pipelinefilter,
+    rename,
+    rename_keys,
+    audio_resample,
+    select,
+    shuffle,
+    slice,
+    to_tuple,
+    transform_with,
+    unbatched,
+    xdecode,
+    audio_data_filter,
+    audio_tokenize,
+    audio_resample,
+    audio_compute_fbank,
+    audio_spec_aug,
+    sort,
+    audio_padding,
+    audio_cmvn,
+    placeholder,
+)
+from .handlers import (
+    ignore_and_continue,
+    ignore_and_stop,
+    reraise_exception,
+    warn_and_continue,
+    warn_and_stop,
+)
 from .pipeline import DataPipeline
-from .shardlists import MultiShardSample
-from .shardlists import non_empty
-from .shardlists import resampled
-from .shardlists import ResampledShards
-from .shardlists import shardspec
-from .shardlists import SimpleShardList
-from .shardlists import single_node_only
-from .shardlists import split_by_node
-from .shardlists import split_by_worker
-from .tariterators import tarfile_samples
-from .tariterators import tarfile_to_samples
-from .utils import PipelineStage
-from .utils import repeatedly
-from .writer import numpy_dumps
-from .writer import ShardWriter
-from .writer import TarWriter
+from .shardlists import (
+    MultiShardSample,
+    ResampledShards,
+    SimpleShardList,
+    non_empty,
+    resampled,
+    shardspec,
+    single_node_only,
+    split_by_node,
+    split_by_worker,
+)
+from .tariterators import tarfile_samples, tarfile_to_samples
+from .utils import PipelineStage, repeatedly
+from .writer import ShardWriter, TarWriter, numpy_dumps
+from .mix import RandomMix, RoundRobin
diff --git a/paddlespeech/audio/streamdata/autodecode.py b/paddlespeech/audio/streamdata/autodecode.py
index d7f7937bd..ca0e2ea2f 100644
--- a/paddlespeech/audio/streamdata/autodecode.py
+++ b/paddlespeech/audio/streamdata/autodecode.py
@@ -5,19 +5,18 @@
 # See the LICENSE file for licensing terms (BSD-style).
 # Modified from https://github.com/webdataset/webdataset
 #
+
 """Automatically decode webdataset samples."""
-import io
-import json
-import os
-import pickle
-import re
-import tempfile
+
+import io, json, os, pickle, re, tempfile
 from functools import partial
 
 import numpy as np
+
 """Extensions passed on to the image decoder."""
 image_extensions = "jpg jpeg png ppm pgm pbm pnm".split()
 
+
 ################################################################
 # handle basic datatypes
 ################################################################
@@ -129,7 +128,7 @@ def call_extension_handler(key, data, f, extensions):
         target = target.split(".")
         if len(target) > len(extension):
             continue
-        if extension[-len(target):] == target:
+        if extension[-len(target) :] == target:
             return f(data)
     return None
 
@@ -269,6 +268,7 @@ def imagehandler(imagespec, extensions=image_extensions):
 ################################################################
 # torch video
 ################################################################
+
 '''
 def torch_video(key, data):
     """Decode video using the torchvideo library.
@@ -289,6 +289,7 @@ def torch_video(key, data):
         return torchvision.io.read_video(fname, pts_unit="sec")
 '''
 
+
 ################################################################
 # paddlespeech.audio
 ################################################################
@@ -358,6 +359,7 @@ def gzfilter(key, data):
 # decode entire training amples
 ################################################################
 
+
 default_pre_handlers = [gzfilter]
 default_post_handlers = [basichandlers]
 
@@ -385,8 +387,7 @@ class Decoder:
             pre = default_pre_handlers
         if post is None:
             post = default_post_handlers
-        assert all(callable(h)
-                   for h in handlers), f"one of {handlers} not callable"
+        assert all(callable(h) for h in handlers), f"one of {handlers} not callable"
         assert all(callable(h) for h in pre), f"one of {pre} not callable"
         assert all(callable(h) for h in post), f"one of {post} not callable"
         self.handlers = pre + handlers + post
diff --git a/paddlespeech/audio/streamdata/cache.py b/paddlespeech/audio/streamdata/cache.py
index faa196398..e7bbffa1b 100644
--- a/paddlespeech/audio/streamdata/cache.py
+++ b/paddlespeech/audio/streamdata/cache.py
@@ -2,10 +2,7 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 # See the LICENSE file for licensing terms (BSD-style).
 # Modified from https://github.com/webdataset/webdataset
-import os
-import random
-import re
-import sys
+import itertools, os, random, re, sys
 from urllib.parse import urlparse
 
 from . import filters
@@ -43,7 +40,7 @@ def lru_cleanup(cache_dir, cache_size, keyfn=os.path.getctime, verbose=False):
         os.remove(fname)
 
 
-def download(url, dest, chunk_size=1024**2, verbose=False):
+def download(url, dest, chunk_size=1024 ** 2, verbose=False):
     """Download a file from `url` to `dest`."""
     temp = dest + f".temp{os.getpid()}"
     with gopen.gopen(url) as stream:
@@ -68,11 +65,12 @@ def pipe_cleaner(spec):
 
 
 def get_file_cached(
-        spec,
-        cache_size=-1,
-        cache_dir=None,
-        url_to_name=pipe_cleaner,
-        verbose=False, ):
+    spec,
+    cache_size=-1,
+    cache_dir=None,
+    url_to_name=pipe_cleaner,
+    verbose=False,
+):
     if cache_size == -1:
         cache_size = default_cache_size
     if cache_dir is None:
@@ -109,14 +107,15 @@ verbose_cache = int(os.environ.get("WDS_VERBOSE_CACHE", "0"))
 
 
 def cached_url_opener(
-        data,
-        handler=reraise_exception,
-        cache_size=-1,
-        cache_dir=None,
-        url_to_name=pipe_cleaner,
-        validator=check_tar_format,
-        verbose=False,
-        always=False, ):
+    data,
+    handler=reraise_exception,
+    cache_size=-1,
+    cache_dir=None,
+    url_to_name=pipe_cleaner,
+    validator=check_tar_format,
+    verbose=False,
+    always=False,
+):
     """Given a stream of url names (packaged in `dict(url=url)`), yield opened streams."""
     verbose = verbose or verbose_cache
     for sample in data:
@@ -133,7 +132,8 @@ def cached_url_opener(
                     cache_size=cache_size,
                     cache_dir=cache_dir,
                     url_to_name=url_to_name,
-                    verbose=verbose, )
+                    verbose=verbose,
+                )
             if verbose:
                 print("# opening %s" % dest, file=sys.stderr)
             assert os.path.exists(dest)
@@ -143,8 +143,9 @@ def cached_url_opener(
                     data = f.read(200)
                 os.remove(dest)
                 raise ValueError(
-                    "%s (%s) is not a tar archive, but a %s, contains %s" %
-                    (dest, url, ftype, repr(data)))
+                    "%s (%s) is not a tar archive, but a %s, contains %s"
+                    % (dest, url, ftype, repr(data))
+                )
             try:
                 stream = open(dest, "rb")
                 sample.update(stream=stream)
@@ -157,7 +158,7 @@ def cached_url_opener(
                     continue
                 raise exn
         except Exception as exn:
-            exn.args = exn.args + (url, )
+            exn.args = exn.args + (url,)
             if handler(exn):
                 continue
             else:
@@ -165,13 +166,14 @@ def cached_url_opener(
 
 
 def cached_tarfile_samples(
-        src,
-        handler=reraise_exception,
-        cache_size=-1,
-        cache_dir=None,
-        verbose=False,
-        url_to_name=pipe_cleaner,
-        always=False, ):
+    src,
+    handler=reraise_exception,
+    cache_size=-1,
+    cache_dir=None,
+    verbose=False,
+    url_to_name=pipe_cleaner,
+    always=False,
+):
     streams = cached_url_opener(
         src,
         handler=handler,
@@ -179,7 +181,8 @@ def cached_tarfile_samples(
         cache_dir=cache_dir,
         verbose=verbose,
         url_to_name=url_to_name,
-        always=always, )
+        always=always,
+    )
     samples = tar_file_and_group_expander(streams, handler=handler)
     return samples
 
diff --git a/paddlespeech/audio/streamdata/compat.py b/paddlespeech/audio/streamdata/compat.py
index 9012eeb10..deda53384 100644
--- a/paddlespeech/audio/streamdata/compat.py
+++ b/paddlespeech/audio/streamdata/compat.py
@@ -2,17 +2,17 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 # See the LICENSE file for licensing terms (BSD-style).
 # Modified from https://github.com/webdataset/webdataset
-import yaml
+from dataclasses import dataclass
+from itertools import islice
+from typing import List
+
+import braceexpand, yaml
 
 from . import autodecode
-from . import cache
-from . import filters
-from . import shardlists
-from . import tariterators
+from . import cache, filters, shardlists, tariterators
 from .filters import reraise_exception
-from .paddle_utils import DataLoader
-from .paddle_utils import IterableDataset
 from .pipeline import DataPipeline
+from .paddle_utils import DataLoader, IterableDataset
 
 
 class FluidInterface:
@@ -26,8 +26,7 @@ class FluidInterface:
         return self.compose(filters.unbatched())
 
     def listed(self, batchsize, partial=True):
-        return self.compose(
-            filters.batched(), batchsize=batchsize, collation_fn=None)
+        return self.compose(filters.batched(), batchsize=batchsize, collation_fn=None)
 
     def unlisted(self):
         return self.compose(filters.unlisted())
@@ -44,19 +43,9 @@ class FluidInterface:
     def map(self, f, handler=reraise_exception):
         return self.compose(filters.map(f, handler=handler))
 
-    def decode(self,
-               *args,
-               pre=None,
-               post=None,
-               only=None,
-               partial=False,
-               handler=reraise_exception):
-        handlers = [
-            autodecode.ImageHandler(x) if isinstance(x, str) else x
-            for x in args
-        ]
-        decoder = autodecode.Decoder(
-            handlers, pre=pre, post=post, only=only, partial=partial)
+    def decode(self, *args, pre=None, post=None, only=None, partial=False, handler=reraise_exception):
+        handlers = [autodecode.ImageHandler(x) if isinstance(x, str) else x for x in args]
+        decoder = autodecode.Decoder(handlers, pre=pre, post=post, only=only, partial=partial)
         return self.map(decoder, handler=handler)
 
     def map_dict(self, handler=reraise_exception, **kw):
@@ -91,12 +80,12 @@ class FluidInterface:
 
     def audio_data_filter(self, *args, **kw):
         return self.compose(filters.audio_data_filter(*args, **kw))
-
+    
     def audio_tokenize(self, *args, **kw):
         return self.compose(filters.audio_tokenize(*args, **kw))
 
     def resample(self, *args, **kw):
-        return self.compose(filters.resample(*args, **kw))
+        return self.compose(filters.resample(*args, **kw)) 
 
     def audio_compute_fbank(self, *args, **kw):
         return self.compose(filters.audio_compute_fbank(*args, **kw))
@@ -113,28 +102,27 @@ class FluidInterface:
     def audio_cmvn(self, cmvn_file):
         return self.compose(filters.audio_cmvn(cmvn_file))
 
-
 class WebDataset(DataPipeline, FluidInterface):
     """Small fluid-interface wrapper for DataPipeline."""
 
     def __init__(
-            self,
-            urls,
-            handler=reraise_exception,
-            resampled=False,
-            repeat=False,
-            shardshuffle=None,
-            cache_size=0,
-            cache_dir=None,
-            detshuffle=False,
-            nodesplitter=shardlists.single_node_only,
-            verbose=False, ):
+        self,
+        urls,
+        handler=reraise_exception,
+        resampled=False,
+        repeat=False,
+        shardshuffle=None,
+        cache_size=0,
+        cache_dir=None,
+        detshuffle=False,
+        nodesplitter=shardlists.single_node_only,
+        verbose=False,
+    ):
         super().__init__()
         if isinstance(urls, IterableDataset):
             assert not resampled
             self.append(urls)
-        elif isinstance(urls, str) and (urls.endswith(".yaml") or
-                                        urls.endswith(".yml")):
+        elif isinstance(urls, str) and (urls.endswith(".yaml") or urls.endswith(".yml")):
             with (open(urls)) as stream:
                 spec = yaml.safe_load(stream)
             assert "datasets" in spec
@@ -164,7 +152,9 @@ class WebDataset(DataPipeline, FluidInterface):
                     handler=handler,
                     verbose=verbose,
                     cache_size=cache_size,
-                    cache_dir=cache_dir, ))
+                    cache_dir=cache_dir,
+                )
+            )
 
 
 class FluidWrapper(DataPipeline, FluidInterface):
diff --git a/paddlespeech/audio/streamdata/extradatasets.py b/paddlespeech/audio/streamdata/extradatasets.py
index 76361c24a..e6d617724 100644
--- a/paddlespeech/audio/streamdata/extradatasets.py
+++ b/paddlespeech/audio/streamdata/extradatasets.py
@@ -5,10 +5,20 @@
 # See the LICENSE file for licensing terms (BSD-style).
 # Modified from https://github.com/webdataset/webdataset
 #
+
+
 """Train PyTorch models directly from POSIX tar archive.
 
 Code works locally or over HTTP connections.
 """
+
+import itertools as itt
+import os
+import random
+import sys
+
+import braceexpand
+
 from . import utils
 from .paddle_utils import IterableDataset
 from .utils import PipelineStage
@@ -53,7 +63,8 @@ class repeatedly(IterableDataset, PipelineStage):
         return utils.repeatedly(
             source,
             nepochs=self.nepochs,
-            nbatches=self.nbatches, )
+            nbatches=self.nbatches,
+        )
 
 
 class with_epoch(IterableDataset):
diff --git a/paddlespeech/audio/streamdata/filters.py b/paddlespeech/audio/streamdata/filters.py
index 68d6830bb..82b9c6bab 100644
--- a/paddlespeech/audio/streamdata/filters.py
+++ b/paddlespeech/audio/streamdata/filters.py
@@ -3,6 +3,7 @@
 # This file is part of the WebDataset library.
 # See the LICENSE file for licensing terms (BSD-style).
 #
+
 # Modified from https://github.com/webdataset/webdataset
 # Modified from wenet(https://github.com/wenet-e2e/wenet)
 """A collection of iterators for data transformations.
@@ -11,29 +12,28 @@ These functions are plain iterator functions. You can find curried versions
 in webdataset.filters, and you can find IterableDataset wrappers in
 webdataset.processing.
 """
+
 import io
-import itertools
-import os
-import random
-import re
-import sys
-import time
 from fnmatch import fnmatch
-from functools import reduce
+import re
+import itertools, os, random, sys, time
+from functools import reduce, wraps
 
-import paddle
+import numpy as np
 
 from . import autodecode
-from . import utils
+from . import  utils
+from .paddle_utils import PaddleTensor
+from .utils import PipelineStage
+
 from .. import backends
 from ..compliance import kaldi
+import paddle
 from ..transform.cmvn import GlobalCMVN
-from ..transform.spec_augment import freq_mask
-from ..transform.spec_augment import time_mask
-from ..transform.spec_augment import time_warp
 from ..utils.tensor_utils import pad_sequence
-from .utils import PipelineStage
-
+from ..transform.spec_augment import time_warp
+from ..transform.spec_augment import time_mask
+from ..transform.spec_augment import freq_mask
 
 class FilterFunction(object):
     """Helper class for currying pipeline stages.
@@ -159,12 +159,10 @@ def transform_with(sample, transformers):
             result[i] = f(sample[i])
     return result
 
-
 ###
 # Iterators
 ###
 
-
 def _info(data, fmt=None, n=3, every=-1, width=50, stream=sys.stderr, name=""):
     """Print information about the samples that are passing through.
 
@@ -280,16 +278,10 @@ def _log_keys(data, logfile=None):
 log_keys = pipelinefilter(_log_keys)
 
 
-def _minedecode(x):
-    if isinstance(x, str):
-        return autodecode.imagehandler(x)
-    else:
-        return x
-
-
 def _decode(data, *args, handler=reraise_exception, **kw):
     """Decode data based on the decoding functions given as arguments."""
-    decoder = _minedecode
+
+    decoder = lambda x: autodecode.imagehandler(x) if isinstance(x, str) else x
     handlers = [decoder(x) for x in args]
     f = autodecode.Decoder(handlers, **kw)
 
@@ -333,24 +325,15 @@ def _rename(data, handler=reraise_exception, keep=True, **kw):
     for sample in data:
         try:
             if not keep:
-                yield {
-                    k: getfirst(sample, v, missing_is_error=True)
-                    for k, v in kw.items()
-                }
+                yield {k: getfirst(sample, v, missing_is_error=True) for k, v in kw.items()}
             else:
 
                 def listify(v):
                     return v.split(";") if isinstance(v, str) else v
 
                 to_be_replaced = {x for v in kw.values() for x in listify(v)}
-                result = {
-                    k: v
-                    for k, v in sample.items() if k not in to_be_replaced
-                }
-                result.update({
-                    k: getfirst(sample, v, missing_is_error=True)
-                    for k, v in kw.items()
-                })
+                result = {k: v for k, v in sample.items() if k not in to_be_replaced}
+                result.update({k: getfirst(sample, v, missing_is_error=True) for k, v in kw.items()})
                 yield result
         except Exception as exn:
             if handler(exn):
@@ -398,11 +381,7 @@ def _map_dict(data, handler=reraise_exception, **kw):
 map_dict = pipelinefilter(_map_dict)
 
 
-def _to_tuple(data,
-              *args,
-              handler=reraise_exception,
-              missing_is_error=True,
-              none_is_error=None):
+def _to_tuple(data, *args, handler=reraise_exception, missing_is_error=True, none_is_error=None):
     """Convert dict samples to tuples."""
     if none_is_error is None:
         none_is_error = missing_is_error
@@ -411,10 +390,7 @@ def _to_tuple(data,
 
     for sample in data:
         try:
-            result = tuple([
-                getfirst(sample, f, missing_is_error=missing_is_error)
-                for f in args
-            ])
+            result = tuple([getfirst(sample, f, missing_is_error=missing_is_error) for f in args])
             if none_is_error and any(x is None for x in result):
                 raise ValueError(f"to_tuple {args} got {sample.keys()}")
             yield result
@@ -487,28 +463,19 @@ rsample = pipelinefilter(_rsample)
 slice = pipelinefilter(itertools.islice)
 
 
-def _extract_keys(source,
-                  *patterns,
-                  duplicate_is_error=True,
-                  ignore_missing=False):
+def _extract_keys(source, *patterns, duplicate_is_error=True, ignore_missing=False):
     for sample in source:
         result = []
         for pattern in patterns:
-            pattern = pattern.split(";") if isinstance(pattern,
-                                                       str) else pattern
-            matches = [
-                x for x in sample.keys()
-                if any(fnmatch("." + x, p) for p in pattern)
-            ]
+            pattern = pattern.split(";") if isinstance(pattern, str) else pattern
+            matches = [x for x in sample.keys() if any(fnmatch("." + x, p) for p in pattern)]
             if len(matches) == 0:
                 if ignore_missing:
                     continue
                 else:
-                    raise ValueError(
-                        f"Cannot find {pattern} in sample keys {sample.keys()}.")
+                    raise ValueError(f"Cannot find {pattern} in sample keys {sample.keys()}.")
             if len(matches) > 1 and duplicate_is_error:
-                raise ValueError(
-                    f"Multiple sample keys {sample.keys()} match {pattern}.")
+                raise ValueError(f"Multiple sample keys {sample.keys()} match {pattern}.")
             value = sample[matches[0]]
             result.append(value)
         yield tuple(result)
@@ -517,12 +484,7 @@ def _extract_keys(source,
 extract_keys = pipelinefilter(_extract_keys)
 
 
-def _rename_keys(source,
-                 *args,
-                 keep_unselected=False,
-                 must_match=True,
-                 duplicate_is_error=True,
-                 **kw):
+def _rename_keys(source, *args, keep_unselected=False, must_match=True, duplicate_is_error=True, **kw):
     renamings = [(pattern, output) for output, pattern in args]
     renamings += [(pattern, output) for output, pattern in kw.items()]
     for sample in source:
@@ -542,15 +504,11 @@ def _rename_keys(source,
                 continue
             if new_name in new_sample:
                 if duplicate_is_error:
-                    raise ValueError(
-                        f"Duplicate value in sample {sample.keys()} after rename."
-                    )
+                    raise ValueError(f"Duplicate value in sample {sample.keys()} after rename.")
                 continue
             new_sample[new_name] = value
         if must_match and not all(matched.values()):
-            raise ValueError(
-                f"Not all patterns ({matched}) matched sample keys ({sample.keys()})."
-            )
+            raise ValueError(f"Not all patterns ({matched}) matched sample keys ({sample.keys()}).")
 
         yield new_sample
 
@@ -583,18 +541,18 @@ def find_decoder(decoders, path):
     if fname.startswith("__"):
         return lambda x: x
     for pattern, fun in decoders[::-1]:
-        if fnmatch(fname.lower(), pattern) or fnmatch("." + fname.lower(),
-                                                      pattern):
+        if fnmatch(fname.lower(), pattern) or fnmatch("." + fname.lower(), pattern):
             return fun
     return None
 
 
 def _xdecode(
-        source,
-        *args,
-        must_decode=True,
-        defaults=default_decoders,
-        **kw, ):
+    source,
+    *args,
+    must_decode=True,
+    defaults=default_decoders,
+    **kw,
+):
     decoders = list(defaults) + list(args)
     decoders += [("*." + k, v) for k, v in kw.items()]
     for sample in source:
@@ -617,18 +575,18 @@ def _xdecode(
             new_sample[path] = value
         yield new_sample
 
-
 xdecode = pipelinefilter(_xdecode)
 
 
+
 def _audio_data_filter(source,
-                       frame_shift=10,
-                       max_length=10240,
-                       min_length=10,
-                       token_max_length=200,
-                       token_min_length=1,
-                       min_output_input_ratio=0.0005,
-                       max_output_input_ratio=1):
+           frame_shift=10,
+           max_length=10240,
+           min_length=10,
+           token_max_length=200,
+           token_min_length=1,
+           min_output_input_ratio=0.0005,
+           max_output_input_ratio=1):
     """ Filter sample according to feature and label length
         Inplace operation.
 
@@ -655,8 +613,7 @@ def _audio_data_filter(source,
         assert 'wav' in sample
         assert 'label' in sample
         # sample['wav'] is paddle.Tensor, we have 100 frames every second (default)
-        num_frames = sample['wav'].shape[1] / sample['sample_rate'] * (
-            1000 / frame_shift)
+        num_frames = sample['wav'].shape[1] / sample['sample_rate'] * (1000 / frame_shift)
         if num_frames < min_length:
             continue
         if num_frames > max_length:
@@ -672,15 +629,13 @@ def _audio_data_filter(source,
                 continue
         yield sample
 
-
 audio_data_filter = pipelinefilter(_audio_data_filter)
 
-
 def _audio_tokenize(source,
-                    symbol_table,
-                    bpe_model=None,
-                    non_lang_syms=None,
-                    split_with_space=False):
+             symbol_table,
+             bpe_model=None,
+             non_lang_syms=None,
+             split_with_space=False):
     """ Decode text to chars or BPE
         Inplace operation
 
@@ -738,10 +693,8 @@ def _audio_tokenize(source,
         sample['label'] = label
         yield sample
 
-
 audio_tokenize = pipelinefilter(_audio_tokenize)
 
-
 def _audio_resample(source, resample_rate=16000):
     """ Resample data.
         Inplace operation.
@@ -760,22 +713,18 @@ def _audio_resample(source, resample_rate=16000):
         waveform = sample['wav']
         if sample_rate != resample_rate:
             sample['sample_rate'] = resample_rate
-            sample['wav'] = paddle.to_tensor(
-                backends.soundfile_backend.resample(
-                    waveform.numpy(),
-                    src_sr=sample_rate,
-                    target_sr=resample_rate))
+            sample['wav'] = paddle.to_tensor(backends.soundfile_backend.resample(
+                waveform.numpy(), src_sr = sample_rate, target_sr = resample_rate
+            ))
         yield sample
 
-
 audio_resample = pipelinefilter(_audio_resample)
 
-
 def _audio_compute_fbank(source,
-                         num_mel_bins=80,
-                         frame_length=25,
-                         frame_shift=10,
-                         dither=0.0):
+                  num_mel_bins=80,
+                  frame_length=25,
+                  frame_shift=10,
+                  dither=0.0):
     """ Extract fbank
 
         Args:
@@ -797,33 +746,30 @@ def _audio_compute_fbank(source,
         waveform = sample['wav']
         waveform = waveform * (1 << 15)
         # Only keep fname, feat, label
-        mat = kaldi.fbank(
-            waveform,
-            n_mels=num_mel_bins,
-            frame_length=frame_length,
-            frame_shift=frame_shift,
-            dither=dither,
-            energy_floor=0.0,
-            sr=sample_rate)
+        mat = kaldi.fbank(waveform,
+                          n_mels=num_mel_bins,
+                          frame_length=frame_length,
+                          frame_shift=frame_shift,
+                          dither=dither,
+                          energy_floor=0.0,
+                          sr=sample_rate)
         yield dict(fname=sample['fname'], label=sample['label'], feat=mat)
 
 
 audio_compute_fbank = pipelinefilter(_audio_compute_fbank)
 
-
-def _audio_spec_aug(
-        source,
-        max_w=5,
-        w_inplace=True,
-        w_mode="PIL",
-        max_f=30,
-        num_f_mask=2,
-        f_inplace=True,
-        f_replace_with_zero=False,
-        max_t=40,
-        num_t_mask=2,
-        t_inplace=True,
-        t_replace_with_zero=False, ):
+def _audio_spec_aug(source,
+            max_w=5, 
+            w_inplace=True, 
+            w_mode="PIL",
+            max_f=30,
+            num_f_mask=2, 
+            f_inplace=True, 
+            f_replace_with_zero=False,
+            max_t=40, 
+            num_t_mask=2, 
+            t_inplace=True, 
+            t_replace_with_zero=False,):
     """ Do spec augmentation
         Inplace operation
 
@@ -847,23 +793,12 @@ def _audio_spec_aug(
     for sample in source:
         x = sample['feat']
         x = x.numpy()
-        x = time_warp(x, max_time_warp=max_w, inplace=w_inplace, mode=w_mode)
-        x = freq_mask(
-            x,
-            F=max_f,
-            n_mask=num_f_mask,
-            inplace=f_inplace,
-            replace_with_zero=f_replace_with_zero)
-        x = time_mask(
-            x,
-            T=max_t,
-            n_mask=num_t_mask,
-            inplace=t_inplace,
-            replace_with_zero=t_replace_with_zero)
+        x = time_warp(x, max_time_warp=max_w, inplace = w_inplace, mode= w_mode)
+        x = freq_mask(x, F = max_f, n_mask = num_f_mask, inplace = f_inplace, replace_with_zero = f_replace_with_zero)
+        x = time_mask(x, T = max_t, n_mask = num_t_mask, inplace = t_inplace, replace_with_zero = t_replace_with_zero)
         sample['feat'] = paddle.to_tensor(x, dtype=paddle.float32)
         yield sample
 
-
 audio_spec_aug = pipelinefilter(_audio_spec_aug)
 
 
@@ -894,10 +829,8 @@ def _sort(source, sort_size=500):
     for x in buf:
         yield x
 
-
 sort = pipelinefilter(_sort)
 
-
 def _batched(source, batch_size=16):
     """ Static batch the data by `batch_size`
 
@@ -917,10 +850,8 @@ def _batched(source, batch_size=16):
     if len(buf) > 0:
         yield buf
 
-
 batched = pipelinefilter(_batched)
 
-
 def dynamic_batched(source, max_frames_in_batch=12000):
     """ Dynamic batch the data until the total frames in batch
         reach `max_frames_in_batch`
@@ -961,8 +892,8 @@ def _audio_padding(source):
     """
     for sample in source:
         assert isinstance(sample, list)
-        feats_length = paddle.to_tensor(
-            [x['feat'].shape[0] for x in sample], dtype="int64")
+        feats_length = paddle.to_tensor([x['feat'].shape[0] for x in sample],
+                                    dtype="int64")
         order = paddle.argsort(feats_length, descending=True)
         feats_lengths = paddle.to_tensor(
             [sample[i]['feat'].shape[0] for i in order], dtype="int64")
@@ -971,20 +902,20 @@ def _audio_padding(source):
         sorted_labels = [
             paddle.to_tensor(sample[i]['label'], dtype="int32") for i in order
         ]
-        label_lengths = paddle.to_tensor(
-            [x.shape[0] for x in sorted_labels], dtype="int64")
-        padded_feats = pad_sequence(
-            sorted_feats, batch_first=True, padding_value=0)
-        padding_labels = pad_sequence(
-            sorted_labels, batch_first=True, padding_value=-1)
-
-        yield (sorted_keys, padded_feats, feats_lengths, padding_labels,
+        label_lengths = paddle.to_tensor([x.shape[0] for x in sorted_labels],
+                                     dtype="int64")
+        padded_feats = pad_sequence(sorted_feats,
+                                    batch_first=True,
+                                    padding_value=0)
+        padding_labels = pad_sequence(sorted_labels,
+                                      batch_first=True,
+                                      padding_value=-1)
+
+        yield (sorted_keys, padded_feats, feats_lengths, padding_labels, 
                label_lengths)
 
-
 audio_padding = pipelinefilter(_audio_padding)
 
-
 def _audio_cmvn(source, cmvn_file):
     global_cmvn = GlobalCMVN(cmvn_file)
     for batch in source:
@@ -992,16 +923,13 @@ def _audio_cmvn(source, cmvn_file):
         padded_feats = padded_feats.numpy()
         padded_feats = global_cmvn(padded_feats)
         padded_feats = paddle.to_tensor(padded_feats, dtype=paddle.float32)
-        yield (sorted_keys, padded_feats, feats_lengths, padding_labels,
-               label_lengths)
-
+        yield (sorted_keys, padded_feats, feats_lengths, padding_labels, 
+           label_lengths)
 
 audio_cmvn = pipelinefilter(_audio_cmvn)
 
-
 def _placeholder(source):
     for data in source:
         yield data
 
-
 placeholder = pipelinefilter(_placeholder)
diff --git a/paddlespeech/audio/streamdata/gopen.py b/paddlespeech/audio/streamdata/gopen.py
index 60a434603..457d048a6 100644
--- a/paddlespeech/audio/streamdata/gopen.py
+++ b/paddlespeech/audio/streamdata/gopen.py
@@ -3,12 +3,12 @@
 # This file is part of the WebDataset library.
 # See the LICENSE file for licensing terms (BSD-style).
 #
+
+
 """Open URLs by calling subcommands."""
-import os
-import re
-import sys
-from subprocess import PIPE
-from subprocess import Popen
+
+import os, sys, re
+from subprocess import PIPE, Popen
 from urllib.parse import urlparse
 
 # global used for printing additional node information during verbose output
@@ -31,13 +31,14 @@ class Pipe:
     """
 
     def __init__(
-            self,
-            *args,
-            mode=None,
-            timeout=7200.0,
-            ignore_errors=False,
-            ignore_status=[],
-            **kw, ):
+        self,
+        *args,
+        mode=None,
+        timeout=7200.0,
+        ignore_errors=False,
+        ignore_status=[],
+        **kw,
+    ):
         """Create an IO Pipe."""
         self.ignore_errors = ignore_errors
         self.ignore_status = [0] + ignore_status
@@ -74,7 +75,8 @@ class Pipe:
         if verbose:
             print(
                 f"pipe exit [{self.status} {os.getpid()}:{self.proc.pid}] {self.args} {info}",
-                file=sys.stderr, )
+                file=sys.stderr,
+            )
         if self.status not in self.ignore_status and not self.ignore_errors:
             raise Exception(f"{self.args}: exit {self.status} (read) {info}")
 
@@ -112,11 +114,9 @@ class Pipe:
         self.close()
 
 
-def set_options(obj,
-                timeout=None,
-                ignore_errors=None,
-                ignore_status=None,
-                handler=None):
+def set_options(
+    obj, timeout=None, ignore_errors=None, ignore_status=None, handler=None
+):
     """Set options for Pipes.
 
     This function can be called on any stream. It will set pipe options only
@@ -168,14 +168,16 @@ def gopen_pipe(url, mode="rb", bufsize=8192):
             mode=mode,
             shell=True,
             bufsize=bufsize,
-            ignore_status=[141], )  # skipcq: BAN-B604
+            ignore_status=[141],
+        )  # skipcq: BAN-B604
     elif mode[0] == "w":
         return Pipe(
             cmd,
             mode=mode,
             shell=True,
             bufsize=bufsize,
-            ignore_status=[141], )  # skipcq: BAN-B604
+            ignore_status=[141],
+        )  # skipcq: BAN-B604
     else:
         raise ValueError(f"{mode}: unknown mode")
 
@@ -194,7 +196,8 @@ def gopen_curl(url, mode="rb", bufsize=8192):
             mode=mode,
             shell=True,
             bufsize=bufsize,
-            ignore_status=[141, 23], )  # skipcq: BAN-B604
+            ignore_status=[141, 23],
+        )  # skipcq: BAN-B604
     elif mode[0] == "w":
         cmd = f"curl -s -L -T - '{url}'"
         return Pipe(
@@ -202,7 +205,8 @@ def gopen_curl(url, mode="rb", bufsize=8192):
             mode=mode,
             shell=True,
             bufsize=bufsize,
-            ignore_status=[141, 26], )  # skipcq: BAN-B604
+            ignore_status=[141, 26],
+        )  # skipcq: BAN-B604
     else:
         raise ValueError(f"{mode}: unknown mode")
 
@@ -222,13 +226,15 @@ def gopen_htgs(url, mode="rb", bufsize=8192):
             mode=mode,
             shell=True,
             bufsize=bufsize,
-            ignore_status=[141, 23], )  # skipcq: BAN-B604
+            ignore_status=[141, 23],
+        )  # skipcq: BAN-B604
     elif mode[0] == "w":
         raise ValueError(f"{mode}: cannot write")
     else:
         raise ValueError(f"{mode}: unknown mode")
 
 
+
 def gopen_gsutil(url, mode="rb", bufsize=8192):
     """Open a URL with `curl`.
 
@@ -243,7 +249,8 @@ def gopen_gsutil(url, mode="rb", bufsize=8192):
             mode=mode,
             shell=True,
             bufsize=bufsize,
-            ignore_status=[141, 23], )  # skipcq: BAN-B604
+            ignore_status=[141, 23],
+        )  # skipcq: BAN-B604
     elif mode[0] == "w":
         cmd = f"gsutil cp - '{url}'"
         return Pipe(
@@ -251,11 +258,13 @@ def gopen_gsutil(url, mode="rb", bufsize=8192):
             mode=mode,
             shell=True,
             bufsize=bufsize,
-            ignore_status=[141, 26], )  # skipcq: BAN-B604
+            ignore_status=[141, 26],
+        )  # skipcq: BAN-B604
     else:
         raise ValueError(f"{mode}: unknown mode")
 
 
+
 def gopen_error(url, *args, **kw):
     """Raise a value error.
 
@@ -276,7 +285,8 @@ gopen_schemes = dict(
     ftps=gopen_curl,
     scp=gopen_curl,
     gs=gopen_gsutil,
-    htgs=gopen_htgs, )
+    htgs=gopen_htgs,
+)
 
 
 def gopen(url, mode="rb", bufsize=8192, **kw):
diff --git a/paddlespeech/audio/streamdata/handlers.py b/paddlespeech/audio/streamdata/handlers.py
index 0173e5373..7f3d28b62 100644
--- a/paddlespeech/audio/streamdata/handlers.py
+++ b/paddlespeech/audio/streamdata/handlers.py
@@ -3,6 +3,7 @@
 # This file is part of the WebDataset library.
 # See the LICENSE file for licensing terms (BSD-style).
 #
+
 """Pluggable exception handlers.
 
 These are functions that take an exception as an argument and then return...
@@ -13,8 +14,8 @@ These are functions that take an exception as an argument and then return...
 
 They are used as handler= arguments in much of the library.
 """
-import time
-import warnings
+
+import time, warnings
 
 
 def reraise_exception(exn):
diff --git a/paddlespeech/audio/streamdata/mix.py b/paddlespeech/audio/streamdata/mix.py
index 37556ed94..7d790f00f 100644
--- a/paddlespeech/audio/streamdata/mix.py
+++ b/paddlespeech/audio/streamdata/mix.py
@@ -5,12 +5,17 @@
 # See the LICENSE file for licensing terms (BSD-style).
 # Modified from https://github.com/webdataset/webdataset
 #
+
 """Classes for mixing samples from multiple sources."""
-import random
+
+import itertools, os, random, time, sys
+from functools import reduce, wraps
 
 import numpy as np
 
-from .paddle_utils import IterableDataset
+from . import autodecode, utils
+from .paddle_utils import PaddleTensor, IterableDataset
+from .utils import PipelineStage
 
 
 def round_robin_shortest(*sources):
diff --git a/paddlespeech/audio/streamdata/paddle_utils.py b/paddlespeech/audio/streamdata/paddle_utils.py
index c2ad8756b..02bc4c841 100644
--- a/paddlespeech/audio/streamdata/paddle_utils.py
+++ b/paddlespeech/audio/streamdata/paddle_utils.py
@@ -5,11 +5,12 @@
 # See the LICENSE file for licensing terms (BSD-style).
 # Modified from https://github.com/webdataset/webdataset
 #
+
 """Mock implementations of paddle interfaces when paddle is not available."""
 
+
 try:
-    from paddle.io import DataLoader
-    from paddle.io import IterableDataset
+    from paddle.io import DataLoader, IterableDataset
 except ModuleNotFoundError:
 
     class IterableDataset:
@@ -21,3 +22,12 @@ except ModuleNotFoundError:
         """Empty implementation of DataLoader when paddle is not available."""
 
         pass
+
+try:
+    from paddle import Tensor as PaddleTensor
+except ModuleNotFoundError:
+
+    class TorchTensor:
+        """Empty implementation of PaddleTensor when paddle is not available."""
+
+        pass
diff --git a/paddlespeech/audio/streamdata/pipeline.py b/paddlespeech/audio/streamdata/pipeline.py
index ff16760ae..7339a762a 100644
--- a/paddlespeech/audio/streamdata/pipeline.py
+++ b/paddlespeech/audio/streamdata/pipeline.py
@@ -3,12 +3,15 @@
 # See the LICENSE file for licensing terms (BSD-style).
 # Modified from https://github.com/webdataset/webdataset
 #%%
-import copy
-import sys
+import copy, os, random, sys, time
+from dataclasses import dataclass
 from itertools import islice
+from typing import List
 
-from .paddle_utils import DataLoader
-from .paddle_utils import IterableDataset
+import braceexpand, yaml
+
+from .handlers import reraise_exception
+from .paddle_utils import DataLoader, IterableDataset
 from .utils import PipelineStage
 
 
@@ -19,7 +22,8 @@ def add_length_method(obj):
     Combined = type(
         obj.__class__.__name__ + "_Length",
         (obj.__class__, IterableDataset),
-        {"__len__": length}, )
+        {"__len__": length},
+    )
     obj.__class__ = Combined
     return obj
 
diff --git a/paddlespeech/audio/streamdata/shardlists.py b/paddlespeech/audio/streamdata/shardlists.py
index 54f501052..cfaf9a64b 100644
--- a/paddlespeech/audio/streamdata/shardlists.py
+++ b/paddlespeech/audio/streamdata/shardlists.py
@@ -4,30 +4,28 @@
 # This file is part of the WebDataset library.
 # See the LICENSE file for licensing terms (BSD-style).
 #
+
 # Modified from https://github.com/webdataset/webdataset
+
 """Train PyTorch models directly from POSIX tar archive.
 
 Code works locally or over HTTP connections.
 """
-import os
-import random
-import sys
-import time
-from dataclasses import dataclass
-from dataclasses import field
+
+import os, random, sys, time
+from dataclasses import dataclass, field
 from itertools import islice
 from typing import List
 
-import braceexpand
-import yaml
+import braceexpand, yaml
 
 from . import utils
-from ..utils.log import Logger
 from .filters import pipelinefilter
 from .paddle_utils import IterableDataset
-logger = Logger(__name__)
 
 
+from ..utils.log import Logger
+logger = Logger(__name__)
 def expand_urls(urls):
     if isinstance(urls, str):
         urllist = urls.split("::")
@@ -66,8 +64,7 @@ class SimpleShardList(IterableDataset):
 
 
 def split_by_node(src, group=None):
-    rank, world_size, worker, num_workers = utils.paddle_worker_info(
-        group=group)
+    rank, world_size, worker, num_workers = utils.paddle_worker_info(group=group)
     logger.info(f"world_size:{world_size}, rank:{rank}")
     if world_size > 1:
         for s in islice(src, rank, None, world_size):
@@ -78,11 +75,9 @@ def split_by_node(src, group=None):
 
 
 def single_node_only(src, group=None):
-    rank, world_size, worker, num_workers = utils.paddle_worker_info(
-        group=group)
+    rank, world_size, worker, num_workers = utils.paddle_worker_info(group=group)
     if world_size > 1:
-        raise ValueError(
-            "input pipeline needs to be reconfigured for multinode training")
+        raise ValueError("input pipeline needs to be reconfigured for multinode training")
     for s in src:
         yield s
 
@@ -109,8 +104,7 @@ def resampled_(src, n=sys.maxsize):
     rng = random.Random(seed)
     print("# resampled loading", file=sys.stderr)
     items = list(src)
-    print(
-        f"# resampled got {len(items)} samples, yielding {n}", file=sys.stderr)
+    print(f"# resampled got {len(items)} samples, yielding {n}", file=sys.stderr)
     for i in range(n):
         yield rng.choice(items)
 
@@ -124,9 +118,7 @@ def non_empty(src):
         yield s
         count += 1
     if count == 0:
-        raise ValueError(
-            "pipeline stage received no data at all and this was declared as an error"
-        )
+        raise ValueError("pipeline stage received no data at all and this was declared as an error")
 
 
 @dataclass
@@ -146,6 +138,10 @@ def expand(s):
     return os.path.expanduser(os.path.expandvars(s))
 
 
+class MultiShardSample(IterableDataset):
+    def __init__(self, fname):
+        """Construct a shardlist from multiple sources using a YAML spec."""
+        self.epoch = -1
 class MultiShardSample(IterableDataset):
     def __init__(self, fname):
         """Construct a shardlist from multiple sources using a YAML spec."""
@@ -160,23 +156,20 @@ class MultiShardSample(IterableDataset):
         else:
             with open(fname) as stream:
                 spec = yaml.safe_load(stream)
-        assert set(spec.keys()).issubset(
-            set("prefix datasets buckets".split())), list(spec.keys())
+        assert set(spec.keys()).issubset(set("prefix datasets buckets".split())), list(spec.keys())
         prefix = expand(spec.get("prefix", ""))
         self.sources = []
         for ds in spec["datasets"]:
-            assert set(ds.keys()).issubset(
-                set("buckets name shards resample choose".split())), list(
-                    ds.keys())
+            assert set(ds.keys()).issubset(set("buckets name shards resample choose".split())), list(
+                ds.keys()
+            )
             buckets = ds.get("buckets", spec.get("buckets", []))
             if isinstance(buckets, str):
                 buckets = [buckets]
             buckets = [expand(s) for s in buckets]
             if buckets == []:
                 buckets = [""]
-            assert len(
-                buckets
-            ) == 1, f"{buckets}: FIXME support for multiple buckets unimplemented"
+            assert len(buckets) == 1, f"{buckets}: FIXME support for multiple buckets unimplemented"
             bucket = buckets[0]
             name = ds.get("name", "@" + bucket)
             urls = ds["shards"]
@@ -184,19 +177,15 @@ class MultiShardSample(IterableDataset):
                 urls = [urls]
             # urls = [u for url in urls for u in braceexpand.braceexpand(url)]
             urls = [
-                prefix + os.path.join(bucket, u)
-                for url in urls for u in braceexpand.braceexpand(expand(url))
+                prefix + os.path.join(bucket, u) for url in urls for u in braceexpand.braceexpand(expand(url))
             ]
             resample = ds.get("resample", -1)
             nsample = ds.get("choose", -1)
             if nsample > len(urls):
-                raise ValueError(
-                    f"perepoch {nsample} must be no greater than the number of shards"
-                )
+                raise ValueError(f"perepoch {nsample} must be no greater than the number of shards")
             if (nsample > 0) and (resample > 0):
                 raise ValueError("specify only one of perepoch or choose")
-            entry = MSSource(
-                name=name, urls=urls, perepoch=nsample, resample=resample)
+            entry = MSSource(name=name, urls=urls, perepoch=nsample, resample=resample)
             self.sources.append(entry)
             print(f"# {name} {len(urls)} {nsample}", file=sys.stderr)
 
@@ -214,7 +203,7 @@ class MultiShardSample(IterableDataset):
                 # sample without replacement
                 l = list(source.urls)
                 self.rng.shuffle(l)
-                l = l[:source.perepoch]
+                l = l[: source.perepoch]
             else:
                 l = list(source.urls)
             result += l
@@ -238,11 +227,12 @@ class ResampledShards(IterableDataset):
     """An iterable dataset yielding a list of urls."""
 
     def __init__(
-            self,
-            urls,
-            nshards=sys.maxsize,
-            worker_seed=None,
-            deterministic=False, ):
+        self,
+        urls,
+        nshards=sys.maxsize,
+        worker_seed=None,
+        deterministic=False,
+    ):
         """Sample shards from the shard list with replacement.
 
         :param urls: a list of URLs as a Python list or brace notation string
@@ -262,8 +252,7 @@ class ResampledShards(IterableDataset):
         if self.deterministic:
             seed = utils.make_seed(self.worker_seed(), self.epoch)
         else:
-            seed = utils.make_seed(self.worker_seed(), self.epoch,
-                                   os.getpid(), time.time_ns(), os.urandom(4))
+            seed = utils.make_seed(self.worker_seed(), self.epoch, os.getpid(), time.time_ns(), os.urandom(4))
         if os.environ.get("WDS_SHOW_SEED", "0") == "1":
             print(f"# ResampledShards seed {seed}")
         self.rng = random.Random(seed)
diff --git a/paddlespeech/audio/streamdata/tariterators.py b/paddlespeech/audio/streamdata/tariterators.py
index 79b81c0ce..b1616918c 100644
--- a/paddlespeech/audio/streamdata/tariterators.py
+++ b/paddlespeech/audio/streamdata/tariterators.py
@@ -3,12 +3,13 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 # This file is part of the WebDataset library.
 # See the LICENSE file for licensing terms (BSD-style).
+
 # Modified from https://github.com/webdataset/webdataset
 # Modified from wenet(https://github.com/wenet-e2e/wenet)
+
 """Low level iteration functions for tar archives."""
-import random
-import re
-import tarfile
+
+import random, re, tarfile
 
 import braceexpand
 
@@ -26,7 +27,6 @@ import numpy as np
 
 AUDIO_FORMAT_SETS = set(['flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma'])
 
-
 def base_plus_ext(path):
     """Split off all file extensions.
 
@@ -47,8 +47,12 @@ def valid_sample(sample):
 
     :param sample: sample to be checked
     """
-    return (sample is not None and isinstance(sample, dict) and
-            len(list(sample.keys())) > 0 and not sample.get("__bad__", False))
+    return (
+        sample is not None
+        and isinstance(sample, dict)
+        and len(list(sample.keys())) > 0
+        and not sample.get("__bad__", False)
+    )
 
 
 # FIXME: UNUSED
@@ -75,16 +79,16 @@ def url_opener(data, handler=reraise_exception, **kw):
             sample.update(stream=stream)
             yield sample
         except Exception as exn:
-            exn.args = exn.args + (url, )
+            exn.args = exn.args + (url,)
             if handler(exn):
                 continue
             else:
                 break
 
 
-def tar_file_iterator(fileobj,
-                      skip_meta=r"__[^/]*__($|/)",
-                      handler=reraise_exception):
+def tar_file_iterator(
+    fileobj, skip_meta=r"__[^/]*__($|/)", handler=reraise_exception
+):
     """Iterate over tar file, yielding filename, content pairs for the given tar stream.
 
     :param fileobj: byte stream suitable for tarfile
@@ -99,8 +103,11 @@ def tar_file_iterator(fileobj,
                 continue
             if fname is None:
                 continue
-            if ("/" not in fname and fname.startswith(meta_prefix) and
-                    fname.endswith(meta_suffix)):
+            if (
+                "/" not in fname
+                and fname.startswith(meta_prefix)
+                and fname.endswith(meta_suffix)
+            ):
                 # skipping metadata for now
                 continue
             if skip_meta is not None and re.match(skip_meta, fname):
@@ -111,10 +118,8 @@ def tar_file_iterator(fileobj,
             assert pos > 0
             prefix, postfix = name[:pos], name[pos + 1:]
             if postfix == 'wav':
-                waveform, sample_rate = paddlespeech.audio.load(
-                    stream.extractfile(tarinfo), normal=False)
-                result = dict(
-                    fname=prefix, wav=waveform, sample_rate=sample_rate)
+                waveform, sample_rate = paddlespeech.audio.load(stream.extractfile(tarinfo), normal=False)
+                result = dict(fname=prefix, wav=waveform, sample_rate = sample_rate)
             else:
                 txt = stream.extractfile(tarinfo).read().decode('utf8').strip()
                 result = dict(fname=prefix, txt=txt)
@@ -123,17 +128,16 @@ def tar_file_iterator(fileobj,
             stream.members = []
         except Exception as exn:
             if hasattr(exn, "args") and len(exn.args) > 0:
-                exn.args = (exn.args[0] + " @ " + str(fileobj), ) + exn.args[1:]
+                exn.args = (exn.args[0] + " @ " + str(fileobj),) + exn.args[1:]
             if handler(exn):
                 continue
             else:
                 break
     del stream
 
-
-def tar_file_and_group_iterator(fileobj,
-                                skip_meta=r"__[^/]*__($|/)",
-                                handler=reraise_exception):
+def tar_file_and_group_iterator(
+    fileobj, skip_meta=r"__[^/]*__($|/)", handler=reraise_exception
+):
     """ Expand a stream of open tar files into a stream of tar file contents.
         And groups the file with same prefix
 
@@ -163,11 +167,8 @@ def tar_file_and_group_iterator(fileobj,
                 if postfix == 'txt':
                     example['txt'] = file_obj.read().decode('utf8').strip()
                 elif postfix in AUDIO_FORMAT_SETS:
-                    waveform, sample_rate = paddlespeech.audio.load(
-                        file_obj, normal=False)
-                    waveform = paddle.to_tensor(
-                        np.expand_dims(np.array(waveform), 0),
-                        dtype=paddle.float32)
+                    waveform, sample_rate = paddlespeech.audio.load(file_obj, normal=False)
+                    waveform = paddle.to_tensor(np.expand_dims(np.array(waveform),0), dtype=paddle.float32)
 
                     example['wav'] = waveform
                     example['sample_rate'] = sample_rate
@@ -175,21 +176,19 @@ def tar_file_and_group_iterator(fileobj,
                     example[postfix] = file_obj.read()
             except Exception as exn:
                 if hasattr(exn, "args") and len(exn.args) > 0:
-                    exn.args = (exn.args[0] + " @ " + str(fileobj),
-                                ) + exn.args[1:]
+                    exn.args = (exn.args[0] + " @ " + str(fileobj),) + exn.args[1:]
                 if handler(exn):
                     continue
                 else:
                     break
                 valid = False
-            #  logging.warning('error to parse {}'.format(name))
+              #  logging.warning('error to parse {}'.format(name))
         prev_prefix = prefix
     if prev_prefix is not None:
         example['fname'] = prev_prefix
         yield example
     stream.close()
 
-
 def tar_file_expander(data, handler=reraise_exception):
     """Expand a stream of open tar files into a stream of tar file contents.
 
@@ -201,8 +200,9 @@ def tar_file_expander(data, handler=reraise_exception):
             assert isinstance(source, dict)
             assert "stream" in source
             for sample in tar_file_iterator(source["stream"]):
-                assert (isinstance(sample, dict) and "data" in sample and
-                        "fname" in sample)
+                assert (
+                    isinstance(sample, dict) and "data" in sample and "fname" in sample
+                )
                 sample["__url__"] = url
                 yield sample
         except Exception as exn:
@@ -213,6 +213,8 @@ def tar_file_expander(data, handler=reraise_exception):
                 break
 
 
+
+
 def tar_file_and_group_expander(data, handler=reraise_exception):
     """Expand a stream of open tar files into a stream of tar file contents.
 
@@ -224,8 +226,9 @@ def tar_file_and_group_expander(data, handler=reraise_exception):
             assert isinstance(source, dict)
             assert "stream" in source
             for sample in tar_file_and_group_iterator(source["stream"]):
-                assert (isinstance(sample, dict) and "wav" in sample and
-                        "txt" in sample and "fname" in sample)
+                assert (
+                    isinstance(sample, dict) and "wav" in sample and "txt" in sample and "fname" in sample
+                )
                 sample["__url__"] = url
                 yield sample
         except Exception as exn:
@@ -236,11 +239,7 @@ def tar_file_and_group_expander(data, handler=reraise_exception):
                 break
 
 
-def group_by_keys(data,
-                  keys=base_plus_ext,
-                  lcase=True,
-                  suffixes=None,
-                  handler=None):
+def group_by_keys(data, keys=base_plus_ext, lcase=True, suffixes=None, handler=None):
     """Return function over iterator that groups key, value pairs into samples.
 
     :param keys: function that splits the key into key and extension (base_plus_ext)
@@ -255,8 +254,8 @@ def group_by_keys(data,
             print(
                 prefix,
                 suffix,
-                current_sample.keys()
-                if isinstance(current_sample, dict) else None, )
+                current_sample.keys() if isinstance(current_sample, dict) else None,
+            )
         if prefix is None:
             continue
         if lcase:
diff --git a/paddlespeech/audio/streamdata/utils.py b/paddlespeech/audio/streamdata/utils.py
index 94dab9052..c7294f2bf 100644
--- a/paddlespeech/audio/streamdata/utils.py
+++ b/paddlespeech/audio/streamdata/utils.py
@@ -4,23 +4,22 @@
 # This file is part of the WebDataset library.
 # See the LICENSE file for licensing terms (BSD-style).
 #
+
 # Modified from https://github.com/webdataset/webdataset
+
 """Miscellaneous utility functions."""
+
 import importlib
 import itertools as itt
 import os
 import re
 import sys
-from typing import Any
-from typing import Callable
-from typing import Iterator
-from typing import Union
+from typing import Any, Callable, Iterator, Optional, Union
 
 from ..utils.log import Logger
 
 logger = Logger(__name__)
 
-
 def make_seed(*args):
     seed = 0
     for arg in args:
@@ -38,7 +37,7 @@ def identity(x: Any) -> Any:
     return x
 
 
-def safe_eval(s: str, expr: str="{}"):
+def safe_eval(s: str, expr: str = "{}"):
     """Evaluate the given expression more safely."""
     if re.sub("[^A-Za-z0-9_]", "", s) != s:
         raise ValueError(f"safe_eval: illegal characters in: '{s}'")
@@ -55,9 +54,9 @@ def lookup_sym(sym: str, modules: list):
     return None
 
 
-def repeatedly0(loader: Iterator,
-                nepochs: int=sys.maxsize,
-                nbatches: int=sys.maxsize):
+def repeatedly0(
+    loader: Iterator, nepochs: int = sys.maxsize, nbatches: int = sys.maxsize
+):
     """Repeatedly returns batches from a DataLoader."""
     for epoch in range(nepochs):
         for sample in itt.islice(loader, nbatches):
@@ -70,11 +69,12 @@ def guess_batchsize(batch: Union[tuple, list]):
 
 
 def repeatedly(
-        source: Iterator,
-        nepochs: int=None,
-        nbatches: int=None,
-        nsamples: int=None,
-        batchsize: Callable[..., int]=guess_batchsize, ):
+    source: Iterator,
+    nepochs: int = None,
+    nbatches: int = None,
+    nsamples: int = None,
+    batchsize: Callable[..., int] = guess_batchsize,
+):
     """Repeatedly yield samples from an iterator."""
     epoch = 0
     batch = 0
@@ -93,7 +93,6 @@ def repeatedly(
         if nepochs is not None and epoch >= nepochs:
             return
 
-
 def paddle_worker_info(group=None):
     """Return node and worker info for PyTorch and some distributed environments."""
     rank = 0
@@ -117,7 +116,7 @@ def paddle_worker_info(group=None):
     else:
         try:
             from paddle.io import get_worker_info
-            worker_info = get_worker_info()
+            worker_info = paddle.io.get_worker_info()
             if worker_info is not None:
                 worker = worker_info.id
                 num_workers = worker_info.num_workers
@@ -127,7 +126,6 @@ def paddle_worker_info(group=None):
 
     return rank, world_size, worker, num_workers
 
-
 def paddle_worker_seed(group=None):
     """Compute a distinct, deterministic RNG seed for each worker and node."""
     rank, world_size, worker, num_workers = paddle_worker_info(group=group)
diff --git a/paddlespeech/audio/streamdata/writer.py b/paddlespeech/audio/streamdata/writer.py
index 3928a3ba6..7d4f7703b 100644
--- a/paddlespeech/audio/streamdata/writer.py
+++ b/paddlespeech/audio/streamdata/writer.py
@@ -5,24 +5,18 @@
 # See the LICENSE file for licensing terms (BSD-style).
 # Modified from https://github.com/webdataset/webdataset
 #
+
 """Classes and functions for writing tar files and WebDataset files."""
-import io
-import json
-import pickle
-import re
-import tarfile
-import time
-from typing import Any
-from typing import Callable
-from typing import Optional
-from typing import Union
+
+import io, json, pickle, re, tarfile, time
+from typing import Any, Callable, Optional, Union
 
 import numpy as np
 
 from . import gopen
 
 
-def imageencoder(image: Any, format: str="PNG"):  # skipcq: PYL-W0622
+def imageencoder(image: Any, format: str = "PNG"):  # skipcq: PYL-W0622
     """Compress an image using PIL and return it as a string.
 
     Can handle float or uint8 images.
@@ -73,7 +67,6 @@ def bytestr(data: Any):
         return data.encode("ascii")
     return str(data).encode("ascii")
 
-
 def paddle_dumps(data: Any):
     """Dump data into a bytestring using paddle.dumps.
 
@@ -89,7 +82,6 @@ def paddle_dumps(data: Any):
     paddle.save(data, stream)
     return stream.getvalue()
 
-
 def numpy_dumps(data: np.ndarray):
     """Dump data into a bytestring using numpy npy format.
 
@@ -147,8 +139,9 @@ def add_handlers(d, keys, value):
 def make_handlers():
     """Create a list of handlers for encoding data."""
     handlers = {}
-    add_handlers(handlers, "cls cls2 class count index inx id",
-                 lambda x: str(x).encode("ascii"))
+    add_handlers(
+        handlers, "cls cls2 class count index inx id", lambda x: str(x).encode("ascii")
+    )
     add_handlers(handlers, "txt text transcript", lambda x: x.encode("utf-8"))
     add_handlers(handlers, "html htm", lambda x: x.encode("utf-8"))
     add_handlers(handlers, "pyd pickle", pickle.dumps)
@@ -159,8 +152,7 @@ def make_handlers():
     add_handlers(handlers, "json jsn", lambda x: json.dumps(x).encode("utf-8"))
     add_handlers(handlers, "mp msgpack msg", mp_dumps)
     add_handlers(handlers, "cbor", cbor_dumps)
-    add_handlers(handlers, "jpg jpeg img image",
-                 lambda data: imageencoder(data, "jpg"))
+    add_handlers(handlers, "jpg jpeg img image", lambda data: imageencoder(data, "jpg"))
     add_handlers(handlers, "png", lambda data: imageencoder(data, "png"))
     add_handlers(handlers, "pbm", lambda data: imageencoder(data, "pbm"))
     add_handlers(handlers, "pgm", lambda data: imageencoder(data, "pgm"))
@@ -200,8 +192,7 @@ def encode_based_on_extension(sample: dict, handlers: dict):
     :param handlers: handlers for encoding
     """
     return {
-        k: encode_based_on_extension1(v, k, handlers)
-        for k, v in list(sample.items())
+        k: encode_based_on_extension1(v, k, handlers) for k, v in list(sample.items())
     }
 
 
@@ -267,14 +258,15 @@ class TarWriter:
     """
 
     def __init__(
-            self,
-            fileobj,
-            user: str="bigdata",
-            group: str="bigdata",
-            mode: int=0o0444,
-            compress: Optional[bool]=None,
-            encoder: Union[None, bool, Callable]=True,
-            keep_meta: bool=False, ):
+        self,
+        fileobj,
+        user: str = "bigdata",
+        group: str = "bigdata",
+        mode: int = 0o0444,
+        compress: Optional[bool] = None,
+        encoder: Union[None, bool, Callable] = True,
+        keep_meta: bool = False,
+    ):
         """Create a tar writer.
 
         :param fileobj: stream to write data to
@@ -338,7 +330,8 @@ class TarWriter:
                 continue
             if not isinstance(v, (bytes, bytearray, memoryview)):
                 raise ValueError(
-                    f"{k} doesn't map to a bytes after encoding ({type(v)})")
+                    f"{k} doesn't map to a bytes after encoding ({type(v)})"
+                )
         key = obj["__key__"]
         for k in sorted(obj.keys()):
             if k == "__key__":
@@ -356,8 +349,7 @@ class TarWriter:
             ti.uname = self.user
             ti.gname = self.group
             if not isinstance(v, (bytes, bytearray, memoryview)):
-                raise ValueError(
-                    f"converter didn't yield bytes: {k}, {type(v)}")
+                raise ValueError(f"converter didn't yield bytes: {k}, {type(v)}")
             stream = io.BytesIO(v)
             self.tarstream.addfile(ti, stream)
             total += ti.size
@@ -368,13 +360,14 @@ class ShardWriter:
     """Like TarWriter but splits into multiple shards."""
 
     def __init__(
-            self,
-            pattern: str,
-            maxcount: int=100000,
-            maxsize: float=3e9,
-            post: Optional[Callable]=None,
-            start_shard: int=0,
-            **kw, ):
+        self,
+        pattern: str,
+        maxcount: int = 100000,
+        maxsize: float = 3e9,
+        post: Optional[Callable] = None,
+        start_shard: int = 0,
+        **kw,
+    ):
         """Create a ShardWriter.
 
         :param pattern: output file pattern
@@ -407,7 +400,8 @@ class ShardWriter:
                 self.fname,
                 self.count,
                 "%.1f GB" % (self.size / 1e9),
-                self.total, )
+                self.total,
+            )
         self.shard += 1
         stream = open(self.fname, "wb")
         self.tarstream = TarWriter(stream, **self.kw)
@@ -419,8 +413,11 @@ class ShardWriter:
 
         :param obj: sample to be written
         """
-        if (self.tarstream is None or self.count >= self.maxcount or
-                self.size >= self.maxsize):
+        if (
+            self.tarstream is None
+            or self.count >= self.maxcount
+            or self.size >= self.maxsize
+        ):
             self.next_stream()
         size = self.tarstream.write(obj)
         self.count += 1
diff --git a/paddlespeech/audio/text/text_featurizer.py b/paddlespeech/audio/text/text_featurizer.py
index bcd6df54b..91c4d75c3 100644
--- a/paddlespeech/audio/text/text_featurizer.py
+++ b/paddlespeech/audio/text/text_featurizer.py
@@ -17,7 +17,6 @@ from typing import Union
 
 import sentencepiece as spm
 
-from ..utils.log import Logger
 from .utility import BLANK
 from .utility import EOS
 from .utility import load_dict
@@ -25,6 +24,7 @@ from .utility import MASKCTC
 from .utility import SOS
 from .utility import SPACE
 from .utility import UNK
+from ..utils.log import Logger
 
 logger = Logger(__name__)
 
diff --git a/paddlespeech/audio/transform/perturb.py b/paddlespeech/audio/transform/perturb.py
index 0825caec8..8044dc36f 100644
--- a/paddlespeech/audio/transform/perturb.py
+++ b/paddlespeech/audio/transform/perturb.py
@@ -12,16 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # Modified from espnet(https://github.com/espnet/espnet)
-import io
-import os
-
-import h5py
 import librosa
 import numpy
-import numpy as np
 import scipy
 import soundfile
 
+import io
+import os
+import h5py
+import numpy as np
 
 class SoundHDF5File():
     """Collecting sound files to a HDF5 file
@@ -110,7 +109,6 @@ class SoundHDF5File():
     def close(self):
         self.file.close()
 
-
 class SpeedPerturbation():
     """SpeedPerturbation
 
@@ -560,3 +558,4 @@ class RIRConvolve():
                 [scipy.convolve(x, r, mode="same") for r in rir], axis=-1)
         else:
             return scipy.convolve(x, rir, mode="same")
+
diff --git a/paddlespeech/audio/transform/spec_augment.py b/paddlespeech/audio/transform/spec_augment.py
index b2635066f..029e7b8f5 100644
--- a/paddlespeech/audio/transform/spec_augment.py
+++ b/paddlespeech/audio/transform/spec_augment.py
@@ -14,7 +14,6 @@
 # Modified from espnet(https://github.com/espnet/espnet)
 """Spec Augment module for preprocessing i.e., data augmentation"""
 import random
-
 import numpy
 from PIL import Image
 
diff --git a/paddlespeech/audio/transform/spectrogram.py b/paddlespeech/audio/transform/spectrogram.py
index 864f3f994..99d50d81e 100644
--- a/paddlespeech/audio/transform/spectrogram.py
+++ b/paddlespeech/audio/transform/spectrogram.py
@@ -381,6 +381,36 @@ class LogMelSpectrogramKaldi():
         mat = np.squeeze(mat.numpy())
         return mat
 
+class WavProcess():
+    def __init__(
+            self,
+            dither=0.1):
+        """
+        Args:
+            dither (float): Dithering constant
+
+        Returns:
+        """
+
+        self.dither = dither
+
+    def __call__(self, x, train):
+        """
+        Args:
+            x (np.ndarray): shape (Ti,)
+            train (bool): True, train mode.
+
+        Raises:
+            ValueError: not support (Ti, C)
+
+        Returns:
+            np.ndarray: (T, D)
+        """
+        dither = self.dither if train else 0.0
+        if x.ndim != 1:
+            raise ValueError("Not support x: [Time, Channel]")
+        waveform = np.expand_dims(x, -1)
+        return waveform
 
 class LogMelSpectrogramKaldi_decay():
     def __init__(
diff --git a/paddlespeech/audio/transform/transformation.py b/paddlespeech/audio/transform/transformation.py
index d24d6437c..e2f66dbf2 100644
--- a/paddlespeech/audio/transform/transformation.py
+++ b/paddlespeech/audio/transform/transformation.py
@@ -41,6 +41,7 @@ import_alias = dict(
     utterance_cmvn="paddlespeech.audio.transform.cmvn:UtteranceCMVN",
     fbank="paddlespeech.audio.transform.spectrogram:LogMelSpectrogram",
     spectrogram="paddlespeech.audio.transform.spectrogram:Spectrogram",
+    wav_process="paddlespeech.audio.transform.spectrogram:WavProcess",
     stft="paddlespeech.audio.transform.spectrogram:Stft",
     istft="paddlespeech.audio.transform.spectrogram:IStft",
     stft2fbank="paddlespeech.audio.transform.spectrogram:Stft2LogMelSpectrogram",
diff --git a/paddlespeech/cli/asr/infer.py b/paddlespeech/cli/asr/infer.py
index 7296776f9..f9b4439ec 100644
--- a/paddlespeech/cli/asr/infer.py
+++ b/paddlespeech/cli/asr/infer.py
@@ -99,9 +99,8 @@ class ASRExecutor(BaseExecutor):
             '-y',
             action="store_true",
             default=False,
-            help='No additional parameters required. \
-            Once set this parameter, it means accepting the request of the program by default, \
-            which includes transforming the audio sample rate')
+            help='No additional parameters required. Once set this parameter, it means accepting the request of the program by default, which includes transforming the audio sample rate'
+        )
         self.parser.add_argument(
             '--rtf',
             action="store_true",
@@ -341,7 +340,7 @@ class ASRExecutor(BaseExecutor):
         audio = np.round(audio).astype("int16")
         return audio
 
-    def _check(self, audio_file: str, sample_rate: int, force_yes: bool=False):
+    def _check(self, audio_file: str, sample_rate: int, force_yes: bool):
         self.sample_rate = sample_rate
         if self.sample_rate != 16000 and self.sample_rate != 8000:
             logger.error(
@@ -435,17 +434,8 @@ class ASRExecutor(BaseExecutor):
 
         for id_, input_ in task_source.items():
             try:
-                res = self(
-                    audio_file=input_,
-                    model=model,
-                    lang=lang,
-                    sample_rate=sample_rate,
-                    config=config,
-                    ckpt_path=ckpt_path,
-                    decode_method=decode_method,
-                    force_yes=force_yes,
-                    rtf=rtf,
-                    device=device)
+                res = self(input_, model, lang, sample_rate, config, ckpt_path,
+                           decode_method, force_yes, rtf, device)
                 task_results[id_] = res
             except Exception as e:
                 has_exceptions = True
diff --git a/paddlespeech/cli/executor.py b/paddlespeech/cli/executor.py
index b53eed88c..3800c36db 100644
--- a/paddlespeech/cli/executor.py
+++ b/paddlespeech/cli/executor.py
@@ -191,7 +191,7 @@ class BaseExecutor(ABC):
                 line = line.strip()
                 if not line:
                     continue
-                k, v = line.split()  # space or \t
+                k, v = line.split() # space or \t
                 job_contents[k] = v
         return job_contents
 
diff --git a/paddlespeech/cli/vector/infer.py b/paddlespeech/cli/vector/infer.py
index 111987246..48ca1f98d 100644
--- a/paddlespeech/cli/vector/infer.py
+++ b/paddlespeech/cli/vector/infer.py
@@ -70,14 +70,6 @@ class VectorExecutor(BaseExecutor):
             type=str,
             default=None,
             help="Checkpoint file of model.")
-        self.parser.add_argument(
-            '--yes',
-            '-y',
-            action="store_true",
-            default=False,
-            help='No additional parameters required. \
-            Once set this parameter, it means accepting the request of the program by default, \
-            which includes transforming the audio sample rate')
         self.parser.add_argument(
             '--config',
             type=str,
@@ -117,7 +109,6 @@ class VectorExecutor(BaseExecutor):
         sample_rate = parser_args.sample_rate
         config = parser_args.config
         ckpt_path = parser_args.ckpt_path
-        force_yes = parser_args.yes
         device = parser_args.device
 
         # stage 1: configurate the verbose flag
@@ -137,14 +128,8 @@ class VectorExecutor(BaseExecutor):
                 # extract the speaker audio embedding
                 if parser_args.task == "spk":
                     logger.debug("do vector spk task")
-                    res = self(
-                        audio_file=input_,
-                        model=model,
-                        sample_rate=sample_rate,
-                        config=config,
-                        ckpt_path=ckpt_path,
-                        force_yes=force_yes,
-                        device=device)
+                    res = self(input_, model, sample_rate, config, ckpt_path,
+                               device)
                     task_result[id_] = res
                 elif parser_args.task == "score":
                     logger.debug("do vector score task")
@@ -160,22 +145,10 @@ class VectorExecutor(BaseExecutor):
                     logger.debug(
                         f"score task, enroll audio: {enroll_audio}, test audio: {test_audio}"
                     )
-                    enroll_embedding = self(
-                        audio_file=enroll_audio,
-                        model=model,
-                        sample_rate=sample_rate,
-                        config=config,
-                        ckpt_path=ckpt_path,
-                        force_yes=force_yes,
-                        device=device)
-                    test_embedding = self(
-                        audio_file=test_audio,
-                        model=model,
-                        sample_rate=sample_rate,
-                        config=config,
-                        ckpt_path=ckpt_path,
-                        force_yes=force_yes,
-                        device=device)
+                    enroll_embedding = self(enroll_audio, model, sample_rate,
+                                            config, ckpt_path, device)
+                    test_embedding = self(test_audio, model, sample_rate,
+                                          config, ckpt_path, device)
 
                     # get the score
                     res = self.get_embeddings_score(enroll_embedding,
@@ -249,7 +222,6 @@ class VectorExecutor(BaseExecutor):
                  sample_rate: int=16000,
                  config: os.PathLike=None,
                  ckpt_path: os.PathLike=None,
-                 force_yes: bool=False,
                  device=paddle.get_device()):
         """Extract the audio embedding
 
@@ -268,7 +240,7 @@ class VectorExecutor(BaseExecutor):
         """
         # stage 0: check the audio format
         audio_file = os.path.abspath(audio_file)
-        if not self._check(audio_file, sample_rate, force_yes):
+        if not self._check(audio_file, sample_rate):
             sys.exit(-1)
 
         # stage 1: set the paddle runtime host device
@@ -446,7 +418,7 @@ class VectorExecutor(BaseExecutor):
 
         logger.debug("audio extract the feat success")
 
-    def _check(self, audio_file: str, sample_rate: int, force_yes: bool=False):
+    def _check(self, audio_file: str, sample_rate: int):
         """Check if the model sample match the audio sample rate 
 
         Args:
@@ -490,34 +462,13 @@ class VectorExecutor(BaseExecutor):
         logger.debug(f"The sample rate is {audio_sample_rate}")
 
         if audio_sample_rate != self.sample_rate:
-            logger.debug("The sample rate of the input file is not {}.\n \
+            logger.error("The sample rate of the input file is not {}.\n \
                             The program will resample the wav file to {}.\n \
                             If the result does not meet your expectations，\n \
                             Please input the 16k 16 bit 1 channel wav file. \
                         ".format(self.sample_rate, self.sample_rate))
-            if force_yes is False:
-                while (True):
-                    logger.debug(
-                        "Whether to change the sample rate and the channel. Y: change the sample. N: exit the prgream."
-                    )
-                    content = input("Input(Y/N):")
-                    if content.strip() == "Y" or content.strip(
-                    ) == "y" or content.strip() == "yes" or content.strip(
-                    ) == "Yes":
-                        logger.debug(
-                            "change the sampele rate, channel to 16k and 1 channel"
-                        )
-                        break
-                    elif content.strip() == "N" or content.strip(
-                    ) == "n" or content.strip() == "no" or content.strip(
-                    ) == "No":
-                        logger.debug("Exit the program")
-                        return False
-                    else:
-                        logger.warning("Not regular input, please input again")
-            self.change_format = True
+            sys.exit(-1)
         else:
             logger.debug("The audio file format is right")
-            self.change_format = False
 
         return True
diff --git a/paddlespeech/resource/pretrained_models.py b/paddlespeech/resource/pretrained_models.py
index f049879a3..872d564cd 100644
--- a/paddlespeech/resource/pretrained_models.py
+++ b/paddlespeech/resource/pretrained_models.py
@@ -1363,11 +1363,5 @@ g2pw_onnx_models = {
             'md5':
             '7e049a55547da840502cf99e8a64f20e',
         },
-        '1.1': {
-            'url':
-            'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip',
-            'md5':
-            'f8b60501770bff92ed6ce90860a610e6',
-        },
     },
 }
diff --git a/paddlespeech/s2t/__init__.py b/paddlespeech/s2t/__init__.py
index 5fe2e16b9..f6476b9aa 100644
--- a/paddlespeech/s2t/__init__.py
+++ b/paddlespeech/s2t/__init__.py
@@ -114,7 +114,6 @@ if not hasattr(paddle.Tensor, 'new_full'):
     paddle.Tensor.new_full = new_full
     paddle.static.Variable.new_full = new_full
 
-
 def contiguous(xs: paddle.Tensor) -> paddle.Tensor:
     return xs
 
diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py b/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py
index 66ea29d08..90b7d8a18 100644
--- a/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py
@@ -20,8 +20,8 @@ import paddle
 import soundfile
 from yacs.config import CfgNode
 
-from paddlespeech.audio.transform.transformation import Transformation
 from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
+from paddlespeech.s2t.io.collator import SpeechCollator
 from paddlespeech.s2t.models.ds2 import DeepSpeech2Model
 from paddlespeech.s2t.training.cli import default_argument_parser
 from paddlespeech.s2t.utils import mp_tools
@@ -38,24 +38,24 @@ class DeepSpeech2Tester_hub():
         self.args = args
         self.config = config
         self.audio_file = args.audio_file
-
-        self.preprocess_conf = config.preprocess_config
-        self.preprocess_args = {"train": False}
-        self.preprocessing = Transformation(self.preprocess_conf)
-
-        self.text_feature = TextFeaturizer(
-            unit_type=config.unit_type,
-            vocab=config.vocab_filepath,
-            spm_model_prefix=config.spm_model_prefix)
-        paddle.set_device('gpu' if self.args.ngpu > 0 else 'cpu')
+        self.collate_fn_test = SpeechCollator.from_config(config)
+        self._text_featurizer = TextFeaturizer(
+            unit_type=config.unit_type, vocab=None)
 
     def compute_result_transcripts(self, audio, audio_len, vocab_list, cfg):
-        decode_batch_size = cfg.decode_batch_size
-        self.model.decoder.init_decoder(
-            decode_batch_size, vocab_list, cfg.decoding_method,
-            cfg.lang_model_path, cfg.alpha, cfg.beta, cfg.beam_size,
-            cfg.cutoff_prob, cfg.cutoff_top_n, cfg.num_proc_bsearch)
-        result_transcripts = self.model.decode(audio, audio_len)
+        result_transcripts = self.model.decode(
+            audio,
+            audio_len,
+            vocab_list,
+            decoding_method=cfg.decoding_method,
+            lang_model_path=cfg.lang_model_path,
+            beam_alpha=cfg.alpha,
+            beam_beta=cfg.beta,
+            beam_size=cfg.beam_size,
+            cutoff_prob=cfg.cutoff_prob,
+            cutoff_top_n=cfg.cutoff_top_n,
+            num_processes=cfg.num_proc_bsearch)
+
         return result_transcripts
 
     @mp_tools.rank_zero_only
@@ -64,23 +64,16 @@ class DeepSpeech2Tester_hub():
         self.model.eval()
         cfg = self.config
         audio_file = self.audio_file
-
-        audio, sample_rate = soundfile.read(
-            self.audio_file, dtype="int16", always_2d=True)
-
-        audio = audio[:, 0]
-        logger.info(f"audio shape: {audio.shape}")
-
-        # fbank
-        feat = self.preprocessing(audio, **self.preprocess_args)
-        logger.info(f"feat shape: {feat.shape}")
-
-        audio_len = paddle.to_tensor(feat.shape[0])
-        audio = paddle.to_tensor(feat, dtype='float32').unsqueeze(axis=0)
-
+        collate_fn_test = self.collate_fn_test
+        audio, _ = collate_fn_test.process_utterance(
+            audio_file=audio_file, transcript=" ")
+        audio_len = audio.shape[0]
+        audio = paddle.to_tensor(audio, dtype='float32')
+        audio_len = paddle.to_tensor(audio_len)
+        audio = paddle.unsqueeze(audio, axis=0)
+        vocab_list = collate_fn_test.vocab_list
         result_transcripts = self.compute_result_transcripts(
-            audio, audio_len, self.text_feature.vocab_list, cfg.decode)
-
+            audio, audio_len, vocab_list, cfg.decode)
         logger.info("result_transcripts: " + result_transcripts[0])
 
     def run_test(self):
@@ -116,9 +109,11 @@ class DeepSpeech2Tester_hub():
     def setup_model(self):
         config = self.config.clone()
         with UpdateConfig(config):
-            config.input_dim = config.feat_dim
-            config.output_dim = self.text_feature.vocab_size
+            config.input_dim = self.collate_fn_test.feature_size
+            config.output_dim = self.collate_fn_test.vocab_size
+
         model = DeepSpeech2Model.from_config(config)
+
         self.model = model
 
     def setup_checkpointer(self):
diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py
index db60083b0..67186081c 100644
--- a/paddlespeech/s2t/exps/u2/model.py
+++ b/paddlespeech/s2t/exps/u2/model.py
@@ -25,6 +25,8 @@ import paddle
 from paddle import distributed as dist
 
 from paddlespeech.s2t.frontend.featurizer import TextFeaturizer
+from paddlespeech.s2t.io.dataloader import BatchDataLoader
+from paddlespeech.s2t.io.dataloader import StreamDataLoader
 from paddlespeech.s2t.io.dataloader import DataLoaderFactory
 from paddlespeech.s2t.models.u2 import U2Model
 from paddlespeech.s2t.training.optimizer import OptimizerFactory
@@ -107,8 +109,7 @@ class U2Trainer(Trainer):
     def valid(self):
         self.model.eval()
         if not self.use_streamdata:
-            logger.info(
-                f"Valid Total Examples: {len(self.valid_loader.dataset)}")
+            logger.info(f"Valid Total Examples: {len(self.valid_loader.dataset)}")
         valid_losses = defaultdict(list)
         num_seen_utts = 1
         total_loss = 0.0
@@ -135,8 +136,7 @@ class U2Trainer(Trainer):
                 msg += "epoch: {}, ".format(self.epoch)
                 msg += "step: {}, ".format(self.iteration)
                 if not self.use_streamdata:
-                    msg += "batch: {}/{}, ".format(i + 1,
-                                                   len(self.valid_loader))
+                    msg += "batch: {}/{}, ".format(i + 1, len(self.valid_loader))
                 msg += ', '.join('{}: {:>.6f}'.format(k, v)
                                  for k, v in valid_dump.items())
                 logger.info(msg)
@@ -157,8 +157,7 @@ class U2Trainer(Trainer):
         self.before_train()
 
         if not self.use_streamdata:
-            logger.info(
-                f"Train Total Examples: {len(self.train_loader.dataset)}")
+            logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}")
         while self.epoch < self.config.n_epoch:
             with Timer("Epoch-Train Time Cost: {}"):
                 self.model.train()
@@ -226,18 +225,14 @@ class U2Trainer(Trainer):
         config = self.config.clone()
         self.use_streamdata = config.get("use_stream_data", False)
         if self.train:
-            self.train_loader = DataLoaderFactory.get_dataloader(
-                'train', config, self.args)
-            self.valid_loader = DataLoaderFactory.get_dataloader(
-                'valid', config, self.args)
+            self.train_loader = DataLoaderFactory.get_dataloader('train', config, self.args)
+            self.valid_loader = DataLoaderFactory.get_dataloader('valid', config, self.args)
             logger.info("Setup train/valid Dataloader!")
         else:
             decode_batch_size = config.get('decode', dict()).get(
                 'decode_batch_size', 1)
-            self.test_loader = DataLoaderFactory.get_dataloader('test', config,
-                                                                self.args)
-            self.align_loader = DataLoaderFactory.get_dataloader(
-                'align', config, self.args)
+            self.test_loader = DataLoaderFactory.get_dataloader('test', config, self.args)
+            self.align_loader = DataLoaderFactory.get_dataloader('align', config, self.args)
             logger.info("Setup test/align Dataloader!")
 
     def setup_model(self):
@@ -250,8 +245,7 @@ class U2Trainer(Trainer):
                 model_conf.output_dim = self.train_loader.vocab_size
             else:
                 model_conf.input_dim = self.test_loader.feat_dim
-                model_conf.output_dim = self.test_loader.vocab_size
-
+                model_conf.output_dim = 5538
         model = U2Model.from_config(model_conf)
 
         if self.parallel:
@@ -316,6 +310,11 @@ class U2Tester(U2Trainer):
             unit_type=self.config.unit_type,
             vocab=self.config.vocab_filepath,
             spm_model_prefix=self.config.spm_model_prefix)
+
+        self.text_feature_test = TextFeaturizer(
+            unit_type=self.config.unit_type,
+            vocab='/home/zhangtianhao/workspace/PaddleSpeech/examples/aishell/asr1/data/lang_char/vocab.txt',
+            spm_model_prefix=self.config.spm_model_prefix)
         self.vocab_list = self.text_feature.vocab_list
 
     def id2token(self, texts, texts_len, text_feature):
@@ -340,7 +339,7 @@ class U2Tester(U2Trainer):
         error_rate_func = error_rate.cer if decode_config.error_rate_type == 'cer' else error_rate.wer
 
         start_time = time.time()
-        target_transcripts = self.id2token(texts, texts_len, self.text_feature)
+        target_transcripts = self.id2token(texts, texts_len, self.text_feature_test)
         result_transcripts, result_tokenids = self.model.decode(
             audio,
             audio_len,
diff --git a/paddlespeech/s2t/exps/u2_kaldi/model.py b/paddlespeech/s2t/exps/u2_kaldi/model.py
index 073d74293..cb015c116 100644
--- a/paddlespeech/s2t/exps/u2_kaldi/model.py
+++ b/paddlespeech/s2t/exps/u2_kaldi/model.py
@@ -105,8 +105,7 @@ class U2Trainer(Trainer):
     def valid(self):
         self.model.eval()
         if not self.use_streamdata:
-            logger.info(
-                f"Valid Total Examples: {len(self.valid_loader.dataset)}")
+            logger.info(f"Valid Total Examples: {len(self.valid_loader.dataset)}")
         valid_losses = defaultdict(list)
         num_seen_utts = 1
         total_loss = 0.0
@@ -134,8 +133,7 @@ class U2Trainer(Trainer):
                 msg += "epoch: {}, ".format(self.epoch)
                 msg += "step: {}, ".format(self.iteration)
                 if not self.use_streamdata:
-                    msg += "batch: {}/{}, ".format(i + 1,
-                                                   len(self.valid_loader))
+                    msg += "batch: {}/{}, ".format(i + 1, len(self.valid_loader))
                 msg += ', '.join('{}: {:>.6f}'.format(k, v)
                                  for k, v in valid_dump.items())
                 logger.info(msg)
@@ -155,8 +153,7 @@ class U2Trainer(Trainer):
 
         self.before_train()
         if not self.use_streamdata:
-            logger.info(
-                f"Train Total Examples: {len(self.train_loader.dataset)}")
+            logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}")
         while self.epoch < self.config.n_epoch:
             with Timer("Epoch-Train Time Cost: {}"):
                 self.model.train()
@@ -168,8 +165,8 @@ class U2Trainer(Trainer):
                         msg += "epoch: {}, ".format(self.epoch)
                         msg += "step: {}, ".format(self.iteration)
                         if not self.use_streamdata:
-                            msg += "batch : {}/{}, ".format(
-                                batch_index + 1, len(self.train_loader))
+                            msg += "batch : {}/{}, ".format(batch_index + 1,
+                                                        len(self.train_loader))
                         msg += "lr: {:>.8f}, ".format(self.lr_scheduler())
                         msg += "data time: {:>.3f}s, ".format(dataload_time)
                         self.train_batch(batch_index, batch, msg)
@@ -207,24 +204,21 @@ class U2Trainer(Trainer):
         self.use_streamdata = config.get("use_stream_data", False)
         if self.train:
             config = self.config.clone()
-            self.train_loader = DataLoaderFactory.get_dataloader(
-                'train', config, self.args)
+            self.train_loader = DataLoaderFactory.get_dataloader('train', config, self.args)
             config = self.config.clone()
             config['preprocess_config'] = None
-            self.valid_loader = DataLoaderFactory.get_dataloader(
-                'valid', config, self.args)
+            self.valid_loader = DataLoaderFactory.get_dataloader('valid', config, self.args)
             logger.info("Setup train/valid Dataloader!")
         else:
             config = self.config.clone()
             config['preprocess_config'] = None
-            self.test_loader = DataLoaderFactory.get_dataloader('test', config,
-                                                                self.args)
+            self.test_loader = DataLoaderFactory.get_dataloader('test', config, self.args)
             config = self.config.clone()
             config['preprocess_config'] = None
-            self.align_loader = DataLoaderFactory.get_dataloader(
-                'align', config, self.args)
+            self.align_loader = DataLoaderFactory.get_dataloader('align', config, self.args)
             logger.info("Setup test/align Dataloader!")
 
+
     def setup_model(self):
         config = self.config
 
diff --git a/paddlespeech/s2t/exps/u2_st/model.py b/paddlespeech/s2t/exps/u2_st/model.py
index d57c49546..603825435 100644
--- a/paddlespeech/s2t/exps/u2_st/model.py
+++ b/paddlespeech/s2t/exps/u2_st/model.py
@@ -121,8 +121,7 @@ class U2STTrainer(Trainer):
     def valid(self):
         self.model.eval()
         if not self.use_streamdata:
-            logger.info(
-                f"Valid Total Examples: {len(self.valid_loader.dataset)}")
+            logger.info(f"Valid Total Examples: {len(self.valid_loader.dataset)}")
         valid_losses = defaultdict(list)
         num_seen_utts = 1
         total_loss = 0.0
@@ -156,8 +155,7 @@ class U2STTrainer(Trainer):
                 msg += "epoch: {}, ".format(self.epoch)
                 msg += "step: {}, ".format(self.iteration)
                 if not self.use_streamdata:
-                    msg += "batch: {}/{}, ".format(i + 1,
-                                                   len(self.valid_loader))
+                    msg += "batch: {}/{}, ".format(i + 1, len(self.valid_loader))
                 msg += ', '.join('{}: {:>.6f}'.format(k, v)
                                  for k, v in valid_dump.items())
                 logger.info(msg)
@@ -177,8 +175,7 @@ class U2STTrainer(Trainer):
 
         self.before_train()
         if not self.use_streamdata:
-            logger.info(
-                f"Train Total Examples: {len(self.train_loader.dataset)}")
+            logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}")
         while self.epoch < self.config.n_epoch:
             with Timer("Epoch-Train Time Cost: {}"):
                 self.model.train()
@@ -251,16 +248,14 @@ class U2STTrainer(Trainer):
         config['load_transcript'] = load_transcript
         self.use_streamdata = config.get("use_stream_data", False)
         if self.train:
-            self.train_loader = DataLoaderFactory.get_dataloader(
-                'train', config, self.args)
-            self.valid_loader = DataLoaderFactory.get_dataloader(
-                'valid', config, self.args)
+            self.train_loader = DataLoaderFactory.get_dataloader('train', config, self.args)
+            self.valid_loader = DataLoaderFactory.get_dataloader('valid', config, self.args)
             logger.info("Setup train/valid Dataloader!")
         else:
-            self.test_loader = DataLoaderFactory.get_dataloader('test', config,
-                                                                self.args)
+            self.test_loader = DataLoaderFactory.get_dataloader('test', config, self.args)
             logger.info("Setup test Dataloader!")
 
+
     def setup_model(self):
         config = self.config
         model_conf = config
diff --git a/paddlespeech/s2t/exps/wav2vec2/bin/__init__.py b/paddlespeech/s2t/exps/wav2vec2/bin/__init__.py
new file mode 100644
index 000000000..185a92b8d
--- /dev/null
+++ b/paddlespeech/s2t/exps/wav2vec2/bin/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/paddlespeech/s2t/exps/wav2vec2/bin/test.py b/paddlespeech/s2t/exps/wav2vec2/bin/test.py
new file mode 100644
index 000000000..4d16d9fa9
--- /dev/null
+++ b/paddlespeech/s2t/exps/wav2vec2/bin/test.py
@@ -0,0 +1,66 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Evaluation for U2 model."""
+import cProfile
+
+from yacs.config import CfgNode
+
+from paddlespeech.s2t.exps.wav2vec2.model import Wav2Vec2ASRTester as Tester
+from paddlespeech.s2t.training.cli import default_argument_parser
+from paddlespeech.s2t.utils.utility import print_arguments
+
+# TODO(hui zhang): dynamic load
+
+
+def main_sp(config, args):
+    exp = Tester(config, args)
+    with exp.eval():
+        exp.setup()
+        exp.run_test()
+
+
+def main(config, args):
+    main_sp(config, args)
+
+
+if __name__ == "__main__":
+    parser = default_argument_parser()
+    # save asr result to
+    parser.add_argument(
+        '--dict-path', type=str, default=None, help='dict path.')
+    parser.add_argument(
+        "--result_file", type=str, help="path of save the asr result")
+    args = parser.parse_args()
+    print_arguments(args, globals())
+
+    # https://yaml.org/type/float.html
+    config = CfgNode(new_allowed=True)
+    if args.config:
+        config.merge_from_file(args.config)
+    if args.decode_cfg:
+        decode_confs = CfgNode(new_allowed=True)
+        decode_confs.merge_from_file(args.decode_cfg)
+        config.decode = decode_confs
+    if args.opts:
+        config.merge_from_list(args.opts)
+    config.freeze()
+    print(config)
+    if args.dump_config:
+        with open(args.dump_config, 'w') as f:
+            print(config, file=f)
+
+    # Setting for profiling
+    pr = cProfile.Profile()
+    pr.runcall(main, config, args)
+    pr.dump_stats('test.profile')
diff --git a/paddlespeech/s2t/exps/wav2vec2/bin/train.py b/paddlespeech/s2t/exps/wav2vec2/bin/train.py
new file mode 100644
index 000000000..b977b2a15
--- /dev/null
+++ b/paddlespeech/s2t/exps/wav2vec2/bin/train.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Trainer for U2 model."""
+import cProfile
+import os
+
+from yacs.config import CfgNode
+
+from paddlespeech.s2t.exps.wav2vec2.model import Wav2Vec2ASRTrainer as Trainer
+from paddlespeech.s2t.training.cli import default_argument_parser
+from paddlespeech.s2t.utils.utility import print_arguments
+
+
+def main_sp(config, args):
+    exp = Trainer(config, args)
+    exp.setup()
+    exp.run()
+
+
+def main(config, args):
+    main_sp(config, args)
+
+
+if __name__ == "__main__":
+    parser = default_argument_parser()
+    args = parser.parse_args()
+    print_arguments(args, globals())
+
+    # https://yaml.org/type/float.html
+    config = CfgNode(new_allowed=True)
+    if args.config:
+        config.merge_from_file(args.config)
+    if args.opts:
+        config.merge_from_list(args.opts)
+    config.freeze()
+    print(config)
+    if args.dump_config:
+        with open(args.dump_config, 'w') as f:
+            print(config, file=f)
+
+    # Setting for profiling
+    pr = cProfile.Profile()
+    pr.runcall(main, config, args)
+    pr.dump_stats(os.path.join(args.output, 'train.profile'))
diff --git a/paddlespeech/s2t/exps/wav2vec2/model.py b/paddlespeech/s2t/exps/wav2vec2/model.py
new file mode 100644
index 000000000..587a279b3
--- /dev/null
+++ b/paddlespeech/s2t/exps/wav2vec2/model.py
@@ -0,0 +1,465 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Contains wav2vec2 model."""
+import json
+import os
+import time
+from collections import defaultdict
+from collections import OrderedDict
+from contextlib import nullcontext
+from paddlespeech.s2t.utils import mp_tools
+
+import jsonlines
+import numpy as np
+import paddle
+from paddle import distributed as dist
+from paddlespeech.s2t.frontend.featurizer import TextFeaturizer
+from paddlespeech.s2t.io.dataloader import BatchDataLoader
+from paddlespeech.s2t.io.dataloader import StreamDataLoader
+from paddlespeech.s2t.io.dataloader import DataLoaderFactory
+from paddlespeech.s2t.models.wav2vec2.wav2vec2_ASR import Wav2vec2ASR
+from paddlespeech.s2t.utils import error_rate
+
+
+from paddlespeech.s2t.training.optimizer import OptimizerFactory
+from paddlespeech.s2t.training.reporter import ObsScope
+from paddlespeech.s2t.training.reporter import report
+from paddlespeech.s2t.training.scheduler import LRSchedulerFactory
+from paddlespeech.s2t.training.timer import Timer
+from paddlespeech.s2t.training.trainer import Trainer
+from paddlespeech.s2t.utils.utility import UpdateConfig
+from paddlespeech.s2t.utils import layer_tools
+from paddlespeech.s2t.utils.log import Log
+
+from paddlespeech.s2t.models.wav2vec2.speechbrain.processing.speech_augmentation import TimeDomainSpecAugment
+import pdb
+
+
+logger = Log(__name__).getlog()
+
+class Wav2Vec2ASRTrainer(Trainer):
+    def __init__(self, config, args):
+        super().__init__(config, args)
+
+    def train_batch(self, batch_index, batch, msg):
+        train_conf = self.config
+        start = time.time()
+
+        # forward
+        utt, wav, wavs_lens, target, target_lens = batch
+        wavs_lens_rate = wavs_lens / wav.shape[1] 
+        target_lens_rate = target_lens / target.shape[1]
+        wav = wav[:,:,0]
+        if train_conf.augment:
+            wav = self.speech_augmentation(wav, wavs_lens_rate)
+        loss = self.model(wav, wavs_lens_rate, target, target_lens_rate)
+        # print(self.model.wav2vec2.feature_projection.projection.weight)
+        # print(self.model.wav2vec2.feature_extractor.conv_layers[0].conv.weight)
+
+        # loss div by `batch_size * accum_grad`
+        loss /= train_conf.accum_grad
+        losses_np = {'loss': float(loss) * train_conf.accum_grad}
+
+        # loss backward
+        if (batch_index + 1) % train_conf.accum_grad != 0:
+            # Disable gradient synchronizations across DDP processes.
+            # Within this context, gradients will be accumulated on module
+            # variables, which will later be synchronized.
+            # When using cpu w/o DDP, model does not have `no_sync`
+            context = self.model.no_sync if (hasattr(self.model, "no_sync") and
+                                             self.parallel) else nullcontext
+        else:
+            # Used for single gpu training and DDP gradient synchronization
+            # processes.
+            context = nullcontext
+        with context():
+            loss.backward()
+            layer_tools.print_grads(self.model, print_func=None)
+
+        # optimizer step old
+        if (batch_index + 1) % train_conf.accum_grad == 0:
+            self.optimizer.step()
+            self.optimizer.clear_grad()
+            self.lr_scheduler.step()
+            self.iteration += 1
+        # optimizer step new
+        # if (batch_index + 1) % train_conf.accum_grad == 0:
+        #     self.optimizer.step()
+        #     self.optimizer.clear_grad()
+        #     self.iteration += 1
+
+        iteration_time = time.time() - start
+
+        for k, v in losses_np.items():
+            report(k, v)
+        report("batch_size", self.config.batch_size)
+        report("accum", train_conf.accum_grad)
+        report("step_cost", iteration_time)
+
+        if (batch_index + 1) % train_conf.accum_grad == 0:
+            if dist.get_rank() == 0 and self.visualizer:
+                losses_np_v = losses_np.copy()
+                losses_np_v.update({"lr": self.lr_scheduler()})
+                for key, val in losses_np_v.items():
+                    self.visualizer.add_scalar(
+                        tag='train/' + key, value=val, step=self.iteration - 1)
+
+    @paddle.no_grad()
+    def valid(self):
+        self.model.eval()
+        if not self.use_streamdata:
+            logger.info(f"Valid Total Examples: {len(self.valid_loader.dataset)}")
+        valid_losses = defaultdict(list)
+        num_seen_utts = 1
+        total_loss = 0.0
+        for i, batch in enumerate(self.valid_loader):
+            utt, wav, wavs_lens, target, target_lens = batch
+            wavs_lens_rate = wavs_lens / wav.shape[1] 
+            target_lens_rate = target_lens / target.shape[1]
+            wav = wav[:,:,0]
+            loss = self.model(wav, wavs_lens_rate, target, target_lens_rate)
+
+            if paddle.isfinite(loss):
+                num_utts = batch[1].shape[0]
+                num_seen_utts += num_utts
+                total_loss += float(loss) * num_utts
+                valid_losses['val_loss'].append(float(loss))
+
+            if (i + 1) % self.config.log_interval == 0:
+                valid_dump = {k: np.mean(v) for k, v in valid_losses.items()}
+                valid_dump['val_history_loss'] = total_loss / num_seen_utts
+
+                # logging
+                msg = f"Valid: Rank: {dist.get_rank()}, "
+                msg += "epoch: {}, ".format(self.epoch)
+                msg += "step: {}, ".format(self.iteration)
+                if not self.use_streamdata:
+                    msg += "batch: {}/{}, ".format(i + 1, len(self.valid_loader))
+                msg += ', '.join('{}: {:>.6f}'.format(k, v)
+                                 for k, v in valid_dump.items())
+                logger.info(msg)
+
+        logger.info('Rank {} Val info val_loss {}'.format(
+            dist.get_rank(), total_loss / num_seen_utts))
+        return total_loss, num_seen_utts
+
+    def do_train(self):
+        """The training process control by step."""
+        # !!!IMPORTANT!!!
+        # Try to export the model by script, if fails, we should refine
+        # the code to satisfy the script export requirements
+        # script_model = paddle.jit.to_static(self.model)
+        # script_model_path = str(self.checkpoint_dir / 'init')
+        # paddle.jit.save(script_model, script_model_path)
+
+        self.before_train()
+
+        if not self.use_streamdata:
+            logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}")
+        while self.epoch < self.config.n_epoch:
+            with Timer("Epoch-Train Time Cost: {}"):
+                self.model.train()
+                try:
+                    data_start_time = time.time()
+                    for batch_index, batch in enumerate(self.train_loader):
+                        dataload_time = time.time() - data_start_time
+                        msg = "Train:"
+                        observation = OrderedDict()
+                        with ObsScope(observation):
+                            report("Rank", dist.get_rank())
+                            report("epoch", self.epoch)
+                            report('step', self.iteration)
+                            report("lr", self.lr_scheduler())
+                            self.train_batch(batch_index, batch, msg)
+                            self.after_train_batch()
+                            report('iter', batch_index + 1)
+                            if not self.use_streamdata:
+                                report('total', len(self.train_loader))
+                            report('reader_cost', dataload_time)
+                        observation['batch_cost'] = observation[
+                            'reader_cost'] + observation['step_cost']
+                        observation['samples'] = observation['batch_size']
+                        observation['ips,samples/s'] = observation[
+                            'batch_size'] / observation['batch_cost']
+                        for k, v in observation.items():
+                            msg += f" {k.split(',')[0]}: "
+                            msg += f"{v:>.8f}" if isinstance(v,
+                                                             float) else f"{v}"
+                            msg += f" {k.split(',')[1]}" if len(
+                                k.split(',')) == 2 else ""
+                            msg += ","
+                        msg = msg[:-1]  # remove the last ","
+                        if (batch_index + 1) % self.config.log_interval == 0:
+                            logger.info(msg)
+                        data_start_time = time.time()
+                except Exception as e:
+                    logger.error(e)
+                    raise e
+            with Timer("Eval Time Cost: {}"):
+                total_loss, num_seen_utts = self.valid()
+                if dist.get_world_size() > 1:
+                    num_seen_utts = paddle.to_tensor(num_seen_utts)
+                    # the default operator in all_reduce function is sum.
+                    dist.all_reduce(num_seen_utts)
+                    total_loss = paddle.to_tensor(total_loss)
+                    dist.all_reduce(total_loss)
+                    cv_loss = total_loss / num_seen_utts
+                    cv_loss = float(cv_loss)
+                else:
+                    cv_loss = total_loss / num_seen_utts
+
+            logger.info(
+                'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss))
+            if self.visualizer:
+                self.visualizer.add_scalar(
+                    tag='eval/cv_loss', value=cv_loss, step=self.epoch)
+                self.visualizer.add_scalar(
+                    tag='eval/lr', value=self.lr_scheduler(), step=self.epoch)
+
+            self.save(tag=self.epoch, infos={'val_loss': cv_loss})
+            self.new_epoch()
+
+    def setup_dataloader(self):
+        config = self.config.clone()
+        self.use_streamdata = config.get("use_stream_data", False)
+        if self.train:
+            self.train_loader = DataLoaderFactory.get_dataloader('train', config, self.args)
+            self.valid_loader = DataLoaderFactory.get_dataloader('valid', config, self.args)
+            logger.info("Setup train/valid Dataloader!")
+        else:
+            decode_batch_size = config.get('decode', dict()).get(
+                'decode_batch_size', 1)
+            self.test_loader = DataLoaderFactory.get_dataloader('test', config, self.args)
+            self.align_loader = DataLoaderFactory.get_dataloader('align', config, self.args)
+            logger.info("Setup test/align Dataloader!")
+
+    def setup_model(self):
+        config = self.config
+        model_conf = config
+
+        with UpdateConfig(model_conf):
+            if self.train:
+                model_conf.input_dim = self.train_loader.feat_dim
+                model_conf.output_dim = self.train_loader.vocab_size
+            else:
+                model_conf.input_dim = self.test_loader.feat_dim
+                model_conf.output_dim = self.test_loader.vocab_size
+
+        model = Wav2vec2ASR.from_config(model_conf)
+
+        if self.parallel:
+            model = paddle.DataParallel(model) 
+
+        # logger.info(f"{model}")
+        layer_tools.print_params(model, logger.info)
+        self.model = model
+        logger.info("Setup model!")
+        if model_conf.augment:
+            self.speech_augmentation = TimeDomainSpecAugment(sample_rate=16000, speeds=[95, 100, 105])
+
+        if not self.train:
+            return
+
+        train_config = config
+        optim_type = train_config.model_optim
+        optim_conf = train_config.model_optim_conf
+        scheduler_type = train_config.scheduler
+        scheduler_conf = train_config.scheduler_conf
+
+        scheduler_args = {
+            "learning_rate": optim_conf.lr,
+            "verbose": False,
+            "warmup_steps": scheduler_conf.warmup_steps,
+            "gamma": scheduler_conf.lr_decay,
+            "d_model": model_conf.dnn_neurons,
+        }
+        lr_scheduler = LRSchedulerFactory.from_args(scheduler_type,
+                                                    scheduler_args)
+
+        def optimizer_args(
+                config,
+                parameters,
+                lr_scheduler=None, ):
+            train_config = config
+            optim_type = train_config.model_optim
+            optim_conf = train_config.model_optim_conf
+            scheduler_type = train_config.scheduler
+            scheduler_conf = train_config.scheduler_conf
+            return {
+                "grad_clip": train_config.global_grad_clip,
+                "learning_rate": lr_scheduler
+                if lr_scheduler else optim_conf.lr,
+                "epsilon": optim_conf.epsilon,
+                "rho": optim_conf.rho,
+                "parameters": parameters,
+                "epsilon": 1e-9 if optim_type == 'noam' else None,
+                "beta1": 0.9 if optim_type == 'noam' else None,
+                "beat2": 0.98 if optim_type == 'noam' else None,
+            }
+
+        # optimzer_args = optimizer_args(config, model.parameters(), lr_scheduler)
+        optimzer_args = optimizer_args(config, model.parameters(), lr_scheduler)
+
+        optimizer = OptimizerFactory.from_args(optim_type, optimzer_args)
+
+        self.optimizer = optimizer
+        self.lr_scheduler = lr_scheduler
+        logger.info("Setup optimizer/lr_scheduler!")
+
+
+
+class Wav2Vec2ASRTester(Wav2Vec2ASRTrainer):
+    def __init__(self, config, args):
+        super().__init__(config, args)
+        print(config)
+        self.text_featurizer = TextFeaturizer(
+            unit_type=config.unit_type, vocab=config.vocab_filepath)
+        self.vocab_list = self.text_featurizer.vocab_list
+
+    def id2token(self, texts, texts_len):
+        """ ord() id to chr() chr """
+        trans = []
+        for text, n in zip(texts, texts_len):
+            n = n.numpy().item()
+            ids = text[:n]
+            trans.append(
+                self.text_featurizer.defeaturize(ids.numpy().tolist()))
+        return trans
+
+    def compute_metrics(self,
+                        utts,
+                        audio,
+                        audio_len,
+                        texts,
+                        texts_len,
+                        fout=None):
+        decode_cfg = self.config.decode
+        errors_sum, len_refs, num_ins = 0.0, 0, 0
+        errors_func = error_rate.char_errors if decode_cfg.error_rate_type == 'cer' else error_rate.word_errors
+        error_rate_func = error_rate.cer if decode_cfg.error_rate_type == 'cer' else error_rate.wer
+
+        start_time = time.time()
+        target_transcripts = self.id2token(texts, texts_len)
+        result_transcripts, result_tokenids = self.model.decode(
+                    audio,
+                    audio_len,
+                    text_feature=self.text_featurizer,
+                    decoding_method=decode_cfg.decoding_method,
+                    beam_size=decode_cfg.beam_size)
+        decode_time = time.time() - start_time
+
+        for utt, target, result, rec_tids in zip(
+                utts, target_transcripts, result_transcripts, result_tokenids):
+            errors, len_ref = errors_func(target, result)
+            errors_sum += errors
+            len_refs += len_ref
+            num_ins += 1
+            if fout:
+                fout.write({
+                    "utt": utt,
+                    "refs": [target],
+                    "hyps": [result],
+                    "hyps_tokenid": [rec_tids],
+                })
+            logger.info(f"Utt: {utt}")
+            logger.info(f"Ref: {target}")
+            logger.info(f"Hyp: {result}")
+            logger.info("One example error rate [%s] = %f" % (
+                decode_cfg.error_rate_type, error_rate_func(target, result)))
+
+        return dict(
+            errors_sum=errors_sum,
+            len_refs=len_refs,
+            num_ins=num_ins,  # num examples
+            error_rate=errors_sum / len_refs,
+            error_rate_type=decode_cfg.error_rate_type,
+            num_frames=audio_len.sum().numpy().item(),
+            decode_time=decode_time)
+
+    @mp_tools.rank_zero_only
+    @paddle.no_grad()
+    def test(self):
+        logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}")
+        self.model.eval()
+
+        error_rate_type = None
+        errors_sum, len_refs, num_ins = 0.0, 0, 0
+        num_frames = 0.0
+        num_time = 0.0
+        # Initialized the decoder in model
+        decode_cfg = self.config.decode
+        vocab_list = self.vocab_list
+        decode_batch_size = decode_cfg.decode_batch_size
+        # self.model.decoder.init_decoder(
+        #     decode_batch_size, vocab_list, decode_cfg.decoding_method,
+        #     decode_cfg.lang_model_path, decode_cfg.alpha, decode_cfg.beta,
+        #     decode_cfg.beam_size, decode_cfg.cutoff_prob,
+        #     decode_cfg.cutoff_top_n, decode_cfg.num_proc_bsearch)
+
+        with jsonlines.open(self.args.result_file, 'w') as fout:
+            for i, batch in enumerate(self.test_loader):
+                metrics = self.compute_metrics(*batch, fout=fout)
+                num_frames += metrics['num_frames']
+                num_time += metrics["decode_time"]
+                errors_sum += metrics['errors_sum']
+                len_refs += metrics['len_refs']
+                num_ins += metrics['num_ins']
+                error_rate_type = metrics['error_rate_type']
+                rtf = num_time / (num_frames)
+                logger.info(
+                    "RTF: %f, Error rate [%s] (%d/?) = %f" %
+                    (rtf, error_rate_type, num_ins, errors_sum / len_refs))
+
+        # logging
+        msg = "Test: "
+        msg += "epoch: {}, ".format(self.epoch)
+        msg += "step: {}, ".format(self.iteration)
+        msg += "Final error rate [%s] (%d/%d) = %f" % (
+            error_rate_type, num_ins, num_ins, errors_sum / len_refs)
+        logger.info(msg)
+
+        err_meta_path = os.path.splitext(self.args.result_file)[0] + '.err'
+        err_type_str = "{}".format(error_rate_type)
+        with open(err_meta_path, 'w') as f:
+            data = json.dumps({
+                "epoch":
+                self.epoch,
+                "step":
+                self.iteration,
+                "rtf":
+                rtf,
+                error_rate_type:
+                errors_sum / len_refs,
+                "dataset_hour": (num_frames) / 1000.0 / 3600.0,
+                "process_hour":
+                num_time / 1000.0 / 3600.0,
+                "num_examples":
+                num_ins,
+                "err_sum":
+                errors_sum,
+                "ref_len":
+                len_refs,
+                "decode_method":
+                self.config.decode.decoding_method,
+            })
+            f.write(data + '\n')
+
+    @paddle.no_grad()
+    def export(self):
+        infer_model = DeepSpeech2InferModel.from_pretrained(
+            self.test_loader, self.config, self.args.checkpoint_path)
+        infer_model.eval()
+        static_model = infer_model.export()
+        logger.info(f"Export code: {static_model.forward.code}")
+        paddle.jit.save(static_model, self.args.export_path)
diff --git a/paddlespeech/s2t/io/dataloader.py b/paddlespeech/s2t/io/dataloader.py
index 4cc8274f9..735d29da2 100644
--- a/paddlespeech/s2t/io/dataloader.py
+++ b/paddlespeech/s2t/io/dataloader.py
@@ -22,16 +22,17 @@ import paddle
 from paddle.io import BatchSampler
 from paddle.io import DataLoader
 from paddle.io import DistributedBatchSampler
-from yacs.config import CfgNode
 
-import paddlespeech.audio.streamdata as streamdata
-from paddlespeech.audio.text.text_featurizer import TextFeaturizer
 from paddlespeech.s2t.io.batchfy import make_batchset
 from paddlespeech.s2t.io.converter import CustomConverter
 from paddlespeech.s2t.io.dataset import TransformDataset
 from paddlespeech.s2t.io.reader import LoadInputsAndTargets
 from paddlespeech.s2t.utils.log import Log
 
+import paddlespeech.audio.streamdata as streamdata
+from paddlespeech.audio.text.text_featurizer import TextFeaturizer
+from yacs.config import CfgNode
+
 __all__ = ["BatchDataLoader", "StreamDataLoader"]
 
 logger = Log(__name__).getlog()
@@ -60,7 +61,6 @@ def batch_collate(x):
     """
     return x[0]
 
-
 def read_preprocess_cfg(preprocess_conf_file):
     augment_conf = dict()
     preprocess_cfg = CfgNode(new_allowed=True)
@@ -82,8 +82,7 @@ def read_preprocess_cfg(preprocess_conf_file):
             augment_conf['num_t_mask'] = process['n_mask']
             augment_conf['t_inplace'] = process['inplace']
             augment_conf['t_replace_with_zero'] = process['replace_with_zero']
-    return augment_conf
-
+    return augment_conf 
 
 class StreamDataLoader():
     def __init__(self,
@@ -96,12 +95,12 @@ class StreamDataLoader():
                  frame_length=25,
                  frame_shift=10,
                  dither=0.0,
-                 minlen_in: float=0.0,
+                 minlen_in: float=0.0, 
                  maxlen_in: float=float('inf'),
                  minlen_out: float=0.0,
                  maxlen_out: float=float('inf'),
                  resample_rate: int=16000,
-                 shuffle_size: int=10000,
+                 shuffle_size: int=10000, 
                  sort_size: int=1000,
                  n_iter_processes: int=1,
                  prefetch_factor: int=2,
@@ -117,11 +116,11 @@ class StreamDataLoader():
 
         text_featurizer = TextFeaturizer(unit_type, vocab_filepath)
         symbol_table = text_featurizer.vocab_dict
-        self.feat_dim = num_mel_bins
-        self.vocab_size = text_featurizer.vocab_size
-
+        self.feat_dim = num_mel_bins 
+        self.vocab_size = text_featurizer.vocab_size 
+        
         augment_conf = read_preprocess_cfg(preprocess_conf)
-
+        
         # The list of shard
         shardlist = []
         with open(manifest_file, "r") as f:
@@ -129,68 +128,58 @@ class StreamDataLoader():
                 shardlist.append(line.strip())
         world_size = 1
         try:
-            world_size = paddle.distributed.get_world_size()
+            world_size = paddle.distributed.get_world_size() 
         except Exception as e:
             logger.warninig(e)
-            logger.warninig(
-                "can not get world_size using paddle.distributed.get_world_size(), use world_size=1"
-            )
-        assert len(shardlist) >= world_size, \
-            "the length of shard list should >= number of gpus/xpus/..."
+            logger.warninig("can not get world_size using paddle.distributed.get_world_size(), use world_size=1")
+        assert(len(shardlist) >= world_size, "the length of shard list should >= number of gpus/xpus/...")
 
-        update_n_iter_processes = int(
-            max(min(len(shardlist) / world_size - 1, self.n_iter_processes), 0))
+        update_n_iter_processes = int(max(min(len(shardlist)/world_size - 1, self.n_iter_processes), 0))
         logger.info(f"update_n_iter_processes {update_n_iter_processes}")
         if update_n_iter_processes != self.n_iter_processes:
-            self.n_iter_processes = update_n_iter_processes
+            self.n_iter_processes = update_n_iter_processes         
             logger.info(f"change nun_workers to {self.n_iter_processes}")
 
         if self.dist_sampler:
             base_dataset = streamdata.DataPipeline(
-                streamdata.SimpleShardList(shardlist), streamdata.split_by_node
-                if train_mode else streamdata.placeholder(),
+                streamdata.SimpleShardList(shardlist),
+                streamdata.split_by_node if train_mode else streamdata.placeholder(),
                 streamdata.split_by_worker,
-                streamdata.tarfile_to_samples(streamdata.reraise_exception))
+                streamdata.tarfile_to_samples(streamdata.reraise_exception)
+            )
         else:
             base_dataset = streamdata.DataPipeline(
                 streamdata.SimpleShardList(shardlist),
                 streamdata.split_by_worker,
-                streamdata.tarfile_to_samples(streamdata.reraise_exception))
+                streamdata.tarfile_to_samples(streamdata.reraise_exception)
+            )
 
         self.dataset = base_dataset.append_list(
             streamdata.audio_tokenize(symbol_table),
-            streamdata.audio_data_filter(
-                frame_shift=frame_shift,
-                max_length=maxlen_in,
-                min_length=minlen_in,
-                token_max_length=maxlen_out,
-                token_min_length=minlen_out),
+            streamdata.audio_data_filter(frame_shift=frame_shift, max_length=maxlen_in, min_length=minlen_in, token_max_length=maxlen_out, token_min_length=minlen_out),
             streamdata.audio_resample(resample_rate=resample_rate),
-            streamdata.audio_compute_fbank(
-                num_mel_bins=num_mel_bins,
-                frame_length=frame_length,
-                frame_shift=frame_shift,
-                dither=dither),
-            streamdata.audio_spec_aug(**augment_conf)
-            if train_mode else streamdata.placeholder(
-            ),  # num_t_mask=2, num_f_mask=2, max_t=40, max_f=30, max_w=80)
+            streamdata.audio_compute_fbank(num_mel_bins=num_mel_bins, frame_length=frame_length, frame_shift=frame_shift, dither=dither),
+            streamdata.audio_spec_aug(**augment_conf) if train_mode else streamdata.placeholder(),  # num_t_mask=2, num_f_mask=2, max_t=40, max_f=30, max_w=80)
             streamdata.shuffle(shuffle_size),
             streamdata.sort(sort_size=sort_size),
             streamdata.batched(batch_size),
             streamdata.audio_padding(),
-            streamdata.audio_cmvn(cmvn_file))
+            streamdata.audio_cmvn(cmvn_file)
+        )
 
         if paddle.__version__ >= '2.3.2':
             self.loader = streamdata.WebLoader(
-                self.dataset,
-                num_workers=self.n_iter_processes,
-                prefetch_factor=self.prefetch_factor,
-                batch_size=None)
+                self.dataset, 
+                num_workers=self.n_iter_processes, 
+                prefetch_factor = self.prefetch_factor, 
+                batch_size=None
+            )
         else:
             self.loader = streamdata.WebLoader(
-                self.dataset,
-                num_workers=self.n_iter_processes,
-                batch_size=None)
+                self.dataset, 
+                num_workers=self.n_iter_processes, 
+                batch_size=None
+            )
 
     def __iter__(self):
         return self.loader.__iter__()
@@ -199,9 +188,7 @@ class StreamDataLoader():
         return self.__iter__()
 
     def __len__(self):
-        logger.info(
-            "Stream dataloader does not support calculate the length of the dataset"
-        )
+        logger.info("Stream dataloader does not support calculate the length of the dataset")
         return -1
 
 
@@ -360,7 +347,7 @@ class DataLoaderFactory():
                 config['train_mode'] = True
             elif mode == 'valid':
                 config['manifest'] = config.dev_manifest
-                config['train_mode'] = False
+                config['train_mode'] = False  
             elif model == 'test' or mode == 'align':
                 config['manifest'] = config.test_manifest
                 config['train_mode'] = False
@@ -371,31 +358,30 @@ class DataLoaderFactory():
                 config['maxlen_out'] = float('inf')
                 config['dist_sampler'] = False
             else:
-                raise KeyError(
-                    "not valid mode type!!, please input one of 'train, valid, test, align'"
-                )
+                raise KeyError("not valid mode type!!, please input one of 'train, valid, test, align'")
             return StreamDataLoader(
-                manifest_file=config.manifest,
-                train_mode=config.train_mode,
-                unit_type=config.unit_type,
-                preprocess_conf=config.preprocess_config,
-                batch_size=config.batch_size,
-                num_mel_bins=config.feat_dim,
-                frame_length=config.window_ms,
-                frame_shift=config.stride_ms,
-                dither=config.dither,
-                minlen_in=config.minlen_in,
-                maxlen_in=config.maxlen_in,
-                minlen_out=config.minlen_out,
-                maxlen_out=config.maxlen_out,
-                resample_rate=config.resample_rate,
-                shuffle_size=config.shuffle_size,
-                sort_size=config.sort_size,
-                n_iter_processes=config.num_workers,
-                prefetch_factor=config.prefetch_factor,
-                dist_sampler=config.dist_sampler,
-                cmvn_file=config.cmvn_file,
-                vocab_filepath=config.vocab_filepath, )
+                    manifest_file=config.manifest,
+                    train_mode=config.train_mode,
+                    unit_type=config.unit_type,
+                    preprocess_conf=config.preprocess_config,
+                    batch_size=config.batch_size,
+                    num_mel_bins=config.feat_dim,
+                    frame_length=config.window_ms,
+                    frame_shift=config.stride_ms,
+                    dither=config.dither,
+                    minlen_in=config.minlen_in,
+                    maxlen_in=config.maxlen_in,
+                    minlen_out=config.minlen_out,
+                    maxlen_out=config.maxlen_out,
+                    resample_rate=config.resample_rate,
+                    shuffle_size=config.shuffle_size, 
+                    sort_size=config.sort_size,
+                    n_iter_processes=config.num_workers,
+                    prefetch_factor=config.prefetch_factor,
+                    dist_sampler=config.dist_sampler,
+                    cmvn_file=config.cmvn_file,
+                    vocab_filepath=config.vocab_filepath,
+                )
         else:
             if mode == 'train':
                 config['manifest'] = config.train_manifest
@@ -425,7 +411,7 @@ class DataLoaderFactory():
                 config['train_mode'] = False
                 config['sortagrad'] = False
                 config['batch_size'] = config.get('decode', dict()).get(
-                    'decode_batch_size', 1)
+                'decode_batch_size', 1)
                 config['maxlen_in'] = float('inf')
                 config['maxlen_out'] = float('inf')
                 config['minibatches'] = 0
@@ -441,10 +427,8 @@ class DataLoaderFactory():
                 config['dist_sampler'] = False
                 config['shortest_first'] = False
             else:
-                raise KeyError(
-                    "not valid mode type!!, please input one of 'train, valid, test, align'"
-                )
-
+                raise KeyError("not valid mode type!!, please input one of 'train, valid, test, align'")
+            
             return BatchDataLoader(
                 json_file=config.manifest,
                 train_mode=config.train_mode,
@@ -466,3 +450,4 @@ class DataLoaderFactory():
                 num_encs=config.num_encs,
                 dist_sampler=config.dist_sampler,
                 shortest_first=config.shortest_first)
+           
diff --git a/paddlespeech/s2t/io/reader.py b/paddlespeech/s2t/io/reader.py
index 5e018befb..44e452bb0 100644
--- a/paddlespeech/s2t/io/reader.py
+++ b/paddlespeech/s2t/io/reader.py
@@ -120,6 +120,7 @@ class LoadInputsAndTargets():
                     x = self._get_from_loader(
                         filepath=inp["feat"],
                         filetype=inp.get("filetype", "mat"))
+
                     x_feats_dict.setdefault(inp["name"], []).append(x)
 
             if self.load_output:
@@ -236,6 +237,7 @@ class LoadInputsAndTargets():
         :return:
         :rtype: np.ndarray
         """
+
         if filetype == "hdf5":
             # e.g.
             #    {"input": [{"feat": "some/path.h5:F01_050C0101_PED_REAL",
diff --git a/paddlespeech/s2t/models/ds2/deepspeech2.py b/paddlespeech/s2t/models/ds2/deepspeech2.py
index b7ee80a7d..4557af86f 100644
--- a/paddlespeech/s2t/models/ds2/deepspeech2.py
+++ b/paddlespeech/s2t/models/ds2/deepspeech2.py
@@ -271,7 +271,7 @@ class DeepSpeech2Model(nn.Layer):
             enc_n_units=self.encoder.output_size,
             blank_id=blank_id,
             dropout_rate=0.0,
-            reduction=True,  # sum
+            reduction_type="sum",  # sum
             batch_average=True,  # sum / batch_size
             grad_norm_type=ctc_grad_norm_type)
 
diff --git a/paddlespeech/s2t/models/test.py b/paddlespeech/s2t/models/test.py
new file mode 100644
index 000000000..488c386e1
--- /dev/null
+++ b/paddlespeech/s2t/models/test.py
@@ -0,0 +1,20 @@
+import paddle
+import paddle.nn as nn
+
+class Model(nn.Layer):
+    def __init__(self):
+        super().__init__()        
+        self.linear = nn.Linear(1024,1024)
+
+    def forward(self, x):
+        return self.linear(x)
+
+model = Model()
+x = paddle.uniform([100,1024], dtype='float32')
+out = model(x)
+loss = paddle.mean(out)
+loss.backward()
+
+clip = nn.ClipGradByGlobalNorm(clip_norm=1.0)
+optim = paddle.optimizer.Adadelta(learning_rate=0.1, parameters=model.parameters(), grad_clip=clip)
+optim.step()
\ No newline at end of file
diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py
index 813e1e529..b6a4eb7fa 100644
--- a/paddlespeech/s2t/models/u2/u2.py
+++ b/paddlespeech/s2t/models/u2/u2.py
@@ -605,8 +605,8 @@ class U2BaseModel(ASRInterface, nn.Layer):
             xs: paddle.Tensor,
             offset: int,
             required_cache_size: int,
-            att_cache: paddle.Tensor,  # paddle.zeros([0, 0, 0, 0])
-            cnn_cache: paddle.Tensor,  # paddle.zeros([0, 0, 0, 0])
+            att_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]),
+            cnn_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]),
     ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
         """ Export interface for c++ call, give input chunk xs, and return
             output from time 0 to current chunk.
@@ -864,7 +864,7 @@ class U2Model(U2DecodeModel):
             enc_n_units=encoder.output_size(),
             blank_id=0,
             dropout_rate=dropout_rate,
-            reduction=True,  # sum
+            reduction_type="sum",  # sum
             batch_average=True,  # sum / batch_size
             grad_norm_type=grad_norm_type)
 
diff --git a/paddlespeech/s2t/models/u2_st/u2_st.py b/paddlespeech/s2t/models/u2_st/u2_st.py
index e8b61bc0d..81ae43184 100644
--- a/paddlespeech/s2t/models/u2_st/u2_st.py
+++ b/paddlespeech/s2t/models/u2_st/u2_st.py
@@ -18,6 +18,7 @@ Unified Streaming and Non-streaming Two-pass End-to-end Model for Speech Recogni
 """
 import time
 from typing import Dict
+from typing import List
 from typing import Optional
 from typing import Tuple
 
@@ -25,8 +26,6 @@ import paddle
 from paddle import jit
 from paddle import nn
 
-from paddlespeech.audio.utils.tensor_utils import add_sos_eos
-from paddlespeech.audio.utils.tensor_utils import th_accuracy
 from paddlespeech.s2t.frontend.utility import IGNORE_ID
 from paddlespeech.s2t.frontend.utility import load_cmvn
 from paddlespeech.s2t.modules.cmvn import GlobalCMVN
@@ -39,6 +38,8 @@ from paddlespeech.s2t.modules.mask import subsequent_mask
 from paddlespeech.s2t.utils import checkpoint
 from paddlespeech.s2t.utils import layer_tools
 from paddlespeech.s2t.utils.log import Log
+from paddlespeech.audio.utils.tensor_utils import add_sos_eos
+from paddlespeech.audio.utils.tensor_utils import th_accuracy
 from paddlespeech.s2t.utils.utility import UpdateConfig
 
 __all__ = ["U2STModel", "U2STInferModel"]
@@ -400,8 +401,8 @@ class U2STBaseModel(nn.Layer):
             xs: paddle.Tensor,
             offset: int,
             required_cache_size: int,
-            att_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]),
-            cnn_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]),
+            att_cache: paddle.Tensor = paddle.zeros([0, 0, 0, 0]),
+            cnn_cache: paddle.Tensor = paddle.zeros([0, 0, 0, 0]),
     ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
         """ Export interface for c++ call, give input chunk xs, and return
             output from time 0 to current chunk.
@@ -434,8 +435,8 @@ class U2STBaseModel(nn.Layer):
             paddle.Tensor: new conformer cnn cache required for next chunk, with
                 same shape as the original cnn_cache.
         """
-        return self.encoder.forward_chunk(xs, offset, required_cache_size,
-                                          att_cache, cnn_cache)
+        return self.encoder.forward_chunk(
+            xs, offset, required_cache_size, att_cache, cnn_cache)
 
     # @jit.to_static
     def ctc_activation(self, xs: paddle.Tensor) -> paddle.Tensor:
@@ -611,7 +612,7 @@ class U2STModel(U2STBaseModel):
                 enc_n_units=encoder.output_size(),
                 blank_id=0,
                 dropout_rate=dropout_rate,
-                reduction=True,  # sum
+                reduction_type='sum',  # sum
                 batch_average=True,  # sum / batch_size
                 grad_norm_type=grad_norm_type)
 
diff --git a/paddlespeech/s2t/models/wav2vec2/__init__.py b/paddlespeech/s2t/models/wav2vec2/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/paddlespeech/s2t/models/wav2vec2/activations.py b/paddlespeech/s2t/models/wav2vec2/activations.py
new file mode 100644
index 000000000..0158e8cb0
--- /dev/null
+++ b/paddlespeech/s2t/models/wav2vec2/activations.py
@@ -0,0 +1,175 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+from packaging import version
+from paddle import Tensor, nn
+
+
+from paddlespeech.s2t.utils.log import Log
+logger = Log(__name__).getlog()
+
+
+class NewGELUActivation(nn.Layer):
+    """
+    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see
+    the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
+    """
+
+    def forward(self, input: Tensor) -> Tensor:
+        return 0.5 * input * (1.0 + paddle.tanh(math.sqrt(2.0 / math.pi) * (input + 0.044715 * paddle.pow(input, 3.0))))
+
+
+class GELUActivation(nn.Layer):
+    """
+    Original Implementation of the GELU activation function in Google BERT repo when initially created. For
+    information: OpenAI GPT's GELU is slightly different (and gives slightly different results): 0.5 * x * (1 +
+    torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) This is now written in C in nn.functional
+    Also see the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
+    """
+
+    def __init__(self, use_gelu_python: bool = False):
+        super().__init__()
+        self.act = nn.functional.gelu
+
+    def _gelu_python(self, input: Tensor) -> Tensor:
+        return input * 0.5 * (1.0 + paddle.erf(input / math.sqrt(2.0)))
+
+    def forward(self, input: Tensor) -> Tensor:
+        return self.act(input)
+
+
+class FastGELUActivation(nn.Layer):
+    """
+    Applies GELU approximation that is slower than QuickGELU but more accurate. See: https://github.com/hendrycks/GELUs
+    """
+
+    def forward(self, input: Tensor) -> Tensor:
+        return 0.5 * input * (1.0 + paddle.tanh(input * 0.7978845608 * (1.0 + 0.044715 * input * input)))
+
+
+class QuickGELUActivation(nn.Layer):
+    """
+    Applies GELU approximation that is fast but somewhat inaccurate. See: https://github.com/hendrycks/GELUs
+    """
+
+    def forward(self, input: Tensor) -> Tensor:
+        return input * paddle.sigmoid(1.702 * input)
+
+
+class ClippedGELUActivation(nn.Layer):
+    """
+    Clip the range of possible GeLU outputs between [min, max]. This is especially useful for quantization purpose, as
+    it allows mapping negatives values in the GeLU spectrum. For more information on this trick, please refer to
+    https://arxiv.org/abs/2004.09602.
+
+    Gaussian Error Linear Unit. Original Implementation of the gelu activation function in Google Bert repo when
+    initially created.
+
+    For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): 0.5 * x * (1 +
+    torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))). See https://arxiv.org/abs/1606.08415
+    """
+
+    def __init__(self, min: float, max: float):
+        if min > max:
+            raise ValueError(f"min should be < max (got min: {min}, max: {max})")
+
+        super().__init__()
+        self.min = min
+        self.max = max
+
+    def forward(self, x: Tensor) -> Tensor:
+        return paddle.clip(gelu(x), self.min, self.max)
+
+
+class SiLUActivation(nn.Layer):
+    """
+    See Gaussian Error Linear Units (Hendrycks et al., https://arxiv.org/abs/1606.08415) where the SiLU (Sigmoid Linear
+    Unit) was originally introduced and coined, and see Sigmoid-Weighted Linear Units for Neural Network Function
+    Approximation in Reinforcement Learning (Elfwing et al., https://arxiv.org/abs/1702.03118) and Swish: a Self-Gated
+    Activation Function (Ramachandran et al., https://arxiv.org/abs/1710.05941v1) where the SiLU was experimented with
+    later.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.act = nn.functional.silu
+
+    def _silu_python(self, input: Tensor) -> Tensor:
+        return input * paddle.sigmoid(input)
+
+    def forward(self, input: Tensor) -> Tensor:
+        return self.act(input)
+
+
+class MishActivation(nn.Layer):
+    """
+    See Mish: A Self-Regularized Non-Monotonic Activation Function (Misra., https://arxiv.org/abs/1908.08681). Also
+    visit the official repository for the paper: https://github.com/digantamisra98/Mish
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.act = nn.functional.mish
+
+    def _mish_python(self, input: Tensor) -> Tensor:
+        return input * paddle.tanh(nn.functional.softplus(input))
+
+    def forward(self, input: Tensor) -> Tensor:
+        return self.act(input)
+
+
+class LinearActivation(nn.Layer):
+    """
+    Applies the linear activation function, i.e. forwarding input directly to output.
+    """
+
+    def forward(self, input: Tensor) -> Tensor:
+        return input
+
+
+ACT2FN = {
+    "gelu": GELUActivation(),
+    "gelu_10": ClippedGELUActivation(-10, 10),
+    "gelu_fast": FastGELUActivation(),
+    "gelu_new": NewGELUActivation(),
+    "gelu_python": GELUActivation(use_gelu_python=True),
+    "linear": LinearActivation(),
+    "mish": MishActivation(),
+    "quick_gelu": QuickGELUActivation(),
+    "relu": nn.ReLU(),
+    "sigmoid": nn.Sigmoid(),
+    "silu": SiLUActivation(),
+    "swish": SiLUActivation(),
+    "tanh": nn.Tanh(),
+}
+
+
+def get_activation(activation_string):
+    if activation_string in ACT2FN:
+        return ACT2FN[activation_string]
+    else:
+        raise KeyError(f"function {activation_string} not found in ACT2FN mapping {list(ACT2FN.keys())}")
+
+
+# For backwards compatibility with: from activations import gelu_python
+gelu_python = get_activation("gelu_python")
+gelu_new = get_activation("gelu_new")
+gelu = get_activation("gelu")
+gelu_fast = get_activation("gelu_fast")
+quick_gelu = get_activation("quick_gelu")
+silu = get_activation("silu")
+mish = get_activation("mish")
+linear_act = get_activation("linear")
diff --git a/paddlespeech/s2t/models/wav2vec2/modeling_outputs.py b/paddlespeech/s2t/models/wav2vec2/modeling_outputs.py
new file mode 100644
index 000000000..a5b509b66
--- /dev/null
+++ b/paddlespeech/s2t/models/wav2vec2/modeling_outputs.py
@@ -0,0 +1,1129 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import Optional, Tuple
+from collections import OrderedDict
+
+from dataclasses import fields
+import paddle
+
+
+class ModelOutput(OrderedDict):
+    """
+    Base class for all model outputs as dataclass. Has a `__getitem__` that allows indexing by integer or slice (like a
+    tuple) or strings (like a dictionary) that will ignore the `None` attributes. Otherwise behaves like a regular
+    python dictionary.
+
+    <Tip warning={true}>
+
+    You can't unpack a `ModelOutput` directly. Use the [`~utils.ModelOutput.to_tuple`] method to convert it to a tuple
+    before.
+
+    </Tip>
+    """
+
+    def __post_init__(self):
+        class_fields = fields(self)
+
+        # Safety and consistency checks
+        if not len(class_fields):
+            raise ValueError(f"{self.__class__.__name__} has no fields.")
+        if not all(field.default is None for field in class_fields[1:]):
+            raise ValueError(f"{self.__class__.__name__} should not have more than one required field.")
+
+        first_field = getattr(self, class_fields[0].name)
+        other_fields_are_none = all(getattr(self, field.name) is None for field in class_fields[1:])
+
+        if other_fields_are_none and not paddle.is_tensor(first_field):
+            if isinstance(first_field, dict):
+                iterator = first_field.items()
+                first_field_iterator = True
+            else:
+                try:
+                    iterator = iter(first_field)
+                    first_field_iterator = True
+                except TypeError:
+                    first_field_iterator = False
+
+            # if we provided an iterator as first field and the iterator is a (key, value) iterator
+            # set the associated fields
+            if first_field_iterator:
+                for element in iterator:
+                    if (
+                        not isinstance(element, (list, tuple))
+                        or not len(element) == 2
+                        or not isinstance(element[0], str)
+                    ):
+                        break
+                    setattr(self, element[0], element[1])
+                    if element[1] is not None:
+                        self[element[0]] = element[1]
+            elif first_field is not None:
+                self[class_fields[0].name] = first_field
+        else:
+            for field in class_fields:
+                v = getattr(self, field.name)
+                if v is not None:
+                    self[field.name] = v
+
+    def __delitem__(self, *args, **kwargs):
+        raise Exception(f"You cannot use ``__delitem__`` on a {self.__class__.__name__} instance.")
+
+    def setdefault(self, *args, **kwargs):
+        raise Exception(f"You cannot use ``setdefault`` on a {self.__class__.__name__} instance.")
+
+    def pop(self, *args, **kwargs):
+        raise Exception(f"You cannot use ``pop`` on a {self.__class__.__name__} instance.")
+
+    def update(self, *args, **kwargs):
+        raise Exception(f"You cannot use ``update`` on a {self.__class__.__name__} instance.")
+
+    def __getitem__(self, k):
+        if isinstance(k, str):
+            inner_dict = {k: v for (k, v) in self.items()}
+            return inner_dict[k]
+        else:
+            return self.to_tuple()[k]
+
+    def __setattr__(self, name, value):
+        if name in self.keys() and value is not None:
+            # Don't call self.__setitem__ to avoid recursion errors
+            super().__setitem__(name, value)
+        super().__setattr__(name, value)
+
+    def __setitem__(self, key, value):
+        # Will raise a KeyException if needed
+        super().__setitem__(key, value)
+        # Don't call self.__setattr__ to avoid recursion errors
+        super().__setattr__(key, value)
+
+    def to_tuple(self) -> Tuple:
+        """
+        Convert self to a tuple containing all the attributes/keys that are not `None`.
+        """
+        return tuple(self[k] for k in self.keys())
+
+
+@dataclass
+class BaseModelOutput(ModelOutput):
+    """
+    Base class for model's outputs, with potential hidden states and attentions.
+
+    Args:
+        last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    last_hidden_state: paddle.Tensor = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class BaseModelOutputWithNoAttention(ModelOutput):
+    """
+    Base class for model's outputs, with potential hidden states.
+
+    Args:
+        last_hidden_state (`paddle.Tensor` of shape `(batch_size, num_channels, height, width)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, num_channels, height, width)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+    """
+
+    last_hidden_state: paddle = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class BaseModelOutputWithPooling(ModelOutput):
+    """
+    Base class for model's outputs that also contains a pooling of the last hidden states.
+
+    Args:
+        last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        pooler_output (`paddle.Tensor` of shape `(batch_size, hidden_size)`):
+            Last layer hidden-state of the first token of the sequence (classification token) after further processing
+            through the layers used for the auxiliary pretraining task. E.g. for BERT-family of models, this returns
+            the classification token after processing through a linear layer and a tanh activation function. The linear
+            layer weights are trained from the next sentence prediction (classification) objective during pretraining.
+        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    last_hidden_state: paddle.Tensor = None
+    pooler_output: paddle.Tensor = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class BaseModelOutputWithPoolingAndNoAttention(ModelOutput):
+    """
+    Base class for model's outputs that also contains a pooling of the last hidden states.
+
+    Args:
+        last_hidden_state (`paddle.Tensor` of shape `(batch_size, num_channels, height, width)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        pooler_output (`paddle.Tensor` of shape `(batch_size, hidden_size)`):
+            Last layer hidden-state after a pooling operation on the spatial dimensions.
+        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, num_channels, height, width)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+    """
+
+    last_hidden_state: paddle.Tensor = None
+    pooler_output: paddle.Tensor = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class BaseModelOutputWithPast(ModelOutput):
+    """
+    Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
+
+    Args:
+        last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+
+            If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
+            hidden_size)` is output.
+        past_key_values (`tuple(tuple(paddle.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(paddle.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
+            `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
+            encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
+            `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
+            input) to speed up sequential decoding.
+        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    last_hidden_state: paddle.Tensor = None
+    past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class BaseModelOutputWithCrossAttentions(ModelOutput):
+    """
+    Base class for model's outputs, with potential hidden states and attentions.
+
+    Args:
+        last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        cross_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
+            weighted average in the cross-attention heads.
+    """
+
+    last_hidden_state: paddle.Tensor = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    attentions: Optional[Tuple[paddle.Tensor]] = None
+    cross_attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class BaseModelOutputWithPoolingAndCrossAttentions(ModelOutput):
+    """
+    Base class for model's outputs that also contains a pooling of the last hidden states.
+
+    Args:
+        last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        pooler_output (`paddle.Tensor` of shape `(batch_size, hidden_size)`):
+            Last layer hidden-state of the first token of the sequence (classification token) after further processing
+            through the layers used for the auxiliary pretraining task. E.g. for BERT-family of models, this returns
+            the classification token after processing through a linear layer and a tanh activation function. The linear
+            layer weights are trained from the next sentence prediction (classification) objective during pretraining.
+        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        cross_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
+            weighted average in the cross-attention heads.
+        past_key_values (`tuple(tuple(paddle.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(paddle.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
+            `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
+            encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
+            `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
+            input) to speed up sequential decoding.
+    """
+
+    last_hidden_state: paddle.Tensor = None
+    pooler_output: paddle.Tensor = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None
+    attentions: Optional[Tuple[paddle.Tensor]] = None
+    cross_attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class BaseModelOutputWithPastAndCrossAttentions(ModelOutput):
+    """
+    Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
+
+    Args:
+        last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+
+            If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
+            hidden_size)` is output.
+        past_key_values (`tuple(tuple(paddle.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(paddle.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
+            `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
+            encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
+            `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
+            input) to speed up sequential decoding.
+        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        cross_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
+            weighted average in the cross-attention heads.
+    """
+
+    last_hidden_state: paddle.Tensor = None
+    past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    attentions: Optional[Tuple[paddle.Tensor]] = None
+    cross_attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class Seq2SeqModelOutput(ModelOutput):
+    """
+    Base class for model encoder's outputs that also contains : pre-computed hidden states that can speed up sequential
+    decoding.
+
+    Args:
+        last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the decoder of the model.
+
+            If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
+            hidden_size)` is output.
+        past_key_values (`tuple(tuple(paddle.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(paddle.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+        decoder_hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the decoder at the output of each layer plus the optional initial embedding outputs.
+        decoder_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        cross_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
+            weighted average in the cross-attention heads.
+        encoder_last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the encoder at the output of each layer plus the optional initial embedding outputs.
+        encoder_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+    """
+
+    last_hidden_state: paddle.Tensor = None
+    past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None
+    decoder_hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    decoder_attentions: Optional[Tuple[paddle.Tensor]] = None
+    cross_attentions: Optional[Tuple[paddle.Tensor]] = None
+    encoder_last_hidden_state: Optional[paddle.Tensor] = None
+    encoder_hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    encoder_attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class CausalLMOutput(ModelOutput):
+    """
+    Base class for causal language model (or autoregressive) outputs.
+
+    Args:
+        loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Language modeling loss (for next-token prediction).
+        logits (`paddle.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[paddle.Tensor] = None
+    logits: paddle.Tensor = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class CausalLMOutputWithPast(ModelOutput):
+    """
+    Base class for causal language model (or autoregressive) outputs.
+
+    Args:
+        loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Language modeling loss (for next-token prediction).
+        logits (`paddle.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        past_key_values (`tuple(tuple(paddle.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(paddle.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+            `past_key_values` input) to speed up sequential decoding.
+        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[paddle.Tensor] = None
+    logits: paddle.Tensor = None
+    past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class CausalLMOutputWithCrossAttentions(ModelOutput):
+    """
+    Base class for causal language model (or autoregressive) outputs.
+
+    Args:
+        loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Language modeling loss (for next-token prediction).
+        logits (`paddle.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        cross_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Cross attentions weights after the attention softmax, used to compute the weighted average in the
+            cross-attention heads.
+        past_key_values (`tuple(tuple(paddle.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `paddle.Tensor` tuples of length `config.n_layers`, with each tuple containing the cached key,
+            value states of the self-attention and the cross-attention layers if model is used in encoder-decoder
+            setting. Only relevant if `config.is_decoder = True`.
+
+            Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
+            `past_key_values` input) to speed up sequential decoding.
+    """
+
+    loss: Optional[paddle.Tensor] = None
+    logits: paddle.Tensor = None
+    past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    attentions: Optional[Tuple[paddle.Tensor]] = None
+    cross_attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class SequenceClassifierOutputWithPast(ModelOutput):
+    """
+    Base class for outputs of sentence classification models.
+
+    Args:
+        loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Classification (or regression if config.num_labels==1) loss.
+        logits (`paddle.Tensor` of shape `(batch_size, config.num_labels)`):
+            Classification (or regression if config.num_labels==1) scores (before SoftMax).
+        past_key_values (`tuple(tuple(paddle.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(paddle.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+            `past_key_values` input) to speed up sequential decoding.
+        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[paddle.Tensor] = None
+    logits: paddle.Tensor = None
+    past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class MaskedLMOutput(ModelOutput):
+    """
+    Base class for masked language models outputs.
+
+    Args:
+        loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Masked language modeling (MLM) loss.
+        logits (`paddle.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[paddle.Tensor] = None
+    logits: paddle.Tensor = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class Seq2SeqLMOutput(ModelOutput):
+    """
+    Base class for sequence-to-sequence language models outputs.
+
+    Args:
+        loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Language modeling loss.
+        logits (`paddle.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        past_key_values (`tuple(tuple(paddle.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(paddle.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+        decoder_hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
+        decoder_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        cross_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
+            weighted average in the cross-attention heads.
+        encoder_last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
+        encoder_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+    """
+
+    loss: Optional[paddle.Tensor] = None
+    logits: paddle.Tensor = None
+    past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None
+    decoder_hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    decoder_attentions: Optional[Tuple[paddle.Tensor]] = None
+    cross_attentions: Optional[Tuple[paddle.Tensor]] = None
+    encoder_last_hidden_state: Optional[paddle.Tensor] = None
+    encoder_hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    encoder_attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class NextSentencePredictorOutput(ModelOutput):
+    """
+    Base class for outputs of models predicting if two sentences are consecutive or not.
+
+    Args:
+        loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `next_sentence_label` is provided):
+            Next sequence prediction (classification) loss.
+        logits (`paddle.Tensor` of shape `(batch_size, 2)`):
+            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
+            before SoftMax).
+        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[paddle.Tensor] = None
+    logits: paddle.Tensor = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class SequenceClassifierOutput(ModelOutput):
+    """
+    Base class for outputs of sentence classification models.
+
+    Args:
+        loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Classification (or regression if config.num_labels==1) loss.
+        logits (`paddle.Tensor` of shape `(batch_size, config.num_labels)`):
+            Classification (or regression if config.num_labels==1) scores (before SoftMax).
+        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[paddle.Tensor] = None
+    logits: paddle.Tensor = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class Seq2SeqSequenceClassifierOutput(ModelOutput):
+    """
+    Base class for outputs of sequence-to-sequence sentence classification models.
+
+    Args:
+        loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `label` is provided):
+            Classification (or regression if config.num_labels==1) loss.
+        logits (`paddle.Tensor` of shape `(batch_size, config.num_labels)`):
+            Classification (or regression if config.num_labels==1) scores (before SoftMax).
+        past_key_values (`tuple(tuple(paddle.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(paddle.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+        decoder_hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
+        decoder_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        cross_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
+            weighted average in the cross-attention heads.
+        encoder_last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
+        encoder_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+    """
+
+    loss: Optional[paddle.Tensor] = None
+    logits: paddle.Tensor = None
+    past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None
+    decoder_hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    decoder_attentions: Optional[Tuple[paddle.Tensor]] = None
+    cross_attentions: Optional[Tuple[paddle.Tensor]] = None
+    encoder_last_hidden_state: Optional[paddle.Tensor] = None
+    encoder_hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    encoder_attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class MultipleChoiceModelOutput(ModelOutput):
+    """
+    Base class for outputs of multiple choice models.
+
+    Args:
+        loss (`paddle.Tensor` of shape *(1,)*, *optional*, returned when `labels` is provided):
+            Classification loss.
+        logits (`paddle.Tensor` of shape `(batch_size, num_choices)`):
+            *num_choices* is the second dimension of the input tensors. (see *input_ids* above).
+
+            Classification scores (before SoftMax).
+        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[paddle.Tensor] = None
+    logits: paddle.Tensor = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class TokenClassifierOutput(ModelOutput):
+    """
+    Base class for outputs of token classification models.
+
+    Args:
+        loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided) :
+            Classification loss.
+        logits (`paddle.Tensor` of shape `(batch_size, sequence_length, config.num_labels)`):
+            Classification scores (before SoftMax).
+        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[paddle.Tensor] = None
+    logits: paddle.Tensor = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class QuestionAnsweringModelOutput(ModelOutput):
+    """
+    Base class for outputs of question answering models.
+
+    Args:
+        loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
+        start_logits (`paddle.Tensor` of shape `(batch_size, sequence_length)`):
+            Span-start scores (before SoftMax).
+        end_logits (`paddle.Tensor` of shape `(batch_size, sequence_length)`):
+            Span-end scores (before SoftMax).
+        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[paddle.Tensor] = None
+    start_logits: paddle.Tensor = None
+    end_logits: paddle.Tensor = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class Seq2SeqQuestionAnsweringModelOutput(ModelOutput):
+    """
+    Base class for outputs of sequence-to-sequence question answering models.
+
+    Args:
+        loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
+        start_logits (`paddle.Tensor` of shape `(batch_size, sequence_length)`):
+            Span-start scores (before SoftMax).
+        end_logits (`paddle.Tensor` of shape `(batch_size, sequence_length)`):
+            Span-end scores (before SoftMax).
+        past_key_values (`tuple(tuple(paddle.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(paddle.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+        decoder_hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
+        decoder_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        cross_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
+            weighted average in the cross-attention heads.
+        encoder_last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
+        encoder_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+    """
+
+    loss: Optional[paddle.Tensor] = None
+    start_logits: paddle.Tensor = None
+    end_logits: paddle.Tensor = None
+    past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None
+    decoder_hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    decoder_attentions: Optional[Tuple[paddle.Tensor]] = None
+    cross_attentions: Optional[Tuple[paddle.Tensor]] = None
+    encoder_last_hidden_state: Optional[paddle.Tensor] = None
+    encoder_hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    encoder_attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class SemanticSegmenterOutput(ModelOutput):
+    """
+    Base class for outputs of semantic segmentation models.
+
+    Args:
+        loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Classification (or regression if config.num_labels==1) loss.
+        logits (`paddle.Tensor` of shape `(batch_size, config.num_labels, logits_height, logits_width)`):
+            Classification scores for each pixel.
+
+            <Tip warning={true}>
+
+            The logits returned do not necessarily have the same size as the `pixel_values` passed as inputs. This is
+            to avoid doing two interpolations and lose some quality when a user needs to resize the logits to the
+            original image size as post-processing. You should always check your logits shape and resize as needed.
+
+            </Tip>
+
+        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, patch_size, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, patch_size,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[paddle.Tensor] = None
+    logits: paddle.Tensor = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class ImageClassifierOutput(ModelOutput):
+    """
+    Base class for outputs of image classification models.
+
+    Args:
+        loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Classification (or regression if config.num_labels==1) loss.
+        logits (`paddle.Tensor` of shape `(batch_size, config.num_labels)`):
+            Classification (or regression if config.num_labels==1) scores (before SoftMax).
+        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each stage) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states
+            (also called feature maps) of the model at the output of each stage.
+        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, patch_size,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[paddle.Tensor] = None
+    logits: paddle.Tensor = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class ImageClassifierOutputWithNoAttention(ModelOutput):
+    """
+    Base class for outputs of image classification models.
+
+    Args:
+        loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Classification (or regression if config.num_labels==1) loss.
+        logits (`paddle.Tensor` of shape `(batch_size, config.num_labels)`):
+            Classification (or regression if config.num_labels==1) scores (before SoftMax).
+        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each stage) of shape `(batch_size, num_channels, height, width)`. Hidden-states (also
+            called feature maps) of the model at the output of each stage.
+    """
+
+    loss: Optional[paddle.Tensor] = None
+    logits: paddle.Tensor = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class DepthEstimatorOutput(ModelOutput):
+    """
+    Base class for outputs of depth estimation models.
+
+    Args:
+        loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Classification (or regression if config.num_labels==1) loss.
+        predicted_depth (`paddle.Tensor` of shape `(batch_size, height, width)`):
+            Predicted depth for each pixel.
+
+        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, num_channels, height, width)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, patch_size,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[paddle.Tensor] = None
+    predicted_depth: paddle.Tensor = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class Wav2Vec2BaseModelOutput(ModelOutput):
+    """
+    Base class for models that have been trained with the Wav2Vec2 loss objective.
+
+    Args:
+        last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        extract_features (`paddle.Tensor` of shape `(batch_size, sequence_length, conv_dim[-1])`):
+            Sequence of extracted feature vectors of the last convolutional layer of the model.
+        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    last_hidden_state: paddle.Tensor = None
+    extract_features: paddle.Tensor = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class XVectorOutput(ModelOutput):
+    """
+    Output type of [`Wav2Vec2ForXVector`].
+
+    Args:
+        loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Classification loss.
+        logits (`paddle.Tensor` of shape `(batch_size, config.xvector_output_dim)`):
+            Classification hidden states before AMSoftmax.
+        embeddings (`paddle.Tensor` of shape `(batch_size, config.xvector_output_dim)`):
+            Utterance embeddings used for vector similarity-based retrieval.
+        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[paddle.Tensor] = None
+    logits: paddle.Tensor = None
+    embeddings: paddle.Tensor = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    attentions: Optional[Tuple[paddle.Tensor]] = None
diff --git a/paddlespeech/s2t/models/wav2vec2/modeling_wav2vec2.py b/paddlespeech/s2t/models/wav2vec2/modeling_wav2vec2.py
new file mode 100755
index 000000000..5accff120
--- /dev/null
+++ b/paddlespeech/s2t/models/wav2vec2/modeling_wav2vec2.py
@@ -0,0 +1,1259 @@
+# coding=utf-8
+# Copyright 2021 The Fairseq Authors and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Wav2Vec2 model."""
+
+import math
+import warnings
+import paddle
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import numpy as np
+from paddle import nn
+
+from paddlespeech.s2t.models.wav2vec2.activations import ACT2FN
+from paddlespeech.s2t.models.wav2vec2.modeling_outputs import (
+    BaseModelOutput,
+    Wav2Vec2BaseModelOutput,
+    ModelOutput
+)
+
+
+from paddlespeech.s2t.utils.log import Log
+logger = Log(__name__).getlog()
+
+
+@dataclass
+class Wav2Vec2ForPreTrainingOutput(ModelOutput):
+    """
+    Output type of [`Wav2Vec2ForPreTraining`], with potential hidden states and attentions.
+
+    Args:
+        loss (*optional*, returned when `sample_negative_indices` are passed, `paddle.Tensor` of shape `(1,)`):
+            Total loss as the sum of the contrastive loss (L_m) and the diversity loss (L_d) as stated in the [official
+            paper](https://arxiv.org/pdf/2006.11477.pdf) . (classification) loss.
+        projected_states (`paddle.Tensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
+            Hidden-states of the model projected to *config.proj_codevector_dim* that can be used to predict the masked
+            projected quantized states.
+        projected_quantized_states (`paddle.Tensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
+            Quantized extracted feature vectors projected to *config.proj_codevector_dim* representing the positive
+            target vectors for contrastive loss.
+        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        contrastive_loss (*optional*, returned when `sample_negative_indices` are passed, `paddle.Tensor` of shape `(1,)`):
+            The contrastive loss (L_m) as stated in the [official paper](https://arxiv.org/pdf/2006.11477.pdf) .
+        diversity_loss (*optional*, returned when `sample_negative_indices` are passed, `paddle.Tensor` of shape `(1,)`):
+            The diversity loss (L_d) as stated in the [official paper](https://arxiv.org/pdf/2006.11477.pdf) .
+    """
+
+    loss: Optional[paddle.Tensor] = None
+    projected_states: paddle.Tensor = None
+    projected_quantized_states: paddle.Tensor = None
+    codevector_perplexity: paddle.Tensor = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    attentions: Optional[Tuple[paddle.Tensor]] = None
+    contrastive_loss: Optional[paddle.Tensor] = None
+    diversity_loss: Optional[paddle.Tensor] = None
+
+
+def _compute_mask_indices(
+    shape: Tuple[int, int],
+    mask_prob: float,
+    mask_length: int,
+    attention_mask: Optional[paddle.Tensor] = None,
+    min_masks: int = 0,
+) -> np.ndarray:
+    """
+    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
+    ASR](https://arxiv.org/abs/1904.08779). Note that this method is not optimized to run on TPU and should be run on
+    CPU as part of the preprocessing during training.
+
+    Args:
+        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
+               the first element is the batch size and the second element is the length of the axis to span.
+        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
+                    independently generated mask spans of length `mask_length` is computed by
+                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
+                    actual percentage will be smaller.
+        mask_length: size of the mask
+        min_masks: minimum number of masked spans
+        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
+                        each batch dimension.
+    """
+    batch_size, sequence_length = shape
+
+    if mask_length < 1:
+        raise ValueError("`mask_length` has to be bigger than 0.")
+
+    if mask_length > sequence_length:
+        raise ValueError(
+            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
+            f" and `sequence_length`: {sequence_length}`"
+        )
+
+    # epsilon is used for probabilistic rounding
+    epsilon = np.random.rand(1).item()
+
+    def compute_num_masked_span(input_length):
+        """Given input length, compute how many spans should be masked"""
+        num_masked_span = int(mask_prob * input_length / mask_length + epsilon)
+        num_masked_span = max(num_masked_span, min_masks)
+
+        # make sure num masked span <= sequence_length
+        if num_masked_span * mask_length > sequence_length:
+            num_masked_span = sequence_length // mask_length
+
+        # make sure num_masked span is also <= input_length - (mask_length - 1)
+        if input_length - (mask_length - 1) < num_masked_span:
+            num_masked_span = max(input_length - (mask_length - 1), 0)
+
+        return num_masked_span
+
+    # compute number of masked spans in batch
+    input_lengths = (
+        attention_mask.sum(-1).detach().tolist()
+        if attention_mask is not None
+        else [sequence_length for _ in range(batch_size)]
+    )
+
+    # SpecAugment mask to fill
+    spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=np.bool)
+    spec_aug_mask_idxs = []
+
+    max_num_masked_span = compute_num_masked_span(sequence_length)
+
+    if max_num_masked_span == 0:
+        return spec_aug_mask
+
+    for input_length in input_lengths:
+        # compute num of masked spans for this input
+        num_masked_span = compute_num_masked_span(input_length)
+
+        # get random indices to mask
+        spec_aug_mask_idx = np.random.choice(
+            np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
+        )
+
+        # pick first sampled index that will serve as a dummy index to pad vector
+        # to ensure same dimension for all batches due to probabilistic rounding
+        # Picking first sample just pads those vectors twice.
+        if len(spec_aug_mask_idx) == 0:
+            # this case can only happen if `input_length` is strictly smaller then
+            # `sequence_length` in which case the last token has to be a padding
+            # token which we can use as a dummy mask id
+            dummy_mask_idx = sequence_length - 1
+        else:
+            dummy_mask_idx = spec_aug_mask_idx[0]
+
+        spec_aug_mask_idx = np.concatenate(
+            [spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx]
+        )
+        spec_aug_mask_idxs.append(spec_aug_mask_idx)
+
+    spec_aug_mask_idxs = np.array(spec_aug_mask_idxs)
+
+    # expand masked indices to masked spans
+    spec_aug_mask_idxs = np.broadcast_to(
+        spec_aug_mask_idxs[:, :, None], (batch_size, max_num_masked_span, mask_length)
+    )
+    spec_aug_mask_idxs = spec_aug_mask_idxs.reshape((batch_size, max_num_masked_span * mask_length))
+
+    # add offset to the starting indexes so that indexes now create a span
+    offsets = np.arange(mask_length)[None, None, :]
+    offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
+        (batch_size, max_num_masked_span * mask_length)
+    )
+    spec_aug_mask_idxs = spec_aug_mask_idxs + offsets
+
+    # ensure that we cannot have indices larger than sequence_length
+    if spec_aug_mask_idxs.max() > sequence_length - 1:
+        spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1
+
+    # scatter indices to mask
+    np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1)
+
+    return spec_aug_mask
+
+
+def _sample_negative_indices(
+    features_shape: Tuple, num_negatives: int, mask_time_indices: Optional[np.ndarray] = None
+):
+    """
+    Sample `num_negatives` vectors from feature vectors.
+    """
+    batch_size, sequence_length = features_shape
+
+    # generate indices of the positive vectors themselves, repeat them `num_negatives` times
+    sequence_length_range = np.arange(sequence_length)
+
+    # get `num_negatives` random vector indices from the same utterance
+    sampled_negative_indices = np.zeros(shape=(batch_size, sequence_length, num_negatives), dtype=np.int32)
+
+    mask_time_indices = (
+        mask_time_indices.astype(np.bool) if mask_time_indices is not None else np.ones(features_shape, dtype=np.bool)
+    )
+
+    for batch_idx in range(batch_size):
+        high = mask_time_indices[batch_idx].sum() - 1
+        mapped_masked_indices = sequence_length_range[mask_time_indices[batch_idx]]
+
+        feature_indices = np.broadcast_to(np.arange(high + 1)[:, None], (high + 1, num_negatives))
+        sampled_indices = np.random.randint(0, high, size=(high + 1, num_negatives))
+        # avoid sampling the same positive vector, but keep the distribution uniform
+        sampled_indices[sampled_indices >= feature_indices] += 1
+
+        # remap to actual indices
+        sampled_negative_indices[batch_idx][mask_time_indices[batch_idx]] = mapped_masked_indices[sampled_indices]
+
+        # correct for batch size
+        sampled_negative_indices[batch_idx] += batch_idx * sequence_length
+
+    return sampled_negative_indices
+
+
+class Wav2Vec2NoLayerNormConvLayer(nn.Layer):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
+        self.out_conv_dim = config.conv_dim[layer_id]
+
+        self.conv = nn.Conv1D(
+            self.in_conv_dim,
+            self.out_conv_dim,
+            kernel_size=config.conv_kernel[layer_id],
+            stride=config.conv_stride[layer_id],
+            bias_attr=config.conv_bias,
+        )
+        self.activation = ACT2FN[config.feat_extract_activation]
+
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+
+
+class Wav2Vec2LayerNormConvLayer(nn.Layer):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
+        self.out_conv_dim = config.conv_dim[layer_id]
+
+        self.conv = nn.Conv1D(
+            self.in_conv_dim,
+            self.out_conv_dim,
+            kernel_size=config.conv_kernel[layer_id],
+            stride=config.conv_stride[layer_id],
+            bias_attr=config.conv_bias,
+        )
+        self.layer_norm = nn.LayerNorm(self.out_conv_dim)
+        self.activation = ACT2FN[config.feat_extract_activation]
+
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+        hidden_states = hidden_states.transpose([0, 2, 1])
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = hidden_states.transpose([0, 2, 1])
+
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+
+
+class Wav2Vec2GroupNormConvLayer(nn.Layer):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
+        self.out_conv_dim = config.conv_dim[layer_id]
+
+        self.conv = nn.Conv1D(
+            self.in_conv_dim,
+            self.out_conv_dim,
+            kernel_size=config.conv_kernel[layer_id],
+            stride=config.conv_stride[layer_id],
+            bias_attr=config.conv_bias,
+        )
+        self.activation = ACT2FN[config.feat_extract_activation]
+
+        self.layer_norm = nn.GroupNorm(num_groups=self.out_conv_dim, num_channels=self.out_conv_dim)
+
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+
+
+class Wav2Vec2PositionalConvEmbedding(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.conv = nn.Conv1D(
+            config.hidden_size,
+            config.hidden_size,
+            kernel_size=config.num_conv_pos_embeddings,
+            padding=config.num_conv_pos_embeddings // 2,
+            groups=config.num_conv_pos_embedding_groups,
+        )
+
+        self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
+
+        self.padding = Wav2Vec2SamePadLayer(config.num_conv_pos_embeddings)
+        self.activation = ACT2FN[config.feat_extract_activation]
+
+    def forward(self, hidden_states):
+        hidden_states = hidden_states.transpose([0, 2, 1])
+
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.padding(hidden_states)
+        hidden_states = self.activation(hidden_states)
+
+        hidden_states = hidden_states.transpose([0, 2, 1])
+        return hidden_states
+
+
+class Wav2Vec2SamePadLayer(nn.Layer):
+    def __init__(self, num_conv_pos_embeddings):
+        super().__init__()
+        self.num_pad_remove = 1 if num_conv_pos_embeddings % 2 == 0 else 0
+
+    def forward(self, hidden_states):
+        if self.num_pad_remove > 0:
+            hidden_states = hidden_states[:, :, : -self.num_pad_remove]
+        return hidden_states
+
+
+class Wav2Vec2FeatureEncoder(nn.Layer):
+    """Construct the features from raw audio waveform"""
+
+    def __init__(self, config):
+        super().__init__()
+
+        if config.feat_extract_norm == "group":
+            conv_layers = [Wav2Vec2GroupNormConvLayer(config, layer_id=0)] + [
+                Wav2Vec2NoLayerNormConvLayer(config, layer_id=i + 1) for i in range(config.num_feat_extract_layers - 1)
+            ]
+        elif config.feat_extract_norm == "layer":
+            conv_layers = [
+                Wav2Vec2LayerNormConvLayer(config, layer_id=i) for i in range(config.num_feat_extract_layers)
+            ]
+        else:
+            raise ValueError(
+                f"`config.feat_extract_norm` is {config.feat_extract_norm}, but has to be one of ['group', 'layer']"
+            )
+        self.conv_layers = nn.LayerList(conv_layers)
+        self.gradient_checkpointing = False
+        self._requires_grad = True
+
+    def _freeze_parameters(self):
+        for param in self.parameters():
+            param.requires_grad = False
+        self._requires_grad = False
+
+    def forward(self, input_values):
+        hidden_states = input_values[:, None]
+
+        # make sure hidden_states require grad for gradient_checkpointing
+        #if self._requires_grad and self.training:
+        #    hidden_states.requires_grad = True
+
+        for conv_layer in self.conv_layers:
+            hidden_states = conv_layer(hidden_states)
+
+        return hidden_states
+
+
+class Wav2Vec2FeatureExtractor(Wav2Vec2FeatureEncoder):
+    def __init__(self, config):
+        super().__init__(config)
+        warnings.warn(
+            f"The class `{self.__class__.__name__}` has been depreciated "
+            "and will be removed in Transformers v5. "
+            f"Use `{self.__class__.__bases__[0].__name__}` instead.",
+            FutureWarning,
+        )
+
+
+class Wav2Vec2FeatureProjection(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.layer_norm = nn.LayerNorm(config.conv_dim[-1], epsilon=config.layer_norm_eps)
+        self.projection = nn.Linear(config.conv_dim[-1], config.hidden_size)
+        self.dropout = nn.Dropout(config.feat_proj_dropout)
+
+    def forward(self, hidden_states):
+        # non-projected hidden states are needed for quantization
+        norm_hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.projection(norm_hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states, norm_hidden_states
+
+
+# Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->Wav2Vec2
+class Wav2Vec2Attention(nn.Layer):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+        self.is_decoder = is_decoder
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias_attr=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias_attr=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias_attr=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias_attr=bias)
+
+    def _shape(self, tensor: paddle.Tensor, seq_len: int, bsz: int):
+        return paddle.reshape(tensor, (bsz, seq_len, self.num_heads, self.head_dim)).transpose([0, 2, 1, 3])
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        key_value_states: Optional[paddle.Tensor] = None,
+        past_key_value: Optional[Tuple[paddle.Tensor]] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        layer_head_mask: Optional[paddle.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+
+        bsz, tgt_len, _ = hidden_states.shape
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+        # get key, value proj
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+            key_states = paddle.concat([past_key_value[0], key_states], axis=2)
+            value_states = paddle.concat([past_key_value[1], value_states], axis=2)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(paddle.Tensor, paddle.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(paddle.Tensor, paddle.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states, value_states)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).reshape(proj_shape)
+        key_states = key_states.reshape(proj_shape)
+        value_states = value_states.reshape(proj_shape)
+
+        src_len = key_states.shape[1]
+        attn_weights = paddle.bmm(query_states, key_states.transpose([0, 2, 1]))
+   
+        
+        if attn_weights.shape != [bsz * self.num_heads, tgt_len, src_len]:
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {attn_weights.shape}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.shape != [bsz, 1, tgt_len, src_len]:
+                raise ValueError(
+                    f"Attention mask should be of size {[bsz, 1, tgt_len, src_len]}, but is {attention_mask.shape}"
+                )
+            attn_weights = attn_weights.reshape(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.reshape(bsz * self.num_heads, tgt_len, src_len)
+
+        attn_weights = nn.functional.softmax(attn_weights, axis=- 1)
+
+        if layer_head_mask is not None:
+            if layer_head_mask.shape != [self.num_heads,]:
+                raise ValueError(
+                    f"Head mask for a single layer should be of size {[self.num_heads,]}, but is"
+                    f" {layer_head_mask.shape}"
+                )
+            attn_weights = layer_head_mask.reshape((1, -1, 1, 1)) * attn_weights.reshape((bsz, self.num_heads, tgt_len, src_len))
+            attn_weights = attn_weights.reshape((bsz * self.num_heads, tgt_len, src_len))
+
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to be reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.reshape((bsz, self.num_heads, tgt_len, src_len))
+            attn_weights = attn_weights_reshaped.reshape((bsz * self.num_heads, tgt_len, src_len))
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = paddle.bmm(attn_probs, value_states)
+
+        if attn_output.shape != [bsz * self.num_heads, tgt_len, self.head_dim]:
+            raise ValueError(
+                f"`attn_output` should be of size {[bsz, self.num_heads, tgt_len, self.head_dim]}, but is"
+                f" {attn_output.shape}"
+            )
+
+        attn_output = attn_output.reshape((bsz, self.num_heads, tgt_len, self.head_dim))
+        attn_output = attn_output.transpose([0, 2, 1, 3])
+
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned aross GPUs when using tensor-parallelism.
+        attn_output = attn_output.reshape((bsz, tgt_len, self.embed_dim))
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped, past_key_value
+
+
+class Wav2Vec2FeedForward(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.intermediate_dropout = nn.Dropout(config.activation_dropout)
+
+        self.intermediate_dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+        self.output_dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.output_dropout = nn.Dropout(config.hidden_dropout)
+
+    def forward(self, hidden_states):
+        hidden_states = self.intermediate_dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        hidden_states = self.intermediate_dropout(hidden_states)
+
+        hidden_states = self.output_dense(hidden_states)
+        hidden_states = self.output_dropout(hidden_states)
+        return hidden_states
+
+
+class Wav2Vec2EncoderLayer(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.attention = Wav2Vec2Attention(
+            embed_dim=config.hidden_size,
+            num_heads=config.num_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=False,
+        )
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+        self.feed_forward = Wav2Vec2FeedForward(config)
+        self.final_layer_norm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+
+    def forward(self, hidden_states, attention_mask=None, output_attentions=False):
+        attn_residual = hidden_states
+        hidden_states, attn_weights, _ = self.attention(
+            hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
+        )
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = attn_residual + hidden_states
+
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = hidden_states + self.feed_forward(hidden_states)
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class Wav2Vec2EncoderLayerStableLayerNorm(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.attention = Wav2Vec2Attention(
+            embed_dim=config.hidden_size,
+            num_heads=config.num_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=False,
+        )
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+        self.feed_forward = Wav2Vec2FeedForward(config)
+        self.final_layer_norm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+
+    def forward(self, hidden_states, attention_mask=None, output_attentions=False):
+        attn_residual = hidden_states
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states, attn_weights, _ = self.attention(
+            hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
+        )
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = attn_residual + hidden_states
+        hidden_states = hidden_states + self.feed_forward(self.final_layer_norm(hidden_states))
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class Wav2Vec2Encoder(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.pos_conv_embed = Wav2Vec2PositionalConvEmbedding(config)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layers = nn.LayerList([Wav2Vec2EncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        if attention_mask is not None:
+            # make sure padded tokens output 0
+            expand_attention_mask = attention_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2])
+            hidden_states[~expand_attention_mask] = 0
+
+            # extend attention_mask
+            attention_mask = 1.0 - attention_mask[:, None, None, :].to(dtype=hidden_states.dtype)
+            attention_mask = attention_mask * np.iinfo(np.float32).min #torch.finfo(hidden_states.dtype).min
+            attention_mask = attention_mask.expand(
+                attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1]
+            )
+
+        position_embeddings = self.pos_conv_embed(hidden_states)
+        hidden_states = hidden_states + position_embeddings
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        #deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled()
+
+        for layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = np.random.uniform(0, 1)
+
+            skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
+            if not skip_the_layer:# or deepspeed_zero3_is_enabled:
+                # under deepspeed zero3 all gpus must run in sync
+                if self.gradient_checkpointing and self.training:
+                    # create gradient checkpointing function
+                    def create_custom_forward(module):
+                        def custom_forward(*inputs):
+                            return module(*inputs, output_attentions)
+
+                        return custom_forward
+                else:
+                    layer_outputs = layer(
+                        hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
+                    )
+                hidden_states = layer_outputs[0]
+
+            if skip_the_layer:
+                layer_outputs = (None, None)
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class Wav2Vec2EncoderStableLayerNorm(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.pos_conv_embed = Wav2Vec2PositionalConvEmbedding(config)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layers = nn.LayerList(
+            [Wav2Vec2EncoderLayerStableLayerNorm(config) for _ in range(config.num_hidden_layers)]
+        )
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        if attention_mask is not None:
+            # make sure padded tokens are not attended to
+            expand_attention_mask = attention_mask.unsqueeze(-1).repeat_interleave(hidden_states.shape[2], axis=2)
+            hidden_states[~expand_attention_mask] = 0
+
+            # extend attention_mask
+            attention_mask = 1.0 - attention_mask[:, None, None, :].to(dtype=hidden_states.dtype)
+            attention_mask = attention_mask * np.iinfo(np.float32).min # torch.finfo(hidden_states.dtype).min
+            attention_mask = attention_mask.expand(
+                attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1]
+            )
+
+        position_embeddings = self.pos_conv_embed(hidden_states)
+        hidden_states = hidden_states + position_embeddings
+        hidden_states = self.dropout(hidden_states)
+
+        #deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled()
+
+        for layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = np.random.uniform(0, 1)
+
+            skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
+            if not skip_the_layer:# or deepspeed_zero3_is_enabled:
+                # under deepspeed zero3 all gpus must run in sync
+                # XXX: could optimize this like synced_gpus in generate_utils but not sure if it's worth the code complication
+                if self.gradient_checkpointing and self.training:
+                    # create gradient checkpointing function
+                    def create_custom_forward(module):
+                        def custom_forward(*inputs):
+                            return module(*inputs, output_attentions)
+
+                        return custom_forward
+                else:
+                    layer_outputs = layer(
+                        hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
+                    )
+                hidden_states = layer_outputs[0]
+
+            if skip_the_layer:
+                layer_outputs = (None, None)
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        hidden_states = self.layer_norm(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class Wav2Vec2GumbelVectorQuantizer(nn.Layer):
+    """
+    Vector quantization using gumbel softmax. See `[CATEGORICAL REPARAMETERIZATION WITH
+    GUMBEL-SOFTMAX](https://arxiv.org/pdf/1611.01144.pdf) for more information.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        self.num_groups = config.num_codevector_groups
+        self.num_vars = config.num_codevectors_per_group
+
+        if config.codevector_dim % self.num_groups != 0:
+            raise ValueError(
+                f"`config.codevector_dim {config.codevector_dim} must be divisible "
+                f"by `config.num_codevector_groups` {self.num_groups} for concatenation"
+            )
+
+        # storage for codebook variables (codewords)
+        self.codevectors = paddle.static.create_parameter(
+            shape=[1, self.num_groups * self.num_vars, config.codevector_dim // self.num_groups], dtype='float32'
+        )
+        self.weight_proj = nn.Linear(config.conv_dim[-1], self.num_groups * self.num_vars)
+
+        # can be decayed for training
+        self.temperature = 2
+
+    @staticmethod
+    def _compute_perplexity(probs, mask=None):
+        if mask is not None:
+            mask_extended = mask.flatten()[:, None, None].expand(probs.shape)
+            probs = paddle.where(mask_extended, probs, paddle.zeros_like(probs))
+            marginal_probs = probs.sum(dim=0) / mask.sum()
+        else:
+            marginal_probs = probs.mean(dim=0)
+
+        perplexity = paddle.exp(-paddle.sum(marginal_probs * paddle.log(marginal_probs + 1e-7), dim=-1)).sum()
+        return perplexity
+
+    def forward(self, hidden_states, mask_time_indices=None):
+        batch_size, sequence_length, hidden_size = hidden_states.shape
+
+        # project to codevector dim
+        hidden_states = self.weight_proj(hidden_states)
+        hidden_states = hidden_states.reshape((batch_size * sequence_length * self.num_groups, -1))
+
+        if self.training:
+            # sample code vector probs via gumbel in differentiateable way
+            codevector_probs = nn.functional.gumbel_softmax(
+                hidden_states.float(), tau=self.temperature, hard=True
+            ).type_as(hidden_states)
+
+            # compute perplexity
+            codevector_soft_dist = paddle.softmax(
+                hidden_states.reshape((batch_size * sequence_length, self.num_groups, -1)).float(), axis=-1
+            )
+            perplexity = self._compute_perplexity(codevector_soft_dist, mask_time_indices)
+        else:
+            # take argmax in non-differentiable way
+            # comptute hard codevector distribution (one hot)
+            codevector_idx = hidden_states.argmax(dim=-1)
+            codevector_probs = hidden_states.new_zeros(*hidden_states.shape).scatter_(
+                -1, codevector_idx.reshape((-1, 1)), 1.0
+            )
+            codevector_probs = codevector_probs.reshape((batch_size * sequence_length, self.num_groups, -1))
+
+            perplexity = self._compute_perplexity(codevector_probs, mask_time_indices)
+
+        codevector_probs = codevector_probs.reshape((batch_size * sequence_length, -1))
+        # use probs to retrieve codevectors
+        codevectors_per_group = codevector_probs.unsqueeze(-1) * self.codevectors
+        codevectors = codevectors_per_group.reshape((batch_size * sequence_length, self.num_groups, self.num_vars, -1))
+        codevectors = codevectors.sum(-2).reshape((batch_size, sequence_length, -1))
+
+        return codevectors, perplexity
+
+
+class Wav2Vec2Adapter(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+
+        # feature dim might need to be down-projected
+        if config.output_hidden_size != config.hidden_size:
+            self.proj = nn.Linear(config.hidden_size, config.output_hidden_size)
+            self.proj_layer_norm = nn.LayerNorm(config.output_hidden_size)
+        else:
+            self.proj = self.proj_layer_norm = None
+
+        self.layers = nn.LayerList(Wav2Vec2AdapterLayer(config) for _ in range(config.num_adapter_layers))
+        self.layerdrop = config.layerdrop
+
+    def forward(self, hidden_states):
+        # down project hidden_states if necessary
+        if self.proj is not None and self.proj_layer_norm is not None:
+            hidden_states = self.proj(hidden_states)
+            hidden_states = self.proj_layer_norm(hidden_states)
+
+        hidden_states = hidden_states.transpose([0, 2, 1])
+
+        for layer in self.layers:
+            layerdrop_prob = np.random.random()
+            if not self.training or (layerdrop_prob > self.layerdrop):
+                hidden_states = layer(hidden_states)
+
+        hidden_states = hidden_states.transpose([0, 2, 1])
+        return hidden_states
+
+
+class Wav2Vec2AdapterLayer(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.conv = nn.Conv1D(
+            config.output_hidden_size,
+            2 * config.output_hidden_size,
+            config.adapter_kernel_size,
+            stride=config.adapter_stride,
+            padding=1,
+        )
+
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+        hidden_states = nn.functional.glu(hidden_states, axis=1)
+
+        return hidden_states
+
+
+class Wav2Vec2Model(nn.Layer): 
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.feature_extractor = Wav2Vec2FeatureEncoder(config)
+        self.feature_projection = Wav2Vec2FeatureProjection(config)
+
+        # model only needs masking vector if mask prob is > 0.0
+        if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0:
+          #  self.masked_spec_embed = nn.Parameter(paddle.Tensor(config.hidden_size).uniform_())
+            #self.masked_spec_embed = paddle.uniform([config.hidden_size])
+            self.masked_spec_embed = paddle.static.create_parameter(shape=[config.hidden_size], dtype='float32', default_initializer=paddle.nn.initializer.Uniform(low=0, high=1.0))
+        if config.do_stable_layer_norm:
+            self.encoder = Wav2Vec2EncoderStableLayerNorm(config)
+        else:
+            self.encoder = Wav2Vec2Encoder(config)
+
+        self.adapter = Wav2Vec2Adapter(config) if config.add_adapter else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def freeze_feature_extractor(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameters will
+        not be updated during training.
+        """
+        warnings.warn(
+            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
+            "Please use the equivalent `freeze_feature_encoder` method instead.",
+            FutureWarning,
+        )
+        self.freeze_feature_encoder()
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.feature_extractor._freeze_parameters()
+
+    def _mask_hidden_states(
+        self,
+        hidden_states: paddle.Tensor,
+        mask_time_indices: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+    ):
+        """
+        Masks extracted features along time axis and/or along feature axis according to
+        [SpecAugment](https://arxiv.org/abs/1904.08779).
+        """
+
+        # `config.apply_spec_augment` can set masking to False
+        if not getattr(self.config, "apply_spec_augment", True):
+            return hidden_states
+
+        # generate indices & apply SpecAugment along time axis
+        batch_size, sequence_length, hidden_size = hidden_states.shape
+
+        if mask_time_indices is not None:
+            # apply SpecAugment along time axis with given mask_time_indices
+            hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
+        elif self.config.mask_time_prob > 0 and self.training:
+            mask_time_indices = _compute_mask_indices(
+                (batch_size, sequence_length),
+                mask_prob=self.config.mask_time_prob,
+                mask_length=self.config.mask_time_length,
+                attention_mask=attention_mask,
+                min_masks=self.config.mask_time_min_masks,
+            )
+            mask_time_indices = paddle.to_tensor(mask_time_indices, dtype=paddle.bool)
+            hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
+
+        if self.config.mask_feature_prob > 0 and self.training:
+            # generate indices & apply SpecAugment along feature axis
+            mask_feature_indices = _compute_mask_indices(
+                (batch_size, hidden_size),
+                mask_prob=self.config.mask_feature_prob,
+                mask_length=self.config.mask_feature_length,
+                min_masks=self.config.mask_feature_min_masks,
+            )
+            mask_feature_indices = paddle.to_tensor(mask_feature_indices, dtype=paddle.bool)
+            mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)
+            hidden_states[mask_feature_indices] = 0
+
+        return hidden_states
+
+    def forward(
+        self,
+        input_values: Optional[paddle.Tensor],
+        attention_mask: Optional[paddle.Tensor] = None,
+        mask_time_indices: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, Wav2Vec2BaseModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        import numpy as np
+        np.save("data/paddle_input_values.npy", input_values.numpy())
+        extract_features = self.feature_extractor(input_values)
+        extract_features = extract_features.transpose([0, 2, 1])
+
+        if attention_mask is not None:
+            # compute reduced attention_mask corresponding to feature vectors
+            attention_mask = self._get_feature_vector_attention_mask(
+                extract_features.shape[1], attention_mask, add_adapter=False
+            )
+        np.save("data/paddle_extract_features.npy", extract_features.numpy())
+        hidden_states, extract_features = self.feature_projection(extract_features)
+        np.save("data/paddle_feature_projection.npy", hidden_states.numpy())
+        hidden_states = self._mask_hidden_states(
+            hidden_states, mask_time_indices=mask_time_indices, attention_mask=attention_mask
+        )
+
+        encoder_outputs = self.encoder(
+            hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = encoder_outputs[0]
+        np.save("data/paddle_encoder_outputs.npy", hidden_states.numpy())
+
+        if self.adapter is not None:
+            hidden_states = self.adapter(hidden_states)
+
+        if not return_dict:
+            return (hidden_states, extract_features) + encoder_outputs[1:]
+
+        return Wav2Vec2BaseModelOutput(
+            last_hidden_state=hidden_states,
+            extract_features=extract_features,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+    def post_init(self):
+        """
+        A method executed at the end of each Transformer model initialization, to execute code that needs the model's
+        modules properly initialized (such as weight initialization).
+        """
+     #   self.init_weights()
+     #   self._backward_compatibility_gradient_checkpointing()
+        pass
+
+class Wav2Vec2ConfigPure():
+    model_type = "wav2vec2"
+    def __init__(
+        self,
+        vocab_size=32,
+        hidden_size=1024,
+        num_hidden_layers=24,
+        num_attention_heads=16,
+        intermediate_size=4096,
+        hidden_act="gelu",
+        hidden_dropout=0.1,
+        activation_dropout=0.1,
+        attention_dropout=0.1,
+        feat_proj_dropout=0.1,
+        feat_quantizer_dropout=0.0,
+        final_dropout=0.1,
+        layerdrop=0.1,
+        initializer_range=0.02,
+        layer_norm_eps=1e-5,
+        feat_extract_norm="layer",
+        feat_extract_activation="gelu",
+        conv_dim=(512, 512, 512, 512, 512, 512, 512),
+        conv_stride=(5, 2, 2, 2, 2, 2, 2),
+        conv_kernel=(10, 3, 3, 3, 3, 2, 2),
+        conv_bias=True,
+        num_conv_pos_embeddings=128,
+        num_conv_pos_embedding_groups=16,
+        do_stable_layer_norm=True,
+        apply_spec_augment=True,
+        mask_time_prob=0.05,
+        mask_time_length=10,
+        mask_time_min_masks=2,
+        mask_feature_prob=0.0,
+        mask_feature_length=10,
+        mask_feature_min_masks=0,
+        num_codevectors_per_group=320,
+        num_codevector_groups=2,
+        contrastive_logits_temperature=0.1,
+        num_negatives=100,
+        codevector_dim=256,
+        proj_codevector_dim=256,
+        diversity_loss_weight=0.1,
+        ctc_loss_reduction="sum",
+        ctc_zero_infinity=False,
+        use_weighted_layer_sum=False,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        add_adapter=False,
+        adapter_kernel_size=3,
+        adapter_stride=2,
+        num_adapter_layers=3,
+        output_hidden_size=None,
+        **kwargs
+    ):
+        self.output_attentions = False
+        self.output_hidden_states = False
+        self.use_return_dict = True
+
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.hidden_size = hidden_size
+        self.feat_extract_norm = feat_extract_norm
+        self.feat_extract_activation = feat_extract_activation
+        self.conv_dim = list(conv_dim)
+        self.conv_stride = list(conv_stride)
+        self.conv_kernel = list(conv_kernel)
+        self.conv_bias = conv_bias
+        self.num_conv_pos_embeddings = num_conv_pos_embeddings
+        self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups
+        self.num_feat_extract_layers = len(self.conv_dim)
+        self.num_hidden_layers = num_hidden_layers
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.num_attention_heads = num_attention_heads
+        self.hidden_dropout = hidden_dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.feat_proj_dropout = feat_proj_dropout
+        self.final_dropout = final_dropout
+        self.layerdrop = layerdrop
+        self.layer_norm_eps = layer_norm_eps
+        self.initializer_range = initializer_range
+        self.vocab_size = vocab_size
+        self.do_stable_layer_norm = do_stable_layer_norm
+        self.use_weighted_layer_sum = use_weighted_layer_sum
+
+        if (
+            (len(self.conv_stride) != self.num_feat_extract_layers)
+            or (len(self.conv_kernel) != self.num_feat_extract_layers)
+            or (len(self.conv_dim) != self.num_feat_extract_layers)
+        ):
+            raise ValueError(
+                "Configuration for convolutional layers is incorrect. It is required that `len(config.conv_dim)` =="
+                " `len(config.conv_stride)` == `len(config.conv_kernel)`, but is `len(config.conv_dim) ="
+                f" {len(self.conv_dim)}`, `len(config.conv_stride) = {len(self.conv_stride)}`,"
+                f" `len(config.conv_kernel) = {len(self.conv_kernel)}`."
+            )
+
+        # fine-tuning config parameters for SpecAugment: https://arxiv.org/abs/1904.08779
+        self.apply_spec_augment = apply_spec_augment
+        self.mask_time_prob = mask_time_prob
+        self.mask_time_length = mask_time_length
+        self.mask_time_min_masks = mask_time_min_masks
+        self.mask_feature_prob = mask_feature_prob
+        self.mask_feature_length = mask_feature_length
+        self.mask_feature_min_masks = mask_feature_min_masks
+
+        # parameters for pretraining with codevector quantized representations
+        self.num_codevectors_per_group = num_codevectors_per_group
+        self.num_codevector_groups = num_codevector_groups
+        self.contrastive_logits_temperature = contrastive_logits_temperature
+        self.feat_quantizer_dropout = feat_quantizer_dropout
+        self.num_negatives = num_negatives
+        self.codevector_dim = codevector_dim
+        self.proj_codevector_dim = proj_codevector_dim
+        self.diversity_loss_weight = diversity_loss_weight
+
+        # ctc loss
+        self.ctc_loss_reduction = ctc_loss_reduction
+        self.ctc_zero_infinity = ctc_zero_infinity
+
+        # adapter
+        self.add_adapter = add_adapter
+        self.adapter_kernel_size = adapter_kernel_size
+        self.adapter_stride = adapter_stride
+        self.num_adapter_layers = num_adapter_layers
+        self.output_hidden_size = output_hidden_size or hidden_size
+
+    @property
+    def inputs_to_logits_ratio(self):
+        return functools.reduce(operator.mul, self.conv_stride, 1)
+
+
+def main():
+    config = Wav2Vec2ConfigPure()
+    model = Wav2Vec2Model(config)
+    model_dict = model.state_dict()
+#    checkpoint_path = "wav2vec2_test"
+#    params_path = checkpoint_path + ".pdparams"    
+#    paddle.save(model_dict, params_path)
+    revise_params_path = "exp/wav2vec2-large-960h-lv60-self.pdparams"
+    model_dict_revise = paddle.load(revise_params_path)
+    model.set_state_dict(model_dict_revise)
+    model.training = False
+    model.eval()
+    input_values = np.load("input_values.npy")
+    input_values = paddle.to_tensor(input_values)
+    outputs = model(input_values)
+    last_hidden_state = outputs.last_hidden_state
+    extract_features = outputs.extract_features
+    hidden_states = outputs.hidden_states
+    attentions = outputs.attentions
+    print (last_hidden_state)
+    np.save("paddle_last_hidden_state.npy", last_hidden_state.numpy())
+    print ("extract_features")
+    print (extract_features)
+    np.save("paddle_extract_features.npy", extract_features.numpy())
+    print ("hidden_states")
+    print (hidden_states)
+    print ("attentions")
+    print (attentions)
+    return 
+    logits = logits.numpy()
+    np.save("paddle_logits.npy", logits)
+
+if __name__ == "__main__":
+    main()
diff --git a/paddlespeech/s2t/models/wav2vec2/speechbrain/__init__.py b/paddlespeech/s2t/models/wav2vec2/speechbrain/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/paddlespeech/s2t/models/wav2vec2/speechbrain/lobes/__init__.py b/paddlespeech/s2t/models/wav2vec2/speechbrain/lobes/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/paddlespeech/s2t/models/wav2vec2/speechbrain/lobes/augment.py b/paddlespeech/s2t/models/wav2vec2/speechbrain/lobes/augment.py
new file mode 100644
index 000000000..057be1d46
--- /dev/null
+++ b/paddlespeech/s2t/models/wav2vec2/speechbrain/lobes/augment.py
@@ -0,0 +1,359 @@
+import os
+import paddle
+import speechbrain as sb
+from speechbrain.processing.speech_augmentation import (
+    SpeedPerturb,
+    DropFreq,
+    DropChunk,
+)
+
+
+class TimeDomainSpecAugment(paddle.nn.Layer):
+    """A time-domain approximation of the SpecAugment algorithm.
+    This augmentation module implements three augmentations in
+    the time-domain.
+     1. Drop chunks of the audio (zero amplitude or white noise)
+     2. Drop frequency bands (with band-drop filters)
+     3. Speed peturbation (via resampling to slightly different rate)
+    Arguments
+    ---------
+    perturb_prob : float from 0 to 1
+        The probability that a batch will have speed perturbation applied.
+    drop_freq_prob : float from 0 to 1
+        The probability that a batch will have frequencies dropped.
+    drop_chunk_prob : float from 0 to 1
+        The probability that a batch will have chunks dropped.
+    speeds : list of ints
+        A set of different speeds to use to perturb each batch.
+        See ``speechbrain.processing.speech_augmentation.SpeedPerturb``
+    sample_rate : int
+        Sampling rate of the input waveforms.
+    drop_freq_count_low : int
+        Lowest number of frequencies that could be dropped.
+    drop_freq_count_high : int
+        Highest number of frequencies that could be dropped.
+    drop_chunk_count_low : int
+        Lowest number of chunks that could be dropped.
+    drop_chunk_count_high : int
+        Highest number of chunks that could be dropped.
+    drop_chunk_length_low : int
+        Lowest length of chunks that could be dropped.
+    drop_chunk_length_high : int
+        Highest length of chunks that could be dropped.
+    drop_chunk_noise_factor : float
+        The noise factor used to scale the white noise inserted, relative to
+        the average amplitude of the utterance. Default 0 (no noise inserted).
+    Example
+    -------
+    >>> inputs = torch.randn([10, 16000])
+    >>> feature_maker = TimeDomainSpecAugment(speeds=[80])
+    >>> feats = feature_maker(inputs, torch.ones(10))
+    >>> feats.shape
+    torch.Size([10, 12800])
+    """
+
+    def __init__(
+        self,
+        perturb_prob=1.0,
+        drop_freq_prob=1.0,
+        drop_chunk_prob=1.0,
+        speeds=[95, 100, 105],
+        sample_rate=16000,
+        drop_freq_count_low=0,
+        drop_freq_count_high=3,
+        drop_chunk_count_low=0,
+        drop_chunk_count_high=5,
+        drop_chunk_length_low=1000,
+        drop_chunk_length_high=2000,
+        drop_chunk_noise_factor=0,
+    ):
+        super().__init__()
+        self.speed_perturb = SpeedPerturb(
+            perturb_prob=perturb_prob, orig_freq=sample_rate, speeds=speeds
+        )
+        self.drop_freq = DropFreq(
+            drop_prob=drop_freq_prob,
+            drop_count_low=drop_freq_count_low,
+            drop_count_high=drop_freq_count_high,
+        )
+        self.drop_chunk = DropChunk(
+            drop_prob=drop_chunk_prob,
+            drop_count_low=drop_chunk_count_low,
+            drop_count_high=drop_chunk_count_high,
+            drop_length_low=drop_chunk_length_low,
+            drop_length_high=drop_chunk_length_high,
+            noise_factor=drop_chunk_noise_factor,
+        )
+
+    def forward(self, waveforms, lengths):
+        """Returns the distorted waveforms.
+        Arguments
+        ---------
+        waveforms : torch.Tensor
+            The waveforms to distort
+        """
+        # Augmentation
+        with paddle.no_grad():
+            waveforms = self.speed_perturb(waveforms)
+            waveforms = self.drop_freq(waveforms)
+            waveforms = self.drop_chunk(waveforms, lengths)
+
+        return 
+
+
+class DropFreq(torch.nn.Module):
+    """This class drops a random frequency from the signal.
+    The purpose of this class is to teach models to learn to rely on all parts
+    of the signal, not just a few frequency bands.
+    Arguments
+    ---------
+    drop_freq_low : float
+        The low end of frequencies that can be dropped,
+        as a fraction of the sampling rate / 2.
+    drop_freq_high : float
+        The high end of frequencies that can be
+        dropped, as a fraction of the sampling rate / 2.
+    drop_count_low : int
+        The low end of number of frequencies that could be dropped.
+    drop_count_high : int
+        The high end of number of frequencies that could be dropped.
+    drop_width : float
+        The width of the frequency band to drop, as
+        a fraction of the sampling_rate / 2.
+    drop_prob : float
+        The probability that the batch of signals will  have a frequency
+        dropped. By default, every batch has frequencies dropped.
+    Example
+    -------
+    >>> from speechbrain.dataio.dataio import read_audio
+    >>> dropper = DropFreq()
+    >>> signal = read_audio('tests/samples/single-mic/example1.wav')
+    >>> dropped_signal = dropper(signal.unsqueeze(0))
+    """
+
+    def __init__(
+        self,
+        drop_freq_low=1e-14,
+        drop_freq_high=1,
+        drop_count_low=1,
+        drop_count_high=2,
+        drop_width=0.05,
+        drop_prob=1,
+    ):
+        super().__init__()
+        self.drop_freq_low = drop_freq_low
+        self.drop_freq_high = drop_freq_high
+        self.drop_count_low = drop_count_low
+        self.drop_count_high = drop_count_high
+        self.drop_width = drop_width
+        self.drop_prob = drop_prob
+
+    def forward(self, waveforms):
+        """
+        Arguments
+        ---------
+        waveforms : tensor
+            Shape should be `[batch, time]` or `[batch, time, channels]`.
+        Returns
+        -------
+        Tensor of shape `[batch, time]` or `[batch, time, channels]`.
+        """
+
+        # Don't drop (return early) 1-`drop_prob` portion of the batches
+        dropped_waveform = waveforms.clone()
+        if torch.rand(1) > self.drop_prob:
+            return dropped_waveform
+
+        # Add channels dimension
+        if len(waveforms.shape) == 2:
+            dropped_waveform = dropped_waveform.unsqueeze(-1)
+
+        # Pick number of frequencies to drop
+        drop_count = torch.randint(
+            low=self.drop_count_low, high=self.drop_count_high + 1, size=(1,),
+        )
+
+        # Pick a frequency to drop
+        drop_range = self.drop_freq_high - self.drop_freq_low
+        drop_frequency = (
+            torch.rand(drop_count) * drop_range + self.drop_freq_low
+        )
+
+        # Filter parameters
+        filter_length = 101
+        pad = filter_length // 2
+
+        # Start with delta function
+        drop_filter = torch.zeros(1, filter_length, 1, device=waveforms.device)
+        drop_filter[0, pad, 0] = 1
+
+        # Subtract each frequency
+        for frequency in drop_frequency:
+            notch_kernel = notch_filter(
+                frequency, filter_length, self.drop_width,
+            ).to(waveforms.device)
+            drop_filter = convolve1d(drop_filter, notch_kernel, pad)
+
+        # Apply filter
+        dropped_waveform = convolve1d(dropped_waveform, drop_filter, pad)
+
+        # Remove channels dimension if added
+        return dropped_waveform.squeeze(-1)
+
+class DropChunk(torch.nn.Module):
+    """This class drops portions of the input signal.
+    Using `DropChunk` as an augmentation strategy helps a models learn to rely
+    on all parts of the signal, since it can't expect a given part to be
+    present.
+    Arguments
+    ---------
+    drop_length_low : int
+        The low end of lengths for which to set the
+        signal to zero, in samples.
+    drop_length_high : int
+        The high end of lengths for which to set the
+        signal to zero, in samples.
+    drop_count_low : int
+        The low end of number of times that the signal
+        can be dropped to zero.
+    drop_count_high : int
+        The high end of number of times that the signal
+        can be dropped to zero.
+    drop_start : int
+        The first index for which dropping will be allowed.
+    drop_end : int
+        The last index for which dropping will be allowed.
+    drop_prob : float
+        The probability that the batch of signals will
+        have a portion dropped. By default, every batch
+        has portions dropped.
+    noise_factor : float
+        The factor relative to average amplitude of an utterance
+        to use for scaling the white noise inserted. 1 keeps
+        the average amplitude the same, while 0 inserts all 0's.
+    Example
+    -------
+    >>> from speechbrain.dataio.dataio import read_audio
+    >>> dropper = DropChunk(drop_start=100, drop_end=200, noise_factor=0.)
+    >>> signal = read_audio('tests/samples/single-mic/example1.wav')
+    >>> signal = signal.unsqueeze(0) # [batch, time, channels]
+    >>> length = torch.ones(1)
+    >>> dropped_signal = dropper(signal, length)
+    >>> float(dropped_signal[:, 150])
+    0.0
+    """
+
+    def __init__(
+        self,
+        drop_length_low=100,
+        drop_length_high=1000,
+        drop_count_low=1,
+        drop_count_high=10,
+        drop_start=0,
+        drop_end=None,
+        drop_prob=1,
+        noise_factor=0.0,
+    ):
+        super().__init__()
+        self.drop_length_low = drop_length_low
+        self.drop_length_high = drop_length_high
+        self.drop_count_low = drop_count_low
+        self.drop_count_high = drop_count_high
+        self.drop_start = drop_start
+        self.drop_end = drop_end
+        self.drop_prob = drop_prob
+        self.noise_factor = noise_factor
+
+        # Validate low < high
+        if drop_length_low > drop_length_high:
+            raise ValueError("Low limit must not be more than high limit")
+        if drop_count_low > drop_count_high:
+            raise ValueError("Low limit must not be more than high limit")
+
+        # Make sure the length doesn't exceed end - start
+        if drop_end is not None and drop_end >= 0:
+            if drop_start > drop_end:
+                raise ValueError("Low limit must not be more than high limit")
+
+            drop_range = drop_end - drop_start
+            self.drop_length_low = min(drop_length_low, drop_range)
+            self.drop_length_high = min(drop_length_high, drop_range)
+
+    def forward(self, waveforms, lengths):
+        """
+        Arguments
+        ---------
+        waveforms : tensor
+            Shape should be `[batch, time]` or `[batch, time, channels]`.
+        lengths : tensor
+            Shape should be a single dimension, `[batch]`.
+        Returns
+        -------
+        Tensor of shape `[batch, time]` or
+            `[batch, time, channels]`
+        """
+
+        # Reading input list
+        lengths = (lengths * waveforms.size(1)).long()
+        batch_size = waveforms.size(0)
+        dropped_waveform = waveforms.clone()
+
+        # Don't drop (return early) 1-`drop_prob` portion of the batches
+        if torch.rand(1) > self.drop_prob:
+            return dropped_waveform
+
+        # Store original amplitude for computing white noise amplitude
+        clean_amplitude = compute_amplitude(waveforms, lengths.unsqueeze(1))
+
+        # Pick a number of times to drop
+        drop_times = torch.randint(
+            low=self.drop_count_low,
+            high=self.drop_count_high + 1,
+            size=(batch_size,),
+        )
+
+        # Iterate batch to set mask
+        for i in range(batch_size):
+            if drop_times[i] == 0:
+                continue
+
+            # Pick lengths
+            length = torch.randint(
+                low=self.drop_length_low,
+                high=self.drop_length_high + 1,
+                size=(drop_times[i],),
+            )
+
+            # Compute range of starting locations
+            start_min = self.drop_start
+            if start_min < 0:
+                start_min += lengths[i]
+            start_max = self.drop_end
+            if start_max is None:
+                start_max = lengths[i]
+            if start_max < 0:
+                start_max += lengths[i]
+            start_max = max(0, start_max - length.max())
+
+            # Pick starting locations
+            start = torch.randint(
+                low=start_min, high=start_max + 1, size=(drop_times[i],),
+            )
+
+            end = start + length
+
+            # Update waveform
+            if not self.noise_factor:
+                for j in range(drop_times[i]):
+                    dropped_waveform[i, start[j] : end[j]] = 0.0
+            else:
+                # Uniform distribution of -2 to +2 * avg amplitude should
+                # preserve the average for normalization
+                noise_max = 2 * clean_amplitude[i] * self.noise_factor
+                for j in range(drop_times[i]):
+                    # zero-center the noise distribution
+                    noise_vec = torch.rand(length[j], device=waveforms.device)
+                    noise_vec = 2 * noise_max * noise_vec - noise_max
+                    dropped_waveform[i, start[j] : end[j]] = noise_vec
+
+        return 
\ No newline at end of file
diff --git a/paddlespeech/s2t/models/wav2vec2/speechbrain/lobes/models/VanillaNN.py b/paddlespeech/s2t/models/wav2vec2/speechbrain/lobes/models/VanillaNN.py
new file mode 100644
index 000000000..8eb56e759
--- /dev/null
+++ b/paddlespeech/s2t/models/wav2vec2/speechbrain/lobes/models/VanillaNN.py
@@ -0,0 +1,45 @@
+"""Vanilla Neural Network for simple tests.
+Authors
+* Elena Rastorgueva 2020
+"""
+import paddle
+from paddlespeech.s2t.models.wav2vec2.speechbrain.nnet import containers
+import paddlespeech.s2t.models.wav2vec2.speechbrain as sb
+
+
+class VanillaNN(containers.Sequential):
+    """A simple vanilla Deep Neural Network.
+    Arguments
+    ---------
+    activation : paddle class
+        A class used for constructing the activation layers.
+    dnn_blocks : int
+        The number of linear neural blocks to include.
+    dnn_neurons : int
+        The number of neurons in the linear layers.
+    Example
+    -------
+    >>> inputs = paddle.rand([10, 120, 60])
+    >>> model = VanillaNN(input_shape=inputs.shape)
+    >>> outputs = model(inputs)
+    >>> outputs.shape
+    paddle.shape([10, 120, 512])
+    """
+
+    def __init__(
+        self,
+        input_shape,
+        activation=paddle.nn.LeakyReLU,
+        dnn_blocks=2,
+        dnn_neurons=512,
+    ):
+        super().__init__(input_shape=input_shape)
+
+        for block_index in range(dnn_blocks):
+            self.append(
+                sb.nnet.linear.Linear,
+                n_neurons=dnn_neurons,
+                bias=True,
+                layer_name="linear",
+            )
+            self.append(activation(), layer_name="act")
diff --git a/paddlespeech/s2t/models/wav2vec2/speechbrain/lobes/models/__init__.py b/paddlespeech/s2t/models/wav2vec2/speechbrain/lobes/models/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/paddlespeech/s2t/models/wav2vec2/speechbrain/nnet/__init__.py b/paddlespeech/s2t/models/wav2vec2/speechbrain/nnet/__init__.py
new file mode 100644
index 000000000..f8f087714
--- /dev/null
+++ b/paddlespeech/s2t/models/wav2vec2/speechbrain/nnet/__init__.py
@@ -0,0 +1,2 @@
+from . import linear
+from . import containers
diff --git a/paddlespeech/s2t/models/wav2vec2/speechbrain/nnet/containers.py b/paddlespeech/s2t/models/wav2vec2/speechbrain/nnet/containers.py
new file mode 100644
index 000000000..078806902
--- /dev/null
+++ b/paddlespeech/s2t/models/wav2vec2/speechbrain/nnet/containers.py
@@ -0,0 +1,132 @@
+import paddle
+import inspect
+import logging
+import operator
+import functools
+
+class Sequential(paddle.nn.LayerDict):
+    """A sequence of modules with potentially inferring shape on construction.
+    If layers are passed with names, these can be referenced with dot notation.
+    Arguments
+    ---------
+    input_shape : iterable
+        A list or tuple of ints or None, representing the expected shape of an
+        input tensor. None represents a variable-length dimension. If no
+        ``input_shape`` is passed, no shape inference will be performed.
+    *layers, **named_layers
+        The inputs are treated as a list of layers to be
+        applied in sequence. The output shape of each layer is used to
+        infer the shape of the following layer. If a tuple is returned,
+        only the shape of the first element is used to determine input
+        shape of the next layer (e.g. RNN returns output, hidden).
+    Example
+    -------
+    >>> inputs = paddle.rand(10, 40, 50)
+    >>> model = Sequential(input_shape=inputs.shape)
+    >>> model.append(Linear, n_neurons=100, layer_name="layer1")
+    >>> model.append(Linear, n_neurons=200, layer_name="layer2")
+    >>> outputs = model(inputs)
+    >>> outputs.shape
+    paddle.shape([10, 40, 200])
+    >>> outputs = model.layer1(inputs)
+    >>> outputs.shape
+    paddle.shape([10, 40, 100])
+    """
+
+    def __init__(self, *layers, input_shape=None, **named_layers):
+        super().__init__()
+
+        # Make sure either layers or input_shape is passed
+        if not layers and input_shape is None and not named_layers:
+            raise ValueError("Must pass either layers or input shape")
+
+        # Keep track of what layers need "lengths" passed
+        self.length_layers = []
+
+        # Replace None dimensions with arbitrary value
+        self.input_shape = input_shape
+        if input_shape and None in input_shape:
+            self.input_shape = list(input_shape)
+            for i, dim in enumerate(self.input_shape):
+
+                # To reduce size of dummy tensors, use 1 for batch dim
+                if i == 0 and dim is None:
+                    dim = 1
+
+                # Use 64 as nice round arbitrary value, big enough that
+                # halving this dimension a few times doesn't reach 1
+                self.input_shape[i] = dim or 256
+
+        # Append non-named layers
+        for layer in layers:
+            self.append(layer)
+
+        # Append named layers
+        for name, layer in named_layers.items():
+            self.append(layer, layer_name=name)
+
+    def append(self, layer, *args, layer_name=None, **kwargs):
+        """Add a layer to the list of layers, inferring shape if necessary.
+        Arguments
+        ---------
+        layer : A paddle.nn.Module class or object
+            If the layer is a class, it should accept an argument called
+            ``input_shape`` which will be inferred and passed. If the layer
+            is a module object, it is added as-is.
+        layer_name : str
+            The name of the layer, for reference. If the name is in use,
+            ``_{count}`` will be appended.
+        *args, **kwargs
+            These are passed to the layer if it is constructed.
+        """
+
+        # Compute layer_name
+        if layer_name is None:
+            layer_name = str(len(self))
+        elif layer_name in self:
+            index = 0
+            while f"{layer_name}_{index}" in self:
+                index += 1
+            layer_name = f"{layer_name}_{index}"
+
+        # Check if it needs to be constructed with input shape
+        if self.input_shape:
+            argspec = inspect.getfullargspec(layer)
+            if "input_shape" in argspec.args + argspec.kwonlyargs:
+                input_shape = self.get_output_shape()
+                layer = layer(*args, input_shape=input_shape, **kwargs)
+
+        # Finally, append the layer.
+        try:
+            self[layer_name] = layer
+           # self.add_module(layer_name, layer)
+        except TypeError:
+            raise ValueError(
+                "Must pass `input_shape` at initialization and use "
+                "modules that take `input_shape` to infer shape when "
+                "using `append()`."
+            )
+
+    def get_output_shape(self):
+        """Returns expected shape of the output.
+        Computed by passing dummy input constructed with the
+        ``self.input_shape`` attribute.
+        """
+        with paddle.no_grad():
+            dummy_input = paddle.zeros(self.input_shape)
+            dummy_output = self(dummy_input)
+        return dummy_output.shape
+
+    def forward(self, x):
+        """Applies layers in sequence, passing only the first element of tuples.
+        Arguments
+        ---------
+        x : paddle.Tensor
+            The input tensor to run through the network.
+        """
+        for layer in self.values():
+            x = layer(x)
+            if isinstance(x, tuple):
+                x = x[0]
+
+        return x
diff --git a/paddlespeech/s2t/models/wav2vec2/speechbrain/nnet/linear.py b/paddlespeech/s2t/models/wav2vec2/speechbrain/nnet/linear.py
new file mode 100644
index 000000000..26389d908
--- /dev/null
+++ b/paddlespeech/s2t/models/wav2vec2/speechbrain/nnet/linear.py
@@ -0,0 +1,73 @@
+"""Library implementing linear transformation.
+Authors
+ * Mirco Ravanelli 2020
+ * Davide Borra 2021
+"""
+
+import logging
+import paddle
+import paddle.nn as nn
+from paddlespeech.s2t.modules import align
+
+logger = logging.getLogger(__name__)
+
+
+class Linear(paddle.nn.Layer):
+    """Computes a linear transformation y = wx + b.
+    Arguments
+    ---------
+    n_neurons : int
+        It is the number of output neurons (i.e, the dimensionality of the
+        output).
+    input_shape: tuple
+        It is the shape of the input tensor.
+    input_size: int
+        Size of the input tensor.
+    bias : bool
+        If True, the additive bias b is adopted.
+    combine_dims : bool
+        If True and the input is 4D, combine 3rd and 4th dimensions of input.
+    Example
+    -------
+    >>> inputs = paddle.rand(10, 50, 40)
+    >>> lin_t = Linear(input_shape=(10, 50, 40), n_neurons=100)
+    >>> output = lin_t(inputs)
+    >>> output.shape
+    paddle.shape([10, 50, 100])
+    """
+
+    def __init__(
+        self,
+        n_neurons,
+        input_shape=None,
+        input_size=None,
+        bias=True,
+        combine_dims=False,
+    ):
+        super().__init__()
+        self.combine_dims = combine_dims
+
+        if input_shape is None and input_size is None:
+            raise ValueError("Expected one of input_shape or input_size")
+
+        if input_size is None:
+            input_size = input_shape[-1]
+            if len(input_shape) == 4 and self.combine_dims:
+                input_size = input_shape[2] * input_shape[3]
+
+        # Weights are initialized following paddle approach
+        self.w = align.Linear(input_size, n_neurons, bias_attr=bias)
+
+    def forward(self, x):
+        """Returns the linear transformation of input tensor.
+        Arguments
+        ---------
+        x : paddle.Tensor
+            Input to transform linearly.
+        """
+        if x.rank == 4 and self.combine_dims:
+            x = x.reshape(x.shape[0], x.shape[1], x.shape[2] * x.shape[3])
+
+        wx = self.w(x)
+
+        return wx
diff --git a/paddlespeech/s2t/models/wav2vec2/speechbrain/processing/signal_processing.py b/paddlespeech/s2t/models/wav2vec2/speechbrain/processing/signal_processing.py
new file mode 100644
index 000000000..aeae11c0b
--- /dev/null
+++ b/paddlespeech/s2t/models/wav2vec2/speechbrain/processing/signal_processing.py
@@ -0,0 +1,256 @@
+"""
+Low level signal processing utilities
+Authors
+ * Peter Plantinga 2020
+ * Francois Grondin 2020
+ * William Aris 2020
+ * Samuele Cornell 2020
+ * Sarthak Yadav 2022
+"""
+import paddle
+import math
+from packaging import version
+import numpy as np
+
+def blackman_window(window_length, periodic=True):
+    if window_length == 0:
+        return []
+    if window_length == 1:
+        return paddle.ones([1])
+    if periodic:
+        window_length += 1
+    
+
+
+
+    window = paddle.arange(window_length) * (np.pi / (window_length - 1))
+    window = 0.08 * paddle.cos(window * 4) - 0.5 * paddle.cos(window * 2) + 0.42
+    return window[:-1] if periodic else window
+
+
+def compute_amplitude(waveforms, lengths=None, amp_type="avg", scale="linear"):
+    """Compute amplitude of a batch of waveforms.
+    Arguments
+    ---------
+    waveform : tensor
+        The waveforms used for computing amplitude.
+        Shape should be `[time]` or `[batch, time]` or
+        `[batch, time, channels]`.
+    lengths : tensor
+        The lengths of the waveforms excluding the padding.
+        Shape should be a single dimension, `[batch]`.
+    amp_type : str
+        Whether to compute "avg" average or "peak" amplitude.
+        Choose between ["avg", "peak"].
+    scale : str
+        Whether to compute amplitude in "dB" or "linear" scale.
+        Choose between ["linear", "dB"].
+    Returns
+    -------
+    The average amplitude of the waveforms.
+    Example
+    -------
+    >>> signal = torch.sin(torch.arange(16000.0)).unsqueeze(0)
+    >>> compute_amplitude(signal, signal.size(1))
+    tensor([[0.6366]])
+    """
+    if len(waveforms.shape) == 1:
+        waveforms = waveforms.unsqueeze(0)
+
+    assert amp_type in ["avg", "peak"]
+    assert scale in ["linear", "dB"]
+
+    if amp_type == "avg":
+        if lengths is None:
+            out = paddle.mean(paddle.abs(waveforms), axis=1, keepdim=True)
+        else:
+            wav_sum = paddle.sum(paddle.abs(waveforms), axis=1, keepdim=True)
+            out = wav_sum / lengths
+    elif amp_type == "peak":
+        out = paddle.max(paddle.abs(waveforms), axis=1, keepdim=True)[0]
+    else:
+        raise NotImplementedError
+
+    if scale == "linear":
+        return out
+    elif scale == "dB":
+        return paddle.clip(20 * paddle.log10(out), min=-80)  # clamp zeros
+    else:
+        raise NotImplementedError
+
+
+def convolve1d(
+    waveform,
+    kernel,
+    padding=0,
+    pad_type="constant",
+    stride=1,
+    groups=1,
+    use_fft=False,
+    rotation_index=0,
+):
+    """Use torch.nn.functional to perform 1d padding and conv.
+    Arguments
+    ---------
+    waveform : tensor
+        The tensor to perform operations on.
+    kernel : tensor
+        The filter to apply during convolution.
+    padding : int or tuple
+        The padding (pad_left, pad_right) to apply.
+        If an integer is passed instead, this is passed
+        to the conv1d function and pad_type is ignored.
+    pad_type : str
+        The type of padding to use. Passed directly to
+        `torch.nn.functional.pad`, see PyTorch documentation
+        for available options.
+    stride : int
+        The number of units to move each time convolution is applied.
+        Passed to conv1d. Has no effect if `use_fft` is True.
+    groups : int
+        This option is passed to `conv1d` to split the input into groups for
+        convolution. Input channels should be divisible by the number of groups.
+    use_fft : bool
+        When `use_fft` is passed `True`, then compute the convolution in the
+        spectral domain using complex multiply. This is more efficient on CPU
+        when the size of the kernel is large (e.g. reverberation). WARNING:
+        Without padding, circular convolution occurs. This makes little
+        difference in the case of reverberation, but may make more difference
+        with different kernels.
+    rotation_index : int
+        This option only applies if `use_fft` is true. If so, the kernel is
+        rolled by this amount before convolution to shift the output location.
+    Returns
+    -------
+    The convolved waveform.
+    Example
+    -------
+    >>> from speechbrain.dataio.dataio import read_audio
+    >>> signal = read_audio('tests/samples/single-mic/example1.wav')
+    >>> signal = signal.unsqueeze(0).unsqueeze(2)
+    >>> kernel = torch.rand(1, 10, 1)
+    >>> signal = convolve1d(signal, kernel, padding=(9, 0))
+    """
+    if len(waveform.shape) != 3:
+        raise ValueError("Convolve1D expects a 3-dimensional tensor")
+
+    # Move time dimension last, which pad and fft and conv expect.
+    waveform = waveform.transpose([0, 2, 1])
+    kernel = kernel.transpose([0, 2, 1])
+
+    # Padding can be a tuple (left_pad, right_pad) or an int
+    if isinstance(padding, tuple):
+        waveform = paddle.nn.functional.pad(
+            x=waveform, pad=padding, mode=pad_type,
+        )
+
+    # This approach uses FFT, which is more efficient if the kernel is large
+    if use_fft:
+
+        # Pad kernel to same length as signal, ensuring correct alignment
+        zero_length = waveform.shape[-1] - kernel.shape[-1]
+
+        # Handle case where signal is shorter
+        if zero_length < 0:
+            kernel = kernel[..., :zero_length]
+            zero_length = 0
+
+        # Perform rotation to ensure alignment
+        zeros = paddle.zeros(
+            kernel.shape[0], kernel.shape[1], zero_length
+        )
+        after_index = kernel[..., rotation_index:]
+        before_index = kernel[..., :rotation_index]
+        kernel = paddle.concat((after_index, zeros, before_index), axis=-1)
+
+        # Multiply in frequency domain to convolve in time domain
+        # if version.parse(torch.__version__) > version.parse("1.6.0"):
+        import paddle.fft as fft
+
+        result = fft.rfft(waveform) * fft.rfft(kernel)
+        convolved = fft.irfft(result, n=waveform.shape[-1])
+        # else:
+        #     f_signal = torch.rfft(waveform, 1)
+        #     f_kernel = torch.rfft(kernel, 1)
+        #     sig_real, sig_imag = f_signal.unbind(-1)
+        #     ker_real, ker_imag = f_kernel.unbind(-1)
+        #     f_result = torch.stack(
+        #         [
+        #             sig_real * ker_real - sig_imag * ker_imag,
+        #             sig_real * ker_imag + sig_imag * ker_real,
+        #         ],
+        #         dim=-1,
+        #     )
+        #     convolved = torch.irfft(
+        #         f_result, 1, signal_sizes=[waveform.size(-1)]
+        #     )
+
+    # Use the implementation given by torch, which should be efficient on GPU
+    else:
+        convolved = paddle.nn.functional.conv1d(
+            x=waveform,
+            weight=kernel,
+            stride=stride,
+            groups=groups,
+            padding=padding if not isinstance(padding, tuple) else 0,
+        )
+
+    # Return time dimension to the second dimension.
+    return convolved.transpose([0, 2, 1])
+
+def notch_filter(notch_freq, filter_width=101, notch_width=0.05):
+    """Returns a notch filter constructed from a high-pass and low-pass filter.
+    (from https://tomroelandts.com/articles/
+    how-to-create-simple-band-pass-and-band-reject-filters)
+    Arguments
+    ---------
+    notch_freq : float
+        frequency to put notch as a fraction of the
+        sampling rate / 2. The range of possible inputs is 0 to 1.
+    filter_width : int
+        Filter width in samples. Longer filters have
+        smaller transition bands, but are more inefficient.
+    notch_width : float
+        Width of the notch, as a fraction of the sampling_rate / 2.
+    Example
+    -------
+    >>> from speechbrain.dataio.dataio import read_audio
+    >>> signal = read_audio('tests/samples/single-mic/example1.wav')
+    >>> signal = signal.unsqueeze(0).unsqueeze(2)
+    >>> kernel = notch_filter(0.25)
+    >>> notched_signal = convolve1d(signal, kernel)
+    """
+
+    # Check inputs
+    assert 0 < notch_freq <= 1
+    assert filter_width % 2 != 0
+    pad = filter_width // 2
+    inputs = paddle.arange(filter_width) - pad
+
+    # Avoid frequencies that are too low
+    notch_freq += notch_width
+
+    # Define sinc function, avoiding division by zero
+    def sinc(x):
+        "Computes the sinc function."
+
+        def _sinc(x):
+            return paddle.sin(x) / x
+
+        # The zero is at the middle index
+        return paddle.concat([_sinc(x[:pad]), paddle.ones([1]), _sinc(x[pad + 1 :])])
+
+    # Compute a low-pass filter with cutoff frequency notch_freq.
+    hlpf = sinc(3 * (notch_freq - notch_width) * inputs)
+    hlpf *= blackman_window(filter_width)
+    hlpf /= paddle.sum(hlpf)
+
+    # Compute a high-pass filter with cutoff frequency notch_freq.
+    hhpf = sinc(3 * (notch_freq + notch_width) * inputs)
+    hhpf *= blackman_window(filter_width)
+    hhpf /= -paddle.sum(hhpf)
+    hhpf[pad] += 1
+
+    # Adding filters creates notch filter
+    return (hlpf + hhpf).view(1, -1, 1)
+
diff --git a/paddlespeech/s2t/models/wav2vec2/speechbrain/processing/speech_augmentation.py b/paddlespeech/s2t/models/wav2vec2/speechbrain/processing/speech_augmentation.py
new file mode 100644
index 000000000..1cbbe11af
--- /dev/null
+++ b/paddlespeech/s2t/models/wav2vec2/speechbrain/processing/speech_augmentation.py
@@ -0,0 +1,741 @@
+import math
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddlespeech.s2t.models.wav2vec2.speechbrain.processing.signal_processing import (
+    compute_amplitude,
+    convolve1d,
+    notch_filter)
+import pdb
+class SpeedPerturb(nn.Layer):
+    """Slightly speed up or slow down an audio signal.
+    Resample the audio signal at a rate that is similar to the original rate,
+    to achieve a slightly slower or slightly faster signal. This technique is
+    outlined in the paper: "Audio Augmentation for Speech Recognition"
+    Arguments
+    ---------
+    orig_freq : int
+        The frequency of the original signal.
+    speeds : list
+        The speeds that the signal should be changed to, as a percentage of the
+        original signal (i.e. `speeds` is divided by 100 to get a ratio).
+    perturb_prob : float
+        The chance that the batch will be speed-
+        perturbed. By default, every batch is perturbed.
+    Example
+    -------
+    >>> from speechbrain.dataio.dataio import read_audio
+    >>> signal = read_audio('tests/samples/single-mic/example1.wav')
+    >>> perturbator = SpeedPerturb(orig_freq=16000, speeds=[90])
+    >>> clean = signal.unsqueeze(0)
+    >>> perturbed = perturbator(clean)
+    >>> clean.shape
+    torch.Size([1, 52173])
+    >>> perturbed.shape
+    torch.Size([1, 46956])
+    """
+
+    def __init__(
+        self, orig_freq, speeds=[90, 100, 110], perturb_prob=1.0,
+    ):
+        super().__init__()
+        self.orig_freq = orig_freq
+        self.speeds = speeds
+        self.perturb_prob = perturb_prob
+
+        # Initialize index of perturbation
+        self.samp_index = 0
+
+        # Initialize resamplers
+        self.resamplers = []
+        for speed in self.speeds:
+            config = {
+                "orig_freq": self.orig_freq,
+                "new_freq": self.orig_freq * speed // 100,
+            }
+            self.resamplers.append(Resample(**config))
+
+    def forward(self, waveform):
+        """
+        Arguments
+        ---------
+        waveforms : tensor
+            Shape should be `[batch, time]` or `[batch, time, channels]`.
+        lengths : tensor
+            Shape should be a single dimension, `[batch]`.
+        Returns
+        -------
+        Tensor of shape `[batch, time]` or `[batch, time, channels]`.
+        """
+
+        # Don't perturb (return early) 1-`perturb_prob` portion of the batches
+        if paddle.rand([1]) > self.perturb_prob:
+        
+            return waveform.clone()
+        # Perform a random perturbation
+        self.samp_index = paddle.randint(len(self.speeds), shape=(1,))[0]
+        perturbed_waveform = self.resamplers[self.samp_index](waveform)
+
+        return perturbed_waveform
+
+class Resample(nn.Layer):
+    """This class resamples an audio signal using sinc-based interpolation.
+
+    It is a modification of the `resample` function from torchaudio
+    (https://pytorch.org/audio/stable/tutorials/audio_resampling_tutorial.html)
+
+    Arguments
+    ---------
+    orig_freq : int
+        the sampling frequency of the input signal.
+    new_freq : int
+        the new sampling frequency after this operation is performed.
+    lowpass_filter_width : int
+        Controls the sharpness of the filter, larger numbers result in a
+        sharper filter, but they are less efficient. Values from 4 to 10 are allowed.
+
+    Example
+    -------
+    >>> from speechbrain.dataio.dataio import read_audio
+    >>> signal = read_audio('tests/samples/single-mic/example1.wav')
+    >>> signal = signal.unsqueeze(0) # [batch, time, channels]
+    >>> resampler = Resample(orig_freq=16000, new_freq=8000)
+    >>> resampled = resampler(signal)
+    >>> signal.shape
+    torch.Size([1, 52173])
+    >>> resampled.shape
+    torch.Size([1, 26087])
+    """
+
+    def __init__(
+        self, orig_freq=16000, new_freq=16000, lowpass_filter_width=6,
+    ):
+        super().__init__()
+        self.orig_freq = orig_freq
+        self.new_freq = new_freq
+        self.lowpass_filter_width = lowpass_filter_width
+
+        # Compute rate for striding
+        self._compute_strides()
+        assert self.orig_freq % self.conv_stride == 0
+        assert self.new_freq % self.conv_transpose_stride == 0
+
+    def _compute_strides(self):
+        """Compute the phases in polyphase filter.
+
+        (almost directly from torchaudio.compliance.kaldi)
+        """
+
+        # Compute new unit based on ratio of in/out frequencies
+        base_freq = math.gcd(self.orig_freq, self.new_freq)
+        input_samples_in_unit = self.orig_freq // base_freq
+        self.output_samples = self.new_freq // base_freq
+
+        # Store the appropriate stride based on the new units
+        self.conv_stride = input_samples_in_unit
+        self.conv_transpose_stride = self.output_samples
+
+    def forward(self, waveforms):
+        """
+        Arguments
+        ---------
+        waveforms : tensor
+            Shape should be `[batch, time]` or `[batch, time, channels]`.
+        lengths : tensor
+            Shape should be a single dimension, `[batch]`.
+
+        Returns
+        -------
+        Tensor of shape `[batch, time]` or `[batch, time, channels]`.
+        """
+
+        if not hasattr(self, "first_indices"):
+            self._indices_and_weights(waveforms)
+
+        # Don't do anything if the frequencies are the same
+        if self.orig_freq == self.new_freq:
+            return waveforms
+        unsqueezed = False
+        if len(waveforms.shape) == 2:
+            waveforms = waveforms.unsqueeze(1)
+            unsqueezed = True
+        elif len(waveforms.shape) == 3:
+            waveforms = waveforms.transpose([0, 2, 1])
+        else:
+            raise ValueError("Input must be 2 or 3 dimensions")
+
+        # Do resampling
+        resampled_waveform = self._perform_resample(waveforms)
+
+        if unsqueezed:
+            resampled_waveform = resampled_waveform.squeeze(1)
+        else:
+            resampled_waveform = resampled_waveform.transpose([0, 2, 1])
+
+        return resampled_waveform
+
+    def _perform_resample(self, waveforms):
+        """Resamples the waveform at the new frequency.
+
+        This matches Kaldi's OfflineFeatureTpl ResampleWaveform which uses a
+        LinearResample (resample a signal at linearly spaced intervals to
+        up/downsample a signal). LinearResample (LR) means that the output
+        signal is at linearly spaced intervals (i.e the output signal has a
+        frequency of `new_freq`). It uses sinc/bandlimited interpolation to
+        upsample/downsample the signal.
+
+        (almost directly from torchaudio.compliance.kaldi)
+
+        https://ccrma.stanford.edu/~jos/resample/
+        Theory_Ideal_Bandlimited_Interpolation.html
+
+        https://github.com/kaldi-asr/kaldi/blob/master/src/feat/resample.h#L56
+
+        Arguments
+        ---------
+        waveforms : tensor
+            The batch of audio signals to resample.
+
+        Returns
+        -------
+        The waveforms at the new frequency.
+        """
+
+        # Compute output size and initialize
+        batch_size, num_channels, wave_len = waveforms.shape
+        window_size = self.weights.shape[1]
+        tot_output_samp = self._output_samples(wave_len)
+        resampled_waveform = paddle.zeros(
+            (batch_size, num_channels, tot_output_samp)
+        )
+        # self.weights = self.weights.to(waveforms.device)
+
+        # Check weights are on correct device
+        # if waveforms.device != self.weights.device:
+        #     self.weights = self.weights.to(waveforms.device)
+
+        # eye size: (num_channels, num_channels, 1)
+        eye = paddle.eye(num_channels).unsqueeze(2)
+
+        # Iterate over the phases in the polyphase filter
+        for i in range(self.first_indices.shape[0]):
+            wave_to_conv = waveforms
+            first_index = int(self.first_indices[i].item())
+            if first_index >= 0:
+                # trim the signal as the filter will not be applied
+                # before the first_index
+                wave_to_conv = wave_to_conv[..., first_index:]
+
+            # pad the right of the signal to allow partial convolutions
+            # meaning compute values for partial windows (e.g. end of the
+            # window is outside the signal length)
+            max_index = (tot_output_samp - 1) // self.output_samples
+            end_index = max_index * self.conv_stride + window_size
+            current_wave_len = wave_len - first_index
+            right_padding = max(0, end_index + 1 - current_wave_len)
+            left_padding = max(0, -first_index)
+            wave_to_conv = paddle.nn.functional.pad(
+                wave_to_conv, (left_padding, right_padding), data_format='NCL'
+            )
+            conv_wave = paddle.nn.functional.conv1d(
+                x=wave_to_conv,
+                weight=self.weights[i].repeat(num_channels, 1, 1),
+                stride=self.conv_stride,
+                groups=num_channels,
+            )
+
+            # we want conv_wave[:, i] to be at
+            # output[:, i + n*conv_transpose_stride]
+            dilated_conv_wave = paddle.nn.functional.conv1d_transpose(
+                conv_wave, eye, stride=self.conv_transpose_stride
+            )
+
+            # pad dilated_conv_wave so it reaches the output length if needed.
+            left_padding = i
+            previous_padding = left_padding + dilated_conv_wave.shape[-1]
+            right_padding = max(0, tot_output_samp - previous_padding)
+            dilated_conv_wave = paddle.nn.functional.pad(
+                dilated_conv_wave, (left_padding, right_padding), data_format='NCL'
+            )
+            dilated_conv_wave = dilated_conv_wave[..., :tot_output_samp]
+
+            resampled_waveform += dilated_conv_wave
+
+        return resampled_waveform
+
+    def _output_samples(self, input_num_samp):
+        """Based on LinearResample::GetNumOutputSamples.
+
+        LinearResample (LR) means that the output signal is at
+        linearly spaced intervals (i.e the output signal has a
+        frequency of ``new_freq``). It uses sinc/bandlimited
+        interpolation to upsample/downsample the signal.
+
+        (almost directly from torchaudio.compliance.kaldi)
+
+        Arguments
+        ---------
+        input_num_samp : int
+            The number of samples in each example in the batch.
+
+        Returns
+        -------
+        Number of samples in the output waveform.
+        """
+
+        # For exact computation, we measure time in "ticks" of 1.0 / tick_freq,
+        # where tick_freq is the least common multiple of samp_in and
+        # samp_out.
+        samp_in = int(self.orig_freq)
+        samp_out = int(self.new_freq)
+
+        tick_freq = abs(samp_in * samp_out) // math.gcd(samp_in, samp_out)
+        ticks_per_input_period = tick_freq // samp_in
+
+        # work out the number of ticks in the time interval
+        # [ 0, input_num_samp/samp_in ).
+        interval_length = input_num_samp * ticks_per_input_period
+        if interval_length <= 0:
+            return 0
+        ticks_per_output_period = tick_freq // samp_out
+
+        # Get the last output-sample in the closed interval,
+        # i.e. replacing [ ) with [ ]. Note: integer division rounds down.
+        # See http://en.wikipedia.org/wiki/Interval_(mathematics) for an
+        # explanation of the notation.
+        last_output_samp = interval_length // ticks_per_output_period
+
+        # We need the last output-sample in the open interval, so if it
+        # takes us to the end of the interval exactly, subtract one.
+        if last_output_samp * ticks_per_output_period == interval_length:
+            last_output_samp -= 1
+
+        # First output-sample index is zero, so the number of output samples
+        # is the last output-sample plus one.
+        num_output_samp = last_output_samp + 1
+
+        return num_output_samp
+
+    def _indices_and_weights(self, waveforms):
+        """Based on LinearResample::SetIndexesAndWeights
+
+        Retrieves the weights for resampling as well as the indices in which
+        they are valid. LinearResample (LR) means that the output signal is at
+        linearly spaced intervals (i.e the output signal has a frequency
+        of ``new_freq``). It uses sinc/bandlimited interpolation to
+        upsample/downsample the signal.
+
+        Returns
+        -------
+        - the place where each filter should start being applied
+        - the filters to be applied to the signal for resampling
+        """
+
+        # Lowpass filter frequency depends on smaller of two frequencies
+        min_freq = min(self.orig_freq, self.new_freq)
+        lowpass_cutoff = 0.99 * 0.5 * min_freq
+
+        assert lowpass_cutoff * 2 <= min_freq
+        window_width = self.lowpass_filter_width / (2.0 * lowpass_cutoff)
+
+        assert lowpass_cutoff < min(self.orig_freq, self.new_freq) / 2
+        output_t = paddle.arange(
+            start=0.0, end=self.output_samples
+        )
+        output_t /= self.new_freq
+        min_t = output_t - window_width
+        max_t = output_t + window_width
+
+        min_input_index = paddle.ceil(min_t * self.orig_freq)
+        max_input_index = paddle.floor(max_t * self.orig_freq)
+        num_indices = max_input_index - min_input_index + 1
+
+        max_weight_width = num_indices.max()
+        j = paddle.arange(max_weight_width)
+        input_index = min_input_index.unsqueeze(1) + j.unsqueeze(0)
+        delta_t = (input_index / self.orig_freq) - output_t.unsqueeze(1)
+
+        weights = paddle.zeros_like(delta_t)
+
+        inside_window_indices = delta_t.abs() < (window_width)
+        # raised-cosine (Hanning) window with width `window_width`
+        weights[inside_window_indices] = 0.5 * (
+            1
+            + paddle.cos(
+                2
+                * math.pi
+                * lowpass_cutoff
+                / self.lowpass_filter_width
+                * delta_t[inside_window_indices]
+            )
+        )
+        t_eq_zero_indices = delta_t == 0.0
+        t_not_eq_zero_indices = ~t_eq_zero_indices
+
+        # sinc filter function
+        weights[t_not_eq_zero_indices] *= paddle.sin(
+            2 * math.pi * lowpass_cutoff * delta_t[t_not_eq_zero_indices]
+        ) / (math.pi * delta_t[t_not_eq_zero_indices])
+
+        # limit of the function at t = 0
+        weights[t_eq_zero_indices] *= 2 * lowpass_cutoff
+
+        # size (output_samples, max_weight_width)
+        weights /= self.orig_freq
+
+        self.first_indices = min_input_index
+        self.weights = weights
+
+
+class DropFreq(nn.Layer):
+    """This class drops a random frequency from the signal.
+    The purpose of this class is to teach models to learn to rely on all parts
+    of the signal, not just a few frequency bands.
+    Arguments
+    ---------
+    drop_freq_low : float
+        The low end of frequencies that can be dropped,
+        as a fraction of the sampling rate / 2.
+    drop_freq_high : float
+        The high end of frequencies that can be
+        dropped, as a fraction of the sampling rate / 2.
+    drop_count_low : int
+        The low end of number of frequencies that could be dropped.
+    drop_count_high : int
+        The high end of number of frequencies that could be dropped.
+    drop_width : float
+        The width of the frequency band to drop, as
+        a fraction of the sampling_rate / 2.
+    drop_prob : float
+        The probability that the batch of signals will  have a frequency
+        dropped. By default, every batch has frequencies dropped.
+    Example
+    -------
+    >>> from speechbrain.dataio.dataio import read_audio
+    >>> dropper = DropFreq()
+    >>> signal = read_audio('tests/samples/single-mic/example1.wav')
+    >>> dropped_signal = dropper(signal.unsqueeze(0))
+    """
+
+    def __init__(
+        self,
+        drop_freq_low=1e-14,
+        drop_freq_high=1,
+        drop_count_low=1,
+        drop_count_high=2,
+        drop_width=0.05,
+        drop_prob=1,
+    ):
+        super().__init__()
+        self.drop_freq_low = drop_freq_low
+        self.drop_freq_high = drop_freq_high
+        self.drop_count_low = drop_count_low
+        self.drop_count_high = drop_count_high
+        self.drop_width = drop_width
+        self.drop_prob = drop_prob
+
+    def forward(self, waveforms):
+        """
+        Arguments
+        ---------
+        waveforms : tensor
+            Shape should be `[batch, time]` or `[batch, time, channels]`.
+        Returns
+        -------
+        Tensor of shape `[batch, time]` or `[batch, time, channels]`.
+        """
+
+        # Don't drop (return early) 1-`drop_prob` portion of the batches
+        dropped_waveform = waveforms.clone()
+        if paddle.rand([1]) > self.drop_prob:
+            return dropped_waveform
+
+        # Add channels dimension
+        if len(waveforms.shape) == 2:
+            dropped_waveform = dropped_waveform.unsqueeze(-1)
+
+        # Pick number of frequencies to drop
+        drop_count = paddle.randint(
+            low=self.drop_count_low, high=self.drop_count_high + 1, shape=(1,),
+        )
+
+        # Pick a frequency to drop
+        drop_range = self.drop_freq_high - self.drop_freq_low
+        drop_frequency = (
+            paddle.rand(drop_count) * drop_range + self.drop_freq_low
+        )
+
+        # Filter parameters
+        filter_length = 101
+        pad = filter_length // 2
+
+        # Start with delta function
+        drop_filter = paddle.zeros([1, filter_length, 1])
+        drop_filter[0, pad, 0] = 1
+        # Subtract each frequency
+        for frequency in drop_frequency:
+            notch_kernel = notch_filter(
+                frequency, filter_length, self.drop_width,
+            )
+            drop_filter = convolve1d(drop_filter, notch_kernel, pad)
+
+        # Apply filter
+        dropped_waveform = convolve1d(dropped_waveform, drop_filter, pad)
+
+        # Remove channels dimension if added
+        return dropped_waveform.squeeze(-1)
+
+class DropChunk(nn.Layer):
+    """This class drops portions of the input signal.
+    Using `DropChunk` as an augmentation strategy helps a models learn to rely
+    on all parts of the signal, since it can't expect a given part to be
+    present.
+    Arguments
+    ---------
+    drop_length_low : int
+        The low end of lengths for which to set the
+        signal to zero, in samples.
+    drop_length_high : int
+        The high end of lengths for which to set the
+        signal to zero, in samples.
+    drop_count_low : int
+        The low end of number of times that the signal
+        can be dropped to zero.
+    drop_count_high : int
+        The high end of number of times that the signal
+        can be dropped to zero.
+    drop_start : int
+        The first index for which dropping will be allowed.
+    drop_end : int
+        The last index for which dropping will be allowed.
+    drop_prob : float
+        The probability that the batch of signals will
+        have a portion dropped. By default, every batch
+        has portions dropped.
+    noise_factor : float
+        The factor relative to average amplitude of an utterance
+        to use for scaling the white noise inserted. 1 keeps
+        the average amplitude the same, while 0 inserts all 0's.
+    Example
+    -------
+    >>> from speechbrain.dataio.dataio import read_audio
+    >>> dropper = DropChunk(drop_start=100, drop_end=200, noise_factor=0.)
+    >>> signal = read_audio('tests/samples/single-mic/example1.wav')
+    >>> signal = signal.unsqueeze(0) # [batch, time, channels]
+    >>> length = torch.ones(1)
+    >>> dropped_signal = dropper(signal, length)
+    >>> float(dropped_signal[:, 150])
+    0.0
+    """
+
+    def __init__(
+        self,
+        drop_length_low=100,
+        drop_length_high=1000,
+        drop_count_low=1,
+        drop_count_high=10,
+        drop_start=0,
+        drop_end=None,
+        drop_prob=1,
+        noise_factor=0.0,
+    ):
+        super().__init__()
+        self.drop_length_low = drop_length_low
+        self.drop_length_high = drop_length_high
+        self.drop_count_low = drop_count_low
+        self.drop_count_high = drop_count_high
+        self.drop_start = drop_start
+        self.drop_end = drop_end
+        self.drop_prob = drop_prob
+        self.noise_factor = noise_factor
+
+        # Validate low < high
+        if drop_length_low > drop_length_high:
+            raise ValueError("Low limit must not be more than high limit")
+        if drop_count_low > drop_count_high:
+            raise ValueError("Low limit must not be more than high limit")
+
+        # Make sure the length doesn't exceed end - start
+        if drop_end is not None and drop_end >= 0:
+            if drop_start > drop_end:
+                raise ValueError("Low limit must not be more than high limit")
+
+            drop_range = drop_end - drop_start
+            self.drop_length_low = min(drop_length_low, drop_range)
+            self.drop_length_high = min(drop_length_high, drop_range)
+
+    def forward(self, waveforms, lengths):
+        """
+        Arguments
+        ---------
+        waveforms : tensor
+            Shape should be `[batch, time]` or `[batch, time, channels]`.
+        lengths : tensor
+            Shape should be a single dimension, `[batch]`.
+        Returns
+        -------
+        Tensor of shape `[batch, time]` or
+            `[batch, time, channels]`
+        """
+
+        # Reading input list
+        lengths = (lengths * waveforms.shape[1]).long()
+        batch_size = waveforms.shape[0]
+        dropped_waveform = waveforms.clone()
+
+        # Don't drop (return early) 1-`drop_prob` portion of the batches
+        if paddle.rand([1]) > self.drop_prob:
+            return dropped_waveform
+
+        # Store original amplitude for computing white noise amplitude
+        clean_amplitude = compute_amplitude(waveforms, lengths.unsqueeze(1))
+
+        # Pick a number of times to drop
+        drop_times = paddle.randint(
+            low=self.drop_count_low,
+            high=self.drop_count_high + 1,
+            shape=(batch_size,),
+        )
+
+        # Iterate batch to set mask
+        for i in range(batch_size):
+            if drop_times[i] == 0:
+                continue
+
+            # Pick lengths
+            length = paddle.randint(
+                low=self.drop_length_low,
+                high=self.drop_length_high + 1,
+                shape=(drop_times[i],),
+            )
+
+            # Compute range of starting locations
+            start_min = self.drop_start
+            if start_min < 0:
+                start_min += lengths[i]
+            start_max = self.drop_end
+            if start_max is None:
+                start_max = lengths[i]
+            if start_max < 0:
+                start_max += lengths[i]
+            start_max = max(0, start_max - length.max())
+
+            # Pick starting locations
+            start = paddle.randint(
+                low=start_min, high=start_max + 1, shape=(drop_times[i],),
+            )
+
+            end = start + length
+
+            # Update waveform
+            if not self.noise_factor:
+                for j in range(drop_times[i]):
+                    dropped_waveform[i, start[j] : end[j]] = 0.0
+            else:
+                # Uniform distribution of -2 to +2 * avg amplitude should
+                # preserve the average for normalization
+                noise_max = 2 * clean_amplitude[i] * self.noise_factor
+                for j in range(drop_times[i]):
+                    # zero-center the noise distribution
+                    noise_vec = paddle.rand(length[j])
+                    noise_vec = 2 * noise_max * noise_vec - noise_max
+                    dropped_waveform[i, start[j] : end[j]] = noise_vec
+
+        return dropped_waveform
+
+
+class TimeDomainSpecAugment(nn.Layer):
+    """A time-domain approximation of the SpecAugment algorithm.
+
+    This augmentation module implements three augmentations in
+    the time-domain.
+
+     1. Drop chunks of the audio (zero amplitude or white noise)
+     2. Drop frequency bands (with band-drop filters)
+     3. Speed peturbation (via resampling to slightly different rate)
+
+    Arguments
+    ---------
+    perturb_prob : float from 0 to 1
+        The probability that a batch will have speed perturbation applied.
+    drop_freq_prob : float from 0 to 1
+        The probability that a batch will have frequencies dropped.
+    drop_chunk_prob : float from 0 to 1
+        The probability that a batch will have chunks dropped.
+    speeds : list of ints
+        A set of different speeds to use to perturb each batch.
+        See ``speechbrain.processing.speech_augmentation.SpeedPerturb``
+    sample_rate : int
+        Sampling rate of the input waveforms.
+    drop_freq_count_low : int
+        Lowest number of frequencies that could be dropped.
+    drop_freq_count_high : int
+        Highest number of frequencies that could be dropped.
+    drop_chunk_count_low : int
+        Lowest number of chunks that could be dropped.
+    drop_chunk_count_high : int
+        Highest number of chunks that could be dropped.
+    drop_chunk_length_low : int
+        Lowest length of chunks that could be dropped.
+    drop_chunk_length_high : int
+        Highest length of chunks that could be dropped.
+    drop_chunk_noise_factor : float
+        The noise factor used to scale the white noise inserted, relative to
+        the average amplitude of the utterance. Default 0 (no noise inserted).
+
+    Example
+    -------
+    >>> inputs = torch.randn([10, 16000])
+    >>> feature_maker = TimeDomainSpecAugment(speeds=[80])
+    >>> feats = feature_maker(inputs, torch.ones(10))
+    >>> feats.shape
+    torch.Size([10, 12800])
+    """
+
+    def __init__(
+        self,
+        perturb_prob=1.0,
+        drop_freq_prob=1.0,
+        drop_chunk_prob=1.0,
+        speeds=[95, 100, 105],
+        sample_rate=16000,
+        drop_freq_count_low=0,
+        drop_freq_count_high=3,
+        drop_chunk_count_low=0,
+        drop_chunk_count_high=5,
+        drop_chunk_length_low=1000,
+        drop_chunk_length_high=2000,
+        drop_chunk_noise_factor=0,
+    ):
+        super().__init__()
+        self.speed_perturb = SpeedPerturb(
+            perturb_prob=perturb_prob, orig_freq=sample_rate, speeds=speeds
+        )
+        self.drop_freq = DropFreq(
+            drop_prob=drop_freq_prob,
+            drop_count_low=drop_freq_count_low,
+            drop_count_high=drop_freq_count_high,
+        )
+        self.drop_chunk = DropChunk(
+            drop_prob=drop_chunk_prob,
+            drop_count_low=drop_chunk_count_low,
+            drop_count_high=drop_chunk_count_high,
+            drop_length_low=drop_chunk_length_low,
+            drop_length_high=drop_chunk_length_high,
+            noise_factor=drop_chunk_noise_factor,
+        )
+
+    def forward(self, waveforms, lengths):
+        """Returns the distorted waveforms.
+
+        Arguments
+        ---------
+        waveforms : torch.Tensor
+            The waveforms to distort
+        """
+        # Augmentation
+        with paddle.no_grad():
+            waveforms = self.speed_perturb(waveforms)
+            waveforms = self.drop_freq(waveforms)
+            waveforms = self.drop_chunk(waveforms, lengths)
+        return waveforms
\ No newline at end of file
diff --git a/paddlespeech/s2t/models/wav2vec2/speechbrain/processing/test.py b/paddlespeech/s2t/models/wav2vec2/speechbrain/processing/test.py
new file mode 100644
index 000000000..da243342c
--- /dev/null
+++ b/paddlespeech/s2t/models/wav2vec2/speechbrain/processing/test.py
@@ -0,0 +1,14 @@
+import paddle
+import numpy as np
+
+def blackman_window(window_length, periodic=True):
+    if window_length == 0:
+        return []
+    if window_length == 1:
+        return paddle.ones([1])
+    if periodic:
+        window_length += 1
+    
+    window = paddle.arange(window_length) * (np.pi / (window_length - 1))
+    window = 0.08 * paddle.cos(window * 4) - 0.5 * paddle.cos(window * 2) + 0.42
+    return window[:-1] if periodic else window
diff --git a/paddlespeech/s2t/models/wav2vec2/wav2vec2_ASR.py b/paddlespeech/s2t/models/wav2vec2/wav2vec2_ASR.py
new file mode 100644
index 000000000..e20a7e129
--- /dev/null
+++ b/paddlespeech/s2t/models/wav2vec2/wav2vec2_ASR.py
@@ -0,0 +1,287 @@
+import numpy as np
+import os
+
+from typing import Dict
+from typing import List
+from typing import Optional
+from typing import Tuple
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddlespeech.s2t.models.wav2vec2.modeling_wav2vec2 import Wav2Vec2ConfigPure
+from paddlespeech.s2t.models.wav2vec2.modeling_wav2vec2 import Wav2Vec2Model
+from paddlespeech.s2t.modules.mask import make_pad_mask
+from paddlespeech.s2t.utils.utility import log_add
+
+from collections import defaultdict
+
+from paddlespeech.s2t.models.wav2vec2.speechbrain.lobes.models.VanillaNN import VanillaNN
+from paddlespeech.s2t.modules.ctc import CTCDecoderBase as CTC
+from paddlespeech.s2t.utils.ctc_utils import remove_duplicates_and_blank
+from yacs.config import CfgNode
+
+class Wav2vec2ASR(nn.Layer):
+    def __init__(self, config: dict):
+        super().__init__()
+        
+        wav2vec2_config = Wav2Vec2ConfigPure()
+        wav2vec2 = Wav2Vec2Model(wav2vec2_config)
+
+        model_dict = paddle.load(config.wav2vec2_params_path)
+        wav2vec2.set_state_dict(model_dict)
+        wav2vec2.eval()
+        self.normalize_wav = config.normalize_wav
+        self.output_norm = config.output_norm
+        if config.freeze_wav2vec2:
+            for parm in wav2vec2.parameters():
+                parm.trainable = False
+        self.wav2vec2 = wav2vec2
+        self.enc = VanillaNN(input_shape=[None,None,wav2vec2_config.hidden_size], activation=nn.LeakyReLU, dnn_blocks=config.dnn_blocks, dnn_neurons=config.dnn_neurons)
+        self.ctc = CTC(odim=config.output_dim, enc_n_units=config.dnn_neurons, blank_id=config.blank_id, dropout_rate=config.ctc_dropout_rate, reduction_type="mean")
+
+    def train_batch(self):
+        wav, wavs_lens_rate, target, target_lens_rate = self._get_data()
+        ctc_loss = self(wav, wavs_lens_rate, target, target_lens_rate)
+
+
+    def forward(self, wav, wavs_lens_rate, target, target_lens_rate):
+        if self.normalize_wav:
+            wav = F.layer_norm(wav, wav.shape[1:])
+        # Extract wav2vec output
+        out = self.wav2vec2(wav)[0]
+        np.save("data/out.npy", out.numpy())
+        # We normalize the output if required
+        if self.output_norm:
+            out = F.layer_norm(out, out.shape[1:])
+        feats = out
+
+        x = self.enc(feats)
+        x_lens = (wavs_lens_rate * x.shape[1]).round().astype(paddle.int64)
+        target_lens = (target_lens_rate * target.shape[1]).round().astype(paddle.int64)
+        
+        ctc_loss = self.ctc(x, x_lens, target, target_lens)
+        return ctc_loss
+
+
+    @paddle.no_grad()
+    def decode(self, 
+               feats: paddle.Tensor,
+               feats_lengths: paddle.Tensor,
+               text_feature: Dict[str, int],
+               decoding_method: str,
+               beam_size: int):
+        batch_size = feats.shape[0]
+        if decoding_method is 'ctc_prefix_beam_search' and batch_size > 1:
+            logger.error(
+                f'decoding mode {decoding_method} must be running with batch_size == 1'
+            )
+            logger.error(f"current batch_size is {batch_size}")
+            sys.exit(1)
+        
+        if decoding_method == 'ctc_greedy_search':
+            hyps = self.ctc_greedy_search(feats, feats_lengths)
+            res = [text_feature.defeaturize(hyp) for hyp in hyps]
+            res_tokenids = [hyp for hyp in hyps]
+        # ctc_prefix_beam_search and attention_rescoring only return one
+        # result in List[int], change it to List[List[int]] for compatible
+        # with other batch decoding mode
+        elif decoding_method == 'ctc_prefix_beam_search':
+            assert feats.shape[0] == 1
+            hyp = self.ctc_prefix_beam_search(
+                feats,
+                feats_lengths,
+                beam_size)
+            res = [text_feature.defeaturize(hyp)]
+            res_tokenids = [hyp]
+        else:
+            raise ValueError(f"wav2vec2 not support decoding method: {decoding_method}")
+
+        return res, res_tokenids
+
+    @classmethod
+    def from_config(cls, config):
+        model = cls(config)
+        return model
+
+    def ctc_greedy_search(
+            self, wav, wavs_lens_rate) -> List[List[int]]:
+        """ Apply CTC greedy search
+        Args:
+            speech (paddle.Tensor): (batch, max_len)
+            speech_length (paddle.Tensor): (batch, )
+        Returns:
+            List[List[int]]: best path result
+        """
+        batch_size = wav.shape[0]
+        wav = wav[:,:,0]
+        if self.normalize_wav:
+            wav = F.layer_norm(wav, wav.shape[1:])
+        # Extract wav2vec output
+        out = self.wav2vec2(wav)[0]
+        # We normalize the output if required
+        if self.output_norm:
+            out = F.layer_norm(out, out.shape[1:])
+        feats = out
+        x = self.enc(feats)
+        x_lens = x.shape[1]
+        ctc_probs = self.ctc.log_softmax(x)  # (B, maxlen, vocab_size)
+        topk_prob, topk_index = ctc_probs.topk(1, axis=2)  # (B, maxlen, 1)
+        topk_index = topk_index.view(batch_size, x_lens)  # (B, maxlen)
+        # pad_mask = make_pad_mask(x_lens)  # (B, maxlen)
+        # topk_index = topk_index.masked_fill_(pad_mask, self.eos)  # (B, maxlen)
+
+        hyps = [hyp.tolist() for hyp in topk_index]
+        hyps = [remove_duplicates_and_blank(hyp) for hyp in hyps]
+        return hyps
+
+    def _ctc_prefix_beam_search(
+           self, wav, wavs_lens_rate, beam_size, blank_id: int=0, ) -> Tuple[List[Tuple[int, float]], paddle.Tensor]:
+        """ CTC prefix beam search inner implementation
+        Args:
+            speech (paddle.Tensor): (batch, max_len, feat_dim)
+            speech_length (paddle.Tensor): (batch, )
+            beam_size (int): beam size for beam search
+            decoding_chunk_size (int): decoding chunk for dynamic chunk
+                trained model.
+                <0: for decoding, use full chunk.
+                >0: for decoding, use fixed chunk size as set.
+                0: used for training, it's prohibited here
+            simulate_streaming (bool): whether do encoder forward in a
+                streaming fashion
+        Returns:
+            List[Tuple[int, float]]: nbest results, (N,1), (text, likelihood)
+            paddle.Tensor: encoder output, (1, max_len, encoder_dim),
+                it will be used for rescoring in attention rescoring mode
+        """
+        wav = wav[:,:,0]
+
+        if self.normalize_wav:
+            wav = F.layer_norm(wav, wav.shape[1:])
+        # Extract wav2vec output
+        out = self.wav2vec2(wav)[0]
+        # We normalize the output if required
+        if self.output_norm:
+            out = F.layer_norm(out, out.shape[1:])
+        feats = out
+
+        x = self.enc(feats)
+        maxlen = x.shape[1]
+        ctc_probs = self.ctc.log_softmax(x)  # (1, maxlen, vocab_size)
+        ctc_probs = ctc_probs.squeeze(0)
+
+        # cur_hyps: (prefix, (blank_ending_score, none_blank_ending_score))
+        # blank_ending_score and  none_blank_ending_score in ln domain
+        cur_hyps = [(tuple(), (0.0, -float('inf')))]
+        # 2. CTC beam search step by step
+        for t in range(0, maxlen):
+            logp = ctc_probs[t]  # (vocab_size,)
+            # key: prefix, value (pb, pnb), default value(-inf, -inf)
+            next_hyps = defaultdict(lambda: (-float('inf'), -float('inf')))
+            # 2.1 First beam prune: select topk best
+            top_k_logp, top_k_index = logp.topk(beam_size)  # (beam_size,)
+            for s in top_k_index:
+                s = s.item()
+                ps = logp[s].item()
+                for prefix, (pb, pnb) in cur_hyps:
+                    last = prefix[-1] if len(prefix) > 0 else None
+                    if s == blank_id:  # blank
+                        n_pb, n_pnb = next_hyps[prefix]
+                        n_pb = log_add([n_pb, pb + ps, pnb + ps])
+                        next_hyps[prefix] = (n_pb, n_pnb)
+                    elif s == last:
+                        #  Update *ss -> *s;
+                        n_pb, n_pnb = next_hyps[prefix]
+                        n_pnb = log_add([n_pnb, pnb + ps])
+                        next_hyps[prefix] = (n_pb, n_pnb)
+                        # Update *s-s -> *ss, - is for blank
+                        n_prefix = prefix + (s, )
+                        n_pb, n_pnb = next_hyps[n_prefix]
+                        n_pnb = log_add([n_pnb, pb + ps])
+                        next_hyps[n_prefix] = (n_pb, n_pnb)
+                    else:
+                        n_prefix = prefix + (s, )
+                        n_pb, n_pnb = next_hyps[n_prefix]
+                        n_pnb = log_add([n_pnb, pb + ps, pnb + ps])
+                        next_hyps[n_prefix] = (n_pb, n_pnb)
+
+            # 2.2 Second beam prune
+            next_hyps = sorted(
+                next_hyps.items(),
+                key=lambda x: log_add(list(x[1])),
+                reverse=True)
+            cur_hyps = next_hyps[:beam_size]
+
+        hyps = [(y[0], log_add([y[1][0], y[1][1]])) for y in cur_hyps]
+        return hyps
+
+    def ctc_prefix_beam_search(self, wav, wavs_lens_rate, beam_size) -> List[int]:
+        """ Apply CTC prefix beam search
+        Args:
+            speech (paddle.Tensor): (batch, max_len, feat_dim)
+            speech_length (paddle.Tensor): (batch, )
+            beam_size (int): beam size for beam search
+            decoding_chunk_size (int): decoding chunk for dynamic chunk
+                trained model.
+                <0: for decoding, use full chunk.
+                >0: for decoding, use fixed chunk size as set.
+                0: used for training, it's prohibited here
+            simulate_streaming (bool): whether do encoder forward in a
+                streaming fashion
+        Returns:
+            List[int]: CTC prefix beam search nbest results
+        """
+        hyps = self._ctc_prefix_beam_search(
+            wav, wavs_lens_rate, beam_size)
+        return hyps[0][0]
+
+    # @jit.to_static
+    def ctc_activation(self, xs: paddle.Tensor) -> paddle.Tensor:
+        """ Export interface for c++ call, apply linear transform and log
+            softmax before ctc
+        Args:
+            xs (paddle.Tensor): encoder output, (B, T, D)
+        Returns:
+            paddle.Tensor: activation before ctc
+        """
+        return self.ctc.log_softmax(xs)
+
+
+    def _get_data(self):
+        data_dir = "data"
+        wavs = np.load(os.path.join(data_dir, "wavs.npy"))
+        wavs_lens = np.load(os.path.join(data_dir, "wavs_lens.npy"))
+        tokens = np.load(os.path.join(data_dir, "tokens.npy"))
+        tokens_lens = np.load(os.path.join(data_dir, "tokens_lens.npy"))
+        
+        batch = (paddle.to_tensor(wavs), paddle.to_tensor(wavs_lens, dtype='float32'), 
+            paddle.to_tensor(tokens, dtype='int32'), paddle.to_tensor(tokens_lens, dtype='float32'))
+        return batch
+
+
+if __name__ == "__main__":
+    # wav2vec2_asr = Wav2vec2ASR(config={})
+    # wav2vec2_asr.train_batch()
+    freeze = True
+    config = Wav2Vec2ConfigPure()
+    model = Wav2Vec2Model(config)
+    model_dict = model.state_dict()
+    revise_params_path = "exp/torch_to_paddle_revise.pdparams"
+    model_dict_revise = paddle.load(revise_params_path)
+    model.set_state_dict(model_dict_revise)
+    model.training = True
+    model.eval()
+    if freeze:
+        for parm in model.parameters():
+            parm.requires_grad = False
+   # get enc()
+    enc = VanillaNN(input_shape=[None,None,1024], activation=paddle.nn.LeakyReLU, dnn_blocks=2, dnn_neurons=1024)
+
+    ctc = CTC(odim=30, enc_n_units=1024, blank_id=0, dropout_rate=0.0)
+
+    input_values = np.load("input_values.npy")
+    input_values = paddle.to_tensor(input_values)
+
+    feats = model(input_values).last_hidden_state
+    x = enc(feats)
+    ctc_loss = ctc(enc, target)
diff --git a/paddlespeech/s2t/modules/align.py b/paddlespeech/s2t/modules/align.py
index 34d796145..cacda2461 100644
--- a/paddlespeech/s2t/modules/align.py
+++ b/paddlespeech/s2t/modules/align.py
@@ -11,10 +11,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import math
-
 import paddle
 from paddle import nn
+import math
 """
     To align the initializer between paddle and torch, 
     the API below are set defalut initializer with priority higger than global initializer.
@@ -82,18 +81,10 @@ class Linear(nn.Linear):
                  name=None):
         if weight_attr is None:
             if global_init_type == "kaiming_uniform":
-                weight_attr = paddle.ParamAttr(
-                    initializer=nn.initializer.KaimingUniform(
-                        fan_in=None,
-                        negative_slope=math.sqrt(5),
-                        nonlinearity='leaky_relu'))
+                weight_attr = paddle.ParamAttr(initializer=nn.initializer.KaimingUniform(fan_in=None, negative_slope=math.sqrt(5), nonlinearity='leaky_relu'))
         if bias_attr is None:
             if global_init_type == "kaiming_uniform":
-                bias_attr = paddle.ParamAttr(
-                    initializer=nn.initializer.KaimingUniform(
-                        fan_in=None,
-                        negative_slope=math.sqrt(5),
-                        nonlinearity='leaky_relu'))
+                bias_attr = paddle.ParamAttr(initializer=nn.initializer.KaimingUniform(fan_in=None, negative_slope=math.sqrt(5), nonlinearity='leaky_relu'))
         super(Linear, self).__init__(in_features, out_features, weight_attr,
                                      bias_attr, name)
 
@@ -113,18 +104,10 @@ class Conv1D(nn.Conv1D):
                  data_format='NCL'):
         if weight_attr is None:
             if global_init_type == "kaiming_uniform":
-                weight_attr = paddle.ParamAttr(
-                    initializer=nn.initializer.KaimingUniform(
-                        fan_in=None,
-                        negative_slope=math.sqrt(5),
-                        nonlinearity='leaky_relu'))
+                weight_attr = paddle.ParamAttr(initializer=nn.initializer.KaimingUniform(fan_in=None, negative_slope=math.sqrt(5), nonlinearity='leaky_relu'))
         if bias_attr is None:
             if global_init_type == "kaiming_uniform":
-                bias_attr = paddle.ParamAttr(
-                    initializer=nn.initializer.KaimingUniform(
-                        fan_in=None,
-                        negative_slope=math.sqrt(5),
-                        nonlinearity='leaky_relu'))
+                bias_attr = paddle.ParamAttr(initializer=nn.initializer.KaimingUniform(fan_in=None, negative_slope=math.sqrt(5), nonlinearity='leaky_relu'))
         super(Conv1D, self).__init__(
             in_channels, out_channels, kernel_size, stride, padding, dilation,
             groups, padding_mode, weight_attr, bias_attr, data_format)
@@ -145,18 +128,10 @@ class Conv2D(nn.Conv2D):
                  data_format='NCHW'):
         if weight_attr is None:
             if global_init_type == "kaiming_uniform":
-                weight_attr = paddle.ParamAttr(
-                    initializer=nn.initializer.KaimingUniform(
-                        fan_in=None,
-                        negative_slope=math.sqrt(5),
-                        nonlinearity='leaky_relu'))
+                weight_attr = paddle.ParamAttr(initializer=nn.initializer.KaimingUniform(fan_in=None, negative_slope=math.sqrt(5), nonlinearity='leaky_relu'))
         if bias_attr is None:
             if global_init_type == "kaiming_uniform":
-                bias_attr = paddle.ParamAttr(
-                    initializer=nn.initializer.KaimingUniform(
-                        fan_in=None,
-                        negative_slope=math.sqrt(5),
-                        nonlinearity='leaky_relu'))
+                bias_attr = paddle.ParamAttr(initializer=nn.initializer.KaimingUniform(fan_in=None, negative_slope=math.sqrt(5), nonlinearity='leaky_relu'))
         super(Conv2D, self).__init__(
             in_channels, out_channels, kernel_size, stride, padding, dilation,
             groups, padding_mode, weight_attr, bias_attr, data_format)
diff --git a/paddlespeech/s2t/modules/attention.py b/paddlespeech/s2t/modules/attention.py
index 92990048d..b6d615867 100644
--- a/paddlespeech/s2t/modules/attention.py
+++ b/paddlespeech/s2t/modules/attention.py
@@ -15,6 +15,7 @@
 # Modified from wenet(https://github.com/wenet-e2e/wenet)
 """Multi-Head Attention layer definition."""
 import math
+from typing import Optional
 from typing import Tuple
 
 import paddle
@@ -82,12 +83,11 @@ class MultiHeadedAttention(nn.Layer):
 
         return q, k, v
 
-    def forward_attention(
-            self,
-            value: paddle.Tensor,
+    def forward_attention(self,
+            value: paddle.Tensor, 
             scores: paddle.Tensor,
-            mask: paddle.Tensor,  # paddle.ones([0, 0, 0], dtype=paddle.bool)
-    ) -> paddle.Tensor:
+            mask: paddle.Tensor = paddle.ones([0, 0, 0], dtype=paddle.bool),
+        ) -> paddle.Tensor:
         """Compute attention context vector.
         Args:
             value (paddle.Tensor): Transformed value, size
@@ -108,7 +108,7 @@ class MultiHeadedAttention(nn.Layer):
         # When will `if mask.size(2) > 0` be False?
         # 1. onnx(16/-1, -1/-1, 16/0)
         # 2. jit (16/-1, -1/-1, 16/0, 16/4)
-        if paddle.shape(mask)[2] > 0:  # time2 > 0
+        if paddle.shape(mask)[2] > 0: # time2 > 0
             mask = mask.unsqueeze(1).equal(0)  # (batch, 1, *, time2)
             # for last chunk, time2 might be larger than scores.size(-1)
             mask = mask[:, :, :, :paddle.shape(scores)[-1]]
@@ -127,15 +127,14 @@ class MultiHeadedAttention(nn.Layer):
 
         return self.linear_out(x)  # (batch, time1, d_model)
 
-    def forward(
-            self,
-            query: paddle.Tensor,
-            key: paddle.Tensor,
-            value: paddle.Tensor,
-            mask: paddle.Tensor,  # paddle.ones([0,0,0], dtype=paddle.bool)
-            pos_emb: paddle.Tensor,  # paddle.empty([0])
-            cache: paddle.Tensor  # paddle.zeros([0,0,0,0])
-    ) -> Tuple[paddle.Tensor, paddle.Tensor]:
+    def forward(self,
+                query: paddle.Tensor,
+                key: paddle.Tensor,
+                value: paddle.Tensor,
+                mask: paddle.Tensor = paddle.ones([0,0,0], dtype=paddle.bool),
+                pos_emb: paddle.Tensor = paddle.empty([0]),
+                cache: paddle.Tensor = paddle.zeros([0,0,0,0])
+                ) -> Tuple[paddle.Tensor, paddle.Tensor]:
         """Compute scaled dot product attention.
        Args:
             query (paddle.Tensor): Query tensor (#batch, time1, size).
@@ -244,15 +243,14 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention):
 
         return x
 
-    def forward(
-            self,
-            query: paddle.Tensor,
-            key: paddle.Tensor,
-            value: paddle.Tensor,
-            mask: paddle.Tensor,  # paddle.ones([0,0,0], dtype=paddle.bool)
-            pos_emb: paddle.Tensor,  # paddle.empty([0])
-            cache: paddle.Tensor  # paddle.zeros([0,0,0,0])
-    ) -> Tuple[paddle.Tensor, paddle.Tensor]:
+    def forward(self,
+                query: paddle.Tensor,
+                key: paddle.Tensor,
+                value: paddle.Tensor,
+                mask: paddle.Tensor = paddle.ones([0,0,0], dtype=paddle.bool),
+                pos_emb: paddle.Tensor = paddle.empty([0]),
+                cache: paddle.Tensor = paddle.zeros([0,0,0,0])
+                ) -> Tuple[paddle.Tensor, paddle.Tensor]:
         """Compute 'Scaled Dot Product Attention' with rel. positional encoding.
         Args:
             query (paddle.Tensor): Query tensor (#batch, time1, size).
diff --git a/paddlespeech/s2t/modules/conformer_convolution.py b/paddlespeech/s2t/modules/conformer_convolution.py
index b35fea5b9..c384b9c78 100644
--- a/paddlespeech/s2t/modules/conformer_convolution.py
+++ b/paddlespeech/s2t/modules/conformer_convolution.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 # Modified from wenet(https://github.com/wenet-e2e/wenet)
 """ConvolutionModule definition."""
+from typing import Optional
 from typing import Tuple
 
 import paddle
@@ -105,12 +106,11 @@ class ConvolutionModule(nn.Layer):
         )
         self.activation = activation
 
-    def forward(
-            self,
-            x: paddle.Tensor,
-            mask_pad: paddle.Tensor,  # paddle.ones([0,0,0], dtype=paddle.bool)
-            cache: paddle.Tensor  # paddle.zeros([0,0,0,0])
-    ) -> Tuple[paddle.Tensor, paddle.Tensor]:
+    def forward(self,
+                x: paddle.Tensor,
+                mask_pad: paddle.Tensor= paddle.ones([0,0,0], dtype=paddle.bool),
+                cache: paddle.Tensor= paddle.zeros([0,0,0]),
+                ) -> Tuple[paddle.Tensor, paddle.Tensor]:
         """Compute convolution module.
         Args:
             x (paddle.Tensor): Input tensor (#batch, time, channels).
@@ -127,11 +127,11 @@ class ConvolutionModule(nn.Layer):
         x = x.transpose([0, 2, 1])  # [B, C, T]
 
         # mask batch padding
-        if paddle.shape(mask_pad)[2] > 0:  # time > 0
+        if paddle.shape(mask_pad)[2] > 0: # time > 0
             x = x.masked_fill(mask_pad, 0.0)
 
         if self.lorder > 0:
-            if paddle.shape(cache)[2] == 0:  # cache_t == 0
+            if paddle.shape(cache)[2] == 0: # cache_t == 0
                 x = nn.functional.pad(
                     x, [self.lorder, 0], 'constant', 0.0, data_format='NCL')
             else:
@@ -161,7 +161,7 @@ class ConvolutionModule(nn.Layer):
         x = self.pointwise_conv2(x)
 
         # mask batch padding
-        if paddle.shape(mask_pad)[2] > 0:  # time > 0
+        if paddle.shape(mask_pad)[2] > 0: # time > 0
             x = x.masked_fill(mask_pad, 0.0)
 
         x = x.transpose([0, 2, 1])  # [B, T, C]
diff --git a/paddlespeech/s2t/modules/ctc.py b/paddlespeech/s2t/modules/ctc.py
index 0f50db21d..652660e16 100644
--- a/paddlespeech/s2t/modules/ctc.py
+++ b/paddlespeech/s2t/modules/ctc.py
@@ -53,7 +53,7 @@ class CTCDecoderBase(nn.Layer):
                  enc_n_units,
                  blank_id=0,
                  dropout_rate: float=0.0,
-                 reduction: bool=True,
+                 reduction_type: str="sum",
                  batch_average: bool=True,
                  grad_norm_type: Union[str, None]=None):
         """CTC decoder
@@ -73,7 +73,7 @@ class CTCDecoderBase(nn.Layer):
         self.odim = odim
         self.dropout = nn.Dropout(dropout_rate)
         self.ctc_lo = Linear(enc_n_units, self.odim)
-        reduction_type = "sum" if reduction else "none"
+        reduction_type = reduction_type if reduction_type else "none"
         self.criterion = CTCLoss(
             blank=self.blank_id,
             reduction=reduction_type,
diff --git a/paddlespeech/s2t/modules/decoder_layer.py b/paddlespeech/s2t/modules/decoder_layer.py
index c8843b723..37b124e84 100644
--- a/paddlespeech/s2t/modules/decoder_layer.py
+++ b/paddlespeech/s2t/modules/decoder_layer.py
@@ -121,16 +121,11 @@ class DecoderLayer(nn.Layer):
 
         if self.concat_after:
             tgt_concat = paddle.cat(
-                (tgt_q, self.self_attn(tgt_q, tgt, tgt, tgt_q_mask,
-                                       paddle.empty([0]),
-                                       paddle.zeros([0, 0, 0, 0]))[0]),
-                dim=-1)
+                (tgt_q, self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0]), dim=-1)
             x = residual + self.concat_linear1(tgt_concat)
         else:
             x = residual + self.dropout(
-                self.self_attn(tgt_q, tgt, tgt, tgt_q_mask,
-                               paddle.empty([0]), paddle.zeros([0, 0, 0, 0]))[
-                                   0])
+                self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0])
         if not self.normalize_before:
             x = self.norm1(x)
 
@@ -139,15 +134,11 @@ class DecoderLayer(nn.Layer):
             x = self.norm2(x)
         if self.concat_after:
             x_concat = paddle.cat(
-                (x, self.src_attn(x, memory, memory, memory_mask,
-                                  paddle.empty([0]),
-                                  paddle.zeros([0, 0, 0, 0]))[0]),
-                dim=-1)
+                (x, self.src_attn(x, memory, memory, memory_mask)[0]), dim=-1)
             x = residual + self.concat_linear2(x_concat)
         else:
             x = residual + self.dropout(
-                self.src_attn(x, memory, memory, memory_mask,
-                              paddle.empty([0]), paddle.zeros([0, 0, 0, 0]))[0])
+                self.src_attn(x, memory, memory, memory_mask)[0])
         if not self.normalize_before:
             x = self.norm2(x)
 
diff --git a/paddlespeech/s2t/modules/encoder.py b/paddlespeech/s2t/modules/encoder.py
index cf4e32fa4..bff2d69bb 100644
--- a/paddlespeech/s2t/modules/encoder.py
+++ b/paddlespeech/s2t/modules/encoder.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 # Modified from wenet(https://github.com/wenet-e2e/wenet)
 """Encoder definition."""
+from typing import List
+from typing import Optional
 from typing import Tuple
 
 import paddle
@@ -175,9 +177,7 @@ class BaseEncoder(nn.Layer):
             decoding_chunk_size, self.static_chunk_size,
             num_decoding_left_chunks)
         for layer in self.encoders:
-            xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad,
-                                          paddle.zeros([0, 0, 0, 0]),
-                                          paddle.zeros([0, 0, 0, 0]))
+            xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad)
         if self.normalize_before:
             xs = self.after_norm(xs)
         # Here we assume the mask is not changed in encoder layers, so just
@@ -190,9 +190,9 @@ class BaseEncoder(nn.Layer):
             xs: paddle.Tensor,
             offset: int,
             required_cache_size: int,
-            att_cache: paddle.Tensor,  # paddle.zeros([0,0,0,0])
-            cnn_cache: paddle.Tensor,  # paddle.zeros([0,0,0,0]),
-            att_mask: paddle.Tensor,  # paddle.ones([0,0,0], dtype=paddle.bool)
+            att_cache: paddle.Tensor = paddle.zeros([0,0,0,0]),
+            cnn_cache: paddle.Tensor = paddle.zeros([0,0,0,0]),
+            att_mask: paddle.Tensor = paddle.ones([0,0,0], dtype=paddle.bool),
     ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
         """ Forward just one chunk
         Args:
@@ -227,7 +227,7 @@ class BaseEncoder(nn.Layer):
             xs = self.global_cmvn(xs)
 
         # before embed, xs=(B, T, D1), pos_emb=(B=1, T, D)
-        xs, pos_emb, _ = self.embed(xs, tmp_masks, offset=offset)
+        xs, pos_emb, _ = self.embed(xs, tmp_masks, offset=offset) 
         # after embed, xs=(B=1, chunk_size, hidden-dim)
 
         elayers = paddle.shape(att_cache)[0]
@@ -252,17 +252,14 @@ class BaseEncoder(nn.Layer):
             # att_cache[i:i+1] = (1, head, cache_t1, d_k*2)
             # cnn_cache[i:i+1] = (1, B=1, hidden-dim, cache_t2)
             xs, _, new_att_cache, new_cnn_cache = layer(
-                xs,
-                att_mask,
-                pos_emb,
-                mask_pad=paddle.ones([0, 0, 0], dtype=paddle.bool),
-                att_cache=att_cache[i:i + 1] if elayers > 0 else att_cache,
-                cnn_cache=cnn_cache[i:i + 1]
-                if paddle.shape(cnn_cache)[0] > 0 else cnn_cache, )
+                xs, att_mask, pos_emb,
+                att_cache=att_cache[i:i+1] if elayers > 0 else att_cache,
+                cnn_cache=cnn_cache[i:i+1] if paddle.shape(cnn_cache)[0] > 0 else cnn_cache,
+            )
             # new_att_cache = (1, head, attention_key_size, d_k*2)
             # new_cnn_cache = (B=1, hidden-dim, cache_t2)
-            r_att_cache.append(new_att_cache[:, :, next_cache_start:, :])
-            r_cnn_cache.append(new_cnn_cache.unsqueeze(0))  # add elayer dim
+            r_att_cache.append(new_att_cache[:,:, next_cache_start:, :])
+            r_cnn_cache.append(new_cnn_cache.unsqueeze(0)) # add elayer dim
 
         if self.normalize_before:
             xs = self.after_norm(xs)
@@ -273,6 +270,7 @@ class BaseEncoder(nn.Layer):
         r_cnn_cache = paddle.concat(r_cnn_cache, axis=0)
         return xs, r_att_cache, r_cnn_cache
 
+
     def forward_chunk_by_chunk(
             self,
             xs: paddle.Tensor,
@@ -317,8 +315,8 @@ class BaseEncoder(nn.Layer):
         num_frames = xs.shape[1]
         required_cache_size = decoding_chunk_size * num_decoding_left_chunks
 
-        att_cache: paddle.Tensor = paddle.zeros([0, 0, 0, 0])
-        cnn_cache: paddle.Tensor = paddle.zeros([0, 0, 0, 0])
+        att_cache: paddle.Tensor = paddle.zeros([0,0,0,0])
+        cnn_cache: paddle.Tensor = paddle.zeros([0,0,0,0])
 
         outputs = []
         offset = 0
@@ -328,8 +326,7 @@ class BaseEncoder(nn.Layer):
             chunk_xs = xs[:, cur:end, :]
 
             (y, att_cache, cnn_cache) = self.forward_chunk(
-                chunk_xs, offset, required_cache_size, att_cache, cnn_cache,
-                paddle.ones([0, 0, 0], dtype=paddle.bool))
+                 chunk_xs, offset, required_cache_size, att_cache, cnn_cache)
 
             outputs.append(y)
             offset += y.shape[1]
diff --git a/paddlespeech/s2t/modules/encoder_layer.py b/paddlespeech/s2t/modules/encoder_layer.py
index 4555b535f..5f810dfde 100644
--- a/paddlespeech/s2t/modules/encoder_layer.py
+++ b/paddlespeech/s2t/modules/encoder_layer.py
@@ -76,10 +76,9 @@ class TransformerEncoderLayer(nn.Layer):
             x: paddle.Tensor,
             mask: paddle.Tensor,
             pos_emb: paddle.Tensor,
-            mask_pad: paddle.
-            Tensor,  # paddle.ones([0, 0, 0], dtype=paddle.bool)
-            att_cache: paddle.Tensor,  # paddle.zeros([0, 0, 0, 0])
-            cnn_cache: paddle.Tensor,  # paddle.zeros([0, 0, 0, 0])
+            mask_pad: paddle.Tensor=paddle.ones([0, 0, 0], dtype=paddle.bool),
+            att_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]),
+            cnn_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]),
     ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor]:
         """Compute encoded features.
         Args:
@@ -106,8 +105,7 @@ class TransformerEncoderLayer(nn.Layer):
         if self.normalize_before:
             x = self.norm1(x)
 
-        x_att, new_att_cache = self.self_attn(
-            x, x, x, mask, paddle.empty([0]), cache=att_cache)
+        x_att, new_att_cache = self.self_attn(x, x, x, mask, cache=att_cache)
 
         if self.concat_after:
             x_concat = paddle.concat((x, x_att), axis=-1)
@@ -195,9 +193,9 @@ class ConformerEncoderLayer(nn.Layer):
             x: paddle.Tensor,
             mask: paddle.Tensor,
             pos_emb: paddle.Tensor,
-            mask_pad: paddle.Tensor,  #paddle.ones([0, 0, 0],dtype=paddle.bool)
-            att_cache: paddle.Tensor,  # paddle.zeros([0, 0, 0, 0])
-            cnn_cache: paddle.Tensor,  # paddle.zeros([0, 0, 0, 0])
+            mask_pad: paddle.Tensor=paddle.ones([0, 0, 0], dtype=paddle.bool),
+            att_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]),
+            cnn_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]),
     ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor]:
         """Compute encoded features.
         Args:
diff --git a/paddlespeech/s2t/modules/initializer.py b/paddlespeech/s2t/modules/initializer.py
index 6eae5713e..cdcf2e052 100644
--- a/paddlespeech/s2t/modules/initializer.py
+++ b/paddlespeech/s2t/modules/initializer.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+import numpy as np
 
 class DefaultInitializerContext(object):
     """
diff --git a/paddlespeech/s2t/training/optimizer.py b/paddlespeech/s2t/training/optimizer.py
index f7f70c570..75d3f5f5c 100644
--- a/paddlespeech/s2t/training/optimizer.py
+++ b/paddlespeech/s2t/training/optimizer.py
@@ -103,6 +103,8 @@ class OptimizerFactory():
 
         grad_clip = ClipGradByGlobalNormWithLog(
             args['grad_clip']) if "grad_clip" in args else None
+        # grad_clip = paddle.nn.ClipGradByGlobalNorm(
+        #     args['grad_clip']) if "grad_clip" in args else None
         weight_decay = L2Decay(
             args['weight_decay']) if "weight_decay" in args else None
         if weight_decay:
diff --git a/paddlespeech/s2t/training/scheduler.py b/paddlespeech/s2t/training/scheduler.py
index b22f7ef85..3464e2299 100644
--- a/paddlespeech/s2t/training/scheduler.py
+++ b/paddlespeech/s2t/training/scheduler.py
@@ -106,6 +106,59 @@ class ConstantLR(LRScheduler):
     def get_lr(self):
         return self.base_lr
 
+@register_scheduler
+class NewBobScheduler(LRScheduler):
+    """
+    Args:
+        learning_rate (float): The initial learning rate. It is a python float number.
+        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
+
+    Returns:
+        ``ConstantLR`` instance to schedule learning rate.
+    """
+    def __init__(
+        self,
+        learning_rate,
+        annealing_factor=0.5,
+        improvement_threshold=0.0025,
+        patient=0,
+    ):
+        self.hyperparam_value = learning_rate
+        self.annealing_factor = annealing_factor
+        self.improvement_threshold = improvement_threshold
+        self.patient = patient
+        self.metric_values = []
+        self.current_patient = self.patient
+
+    def __call__(self, metric_value):
+        """Returns the current and new value for the hyperparameter.
+
+        Arguments
+        ---------
+        metric_value : int
+            A number for determining whether to change the hyperparameter value.
+        """
+        old_value = new_value = self.hyperparam_value
+        if len(self.metric_values) > 0:
+            prev_metric = self.metric_values[-1]
+            # Update value if improvement too small and patience is 0
+            if prev_metric == 0:  # Prevent division by zero
+                improvement = 0
+            else:
+                improvement = (prev_metric - metric_value) / prev_metric
+            if improvement < self.improvement_threshold:
+                if self.current_patient == 0:
+                    new_value *= self.annealing_factor
+                    self.current_patient = self.patient
+                else:
+                    self.current_patient -= 1
+        # Store relevant info
+        self.metric_values.append(metric_value)
+        self.hyperparam_value = new_value
+
+        return old_value, new_value
+
 
 def dynamic_import_scheduler(module):
     """Import Scheduler class dynamically.
diff --git a/paddlespeech/s2t/training/trainer.py b/paddlespeech/s2t/training/trainer.py
index a7eb9892d..815b61e0f 100644
--- a/paddlespeech/s2t/training/trainer.py
+++ b/paddlespeech/s2t/training/trainer.py
@@ -19,6 +19,8 @@ from pathlib import Path
 
 import paddle
 from paddle import distributed as dist
+dist.init_parallel_env()
+
 from visualdl import LogWriter
 
 from paddlespeech.s2t.training.reporter import ObsScope
@@ -130,7 +132,9 @@ class Trainer():
             latest_n=self.config.checkpoint.latest_n)
 
         # set random seed if needed
+        print(args.seed)
         if args.seed:
+            print('***********')
             seed_all(args.seed)
             logger.info(f"Set seed {args.seed}")
 
@@ -176,7 +180,7 @@ class Trainer():
     def init_parallel(self):
         """Init environment for multiprocess training.
         """
-        dist.init_parallel_env()
+        # dist.init_parallel_env()
 
     @mp_tools.rank_zero_only
     def save(self, tag=None, infos: dict=None):
diff --git a/paddlespeech/server/conf/application.yaml b/paddlespeech/server/conf/application.yaml
index 55f241ec7..8650154e9 100644
--- a/paddlespeech/server/conf/application.yaml
+++ b/paddlespeech/server/conf/application.yaml
@@ -25,7 +25,6 @@ asr_python:
     cfg_path: # [optional]
     ckpt_path: # [optional]
     decode_method: 'attention_rescoring'
-    num_decoding_left_chunks: -1
     force_yes: True
     device:  # set 'gpu:id' or 'cpu'
 
@@ -39,7 +38,6 @@ asr_inference:
     lang: 'zh'
     sample_rate: 16000
     cfg_path: 
-    num_decoding_left_chunks: -1
     decode_method: 
     force_yes: True
 
diff --git a/paddlespeech/server/engine/asr/online/ctc_endpoint.py b/paddlespeech/server/engine/asr/online/ctc_endpoint.py
index 1b8ad1cb7..b87dbe805 100644
--- a/paddlespeech/server/engine/asr/online/ctc_endpoint.py
+++ b/paddlespeech/server/engine/asr/online/ctc_endpoint.py
@@ -102,10 +102,8 @@ class OnlineCTCEndpoint:
 
         assert self.num_frames_decoded >= self.trailing_silence_frames
         assert self.frame_shift_in_ms > 0
-
-        decoding_something = (
-            self.num_frames_decoded > self.trailing_silence_frames
-        ) and decoding_something
+        
+        decoding_something = (self.num_frames_decoded > self.trailing_silence_frames) and decoding_something
         utterance_length = self.num_frames_decoded * self.frame_shift_in_ms
         trailing_silence = self.trailing_silence_frames * self.frame_shift_in_ms
 
diff --git a/paddlespeech/server/engine/asr/online/onnx/asr_engine.py b/paddlespeech/server/engine/asr/online/onnx/asr_engine.py
index 6daae5be3..ab4f11305 100644
--- a/paddlespeech/server/engine/asr/online/onnx/asr_engine.py
+++ b/paddlespeech/server/engine/asr/online/onnx/asr_engine.py
@@ -21,12 +21,12 @@ import paddle
 from numpy import float32
 from yacs.config import CfgNode
 
-from paddlespeech.audio.transform.transformation import Transformation
 from paddlespeech.cli.asr.infer import ASRExecutor
 from paddlespeech.cli.log import logger
 from paddlespeech.resource import CommonTaskResource
 from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
 from paddlespeech.s2t.modules.ctc import CTCDecoder
+from paddlespeech.audio.transform.transformation import Transformation
 from paddlespeech.s2t.utils.utility import UpdateConfig
 from paddlespeech.server.engine.base_engine import BaseEngine
 from paddlespeech.server.utils import onnx_infer
diff --git a/paddlespeech/server/engine/asr/online/paddleinference/asr_engine.py b/paddlespeech/server/engine/asr/online/paddleinference/asr_engine.py
index 0fd5d1bc6..182e64180 100644
--- a/paddlespeech/server/engine/asr/online/paddleinference/asr_engine.py
+++ b/paddlespeech/server/engine/asr/online/paddleinference/asr_engine.py
@@ -21,10 +21,10 @@ import paddle
 from numpy import float32
 from yacs.config import CfgNode
 
-from paddlespeech.audio.transform.transformation import Transformation
 from paddlespeech.cli.asr.infer import ASRExecutor
 from paddlespeech.cli.log import logger
 from paddlespeech.resource import CommonTaskResource
+from paddlespeech.audio.transform.transformation import Transformation
 from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
 from paddlespeech.s2t.modules.ctc import CTCDecoder
 from paddlespeech.s2t.utils.utility import UpdateConfig
diff --git a/paddlespeech/server/engine/asr/online/python/asr_engine.py b/paddlespeech/server/engine/asr/online/python/asr_engine.py
index 87d88ee60..4df38f09d 100644
--- a/paddlespeech/server/engine/asr/online/python/asr_engine.py
+++ b/paddlespeech/server/engine/asr/online/python/asr_engine.py
@@ -21,10 +21,10 @@ import paddle
 from numpy import float32
 from yacs.config import CfgNode
 
-from paddlespeech.audio.transform.transformation import Transformation
 from paddlespeech.cli.asr.infer import ASRExecutor
 from paddlespeech.cli.log import logger
 from paddlespeech.resource import CommonTaskResource
+from paddlespeech.audio.transform.transformation import Transformation
 from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
 from paddlespeech.s2t.modules.ctc import CTCDecoder
 from paddlespeech.s2t.utils.tensor_utils import add_sos_eos
@@ -130,8 +130,8 @@ class PaddleASRConnectionHanddler:
 
         ## conformer
         # cache for conformer online
-        self.att_cache = paddle.zeros([0, 0, 0, 0])
-        self.cnn_cache = paddle.zeros([0, 0, 0, 0])
+        self.att_cache = paddle.zeros([0,0,0,0])
+        self.cnn_cache = paddle.zeros([0,0,0,0])
 
         self.encoder_out = None
         # conformer decoding state
@@ -474,14 +474,9 @@ class PaddleASRConnectionHanddler:
             # cur chunk
             chunk_xs = self.cached_feat[:, cur:end, :]
             # forward chunk
-            (y, self.att_cache,
-             self.cnn_cache) = self.model.encoder.forward_chunk(
-                 chunk_xs,
-                 self.offset,
-                 required_cache_size,
-                 att_cache=self.att_cache,
-                 cnn_cache=self.cnn_cache,
-                 att_mask=paddle.ones([0, 0, 0], dtype=paddle.bool))
+            (y, self.att_cache, self.cnn_cache) = self.model.encoder.forward_chunk(
+                 chunk_xs, self.offset, required_cache_size,
+                 self.att_cache, self.cnn_cache)
             outputs.append(y)
 
             # update the global offset, in decoding frame unit
diff --git a/paddlespeech/server/engine/asr/python/asr_engine.py b/paddlespeech/server/engine/asr/python/asr_engine.py
index e297e5c21..02c40fd12 100644
--- a/paddlespeech/server/engine/asr/python/asr_engine.py
+++ b/paddlespeech/server/engine/asr/python/asr_engine.py
@@ -68,12 +68,9 @@ class ASREngine(BaseEngine):
             return False
 
         self.executor._init_from_path(
-            model_type=self.config.model,
-            lang=self.config.lang,
-            sample_rate=self.config.sample_rate,
-            cfg_path=self.config.cfg_path,
-            decode_method=self.config.decode_method,
-            ckpt_path=self.config.ckpt_path)
+            self.config.model, self.config.lang, self.config.sample_rate,
+            self.config.cfg_path, self.config.decode_method,
+            self.config.ckpt_path)
 
         logger.info("Initialize ASR server engine successfully on device: %s." %
                     (self.device))
diff --git a/paddlespeech/server/engine/vector/python/vector_engine.py b/paddlespeech/server/engine/vector/python/vector_engine.py
index 7b8f667db..f7d60648d 100644
--- a/paddlespeech/server/engine/vector/python/vector_engine.py
+++ b/paddlespeech/server/engine/vector/python/vector_engine.py
@@ -105,8 +105,7 @@ class PaddleVectorConnectionHandler:
         # we can not reuse the cache io.BytesIO(audio) data, 
         # because the soundfile will change the io.BytesIO(audio) to the end
         # thus we should convert the base64 string to io.BytesIO when we need the audio data
-        if not self.executor._check(
-                io.BytesIO(audio), sample_rate, force_yes=True):
+        if not self.executor._check(io.BytesIO(audio), sample_rate):
             logger.debug("check the audio sample rate occurs error")
             return np.array([0.0])
 
diff --git a/paddlespeech/t2s/datasets/am_batch_fn.py b/paddlespeech/t2s/datasets/am_batch_fn.py
index c00648b1f..2cb7a11a2 100644
--- a/paddlespeech/t2s/datasets/am_batch_fn.py
+++ b/paddlespeech/t2s/datasets/am_batch_fn.py
@@ -11,12 +11,19 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from typing import Collection
+from typing import Dict
+from typing import List
+from typing import Tuple
+
 import numpy as np
 import paddle
 
 from paddlespeech.t2s.datasets.batch import batch_sequences
+from paddlespeech.t2s.datasets.get_feats import LogMelFBank
 from paddlespeech.t2s.modules.nets_utils import get_seg_pos
 from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask
+from paddlespeech.t2s.modules.nets_utils import pad_list
 from paddlespeech.t2s.modules.nets_utils import phones_masking
 from paddlespeech.t2s.modules.nets_utils import phones_text_masking
 
@@ -485,56 +492,180 @@ def vits_single_spk_batch_fn(examples):
     return batch
 
 
-def vits_multi_spk_batch_fn(examples):
-    """
-    Returns:
-        Dict[str, Any]:
-            - text (Tensor): Text index tensor (B, T_text).
-            - text_lengths (Tensor): Text length tensor (B,).
-            - feats (Tensor): Feature tensor (B, T_feats, aux_channels).
-            - feats_lengths (Tensor): Feature length tensor (B,).
-            - speech (Tensor): Speech waveform tensor (B, T_wav).
-            - spk_id (Optional[Tensor]): Speaker index tensor (B,) or (B, 1).
-            - spk_emb (Optional[Tensor]): Speaker embedding tensor (B, spk_embed_dim).
-    """
-    # fields = ["text", "text_lengths", "feats", "feats_lengths", "speech", "spk_id"/"spk_emb"]
-    text = [np.array(item["text"], dtype=np.int64) for item in examples]
-    feats = [np.array(item["feats"], dtype=np.float32) for item in examples]
-    speech = [np.array(item["wave"], dtype=np.float32) for item in examples]
-    text_lengths = [
-        np.array(item["text_lengths"], dtype=np.int64) for item in examples
-    ]
-    feats_lengths = [
-        np.array(item["feats_lengths"], dtype=np.int64) for item in examples
-    ]
+# for ERNIE SAT
+class MLMCollateFn:
+    """Functor class of common_collate_fn()"""
 
-    text = batch_sequences(text)
-    feats = batch_sequences(feats)
-    speech = batch_sequences(speech)
+    def __init__(
+            self,
+            feats_extract,
+            mlm_prob: float=0.8,
+            mean_phn_span: int=8,
+            seg_emb: bool=False,
+            text_masking: bool=False,
+            attention_window: int=0,
+            not_sequence: Collection[str]=(), ):
+        self.mlm_prob = mlm_prob
+        self.mean_phn_span = mean_phn_span
+        self.feats_extract = feats_extract
+        self.not_sequence = set(not_sequence)
+        self.attention_window = attention_window
+        self.seg_emb = seg_emb
+        self.text_masking = text_masking
 
-    # convert each batch to paddle.Tensor
-    text = paddle.to_tensor(text)
+    def __call__(self, data: Collection[Tuple[str, Dict[str, np.ndarray]]]
+                 ) -> Tuple[List[str], Dict[str, paddle.Tensor]]:
+        return mlm_collate_fn(
+            data,
+            feats_extract=self.feats_extract,
+            mlm_prob=self.mlm_prob,
+            mean_phn_span=self.mean_phn_span,
+            seg_emb=self.seg_emb,
+            text_masking=self.text_masking,
+            not_sequence=self.not_sequence)
+
+
+def mlm_collate_fn(
+        data: Collection[Tuple[str, Dict[str, np.ndarray]]],
+        feats_extract=None,
+        mlm_prob: float=0.8,
+        mean_phn_span: int=8,
+        seg_emb: bool=False,
+        text_masking: bool=False,
+        pad_value: int=0,
+        not_sequence: Collection[str]=(),
+) -> Tuple[List[str], Dict[str, paddle.Tensor]]:
+    uttids = [u for u, _ in data]
+    data = [d for _, d in data]
+
+    assert all(set(data[0]) == set(d) for d in data), "dict-keys mismatching"
+    assert all(not k.endswith("_lens")
+               for k in data[0]), f"*_lens is reserved: {list(data[0])}"
+
+    output = {}
+    for key in data[0]:
+
+        array_list = [d[key] for d in data]
+
+        # Assume the first axis is length:
+        # tensor_list: Batch x (Length, ...)
+        tensor_list = [paddle.to_tensor(a) for a in array_list]
+        # tensor: (Batch, Length, ...)
+        tensor = pad_list(tensor_list, pad_value)
+        output[key] = tensor
+
+        # lens: (Batch,)
+        if key not in not_sequence:
+            lens = paddle.to_tensor(
+                [d[key].shape[0] for d in data], dtype=paddle.int64)
+            output[key + "_lens"] = lens
+
+    feats = feats_extract.get_log_mel_fbank(np.array(output["speech"][0]))
     feats = paddle.to_tensor(feats)
-    text_lengths = paddle.to_tensor(text_lengths)
-    feats_lengths = paddle.to_tensor(feats_lengths)
+    print("feats.shape:", feats.shape)
+    feats_lens = paddle.shape(feats)[0]
+    feats = paddle.unsqueeze(feats, 0)
 
-    batch = {
-        "text": text,
-        "text_lengths": text_lengths,
-        "feats": feats,
-        "feats_lengths": feats_lengths,
-        "speech": speech
-    }
-    # spk_emb has a higher priority than spk_id
-    if "spk_emb" in examples[0]:
-        spk_emb = [
-            np.array(item["spk_emb"], dtype=np.float32) for item in examples
-        ]
-        spk_emb = batch_sequences(spk_emb)
-        spk_emb = paddle.to_tensor(spk_emb)
-        batch["spk_emb"] = spk_emb
-    elif "spk_id" in examples[0]:
-        spk_id = [np.array(item["spk_id"], dtype=np.int64) for item in examples]
-        spk_id = paddle.to_tensor(spk_id)
-        batch["spk_id"] = spk_id
-    return batch
+    text = output["text"]
+    text_lens = output["text_lens"]
+    align_start = output["align_start"]
+    align_start_lens = output["align_start_lens"]
+    align_end = output["align_end"]
+
+    max_tlen = max(text_lens)
+    max_slen = max(feats_lens)
+
+    speech_pad = feats[:, :max_slen]
+
+    text_pad = text
+    text_mask = make_non_pad_mask(
+        text_lens, text_pad, length_dim=1).unsqueeze(-2)
+    speech_mask = make_non_pad_mask(
+        feats_lens, speech_pad[:, :, 0], length_dim=1).unsqueeze(-2)
+
+    span_bdy = None
+    if 'span_bdy' in output.keys():
+        span_bdy = output['span_bdy']
+
+    # dual_mask 的是混合中英时候同时 mask 语音和文本 
+    # ernie sat 在实现跨语言的时候都 mask 了
+    if text_masking:
+        masked_pos, text_masked_pos = phones_text_masking(
+            xs_pad=speech_pad,
+            src_mask=speech_mask,
+            text_pad=text_pad,
+            text_mask=text_mask,
+            align_start=align_start,
+            align_end=align_end,
+            align_start_lens=align_start_lens,
+            mlm_prob=mlm_prob,
+            mean_phn_span=mean_phn_span,
+            span_bdy=span_bdy)
+    # 训练纯中文和纯英文的 -> a3t 没有对 phoneme 做 mask, 只对语音 mask 了
+    # a3t 和 ernie sat 的区别主要在于做 mask 的时候
+    else:
+        masked_pos = phones_masking(
+            xs_pad=speech_pad,
+            src_mask=speech_mask,
+            align_start=align_start,
+            align_end=align_end,
+            align_start_lens=align_start_lens,
+            mlm_prob=mlm_prob,
+            mean_phn_span=mean_phn_span,
+            span_bdy=span_bdy)
+        text_masked_pos = paddle.zeros(paddle.shape(text_pad))
+
+    output_dict = {}
+
+    speech_seg_pos, text_seg_pos = get_seg_pos(
+        speech_pad=speech_pad,
+        text_pad=text_pad,
+        align_start=align_start,
+        align_end=align_end,
+        align_start_lens=align_start_lens,
+        seg_emb=seg_emb)
+    output_dict['speech'] = speech_pad
+    output_dict['text'] = text_pad
+    output_dict['masked_pos'] = masked_pos
+    output_dict['text_masked_pos'] = text_masked_pos
+    output_dict['speech_mask'] = speech_mask
+    output_dict['text_mask'] = text_mask
+    output_dict['speech_seg_pos'] = speech_seg_pos
+    output_dict['text_seg_pos'] = text_seg_pos
+    output = (uttids, output_dict)
+    return output
+
+
+def build_mlm_collate_fn(
+        sr: int=24000,
+        n_fft: int=2048,
+        hop_length: int=300,
+        win_length: int=None,
+        n_mels: int=80,
+        fmin: int=80,
+        fmax: int=7600,
+        mlm_prob: float=0.8,
+        mean_phn_span: int=8,
+        seg_emb: bool=False,
+        epoch: int=-1, ):
+    feats_extract_class = LogMelFBank
+
+    feats_extract = feats_extract_class(
+        sr=sr,
+        n_fft=n_fft,
+        hop_length=hop_length,
+        win_length=win_length,
+        n_mels=n_mels,
+        fmin=fmin,
+        fmax=fmax)
+
+    if epoch == -1:
+        mlm_prob_factor = 1
+    else:
+        mlm_prob_factor = 0.8
+
+    return MLMCollateFn(
+        feats_extract=feats_extract,
+        mlm_prob=mlm_prob * mlm_prob_factor,
+        mean_phn_span=mean_phn_span,
+        seg_emb=seg_emb)
diff --git a/paddlespeech/t2s/datasets/sampler.py b/paddlespeech/t2s/datasets/sampler.py
index cbc9764c5..a69bc8600 100644
--- a/paddlespeech/t2s/datasets/sampler.py
+++ b/paddlespeech/t2s/datasets/sampler.py
@@ -1,9 +1,8 @@
+import paddle
 import math
-
 import numpy as np
 from paddle.io import BatchSampler
 
-
 class ErnieSATSampler(BatchSampler):
     """Sampler that restricts data loading to a subset of the dataset.
     In such case, each process can pass a DistributedBatchSampler instance 
@@ -71,7 +70,7 @@ class ErnieSATSampler(BatchSampler):
         assert isinstance(drop_last, bool), \
                 "drop_last should be a boolean number"
 
-        from paddle.distributed import ParallelEnv
+        from paddle.fluid.dygraph.parallel import ParallelEnv
 
         if num_replicas is not None:
             assert isinstance(num_replicas, int) and num_replicas > 0, \
@@ -111,8 +110,8 @@ class ErnieSATSampler(BatchSampler):
                 subsampled_indices.extend(indices[i:i + self.batch_size])
 
             indices = indices[len(indices) - last_batch_size:]
-            subsampled_indices.extend(
-                indices[self.local_rank * last_local_batch_size:(
+            subsampled_indices.extend(indices[
+                self.local_rank * last_local_batch_size:(
                     self.local_rank + 1) * last_local_batch_size])
             return subsampled_indices
 
diff --git a/paddlespeech/t2s/exps/ernie_sat/align.py b/paddlespeech/t2s/exps/ernie_sat/align.py
index 464f51a3b..529a8221c 100755
--- a/paddlespeech/t2s/exps/ernie_sat/align.py
+++ b/paddlespeech/t2s/exps/ernie_sat/align.py
@@ -19,9 +19,9 @@ import librosa
 import numpy as np
 import pypinyin
 from praatio import textgrid
-
-from paddlespeech.t2s.exps.ernie_sat.utils import get_dict
 from paddlespeech.t2s.exps.ernie_sat.utils import get_tmp_name
+from paddlespeech.t2s.exps.ernie_sat.utils import get_dict
+
 
 DICT_EN = 'tools/aligner/cmudict-0.7b'
 DICT_ZH = 'tools/aligner/simple.lexicon'
@@ -30,7 +30,6 @@ MODEL_DIR_ZH = 'tools/aligner/aishell3_model.zip'
 MFA_PATH = 'tools/montreal-forced-aligner/bin'
 os.environ['PATH'] = MFA_PATH + '/:' + os.environ['PATH']
 
-
 def _get_max_idx(dic):
     return sorted([int(key.split('_')[0]) for key in dic.keys()])[-1]
 
@@ -107,11 +106,11 @@ def alignment(wav_path: str,
     wav_name = os.path.basename(wav_path)
     utt = wav_name.split('.')[0]
     # prepare data for MFA
-    tmp_name = get_tmp_name(text=text)
+    tmp_name =  get_tmp_name(text=text)
     tmpbase = './tmp_dir/' + tmp_name
     tmpbase = Path(tmpbase)
     tmpbase.mkdir(parents=True, exist_ok=True)
-    print("tmp_name in alignment:", tmp_name)
+    print("tmp_name in alignment:",tmp_name)
 
     shutil.copyfile(wav_path, tmpbase / wav_name)
     txt_name = utt + '.txt'
@@ -341,7 +340,7 @@ def get_phns_spans(wav_path: str,
 
 if __name__ == '__main__':
     text = "For that reason cover should not be given."
-    phn, dur, word2phns = alignment("source/p243_313.wav", text, lang='en')
+    phn, dur, word2phns = alignment("exp/p243_313.wav", text, lang='en')
     print(phn, dur)
     print(word2phns)
     print("---------------------------------")
@@ -353,7 +352,7 @@ if __name__ == '__main__':
         style=pypinyin.Style.TONE3,
         tone_sandhi=True)
     text_zh = " ".join(text_zh)
-    phn, dur, word2phns = alignment("source/000001.wav", text_zh, lang='zh')
+    phn, dur, word2phns = alignment("exp/000001.wav", text_zh, lang='zh')
     print(phn, dur)
     print(word2phns)
     print("---------------------------------")
@@ -368,7 +367,7 @@ if __name__ == '__main__':
     print("---------------------------------")
 
     outs = get_phns_spans(
-        wav_path="source/p243_313.wav",
+        wav_path="exp/p243_313.wav",
         old_str="For that reason cover should not be given.",
         new_str="for that reason cover is impossible to be given.")
 
diff --git a/paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py b/paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py
index 21c9ae044..95b07367c 100644
--- a/paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py
@@ -11,41 +11,35 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import argparse
-import os
-from pathlib import Path
-from typing import List
-
 import librosa
 import numpy as np
-import paddle
-import pypinyin
 import soundfile as sf
-import yaml
-from pypinyin_dict.phrase_pinyin_data import large_pinyin
-from yacs.config import CfgNode
 
-from paddlespeech.t2s.datasets.am_batch_fn import build_erniesat_collate_fn
-from paddlespeech.t2s.datasets.get_feats import LogMelFBank
 from paddlespeech.t2s.exps.ernie_sat.align import get_phns_spans
 from paddlespeech.t2s.exps.ernie_sat.utils import eval_durs
 from paddlespeech.t2s.exps.ernie_sat.utils import get_dur_adj_factor
 from paddlespeech.t2s.exps.ernie_sat.utils import get_span_bdy
-from paddlespeech.t2s.exps.ernie_sat.utils import get_tmp_name
-from paddlespeech.t2s.exps.syn_utils import get_am_inference
-from paddlespeech.t2s.exps.syn_utils import get_voc_inference
+from paddlespeech.t2s.datasets.am_batch_fn import build_erniesat_collate_fn
+from paddlespeech.t2s.exps.syn_utils import get_frontend
+from paddlespeech.t2s.datasets.get_feats import LogMelFBank
 from paddlespeech.t2s.exps.syn_utils import norm
-from paddlespeech.t2s.utils import str2bool
-large_pinyin.load()
+from paddlespeech.t2s.exps.ernie_sat.utils import get_tmp_name
 
 
-def _p2id(phonemes: List[str]) -> np.ndarray:
+
+
+
+
+def _p2id(self, phonemes: List[str]) -> np.ndarray:
     # replace unk phone with sp
-    phonemes = [phn if phn in vocab_phones else "sp" for phn in phonemes]
+    phonemes = [
+        phn if phn in vocab_phones else "sp" for phn in phonemes
+    ]
     phone_ids = [vocab_phones[item] for item in phonemes]
     return np.array(phone_ids, np.int64)
 
 
+
 def prep_feats_with_dur(wav_path: str,
                         old_str: str='',
                         new_str: str='',
@@ -73,12 +67,12 @@ def prep_feats_with_dur(wav_path: str,
         fs=fs,
         n_shift=n_shift)
 
-    mfa_start = phns_spans_outs['mfa_start']
-    mfa_end = phns_spans_outs['mfa_end']
-    old_phns = phns_spans_outs['old_phns']
-    new_phns = phns_spans_outs['new_phns']
-    span_to_repl = phns_spans_outs['span_to_repl']
-    span_to_add = phns_spans_outs['span_to_add']
+    mfa_start = phns_spans_outs["mfa_start"]
+    mfa_end = phns_spans_outs["mfa_end"]
+    old_phns = phns_spans_outs["old_phns"]
+    new_phns = phns_spans_outs["new_phns"]
+    span_to_repl = phns_spans_outs["span_to_repl"]
+    span_to_add = phns_spans_outs["span_to_add"]
 
     # 中文的 phns 不一定都在 fastspeech2 的字典里, 用 sp 代替
     if target_lang in {'en', 'zh'}:
@@ -138,7 +132,7 @@ def prep_feats_with_dur(wav_path: str,
         [wav_org[:wav_left_idx], blank_wav, wav_org[wav_right_idx:]])
 
     # 音频是正常遮住了
-    sf.write(str("mask_wav.wav"), new_wav, samplerate=fs)
+    sf.write(str("new_wav.wav"), new_wav, samplerate=fs)
 
     # 4. get old and new mel span to be mask
     old_span_bdy = get_span_bdy(
@@ -158,6 +152,8 @@ def prep_feats_with_dur(wav_path: str,
     return outs
 
 
+
+
 def prep_feats(wav_path: str,
                old_str: str='',
                new_str: str='',
@@ -167,7 +163,7 @@ def prep_feats(wav_path: str,
                fs: int=24000,
                n_shift: int=300):
 
-    with_dur_outs = prep_feats_with_dur(
+    outs = prep_feats_with_dur(
         wav_path=wav_path,
         old_str=old_str,
         new_str=new_str,
@@ -180,240 +176,138 @@ def prep_feats(wav_path: str,
     wav_name = os.path.basename(wav_path)
     utt_id = wav_name.split('.')[0]
 
-    wav = with_dur_outs['new_wav']
-    phns = with_dur_outs['new_phns']
-    mfa_start = with_dur_outs['new_mfa_start']
-    mfa_end = with_dur_outs['new_mfa_end']
-    old_span_bdy = with_dur_outs['old_span_bdy']
-    new_span_bdy = with_dur_outs['new_span_bdy']
+    wav = outs['new_wav']
+    phns = outs['new_phns']
+    mfa_start = outs['new_mfa_start']
+    mfa_end = outs['new_mfa_end']
+    old_span_bdy = outs['old_span_bdy']
+    new_span_bdy = outs['new_span_bdy']
     span_bdy = np.array(new_span_bdy)
 
+    text = _p2id(phns)
     mel = mel_extractor.get_log_mel_fbank(wav)
     erniesat_mean, erniesat_std = np.load(erniesat_stat)
     normed_mel = norm(mel, erniesat_mean, erniesat_std)
-    tmp_name = get_tmp_name(text=old_str)
+    tmp_name =  get_tmp_name(text=old_str)
     tmpbase = './tmp_dir/' + tmp_name
     tmpbase = Path(tmpbase)
     tmpbase.mkdir(parents=True, exist_ok=True)
+    print("tmp_name in synthesize_e2e:",tmp_name)
 
     mel_path = tmpbase / 'mel.npy'
-    np.save(mel_path, normed_mel)
+    print("mel_path:",mel_path)
+    np.save(mel_path, logmel)
     durations = [e - s for e, s in zip(mfa_end, mfa_start)]
-    text = _p2id(phns)
 
-    datum = {
-        "utt_id": utt_id,
-        "spk_id": 0,
-        "text": text,
-        "text_lengths": len(text),
-        "speech_lengths": len(normed_mel),
-        "durations": durations,
-        "speech": np.load(mel_path),
-        "align_start": mfa_start,
+    datum={
+        "utt_id": utt_id, 
+        "spk_id": 0, 
+        "text": text, 
+        "text_lengths": len(text), 
+        "speech_lengths": 115, 
+        "durations": durations, 
+        "speech": mel_path, 
+        "align_start": mfa_start, 
         "align_end": mfa_end,
         "span_bdy": span_bdy
     }
 
     batch = collate_fn([datum])
-    outs = dict()
-    outs['batch'] = batch
-    outs['old_span_bdy'] = old_span_bdy
-    outs['new_span_bdy'] = new_span_bdy
-    return outs
-
-
-def get_mlm_output(wav_path: str,
-                   old_str: str='',
-                   new_str: str='',
-                   source_lang: str='en',
-                   target_lang: str='en',
-                   duration_adjust: bool=True,
-                   fs: int=24000,
-                   n_shift: int=300):
-
-    prep_feats_outs = prep_feats(
+    print("batch:",batch)
+
+    return batch, old_span_bdy, new_span_bdy
+
+
+def decode_with_model(mlm_model: nn.Layer,
+                      collate_fn,
+                      wav_path: str,
+                      old_str: str='',
+                      new_str: str='',
+                      source_lang: str='en',
+                      target_lang: str='en',
+                      use_teacher_forcing: bool=False,
+                      duration_adjust: bool=True,
+                      fs: int=24000,
+                      n_shift: int=300,
+                      token_list: List[str]=[]):
+    batch, old_span_bdy, new_span_bdy = prep_feats(
+        source_lang=source_lang,
+        target_lang=target_lang,
         wav_path=wav_path,
         old_str=old_str,
         new_str=new_str,
-        source_lang=source_lang,
-        target_lang=target_lang,
         duration_adjust=duration_adjust,
         fs=fs,
-        n_shift=n_shift)
+        n_shift=n_shift,
+        token_list=token_list)
+    
 
-    batch = prep_feats_outs['batch']
-    new_span_bdy = prep_feats_outs['new_span_bdy']
-    old_span_bdy = prep_feats_outs['old_span_bdy']
 
-    out_mels = erniesat_inference(
-        speech=batch['speech'],
-        text=batch['text'],
-        masked_pos=batch['masked_pos'],
-        speech_mask=batch['speech_mask'],
-        text_mask=batch['text_mask'],
-        speech_seg_pos=batch['speech_seg_pos'],
-        text_seg_pos=batch['text_seg_pos'],
-        span_bdy=new_span_bdy)
+    feats = collate_fn(batch)[1]
+
+    if 'text_masked_pos' in feats.keys():
+        feats.pop('text_masked_pos')
+
+    output = mlm_model.inference(
+        text=feats['text'],
+        speech=feats['speech'],
+        masked_pos=feats['masked_pos'],
+        speech_mask=feats['speech_mask'],
+        text_mask=feats['text_mask'],
+        speech_seg_pos=feats['speech_seg_pos'],
+        text_seg_pos=feats['text_seg_pos'],
+        span_bdy=new_span_bdy,
+        use_teacher_forcing=use_teacher_forcing)
 
     # 拼接音频
-    output_feat = paddle.concat(x=out_mels, axis=0)
+    output_feat = paddle.concat(x=output, axis=0)
     wav_org, _ = librosa.load(wav_path, sr=fs)
-    outs = dict()
-    outs['wav_org'] = wav_org
-    outs['output_feat'] = output_feat
-    outs['old_span_bdy'] = old_span_bdy
-    outs['new_span_bdy'] = new_span_bdy
-
-    return outs
+    return wav_org, output_feat, old_span_bdy, new_span_bdy, fs, hop_length
 
 
-def get_wav(wav_path: str,
-            source_lang: str='en',
-            target_lang: str='en',
-            old_str: str='',
-            new_str: str='',
-            duration_adjust: bool=True,
-            fs: int=24000,
-            n_shift: int=300):
+if __name__ == '__main__':
+    fs = 24000
+    n_shift = 300
+    wav_path = "exp/p243_313.wav"
+    old_str = "For that reason cover should not be given."
+    # for edit
+    # new_str = "for that reason cover is impossible to be given."
+    # for synthesize
+    append_str = "do you love me i love you so much"
+    new_str = old_str + append_str
 
-    outs = get_mlm_output(
+    '''
+    outs = prep_feats_with_dur(
         wav_path=wav_path,
         old_str=old_str,
         new_str=new_str,
-        source_lang=source_lang,
-        target_lang=target_lang,
-        duration_adjust=duration_adjust,
         fs=fs,
         n_shift=n_shift)
 
-    wav_org = outs['wav_org']
-    output_feat = outs['output_feat']
+    new_wav = outs['new_wav']
+    new_phns = outs['new_phns']
+    new_mfa_start = outs['new_mfa_start']
+    new_mfa_end = outs['new_mfa_end']
     old_span_bdy = outs['old_span_bdy']
     new_span_bdy = outs['new_span_bdy']
 
-    masked_feat = output_feat[new_span_bdy[0]:new_span_bdy[1]]
-
-    with paddle.no_grad():
-        alt_wav = voc_inference(masked_feat)
-    alt_wav = np.squeeze(alt_wav)
-
-    old_time_bdy = [n_shift * x for x in old_span_bdy]
-    wav_replaced = np.concatenate(
-        [wav_org[:old_time_bdy[0]], alt_wav, wav_org[old_time_bdy[1]:]])
-
-    wav_dict = {"origin": wav_org, "output": wav_replaced}
-    return wav_dict
-
-
-def parse_args():
-    # parse args and config
-    parser = argparse.ArgumentParser(
-        description="Synthesize with acoustic model & vocoder")
-    # ernie sat
-
-    parser.add_argument(
-        '--erniesat_config',
-        type=str,
-        default=None,
-        help='Config of acoustic model.')
-    parser.add_argument(
-        '--erniesat_ckpt',
-        type=str,
-        default=None,
-        help='Checkpoint file of acoustic model.')
-    parser.add_argument(
-        "--erniesat_stat",
-        type=str,
-        default=None,
-        help="mean and standard deviation used to normalize spectrogram when training acoustic model."
-    )
-    parser.add_argument(
-        "--phones_dict", type=str, default=None, help="phone vocabulary file.")
-    # vocoder
-    parser.add_argument(
-        '--voc',
-        type=str,
-        default='pwgan_csmsc',
-        choices=[
-            'pwgan_aishell3',
-            'pwgan_vctk',
-            'hifigan_aishell3',
-            'hifigan_vctk',
-        ],
-        help='Choose vocoder type of tts task.')
-    parser.add_argument(
-        '--voc_config', type=str, default=None, help='Config of voc.')
-    parser.add_argument(
-        '--voc_ckpt', type=str, default=None, help='Checkpoint file of voc.')
-    parser.add_argument(
-        "--voc_stat",
-        type=str,
-        default=None,
-        help="mean and standard deviation used to normalize spectrogram when training voc."
-    )
-    # other
-    parser.add_argument(
-        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
-
-    # ernie sat related
-    parser.add_argument("--task_name", type=str, help="task name")
-    parser.add_argument("--wav_path", type=str, help="path of old wav")
-    parser.add_argument("--old_str", type=str, help="old string")
-    parser.add_argument("--new_str", type=str, help="new string")
-    parser.add_argument(
-        "--source_lang", type=str, default="en", help="source language")
-    parser.add_argument(
-        "--target_lang", type=str, default="en", help="target language")
-    parser.add_argument(
-        "--duration_adjust",
-        type=str2bool,
-        default=True,
-        help="whether to adjust duration.")
-    parser.add_argument("--output_name", type=str, default="output.wav")
-
-    args = parser.parse_args()
-    return args
-
+    print("---------------------------------")
 
-if __name__ == '__main__':
-    args = parse_args()
+    print("new_wav:", new_wav)
+    print("new_phns:", new_phns)
+    print("new_mfa_start:", new_mfa_start)
+    print("new_mfa_end:", new_mfa_end)
+    print("old_span_bdy:", old_span_bdy)
+    print("new_span_bdy:", new_span_bdy)
+    print("---------------------------------")
+    '''
 
-    if args.ngpu == 0:
-        paddle.set_device("cpu")
-    elif args.ngpu > 0:
-        paddle.set_device("gpu")
-    else:
-        print("ngpu should >= 0 !")
+    erniesat_config = "/home/yuantian01/PaddleSpeech_ERNIE_SAT/PaddleSpeech/examples/vctk/ernie_sat/local/default.yaml"
 
-    # evaluate(args)
-    with open(args.erniesat_config) as f:
+    with open(erniesat_config) as f:
         erniesat_config = CfgNode(yaml.safe_load(f))
-    old_str = args.old_str
-    new_str = args.new_str
-
-    # convert Chinese characters to pinyin
-    if args.source_lang == 'zh':
-        old_str = pypinyin.lazy_pinyin(
-            old_str,
-            neutral_tone_with_five=True,
-            style=pypinyin.Style.TONE3,
-            tone_sandhi=True)
-        old_str = ' '.join(old_str)
-    if args.target_lang == 'zh':
-        new_str = pypinyin.lazy_pinyin(
-            new_str,
-            neutral_tone_with_five=True,
-            style=pypinyin.Style.TONE3,
-            tone_sandhi=True)
-        new_str = ' '.join(new_str)
-
-    if args.task_name == 'edit':
-        new_str = new_str
-    elif args.task_name == 'synthesize':
-        new_str = old_str + new_str
-    else:
-        new_str = old_str + new_str
-    print("new_str:", new_str)
+    
+    erniesat_stat = "/home/yuantian01/PaddleSpeech_ERNIE_SAT/PaddleSpeech/examples/vctk/ernie_sat/dump/train/speech_stats.npy"
 
     # Extractor
     mel_extractor = LogMelFBank(
@@ -425,51 +319,28 @@ if __name__ == '__main__':
         n_mels=erniesat_config.n_mels,
         fmin=erniesat_config.fmin,
         fmax=erniesat_config.fmax)
+    
+
 
     collate_fn = build_erniesat_collate_fn(
         mlm_prob=erniesat_config.mlm_prob,
         mean_phn_span=erniesat_config.mean_phn_span,
         seg_emb=erniesat_config.model['enc_input_layer'] == 'sega_mlm',
         text_masking=False)
-
+        
+    phones_dict='/home/yuantian01/PaddleSpeech_ERNIE_SAT/PaddleSpeech/examples/vctk/ernie_sat/dump/phone_id_map.txt'
     vocab_phones = {}
 
-    with open(args.phones_dict, 'rt') as f:
+    with open(phones_dict, 'rt') as f:
         phn_id = [line.strip().split() for line in f.readlines()]
     for phn, id in phn_id:
         vocab_phones[phn] = int(id)
 
-    # ernie sat model
-    erniesat_inference = get_am_inference(
-        am='erniesat_dataset',
-        am_config=erniesat_config,
-        am_ckpt=args.erniesat_ckpt,
-        am_stat=args.erniesat_stat,
-        phones_dict=args.phones_dict)
-
-    with open(args.voc_config) as f:
-        voc_config = CfgNode(yaml.safe_load(f))
-
-    # vocoder
-    voc_inference = get_voc_inference(
-        voc=args.voc,
-        voc_config=voc_config,
-        voc_ckpt=args.voc_ckpt,
-        voc_stat=args.voc_stat)
-
-    erniesat_stat = args.erniesat_stat
-
-    wav_dict = get_wav(
-        wav_path=args.wav_path,
-        source_lang=args.source_lang,
-        target_lang=args.target_lang,
-        old_str=old_str,
-        new_str=new_str,
-        duration_adjust=args.duration_adjust,
-        fs=erniesat_config.fs,
-        n_shift=erniesat_config.n_shift)
-
-    sf.write(
-        args.output_name, wav_dict['output'], samplerate=erniesat_config.fs)
-    print(
-        f"\033[1;32;m Generated audio saved into {args.output_name} ! \033[0m")
+    prep_feats(wav_path=wav_path,
+               old_str=old_str,
+               new_str=new_str,
+               fs=fs,
+               n_shift=n_shift)
+
+    
+    
diff --git a/paddlespeech/t2s/exps/ernie_sat/train.py b/paddlespeech/t2s/exps/ernie_sat/train.py
index 75a666bb1..af653ef89 100644
--- a/paddlespeech/t2s/exps/ernie_sat/train.py
+++ b/paddlespeech/t2s/exps/ernie_sat/train.py
@@ -25,6 +25,7 @@ from paddle import DataParallel
 from paddle import distributed as dist
 from paddle import nn
 from paddle.io import DataLoader
+from paddle.io import DistributedBatchSampler
 from paddle.optimizer import Adam
 from yacs.config import CfgNode
 
diff --git a/paddlespeech/t2s/exps/ernie_sat/utils.py b/paddlespeech/t2s/exps/ernie_sat/utils.py
index 6805e513c..9169efa36 100644
--- a/paddlespeech/t2s/exps/ernie_sat/utils.py
+++ b/paddlespeech/t2s/exps/ernie_sat/utils.py
@@ -11,35 +11,32 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import hashlib
-import os
 from pathlib import Path
 from typing import Dict
 from typing import List
 from typing import Union
+import os
 
 import numpy as np
 import paddle
 import yaml
 from yacs.config import CfgNode
+import hashlib
+
 
 from paddlespeech.t2s.exps.syn_utils import get_am_inference
 from paddlespeech.t2s.exps.syn_utils import get_voc_inference
 
-
 def _get_user():
     return os.path.expanduser('~').split('/')[-1]
 
-
 def str2md5(string):
     md5_val = hashlib.md5(string.encode('utf8')).hexdigest()
     return md5_val
 
-
-def get_tmp_name(text: str):
+def get_tmp_name(text:str):
     return _get_user() + '_' + str(os.getpid()) + '_' + str2md5(text)
 
-
 def get_dict(dictfile: str):
     word2phns_dict = {}
     with open(dictfile, 'r') as fid:
diff --git a/paddlespeech/t2s/exps/syn_utils.py b/paddlespeech/t2s/exps/syn_utils.py
index 15d8dfb78..127e1a3ba 100644
--- a/paddlespeech/t2s/exps/syn_utils.py
+++ b/paddlespeech/t2s/exps/syn_utils.py
@@ -82,10 +82,6 @@ def denorm(data, mean, std):
     return data * std + mean
 
 
-def norm(data, mean, std):
-    return (data - mean) / std
-
-
 def get_chunks(data, block_size: int, pad_size: int):
     data_len = data.shape[1]
     chunks = []
@@ -298,8 +294,8 @@ def am_to_static(am_inference,
     am_name = am[:am.rindex('_')]
     am_dataset = am[am.rindex('_') + 1:]
     if am_name == 'fastspeech2':
-        if am_dataset in {"aishell3", "vctk",
-                          "mix"} and speaker_dict is not None:
+        if am_dataset in {"aishell3", "vctk", "mix"
+                          } and speaker_dict is not None:
             am_inference = jit.to_static(
                 am_inference,
                 input_spec=[
@@ -311,8 +307,8 @@ def am_to_static(am_inference,
                 am_inference, input_spec=[InputSpec([-1], dtype=paddle.int64)])
 
     elif am_name == 'speedyspeech':
-        if am_dataset in {"aishell3", "vctk",
-                          "mix"} and speaker_dict is not None:
+        if am_dataset in {"aishell3", "vctk", "mix"
+                          } and speaker_dict is not None:
             am_inference = jit.to_static(
                 am_inference,
                 input_spec=[
diff --git a/paddlespeech/t2s/exps/vits/synthesize.py b/paddlespeech/t2s/exps/vits/synthesize.py
index 968684b25..074b890f9 100644
--- a/paddlespeech/t2s/exps/vits/synthesize.py
+++ b/paddlespeech/t2s/exps/vits/synthesize.py
@@ -15,7 +15,6 @@ import argparse
 from pathlib import Path
 
 import jsonlines
-import numpy as np
 import paddle
 import soundfile as sf
 import yaml
@@ -24,7 +23,6 @@ from yacs.config import CfgNode
 
 from paddlespeech.t2s.datasets.data_table import DataTable
 from paddlespeech.t2s.models.vits import VITS
-from paddlespeech.t2s.utils import str2bool
 
 
 def evaluate(args):
@@ -42,26 +40,8 @@ def evaluate(args):
     print(config)
 
     fields = ["utt_id", "text"]
-    converters = {}
-
-    spk_num = None
-    if args.speaker_dict is not None:
-        print("multiple speaker vits!")
-        with open(args.speaker_dict, 'rt') as f:
-            spk_id = [line.strip().split() for line in f.readlines()]
-        spk_num = len(spk_id)
-        fields += ["spk_id"]
-    elif args.voice_cloning:
-        print("Evaluating voice cloning!")
-        fields += ["spk_emb"]
-    else:
-        print("single speaker vits!")
-    print("spk_num:", spk_num)
 
-    test_dataset = DataTable(
-        data=test_metadata,
-        fields=fields,
-        converters=converters, )
+    test_dataset = DataTable(data=test_metadata, fields=fields)
 
     with open(args.phones_dict, "r") as f:
         phn_id = [line.strip().split() for line in f.readlines()]
@@ -69,7 +49,6 @@ def evaluate(args):
     print("vocab_size:", vocab_size)
 
     odim = config.n_fft // 2 + 1
-    config["model"]["generator_params"]["spks"] = spk_num
 
     vits = VITS(idim=vocab_size, odim=odim, **config["model"])
     vits.set_state_dict(paddle.load(args.ckpt)["main_params"])
@@ -86,15 +65,7 @@ def evaluate(args):
         phone_ids = paddle.to_tensor(datum["text"])
         with timer() as t:
             with paddle.no_grad():
-                spk_emb = None
-                spk_id = None
-                # multi speaker
-                if args.voice_cloning and "spk_emb" in datum:
-                    spk_emb = paddle.to_tensor(np.load(datum["spk_emb"]))
-                elif "spk_id" in datum:
-                    spk_id = paddle.to_tensor(datum["spk_id"])
-                out = vits.inference(
-                    text=phone_ids, sids=spk_id, spembs=spk_emb)
+                out = vits.inference(text=phone_ids)
             wav = out["wav"]
             wav = wav.numpy()
             N += wav.size
@@ -119,13 +90,6 @@ def parse_args():
         '--ckpt', type=str, default=None, help='Checkpoint file of VITS.')
     parser.add_argument(
         "--phones_dict", type=str, default=None, help="phone vocabulary file.")
-    parser.add_argument(
-        "--speaker_dict", type=str, default=None, help="speaker id map file.")
-    parser.add_argument(
-        "--voice-cloning",
-        type=str2bool,
-        default=False,
-        help="whether training voice cloning model.")
     # other
     parser.add_argument(
         "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
diff --git a/paddlespeech/t2s/exps/vits/synthesize_e2e.py b/paddlespeech/t2s/exps/vits/synthesize_e2e.py
index f9d10ea62..33a413751 100644
--- a/paddlespeech/t2s/exps/vits/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/vits/synthesize_e2e.py
@@ -42,23 +42,12 @@ def evaluate(args):
     # frontend
     frontend = get_frontend(lang=args.lang, phones_dict=args.phones_dict)
 
-    spk_num = None
-    if args.speaker_dict is not None:
-        print("multiple speaker vits!")
-        with open(args.speaker_dict, 'rt') as f:
-            spk_id = [line.strip().split() for line in f.readlines()]
-        spk_num = len(spk_id)
-    else:
-        print("single speaker vits!")
-    print("spk_num:", spk_num)
-
     with open(args.phones_dict, "r") as f:
         phn_id = [line.strip().split() for line in f.readlines()]
     vocab_size = len(phn_id)
     print("vocab_size:", vocab_size)
 
     odim = config.n_fft // 2 + 1
-    config["model"]["generator_params"]["spks"] = spk_num
 
     vits = VITS(idim=vocab_size, odim=odim, **config["model"])
     vits.set_state_dict(paddle.load(args.ckpt)["main_params"])
@@ -89,10 +78,7 @@ def evaluate(args):
                 flags = 0
                 for i in range(len(phone_ids)):
                     part_phone_ids = phone_ids[i]
-                    spk_id = None
-                    if spk_num is not None:
-                        spk_id = paddle.to_tensor(args.spk_id)
-                    out = vits.inference(text=part_phone_ids, sids=spk_id)
+                    out = vits.inference(text=part_phone_ids)
                     wav = out["wav"]
                     if flags == 0:
                         wav_all = wav
@@ -123,13 +109,6 @@ def parse_args():
         '--ckpt', type=str, default=None, help='Checkpoint file of VITS.')
     parser.add_argument(
         "--phones_dict", type=str, default=None, help="phone vocabulary file.")
-    parser.add_argument(
-        "--speaker_dict", type=str, default=None, help="speaker id map file.")
-    parser.add_argument(
-        '--spk_id',
-        type=int,
-        default=0,
-        help='spk id for multi speaker acoustic model')
     # other
     parser.add_argument(
         '--lang',
diff --git a/paddlespeech/t2s/exps/vits/train.py b/paddlespeech/t2s/exps/vits/train.py
index c994faa5a..1a68d1326 100644
--- a/paddlespeech/t2s/exps/vits/train.py
+++ b/paddlespeech/t2s/exps/vits/train.py
@@ -28,7 +28,6 @@ from paddle.io import DistributedBatchSampler
 from paddle.optimizer import Adam
 from yacs.config import CfgNode
 
-from paddlespeech.t2s.datasets.am_batch_fn import vits_multi_spk_batch_fn
 from paddlespeech.t2s.datasets.am_batch_fn import vits_single_spk_batch_fn
 from paddlespeech.t2s.datasets.data_table import DataTable
 from paddlespeech.t2s.models.vits import VITS
@@ -44,7 +43,6 @@ from paddlespeech.t2s.training.extensions.visualizer import VisualDL
 from paddlespeech.t2s.training.optimizer import scheduler_classes
 from paddlespeech.t2s.training.seeding import seed_everything
 from paddlespeech.t2s.training.trainer import Trainer
-from paddlespeech.t2s.utils import str2bool
 
 
 def train_sp(args, config):
@@ -74,23 +72,6 @@ def train_sp(args, config):
         "wave": np.load,
         "feats": np.load,
     }
-    spk_num = None
-    if args.speaker_dict is not None:
-        print("multiple speaker vits!")
-        collate_fn = vits_multi_spk_batch_fn
-        with open(args.speaker_dict, 'rt') as f:
-            spk_id = [line.strip().split() for line in f.readlines()]
-        spk_num = len(spk_id)
-        fields += ["spk_id"]
-    elif args.voice_cloning:
-        print("Training voice cloning!")
-        collate_fn = vits_multi_spk_batch_fn
-        fields += ["spk_emb"]
-        converters["spk_emb"] = np.load
-    else:
-        print("single speaker vits!")
-        collate_fn = vits_single_spk_batch_fn
-    print("spk_num:", spk_num)
 
     # construct dataset for training and validation
     with jsonlines.open(args.train_metadata, 'r') as reader:
@@ -119,16 +100,18 @@ def train_sp(args, config):
         drop_last=False)
     print("samplers done!")
 
+    train_batch_fn = vits_single_spk_batch_fn
+
     train_dataloader = DataLoader(
         train_dataset,
         batch_sampler=train_sampler,
-        collate_fn=collate_fn,
+        collate_fn=train_batch_fn,
         num_workers=config.num_workers)
 
     dev_dataloader = DataLoader(
         dev_dataset,
         batch_sampler=dev_sampler,
-        collate_fn=collate_fn,
+        collate_fn=train_batch_fn,
         num_workers=config.num_workers)
     print("dataloaders done!")
 
@@ -138,7 +121,6 @@ def train_sp(args, config):
     print("vocab_size:", vocab_size)
 
     odim = config.n_fft // 2 + 1
-    config["model"]["generator_params"]["spks"] = spk_num
     model = VITS(idim=vocab_size, odim=odim, **config["model"])
     gen_parameters = model.generator.parameters()
     dis_parameters = model.discriminator.parameters()
@@ -258,17 +240,6 @@ def main():
         "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
     parser.add_argument(
         "--phones-dict", type=str, default=None, help="phone vocabulary file.")
-    parser.add_argument(
-        "--speaker-dict",
-        type=str,
-        default=None,
-        help="speaker id map file for multiple speaker model.")
-
-    parser.add_argument(
-        "--voice-cloning",
-        type=str2bool,
-        default=False,
-        help="whether training voice cloning model.")
 
     args = parser.parse_args()
 
diff --git a/paddlespeech/t2s/exps/voice_cloning.py b/paddlespeech/t2s/exps/voice_cloning.py
index 80cfea4a6..b51a4d7bc 100644
--- a/paddlespeech/t2s/exps/voice_cloning.py
+++ b/paddlespeech/t2s/exps/voice_cloning.py
@@ -21,28 +21,13 @@ import soundfile as sf
 import yaml
 from yacs.config import CfgNode
 
-from paddlespeech.cli.vector import VectorExecutor
 from paddlespeech.t2s.exps.syn_utils import get_am_inference
 from paddlespeech.t2s.exps.syn_utils import get_voc_inference
 from paddlespeech.t2s.frontend.zh_frontend import Frontend
-from paddlespeech.t2s.utils import str2bool
 from paddlespeech.vector.exps.ge2e.audio_processor import SpeakerVerificationPreprocessor
 from paddlespeech.vector.models.lstm_speaker_encoder import LSTMSpeakerEncoder
 
 
-def gen_random_embed(use_ecapa: bool=False):
-    if use_ecapa:
-        # Randomly generate numbers of -25 ~ 25, 192 is the dim of spk_emb
-        random_spk_emb = (-1 + 2 * np.random.rand(192)) * 25
-
-    # GE2E
-    else:
-        # Randomly generate numbers of 0 ~ 0.2, 256 is the dim of spk_emb
-        random_spk_emb = np.random.rand(256) * 0.2
-    random_spk_emb = paddle.to_tensor(random_spk_emb, dtype='float32')
-    return random_spk_emb
-
-
 def voice_cloning(args):
     # Init body.
     with open(args.am_config) as f:
@@ -56,47 +41,30 @@ def voice_cloning(args):
     print(am_config)
     print(voc_config)
 
-    output_dir = Path(args.output_dir)
-    output_dir.mkdir(parents=True, exist_ok=True)
-
-    input_dir = Path(args.input_dir)
-
     # speaker encoder
-    if args.use_ecapa:
-        vec_executor = VectorExecutor()
-        # warm up
-        vec_executor(
-            audio_file=input_dir / os.listdir(input_dir)[0], force_yes=True)
-        print("ECAPA-TDNN Done!")
-    # use GE2E
-    else:
-        p = SpeakerVerificationPreprocessor(
-            sampling_rate=16000,
-            audio_norm_target_dBFS=-30,
-            vad_window_length=30,
-            vad_moving_average_width=8,
-            vad_max_silence_length=6,
-            mel_window_length=25,
-            mel_window_step=10,
-            n_mels=40,
-            partial_n_frames=160,
-            min_pad_coverage=0.75,
-            partial_overlap_ratio=0.5)
-        print("Audio Processor Done!")
-
-        speaker_encoder = LSTMSpeakerEncoder(
-            n_mels=40, num_layers=3, hidden_size=256, output_size=256)
-        speaker_encoder.set_state_dict(paddle.load(args.ge2e_params_path))
-        speaker_encoder.eval()
-        print("GE2E Done!")
+    p = SpeakerVerificationPreprocessor(
+        sampling_rate=16000,
+        audio_norm_target_dBFS=-30,
+        vad_window_length=30,
+        vad_moving_average_width=8,
+        vad_max_silence_length=6,
+        mel_window_length=25,
+        mel_window_step=10,
+        n_mels=40,
+        partial_n_frames=160,
+        min_pad_coverage=0.75,
+        partial_overlap_ratio=0.5)
+    print("Audio Processor Done!")
+
+    speaker_encoder = LSTMSpeakerEncoder(
+        n_mels=40, num_layers=3, hidden_size=256, output_size=256)
+    speaker_encoder.set_state_dict(paddle.load(args.ge2e_params_path))
+    speaker_encoder.eval()
+    print("GE2E Done!")
 
     frontend = Frontend(phone_vocab_path=args.phones_dict)
     print("frontend done!")
 
-    sentence = args.text
-    input_ids = frontend.get_input_ids(sentence, merge_sentences=True)
-    phone_ids = input_ids["phone_ids"][0]
-
     # acoustic model
     am_inference = get_am_inference(
         am=args.am,
@@ -112,19 +80,26 @@ def voice_cloning(args):
         voc_ckpt=args.voc_ckpt,
         voc_stat=args.voc_stat)
 
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    input_dir = Path(args.input_dir)
+
+    sentence = args.text
+
+    input_ids = frontend.get_input_ids(sentence, merge_sentences=True)
+    phone_ids = input_ids["phone_ids"][0]
+
     for name in os.listdir(input_dir):
         utt_id = name.split(".")[0]
         ref_audio_path = input_dir / name
-        if args.use_ecapa:
-            spk_emb = vec_executor(audio_file=ref_audio_path, force_yes=True)
-            spk_emb = paddle.to_tensor(spk_emb)
-        # GE2E
-        else:
-            mel_sequences = p.extract_mel_partials(
-                p.preprocess_wav(ref_audio_path))
-            with paddle.no_grad():
-                spk_emb = speaker_encoder.embed_utterance(
-                    paddle.to_tensor(mel_sequences))
+        mel_sequences = p.extract_mel_partials(p.preprocess_wav(ref_audio_path))
+        # print("mel_sequences: ", mel_sequences.shape)
+        with paddle.no_grad():
+            spk_emb = speaker_encoder.embed_utterance(
+                paddle.to_tensor(mel_sequences))
+        # print("spk_emb shape: ", spk_emb.shape)
+
         with paddle.no_grad():
             wav = voc_inference(am_inference(phone_ids, spk_emb=spk_emb))
 
@@ -133,17 +108,16 @@ def voice_cloning(args):
             wav.numpy(),
             samplerate=am_config.fs)
         print(f"{utt_id} done!")
-
-    # generate 5 random_spk_emb
-    for i in range(5):
-        random_spk_emb = gen_random_embed(args.use_ecapa)
-        utt_id = "random_spk_emb"
-        with paddle.no_grad():
-            wav = voc_inference(am_inference(phone_ids, spk_emb=random_spk_emb))
-        sf.write(
-            str(output_dir / (utt_id + "_" + str(i) + ".wav")),
-            wav.numpy(),
-            samplerate=am_config.fs)
+    # Randomly generate numbers of 0 ~ 0.2, 256 is the dim of spk_emb
+    random_spk_emb = np.random.rand(256) * 0.2
+    random_spk_emb = paddle.to_tensor(random_spk_emb, dtype='float32')
+    utt_id = "random_spk_emb"
+    with paddle.no_grad():
+        wav = voc_inference(am_inference(phone_ids, spk_emb=random_spk_emb))
+    sf.write(
+        str(output_dir / (utt_id + ".wav")),
+        wav.numpy(),
+        samplerate=am_config.fs)
     print(f"{utt_id} done!")
 
 
@@ -197,15 +171,13 @@ def parse_args():
         type=str,
         default="每当你觉得，想要批评什么人的时候，你切要记着，这个世界上的人，并非都具备你禀有的条件。",
         help="text to synthesize, a line")
+
     parser.add_argument(
         "--ge2e_params_path", type=str, help="ge2e params path.")
-    parser.add_argument(
-        "--use_ecapa",
-        type=str2bool,
-        default=False,
-        help="whether to use ECAPA-TDNN as speaker encoder.")
+
     parser.add_argument(
         "--ngpu", type=int, default=1, help="if ngpu=0, use cpu.")
+
     parser.add_argument(
         "--input-dir",
         type=str,
diff --git a/paddlespeech/t2s/frontend/g2pw/__init__.py b/paddlespeech/t2s/frontend/g2pw/__init__.py
index 0eaeee5df..6e1ee0db8 100644
--- a/paddlespeech/t2s/frontend/g2pw/__init__.py
+++ b/paddlespeech/t2s/frontend/g2pw/__init__.py
@@ -1 +1,2 @@
 from paddlespeech.t2s.frontend.g2pw.onnx_api import G2PWOnnxConverter
+
diff --git a/paddlespeech/t2s/frontend/g2pw/dataset.py b/paddlespeech/t2s/frontend/g2pw/dataset.py
index 98af5f463..ab715dc36 100644
--- a/paddlespeech/t2s/frontend/g2pw/dataset.py
+++ b/paddlespeech/t2s/frontend/g2pw/dataset.py
@@ -81,12 +81,12 @@ def prepare_onnx_input(tokenizer,
         position_ids.append(position_id)
 
     outputs = {
-        'input_ids': np.array(input_ids).astype(np.int64),
-        'token_type_ids': np.array(token_type_ids).astype(np.int64),
-        'attention_masks': np.array(attention_masks).astype(np.int64),
+        'input_ids': np.array(input_ids),
+        'token_type_ids': np.array(token_type_ids),
+        'attention_masks': np.array(attention_masks),
         'phoneme_masks': np.array(phoneme_masks).astype(np.float32),
-        'char_ids': np.array(char_ids).astype(np.int64),
-        'position_ids': np.array(position_ids).astype(np.int64),
+        'char_ids': np.array(char_ids),
+        'position_ids': np.array(position_ids),
     }
     return outputs
 
diff --git a/paddlespeech/t2s/frontend/g2pw/onnx_api.py b/paddlespeech/t2s/frontend/g2pw/onnx_api.py
index 180e8ae15..9e708ec88 100644
--- a/paddlespeech/t2s/frontend/g2pw/onnx_api.py
+++ b/paddlespeech/t2s/frontend/g2pw/onnx_api.py
@@ -34,7 +34,7 @@ from paddlespeech.t2s.frontend.g2pw.utils import load_config
 from paddlespeech.t2s.frontend.zh_normalization.char_convert import tranditional_to_simplified
 from paddlespeech.utils.env import MODEL_HOME
 
-model_version = '1.1'
+model_version = '1.0'
 
 
 def predict(session, onnx_input, labels):
diff --git a/paddlespeech/t2s/frontend/mix_frontend.py b/paddlespeech/t2s/frontend/mix_frontend.py
index 101a1e503..6868d3357 100644
--- a/paddlespeech/t2s/frontend/mix_frontend.py
+++ b/paddlespeech/t2s/frontend/mix_frontend.py
@@ -61,11 +61,7 @@ class MixFrontend():
             return False
 
     def is_end(self, before_char, after_char) -> bool:
-        flag = 0
-        for char in (before_char, after_char):
-            if self.is_alphabet(char) or char == " ":
-                flag += 1
-        if flag == 2:
+        if ((self.is_alphabet(before_char) or before_char == " ") and (self.is_alphabet(after_char) or after_char == " ")):
             return True
         else:
             return False
@@ -90,11 +86,10 @@ class MixFrontend():
             if point_index == 0 or point_index == len(text) - 1:
                 new_text = text
             else:
-                if not self.is_end(text[point_index - 1], text[point_index +
-                                                               1]):
+                if not self.is_end(text[point_index - 1], text[point_index + 1]):
                     new_text = text
                 else:
-                    new_text = text[:point_index] + "。" + text[point_index + 1:]
+                    new_text = text[: point_index] + "。" + text[point_index + 1:]
 
         elif len(point_indexs) == 2:
             first_index = point_indexs[0]
@@ -102,8 +97,7 @@ class MixFrontend():
 
             # first
             if first_index != 0:
-                if not self.is_end(text[first_index - 1], text[first_index +
-                                                               1]):
+                if not self.is_end(text[first_index - 1], text[first_index + 1]):
                     new_text += (text[:first_index] + ".")
                 else:
                     new_text += (text[:first_index] + "。")
@@ -112,20 +106,18 @@ class MixFrontend():
             # last
             if end_index != len(text) - 1:
                 if not self.is_end(text[end_index - 1], text[end_index + 1]):
-                    new_text += text[point_indexs[-2] + 1:]
+                    new_text += text[point_indexs[-2] + 1 : ]
                 else:
-                    new_text += (text[point_indexs[-2] + 1:end_index] + "。" +
-                                 text[end_index + 1:])
+                    new_text += (text[point_indexs[-2] + 1 : end_index] + "。" + text[end_index + 1 : ])
             else:
-                new_text += "."
+                new_text += "."          
 
         else:
             first_index = point_indexs[0]
             end_index = point_indexs[-1]
             # first
             if first_index != 0:
-                if not self.is_end(text[first_index - 1], text[first_index +
-                                                               1]):
+                if not self.is_end(text[first_index - 1], text[first_index + 1]):
                     new_text += (text[:first_index] + ".")
                 else:
                     new_text += (text[:first_index] + "。")
@@ -134,20 +126,16 @@ class MixFrontend():
             # middle
             for j in range(1, len(point_indexs) - 1):
                 point_index = point_indexs[j]
-                if not self.is_end(text[point_index - 1], text[point_index +
-                                                               1]):
-                    new_text += (
-                        text[point_indexs[j - 1] + 1:point_index] + ".")
+                if not self.is_end(text[point_index - 1], text[point_index + 1]):
+                    new_text += (text[point_indexs[j-1] + 1 : point_index] + ".")
                 else:
-                    new_text += (
-                        text[point_indexs[j - 1] + 1:point_index] + "。")
+                    new_text += (text[point_indexs[j-1] + 1 : point_index] + "。")
             # last
             if end_index != len(text) - 1:
                 if not self.is_end(text[end_index - 1], text[end_index + 1]):
-                    new_text += text[point_indexs[-2] + 1:]
+                    new_text += text[point_indexs[-2] + 1 : ]
                 else:
-                    new_text += (text[point_indexs[-2] + 1:end_index] + "。" +
-                                 text[end_index + 1:])
+                    new_text += (text[point_indexs[-2] + 1 : end_index] + "。" + text[end_index + 1 : ])
             else:
                 new_text += "."
 
@@ -236,7 +224,7 @@ class MixFrontend():
 
     def get_input_ids(self,
                       sentence: str,
-                      merge_sentences: bool=False,
+                      merge_sentences: bool=True,
                       get_tone_ids: bool=False,
                       add_sp: bool=True,
                       to_tensor: bool=True) -> Dict[str, List[paddle.Tensor]]:
@@ -244,29 +232,28 @@ class MixFrontend():
         sentences = self._split(sentence)
         phones_list = []
         result = {}
+
         for text in sentences:
             phones_seg = []
             segments = self._distinguish(text)
             for seg in segments:
                 content = seg[0]
                 lang = seg[1]
-                if content != '':
-                    if lang == "en":
-                        input_ids = self.en_frontend.get_input_ids(
-                            content, merge_sentences=True, to_tensor=to_tensor)
-                    else:
-                        input_ids = self.zh_frontend.get_input_ids(
-                            content,
-                            merge_sentences=True,
-                            get_tone_ids=get_tone_ids,
-                            to_tensor=to_tensor)
-
-                    phones_seg.append(input_ids["phone_ids"][0])
-                    if add_sp:
-                        phones_seg.append(self.sp_id_tensor)
-
-            if phones_seg == []:
-                phones_seg.append(self.sp_id_tensor)
+                if lang == "zh":
+                    input_ids = self.zh_frontend.get_input_ids(
+                        content,
+                        merge_sentences=True,
+                        get_tone_ids=get_tone_ids,
+                        to_tensor=to_tensor)
+
+                elif lang == "en":
+                    input_ids = self.en_frontend.get_input_ids(
+                        content, merge_sentences=True, to_tensor=to_tensor)
+
+                phones_seg.append(input_ids["phone_ids"][0])
+                if add_sp:
+                    phones_seg.append(self.sp_id_tensor)
+
             phones = paddle.concat(phones_seg)
             phones_list.append(phones)
 
diff --git a/paddlespeech/t2s/frontend/polyphonic.yaml b/paddlespeech/t2s/frontend/polyphonic.yaml
index 51b76f23f..2c7cf33fb 100644
--- a/paddlespeech/t2s/frontend/polyphonic.yaml
+++ b/paddlespeech/t2s/frontend/polyphonic.yaml
@@ -42,8 +42,3 @@ polyphonic:
     咖喱: ['ga1','li5']
     时分: ['shi2','fen1']
     蚌埠: ['beng4','bu4']
-    驯服: ['xun4','fu2']
-    幸免于难: ['xing4','mian3','yu2','nan4']
-    恶行: ['e4','xing2']
-    唉: ['ai4']
-
diff --git a/paddlespeech/t2s/frontend/tone_sandhi.py b/paddlespeech/t2s/frontend/tone_sandhi.py
index 9fff4272c..e5ef617a9 100644
--- a/paddlespeech/t2s/frontend/tone_sandhi.py
+++ b/paddlespeech/t2s/frontend/tone_sandhi.py
@@ -42,7 +42,7 @@ class ToneSandhi():
             '木头', '木匠', '朋友', '月饼', '月亮', '暖和', '明白', '时候', '新鲜', '故事', '收拾',
             '收成', '提防', '挖苦', '挑剔', '指甲', '指头', '拾掇', '拳头', '拨弄', '招牌', '招呼',
             '抬举', '护士', '折腾', '扫帚', '打量', '打算', '打扮', '打听', '打发', '扎实', '扁担',
-            '戒指', '懒得', '意识', '意思', '悟性', '怪物', '思量', '怎么', '念头', '念叨', '别人',
+            '戒指', '懒得', '意识', '意思', '情形', '悟性', '怪物', '思量', '怎么', '念头', '念叨',
             '快活', '忙活', '志气', '心思', '得罪', '张罗', '弟兄', '开通', '应酬', '庄稼', '干事',
             '帮手', '帐篷', '希罕', '师父', '师傅', '巴结', '巴掌', '差事', '工夫', '岁数', '屁股',
             '尾巴', '少爷', '小气', '小伙', '将就', '对头', '对付', '寡妇', '家伙', '客气', '实在',
@@ -60,7 +60,7 @@ class ToneSandhi():
             '邋遢', '费用', '冤家', '甜头', '介绍', '荒唐', '大人', '泥鳅', '幸福', '熟悉', '计划',
             '扑腾', '蜡烛', '姥爷', '照顾', '喉咙', '吉他', '弄堂', '蚂蚱', '凤凰', '拖沓', '寒碜',
             '糟蹋', '倒腾', '报复', '逻辑', '盘缠', '喽啰', '牢骚', '咖喱', '扫把', '惦记', '戏弄',
-            '将军'
+            '将军', '别人'
         }
         self.must_not_neural_tone_words = {
             '男子', '女子', '分子', '原子', '量子', '莲子', '石子', '瓜子', '电子', '人人', '虎虎',
@@ -84,7 +84,7 @@ class ToneSandhi():
             if j - 1 >= 0 and item == word[j - 1] and pos[0] in {"n", "v", "a"}:
                 finals[j] = finals[j][:-1] + "5"
         ge_idx = word.find("个")
-        if len(word) >= 1 and word[-1] in "吧呢啊呐噻嘛吖嗨呐哦哒滴哩哟喽啰耶喔诶":
+        if len(word) >= 1 and word[-1] in "吧呢啊呐噻嘛吖嗨呐哦哒额滴哩哟喽啰耶喔诶":
             finals[-1] = finals[-1][:-1] + "5"
         elif len(word) >= 1 and word[-1] in "的地得":
             finals[-1] = finals[-1][:-1] + "5"
@@ -169,7 +169,6 @@ class ToneSandhi():
         return new_word_list
 
     def _three_sandhi(self, word: str, finals: List[str]) -> List[str]:
-
         if len(word) == 2 and self._all_tone_three(finals):
             finals[0] = finals[0][:-1] + "2"
         elif len(word) == 3:
@@ -347,7 +346,6 @@ class ToneSandhi():
 
     def modified_tone(self, word: str, pos: str,
                       finals: List[str]) -> List[str]:
-
         finals = self._bu_sandhi(word, finals)
         finals = self._yi_sandhi(word, finals)
         finals = self._neural_sandhi(word, pos, finals)
diff --git a/paddlespeech/t2s/frontend/zh_normalization/num.py b/paddlespeech/t2s/frontend/zh_normalization/num.py
index 8a54d3e63..ec1367736 100644
--- a/paddlespeech/t2s/frontend/zh_normalization/num.py
+++ b/paddlespeech/t2s/frontend/zh_normalization/num.py
@@ -28,7 +28,7 @@ UNITS = OrderedDict({
     8: '亿',
 })
 
-COM_QUANTIFIERS = '(封|艘|把|目|套|段|人|所|朵|匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|毫|厘|(公)分|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|小时|旬|纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块|元|(亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|美|)元|(亿|千万|百万|万|千|百|十|)吨|(亿|千万|百万|万|千|百|)块|角|毛|分)'
+COM_QUANTIFIERS = '(所|朵|匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|毫|厘|(公)分|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|小时|旬|纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块|元|(亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|美|)元|(亿|千万|百万|万|千|百|)块|角|毛|分)'
 
 # 分数表达式
 RE_FRAC = re.compile(r'(-?)(\d+)/(\d+)')
diff --git a/paddlespeech/t2s/models/ernie_sat/__init__.py b/paddlespeech/t2s/models/ernie_sat/__init__.py
index 87e7afe85..7e795370e 100644
--- a/paddlespeech/t2s/models/ernie_sat/__init__.py
+++ b/paddlespeech/t2s/models/ernie_sat/__init__.py
@@ -13,3 +13,4 @@
 # limitations under the License.
 from .ernie_sat import *
 from .ernie_sat_updater import *
+from .mlm import *
diff --git a/paddlespeech/t2s/models/ernie_sat/ernie_sat.py b/paddlespeech/t2s/models/ernie_sat/ernie_sat.py
index 08c43dc5f..54f5d542d 100644
--- a/paddlespeech/t2s/models/ernie_sat/ernie_sat.py
+++ b/paddlespeech/t2s/models/ernie_sat/ernie_sat.py
@@ -389,7 +389,7 @@ class MLM(nn.Layer):
             speech_seg_pos: paddle.Tensor,
             text_seg_pos: paddle.Tensor,
             span_bdy: List[int],
-            use_teacher_forcing: bool=True, ) -> List[paddle.Tensor]:
+            use_teacher_forcing: bool=False, ) -> List[paddle.Tensor]:
         '''
         Args:
             speech (paddle.Tensor): input speech (1, Tmax, D).
@@ -657,7 +657,7 @@ class ErnieSAT(nn.Layer):
             speech_seg_pos: paddle.Tensor,
             text_seg_pos: paddle.Tensor,
             span_bdy: List[int],
-            use_teacher_forcing: bool=True, ) -> Dict[str, paddle.Tensor]:
+            use_teacher_forcing: bool=False, ) -> Dict[str, paddle.Tensor]:
         return self.model.inference(
             speech=speech,
             text=text,
diff --git a/paddlespeech/t2s/models/ernie_sat/mlm.py b/paddlespeech/t2s/models/ernie_sat/mlm.py
new file mode 100644
index 000000000..647fdd9b4
--- /dev/null
+++ b/paddlespeech/t2s/models/ernie_sat/mlm.py
@@ -0,0 +1,579 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+from typing import Dict
+from typing import List
+from typing import Optional
+
+import paddle
+import yaml
+from paddle import nn
+from yacs.config import CfgNode
+
+from paddlespeech.t2s.modules.activation import get_activation
+from paddlespeech.t2s.modules.conformer.convolution import ConvolutionModule
+from paddlespeech.t2s.modules.conformer.encoder_layer import EncoderLayer
+from paddlespeech.t2s.modules.layer_norm import LayerNorm
+from paddlespeech.t2s.modules.masked_fill import masked_fill
+from paddlespeech.t2s.modules.nets_utils import initialize
+from paddlespeech.t2s.modules.tacotron2.decoder import Postnet
+from paddlespeech.t2s.modules.transformer.attention import LegacyRelPositionMultiHeadedAttention
+from paddlespeech.t2s.modules.transformer.attention import MultiHeadedAttention
+from paddlespeech.t2s.modules.transformer.attention import RelPositionMultiHeadedAttention
+from paddlespeech.t2s.modules.transformer.embedding import LegacyRelPositionalEncoding
+from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
+from paddlespeech.t2s.modules.transformer.embedding import RelPositionalEncoding
+from paddlespeech.t2s.modules.transformer.embedding import ScaledPositionalEncoding
+from paddlespeech.t2s.modules.transformer.multi_layer_conv import Conv1dLinear
+from paddlespeech.t2s.modules.transformer.multi_layer_conv import MultiLayeredConv1d
+from paddlespeech.t2s.modules.transformer.positionwise_feed_forward import PositionwiseFeedForward
+from paddlespeech.t2s.modules.transformer.repeat import repeat
+from paddlespeech.t2s.modules.transformer.subsampling import Conv2dSubsampling
+
+
+# MLM -> Mask Language Model
+class mySequential(nn.Sequential):
+    def forward(self, *inputs):
+        for module in self._sub_layers.values():
+            if type(inputs) == tuple:
+                inputs = module(*inputs)
+            else:
+                inputs = module(inputs)
+        return inputs
+
+
+class MaskInputLayer(nn.Layer):
+    def __init__(self, out_features: int) -> None:
+        super().__init__()
+        self.mask_feature = paddle.create_parameter(
+            shape=(1, 1, out_features),
+            dtype=paddle.float32,
+            default_initializer=paddle.nn.initializer.Assign(
+                paddle.normal(shape=(1, 1, out_features))))
+
+    def forward(self, input: paddle.Tensor,
+                masked_pos: paddle.Tensor=None) -> paddle.Tensor:
+        masked_pos = paddle.expand_as(paddle.unsqueeze(masked_pos, -1), input)
+        masked_input = masked_fill(input, masked_pos, 0) + masked_fill(
+            paddle.expand_as(self.mask_feature, input), ~masked_pos, 0)
+        return masked_input
+
+
+class MLMEncoder(nn.Layer):
+    """Conformer encoder module.
+
+    Args:
+        idim (int): Input dimension.
+        attention_dim (int): Dimension of attention.
+        attention_heads (int): The number of heads of multi head attention.
+        linear_units (int): The number of units of position-wise feed forward.
+        num_blocks (int): The number of decoder blocks.
+        dropout_rate (float): Dropout rate.
+        positional_dropout_rate (float): Dropout rate after adding positional encoding.
+        attention_dropout_rate (float): Dropout rate in attention.
+        input_layer (Union[str, paddle.nn.Layer]): Input layer type.
+        normalize_before (bool): Whether to use layer_norm before the first block.
+        concat_after (bool): Whether to concat attention layer's input and output.
+            if True, additional linear will be applied.
+            i.e. x -> x + linear(concat(x, att(x)))
+            if False, no additional linear will be applied. i.e. x -> x + att(x)
+        positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear".
+        positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer.
+        macaron_style (bool): Whether to use macaron style for positionwise layer.
+        pos_enc_layer_type (str): Encoder positional encoding layer type.
+        selfattention_layer_type (str): Encoder attention layer type.
+        activation_type (str): Encoder activation function type.
+        use_cnn_module (bool): Whether to use convolution module.
+        zero_triu (bool): Whether to zero the upper triangular part of attention matrix.
+        cnn_module_kernel (int): Kernerl size of convolution module.
+        padding_idx (int): Padding idx for input_layer=embed.
+        stochastic_depth_rate (float): Maximum probability to skip the encoder layer.
+
+    """
+
+    def __init__(self,
+                 idim: int,
+                 vocab_size: int=0,
+                 pre_speech_layer: int=0,
+                 attention_dim: int=256,
+                 attention_heads: int=4,
+                 linear_units: int=2048,
+                 num_blocks: int=6,
+                 dropout_rate: float=0.1,
+                 positional_dropout_rate: float=0.1,
+                 attention_dropout_rate: float=0.0,
+                 input_layer: str="conv2d",
+                 normalize_before: bool=True,
+                 concat_after: bool=False,
+                 positionwise_layer_type: str="linear",
+                 positionwise_conv_kernel_size: int=1,
+                 macaron_style: bool=False,
+                 pos_enc_layer_type: str="abs_pos",
+                 selfattention_layer_type: str="selfattn",
+                 activation_type: str="swish",
+                 use_cnn_module: bool=False,
+                 zero_triu: bool=False,
+                 cnn_module_kernel: int=31,
+                 padding_idx: int=-1,
+                 stochastic_depth_rate: float=0.0,
+                 text_masking: bool=False):
+        """Construct an Encoder object."""
+        super().__init__()
+        self._output_size = attention_dim
+        self.text_masking = text_masking
+        if self.text_masking:
+            self.text_masking_layer = MaskInputLayer(attention_dim)
+        activation = get_activation(activation_type)
+        if pos_enc_layer_type == "abs_pos":
+            pos_enc_class = PositionalEncoding
+        elif pos_enc_layer_type == "scaled_abs_pos":
+            pos_enc_class = ScaledPositionalEncoding
+        elif pos_enc_layer_type == "rel_pos":
+            assert selfattention_layer_type == "rel_selfattn"
+            pos_enc_class = RelPositionalEncoding
+        elif pos_enc_layer_type == "legacy_rel_pos":
+            pos_enc_class = LegacyRelPositionalEncoding
+            assert selfattention_layer_type == "legacy_rel_selfattn"
+        else:
+            raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type)
+
+        self.conv_subsampling_factor = 1
+        if input_layer == "linear":
+            self.embed = nn.Sequential(
+                nn.Linear(idim, attention_dim),
+                nn.LayerNorm(attention_dim),
+                nn.Dropout(dropout_rate),
+                nn.ReLU(),
+                pos_enc_class(attention_dim, positional_dropout_rate), )
+        elif input_layer == "conv2d":
+            self.embed = Conv2dSubsampling(
+                idim,
+                attention_dim,
+                dropout_rate,
+                pos_enc_class(attention_dim, positional_dropout_rate), )
+            self.conv_subsampling_factor = 4
+        elif input_layer == "embed":
+            self.embed = nn.Sequential(
+                nn.Embedding(idim, attention_dim, padding_idx=padding_idx),
+                pos_enc_class(attention_dim, positional_dropout_rate), )
+        elif input_layer == "mlm":
+            self.segment_emb = None
+            self.speech_embed = mySequential(
+                MaskInputLayer(idim),
+                nn.Linear(idim, attention_dim),
+                nn.LayerNorm(attention_dim),
+                nn.ReLU(),
+                pos_enc_class(attention_dim, positional_dropout_rate))
+            self.text_embed = nn.Sequential(
+                nn.Embedding(
+                    vocab_size, attention_dim, padding_idx=padding_idx),
+                pos_enc_class(attention_dim, positional_dropout_rate), )
+        elif input_layer == "sega_mlm":
+            self.segment_emb = nn.Embedding(
+                500, attention_dim, padding_idx=padding_idx)
+            self.speech_embed = mySequential(
+                MaskInputLayer(idim),
+                nn.Linear(idim, attention_dim),
+                nn.LayerNorm(attention_dim),
+                nn.ReLU(),
+                pos_enc_class(attention_dim, positional_dropout_rate))
+            self.text_embed = nn.Sequential(
+                nn.Embedding(
+                    vocab_size, attention_dim, padding_idx=padding_idx),
+                pos_enc_class(attention_dim, positional_dropout_rate), )
+        elif isinstance(input_layer, nn.Layer):
+            self.embed = nn.Sequential(
+                input_layer,
+                pos_enc_class(attention_dim, positional_dropout_rate), )
+        elif input_layer is None:
+            self.embed = nn.Sequential(
+                pos_enc_class(attention_dim, positional_dropout_rate))
+        else:
+            raise ValueError("unknown input_layer: " + input_layer)
+        self.normalize_before = normalize_before
+
+        # self-attention module definition
+        if selfattention_layer_type == "selfattn":
+            encoder_selfattn_layer = MultiHeadedAttention
+            encoder_selfattn_layer_args = (attention_heads, attention_dim,
+                                           attention_dropout_rate, )
+        elif selfattention_layer_type == "legacy_rel_selfattn":
+            assert pos_enc_layer_type == "legacy_rel_pos"
+            encoder_selfattn_layer = LegacyRelPositionMultiHeadedAttention
+            encoder_selfattn_layer_args = (attention_heads, attention_dim,
+                                           attention_dropout_rate, )
+        elif selfattention_layer_type == "rel_selfattn":
+            assert pos_enc_layer_type == "rel_pos"
+            encoder_selfattn_layer = RelPositionMultiHeadedAttention
+            encoder_selfattn_layer_args = (attention_heads, attention_dim,
+                                           attention_dropout_rate, zero_triu, )
+        else:
+            raise ValueError("unknown encoder_attn_layer: " +
+                             selfattention_layer_type)
+
+        # feed-forward module definition
+        if positionwise_layer_type == "linear":
+            positionwise_layer = PositionwiseFeedForward
+            positionwise_layer_args = (attention_dim, linear_units,
+                                       dropout_rate, activation, )
+        elif positionwise_layer_type == "conv1d":
+            positionwise_layer = MultiLayeredConv1d
+            positionwise_layer_args = (attention_dim, linear_units,
+                                       positionwise_conv_kernel_size,
+                                       dropout_rate, )
+        elif positionwise_layer_type == "conv1d-linear":
+            positionwise_layer = Conv1dLinear
+            positionwise_layer_args = (attention_dim, linear_units,
+                                       positionwise_conv_kernel_size,
+                                       dropout_rate, )
+        else:
+            raise NotImplementedError("Support only linear or conv1d.")
+
+        # convolution module definition
+        convolution_layer = ConvolutionModule
+        convolution_layer_args = (attention_dim, cnn_module_kernel, activation)
+
+        self.encoders = repeat(
+            num_blocks,
+            lambda lnum: EncoderLayer(
+                attention_dim,
+                encoder_selfattn_layer(*encoder_selfattn_layer_args),
+                positionwise_layer(*positionwise_layer_args),
+                positionwise_layer(*positionwise_layer_args) if macaron_style else None,
+                convolution_layer(*convolution_layer_args) if use_cnn_module else None,
+                dropout_rate,
+                normalize_before,
+                concat_after,
+                stochastic_depth_rate * float(1 + lnum) / num_blocks, ), )
+        self.pre_speech_layer = pre_speech_layer
+        self.pre_speech_encoders = repeat(
+            self.pre_speech_layer,
+            lambda lnum: EncoderLayer(
+                attention_dim,
+                encoder_selfattn_layer(*encoder_selfattn_layer_args),
+                positionwise_layer(*positionwise_layer_args),
+                positionwise_layer(*positionwise_layer_args) if macaron_style else None,
+                convolution_layer(*convolution_layer_args) if use_cnn_module else None,
+                dropout_rate,
+                normalize_before,
+                concat_after,
+                stochastic_depth_rate * float(1 + lnum) / self.pre_speech_layer, ),
+        )
+        if self.normalize_before:
+            self.after_norm = LayerNorm(attention_dim)
+
+    def forward(self,
+                speech: paddle.Tensor,
+                text: paddle.Tensor,
+                masked_pos: paddle.Tensor,
+                speech_mask: paddle.Tensor=None,
+                text_mask: paddle.Tensor=None,
+                speech_seg_pos: paddle.Tensor=None,
+                text_seg_pos: paddle.Tensor=None):
+        """Encode input sequence.
+
+        """
+        if masked_pos is not None:
+            speech = self.speech_embed(speech, masked_pos)
+        else:
+            speech = self.speech_embed(speech)
+        if text is not None:
+            text = self.text_embed(text)
+        if speech_seg_pos is not None and text_seg_pos is not None and self.segment_emb:
+            speech_seg_emb = self.segment_emb(speech_seg_pos)
+            text_seg_emb = self.segment_emb(text_seg_pos)
+            text = (text[0] + text_seg_emb, text[1])
+            speech = (speech[0] + speech_seg_emb, speech[1])
+        if self.pre_speech_encoders:
+            speech, _ = self.pre_speech_encoders(speech, speech_mask)
+
+        if text is not None:
+            xs = paddle.concat([speech[0], text[0]], axis=1)
+            xs_pos_emb = paddle.concat([speech[1], text[1]], axis=1)
+            masks = paddle.concat([speech_mask, text_mask], axis=-1)
+        else:
+            xs = speech[0]
+            xs_pos_emb = speech[1]
+            masks = speech_mask
+
+        xs, masks = self.encoders((xs, xs_pos_emb), masks)
+
+        if isinstance(xs, tuple):
+            xs = xs[0]
+        if self.normalize_before:
+            xs = self.after_norm(xs)
+
+        return xs, masks
+
+
+class MLMDecoder(MLMEncoder):
+    def forward(self, xs: paddle.Tensor, masks: paddle.Tensor):
+        """Encode input sequence.
+
+        Args:
+            xs (paddle.Tensor): Input tensor (#batch, time, idim).
+            masks (paddle.Tensor): Mask tensor (#batch, time).
+
+        Returns:
+            paddle.Tensor: Output tensor (#batch, time, attention_dim).
+            paddle.Tensor: Mask tensor (#batch, time).
+
+        """
+        xs = self.embed(xs)
+        xs, masks = self.encoders(xs, masks)
+
+        if isinstance(xs, tuple):
+            xs = xs[0]
+        if self.normalize_before:
+            xs = self.after_norm(xs)
+
+        return xs, masks
+
+
+# encoder and decoder is nn.Layer, not str
+class MLM(nn.Layer):
+    def __init__(self,
+                 odim: int,
+                 encoder: nn.Layer,
+                 decoder: Optional[nn.Layer],
+                 postnet_layers: int=0,
+                 postnet_chans: int=0,
+                 postnet_filts: int=0,
+                 text_masking: bool=False):
+
+        super().__init__()
+        self.odim = odim
+        self.encoder = encoder
+        self.decoder = decoder
+        self.vocab_size = encoder.text_embed[0]._num_embeddings
+
+        if self.decoder is None or not (hasattr(self.decoder,
+                                                'output_layer') and
+                                        self.decoder.output_layer is not None):
+            self.sfc = nn.Linear(self.encoder._output_size, odim)
+        else:
+            self.sfc = None
+        if text_masking:
+            self.text_sfc = nn.Linear(
+                self.encoder.text_embed[0]._embedding_dim,
+                self.vocab_size,
+                weight_attr=self.encoder.text_embed[0]._weight_attr)
+        else:
+            self.text_sfc = None
+
+        self.postnet = (None if postnet_layers == 0 else Postnet(
+            idim=self.encoder._output_size,
+            odim=odim,
+            n_layers=postnet_layers,
+            n_chans=postnet_chans,
+            n_filts=postnet_filts,
+            use_batch_norm=True,
+            dropout_rate=0.5, ))
+
+    def inference(
+            self,
+            speech: paddle.Tensor,
+            text: paddle.Tensor,
+            masked_pos: paddle.Tensor,
+            speech_mask: paddle.Tensor,
+            text_mask: paddle.Tensor,
+            speech_seg_pos: paddle.Tensor,
+            text_seg_pos: paddle.Tensor,
+            span_bdy: List[int],
+            use_teacher_forcing: bool=False, ) -> Dict[str, paddle.Tensor]:
+        '''
+        Args:
+            speech (paddle.Tensor): input speech (1, Tmax, D).
+            text (paddle.Tensor): input text (1, Tmax2).
+            masked_pos (paddle.Tensor): masked position of input speech (1, Tmax)
+            speech_mask (paddle.Tensor): mask of speech (1, 1, Tmax).
+            text_mask (paddle.Tensor): mask of text (1, 1, Tmax2).
+            speech_seg_pos (paddle.Tensor): n-th phone of each mel, 0<=n<=Tmax2 (1, Tmax).
+            text_seg_pos (paddle.Tensor): n-th phone of each phone, 0<=n<=Tmax2 (1, Tmax2).
+            span_bdy (List[int]): masked mel boundary of input speech (2,)
+            use_teacher_forcing (bool): whether to use teacher forcing
+        Returns:
+            List[Tensor]:
+                eg:
+                [Tensor(shape=[1, 181, 80]), Tensor(shape=[80, 80]), Tensor(shape=[1, 67, 80])]
+        '''
+
+        z_cache = None
+        if use_teacher_forcing:
+            before_outs, zs, *_ = self.forward(
+                speech=speech,
+                text=text,
+                masked_pos=masked_pos,
+                speech_mask=speech_mask,
+                text_mask=text_mask,
+                speech_seg_pos=speech_seg_pos,
+                text_seg_pos=text_seg_pos)
+            if zs is None:
+                zs = before_outs
+
+            speech = speech.squeeze(0)
+            outs = [speech[:span_bdy[0]]]
+            outs += [zs[0][span_bdy[0]:span_bdy[1]]]
+            outs += [speech[span_bdy[1]:]]
+            return outs
+        return None
+
+
+class MLMEncAsDecoder(MLM):
+    def forward(self,
+                speech: paddle.Tensor,
+                text: paddle.Tensor,
+                masked_pos: paddle.Tensor,
+                speech_mask: paddle.Tensor,
+                text_mask: paddle.Tensor,
+                speech_seg_pos: paddle.Tensor,
+                text_seg_pos: paddle.Tensor):
+        # feats: (Batch, Length, Dim)
+        # -> encoder_out: (Batch, Length2, Dim2)
+        encoder_out, h_masks = self.encoder(
+            speech=speech,
+            text=text,
+            masked_pos=masked_pos,
+            speech_mask=speech_mask,
+            text_mask=text_mask,
+            speech_seg_pos=speech_seg_pos,
+            text_seg_pos=text_seg_pos)
+        if self.decoder is not None:
+            zs, _ = self.decoder(encoder_out, h_masks)
+        else:
+            zs = encoder_out
+        speech_hidden_states = zs[:, :paddle.shape(speech)[1], :]
+        if self.sfc is not None:
+            before_outs = paddle.reshape(
+                self.sfc(speech_hidden_states),
+                (paddle.shape(speech_hidden_states)[0], -1, self.odim))
+        else:
+            before_outs = speech_hidden_states
+        if self.postnet is not None:
+            after_outs = before_outs + paddle.transpose(
+                self.postnet(paddle.transpose(before_outs, [0, 2, 1])),
+                [0, 2, 1])
+        else:
+            after_outs = None
+        return before_outs, after_outs, None
+
+
+class MLMDualMaksing(MLM):
+    def forward(self,
+                speech: paddle.Tensor,
+                text: paddle.Tensor,
+                masked_pos: paddle.Tensor,
+                speech_mask: paddle.Tensor,
+                text_mask: paddle.Tensor,
+                speech_seg_pos: paddle.Tensor,
+                text_seg_pos: paddle.Tensor):
+        # feats: (Batch, Length, Dim)
+        # -> encoder_out: (Batch, Length2, Dim2)
+        encoder_out, h_masks = self.encoder(
+            speech=speech,
+            text=text,
+            masked_pos=masked_pos,
+            speech_mask=speech_mask,
+            text_mask=text_mask,
+            speech_seg_pos=speech_seg_pos,
+            text_seg_pos=text_seg_pos)
+        if self.decoder is not None:
+            zs, _ = self.decoder(encoder_out, h_masks)
+        else:
+            zs = encoder_out
+        speech_hidden_states = zs[:, :paddle.shape(speech)[1], :]
+        if self.text_sfc:
+            text_hiddent_states = zs[:, paddle.shape(speech)[1]:, :]
+            text_outs = paddle.reshape(
+                self.text_sfc(text_hiddent_states),
+                (paddle.shape(text_hiddent_states)[0], -1, self.vocab_size))
+        if self.sfc is not None:
+            before_outs = paddle.reshape(
+                self.sfc(speech_hidden_states),
+                (paddle.shape(speech_hidden_states)[0], -1, self.odim))
+        else:
+            before_outs = speech_hidden_states
+        if self.postnet is not None:
+            after_outs = before_outs + paddle.transpose(
+                self.postnet(paddle.transpose(before_outs, [0, 2, 1])),
+                [0, 2, 1])
+        else:
+            after_outs = None
+        return before_outs, after_outs, text_outs
+
+
+def build_model_from_file(config_file, model_file):
+
+    state_dict = paddle.load(model_file)
+    model_class = MLMDualMaksing if 'conformer_combine_vctk_aishell3_dual_masking' in config_file \
+        else MLMEncAsDecoder
+
+    # 构建模型
+    with open(config_file) as f:
+        conf = CfgNode(yaml.safe_load(f))
+    model = build_model(conf, model_class)
+    model.set_state_dict(state_dict)
+    return model, conf
+
+
+# select encoder and decoder here
+def build_model(args: argparse.Namespace, model_class=MLMEncAsDecoder) -> MLM:
+    if isinstance(args.token_list, str):
+        with open(args.token_list, encoding="utf-8") as f:
+            token_list = [line.rstrip() for line in f]
+
+        # Overwriting token_list to keep it as "portable".
+        args.token_list = list(token_list)
+    elif isinstance(args.token_list, (tuple, list)):
+        token_list = list(args.token_list)
+    else:
+        raise RuntimeError("token_list must be str or list")
+
+    vocab_size = len(token_list)
+    odim = 80
+
+    # Encoder
+    encoder_class = MLMEncoder
+
+    if 'text_masking' in args.model_conf.keys() and args.model_conf[
+            'text_masking']:
+        args.encoder_conf['text_masking'] = True
+    else:
+        args.encoder_conf['text_masking'] = False
+
+    encoder = encoder_class(
+        args.input_size, vocab_size=vocab_size, **args.encoder_conf)
+
+    # Decoder
+    if args.decoder != 'no_decoder':
+        decoder_class = MLMDecoder
+        decoder = decoder_class(
+            idim=0,
+            input_layer=None,
+            **args.decoder_conf, )
+    else:
+        decoder = None
+
+    # Build model
+    model = model_class(
+        odim=odim,
+        encoder=encoder,
+        decoder=decoder,
+        **args.model_conf, )
+
+    # Initialize
+    if args.init is not None:
+        initialize(model, args.init)
+
+    return model
diff --git a/paddlespeech/t2s/models/vits/generator.py b/paddlespeech/t2s/models/vits/generator.py
index 359b66258..f87de91a2 100644
--- a/paddlespeech/t2s/models/vits/generator.py
+++ b/paddlespeech/t2s/models/vits/generator.py
@@ -522,82 +522,6 @@ class VITSGenerator(nn.Layer):
 
         return wav.squeeze(1), attn.squeeze(1), dur.squeeze(1)
 
-    def voice_conversion(
-            self,
-            feats: paddle.Tensor=None,
-            feats_lengths: paddle.Tensor=None,
-            sids_src: Optional[paddle.Tensor]=None,
-            sids_tgt: Optional[paddle.Tensor]=None,
-            spembs_src: Optional[paddle.Tensor]=None,
-            spembs_tgt: Optional[paddle.Tensor]=None,
-            lids: Optional[paddle.Tensor]=None, ) -> paddle.Tensor:
-        """Run voice conversion.
-        Args:
-            feats (Tensor): Feature tensor (B, aux_channels, T_feats,).
-            feats_lengths (Tensor): Feature length tensor (B,).
-            sids_src (Optional[Tensor]): Speaker index tensor of source feature (B,) or (B, 1).
-            sids_tgt (Optional[Tensor]): Speaker index tensor of target feature (B,) or (B, 1).
-            spembs_src (Optional[Tensor]): Speaker embedding tensor of source feature (B, spk_embed_dim).
-            spembs_tgt (Optional[Tensor]): Speaker embedding tensor of target feature (B, spk_embed_dim).
-            lids (Optional[Tensor]): Language index tensor (B,) or (B, 1).
-        Returns:
-            Tensor: Generated waveform tensor (B, T_wav).
-        """
-        # encoder
-        g_src = None
-        g_tgt = None
-        if self.spks is not None:
-            # (B, global_channels, 1)
-            g_src = self.global_emb(
-                paddle.reshape(sids_src, [-1])).unsqueeze(-1)
-            g_tgt = self.global_emb(
-                paddle.reshape(sids_tgt, [-1])).unsqueeze(-1)
-
-        if self.spk_embed_dim is not None:
-            # (B, global_channels, 1)
-            g_src_ = self.spemb_proj(
-                F.normalize(spembs_src.unsqueeze(0))).unsqueeze(-1)
-            if g_src is None:
-                g_src = g_src_
-            else:
-                g_src = g_src + g_src_
-
-            # (B, global_channels, 1)
-            g_tgt_ = self.spemb_proj(
-                F.normalize(spembs_tgt.unsqueeze(0))).unsqueeze(-1)
-            if g_tgt is None:
-                g_tgt = g_tgt_
-            else:
-                g_tgt = g_tgt + g_tgt_
-
-        if self.langs is not None:
-            # (B, global_channels, 1)
-            g_ = self.lang_emb(paddle.reshape(lids, [-1])).unsqueeze(-1)
-
-            if g_src is None:
-                g_src = g_
-            else:
-                g_src = g_src + g_
-
-            if g_tgt is None:
-                g_tgt = g_
-            else:
-                g_tgt = g_tgt + g_
-
-        # forward posterior encoder
-        z, m_q, logs_q, y_mask = self.posterior_encoder(
-            feats, feats_lengths, g=g_src)
-
-        # forward flow
-        # (B, H, T_feats)
-        z_p = self.flow(z, y_mask, g=g_src)
-
-        # decoder
-        z_hat = self.flow(z_p, y_mask, g=g_tgt, inverse=True)
-        wav = self.decoder(z_hat * y_mask, g=g_tgt)
-
-        return wav.squeeze(1)
-
     def _generate_path(self, dur: paddle.Tensor,
                        mask: paddle.Tensor) -> paddle.Tensor:
         """Generate path a.k.a. monotonic attention.
diff --git a/paddlespeech/t2s/models/vits/vits.py b/paddlespeech/t2s/models/vits/vits.py
index 983bf0a36..5c476be77 100644
--- a/paddlespeech/t2s/models/vits/vits.py
+++ b/paddlespeech/t2s/models/vits/vits.py
@@ -381,7 +381,7 @@ class VITS(nn.Layer):
         if use_teacher_forcing:
             assert feats is not None
             feats = feats[None].transpose([0, 2, 1])
-            feats_lengths = paddle.to_tensor(paddle.shape(feats)[2])
+            feats_lengths = paddle.to_tensor([paddle.shape(feats)[2]])
             wav, att_w, dur = self.generator.inference(
                 text=text,
                 text_lengths=text_lengths,
@@ -406,43 +406,3 @@ class VITS(nn.Layer):
                 max_len=max_len, )
         return dict(
             wav=paddle.reshape(wav, [-1]), att_w=att_w[0], duration=dur[0])
-
-    def voice_conversion(
-            self,
-            feats: paddle.Tensor,
-            sids_src: Optional[paddle.Tensor]=None,
-            sids_tgt: Optional[paddle.Tensor]=None,
-            spembs_src: Optional[paddle.Tensor]=None,
-            spembs_tgt: Optional[paddle.Tensor]=None,
-            lids: Optional[paddle.Tensor]=None, ) -> paddle.Tensor:
-        """Run voice conversion.
-        Args:
-            feats (Tensor): Feature tensor (T_feats, aux_channels).
-            sids_src (Optional[Tensor]): Speaker index tensor of source feature (1,).
-            sids_tgt (Optional[Tensor]): Speaker index tensor of target feature (1,).
-            spembs_src (Optional[Tensor]): Speaker embedding tensor of source feature (spk_embed_dim,).
-            spembs_tgt (Optional[Tensor]): Speaker embedding tensor of target feature (spk_embed_dim,).
-            lids (Optional[Tensor]): Language index tensor (1,).
-        Returns:
-            Dict[str, Tensor]:
-                * wav (Tensor): Generated waveform tensor (T_wav,).
-        """
-        assert feats is not None
-        feats = feats[None].transpose([0, 2, 1])
-        feats_lengths = paddle.to_tensor(paddle.shape(feats)[2])
-
-        sids_none = sids_src is None and sids_tgt is None
-        spembs_none = spembs_src is None and spembs_tgt is None
-
-        assert not sids_none or not spembs_none
-
-        wav = self.generator.voice_conversion(
-            feats,
-            feats_lengths,
-            sids_src,
-            sids_tgt,
-            spembs_src,
-            spembs_tgt,
-            lids, )
-
-        return dict(wav=paddle.reshape(wav, [-1]))
diff --git a/paddlespeech/t2s/models/vits/vits_updater.py b/paddlespeech/t2s/models/vits/vits_updater.py
index 9f8be6803..76271fd97 100644
--- a/paddlespeech/t2s/models/vits/vits_updater.py
+++ b/paddlespeech/t2s/models/vits/vits_updater.py
@@ -111,8 +111,6 @@ class VITSUpdater(StandardUpdater):
                 text_lengths=batch["text_lengths"],
                 feats=batch["feats"],
                 feats_lengths=batch["feats_lengths"],
-                sids=batch.get("spk_id", None),
-                spembs=batch.get("spk_emb", None),
                 forward_generator=turn == "generator")
             # Generator
             if turn == "generator":
@@ -270,8 +268,6 @@ class VITSEvaluator(StandardEvaluator):
                 text_lengths=batch["text_lengths"],
                 feats=batch["feats"],
                 feats_lengths=batch["feats_lengths"],
-                sids=batch.get("spk_id", None),
-                spembs=batch.get("spk_emb", None),
                 forward_generator=turn == "generator")
             # Generator
             if turn == "generator":
diff --git a/paddlespeech/t2s/training/updaters/standard_updater.py b/paddlespeech/t2s/training/updaters/standard_updater.py
index 6d3aa7099..668d2fc69 100644
--- a/paddlespeech/t2s/training/updaters/standard_updater.py
+++ b/paddlespeech/t2s/training/updaters/standard_updater.py
@@ -24,11 +24,10 @@ from paddle.nn import Layer
 from paddle.optimizer import Optimizer
 from timer import timer
 
-from paddlespeech.t2s.datasets.sampler import ErnieSATSampler
 from paddlespeech.t2s.training.reporter import report
 from paddlespeech.t2s.training.updater import UpdaterBase
 from paddlespeech.t2s.training.updater import UpdaterState
-
+from paddlespeech.t2s.datasets.sampler import ErnieSATSampler
 
 class StandardUpdater(UpdaterBase):
     """An example of over-simplification. Things may not be that simple, but