wav2vec2 pipeline

3 years ago · c51da12b7f
parent 0975a332c4
commit c51da12b7f
100 changed files with 6875 additions and 1572 deletions
--- a/paddlespeech/init.py
+++ b/paddlespeech/init.py
@ -14,3 +14,9 @@
 import _locale

 _locale._getdefaultlocale = (lambda *args: ['en_US', 'utf8'])
+
+
+
+
+
+
--- a/paddlespeech/audio/init.py
+++ b/paddlespeech/audio/init.py
@ -14,12 +14,12 @@
 from . import compliance
 from . import datasets
 from . import features
+from . import text
+from . import transform
+from . import streamdata
 from . import functional
 from . import io
 from . import metric
 from . import sox_effects
-from . import streamdata
-from . import text
-from . import transform
 from .backends import load
 from .backends import save
--- a/paddlespeech/audio/streamdata/init.py
+++ b/paddlespeech/audio/streamdata/init.py
@ -4,66 +4,67 @@
 # Modified from https://github.com/webdataset/webdataset
 #
 # flake8: noqa
-from .cache import cached_tarfile_samples
-from .cache import cached_tarfile_to_samples
-from .cache import lru_cleanup
-from .cache import pipe_cleaner
-from .compat import FluidWrapper
-from .compat import WebDataset
-from .compat import WebLoader
-from .extradatasets import MockDataset
-from .extradatasets import with_epoch
-from .extradatasets import with_length
-from .filters import associate
-from .filters import audio_cmvn
-from .filters import audio_compute_fbank
-from .filters import audio_data_filter
-from .filters import audio_padding
-from .filters import audio_resample
-from .filters import audio_spec_aug
-from .filters import audio_tokenize
-from .filters import batched
-from .filters import decode
-from .filters import detshuffle
-from .filters import extract_keys
-from .filters import getfirst
-from .filters import info
-from .filters import map
-from .filters import map_dict
-from .filters import map_tuple
-from .filters import pipelinefilter
-from .filters import placeholder
-from .filters import rename
-from .filters import rename_keys
-from .filters import select
-from .filters import shuffle
-from .filters import slice
-from .filters import sort
-from .filters import to_tuple
-from .filters import transform_with
-from .filters import unbatched
-from .filters import xdecode
-from .handlers import ignore_and_continue
-from .handlers import ignore_and_stop
-from .handlers import reraise_exception
-from .handlers import warn_and_continue
-from .handlers import warn_and_stop
-from .mix import RandomMix
-from .mix import RoundRobin
+
+from .cache import (
+    cached_tarfile_samples,
+    cached_tarfile_to_samples,
+    lru_cleanup,
+    pipe_cleaner,
+)
+from .compat import WebDataset, WebLoader, FluidWrapper
+from .extradatasets import MockDataset, with_epoch, with_length
+from .filters import (
+    associate,
+    batched,
+    decode,
+    detshuffle,
+    extract_keys,
+    getfirst,
+    info,
+    map,
+    map_dict,
+    map_tuple,
+    pipelinefilter,
+    rename,
+    rename_keys,
+    audio_resample,
+    select,
+    shuffle,
+    slice,
+    to_tuple,
+    transform_with,
+    unbatched,
+    xdecode,
+    audio_data_filter,
+    audio_tokenize,
+    audio_resample,
+    audio_compute_fbank,
+    audio_spec_aug,
+    sort,
+    audio_padding,
+    audio_cmvn,
+    placeholder,
+)
+from .handlers import (
+    ignore_and_continue,
+    ignore_and_stop,
+    reraise_exception,
+    warn_and_continue,
+    warn_and_stop,
+)
 from .pipeline import DataPipeline
-from .shardlists import MultiShardSample
-from .shardlists import non_empty
-from .shardlists import resampled
-from .shardlists import ResampledShards
-from .shardlists import shardspec
-from .shardlists import SimpleShardList
-from .shardlists import single_node_only
-from .shardlists import split_by_node
-from .shardlists import split_by_worker
-from .tariterators import tarfile_samples
-from .tariterators import tarfile_to_samples
-from .utils import PipelineStage
-from .utils import repeatedly
-from .writer import numpy_dumps
-from .writer import ShardWriter
-from .writer import TarWriter
+from .shardlists import (
+    MultiShardSample,
+    ResampledShards,
+    SimpleShardList,
+    non_empty,
+    resampled,
+    shardspec,
+    single_node_only,
+    split_by_node,
+    split_by_worker,
+)
+from .tariterators import tarfile_samples, tarfile_to_samples
+from .utils import PipelineStage, repeatedly
+from .writer import ShardWriter, TarWriter, numpy_dumps
+from .mix import RandomMix, RoundRobin
--- a/paddlespeech/audio/streamdata/autodecode.py
+++ b/paddlespeech/audio/streamdata/autodecode.py
@ -5,19 +5,18 @@
 # See the LICENSE file for licensing terms (BSD-style).
 # Modified from https://github.com/webdataset/webdataset
 #
+
 """Automatically decode webdataset samples."""
-import io
-import json
-import os
-import pickle
-import re
-import tempfile
+
+import io, json, os, pickle, re, tempfile
 from functools import partial

 import numpy as np
+
 """Extensions passed on to the image decoder."""
 image_extensions = "jpg jpeg png ppm pgm pbm pnm".split()

+
 ################################################################
 # handle basic datatypes
 ################################################################
@ -129,7 +128,7 @@ def call_extension_handler(key, data, f, extensions):
        target = target.split(".")
        if len(target) > len(extension):
            continue
-        if extension[-len(target):] == target:
+        if extension[-len(target) :] == target:
            return f(data)
    return None

@ -269,6 +268,7 @@ def imagehandler(imagespec, extensions=image_extensions):
 ################################################################
 # torch video
 ################################################################
+
 '''
 def torch_video(key, data):
    """Decode video using the torchvideo library.
@ -289,6 +289,7 @@ def torch_video(key, data):
        return torchvision.io.read_video(fname, pts_unit="sec")
 '''

+
 ################################################################
 # paddlespeech.audio
 ################################################################
@ -358,6 +359,7 @@ def gzfilter(key, data):
 # decode entire training amples
 ################################################################

+
 default_pre_handlers = [gzfilter]
 default_post_handlers = [basichandlers]

@ -385,8 +387,7 @@ class Decoder:
            pre = default_pre_handlers
        if post is None:
            post = default_post_handlers
-        assert all(callable(h)
-                   for h in handlers), f"one of {handlers} not callable"
+        assert all(callable(h) for h in handlers), f"one of {handlers} not callable"
        assert all(callable(h) for h in pre), f"one of {pre} not callable"
        assert all(callable(h) for h in post), f"one of {post} not callable"
        self.handlers = pre + handlers + post
--- a/paddlespeech/audio/streamdata/cache.py
+++ b/paddlespeech/audio/streamdata/cache.py
@ -2,10 +2,7 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 # See the LICENSE file for licensing terms (BSD-style).
 # Modified from https://github.com/webdataset/webdataset
-import os
-import random
-import re
-import sys
+import itertools, os, random, re, sys
 from urllib.parse import urlparse

 from . import filters
@ -43,7 +40,7 @@ def lru_cleanup(cache_dir, cache_size, keyfn=os.path.getctime, verbose=False):
        os.remove(fname)


-def download(url, dest, chunk_size=1024**2, verbose=False):
+def download(url, dest, chunk_size=1024 ** 2, verbose=False):
    """Download a file from `url` to `dest`."""
    temp = dest + f".temp{os.getpid()}"
    with gopen.gopen(url) as stream:
@ -72,7 +69,8 @@ def get_file_cached(
    cache_size=-1,
    cache_dir=None,
    url_to_name=pipe_cleaner,
-        verbose=False, ):
+    verbose=False,
+):
    if cache_size == -1:
        cache_size = default_cache_size
    if cache_dir is None:
@ -116,7 +114,8 @@ def cached_url_opener(
    url_to_name=pipe_cleaner,
    validator=check_tar_format,
    verbose=False,
-        always=False, ):
+    always=False,
+):
    """Given a stream of url names (packaged in `dict(url=url)`), yield opened streams."""
    verbose = verbose or verbose_cache
    for sample in data:
@ -133,7 +132,8 @@ def cached_url_opener(
                    cache_size=cache_size,
                    cache_dir=cache_dir,
                    url_to_name=url_to_name,
-                    verbose=verbose, )
+                    verbose=verbose,
+                )
            if verbose:
                print("# opening %s" % dest, file=sys.stderr)
            assert os.path.exists(dest)
@ -143,8 +143,9 @@ def cached_url_opener(
                    data = f.read(200)
                os.remove(dest)
                raise ValueError(
-                    "%s (%s) is not a tar archive, but a %s, contains %s" %
-                    (dest, url, ftype, repr(data)))
+                    "%s (%s) is not a tar archive, but a %s, contains %s"
+                    % (dest, url, ftype, repr(data))
+                )
            try:
                stream = open(dest, "rb")
                sample.update(stream=stream)
@ -157,7 +158,7 @@ def cached_url_opener(
                    continue
                raise exn
        except Exception as exn:
-            exn.args = exn.args + (url, )
+            exn.args = exn.args + (url,)
            if handler(exn):
                continue
            else:
@ -171,7 +172,8 @@ def cached_tarfile_samples(
    cache_dir=None,
    verbose=False,
    url_to_name=pipe_cleaner,
-        always=False, ):
+    always=False,
+):
    streams = cached_url_opener(
        src,
        handler=handler,
@ -179,7 +181,8 @@ def cached_tarfile_samples(
        cache_dir=cache_dir,
        verbose=verbose,
        url_to_name=url_to_name,
-        always=always, )
+        always=always,
+    )
    samples = tar_file_and_group_expander(streams, handler=handler)
    return samples

--- a/paddlespeech/audio/streamdata/compat.py
+++ b/paddlespeech/audio/streamdata/compat.py
@ -2,17 +2,17 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 # See the LICENSE file for licensing terms (BSD-style).
 # Modified from https://github.com/webdataset/webdataset
-import yaml
+from dataclasses import dataclass
+from itertools import islice
+from typing import List
+
+import braceexpand, yaml

 from . import autodecode
-from . import cache
-from . import filters
-from . import shardlists
-from . import tariterators
+from . import cache, filters, shardlists, tariterators
 from .filters import reraise_exception
-from .paddle_utils import DataLoader
-from .paddle_utils import IterableDataset
 from .pipeline import DataPipeline
+from .paddle_utils import DataLoader, IterableDataset


 class FluidInterface:
@ -26,8 +26,7 @@ class FluidInterface:
        return self.compose(filters.unbatched())

    def listed(self, batchsize, partial=True):
-        return self.compose(
-            filters.batched(), batchsize=batchsize, collation_fn=None)
+        return self.compose(filters.batched(), batchsize=batchsize, collation_fn=None)

    def unlisted(self):
        return self.compose(filters.unlisted())
@ -44,19 +43,9 @@ class FluidInterface:
    def map(self, f, handler=reraise_exception):
        return self.compose(filters.map(f, handler=handler))

-    def decode(self,
-               *args,
-               pre=None,
-               post=None,
-               only=None,
-               partial=False,
-               handler=reraise_exception):
-        handlers = [
-            autodecode.ImageHandler(x) if isinstance(x, str) else x
-            for x in args
-        ]
-        decoder = autodecode.Decoder(
-            handlers, pre=pre, post=post, only=only, partial=partial)
+    def decode(self, *args, pre=None, post=None, only=None, partial=False, handler=reraise_exception):
+        handlers = [autodecode.ImageHandler(x) if isinstance(x, str) else x for x in args]
+        decoder = autodecode.Decoder(handlers, pre=pre, post=post, only=only, partial=partial)
        return self.map(decoder, handler=handler)

    def map_dict(self, handler=reraise_exception, **kw):
@ -113,7 +102,6 @@ class FluidInterface:
    def audio_cmvn(self, cmvn_file):
        return self.compose(filters.audio_cmvn(cmvn_file))

-
 class WebDataset(DataPipeline, FluidInterface):
    """Small fluid-interface wrapper for DataPipeline."""

@ -128,13 +116,13 @@ class WebDataset(DataPipeline, FluidInterface):
        cache_dir=None,
        detshuffle=False,
        nodesplitter=shardlists.single_node_only,
-            verbose=False, ):
+        verbose=False,
+    ):
        super().__init__()
        if isinstance(urls, IterableDataset):
            assert not resampled
            self.append(urls)
-        elif isinstance(urls, str) and (urls.endswith(".yaml") or
-                                        urls.endswith(".yml")):
+        elif isinstance(urls, str) and (urls.endswith(".yaml") or urls.endswith(".yml")):
            with (open(urls)) as stream:
                spec = yaml.safe_load(stream)
            assert "datasets" in spec
@ -164,7 +152,9 @@ class WebDataset(DataPipeline, FluidInterface):
                    handler=handler,
                    verbose=verbose,
                    cache_size=cache_size,
-                    cache_dir=cache_dir, ))
+                    cache_dir=cache_dir,
+                )
+            )


 class FluidWrapper(DataPipeline, FluidInterface):
--- a/paddlespeech/audio/streamdata/extradatasets.py
+++ b/paddlespeech/audio/streamdata/extradatasets.py
@ -5,10 +5,20 @@
 # See the LICENSE file for licensing terms (BSD-style).
 # Modified from https://github.com/webdataset/webdataset
 #
+
+
 """Train PyTorch models directly from POSIX tar archive.

 Code works locally or over HTTP connections.
 """
+
+import itertools as itt
+import os
+import random
+import sys
+
+import braceexpand
+
 from . import utils
 from .paddle_utils import IterableDataset
 from .utils import PipelineStage
@ -53,7 +63,8 @@ class repeatedly(IterableDataset, PipelineStage):
        return utils.repeatedly(
            source,
            nepochs=self.nepochs,
-            nbatches=self.nbatches, )
+            nbatches=self.nbatches,
+        )


 class with_epoch(IterableDataset):
--- a/paddlespeech/audio/streamdata/filters.py
+++ b/paddlespeech/audio/streamdata/filters.py
@ -3,6 +3,7 @@
 # This file is part of the WebDataset library.
 # See the LICENSE file for licensing terms (BSD-style).
 #
+
 # Modified from https://github.com/webdataset/webdataset
 # Modified from wenet(https://github.com/wenet-e2e/wenet)
 """A collection of iterators for data transformations.
@ -11,29 +12,28 @@ These functions are plain iterator functions. You can find curried versions
 in webdataset.filters, and you can find IterableDataset wrappers in
 webdataset.processing.
 """
+
 import io
-import itertools
-import os
-import random
-import re
-import sys
-import time
 from fnmatch import fnmatch
-from functools import reduce
+import re
+import itertools, os, random, sys, time
+from functools import reduce, wraps

-import paddle
+import numpy as np

 from . import autodecode
 from . import  utils
+from .paddle_utils import PaddleTensor
+from .utils import PipelineStage
+
 from .. import backends
 from ..compliance import kaldi
+import paddle
 from ..transform.cmvn import GlobalCMVN
-from ..transform.spec_augment import freq_mask
-from ..transform.spec_augment import time_mask
-from ..transform.spec_augment import time_warp
 from ..utils.tensor_utils import pad_sequence
-from .utils import PipelineStage
-
+from ..transform.spec_augment import time_warp
+from ..transform.spec_augment import time_mask
+from ..transform.spec_augment import freq_mask

 class FilterFunction(object):
    """Helper class for currying pipeline stages.
@ -159,12 +159,10 @@ def transform_with(sample, transformers):
            result[i] = f(sample[i])
    return result

-
 ###
 # Iterators
 ###

-
 def _info(data, fmt=None, n=3, every=-1, width=50, stream=sys.stderr, name=""):
    """Print information about the samples that are passing through.

@ -280,16 +278,10 @@ def _log_keys(data, logfile=None):
 log_keys = pipelinefilter(_log_keys)


-def _minedecode(x):
-    if isinstance(x, str):
-        return autodecode.imagehandler(x)
-    else:
-        return x
-
-
 def _decode(data, *args, handler=reraise_exception, **kw):
    """Decode data based on the decoding functions given as arguments."""
-    decoder = _minedecode
+
+    decoder = lambda x: autodecode.imagehandler(x) if isinstance(x, str) else x
    handlers = [decoder(x) for x in args]
    f = autodecode.Decoder(handlers, **kw)

@ -333,24 +325,15 @@ def _rename(data, handler=reraise_exception, keep=True, **kw):
    for sample in data:
        try:
            if not keep:
-                yield {
-                    k: getfirst(sample, v, missing_is_error=True)
-                    for k, v in kw.items()
-                }
+                yield {k: getfirst(sample, v, missing_is_error=True) for k, v in kw.items()}
            else:

                def listify(v):
                    return v.split(";") if isinstance(v, str) else v

                to_be_replaced = {x for v in kw.values() for x in listify(v)}
-                result = {
-                    k: v
-                    for k, v in sample.items() if k not in to_be_replaced
-                }
-                result.update({
-                    k: getfirst(sample, v, missing_is_error=True)
-                    for k, v in kw.items()
-                })
+                result = {k: v for k, v in sample.items() if k not in to_be_replaced}
+                result.update({k: getfirst(sample, v, missing_is_error=True) for k, v in kw.items()})
                yield result
        except Exception as exn:
            if handler(exn):
@ -398,11 +381,7 @@ def _map_dict(data, handler=reraise_exception, **kw):
 map_dict = pipelinefilter(_map_dict)


-def _to_tuple(data,
-              *args,
-              handler=reraise_exception,
-              missing_is_error=True,
-              none_is_error=None):
+def _to_tuple(data, *args, handler=reraise_exception, missing_is_error=True, none_is_error=None):
    """Convert dict samples to tuples."""
    if none_is_error is None:
        none_is_error = missing_is_error
@ -411,10 +390,7 @@ def _to_tuple(data,

    for sample in data:
        try:
-            result = tuple([
-                getfirst(sample, f, missing_is_error=missing_is_error)
-                for f in args
-            ])
+            result = tuple([getfirst(sample, f, missing_is_error=missing_is_error) for f in args])
            if none_is_error and any(x is None for x in result):
                raise ValueError(f"to_tuple {args} got {sample.keys()}")
            yield result
@ -487,28 +463,19 @@ rsample = pipelinefilter(_rsample)
 slice = pipelinefilter(itertools.islice)


-def _extract_keys(source,
-                  *patterns,
-                  duplicate_is_error=True,
-                  ignore_missing=False):
+def _extract_keys(source, *patterns, duplicate_is_error=True, ignore_missing=False):
    for sample in source:
        result = []
        for pattern in patterns:
-            pattern = pattern.split(";") if isinstance(pattern,
-                                                       str) else pattern
-            matches = [
-                x for x in sample.keys()
-                if any(fnmatch("." + x, p) for p in pattern)
-            ]
+            pattern = pattern.split(";") if isinstance(pattern, str) else pattern
+            matches = [x for x in sample.keys() if any(fnmatch("." + x, p) for p in pattern)]
            if len(matches) == 0:
                if ignore_missing:
                    continue
                else:
-                    raise ValueError(
-                        f"Cannot find {pattern} in sample keys {sample.keys()}.")
+                    raise ValueError(f"Cannot find {pattern} in sample keys {sample.keys()}.")
            if len(matches) > 1 and duplicate_is_error:
-                raise ValueError(
-                    f"Multiple sample keys {sample.keys()} match {pattern}.")
+                raise ValueError(f"Multiple sample keys {sample.keys()} match {pattern}.")
            value = sample[matches[0]]
            result.append(value)
        yield tuple(result)
@ -517,12 +484,7 @@ def _extract_keys(source,
 extract_keys = pipelinefilter(_extract_keys)


-def _rename_keys(source,
-                 *args,
-                 keep_unselected=False,
-                 must_match=True,
-                 duplicate_is_error=True,
-                 **kw):
+def _rename_keys(source, *args, keep_unselected=False, must_match=True, duplicate_is_error=True, **kw):
    renamings = [(pattern, output) for output, pattern in args]
    renamings += [(pattern, output) for output, pattern in kw.items()]
    for sample in source:
@ -542,15 +504,11 @@ def _rename_keys(source,
                continue
            if new_name in new_sample:
                if duplicate_is_error:
-                    raise ValueError(
-                        f"Duplicate value in sample {sample.keys()} after rename."
-                    )
+                    raise ValueError(f"Duplicate value in sample {sample.keys()} after rename.")
                continue
            new_sample[new_name] = value
        if must_match and not all(matched.values()):
-            raise ValueError(
-                f"Not all patterns ({matched}) matched sample keys ({sample.keys()})."
-            )
+            raise ValueError(f"Not all patterns ({matched}) matched sample keys ({sample.keys()}).")

        yield new_sample

@ -583,8 +541,7 @@ def find_decoder(decoders, path):
    if fname.startswith("__"):
        return lambda x: x
    for pattern, fun in decoders[::-1]:
-        if fnmatch(fname.lower(), pattern) or fnmatch("." + fname.lower(),
-                                                      pattern):
+        if fnmatch(fname.lower(), pattern) or fnmatch("." + fname.lower(), pattern):
            return fun
    return None

@ -594,7 +551,8 @@ def _xdecode(
    *args,
    must_decode=True,
    defaults=default_decoders,
-        **kw, ):
+    **kw,
+):
    decoders = list(defaults) + list(args)
    decoders += [("*." + k, v) for k, v in kw.items()]
    for sample in source:
@ -617,10 +575,10 @@ def _xdecode(
            new_sample[path] = value
        yield new_sample

-
 xdecode = pipelinefilter(_xdecode)


+
 def _audio_data_filter(source,
           frame_shift=10,
           max_length=10240,
@ -655,8 +613,7 @@ def _audio_data_filter(source,
        assert 'wav' in sample
        assert 'label' in sample
        # sample['wav'] is paddle.Tensor, we have 100 frames every second (default)
-        num_frames = sample['wav'].shape[1] / sample['sample_rate'] * (
-            1000 / frame_shift)
+        num_frames = sample['wav'].shape[1] / sample['sample_rate'] * (1000 / frame_shift)
        if num_frames < min_length:
            continue
        if num_frames > max_length:
@ -672,10 +629,8 @@ def _audio_data_filter(source,
                continue
        yield sample

-
 audio_data_filter = pipelinefilter(_audio_data_filter)

-
 def _audio_tokenize(source,
             symbol_table,
             bpe_model=None,
@ -738,10 +693,8 @@ def _audio_tokenize(source,
        sample['label'] = label
        yield sample

-
 audio_tokenize = pipelinefilter(_audio_tokenize)

-
 def _audio_resample(source, resample_rate=16000):
    """ Resample data.
        Inplace operation.
@ -760,17 +713,13 @@ def _audio_resample(source, resample_rate=16000):
        waveform = sample['wav']
        if sample_rate != resample_rate:
            sample['sample_rate'] = resample_rate
-            sample['wav'] = paddle.to_tensor(
-                backends.soundfile_backend.resample(
-                    waveform.numpy(),
-                    src_sr=sample_rate,
-                    target_sr=resample_rate))
+            sample['wav'] = paddle.to_tensor(backends.soundfile_backend.resample(
+                waveform.numpy(), src_sr = sample_rate, target_sr = resample_rate
+            ))
        yield sample

-
 audio_resample = pipelinefilter(_audio_resample)

-
 def _audio_compute_fbank(source,
                  num_mel_bins=80,
                  frame_length=25,
@ -797,8 +746,7 @@ def _audio_compute_fbank(source,
        waveform = sample['wav']
        waveform = waveform * (1 << 15)
        # Only keep fname, feat, label
-        mat = kaldi.fbank(
-            waveform,
+        mat = kaldi.fbank(waveform,
                          n_mels=num_mel_bins,
                          frame_length=frame_length,
                          frame_shift=frame_shift,
@ -810,9 +758,7 @@ def _audio_compute_fbank(source,

 audio_compute_fbank = pipelinefilter(_audio_compute_fbank)

-
-def _audio_spec_aug(
-        source,
+def _audio_spec_aug(source,
            max_w=5, 
            w_inplace=True, 
            w_mode="PIL",
@ -823,7 +769,7 @@ def _audio_spec_aug(
            max_t=40, 
            num_t_mask=2, 
            t_inplace=True, 
-        t_replace_with_zero=False, ):
+            t_replace_with_zero=False,):
    """ Do spec augmentation
        Inplace operation

@ -847,23 +793,12 @@ def _audio_spec_aug(
    for sample in source:
        x = sample['feat']
        x = x.numpy()
-        x = time_warp(x, max_time_warp=max_w, inplace=w_inplace, mode=w_mode)
-        x = freq_mask(
-            x,
-            F=max_f,
-            n_mask=num_f_mask,
-            inplace=f_inplace,
-            replace_with_zero=f_replace_with_zero)
-        x = time_mask(
-            x,
-            T=max_t,
-            n_mask=num_t_mask,
-            inplace=t_inplace,
-            replace_with_zero=t_replace_with_zero)
+        x = time_warp(x, max_time_warp=max_w, inplace = w_inplace, mode= w_mode)
+        x = freq_mask(x, F = max_f, n_mask = num_f_mask, inplace = f_inplace, replace_with_zero = f_replace_with_zero)
+        x = time_mask(x, T = max_t, n_mask = num_t_mask, inplace = t_inplace, replace_with_zero = t_replace_with_zero)
        sample['feat'] = paddle.to_tensor(x, dtype=paddle.float32)
        yield sample

-
 audio_spec_aug = pipelinefilter(_audio_spec_aug)


@ -894,10 +829,8 @@ def _sort(source, sort_size=500):
    for x in buf:
        yield x

-
 sort = pipelinefilter(_sort)

-
 def _batched(source, batch_size=16):
    """ Static batch the data by `batch_size`

@ -917,10 +850,8 @@ def _batched(source, batch_size=16):
    if len(buf) > 0:
        yield buf

-
 batched = pipelinefilter(_batched)

-
 def dynamic_batched(source, max_frames_in_batch=12000):
    """ Dynamic batch the data until the total frames in batch
        reach `max_frames_in_batch`
@ -961,8 +892,8 @@ def _audio_padding(source):
    """
    for sample in source:
        assert isinstance(sample, list)
-        feats_length = paddle.to_tensor(
-            [x['feat'].shape[0] for x in sample], dtype="int64")
+        feats_length = paddle.to_tensor([x['feat'].shape[0] for x in sample],
+                                    dtype="int64")
        order = paddle.argsort(feats_length, descending=True)
        feats_lengths = paddle.to_tensor(
            [sample[i]['feat'].shape[0] for i in order], dtype="int64")
@ -971,20 +902,20 @@ def _audio_padding(source):
        sorted_labels = [
            paddle.to_tensor(sample[i]['label'], dtype="int32") for i in order
        ]
-        label_lengths = paddle.to_tensor(
-            [x.shape[0] for x in sorted_labels], dtype="int64")
-        padded_feats = pad_sequence(
-            sorted_feats, batch_first=True, padding_value=0)
-        padding_labels = pad_sequence(
-            sorted_labels, batch_first=True, padding_value=-1)
+        label_lengths = paddle.to_tensor([x.shape[0] for x in sorted_labels],
+                                     dtype="int64")
+        padded_feats = pad_sequence(sorted_feats,
+                                    batch_first=True,
+                                    padding_value=0)
+        padding_labels = pad_sequence(sorted_labels,
+                                      batch_first=True,
+                                      padding_value=-1)

        yield (sorted_keys, padded_feats, feats_lengths, padding_labels, 
               label_lengths)

-
 audio_padding = pipelinefilter(_audio_padding)

-
 def _audio_cmvn(source, cmvn_file):
    global_cmvn = GlobalCMVN(cmvn_file)
    for batch in source:
@ -995,13 +926,10 @@ def _audio_cmvn(source, cmvn_file):
        yield (sorted_keys, padded_feats, feats_lengths, padding_labels, 
           label_lengths)

-
 audio_cmvn = pipelinefilter(_audio_cmvn)

-
 def _placeholder(source):
    for data in source:
        yield data

-
 placeholder = pipelinefilter(_placeholder)
--- a/paddlespeech/audio/streamdata/gopen.py
+++ b/paddlespeech/audio/streamdata/gopen.py
@ -3,12 +3,12 @@
 # This file is part of the WebDataset library.
 # See the LICENSE file for licensing terms (BSD-style).
 #
+
+
 """Open URLs by calling subcommands."""
-import os
-import re
-import sys
-from subprocess import PIPE
-from subprocess import Popen
+
+import os, sys, re
+from subprocess import PIPE, Popen
 from urllib.parse import urlparse

 # global used for printing additional node information during verbose output
@ -37,7 +37,8 @@ class Pipe:
        timeout=7200.0,
        ignore_errors=False,
        ignore_status=[],
-            **kw, ):
+        **kw,
+    ):
        """Create an IO Pipe."""
        self.ignore_errors = ignore_errors
        self.ignore_status = [0] + ignore_status
@ -74,7 +75,8 @@ class Pipe:
        if verbose:
            print(
                f"pipe exit [{self.status} {os.getpid()}:{self.proc.pid}] {self.args} {info}",
-                file=sys.stderr, )
+                file=sys.stderr,
+            )
        if self.status not in self.ignore_status and not self.ignore_errors:
            raise Exception(f"{self.args}: exit {self.status} (read) {info}")

@ -112,11 +114,9 @@ class Pipe:
        self.close()


-def set_options(obj,
-                timeout=None,
-                ignore_errors=None,
-                ignore_status=None,
-                handler=None):
+def set_options(
+    obj, timeout=None, ignore_errors=None, ignore_status=None, handler=None
+):
    """Set options for Pipes.

    This function can be called on any stream. It will set pipe options only
@ -168,14 +168,16 @@ def gopen_pipe(url, mode="rb", bufsize=8192):
            mode=mode,
            shell=True,
            bufsize=bufsize,
-            ignore_status=[141], )  # skipcq: BAN-B604
+            ignore_status=[141],
+        )  # skipcq: BAN-B604
    elif mode[0] == "w":
        return Pipe(
            cmd,
            mode=mode,
            shell=True,
            bufsize=bufsize,
-            ignore_status=[141], )  # skipcq: BAN-B604
+            ignore_status=[141],
+        )  # skipcq: BAN-B604
    else:
        raise ValueError(f"{mode}: unknown mode")

@ -194,7 +196,8 @@ def gopen_curl(url, mode="rb", bufsize=8192):
            mode=mode,
            shell=True,
            bufsize=bufsize,
-            ignore_status=[141, 23], )  # skipcq: BAN-B604
+            ignore_status=[141, 23],
+        )  # skipcq: BAN-B604
    elif mode[0] == "w":
        cmd = f"curl -s -L -T - '{url}'"
        return Pipe(
@ -202,7 +205,8 @@ def gopen_curl(url, mode="rb", bufsize=8192):
            mode=mode,
            shell=True,
            bufsize=bufsize,
-            ignore_status=[141, 26], )  # skipcq: BAN-B604
+            ignore_status=[141, 26],
+        )  # skipcq: BAN-B604
    else:
        raise ValueError(f"{mode}: unknown mode")

@ -222,13 +226,15 @@ def gopen_htgs(url, mode="rb", bufsize=8192):
            mode=mode,
            shell=True,
            bufsize=bufsize,
-            ignore_status=[141, 23], )  # skipcq: BAN-B604
+            ignore_status=[141, 23],
+        )  # skipcq: BAN-B604
    elif mode[0] == "w":
        raise ValueError(f"{mode}: cannot write")
    else:
        raise ValueError(f"{mode}: unknown mode")


+
 def gopen_gsutil(url, mode="rb", bufsize=8192):
    """Open a URL with `curl`.

@ -243,7 +249,8 @@ def gopen_gsutil(url, mode="rb", bufsize=8192):
            mode=mode,
            shell=True,
            bufsize=bufsize,
-            ignore_status=[141, 23], )  # skipcq: BAN-B604
+            ignore_status=[141, 23],
+        )  # skipcq: BAN-B604
    elif mode[0] == "w":
        cmd = f"gsutil cp - '{url}'"
        return Pipe(
@ -251,11 +258,13 @@ def gopen_gsutil(url, mode="rb", bufsize=8192):
            mode=mode,
            shell=True,
            bufsize=bufsize,
-            ignore_status=[141, 26], )  # skipcq: BAN-B604
+            ignore_status=[141, 26],
+        )  # skipcq: BAN-B604
    else:
        raise ValueError(f"{mode}: unknown mode")


+
 def gopen_error(url, *args, **kw):
    """Raise a value error.

@ -276,7 +285,8 @@ gopen_schemes = dict(
    ftps=gopen_curl,
    scp=gopen_curl,
    gs=gopen_gsutil,
-    htgs=gopen_htgs, )
+    htgs=gopen_htgs,
+)


 def gopen(url, mode="rb", bufsize=8192, **kw):
--- a/paddlespeech/audio/streamdata/handlers.py
+++ b/paddlespeech/audio/streamdata/handlers.py
@ -3,6 +3,7 @@
 # This file is part of the WebDataset library.
 # See the LICENSE file for licensing terms (BSD-style).
 #
+
 """Pluggable exception handlers.

 These are functions that take an exception as an argument and then return...
@ -13,8 +14,8 @@ These are functions that take an exception as an argument and then return...

 They are used as handler= arguments in much of the library.
 """
-import time
-import warnings
+
+import time, warnings


 def reraise_exception(exn):
--- a/paddlespeech/audio/streamdata/mix.py
+++ b/paddlespeech/audio/streamdata/mix.py
@ -5,12 +5,17 @@
 # See the LICENSE file for licensing terms (BSD-style).
 # Modified from https://github.com/webdataset/webdataset
 #
+
 """Classes for mixing samples from multiple sources."""
-import random
+
+import itertools, os, random, time, sys
+from functools import reduce, wraps

 import numpy as np

-from .paddle_utils import IterableDataset
+from . import autodecode, utils
+from .paddle_utils import PaddleTensor, IterableDataset
+from .utils import PipelineStage


 def round_robin_shortest(*sources):
--- a/paddlespeech/audio/streamdata/paddle_utils.py
+++ b/paddlespeech/audio/streamdata/paddle_utils.py
@ -5,11 +5,12 @@
 # See the LICENSE file for licensing terms (BSD-style).
 # Modified from https://github.com/webdataset/webdataset
 #
+
 """Mock implementations of paddle interfaces when paddle is not available."""

+
 try:
-    from paddle.io import DataLoader
-    from paddle.io import IterableDataset
+    from paddle.io import DataLoader, IterableDataset
 except ModuleNotFoundError:

    class IterableDataset:
@ -21,3 +22,12 @@ except ModuleNotFoundError:
        """Empty implementation of DataLoader when paddle is not available."""

        pass
+
+try:
+    from paddle import Tensor as PaddleTensor
+except ModuleNotFoundError:
+
+    class TorchTensor:
+        """Empty implementation of PaddleTensor when paddle is not available."""
+
+        pass
--- a/paddlespeech/audio/streamdata/pipeline.py
+++ b/paddlespeech/audio/streamdata/pipeline.py
@ -3,12 +3,15 @@
 # See the LICENSE file for licensing terms (BSD-style).
 # Modified from https://github.com/webdataset/webdataset
 #%%
-import copy
-import sys
+import copy, os, random, sys, time
+from dataclasses import dataclass
 from itertools import islice
+from typing import List

-from .paddle_utils import DataLoader
-from .paddle_utils import IterableDataset
+import braceexpand, yaml
+
+from .handlers import reraise_exception
+from .paddle_utils import DataLoader, IterableDataset
 from .utils import PipelineStage


@ -19,7 +22,8 @@ def add_length_method(obj):
    Combined = type(
        obj.__class__.__name__ + "_Length",
        (obj.__class__, IterableDataset),
-        {"__len__": length}, )
+        {"__len__": length},
+    )
    obj.__class__ = Combined
    return obj

--- a/paddlespeech/audio/streamdata/shardlists.py
+++ b/paddlespeech/audio/streamdata/shardlists.py
@ -4,30 +4,28 @@
 # This file is part of the WebDataset library.
 # See the LICENSE file for licensing terms (BSD-style).
 #
+
 # Modified from https://github.com/webdataset/webdataset
+
 """Train PyTorch models directly from POSIX tar archive.

 Code works locally or over HTTP connections.
 """
-import os
-import random
-import sys
-import time
-from dataclasses import dataclass
-from dataclasses import field
+
+import os, random, sys, time
+from dataclasses import dataclass, field
 from itertools import islice
 from typing import List

-import braceexpand
-import yaml
+import braceexpand, yaml

 from . import utils
-from ..utils.log import Logger
 from .filters import pipelinefilter
 from .paddle_utils import IterableDataset
-logger = Logger(__name__)


+from ..utils.log import Logger
+logger = Logger(__name__)
 def expand_urls(urls):
    if isinstance(urls, str):
        urllist = urls.split("::")
@ -66,8 +64,7 @@ class SimpleShardList(IterableDataset):


 def split_by_node(src, group=None):
-    rank, world_size, worker, num_workers = utils.paddle_worker_info(
-        group=group)
+    rank, world_size, worker, num_workers = utils.paddle_worker_info(group=group)
    logger.info(f"world_size:{world_size}, rank:{rank}")
    if world_size > 1:
        for s in islice(src, rank, None, world_size):
@ -78,11 +75,9 @@ def split_by_node(src, group=None):


 def single_node_only(src, group=None):
-    rank, world_size, worker, num_workers = utils.paddle_worker_info(
-        group=group)
+    rank, world_size, worker, num_workers = utils.paddle_worker_info(group=group)
    if world_size > 1:
-        raise ValueError(
-            "input pipeline needs to be reconfigured for multinode training")
+        raise ValueError("input pipeline needs to be reconfigured for multinode training")
    for s in src:
        yield s

@ -109,8 +104,7 @@ def resampled_(src, n=sys.maxsize):
    rng = random.Random(seed)
    print("# resampled loading", file=sys.stderr)
    items = list(src)
-    print(
-        f"# resampled got {len(items)} samples, yielding {n}", file=sys.stderr)
+    print(f"# resampled got {len(items)} samples, yielding {n}", file=sys.stderr)
    for i in range(n):
        yield rng.choice(items)

@ -124,9 +118,7 @@ def non_empty(src):
        yield s
        count += 1
    if count == 0:
-        raise ValueError(
-            "pipeline stage received no data at all and this was declared as an error"
-        )
+        raise ValueError("pipeline stage received no data at all and this was declared as an error")


@dataclass
@ -146,6 +138,10 @@ def expand(s):
    return os.path.expanduser(os.path.expandvars(s))


+class MultiShardSample(IterableDataset):
+    def __init__(self, fname):
+        """Construct a shardlist from multiple sources using a YAML spec."""
+        self.epoch = -1
 class MultiShardSample(IterableDataset):
    def __init__(self, fname):
        """Construct a shardlist from multiple sources using a YAML spec."""
@ -160,23 +156,20 @@ class MultiShardSample(IterableDataset):
        else:
            with open(fname) as stream:
                spec = yaml.safe_load(stream)
-        assert set(spec.keys()).issubset(
-            set("prefix datasets buckets".split())), list(spec.keys())
+        assert set(spec.keys()).issubset(set("prefix datasets buckets".split())), list(spec.keys())
        prefix = expand(spec.get("prefix", ""))
        self.sources = []
        for ds in spec["datasets"]:
-            assert set(ds.keys()).issubset(
-                set("buckets name shards resample choose".split())), list(
-                    ds.keys())
+            assert set(ds.keys()).issubset(set("buckets name shards resample choose".split())), list(
+                ds.keys()
+            )
            buckets = ds.get("buckets", spec.get("buckets", []))
            if isinstance(buckets, str):
                buckets = [buckets]
            buckets = [expand(s) for s in buckets]
            if buckets == []:
                buckets = [""]
-            assert len(
-                buckets
-            ) == 1, f"{buckets}: FIXME support for multiple buckets unimplemented"
+            assert len(buckets) == 1, f"{buckets}: FIXME support for multiple buckets unimplemented"
            bucket = buckets[0]
            name = ds.get("name", "@" + bucket)
            urls = ds["shards"]
@ -184,19 +177,15 @@ class MultiShardSample(IterableDataset):
                urls = [urls]
            # urls = [u for url in urls for u in braceexpand.braceexpand(url)]
            urls = [
-                prefix + os.path.join(bucket, u)
-                for url in urls for u in braceexpand.braceexpand(expand(url))
+                prefix + os.path.join(bucket, u) for url in urls for u in braceexpand.braceexpand(expand(url))
            ]
            resample = ds.get("resample", -1)
            nsample = ds.get("choose", -1)
            if nsample > len(urls):
-                raise ValueError(
-                    f"perepoch {nsample} must be no greater than the number of shards"
-                )
+                raise ValueError(f"perepoch {nsample} must be no greater than the number of shards")
            if (nsample > 0) and (resample > 0):
                raise ValueError("specify only one of perepoch or choose")
-            entry = MSSource(
-                name=name, urls=urls, perepoch=nsample, resample=resample)
+            entry = MSSource(name=name, urls=urls, perepoch=nsample, resample=resample)
            self.sources.append(entry)
            print(f"# {name} {len(urls)} {nsample}", file=sys.stderr)

@ -214,7 +203,7 @@ class MultiShardSample(IterableDataset):
                # sample without replacement
                l = list(source.urls)
                self.rng.shuffle(l)
-                l = l[:source.perepoch]
+                l = l[: source.perepoch]
            else:
                l = list(source.urls)
            result += l
@ -242,7 +231,8 @@ class ResampledShards(IterableDataset):
        urls,
        nshards=sys.maxsize,
        worker_seed=None,
-            deterministic=False, ):
+        deterministic=False,
+    ):
        """Sample shards from the shard list with replacement.

        :param urls: a list of URLs as a Python list or brace notation string
@ -262,8 +252,7 @@ class ResampledShards(IterableDataset):
        if self.deterministic:
            seed = utils.make_seed(self.worker_seed(), self.epoch)
        else:
-            seed = utils.make_seed(self.worker_seed(), self.epoch,
-                                   os.getpid(), time.time_ns(), os.urandom(4))
+            seed = utils.make_seed(self.worker_seed(), self.epoch, os.getpid(), time.time_ns(), os.urandom(4))
        if os.environ.get("WDS_SHOW_SEED", "0") == "1":
            print(f"# ResampledShards seed {seed}")
        self.rng = random.Random(seed)
--- a/paddlespeech/audio/streamdata/tariterators.py
+++ b/paddlespeech/audio/streamdata/tariterators.py
@ -3,12 +3,13 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 # This file is part of the WebDataset library.
 # See the LICENSE file for licensing terms (BSD-style).
+
 # Modified from https://github.com/webdataset/webdataset
 # Modified from wenet(https://github.com/wenet-e2e/wenet)
+
 """Low level iteration functions for tar archives."""
-import random
-import re
-import tarfile
+
+import random, re, tarfile

 import braceexpand

@ -26,7 +27,6 @@ import numpy as np

 AUDIO_FORMAT_SETS = set(['flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma'])

-
 def base_plus_ext(path):
    """Split off all file extensions.

@ -47,8 +47,12 @@ def valid_sample(sample):

    :param sample: sample to be checked
    """
-    return (sample is not None and isinstance(sample, dict) and
-            len(list(sample.keys())) > 0 and not sample.get("__bad__", False))
+    return (
+        sample is not None
+        and isinstance(sample, dict)
+        and len(list(sample.keys())) > 0
+        and not sample.get("__bad__", False)
+    )


 # FIXME: UNUSED
@ -75,16 +79,16 @@ def url_opener(data, handler=reraise_exception, **kw):
            sample.update(stream=stream)
            yield sample
        except Exception as exn:
-            exn.args = exn.args + (url, )
+            exn.args = exn.args + (url,)
            if handler(exn):
                continue
            else:
                break


-def tar_file_iterator(fileobj,
-                      skip_meta=r"__[^/]*__($|/)",
-                      handler=reraise_exception):
+def tar_file_iterator(
+    fileobj, skip_meta=r"__[^/]*__($|/)", handler=reraise_exception
+):
    """Iterate over tar file, yielding filename, content pairs for the given tar stream.

    :param fileobj: byte stream suitable for tarfile
@ -99,8 +103,11 @@ def tar_file_iterator(fileobj,
                continue
            if fname is None:
                continue
-            if ("/" not in fname and fname.startswith(meta_prefix) and
-                    fname.endswith(meta_suffix)):
+            if (
+                "/" not in fname
+                and fname.startswith(meta_prefix)
+                and fname.endswith(meta_suffix)
+            ):
                # skipping metadata for now
                continue
            if skip_meta is not None and re.match(skip_meta, fname):
@ -111,10 +118,8 @@ def tar_file_iterator(fileobj,
            assert pos > 0
            prefix, postfix = name[:pos], name[pos + 1:]
            if postfix == 'wav':
-                waveform, sample_rate = paddlespeech.audio.load(
-                    stream.extractfile(tarinfo), normal=False)
-                result = dict(
-                    fname=prefix, wav=waveform, sample_rate=sample_rate)
+                waveform, sample_rate = paddlespeech.audio.load(stream.extractfile(tarinfo), normal=False)
+                result = dict(fname=prefix, wav=waveform, sample_rate = sample_rate)
            else:
                txt = stream.extractfile(tarinfo).read().decode('utf8').strip()
                result = dict(fname=prefix, txt=txt)
@ -123,17 +128,16 @@ def tar_file_iterator(fileobj,
            stream.members = []
        except Exception as exn:
            if hasattr(exn, "args") and len(exn.args) > 0:
-                exn.args = (exn.args[0] + " @ " + str(fileobj), ) + exn.args[1:]
+                exn.args = (exn.args[0] + " @ " + str(fileobj),) + exn.args[1:]
            if handler(exn):
                continue
            else:
                break
    del stream

-
-def tar_file_and_group_iterator(fileobj,
-                                skip_meta=r"__[^/]*__($|/)",
-                                handler=reraise_exception):
+def tar_file_and_group_iterator(
+    fileobj, skip_meta=r"__[^/]*__($|/)", handler=reraise_exception
+):
    """ Expand a stream of open tar files into a stream of tar file contents.
        And groups the file with same prefix

@ -163,11 +167,8 @@ def tar_file_and_group_iterator(fileobj,
                if postfix == 'txt':
                    example['txt'] = file_obj.read().decode('utf8').strip()
                elif postfix in AUDIO_FORMAT_SETS:
-                    waveform, sample_rate = paddlespeech.audio.load(
-                        file_obj, normal=False)
-                    waveform = paddle.to_tensor(
-                        np.expand_dims(np.array(waveform), 0),
-                        dtype=paddle.float32)
+                    waveform, sample_rate = paddlespeech.audio.load(file_obj, normal=False)
+                    waveform = paddle.to_tensor(np.expand_dims(np.array(waveform),0), dtype=paddle.float32)

                    example['wav'] = waveform
                    example['sample_rate'] = sample_rate
@ -175,8 +176,7 @@ def tar_file_and_group_iterator(fileobj,
                    example[postfix] = file_obj.read()
            except Exception as exn:
                if hasattr(exn, "args") and len(exn.args) > 0:
-                    exn.args = (exn.args[0] + " @ " + str(fileobj),
-                                ) + exn.args[1:]
+                    exn.args = (exn.args[0] + " @ " + str(fileobj),) + exn.args[1:]
                if handler(exn):
                    continue
                else:
@ -189,7 +189,6 @@ def tar_file_and_group_iterator(fileobj,
        yield example
    stream.close()

-
 def tar_file_expander(data, handler=reraise_exception):
    """Expand a stream of open tar files into a stream of tar file contents.

@ -201,8 +200,9 @@ def tar_file_expander(data, handler=reraise_exception):
            assert isinstance(source, dict)
            assert "stream" in source
            for sample in tar_file_iterator(source["stream"]):
-                assert (isinstance(sample, dict) and "data" in sample and
-                        "fname" in sample)
+                assert (
+                    isinstance(sample, dict) and "data" in sample and "fname" in sample
+                )
                sample["__url__"] = url
                yield sample
        except Exception as exn:
@ -213,6 +213,8 @@ def tar_file_expander(data, handler=reraise_exception):
                break


+
+
 def tar_file_and_group_expander(data, handler=reraise_exception):
    """Expand a stream of open tar files into a stream of tar file contents.

@ -224,8 +226,9 @@ def tar_file_and_group_expander(data, handler=reraise_exception):
            assert isinstance(source, dict)
            assert "stream" in source
            for sample in tar_file_and_group_iterator(source["stream"]):
-                assert (isinstance(sample, dict) and "wav" in sample and
-                        "txt" in sample and "fname" in sample)
+                assert (
+                    isinstance(sample, dict) and "wav" in sample and "txt" in sample and "fname" in sample
+                )
                sample["__url__"] = url
                yield sample
        except Exception as exn:
@ -236,11 +239,7 @@ def tar_file_and_group_expander(data, handler=reraise_exception):
                break


-def group_by_keys(data,
-                  keys=base_plus_ext,
-                  lcase=True,
-                  suffixes=None,
-                  handler=None):
+def group_by_keys(data, keys=base_plus_ext, lcase=True, suffixes=None, handler=None):
    """Return function over iterator that groups key, value pairs into samples.

    :param keys: function that splits the key into key and extension (base_plus_ext)
@ -255,8 +254,8 @@ def group_by_keys(data,
            print(
                prefix,
                suffix,
-                current_sample.keys()
-                if isinstance(current_sample, dict) else None, )
+                current_sample.keys() if isinstance(current_sample, dict) else None,
+            )
        if prefix is None:
            continue
        if lcase:
--- a/paddlespeech/audio/streamdata/utils.py
+++ b/paddlespeech/audio/streamdata/utils.py
@ -4,23 +4,22 @@
 # This file is part of the WebDataset library.
 # See the LICENSE file for licensing terms (BSD-style).
 #
+
 # Modified from https://github.com/webdataset/webdataset
+
 """Miscellaneous utility functions."""
+
 import importlib
 import itertools as itt
 import os
 import re
 import sys
-from typing import Any
-from typing import Callable
-from typing import Iterator
-from typing import Union
+from typing import Any, Callable, Iterator, Optional, Union

 from ..utils.log import Logger

 logger = Logger(__name__)

-
 def make_seed(*args):
    seed = 0
    for arg in args:
@ -38,7 +37,7 @@ def identity(x: Any) -> Any:
    return x


-def safe_eval(s: str, expr: str="{}"):
+def safe_eval(s: str, expr: str = "{}"):
    """Evaluate the given expression more safely."""
    if re.sub("[^A-Za-z0-9_]", "", s) != s:
        raise ValueError(f"safe_eval: illegal characters in: '{s}'")
@ -55,9 +54,9 @@ def lookup_sym(sym: str, modules: list):
    return None


-def repeatedly0(loader: Iterator,
-                nepochs: int=sys.maxsize,
-                nbatches: int=sys.maxsize):
+def repeatedly0(
+    loader: Iterator, nepochs: int = sys.maxsize, nbatches: int = sys.maxsize
+):
    """Repeatedly returns batches from a DataLoader."""
    for epoch in range(nepochs):
        for sample in itt.islice(loader, nbatches):
@ -71,10 +70,11 @@ def guess_batchsize(batch: Union[tuple, list]):

 def repeatedly(
    source: Iterator,
-        nepochs: int=None,
-        nbatches: int=None,
-        nsamples: int=None,
-        batchsize: Callable[..., int]=guess_batchsize, ):
+    nepochs: int = None,
+    nbatches: int = None,
+    nsamples: int = None,
+    batchsize: Callable[..., int] = guess_batchsize,
+):
    """Repeatedly yield samples from an iterator."""
    epoch = 0
    batch = 0
@ -93,7 +93,6 @@ def repeatedly(
        if nepochs is not None and epoch >= nepochs:
            return

-
 def paddle_worker_info(group=None):
    """Return node and worker info for PyTorch and some distributed environments."""
    rank = 0
@ -117,7 +116,7 @@ def paddle_worker_info(group=None):
    else:
        try:
            from paddle.io import get_worker_info
-            worker_info = get_worker_info()
+            worker_info = paddle.io.get_worker_info()
            if worker_info is not None:
                worker = worker_info.id
                num_workers = worker_info.num_workers
@ -127,7 +126,6 @@ def paddle_worker_info(group=None):

    return rank, world_size, worker, num_workers

-
 def paddle_worker_seed(group=None):
    """Compute a distinct, deterministic RNG seed for each worker and node."""
    rank, world_size, worker, num_workers = paddle_worker_info(group=group)
--- a/paddlespeech/audio/streamdata/writer.py
+++ b/paddlespeech/audio/streamdata/writer.py
@ -5,24 +5,18 @@
 # See the LICENSE file for licensing terms (BSD-style).
 # Modified from https://github.com/webdataset/webdataset
 #
+
 """Classes and functions for writing tar files and WebDataset files."""
-import io
-import json
-import pickle
-import re
-import tarfile
-import time
-from typing import Any
-from typing import Callable
-from typing import Optional
-from typing import Union
+
+import io, json, pickle, re, tarfile, time
+from typing import Any, Callable, Optional, Union

 import numpy as np

 from . import gopen


-def imageencoder(image: Any, format: str="PNG"):  # skipcq: PYL-W0622
+def imageencoder(image: Any, format: str = "PNG"):  # skipcq: PYL-W0622
    """Compress an image using PIL and return it as a string.

    Can handle float or uint8 images.
@ -73,7 +67,6 @@ def bytestr(data: Any):
        return data.encode("ascii")
    return str(data).encode("ascii")

-
 def paddle_dumps(data: Any):
    """Dump data into a bytestring using paddle.dumps.

@ -89,7 +82,6 @@ def paddle_dumps(data: Any):
    paddle.save(data, stream)
    return stream.getvalue()

-
 def numpy_dumps(data: np.ndarray):
    """Dump data into a bytestring using numpy npy format.

@ -147,8 +139,9 @@ def add_handlers(d, keys, value):
 def make_handlers():
    """Create a list of handlers for encoding data."""
    handlers = {}
-    add_handlers(handlers, "cls cls2 class count index inx id",
-                 lambda x: str(x).encode("ascii"))
+    add_handlers(
+        handlers, "cls cls2 class count index inx id", lambda x: str(x).encode("ascii")
+    )
    add_handlers(handlers, "txt text transcript", lambda x: x.encode("utf-8"))
    add_handlers(handlers, "html htm", lambda x: x.encode("utf-8"))
    add_handlers(handlers, "pyd pickle", pickle.dumps)
@ -159,8 +152,7 @@ def make_handlers():
    add_handlers(handlers, "json jsn", lambda x: json.dumps(x).encode("utf-8"))
    add_handlers(handlers, "mp msgpack msg", mp_dumps)
    add_handlers(handlers, "cbor", cbor_dumps)
-    add_handlers(handlers, "jpg jpeg img image",
-                 lambda data: imageencoder(data, "jpg"))
+    add_handlers(handlers, "jpg jpeg img image", lambda data: imageencoder(data, "jpg"))
    add_handlers(handlers, "png", lambda data: imageencoder(data, "png"))
    add_handlers(handlers, "pbm", lambda data: imageencoder(data, "pbm"))
    add_handlers(handlers, "pgm", lambda data: imageencoder(data, "pgm"))
@ -200,8 +192,7 @@ def encode_based_on_extension(sample: dict, handlers: dict):
    :param handlers: handlers for encoding
    """
    return {
-        k: encode_based_on_extension1(v, k, handlers)
-        for k, v in list(sample.items())
+        k: encode_based_on_extension1(v, k, handlers) for k, v in list(sample.items())
    }


@ -269,12 +260,13 @@ class TarWriter:
    def __init__(
        self,
        fileobj,
-            user: str="bigdata",
-            group: str="bigdata",
-            mode: int=0o0444,
-            compress: Optional[bool]=None,
-            encoder: Union[None, bool, Callable]=True,
-            keep_meta: bool=False, ):
+        user: str = "bigdata",
+        group: str = "bigdata",
+        mode: int = 0o0444,
+        compress: Optional[bool] = None,
+        encoder: Union[None, bool, Callable] = True,
+        keep_meta: bool = False,
+    ):
        """Create a tar writer.

        :param fileobj: stream to write data to
@ -338,7 +330,8 @@ class TarWriter:
                continue
            if not isinstance(v, (bytes, bytearray, memoryview)):
                raise ValueError(
-                    f"{k} doesn't map to a bytes after encoding ({type(v)})")
+                    f"{k} doesn't map to a bytes after encoding ({type(v)})"
+                )
        key = obj["__key__"]
        for k in sorted(obj.keys()):
            if k == "__key__":
@ -356,8 +349,7 @@ class TarWriter:
            ti.uname = self.user
            ti.gname = self.group
            if not isinstance(v, (bytes, bytearray, memoryview)):
-                raise ValueError(
-                    f"converter didn't yield bytes: {k}, {type(v)}")
+                raise ValueError(f"converter didn't yield bytes: {k}, {type(v)}")
            stream = io.BytesIO(v)
            self.tarstream.addfile(ti, stream)
            total += ti.size
@ -370,11 +362,12 @@ class ShardWriter:
    def __init__(
        self,
        pattern: str,
-            maxcount: int=100000,
-            maxsize: float=3e9,
-            post: Optional[Callable]=None,
-            start_shard: int=0,
-            **kw, ):
+        maxcount: int = 100000,
+        maxsize: float = 3e9,
+        post: Optional[Callable] = None,
+        start_shard: int = 0,
+        **kw,
+    ):
        """Create a ShardWriter.

        :param pattern: output file pattern
@ -407,7 +400,8 @@ class ShardWriter:
                self.fname,
                self.count,
                "%.1f GB" % (self.size / 1e9),
-                self.total, )
+                self.total,
+            )
        self.shard += 1
        stream = open(self.fname, "wb")
        self.tarstream = TarWriter(stream, **self.kw)
@ -419,8 +413,11 @@ class ShardWriter:

        :param obj: sample to be written
        """
-        if (self.tarstream is None or self.count >= self.maxcount or
-                self.size >= self.maxsize):
+        if (
+            self.tarstream is None
+            or self.count >= self.maxcount
+            or self.size >= self.maxsize
+        ):
            self.next_stream()
        size = self.tarstream.write(obj)
        self.count += 1
--- a/paddlespeech/audio/text/text_featurizer.py
+++ b/paddlespeech/audio/text/text_featurizer.py
@ -17,7 +17,6 @@ from typing import Union

 import sentencepiece as spm

-from ..utils.log import Logger
 from .utility import BLANK
 from .utility import EOS
 from .utility import load_dict
@ -25,6 +24,7 @@ from .utility import MASKCTC
 from .utility import SOS
 from .utility import SPACE
 from .utility import UNK
+from ..utils.log import Logger

 logger = Logger(__name__)

--- a/paddlespeech/audio/transform/perturb.py
+++ b/paddlespeech/audio/transform/perturb.py
@ -12,16 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # Modified from espnet(https://github.com/espnet/espnet)
-import io
-import os
-
-import h5py
 import librosa
 import numpy
-import numpy as np
 import scipy
 import soundfile

+import io
+import os
+import h5py
+import numpy as np

 class SoundHDF5File():
    """Collecting sound files to a HDF5 file
@ -110,7 +109,6 @@ class SoundHDF5File():
    def close(self):
        self.file.close()

-
 class SpeedPerturbation():
    """SpeedPerturbation

@ -560,3 +558,4 @@ class RIRConvolve():
                [scipy.convolve(x, r, mode="same") for r in rir], axis=-1)
        else:
            return scipy.convolve(x, rir, mode="same")
+
--- a/paddlespeech/audio/transform/spec_augment.py
+++ b/paddlespeech/audio/transform/spec_augment.py
@ -14,7 +14,6 @@
 # Modified from espnet(https://github.com/espnet/espnet)
 """Spec Augment module for preprocessing i.e., data augmentation"""
 import random
-
 import numpy
 from PIL import Image

--- a/paddlespeech/audio/transform/spectrogram.py
+++ b/paddlespeech/audio/transform/spectrogram.py
@ -381,6 +381,36 @@ class LogMelSpectrogramKaldi():
        mat = np.squeeze(mat.numpy())
        return mat

+class WavProcess():
+    def __init__(
+            self,
+            dither=0.1):
+        """
+        Args:
+            dither (float): Dithering constant
+
+        Returns:
+        """
+
+        self.dither = dither
+
+    def __call__(self, x, train):
+        """
+        Args:
+            x (np.ndarray): shape (Ti,)
+            train (bool): True, train mode.
+
+        Raises:
+            ValueError: not support (Ti, C)
+
+        Returns:
+            np.ndarray: (T, D)
+        """
+        dither = self.dither if train else 0.0
+        if x.ndim != 1:
+            raise ValueError("Not support x: [Time, Channel]")
+        waveform = np.expand_dims(x, -1)
+        return waveform

 class LogMelSpectrogramKaldi_decay():
    def __init__(
--- a/paddlespeech/audio/transform/transformation.py
+++ b/paddlespeech/audio/transform/transformation.py
@ -41,6 +41,7 @@ import_alias = dict(
    utterance_cmvn="paddlespeech.audio.transform.cmvn:UtteranceCMVN",
    fbank="paddlespeech.audio.transform.spectrogram:LogMelSpectrogram",
    spectrogram="paddlespeech.audio.transform.spectrogram:Spectrogram",
+    wav_process="paddlespeech.audio.transform.spectrogram:WavProcess",
    stft="paddlespeech.audio.transform.spectrogram:Stft",
    istft="paddlespeech.audio.transform.spectrogram:IStft",
    stft2fbank="paddlespeech.audio.transform.spectrogram:Stft2LogMelSpectrogram",
--- a/paddlespeech/cli/asr/infer.py
+++ b/paddlespeech/cli/asr/infer.py
@ -99,9 +99,8 @@ class ASRExecutor(BaseExecutor):
            '-y',
            action="store_true",
            default=False,
-            help='No additional parameters required. \
-            Once set this parameter, it means accepting the request of the program by default, \
-            which includes transforming the audio sample rate')
+            help='No additional parameters required. Once set this parameter, it means accepting the request of the program by default, which includes transforming the audio sample rate'
+        )
        self.parser.add_argument(
            '--rtf',
            action="store_true",
@ -341,7 +340,7 @@ class ASRExecutor(BaseExecutor):
        audio = np.round(audio).astype("int16")
        return audio

-    def _check(self, audio_file: str, sample_rate: int, force_yes: bool=False):
+    def _check(self, audio_file: str, sample_rate: int, force_yes: bool):
        self.sample_rate = sample_rate
        if self.sample_rate != 16000 and self.sample_rate != 8000:
            logger.error(
@ -435,17 +434,8 @@ class ASRExecutor(BaseExecutor):

        for id_, input_ in task_source.items():
            try:
-                res = self(
-                    audio_file=input_,
-                    model=model,
-                    lang=lang,
-                    sample_rate=sample_rate,
-                    config=config,
-                    ckpt_path=ckpt_path,
-                    decode_method=decode_method,
-                    force_yes=force_yes,
-                    rtf=rtf,
-                    device=device)
+                res = self(input_, model, lang, sample_rate, config, ckpt_path,
+                           decode_method, force_yes, rtf, device)
                task_results[id_] = res
            except Exception as e:
                has_exceptions = True
--- a/paddlespeech/cli/vector/infer.py
+++ b/paddlespeech/cli/vector/infer.py
@ -70,14 +70,6 @@ class VectorExecutor(BaseExecutor):
            type=str,
            default=None,
            help="Checkpoint file of model.")
-        self.parser.add_argument(
-            '--yes',
-            '-y',
-            action="store_true",
-            default=False,
-            help='No additional parameters required. \
-            Once set this parameter, it means accepting the request of the program by default, \
-            which includes transforming the audio sample rate')
        self.parser.add_argument(
            '--config',
            type=str,
@ -117,7 +109,6 @@ class VectorExecutor(BaseExecutor):
        sample_rate = parser_args.sample_rate
        config = parser_args.config
        ckpt_path = parser_args.ckpt_path
-        force_yes = parser_args.yes
        device = parser_args.device

        # stage 1: configurate the verbose flag
@ -137,14 +128,8 @@ class VectorExecutor(BaseExecutor):
                # extract the speaker audio embedding
                if parser_args.task == "spk":
                    logger.debug("do vector spk task")
-                    res = self(
-                        audio_file=input_,
-                        model=model,
-                        sample_rate=sample_rate,
-                        config=config,
-                        ckpt_path=ckpt_path,
-                        force_yes=force_yes,
-                        device=device)
+                    res = self(input_, model, sample_rate, config, ckpt_path,
+                               device)
                    task_result[id_] = res
                elif parser_args.task == "score":
                    logger.debug("do vector score task")
@ -160,22 +145,10 @@ class VectorExecutor(BaseExecutor):
                    logger.debug(
                        f"score task, enroll audio: {enroll_audio}, test audio: {test_audio}"
                    )
-                    enroll_embedding = self(
-                        audio_file=enroll_audio,
-                        model=model,
-                        sample_rate=sample_rate,
-                        config=config,
-                        ckpt_path=ckpt_path,
-                        force_yes=force_yes,
-                        device=device)
-                    test_embedding = self(
-                        audio_file=test_audio,
-                        model=model,
-                        sample_rate=sample_rate,
-                        config=config,
-                        ckpt_path=ckpt_path,
-                        force_yes=force_yes,
-                        device=device)
+                    enroll_embedding = self(enroll_audio, model, sample_rate,
+                                            config, ckpt_path, device)
+                    test_embedding = self(test_audio, model, sample_rate,
+                                          config, ckpt_path, device)

                    # get the score
                    res = self.get_embeddings_score(enroll_embedding,
@ -249,7 +222,6 @@ class VectorExecutor(BaseExecutor):
                 sample_rate: int=16000,
                 config: os.PathLike=None,
                 ckpt_path: os.PathLike=None,
-                 force_yes: bool=False,
                 device=paddle.get_device()):
        """Extract the audio embedding

@ -268,7 +240,7 @@ class VectorExecutor(BaseExecutor):
        """
        # stage 0: check the audio format
        audio_file = os.path.abspath(audio_file)
-        if not self._check(audio_file, sample_rate, force_yes):
+        if not self._check(audio_file, sample_rate):
            sys.exit(-1)

        # stage 1: set the paddle runtime host device
@ -446,7 +418,7 @@ class VectorExecutor(BaseExecutor):

        logger.debug("audio extract the feat success")

-    def _check(self, audio_file: str, sample_rate: int, force_yes: bool=False):
+    def _check(self, audio_file: str, sample_rate: int):
        """Check if the model sample match the audio sample rate 

        Args:
@ -490,34 +462,13 @@ class VectorExecutor(BaseExecutor):
        logger.debug(f"The sample rate is {audio_sample_rate}")

        if audio_sample_rate != self.sample_rate:
-            logger.debug("The sample rate of the input file is not {}.\n \
+            logger.error("The sample rate of the input file is not {}.\n \
                            The program will resample the wav file to {}.\n \
                            If the result does not meet your expectations，\n \
                            Please input the 16k 16 bit 1 channel wav file. \
                        ".format(self.sample_rate, self.sample_rate))
-            if force_yes is False:
-                while (True):
-                    logger.debug(
-                        "Whether to change the sample rate and the channel. Y: change the sample. N: exit the prgream."
-                    )
-                    content = input("Input(Y/N):")
-                    if content.strip() == "Y" or content.strip(
-                    ) == "y" or content.strip() == "yes" or content.strip(
-                    ) == "Yes":
-                        logger.debug(
-                            "change the sampele rate, channel to 16k and 1 channel"
-                        )
-                        break
-                    elif content.strip() == "N" or content.strip(
-                    ) == "n" or content.strip() == "no" or content.strip(
-                    ) == "No":
-                        logger.debug("Exit the program")
-                        return False
-                    else:
-                        logger.warning("Not regular input, please input again")
-            self.change_format = True
+            sys.exit(-1)
        else:
            logger.debug("The audio file format is right")
-            self.change_format = False

        return True
--- a/paddlespeech/resource/pretrained_models.py
+++ b/paddlespeech/resource/pretrained_models.py
@ -1363,11 +1363,5 @@ g2pw_onnx_models = {
            'md5':
            '7e049a55547da840502cf99e8a64f20e',
        },
-        '1.1': {
-            'url':
-            'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip',
-            'md5':
-            'f8b60501770bff92ed6ce90860a610e6',
-        },
    },
 }
--- a/paddlespeech/s2t/init.py
+++ b/paddlespeech/s2t/init.py
@ -114,7 +114,6 @@ if not hasattr(paddle.Tensor, 'new_full'):
    paddle.Tensor.new_full = new_full
    paddle.static.Variable.new_full = new_full

-
 def contiguous(xs: paddle.Tensor) -> paddle.Tensor:
    return xs

--- a/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py
@ -20,8 +20,8 @@ import paddle
 import soundfile
 from yacs.config import CfgNode

-from paddlespeech.audio.transform.transformation import Transformation
 from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
+from paddlespeech.s2t.io.collator import SpeechCollator
 from paddlespeech.s2t.models.ds2 import DeepSpeech2Model
 from paddlespeech.s2t.training.cli import default_argument_parser
 from paddlespeech.s2t.utils import mp_tools
@ -38,24 +38,24 @@ class DeepSpeech2Tester_hub():
        self.args = args
        self.config = config
        self.audio_file = args.audio_file
-
-        self.preprocess_conf = config.preprocess_config
-        self.preprocess_args = {"train": False}
-        self.preprocessing = Transformation(self.preprocess_conf)
-
-        self.text_feature = TextFeaturizer(
-            unit_type=config.unit_type,
-            vocab=config.vocab_filepath,
-            spm_model_prefix=config.spm_model_prefix)
-        paddle.set_device('gpu' if self.args.ngpu > 0 else 'cpu')
+        self.collate_fn_test = SpeechCollator.from_config(config)
+        self._text_featurizer = TextFeaturizer(
+            unit_type=config.unit_type, vocab=None)

    def compute_result_transcripts(self, audio, audio_len, vocab_list, cfg):
-        decode_batch_size = cfg.decode_batch_size
-        self.model.decoder.init_decoder(
-            decode_batch_size, vocab_list, cfg.decoding_method,
-            cfg.lang_model_path, cfg.alpha, cfg.beta, cfg.beam_size,
-            cfg.cutoff_prob, cfg.cutoff_top_n, cfg.num_proc_bsearch)
-        result_transcripts = self.model.decode(audio, audio_len)
+        result_transcripts = self.model.decode(
+            audio,
+            audio_len,
+            vocab_list,
+            decoding_method=cfg.decoding_method,
+            lang_model_path=cfg.lang_model_path,
+            beam_alpha=cfg.alpha,
+            beam_beta=cfg.beta,
+            beam_size=cfg.beam_size,
+            cutoff_prob=cfg.cutoff_prob,
+            cutoff_top_n=cfg.cutoff_top_n,
+            num_processes=cfg.num_proc_bsearch)
+
        return result_transcripts

    @mp_tools.rank_zero_only
@ -64,23 +64,16 @@ class DeepSpeech2Tester_hub():
        self.model.eval()
        cfg = self.config
        audio_file = self.audio_file
-
-        audio, sample_rate = soundfile.read(
-            self.audio_file, dtype="int16", always_2d=True)
-
-        audio = audio[:, 0]
-        logger.info(f"audio shape: {audio.shape}")
-
-        # fbank
-        feat = self.preprocessing(audio, **self.preprocess_args)
-        logger.info(f"feat shape: {feat.shape}")
-
-        audio_len = paddle.to_tensor(feat.shape[0])
-        audio = paddle.to_tensor(feat, dtype='float32').unsqueeze(axis=0)
-
+        collate_fn_test = self.collate_fn_test
+        audio, _ = collate_fn_test.process_utterance(
+            audio_file=audio_file, transcript=" ")
+        audio_len = audio.shape[0]
+        audio = paddle.to_tensor(audio, dtype='float32')
+        audio_len = paddle.to_tensor(audio_len)
+        audio = paddle.unsqueeze(audio, axis=0)
+        vocab_list = collate_fn_test.vocab_list
        result_transcripts = self.compute_result_transcripts(
-            audio, audio_len, self.text_feature.vocab_list, cfg.decode)
-
+            audio, audio_len, vocab_list, cfg.decode)
        logger.info("result_transcripts: " + result_transcripts[0])

    def run_test(self):
@ -116,9 +109,11 @@ class DeepSpeech2Tester_hub():
    def setup_model(self):
        config = self.config.clone()
        with UpdateConfig(config):
-            config.input_dim = config.feat_dim
-            config.output_dim = self.text_feature.vocab_size
+            config.input_dim = self.collate_fn_test.feature_size
+            config.output_dim = self.collate_fn_test.vocab_size
+
        model = DeepSpeech2Model.from_config(config)
+
        self.model = model

    def setup_checkpointer(self):
--- a/paddlespeech/s2t/exps/u2/model.py
+++ b/paddlespeech/s2t/exps/u2/model.py
@ -25,6 +25,8 @@ import paddle
 from paddle import distributed as dist

 from paddlespeech.s2t.frontend.featurizer import TextFeaturizer
+from paddlespeech.s2t.io.dataloader import BatchDataLoader
+from paddlespeech.s2t.io.dataloader import StreamDataLoader
 from paddlespeech.s2t.io.dataloader import DataLoaderFactory
 from paddlespeech.s2t.models.u2 import U2Model
 from paddlespeech.s2t.training.optimizer import OptimizerFactory
@ -107,8 +109,7 @@ class U2Trainer(Trainer):
    def valid(self):
        self.model.eval()
        if not self.use_streamdata:
-            logger.info(
-                f"Valid Total Examples: {len(self.valid_loader.dataset)}")
+            logger.info(f"Valid Total Examples: {len(self.valid_loader.dataset)}")
        valid_losses = defaultdict(list)
        num_seen_utts = 1
        total_loss = 0.0
@ -135,8 +136,7 @@ class U2Trainer(Trainer):
                msg += "epoch: {}, ".format(self.epoch)
                msg += "step: {}, ".format(self.iteration)
                if not self.use_streamdata:
-                    msg += "batch: {}/{}, ".format(i + 1,
-                                                   len(self.valid_loader))
+                    msg += "batch: {}/{}, ".format(i + 1, len(self.valid_loader))
                msg += ', '.join('{}: {:>.6f}'.format(k, v)
                                 for k, v in valid_dump.items())
                logger.info(msg)
@ -157,8 +157,7 @@ class U2Trainer(Trainer):
        self.before_train()

        if not self.use_streamdata:
-            logger.info(
-                f"Train Total Examples: {len(self.train_loader.dataset)}")
+            logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}")
        while self.epoch < self.config.n_epoch:
            with Timer("Epoch-Train Time Cost: {}"):
                self.model.train()
@ -226,18 +225,14 @@ class U2Trainer(Trainer):
        config = self.config.clone()
        self.use_streamdata = config.get("use_stream_data", False)
        if self.train:
-            self.train_loader = DataLoaderFactory.get_dataloader(
-                'train', config, self.args)
-            self.valid_loader = DataLoaderFactory.get_dataloader(
-                'valid', config, self.args)
+            self.train_loader = DataLoaderFactory.get_dataloader('train', config, self.args)
+            self.valid_loader = DataLoaderFactory.get_dataloader('valid', config, self.args)
            logger.info("Setup train/valid Dataloader!")
        else:
            decode_batch_size = config.get('decode', dict()).get(
                'decode_batch_size', 1)
-            self.test_loader = DataLoaderFactory.get_dataloader('test', config,
-                                                                self.args)
-            self.align_loader = DataLoaderFactory.get_dataloader(
-                'align', config, self.args)
+            self.test_loader = DataLoaderFactory.get_dataloader('test', config, self.args)
+            self.align_loader = DataLoaderFactory.get_dataloader('align', config, self.args)
            logger.info("Setup test/align Dataloader!")

    def setup_model(self):
@ -250,8 +245,7 @@ class U2Trainer(Trainer):
                model_conf.output_dim = self.train_loader.vocab_size
            else:
                model_conf.input_dim = self.test_loader.feat_dim
-                model_conf.output_dim = self.test_loader.vocab_size
-
+                model_conf.output_dim = 5538
        model = U2Model.from_config(model_conf)

        if self.parallel:
@ -316,6 +310,11 @@ class U2Tester(U2Trainer):
            unit_type=self.config.unit_type,
            vocab=self.config.vocab_filepath,
            spm_model_prefix=self.config.spm_model_prefix)
+
+        self.text_feature_test = TextFeaturizer(
+            unit_type=self.config.unit_type,
+            vocab='/home/zhangtianhao/workspace/PaddleSpeech/examples/aishell/asr1/data/lang_char/vocab.txt',
+            spm_model_prefix=self.config.spm_model_prefix)
        self.vocab_list = self.text_feature.vocab_list

    def id2token(self, texts, texts_len, text_feature):
@ -340,7 +339,7 @@ class U2Tester(U2Trainer):
        error_rate_func = error_rate.cer if decode_config.error_rate_type == 'cer' else error_rate.wer

        start_time = time.time()
-        target_transcripts = self.id2token(texts, texts_len, self.text_feature)
+        target_transcripts = self.id2token(texts, texts_len, self.text_feature_test)
        result_transcripts, result_tokenids = self.model.decode(
            audio,
            audio_len,
--- a/paddlespeech/s2t/exps/u2_kaldi/model.py
+++ b/paddlespeech/s2t/exps/u2_kaldi/model.py
@ -105,8 +105,7 @@ class U2Trainer(Trainer):
    def valid(self):
        self.model.eval()
        if not self.use_streamdata:
-            logger.info(
-                f"Valid Total Examples: {len(self.valid_loader.dataset)}")
+            logger.info(f"Valid Total Examples: {len(self.valid_loader.dataset)}")
        valid_losses = defaultdict(list)
        num_seen_utts = 1
        total_loss = 0.0
@ -134,8 +133,7 @@ class U2Trainer(Trainer):
                msg += "epoch: {}, ".format(self.epoch)
                msg += "step: {}, ".format(self.iteration)
                if not self.use_streamdata:
-                    msg += "batch: {}/{}, ".format(i + 1,
-                                                   len(self.valid_loader))
+                    msg += "batch: {}/{}, ".format(i + 1, len(self.valid_loader))
                msg += ', '.join('{}: {:>.6f}'.format(k, v)
                                 for k, v in valid_dump.items())
                logger.info(msg)
@ -155,8 +153,7 @@ class U2Trainer(Trainer):

        self.before_train()
        if not self.use_streamdata:
-            logger.info(
-                f"Train Total Examples: {len(self.train_loader.dataset)}")
+            logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}")
        while self.epoch < self.config.n_epoch:
            with Timer("Epoch-Train Time Cost: {}"):
                self.model.train()
@ -168,8 +165,8 @@ class U2Trainer(Trainer):
                        msg += "epoch: {}, ".format(self.epoch)
                        msg += "step: {}, ".format(self.iteration)
                        if not self.use_streamdata:
-                            msg += "batch : {}/{}, ".format(
-                                batch_index + 1, len(self.train_loader))
+                            msg += "batch : {}/{}, ".format(batch_index + 1,
+                                                        len(self.train_loader))
                        msg += "lr: {:>.8f}, ".format(self.lr_scheduler())
                        msg += "data time: {:>.3f}s, ".format(dataload_time)
                        self.train_batch(batch_index, batch, msg)
@ -207,24 +204,21 @@ class U2Trainer(Trainer):
        self.use_streamdata = config.get("use_stream_data", False)
        if self.train:
            config = self.config.clone()
-            self.train_loader = DataLoaderFactory.get_dataloader(
-                'train', config, self.args)
+            self.train_loader = DataLoaderFactory.get_dataloader('train', config, self.args)
            config = self.config.clone()
            config['preprocess_config'] = None
-            self.valid_loader = DataLoaderFactory.get_dataloader(
-                'valid', config, self.args)
+            self.valid_loader = DataLoaderFactory.get_dataloader('valid', config, self.args)
            logger.info("Setup train/valid Dataloader!")
        else:
            config = self.config.clone()
            config['preprocess_config'] = None
-            self.test_loader = DataLoaderFactory.get_dataloader('test', config,
-                                                                self.args)
+            self.test_loader = DataLoaderFactory.get_dataloader('test', config, self.args)
            config = self.config.clone()
            config['preprocess_config'] = None
-            self.align_loader = DataLoaderFactory.get_dataloader(
-                'align', config, self.args)
+            self.align_loader = DataLoaderFactory.get_dataloader('align', config, self.args)
            logger.info("Setup test/align Dataloader!")

+
    def setup_model(self):
        config = self.config

--- a/paddlespeech/s2t/exps/u2_st/model.py
+++ b/paddlespeech/s2t/exps/u2_st/model.py
@ -121,8 +121,7 @@ class U2STTrainer(Trainer):
    def valid(self):
        self.model.eval()
        if not self.use_streamdata:
-            logger.info(
-                f"Valid Total Examples: {len(self.valid_loader.dataset)}")
+            logger.info(f"Valid Total Examples: {len(self.valid_loader.dataset)}")
        valid_losses = defaultdict(list)
        num_seen_utts = 1
        total_loss = 0.0
@ -156,8 +155,7 @@ class U2STTrainer(Trainer):
                msg += "epoch: {}, ".format(self.epoch)
                msg += "step: {}, ".format(self.iteration)
                if not self.use_streamdata:
-                    msg += "batch: {}/{}, ".format(i + 1,
-                                                   len(self.valid_loader))
+                    msg += "batch: {}/{}, ".format(i + 1, len(self.valid_loader))
                msg += ', '.join('{}: {:>.6f}'.format(k, v)
                                 for k, v in valid_dump.items())
                logger.info(msg)
@ -177,8 +175,7 @@ class U2STTrainer(Trainer):

        self.before_train()
        if not self.use_streamdata:
-            logger.info(
-                f"Train Total Examples: {len(self.train_loader.dataset)}")
+            logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}")
        while self.epoch < self.config.n_epoch:
            with Timer("Epoch-Train Time Cost: {}"):
                self.model.train()
@ -251,16 +248,14 @@ class U2STTrainer(Trainer):
        config['load_transcript'] = load_transcript
        self.use_streamdata = config.get("use_stream_data", False)
        if self.train:
-            self.train_loader = DataLoaderFactory.get_dataloader(
-                'train', config, self.args)
-            self.valid_loader = DataLoaderFactory.get_dataloader(
-                'valid', config, self.args)
+            self.train_loader = DataLoaderFactory.get_dataloader('train', config, self.args)
+            self.valid_loader = DataLoaderFactory.get_dataloader('valid', config, self.args)
            logger.info("Setup train/valid Dataloader!")
        else:
-            self.test_loader = DataLoaderFactory.get_dataloader('test', config,
-                                                                self.args)
+            self.test_loader = DataLoaderFactory.get_dataloader('test', config, self.args)
            logger.info("Setup test Dataloader!")

+
    def setup_model(self):
        config = self.config
        model_conf = config
--- a/paddlespeech/s2t/exps/wav2vec2/bin/init.py
+++ b/paddlespeech/s2t/exps/wav2vec2/bin/init.py
@ -0,0 +1,13 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/paddlespeech/s2t/exps/wav2vec2/bin/test.py
+++ b/paddlespeech/s2t/exps/wav2vec2/bin/test.py
@ -0,0 +1,66 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Evaluation for U2 model."""
+import cProfile
+
+from yacs.config import CfgNode
+
+from paddlespeech.s2t.exps.wav2vec2.model import Wav2Vec2ASRTester as Tester
+from paddlespeech.s2t.training.cli import default_argument_parser
+from paddlespeech.s2t.utils.utility import print_arguments
+
+# TODO(hui zhang): dynamic load
+
+
+def main_sp(config, args):
+    exp = Tester(config, args)
+    with exp.eval():
+        exp.setup()
+        exp.run_test()
+
+
+def main(config, args):
+    main_sp(config, args)
+
+
+if __name__ == "__main__":
+    parser = default_argument_parser()
+    # save asr result to
+    parser.add_argument(
+        '--dict-path', type=str, default=None, help='dict path.')
+    parser.add_argument(
+        "--result_file", type=str, help="path of save the asr result")
+    args = parser.parse_args()
+    print_arguments(args, globals())
+
+    # https://yaml.org/type/float.html
+    config = CfgNode(new_allowed=True)
+    if args.config:
+        config.merge_from_file(args.config)
+    if args.decode_cfg:
+        decode_confs = CfgNode(new_allowed=True)
+        decode_confs.merge_from_file(args.decode_cfg)
+        config.decode = decode_confs
+    if args.opts:
+        config.merge_from_list(args.opts)
+    config.freeze()
+    print(config)
+    if args.dump_config:
+        with open(args.dump_config, 'w') as f:
+            print(config, file=f)
+
+    # Setting for profiling
+    pr = cProfile.Profile()
+    pr.runcall(main, config, args)
+    pr.dump_stats('test.profile')
--- a/paddlespeech/s2t/exps/wav2vec2/bin/train.py
+++ b/paddlespeech/s2t/exps/wav2vec2/bin/train.py
@ -0,0 +1,55 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Trainer for U2 model."""
+import cProfile
+import os
+
+from yacs.config import CfgNode
+
+from paddlespeech.s2t.exps.wav2vec2.model import Wav2Vec2ASRTrainer as Trainer
+from paddlespeech.s2t.training.cli import default_argument_parser
+from paddlespeech.s2t.utils.utility import print_arguments
+
+
+def main_sp(config, args):
+    exp = Trainer(config, args)
+    exp.setup()
+    exp.run()
+
+
+def main(config, args):
+    main_sp(config, args)
+
+
+if __name__ == "__main__":
+    parser = default_argument_parser()
+    args = parser.parse_args()
+    print_arguments(args, globals())
+
+    # https://yaml.org/type/float.html
+    config = CfgNode(new_allowed=True)
+    if args.config:
+        config.merge_from_file(args.config)
+    if args.opts:
+        config.merge_from_list(args.opts)
+    config.freeze()
+    print(config)
+    if args.dump_config:
+        with open(args.dump_config, 'w') as f:
+            print(config, file=f)
+
+    # Setting for profiling
+    pr = cProfile.Profile()
+    pr.runcall(main, config, args)
+    pr.dump_stats(os.path.join(args.output, 'train.profile'))
--- a/paddlespeech/s2t/exps/wav2vec2/model.py
+++ b/paddlespeech/s2t/exps/wav2vec2/model.py
@ -0,0 +1,465 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Contains wav2vec2 model."""
+import json
+import os
+import time
+from collections import defaultdict
+from collections import OrderedDict
+from contextlib import nullcontext
+from paddlespeech.s2t.utils import mp_tools
+
+import jsonlines
+import numpy as np
+import paddle
+from paddle import distributed as dist
+from paddlespeech.s2t.frontend.featurizer import TextFeaturizer
+from paddlespeech.s2t.io.dataloader import BatchDataLoader
+from paddlespeech.s2t.io.dataloader import StreamDataLoader
+from paddlespeech.s2t.io.dataloader import DataLoaderFactory
+from paddlespeech.s2t.models.wav2vec2.wav2vec2_ASR import Wav2vec2ASR
+from paddlespeech.s2t.utils import error_rate
+
+
+from paddlespeech.s2t.training.optimizer import OptimizerFactory
+from paddlespeech.s2t.training.reporter import ObsScope
+from paddlespeech.s2t.training.reporter import report
+from paddlespeech.s2t.training.scheduler import LRSchedulerFactory
+from paddlespeech.s2t.training.timer import Timer
+from paddlespeech.s2t.training.trainer import Trainer
+from paddlespeech.s2t.utils.utility import UpdateConfig
+from paddlespeech.s2t.utils import layer_tools
+from paddlespeech.s2t.utils.log import Log
+
+from paddlespeech.s2t.models.wav2vec2.speechbrain.processing.speech_augmentation import TimeDomainSpecAugment
+import pdb
+
+
+logger = Log(__name__).getlog()
+
+class Wav2Vec2ASRTrainer(Trainer):
+    def __init__(self, config, args):
+        super().__init__(config, args)
+
+    def train_batch(self, batch_index, batch, msg):
+        train_conf = self.config
+        start = time.time()
+
+        # forward
+        utt, wav, wavs_lens, target, target_lens = batch
+        wavs_lens_rate = wavs_lens / wav.shape[1] 
+        target_lens_rate = target_lens / target.shape[1]
+        wav = wav[:,:,0]
+        if train_conf.augment:
+            wav = self.speech_augmentation(wav, wavs_lens_rate)
+        loss = self.model(wav, wavs_lens_rate, target, target_lens_rate)
+        # print(self.model.wav2vec2.feature_projection.projection.weight)
+        # print(self.model.wav2vec2.feature_extractor.conv_layers[0].conv.weight)
+
+        # loss div by `batch_size * accum_grad`
+        loss /= train_conf.accum_grad
+        losses_np = {'loss': float(loss) * train_conf.accum_grad}
+
+        # loss backward
+        if (batch_index + 1) % train_conf.accum_grad != 0:
+            # Disable gradient synchronizations across DDP processes.
+            # Within this context, gradients will be accumulated on module
+            # variables, which will later be synchronized.
+            # When using cpu w/o DDP, model does not have `no_sync`
+            context = self.model.no_sync if (hasattr(self.model, "no_sync") and
+                                             self.parallel) else nullcontext
+        else:
+            # Used for single gpu training and DDP gradient synchronization
+            # processes.
+            context = nullcontext
+        with context():
+            loss.backward()
+            layer_tools.print_grads(self.model, print_func=None)
+
+        # optimizer step old
+        if (batch_index + 1) % train_conf.accum_grad == 0:
+            self.optimizer.step()
+            self.optimizer.clear_grad()
+            self.lr_scheduler.step()
+            self.iteration += 1
+        # optimizer step new
+        # if (batch_index + 1) % train_conf.accum_grad == 0:
+        #     self.optimizer.step()
+        #     self.optimizer.clear_grad()
+        #     self.iteration += 1
+
+        iteration_time = time.time() - start
+
+        for k, v in losses_np.items():
+            report(k, v)
+        report("batch_size", self.config.batch_size)
+        report("accum", train_conf.accum_grad)
+        report("step_cost", iteration_time)
+
+        if (batch_index + 1) % train_conf.accum_grad == 0:
+            if dist.get_rank() == 0 and self.visualizer:
+                losses_np_v = losses_np.copy()
+                losses_np_v.update({"lr": self.lr_scheduler()})
+                for key, val in losses_np_v.items():
+                    self.visualizer.add_scalar(
+                        tag='train/' + key, value=val, step=self.iteration - 1)
+
+    @paddle.no_grad()
+    def valid(self):
+        self.model.eval()
+        if not self.use_streamdata:
+            logger.info(f"Valid Total Examples: {len(self.valid_loader.dataset)}")
+        valid_losses = defaultdict(list)
+        num_seen_utts = 1
+        total_loss = 0.0
+        for i, batch in enumerate(self.valid_loader):
+            utt, wav, wavs_lens, target, target_lens = batch
+            wavs_lens_rate = wavs_lens / wav.shape[1] 
+            target_lens_rate = target_lens / target.shape[1]
+            wav = wav[:,:,0]
+            loss = self.model(wav, wavs_lens_rate, target, target_lens_rate)
+
+            if paddle.isfinite(loss):
+                num_utts = batch[1].shape[0]
+                num_seen_utts += num_utts
+                total_loss += float(loss) * num_utts
+                valid_losses['val_loss'].append(float(loss))
+
+            if (i + 1) % self.config.log_interval == 0:
+                valid_dump = {k: np.mean(v) for k, v in valid_losses.items()}
+                valid_dump['val_history_loss'] = total_loss / num_seen_utts
+
+                # logging
+                msg = f"Valid: Rank: {dist.get_rank()}, "
+                msg += "epoch: {}, ".format(self.epoch)
+                msg += "step: {}, ".format(self.iteration)
+                if not self.use_streamdata:
+                    msg += "batch: {}/{}, ".format(i + 1, len(self.valid_loader))
+                msg += ', '.join('{}: {:>.6f}'.format(k, v)
+                                 for k, v in valid_dump.items())
+                logger.info(msg)
+
+        logger.info('Rank {} Val info val_loss {}'.format(
+            dist.get_rank(), total_loss / num_seen_utts))
+        return total_loss, num_seen_utts
+
+    def do_train(self):
+        """The training process control by step."""
+        # !!!IMPORTANT!!!
+        # Try to export the model by script, if fails, we should refine
+        # the code to satisfy the script export requirements
+        # script_model = paddle.jit.to_static(self.model)
+        # script_model_path = str(self.checkpoint_dir / 'init')
+        # paddle.jit.save(script_model, script_model_path)
+
+        self.before_train()
+
+        if not self.use_streamdata:
+            logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}")
+        while self.epoch < self.config.n_epoch:
+            with Timer("Epoch-Train Time Cost: {}"):
+                self.model.train()
+                try:
+                    data_start_time = time.time()
+                    for batch_index, batch in enumerate(self.train_loader):
+                        dataload_time = time.time() - data_start_time
+                        msg = "Train:"
+                        observation = OrderedDict()
+                        with ObsScope(observation):
+                            report("Rank", dist.get_rank())
+                            report("epoch", self.epoch)
+                            report('step', self.iteration)
+                            report("lr", self.lr_scheduler())
+                            self.train_batch(batch_index, batch, msg)
+                            self.after_train_batch()
+                            report('iter', batch_index + 1)
+                            if not self.use_streamdata:
+                                report('total', len(self.train_loader))
+                            report('reader_cost', dataload_time)
+                        observation['batch_cost'] = observation[
+                            'reader_cost'] + observation['step_cost']
+                        observation['samples'] = observation['batch_size']
+                        observation['ips,samples/s'] = observation[
+                            'batch_size'] / observation['batch_cost']
+                        for k, v in observation.items():
+                            msg += f" {k.split(',')[0]}: "
+                            msg += f"{v:>.8f}" if isinstance(v,
+                                                             float) else f"{v}"
+                            msg += f" {k.split(',')[1]}" if len(
+                                k.split(',')) == 2 else ""
+                            msg += ","
+                        msg = msg[:-1]  # remove the last ","
+                        if (batch_index + 1) % self.config.log_interval == 0:
+                            logger.info(msg)
+                        data_start_time = time.time()
+                except Exception as e:
+                    logger.error(e)
+                    raise e
+            with Timer("Eval Time Cost: {}"):
+                total_loss, num_seen_utts = self.valid()
+                if dist.get_world_size() > 1:
+                    num_seen_utts = paddle.to_tensor(num_seen_utts)
+                    # the default operator in all_reduce function is sum.
+                    dist.all_reduce(num_seen_utts)
+                    total_loss = paddle.to_tensor(total_loss)
+                    dist.all_reduce(total_loss)
+                    cv_loss = total_loss / num_seen_utts
+                    cv_loss = float(cv_loss)
+                else:
+                    cv_loss = total_loss / num_seen_utts
+
+            logger.info(
+                'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss))
+            if self.visualizer:
+                self.visualizer.add_scalar(
+                    tag='eval/cv_loss', value=cv_loss, step=self.epoch)
+                self.visualizer.add_scalar(
+                    tag='eval/lr', value=self.lr_scheduler(), step=self.epoch)
+
+            self.save(tag=self.epoch, infos={'val_loss': cv_loss})
+            self.new_epoch()
+
+    def setup_dataloader(self):
+        config = self.config.clone()
+        self.use_streamdata = config.get("use_stream_data", False)
+        if self.train:
+            self.train_loader = DataLoaderFactory.get_dataloader('train', config, self.args)
+            self.valid_loader = DataLoaderFactory.get_dataloader('valid', config, self.args)
+            logger.info("Setup train/valid Dataloader!")
+        else:
+            decode_batch_size = config.get('decode', dict()).get(
+                'decode_batch_size', 1)
+            self.test_loader = DataLoaderFactory.get_dataloader('test', config, self.args)
+            self.align_loader = DataLoaderFactory.get_dataloader('align', config, self.args)
+            logger.info("Setup test/align Dataloader!")
+
+    def setup_model(self):
+        config = self.config
+        model_conf = config
+
+        with UpdateConfig(model_conf):
+            if self.train:
+                model_conf.input_dim = self.train_loader.feat_dim
+                model_conf.output_dim = self.train_loader.vocab_size
+            else:
+                model_conf.input_dim = self.test_loader.feat_dim
+                model_conf.output_dim = self.test_loader.vocab_size
+
+        model = Wav2vec2ASR.from_config(model_conf)
+
+        if self.parallel:
+            model = paddle.DataParallel(model) 
+
+        # logger.info(f"{model}")
+        layer_tools.print_params(model, logger.info)
+        self.model = model
+        logger.info("Setup model!")
+        if model_conf.augment:
+            self.speech_augmentation = TimeDomainSpecAugment(sample_rate=16000, speeds=[95, 100, 105])
+
+        if not self.train:
+            return
+
+        train_config = config
+        optim_type = train_config.model_optim
+        optim_conf = train_config.model_optim_conf
+        scheduler_type = train_config.scheduler
+        scheduler_conf = train_config.scheduler_conf
+
+        scheduler_args = {
+            "learning_rate": optim_conf.lr,
+            "verbose": False,
+            "warmup_steps": scheduler_conf.warmup_steps,
+            "gamma": scheduler_conf.lr_decay,
+            "d_model": model_conf.dnn_neurons,
+        }
+        lr_scheduler = LRSchedulerFactory.from_args(scheduler_type,
+                                                    scheduler_args)
+
+        def optimizer_args(
+                config,
+                parameters,
+                lr_scheduler=None, ):
+            train_config = config
+            optim_type = train_config.model_optim
+            optim_conf = train_config.model_optim_conf
+            scheduler_type = train_config.scheduler
+            scheduler_conf = train_config.scheduler_conf
+            return {
+                "grad_clip": train_config.global_grad_clip,
+                "learning_rate": lr_scheduler
+                if lr_scheduler else optim_conf.lr,
+                "epsilon": optim_conf.epsilon,
+                "rho": optim_conf.rho,
+                "parameters": parameters,
+                "epsilon": 1e-9 if optim_type == 'noam' else None,
+                "beta1": 0.9 if optim_type == 'noam' else None,
+                "beat2": 0.98 if optim_type == 'noam' else None,
+            }
+
+        # optimzer_args = optimizer_args(config, model.parameters(), lr_scheduler)
+        optimzer_args = optimizer_args(config, model.parameters(), lr_scheduler)
+
+        optimizer = OptimizerFactory.from_args(optim_type, optimzer_args)
+
+        self.optimizer = optimizer
+        self.lr_scheduler = lr_scheduler
+        logger.info("Setup optimizer/lr_scheduler!")
+
+
+
+class Wav2Vec2ASRTester(Wav2Vec2ASRTrainer):
+    def __init__(self, config, args):
+        super().__init__(config, args)
+        print(config)
+        self.text_featurizer = TextFeaturizer(
+            unit_type=config.unit_type, vocab=config.vocab_filepath)
+        self.vocab_list = self.text_featurizer.vocab_list
+
+    def id2token(self, texts, texts_len):
+        """ ord() id to chr() chr """
+        trans = []
+        for text, n in zip(texts, texts_len):
+            n = n.numpy().item()
+            ids = text[:n]
+            trans.append(
+                self.text_featurizer.defeaturize(ids.numpy().tolist()))
+        return trans
+
+    def compute_metrics(self,
+                        utts,
+                        audio,
+                        audio_len,
+                        texts,
+                        texts_len,
+                        fout=None):
+        decode_cfg = self.config.decode
+        errors_sum, len_refs, num_ins = 0.0, 0, 0
+        errors_func = error_rate.char_errors if decode_cfg.error_rate_type == 'cer' else error_rate.word_errors
+        error_rate_func = error_rate.cer if decode_cfg.error_rate_type == 'cer' else error_rate.wer
+
+        start_time = time.time()
+        target_transcripts = self.id2token(texts, texts_len)
+        result_transcripts, result_tokenids = self.model.decode(
+                    audio,
+                    audio_len,
+                    text_feature=self.text_featurizer,
+                    decoding_method=decode_cfg.decoding_method,
+                    beam_size=decode_cfg.beam_size)
+        decode_time = time.time() - start_time
+
+        for utt, target, result, rec_tids in zip(
+                utts, target_transcripts, result_transcripts, result_tokenids):
+            errors, len_ref = errors_func(target, result)
+            errors_sum += errors
+            len_refs += len_ref
+            num_ins += 1
+            if fout:
+                fout.write({
+                    "utt": utt,
+                    "refs": [target],
+                    "hyps": [result],
+                    "hyps_tokenid": [rec_tids],
+                })
+            logger.info(f"Utt: {utt}")
+            logger.info(f"Ref: {target}")
+            logger.info(f"Hyp: {result}")
+            logger.info("One example error rate [%s] = %f" % (
+                decode_cfg.error_rate_type, error_rate_func(target, result)))
+
+        return dict(
+            errors_sum=errors_sum,
+            len_refs=len_refs,
+            num_ins=num_ins,  # num examples
+            error_rate=errors_sum / len_refs,
+            error_rate_type=decode_cfg.error_rate_type,
+            num_frames=audio_len.sum().numpy().item(),
+            decode_time=decode_time)
+
+    @mp_tools.rank_zero_only
+    @paddle.no_grad()
+    def test(self):
+        logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}")
+        self.model.eval()
+
+        error_rate_type = None
+        errors_sum, len_refs, num_ins = 0.0, 0, 0
+        num_frames = 0.0
+        num_time = 0.0
+        # Initialized the decoder in model
+        decode_cfg = self.config.decode
+        vocab_list = self.vocab_list
+        decode_batch_size = decode_cfg.decode_batch_size
+        # self.model.decoder.init_decoder(
+        #     decode_batch_size, vocab_list, decode_cfg.decoding_method,
+        #     decode_cfg.lang_model_path, decode_cfg.alpha, decode_cfg.beta,
+        #     decode_cfg.beam_size, decode_cfg.cutoff_prob,
+        #     decode_cfg.cutoff_top_n, decode_cfg.num_proc_bsearch)
+
+        with jsonlines.open(self.args.result_file, 'w') as fout:
+            for i, batch in enumerate(self.test_loader):
+                metrics = self.compute_metrics(*batch, fout=fout)
+                num_frames += metrics['num_frames']
+                num_time += metrics["decode_time"]
+                errors_sum += metrics['errors_sum']
+                len_refs += metrics['len_refs']
+                num_ins += metrics['num_ins']
+                error_rate_type = metrics['error_rate_type']
+                rtf = num_time / (num_frames)
+                logger.info(
+                    "RTF: %f, Error rate [%s] (%d/?) = %f" %
+                    (rtf, error_rate_type, num_ins, errors_sum / len_refs))
+
+        # logging
+        msg = "Test: "
+        msg += "epoch: {}, ".format(self.epoch)
+        msg += "step: {}, ".format(self.iteration)
+        msg += "Final error rate [%s] (%d/%d) = %f" % (
+            error_rate_type, num_ins, num_ins, errors_sum / len_refs)
+        logger.info(msg)
+
+        err_meta_path = os.path.splitext(self.args.result_file)[0] + '.err'
+        err_type_str = "{}".format(error_rate_type)
+        with open(err_meta_path, 'w') as f:
+            data = json.dumps({
+                "epoch":
+                self.epoch,
+                "step":
+                self.iteration,
+                "rtf":
+                rtf,
+                error_rate_type:
+                errors_sum / len_refs,
+                "dataset_hour": (num_frames) / 1000.0 / 3600.0,
+                "process_hour":
+                num_time / 1000.0 / 3600.0,
+                "num_examples":
+                num_ins,
+                "err_sum":
+                errors_sum,
+                "ref_len":
+                len_refs,
+                "decode_method":
+                self.config.decode.decoding_method,
+            })
+            f.write(data + '\n')
+
+    @paddle.no_grad()
+    def export(self):
+        infer_model = DeepSpeech2InferModel.from_pretrained(
+            self.test_loader, self.config, self.args.checkpoint_path)
+        infer_model.eval()
+        static_model = infer_model.export()
+        logger.info(f"Export code: {static_model.forward.code}")
+        paddle.jit.save(static_model, self.args.export_path)
--- a/paddlespeech/s2t/io/dataloader.py
+++ b/paddlespeech/s2t/io/dataloader.py
@ -22,16 +22,17 @@ import paddle
 from paddle.io import BatchSampler
 from paddle.io import DataLoader
 from paddle.io import DistributedBatchSampler
-from yacs.config import CfgNode

-import paddlespeech.audio.streamdata as streamdata
-from paddlespeech.audio.text.text_featurizer import TextFeaturizer
 from paddlespeech.s2t.io.batchfy import make_batchset
 from paddlespeech.s2t.io.converter import CustomConverter
 from paddlespeech.s2t.io.dataset import TransformDataset
 from paddlespeech.s2t.io.reader import LoadInputsAndTargets
 from paddlespeech.s2t.utils.log import Log

+import paddlespeech.audio.streamdata as streamdata
+from paddlespeech.audio.text.text_featurizer import TextFeaturizer
+from yacs.config import CfgNode
+
 __all__ = ["BatchDataLoader", "StreamDataLoader"]

 logger = Log(__name__).getlog()
@ -60,7 +61,6 @@ def batch_collate(x):
    """
    return x[0]

-
 def read_preprocess_cfg(preprocess_conf_file):
    augment_conf = dict()
    preprocess_cfg = CfgNode(new_allowed=True)
@ -84,7 +84,6 @@ def read_preprocess_cfg(preprocess_conf_file):
            augment_conf['t_replace_with_zero'] = process['replace_with_zero']
    return augment_conf 

-
 class StreamDataLoader():
    def __init__(self,
                 manifest_file: str,
@ -132,14 +131,10 @@ class StreamDataLoader():
            world_size = paddle.distributed.get_world_size() 
        except Exception as e:
            logger.warninig(e)
-            logger.warninig(
-                "can not get world_size using paddle.distributed.get_world_size(), use world_size=1"
-            )
-        assert len(shardlist) >= world_size, \
-            "the length of shard list should >= number of gpus/xpus/..."
+            logger.warninig("can not get world_size using paddle.distributed.get_world_size(), use world_size=1")
+        assert(len(shardlist) >= world_size, "the length of shard list should >= number of gpus/xpus/...")

-        update_n_iter_processes = int(
-            max(min(len(shardlist) / world_size - 1, self.n_iter_processes), 0))
+        update_n_iter_processes = int(max(min(len(shardlist)/world_size - 1, self.n_iter_processes), 0))
        logger.info(f"update_n_iter_processes {update_n_iter_processes}")
        if update_n_iter_processes != self.n_iter_processes:
            self.n_iter_processes = update_n_iter_processes         
@ -147,50 +142,44 @@ class StreamDataLoader():

        if self.dist_sampler:
            base_dataset = streamdata.DataPipeline(
-                streamdata.SimpleShardList(shardlist), streamdata.split_by_node
-                if train_mode else streamdata.placeholder(),
+                streamdata.SimpleShardList(shardlist),
+                streamdata.split_by_node if train_mode else streamdata.placeholder(),
                streamdata.split_by_worker,
-                streamdata.tarfile_to_samples(streamdata.reraise_exception))
+                streamdata.tarfile_to_samples(streamdata.reraise_exception)
+            )
        else:
            base_dataset = streamdata.DataPipeline(
                streamdata.SimpleShardList(shardlist),
                streamdata.split_by_worker,
-                streamdata.tarfile_to_samples(streamdata.reraise_exception))
+                streamdata.tarfile_to_samples(streamdata.reraise_exception)
+            )

        self.dataset = base_dataset.append_list(
            streamdata.audio_tokenize(symbol_table),
-            streamdata.audio_data_filter(
-                frame_shift=frame_shift,
-                max_length=maxlen_in,
-                min_length=minlen_in,
-                token_max_length=maxlen_out,
-                token_min_length=minlen_out),
+            streamdata.audio_data_filter(frame_shift=frame_shift, max_length=maxlen_in, min_length=minlen_in, token_max_length=maxlen_out, token_min_length=minlen_out),
            streamdata.audio_resample(resample_rate=resample_rate),
-            streamdata.audio_compute_fbank(
-                num_mel_bins=num_mel_bins,
-                frame_length=frame_length,
-                frame_shift=frame_shift,
-                dither=dither),
-            streamdata.audio_spec_aug(**augment_conf)
-            if train_mode else streamdata.placeholder(
-            ),  # num_t_mask=2, num_f_mask=2, max_t=40, max_f=30, max_w=80)
+            streamdata.audio_compute_fbank(num_mel_bins=num_mel_bins, frame_length=frame_length, frame_shift=frame_shift, dither=dither),
+            streamdata.audio_spec_aug(**augment_conf) if train_mode else streamdata.placeholder(),  # num_t_mask=2, num_f_mask=2, max_t=40, max_f=30, max_w=80)
            streamdata.shuffle(shuffle_size),
            streamdata.sort(sort_size=sort_size),
            streamdata.batched(batch_size),
            streamdata.audio_padding(),
-            streamdata.audio_cmvn(cmvn_file))
+            streamdata.audio_cmvn(cmvn_file)
+        )

        if paddle.__version__ >= '2.3.2':
            self.loader = streamdata.WebLoader(
                self.dataset, 
                num_workers=self.n_iter_processes, 
-                prefetch_factor=self.prefetch_factor,
-                batch_size=None)
+                prefetch_factor = self.prefetch_factor, 
+                batch_size=None
+            )
        else:
            self.loader = streamdata.WebLoader(
                self.dataset, 
                num_workers=self.n_iter_processes, 
-                batch_size=None)
+                batch_size=None
+            )

    def __iter__(self):
        return self.loader.__iter__()
@ -199,9 +188,7 @@ class StreamDataLoader():
        return self.__iter__()

    def __len__(self):
-        logger.info(
-            "Stream dataloader does not support calculate the length of the dataset"
-        )
+        logger.info("Stream dataloader does not support calculate the length of the dataset")
        return -1


@ -371,9 +358,7 @@ class DataLoaderFactory():
                config['maxlen_out'] = float('inf')
                config['dist_sampler'] = False
            else:
-                raise KeyError(
-                    "not valid mode type!!, please input one of 'train, valid, test, align'"
-                )
+                raise KeyError("not valid mode type!!, please input one of 'train, valid, test, align'")
            return StreamDataLoader(
                    manifest_file=config.manifest,
                    train_mode=config.train_mode,
@ -395,7 +380,8 @@ class DataLoaderFactory():
                    prefetch_factor=config.prefetch_factor,
                    dist_sampler=config.dist_sampler,
                    cmvn_file=config.cmvn_file,
-                vocab_filepath=config.vocab_filepath, )
+                    vocab_filepath=config.vocab_filepath,
+                )
        else:
            if mode == 'train':
                config['manifest'] = config.train_manifest
@ -441,9 +427,7 @@ class DataLoaderFactory():
                config['dist_sampler'] = False
                config['shortest_first'] = False
            else:
-                raise KeyError(
-                    "not valid mode type!!, please input one of 'train, valid, test, align'"
-                )
+                raise KeyError("not valid mode type!!, please input one of 'train, valid, test, align'")
            
            return BatchDataLoader(
                json_file=config.manifest,
@ -466,3 +450,4 @@ class DataLoaderFactory():
                num_encs=config.num_encs,
                dist_sampler=config.dist_sampler,
                shortest_first=config.shortest_first)
+           
--- a/paddlespeech/s2t/io/reader.py
+++ b/paddlespeech/s2t/io/reader.py
@ -120,6 +120,7 @@ class LoadInputsAndTargets():
                    x = self._get_from_loader(
                        filepath=inp["feat"],
                        filetype=inp.get("filetype", "mat"))
+
                    x_feats_dict.setdefault(inp["name"], []).append(x)

            if self.load_output:
@ -236,6 +237,7 @@ class LoadInputsAndTargets():
        :return:
        :rtype: np.ndarray
        """
+
        if filetype == "hdf5":
            # e.g.
            #    {"input": [{"feat": "some/path.h5:F01_050C0101_PED_REAL",
--- a/paddlespeech/s2t/models/ds2/deepspeech2.py
+++ b/paddlespeech/s2t/models/ds2/deepspeech2.py
@ -271,7 +271,7 @@ class DeepSpeech2Model(nn.Layer):
            enc_n_units=self.encoder.output_size,
            blank_id=blank_id,
            dropout_rate=0.0,
-            reduction=True,  # sum
+            reduction_type="sum",  # sum
            batch_average=True,  # sum / batch_size
            grad_norm_type=ctc_grad_norm_type)

--- a/paddlespeech/s2t/models/test.py
+++ b/paddlespeech/s2t/models/test.py
@ -0,0 +1,20 @@
+import paddle
+import paddle.nn as nn
+
+class Model(nn.Layer):
+    def __init__(self):
+        super().__init__()        
+        self.linear = nn.Linear(1024,1024)
+
+    def forward(self, x):
+        return self.linear(x)
+
+model = Model()
+x = paddle.uniform([100,1024], dtype='float32')
+out = model(x)
+loss = paddle.mean(out)
+loss.backward()
+
+clip = nn.ClipGradByGlobalNorm(clip_norm=1.0)
+optim = paddle.optimizer.Adadelta(learning_rate=0.1, parameters=model.parameters(), grad_clip=clip)
+optim.step()
--- a/paddlespeech/s2t/models/u2/u2.py
+++ b/paddlespeech/s2t/models/u2/u2.py
@ -605,8 +605,8 @@ class U2BaseModel(ASRInterface, nn.Layer):
            xs: paddle.Tensor,
            offset: int,
            required_cache_size: int,
-            att_cache: paddle.Tensor,  # paddle.zeros([0, 0, 0, 0])
-            cnn_cache: paddle.Tensor,  # paddle.zeros([0, 0, 0, 0])
+            att_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]),
+            cnn_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]),
    ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
        """ Export interface for c++ call, give input chunk xs, and return
            output from time 0 to current chunk.
@ -864,7 +864,7 @@ class U2Model(U2DecodeModel):
            enc_n_units=encoder.output_size(),
            blank_id=0,
            dropout_rate=dropout_rate,
-            reduction=True,  # sum
+            reduction_type="sum",  # sum
            batch_average=True,  # sum / batch_size
            grad_norm_type=grad_norm_type)

--- a/paddlespeech/s2t/models/u2_st/u2_st.py
+++ b/paddlespeech/s2t/models/u2_st/u2_st.py
@ -18,6 +18,7 @@ Unified Streaming and Non-streaming Two-pass End-to-end Model for Speech Recogni
 """
 import time
 from typing import Dict
+from typing import List
 from typing import Optional
 from typing import Tuple

@ -25,8 +26,6 @@ import paddle
 from paddle import jit
 from paddle import nn

-from paddlespeech.audio.utils.tensor_utils import add_sos_eos
-from paddlespeech.audio.utils.tensor_utils import th_accuracy
 from paddlespeech.s2t.frontend.utility import IGNORE_ID
 from paddlespeech.s2t.frontend.utility import load_cmvn
 from paddlespeech.s2t.modules.cmvn import GlobalCMVN
@ -39,6 +38,8 @@ from paddlespeech.s2t.modules.mask import subsequent_mask
 from paddlespeech.s2t.utils import checkpoint
 from paddlespeech.s2t.utils import layer_tools
 from paddlespeech.s2t.utils.log import Log
+from paddlespeech.audio.utils.tensor_utils import add_sos_eos
+from paddlespeech.audio.utils.tensor_utils import th_accuracy
 from paddlespeech.s2t.utils.utility import UpdateConfig

 __all__ = ["U2STModel", "U2STInferModel"]
@ -400,8 +401,8 @@ class U2STBaseModel(nn.Layer):
            xs: paddle.Tensor,
            offset: int,
            required_cache_size: int,
-            att_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]),
-            cnn_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]),
+            att_cache: paddle.Tensor = paddle.zeros([0, 0, 0, 0]),
+            cnn_cache: paddle.Tensor = paddle.zeros([0, 0, 0, 0]),
    ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
        """ Export interface for c++ call, give input chunk xs, and return
            output from time 0 to current chunk.
@ -434,8 +435,8 @@ class U2STBaseModel(nn.Layer):
            paddle.Tensor: new conformer cnn cache required for next chunk, with
                same shape as the original cnn_cache.
        """
-        return self.encoder.forward_chunk(xs, offset, required_cache_size,
-                                          att_cache, cnn_cache)
+        return self.encoder.forward_chunk(
+            xs, offset, required_cache_size, att_cache, cnn_cache)

    # @jit.to_static
    def ctc_activation(self, xs: paddle.Tensor) -> paddle.Tensor:
@ -611,7 +612,7 @@ class U2STModel(U2STBaseModel):
                enc_n_units=encoder.output_size(),
                blank_id=0,
                dropout_rate=dropout_rate,
-                reduction=True,  # sum
+                reduction_type='sum',  # sum
                batch_average=True,  # sum / batch_size
                grad_norm_type=grad_norm_type)

--- a/paddlespeech/s2t/models/wav2vec2/init.py
+++ b/paddlespeech/s2t/models/wav2vec2/init.py
--- a/paddlespeech/s2t/models/wav2vec2/activations.py
+++ b/paddlespeech/s2t/models/wav2vec2/activations.py
@ -0,0 +1,175 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+from packaging import version
+from paddle import Tensor, nn
+
+
+from paddlespeech.s2t.utils.log import Log
+logger = Log(__name__).getlog()
+
+
+class NewGELUActivation(nn.Layer):
+    """
+    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see
+    the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
+    """
+
+    def forward(self, input: Tensor) -> Tensor:
+        return 0.5 * input * (1.0 + paddle.tanh(math.sqrt(2.0 / math.pi) * (input + 0.044715 * paddle.pow(input, 3.0))))
+
+
+class GELUActivation(nn.Layer):
+    """
+    Original Implementation of the GELU activation function in Google BERT repo when initially created. For
+    information: OpenAI GPT's GELU is slightly different (and gives slightly different results): 0.5 * x * (1 +
+    torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) This is now written in C in nn.functional
+    Also see the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
+    """
+
+    def __init__(self, use_gelu_python: bool = False):
+        super().__init__()
+        self.act = nn.functional.gelu
+
+    def _gelu_python(self, input: Tensor) -> Tensor:
+        return input * 0.5 * (1.0 + paddle.erf(input / math.sqrt(2.0)))
+
+    def forward(self, input: Tensor) -> Tensor:
+        return self.act(input)
+
+
+class FastGELUActivation(nn.Layer):
+    """
+    Applies GELU approximation that is slower than QuickGELU but more accurate. See: https://github.com/hendrycks/GELUs
+    """
+
+    def forward(self, input: Tensor) -> Tensor:
+        return 0.5 * input * (1.0 + paddle.tanh(input * 0.7978845608 * (1.0 + 0.044715 * input * input)))
+
+
+class QuickGELUActivation(nn.Layer):
+    """
+    Applies GELU approximation that is fast but somewhat inaccurate. See: https://github.com/hendrycks/GELUs
+    """
+
+    def forward(self, input: Tensor) -> Tensor:
+        return input * paddle.sigmoid(1.702 * input)
+
+
+class ClippedGELUActivation(nn.Layer):
+    """
+    Clip the range of possible GeLU outputs between [min, max]. This is especially useful for quantization purpose, as
+    it allows mapping negatives values in the GeLU spectrum. For more information on this trick, please refer to
+    https://arxiv.org/abs/2004.09602.
+
+    Gaussian Error Linear Unit. Original Implementation of the gelu activation function in Google Bert repo when
+    initially created.
+
+    For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): 0.5 * x * (1 +
+    torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))). See https://arxiv.org/abs/1606.08415
+    """
+
+    def __init__(self, min: float, max: float):
+        if min > max:
+            raise ValueError(f"min should be < max (got min: {min}, max: {max})")
+
+        super().__init__()
+        self.min = min
+        self.max = max
+
+    def forward(self, x: Tensor) -> Tensor:
+        return paddle.clip(gelu(x), self.min, self.max)
+
+
+class SiLUActivation(nn.Layer):
+    """
+    See Gaussian Error Linear Units (Hendrycks et al., https://arxiv.org/abs/1606.08415) where the SiLU (Sigmoid Linear
+    Unit) was originally introduced and coined, and see Sigmoid-Weighted Linear Units for Neural Network Function
+    Approximation in Reinforcement Learning (Elfwing et al., https://arxiv.org/abs/1702.03118) and Swish: a Self-Gated
+    Activation Function (Ramachandran et al., https://arxiv.org/abs/1710.05941v1) where the SiLU was experimented with
+    later.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.act = nn.functional.silu
+
+    def _silu_python(self, input: Tensor) -> Tensor:
+        return input * paddle.sigmoid(input)
+
+    def forward(self, input: Tensor) -> Tensor:
+        return self.act(input)
+
+
+class MishActivation(nn.Layer):
+    """
+    See Mish: A Self-Regularized Non-Monotonic Activation Function (Misra., https://arxiv.org/abs/1908.08681). Also
+    visit the official repository for the paper: https://github.com/digantamisra98/Mish
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.act = nn.functional.mish
+
+    def _mish_python(self, input: Tensor) -> Tensor:
+        return input * paddle.tanh(nn.functional.softplus(input))
+
+    def forward(self, input: Tensor) -> Tensor:
+        return self.act(input)
+
+
+class LinearActivation(nn.Layer):
+    """
+    Applies the linear activation function, i.e. forwarding input directly to output.
+    """
+
+    def forward(self, input: Tensor) -> Tensor:
+        return input
+
+
+ACT2FN = {
+    "gelu": GELUActivation(),
+    "gelu_10": ClippedGELUActivation(-10, 10),
+    "gelu_fast": FastGELUActivation(),
+    "gelu_new": NewGELUActivation(),
+    "gelu_python": GELUActivation(use_gelu_python=True),
+    "linear": LinearActivation(),
+    "mish": MishActivation(),
+    "quick_gelu": QuickGELUActivation(),
+    "relu": nn.ReLU(),
+    "sigmoid": nn.Sigmoid(),
+    "silu": SiLUActivation(),
+    "swish": SiLUActivation(),
+    "tanh": nn.Tanh(),
+}
+
+
+def get_activation(activation_string):
+    if activation_string in ACT2FN:
+        return ACT2FN[activation_string]
+    else:
+        raise KeyError(f"function {activation_string} not found in ACT2FN mapping {list(ACT2FN.keys())}")
+
+
+# For backwards compatibility with: from activations import gelu_python
+gelu_python = get_activation("gelu_python")
+gelu_new = get_activation("gelu_new")
+gelu = get_activation("gelu")
+gelu_fast = get_activation("gelu_fast")
+quick_gelu = get_activation("quick_gelu")
+silu = get_activation("silu")
+mish = get_activation("mish")
+linear_act = get_activation("linear")
--- a/paddlespeech/s2t/models/wav2vec2/modeling_outputs.py
+++ b/paddlespeech/s2t/models/wav2vec2/modeling_outputs.py
--- a/paddlespeech/s2t/models/wav2vec2/modeling_wav2vec2.py
+++ b/paddlespeech/s2t/models/wav2vec2/modeling_wav2vec2.py
--- a/paddlespeech/s2t/models/wav2vec2/speechbrain/init.py
+++ b/paddlespeech/s2t/models/wav2vec2/speechbrain/init.py
--- a/paddlespeech/s2t/models/wav2vec2/speechbrain/lobes/init.py
+++ b/paddlespeech/s2t/models/wav2vec2/speechbrain/lobes/init.py
--- a/paddlespeech/s2t/models/wav2vec2/speechbrain/lobes/augment.py
+++ b/paddlespeech/s2t/models/wav2vec2/speechbrain/lobes/augment.py
@ -0,0 +1,359 @@
+import os
+import paddle
+import speechbrain as sb
+from speechbrain.processing.speech_augmentation import (
+    SpeedPerturb,
+    DropFreq,
+    DropChunk,
+)
+
+
+class TimeDomainSpecAugment(paddle.nn.Layer):
+    """A time-domain approximation of the SpecAugment algorithm.
+    This augmentation module implements three augmentations in
+    the time-domain.
+     1. Drop chunks of the audio (zero amplitude or white noise)
+     2. Drop frequency bands (with band-drop filters)
+     3. Speed peturbation (via resampling to slightly different rate)
+    Arguments
+    ---------
+    perturb_prob : float from 0 to 1
+        The probability that a batch will have speed perturbation applied.
+    drop_freq_prob : float from 0 to 1
+        The probability that a batch will have frequencies dropped.
+    drop_chunk_prob : float from 0 to 1
+        The probability that a batch will have chunks dropped.
+    speeds : list of ints
+        A set of different speeds to use to perturb each batch.
+        See ``speechbrain.processing.speech_augmentation.SpeedPerturb``
+    sample_rate : int
+        Sampling rate of the input waveforms.
+    drop_freq_count_low : int
+        Lowest number of frequencies that could be dropped.
+    drop_freq_count_high : int
+        Highest number of frequencies that could be dropped.
+    drop_chunk_count_low : int
+        Lowest number of chunks that could be dropped.
+    drop_chunk_count_high : int
+        Highest number of chunks that could be dropped.
+    drop_chunk_length_low : int
+        Lowest length of chunks that could be dropped.
+    drop_chunk_length_high : int
+        Highest length of chunks that could be dropped.
+    drop_chunk_noise_factor : float
+        The noise factor used to scale the white noise inserted, relative to
+        the average amplitude of the utterance. Default 0 (no noise inserted).
+    Example
+    -------
+    >>> inputs = torch.randn([10, 16000])
+    >>> feature_maker = TimeDomainSpecAugment(speeds=[80])
+    >>> feats = feature_maker(inputs, torch.ones(10))
+    >>> feats.shape
+    torch.Size([10, 12800])
+    """
+
+    def __init__(
+        self,
+        perturb_prob=1.0,
+        drop_freq_prob=1.0,
+        drop_chunk_prob=1.0,
+        speeds=[95, 100, 105],
+        sample_rate=16000,
+        drop_freq_count_low=0,
+        drop_freq_count_high=3,
+        drop_chunk_count_low=0,
+        drop_chunk_count_high=5,
+        drop_chunk_length_low=1000,
+        drop_chunk_length_high=2000,
+        drop_chunk_noise_factor=0,
+    ):
+        super().__init__()
+        self.speed_perturb = SpeedPerturb(
+            perturb_prob=perturb_prob, orig_freq=sample_rate, speeds=speeds
+        )
+        self.drop_freq = DropFreq(
+            drop_prob=drop_freq_prob,
+            drop_count_low=drop_freq_count_low,
+            drop_count_high=drop_freq_count_high,
+        )
+        self.drop_chunk = DropChunk(
+            drop_prob=drop_chunk_prob,
+            drop_count_low=drop_chunk_count_low,
+            drop_count_high=drop_chunk_count_high,
+            drop_length_low=drop_chunk_length_low,
+            drop_length_high=drop_chunk_length_high,
+            noise_factor=drop_chunk_noise_factor,
+        )
+
+    def forward(self, waveforms, lengths):
+        """Returns the distorted waveforms.
+        Arguments
+        ---------
+        waveforms : torch.Tensor
+            The waveforms to distort
+        """
+        # Augmentation
+        with paddle.no_grad():
+            waveforms = self.speed_perturb(waveforms)
+            waveforms = self.drop_freq(waveforms)
+            waveforms = self.drop_chunk(waveforms, lengths)
+
+        return 
+
+
+class DropFreq(torch.nn.Module):
+    """This class drops a random frequency from the signal.
+    The purpose of this class is to teach models to learn to rely on all parts
+    of the signal, not just a few frequency bands.
+    Arguments
+    ---------
+    drop_freq_low : float
+        The low end of frequencies that can be dropped,
+        as a fraction of the sampling rate / 2.
+    drop_freq_high : float
+        The high end of frequencies that can be
+        dropped, as a fraction of the sampling rate / 2.
+    drop_count_low : int
+        The low end of number of frequencies that could be dropped.
+    drop_count_high : int
+        The high end of number of frequencies that could be dropped.
+    drop_width : float
+        The width of the frequency band to drop, as
+        a fraction of the sampling_rate / 2.
+    drop_prob : float
+        The probability that the batch of signals will  have a frequency
+        dropped. By default, every batch has frequencies dropped.
+    Example
+    -------
+    >>> from speechbrain.dataio.dataio import read_audio
+    >>> dropper = DropFreq()
+    >>> signal = read_audio('tests/samples/single-mic/example1.wav')
+    >>> dropped_signal = dropper(signal.unsqueeze(0))
+    """
+
+    def __init__(
+        self,
+        drop_freq_low=1e-14,
+        drop_freq_high=1,
+        drop_count_low=1,
+        drop_count_high=2,
+        drop_width=0.05,
+        drop_prob=1,
+    ):
+        super().__init__()
+        self.drop_freq_low = drop_freq_low
+        self.drop_freq_high = drop_freq_high
+        self.drop_count_low = drop_count_low
+        self.drop_count_high = drop_count_high
+        self.drop_width = drop_width
+        self.drop_prob = drop_prob
+
+    def forward(self, waveforms):
+        """
+        Arguments
+        ---------
+        waveforms : tensor
+            Shape should be `[batch, time]` or `[batch, time, channels]`.
+        Returns
+        -------
+        Tensor of shape `[batch, time]` or `[batch, time, channels]`.
+        """
+
+        # Don't drop (return early) 1-`drop_prob` portion of the batches
+        dropped_waveform = waveforms.clone()
+        if torch.rand(1) > self.drop_prob:
+            return dropped_waveform
+
+        # Add channels dimension
+        if len(waveforms.shape) == 2:
+            dropped_waveform = dropped_waveform.unsqueeze(-1)
+
+        # Pick number of frequencies to drop
+        drop_count = torch.randint(
+            low=self.drop_count_low, high=self.drop_count_high + 1, size=(1,),
+        )
+
+        # Pick a frequency to drop
+        drop_range = self.drop_freq_high - self.drop_freq_low
+        drop_frequency = (
+            torch.rand(drop_count) * drop_range + self.drop_freq_low
+        )
+
+        # Filter parameters
+        filter_length = 101
+        pad = filter_length // 2
+
+        # Start with delta function
+        drop_filter = torch.zeros(1, filter_length, 1, device=waveforms.device)
+        drop_filter[0, pad, 0] = 1
+
+        # Subtract each frequency
+        for frequency in drop_frequency:
+            notch_kernel = notch_filter(
+                frequency, filter_length, self.drop_width,
+            ).to(waveforms.device)
+            drop_filter = convolve1d(drop_filter, notch_kernel, pad)
+
+        # Apply filter
+        dropped_waveform = convolve1d(dropped_waveform, drop_filter, pad)
+
+        # Remove channels dimension if added
+        return dropped_waveform.squeeze(-1)
+
+class DropChunk(torch.nn.Module):
+    """This class drops portions of the input signal.
+    Using `DropChunk` as an augmentation strategy helps a models learn to rely
+    on all parts of the signal, since it can't expect a given part to be
+    present.
+    Arguments
+    ---------
+    drop_length_low : int
+        The low end of lengths for which to set the
+        signal to zero, in samples.
+    drop_length_high : int
+        The high end of lengths for which to set the
+        signal to zero, in samples.
+    drop_count_low : int
+        The low end of number of times that the signal
+        can be dropped to zero.
+    drop_count_high : int
+        The high end of number of times that the signal
+        can be dropped to zero.
+    drop_start : int
+        The first index for which dropping will be allowed.
+    drop_end : int
+        The last index for which dropping will be allowed.
+    drop_prob : float
+        The probability that the batch of signals will
+        have a portion dropped. By default, every batch
+        has portions dropped.
+    noise_factor : float
+        The factor relative to average amplitude of an utterance
+        to use for scaling the white noise inserted. 1 keeps
+        the average amplitude the same, while 0 inserts all 0's.
+    Example
+    -------
+    >>> from speechbrain.dataio.dataio import read_audio
+    >>> dropper = DropChunk(drop_start=100, drop_end=200, noise_factor=0.)
+    >>> signal = read_audio('tests/samples/single-mic/example1.wav')
+    >>> signal = signal.unsqueeze(0) # [batch, time, channels]
+    >>> length = torch.ones(1)
+    >>> dropped_signal = dropper(signal, length)
+    >>> float(dropped_signal[:, 150])
+    0.0
+    """
+
+    def __init__(
+        self,
+        drop_length_low=100,
+        drop_length_high=1000,
+        drop_count_low=1,
+        drop_count_high=10,
+        drop_start=0,
+        drop_end=None,
+        drop_prob=1,
+        noise_factor=0.0,
+    ):
+        super().__init__()
+        self.drop_length_low = drop_length_low
+        self.drop_length_high = drop_length_high
+        self.drop_count_low = drop_count_low
+        self.drop_count_high = drop_count_high
+        self.drop_start = drop_start
+        self.drop_end = drop_end
+        self.drop_prob = drop_prob
+        self.noise_factor = noise_factor
+
+        # Validate low < high
+        if drop_length_low > drop_length_high:
+            raise ValueError("Low limit must not be more than high limit")
+        if drop_count_low > drop_count_high:
+            raise ValueError("Low limit must not be more than high limit")
+
+        # Make sure the length doesn't exceed end - start
+        if drop_end is not None and drop_end >= 0:
+            if drop_start > drop_end:
+                raise ValueError("Low limit must not be more than high limit")
+
+            drop_range = drop_end - drop_start
+            self.drop_length_low = min(drop_length_low, drop_range)
+            self.drop_length_high = min(drop_length_high, drop_range)
+
+    def forward(self, waveforms, lengths):
+        """
+        Arguments
+        ---------
+        waveforms : tensor
+            Shape should be `[batch, time]` or `[batch, time, channels]`.
+        lengths : tensor
+            Shape should be a single dimension, `[batch]`.
+        Returns
+        -------
+        Tensor of shape `[batch, time]` or
+            `[batch, time, channels]`
+        """
+
+        # Reading input list
+        lengths = (lengths * waveforms.size(1)).long()
+        batch_size = waveforms.size(0)
+        dropped_waveform = waveforms.clone()
+
+        # Don't drop (return early) 1-`drop_prob` portion of the batches
+        if torch.rand(1) > self.drop_prob:
+            return dropped_waveform
+
+        # Store original amplitude for computing white noise amplitude
+        clean_amplitude = compute_amplitude(waveforms, lengths.unsqueeze(1))
+
+        # Pick a number of times to drop
+        drop_times = torch.randint(
+            low=self.drop_count_low,
+            high=self.drop_count_high + 1,
+            size=(batch_size,),
+        )
+
+        # Iterate batch to set mask
+        for i in range(batch_size):
+            if drop_times[i] == 0:
+                continue
+
+            # Pick lengths
+            length = torch.randint(
+                low=self.drop_length_low,
+                high=self.drop_length_high + 1,
+                size=(drop_times[i],),
+            )
+
+            # Compute range of starting locations
+            start_min = self.drop_start
+            if start_min < 0:
+                start_min += lengths[i]
+            start_max = self.drop_end
+            if start_max is None:
+                start_max = lengths[i]
+            if start_max < 0:
+                start_max += lengths[i]
+            start_max = max(0, start_max - length.max())
+
+            # Pick starting locations
+            start = torch.randint(
+                low=start_min, high=start_max + 1, size=(drop_times[i],),
+            )
+
+            end = start + length
+
+            # Update waveform
+            if not self.noise_factor:
+                for j in range(drop_times[i]):
+                    dropped_waveform[i, start[j] : end[j]] = 0.0
+            else:
+                # Uniform distribution of -2 to +2 * avg amplitude should
+                # preserve the average for normalization
+                noise_max = 2 * clean_amplitude[i] * self.noise_factor
+                for j in range(drop_times[i]):
+                    # zero-center the noise distribution
+                    noise_vec = torch.rand(length[j], device=waveforms.device)
+                    noise_vec = 2 * noise_max * noise_vec - noise_max
+                    dropped_waveform[i, start[j] : end[j]] = noise_vec
+
+        return 
--- a/paddlespeech/s2t/models/wav2vec2/speechbrain/lobes/models/VanillaNN.py
+++ b/paddlespeech/s2t/models/wav2vec2/speechbrain/lobes/models/VanillaNN.py
@ -0,0 +1,45 @@
+"""Vanilla Neural Network for simple tests.
+Authors
+* Elena Rastorgueva 2020
+"""
+import paddle
+from paddlespeech.s2t.models.wav2vec2.speechbrain.nnet import containers
+import paddlespeech.s2t.models.wav2vec2.speechbrain as sb
+
+
+class VanillaNN(containers.Sequential):
+    """A simple vanilla Deep Neural Network.
+    Arguments
+    ---------
+    activation : paddle class
+        A class used for constructing the activation layers.
+    dnn_blocks : int
+        The number of linear neural blocks to include.
+    dnn_neurons : int
+        The number of neurons in the linear layers.
+    Example
+    -------
+    >>> inputs = paddle.rand([10, 120, 60])
+    >>> model = VanillaNN(input_shape=inputs.shape)
+    >>> outputs = model(inputs)
+    >>> outputs.shape
+    paddle.shape([10, 120, 512])
+    """
+
+    def __init__(
+        self,
+        input_shape,
+        activation=paddle.nn.LeakyReLU,
+        dnn_blocks=2,
+        dnn_neurons=512,
+    ):
+        super().__init__(input_shape=input_shape)
+
+        for block_index in range(dnn_blocks):
+            self.append(
+                sb.nnet.linear.Linear,
+                n_neurons=dnn_neurons,
+                bias=True,
+                layer_name="linear",
+            )
+            self.append(activation(), layer_name="act")
--- a/paddlespeech/s2t/models/wav2vec2/speechbrain/lobes/models/init.py
+++ b/paddlespeech/s2t/models/wav2vec2/speechbrain/lobes/models/init.py
--- a/paddlespeech/s2t/models/wav2vec2/speechbrain/nnet/init.py
+++ b/paddlespeech/s2t/models/wav2vec2/speechbrain/nnet/init.py
@ -0,0 +1,2 @@
+from . import linear
+from . import containers
--- a/paddlespeech/s2t/models/wav2vec2/speechbrain/nnet/containers.py
+++ b/paddlespeech/s2t/models/wav2vec2/speechbrain/nnet/containers.py
@ -0,0 +1,132 @@
+import paddle
+import inspect
+import logging
+import operator
+import functools
+
+class Sequential(paddle.nn.LayerDict):
+    """A sequence of modules with potentially inferring shape on construction.
+    If layers are passed with names, these can be referenced with dot notation.
+    Arguments
+    ---------
+    input_shape : iterable
+        A list or tuple of ints or None, representing the expected shape of an
+        input tensor. None represents a variable-length dimension. If no
+        ``input_shape`` is passed, no shape inference will be performed.
+    *layers, **named_layers
+        The inputs are treated as a list of layers to be
+        applied in sequence. The output shape of each layer is used to
+        infer the shape of the following layer. If a tuple is returned,
+        only the shape of the first element is used to determine input
+        shape of the next layer (e.g. RNN returns output, hidden).
+    Example
+    -------
+    >>> inputs = paddle.rand(10, 40, 50)
+    >>> model = Sequential(input_shape=inputs.shape)
+    >>> model.append(Linear, n_neurons=100, layer_name="layer1")
+    >>> model.append(Linear, n_neurons=200, layer_name="layer2")
+    >>> outputs = model(inputs)
+    >>> outputs.shape
+    paddle.shape([10, 40, 200])
+    >>> outputs = model.layer1(inputs)
+    >>> outputs.shape
+    paddle.shape([10, 40, 100])
+    """
+
+    def __init__(self, *layers, input_shape=None, **named_layers):
+        super().__init__()
+
+        # Make sure either layers or input_shape is passed
+        if not layers and input_shape is None and not named_layers:
+            raise ValueError("Must pass either layers or input shape")
+
+        # Keep track of what layers need "lengths" passed
+        self.length_layers = []
+
+        # Replace None dimensions with arbitrary value
+        self.input_shape = input_shape
+        if input_shape and None in input_shape:
+            self.input_shape = list(input_shape)
+            for i, dim in enumerate(self.input_shape):
+
+                # To reduce size of dummy tensors, use 1 for batch dim
+                if i == 0 and dim is None:
+                    dim = 1
+
+                # Use 64 as nice round arbitrary value, big enough that
+                # halving this dimension a few times doesn't reach 1
+                self.input_shape[i] = dim or 256
+
+        # Append non-named layers
+        for layer in layers:
+            self.append(layer)
+
+        # Append named layers
+        for name, layer in named_layers.items():
+            self.append(layer, layer_name=name)
+
+    def append(self, layer, *args, layer_name=None, **kwargs):
+        """Add a layer to the list of layers, inferring shape if necessary.
+        Arguments
+        ---------
+        layer : A paddle.nn.Module class or object
+            If the layer is a class, it should accept an argument called
+            ``input_shape`` which will be inferred and passed. If the layer
+            is a module object, it is added as-is.
+        layer_name : str
+            The name of the layer, for reference. If the name is in use,
+            ``_{count}`` will be appended.
+        *args, **kwargs
+            These are passed to the layer if it is constructed.
+        """
+
+        # Compute layer_name
+        if layer_name is None:
+            layer_name = str(len(self))
+        elif layer_name in self:
+            index = 0
+            while f"{layer_name}_{index}" in self:
+                index += 1
+            layer_name = f"{layer_name}_{index}"
+
+        # Check if it needs to be constructed with input shape
+        if self.input_shape:
+            argspec = inspect.getfullargspec(layer)
+            if "input_shape" in argspec.args + argspec.kwonlyargs:
+                input_shape = self.get_output_shape()
+                layer = layer(*args, input_shape=input_shape, **kwargs)
+
+        # Finally, append the layer.
+        try:
+            self[layer_name] = layer
+           # self.add_module(layer_name, layer)
+        except TypeError:
+            raise ValueError(
+                "Must pass `input_shape` at initialization and use "
+                "modules that take `input_shape` to infer shape when "
+                "using `append()`."
+            )
+
+    def get_output_shape(self):
+        """Returns expected shape of the output.
+        Computed by passing dummy input constructed with the
+        ``self.input_shape`` attribute.
+        """
+        with paddle.no_grad():
+            dummy_input = paddle.zeros(self.input_shape)
+            dummy_output = self(dummy_input)
+        return dummy_output.shape
+
+    def forward(self, x):
+        """Applies layers in sequence, passing only the first element of tuples.
+        Arguments
+        ---------
+        x : paddle.Tensor
+            The input tensor to run through the network.
+        """
+        for layer in self.values():
+            x = layer(x)
+            if isinstance(x, tuple):
+                x = x[0]
+
+        return x
--- a/paddlespeech/s2t/models/wav2vec2/speechbrain/nnet/linear.py
+++ b/paddlespeech/s2t/models/wav2vec2/speechbrain/nnet/linear.py
@ -0,0 +1,73 @@
+"""Library implementing linear transformation.
+Authors
+ * Mirco Ravanelli 2020
+ * Davide Borra 2021
+"""
+
+import logging
+import paddle
+import paddle.nn as nn
+from paddlespeech.s2t.modules import align
+
+logger = logging.getLogger(__name__)
+
+
+class Linear(paddle.nn.Layer):
+    """Computes a linear transformation y = wx + b.
+    Arguments
+    ---------
+    n_neurons : int
+        It is the number of output neurons (i.e, the dimensionality of the
+        output).
+    input_shape: tuple
+        It is the shape of the input tensor.
+    input_size: int
+        Size of the input tensor.
+    bias : bool
+        If True, the additive bias b is adopted.
+    combine_dims : bool
+        If True and the input is 4D, combine 3rd and 4th dimensions of input.
+    Example
+    -------
+    >>> inputs = paddle.rand(10, 50, 40)
+    >>> lin_t = Linear(input_shape=(10, 50, 40), n_neurons=100)
+    >>> output = lin_t(inputs)
+    >>> output.shape
+    paddle.shape([10, 50, 100])
+    """
+
+    def __init__(
+        self,
+        n_neurons,
+        input_shape=None,
+        input_size=None,
+        bias=True,
+        combine_dims=False,
+    ):
+        super().__init__()
+        self.combine_dims = combine_dims
+
+        if input_shape is None and input_size is None:
+            raise ValueError("Expected one of input_shape or input_size")
+
+        if input_size is None:
+            input_size = input_shape[-1]
+            if len(input_shape) == 4 and self.combine_dims:
+                input_size = input_shape[2] * input_shape[3]
+
+        # Weights are initialized following paddle approach
+        self.w = align.Linear(input_size, n_neurons, bias_attr=bias)
+
+    def forward(self, x):
+        """Returns the linear transformation of input tensor.
+        Arguments
+        ---------
+        x : paddle.Tensor
+            Input to transform linearly.
+        """
+        if x.rank == 4 and self.combine_dims:
+            x = x.reshape(x.shape[0], x.shape[1], x.shape[2] * x.shape[3])
+
+        wx = self.w(x)
+
+        return wx
--- a/paddlespeech/s2t/models/wav2vec2/speechbrain/processing/signal_processing.py
+++ b/paddlespeech/s2t/models/wav2vec2/speechbrain/processing/signal_processing.py
@ -0,0 +1,256 @@
+"""
+Low level signal processing utilities
+Authors
+ * Peter Plantinga 2020
+ * Francois Grondin 2020
+ * William Aris 2020
+ * Samuele Cornell 2020
+ * Sarthak Yadav 2022
+"""
+import paddle
+import math
+from packaging import version
+import numpy as np
+
+def blackman_window(window_length, periodic=True):
+    if window_length == 0:
+        return []
+    if window_length == 1:
+        return paddle.ones([1])
+    if periodic:
+        window_length += 1
+    
+
+
+
+    window = paddle.arange(window_length) * (np.pi / (window_length - 1))
+    window = 0.08 * paddle.cos(window * 4) - 0.5 * paddle.cos(window * 2) + 0.42
+    return window[:-1] if periodic else window
+
+
+def compute_amplitude(waveforms, lengths=None, amp_type="avg", scale="linear"):
+    """Compute amplitude of a batch of waveforms.
+    Arguments
+    ---------
+    waveform : tensor
+        The waveforms used for computing amplitude.
+        Shape should be `[time]` or `[batch, time]` or
+        `[batch, time, channels]`.
+    lengths : tensor
+        The lengths of the waveforms excluding the padding.
+        Shape should be a single dimension, `[batch]`.
+    amp_type : str
+        Whether to compute "avg" average or "peak" amplitude.
+        Choose between ["avg", "peak"].
+    scale : str
+        Whether to compute amplitude in "dB" or "linear" scale.
+        Choose between ["linear", "dB"].
+    Returns
+    -------
+    The average amplitude of the waveforms.
+    Example
+    -------
+    >>> signal = torch.sin(torch.arange(16000.0)).unsqueeze(0)
+    >>> compute_amplitude(signal, signal.size(1))
+    tensor([[0.6366]])
+    """
+    if len(waveforms.shape) == 1:
+        waveforms = waveforms.unsqueeze(0)
+
+    assert amp_type in ["avg", "peak"]
+    assert scale in ["linear", "dB"]
+
+    if amp_type == "avg":
+        if lengths is None:
+            out = paddle.mean(paddle.abs(waveforms), axis=1, keepdim=True)
+        else:
+            wav_sum = paddle.sum(paddle.abs(waveforms), axis=1, keepdim=True)
+            out = wav_sum / lengths
+    elif amp_type == "peak":
+        out = paddle.max(paddle.abs(waveforms), axis=1, keepdim=True)[0]
+    else:
+        raise NotImplementedError
+
+    if scale == "linear":
+        return out
+    elif scale == "dB":
+        return paddle.clip(20 * paddle.log10(out), min=-80)  # clamp zeros
+    else:
+        raise NotImplementedError
+
+
+def convolve1d(
+    waveform,
+    kernel,
+    padding=0,
+    pad_type="constant",
+    stride=1,
+    groups=1,
+    use_fft=False,
+    rotation_index=0,
+):
+    """Use torch.nn.functional to perform 1d padding and conv.
+    Arguments
+    ---------
+    waveform : tensor
+        The tensor to perform operations on.
+    kernel : tensor
+        The filter to apply during convolution.
+    padding : int or tuple
+        The padding (pad_left, pad_right) to apply.
+        If an integer is passed instead, this is passed
+        to the conv1d function and pad_type is ignored.
+    pad_type : str
+        The type of padding to use. Passed directly to
+        `torch.nn.functional.pad`, see PyTorch documentation
+        for available options.
+    stride : int
+        The number of units to move each time convolution is applied.
+        Passed to conv1d. Has no effect if `use_fft` is True.
+    groups : int
+        This option is passed to `conv1d` to split the input into groups for
+        convolution. Input channels should be divisible by the number of groups.
+    use_fft : bool
+        When `use_fft` is passed `True`, then compute the convolution in the
+        spectral domain using complex multiply. This is more efficient on CPU
+        when the size of the kernel is large (e.g. reverberation). WARNING:
+        Without padding, circular convolution occurs. This makes little
+        difference in the case of reverberation, but may make more difference
+        with different kernels.
+    rotation_index : int
+        This option only applies if `use_fft` is true. If so, the kernel is
+        rolled by this amount before convolution to shift the output location.
+    Returns
+    -------
+    The convolved waveform.
+    Example
+    -------
+    >>> from speechbrain.dataio.dataio import read_audio
+    >>> signal = read_audio('tests/samples/single-mic/example1.wav')
+    >>> signal = signal.unsqueeze(0).unsqueeze(2)
+    >>> kernel = torch.rand(1, 10, 1)
+    >>> signal = convolve1d(signal, kernel, padding=(9, 0))
+    """
+    if len(waveform.shape) != 3:
+        raise ValueError("Convolve1D expects a 3-dimensional tensor")
+
+    # Move time dimension last, which pad and fft and conv expect.
+    waveform = waveform.transpose([0, 2, 1])
+    kernel = kernel.transpose([0, 2, 1])
+
+    # Padding can be a tuple (left_pad, right_pad) or an int
+    if isinstance(padding, tuple):
+        waveform = paddle.nn.functional.pad(
+            x=waveform, pad=padding, mode=pad_type,
+        )
+
+    # This approach uses FFT, which is more efficient if the kernel is large
+    if use_fft:
+
+        # Pad kernel to same length as signal, ensuring correct alignment
+        zero_length = waveform.shape[-1] - kernel.shape[-1]
+
+        # Handle case where signal is shorter
+        if zero_length < 0:
+            kernel = kernel[..., :zero_length]
+            zero_length = 0
+
+        # Perform rotation to ensure alignment
+        zeros = paddle.zeros(
+            kernel.shape[0], kernel.shape[1], zero_length
+        )
+        after_index = kernel[..., rotation_index:]
+        before_index = kernel[..., :rotation_index]
+        kernel = paddle.concat((after_index, zeros, before_index), axis=-1)
+
+        # Multiply in frequency domain to convolve in time domain
+        # if version.parse(torch.__version__) > version.parse("1.6.0"):
+        import paddle.fft as fft
+
+        result = fft.rfft(waveform) * fft.rfft(kernel)
+        convolved = fft.irfft(result, n=waveform.shape[-1])
+        # else:
+        #     f_signal = torch.rfft(waveform, 1)
+        #     f_kernel = torch.rfft(kernel, 1)
+        #     sig_real, sig_imag = f_signal.unbind(-1)
+        #     ker_real, ker_imag = f_kernel.unbind(-1)
+        #     f_result = torch.stack(
+        #         [
+        #             sig_real * ker_real - sig_imag * ker_imag,
+        #             sig_real * ker_imag + sig_imag * ker_real,
+        #         ],
+        #         dim=-1,
+        #     )
+        #     convolved = torch.irfft(
+        #         f_result, 1, signal_sizes=[waveform.size(-1)]
+        #     )
+
+    # Use the implementation given by torch, which should be efficient on GPU
+    else:
+        convolved = paddle.nn.functional.conv1d(
+            x=waveform,
+            weight=kernel,
+            stride=stride,
+            groups=groups,
+            padding=padding if not isinstance(padding, tuple) else 0,
+        )
+
+    # Return time dimension to the second dimension.
+    return convolved.transpose([0, 2, 1])
+
+def notch_filter(notch_freq, filter_width=101, notch_width=0.05):
+    """Returns a notch filter constructed from a high-pass and low-pass filter.
+    (from https://tomroelandts.com/articles/
+    how-to-create-simple-band-pass-and-band-reject-filters)
+    Arguments
+    ---------
+    notch_freq : float
+        frequency to put notch as a fraction of the
+        sampling rate / 2. The range of possible inputs is 0 to 1.
+    filter_width : int
+        Filter width in samples. Longer filters have
+        smaller transition bands, but are more inefficient.
+    notch_width : float
+        Width of the notch, as a fraction of the sampling_rate / 2.
+    Example
+    -------
+    >>> from speechbrain.dataio.dataio import read_audio
+    >>> signal = read_audio('tests/samples/single-mic/example1.wav')
+    >>> signal = signal.unsqueeze(0).unsqueeze(2)
+    >>> kernel = notch_filter(0.25)
+    >>> notched_signal = convolve1d(signal, kernel)
+    """
+
+    # Check inputs
+    assert 0 < notch_freq <= 1
+    assert filter_width % 2 != 0
+    pad = filter_width // 2
+    inputs = paddle.arange(filter_width) - pad
+
+    # Avoid frequencies that are too low
+    notch_freq += notch_width
+
+    # Define sinc function, avoiding division by zero
+    def sinc(x):
+        "Computes the sinc function."
+
+        def _sinc(x):
+            return paddle.sin(x) / x
+
+        # The zero is at the middle index
+        return paddle.concat([_sinc(x[:pad]), paddle.ones([1]), _sinc(x[pad + 1 :])])
+
+    # Compute a low-pass filter with cutoff frequency notch_freq.
+    hlpf = sinc(3 * (notch_freq - notch_width) * inputs)
+    hlpf *= blackman_window(filter_width)
+    hlpf /= paddle.sum(hlpf)
+
+    # Compute a high-pass filter with cutoff frequency notch_freq.
+    hhpf = sinc(3 * (notch_freq + notch_width) * inputs)
+    hhpf *= blackman_window(filter_width)
+    hhpf /= -paddle.sum(hhpf)
+    hhpf[pad] += 1
+
+    # Adding filters creates notch filter
+    return (hlpf + hhpf).view(1, -1, 1)
+
--- a/paddlespeech/s2t/models/wav2vec2/speechbrain/processing/speech_augmentation.py
+++ b/paddlespeech/s2t/models/wav2vec2/speechbrain/processing/speech_augmentation.py
@ -0,0 +1,741 @@
+import math
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddlespeech.s2t.models.wav2vec2.speechbrain.processing.signal_processing import (
+    compute_amplitude,
+    convolve1d,
+    notch_filter)
+import pdb
+class SpeedPerturb(nn.Layer):
+    """Slightly speed up or slow down an audio signal.
+    Resample the audio signal at a rate that is similar to the original rate,
+    to achieve a slightly slower or slightly faster signal. This technique is
+    outlined in the paper: "Audio Augmentation for Speech Recognition"
+    Arguments
+    ---------
+    orig_freq : int
+        The frequency of the original signal.
+    speeds : list
+        The speeds that the signal should be changed to, as a percentage of the
+        original signal (i.e. `speeds` is divided by 100 to get a ratio).
+    perturb_prob : float
+        The chance that the batch will be speed-
+        perturbed. By default, every batch is perturbed.
+    Example
+    -------
+    >>> from speechbrain.dataio.dataio import read_audio
+    >>> signal = read_audio('tests/samples/single-mic/example1.wav')
+    >>> perturbator = SpeedPerturb(orig_freq=16000, speeds=[90])
+    >>> clean = signal.unsqueeze(0)
+    >>> perturbed = perturbator(clean)
+    >>> clean.shape
+    torch.Size([1, 52173])
+    >>> perturbed.shape
+    torch.Size([1, 46956])
+    """
+
+    def __init__(
+        self, orig_freq, speeds=[90, 100, 110], perturb_prob=1.0,
+    ):
+        super().__init__()
+        self.orig_freq = orig_freq
+        self.speeds = speeds
+        self.perturb_prob = perturb_prob
+
+        # Initialize index of perturbation
+        self.samp_index = 0
+
+        # Initialize resamplers
+        self.resamplers = []
+        for speed in self.speeds:
+            config = {
+                "orig_freq": self.orig_freq,
+                "new_freq": self.orig_freq * speed // 100,
+            }
+            self.resamplers.append(Resample(**config))
+
+    def forward(self, waveform):
+        """
+        Arguments
+        ---------
+        waveforms : tensor
+            Shape should be `[batch, time]` or `[batch, time, channels]`.
+        lengths : tensor
+            Shape should be a single dimension, `[batch]`.
+        Returns
+        -------
+        Tensor of shape `[batch, time]` or `[batch, time, channels]`.
+        """
+
+        # Don't perturb (return early) 1-`perturb_prob` portion of the batches
+        if paddle.rand([1]) > self.perturb_prob:
+        
+            return waveform.clone()
+        # Perform a random perturbation
+        self.samp_index = paddle.randint(len(self.speeds), shape=(1,))[0]
+        perturbed_waveform = self.resamplers[self.samp_index](waveform)
+
+        return perturbed_waveform
+
+class Resample(nn.Layer):
+    """This class resamples an audio signal using sinc-based interpolation.
+
+    It is a modification of the `resample` function from torchaudio
+    (https://pytorch.org/audio/stable/tutorials/audio_resampling_tutorial.html)
+
+    Arguments
+    ---------
+    orig_freq : int
+        the sampling frequency of the input signal.
+    new_freq : int
+        the new sampling frequency after this operation is performed.
+    lowpass_filter_width : int
+        Controls the sharpness of the filter, larger numbers result in a
+        sharper filter, but they are less efficient. Values from 4 to 10 are allowed.
+
+    Example
+    -------
+    >>> from speechbrain.dataio.dataio import read_audio
+    >>> signal = read_audio('tests/samples/single-mic/example1.wav')
+    >>> signal = signal.unsqueeze(0) # [batch, time, channels]
+    >>> resampler = Resample(orig_freq=16000, new_freq=8000)
+    >>> resampled = resampler(signal)
+    >>> signal.shape
+    torch.Size([1, 52173])
+    >>> resampled.shape
+    torch.Size([1, 26087])
+    """
+
+    def __init__(
+        self, orig_freq=16000, new_freq=16000, lowpass_filter_width=6,
+    ):
+        super().__init__()
+        self.orig_freq = orig_freq
+        self.new_freq = new_freq
+        self.lowpass_filter_width = lowpass_filter_width
+
+        # Compute rate for striding
+        self._compute_strides()
+        assert self.orig_freq % self.conv_stride == 0
+        assert self.new_freq % self.conv_transpose_stride == 0
+
+    def _compute_strides(self):
+        """Compute the phases in polyphase filter.
+
+        (almost directly from torchaudio.compliance.kaldi)
+        """
+
+        # Compute new unit based on ratio of in/out frequencies
+        base_freq = math.gcd(self.orig_freq, self.new_freq)
+        input_samples_in_unit = self.orig_freq // base_freq
+        self.output_samples = self.new_freq // base_freq
+
+        # Store the appropriate stride based on the new units
+        self.conv_stride = input_samples_in_unit
+        self.conv_transpose_stride = self.output_samples
+
+    def forward(self, waveforms):
+        """
+        Arguments
+        ---------
+        waveforms : tensor
+            Shape should be `[batch, time]` or `[batch, time, channels]`.
+        lengths : tensor
+            Shape should be a single dimension, `[batch]`.
+
+        Returns
+        -------
+        Tensor of shape `[batch, time]` or `[batch, time, channels]`.
+        """
+
+        if not hasattr(self, "first_indices"):
+            self._indices_and_weights(waveforms)
+
+        # Don't do anything if the frequencies are the same
+        if self.orig_freq == self.new_freq:
+            return waveforms
+        unsqueezed = False
+        if len(waveforms.shape) == 2:
+            waveforms = waveforms.unsqueeze(1)
+            unsqueezed = True
+        elif len(waveforms.shape) == 3:
+            waveforms = waveforms.transpose([0, 2, 1])
+        else:
+            raise ValueError("Input must be 2 or 3 dimensions")
+
+        # Do resampling
+        resampled_waveform = self._perform_resample(waveforms)
+
+        if unsqueezed:
+            resampled_waveform = resampled_waveform.squeeze(1)
+        else:
+            resampled_waveform = resampled_waveform.transpose([0, 2, 1])
+
+        return resampled_waveform
+
+    def _perform_resample(self, waveforms):
+        """Resamples the waveform at the new frequency.
+
+        This matches Kaldi's OfflineFeatureTpl ResampleWaveform which uses a
+        LinearResample (resample a signal at linearly spaced intervals to
+        up/downsample a signal). LinearResample (LR) means that the output
+        signal is at linearly spaced intervals (i.e the output signal has a
+        frequency of `new_freq`). It uses sinc/bandlimited interpolation to
+        upsample/downsample the signal.
+
+        (almost directly from torchaudio.compliance.kaldi)
+
+        https://ccrma.stanford.edu/~jos/resample/
+        Theory_Ideal_Bandlimited_Interpolation.html
+
+        https://github.com/kaldi-asr/kaldi/blob/master/src/feat/resample.h#L56
+
+        Arguments
+        ---------
+        waveforms : tensor
+            The batch of audio signals to resample.
+
+        Returns
+        -------
+        The waveforms at the new frequency.
+        """
+
+        # Compute output size and initialize
+        batch_size, num_channels, wave_len = waveforms.shape
+        window_size = self.weights.shape[1]
+        tot_output_samp = self._output_samples(wave_len)
+        resampled_waveform = paddle.zeros(
+            (batch_size, num_channels, tot_output_samp)
+        )
+        # self.weights = self.weights.to(waveforms.device)
+
+        # Check weights are on correct device
+        # if waveforms.device != self.weights.device:
+        #     self.weights = self.weights.to(waveforms.device)
+
+        # eye size: (num_channels, num_channels, 1)
+        eye = paddle.eye(num_channels).unsqueeze(2)
+
+        # Iterate over the phases in the polyphase filter
+        for i in range(self.first_indices.shape[0]):
+            wave_to_conv = waveforms
+            first_index = int(self.first_indices[i].item())
+            if first_index >= 0:
+                # trim the signal as the filter will not be applied
+                # before the first_index
+                wave_to_conv = wave_to_conv[..., first_index:]
+
+            # pad the right of the signal to allow partial convolutions
+            # meaning compute values for partial windows (e.g. end of the
+            # window is outside the signal length)
+            max_index = (tot_output_samp - 1) // self.output_samples
+            end_index = max_index * self.conv_stride + window_size
+            current_wave_len = wave_len - first_index
+            right_padding = max(0, end_index + 1 - current_wave_len)
+            left_padding = max(0, -first_index)
+            wave_to_conv = paddle.nn.functional.pad(
+                wave_to_conv, (left_padding, right_padding), data_format='NCL'
+            )
+            conv_wave = paddle.nn.functional.conv1d(
+                x=wave_to_conv,
+                weight=self.weights[i].repeat(num_channels, 1, 1),
+                stride=self.conv_stride,
+                groups=num_channels,
+            )
+
+            # we want conv_wave[:, i] to be at
+            # output[:, i + n*conv_transpose_stride]
+            dilated_conv_wave = paddle.nn.functional.conv1d_transpose(
+                conv_wave, eye, stride=self.conv_transpose_stride
+            )
+
+            # pad dilated_conv_wave so it reaches the output length if needed.
+            left_padding = i
+            previous_padding = left_padding + dilated_conv_wave.shape[-1]
+            right_padding = max(0, tot_output_samp - previous_padding)
+            dilated_conv_wave = paddle.nn.functional.pad(
+                dilated_conv_wave, (left_padding, right_padding), data_format='NCL'
+            )
+            dilated_conv_wave = dilated_conv_wave[..., :tot_output_samp]
+
+            resampled_waveform += dilated_conv_wave
+
+        return resampled_waveform
+
+    def _output_samples(self, input_num_samp):
+        """Based on LinearResample::GetNumOutputSamples.
+
+        LinearResample (LR) means that the output signal is at
+        linearly spaced intervals (i.e the output signal has a
+        frequency of ``new_freq``). It uses sinc/bandlimited
+        interpolation to upsample/downsample the signal.
+
+        (almost directly from torchaudio.compliance.kaldi)
+
+        Arguments
+        ---------
+        input_num_samp : int
+            The number of samples in each example in the batch.
+
+        Returns
+        -------
+        Number of samples in the output waveform.
+        """
+
+        # For exact computation, we measure time in "ticks" of 1.0 / tick_freq,
+        # where tick_freq is the least common multiple of samp_in and
+        # samp_out.
+        samp_in = int(self.orig_freq)
+        samp_out = int(self.new_freq)
+
+        tick_freq = abs(samp_in * samp_out) // math.gcd(samp_in, samp_out)
+        ticks_per_input_period = tick_freq // samp_in
+
+        # work out the number of ticks in the time interval
+        # [ 0, input_num_samp/samp_in ).
+        interval_length = input_num_samp * ticks_per_input_period
+        if interval_length <= 0:
+            return 0
+        ticks_per_output_period = tick_freq // samp_out
+
+        # Get the last output-sample in the closed interval,
+        # i.e. replacing [ ) with [ ]. Note: integer division rounds down.
+        # See http://en.wikipedia.org/wiki/Interval_(mathematics) for an
+        # explanation of the notation.
+        last_output_samp = interval_length // ticks_per_output_period
+
+        # We need the last output-sample in the open interval, so if it
+        # takes us to the end of the interval exactly, subtract one.
+        if last_output_samp * ticks_per_output_period == interval_length:
+            last_output_samp -= 1
+
+        # First output-sample index is zero, so the number of output samples
+        # is the last output-sample plus one.
+        num_output_samp = last_output_samp + 1
+
+        return num_output_samp
+
+    def _indices_and_weights(self, waveforms):
+        """Based on LinearResample::SetIndexesAndWeights
+
+        Retrieves the weights for resampling as well as the indices in which
+        they are valid. LinearResample (LR) means that the output signal is at
+        linearly spaced intervals (i.e the output signal has a frequency
+        of ``new_freq``). It uses sinc/bandlimited interpolation to
+        upsample/downsample the signal.
+
+        Returns
+        -------
+        - the place where each filter should start being applied
+        - the filters to be applied to the signal for resampling
+        """
+
+        # Lowpass filter frequency depends on smaller of two frequencies
+        min_freq = min(self.orig_freq, self.new_freq)
+        lowpass_cutoff = 0.99 * 0.5 * min_freq
+
+        assert lowpass_cutoff * 2 <= min_freq
+        window_width = self.lowpass_filter_width / (2.0 * lowpass_cutoff)
+
+        assert lowpass_cutoff < min(self.orig_freq, self.new_freq) / 2
+        output_t = paddle.arange(
+            start=0.0, end=self.output_samples
+        )
+        output_t /= self.new_freq
+        min_t = output_t - window_width
+        max_t = output_t + window_width
+
+        min_input_index = paddle.ceil(min_t * self.orig_freq)
+        max_input_index = paddle.floor(max_t * self.orig_freq)
+        num_indices = max_input_index - min_input_index + 1
+
+        max_weight_width = num_indices.max()
+        j = paddle.arange(max_weight_width)
+        input_index = min_input_index.unsqueeze(1) + j.unsqueeze(0)
+        delta_t = (input_index / self.orig_freq) - output_t.unsqueeze(1)
+
+        weights = paddle.zeros_like(delta_t)
+
+        inside_window_indices = delta_t.abs() < (window_width)
+        # raised-cosine (Hanning) window with width `window_width`
+        weights[inside_window_indices] = 0.5 * (
+            1
+            + paddle.cos(
+                2
+                * math.pi
+                * lowpass_cutoff
+                / self.lowpass_filter_width
+                * delta_t[inside_window_indices]
+            )
+        )
+        t_eq_zero_indices = delta_t == 0.0
+        t_not_eq_zero_indices = ~t_eq_zero_indices
+
+        # sinc filter function
+        weights[t_not_eq_zero_indices] *= paddle.sin(
+            2 * math.pi * lowpass_cutoff * delta_t[t_not_eq_zero_indices]
+        ) / (math.pi * delta_t[t_not_eq_zero_indices])
+
+        # limit of the function at t = 0
+        weights[t_eq_zero_indices] *= 2 * lowpass_cutoff
+
+        # size (output_samples, max_weight_width)
+        weights /= self.orig_freq
+
+        self.first_indices = min_input_index
+        self.weights = weights
+
+
+class DropFreq(nn.Layer):
+    """This class drops a random frequency from the signal.
+    The purpose of this class is to teach models to learn to rely on all parts
+    of the signal, not just a few frequency bands.
+    Arguments
+    ---------
+    drop_freq_low : float
+        The low end of frequencies that can be dropped,
+        as a fraction of the sampling rate / 2.
+    drop_freq_high : float
+        The high end of frequencies that can be
+        dropped, as a fraction of the sampling rate / 2.
+    drop_count_low : int
+        The low end of number of frequencies that could be dropped.
+    drop_count_high : int
+        The high end of number of frequencies that could be dropped.
+    drop_width : float
+        The width of the frequency band to drop, as
+        a fraction of the sampling_rate / 2.
+    drop_prob : float
+        The probability that the batch of signals will  have a frequency
+        dropped. By default, every batch has frequencies dropped.
+    Example
+    -------
+    >>> from speechbrain.dataio.dataio import read_audio
+    >>> dropper = DropFreq()
+    >>> signal = read_audio('tests/samples/single-mic/example1.wav')
+    >>> dropped_signal = dropper(signal.unsqueeze(0))
+    """
+
+    def __init__(
+        self,
+        drop_freq_low=1e-14,
+        drop_freq_high=1,
+        drop_count_low=1,
+        drop_count_high=2,
+        drop_width=0.05,
+        drop_prob=1,
+    ):
+        super().__init__()
+        self.drop_freq_low = drop_freq_low
+        self.drop_freq_high = drop_freq_high
+        self.drop_count_low = drop_count_low
+        self.drop_count_high = drop_count_high
+        self.drop_width = drop_width
+        self.drop_prob = drop_prob
+
+    def forward(self, waveforms):
+        """
+        Arguments
+        ---------
+        waveforms : tensor
+            Shape should be `[batch, time]` or `[batch, time, channels]`.
+        Returns
+        -------
+        Tensor of shape `[batch, time]` or `[batch, time, channels]`.
+        """
+
+        # Don't drop (return early) 1-`drop_prob` portion of the batches
+        dropped_waveform = waveforms.clone()
+        if paddle.rand([1]) > self.drop_prob:
+            return dropped_waveform
+
+        # Add channels dimension
+        if len(waveforms.shape) == 2:
+            dropped_waveform = dropped_waveform.unsqueeze(-1)
+
+        # Pick number of frequencies to drop
+        drop_count = paddle.randint(
+            low=self.drop_count_low, high=self.drop_count_high + 1, shape=(1,),
+        )
+
+        # Pick a frequency to drop
+        drop_range = self.drop_freq_high - self.drop_freq_low
+        drop_frequency = (
+            paddle.rand(drop_count) * drop_range + self.drop_freq_low
+        )
+
+        # Filter parameters
+        filter_length = 101
+        pad = filter_length // 2
+
+        # Start with delta function
+        drop_filter = paddle.zeros([1, filter_length, 1])
+        drop_filter[0, pad, 0] = 1
+        # Subtract each frequency
+        for frequency in drop_frequency:
+            notch_kernel = notch_filter(
+                frequency, filter_length, self.drop_width,
+            )
+            drop_filter = convolve1d(drop_filter, notch_kernel, pad)
+
+        # Apply filter
+        dropped_waveform = convolve1d(dropped_waveform, drop_filter, pad)
+
+        # Remove channels dimension if added
+        return dropped_waveform.squeeze(-1)
+
+class DropChunk(nn.Layer):
+    """This class drops portions of the input signal.
+    Using `DropChunk` as an augmentation strategy helps a models learn to rely
+    on all parts of the signal, since it can't expect a given part to be
+    present.
+    Arguments
+    ---------
+    drop_length_low : int
+        The low end of lengths for which to set the
+        signal to zero, in samples.
+    drop_length_high : int
+        The high end of lengths for which to set the
+        signal to zero, in samples.
+    drop_count_low : int
+        The low end of number of times that the signal
+        can be dropped to zero.
+    drop_count_high : int
+        The high end of number of times that the signal
+        can be dropped to zero.
+    drop_start : int
+        The first index for which dropping will be allowed.
+    drop_end : int
+        The last index for which dropping will be allowed.
+    drop_prob : float
+        The probability that the batch of signals will
+        have a portion dropped. By default, every batch
+        has portions dropped.
+    noise_factor : float
+        The factor relative to average amplitude of an utterance
+        to use for scaling the white noise inserted. 1 keeps
+        the average amplitude the same, while 0 inserts all 0's.
+    Example
+    -------
+    >>> from speechbrain.dataio.dataio import read_audio
+    >>> dropper = DropChunk(drop_start=100, drop_end=200, noise_factor=0.)
+    >>> signal = read_audio('tests/samples/single-mic/example1.wav')
+    >>> signal = signal.unsqueeze(0) # [batch, time, channels]
+    >>> length = torch.ones(1)
+    >>> dropped_signal = dropper(signal, length)
+    >>> float(dropped_signal[:, 150])
+    0.0
+    """
+
+    def __init__(
+        self,
+        drop_length_low=100,
+        drop_length_high=1000,
+        drop_count_low=1,
+        drop_count_high=10,
+        drop_start=0,
+        drop_end=None,
+        drop_prob=1,
+        noise_factor=0.0,
+    ):
+        super().__init__()
+        self.drop_length_low = drop_length_low
+        self.drop_length_high = drop_length_high
+        self.drop_count_low = drop_count_low
+        self.drop_count_high = drop_count_high
+        self.drop_start = drop_start
+        self.drop_end = drop_end
+        self.drop_prob = drop_prob
+        self.noise_factor = noise_factor
+
+        # Validate low < high
+        if drop_length_low > drop_length_high:
+            raise ValueError("Low limit must not be more than high limit")
+        if drop_count_low > drop_count_high:
+            raise ValueError("Low limit must not be more than high limit")
+
+        # Make sure the length doesn't exceed end - start
+        if drop_end is not None and drop_end >= 0:
+            if drop_start > drop_end:
+                raise ValueError("Low limit must not be more than high limit")
+
+            drop_range = drop_end - drop_start
+            self.drop_length_low = min(drop_length_low, drop_range)
+            self.drop_length_high = min(drop_length_high, drop_range)
+
+    def forward(self, waveforms, lengths):
+        """
+        Arguments
+        ---------
+        waveforms : tensor
+            Shape should be `[batch, time]` or `[batch, time, channels]`.
+        lengths : tensor
+            Shape should be a single dimension, `[batch]`.
+        Returns
+        -------
+        Tensor of shape `[batch, time]` or
+            `[batch, time, channels]`
+        """
+
+        # Reading input list
+        lengths = (lengths * waveforms.shape[1]).long()
+        batch_size = waveforms.shape[0]
+        dropped_waveform = waveforms.clone()
+
+        # Don't drop (return early) 1-`drop_prob` portion of the batches
+        if paddle.rand([1]) > self.drop_prob:
+            return dropped_waveform
+
+        # Store original amplitude for computing white noise amplitude
+        clean_amplitude = compute_amplitude(waveforms, lengths.unsqueeze(1))
+
+        # Pick a number of times to drop
+        drop_times = paddle.randint(
+            low=self.drop_count_low,
+            high=self.drop_count_high + 1,
+            shape=(batch_size,),
+        )
+
+        # Iterate batch to set mask
+        for i in range(batch_size):
+            if drop_times[i] == 0:
+                continue
+
+            # Pick lengths
+            length = paddle.randint(
+                low=self.drop_length_low,
+                high=self.drop_length_high + 1,
+                shape=(drop_times[i],),
+            )
+
+            # Compute range of starting locations
+            start_min = self.drop_start
+            if start_min < 0:
+                start_min += lengths[i]
+            start_max = self.drop_end
+            if start_max is None:
+                start_max = lengths[i]
+            if start_max < 0:
+                start_max += lengths[i]
+            start_max = max(0, start_max - length.max())
+
+            # Pick starting locations
+            start = paddle.randint(
+                low=start_min, high=start_max + 1, shape=(drop_times[i],),
+            )
+
+            end = start + length
+
+            # Update waveform
+            if not self.noise_factor:
+                for j in range(drop_times[i]):
+                    dropped_waveform[i, start[j] : end[j]] = 0.0
+            else:
+                # Uniform distribution of -2 to +2 * avg amplitude should
+                # preserve the average for normalization
+                noise_max = 2 * clean_amplitude[i] * self.noise_factor
+                for j in range(drop_times[i]):
+                    # zero-center the noise distribution
+                    noise_vec = paddle.rand(length[j])
+                    noise_vec = 2 * noise_max * noise_vec - noise_max
+                    dropped_waveform[i, start[j] : end[j]] = noise_vec
+
+        return dropped_waveform
+
+
+class TimeDomainSpecAugment(nn.Layer):
+    """A time-domain approximation of the SpecAugment algorithm.
+
+    This augmentation module implements three augmentations in
+    the time-domain.
+
+     1. Drop chunks of the audio (zero amplitude or white noise)
+     2. Drop frequency bands (with band-drop filters)
+     3. Speed peturbation (via resampling to slightly different rate)
+
+    Arguments
+    ---------
+    perturb_prob : float from 0 to 1
+        The probability that a batch will have speed perturbation applied.
+    drop_freq_prob : float from 0 to 1
+        The probability that a batch will have frequencies dropped.
+    drop_chunk_prob : float from 0 to 1
+        The probability that a batch will have chunks dropped.
+    speeds : list of ints
+        A set of different speeds to use to perturb each batch.
+        See ``speechbrain.processing.speech_augmentation.SpeedPerturb``
+    sample_rate : int
+        Sampling rate of the input waveforms.
+    drop_freq_count_low : int
+        Lowest number of frequencies that could be dropped.
+    drop_freq_count_high : int
+        Highest number of frequencies that could be dropped.
+    drop_chunk_count_low : int
+        Lowest number of chunks that could be dropped.
+    drop_chunk_count_high : int
+        Highest number of chunks that could be dropped.
+    drop_chunk_length_low : int
+        Lowest length of chunks that could be dropped.
+    drop_chunk_length_high : int
+        Highest length of chunks that could be dropped.
+    drop_chunk_noise_factor : float
+        The noise factor used to scale the white noise inserted, relative to
+        the average amplitude of the utterance. Default 0 (no noise inserted).
+
+    Example
+    -------
+    >>> inputs = torch.randn([10, 16000])
+    >>> feature_maker = TimeDomainSpecAugment(speeds=[80])
+    >>> feats = feature_maker(inputs, torch.ones(10))
+    >>> feats.shape
+    torch.Size([10, 12800])
+    """
+
+    def __init__(
+        self,
+        perturb_prob=1.0,
+        drop_freq_prob=1.0,
+        drop_chunk_prob=1.0,
+        speeds=[95, 100, 105],
+        sample_rate=16000,
+        drop_freq_count_low=0,
+        drop_freq_count_high=3,
+        drop_chunk_count_low=0,
+        drop_chunk_count_high=5,
+        drop_chunk_length_low=1000,
+        drop_chunk_length_high=2000,
+        drop_chunk_noise_factor=0,
+    ):
+        super().__init__()
+        self.speed_perturb = SpeedPerturb(
+            perturb_prob=perturb_prob, orig_freq=sample_rate, speeds=speeds
+        )
+        self.drop_freq = DropFreq(
+            drop_prob=drop_freq_prob,
+            drop_count_low=drop_freq_count_low,
+            drop_count_high=drop_freq_count_high,
+        )
+        self.drop_chunk = DropChunk(
+            drop_prob=drop_chunk_prob,
+            drop_count_low=drop_chunk_count_low,
+            drop_count_high=drop_chunk_count_high,
+            drop_length_low=drop_chunk_length_low,
+            drop_length_high=drop_chunk_length_high,
+            noise_factor=drop_chunk_noise_factor,
+        )
+
+    def forward(self, waveforms, lengths):
+        """Returns the distorted waveforms.
+
+        Arguments
+        ---------
+        waveforms : torch.Tensor
+            The waveforms to distort
+        """
+        # Augmentation
+        with paddle.no_grad():
+            waveforms = self.speed_perturb(waveforms)
+            waveforms = self.drop_freq(waveforms)
+            waveforms = self.drop_chunk(waveforms, lengths)
+        return waveforms
--- a/paddlespeech/s2t/models/wav2vec2/speechbrain/processing/test.py
+++ b/paddlespeech/s2t/models/wav2vec2/speechbrain/processing/test.py
@ -0,0 +1,14 @@
+import paddle
+import numpy as np
+
+def blackman_window(window_length, periodic=True):
+    if window_length == 0:
+        return []
+    if window_length == 1:
+        return paddle.ones([1])
+    if periodic:
+        window_length += 1
+    
+    window = paddle.arange(window_length) * (np.pi / (window_length - 1))
+    window = 0.08 * paddle.cos(window * 4) - 0.5 * paddle.cos(window * 2) + 0.42
+    return window[:-1] if periodic else window
--- a/paddlespeech/s2t/models/wav2vec2/wav2vec2_ASR.py
+++ b/paddlespeech/s2t/models/wav2vec2/wav2vec2_ASR.py
@ -0,0 +1,287 @@
+import numpy as np
+import os
+
+from typing import Dict
+from typing import List
+from typing import Optional
+from typing import Tuple
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddlespeech.s2t.models.wav2vec2.modeling_wav2vec2 import Wav2Vec2ConfigPure
+from paddlespeech.s2t.models.wav2vec2.modeling_wav2vec2 import Wav2Vec2Model
+from paddlespeech.s2t.modules.mask import make_pad_mask
+from paddlespeech.s2t.utils.utility import log_add
+
+from collections import defaultdict
+
+from paddlespeech.s2t.models.wav2vec2.speechbrain.lobes.models.VanillaNN import VanillaNN
+from paddlespeech.s2t.modules.ctc import CTCDecoderBase as CTC
+from paddlespeech.s2t.utils.ctc_utils import remove_duplicates_and_blank
+from yacs.config import CfgNode
+
+class Wav2vec2ASR(nn.Layer):
+    def __init__(self, config: dict):
+        super().__init__()
+        
+        wav2vec2_config = Wav2Vec2ConfigPure()
+        wav2vec2 = Wav2Vec2Model(wav2vec2_config)
+
+        model_dict = paddle.load(config.wav2vec2_params_path)
+        wav2vec2.set_state_dict(model_dict)
+        wav2vec2.eval()
+        self.normalize_wav = config.normalize_wav
+        self.output_norm = config.output_norm
+        if config.freeze_wav2vec2:
+            for parm in wav2vec2.parameters():
+                parm.trainable = False
+        self.wav2vec2 = wav2vec2
+        self.enc = VanillaNN(input_shape=[None,None,wav2vec2_config.hidden_size], activation=nn.LeakyReLU, dnn_blocks=config.dnn_blocks, dnn_neurons=config.dnn_neurons)
+        self.ctc = CTC(odim=config.output_dim, enc_n_units=config.dnn_neurons, blank_id=config.blank_id, dropout_rate=config.ctc_dropout_rate, reduction_type="mean")
+
+    def train_batch(self):
+        wav, wavs_lens_rate, target, target_lens_rate = self._get_data()
+        ctc_loss = self(wav, wavs_lens_rate, target, target_lens_rate)
+
+
+    def forward(self, wav, wavs_lens_rate, target, target_lens_rate):
+        if self.normalize_wav:
+            wav = F.layer_norm(wav, wav.shape[1:])
+        # Extract wav2vec output
+        out = self.wav2vec2(wav)[0]
+        np.save("data/out.npy", out.numpy())
+        # We normalize the output if required
+        if self.output_norm:
+            out = F.layer_norm(out, out.shape[1:])
+        feats = out
+
+        x = self.enc(feats)
+        x_lens = (wavs_lens_rate * x.shape[1]).round().astype(paddle.int64)
+        target_lens = (target_lens_rate * target.shape[1]).round().astype(paddle.int64)
+        
+        ctc_loss = self.ctc(x, x_lens, target, target_lens)
+        return ctc_loss
+
+
+    @paddle.no_grad()
+    def decode(self, 
+               feats: paddle.Tensor,
+               feats_lengths: paddle.Tensor,
+               text_feature: Dict[str, int],
+               decoding_method: str,
+               beam_size: int):
+        batch_size = feats.shape[0]
+        if decoding_method is 'ctc_prefix_beam_search' and batch_size > 1:
+            logger.error(
+                f'decoding mode {decoding_method} must be running with batch_size == 1'
+            )
+            logger.error(f"current batch_size is {batch_size}")
+            sys.exit(1)
+        
+        if decoding_method == 'ctc_greedy_search':
+            hyps = self.ctc_greedy_search(feats, feats_lengths)
+            res = [text_feature.defeaturize(hyp) for hyp in hyps]
+            res_tokenids = [hyp for hyp in hyps]
+        # ctc_prefix_beam_search and attention_rescoring only return one
+        # result in List[int], change it to List[List[int]] for compatible
+        # with other batch decoding mode
+        elif decoding_method == 'ctc_prefix_beam_search':
+            assert feats.shape[0] == 1
+            hyp = self.ctc_prefix_beam_search(
+                feats,
+                feats_lengths,
+                beam_size)
+            res = [text_feature.defeaturize(hyp)]
+            res_tokenids = [hyp]
+        else:
+            raise ValueError(f"wav2vec2 not support decoding method: {decoding_method}")
+
+        return res, res_tokenids
+
+    @classmethod
+    def from_config(cls, config):
+        model = cls(config)
+        return model
+
+    def ctc_greedy_search(
+            self, wav, wavs_lens_rate) -> List[List[int]]:
+        """ Apply CTC greedy search
+        Args:
+            speech (paddle.Tensor): (batch, max_len)
+            speech_length (paddle.Tensor): (batch, )
+        Returns:
+            List[List[int]]: best path result
+        """
+        batch_size = wav.shape[0]
+        wav = wav[:,:,0]
+        if self.normalize_wav:
+            wav = F.layer_norm(wav, wav.shape[1:])
+        # Extract wav2vec output
+        out = self.wav2vec2(wav)[0]
+        # We normalize the output if required
+        if self.output_norm:
+            out = F.layer_norm(out, out.shape[1:])
+        feats = out
+        x = self.enc(feats)
+        x_lens = x.shape[1]
+        ctc_probs = self.ctc.log_softmax(x)  # (B, maxlen, vocab_size)
+        topk_prob, topk_index = ctc_probs.topk(1, axis=2)  # (B, maxlen, 1)
+        topk_index = topk_index.view(batch_size, x_lens)  # (B, maxlen)
+        # pad_mask = make_pad_mask(x_lens)  # (B, maxlen)
+        # topk_index = topk_index.masked_fill_(pad_mask, self.eos)  # (B, maxlen)
+
+        hyps = [hyp.tolist() for hyp in topk_index]
+        hyps = [remove_duplicates_and_blank(hyp) for hyp in hyps]
+        return hyps
+
+    def _ctc_prefix_beam_search(
+           self, wav, wavs_lens_rate, beam_size, blank_id: int=0, ) -> Tuple[List[Tuple[int, float]], paddle.Tensor]:
+        """ CTC prefix beam search inner implementation
+        Args:
+            speech (paddle.Tensor): (batch, max_len, feat_dim)
+            speech_length (paddle.Tensor): (batch, )
+            beam_size (int): beam size for beam search
+            decoding_chunk_size (int): decoding chunk for dynamic chunk
+                trained model.
+                <0: for decoding, use full chunk.
+                >0: for decoding, use fixed chunk size as set.
+                0: used for training, it's prohibited here
+            simulate_streaming (bool): whether do encoder forward in a
+                streaming fashion
+        Returns:
+            List[Tuple[int, float]]: nbest results, (N,1), (text, likelihood)
+            paddle.Tensor: encoder output, (1, max_len, encoder_dim),
+                it will be used for rescoring in attention rescoring mode
+        """
+        wav = wav[:,:,0]
+
+        if self.normalize_wav:
+            wav = F.layer_norm(wav, wav.shape[1:])
+        # Extract wav2vec output
+        out = self.wav2vec2(wav)[0]
+        # We normalize the output if required
+        if self.output_norm:
+            out = F.layer_norm(out, out.shape[1:])
+        feats = out
+
+        x = self.enc(feats)
+        maxlen = x.shape[1]
+        ctc_probs = self.ctc.log_softmax(x)  # (1, maxlen, vocab_size)
+        ctc_probs = ctc_probs.squeeze(0)
+
+        # cur_hyps: (prefix, (blank_ending_score, none_blank_ending_score))
+        # blank_ending_score and  none_blank_ending_score in ln domain
+        cur_hyps = [(tuple(), (0.0, -float('inf')))]
+        # 2. CTC beam search step by step
+        for t in range(0, maxlen):
+            logp = ctc_probs[t]  # (vocab_size,)
+            # key: prefix, value (pb, pnb), default value(-inf, -inf)
+            next_hyps = defaultdict(lambda: (-float('inf'), -float('inf')))
+            # 2.1 First beam prune: select topk best
+            top_k_logp, top_k_index = logp.topk(beam_size)  # (beam_size,)
+            for s in top_k_index:
+                s = s.item()
+                ps = logp[s].item()
+                for prefix, (pb, pnb) in cur_hyps:
+                    last = prefix[-1] if len(prefix) > 0 else None
+                    if s == blank_id:  # blank
+                        n_pb, n_pnb = next_hyps[prefix]
+                        n_pb = log_add([n_pb, pb + ps, pnb + ps])
+                        next_hyps[prefix] = (n_pb, n_pnb)
+                    elif s == last:
+                        #  Update *ss -> *s;
+                        n_pb, n_pnb = next_hyps[prefix]
+                        n_pnb = log_add([n_pnb, pnb + ps])
+                        next_hyps[prefix] = (n_pb, n_pnb)
+                        # Update *s-s -> *ss, - is for blank
+                        n_prefix = prefix + (s, )
+                        n_pb, n_pnb = next_hyps[n_prefix]
+                        n_pnb = log_add([n_pnb, pb + ps])
+                        next_hyps[n_prefix] = (n_pb, n_pnb)
+                    else:
+                        n_prefix = prefix + (s, )
+                        n_pb, n_pnb = next_hyps[n_prefix]
+                        n_pnb = log_add([n_pnb, pb + ps, pnb + ps])
+                        next_hyps[n_prefix] = (n_pb, n_pnb)
+
+            # 2.2 Second beam prune
+            next_hyps = sorted(
+                next_hyps.items(),
+                key=lambda x: log_add(list(x[1])),
+                reverse=True)
+            cur_hyps = next_hyps[:beam_size]
+
+        hyps = [(y[0], log_add([y[1][0], y[1][1]])) for y in cur_hyps]
+        return hyps
+
+    def ctc_prefix_beam_search(self, wav, wavs_lens_rate, beam_size) -> List[int]:
+        """ Apply CTC prefix beam search
+        Args:
+            speech (paddle.Tensor): (batch, max_len, feat_dim)
+            speech_length (paddle.Tensor): (batch, )
+            beam_size (int): beam size for beam search
+            decoding_chunk_size (int): decoding chunk for dynamic chunk
+                trained model.
+                <0: for decoding, use full chunk.
+                >0: for decoding, use fixed chunk size as set.
+                0: used for training, it's prohibited here
+            simulate_streaming (bool): whether do encoder forward in a
+                streaming fashion
+        Returns:
+            List[int]: CTC prefix beam search nbest results
+        """
+        hyps = self._ctc_prefix_beam_search(
+            wav, wavs_lens_rate, beam_size)
+        return hyps[0][0]
+
+    # @jit.to_static
+    def ctc_activation(self, xs: paddle.Tensor) -> paddle.Tensor:
+        """ Export interface for c++ call, apply linear transform and log
+            softmax before ctc
+        Args:
+            xs (paddle.Tensor): encoder output, (B, T, D)
+        Returns:
+            paddle.Tensor: activation before ctc
+        """
+        return self.ctc.log_softmax(xs)
+
+
+    def _get_data(self):
+        data_dir = "data"
+        wavs = np.load(os.path.join(data_dir, "wavs.npy"))
+        wavs_lens = np.load(os.path.join(data_dir, "wavs_lens.npy"))
+        tokens = np.load(os.path.join(data_dir, "tokens.npy"))
+        tokens_lens = np.load(os.path.join(data_dir, "tokens_lens.npy"))
+        
+        batch = (paddle.to_tensor(wavs), paddle.to_tensor(wavs_lens, dtype='float32'), 
+            paddle.to_tensor(tokens, dtype='int32'), paddle.to_tensor(tokens_lens, dtype='float32'))
+        return batch
+
+
+if __name__ == "__main__":
+    # wav2vec2_asr = Wav2vec2ASR(config={})
+    # wav2vec2_asr.train_batch()
+    freeze = True
+    config = Wav2Vec2ConfigPure()
+    model = Wav2Vec2Model(config)
+    model_dict = model.state_dict()
+    revise_params_path = "exp/torch_to_paddle_revise.pdparams"
+    model_dict_revise = paddle.load(revise_params_path)
+    model.set_state_dict(model_dict_revise)
+    model.training = True
+    model.eval()
+    if freeze:
+        for parm in model.parameters():
+            parm.requires_grad = False
+   # get enc()
+    enc = VanillaNN(input_shape=[None,None,1024], activation=paddle.nn.LeakyReLU, dnn_blocks=2, dnn_neurons=1024)
+
+    ctc = CTC(odim=30, enc_n_units=1024, blank_id=0, dropout_rate=0.0)
+
+    input_values = np.load("input_values.npy")
+    input_values = paddle.to_tensor(input_values)
+
+    feats = model(input_values).last_hidden_state
+    x = enc(feats)
+    ctc_loss = ctc(enc, target)
--- a/paddlespeech/s2t/modules/align.py
+++ b/paddlespeech/s2t/modules/align.py
@ -11,10 +11,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import math
-
 import paddle
 from paddle import nn
+import math
 """
    To align the initializer between paddle and torch, 
    the API below are set defalut initializer with priority higger than global initializer.
@ -82,18 +81,10 @@ class Linear(nn.Linear):
                 name=None):
        if weight_attr is None:
            if global_init_type == "kaiming_uniform":
-                weight_attr = paddle.ParamAttr(
-                    initializer=nn.initializer.KaimingUniform(
-                        fan_in=None,
-                        negative_slope=math.sqrt(5),
-                        nonlinearity='leaky_relu'))
+                weight_attr = paddle.ParamAttr(initializer=nn.initializer.KaimingUniform(fan_in=None, negative_slope=math.sqrt(5), nonlinearity='leaky_relu'))
        if bias_attr is None:
            if global_init_type == "kaiming_uniform":
-                bias_attr = paddle.ParamAttr(
-                    initializer=nn.initializer.KaimingUniform(
-                        fan_in=None,
-                        negative_slope=math.sqrt(5),
-                        nonlinearity='leaky_relu'))
+                bias_attr = paddle.ParamAttr(initializer=nn.initializer.KaimingUniform(fan_in=None, negative_slope=math.sqrt(5), nonlinearity='leaky_relu'))
        super(Linear, self).__init__(in_features, out_features, weight_attr,
                                     bias_attr, name)

@ -113,18 +104,10 @@ class Conv1D(nn.Conv1D):
                 data_format='NCL'):
        if weight_attr is None:
            if global_init_type == "kaiming_uniform":
-                weight_attr = paddle.ParamAttr(
-                    initializer=nn.initializer.KaimingUniform(
-                        fan_in=None,
-                        negative_slope=math.sqrt(5),
-                        nonlinearity='leaky_relu'))
+                weight_attr = paddle.ParamAttr(initializer=nn.initializer.KaimingUniform(fan_in=None, negative_slope=math.sqrt(5), nonlinearity='leaky_relu'))
        if bias_attr is None:
            if global_init_type == "kaiming_uniform":
-                bias_attr = paddle.ParamAttr(
-                    initializer=nn.initializer.KaimingUniform(
-                        fan_in=None,
-                        negative_slope=math.sqrt(5),
-                        nonlinearity='leaky_relu'))
+                bias_attr = paddle.ParamAttr(initializer=nn.initializer.KaimingUniform(fan_in=None, negative_slope=math.sqrt(5), nonlinearity='leaky_relu'))
        super(Conv1D, self).__init__(
            in_channels, out_channels, kernel_size, stride, padding, dilation,
            groups, padding_mode, weight_attr, bias_attr, data_format)
@ -145,18 +128,10 @@ class Conv2D(nn.Conv2D):
                 data_format='NCHW'):
        if weight_attr is None:
            if global_init_type == "kaiming_uniform":
-                weight_attr = paddle.ParamAttr(
-                    initializer=nn.initializer.KaimingUniform(
-                        fan_in=None,
-                        negative_slope=math.sqrt(5),
-                        nonlinearity='leaky_relu'))
+                weight_attr = paddle.ParamAttr(initializer=nn.initializer.KaimingUniform(fan_in=None, negative_slope=math.sqrt(5), nonlinearity='leaky_relu'))
        if bias_attr is None:
            if global_init_type == "kaiming_uniform":
-                bias_attr = paddle.ParamAttr(
-                    initializer=nn.initializer.KaimingUniform(
-                        fan_in=None,
-                        negative_slope=math.sqrt(5),
-                        nonlinearity='leaky_relu'))
+                bias_attr = paddle.ParamAttr(initializer=nn.initializer.KaimingUniform(fan_in=None, negative_slope=math.sqrt(5), nonlinearity='leaky_relu'))
        super(Conv2D, self).__init__(
            in_channels, out_channels, kernel_size, stride, padding, dilation,
            groups, padding_mode, weight_attr, bias_attr, data_format)
--- a/paddlespeech/s2t/modules/attention.py
+++ b/paddlespeech/s2t/modules/attention.py
@ -15,6 +15,7 @@
 # Modified from wenet(https://github.com/wenet-e2e/wenet)
 """Multi-Head Attention layer definition."""
 import math
+from typing import Optional
 from typing import Tuple

 import paddle
@ -82,11 +83,10 @@ class MultiHeadedAttention(nn.Layer):

        return q, k, v

-    def forward_attention(
-            self,
+    def forward_attention(self,
            value: paddle.Tensor, 
            scores: paddle.Tensor,
-            mask: paddle.Tensor,  # paddle.ones([0, 0, 0], dtype=paddle.bool)
+            mask: paddle.Tensor = paddle.ones([0, 0, 0], dtype=paddle.bool),
        ) -> paddle.Tensor:
        """Compute attention context vector.
        Args:
@ -127,14 +127,13 @@ class MultiHeadedAttention(nn.Layer):

        return self.linear_out(x)  # (batch, time1, d_model)

-    def forward(
-            self,
+    def forward(self,
                query: paddle.Tensor,
                key: paddle.Tensor,
                value: paddle.Tensor,
-            mask: paddle.Tensor,  # paddle.ones([0,0,0], dtype=paddle.bool)
-            pos_emb: paddle.Tensor,  # paddle.empty([0])
-            cache: paddle.Tensor  # paddle.zeros([0,0,0,0])
+                mask: paddle.Tensor = paddle.ones([0,0,0], dtype=paddle.bool),
+                pos_emb: paddle.Tensor = paddle.empty([0]),
+                cache: paddle.Tensor = paddle.zeros([0,0,0,0])
                ) -> Tuple[paddle.Tensor, paddle.Tensor]:
        """Compute scaled dot product attention.
       Args:
@ -244,14 +243,13 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention):

        return x

-    def forward(
-            self,
+    def forward(self,
                query: paddle.Tensor,
                key: paddle.Tensor,
                value: paddle.Tensor,
-            mask: paddle.Tensor,  # paddle.ones([0,0,0], dtype=paddle.bool)
-            pos_emb: paddle.Tensor,  # paddle.empty([0])
-            cache: paddle.Tensor  # paddle.zeros([0,0,0,0])
+                mask: paddle.Tensor = paddle.ones([0,0,0], dtype=paddle.bool),
+                pos_emb: paddle.Tensor = paddle.empty([0]),
+                cache: paddle.Tensor = paddle.zeros([0,0,0,0])
                ) -> Tuple[paddle.Tensor, paddle.Tensor]:
        """Compute 'Scaled Dot Product Attention' with rel. positional encoding.
        Args:
--- a/paddlespeech/s2t/modules/conformer_convolution.py
+++ b/paddlespeech/s2t/modules/conformer_convolution.py
@ -14,6 +14,7 @@
 # limitations under the License.
 # Modified from wenet(https://github.com/wenet-e2e/wenet)
 """ConvolutionModule definition."""
+from typing import Optional
 from typing import Tuple

 import paddle
@ -105,11 +106,10 @@ class ConvolutionModule(nn.Layer):
        )
        self.activation = activation

-    def forward(
-            self,
+    def forward(self,
                x: paddle.Tensor,
-            mask_pad: paddle.Tensor,  # paddle.ones([0,0,0], dtype=paddle.bool)
-            cache: paddle.Tensor  # paddle.zeros([0,0,0,0])
+                mask_pad: paddle.Tensor= paddle.ones([0,0,0], dtype=paddle.bool),
+                cache: paddle.Tensor= paddle.zeros([0,0,0]),
                ) -> Tuple[paddle.Tensor, paddle.Tensor]:
        """Compute convolution module.
        Args:
--- a/paddlespeech/s2t/modules/ctc.py
+++ b/paddlespeech/s2t/modules/ctc.py
@ -53,7 +53,7 @@ class CTCDecoderBase(nn.Layer):
                 enc_n_units,
                 blank_id=0,
                 dropout_rate: float=0.0,
-                 reduction: bool=True,
+                 reduction_type: str="sum",
                 batch_average: bool=True,
                 grad_norm_type: Union[str, None]=None):
        """CTC decoder
@ -73,7 +73,7 @@ class CTCDecoderBase(nn.Layer):
        self.odim = odim
        self.dropout = nn.Dropout(dropout_rate)
        self.ctc_lo = Linear(enc_n_units, self.odim)
-        reduction_type = "sum" if reduction else "none"
+        reduction_type = reduction_type if reduction_type else "none"
        self.criterion = CTCLoss(
            blank=self.blank_id,
            reduction=reduction_type,
--- a/paddlespeech/s2t/modules/decoder_layer.py
+++ b/paddlespeech/s2t/modules/decoder_layer.py
@ -121,16 +121,11 @@ class DecoderLayer(nn.Layer):

        if self.concat_after:
            tgt_concat = paddle.cat(
-                (tgt_q, self.self_attn(tgt_q, tgt, tgt, tgt_q_mask,
-                                       paddle.empty([0]),
-                                       paddle.zeros([0, 0, 0, 0]))[0]),
-                dim=-1)
+                (tgt_q, self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0]), dim=-1)
            x = residual + self.concat_linear1(tgt_concat)
        else:
            x = residual + self.dropout(
-                self.self_attn(tgt_q, tgt, tgt, tgt_q_mask,
-                               paddle.empty([0]), paddle.zeros([0, 0, 0, 0]))[
-                                   0])
+                self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0])
        if not self.normalize_before:
            x = self.norm1(x)

@ -139,15 +134,11 @@ class DecoderLayer(nn.Layer):
            x = self.norm2(x)
        if self.concat_after:
            x_concat = paddle.cat(
-                (x, self.src_attn(x, memory, memory, memory_mask,
-                                  paddle.empty([0]),
-                                  paddle.zeros([0, 0, 0, 0]))[0]),
-                dim=-1)
+                (x, self.src_attn(x, memory, memory, memory_mask)[0]), dim=-1)
            x = residual + self.concat_linear2(x_concat)
        else:
            x = residual + self.dropout(
-                self.src_attn(x, memory, memory, memory_mask,
-                              paddle.empty([0]), paddle.zeros([0, 0, 0, 0]))[0])
+                self.src_attn(x, memory, memory, memory_mask)[0])
        if not self.normalize_before:
            x = self.norm2(x)

--- a/paddlespeech/s2t/modules/encoder.py
+++ b/paddlespeech/s2t/modules/encoder.py
@ -14,6 +14,8 @@
 # limitations under the License.
 # Modified from wenet(https://github.com/wenet-e2e/wenet)
 """Encoder definition."""
+from typing import List
+from typing import Optional
 from typing import Tuple

 import paddle
@ -175,9 +177,7 @@ class BaseEncoder(nn.Layer):
            decoding_chunk_size, self.static_chunk_size,
            num_decoding_left_chunks)
        for layer in self.encoders:
-            xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad,
-                                          paddle.zeros([0, 0, 0, 0]),
-                                          paddle.zeros([0, 0, 0, 0]))
+            xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad)
        if self.normalize_before:
            xs = self.after_norm(xs)
        # Here we assume the mask is not changed in encoder layers, so just
@ -190,9 +190,9 @@ class BaseEncoder(nn.Layer):
            xs: paddle.Tensor,
            offset: int,
            required_cache_size: int,
-            att_cache: paddle.Tensor,  # paddle.zeros([0,0,0,0])
-            cnn_cache: paddle.Tensor,  # paddle.zeros([0,0,0,0]),
-            att_mask: paddle.Tensor,  # paddle.ones([0,0,0], dtype=paddle.bool)
+            att_cache: paddle.Tensor = paddle.zeros([0,0,0,0]),
+            cnn_cache: paddle.Tensor = paddle.zeros([0,0,0,0]),
+            att_mask: paddle.Tensor = paddle.ones([0,0,0], dtype=paddle.bool),
    ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
        """ Forward just one chunk
        Args:
@ -252,16 +252,13 @@ class BaseEncoder(nn.Layer):
            # att_cache[i:i+1] = (1, head, cache_t1, d_k*2)
            # cnn_cache[i:i+1] = (1, B=1, hidden-dim, cache_t2)
            xs, _, new_att_cache, new_cnn_cache = layer(
-                xs,
-                att_mask,
-                pos_emb,
-                mask_pad=paddle.ones([0, 0, 0], dtype=paddle.bool),
-                att_cache=att_cache[i:i + 1] if elayers > 0 else att_cache,
-                cnn_cache=cnn_cache[i:i + 1]
-                if paddle.shape(cnn_cache)[0] > 0 else cnn_cache, )
+                xs, att_mask, pos_emb,
+                att_cache=att_cache[i:i+1] if elayers > 0 else att_cache,
+                cnn_cache=cnn_cache[i:i+1] if paddle.shape(cnn_cache)[0] > 0 else cnn_cache,
+            )
            # new_att_cache = (1, head, attention_key_size, d_k*2)
            # new_cnn_cache = (B=1, hidden-dim, cache_t2)
-            r_att_cache.append(new_att_cache[:, :, next_cache_start:, :])
+            r_att_cache.append(new_att_cache[:,:, next_cache_start:, :])
            r_cnn_cache.append(new_cnn_cache.unsqueeze(0)) # add elayer dim

        if self.normalize_before:
@ -273,6 +270,7 @@ class BaseEncoder(nn.Layer):
        r_cnn_cache = paddle.concat(r_cnn_cache, axis=0)
        return xs, r_att_cache, r_cnn_cache

+
    def forward_chunk_by_chunk(
            self,
            xs: paddle.Tensor,
@ -317,8 +315,8 @@ class BaseEncoder(nn.Layer):
        num_frames = xs.shape[1]
        required_cache_size = decoding_chunk_size * num_decoding_left_chunks

-        att_cache: paddle.Tensor = paddle.zeros([0, 0, 0, 0])
-        cnn_cache: paddle.Tensor = paddle.zeros([0, 0, 0, 0])
+        att_cache: paddle.Tensor = paddle.zeros([0,0,0,0])
+        cnn_cache: paddle.Tensor = paddle.zeros([0,0,0,0])

        outputs = []
        offset = 0
@ -328,8 +326,7 @@ class BaseEncoder(nn.Layer):
            chunk_xs = xs[:, cur:end, :]

            (y, att_cache, cnn_cache) = self.forward_chunk(
-                chunk_xs, offset, required_cache_size, att_cache, cnn_cache,
-                paddle.ones([0, 0, 0], dtype=paddle.bool))
+                 chunk_xs, offset, required_cache_size, att_cache, cnn_cache)

            outputs.append(y)
            offset += y.shape[1]
--- a/paddlespeech/s2t/modules/encoder_layer.py
+++ b/paddlespeech/s2t/modules/encoder_layer.py
@ -76,10 +76,9 @@ class TransformerEncoderLayer(nn.Layer):
            x: paddle.Tensor,
            mask: paddle.Tensor,
            pos_emb: paddle.Tensor,
-            mask_pad: paddle.
-            Tensor,  # paddle.ones([0, 0, 0], dtype=paddle.bool)
-            att_cache: paddle.Tensor,  # paddle.zeros([0, 0, 0, 0])
-            cnn_cache: paddle.Tensor,  # paddle.zeros([0, 0, 0, 0])
+            mask_pad: paddle.Tensor=paddle.ones([0, 0, 0], dtype=paddle.bool),
+            att_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]),
+            cnn_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]),
    ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor]:
        """Compute encoded features.
        Args:
@ -106,8 +105,7 @@ class TransformerEncoderLayer(nn.Layer):
        if self.normalize_before:
            x = self.norm1(x)

-        x_att, new_att_cache = self.self_attn(
-            x, x, x, mask, paddle.empty([0]), cache=att_cache)
+        x_att, new_att_cache = self.self_attn(x, x, x, mask, cache=att_cache)

        if self.concat_after:
            x_concat = paddle.concat((x, x_att), axis=-1)
@ -195,9 +193,9 @@ class ConformerEncoderLayer(nn.Layer):
            x: paddle.Tensor,
            mask: paddle.Tensor,
            pos_emb: paddle.Tensor,
-            mask_pad: paddle.Tensor,  #paddle.ones([0, 0, 0],dtype=paddle.bool)
-            att_cache: paddle.Tensor,  # paddle.zeros([0, 0, 0, 0])
-            cnn_cache: paddle.Tensor,  # paddle.zeros([0, 0, 0, 0])
+            mask_pad: paddle.Tensor=paddle.ones([0, 0, 0], dtype=paddle.bool),
+            att_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]),
+            cnn_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]),
    ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor]:
        """Compute encoded features.
        Args:
--- a/paddlespeech/s2t/modules/initializer.py
+++ b/paddlespeech/s2t/modules/initializer.py
@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+import numpy as np

 class DefaultInitializerContext(object):
    """
--- a/paddlespeech/s2t/training/optimizer.py
+++ b/paddlespeech/s2t/training/optimizer.py
@ -103,6 +103,8 @@ class OptimizerFactory():

        grad_clip = ClipGradByGlobalNormWithLog(
            args['grad_clip']) if "grad_clip" in args else None
+        # grad_clip = paddle.nn.ClipGradByGlobalNorm(
+        #     args['grad_clip']) if "grad_clip" in args else None
        weight_decay = L2Decay(
            args['weight_decay']) if "weight_decay" in args else None
        if weight_decay:
--- a/paddlespeech/s2t/training/scheduler.py
+++ b/paddlespeech/s2t/training/scheduler.py
@ -106,6 +106,59 @@ class ConstantLR(LRScheduler):
    def get_lr(self):
        return self.base_lr

+@register_scheduler
+class NewBobScheduler(LRScheduler):
+    """
+    Args:
+        learning_rate (float): The initial learning rate. It is a python float number.
+        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
+
+    Returns:
+        ``ConstantLR`` instance to schedule learning rate.
+    """
+    def __init__(
+        self,
+        learning_rate,
+        annealing_factor=0.5,
+        improvement_threshold=0.0025,
+        patient=0,
+    ):
+        self.hyperparam_value = learning_rate
+        self.annealing_factor = annealing_factor
+        self.improvement_threshold = improvement_threshold
+        self.patient = patient
+        self.metric_values = []
+        self.current_patient = self.patient
+
+    def __call__(self, metric_value):
+        """Returns the current and new value for the hyperparameter.
+
+        Arguments
+        ---------
+        metric_value : int
+            A number for determining whether to change the hyperparameter value.
+        """
+        old_value = new_value = self.hyperparam_value
+        if len(self.metric_values) > 0:
+            prev_metric = self.metric_values[-1]
+            # Update value if improvement too small and patience is 0
+            if prev_metric == 0:  # Prevent division by zero
+                improvement = 0
+            else:
+                improvement = (prev_metric - metric_value) / prev_metric
+            if improvement < self.improvement_threshold:
+                if self.current_patient == 0:
+                    new_value *= self.annealing_factor
+                    self.current_patient = self.patient
+                else:
+                    self.current_patient -= 1
+        # Store relevant info
+        self.metric_values.append(metric_value)
+        self.hyperparam_value = new_value
+
+        return old_value, new_value
+

 def dynamic_import_scheduler(module):
    """Import Scheduler class dynamically.
--- a/paddlespeech/s2t/training/trainer.py
+++ b/paddlespeech/s2t/training/trainer.py
@ -19,6 +19,8 @@ from pathlib import Path

 import paddle
 from paddle import distributed as dist
+dist.init_parallel_env()
+
 from visualdl import LogWriter

 from paddlespeech.s2t.training.reporter import ObsScope
@ -130,7 +132,9 @@ class Trainer():
            latest_n=self.config.checkpoint.latest_n)

        # set random seed if needed
+        print(args.seed)
        if args.seed:
+            print('***********')
            seed_all(args.seed)
            logger.info(f"Set seed {args.seed}")

@ -176,7 +180,7 @@ class Trainer():
    def init_parallel(self):
        """Init environment for multiprocess training.
        """
-        dist.init_parallel_env()
+        # dist.init_parallel_env()

    @mp_tools.rank_zero_only
    def save(self, tag=None, infos: dict=None):
--- a/paddlespeech/server/conf/application.yaml
+++ b/paddlespeech/server/conf/application.yaml
@ -25,7 +25,6 @@ asr_python:
    cfg_path: # [optional]
    ckpt_path: # [optional]
    decode_method: 'attention_rescoring'
-    num_decoding_left_chunks: -1
    force_yes: True
    device:  # set 'gpu:id' or 'cpu'

@ -39,7 +38,6 @@ asr_inference:
    lang: 'zh'
    sample_rate: 16000
    cfg_path: 
-    num_decoding_left_chunks: -1
    decode_method: 
    force_yes: True

--- a/paddlespeech/server/engine/asr/online/ctc_endpoint.py
+++ b/paddlespeech/server/engine/asr/online/ctc_endpoint.py
@ -103,9 +103,7 @@ class OnlineCTCEndpoint:
        assert self.num_frames_decoded >= self.trailing_silence_frames
        assert self.frame_shift_in_ms > 0
        
-        decoding_something = (
-            self.num_frames_decoded > self.trailing_silence_frames
-        ) and decoding_something
+        decoding_something = (self.num_frames_decoded > self.trailing_silence_frames) and decoding_something
        utterance_length = self.num_frames_decoded * self.frame_shift_in_ms
        trailing_silence = self.trailing_silence_frames * self.frame_shift_in_ms

--- a/paddlespeech/server/engine/asr/online/onnx/asr_engine.py
+++ b/paddlespeech/server/engine/asr/online/onnx/asr_engine.py
@ -21,12 +21,12 @@ import paddle
 from numpy import float32
 from yacs.config import CfgNode

-from paddlespeech.audio.transform.transformation import Transformation
 from paddlespeech.cli.asr.infer import ASRExecutor
 from paddlespeech.cli.log import logger
 from paddlespeech.resource import CommonTaskResource
 from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
 from paddlespeech.s2t.modules.ctc import CTCDecoder
+from paddlespeech.audio.transform.transformation import Transformation
 from paddlespeech.s2t.utils.utility import UpdateConfig
 from paddlespeech.server.engine.base_engine import BaseEngine
 from paddlespeech.server.utils import onnx_infer
--- a/paddlespeech/server/engine/asr/online/paddleinference/asr_engine.py
+++ b/paddlespeech/server/engine/asr/online/paddleinference/asr_engine.py
@ -21,10 +21,10 @@ import paddle
 from numpy import float32
 from yacs.config import CfgNode

-from paddlespeech.audio.transform.transformation import Transformation
 from paddlespeech.cli.asr.infer import ASRExecutor
 from paddlespeech.cli.log import logger
 from paddlespeech.resource import CommonTaskResource
+from paddlespeech.audio.transform.transformation import Transformation
 from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
 from paddlespeech.s2t.modules.ctc import CTCDecoder
 from paddlespeech.s2t.utils.utility import UpdateConfig
--- a/paddlespeech/server/engine/asr/online/python/asr_engine.py
+++ b/paddlespeech/server/engine/asr/online/python/asr_engine.py
@ -21,10 +21,10 @@ import paddle
 from numpy import float32
 from yacs.config import CfgNode

-from paddlespeech.audio.transform.transformation import Transformation
 from paddlespeech.cli.asr.infer import ASRExecutor
 from paddlespeech.cli.log import logger
 from paddlespeech.resource import CommonTaskResource
+from paddlespeech.audio.transform.transformation import Transformation
 from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
 from paddlespeech.s2t.modules.ctc import CTCDecoder
 from paddlespeech.s2t.utils.tensor_utils import add_sos_eos
@ -130,8 +130,8 @@ class PaddleASRConnectionHanddler:

        ## conformer
        # cache for conformer online
-        self.att_cache = paddle.zeros([0, 0, 0, 0])
-        self.cnn_cache = paddle.zeros([0, 0, 0, 0])
+        self.att_cache = paddle.zeros([0,0,0,0])
+        self.cnn_cache = paddle.zeros([0,0,0,0])

        self.encoder_out = None
        # conformer decoding state
@ -474,14 +474,9 @@ class PaddleASRConnectionHanddler:
            # cur chunk
            chunk_xs = self.cached_feat[:, cur:end, :]
            # forward chunk
-            (y, self.att_cache,
-             self.cnn_cache) = self.model.encoder.forward_chunk(
-                 chunk_xs,
-                 self.offset,
-                 required_cache_size,
-                 att_cache=self.att_cache,
-                 cnn_cache=self.cnn_cache,
-                 att_mask=paddle.ones([0, 0, 0], dtype=paddle.bool))
+            (y, self.att_cache, self.cnn_cache) = self.model.encoder.forward_chunk(
+                 chunk_xs, self.offset, required_cache_size,
+                 self.att_cache, self.cnn_cache)
            outputs.append(y)

            # update the global offset, in decoding frame unit
--- a/paddlespeech/server/engine/asr/python/asr_engine.py
+++ b/paddlespeech/server/engine/asr/python/asr_engine.py
@ -68,12 +68,9 @@ class ASREngine(BaseEngine):
            return False

        self.executor._init_from_path(
-            model_type=self.config.model,
-            lang=self.config.lang,
-            sample_rate=self.config.sample_rate,
-            cfg_path=self.config.cfg_path,
-            decode_method=self.config.decode_method,
-            ckpt_path=self.config.ckpt_path)
+            self.config.model, self.config.lang, self.config.sample_rate,
+            self.config.cfg_path, self.config.decode_method,
+            self.config.ckpt_path)

        logger.info("Initialize ASR server engine successfully on device: %s." %
                    (self.device))
--- a/paddlespeech/server/engine/vector/python/vector_engine.py
+++ b/paddlespeech/server/engine/vector/python/vector_engine.py
@ -105,8 +105,7 @@ class PaddleVectorConnectionHandler:
        # we can not reuse the cache io.BytesIO(audio) data, 
        # because the soundfile will change the io.BytesIO(audio) to the end
        # thus we should convert the base64 string to io.BytesIO when we need the audio data
-        if not self.executor._check(
-                io.BytesIO(audio), sample_rate, force_yes=True):
+        if not self.executor._check(io.BytesIO(audio), sample_rate):
            logger.debug("check the audio sample rate occurs error")
            return np.array([0.0])

--- a/paddlespeech/t2s/datasets/am_batch_fn.py
+++ b/paddlespeech/t2s/datasets/am_batch_fn.py
@ -11,12 +11,19 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from typing import Collection
+from typing import Dict
+from typing import List
+from typing import Tuple
+
 import numpy as np
 import paddle

 from paddlespeech.t2s.datasets.batch import batch_sequences
+from paddlespeech.t2s.datasets.get_feats import LogMelFBank
 from paddlespeech.t2s.modules.nets_utils import get_seg_pos
 from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask
+from paddlespeech.t2s.modules.nets_utils import pad_list
 from paddlespeech.t2s.modules.nets_utils import phones_masking
 from paddlespeech.t2s.modules.nets_utils import phones_text_masking

@ -485,56 +492,180 @@ def vits_single_spk_batch_fn(examples):
    return batch


-def vits_multi_spk_batch_fn(examples):
-    """
-    Returns:
-        Dict[str, Any]:
-            - text (Tensor): Text index tensor (B, T_text).
-            - text_lengths (Tensor): Text length tensor (B,).
-            - feats (Tensor): Feature tensor (B, T_feats, aux_channels).
-            - feats_lengths (Tensor): Feature length tensor (B,).
-            - speech (Tensor): Speech waveform tensor (B, T_wav).
-            - spk_id (Optional[Tensor]): Speaker index tensor (B,) or (B, 1).
-            - spk_emb (Optional[Tensor]): Speaker embedding tensor (B, spk_embed_dim).
-    """
-    # fields = ["text", "text_lengths", "feats", "feats_lengths", "speech", "spk_id"/"spk_emb"]
-    text = [np.array(item["text"], dtype=np.int64) for item in examples]
-    feats = [np.array(item["feats"], dtype=np.float32) for item in examples]
-    speech = [np.array(item["wave"], dtype=np.float32) for item in examples]
-    text_lengths = [
-        np.array(item["text_lengths"], dtype=np.int64) for item in examples
-    ]
-    feats_lengths = [
-        np.array(item["feats_lengths"], dtype=np.int64) for item in examples
-    ]
+# for ERNIE SAT
+class MLMCollateFn:
+    """Functor class of common_collate_fn()"""

-    text = batch_sequences(text)
-    feats = batch_sequences(feats)
-    speech = batch_sequences(speech)
+    def __init__(
+            self,
+            feats_extract,
+            mlm_prob: float=0.8,
+            mean_phn_span: int=8,
+            seg_emb: bool=False,
+            text_masking: bool=False,
+            attention_window: int=0,
+            not_sequence: Collection[str]=(), ):
+        self.mlm_prob = mlm_prob
+        self.mean_phn_span = mean_phn_span
+        self.feats_extract = feats_extract
+        self.not_sequence = set(not_sequence)
+        self.attention_window = attention_window
+        self.seg_emb = seg_emb
+        self.text_masking = text_masking

-    # convert each batch to paddle.Tensor
-    text = paddle.to_tensor(text)
+    def __call__(self, data: Collection[Tuple[str, Dict[str, np.ndarray]]]
+                 ) -> Tuple[List[str], Dict[str, paddle.Tensor]]:
+        return mlm_collate_fn(
+            data,
+            feats_extract=self.feats_extract,
+            mlm_prob=self.mlm_prob,
+            mean_phn_span=self.mean_phn_span,
+            seg_emb=self.seg_emb,
+            text_masking=self.text_masking,
+            not_sequence=self.not_sequence)
+
+
+def mlm_collate_fn(
+        data: Collection[Tuple[str, Dict[str, np.ndarray]]],
+        feats_extract=None,
+        mlm_prob: float=0.8,
+        mean_phn_span: int=8,
+        seg_emb: bool=False,
+        text_masking: bool=False,
+        pad_value: int=0,
+        not_sequence: Collection[str]=(),
+) -> Tuple[List[str], Dict[str, paddle.Tensor]]:
+    uttids = [u for u, _ in data]
+    data = [d for _, d in data]
+
+    assert all(set(data[0]) == set(d) for d in data), "dict-keys mismatching"
+    assert all(not k.endswith("_lens")
+               for k in data[0]), f"*_lens is reserved: {list(data[0])}"
+
+    output = {}
+    for key in data[0]:
+
+        array_list = [d[key] for d in data]
+
+        # Assume the first axis is length:
+        # tensor_list: Batch x (Length, ...)
+        tensor_list = [paddle.to_tensor(a) for a in array_list]
+        # tensor: (Batch, Length, ...)
+        tensor = pad_list(tensor_list, pad_value)
+        output[key] = tensor
+
+        # lens: (Batch,)
+        if key not in not_sequence:
+            lens = paddle.to_tensor(
+                [d[key].shape[0] for d in data], dtype=paddle.int64)
+            output[key + "_lens"] = lens
+
+    feats = feats_extract.get_log_mel_fbank(np.array(output["speech"][0]))
    feats = paddle.to_tensor(feats)
-    text_lengths = paddle.to_tensor(text_lengths)
-    feats_lengths = paddle.to_tensor(feats_lengths)
+    print("feats.shape:", feats.shape)
+    feats_lens = paddle.shape(feats)[0]
+    feats = paddle.unsqueeze(feats, 0)

-    batch = {
-        "text": text,
-        "text_lengths": text_lengths,
-        "feats": feats,
-        "feats_lengths": feats_lengths,
-        "speech": speech
-    }
-    # spk_emb has a higher priority than spk_id
-    if "spk_emb" in examples[0]:
-        spk_emb = [
-            np.array(item["spk_emb"], dtype=np.float32) for item in examples
-        ]
-        spk_emb = batch_sequences(spk_emb)
-        spk_emb = paddle.to_tensor(spk_emb)
-        batch["spk_emb"] = spk_emb
-    elif "spk_id" in examples[0]:
-        spk_id = [np.array(item["spk_id"], dtype=np.int64) for item in examples]
-        spk_id = paddle.to_tensor(spk_id)
-        batch["spk_id"] = spk_id
-    return batch
+    text = output["text"]
+    text_lens = output["text_lens"]
+    align_start = output["align_start"]
+    align_start_lens = output["align_start_lens"]
+    align_end = output["align_end"]
+
+    max_tlen = max(text_lens)
+    max_slen = max(feats_lens)
+
+    speech_pad = feats[:, :max_slen]
+
+    text_pad = text
+    text_mask = make_non_pad_mask(
+        text_lens, text_pad, length_dim=1).unsqueeze(-2)
+    speech_mask = make_non_pad_mask(
+        feats_lens, speech_pad[:, :, 0], length_dim=1).unsqueeze(-2)
+
+    span_bdy = None
+    if 'span_bdy' in output.keys():
+        span_bdy = output['span_bdy']
+
+    # dual_mask 的是混合中英时候同时 mask 语音和文本 
+    # ernie sat 在实现跨语言的时候都 mask 了
+    if text_masking:
+        masked_pos, text_masked_pos = phones_text_masking(
+            xs_pad=speech_pad,
+            src_mask=speech_mask,
+            text_pad=text_pad,
+            text_mask=text_mask,
+            align_start=align_start,
+            align_end=align_end,
+            align_start_lens=align_start_lens,
+            mlm_prob=mlm_prob,
+            mean_phn_span=mean_phn_span,
+            span_bdy=span_bdy)
+    # 训练纯中文和纯英文的 -> a3t 没有对 phoneme 做 mask, 只对语音 mask 了
+    # a3t 和 ernie sat 的区别主要在于做 mask 的时候
+    else:
+        masked_pos = phones_masking(
+            xs_pad=speech_pad,
+            src_mask=speech_mask,
+            align_start=align_start,
+            align_end=align_end,
+            align_start_lens=align_start_lens,
+            mlm_prob=mlm_prob,
+            mean_phn_span=mean_phn_span,
+            span_bdy=span_bdy)
+        text_masked_pos = paddle.zeros(paddle.shape(text_pad))
+
+    output_dict = {}
+
+    speech_seg_pos, text_seg_pos = get_seg_pos(
+        speech_pad=speech_pad,
+        text_pad=text_pad,
+        align_start=align_start,
+        align_end=align_end,
+        align_start_lens=align_start_lens,
+        seg_emb=seg_emb)
+    output_dict['speech'] = speech_pad
+    output_dict['text'] = text_pad
+    output_dict['masked_pos'] = masked_pos
+    output_dict['text_masked_pos'] = text_masked_pos
+    output_dict['speech_mask'] = speech_mask
+    output_dict['text_mask'] = text_mask
+    output_dict['speech_seg_pos'] = speech_seg_pos
+    output_dict['text_seg_pos'] = text_seg_pos
+    output = (uttids, output_dict)
+    return output
+
+
+def build_mlm_collate_fn(
+        sr: int=24000,
+        n_fft: int=2048,
+        hop_length: int=300,
+        win_length: int=None,
+        n_mels: int=80,
+        fmin: int=80,
+        fmax: int=7600,
+        mlm_prob: float=0.8,
+        mean_phn_span: int=8,
+        seg_emb: bool=False,
+        epoch: int=-1, ):
+    feats_extract_class = LogMelFBank
+
+    feats_extract = feats_extract_class(
+        sr=sr,
+        n_fft=n_fft,
+        hop_length=hop_length,
+        win_length=win_length,
+        n_mels=n_mels,
+        fmin=fmin,
+        fmax=fmax)
+
+    if epoch == -1:
+        mlm_prob_factor = 1
+    else:
+        mlm_prob_factor = 0.8
+
+    return MLMCollateFn(
+        feats_extract=feats_extract,
+        mlm_prob=mlm_prob * mlm_prob_factor,
+        mean_phn_span=mean_phn_span,
+        seg_emb=seg_emb)
--- a/paddlespeech/t2s/datasets/sampler.py
+++ b/paddlespeech/t2s/datasets/sampler.py
@ -1,9 +1,8 @@
+import paddle
 import math
-
 import numpy as np
 from paddle.io import BatchSampler

-
 class ErnieSATSampler(BatchSampler):
    """Sampler that restricts data loading to a subset of the dataset.
    In such case, each process can pass a DistributedBatchSampler instance 
@ -71,7 +70,7 @@ class ErnieSATSampler(BatchSampler):
        assert isinstance(drop_last, bool), \
                "drop_last should be a boolean number"

-        from paddle.distributed import ParallelEnv
+        from paddle.fluid.dygraph.parallel import ParallelEnv

        if num_replicas is not None:
            assert isinstance(num_replicas, int) and num_replicas > 0, \
@ -111,8 +110,8 @@ class ErnieSATSampler(BatchSampler):
                subsampled_indices.extend(indices[i:i + self.batch_size])

            indices = indices[len(indices) - last_batch_size:]
-            subsampled_indices.extend(
-                indices[self.local_rank * last_local_batch_size:(
+            subsampled_indices.extend(indices[
+                self.local_rank * last_local_batch_size:(
                    self.local_rank + 1) * last_local_batch_size])
            return subsampled_indices

--- a/paddlespeech/t2s/exps/ernie_sat/align.py
+++ b/paddlespeech/t2s/exps/ernie_sat/align.py
@ -19,9 +19,9 @@ import librosa
 import numpy as np
 import pypinyin
 from praatio import textgrid
-
-from paddlespeech.t2s.exps.ernie_sat.utils import get_dict
 from paddlespeech.t2s.exps.ernie_sat.utils import get_tmp_name
+from paddlespeech.t2s.exps.ernie_sat.utils import get_dict
+

 DICT_EN = 'tools/aligner/cmudict-0.7b'
 DICT_ZH = 'tools/aligner/simple.lexicon'
@ -30,7 +30,6 @@ MODEL_DIR_ZH = 'tools/aligner/aishell3_model.zip'
 MFA_PATH = 'tools/montreal-forced-aligner/bin'
 os.environ['PATH'] = MFA_PATH + '/:' + os.environ['PATH']

-
 def _get_max_idx(dic):
    return sorted([int(key.split('_')[0]) for key in dic.keys()])[-1]

@ -111,7 +110,7 @@ def alignment(wav_path: str,
    tmpbase = './tmp_dir/' + tmp_name
    tmpbase = Path(tmpbase)
    tmpbase.mkdir(parents=True, exist_ok=True)
-    print("tmp_name in alignment:", tmp_name)
+    print("tmp_name in alignment:",tmp_name)

    shutil.copyfile(wav_path, tmpbase / wav_name)
    txt_name = utt + '.txt'
@ -341,7 +340,7 @@ def get_phns_spans(wav_path: str,

 if __name__ == '__main__':
    text = "For that reason cover should not be given."
-    phn, dur, word2phns = alignment("source/p243_313.wav", text, lang='en')
+    phn, dur, word2phns = alignment("exp/p243_313.wav", text, lang='en')
    print(phn, dur)
    print(word2phns)
    print("---------------------------------")
@ -353,7 +352,7 @@ if __name__ == '__main__':
        style=pypinyin.Style.TONE3,
        tone_sandhi=True)
    text_zh = " ".join(text_zh)
-    phn, dur, word2phns = alignment("source/000001.wav", text_zh, lang='zh')
+    phn, dur, word2phns = alignment("exp/000001.wav", text_zh, lang='zh')
    print(phn, dur)
    print(word2phns)
    print("---------------------------------")
@ -368,7 +367,7 @@ if __name__ == '__main__':
    print("---------------------------------")

    outs = get_phns_spans(
-        wav_path="source/p243_313.wav",
+        wav_path="exp/p243_313.wav",
        old_str="For that reason cover should not be given.",
        new_str="for that reason cover is impossible to be given.")

--- a/paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py
@ -11,41 +11,35 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import argparse
-import os
-from pathlib import Path
-from typing import List
-
 import librosa
 import numpy as np
-import paddle
-import pypinyin
 import soundfile as sf
-import yaml
-from pypinyin_dict.phrase_pinyin_data import large_pinyin
-from yacs.config import CfgNode

-from paddlespeech.t2s.datasets.am_batch_fn import build_erniesat_collate_fn
-from paddlespeech.t2s.datasets.get_feats import LogMelFBank
 from paddlespeech.t2s.exps.ernie_sat.align import get_phns_spans
 from paddlespeech.t2s.exps.ernie_sat.utils import eval_durs
 from paddlespeech.t2s.exps.ernie_sat.utils import get_dur_adj_factor
 from paddlespeech.t2s.exps.ernie_sat.utils import get_span_bdy
-from paddlespeech.t2s.exps.ernie_sat.utils import get_tmp_name
-from paddlespeech.t2s.exps.syn_utils import get_am_inference
-from paddlespeech.t2s.exps.syn_utils import get_voc_inference
+from paddlespeech.t2s.datasets.am_batch_fn import build_erniesat_collate_fn
+from paddlespeech.t2s.exps.syn_utils import get_frontend
+from paddlespeech.t2s.datasets.get_feats import LogMelFBank
 from paddlespeech.t2s.exps.syn_utils import norm
-from paddlespeech.t2s.utils import str2bool
-large_pinyin.load()
+from paddlespeech.t2s.exps.ernie_sat.utils import get_tmp_name
+


-def _p2id(phonemes: List[str]) -> np.ndarray:
+
+
+
+def _p2id(self, phonemes: List[str]) -> np.ndarray:
    # replace unk phone with sp
-    phonemes = [phn if phn in vocab_phones else "sp" for phn in phonemes]
+    phonemes = [
+        phn if phn in vocab_phones else "sp" for phn in phonemes
+    ]
    phone_ids = [vocab_phones[item] for item in phonemes]
    return np.array(phone_ids, np.int64)


+
 def prep_feats_with_dur(wav_path: str,
                        old_str: str='',
                        new_str: str='',
@ -73,12 +67,12 @@ def prep_feats_with_dur(wav_path: str,
        fs=fs,
        n_shift=n_shift)

-    mfa_start = phns_spans_outs['mfa_start']
-    mfa_end = phns_spans_outs['mfa_end']
-    old_phns = phns_spans_outs['old_phns']
-    new_phns = phns_spans_outs['new_phns']
-    span_to_repl = phns_spans_outs['span_to_repl']
-    span_to_add = phns_spans_outs['span_to_add']
+    mfa_start = phns_spans_outs["mfa_start"]
+    mfa_end = phns_spans_outs["mfa_end"]
+    old_phns = phns_spans_outs["old_phns"]
+    new_phns = phns_spans_outs["new_phns"]
+    span_to_repl = phns_spans_outs["span_to_repl"]
+    span_to_add = phns_spans_outs["span_to_add"]

    # 中文的 phns 不一定都在 fastspeech2 的字典里, 用 sp 代替
    if target_lang in {'en', 'zh'}:
@ -138,7 +132,7 @@ def prep_feats_with_dur(wav_path: str,
        [wav_org[:wav_left_idx], blank_wav, wav_org[wav_right_idx:]])

    # 音频是正常遮住了
-    sf.write(str("mask_wav.wav"), new_wav, samplerate=fs)
+    sf.write(str("new_wav.wav"), new_wav, samplerate=fs)

    # 4. get old and new mel span to be mask
    old_span_bdy = get_span_bdy(
@ -158,6 +152,8 @@ def prep_feats_with_dur(wav_path: str,
    return outs


+
+
 def prep_feats(wav_path: str,
               old_str: str='',
               new_str: str='',
@ -167,7 +163,7 @@ def prep_feats(wav_path: str,
               fs: int=24000,
               n_shift: int=300):

-    with_dur_outs = prep_feats_with_dur(
+    outs = prep_feats_with_dur(
        wav_path=wav_path,
        old_str=old_str,
        new_str=new_str,
@ -180,14 +176,15 @@ def prep_feats(wav_path: str,
    wav_name = os.path.basename(wav_path)
    utt_id = wav_name.split('.')[0]

-    wav = with_dur_outs['new_wav']
-    phns = with_dur_outs['new_phns']
-    mfa_start = with_dur_outs['new_mfa_start']
-    mfa_end = with_dur_outs['new_mfa_end']
-    old_span_bdy = with_dur_outs['old_span_bdy']
-    new_span_bdy = with_dur_outs['new_span_bdy']
+    wav = outs['new_wav']
+    phns = outs['new_phns']
+    mfa_start = outs['new_mfa_start']
+    mfa_end = outs['new_mfa_end']
+    old_span_bdy = outs['old_span_bdy']
+    new_span_bdy = outs['new_span_bdy']
    span_bdy = np.array(new_span_bdy)

+    text = _p2id(phns)
    mel = mel_extractor.get_log_mel_fbank(wav)
    erniesat_mean, erniesat_std = np.load(erniesat_stat)
    normed_mel = norm(mel, erniesat_mean, erniesat_std)
@ -195,225 +192,122 @@ def prep_feats(wav_path: str,
    tmpbase = './tmp_dir/' + tmp_name
    tmpbase = Path(tmpbase)
    tmpbase.mkdir(parents=True, exist_ok=True)
+    print("tmp_name in synthesize_e2e:",tmp_name)

    mel_path = tmpbase / 'mel.npy'
-    np.save(mel_path, normed_mel)
+    print("mel_path:",mel_path)
+    np.save(mel_path, logmel)
    durations = [e - s for e, s in zip(mfa_end, mfa_start)]
-    text = _p2id(phns)

-    datum = {
+    datum={
        "utt_id": utt_id, 
        "spk_id": 0, 
        "text": text, 
        "text_lengths": len(text), 
-        "speech_lengths": len(normed_mel),
+        "speech_lengths": 115, 
        "durations": durations, 
-        "speech": np.load(mel_path),
+        "speech": mel_path, 
        "align_start": mfa_start, 
        "align_end": mfa_end,
        "span_bdy": span_bdy
    }

    batch = collate_fn([datum])
-    outs = dict()
-    outs['batch'] = batch
-    outs['old_span_bdy'] = old_span_bdy
-    outs['new_span_bdy'] = new_span_bdy
-    return outs
+    print("batch:",batch)
+
+    return batch, old_span_bdy, new_span_bdy


-def get_mlm_output(wav_path: str,
+def decode_with_model(mlm_model: nn.Layer,
+                      collate_fn,
+                      wav_path: str,
                      old_str: str='',
                      new_str: str='',
                      source_lang: str='en',
                      target_lang: str='en',
+                      use_teacher_forcing: bool=False,
                      duration_adjust: bool=True,
                      fs: int=24000,
-                   n_shift: int=300):
-
-    prep_feats_outs = prep_feats(
+                      n_shift: int=300,
+                      token_list: List[str]=[]):
+    batch, old_span_bdy, new_span_bdy = prep_feats(
+        source_lang=source_lang,
+        target_lang=target_lang,
        wav_path=wav_path,
        old_str=old_str,
        new_str=new_str,
-        source_lang=source_lang,
-        target_lang=target_lang,
        duration_adjust=duration_adjust,
        fs=fs,
-        n_shift=n_shift)
+        n_shift=n_shift,
+        token_list=token_list)
+    
+

-    batch = prep_feats_outs['batch']
-    new_span_bdy = prep_feats_outs['new_span_bdy']
-    old_span_bdy = prep_feats_outs['old_span_bdy']
+    feats = collate_fn(batch)[1]

-    out_mels = erniesat_inference(
-        speech=batch['speech'],
-        text=batch['text'],
-        masked_pos=batch['masked_pos'],
-        speech_mask=batch['speech_mask'],
-        text_mask=batch['text_mask'],
-        speech_seg_pos=batch['speech_seg_pos'],
-        text_seg_pos=batch['text_seg_pos'],
-        span_bdy=new_span_bdy)
+    if 'text_masked_pos' in feats.keys():
+        feats.pop('text_masked_pos')
+
+    output = mlm_model.inference(
+        text=feats['text'],
+        speech=feats['speech'],
+        masked_pos=feats['masked_pos'],
+        speech_mask=feats['speech_mask'],
+        text_mask=feats['text_mask'],
+        speech_seg_pos=feats['speech_seg_pos'],
+        text_seg_pos=feats['text_seg_pos'],
+        span_bdy=new_span_bdy,
+        use_teacher_forcing=use_teacher_forcing)

    # 拼接音频
-    output_feat = paddle.concat(x=out_mels, axis=0)
+    output_feat = paddle.concat(x=output, axis=0)
    wav_org, _ = librosa.load(wav_path, sr=fs)
-    outs = dict()
-    outs['wav_org'] = wav_org
-    outs['output_feat'] = output_feat
-    outs['old_span_bdy'] = old_span_bdy
-    outs['new_span_bdy'] = new_span_bdy
-
-    return outs
+    return wav_org, output_feat, old_span_bdy, new_span_bdy, fs, hop_length


-def get_wav(wav_path: str,
-            source_lang: str='en',
-            target_lang: str='en',
-            old_str: str='',
-            new_str: str='',
-            duration_adjust: bool=True,
-            fs: int=24000,
-            n_shift: int=300):
+if __name__ == '__main__':
+    fs = 24000
+    n_shift = 300
+    wav_path = "exp/p243_313.wav"
+    old_str = "For that reason cover should not be given."
+    # for edit
+    # new_str = "for that reason cover is impossible to be given."
+    # for synthesize
+    append_str = "do you love me i love you so much"
+    new_str = old_str + append_str

-    outs = get_mlm_output(
+    '''
+    outs = prep_feats_with_dur(
        wav_path=wav_path,
        old_str=old_str,
        new_str=new_str,
-        source_lang=source_lang,
-        target_lang=target_lang,
-        duration_adjust=duration_adjust,
        fs=fs,
        n_shift=n_shift)

-    wav_org = outs['wav_org']
-    output_feat = outs['output_feat']
+    new_wav = outs['new_wav']
+    new_phns = outs['new_phns']
+    new_mfa_start = outs['new_mfa_start']
+    new_mfa_end = outs['new_mfa_end']
    old_span_bdy = outs['old_span_bdy']
    new_span_bdy = outs['new_span_bdy']

-    masked_feat = output_feat[new_span_bdy[0]:new_span_bdy[1]]
-
-    with paddle.no_grad():
-        alt_wav = voc_inference(masked_feat)
-    alt_wav = np.squeeze(alt_wav)
-
-    old_time_bdy = [n_shift * x for x in old_span_bdy]
-    wav_replaced = np.concatenate(
-        [wav_org[:old_time_bdy[0]], alt_wav, wav_org[old_time_bdy[1]:]])
-
-    wav_dict = {"origin": wav_org, "output": wav_replaced}
-    return wav_dict
-
-
-def parse_args():
-    # parse args and config
-    parser = argparse.ArgumentParser(
-        description="Synthesize with acoustic model & vocoder")
-    # ernie sat
-
-    parser.add_argument(
-        '--erniesat_config',
-        type=str,
-        default=None,
-        help='Config of acoustic model.')
-    parser.add_argument(
-        '--erniesat_ckpt',
-        type=str,
-        default=None,
-        help='Checkpoint file of acoustic model.')
-    parser.add_argument(
-        "--erniesat_stat",
-        type=str,
-        default=None,
-        help="mean and standard deviation used to normalize spectrogram when training acoustic model."
-    )
-    parser.add_argument(
-        "--phones_dict", type=str, default=None, help="phone vocabulary file.")
-    # vocoder
-    parser.add_argument(
-        '--voc',
-        type=str,
-        default='pwgan_csmsc',
-        choices=[
-            'pwgan_aishell3',
-            'pwgan_vctk',
-            'hifigan_aishell3',
-            'hifigan_vctk',
-        ],
-        help='Choose vocoder type of tts task.')
-    parser.add_argument(
-        '--voc_config', type=str, default=None, help='Config of voc.')
-    parser.add_argument(
-        '--voc_ckpt', type=str, default=None, help='Checkpoint file of voc.')
-    parser.add_argument(
-        "--voc_stat",
-        type=str,
-        default=None,
-        help="mean and standard deviation used to normalize spectrogram when training voc."
-    )
-    # other
-    parser.add_argument(
-        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
-
-    # ernie sat related
-    parser.add_argument("--task_name", type=str, help="task name")
-    parser.add_argument("--wav_path", type=str, help="path of old wav")
-    parser.add_argument("--old_str", type=str, help="old string")
-    parser.add_argument("--new_str", type=str, help="new string")
-    parser.add_argument(
-        "--source_lang", type=str, default="en", help="source language")
-    parser.add_argument(
-        "--target_lang", type=str, default="en", help="target language")
-    parser.add_argument(
-        "--duration_adjust",
-        type=str2bool,
-        default=True,
-        help="whether to adjust duration.")
-    parser.add_argument("--output_name", type=str, default="output.wav")
-
-    args = parser.parse_args()
-    return args
+    print("---------------------------------")

+    print("new_wav:", new_wav)
+    print("new_phns:", new_phns)
+    print("new_mfa_start:", new_mfa_start)
+    print("new_mfa_end:", new_mfa_end)
+    print("old_span_bdy:", old_span_bdy)
+    print("new_span_bdy:", new_span_bdy)
+    print("---------------------------------")
+    '''

-if __name__ == '__main__':
-    args = parse_args()
-
-    if args.ngpu == 0:
-        paddle.set_device("cpu")
-    elif args.ngpu > 0:
-        paddle.set_device("gpu")
-    else:
-        print("ngpu should >= 0 !")
+    erniesat_config = "/home/yuantian01/PaddleSpeech_ERNIE_SAT/PaddleSpeech/examples/vctk/ernie_sat/local/default.yaml"

-    # evaluate(args)
-    with open(args.erniesat_config) as f:
+    with open(erniesat_config) as f:
        erniesat_config = CfgNode(yaml.safe_load(f))
-    old_str = args.old_str
-    new_str = args.new_str
-
-    # convert Chinese characters to pinyin
-    if args.source_lang == 'zh':
-        old_str = pypinyin.lazy_pinyin(
-            old_str,
-            neutral_tone_with_five=True,
-            style=pypinyin.Style.TONE3,
-            tone_sandhi=True)
-        old_str = ' '.join(old_str)
-    if args.target_lang == 'zh':
-        new_str = pypinyin.lazy_pinyin(
-            new_str,
-            neutral_tone_with_five=True,
-            style=pypinyin.Style.TONE3,
-            tone_sandhi=True)
-        new_str = ' '.join(new_str)
-
-    if args.task_name == 'edit':
-        new_str = new_str
-    elif args.task_name == 'synthesize':
-        new_str = old_str + new_str
-    else:
-        new_str = old_str + new_str
-    print("new_str:", new_str)
+    
+    erniesat_stat = "/home/yuantian01/PaddleSpeech_ERNIE_SAT/PaddleSpeech/examples/vctk/ernie_sat/dump/train/speech_stats.npy"

    # Extractor
    mel_extractor = LogMelFBank(
@ -426,50 +320,27 @@ if __name__ == '__main__':
        fmin=erniesat_config.fmin,
        fmax=erniesat_config.fmax)
    
+
+
    collate_fn = build_erniesat_collate_fn(
        mlm_prob=erniesat_config.mlm_prob,
        mean_phn_span=erniesat_config.mean_phn_span,
        seg_emb=erniesat_config.model['enc_input_layer'] == 'sega_mlm',
        text_masking=False)
        
+    phones_dict='/home/yuantian01/PaddleSpeech_ERNIE_SAT/PaddleSpeech/examples/vctk/ernie_sat/dump/phone_id_map.txt'
    vocab_phones = {}

-    with open(args.phones_dict, 'rt') as f:
+    with open(phones_dict, 'rt') as f:
        phn_id = [line.strip().split() for line in f.readlines()]
    for phn, id in phn_id:
        vocab_phones[phn] = int(id)

-    # ernie sat model
-    erniesat_inference = get_am_inference(
-        am='erniesat_dataset',
-        am_config=erniesat_config,
-        am_ckpt=args.erniesat_ckpt,
-        am_stat=args.erniesat_stat,
-        phones_dict=args.phones_dict)
-
-    with open(args.voc_config) as f:
-        voc_config = CfgNode(yaml.safe_load(f))
-
-    # vocoder
-    voc_inference = get_voc_inference(
-        voc=args.voc,
-        voc_config=voc_config,
-        voc_ckpt=args.voc_ckpt,
-        voc_stat=args.voc_stat)
-
-    erniesat_stat = args.erniesat_stat
-
-    wav_dict = get_wav(
-        wav_path=args.wav_path,
-        source_lang=args.source_lang,
-        target_lang=args.target_lang,
+    prep_feats(wav_path=wav_path,
               old_str=old_str,
               new_str=new_str,
-        duration_adjust=args.duration_adjust,
-        fs=erniesat_config.fs,
-        n_shift=erniesat_config.n_shift)
-
-    sf.write(
-        args.output_name, wav_dict['output'], samplerate=erniesat_config.fs)
-    print(
-        f"\033[1;32;m Generated audio saved into {args.output_name} ! \033[0m")
+               fs=fs,
+               n_shift=n_shift)
+
+    
+    
--- a/paddlespeech/t2s/exps/ernie_sat/train.py
+++ b/paddlespeech/t2s/exps/ernie_sat/train.py
@ -25,6 +25,7 @@ from paddle import DataParallel
 from paddle import distributed as dist
 from paddle import nn
 from paddle.io import DataLoader
+from paddle.io import DistributedBatchSampler
 from paddle.optimizer import Adam
 from yacs.config import CfgNode

--- a/paddlespeech/t2s/exps/ernie_sat/utils.py
+++ b/paddlespeech/t2s/exps/ernie_sat/utils.py
@ -11,35 +11,32 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import hashlib
-import os
 from pathlib import Path
 from typing import Dict
 from typing import List
 from typing import Union
+import os

 import numpy as np
 import paddle
 import yaml
 from yacs.config import CfgNode
+import hashlib
+

 from paddlespeech.t2s.exps.syn_utils import get_am_inference
 from paddlespeech.t2s.exps.syn_utils import get_voc_inference

-
 def _get_user():
    return os.path.expanduser('~').split('/')[-1]

-
 def str2md5(string):
    md5_val = hashlib.md5(string.encode('utf8')).hexdigest()
    return md5_val

-
-def get_tmp_name(text: str):
+def get_tmp_name(text:str):
    return _get_user() + '_' + str(os.getpid()) + '_' + str2md5(text)

-
 def get_dict(dictfile: str):
    word2phns_dict = {}
    with open(dictfile, 'r') as fid:
--- a/paddlespeech/t2s/exps/syn_utils.py
+++ b/paddlespeech/t2s/exps/syn_utils.py
@ -82,10 +82,6 @@ def denorm(data, mean, std):
    return data * std + mean


-def norm(data, mean, std):
-    return (data - mean) / std
-
-
 def get_chunks(data, block_size: int, pad_size: int):
    data_len = data.shape[1]
    chunks = []
@ -298,8 +294,8 @@ def am_to_static(am_inference,
    am_name = am[:am.rindex('_')]
    am_dataset = am[am.rindex('_') + 1:]
    if am_name == 'fastspeech2':
-        if am_dataset in {"aishell3", "vctk",
-                          "mix"} and speaker_dict is not None:
+        if am_dataset in {"aishell3", "vctk", "mix"
+                          } and speaker_dict is not None:
            am_inference = jit.to_static(
                am_inference,
                input_spec=[
@ -311,8 +307,8 @@ def am_to_static(am_inference,
                am_inference, input_spec=[InputSpec([-1], dtype=paddle.int64)])

    elif am_name == 'speedyspeech':
-        if am_dataset in {"aishell3", "vctk",
-                          "mix"} and speaker_dict is not None:
+        if am_dataset in {"aishell3", "vctk", "mix"
+                          } and speaker_dict is not None:
            am_inference = jit.to_static(
                am_inference,
                input_spec=[
--- a/paddlespeech/t2s/exps/vits/synthesize.py
+++ b/paddlespeech/t2s/exps/vits/synthesize.py
@ -15,7 +15,6 @@ import argparse
 from pathlib import Path

 import jsonlines
-import numpy as np
 import paddle
 import soundfile as sf
 import yaml
@ -24,7 +23,6 @@ from yacs.config import CfgNode

 from paddlespeech.t2s.datasets.data_table import DataTable
 from paddlespeech.t2s.models.vits import VITS
-from paddlespeech.t2s.utils import str2bool


 def evaluate(args):
@ -42,26 +40,8 @@ def evaluate(args):
    print(config)

    fields = ["utt_id", "text"]
-    converters = {}
-
-    spk_num = None
-    if args.speaker_dict is not None:
-        print("multiple speaker vits!")
-        with open(args.speaker_dict, 'rt') as f:
-            spk_id = [line.strip().split() for line in f.readlines()]
-        spk_num = len(spk_id)
-        fields += ["spk_id"]
-    elif args.voice_cloning:
-        print("Evaluating voice cloning!")
-        fields += ["spk_emb"]
-    else:
-        print("single speaker vits!")
-    print("spk_num:", spk_num)

-    test_dataset = DataTable(
-        data=test_metadata,
-        fields=fields,
-        converters=converters, )
+    test_dataset = DataTable(data=test_metadata, fields=fields)

    with open(args.phones_dict, "r") as f:
        phn_id = [line.strip().split() for line in f.readlines()]
@ -69,7 +49,6 @@ def evaluate(args):
    print("vocab_size:", vocab_size)

    odim = config.n_fft // 2 + 1
-    config["model"]["generator_params"]["spks"] = spk_num

    vits = VITS(idim=vocab_size, odim=odim, **config["model"])
    vits.set_state_dict(paddle.load(args.ckpt)["main_params"])
@ -86,15 +65,7 @@ def evaluate(args):
        phone_ids = paddle.to_tensor(datum["text"])
        with timer() as t:
            with paddle.no_grad():
-                spk_emb = None
-                spk_id = None
-                # multi speaker
-                if args.voice_cloning and "spk_emb" in datum:
-                    spk_emb = paddle.to_tensor(np.load(datum["spk_emb"]))
-                elif "spk_id" in datum:
-                    spk_id = paddle.to_tensor(datum["spk_id"])
-                out = vits.inference(
-                    text=phone_ids, sids=spk_id, spembs=spk_emb)
+                out = vits.inference(text=phone_ids)
            wav = out["wav"]
            wav = wav.numpy()
            N += wav.size
@ -119,13 +90,6 @@ def parse_args():
        '--ckpt', type=str, default=None, help='Checkpoint file of VITS.')
    parser.add_argument(
        "--phones_dict", type=str, default=None, help="phone vocabulary file.")
-    parser.add_argument(
-        "--speaker_dict", type=str, default=None, help="speaker id map file.")
-    parser.add_argument(
-        "--voice-cloning",
-        type=str2bool,
-        default=False,
-        help="whether training voice cloning model.")
    # other
    parser.add_argument(
        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
--- a/paddlespeech/t2s/exps/vits/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/vits/synthesize_e2e.py
@ -42,23 +42,12 @@ def evaluate(args):
    # frontend
    frontend = get_frontend(lang=args.lang, phones_dict=args.phones_dict)

-    spk_num = None
-    if args.speaker_dict is not None:
-        print("multiple speaker vits!")
-        with open(args.speaker_dict, 'rt') as f:
-            spk_id = [line.strip().split() for line in f.readlines()]
-        spk_num = len(spk_id)
-    else:
-        print("single speaker vits!")
-    print("spk_num:", spk_num)
-
    with open(args.phones_dict, "r") as f:
        phn_id = [line.strip().split() for line in f.readlines()]
    vocab_size = len(phn_id)
    print("vocab_size:", vocab_size)

    odim = config.n_fft // 2 + 1
-    config["model"]["generator_params"]["spks"] = spk_num

    vits = VITS(idim=vocab_size, odim=odim, **config["model"])
    vits.set_state_dict(paddle.load(args.ckpt)["main_params"])
@ -89,10 +78,7 @@ def evaluate(args):
                flags = 0
                for i in range(len(phone_ids)):
                    part_phone_ids = phone_ids[i]
-                    spk_id = None
-                    if spk_num is not None:
-                        spk_id = paddle.to_tensor(args.spk_id)
-                    out = vits.inference(text=part_phone_ids, sids=spk_id)
+                    out = vits.inference(text=part_phone_ids)
                    wav = out["wav"]
                    if flags == 0:
                        wav_all = wav
@ -123,13 +109,6 @@ def parse_args():
        '--ckpt', type=str, default=None, help='Checkpoint file of VITS.')
    parser.add_argument(
        "--phones_dict", type=str, default=None, help="phone vocabulary file.")
-    parser.add_argument(
-        "--speaker_dict", type=str, default=None, help="speaker id map file.")
-    parser.add_argument(
-        '--spk_id',
-        type=int,
-        default=0,
-        help='spk id for multi speaker acoustic model')
    # other
    parser.add_argument(
        '--lang',
--- a/paddlespeech/t2s/exps/vits/train.py
+++ b/paddlespeech/t2s/exps/vits/train.py
@ -28,7 +28,6 @@ from paddle.io import DistributedBatchSampler
 from paddle.optimizer import Adam
 from yacs.config import CfgNode

-from paddlespeech.t2s.datasets.am_batch_fn import vits_multi_spk_batch_fn
 from paddlespeech.t2s.datasets.am_batch_fn import vits_single_spk_batch_fn
 from paddlespeech.t2s.datasets.data_table import DataTable
 from paddlespeech.t2s.models.vits import VITS
@ -44,7 +43,6 @@ from paddlespeech.t2s.training.extensions.visualizer import VisualDL
 from paddlespeech.t2s.training.optimizer import scheduler_classes
 from paddlespeech.t2s.training.seeding import seed_everything
 from paddlespeech.t2s.training.trainer import Trainer
-from paddlespeech.t2s.utils import str2bool


 def train_sp(args, config):
@ -74,23 +72,6 @@ def train_sp(args, config):
        "wave": np.load,
        "feats": np.load,
    }
-    spk_num = None
-    if args.speaker_dict is not None:
-        print("multiple speaker vits!")
-        collate_fn = vits_multi_spk_batch_fn
-        with open(args.speaker_dict, 'rt') as f:
-            spk_id = [line.strip().split() for line in f.readlines()]
-        spk_num = len(spk_id)
-        fields += ["spk_id"]
-    elif args.voice_cloning:
-        print("Training voice cloning!")
-        collate_fn = vits_multi_spk_batch_fn
-        fields += ["spk_emb"]
-        converters["spk_emb"] = np.load
-    else:
-        print("single speaker vits!")
-        collate_fn = vits_single_spk_batch_fn
-    print("spk_num:", spk_num)

    # construct dataset for training and validation
    with jsonlines.open(args.train_metadata, 'r') as reader:
@ -119,16 +100,18 @@ def train_sp(args, config):
        drop_last=False)
    print("samplers done!")

+    train_batch_fn = vits_single_spk_batch_fn
+
    train_dataloader = DataLoader(
        train_dataset,
        batch_sampler=train_sampler,
-        collate_fn=collate_fn,
+        collate_fn=train_batch_fn,
        num_workers=config.num_workers)

    dev_dataloader = DataLoader(
        dev_dataset,
        batch_sampler=dev_sampler,
-        collate_fn=collate_fn,
+        collate_fn=train_batch_fn,
        num_workers=config.num_workers)
    print("dataloaders done!")

@ -138,7 +121,6 @@ def train_sp(args, config):
    print("vocab_size:", vocab_size)

    odim = config.n_fft // 2 + 1
-    config["model"]["generator_params"]["spks"] = spk_num
    model = VITS(idim=vocab_size, odim=odim, **config["model"])
    gen_parameters = model.generator.parameters()
    dis_parameters = model.discriminator.parameters()
@ -258,17 +240,6 @@ def main():
        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
    parser.add_argument(
        "--phones-dict", type=str, default=None, help="phone vocabulary file.")
-    parser.add_argument(
-        "--speaker-dict",
-        type=str,
-        default=None,
-        help="speaker id map file for multiple speaker model.")
-
-    parser.add_argument(
-        "--voice-cloning",
-        type=str2bool,
-        default=False,
-        help="whether training voice cloning model.")

    args = parser.parse_args()

--- a/paddlespeech/t2s/exps/voice_cloning.py
+++ b/paddlespeech/t2s/exps/voice_cloning.py
@ -21,28 +21,13 @@ import soundfile as sf
 import yaml
 from yacs.config import CfgNode

-from paddlespeech.cli.vector import VectorExecutor
 from paddlespeech.t2s.exps.syn_utils import get_am_inference
 from paddlespeech.t2s.exps.syn_utils import get_voc_inference
 from paddlespeech.t2s.frontend.zh_frontend import Frontend
-from paddlespeech.t2s.utils import str2bool
 from paddlespeech.vector.exps.ge2e.audio_processor import SpeakerVerificationPreprocessor
 from paddlespeech.vector.models.lstm_speaker_encoder import LSTMSpeakerEncoder


-def gen_random_embed(use_ecapa: bool=False):
-    if use_ecapa:
-        # Randomly generate numbers of -25 ~ 25, 192 is the dim of spk_emb
-        random_spk_emb = (-1 + 2 * np.random.rand(192)) * 25
-
-    # GE2E
-    else:
-        # Randomly generate numbers of 0 ~ 0.2, 256 is the dim of spk_emb
-        random_spk_emb = np.random.rand(256) * 0.2
-    random_spk_emb = paddle.to_tensor(random_spk_emb, dtype='float32')
-    return random_spk_emb
-
-
 def voice_cloning(args):
    # Init body.
    with open(args.am_config) as f:
@ -56,20 +41,7 @@ def voice_cloning(args):
    print(am_config)
    print(voc_config)

-    output_dir = Path(args.output_dir)
-    output_dir.mkdir(parents=True, exist_ok=True)
-
-    input_dir = Path(args.input_dir)
-
    # speaker encoder
-    if args.use_ecapa:
-        vec_executor = VectorExecutor()
-        # warm up
-        vec_executor(
-            audio_file=input_dir / os.listdir(input_dir)[0], force_yes=True)
-        print("ECAPA-TDNN Done!")
-    # use GE2E
-    else:
    p = SpeakerVerificationPreprocessor(
        sampling_rate=16000,
        audio_norm_target_dBFS=-30,
@ -93,10 +65,6 @@ def voice_cloning(args):
    frontend = Frontend(phone_vocab_path=args.phones_dict)
    print("frontend done!")

-    sentence = args.text
-    input_ids = frontend.get_input_ids(sentence, merge_sentences=True)
-    phone_ids = input_ids["phone_ids"][0]
-
    # acoustic model
    am_inference = get_am_inference(
        am=args.am,
@ -112,19 +80,26 @@ def voice_cloning(args):
        voc_ckpt=args.voc_ckpt,
        voc_stat=args.voc_stat)

+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    input_dir = Path(args.input_dir)
+
+    sentence = args.text
+
+    input_ids = frontend.get_input_ids(sentence, merge_sentences=True)
+    phone_ids = input_ids["phone_ids"][0]
+
    for name in os.listdir(input_dir):
        utt_id = name.split(".")[0]
        ref_audio_path = input_dir / name
-        if args.use_ecapa:
-            spk_emb = vec_executor(audio_file=ref_audio_path, force_yes=True)
-            spk_emb = paddle.to_tensor(spk_emb)
-        # GE2E
-        else:
-            mel_sequences = p.extract_mel_partials(
-                p.preprocess_wav(ref_audio_path))
+        mel_sequences = p.extract_mel_partials(p.preprocess_wav(ref_audio_path))
+        # print("mel_sequences: ", mel_sequences.shape)
        with paddle.no_grad():
            spk_emb = speaker_encoder.embed_utterance(
                paddle.to_tensor(mel_sequences))
+        # print("spk_emb shape: ", spk_emb.shape)
+
        with paddle.no_grad():
            wav = voc_inference(am_inference(phone_ids, spk_emb=spk_emb))

@ -133,15 +108,14 @@ def voice_cloning(args):
            wav.numpy(),
            samplerate=am_config.fs)
        print(f"{utt_id} done!")
-
-    # generate 5 random_spk_emb
-    for i in range(5):
-        random_spk_emb = gen_random_embed(args.use_ecapa)
+    # Randomly generate numbers of 0 ~ 0.2, 256 is the dim of spk_emb
+    random_spk_emb = np.random.rand(256) * 0.2
+    random_spk_emb = paddle.to_tensor(random_spk_emb, dtype='float32')
    utt_id = "random_spk_emb"
    with paddle.no_grad():
        wav = voc_inference(am_inference(phone_ids, spk_emb=random_spk_emb))
    sf.write(
-            str(output_dir / (utt_id + "_" + str(i) + ".wav")),
+        str(output_dir / (utt_id + ".wav")),
        wav.numpy(),
        samplerate=am_config.fs)
    print(f"{utt_id} done!")
@ -197,15 +171,13 @@ def parse_args():
        type=str,
        default="每当你觉得，想要批评什么人的时候，你切要记着，这个世界上的人，并非都具备你禀有的条件。",
        help="text to synthesize, a line")
+
    parser.add_argument(
        "--ge2e_params_path", type=str, help="ge2e params path.")
-    parser.add_argument(
-        "--use_ecapa",
-        type=str2bool,
-        default=False,
-        help="whether to use ECAPA-TDNN as speaker encoder.")
+
    parser.add_argument(
        "--ngpu", type=int, default=1, help="if ngpu=0, use cpu.")
+
    parser.add_argument(
        "--input-dir",
        type=str,
--- a/paddlespeech/t2s/frontend/g2pw/init.py
+++ b/paddlespeech/t2s/frontend/g2pw/init.py
@ -1 +1,2 @@
 from paddlespeech.t2s.frontend.g2pw.onnx_api import G2PWOnnxConverter
+
--- a/paddlespeech/t2s/frontend/g2pw/dataset.py
+++ b/paddlespeech/t2s/frontend/g2pw/dataset.py
@ -81,12 +81,12 @@ def prepare_onnx_input(tokenizer,
        position_ids.append(position_id)

    outputs = {
-        'input_ids': np.array(input_ids).astype(np.int64),
-        'token_type_ids': np.array(token_type_ids).astype(np.int64),
-        'attention_masks': np.array(attention_masks).astype(np.int64),
+        'input_ids': np.array(input_ids),
+        'token_type_ids': np.array(token_type_ids),
+        'attention_masks': np.array(attention_masks),
        'phoneme_masks': np.array(phoneme_masks).astype(np.float32),
-        'char_ids': np.array(char_ids).astype(np.int64),
-        'position_ids': np.array(position_ids).astype(np.int64),
+        'char_ids': np.array(char_ids),
+        'position_ids': np.array(position_ids),
    }
    return outputs

--- a/paddlespeech/t2s/frontend/g2pw/onnx_api.py
+++ b/paddlespeech/t2s/frontend/g2pw/onnx_api.py
@ -34,7 +34,7 @@ from paddlespeech.t2s.frontend.g2pw.utils import load_config
 from paddlespeech.t2s.frontend.zh_normalization.char_convert import tranditional_to_simplified
 from paddlespeech.utils.env import MODEL_HOME

-model_version = '1.1'
+model_version = '1.0'


 def predict(session, onnx_input, labels):
--- a/paddlespeech/t2s/frontend/mix_frontend.py
+++ b/paddlespeech/t2s/frontend/mix_frontend.py
@ -61,11 +61,7 @@ class MixFrontend():
            return False

    def is_end(self, before_char, after_char) -> bool:
-        flag = 0
-        for char in (before_char, after_char):
-            if self.is_alphabet(char) or char == " ":
-                flag += 1
-        if flag == 2:
+        if ((self.is_alphabet(before_char) or before_char == " ") and (self.is_alphabet(after_char) or after_char == " ")):
            return True
        else:
            return False
@ -90,11 +86,10 @@ class MixFrontend():
            if point_index == 0 or point_index == len(text) - 1:
                new_text = text
            else:
-                if not self.is_end(text[point_index - 1], text[point_index +
-                                                               1]):
+                if not self.is_end(text[point_index - 1], text[point_index + 1]):
                    new_text = text
                else:
-                    new_text = text[:point_index] + "。" + text[point_index + 1:]
+                    new_text = text[: point_index] + "。" + text[point_index + 1:]

        elif len(point_indexs) == 2:
            first_index = point_indexs[0]
@ -102,8 +97,7 @@ class MixFrontend():

            # first
            if first_index != 0:
-                if not self.is_end(text[first_index - 1], text[first_index +
-                                                               1]):
+                if not self.is_end(text[first_index - 1], text[first_index + 1]):
                    new_text += (text[:first_index] + ".")
                else:
                    new_text += (text[:first_index] + "。")
@ -112,10 +106,9 @@ class MixFrontend():
            # last
            if end_index != len(text) - 1:
                if not self.is_end(text[end_index - 1], text[end_index + 1]):
-                    new_text += text[point_indexs[-2] + 1:]
+                    new_text += text[point_indexs[-2] + 1 : ]
                else:
-                    new_text += (text[point_indexs[-2] + 1:end_index] + "。" +
-                                 text[end_index + 1:])
+                    new_text += (text[point_indexs[-2] + 1 : end_index] + "。" + text[end_index + 1 : ])
            else:
                new_text += "."          

@ -124,8 +117,7 @@ class MixFrontend():
            end_index = point_indexs[-1]
            # first
            if first_index != 0:
-                if not self.is_end(text[first_index - 1], text[first_index +
-                                                               1]):
+                if not self.is_end(text[first_index - 1], text[first_index + 1]):
                    new_text += (text[:first_index] + ".")
                else:
                    new_text += (text[:first_index] + "。")
@ -134,20 +126,16 @@ class MixFrontend():
            # middle
            for j in range(1, len(point_indexs) - 1):
                point_index = point_indexs[j]
-                if not self.is_end(text[point_index - 1], text[point_index +
-                                                               1]):
-                    new_text += (
-                        text[point_indexs[j - 1] + 1:point_index] + ".")
+                if not self.is_end(text[point_index - 1], text[point_index + 1]):
+                    new_text += (text[point_indexs[j-1] + 1 : point_index] + ".")
                else:
-                    new_text += (
-                        text[point_indexs[j - 1] + 1:point_index] + "。")
+                    new_text += (text[point_indexs[j-1] + 1 : point_index] + "。")
            # last
            if end_index != len(text) - 1:
                if not self.is_end(text[end_index - 1], text[end_index + 1]):
-                    new_text += text[point_indexs[-2] + 1:]
+                    new_text += text[point_indexs[-2] + 1 : ]
                else:
-                    new_text += (text[point_indexs[-2] + 1:end_index] + "。" +
-                                 text[end_index + 1:])
+                    new_text += (text[point_indexs[-2] + 1 : end_index] + "。" + text[end_index + 1 : ])
            else:
                new_text += "."

@ -236,7 +224,7 @@ class MixFrontend():

    def get_input_ids(self,
                      sentence: str,
-                      merge_sentences: bool=False,
+                      merge_sentences: bool=True,
                      get_tone_ids: bool=False,
                      add_sp: bool=True,
                      to_tensor: bool=True) -> Dict[str, List[paddle.Tensor]]:
@ -244,29 +232,28 @@ class MixFrontend():
        sentences = self._split(sentence)
        phones_list = []
        result = {}
+
        for text in sentences:
            phones_seg = []
            segments = self._distinguish(text)
            for seg in segments:
                content = seg[0]
                lang = seg[1]
-                if content != '':
-                    if lang == "en":
-                        input_ids = self.en_frontend.get_input_ids(
-                            content, merge_sentences=True, to_tensor=to_tensor)
-                    else:
+                if lang == "zh":
                    input_ids = self.zh_frontend.get_input_ids(
                        content,
                        merge_sentences=True,
                        get_tone_ids=get_tone_ids,
                        to_tensor=to_tensor)

+                elif lang == "en":
+                    input_ids = self.en_frontend.get_input_ids(
+                        content, merge_sentences=True, to_tensor=to_tensor)
+
                phones_seg.append(input_ids["phone_ids"][0])
                if add_sp:
                    phones_seg.append(self.sp_id_tensor)

-            if phones_seg == []:
-                phones_seg.append(self.sp_id_tensor)
            phones = paddle.concat(phones_seg)
            phones_list.append(phones)

--- a/paddlespeech/t2s/frontend/polyphonic.yaml
+++ b/paddlespeech/t2s/frontend/polyphonic.yaml
@ -42,8 +42,3 @@ polyphonic:
    咖喱: ['ga1','li5']
    时分: ['shi2','fen1']
    蚌埠: ['beng4','bu4']
-    驯服: ['xun4','fu2']
-    幸免于难: ['xing4','mian3','yu2','nan4']
-    恶行: ['e4','xing2']
-    唉: ['ai4']
-
--- a/paddlespeech/t2s/frontend/tone_sandhi.py
+++ b/paddlespeech/t2s/frontend/tone_sandhi.py
@ -42,7 +42,7 @@ class ToneSandhi():
            '木头', '木匠', '朋友', '月饼', '月亮', '暖和', '明白', '时候', '新鲜', '故事', '收拾',
            '收成', '提防', '挖苦', '挑剔', '指甲', '指头', '拾掇', '拳头', '拨弄', '招牌', '招呼',
            '抬举', '护士', '折腾', '扫帚', '打量', '打算', '打扮', '打听', '打发', '扎实', '扁担',
-            '戒指', '懒得', '意识', '意思', '悟性', '怪物', '思量', '怎么', '念头', '念叨', '别人',
+            '戒指', '懒得', '意识', '意思', '情形', '悟性', '怪物', '思量', '怎么', '念头', '念叨',
            '快活', '忙活', '志气', '心思', '得罪', '张罗', '弟兄', '开通', '应酬', '庄稼', '干事',
            '帮手', '帐篷', '希罕', '师父', '师傅', '巴结', '巴掌', '差事', '工夫', '岁数', '屁股',
            '尾巴', '少爷', '小气', '小伙', '将就', '对头', '对付', '寡妇', '家伙', '客气', '实在',
@ -60,7 +60,7 @@ class ToneSandhi():
            '邋遢', '费用', '冤家', '甜头', '介绍', '荒唐', '大人', '泥鳅', '幸福', '熟悉', '计划',
            '扑腾', '蜡烛', '姥爷', '照顾', '喉咙', '吉他', '弄堂', '蚂蚱', '凤凰', '拖沓', '寒碜',
            '糟蹋', '倒腾', '报复', '逻辑', '盘缠', '喽啰', '牢骚', '咖喱', '扫把', '惦记', '戏弄',
-            '将军'
+            '将军', '别人'
        }
        self.must_not_neural_tone_words = {
            '男子', '女子', '分子', '原子', '量子', '莲子', '石子', '瓜子', '电子', '人人', '虎虎',
@ -84,7 +84,7 @@ class ToneSandhi():
            if j - 1 >= 0 and item == word[j - 1] and pos[0] in {"n", "v", "a"}:
                finals[j] = finals[j][:-1] + "5"
        ge_idx = word.find("个")
-        if len(word) >= 1 and word[-1] in "吧呢啊呐噻嘛吖嗨呐哦哒滴哩哟喽啰耶喔诶":
+        if len(word) >= 1 and word[-1] in "吧呢啊呐噻嘛吖嗨呐哦哒额滴哩哟喽啰耶喔诶":
            finals[-1] = finals[-1][:-1] + "5"
        elif len(word) >= 1 and word[-1] in "的地得":
            finals[-1] = finals[-1][:-1] + "5"
@ -169,7 +169,6 @@ class ToneSandhi():
        return new_word_list

    def _three_sandhi(self, word: str, finals: List[str]) -> List[str]:
-
        if len(word) == 2 and self._all_tone_three(finals):
            finals[0] = finals[0][:-1] + "2"
        elif len(word) == 3:
@ -347,7 +346,6 @@ class ToneSandhi():

    def modified_tone(self, word: str, pos: str,
                      finals: List[str]) -> List[str]:
-
        finals = self._bu_sandhi(word, finals)
        finals = self._yi_sandhi(word, finals)
        finals = self._neural_sandhi(word, pos, finals)
--- a/paddlespeech/t2s/frontend/zh_normalization/num.py
+++ b/paddlespeech/t2s/frontend/zh_normalization/num.py
@ -28,7 +28,7 @@ UNITS = OrderedDict({
    8: '亿',
 })

-COM_QUANTIFIERS = '(封|艘|把|目|套|段|人|所|朵|匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|毫|厘|(公)分|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|小时|旬|纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块|元|(亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|美|)元|(亿|千万|百万|万|千|百|十|)吨|(亿|千万|百万|万|千|百|)块|角|毛|分)'
+COM_QUANTIFIERS = '(所|朵|匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|毫|厘|(公)分|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|小时|旬|纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块|元|(亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|美|)元|(亿|千万|百万|万|千|百|)块|角|毛|分)'

 # 分数表达式
 RE_FRAC = re.compile(r'(-?)(\d+)/(\d+)')
--- a/paddlespeech/t2s/models/ernie_sat/init.py
+++ b/paddlespeech/t2s/models/ernie_sat/init.py
@ -13,3 +13,4 @@
 # limitations under the License.
 from .ernie_sat import *
 from .ernie_sat_updater import *
+from .mlm import *
--- a/paddlespeech/t2s/models/ernie_sat/ernie_sat.py
+++ b/paddlespeech/t2s/models/ernie_sat/ernie_sat.py
@ -389,7 +389,7 @@ class MLM(nn.Layer):
            speech_seg_pos: paddle.Tensor,
            text_seg_pos: paddle.Tensor,
            span_bdy: List[int],
-            use_teacher_forcing: bool=True, ) -> List[paddle.Tensor]:
+            use_teacher_forcing: bool=False, ) -> List[paddle.Tensor]:
        '''
        Args:
            speech (paddle.Tensor): input speech (1, Tmax, D).
@ -657,7 +657,7 @@ class ErnieSAT(nn.Layer):
            speech_seg_pos: paddle.Tensor,
            text_seg_pos: paddle.Tensor,
            span_bdy: List[int],
-            use_teacher_forcing: bool=True, ) -> Dict[str, paddle.Tensor]:
+            use_teacher_forcing: bool=False, ) -> Dict[str, paddle.Tensor]:
        return self.model.inference(
            speech=speech,
            text=text,
--- a/paddlespeech/t2s/models/ernie_sat/mlm.py
+++ b/paddlespeech/t2s/models/ernie_sat/mlm.py
@ -0,0 +1,579 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+from typing import Dict
+from typing import List
+from typing import Optional
+
+import paddle
+import yaml
+from paddle import nn
+from yacs.config import CfgNode
+
+from paddlespeech.t2s.modules.activation import get_activation
+from paddlespeech.t2s.modules.conformer.convolution import ConvolutionModule
+from paddlespeech.t2s.modules.conformer.encoder_layer import EncoderLayer
+from paddlespeech.t2s.modules.layer_norm import LayerNorm
+from paddlespeech.t2s.modules.masked_fill import masked_fill
+from paddlespeech.t2s.modules.nets_utils import initialize
+from paddlespeech.t2s.modules.tacotron2.decoder import Postnet
+from paddlespeech.t2s.modules.transformer.attention import LegacyRelPositionMultiHeadedAttention
+from paddlespeech.t2s.modules.transformer.attention import MultiHeadedAttention
+from paddlespeech.t2s.modules.transformer.attention import RelPositionMultiHeadedAttention
+from paddlespeech.t2s.modules.transformer.embedding import LegacyRelPositionalEncoding
+from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
+from paddlespeech.t2s.modules.transformer.embedding import RelPositionalEncoding
+from paddlespeech.t2s.modules.transformer.embedding import ScaledPositionalEncoding
+from paddlespeech.t2s.modules.transformer.multi_layer_conv import Conv1dLinear
+from paddlespeech.t2s.modules.transformer.multi_layer_conv import MultiLayeredConv1d
+from paddlespeech.t2s.modules.transformer.positionwise_feed_forward import PositionwiseFeedForward
+from paddlespeech.t2s.modules.transformer.repeat import repeat
+from paddlespeech.t2s.modules.transformer.subsampling import Conv2dSubsampling
+
+
+# MLM -> Mask Language Model
+class mySequential(nn.Sequential):
+    def forward(self, *inputs):
+        for module in self._sub_layers.values():
+            if type(inputs) == tuple:
+                inputs = module(*inputs)
+            else:
+                inputs = module(inputs)
+        return inputs
+
+
+class MaskInputLayer(nn.Layer):
+    def __init__(self, out_features: int) -> None:
+        super().__init__()
+        self.mask_feature = paddle.create_parameter(
+            shape=(1, 1, out_features),
+            dtype=paddle.float32,
+            default_initializer=paddle.nn.initializer.Assign(
+                paddle.normal(shape=(1, 1, out_features))))
+
+    def forward(self, input: paddle.Tensor,
+                masked_pos: paddle.Tensor=None) -> paddle.Tensor:
+        masked_pos = paddle.expand_as(paddle.unsqueeze(masked_pos, -1), input)
+        masked_input = masked_fill(input, masked_pos, 0) + masked_fill(
+            paddle.expand_as(self.mask_feature, input), ~masked_pos, 0)
+        return masked_input
+
+
+class MLMEncoder(nn.Layer):
+    """Conformer encoder module.
+
+    Args:
+        idim (int): Input dimension.
+        attention_dim (int): Dimension of attention.
+        attention_heads (int): The number of heads of multi head attention.
+        linear_units (int): The number of units of position-wise feed forward.
+        num_blocks (int): The number of decoder blocks.
+        dropout_rate (float): Dropout rate.
+        positional_dropout_rate (float): Dropout rate after adding positional encoding.
+        attention_dropout_rate (float): Dropout rate in attention.
+        input_layer (Union[str, paddle.nn.Layer]): Input layer type.
+        normalize_before (bool): Whether to use layer_norm before the first block.
+        concat_after (bool): Whether to concat attention layer's input and output.
+            if True, additional linear will be applied.
+            i.e. x -> x + linear(concat(x, att(x)))
+            if False, no additional linear will be applied. i.e. x -> x + att(x)
+        positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear".
+        positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer.
+        macaron_style (bool): Whether to use macaron style for positionwise layer.
+        pos_enc_layer_type (str): Encoder positional encoding layer type.
+        selfattention_layer_type (str): Encoder attention layer type.
+        activation_type (str): Encoder activation function type.
+        use_cnn_module (bool): Whether to use convolution module.
+        zero_triu (bool): Whether to zero the upper triangular part of attention matrix.
+        cnn_module_kernel (int): Kernerl size of convolution module.
+        padding_idx (int): Padding idx for input_layer=embed.
+        stochastic_depth_rate (float): Maximum probability to skip the encoder layer.
+
+    """
+
+    def __init__(self,
+                 idim: int,
+                 vocab_size: int=0,
+                 pre_speech_layer: int=0,
+                 attention_dim: int=256,
+                 attention_heads: int=4,
+                 linear_units: int=2048,
+                 num_blocks: int=6,
+                 dropout_rate: float=0.1,
+                 positional_dropout_rate: float=0.1,
+                 attention_dropout_rate: float=0.0,
+                 input_layer: str="conv2d",
+                 normalize_before: bool=True,
+                 concat_after: bool=False,
+                 positionwise_layer_type: str="linear",
+                 positionwise_conv_kernel_size: int=1,
+                 macaron_style: bool=False,
+                 pos_enc_layer_type: str="abs_pos",
+                 selfattention_layer_type: str="selfattn",
+                 activation_type: str="swish",
+                 use_cnn_module: bool=False,
+                 zero_triu: bool=False,
+                 cnn_module_kernel: int=31,
+                 padding_idx: int=-1,
+                 stochastic_depth_rate: float=0.0,
+                 text_masking: bool=False):
+        """Construct an Encoder object."""
+        super().__init__()
+        self._output_size = attention_dim
+        self.text_masking = text_masking
+        if self.text_masking:
+            self.text_masking_layer = MaskInputLayer(attention_dim)
+        activation = get_activation(activation_type)
+        if pos_enc_layer_type == "abs_pos":
+            pos_enc_class = PositionalEncoding
+        elif pos_enc_layer_type == "scaled_abs_pos":
+            pos_enc_class = ScaledPositionalEncoding
+        elif pos_enc_layer_type == "rel_pos":
+            assert selfattention_layer_type == "rel_selfattn"
+            pos_enc_class = RelPositionalEncoding
+        elif pos_enc_layer_type == "legacy_rel_pos":
+            pos_enc_class = LegacyRelPositionalEncoding
+            assert selfattention_layer_type == "legacy_rel_selfattn"
+        else:
+            raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type)
+
+        self.conv_subsampling_factor = 1
+        if input_layer == "linear":
+            self.embed = nn.Sequential(
+                nn.Linear(idim, attention_dim),
+                nn.LayerNorm(attention_dim),
+                nn.Dropout(dropout_rate),
+                nn.ReLU(),
+                pos_enc_class(attention_dim, positional_dropout_rate), )
+        elif input_layer == "conv2d":
+            self.embed = Conv2dSubsampling(
+                idim,
+                attention_dim,
+                dropout_rate,
+                pos_enc_class(attention_dim, positional_dropout_rate), )
+            self.conv_subsampling_factor = 4
+        elif input_layer == "embed":
+            self.embed = nn.Sequential(
+                nn.Embedding(idim, attention_dim, padding_idx=padding_idx),
+                pos_enc_class(attention_dim, positional_dropout_rate), )
+        elif input_layer == "mlm":
+            self.segment_emb = None
+            self.speech_embed = mySequential(
+                MaskInputLayer(idim),
+                nn.Linear(idim, attention_dim),
+                nn.LayerNorm(attention_dim),
+                nn.ReLU(),
+                pos_enc_class(attention_dim, positional_dropout_rate))
+            self.text_embed = nn.Sequential(
+                nn.Embedding(
+                    vocab_size, attention_dim, padding_idx=padding_idx),
+                pos_enc_class(attention_dim, positional_dropout_rate), )
+        elif input_layer == "sega_mlm":
+            self.segment_emb = nn.Embedding(
+                500, attention_dim, padding_idx=padding_idx)
+            self.speech_embed = mySequential(
+                MaskInputLayer(idim),
+                nn.Linear(idim, attention_dim),
+                nn.LayerNorm(attention_dim),
+                nn.ReLU(),
+                pos_enc_class(attention_dim, positional_dropout_rate))
+            self.text_embed = nn.Sequential(
+                nn.Embedding(
+                    vocab_size, attention_dim, padding_idx=padding_idx),
+                pos_enc_class(attention_dim, positional_dropout_rate), )
+        elif isinstance(input_layer, nn.Layer):
+            self.embed = nn.Sequential(
+                input_layer,
+                pos_enc_class(attention_dim, positional_dropout_rate), )
+        elif input_layer is None:
+            self.embed = nn.Sequential(
+                pos_enc_class(attention_dim, positional_dropout_rate))
+        else:
+            raise ValueError("unknown input_layer: " + input_layer)
+        self.normalize_before = normalize_before
+
+        # self-attention module definition
+        if selfattention_layer_type == "selfattn":
+            encoder_selfattn_layer = MultiHeadedAttention
+            encoder_selfattn_layer_args = (attention_heads, attention_dim,
+                                           attention_dropout_rate, )
+        elif selfattention_layer_type == "legacy_rel_selfattn":
+            assert pos_enc_layer_type == "legacy_rel_pos"
+            encoder_selfattn_layer = LegacyRelPositionMultiHeadedAttention
+            encoder_selfattn_layer_args = (attention_heads, attention_dim,
+                                           attention_dropout_rate, )
+        elif selfattention_layer_type == "rel_selfattn":
+            assert pos_enc_layer_type == "rel_pos"
+            encoder_selfattn_layer = RelPositionMultiHeadedAttention
+            encoder_selfattn_layer_args = (attention_heads, attention_dim,
+                                           attention_dropout_rate, zero_triu, )
+        else:
+            raise ValueError("unknown encoder_attn_layer: " +
+                             selfattention_layer_type)
+
+        # feed-forward module definition
+        if positionwise_layer_type == "linear":
+            positionwise_layer = PositionwiseFeedForward
+            positionwise_layer_args = (attention_dim, linear_units,
+                                       dropout_rate, activation, )
+        elif positionwise_layer_type == "conv1d":
+            positionwise_layer = MultiLayeredConv1d
+            positionwise_layer_args = (attention_dim, linear_units,
+                                       positionwise_conv_kernel_size,
+                                       dropout_rate, )
+        elif positionwise_layer_type == "conv1d-linear":
+            positionwise_layer = Conv1dLinear
+            positionwise_layer_args = (attention_dim, linear_units,
+                                       positionwise_conv_kernel_size,
+                                       dropout_rate, )
+        else:
+            raise NotImplementedError("Support only linear or conv1d.")
+
+        # convolution module definition
+        convolution_layer = ConvolutionModule
+        convolution_layer_args = (attention_dim, cnn_module_kernel, activation)
+
+        self.encoders = repeat(
+            num_blocks,
+            lambda lnum: EncoderLayer(
+                attention_dim,
+                encoder_selfattn_layer(*encoder_selfattn_layer_args),
+                positionwise_layer(*positionwise_layer_args),
+                positionwise_layer(*positionwise_layer_args) if macaron_style else None,
+                convolution_layer(*convolution_layer_args) if use_cnn_module else None,
+                dropout_rate,
+                normalize_before,
+                concat_after,
+                stochastic_depth_rate * float(1 + lnum) / num_blocks, ), )
+        self.pre_speech_layer = pre_speech_layer
+        self.pre_speech_encoders = repeat(
+            self.pre_speech_layer,
+            lambda lnum: EncoderLayer(
+                attention_dim,
+                encoder_selfattn_layer(*encoder_selfattn_layer_args),
+                positionwise_layer(*positionwise_layer_args),
+                positionwise_layer(*positionwise_layer_args) if macaron_style else None,
+                convolution_layer(*convolution_layer_args) if use_cnn_module else None,
+                dropout_rate,
+                normalize_before,
+                concat_after,
+                stochastic_depth_rate * float(1 + lnum) / self.pre_speech_layer, ),
+        )
+        if self.normalize_before:
+            self.after_norm = LayerNorm(attention_dim)
+
+    def forward(self,
+                speech: paddle.Tensor,
+                text: paddle.Tensor,
+                masked_pos: paddle.Tensor,
+                speech_mask: paddle.Tensor=None,
+                text_mask: paddle.Tensor=None,
+                speech_seg_pos: paddle.Tensor=None,
+                text_seg_pos: paddle.Tensor=None):
+        """Encode input sequence.
+
+        """
+        if masked_pos is not None:
+            speech = self.speech_embed(speech, masked_pos)
+        else:
+            speech = self.speech_embed(speech)
+        if text is not None:
+            text = self.text_embed(text)
+        if speech_seg_pos is not None and text_seg_pos is not None and self.segment_emb:
+            speech_seg_emb = self.segment_emb(speech_seg_pos)
+            text_seg_emb = self.segment_emb(text_seg_pos)
+            text = (text[0] + text_seg_emb, text[1])
+            speech = (speech[0] + speech_seg_emb, speech[1])
+        if self.pre_speech_encoders:
+            speech, _ = self.pre_speech_encoders(speech, speech_mask)
+
+        if text is not None:
+            xs = paddle.concat([speech[0], text[0]], axis=1)
+            xs_pos_emb = paddle.concat([speech[1], text[1]], axis=1)
+            masks = paddle.concat([speech_mask, text_mask], axis=-1)
+        else:
+            xs = speech[0]
+            xs_pos_emb = speech[1]
+            masks = speech_mask
+
+        xs, masks = self.encoders((xs, xs_pos_emb), masks)
+
+        if isinstance(xs, tuple):
+            xs = xs[0]
+        if self.normalize_before:
+            xs = self.after_norm(xs)
+
+        return xs, masks
+
+
+class MLMDecoder(MLMEncoder):
+    def forward(self, xs: paddle.Tensor, masks: paddle.Tensor):
+        """Encode input sequence.
+
+        Args:
+            xs (paddle.Tensor): Input tensor (#batch, time, idim).
+            masks (paddle.Tensor): Mask tensor (#batch, time).
+
+        Returns:
+            paddle.Tensor: Output tensor (#batch, time, attention_dim).
+            paddle.Tensor: Mask tensor (#batch, time).
+
+        """
+        xs = self.embed(xs)
+        xs, masks = self.encoders(xs, masks)
+
+        if isinstance(xs, tuple):
+            xs = xs[0]
+        if self.normalize_before:
+            xs = self.after_norm(xs)
+
+        return xs, masks
+
+
+# encoder and decoder is nn.Layer, not str
+class MLM(nn.Layer):
+    def __init__(self,
+                 odim: int,
+                 encoder: nn.Layer,
+                 decoder: Optional[nn.Layer],
+                 postnet_layers: int=0,
+                 postnet_chans: int=0,
+                 postnet_filts: int=0,
+                 text_masking: bool=False):
+
+        super().__init__()
+        self.odim = odim
+        self.encoder = encoder
+        self.decoder = decoder
+        self.vocab_size = encoder.text_embed[0]._num_embeddings
+
+        if self.decoder is None or not (hasattr(self.decoder,
+                                                'output_layer') and
+                                        self.decoder.output_layer is not None):
+            self.sfc = nn.Linear(self.encoder._output_size, odim)
+        else:
+            self.sfc = None
+        if text_masking:
+            self.text_sfc = nn.Linear(
+                self.encoder.text_embed[0]._embedding_dim,
+                self.vocab_size,
+                weight_attr=self.encoder.text_embed[0]._weight_attr)
+        else:
+            self.text_sfc = None
+
+        self.postnet = (None if postnet_layers == 0 else Postnet(
+            idim=self.encoder._output_size,
+            odim=odim,
+            n_layers=postnet_layers,
+            n_chans=postnet_chans,
+            n_filts=postnet_filts,
+            use_batch_norm=True,
+            dropout_rate=0.5, ))
+
+    def inference(
+            self,
+            speech: paddle.Tensor,
+            text: paddle.Tensor,
+            masked_pos: paddle.Tensor,
+            speech_mask: paddle.Tensor,
+            text_mask: paddle.Tensor,
+            speech_seg_pos: paddle.Tensor,
+            text_seg_pos: paddle.Tensor,
+            span_bdy: List[int],
+            use_teacher_forcing: bool=False, ) -> Dict[str, paddle.Tensor]:
+        '''
+        Args:
+            speech (paddle.Tensor): input speech (1, Tmax, D).
+            text (paddle.Tensor): input text (1, Tmax2).
+            masked_pos (paddle.Tensor): masked position of input speech (1, Tmax)
+            speech_mask (paddle.Tensor): mask of speech (1, 1, Tmax).
+            text_mask (paddle.Tensor): mask of text (1, 1, Tmax2).
+            speech_seg_pos (paddle.Tensor): n-th phone of each mel, 0<=n<=Tmax2 (1, Tmax).
+            text_seg_pos (paddle.Tensor): n-th phone of each phone, 0<=n<=Tmax2 (1, Tmax2).
+            span_bdy (List[int]): masked mel boundary of input speech (2,)
+            use_teacher_forcing (bool): whether to use teacher forcing
+        Returns:
+            List[Tensor]:
+                eg:
+                [Tensor(shape=[1, 181, 80]), Tensor(shape=[80, 80]), Tensor(shape=[1, 67, 80])]
+        '''
+
+        z_cache = None
+        if use_teacher_forcing:
+            before_outs, zs, *_ = self.forward(
+                speech=speech,
+                text=text,
+                masked_pos=masked_pos,
+                speech_mask=speech_mask,
+                text_mask=text_mask,
+                speech_seg_pos=speech_seg_pos,
+                text_seg_pos=text_seg_pos)
+            if zs is None:
+                zs = before_outs
+
+            speech = speech.squeeze(0)
+            outs = [speech[:span_bdy[0]]]
+            outs += [zs[0][span_bdy[0]:span_bdy[1]]]
+            outs += [speech[span_bdy[1]:]]
+            return outs
+        return None
+
+
+class MLMEncAsDecoder(MLM):
+    def forward(self,
+                speech: paddle.Tensor,
+                text: paddle.Tensor,
+                masked_pos: paddle.Tensor,
+                speech_mask: paddle.Tensor,
+                text_mask: paddle.Tensor,
+                speech_seg_pos: paddle.Tensor,
+                text_seg_pos: paddle.Tensor):
+        # feats: (Batch, Length, Dim)
+        # -> encoder_out: (Batch, Length2, Dim2)
+        encoder_out, h_masks = self.encoder(
+            speech=speech,
+            text=text,
+            masked_pos=masked_pos,
+            speech_mask=speech_mask,
+            text_mask=text_mask,
+            speech_seg_pos=speech_seg_pos,
+            text_seg_pos=text_seg_pos)
+        if self.decoder is not None:
+            zs, _ = self.decoder(encoder_out, h_masks)
+        else:
+            zs = encoder_out
+        speech_hidden_states = zs[:, :paddle.shape(speech)[1], :]
+        if self.sfc is not None:
+            before_outs = paddle.reshape(
+                self.sfc(speech_hidden_states),
+                (paddle.shape(speech_hidden_states)[0], -1, self.odim))
+        else:
+            before_outs = speech_hidden_states
+        if self.postnet is not None:
+            after_outs = before_outs + paddle.transpose(
+                self.postnet(paddle.transpose(before_outs, [0, 2, 1])),
+                [0, 2, 1])
+        else:
+            after_outs = None
+        return before_outs, after_outs, None
+
+
+class MLMDualMaksing(MLM):
+    def forward(self,
+                speech: paddle.Tensor,
+                text: paddle.Tensor,
+                masked_pos: paddle.Tensor,
+                speech_mask: paddle.Tensor,
+                text_mask: paddle.Tensor,
+                speech_seg_pos: paddle.Tensor,
+                text_seg_pos: paddle.Tensor):
+        # feats: (Batch, Length, Dim)
+        # -> encoder_out: (Batch, Length2, Dim2)
+        encoder_out, h_masks = self.encoder(
+            speech=speech,
+            text=text,
+            masked_pos=masked_pos,
+            speech_mask=speech_mask,
+            text_mask=text_mask,
+            speech_seg_pos=speech_seg_pos,
+            text_seg_pos=text_seg_pos)
+        if self.decoder is not None:
+            zs, _ = self.decoder(encoder_out, h_masks)
+        else:
+            zs = encoder_out
+        speech_hidden_states = zs[:, :paddle.shape(speech)[1], :]
+        if self.text_sfc:
+            text_hiddent_states = zs[:, paddle.shape(speech)[1]:, :]
+            text_outs = paddle.reshape(
+                self.text_sfc(text_hiddent_states),
+                (paddle.shape(text_hiddent_states)[0], -1, self.vocab_size))
+        if self.sfc is not None:
+            before_outs = paddle.reshape(
+                self.sfc(speech_hidden_states),
+                (paddle.shape(speech_hidden_states)[0], -1, self.odim))
+        else:
+            before_outs = speech_hidden_states
+        if self.postnet is not None:
+            after_outs = before_outs + paddle.transpose(
+                self.postnet(paddle.transpose(before_outs, [0, 2, 1])),
+                [0, 2, 1])
+        else:
+            after_outs = None
+        return before_outs, after_outs, text_outs
+
+
+def build_model_from_file(config_file, model_file):
+
+    state_dict = paddle.load(model_file)
+    model_class = MLMDualMaksing if 'conformer_combine_vctk_aishell3_dual_masking' in config_file \
+        else MLMEncAsDecoder
+
+    # 构建模型
+    with open(config_file) as f:
+        conf = CfgNode(yaml.safe_load(f))
+    model = build_model(conf, model_class)
+    model.set_state_dict(state_dict)
+    return model, conf
+
+
+# select encoder and decoder here
+def build_model(args: argparse.Namespace, model_class=MLMEncAsDecoder) -> MLM:
+    if isinstance(args.token_list, str):
+        with open(args.token_list, encoding="utf-8") as f:
+            token_list = [line.rstrip() for line in f]
+
+        # Overwriting token_list to keep it as "portable".
+        args.token_list = list(token_list)
+    elif isinstance(args.token_list, (tuple, list)):
+        token_list = list(args.token_list)
+    else:
+        raise RuntimeError("token_list must be str or list")
+
+    vocab_size = len(token_list)
+    odim = 80
+
+    # Encoder
+    encoder_class = MLMEncoder
+
+    if 'text_masking' in args.model_conf.keys() and args.model_conf[
+            'text_masking']:
+        args.encoder_conf['text_masking'] = True
+    else:
+        args.encoder_conf['text_masking'] = False
+
+    encoder = encoder_class(
+        args.input_size, vocab_size=vocab_size, **args.encoder_conf)
+
+    # Decoder
+    if args.decoder != 'no_decoder':
+        decoder_class = MLMDecoder
+        decoder = decoder_class(
+            idim=0,
+            input_layer=None,
+            **args.decoder_conf, )
+    else:
+        decoder = None
+
+    # Build model
+    model = model_class(
+        odim=odim,
+        encoder=encoder,
+        decoder=decoder,
+        **args.model_conf, )
+
+    # Initialize
+    if args.init is not None:
+        initialize(model, args.init)
+
+    return model
--- a/paddlespeech/t2s/models/vits/generator.py
+++ b/paddlespeech/t2s/models/vits/generator.py
@ -522,82 +522,6 @@ class VITSGenerator(nn.Layer):

        return wav.squeeze(1), attn.squeeze(1), dur.squeeze(1)

-    def voice_conversion(
-            self,
-            feats: paddle.Tensor=None,
-            feats_lengths: paddle.Tensor=None,
-            sids_src: Optional[paddle.Tensor]=None,
-            sids_tgt: Optional[paddle.Tensor]=None,
-            spembs_src: Optional[paddle.Tensor]=None,
-            spembs_tgt: Optional[paddle.Tensor]=None,
-            lids: Optional[paddle.Tensor]=None, ) -> paddle.Tensor:
-        """Run voice conversion.
-        Args:
-            feats (Tensor): Feature tensor (B, aux_channels, T_feats,).
-            feats_lengths (Tensor): Feature length tensor (B,).
-            sids_src (Optional[Tensor]): Speaker index tensor of source feature (B,) or (B, 1).
-            sids_tgt (Optional[Tensor]): Speaker index tensor of target feature (B,) or (B, 1).
-            spembs_src (Optional[Tensor]): Speaker embedding tensor of source feature (B, spk_embed_dim).
-            spembs_tgt (Optional[Tensor]): Speaker embedding tensor of target feature (B, spk_embed_dim).
-            lids (Optional[Tensor]): Language index tensor (B,) or (B, 1).
-        Returns:
-            Tensor: Generated waveform tensor (B, T_wav).
-        """
-        # encoder
-        g_src = None
-        g_tgt = None
-        if self.spks is not None:
-            # (B, global_channels, 1)
-            g_src = self.global_emb(
-                paddle.reshape(sids_src, [-1])).unsqueeze(-1)
-            g_tgt = self.global_emb(
-                paddle.reshape(sids_tgt, [-1])).unsqueeze(-1)
-
-        if self.spk_embed_dim is not None:
-            # (B, global_channels, 1)
-            g_src_ = self.spemb_proj(
-                F.normalize(spembs_src.unsqueeze(0))).unsqueeze(-1)
-            if g_src is None:
-                g_src = g_src_
-            else:
-                g_src = g_src + g_src_
-
-            # (B, global_channels, 1)
-            g_tgt_ = self.spemb_proj(
-                F.normalize(spembs_tgt.unsqueeze(0))).unsqueeze(-1)
-            if g_tgt is None:
-                g_tgt = g_tgt_
-            else:
-                g_tgt = g_tgt + g_tgt_
-
-        if self.langs is not None:
-            # (B, global_channels, 1)
-            g_ = self.lang_emb(paddle.reshape(lids, [-1])).unsqueeze(-1)
-
-            if g_src is None:
-                g_src = g_
-            else:
-                g_src = g_src + g_
-
-            if g_tgt is None:
-                g_tgt = g_
-            else:
-                g_tgt = g_tgt + g_
-
-        # forward posterior encoder
-        z, m_q, logs_q, y_mask = self.posterior_encoder(
-            feats, feats_lengths, g=g_src)
-
-        # forward flow
-        # (B, H, T_feats)
-        z_p = self.flow(z, y_mask, g=g_src)
-
-        # decoder
-        z_hat = self.flow(z_p, y_mask, g=g_tgt, inverse=True)
-        wav = self.decoder(z_hat * y_mask, g=g_tgt)
-
-        return wav.squeeze(1)
-
    def _generate_path(self, dur: paddle.Tensor,
                       mask: paddle.Tensor) -> paddle.Tensor:
        """Generate path a.k.a. monotonic attention.
--- a/paddlespeech/t2s/models/vits/vits.py
+++ b/paddlespeech/t2s/models/vits/vits.py
@ -381,7 +381,7 @@ class VITS(nn.Layer):
        if use_teacher_forcing:
            assert feats is not None
            feats = feats[None].transpose([0, 2, 1])
-            feats_lengths = paddle.to_tensor(paddle.shape(feats)[2])
+            feats_lengths = paddle.to_tensor([paddle.shape(feats)[2]])
            wav, att_w, dur = self.generator.inference(
                text=text,
                text_lengths=text_lengths,
@ -406,43 +406,3 @@ class VITS(nn.Layer):
                max_len=max_len, )
        return dict(
            wav=paddle.reshape(wav, [-1]), att_w=att_w[0], duration=dur[0])
-
-    def voice_conversion(
-            self,
-            feats: paddle.Tensor,
-            sids_src: Optional[paddle.Tensor]=None,
-            sids_tgt: Optional[paddle.Tensor]=None,
-            spembs_src: Optional[paddle.Tensor]=None,
-            spembs_tgt: Optional[paddle.Tensor]=None,
-            lids: Optional[paddle.Tensor]=None, ) -> paddle.Tensor:
-        """Run voice conversion.
-        Args:
-            feats (Tensor): Feature tensor (T_feats, aux_channels).
-            sids_src (Optional[Tensor]): Speaker index tensor of source feature (1,).
-            sids_tgt (Optional[Tensor]): Speaker index tensor of target feature (1,).
-            spembs_src (Optional[Tensor]): Speaker embedding tensor of source feature (spk_embed_dim,).
-            spembs_tgt (Optional[Tensor]): Speaker embedding tensor of target feature (spk_embed_dim,).
-            lids (Optional[Tensor]): Language index tensor (1,).
-        Returns:
-            Dict[str, Tensor]:
-                * wav (Tensor): Generated waveform tensor (T_wav,).
-        """
-        assert feats is not None
-        feats = feats[None].transpose([0, 2, 1])
-        feats_lengths = paddle.to_tensor(paddle.shape(feats)[2])
-
-        sids_none = sids_src is None and sids_tgt is None
-        spembs_none = spembs_src is None and spembs_tgt is None
-
-        assert not sids_none or not spembs_none
-
-        wav = self.generator.voice_conversion(
-            feats,
-            feats_lengths,
-            sids_src,
-            sids_tgt,
-            spembs_src,
-            spembs_tgt,
-            lids, )
-
-        return dict(wav=paddle.reshape(wav, [-1]))
--- a/paddlespeech/t2s/models/vits/vits_updater.py
+++ b/paddlespeech/t2s/models/vits/vits_updater.py
@ -111,8 +111,6 @@ class VITSUpdater(StandardUpdater):
                text_lengths=batch["text_lengths"],
                feats=batch["feats"],
                feats_lengths=batch["feats_lengths"],
-                sids=batch.get("spk_id", None),
-                spembs=batch.get("spk_emb", None),
                forward_generator=turn == "generator")
            # Generator
            if turn == "generator":
@ -270,8 +268,6 @@ class VITSEvaluator(StandardEvaluator):
                text_lengths=batch["text_lengths"],
                feats=batch["feats"],
                feats_lengths=batch["feats_lengths"],
-                sids=batch.get("spk_id", None),
-                spembs=batch.get("spk_emb", None),
                forward_generator=turn == "generator")
            # Generator
            if turn == "generator":
--- a/paddlespeech/t2s/training/updaters/standard_updater.py
+++ b/paddlespeech/t2s/training/updaters/standard_updater.py
@ -24,11 +24,10 @@ from paddle.nn import Layer
 from paddle.optimizer import Optimizer
 from timer import timer

-from paddlespeech.t2s.datasets.sampler import ErnieSATSampler
 from paddlespeech.t2s.training.reporter import report
 from paddlespeech.t2s.training.updater import UpdaterBase
 from paddlespeech.t2s.training.updater import UpdaterState
-
+from paddlespeech.t2s.datasets.sampler import ErnieSATSampler

 class StandardUpdater(UpdaterBase):
    """An example of over-simplification. Things may not be that simple, but
				`@ -1 +1,2 @@`
				`from paddlespeech.t2s.frontend.g2pw.onnx_api import G2PWOnnxConverter`