change the code format to 2.x style

4 years ago · f0184352f5
parent 4c7fefd4e3
commit f0184352f5
144 changed files with 49903 additions and 14530 deletions
--- a/deepspeech/frontend/featurizer/text_featurizer.py
+++ b/deepspeech/frontend/featurizer/text_featurizer.py
@ -76,7 +76,7 @@ class TextFeaturizer():
        Args:
            text (str): Text.
-        
+
        Returns:
            List[int]: List of token indices.
        """
@ -89,7 +89,7 @@ class TextFeaturizer():
    def defeaturize(self, idxs):
        """Convert a list of token indices to text string,
-        ignore index after eos_id. 
+        ignore index after eos_id.
        Args:
            idxs (List[int]): List of token indices.
@ -196,7 +196,12 @@ class TextFeaturizer():
            [(idx, token) for (idx, token) in enumerate(vocab_list)])
        token2id = dict(
            [(token, idx) for (idx, token) in enumerate(vocab_list)])
-
+        if UNK in vocab_list:
-        unk_id = vocab_list.index(UNK)
+            unk_id = vocab_list.index(UNK)
-        eos_id = vocab_list.index(EOS)
+        else:
            unk_id = -1
        if EOS in vocab_list:
            eos_id = vocab_list.index(EOS)
        else:
            eos_id = -1
        return token2id, id2token, vocab_list, unk_id, eos_id
--- a/deepspeech/frontend/normalizer.py
+++ b/deepspeech/frontend/normalizer.py
@ -130,7 +130,8 @@ class FeatureNormalizer(object):
    def _read_mean_std_from_file(self, filepath, eps=1e-20):
        """Load mean and std from file."""
-        mean, istd = load_cmvn(filepath, filetype='json')
+        filetype = filepath.split(".")[-1]
        mean, istd = load_cmvn(filepath, filetype=filetype)
        self._mean = np.expand_dims(mean, axis=0)
        self._istd = np.expand_dims(istd, axis=0)
--- a/deepspeech/frontend/utility.py
+++ b/deepspeech/frontend/utility.py
@ -69,19 +69,19 @@ def read_manifest(
    Args:
        manifest_path ([type]): Manifest file to load and parse.
-        max_input_len ([type], optional): maximum output seq length, 
+        max_input_len ([type], optional): maximum output seq length,
-            in seconds for raw wav, in frame numbers for feature data. 
+            in seconds for raw wav, in frame numbers for feature data.
            Defaults to float('inf').
-        min_input_len (float, optional): minimum input seq length, 
+        min_input_len (float, optional): minimum input seq length,
-            in seconds for raw wav, in frame numbers for feature data. 
+            in seconds for raw wav, in frame numbers for feature data.
            Defaults to 0.0.
-        max_output_len (float, optional): maximum input seq length, 
+        max_output_len (float, optional): maximum input seq length,
            in modeling units. Defaults to 500.0.
-        min_output_len (float, optional): minimum input seq length, 
+        min_output_len (float, optional): minimum input seq length,
            in modeling units. Defaults to 0.0.
-        max_output_input_ratio (float, optional): 
+        max_output_input_ratio (float, optional):
            maximum output seq length/output seq length ratio. Defaults to 10.0.
-        min_output_input_ratio (float, optional): 
+        min_output_input_ratio (float, optional):
            minimum output seq length/output seq length ratio. Defaults to 0.05.
    Raises:
@ -131,7 +131,7 @@ def rms_to_dbfs(rms: float):
    """Root Mean Square to dBFS.
    https://fireattack.wordpress.com/2017/02/06/replaygain-loudness-normalization-and-applications/
    Audio is mix of sine wave, so 1 amp sine wave's Full scale is 0.7071, equal to -3.0103dB.
-   
+
    dB = dBFS + 3.0103
    dBFS = db - 3.0103
    e.g. 0 dB = -3.0103 dBFS
@ -146,26 +146,26 @@ def rms_to_dbfs(rms: float):
 def max_dbfs(sample_data: np.ndarray):
-    """Peak dBFS based on the maximum energy sample. 
+    """Peak dBFS based on the maximum energy sample.
    Args:
        sample_data ([np.ndarray]): float array, [-1, 1].
    Returns:
-        float: dBFS 
+        float: dBFS
    """
    # Peak dBFS based on the maximum energy sample. Will prevent overdrive if used for normalization.
    return rms_to_dbfs(max(abs(np.min(sample_data)), abs(np.max(sample_data))))
 def mean_dbfs(sample_data):
-    """Peak dBFS based on the RMS energy. 
+    """Peak dBFS based on the RMS energy.
    Args:
        sample_data ([np.ndarray]): float array, [-1, 1].
    Returns:
-        float: dBFS 
+        float: dBFS
    """
    return rms_to_dbfs(
        math.sqrt(np.mean(np.square(sample_data, dtype=np.float64))))
@ -185,7 +185,7 @@ def gain_db_to_ratio(gain_db: float):
 def normalize_audio(sample_data: np.ndarray, dbfs: float=-3.0103):
    """Nomalize audio to dBFS.
-    
+
    Args:
        sample_data (np.ndarray): input wave samples, [-1, 1].
        dbfs (float, optional): target dBFS. Defaults to -3.0103.
@ -284,6 +284,13 @@ def load_cmvn(cmvn_file: str, filetype: str):
        cmvn = _load_json_cmvn(cmvn_file)
    elif filetype == "kaldi":
        cmvn = _load_kaldi_cmvn(cmvn_file)
    elif filetype == "npz":
        eps = 1e-14
        npzfile = np.load(cmvn_file)
        mean = np.squeeze(npzfile["mean"])
        std = np.squeeze(npzfile["std"])
        istd = 1 / (std + eps)
        cmvn = [mean, istd]
    else:
        raise ValueError(f"cmvn file type no support: {filetype}")
    return cmvn[0], cmvn[1]
--- a/examples/transv1.8to2.x/data/aishell/aishell.py
+++ b/examples/transv1.8to2.x/data/aishell/aishell.py
@ -1,123 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Prepare Aishell mandarin dataset
 Download, unpack and create manifest files.
 Manifest file is a json-format file with each line containing the
 meta data (i.e. audio filepath, transcript and audio duration)
 of each audio file in the data set.
 """
 import argparse
 import codecs
 import json
 import os
 import soundfile
 from data_utils.utility import download
 from data_utils.utility import unpack
 DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
 URL_ROOT = 'http://www.openslr.org/resources/33'
 URL_ROOT = 'https://openslr.magicdatatech.com/resources/33'
 DATA_URL = URL_ROOT + '/data_aishell.tgz'
 MD5_DATA = '2f494334227864a8a8fec932999db9d8'
 parser = argparse.ArgumentParser(description=__doc__)
 parser.add_argument(
    "--target_dir",
    default=DATA_HOME + "/Aishell",
    type=str,
    help="Directory to save the dataset. (default: %(default)s)")
 parser.add_argument(
    "--manifest_prefix",
    default="manifest",
    type=str,
    help="Filepath prefix for output manifests. (default: %(default)s)")
 args = parser.parse_args()
 def create_manifest(data_dir, manifest_path_prefix):
    print("Creating manifest %s ..." % manifest_path_prefix)
    json_lines = []
    transcript_path = os.path.join(data_dir, 'transcript',
                                   'aishell_transcript_v0.8.txt')
    transcript_dict = {}
    for line in codecs.open(transcript_path, 'r', 'utf-8'):
        line = line.strip()
        if line == '':
            continue
        audio_id, text = line.split(' ', 1)
        # remove withespace
        text = ''.join(text.split())
        transcript_dict[audio_id] = text
    data_types = ['train', 'dev', 'test']
    for type in data_types:
        del json_lines[:]
        audio_dir = os.path.join(data_dir, 'wav', type)
        for subfolder, _, filelist in sorted(os.walk(audio_dir)):
            for fname in filelist:
                audio_path = os.path.join(subfolder, fname)
                audio_id = fname[:-4]
                # if no transcription for audio then skipped
                if audio_id not in transcript_dict:
                    continue
                audio_data, samplerate = soundfile.read(audio_path)
                duration = float(len(audio_data) / samplerate)
                text = transcript_dict[audio_id]
                json_lines.append(
                    json.dumps(
                        {
                            'audio_filepath': audio_path,
                            'duration': duration,
                            'text': text
                        },
                        ensure_ascii=False))
        manifest_path = manifest_path_prefix + '.' + type
        with codecs.open(manifest_path, 'w', 'utf-8') as fout:
            for line in json_lines:
                fout.write(line + '\n')
 def prepare_dataset(url, md5sum, target_dir, manifest_path):
    """Download, unpack and create manifest file."""
    data_dir = os.path.join(target_dir, 'data_aishell')
    if not os.path.exists(data_dir):
        filepath = download(url, md5sum, target_dir)
        unpack(filepath, target_dir)
        # unpack all audio tar files
        audio_dir = os.path.join(data_dir, 'wav')
        for subfolder, _, filelist in sorted(os.walk(audio_dir)):
            for ftar in filelist:
                unpack(os.path.join(subfolder, ftar), subfolder, True)
    else:
        print("Skip downloading and unpacking. Data already exists in %s." %
              target_dir)
    create_manifest(data_dir, manifest_path)
 def main():
    if args.target_dir.startswith('~'):
        args.target_dir = os.path.expanduser(args.target_dir)
    prepare_dataset(
        url=DATA_URL,
        md5sum=MD5_DATA,
        target_dir=args.target_dir,
        manifest_path=args.manifest_prefix)
 if __name__ == '__main__':
    main()
--- a/examples/transv1.8to2.x/data/librispeech/librispeech.py
+++ b/examples/transv1.8to2.x/data/librispeech/librispeech.py
@ -1,159 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Prepare Librispeech ASR datasets.
 Download, unpack and create manifest files.
 Manifest file is a json-format file with each line containing the
 meta data (i.e. audio filepath, transcript and audio duration)
 of each audio file in the data set.
 """
 import argparse
 import codecs
 import distutils.util
 import io
 import json
 import os
 import soundfile
 from data_utils.utility import download
 from data_utils.utility import unpack
 URL_ROOT = "http://www.openslr.org/resources/12"
 URL_ROOT = "https://openslr.magicdatatech.com/resources/12"
 URL_TEST_CLEAN = URL_ROOT + "/test-clean.tar.gz"
 URL_TEST_OTHER = URL_ROOT + "/test-other.tar.gz"
 URL_DEV_CLEAN = URL_ROOT + "/dev-clean.tar.gz"
 URL_DEV_OTHER = URL_ROOT + "/dev-other.tar.gz"
 URL_TRAIN_CLEAN_100 = URL_ROOT + "/train-clean-100.tar.gz"
 URL_TRAIN_CLEAN_360 = URL_ROOT + "/train-clean-360.tar.gz"
 URL_TRAIN_OTHER_500 = URL_ROOT + "/train-other-500.tar.gz"
 MD5_TEST_CLEAN = "32fa31d27d2e1cad72775fee3f4849a9"
 MD5_TEST_OTHER = "fb5a50374b501bb3bac4815ee91d3135"
 MD5_DEV_CLEAN = "42e2234ba48799c1f50f24a7926300a1"
 MD5_DEV_OTHER = "c8d0bcc9cca99d4f8b62fcc847357931"
 MD5_TRAIN_CLEAN_100 = "2a93770f6d5c6c964bc36631d331a522"
 MD5_TRAIN_CLEAN_360 = "c0e676e450a7ff2f54aeade5171606fa"
 MD5_TRAIN_OTHER_500 = "d1a0fd59409feb2c614ce4d30c387708"
 parser = argparse.ArgumentParser(description=__doc__)
 parser.add_argument(
    "--target_dir",
    default='~/.cache/paddle/dataset/speech/libri',
    type=str,
    help="Directory to save the dataset. (default: %(default)s)")
 parser.add_argument(
    "--manifest_prefix",
    default="manifest",
    type=str,
    help="Filepath prefix for output manifests. (default: %(default)s)")
 parser.add_argument(
    "--full_download",
    default="True",
    type=distutils.util.strtobool,
    help="Download all datasets for Librispeech."
    " If False, only download a minimal requirement (test-clean, dev-clean"
    " train-clean-100). (default: %(default)s)")
 args = parser.parse_args()
 def create_manifest(data_dir, manifest_path):
    """Create a manifest json file summarizing the data set, with each line
    containing the meta data (i.e. audio filepath, transcription text, audio
    duration) of each audio file within the data set.
    """
    print("Creating manifest %s ..." % manifest_path)
    json_lines = []
    for subfolder, _, filelist in sorted(os.walk(data_dir)):
        text_filelist = [
            filename for filename in filelist if filename.endswith('trans.txt')
        ]
        if len(text_filelist) > 0:
            text_filepath = os.path.join(subfolder, text_filelist[0])
            for line in io.open(text_filepath, encoding="utf8"):
                segments = line.strip().split()
                text = ' '.join(segments[1:]).lower()
                audio_filepath = os.path.join(subfolder, segments[0] + '.flac')
                audio_data, samplerate = soundfile.read(audio_filepath)
                duration = float(len(audio_data)) / samplerate
                json_lines.append(
                    json.dumps({
                        'audio_filepath': audio_filepath,
                        'duration': duration,
                        'text': text
                    }))
    with codecs.open(manifest_path, 'w', 'utf-8') as out_file:
        for line in json_lines:
            out_file.write(line + '\n')
 def prepare_dataset(url, md5sum, target_dir, manifest_path):
    """Download, unpack and create summmary manifest file.
    """
    if not os.path.exists(os.path.join(target_dir, "LibriSpeech")):
        # download
        filepath = download(url, md5sum, target_dir)
        # unpack
        unpack(filepath, target_dir)
    else:
        print("Skip downloading and unpacking. Data already exists in %s." %
              target_dir)
    # create manifest json file
    create_manifest(target_dir, manifest_path)
 def main():
    if args.target_dir.startswith('~'):
        args.target_dir = os.path.expanduser(args.target_dir)
    prepare_dataset(
        url=URL_TEST_CLEAN,
        md5sum=MD5_TEST_CLEAN,
        target_dir=os.path.join(args.target_dir, "test-clean"),
        manifest_path=args.manifest_prefix + ".test-clean")
    prepare_dataset(
        url=URL_DEV_CLEAN,
        md5sum=MD5_DEV_CLEAN,
        target_dir=os.path.join(args.target_dir, "dev-clean"),
        manifest_path=args.manifest_prefix + ".dev-clean")
    if args.full_download:
        prepare_dataset(
            url=URL_TRAIN_CLEAN_100,
            md5sum=MD5_TRAIN_CLEAN_100,
            target_dir=os.path.join(args.target_dir, "train-clean-100"),
            manifest_path=args.manifest_prefix + ".train-clean-100")
        prepare_dataset(
            url=URL_TEST_OTHER,
            md5sum=MD5_TEST_OTHER,
            target_dir=os.path.join(args.target_dir, "test-other"),
            manifest_path=args.manifest_prefix + ".test-other")
        prepare_dataset(
            url=URL_DEV_OTHER,
            md5sum=MD5_DEV_OTHER,
            target_dir=os.path.join(args.target_dir, "dev-other"),
            manifest_path=args.manifest_prefix + ".dev-other")
        prepare_dataset(
            url=URL_TRAIN_CLEAN_360,
            md5sum=MD5_TRAIN_CLEAN_360,
            target_dir=os.path.join(args.target_dir, "train-clean-360"),
            manifest_path=args.manifest_prefix + ".train-clean-360")
        prepare_dataset(
            url=URL_TRAIN_OTHER_500,
            md5sum=MD5_TRAIN_OTHER_500,
            target_dir=os.path.join(args.target_dir, "train-other-500"),
            manifest_path=args.manifest_prefix + ".train-other-500")
 if __name__ == '__main__':
    main()
--- a/examples/transv1.8to2.x/data/noise/chime3_background.py
+++ b/examples/transv1.8to2.x/data/noise/chime3_background.py
@ -1,139 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Prepare CHiME3 background data.
 Download, unpack and create manifest files.
 Manifest file is a json-format file with each line containing the
 meta data (i.e. audio filepath, transcript and audio duration)
 of each audio file in the data set.
 """
 import argparse
 import io
 import json
 import os
 import zipfile
 import soundfile
 import wget
 from paddle.v2.dataset.common import md5file
 DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
 URL = "https://d4s.myairbridge.com/packagev2/AG0Y3DNBE5IWRRTV/?dlid=W19XG7T0NNHB027139H0EQ"
 MD5 = "c3ff512618d7a67d4f85566ea1bc39ec"
 parser = argparse.ArgumentParser(description=__doc__)
 parser.add_argument(
    "--target_dir",
    default=DATA_HOME + "/chime3_background",
    type=str,
    help="Directory to save the dataset. (default: %(default)s)")
 parser.add_argument(
    "--manifest_filepath",
    default="manifest.chime3.background",
    type=str,
    help="Filepath for output manifests. (default: %(default)s)")
 args = parser.parse_args()
 def download(url, md5sum, target_dir, filename=None):
    """Download file from url to target_dir, and check md5sum."""
    if filename is None:
        filename = url.split("/")[-1]
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)
    filepath = os.path.join(target_dir, filename)
    if not (os.path.exists(filepath) and md5file(filepath) == md5sum):
        print("Downloading %s ..." % url)
        wget.download(url, target_dir)
        print("\nMD5 Chesksum %s ..." % filepath)
        if not md5file(filepath) == md5sum:
            raise RuntimeError("MD5 checksum failed.")
    else:
        print("File exists, skip downloading. (%s)" % filepath)
    return filepath
 def unpack(filepath, target_dir):
    """Unpack the file to the target_dir."""
    print("Unpacking %s ..." % filepath)
    if filepath.endswith('.zip'):
        zip = zipfile.ZipFile(filepath, 'r')
        zip.extractall(target_dir)
        zip.close()
    elif filepath.endswith('.tar') or filepath.endswith('.tar.gz'):
        tar = zipfile.open(filepath)
        tar.extractall(target_dir)
        tar.close()
    else:
        raise ValueError("File format is not supported for unpacking.")
 def create_manifest(data_dir, manifest_path):
    """Create a manifest json file summarizing the data set, with each line
    containing the meta data (i.e. audio filepath, transcription text, audio
    duration) of each audio file within the data set.
    """
    print("Creating manifest %s ..." % manifest_path)
    json_lines = []
    for subfolder, _, filelist in sorted(os.walk(data_dir)):
        for filename in filelist:
            if filename.endswith('.wav'):
                filepath = os.path.join(data_dir, subfolder, filename)
                audio_data, samplerate = soundfile.read(filepath)
                duration = float(len(audio_data)) / samplerate
                json_lines.append(
                    json.dumps({
                        'audio_filepath': filepath,
                        'duration': duration,
                        'text': ''
                    }))
    with io.open(manifest_path, mode='w', encoding='utf8') as out_file:
        for line in json_lines:
            out_file.write(line + '\n')
 def prepare_chime3(url, md5sum, target_dir, manifest_path):
    """Download, unpack and create summmary manifest file."""
    if not os.path.exists(os.path.join(target_dir, "CHiME3")):
        # download
        filepath = download(url, md5sum, target_dir,
                            "myairbridge-AG0Y3DNBE5IWRRTV.zip")
        # unpack
        unpack(filepath, target_dir)
        unpack(
            os.path.join(target_dir, 'CHiME3_background_bus.zip'), target_dir)
        unpack(
            os.path.join(target_dir, 'CHiME3_background_caf.zip'), target_dir)
        unpack(
            os.path.join(target_dir, 'CHiME3_background_ped.zip'), target_dir)
        unpack(
            os.path.join(target_dir, 'CHiME3_background_str.zip'), target_dir)
    else:
        print("Skip downloading and unpacking. Data already exists in %s." %
              target_dir)
    # create manifest json file
    create_manifest(target_dir, manifest_path)
 def main():
    prepare_chime3(
        url=URL,
        md5sum=MD5,
        target_dir=args.target_dir,
        manifest_path=args.manifest_filepath)
 if __name__ == '__main__':
    main()
--- a/examples/transv1.8to2.x/data/voxforge/run_data.sh
+++ b/examples/transv1.8to2.x/data/voxforge/run_data.sh
@ -1,16 +0,0 @@
 #! /usr/bin/env bash
 # download data, generate manifests
 PYTHONPATH=../../:$PYTHONPATH python voxforge.py \
 --manifest_prefix='./manifest' \
 --target_dir='./dataset/VoxForge' \
 --is_merge_dialect=True \
 --dialects 'american' 'british' 'australian' 'european' 'irish' 'canadian' 'indian'
 if [ $? -ne 0 ]; then
    echo "Prepare VoxForge failed. Terminated."
    exit 1
 fi
 echo "VoxForge Data preparation done."
 exit 0
--- a/examples/transv1.8to2.x/data/voxforge/voxforge.py
+++ b/examples/transv1.8to2.x/data/voxforge/voxforge.py
@ -1,234 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Prepare VoxForge dataset
 Download, unpack and create manifest files.
 Manifest file is a json-format file with each line containing the
 meta data (i.e. audio filepath, transcript and audio duration)
 of each audio file in the data set.
 """
 import argparse
 import codecs
 import datetime
 import json
 import os
 import shutil
 import subprocess
 import soundfile
 from data_utils.utility import download_multi
 from data_utils.utility import getfile_insensitive
 from data_utils.utility import unpack
 DATA_HOME = './dataset'
 DATA_URL = 'http://www.repository.voxforge1.org/downloads/SpeechCorpus/Trunk/' \
           'Audio/Main/16kHz_16bit'
 parser = argparse.ArgumentParser(description=__doc__)
 parser.add_argument(
    "--target_dir",
    default=DATA_HOME + "/VoxForge",
    type=str,
    help="Directory to save the dataset. (default: %(default)s)")
 parser.add_argument(
    "--dialects",
    default=[
        'american', 'british', 'australian', 'european', 'irish', 'canadian',
        'indian'
    ],
    nargs='+',
    type=str,
    help="Dialect types. (default: %(default)s)")
 parser.add_argument(
    "--is_merge_dialect",
    default=True,
    type=bool,
    help="If set True, manifests of american dialect and canadian dialect will "
    "be merged to american-canadian dialect; manifests of british "
    "dialect, irish dialect and australian dialect will be merged to "
    "commonwealth dialect. (default: %(default)s)")
 parser.add_argument(
    "--manifest_prefix",
    default="manifest",
    type=str,
    help="Filepath prefix for output manifests. (default: %(default)s)")
 args = parser.parse_args()
 def download_and_unpack(target_dir, url):
    wget_args = '-q -l 1 -N -nd -c -e robots=off -A tgz -r -np'
    tgz_dir = os.path.join(target_dir, 'tgz')
    exit_code = download_multi(url, tgz_dir, wget_args)
    if exit_code != 0:
        print('Download tgz audio files failed with exit code %d.' % exit_code)
    else:
        print('Download done, start unpacking ...')
        audio_dir = os.path.join(target_dir, 'audio')
        for root, dirs, files in os.walk(tgz_dir):
            for file in files:
                print(file)
                if file.endswith('.tgz'):
                    unpack(os.path.join(root, file), audio_dir)
 def select_dialects(target_dir, dialect_list):
    """Classify audio files by dialect."""
    dialect_root_dir = os.path.join(target_dir, 'dialect')
    if os.path.exists(dialect_root_dir):
        shutil.rmtree(dialect_root_dir)
    os.mkdir(dialect_root_dir)
    audio_dir = os.path.abspath(os.path.join(target_dir, 'audio'))
    for dialect in dialect_list:
        # filter files by dialect
        command = 'find %s -iwholename "*etc/readme*" -exec egrep -iHl \
                   "pronunciation dialect.*%s" {} \;' % (audio_dir, dialect)
        p = subprocess.Popen(
            command, stdin=subprocess.PIPE, stdout=subprocess.PIPE, shell=True)
        output, err = p.communicate()
        dialect_dir = os.path.join(dialect_root_dir, dialect)
        if os.path.exists(dialect_dir):
            shutil.rmtree(dialect_dir)
        os.mkdir(dialect_dir)
        for path in output.splitlines():
            src_dir = os.path.dirname(os.path.dirname(path))
            link = os.path.basename(os.path.normpath(src_dir))
            os.symlink(src_dir, os.path.join(dialect_dir, link))
 def generate_manifest(data_dir, manifest_path):
    json_lines = []
    for path in os.listdir(data_dir):
        audio_link = os.path.join(data_dir, path)
        assert os.path.islink(
            audio_link), '%s should be symbolic link.' % audio_link
        actual_audio_dir = os.path.abspath(os.readlink(audio_link))
        audio_type = ''
        if os.path.isdir(os.path.join(actual_audio_dir, 'wav')):
            audio_type = 'wav'
        elif os.path.isdir(os.path.join(actual_audio_dir, 'flac')):
            audio_type = 'flac'
        else:
            print('Unknown audio type, skipped processing %s.' %
                  actual_audio_dir)
            continue
        etc_dir = os.path.join(actual_audio_dir, 'etc')
        prompts_file = os.path.join(etc_dir, 'PROMPTS')
        if not os.path.isfile(prompts_file):
            print('PROMPTS file missing, skip processing %s.' %
                  actual_audio_dir)
            continue
        readme_file = getfile_insensitive(os.path.join(etc_dir, 'README'))
        if readme_file is None:
            print('README file missing, skip processing %s.' % actual_audio_dir)
            continue
        for line in file(prompts_file):
            u, trans = line.strip().split(None, 1)
            u_parts = u.split('/')
            # try to format the date time
            try:
                speaker, date, sfx = u_parts[-3].split('-')
                obj = datetime.datetime.strptime(date, '%y.%m.%d')
                formatted = obj.strftime('%Y%m%d')
                u_parts[-3] = '-'.join([speaker, formatted, sfx])
            except Exception as e:
                pass
            if len(u_parts) < 2:
                u_parts = [audio_type] + u_parts
            u_parts[-2] = audio_type
            u_parts[-1] += '.' + audio_type
            u = os.path.join(actual_audio_dir, '/'.join(u_parts[-2:]))
            if not os.path.isfile(u):
                print('Audio file missing, skip processing %s.' % u)
                continue
            if os.stat(u).st_size == 0:
                print('Empty audio file, skip processing %s.' % u)
                continue
            trans = trans.strip().replace('-', ' ')
            if not trans.isupper() or \
                not trans.strip().replace(' ', '').replace("'", "").isalpha():
                print("Transcript not normalized properly, skip processing %s."
                      % u)
                continue
            audio_data, samplerate = soundfile.read(u)
            duration = float(len(audio_data)) / samplerate
            json_lines.append(
                json.dumps({
                    'audio_filepath': u,
                    'duration': duration,
                    'text': trans.lower()
                }))
    with codecs.open(manifest_path, 'w', 'utf-8') as fout:
        for line in json_lines:
            fout.write(line + '\n')
 def merge_manifests(manifest_files, save_path):
    lines = []
    for manifest_file in manifest_files:
        line = codecs.open(manifest_file, 'r', 'utf-8').readlines()
        lines += line
    with codecs.open(save_path, 'w', 'utf-8') as fout:
        for line in lines:
            fout.write(line)
 def prepare_dataset(url, dialects, target_dir, manifest_prefix, is_merge):
    download_and_unpack(target_dir, url)
    select_dialects(target_dir, dialects)
    american_canadian_manifests = []
    commonwealth_manifests = []
    for dialect in dialects:
        dialect_dir = os.path.join(target_dir, 'dialect', dialect)
        manifest_fpath = manifest_prefix + '.' + dialect
        if dialect == 'american' or dialect == 'canadian':
            american_canadian_manifests.append(manifest_fpath)
        if dialect == 'australian' \
                or dialect == 'british' \
                or dialect == 'irish':
            commonwealth_manifests.append(manifest_fpath)
        generate_manifest(dialect_dir, manifest_fpath)
    if is_merge:
        if len(american_canadian_manifests) > 0:
            manifest_fpath = manifest_prefix + '.american-canadian'
            merge_manifests(american_canadian_manifests, manifest_fpath)
        if len(commonwealth_manifests) > 0:
            manifest_fpath = manifest_prefix + '.commonwealth'
            merge_manifests(commonwealth_manifests, manifest_fpath)
 def main():
    if args.target_dir.startswith('~'):
        args.target_dir = os.path.expanduser(args.target_dir)
    prepare_dataset(DATA_URL, args.dialects, args.target_dir,
                    args.manifest_prefix, args.is_merge_dialect)
 if __name__ == '__main__':
    main()
--- a/examples/transv1.8to2.x/data_utils/audio.py
+++ b/examples/transv1.8to2.x/data_utils/audio.py
@ -1,695 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Contains the audio segment class."""
 import copy
 import io
 import random
 import re
 import struct
 import numpy as np
 import resampy
 import soundfile
 from scipy import signal
 class AudioSegment(object):
    """Monaural audio segment abstraction.
    :param samples: Audio samples [num_samples x num_channels].
    :type samples: ndarray.float32
    :param sample_rate: Audio sample rate.
    :type sample_rate: int
    :raises TypeError: If the sample data type is not float or int.
    """
    def __init__(self, samples, sample_rate):
        """Create audio segment from samples.
        Samples are convert float32 internally, with int scaled to [-1, 1].
        """
        self._samples = self._convert_samples_to_float32(samples)
        self._sample_rate = sample_rate
        if self._samples.ndim >= 2:
            self._samples = np.mean(self._samples, 1)
    def __eq__(self, other):
        """Return whether two objects are equal."""
        if type(other) is not type(self):
            return False
        if self._sample_rate != other._sample_rate:
            return False
        if self._samples.shape != other._samples.shape:
            return False
        if np.any(self.samples != other._samples):
            return False
        return True
    def __ne__(self, other):
        """Return whether two objects are unequal."""
        return not self.__eq__(other)
    def __str__(self):
        """Return human-readable representation of segment."""
        return ("%s: num_samples=%d, sample_rate=%d, duration=%.2fsec, "
                "rms=%.2fdB" % (type(self), self.num_samples, self.sample_rate,
                                self.duration, self.rms_db))
    @classmethod
    def from_file(cls, file):
        """Create audio segment from audio file.
        :param filepath: Filepath or file object to audio file.
        :type filepath: str|file
        :return: Audio segment instance.
        :rtype: AudioSegment
        """
        if isinstance(file, str) and re.findall(r".seqbin_\d+$", file):
            return cls.from_sequence_file(file)
        else:
            samples, sample_rate = soundfile.read(file, dtype='float32')
            return cls(samples, sample_rate)
    @classmethod
    def slice_from_file(cls, file, start=None, end=None):
        """Loads a small section of an audio without having to load
        the entire file into the memory which can be incredibly wasteful.
        :param file: Input audio filepath or file object.
        :type file: str|file
        :param start: Start time in seconds. If start is negative, it wraps
                      around from the end. If not provided, this function 
                      reads from the very beginning.
        :type start: float
        :param end: End time in seconds. If end is negative, it wraps around
                    from the end. If not provided, the default behvaior is
                    to read to the end of the file.
        :type end: float
        :return: AudioSegment instance of the specified slice of the input
                 audio file.
        :rtype: AudioSegment
        :raise ValueError: If start or end is incorrectly set, e.g. out of
                           bounds in time.
        """
        sndfile = soundfile.SoundFile(file)
        sample_rate = sndfile.samplerate
        duration = float(len(sndfile)) / sample_rate
        start = 0. if start is None else start
        end = duration if end is None else end
        if start < 0.0:
            start += duration
        if end < 0.0:
            end += duration
        if start < 0.0:
            raise ValueError("The slice start position (%f s) is out of "
                             "bounds." % start)
        if end < 0.0:
            raise ValueError("The slice end position (%f s) is out of bounds." %
                             end)
        if start > end:
            raise ValueError("The slice start position (%f s) is later than "
                             "the slice end position (%f s)." % (start, end))
        if end > duration:
            raise ValueError("The slice end position (%f s) is out of bounds "
                             "(> %f s)" % (end, duration))
        start_frame = int(start * sample_rate)
        end_frame = int(end * sample_rate)
        sndfile.seek(start_frame)
        data = sndfile.read(frames=end_frame - start_frame, dtype='float32')
        return cls(data, sample_rate)
    @classmethod
    def from_sequence_file(cls, filepath):
        """Create audio segment from sequence file. Sequence file is a binary
        file containing a collection of multiple audio files, with several
        header bytes in the head indicating the offsets of each audio byte data
        chunk.
        The format is:
            4 bytes (int, version),
            4 bytes (int, num of utterance),
            4 bytes (int, bytes per header),
            [bytes_per_header*(num_utterance+1)] bytes (offsets for each audio),
            audio_bytes_data_of_1st_utterance,
            audio_bytes_data_of_2nd_utterance,
            ......
        Sequence file name must end with ".seqbin". And the filename of the 5th
        utterance's audio file in sequence file "xxx.seqbin" must be
        "xxx.seqbin_5", with "5" indicating the utterance index within this
        sequence file (starting from 1).
        :param filepath: Filepath of sequence file.
        :type filepath: str
        :return: Audio segment instance.
        :rtype: AudioSegment
        """
        # parse filepath
        matches = re.match(r"(.+\.seqbin)_(\d+)", filepath)
        if matches is None:
            raise IOError("File type of %s is not supported" % filepath)
        filename = matches.group(1)
        fileno = int(matches.group(2))
        # read headers
        f = io.open(filename, mode='rb', encoding='utf8')
        version = f.read(4)
        num_utterances = struct.unpack("i", f.read(4))[0]
        bytes_per_header = struct.unpack("i", f.read(4))[0]
        header_bytes = f.read(bytes_per_header * (num_utterances + 1))
        header = [
            struct.unpack("i", header_bytes[bytes_per_header * i:
                                            bytes_per_header * (i + 1)])[0]
            for i in range(num_utterances + 1)
        ]
        # read audio bytes
        f.seek(header[fileno - 1])
        audio_bytes = f.read(header[fileno] - header[fileno - 1])
        f.close()
        # create audio segment
        try:
            return cls.from_bytes(audio_bytes)
        except Exception as e:
            samples = np.frombuffer(audio_bytes, dtype='int16')
            return cls(samples=samples, sample_rate=8000)
    @classmethod
    def from_bytes(cls, bytes):
        """Create audio segment from a byte string containing audio samples.
        :param bytes: Byte string containing audio samples.
        :type bytes: str
        :return: Audio segment instance.
        :rtype: AudioSegment
        """
        samples, sample_rate = soundfile.read(
            io.BytesIO(bytes), dtype='float32')
        return cls(samples, sample_rate)
    @classmethod
    def concatenate(cls, *segments):
        """Concatenate an arbitrary number of audio segments together.
        :param *segments: Input audio segments to be concatenated.
        :type *segments: tuple of AudioSegment
        :return: Audio segment instance as concatenating results.
        :rtype: AudioSegment
        :raises ValueError: If the number of segments is zero, or if the 
                            sample_rate of any segments does not match.
        :raises TypeError: If any segment is not AudioSegment instance.
        """
        # Perform basic sanity-checks.
        if len(segments) == 0:
            raise ValueError("No audio segments are given to concatenate.")
        sample_rate = segments[0]._sample_rate
        for seg in segments:
            if sample_rate != seg._sample_rate:
                raise ValueError("Can't concatenate segments with "
                                 "different sample rates")
            if type(seg) is not cls:
                raise TypeError("Only audio segments of the same type "
                                "can be concatenated.")
        samples = np.concatenate([seg.samples for seg in segments])
        return cls(samples, sample_rate)
    @classmethod
    def make_silence(cls, duration, sample_rate):
        """Creates a silent audio segment of the given duration and sample rate.
        :param duration: Length of silence in seconds.
        :type duration: float
        :param sample_rate: Sample rate.
        :type sample_rate: float
        :return: Silent AudioSegment instance of the given duration.
        :rtype: AudioSegment
        """
        samples = np.zeros(int(duration * sample_rate))
        return cls(samples, sample_rate)
    def to_wav_file(self, filepath, dtype='float32'):
        """Save audio segment to disk as wav file.
        :param filepath: WAV filepath or file object to save the
                         audio segment.
        :type filepath: str|file
        :param dtype: Subtype for audio file. Options: 'int16', 'int32',
                      'float32', 'float64'. Default is 'float32'.
        :type dtype: str
        :raises TypeError: If dtype is not supported.
        """
        samples = self._convert_samples_from_float32(self._samples, dtype)
        subtype_map = {
            'int16': 'PCM_16',
            'int32': 'PCM_32',
            'float32': 'FLOAT',
            'float64': 'DOUBLE'
        }
        soundfile.write(
            filepath,
            samples,
            self._sample_rate,
            format='WAV',
            subtype=subtype_map[dtype])
    def superimpose(self, other):
        """Add samples from another segment to those of this segment
        (sample-wise addition, not segment concatenation).
        Note that this is an in-place transformation.
        :param other: Segment containing samples to be added in.
        :type other: AudioSegments
        :raise TypeError: If type of two segments don't match.
        :raise ValueError: If the sample rates of the two segments are not
                           equal, or if the lengths of segments don't match.
        """
        if isinstance(other, type(self)):
            raise TypeError("Cannot add segments of different types: %s "
                            "and %s." % (type(self), type(other)))
        if self._sample_rate != other._sample_rate:
            raise ValueError("Sample rates must match to add segments.")
        if len(self._samples) != len(other._samples):
            raise ValueError("Segment lengths must match to add segments.")
        self._samples += other._samples
    def to_bytes(self, dtype='float32'):
        """Create a byte string containing the audio content.
        :param dtype: Data type for export samples. Options: 'int16', 'int32',
                      'float32', 'float64'. Default is 'float32'.
        :type dtype: str
        :return: Byte string containing audio content.
        :rtype: str
        """
        samples = self._convert_samples_from_float32(self._samples, dtype)
        return samples.tostring()
    def gain_db(self, gain):
        """Apply gain in decibels to samples.
        Note that this is an in-place transformation.
        :param gain: Gain in decibels to apply to samples. 
        :type gain: float|1darray
        """
        self._samples *= 10.**(gain / 20.)
    def change_speed(self, speed_rate):
        """Change the audio speed by linear interpolation.
        Note that this is an in-place transformation.
        :param speed_rate: Rate of speed change:
                           speed_rate > 1.0, speed up the audio;
                           speed_rate = 1.0, unchanged;
                           speed_rate < 1.0, slow down the audio;
                           speed_rate <= 0.0, not allowed, raise ValueError.
        :type speed_rate: float
        :raises ValueError: If speed_rate <= 0.0.
        """
        if speed_rate <= 0:
            raise ValueError("speed_rate should be greater than zero.")
        old_length = self._samples.shape[0]
        new_length = int(old_length / speed_rate)
        old_indices = np.arange(old_length)
        new_indices = np.linspace(start=0, stop=old_length, num=new_length)
        self._samples = np.interp(new_indices, old_indices, self._samples)
    def normalize(self, target_db=-20, max_gain_db=300.0):
        """Normalize audio to be of the desired RMS value in decibels.
        Note that this is an in-place transformation.
        :param target_db: Target RMS value in decibels. This value should be
                          less than 0.0 as 0.0 is full-scale audio.
        :type target_db: float
        :param max_gain_db: Max amount of gain in dB that can be applied for
                            normalization. This is to prevent nans when
                            attempting to normalize a signal consisting of
                            all zeros.
        :type max_gain_db: float
        :raises ValueError: If the required gain to normalize the segment to
                            the target_db value exceeds max_gain_db.
        """
        gain = target_db - self.rms_db
        if gain > max_gain_db:
            raise ValueError(
                "Unable to normalize segment to %f dB because the "
                "the probable gain have exceeds max_gain_db (%f dB)" %
                (target_db, max_gain_db))
        self.gain_db(min(max_gain_db, target_db - self.rms_db))
    def normalize_online_bayesian(self,
                                  target_db,
                                  prior_db,
                                  prior_samples,
                                  startup_delay=0.0):
        """Normalize audio using a production-compatible online/causal
        algorithm. This uses an exponential likelihood and gamma prior to
        make online estimates of the RMS even when there are very few samples.
        Note that this is an in-place transformation.
        :param target_db: Target RMS value in decibels.
        :type target_bd: float
        :param prior_db: Prior RMS estimate in decibels.
        :type prior_db: float
        :param prior_samples: Prior strength in number of samples.
        :type prior_samples: float
        :param startup_delay: Default 0.0s. If provided, this function will
                              accrue statistics for the first startup_delay 
                              seconds before applying online normalization.
        :type startup_delay: float
        """
        # Estimate total RMS online.
        startup_sample_idx = min(self.num_samples - 1,
                                 int(self.sample_rate * startup_delay))
        prior_mean_squared = 10.**(prior_db / 10.)
        prior_sum_of_squares = prior_mean_squared * prior_samples
        cumsum_of_squares = np.cumsum(self.samples**2)
        sample_count = np.arange(self.num_samples) + 1
        if startup_sample_idx > 0:
            cumsum_of_squares[:startup_sample_idx] = \
                cumsum_of_squares[startup_sample_idx]
            sample_count[:startup_sample_idx] = \
                sample_count[startup_sample_idx]
        mean_squared_estimate = ((cumsum_of_squares + prior_sum_of_squares) /
                                 (sample_count + prior_samples))
        rms_estimate_db = 10 * np.log10(mean_squared_estimate)
        # Compute required time-varying gain.
        gain_db = target_db - rms_estimate_db
        self.gain_db(gain_db)
    def resample(self, target_sample_rate, filter='kaiser_best'):
        """Resample the audio to a target sample rate.
        Note that this is an in-place transformation.
        :param target_sample_rate: Target sample rate.
        :type target_sample_rate: int
        :param filter: The resampling filter to use one of {'kaiser_best',
                       'kaiser_fast'}.
        :type filter: str
        """
        self._samples = resampy.resample(
            self.samples, self.sample_rate, target_sample_rate, filter=filter)
        self._sample_rate = target_sample_rate
    def pad_silence(self, duration, sides='both'):
        """Pad this audio sample with a period of silence.
        Note that this is an in-place transformation.
        :param duration: Length of silence in seconds to pad.
        :type duration: float
        :param sides: Position for padding:
                     'beginning' - adds silence in the beginning;
                     'end' - adds silence in the end;
                     'both' - adds silence in both the beginning and the end.
        :type sides: str
        :raises ValueError: If sides is not supported.
        """
        if duration == 0.0:
            return self
        cls = type(self)
        silence = self.make_silence(duration, self._sample_rate)
        if sides == "beginning":
            padded = cls.concatenate(silence, self)
        elif sides == "end":
            padded = cls.concatenate(self, silence)
        elif sides == "both":
            padded = cls.concatenate(silence, self, silence)
        else:
            raise ValueError("Unknown value for the sides %s" % sides)
        self._samples = padded._samples
    def shift(self, shift_ms):
        """Shift the audio in time. If `shift_ms` is positive, shift with time
        advance; if negative, shift with time delay. Silence are padded to
        keep the duration unchanged.
        Note that this is an in-place transformation.
        :param shift_ms: Shift time in millseconds. If positive, shift with
                         time advance; if negative; shift with time delay.
        :type shift_ms: float
        :raises ValueError: If shift_ms is longer than audio duration.
        """
        if abs(shift_ms) / 1000.0 > self.duration:
            raise ValueError("Absolute value of shift_ms should be smaller "
                             "than audio duration.")
        shift_samples = int(shift_ms * self._sample_rate / 1000)
        if shift_samples > 0:
            # time advance
            self._samples[:-shift_samples] = self._samples[shift_samples:]
            self._samples[-shift_samples:] = 0
        elif shift_samples < 0:
            # time delay
            self._samples[-shift_samples:] = self._samples[:shift_samples]
            self._samples[:-shift_samples] = 0
    def subsegment(self, start_sec=None, end_sec=None):
        """Cut the AudioSegment between given boundaries.
        Note that this is an in-place transformation.
        :param start_sec: Beginning of subsegment in seconds.
        :type start_sec: float
        :param end_sec: End of subsegment in seconds.
        :type end_sec: float
        :raise ValueError: If start_sec or end_sec is incorrectly set, e.g. out
                           of bounds in time.
        """
        start_sec = 0.0 if start_sec is None else start_sec
        end_sec = self.duration if end_sec is None else end_sec
        if start_sec < 0.0:
            start_sec = self.duration + start_sec
        if end_sec < 0.0:
            end_sec = self.duration + end_sec
        if start_sec < 0.0:
            raise ValueError("The slice start position (%f s) is out of "
                             "bounds." % start_sec)
        if end_sec < 0.0:
            raise ValueError("The slice end position (%f s) is out of bounds." %
                             end_sec)
        if start_sec > end_sec:
            raise ValueError("The slice start position (%f s) is later than "
                             "the end position (%f s)." % (start_sec, end_sec))
        if end_sec > self.duration:
            raise ValueError("The slice end position (%f s) is out of bounds "
                             "(> %f s)" % (end_sec, self.duration))
        start_sample = int(round(start_sec * self._sample_rate))
        end_sample = int(round(end_sec * self._sample_rate))
        self._samples = self._samples[start_sample:end_sample]
    def random_subsegment(self, subsegment_length, rng=None):
        """Cut the specified length of the audiosegment randomly.
        Note that this is an in-place transformation.
        :param subsegment_length: Subsegment length in seconds.
        :type subsegment_length: float
        :param rng: Random number generator state.
        :type rng: random.Random
        :raises ValueError: If the length of subsegment is greater than
                            the origineal segemnt.
        """
        rng = random.Random() if rng is None else rng
        if subsegment_length > self.duration:
            raise ValueError("Length of subsegment must not be greater "
                             "than original segment.")
        start_time = rng.uniform(0.0, self.duration - subsegment_length)
        self.subsegment(start_time, start_time + subsegment_length)
    def convolve(self, impulse_segment, allow_resample=False):
        """Convolve this audio segment with the given impulse segment.
        Note that this is an in-place transformation.
        :param impulse_segment: Impulse response segments.
        :type impulse_segment: AudioSegment
        :param allow_resample: Indicates whether resampling is allowed when
                               the impulse_segment has a different sample 
                               rate from this signal.
        :type allow_resample: bool
        :raises ValueError: If the sample rate is not match between two
                            audio segments when resample is not allowed.
        """
        if allow_resample and self.sample_rate != impulse_segment.sample_rate:
            impulse_segment.resample(self.sample_rate)
        if self.sample_rate != impulse_segment.sample_rate:
            raise ValueError("Impulse segment's sample rate (%d Hz) is not "
                             "equal to base signal sample rate (%d Hz)." %
                             (impulse_segment.sample_rate, self.sample_rate))
        samples = signal.fftconvolve(self.samples, impulse_segment.samples,
                                     "full")
        self._samples = samples
    def convolve_and_normalize(self, impulse_segment, allow_resample=False):
        """Convolve and normalize the resulting audio segment so that it
        has the same average power as the input signal.
        Note that this is an in-place transformation.
        :param impulse_segment: Impulse response segments.
        :type impulse_segment: AudioSegment
        :param allow_resample: Indicates whether resampling is allowed when
                               the impulse_segment has a different sample
                               rate from this signal.
        :type allow_resample: bool
        """
        target_db = self.rms_db
        self.convolve(impulse_segment, allow_resample=allow_resample)
        self.normalize(target_db)
    def add_noise(self,
                  noise,
                  snr_dB,
                  allow_downsampling=False,
                  max_gain_db=300.0,
                  rng=None):
        """Add the given noise segment at a specific signal-to-noise ratio.
        If the noise segment is longer than this segment, a random subsegment
        of matching length is sampled from it and used instead.
        Note that this is an in-place transformation.
        :param noise: Noise signal to add.
        :type noise: AudioSegment
        :param snr_dB: Signal-to-Noise Ratio, in decibels.
        :type snr_dB: float
        :param allow_downsampling: Whether to allow the noise signal to be
                                   downsampled to match the base signal sample
                                   rate.
        :type allow_downsampling: bool
        :param max_gain_db: Maximum amount of gain to apply to noise signal
                            before adding it in. This is to prevent attempting
                            to apply infinite gain to a zero signal.
        :type max_gain_db: float
        :param rng: Random number generator state.
        :type rng: None|random.Random
        :raises ValueError: If the sample rate does not match between the two
                            audio segments when downsampling is not allowed, or
                            if the duration of noise segments is shorter than
                            original audio segments.
        """
        rng = random.Random() if rng is None else rng
        if allow_downsampling and noise.sample_rate > self.sample_rate:
            noise = noise.resample(self.sample_rate)
        if noise.sample_rate != self.sample_rate:
            raise ValueError("Noise sample rate (%d Hz) is not equal to base "
                             "signal sample rate (%d Hz)." % (noise.sample_rate,
                                                              self.sample_rate))
        if noise.duration < self.duration:
            raise ValueError("Noise signal (%f sec) must be at least as long as"
                             " base signal (%f sec)." %
                             (noise.duration, self.duration))
        noise_gain_db = min(self.rms_db - noise.rms_db - snr_dB, max_gain_db)
        noise_new = copy.deepcopy(noise)
        noise_new.random_subsegment(self.duration, rng=rng)
        noise_new.gain_db(noise_gain_db)
        self.superimpose(noise_new)
    @property
    def samples(self):
        """Return audio samples.
        :return: Audio samples.
        :rtype: ndarray
        """
        return self._samples.copy()
    @property
    def sample_rate(self):
        """Return audio sample rate.
        :return: Audio sample rate.
        :rtype: int
        """
        return self._sample_rate
    @property
    def num_samples(self):
        """Return number of samples.
        :return: Number of samples.
        :rtype: int
        """
        return self._samples.shape[0]
    @property
    def duration(self):
        """Return audio duration.
        :return: Audio duration in seconds.
        :rtype: float
        """
        return self._samples.shape[0] / float(self._sample_rate)
    @property
    def rms_db(self):
        """Return root mean square energy of the audio in decibels.
        :return: Root mean square energy in decibels.
        :rtype: float
        """
        # square root => multiply by 10 instead of 20 for dBs
        mean_square = np.mean(self._samples**2)
        return 10 * np.log10(mean_square)
    def _convert_samples_to_float32(self, samples):
        """Convert sample type to float32.
        Audio sample type is usually integer or float-point.
        Integers will be scaled to [-1, 1] in float32.
        """
        float32_samples = samples.astype('float32')
        if samples.dtype in np.sctypes['int']:
            bits = np.iinfo(samples.dtype).bits
            float32_samples *= (1. / 2**(bits - 1))
        elif samples.dtype in np.sctypes['float']:
            pass
        else:
            raise TypeError("Unsupported sample type: %s." % samples.dtype)
        return float32_samples
    def _convert_samples_from_float32(self, samples, dtype):
        """Convert sample type from float32 to dtype.
        Audio sample type is usually integer or float-point. For integer
        type, float32 will be rescaled from [-1, 1] to the maximum range
        supported by the integer type.
        This is for writing a audio file.
        """
        dtype = np.dtype(dtype)
        output_samples = samples.copy()
        if dtype in np.sctypes['int']:
            bits = np.iinfo(dtype).bits
            output_samples *= (2**(bits - 1) / 1.)
            min_val = np.iinfo(dtype).min
            max_val = np.iinfo(dtype).max
            output_samples[output_samples > max_val] = max_val
            output_samples[output_samples < min_val] = min_val
        elif samples.dtype in np.sctypes['float']:
            min_val = np.finfo(dtype).min
            max_val = np.finfo(dtype).max
            output_samples[output_samples > max_val] = max_val
            output_samples[output_samples < min_val] = min_val
        else:
            raise TypeError("Unsupported sample type: %s." % samples.dtype)
        return output_samples.astype(dtype)
--- a/examples/transv1.8to2.x/data_utils/augmentor/init.py
+++ b/examples/transv1.8to2.x/data_utils/augmentor/init.py
@ -1,13 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
--- a/examples/transv1.8to2.x/data_utils/augmentor/augmentation.py
+++ b/examples/transv1.8to2.x/data_utils/augmentor/augmentation.py
@ -1,134 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Contains the data augmentation pipeline."""
 import json
 import random
 from data_utils.augmentor.impulse_response import ImpulseResponseAugmentor
 from data_utils.augmentor.noise_perturb import NoisePerturbAugmentor
 from data_utils.augmentor.online_bayesian_normalization import \
     OnlineBayesianNormalizationAugmentor
 from data_utils.augmentor.resample import ResampleAugmentor
 from data_utils.augmentor.shift_perturb import ShiftPerturbAugmentor
 from data_utils.augmentor.speed_perturb import SpeedPerturbAugmentor
 from data_utils.augmentor.volume_perturb import VolumePerturbAugmentor
 class AugmentationPipeline(object):
    """Build a pre-processing pipeline with various augmentation models.Such a
    data augmentation pipeline is oftern leveraged to augment the training
    samples to make the model invariant to certain types of perturbations in the
    real world, improving model's generalization ability.
    The pipeline is built according the the augmentation configuration in json
    string, e.g.
    .. code-block::
        [ {
                "type": "noise",
                "params": {"min_snr_dB": 10,
                           "max_snr_dB": 20,
                           "noise_manifest_path": "datasets/manifest.noise"},
                "prob": 0.0
            },
            {
                "type": "speed",
                "params": {"min_speed_rate": 0.9,
                           "max_speed_rate": 1.1},
                "prob": 1.0
            },
            {
                "type": "shift",
                "params": {"min_shift_ms": -5,
                           "max_shift_ms": 5},
                "prob": 1.0
            },
            {
                "type": "volume",
                "params": {"min_gain_dBFS": -10,
                           "max_gain_dBFS": 10},
                "prob": 0.0
            },
            {
                "type": "bayesian_normal",
                "params": {"target_db": -20,
                           "prior_db": -20,
                           "prior_samples": 100},
                "prob": 0.0
            }
        ]
    This augmentation configuration inserts two augmentation models
    into the pipeline, with one is VolumePerturbAugmentor and the other
    SpeedPerturbAugmentor. "prob" indicates the probability of the current
    augmentor to take effect. If "prob" is zero, the augmentor does not take
    effect.
    :param augmentation_config: Augmentation configuration in json string.
    :type augmentation_config: str
    :param random_seed: Random seed.
    :type random_seed: int
    :raises ValueError: If the augmentation json config is in incorrect format".
    """
    def __init__(self, augmentation_config, random_seed=0):
        self._rng = random.Random(random_seed)
        self._augmentors, self._rates = self._parse_pipeline_from(
            augmentation_config)
    def transform_audio(self, audio_segment):
        """Run the pre-processing pipeline for data augmentation.
        Note that this is an in-place transformation.
        :param audio_segment: Audio segment to process.
        :type audio_segment: AudioSegmenet|SpeechSegment
        """
        for augmentor, rate in zip(self._augmentors, self._rates):
            if self._rng.uniform(0., 1.) < rate:
                augmentor.transform_audio(audio_segment)
    def _parse_pipeline_from(self, config_json):
        """Parse the config json to build a augmentation pipelien."""
        try:
            configs = json.loads(config_json)
            augmentors = [
                self._get_augmentor(config["type"], config["params"])
                for config in configs
            ]
            rates = [config["prob"] for config in configs]
        except Exception as e:
            raise ValueError("Failed to parse the augmentation config json: "
                             "%s" % str(e))
        return augmentors, rates
    def _get_augmentor(self, augmentor_type, params):
        """Return an augmentation model by the type name, and pass in params."""
        if augmentor_type == "volume":
            return VolumePerturbAugmentor(self._rng, **params)
        elif augmentor_type == "shift":
            return ShiftPerturbAugmentor(self._rng, **params)
        elif augmentor_type == "speed":
            return SpeedPerturbAugmentor(self._rng, **params)
        elif augmentor_type == "resample":
            return ResampleAugmentor(self._rng, **params)
        elif augmentor_type == "bayesian_normal":
            return OnlineBayesianNormalizationAugmentor(self._rng, **params)
        elif augmentor_type == "noise":
            return NoisePerturbAugmentor(self._rng, **params)
        elif augmentor_type == "impulse":
            return ImpulseResponseAugmentor(self._rng, **params)
        else:
            raise ValueError("Unknown augmentor type [%s]." % augmentor_type)
--- a/examples/transv1.8to2.x/data_utils/augmentor/base.py
+++ b/examples/transv1.8to2.x/data_utils/augmentor/base.py
@ -1,43 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Contains the abstract base class for augmentation models."""
 from abc import ABCMeta
 from abc import abstractmethod
 class AugmentorBase(object):
    """Abstract base class for augmentation model (augmentor) class.
    All augmentor classes should inherit from this class, and implement the
    following abstract methods.
    """
    __metaclass__ = ABCMeta
    @abstractmethod
    def __init__(self):
        pass
    @abstractmethod
    def transform_audio(self, audio_segment):
        """Adds various effects to the input audio segment. Such effects
        will augment the training data to make the model invariant to certain
        types of perturbations in the real world, improving model's
        generalization ability.
        Note that this is an in-place transformation.
        :param audio_segment: Audio segment to add effects to.
        :type audio_segment: AudioSegmenet|SpeechSegment
        """
        pass
--- a/examples/transv1.8to2.x/data_utils/augmentor/impulse_response.py
+++ b/examples/transv1.8to2.x/data_utils/augmentor/impulse_response.py
@ -1,43 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Contains the impulse response augmentation model."""
 from data_utils.audio import AudioSegment
 from data_utils.augmentor.base import AugmentorBase
 from data_utils.utility import read_manifest
 class ImpulseResponseAugmentor(AugmentorBase):
    """Augmentation model for adding impulse response effect.
    :param rng: Random generator object.
    :type rng: random.Random
    :param impulse_manifest_path: Manifest path for impulse audio data.
    :type impulse_manifest_path: str
    """
    def __init__(self, rng, impulse_manifest_path):
        self._rng = rng
        self._impulse_manifest = read_manifest(impulse_manifest_path)
    def transform_audio(self, audio_segment):
        """Add impulse response effect.
        Note that this is an in-place transformation.
        :param audio_segment: Audio segment to add effects to.
        :type audio_segment: AudioSegmenet|SpeechSegment
        """
        impulse_json = self._rng.sample(self._impulse_manifest, 1)[0]
        impulse_segment = AudioSegment.from_file(impulse_json['audio_filepath'])
        audio_segment.convolve(impulse_segment, allow_resample=True)
--- a/examples/transv1.8to2.x/data_utils/augmentor/noise_perturb.py
+++ b/examples/transv1.8to2.x/data_utils/augmentor/noise_perturb.py
@ -1,58 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Contains the noise perturb augmentation model."""
 from data_utils.audio import AudioSegment
 from data_utils.augmentor.base import AugmentorBase
 from data_utils.utility import read_manifest
 class NoisePerturbAugmentor(AugmentorBase):
    """Augmentation model for adding background noise.
    :param rng: Random generator object.
    :type rng: random.Random
    :param min_snr_dB: Minimal signal noise ratio, in decibels.
    :type min_snr_dB: float
    :param max_snr_dB: Maximal signal noise ratio, in decibels.
    :type max_snr_dB: float
    :param noise_manifest_path: Manifest path for noise audio data.
    :type noise_manifest_path: str
    """
    def __init__(self, rng, min_snr_dB, max_snr_dB, noise_manifest_path):
        self._min_snr_dB = min_snr_dB
        self._max_snr_dB = max_snr_dB
        self._rng = rng
        self._noise_manifest = read_manifest(manifest_path=noise_manifest_path)
    def transform_audio(self, audio_segment):
        """Add background noise audio.
        Note that this is an in-place transformation.
        :param audio_segment: Audio segment to add effects to.
        :type audio_segment: AudioSegmenet|SpeechSegment
        """
        noise_json = self._rng.sample(self._noise_manifest, 1)[0]
        if noise_json['duration'] < audio_segment.duration:
            raise RuntimeError("The duration of sampled noise audio is smaller "
                               "than the audio segment to add effects to.")
        diff_duration = noise_json['duration'] - audio_segment.duration
        start = self._rng.uniform(0, diff_duration)
        end = start + audio_segment.duration
        noise_segment = AudioSegment.slice_from_file(
            noise_json['audio_filepath'], start=start, end=end)
        snr_dB = self._rng.uniform(self._min_snr_dB, self._max_snr_dB)
        audio_segment.add_noise(
            noise_segment, snr_dB, allow_downsampling=True, rng=self._rng)
--- a/examples/transv1.8to2.x/data_utils/augmentor/online_bayesian_normalization.py
+++ b/examples/transv1.8to2.x/data_utils/augmentor/online_bayesian_normalization.py
@ -1,57 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Contain the online bayesian normalization augmentation model."""
 from data_utils.augmentor.base import AugmentorBase
 class OnlineBayesianNormalizationAugmentor(AugmentorBase):
    """Augmentation model for adding online bayesian normalization.
    :param rng: Random generator object.
    :type rng: random.Random
    :param target_db: Target RMS value in decibels.
    :type target_db: float
    :param prior_db: Prior RMS estimate in decibels.
    :type prior_db: float
    :param prior_samples: Prior strength in number of samples.
    :type prior_samples: int
    :param startup_delay: Default 0.0s. If provided, this function will
                          accrue statistics for the first startup_delay 
                          seconds before applying online normalization.
    :type starup_delay: float.
    """
    def __init__(self,
                 rng,
                 target_db,
                 prior_db,
                 prior_samples,
                 startup_delay=0.0):
        self._target_db = target_db
        self._prior_db = prior_db
        self._prior_samples = prior_samples
        self._rng = rng
        self._startup_delay = startup_delay
    def transform_audio(self, audio_segment):
        """Normalizes the input audio using the online Bayesian approach.
        Note that this is an in-place transformation.
        :param audio_segment: Audio segment to add effects to.
        :type audio_segment: AudioSegment|SpeechSegment
        """
        audio_segment.normalize_online_bayesian(self._target_db, self._prior_db,
                                                self._prior_samples,
                                                self._startup_delay)
--- a/examples/transv1.8to2.x/data_utils/augmentor/resample.py
+++ b/examples/transv1.8to2.x/data_utils/augmentor/resample.py
@ -1,42 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Contain the resample augmentation model."""
 from data_utils.augmentor.base import AugmentorBase
 class ResampleAugmentor(AugmentorBase):
    """Augmentation model for resampling.
    See more info here:
    https://ccrma.stanford.edu/~jos/resample/index.html
    :param rng: Random generator object.
    :type rng: random.Random
    :param new_sample_rate: New sample rate in Hz.
    :type new_sample_rate: int
    """
    def __init__(self, rng, new_sample_rate):
        self._new_sample_rate = new_sample_rate
        self._rng = rng
    def transform_audio(self, audio_segment):
        """Resamples the input audio to a target sample rate.
        Note that this is an in-place transformation.
        :param audio: Audio segment to add effects to.
        :type audio: AudioSegment|SpeechSegment
        """
        audio_segment.resample(self._new_sample_rate)
--- a/examples/transv1.8to2.x/data_utils/augmentor/shift_perturb.py
+++ b/examples/transv1.8to2.x/data_utils/augmentor/shift_perturb.py
@ -1,43 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Contains the volume perturb augmentation model."""
 from data_utils.augmentor.base import AugmentorBase
 class ShiftPerturbAugmentor(AugmentorBase):
    """Augmentation model for adding random shift perturbation.
    :param rng: Random generator object.
    :type rng: random.Random
    :param min_shift_ms: Minimal shift in milliseconds.
    :type min_shift_ms: float
    :param max_shift_ms: Maximal shift in milliseconds.
    :type max_shift_ms: float
    """
    def __init__(self, rng, min_shift_ms, max_shift_ms):
        self._min_shift_ms = min_shift_ms
        self._max_shift_ms = max_shift_ms
        self._rng = rng
    def transform_audio(self, audio_segment):
        """Shift audio.
        Note that this is an in-place transformation.
        :param audio_segment: Audio segment to add effects to.
        :type audio_segment: AudioSegmenet|SpeechSegment
        """
        shift_ms = self._rng.uniform(self._min_shift_ms, self._max_shift_ms)
        audio_segment.shift(shift_ms)
--- a/examples/transv1.8to2.x/data_utils/augmentor/speed_perturb.py
+++ b/examples/transv1.8to2.x/data_utils/augmentor/speed_perturb.py
@ -1,56 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Contain the speech perturbation augmentation model."""
 from data_utils.augmentor.base import AugmentorBase
 class SpeedPerturbAugmentor(AugmentorBase):
    """Augmentation model for adding speed perturbation.
    See reference paper here:
    http://www.danielpovey.com/files/2015_interspeech_augmentation.pdf
    :param rng: Random generator object.
    :type rng: random.Random
    :param min_speed_rate: Lower bound of new speed rate to sample and should
                           not be smaller than 0.9.
    :type min_speed_rate: float
    :param max_speed_rate: Upper bound of new speed rate to sample and should
                           not be larger than 1.1.
    :type max_speed_rate: float
    """
    def __init__(self, rng, min_speed_rate, max_speed_rate):
        if min_speed_rate < 0.9:
            raise ValueError(
                "Sampling speed below 0.9 can cause unnatural effects")
        if max_speed_rate > 1.1:
            raise ValueError(
                "Sampling speed above 1.1 can cause unnatural effects")
        self._min_speed_rate = min_speed_rate
        self._max_speed_rate = max_speed_rate
        self._rng = rng
    def transform_audio(self, audio_segment):
        """Sample a new speed rate from the given range and
        changes the speed of the given audio clip.
        Note that this is an in-place transformation.
        :param audio_segment: Audio segment to add effects to.
        :type audio_segment: AudioSegment|SpeechSegment
        """
        sampled_speed = self._rng.uniform(self._min_speed_rate,
                                          self._max_speed_rate)
        audio_segment.change_speed(sampled_speed)
--- a/examples/transv1.8to2.x/data_utils/augmentor/volume_perturb.py
+++ b/examples/transv1.8to2.x/data_utils/augmentor/volume_perturb.py
@ -1,49 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Contains the volume perturb augmentation model."""
 from data_utils.augmentor.base import AugmentorBase
 class VolumePerturbAugmentor(AugmentorBase):
    """Augmentation model for adding random volume perturbation.
    This is used for multi-loudness training of PCEN. See
    https://arxiv.org/pdf/1607.05666v1.pdf
    for more details.
    :param rng: Random generator object.
    :type rng: random.Random
    :param min_gain_dBFS: Minimal gain in dBFS.
    :type min_gain_dBFS: float
    :param max_gain_dBFS: Maximal gain in dBFS.
    :type max_gain_dBFS: float
    """
    def __init__(self, rng, min_gain_dBFS, max_gain_dBFS):
        self._min_gain_dBFS = min_gain_dBFS
        self._max_gain_dBFS = max_gain_dBFS
        self._rng = rng
    def transform_audio(self, audio_segment):
        """Change audio loadness.
        Note that this is an in-place transformation.
        :param audio_segment: Audio segment to add effects to.
        :type audio_segment: AudioSegmenet|SpeechSegment
        """
        gain = self._rng.uniform(self._min_gain_dBFS, self._max_gain_dBFS)
        audio_segment.gain_db(gain)
--- a/examples/transv1.8to2.x/data_utils/data.py
+++ b/examples/transv1.8to2.x/data_utils/data.py
@ -1,380 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Contains data generator for orgnaizing various audio data preprocessing
 pipeline and offering data reader interface of PaddlePaddle requirements.
 """
 import random
 import tarfile
 from threading import local
 import numpy as np
 import paddle.fluid as fluid
 from data_utils.augmentor.augmentation import AugmentationPipeline
 from data_utils.featurizer.speech_featurizer import SpeechFeaturizer
 from data_utils.normalizer import FeatureNormalizer
 from data_utils.speech import SpeechSegment
 from data_utils.utility import read_manifest
 class DataGenerator(object):
    """
    DataGenerator provides basic audio data preprocessing pipeline, and offers
    data reader interfaces of PaddlePaddle requirements.
    :param vocab_filepath: Vocabulary filepath for indexing tokenized
                           transcripts.
    :type vocab_filepath: str
    :param mean_std_filepath: File containing the pre-computed mean and stddev.
    :type mean_std_filepath: None|str
    :param augmentation_config: Augmentation configuration in json string.
                                Details see AugmentationPipeline.__doc__.
    :type augmentation_config: str
    :param max_duration: Audio with duration (in seconds) greater than
                         this will be discarded.
    :type max_duration: float
    :param min_duration: Audio with duration (in seconds) smaller than
                         this will be discarded.
    :type min_duration: float
    :param stride_ms: Striding size (in milliseconds) for generating frames.
    :type stride_ms: float
    :param window_ms: Window size (in milliseconds) for generating frames.
    :type window_ms: float
    :param max_freq: Used when specgram_type is 'linear', only FFT bins
                     corresponding to frequencies between [0, max_freq] are
                     returned.
    :types max_freq: None|float
    :param specgram_type: Specgram feature type. Options: 'linear'.
    :type specgram_type: str
    :param use_dB_normalization: Whether to normalize the audio to -20 dB
                                before extracting the features.
    :type use_dB_normalization: bool
    :param random_seed: Random seed.
    :type random_seed: int
    :param keep_transcription_text: If set to True, transcription text will
                                    be passed forward directly without
                                    converting to index sequence.
    :type keep_transcription_text: bool
    :param place: The place to run the program.
    :type place: CPUPlace or CUDAPlace
    :param is_training: If set to True, generate text data for training,
                        otherwise,  generate text data for infer.
    :type is_training: bool
    """
    def __init__(self,
                 vocab_filepath,
                 mean_std_filepath,
                 augmentation_config='{}',
                 max_duration=float('inf'),
                 min_duration=0.0,
                 stride_ms=10.0,
                 window_ms=20.0,
                 max_freq=None,
                 specgram_type='linear',
                 use_dB_normalization=True,
                 random_seed=0,
                 keep_transcription_text=False,
                 place=fluid.CPUPlace(),
                 is_training=True):
        self._max_duration = max_duration
        self._min_duration = min_duration
        self._normalizer = FeatureNormalizer(mean_std_filepath)
        self._augmentation_pipeline = AugmentationPipeline(
            augmentation_config=augmentation_config, random_seed=random_seed)
        self._speech_featurizer = SpeechFeaturizer(
            vocab_filepath=vocab_filepath,
            specgram_type=specgram_type,
            stride_ms=stride_ms,
            window_ms=window_ms,
            max_freq=max_freq,
            use_dB_normalization=use_dB_normalization)
        self._rng = random.Random(random_seed)
        self._keep_transcription_text = keep_transcription_text
        self._epoch = 0
        self._is_training = is_training
        # for caching tar files info
        self._local_data = local()
        self._local_data.tar2info = {}
        self._local_data.tar2object = {}
        self._place = place
    def process_utterance(self, audio_file, transcript):
        """Load, augment, featurize and normalize for speech data.
        :param audio_file: Filepath or file object of audio file.
        :type audio_file: str | file
        :param transcript: Transcription text.
        :type transcript: str
        :return: Tuple of audio feature tensor and data of transcription part,
                 where transcription part could be token ids or text.
        :rtype: tuple of (2darray, list)
        """
        if isinstance(audio_file, str) and audio_file.startswith('tar:'):
            speech_segment = SpeechSegment.from_file(
                self._subfile_from_tar(audio_file), transcript)
        else:
            speech_segment = SpeechSegment.from_file(audio_file, transcript)
        self._augmentation_pipeline.transform_audio(speech_segment)
        specgram, transcript_part = self._speech_featurizer.featurize(
            speech_segment, self._keep_transcription_text)
        specgram = self._normalizer.apply(specgram)
        return specgram, transcript_part
    def batch_reader_creator(self,
                             manifest_path,
                             batch_size,
                             padding_to=-1,
                             flatten=False,
                             sortagrad=False,
                             shuffle_method="batch_shuffle"):
        """
        Batch data reader creator for audio data. Return a callable generator
        function to produce batches of data.
        Audio features within one batch will be padded with zeros to have the
        same shape, or a user-defined shape.
        :param manifest_path: Filepath of manifest for audio files.
        :type manifest_path: str
        :param batch_size: Number of instances in a batch.
        :type batch_size: int
        :param padding_to:  If set -1, the maximun shape in the batch
                            will be used as the target shape for padding.
                            Otherwise, `padding_to` will be the target shape.
        :type padding_to: int
        :param flatten: If set True, audio features will be flatten to 1darray.
        :type flatten: bool
        :param sortagrad: If set True, sort the instances by audio duration
                          in the first epoch for speed up training.
        :type sortagrad: bool
        :param shuffle_method: Shuffle method. Options:
                                '' or None: no shuffle.
                                'instance_shuffle': instance-wise shuffle.
                                'batch_shuffle': similarly-sized instances are
                                                 put into batches, and then
                                                 batch-wise shuffle the batches.
                                                 For more details, please see
                                                 ``_batch_shuffle.__doc__``.
                                'batch_shuffle_clipped': 'batch_shuffle' with
                                                         head shift and tail
                                                         clipping. For more
                                                         details, please see
                                                         ``_batch_shuffle``.
                              If sortagrad is True, shuffle is disabled
                              for the first epoch.
        :type shuffle_method: None|str
        :return: Batch reader function, producing batches of data when called.
        :rtype: callable
        """
        def batch_reader():
            # read manifest
            manifest = read_manifest(
                manifest_path=manifest_path,
                max_duration=self._max_duration,
                min_duration=self._min_duration)
            # sort (by duration) or batch-wise shuffle the manifest
            if self._epoch == 0 and sortagrad:
                manifest.sort(key=lambda x: x["duration"])
            else:
                if shuffle_method == "batch_shuffle":
                    manifest = self._batch_shuffle(
                        manifest, batch_size, clipped=False)
                elif shuffle_method == "batch_shuffle_clipped":
                    manifest = self._batch_shuffle(
                        manifest, batch_size, clipped=True)
                elif shuffle_method == "instance_shuffle":
                    self._rng.shuffle(manifest)
                elif shuffle_method is None:
                    pass
                else:
                    raise ValueError("Unknown shuffle method %s." %
                                     shuffle_method)
            # prepare batches
            batch = []
            instance_reader = self._instance_reader_creator(manifest)
            for instance in instance_reader():
                batch.append(instance)
                if len(batch) == batch_size:
                    yield self._padding_batch(batch, padding_to, flatten)
                    batch = []
            if len(batch) >= 1:
                yield self._padding_batch(batch, padding_to, flatten)
            self._epoch += 1
        return batch_reader
    @property
    def feeding(self):
        """Returns data reader's feeding dict.
        :return: Data feeding dict.
        :rtype: dict
        """
        feeding_dict = {"audio_spectrogram": 0, "transcript_text": 1}
        return feeding_dict
    @property
    def vocab_size(self):
        """Return the vocabulary size.
        :return: Vocabulary size.
        :rtype: int
        """
        return self._speech_featurizer.vocab_size
    @property
    def vocab_list(self):
        """Return the vocabulary in list.
        :return: Vocabulary in list.
        :rtype: list
        """
        return self._speech_featurizer.vocab_list
    def _parse_tar(self, file):
        """Parse a tar file to get a tarfile object
        and a map containing tarinfoes
        """
        result = {}
        f = tarfile.open(file)
        for tarinfo in f.getmembers():
            result[tarinfo.name] = tarinfo
        return f, result
    def _subfile_from_tar(self, file):
        """Get subfile object from tar.
        It will return a subfile object from tar file
        and cached tar file info for next reading request.
        """
        tarpath, filename = file.split(':', 1)[1].split('#', 1)
        if 'tar2info' not in self._local_data.__dict__:
            self._local_data.tar2info = {}
        if 'tar2object' not in self._local_data.__dict__:
            self._local_data.tar2object = {}
        if tarpath not in self._local_data.tar2info:
            object, infoes = self._parse_tar(tarpath)
            self._local_data.tar2info[tarpath] = infoes
            self._local_data.tar2object[tarpath] = object
        return self._local_data.tar2object[tarpath].extractfile(
            self._local_data.tar2info[tarpath][filename])
    def _instance_reader_creator(self, manifest):
        """
        Instance reader creator. Create a callable function to produce
        instances of data.
        Instance: a tuple of ndarray of audio spectrogram and a list of
        token indices for transcript.
        """
        def reader():
            for instance in manifest:
                inst = self.process_utterance(instance["audio_filepath"],
                                              instance["text"])
                yield inst
        return reader
    def _padding_batch(self, batch, padding_to=-1, flatten=False):
        """
        Padding audio features with zeros to make them have the same shape (or
        a user-defined shape) within one bach.
        If ``padding_to`` is -1, the maximun shape in the batch will be used
        as the target shape for padding. Otherwise, `padding_to` will be the
        target shape (only refers to the second axis).
        If `flatten` is True, features will be flatten to 1darray.
        """
        new_batch = []
        # get target shape
        max_length = max([audio.shape[1] for audio, text in batch])
        if padding_to != -1:
            if padding_to < max_length:
                raise ValueError("If padding_to is not -1, it should be larger "
                                 "than any instance's shape in the batch")
            max_length = padding_to
        # padding
        padded_audios = []
        texts, text_lens = [], []
        audio_lens = []
        masks = []
        for audio, text in batch:
            padded_audio = np.zeros([audio.shape[0], max_length])
            padded_audio[:, :audio.shape[1]] = audio
            if flatten:
                padded_audio = padded_audio.flatten()
            padded_audios.append(padded_audio)
            if self._is_training:
                texts += text
            else:
                texts.append(text)
            text_lens.append(len(text))
            audio_lens.append(audio.shape[1])
            mask_shape0 = (audio.shape[0] - 1) // 2 + 1
            mask_shape1 = (audio.shape[1] - 1) // 3 + 1
            mask_max_len = (max_length - 1) // 3 + 1
            mask_ones = np.ones((mask_shape0, mask_shape1))
            mask_zeros = np.zeros((mask_shape0, mask_max_len - mask_shape1))
            mask = np.repeat(
                np.reshape(
                    np.concatenate((mask_ones, mask_zeros), axis=1),
                    (1, mask_shape0, mask_max_len)),
                32,
                axis=0)
            masks.append(mask)
        padded_audios = np.array(padded_audios).astype('float32')
        if self._is_training:
            texts = np.expand_dims(np.array(texts).astype('int32'), axis=-1)
            texts = fluid.create_lod_tensor(
                texts, recursive_seq_lens=[text_lens], place=self._place)
        audio_lens = np.array(audio_lens).astype('int64').reshape([-1, 1])
        masks = np.array(masks).astype('float32')
        return padded_audios, texts, audio_lens, masks
    def _batch_shuffle(self, manifest, batch_size, clipped=False):
        """Put similarly-sized instances into minibatches for better efficiency
        and make a batch-wise shuffle.
        1. Sort the audio clips by duration.
        2. Generate a random number `k`, k in [0, batch_size).
        3. Randomly shift `k` instances in order to create different batches
           for different epochs. Create minibatches.
        4. Shuffle the minibatches.
        :param manifest: Manifest contents. List of dict.
        :type manifest: list
        :param batch_size: Batch size. This size is also used for generate
                           a random number for batch shuffle.
        :type batch_size: int
        :param clipped: Whether to clip the heading (small shift) and trailing
                        (incomplete batch) instances.
        :type clipped: bool
        :return: Batch shuffled mainifest.
        :rtype: list
        """
        manifest.sort(key=lambda x: x["duration"])
        shift_len = self._rng.randint(0, batch_size - 1)
        batch_manifest = list(zip(* [iter(manifest[shift_len:])] * batch_size))
        self._rng.shuffle(batch_manifest)
        batch_manifest = [item for batch in batch_manifest for item in batch]
        if not clipped:
            res_len = len(manifest) - shift_len - len(batch_manifest)
            batch_manifest.extend(manifest[-res_len:])
            batch_manifest.extend(manifest[0:shift_len])
        return batch_manifest
--- a/examples/transv1.8to2.x/data_utils/featurizer/init.py
+++ b/examples/transv1.8to2.x/data_utils/featurizer/init.py
@ -1,13 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
--- a/examples/transv1.8to2.x/data_utils/featurizer/audio_featurizer.py
+++ b/examples/transv1.8to2.x/data_utils/featurizer/audio_featurizer.py
@ -1,194 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Contains the audio featurizer class."""
 import numpy as np
 from python_speech_features import delta
 from python_speech_features import mfcc
 class AudioFeaturizer(object):
    """Audio featurizer, for extracting features from audio contents of
    AudioSegment or SpeechSegment.
    Currently, it supports feature types of linear spectrogram and mfcc.
    :param specgram_type: Specgram feature type. Options: 'linear'.
    :type specgram_type: str
    :param stride_ms: Striding size (in milliseconds) for generating frames.
    :type stride_ms: float
    :param window_ms: Window size (in milliseconds) for generating frames.
    :type window_ms: float
    :param max_freq: When specgram_type is 'linear', only FFT bins
                     corresponding to frequencies between [0, max_freq] are
                     returned; when specgram_type is 'mfcc', max_feq is the
                     highest band edge of mel filters.
    :types max_freq: None|float
    :param target_sample_rate: Audio are resampled (if upsampling or
                               downsampling is allowed) to this before
                               extracting spectrogram features.
    :type target_sample_rate: float
    :param use_dB_normalization: Whether to normalize the audio to a certain
                                 decibels before extracting the features.
    :type use_dB_normalization: bool
    :param target_dB: Target audio decibels for normalization.
    :type target_dB: float
    """
    def __init__(self,
                 specgram_type='linear',
                 stride_ms=10.0,
                 window_ms=20.0,
                 max_freq=None,
                 target_sample_rate=16000,
                 use_dB_normalization=True,
                 target_dB=-20):
        self._specgram_type = specgram_type
        self._stride_ms = stride_ms
        self._window_ms = window_ms
        self._max_freq = max_freq
        self._target_sample_rate = target_sample_rate
        self._use_dB_normalization = use_dB_normalization
        self._target_dB = target_dB
    def featurize(self,
                  audio_segment,
                  allow_downsampling=True,
                  allow_upsampling=True):
        """Extract audio features from AudioSegment or SpeechSegment.
        :param audio_segment: Audio/speech segment to extract features from.
        :type audio_segment: AudioSegment|SpeechSegment
        :param allow_downsampling: Whether to allow audio downsampling before
                                   featurizing.
        :type allow_downsampling: bool
        :param allow_upsampling: Whether to allow audio upsampling before
                                 featurizing.
        :type allow_upsampling: bool
        :return: Spectrogram audio feature in 2darray.
        :rtype: ndarray
        :raises ValueError: If audio sample rate is not supported.
        """
        # upsampling or downsampling
        if ((audio_segment.sample_rate > self._target_sample_rate and
             allow_downsampling) or
            (audio_segment.sample_rate < self._target_sample_rate and
             allow_upsampling)):
            audio_segment.resample(self._target_sample_rate)
        if audio_segment.sample_rate != self._target_sample_rate:
            raise ValueError("Audio sample rate is not supported. "
                             "Turn allow_downsampling or allow up_sampling on.")
        # decibel normalization
        if self._use_dB_normalization:
            audio_segment.normalize(target_db=self._target_dB)
        # extract spectrogram
        return self._compute_specgram(audio_segment.samples,
                                      audio_segment.sample_rate)
    def _compute_specgram(self, samples, sample_rate):
        """Extract various audio features."""
        if self._specgram_type == 'linear':
            return self._compute_linear_specgram(
                samples, sample_rate, self._stride_ms, self._window_ms,
                self._max_freq)
        elif self._specgram_type == 'mfcc':
            return self._compute_mfcc(samples, sample_rate, self._stride_ms,
                                      self._window_ms, self._max_freq)
        else:
            raise ValueError("Unknown specgram_type %s. "
                             "Supported values: linear." % self._specgram_type)
    def _compute_linear_specgram(self,
                                 samples,
                                 sample_rate,
                                 stride_ms=10.0,
                                 window_ms=20.0,
                                 max_freq=None,
                                 eps=1e-14):
        """Compute the linear spectrogram from FFT energy."""
        if max_freq is None:
            max_freq = sample_rate / 2
        if max_freq > sample_rate / 2:
            raise ValueError("max_freq must not be greater than half of "
                             "sample rate.")
        if stride_ms > window_ms:
            raise ValueError("Stride size must not be greater than "
                             "window size.")
        stride_size = int(0.001 * sample_rate * stride_ms)
        window_size = int(0.001 * sample_rate * window_ms)
        specgram, freqs = self._specgram_real(
            samples,
            window_size=window_size,
            stride_size=stride_size,
            sample_rate=sample_rate)
        ind = np.where(freqs <= max_freq)[0][-1] + 1
        return np.log(specgram[:ind, :] + eps)
    def _specgram_real(self, samples, window_size, stride_size, sample_rate):
        """Compute the spectrogram for samples from a real signal."""
        # extract strided windows
        truncate_size = (len(samples) - window_size) % stride_size
        samples = samples[:len(samples) - truncate_size]
        nshape = (window_size, (len(samples) - window_size) // stride_size + 1)
        nstrides = (samples.strides[0], samples.strides[0] * stride_size)
        windows = np.lib.stride_tricks.as_strided(
            samples, shape=nshape, strides=nstrides)
        assert np.all(
            windows[:, 1] == samples[stride_size:(stride_size + window_size)])
        # window weighting, squared Fast Fourier Transform (fft), scaling
        weighting = np.hanning(window_size)[:, None]
        fft = np.fft.rfft(windows * weighting, axis=0)
        fft = np.absolute(fft)
        fft = fft**2
        scale = np.sum(weighting**2) * sample_rate
        fft[1:-1, :] *= (2.0 / scale)
        fft[(0, -1), :] /= scale
        # prepare fft frequency list
        freqs = float(sample_rate) / window_size * np.arange(fft.shape[0])
        return fft, freqs
    def _compute_mfcc(self,
                      samples,
                      sample_rate,
                      stride_ms=10.0,
                      window_ms=20.0,
                      max_freq=None):
        """Compute mfcc from samples."""
        if max_freq is None:
            max_freq = sample_rate / 2
        if max_freq > sample_rate / 2:
            raise ValueError("max_freq must not be greater than half of "
                             "sample rate.")
        if stride_ms > window_ms:
            raise ValueError("Stride size must not be greater than "
                             "window size.")
        # compute the 13 cepstral coefficients, and the first one is replaced
        # by log(frame energy)
        mfcc_feat = mfcc(
            signal=samples,
            samplerate=sample_rate,
            winlen=0.001 * window_ms,
            winstep=0.001 * stride_ms,
            highfreq=max_freq)
        # Deltas
        d_mfcc_feat = delta(mfcc_feat, 2)
        # Deltas-Deltas
        dd_mfcc_feat = delta(d_mfcc_feat, 2)
        # transpose
        mfcc_feat = np.transpose(mfcc_feat)
        d_mfcc_feat = np.transpose(d_mfcc_feat)
        dd_mfcc_feat = np.transpose(dd_mfcc_feat)
        # concat above three features
        concat_mfcc_feat = np.concatenate(
            (mfcc_feat, d_mfcc_feat, dd_mfcc_feat))
        return concat_mfcc_feat
--- a/examples/transv1.8to2.x/data_utils/featurizer/speech_featurizer.py
+++ b/examples/transv1.8to2.x/data_utils/featurizer/speech_featurizer.py
@ -1,107 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Contains the speech featurizer class."""
 from data_utils.featurizer.audio_featurizer import AudioFeaturizer
 from data_utils.featurizer.text_featurizer import TextFeaturizer
 class SpeechFeaturizer(object):
    """Speech featurizer, for extracting features from both audio and transcript
    contents of SpeechSegment.
    Currently, for audio parts, it supports feature types of linear
    spectrogram and mfcc; for transcript parts, it only supports char-level
    tokenizing and conversion into a list of token indices. Note that the
    token indexing order follows the given vocabulary file.
    :param vocab_filepath: Filepath to load vocabulary for token indices
                           conversion.
    :type specgram_type: str
    :param specgram_type: Specgram feature type. Options: 'linear', 'mfcc'.
    :type specgram_type: str
    :param stride_ms: Striding size (in milliseconds) for generating frames.
    :type stride_ms: float
    :param window_ms: Window size (in milliseconds) for generating frames.
    :type window_ms: float
    :param max_freq: When specgram_type is 'linear', only FFT bins
                     corresponding to frequencies between [0, max_freq] are
                     returned; when specgram_type is 'mfcc', max_freq is the
                     highest band edge of mel filters.
    :types max_freq: None|float
    :param target_sample_rate: Speech are resampled (if upsampling or
                               downsampling is allowed) to this before
                               extracting spectrogram features.
    :type target_sample_rate: float
    :param use_dB_normalization: Whether to normalize the audio to a certain
                                 decibels before extracting the features.
    :type use_dB_normalization: bool
    :param target_dB: Target audio decibels for normalization.
    :type target_dB: float
    """
    def __init__(self,
                 vocab_filepath,
                 specgram_type='linear',
                 stride_ms=10.0,
                 window_ms=20.0,
                 max_freq=None,
                 target_sample_rate=16000,
                 use_dB_normalization=True,
                 target_dB=-20):
        self._audio_featurizer = AudioFeaturizer(
            specgram_type=specgram_type,
            stride_ms=stride_ms,
            window_ms=window_ms,
            max_freq=max_freq,
            target_sample_rate=target_sample_rate,
            use_dB_normalization=use_dB_normalization,
            target_dB=target_dB)
        self._text_featurizer = TextFeaturizer(vocab_filepath)
    def featurize(self, speech_segment, keep_transcription_text):
        """Extract features for speech segment.
        1. For audio parts, extract the audio features.
        2. For transcript parts, keep the original text or convert text string
           to a list of token indices in char-level.
        :param audio_segment: Speech segment to extract features from.
        :type audio_segment: SpeechSegment
        :return: A tuple of 1) spectrogram audio feature in 2darray, 2) list of
                 char-level token indices.
        :rtype: tuple
        """
        audio_feature = self._audio_featurizer.featurize(speech_segment)
        if keep_transcription_text:
            return audio_feature, speech_segment.transcript
        text_ids = self._text_featurizer.featurize(speech_segment.transcript)
        return audio_feature, text_ids
    @property
    def vocab_size(self):
        """Return the vocabulary size.
        :return: Vocabulary size.
        :rtype: int
        """
        return self._text_featurizer.vocab_size
    @property
    def vocab_list(self):
        """Return the vocabulary in list.
        :return: Vocabulary in list.
        :rtype: list
        """
        return self._text_featurizer.vocab_list
--- a/examples/transv1.8to2.x/data_utils/featurizer/text_featurizer.py
+++ b/examples/transv1.8to2.x/data_utils/featurizer/text_featurizer.py
@ -1,76 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Contains the text featurizer class."""
 import codecs
 class TextFeaturizer(object):
    """Text featurizer, for processing or extracting features from text.
    Currently, it only supports char-level tokenizing and conversion into
    a list of token indices. Note that the token indexing order follows the
    given vocabulary file.
    :param vocab_filepath: Filepath to load vocabulary for token indices
                           conversion.
    :type specgram_type: str
    """
    def __init__(self, vocab_filepath):
        self._vocab_dict, self._vocab_list = self._load_vocabulary_from_file(
            vocab_filepath)
    def featurize(self, text):
        """Convert text string to a list of token indices in char-level.Note
        that the token indexing order follows the given vocabulary file.
        :param text: Text to process.
        :type text: str
        :return: List of char-level token indices.
        :rtype: list
        """
        tokens = self._char_tokenize(text)
        return [self._vocab_dict[token] for token in tokens]
    @property
    def vocab_size(self):
        """Return the vocabulary size.
        :return: Vocabulary size.
        :rtype: int
        """
        return len(self._vocab_list)
    @property
    def vocab_list(self):
        """Return the vocabulary in list.
        :return: Vocabulary in list.
        :rtype: list
        """
        return self._vocab_list
    def _char_tokenize(self, text):
        """Character tokenizer."""
        return list(text.strip())
    def _load_vocabulary_from_file(self, vocab_filepath):
        """Load vocabulary from file."""
        vocab_lines = []
        with codecs.open(vocab_filepath, 'r', 'utf-8') as file:
            vocab_lines.extend(file.readlines())
        vocab_list = [line[:-1] for line in vocab_lines]
        vocab_dict = dict(
            [(token, id) for (id, token) in enumerate(vocab_list)])
        return vocab_dict, vocab_list
--- a/examples/transv1.8to2.x/data_utils/normalizer.py
+++ b/examples/transv1.8to2.x/data_utils/normalizer.py
@ -1,97 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Contains feature normalizers."""
 import random
 import numpy as np
 from data_utils.audio import AudioSegment
 from data_utils.utility import read_manifest
 class FeatureNormalizer(object):
    """Feature normalizer. Normalize features to be of zero mean and unit
    stddev.
    if mean_std_filepath is provided (not None), the normalizer will directly
    initilize from the file. Otherwise, both manifest_path and featurize_func
    should be given for on-the-fly mean and stddev computing.
    :param mean_std_filepath: File containing the pre-computed mean and stddev.
    :type mean_std_filepath: None|str
    :param manifest_path: Manifest of instances for computing mean and stddev.
    :type meanifest_path: None|str
    :param featurize_func: Function to extract features. It should be callable
                           with ``featurize_func(audio_segment)``.
    :type featurize_func: None|callable
    :param num_samples: Number of random samples for computing mean and stddev.
    :type num_samples: int
    :param random_seed: Random seed for sampling instances.
    :type random_seed: int
    :raises ValueError: If both mean_std_filepath and manifest_path
                        (or both mean_std_filepath and featurize_func) are None.
    """
    def __init__(self,
                 mean_std_filepath,
                 manifest_path=None,
                 featurize_func=None,
                 num_samples=500,
                 random_seed=0):
        if not mean_std_filepath:
            if not (manifest_path and featurize_func):
                raise ValueError("If mean_std_filepath is None, meanifest_path "
                                 "and featurize_func should not be None.")
            self._rng = random.Random(random_seed)
            self._compute_mean_std(manifest_path, featurize_func, num_samples)
        else:
            self._read_mean_std_from_file(mean_std_filepath)
    def apply(self, features, eps=1e-14):
        """Normalize features to be of zero mean and unit stddev.
        :param features: Input features to be normalized.
        :type features: ndarray
        :param eps:  added to stddev to provide numerical stablibity.
        :type eps: float
        :return: Normalized features.
        :rtype: ndarray
        """
        return (features - self._mean) / (self._std + eps)
    def write_to_file(self, filepath):
        """Write the mean and stddev to the file.
        :param filepath: File to write mean and stddev.
        :type filepath: str
        """
        np.savez(filepath, mean=self._mean, std=self._std)
    def _read_mean_std_from_file(self, filepath):
        """Load mean and std from file."""
        npzfile = np.load(filepath)
        self._mean = npzfile["mean"]
        self._std = npzfile["std"]
    def _compute_mean_std(self, manifest_path, featurize_func, num_samples):
        """Compute mean and std from randomly sampled instances."""
        manifest = read_manifest(manifest_path)
        sampled_manifest = self._rng.sample(manifest, num_samples)
        features = []
        for instance in sampled_manifest:
            features.append(
                featurize_func(
                    AudioSegment.from_file(instance["audio_filepath"])))
        features = np.hstack(features)
        self._mean = np.mean(features, axis=1).reshape([-1, 1])
        self._std = np.std(features, axis=1).reshape([-1, 1])
--- a/examples/transv1.8to2.x/data_utils/speech.py
+++ b/examples/transv1.8to2.x/data_utils/speech.py
@ -1,153 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Contains the speech segment class."""
 import numpy as np
 from data_utils.audio import AudioSegment
 class SpeechSegment(AudioSegment):
    """Speech segment abstraction, a subclass of AudioSegment,
    with an additional transcript.
    :param samples: Audio samples [num_samples x num_channels].
    :type samples: ndarray.float32
    :param sample_rate: Audio sample rate.
    :type sample_rate: int
    :param transcript: Transcript text for the speech.
    :type transript: str
    :raises TypeError: If the sample data type is not float or int.
    """
    def __init__(self, samples, sample_rate, transcript):
        AudioSegment.__init__(self, samples, sample_rate)
        self._transcript = transcript
    def __eq__(self, other):
        """Return whether two objects are equal.
        """
        if not AudioSegment.__eq__(self, other):
            return False
        if self._transcript != other._transcript:
            return False
        return True
    def __ne__(self, other):
        """Return whether two objects are unequal."""
        return not self.__eq__(other)
    @classmethod
    def from_file(cls, filepath, transcript):
        """Create speech segment from audio file and corresponding transcript.
        :param filepath: Filepath or file object to audio file.
        :type filepath: str|file
        :param transcript: Transcript text for the speech.
        :type transript: str
        :return: Speech segment instance.
        :rtype: SpeechSegment
        """
        audio = AudioSegment.from_file(filepath)
        return cls(audio.samples, audio.sample_rate, transcript)
    @classmethod
    def from_bytes(cls, bytes, transcript):
        """Create speech segment from a byte string and corresponding
        transcript.
        :param bytes: Byte string containing audio samples.
        :type bytes: str
        :param transcript: Transcript text for the speech.
        :type transript: str
        :return: Speech segment instance.
        :rtype: Speech Segment
        """
        audio = AudioSegment.from_bytes(bytes)
        return cls(audio.samples, audio.sample_rate, transcript)
    @classmethod
    def concatenate(cls, *segments):
        """Concatenate an arbitrary number of speech segments together, both
        audio and transcript will be concatenated.
        :param *segments: Input speech segments to be concatenated.
        :type *segments: tuple of SpeechSegment
        :return: Speech segment instance.
        :rtype: SpeechSegment
        :raises ValueError: If the number of segments is zero, or if the 
                            sample_rate of any two segments does not match.
        :raises TypeError: If any segment is not SpeechSegment instance.
        """
        if len(segments) == 0:
            raise ValueError("No speech segments are given to concatenate.")
        sample_rate = segments[0]._sample_rate
        transcripts = ""
        for seg in segments:
            if sample_rate != seg._sample_rate:
                raise ValueError("Can't concatenate segments with "
                                 "different sample rates")
            if type(seg) is not cls:
                raise TypeError("Only speech segments of the same type "
                                "instance can be concatenated.")
            transcripts += seg._transcript
        samples = np.concatenate([seg.samples for seg in segments])
        return cls(samples, sample_rate, transcripts)
    @classmethod
    def slice_from_file(cls, filepath, transcript, start=None, end=None):
        """Loads a small section of an speech without having to load
        the entire file into the memory which can be incredibly wasteful.
        :param filepath: Filepath or file object to audio file.
        :type filepath: str|file
        :param start: Start time in seconds. If start is negative, it wraps
                      around from the end. If not provided, this function 
                      reads from the very beginning.
        :type start: float
        :param end: End time in seconds. If end is negative, it wraps around
                    from the end. If not provided, the default behvaior is
                    to read to the end of the file.
        :type end: float
        :param transcript: Transcript text for the speech. if not provided, 
                           the defaults is an empty string.
        :type transript: str
        :return: SpeechSegment instance of the specified slice of the input
                 speech file.
        :rtype: SpeechSegment
        """
        audio = AudioSegment.slice_from_file(filepath, start, end)
        return cls(audio.samples, audio.sample_rate, transcript)
    @classmethod
    def make_silence(cls, duration, sample_rate):
        """Creates a silent speech segment of the given duration and
        sample rate, transcript will be an empty string.
        :param duration: Length of silence in seconds.
        :type duration: float
        :param sample_rate: Sample rate.
        :type sample_rate: float
        :return: Silence of the given duration.
        :rtype: SpeechSegment
        """
        audio = AudioSegment.make_silence(duration, sample_rate)
        return cls(audio.samples, audio.sample_rate, "")
    @property
    def transcript(self):
        """Return the transcript text.
        :return: Transcript text for the speech.
        :rtype: str
        """
        return self._transcript
--- a/examples/transv1.8to2.x/data_utils/utility.py
+++ b/examples/transv1.8to2.x/data_utils/utility.py
@ -1,98 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Contains data helper functions."""
 import codecs
 import json
 import os
 import tarfile
 from paddle.dataset.common import md5file
 def read_manifest(manifest_path, max_duration=float('inf'), min_duration=0.0):
    """Load and parse manifest file.
    Instances with durations outside [min_duration, max_duration] will be
    filtered out.
    :param manifest_path: Manifest file to load and parse.
    :type manifest_path: str
    :param max_duration: Maximal duration in seconds for instance filter.
    :type max_duration: float
    :param min_duration: Minimal duration in seconds for instance filter.
    :type min_duration: float
    :return: Manifest parsing results. List of dict.
    :rtype: list
    :raises IOError: If failed to parse the manifest.
    """
    manifest = []
    for json_line in codecs.open(manifest_path, 'r', 'utf-8'):
        try:
            json_data = json.loads(json_line)
        except Exception as e:
            raise IOError("Error reading manifest: %s" % str(e))
        if (json_data["duration"] <= max_duration and
                json_data["duration"] >= min_duration):
            manifest.append(json_data)
    return manifest
 def getfile_insensitive(path):
    """Get the actual file path when given insensitive filename."""
    directory, filename = os.path.split(path)
    directory, filename = (directory or '.'), filename.lower()
    for f in os.listdir(directory):
        newpath = os.path.join(directory, f)
        if os.path.isfile(newpath) and f.lower() == filename:
            return newpath
 def download_multi(url, target_dir, extra_args):
    """Download multiple files from url to target_dir."""
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)
    print("Downloading %s ..." % url)
    ret_code = os.system("wget -c " + url + ' ' + extra_args + " -P " +
                         target_dir)
    return ret_code
 def download(url, md5sum, target_dir):
    """Download file from url to target_dir, and check md5sum."""
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)
    filepath = os.path.join(target_dir, url.split("/")[-1])
    if not (os.path.exists(filepath) and md5file(filepath) == md5sum):
        print("Downloading %s ..." % url)
        os.system("wget -c " + url + " -P " + target_dir)
        print("\nMD5 Chesksum %s ..." % filepath)
        if not md5file(filepath) == md5sum:
            raise RuntimeError("MD5 checksum failed.")
    else:
        print("File exists, skip downloading. (%s)" % filepath)
    return filepath
 def unpack(filepath, target_dir, rm_tar=False):
    """Unpack the file to the target_dir."""
    print("Unpacking %s ..." % filepath)
    tar = tarfile.open(filepath)
    tar.extractall(target_dir)
    tar.close()
    if rm_tar is True:
        os.remove(filepath)
 class XmapEndSignal():
    pass
--- a/examples/transv1.8to2.x/deepspeech/decoders/README.MD
+++ b/examples/transv1.8to2.x/deepspeech/decoders/README.MD
@ -1,3 +0,0 @@
 # Reference
 * [Sequence Modeling With CTC](https://distill.pub/2017/ctc/)
 * [First-Pass Large Vocabulary Continuous Speech Recognition using Bi-Directional Recurrent DNNs](https://arxiv.org/pdf/1408.2873.pdf)
--- a/examples/transv1.8to2.x/deepspeech/decoders/init.py
+++ b/examples/transv1.8to2.x/deepspeech/decoders/init.py
@ -1,13 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
--- a/examples/transv1.8to2.x/deepspeech/decoders/decoders_deprecated.py
+++ b/examples/transv1.8to2.x/deepspeech/decoders/decoders_deprecated.py
@ -1,248 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Contains various CTC decoders."""
 import multiprocessing
 from itertools import groupby
 from math import log
 import numpy as np
 def ctc_greedy_decoder(probs_seq, vocabulary):
    """CTC greedy (best path) decoder.
    Path consisting of the most probable tokens are further post-processed to
    remove consecutive repetitions and all blanks.
    :param probs_seq: 2-D list of probabilities over the vocabulary for each
                      character. Each element is a list of float probabilities
                      for one character.
    :type probs_seq: list
    :param vocabulary: Vocabulary list.
    :type vocabulary: list
    :return: Decoding result string.
    :rtype: baseline
    """
    # dimension verification
    for probs in probs_seq:
        if not len(probs) == len(vocabulary) + 1:
            raise ValueError("probs_seq dimension mismatchedd with vocabulary")
    # argmax to get the best index for each time step
    max_index_list = list(np.array(probs_seq).argmax(axis=1))
    # remove consecutive duplicate indexes
    index_list = [index_group[0] for index_group in groupby(max_index_list)]
    # remove blank indexes
    blank_index = len(vocabulary)
    index_list = [index for index in index_list if index != blank_index]
    # convert index list to string
    return ''.join([vocabulary[index] for index in index_list])
 def ctc_beam_search_decoder(probs_seq,
                            beam_size,
                            vocabulary,
                            cutoff_prob=1.0,
                            cutoff_top_n=40,
                            ext_scoring_func=None,
                            nproc=False):
    """CTC Beam search decoder.
    It utilizes beam search to approximately select top best decoding
    labels and returning results in the descending order.
    The implementation is based on Prefix Beam Search
    (https://arxiv.org/abs/1408.2873), and the unclear part is
    redesigned. Two important modifications: 1) in the iterative computation
    of probabilities, the assignment operation is changed to accumulation for
    one prefix may comes from different paths; 2) the if condition "if l^+ not
    in A_prev then" after probabilities' computation is deprecated for it is
    hard to understand and seems unnecessary.
    :param probs_seq: 2-D list of probability distributions over each time
                      step, with each element being a list of normalized
                      probabilities over vocabulary and blank.
    :type probs_seq: 2-D list
    :param beam_size: Width for beam search.
    :type beam_size: int
    :param vocabulary: Vocabulary list.
    :type vocabulary: list
    :param cutoff_prob: Cutoff probability in pruning,
                        default 1.0, no pruning.
    :type cutoff_prob: float
    :param ext_scoring_func: External scoring function for
                            partially decoded sentence, e.g. word count
                            or language model.
    :type external_scoring_func: callable
    :param nproc: Whether the decoder used in multiprocesses.
    :type nproc: bool
    :return: List of tuples of log probability and sentence as decoding
             results, in descending order of the probability.
    :rtype: list
    """
    # dimension check
    for prob_list in probs_seq:
        if not len(prob_list) == len(vocabulary) + 1:
            raise ValueError("The shape of prob_seq does not match with the "
                             "shape of the vocabulary.")
    # blank_id assign
    blank_id = len(vocabulary)
    # If the decoder called in the multiprocesses, then use the global scorer
    # instantiated in ctc_beam_search_decoder_batch().
    if nproc is True:
        global ext_nproc_scorer
        ext_scoring_func = ext_nproc_scorer
    # initialize
    # prefix_set_prev: the set containing selected prefixes
    # probs_b_prev: prefixes' probability ending with blank in previous step
    # probs_nb_prev: prefixes' probability ending with non-blank in previous step
    prefix_set_prev = {'\t': 1.0}
    probs_b_prev, probs_nb_prev = {'\t': 1.0}, {'\t': 0.0}
    # extend prefix in loop
    for time_step in range(len(probs_seq)):
        # prefix_set_next: the set containing candidate prefixes
        # probs_b_cur: prefixes' probability ending with blank in current step
        # probs_nb_cur: prefixes' probability ending with non-blank in current step
        prefix_set_next, probs_b_cur, probs_nb_cur = {}, {}, {}
        prob_idx = list(enumerate(probs_seq[time_step]))
        cutoff_len = len(prob_idx)
        # If pruning is enabled
        if cutoff_prob < 1.0 or cutoff_top_n < cutoff_len:
            prob_idx = sorted(prob_idx, key=lambda asd: asd[1], reverse=True)
            cutoff_len, cum_prob = 0, 0.0
            for i in range(len(prob_idx)):
                cum_prob += prob_idx[i][1]
                cutoff_len += 1
                if cum_prob >= cutoff_prob:
                    break
            cutoff_len = min(cutoff_len, cutoff_top_n)
            prob_idx = prob_idx[0:cutoff_len]
        for l in prefix_set_prev:
            if l not in prefix_set_next:
                probs_b_cur[l], probs_nb_cur[l] = 0.0, 0.0
            # extend prefix by travering prob_idx
            for index in range(cutoff_len):
                c, prob_c = prob_idx[index][0], prob_idx[index][1]
                if c == blank_id:
                    probs_b_cur[l] += prob_c * (
                        probs_b_prev[l] + probs_nb_prev[l])
                else:
                    last_char = l[-1]
                    new_char = vocabulary[c]
                    l_plus = l + new_char
                    if l_plus not in prefix_set_next:
                        probs_b_cur[l_plus], probs_nb_cur[l_plus] = 0.0, 0.0
                    if new_char == last_char:
                        probs_nb_cur[l_plus] += prob_c * probs_b_prev[l]
                        probs_nb_cur[l] += prob_c * probs_nb_prev[l]
                    elif new_char == ' ':
                        if (ext_scoring_func is None) or (len(l) == 1):
                            score = 1.0
                        else:
                            prefix = l[1:]
                            score = ext_scoring_func(prefix)
                        probs_nb_cur[l_plus] += score * prob_c * (
                            probs_b_prev[l] + probs_nb_prev[l])
                    else:
                        probs_nb_cur[l_plus] += prob_c * (
                            probs_b_prev[l] + probs_nb_prev[l])
                    # add l_plus into prefix_set_next
                    prefix_set_next[l_plus] = probs_nb_cur[
                        l_plus] + probs_b_cur[l_plus]
            # add l into prefix_set_next
            prefix_set_next[l] = probs_b_cur[l] + probs_nb_cur[l]
        # update probs
        probs_b_prev, probs_nb_prev = probs_b_cur, probs_nb_cur
        # store top beam_size prefixes
        prefix_set_prev = sorted(
            prefix_set_next.items(), key=lambda asd: asd[1], reverse=True)
        if beam_size < len(prefix_set_prev):
            prefix_set_prev = prefix_set_prev[:beam_size]
        prefix_set_prev = dict(prefix_set_prev)
    beam_result = []
    for seq, prob in prefix_set_prev.items():
        if prob > 0.0 and len(seq) > 1:
            result = seq[1:]
            # score last word by external scorer
            if (ext_scoring_func is not None) and (result[-1] != ' '):
                prob = prob * ext_scoring_func(result)
            log_prob = log(prob)
            beam_result.append((log_prob, result))
        else:
            beam_result.append((float('-inf'), ''))
    # output top beam_size decoding results
    beam_result = sorted(beam_result, key=lambda asd: asd[0], reverse=True)
    return beam_result
 def ctc_beam_search_decoder_batch(probs_split,
                                  beam_size,
                                  vocabulary,
                                  num_processes,
                                  cutoff_prob=1.0,
                                  cutoff_top_n=40,
                                  ext_scoring_func=None):
    """CTC beam search decoder using multiple processes.
    :param probs_seq: 3-D list with each element as an instance of 2-D list
                      of probabilities used by ctc_beam_search_decoder().
    :type probs_seq: 3-D list
    :param beam_size: Width for beam search.
    :type beam_size: int
    :param vocabulary: Vocabulary list.
    :type vocabulary: list
    :param num_processes: Number of parallel processes.
    :type num_processes: int
    :param cutoff_prob: Cutoff probability in pruning,
                        default 1.0, no pruning.
    :type cutoff_prob: float
    :param num_processes: Number of parallel processes.
    :type num_processes: int
    :param ext_scoring_func: External scoring function for
                            partially decoded sentence, e.g. word count
                            or language model.
    :type external_scoring_function: callable
    :return: List of tuples of log probability and sentence as decoding
             results, in descending order of the probability.
    :rtype: list
    """
    if not num_processes > 0:
        raise ValueError("Number of processes must be positive!")
    # use global variable to pass the externnal scorer to beam search decoder
    global ext_nproc_scorer
    ext_nproc_scorer = ext_scoring_func
    nproc = True
    pool = multiprocessing.Pool(processes=num_processes)
    results = []
    for i, probs_list in enumerate(probs_split):
        args = (probs_list, beam_size, vocabulary, cutoff_prob, cutoff_top_n,
                None, nproc)
        results.append(pool.apply_async(ctc_beam_search_decoder, args))
    pool.close()
    pool.join()
    beam_search_results = [result.get() for result in results]
    return beam_search_results
--- a/examples/transv1.8to2.x/deepspeech/decoders/scorer_deprecated.py
+++ b/examples/transv1.8to2.x/deepspeech/decoders/scorer_deprecated.py
@ -1,78 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """External Scorer for Beam Search Decoder."""
 import os
 import kenlm
 import numpy as np
 class Scorer(object):
    """External scorer to evaluate a prefix or whole sentence in
       beam search decoding, including the score from n-gram language
       model and word count.
    :param alpha: Parameter associated with language model. Don't use
                  language model when alpha = 0.
    :type alpha: float
    :param beta: Parameter associated with word count. Don't use word
                count when beta = 0.
    :type beta: float
    :model_path: Path to load language model.
    :type model_path: str
    """
    def __init__(self, alpha, beta, model_path):
        self._alpha = alpha
        self._beta = beta
        if not os.path.isfile(model_path):
            raise IOError("Invaid language model path: %s" % model_path)
        self._language_model = kenlm.LanguageModel(model_path)
    # n-gram language model scoring
    def _language_model_score(self, sentence):
        #log10 prob of last word
        log_cond_prob = list(
            self._language_model.full_scores(sentence, eos=False))[-1][0]
        return np.power(10, log_cond_prob)
    # word insertion term
    def _word_count(self, sentence):
        words = sentence.strip().split(' ')
        return len(words)
    # reset alpha and beta
    def reset_params(self, alpha, beta):
        self._alpha = alpha
        self._beta = beta
    # execute evaluation
    def __call__(self, sentence, log=False):
        """Evaluation function, gathering all the different scores
        and return the final one.
        :param sentence: The input sentence for evalutation
        :type sentence: str
        :param log: Whether return the score in log representation.
        :type log: bool
        :return: Evaluation score, in the decimal or log.
        :rtype: float
        """
        lm = self._language_model_score(sentence)
        word_cnt = self._word_count(sentence)
        if log is False:
            score = np.power(lm, self._alpha) * np.power(word_cnt, self._beta)
        else:
            score = self._alpha * np.log(lm) + self._beta * np.log(word_cnt)
        return score
--- a/examples/transv1.8to2.x/deepspeech/decoders/swig_wrapper.py
+++ b/examples/transv1.8to2.x/deepspeech/decoders/swig_wrapper.py
@ -1,134 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Wrapper for various CTC decoders in SWIG."""
 import swig_decoders
 class Scorer(swig_decoders.Scorer):
    """Wrapper for Scorer.
    :param alpha: Parameter associated with language model. Don't use
                  language model when alpha = 0.
    :type alpha: float
    :param beta: Parameter associated with word count. Don't use word
                 count when beta = 0.
    :type beta: float
    :model_path: Path to load language model.
    :type model_path: str
    """
    def __init__(self, alpha, beta, model_path, vocabulary):
        swig_decoders.Scorer.__init__(self, alpha, beta, model_path, vocabulary)
 def ctc_greedy_decoder(probs_seq, vocabulary, blank_id):
    """Wrapper for ctc best path decoder in swig.
    :param probs_seq: 2-D list of probability distributions over each time
                      step, with each element being a list of normalized
                      probabilities over vocabulary and blank.
    :type probs_seq: 2-D list
    :param vocabulary: Vocabulary list.
    :type vocabulary: list
    :return: Decoding result string.
    :rtype: str
    """
    result = swig_decoders.ctc_greedy_decoder(probs_seq.tolist(), vocabulary,
                                              blank_id)
    return result
 def ctc_beam_search_decoder(probs_seq,
                            vocabulary,
                            beam_size,
                            cutoff_prob=1.0,
                            cutoff_top_n=40,
                            ext_scoring_func=None,
                            blank_id=0):
    """Wrapper for the CTC Beam Search Decoder.
    :param probs_seq: 2-D list of probability distributions over each time
                      step, with each element being a list of normalized
                      probabilities over vocabulary and blank.
    :type probs_seq: 2-D list
    :param vocabulary: Vocabulary list.
    :type vocabulary: list
    :param beam_size: Width for beam search.
    :type beam_size: int
    :param cutoff_prob: Cutoff probability in pruning,
                        default 1.0, no pruning.
    :type cutoff_prob: float
    :param cutoff_top_n: Cutoff number in pruning, only top cutoff_top_n
                         characters with highest probs in vocabulary will be
                         used in beam search, default 40.
    :type cutoff_top_n: int
    :param ext_scoring_func: External scoring function for
                             partially decoded sentence, e.g. word count
                             or language model.
    :type external_scoring_func: callable
    :return: List of tuples of log probability and sentence as decoding
             results, in descending order of the probability.
    :rtype: list
    """
    beam_results = swig_decoders.ctc_beam_search_decoder(
        probs_seq.tolist(), vocabulary, beam_size, cutoff_prob, cutoff_top_n,
        ext_scoring_func, blank_id)
    beam_results = [(res[0], res[1].decode('utf-8')) for res in beam_results]
    return beam_results
 def ctc_beam_search_decoder_batch(probs_split,
                                  vocabulary,
                                  beam_size,
                                  num_processes,
                                  cutoff_prob=1.0,
                                  cutoff_top_n=40,
                                  ext_scoring_func=None,
                                  blank_id=0):
    """Wrapper for the batched CTC beam search decoder.
    :param probs_seq: 3-D list with each element as an instance of 2-D list
                      of probabilities used by ctc_beam_search_decoder().
    :type probs_seq: 3-D list
    :param vocabulary: Vocabulary list.
    :type vocabulary: list
    :param beam_size: Width for beam search.
    :type beam_size: int
    :param num_processes: Number of parallel processes.
    :type num_processes: int
    :param cutoff_prob: Cutoff probability in vocabulary pruning,
                        default 1.0, no pruning.
    :type cutoff_prob: float
    :param cutoff_top_n: Cutoff number in pruning, only top cutoff_top_n
                         characters with highest probs in vocabulary will be
                         used in beam search, default 40.
    :type cutoff_top_n: int
    :param num_processes: Number of parallel processes.
    :type num_processes: int
    :param ext_scoring_func: External scoring function for
                             partially decoded sentence, e.g. word count
                             or language model.
    :type external_scoring_function: callable
    :return: List of tuples of log probability and sentence as decoding
             results, in descending order of the probability.
    :rtype: list
    """
    probs_split = [probs_seq.tolist() for probs_seq in probs_split]
    batch_beam_results = swig_decoders.ctc_beam_search_decoder_batch(
        probs_split, vocabulary, beam_size, num_processes, cutoff_prob,
        cutoff_top_n, ext_scoring_func, blank_id)
    batch_beam_results = [[(res[0], res[1]) for res in beam_results]
                          for beam_results in batch_beam_results]
    return batch_beam_results
--- a/examples/transv1.8to2.x/deepspeech/frontend/init.py
+++ b/examples/transv1.8to2.x/deepspeech/frontend/init.py
@ -1,13 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
--- a/examples/transv1.8to2.x/deepspeech/frontend/audio.py
+++ b/examples/transv1.8to2.x/deepspeech/frontend/audio.py
@ -1,721 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Contains the audio segment class."""
 import copy
 import io
 import random
 import re
 import struct
 import numpy as np
 import resampy
 import soundfile
 import soxbindings as sox
 from scipy import signal
 class AudioSegment(object):
    """Monaural audio segment abstraction.
    :param samples: Audio samples [num_samples x num_channels].
    :type samples: ndarray.float32
    :param sample_rate: Audio sample rate.
    :type sample_rate: int
    :raises TypeError: If the sample data type is not float or int.
    """
    def __init__(self, samples, sample_rate):
        """Create audio segment from samples.
        Samples are convert float32 internally, with int scaled to [-1, 1].
        """
        self._samples = self._convert_samples_to_float32(samples)
        self._sample_rate = sample_rate
        if self._samples.ndim >= 2:
            self._samples = np.mean(self._samples, 1)
    def __eq__(self, other):
        """Return whether two objects are equal."""
        if type(other) is not type(self):
            return False
        if self._sample_rate != other._sample_rate:
            return False
        if self._samples.shape != other._samples.shape:
            return False
        if np.any(self.samples != other._samples):
            return False
        return True
    def __ne__(self, other):
        """Return whether two objects are unequal."""
        return not self.__eq__(other)
    def __str__(self):
        """Return human-readable representation of segment."""
        return ("%s: num_samples=%d, sample_rate=%d, duration=%.2fsec, "
                "rms=%.2fdB" % (type(self), self.num_samples, self.sample_rate,
                                self.duration, self.rms_db))
    @classmethod
    def from_file(cls, file):
        """Create audio segment from audio file.
        :param filepath: Filepath or file object to audio file.
        :type filepath: str|file
        :return: Audio segment instance.
        :rtype: AudioSegment
        """
        if isinstance(file, str) and re.findall(r".seqbin_\d+$", file):
            return cls.from_sequence_file(file)
        else:
            samples, sample_rate = soundfile.read(file, dtype='float32')
            return cls(samples, sample_rate)
    @classmethod
    def slice_from_file(cls, file, start=None, end=None):
        """Loads a small section of an audio without having to load
        the entire file into the memory which can be incredibly wasteful.
        :param file: Input audio filepath or file object.
        :type file: str|file
        :param start: Start time in seconds. If start is negative, it wraps
                      around from the end. If not provided, this function 
                      reads from the very beginning.
        :type start: float
        :param end: End time in seconds. If end is negative, it wraps around
                    from the end. If not provided, the default behvaior is
                    to read to the end of the file.
        :type end: float
        :return: AudioSegment instance of the specified slice of the input
                 audio file.
        :rtype: AudioSegment
        :raise ValueError: If start or end is incorrectly set, e.g. out of
                           bounds in time.
        """
        sndfile = soundfile.SoundFile(file)
        sample_rate = sndfile.samplerate
        duration = float(len(sndfile)) / sample_rate
        start = 0. if start is None else start
        end = duration if end is None else end
        if start < 0.0:
            start += duration
        if end < 0.0:
            end += duration
        if start < 0.0:
            raise ValueError("The slice start position (%f s) is out of "
                             "bounds." % start)
        if end < 0.0:
            raise ValueError("The slice end position (%f s) is out of bounds." %
                             end)
        if start > end:
            raise ValueError("The slice start position (%f s) is later than "
                             "the slice end position (%f s)." % (start, end))
        if end > duration:
            raise ValueError("The slice end position (%f s) is out of bounds "
                             "(> %f s)" % (end, duration))
        start_frame = int(start * sample_rate)
        end_frame = int(end * sample_rate)
        sndfile.seek(start_frame)
        data = sndfile.read(frames=end_frame - start_frame, dtype='float32')
        return cls(data, sample_rate)
    @classmethod
    def from_sequence_file(cls, filepath):
        """Create audio segment from sequence file. Sequence file is a binary
        file containing a collection of multiple audio files, with several
        header bytes in the head indicating the offsets of each audio byte data
        chunk.
        The format is:
            4 bytes (int, version),
            4 bytes (int, num of utterance),
            4 bytes (int, bytes per header),
            [bytes_per_header*(num_utterance+1)] bytes (offsets for each audio),
            audio_bytes_data_of_1st_utterance,
            audio_bytes_data_of_2nd_utterance,
            ......
        Sequence file name must end with ".seqbin". And the filename of the 5th
        utterance's audio file in sequence file "xxx.seqbin" must be
        "xxx.seqbin_5", with "5" indicating the utterance index within this
        sequence file (starting from 1).
        :param filepath: Filepath of sequence file.
        :type filepath: str
        :return: Audio segment instance.
        :rtype: AudioSegment
        """
        # parse filepath
        matches = re.match(r"(.+\.seqbin)_(\d+)", filepath)
        if matches is None:
            raise IOError("File type of %s is not supported" % filepath)
        filename = matches.group(1)
        fileno = int(matches.group(2))
        # read headers
        f = io.open(filename, mode='rb', encoding='utf8')
        version = f.read(4)
        num_utterances = struct.unpack("i", f.read(4))[0]
        bytes_per_header = struct.unpack("i", f.read(4))[0]
        header_bytes = f.read(bytes_per_header * (num_utterances + 1))
        header = [
            struct.unpack("i", header_bytes[bytes_per_header * i:
                                            bytes_per_header * (i + 1)])[0]
            for i in range(num_utterances + 1)
        ]
        # read audio bytes
        f.seek(header[fileno - 1])
        audio_bytes = f.read(header[fileno] - header[fileno - 1])
        f.close()
        # create audio segment
        try:
            return cls.from_bytes(audio_bytes)
        except Exception as e:
            samples = np.frombuffer(audio_bytes, dtype='int16')
            return cls(samples=samples, sample_rate=8000)
    @classmethod
    def from_bytes(cls, bytes):
        """Create audio segment from a byte string containing audio samples.
        :param bytes: Byte string containing audio samples.
        :type bytes: str
        :return: Audio segment instance.
        :rtype: AudioSegment
        """
        samples, sample_rate = soundfile.read(
            io.BytesIO(bytes), dtype='float32')
        return cls(samples, sample_rate)
    @classmethod
    def concatenate(cls, *segments):
        """Concatenate an arbitrary number of audio segments together.
        :param *segments: Input audio segments to be concatenated.
        :type *segments: tuple of AudioSegment
        :return: Audio segment instance as concatenating results.
        :rtype: AudioSegment
        :raises ValueError: If the number of segments is zero, or if the 
                            sample_rate of any segments does not match.
        :raises TypeError: If any segment is not AudioSegment instance.
        """
        # Perform basic sanity-checks.
        if len(segments) == 0:
            raise ValueError("No audio segments are given to concatenate.")
        sample_rate = segments[0]._sample_rate
        for seg in segments:
            if sample_rate != seg._sample_rate:
                raise ValueError("Can't concatenate segments with "
                                 "different sample rates")
            if type(seg) is not cls:
                raise TypeError("Only audio segments of the same type "
                                "can be concatenated.")
        samples = np.concatenate([seg.samples for seg in segments])
        return cls(samples, sample_rate)
    @classmethod
    def make_silence(cls, duration, sample_rate):
        """Creates a silent audio segment of the given duration and sample rate.
        :param duration: Length of silence in seconds.
        :type duration: float
        :param sample_rate: Sample rate.
        :type sample_rate: float
        :return: Silent AudioSegment instance of the given duration.
        :rtype: AudioSegment
        """
        samples = np.zeros(int(duration * sample_rate))
        return cls(samples, sample_rate)
    def to_wav_file(self, filepath, dtype='float32'):
        """Save audio segment to disk as wav file.
        :param filepath: WAV filepath or file object to save the
                         audio segment.
        :type filepath: str|file
        :param dtype: Subtype for audio file. Options: 'int16', 'int32',
                      'float32', 'float64'. Default is 'float32'.
        :type dtype: str
        :raises TypeError: If dtype is not supported.
        """
        samples = self._convert_samples_from_float32(self._samples, dtype)
        subtype_map = {
            'int16': 'PCM_16',
            'int32': 'PCM_32',
            'float32': 'FLOAT',
            'float64': 'DOUBLE'
        }
        soundfile.write(
            filepath,
            samples,
            self._sample_rate,
            format='WAV',
            subtype=subtype_map[dtype])
    def superimpose(self, other):
        """Add samples from another segment to those of this segment
        (sample-wise addition, not segment concatenation).
        Note that this is an in-place transformation.
        :param other: Segment containing samples to be added in.
        :type other: AudioSegments
        :raise TypeError: If type of two segments don't match.
        :raise ValueError: If the sample rates of the two segments are not
                           equal, or if the lengths of segments don't match.
        """
        if isinstance(other, type(self)):
            raise TypeError("Cannot add segments of different types: %s "
                            "and %s." % (type(self), type(other)))
        if self._sample_rate != other._sample_rate:
            raise ValueError("Sample rates must match to add segments.")
        if len(self._samples) != len(other._samples):
            raise ValueError("Segment lengths must match to add segments.")
        self._samples += other._samples
    def to_bytes(self, dtype='float32'):
        """Create a byte string containing the audio content.
        :param dtype: Data type for export samples. Options: 'int16', 'int32',
                      'float32', 'float64'. Default is 'float32'.
        :type dtype: str
        :return: Byte string containing audio content.
        :rtype: str
        """
        samples = self._convert_samples_from_float32(self._samples, dtype)
        return samples.tostring()
    def to(self, dtype='int16'):
        """Create a `dtype` audio content.
        :param dtype: Data type for export samples. Options: 'int16', 'int32',
                      'float32', 'float64'. Default is 'float32'.
        :type dtype: str
        :return: np.ndarray containing `dtype` audio content.
        :rtype: str
        """
        samples = self._convert_samples_from_float32(self._samples, dtype)
        return samples
    def gain_db(self, gain):
        """Apply gain in decibels to samples.
        Note that this is an in-place transformation.
        :param gain: Gain in decibels to apply to samples. 
        :type gain: float|1darray
        """
        self._samples *= 10.**(gain / 20.)
    def change_speed(self, speed_rate):
        """Change the audio speed by linear interpolation.
        Note that this is an in-place transformation.
        :param speed_rate: Rate of speed change:
                           speed_rate > 1.0, speed up the audio;
                           speed_rate = 1.0, unchanged;
                           speed_rate < 1.0, slow down the audio;
                           speed_rate <= 0.0, not allowed, raise ValueError.
        :type speed_rate: float
        :raises ValueError: If speed_rate <= 0.0.
        """
        if speed_rate == 1.0:
            return
        if speed_rate <= 0:
            raise ValueError("speed_rate should be greater than zero.")
        # numpy
        # old_length = self._samples.shape[0]
        # new_length = int(old_length / speed_rate)
        # old_indices = np.arange(old_length)
        # new_indices = np.linspace(start=0, stop=old_length, num=new_length)
        # self._samples = np.interp(new_indices, old_indices, self._samples)
        # sox, slow
        tfm = sox.Transformer()
        tfm.set_globals(multithread=False)
        tfm.speed(speed_rate)
        self._samples = tfm.build_array(
            input_array=self._samples,
            sample_rate_in=self._sample_rate).squeeze(-1).astype(
                np.float32).copy()
    def normalize(self, target_db=-20, max_gain_db=300.0):
        """Normalize audio to be of the desired RMS value in decibels.
        Note that this is an in-place transformation.
        :param target_db: Target RMS value in decibels. This value should be
                          less than 0.0 as 0.0 is full-scale audio.
        :type target_db: float
        :param max_gain_db: Max amount of gain in dB that can be applied for
                            normalization. This is to prevent nans when
                            attempting to normalize a signal consisting of
                            all zeros.
        :type max_gain_db: float
        :raises ValueError: If the required gain to normalize the segment to
                            the target_db value exceeds max_gain_db.
        """
        gain = target_db - self.rms_db
        if gain > max_gain_db:
            raise ValueError(
                "Unable to normalize segment to %f dB because the "
                "the probable gain have exceeds max_gain_db (%f dB)" %
                (target_db, max_gain_db))
        self.gain_db(min(max_gain_db, target_db - self.rms_db))
    def normalize_online_bayesian(self,
                                  target_db,
                                  prior_db,
                                  prior_samples,
                                  startup_delay=0.0):
        """Normalize audio using a production-compatible online/causal
        algorithm. This uses an exponential likelihood and gamma prior to
        make online estimates of the RMS even when there are very few samples.
        Note that this is an in-place transformation.
        :param target_db: Target RMS value in decibels.
        :type target_bd: float
        :param prior_db: Prior RMS estimate in decibels.
        :type prior_db: float
        :param prior_samples: Prior strength in number of samples.
        :type prior_samples: float
        :param startup_delay: Default 0.0s. If provided, this function will
                              accrue statistics for the first startup_delay 
                              seconds before applying online normalization.
        :type startup_delay: float
        """
        # Estimate total RMS online.
        startup_sample_idx = min(self.num_samples - 1,
                                 int(self.sample_rate * startup_delay))
        prior_mean_squared = 10.**(prior_db / 10.)
        prior_sum_of_squares = prior_mean_squared * prior_samples
        cumsum_of_squares = np.cumsum(self.samples**2)
        sample_count = np.arange(self.num_samples) + 1
        if startup_sample_idx > 0:
            cumsum_of_squares[:startup_sample_idx] = \
                cumsum_of_squares[startup_sample_idx]
            sample_count[:startup_sample_idx] = \
                sample_count[startup_sample_idx]
        mean_squared_estimate = ((cumsum_of_squares + prior_sum_of_squares) /
                                 (sample_count + prior_samples))
        rms_estimate_db = 10 * np.log10(mean_squared_estimate)
        # Compute required time-varying gain.
        gain_db = target_db - rms_estimate_db
        self.gain_db(gain_db)
    def resample(self, target_sample_rate, filter='kaiser_best'):
        """Resample the audio to a target sample rate.
        Note that this is an in-place transformation.
        :param target_sample_rate: Target sample rate.
        :type target_sample_rate: int
        :param filter: The resampling filter to use one of {'kaiser_best',
                       'kaiser_fast'}.
        :type filter: str
        """
        self._samples = resampy.resample(
            self.samples, self.sample_rate, target_sample_rate, filter=filter)
        self._sample_rate = target_sample_rate
    def pad_silence(self, duration, sides='both'):
        """Pad this audio sample with a period of silence.
        Note that this is an in-place transformation.
        :param duration: Length of silence in seconds to pad.
        :type duration: float
        :param sides: Position for padding:
                     'beginning' - adds silence in the beginning;
                     'end' - adds silence in the end;
                     'both' - adds silence in both the beginning and the end.
        :type sides: str
        :raises ValueError: If sides is not supported.
        """
        if duration == 0.0:
            return self
        cls = type(self)
        silence = self.make_silence(duration, self._sample_rate)
        if sides == "beginning":
            padded = cls.concatenate(silence, self)
        elif sides == "end":
            padded = cls.concatenate(self, silence)
        elif sides == "both":
            padded = cls.concatenate(silence, self, silence)
        else:
            raise ValueError("Unknown value for the sides %s" % sides)
        self._samples = padded._samples
    def shift(self, shift_ms):
        """Shift the audio in time. If `shift_ms` is positive, shift with time
        advance; if negative, shift with time delay. Silence are padded to
        keep the duration unchanged.
        Note that this is an in-place transformation.
        :param shift_ms: Shift time in millseconds. If positive, shift with
                         time advance; if negative; shift with time delay.
        :type shift_ms: float
        :raises ValueError: If shift_ms is longer than audio duration.
        """
        if abs(shift_ms) / 1000.0 > self.duration:
            raise ValueError("Absolute value of shift_ms should be smaller "
                             "than audio duration.")
        shift_samples = int(shift_ms * self._sample_rate / 1000)
        if shift_samples > 0:
            # time advance
            self._samples[:-shift_samples] = self._samples[shift_samples:]
            self._samples[-shift_samples:] = 0
        elif shift_samples < 0:
            # time delay
            self._samples[-shift_samples:] = self._samples[:shift_samples]
            self._samples[:-shift_samples] = 0
    def subsegment(self, start_sec=None, end_sec=None):
        """Cut the AudioSegment between given boundaries.
        Note that this is an in-place transformation.
        :param start_sec: Beginning of subsegment in seconds.
        :type start_sec: float
        :param end_sec: End of subsegment in seconds.
        :type end_sec: float
        :raise ValueError: If start_sec or end_sec is incorrectly set, e.g. out
                           of bounds in time.
        """
        start_sec = 0.0 if start_sec is None else start_sec
        end_sec = self.duration if end_sec is None else end_sec
        if start_sec < 0.0:
            start_sec = self.duration + start_sec
        if end_sec < 0.0:
            end_sec = self.duration + end_sec
        if start_sec < 0.0:
            raise ValueError("The slice start position (%f s) is out of "
                             "bounds." % start_sec)
        if end_sec < 0.0:
            raise ValueError("The slice end position (%f s) is out of bounds." %
                             end_sec)
        if start_sec > end_sec:
            raise ValueError("The slice start position (%f s) is later than "
                             "the end position (%f s)." % (start_sec, end_sec))
        if end_sec > self.duration:
            raise ValueError("The slice end position (%f s) is out of bounds "
                             "(> %f s)" % (end_sec, self.duration))
        start_sample = int(round(start_sec * self._sample_rate))
        end_sample = int(round(end_sec * self._sample_rate))
        self._samples = self._samples[start_sample:end_sample]
    def random_subsegment(self, subsegment_length, rng=None):
        """Cut the specified length of the audiosegment randomly.
        Note that this is an in-place transformation.
        :param subsegment_length: Subsegment length in seconds.
        :type subsegment_length: float
        :param rng: Random number generator state.
        :type rng: random.Random
        :raises ValueError: If the length of subsegment is greater than
                            the origineal segemnt.
        """
        rng = random.Random() if rng is None else rng
        if subsegment_length > self.duration:
            raise ValueError("Length of subsegment must not be greater "
                             "than original segment.")
        start_time = rng.uniform(0.0, self.duration - subsegment_length)
        self.subsegment(start_time, start_time + subsegment_length)
    def convolve(self, impulse_segment, allow_resample=False):
        """Convolve this audio segment with the given impulse segment.
        Note that this is an in-place transformation.
        :param impulse_segment: Impulse response segments.
        :type impulse_segment: AudioSegment
        :param allow_resample: Indicates whether resampling is allowed when
                               the impulse_segment has a different sample 
                               rate from this signal.
        :type allow_resample: bool
        :raises ValueError: If the sample rate is not match between two
                            audio segments when resample is not allowed.
        """
        if allow_resample and self.sample_rate != impulse_segment.sample_rate:
            impulse_segment.resample(self.sample_rate)
        if self.sample_rate != impulse_segment.sample_rate:
            raise ValueError("Impulse segment's sample rate (%d Hz) is not "
                             "equal to base signal sample rate (%d Hz)." %
                             (impulse_segment.sample_rate, self.sample_rate))
        samples = signal.fftconvolve(self.samples, impulse_segment.samples,
                                     "full")
        self._samples = samples
    def convolve_and_normalize(self, impulse_segment, allow_resample=False):
        """Convolve and normalize the resulting audio segment so that it
        has the same average power as the input signal.
        Note that this is an in-place transformation.
        :param impulse_segment: Impulse response segments.
        :type impulse_segment: AudioSegment
        :param allow_resample: Indicates whether resampling is allowed when
                               the impulse_segment has a different sample
                               rate from this signal.
        :type allow_resample: bool
        """
        target_db = self.rms_db
        self.convolve(impulse_segment, allow_resample=allow_resample)
        self.normalize(target_db)
    def add_noise(self,
                  noise,
                  snr_dB,
                  allow_downsampling=False,
                  max_gain_db=300.0,
                  rng=None):
        """Add the given noise segment at a specific signal-to-noise ratio.
        If the noise segment is longer than this segment, a random subsegment
        of matching length is sampled from it and used instead.
        Note that this is an in-place transformation.
        :param noise: Noise signal to add.
        :type noise: AudioSegment
        :param snr_dB: Signal-to-Noise Ratio, in decibels.
        :type snr_dB: float
        :param allow_downsampling: Whether to allow the noise signal to be
                                   downsampled to match the base signal sample
                                   rate.
        :type allow_downsampling: bool
        :param max_gain_db: Maximum amount of gain to apply to noise signal
                            before adding it in. This is to prevent attempting
                            to apply infinite gain to a zero signal.
        :type max_gain_db: float
        :param rng: Random number generator state.
        :type rng: None|random.Random
        :raises ValueError: If the sample rate does not match between the two
                            audio segments when downsampling is not allowed, or
                            if the duration of noise segments is shorter than
                            original audio segments.
        """
        rng = random.Random() if rng is None else rng
        if allow_downsampling and noise.sample_rate > self.sample_rate:
            noise = noise.resample(self.sample_rate)
        if noise.sample_rate != self.sample_rate:
            raise ValueError("Noise sample rate (%d Hz) is not equal to base "
                             "signal sample rate (%d Hz)." % (noise.sample_rate,
                                                              self.sample_rate))
        if noise.duration < self.duration:
            raise ValueError("Noise signal (%f sec) must be at least as long as"
                             " base signal (%f sec)." %
                             (noise.duration, self.duration))
        noise_gain_db = min(self.rms_db - noise.rms_db - snr_dB, max_gain_db)
        noise_new = copy.deepcopy(noise)
        noise_new.random_subsegment(self.duration, rng=rng)
        noise_new.gain_db(noise_gain_db)
        self.superimpose(noise_new)
    @property
    def samples(self):
        """Return audio samples.
        :return: Audio samples.
        :rtype: ndarray
        """
        return self._samples.copy()
    @property
    def sample_rate(self):
        """Return audio sample rate.
        :return: Audio sample rate.
        :rtype: int
        """
        return self._sample_rate
    @property
    def num_samples(self):
        """Return number of samples.
        :return: Number of samples.
        :rtype: int
        """
        return self._samples.shape[0]
    @property
    def duration(self):
        """Return audio duration.
        :return: Audio duration in seconds.
        :rtype: float
        """
        return self._samples.shape[0] / float(self._sample_rate)
    @property
    def rms_db(self):
        """Return root mean square energy of the audio in decibels.
        :return: Root mean square energy in decibels.
        :rtype: float
        """
        # square root => multiply by 10 instead of 20 for dBs
        mean_square = np.mean(self._samples**2)
        return 10 * np.log10(mean_square)
    def _convert_samples_to_float32(self, samples):
        """Convert sample type to float32.
        Audio sample type is usually integer or float-point.
        Integers will be scaled to [-1, 1] in float32.
        """
        float32_samples = samples.astype('float32')
        if samples.dtype in np.sctypes['int']:
            bits = np.iinfo(samples.dtype).bits
            float32_samples *= (1. / 2**(bits - 1))
        elif samples.dtype in np.sctypes['float']:
            pass
        else:
            raise TypeError("Unsupported sample type: %s." % samples.dtype)
        return float32_samples
    def _convert_samples_from_float32(self, samples, dtype):
        """Convert sample type from float32 to dtype.
        Audio sample type is usually integer or float-point. For integer
        type, float32 will be rescaled from [-1, 1] to the maximum range
        supported by the integer type.
        This is for writing a audio file.
        """
        dtype = np.dtype(dtype)
        output_samples = samples.copy()
        if dtype in np.sctypes['int']:
            bits = np.iinfo(dtype).bits
            output_samples *= (2**(bits - 1) / 1.)
            min_val = np.iinfo(dtype).min
            max_val = np.iinfo(dtype).max
            output_samples[output_samples > max_val] = max_val
            output_samples[output_samples < min_val] = min_val
        elif samples.dtype in np.sctypes['float']:
            min_val = np.finfo(dtype).min
            max_val = np.finfo(dtype).max
            output_samples[output_samples > max_val] = max_val
            output_samples[output_samples < min_val] = min_val
        else:
            raise TypeError("Unsupported sample type: %s." % samples.dtype)
        return output_samples.astype(dtype)
--- a/examples/transv1.8to2.x/deepspeech/frontend/augmentor/init.py
+++ b/examples/transv1.8to2.x/deepspeech/frontend/augmentor/init.py
@ -1,13 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
--- a/examples/transv1.8to2.x/deepspeech/frontend/augmentor/augmentation.py
+++ b/examples/transv1.8to2.x/deepspeech/frontend/augmentor/augmentation.py
@ -1,218 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Contains the data augmentation pipeline."""
 import json
 from collections.abc import Sequence
 from inspect import signature
 import numpy as np
 from deepspeech.frontend.augmentor.base import AugmentorBase
 from deepspeech.utils.dynamic_import import dynamic_import
 from deepspeech.utils.log import Log
 __all__ = ["AugmentationPipeline"]
 logger = Log(__name__).getlog()
 import_alias = dict(
    volume="deepspeech.frontend.augmentor.impulse_response:VolumePerturbAugmentor",
    shift="deepspeech.frontend.augmentor.shift_perturb:ShiftPerturbAugmentor",
    speed="deepspeech.frontend.augmentor.speed_perturb:SpeedPerturbAugmentor",
    resample="deepspeech.frontend.augmentor.resample:ResampleAugmentor",
    bayesian_normal="deepspeech.frontend.augmentor.online_bayesian_normalization:OnlineBayesianNormalizationAugmentor",
    noise="deepspeech.frontend.augmentor.noise_perturb:NoisePerturbAugmentor",
    impulse="deepspeech.frontend.augmentor.impulse_response:ImpulseResponseAugmentor",
    specaug="deepspeech.frontend.augmentor.spec_augment:SpecAugmentor", )
 class AugmentationPipeline():
    """Build a pre-processing pipeline with various augmentation models.Such a
    data augmentation pipeline is oftern leveraged to augment the training
    samples to make the model invariant to certain types of perturbations in the
    real world, improving model's generalization ability.
    The pipeline is built according the the augmentation configuration in json
    string, e.g.
    .. code-block::
        [ {
                "type": "noise",
                "params": {"min_snr_dB": 10,
                           "max_snr_dB": 20,
                           "noise_manifest_path": "datasets/manifest.noise"},
                "prob": 0.0
            },
            {
                "type": "speed",
                "params": {"min_speed_rate": 0.9,
                           "max_speed_rate": 1.1},
                "prob": 1.0
            },
            {
                "type": "shift",
                "params": {"min_shift_ms": -5,
                           "max_shift_ms": 5},
                "prob": 1.0
            },
            {
                "type": "volume",
                "params": {"min_gain_dBFS": -10,
                           "max_gain_dBFS": 10},
                "prob": 0.0
            },
            {
                "type": "bayesian_normal",
                "params": {"target_db": -20,
                           "prior_db": -20,
                           "prior_samples": 100},
                "prob": 0.0
            }
        ]
    This augmentation configuration inserts two augmentation models
    into the pipeline, with one is VolumePerturbAugmentor and the other
    SpeedPerturbAugmentor. "prob" indicates the probability of the current
    augmentor to take effect. If "prob" is zero, the augmentor does not take
    effect.
    Params:
        augmentation_config(str): Augmentation configuration in json string.
        random_seed(int): Random seed.
        train(bool): whether is train mode.
    Raises:
        ValueError: If the augmentation json config is in incorrect format".
    """
    SPEC_TYPES = {'specaug'}
    def __init__(self, augmentation_config: str, random_seed: int=0):
        self._rng = np.random.RandomState(random_seed)
        self.conf = {'mode': 'sequential', 'process': []}
        if augmentation_config:
            process = json.loads(augmentation_config)
            self.conf['process'] += process
        self._augmentors, self._rates = self._parse_pipeline_from('all')
        self._audio_augmentors, self._audio_rates = self._parse_pipeline_from(
            'audio')
        self._spec_augmentors, self._spec_rates = self._parse_pipeline_from(
            'feature')
    def __call__(self, xs, uttid_list=None, **kwargs):
        if not isinstance(xs, Sequence):
            is_batch = False
            xs = [xs]
        else:
            is_batch = True
        if isinstance(uttid_list, str):
            uttid_list = [uttid_list for _ in range(len(xs))]
        if self.conf.get("mode", "sequential") == "sequential":
            for idx, (func, rate) in enumerate(
                    zip(self._augmentors, self._rates), 0):
                if self._rng.uniform(0., 1.) >= rate:
                    continue
                # Derive only the args which the func has
                try:
                    param = signature(func).parameters
                except ValueError:
                    # Some function, e.g. built-in function, are failed
                    param = {}
                _kwargs = {k: v for k, v in kwargs.items() if k in param}
                try:
                    if uttid_list is not None and "uttid" in param:
                        xs = [
                            func(x, u, **_kwargs)
                            for x, u in zip(xs, uttid_list)
                        ]
                    else:
                        xs = [func(x, **_kwargs) for x in xs]
                except Exception:
                    logger.fatal("Catch a exception from {}th func: {}".format(
                        idx, func))
                    raise
        else:
            raise NotImplementedError(
                "Not supporting mode={}".format(self.conf["mode"]))
        if is_batch:
            return xs
        else:
            return xs[0]
    def transform_audio(self, audio_segment):
        """Run the pre-processing pipeline for data augmentation.
        Note that this is an in-place transformation.
        :param audio_segment: Audio segment to process.
        :type audio_segment: AudioSegmenet|SpeechSegment
        """
        for augmentor, rate in zip(self._audio_augmentors, self._audio_rates):
            if self._rng.uniform(0., 1.) < rate:
                augmentor.transform_audio(audio_segment)
    def transform_feature(self, spec_segment):
        """spectrogram augmentation.
        Args:
            spec_segment (np.ndarray): audio feature, (D, T).
        """
        for augmentor, rate in zip(self._spec_augmentors, self._spec_rates):
            if self._rng.uniform(0., 1.) < rate:
                spec_segment = augmentor.transform_feature(spec_segment)
        return spec_segment
    def _parse_pipeline_from(self, aug_type='all'):
        """Parse the config json to build a augmentation pipelien."""
        assert aug_type in ('audio', 'feature', 'all'), aug_type
        audio_confs = []
        feature_confs = []
        all_confs = []
        for config in self.conf['process']:
            all_confs.append(config)
            if config["type"] in self.SPEC_TYPES:
                feature_confs.append(config)
            else:
                audio_confs.append(config)
        if aug_type == 'audio':
            aug_confs = audio_confs
        elif aug_type == 'feature':
            aug_confs = feature_confs
        else:
            aug_confs = all_confs
        augmentors = [
            self._get_augmentor(config["type"], config["params"])
            for config in aug_confs
        ]
        rates = [config["prob"] for config in aug_confs]
        return augmentors, rates
    def _get_augmentor(self, augmentor_type, params):
        """Return an augmentation model by the type name, and pass in params."""
        class_obj = dynamic_import(augmentor_type, import_alias)
        assert issubclass(class_obj, AugmentorBase)
        try:
            obj = class_obj(self._rng, **params)
        except Exception:
            raise ValueError("Unknown augmentor type [%s]." % augmentor_type)
        return obj
--- a/examples/transv1.8to2.x/deepspeech/frontend/augmentor/base.py
+++ b/examples/transv1.8to2.x/deepspeech/frontend/augmentor/base.py
@ -1,59 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Contains the abstract base class for augmentation models."""
 from abc import ABCMeta
 from abc import abstractmethod
 class AugmentorBase():
    """Abstract base class for augmentation model (augmentor) class.
    All augmentor classes should inherit from this class, and implement the
    following abstract methods.
    """
    __metaclass__ = ABCMeta
    @abstractmethod
    def __init__(self):
        pass
    @abstractmethod
    def __call__(self, xs):
        raise NotImplementedError("AugmentorBase: Not impl __call__")
    @abstractmethod
    def transform_audio(self, audio_segment):
        """Adds various effects to the input audio segment. Such effects
        will augment the training data to make the model invariant to certain
        types of perturbations in the real world, improving model's
        generalization ability.
        Note that this is an in-place transformation.
        :param audio_segment: Audio segment to add effects to.
        :type audio_segment: AudioSegmenet|SpeechSegment
        """
        raise NotImplementedError("AugmentorBase: Not impl transform_audio")
    @abstractmethod
    def transform_feature(self, spec_segment):
        """Adds various effects to the input audo feature segment. Such effects
        will augment the training data to make the model invariant to certain
        types of time_mask or freq_mask in the real world, improving model's
        generalization ability.
        Args:
            spec_segment (Spectrogram): Spectrogram segment to add effects to.
        """
        raise NotImplementedError("AugmentorBase: Not impl transform_feature")
--- a/examples/transv1.8to2.x/deepspeech/frontend/augmentor/impulse_response.py
+++ b/examples/transv1.8to2.x/deepspeech/frontend/augmentor/impulse_response.py
@ -1,50 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Contains the impulse response augmentation model."""
 from deepspeech.frontend.audio import AudioSegment
 from deepspeech.frontend.augmentor.base import AugmentorBase
 from deepspeech.frontend.utility import read_manifest
 class ImpulseResponseAugmentor(AugmentorBase):
    """Augmentation model for adding impulse response effect.
    :param rng: Random generator object.
    :type rng: random.Random
    :param impulse_manifest_path: Manifest path for impulse audio data.
    :type impulse_manifest_path: str
    """
    def __init__(self, rng, impulse_manifest_path):
        self._rng = rng
        self._impulse_manifest = read_manifest(impulse_manifest_path)
    def __call__(self, x, uttid=None, train=True):
        if not train:
            return x
        self.transform_audio(x)
        return x
    def transform_audio(self, audio_segment):
        """Add impulse response effect.
        Note that this is an in-place transformation.
        :param audio_segment: Audio segment to add effects to.
        :type audio_segment: AudioSegmenet|SpeechSegment
        """
        impulse_json = self._rng.choice(
            self._impulse_manifest, 1, replace=False)[0]
        impulse_segment = AudioSegment.from_file(impulse_json['audio_filepath'])
        audio_segment.convolve(impulse_segment, allow_resample=True)
--- a/examples/transv1.8to2.x/deepspeech/frontend/augmentor/noise_perturb.py
+++ b/examples/transv1.8to2.x/deepspeech/frontend/augmentor/noise_perturb.py
@ -1,64 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Contains the noise perturb augmentation model."""
 from deepspeech.frontend.audio import AudioSegment
 from deepspeech.frontend.augmentor.base import AugmentorBase
 from deepspeech.frontend.utility import read_manifest
 class NoisePerturbAugmentor(AugmentorBase):
    """Augmentation model for adding background noise.
    :param rng: Random generator object.
    :type rng: random.Random
    :param min_snr_dB: Minimal signal noise ratio, in decibels.
    :type min_snr_dB: float
    :param max_snr_dB: Maximal signal noise ratio, in decibels.
    :type max_snr_dB: float
    :param noise_manifest_path: Manifest path for noise audio data.
    :type noise_manifest_path: str
    """
    def __init__(self, rng, min_snr_dB, max_snr_dB, noise_manifest_path):
        self._min_snr_dB = min_snr_dB
        self._max_snr_dB = max_snr_dB
        self._rng = rng
        self._noise_manifest = read_manifest(manifest_path=noise_manifest_path)
    def __call__(self, x, uttid=None, train=True):
        if not train:
            return x
        self.transform_audio(x)
        return x
    def transform_audio(self, audio_segment):
        """Add background noise audio.
        Note that this is an in-place transformation.
        :param audio_segment: Audio segment to add effects to.
        :type audio_segment: AudioSegmenet|SpeechSegment
        """
        noise_json = self._rng.choice(self._noise_manifest, 1, replace=False)[0]
        if noise_json['duration'] < audio_segment.duration:
            raise RuntimeError("The duration of sampled noise audio is smaller "
                               "than the audio segment to add effects to.")
        diff_duration = noise_json['duration'] - audio_segment.duration
        start = self._rng.uniform(0, diff_duration)
        end = start + audio_segment.duration
        noise_segment = AudioSegment.slice_from_file(
            noise_json['audio_filepath'], start=start, end=end)
        snr_dB = self._rng.uniform(self._min_snr_dB, self._max_snr_dB)
        audio_segment.add_noise(
            noise_segment, snr_dB, allow_downsampling=True, rng=self._rng)
--- a/examples/transv1.8to2.x/deepspeech/frontend/augmentor/online_bayesian_normalization.py
+++ b/examples/transv1.8to2.x/deepspeech/frontend/augmentor/online_bayesian_normalization.py
@ -1,63 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Contain the online bayesian normalization augmentation model."""
 from deepspeech.frontend.augmentor.base import AugmentorBase
 class OnlineBayesianNormalizationAugmentor(AugmentorBase):
    """Augmentation model for adding online bayesian normalization.
    :param rng: Random generator object.
    :type rng: random.Random
    :param target_db: Target RMS value in decibels.
    :type target_db: float
    :param prior_db: Prior RMS estimate in decibels.
    :type prior_db: float
    :param prior_samples: Prior strength in number of samples.
    :type prior_samples: int
    :param startup_delay: Default 0.0s. If provided, this function will
                          accrue statistics for the first startup_delay 
                          seconds before applying online normalization.
    :type starup_delay: float.
    """
    def __init__(self,
                 rng,
                 target_db,
                 prior_db,
                 prior_samples,
                 startup_delay=0.0):
        self._target_db = target_db
        self._prior_db = prior_db
        self._prior_samples = prior_samples
        self._rng = rng
        self._startup_delay = startup_delay
    def __call__(self, x, uttid=None, train=True):
        if not train:
            return x
        self.transform_audio(x)
        return x
    def transform_audio(self, audio_segment):
        """Normalizes the input audio using the online Bayesian approach.
        Note that this is an in-place transformation.
        :param audio_segment: Audio segment to add effects to.
        :type audio_segment: AudioSegment|SpeechSegment
        """
        audio_segment.normalize_online_bayesian(self._target_db, self._prior_db,
                                                self._prior_samples,
                                                self._startup_delay)
--- a/examples/transv1.8to2.x/deepspeech/frontend/augmentor/resample.py
+++ b/examples/transv1.8to2.x/deepspeech/frontend/augmentor/resample.py
@ -1,48 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Contain the resample augmentation model."""
 from deepspeech.frontend.augmentor.base import AugmentorBase
 class ResampleAugmentor(AugmentorBase):
    """Augmentation model for resampling.
    See more info here:
    https://ccrma.stanford.edu/~jos/resample/index.html
    :param rng: Random generator object.
    :type rng: random.Random
    :param new_sample_rate: New sample rate in Hz.
    :type new_sample_rate: int
    """
    def __init__(self, rng, new_sample_rate):
        self._new_sample_rate = new_sample_rate
        self._rng = rng
    def __call__(self, x, uttid=None, train=True):
        if not train:
            return x
        self.transform_audio(x)
        return x
    def transform_audio(self, audio_segment):
        """Resamples the input audio to a target sample rate.
        Note that this is an in-place transformation.
        :param audio: Audio segment to add effects to.
        :type audio: AudioSegment|SpeechSegment
        """
        audio_segment.resample(self._new_sample_rate)
--- a/examples/transv1.8to2.x/deepspeech/frontend/augmentor/shift_perturb.py
+++ b/examples/transv1.8to2.x/deepspeech/frontend/augmentor/shift_perturb.py
@ -1,49 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Contains the volume perturb augmentation model."""
 from deepspeech.frontend.augmentor.base import AugmentorBase
 class ShiftPerturbAugmentor(AugmentorBase):
    """Augmentation model for adding random shift perturbation.
    :param rng: Random generator object.
    :type rng: random.Random
    :param min_shift_ms: Minimal shift in milliseconds.
    :type min_shift_ms: float
    :param max_shift_ms: Maximal shift in milliseconds.
    :type max_shift_ms: float
    """
    def __init__(self, rng, min_shift_ms, max_shift_ms):
        self._min_shift_ms = min_shift_ms
        self._max_shift_ms = max_shift_ms
        self._rng = rng
    def __call__(self, x, uttid=None, train=True):
        if not train:
            return x
        self.transform_audio(x)
        return x
    def transform_audio(self, audio_segment):
        """Shift audio.
        Note that this is an in-place transformation.
        :param audio_segment: Audio segment to add effects to.
        :type audio_segment: AudioSegmenet|SpeechSegment
        """
        shift_ms = self._rng.uniform(self._min_shift_ms, self._max_shift_ms)
        audio_segment.shift(shift_ms)
--- a/examples/transv1.8to2.x/deepspeech/frontend/augmentor/spec_augment.py
+++ b/examples/transv1.8to2.x/deepspeech/frontend/augmentor/spec_augment.py
@ -1,256 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Contains the volume perturb augmentation model."""
 import random
 import numpy as np
 from PIL import Image
 from PIL.Image import BICUBIC
 from deepspeech.frontend.augmentor.base import AugmentorBase
 from deepspeech.utils.log import Log
 logger = Log(__name__).getlog()
 class SpecAugmentor(AugmentorBase):
    """Augmentation model for Time warping, Frequency masking, Time masking.
    SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition
        https://arxiv.org/abs/1904.08779
    SpecAugment on Large Scale Datasets
        https://arxiv.org/abs/1912.05533
    """
    def __init__(self,
                 rng,
                 F,
                 T,
                 n_freq_masks,
                 n_time_masks,
                 p=1.0,
                 W=40,
                 adaptive_number_ratio=0,
                 adaptive_size_ratio=0,
                 max_n_time_masks=20,
                 replace_with_zero=True,
                 warp_mode='PIL'):
        """SpecAugment class.
        Args:
            rng (random.Random): random generator object.
            F (int): parameter for frequency masking
            T (int): parameter for time masking
            n_freq_masks (int): number of frequency masks
            n_time_masks (int): number of time masks
            p (float): parameter for upperbound of the time mask
            W (int): parameter for time warping
            adaptive_number_ratio (float): adaptive multiplicity ratio for time masking
            adaptive_size_ratio (float): adaptive size ratio for time masking
            max_n_time_masks (int): maximum number of time masking
            replace_with_zero (bool): pad zero on mask if true else use mean
            warp_mode (str):  "PIL" (default, fast, not differentiable) 
                 or "sparse_image_warp" (slow, differentiable)
        """
        super().__init__()
        self._rng = rng
        self.inplace = True
        self.replace_with_zero = replace_with_zero
        self.mode = warp_mode
        self.W = W
        self.F = F
        self.T = T
        self.n_freq_masks = n_freq_masks
        self.n_time_masks = n_time_masks
        self.p = p
        # adaptive SpecAugment
        self.adaptive_number_ratio = adaptive_number_ratio
        self.adaptive_size_ratio = adaptive_size_ratio
        self.max_n_time_masks = max_n_time_masks
        if adaptive_number_ratio > 0:
            self.n_time_masks = 0
            logger.info('n_time_masks is set ot zero for adaptive SpecAugment.')
        if adaptive_size_ratio > 0:
            self.T = 0
            logger.info('T is set to zero for adaptive SpecAugment.')
        self._freq_mask = None
        self._time_mask = None
    def librispeech_basic(self):
        self.W = 80
        self.F = 27
        self.T = 100
        self.n_freq_masks = 1
        self.n_time_masks = 1
        self.p = 1.0
    def librispeech_double(self):
        self.W = 80
        self.F = 27
        self.T = 100
        self.n_freq_masks = 2
        self.n_time_masks = 2
        self.p = 1.0
    def switchboard_mild(self):
        self.W = 40
        self.F = 15
        self.T = 70
        self.n_freq_masks = 2
        self.n_time_masks = 2
        self.p = 0.2
    def switchboard_strong(self):
        self.W = 40
        self.F = 27
        self.T = 70
        self.n_freq_masks = 2
        self.n_time_masks = 2
        self.p = 0.2
    @property
    def freq_mask(self):
        return self._freq_mask
    @property
    def time_mask(self):
        return self._time_mask
    def __repr__(self):
        return f"specaug: F-{F}, T-{T}, F-n-{n_freq_masks}, T-n-{n_time_masks}"
    def time_warp(self, x, mode='PIL'):
        """time warp for spec augment
        move random center frame by the random width ~ uniform(-window, window)
        Args:
            x (np.ndarray): spectrogram (time, freq)
            mode (str): PIL or sparse_image_warp
        Raises:
            NotImplementedError: [description]
            NotImplementedError: [description]
        Returns:
            np.ndarray: time warped spectrogram (time, freq)
        """
        window = max_time_warp = self.W
        if window == 0:
            return x
        if mode == "PIL":
            t = x.shape[0]
            if t - window <= window:
                return x
            # NOTE: randrange(a, b) emits a, a + 1, ..., b - 1
            center = random.randrange(window, t - window)
            warped = random.randrange(center - window, center +
                                      window) + 1  # 1 ... t - 1
            left = Image.fromarray(x[:center]).resize((x.shape[1], warped),
                                                      BICUBIC)
            right = Image.fromarray(x[center:]).resize((x.shape[1], t - warped),
                                                       BICUBIC)
            if self.inplace:
                x[:warped] = left
                x[warped:] = right
                return x
            return np.concatenate((left, right), 0)
        elif mode == "sparse_image_warp":
            raise NotImplementedError('sparse_image_warp')
        else:
            raise NotImplementedError(
                "unknown resize mode: " + mode +
                ", choose one from (PIL, sparse_image_warp).")
    def mask_freq(self, x, replace_with_zero=False):
        """freq mask
        Args:
            x (np.ndarray): spectrogram (time, freq)
            replace_with_zero (bool, optional): Defaults to False.
        Returns:
            np.ndarray: freq mask spectrogram (time, freq)
        """
        n_bins = x.shape[1]
        for i in range(0, self.n_freq_masks):
            f = int(self._rng.uniform(low=0, high=self.F))
            f_0 = int(self._rng.uniform(low=0, high=n_bins - f))
            assert f_0 <= f_0 + f
            if replace_with_zero:
                x[:, f_0:f_0 + f] = 0
            else:
                x[:, f_0:f_0 + f] = x.mean()
            self._freq_mask = (f_0, f_0 + f)
        return x
    def mask_time(self, x, replace_with_zero=False):
        """time mask
        Args:
            x (np.ndarray): spectrogram (time, freq)
            replace_with_zero (bool, optional): Defaults to False.
        Returns:
            np.ndarray: time mask spectrogram (time, freq)
        """
        n_frames = x.shape[0]
        if self.adaptive_number_ratio > 0:
            n_masks = int(n_frames * self.adaptive_number_ratio)
            n_masks = min(n_masks, self.max_n_time_masks)
        else:
            n_masks = self.n_time_masks
        if self.adaptive_size_ratio > 0:
            T = self.adaptive_size_ratio * n_frames
        else:
            T = self.T
        for i in range(n_masks):
            t = int(self._rng.uniform(low=0, high=T))
            t = min(t, int(n_frames * self.p))
            t_0 = int(self._rng.uniform(low=0, high=n_frames - t))
            assert t_0 <= t_0 + t
            if replace_with_zero:
                x[t_0:t_0 + t, :] = 0
            else:
                x[t_0:t_0 + t, :] = x.mean()
            self._time_mask = (t_0, t_0 + t)
        return x
    def __call__(self, x, train=True):
        if not train:
            return x
        return self.transform_feature(x)
    def transform_feature(self, x: np.ndarray):
        """
        Args:
            x (np.ndarray): `[T, F]`
        Returns:
            x (np.ndarray): `[T, F]`
        """
        assert isinstance(x, np.ndarray)
        assert x.ndim == 2
        x = self.time_warp(x, self.mode)
        x = self.mask_freq(x, self.replace_with_zero)
        x = self.mask_time(x, self.replace_with_zero)
        return x
--- a/examples/transv1.8to2.x/deepspeech/frontend/augmentor/speed_perturb.py
+++ b/examples/transv1.8to2.x/deepspeech/frontend/augmentor/speed_perturb.py
@ -1,106 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Contain the speech perturbation augmentation model."""
 import numpy as np
 from deepspeech.frontend.augmentor.base import AugmentorBase
 class SpeedPerturbAugmentor(AugmentorBase):
    """Augmentation model for adding speed perturbation."""
    def __init__(self, rng, min_speed_rate=0.9, max_speed_rate=1.1,
                 num_rates=3):
        """speed perturbation.
        The speed perturbation in kaldi uses sox-speed instead of sox-tempo,
        and sox-speed just to resample the input,
        i.e pitch and tempo are changed both.
        "Why use speed option instead of tempo -s in SoX for speed perturbation"
        https://groups.google.com/forum/#!topic/kaldi-help/8OOG7eE4sZ8
        Sox speed:
        https://pysox.readthedocs.io/en/latest/api.html#sox.transform.Transformer
        See reference paper here:
        http://www.danielpovey.com/files/2015_interspeech_augmentation.pdf
        Espnet:
        https://espnet.github.io/espnet/_modules/espnet/transform/perturb.html
        Nemo:
        https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/asr/parts/perturb.py#L92
        Args:
            rng (random.Random): Random generator object.
            min_speed_rate (float): Lower bound of new speed rate to sample and should
                not be smaller than 0.9.
            max_speed_rate (float): Upper bound of new speed rate to sample and should
                not be larger than 1.1.
            num_rates (int, optional): Number of discrete rates to allow. 
                Can be a positive or negative integer. Defaults to 3.
                If a positive integer greater than 0 is provided, the range of
                speed rates will be discretized into `num_rates` values.
                If a negative integer or 0 is provided, the full range of speed rates
                will be sampled uniformly.
                Note: If a positive integer is provided and the resultant discretized
                range of rates contains the value '1.0', then those samples with rate=1.0,
                will not be augmented at all and simply skipped. This is to unnecessary
                augmentation and increase computation time. Effective augmentation chance
                in such a case is = `prob * (num_rates - 1 / num_rates) * 100`% chance
                where `prob` is the global probability of a sample being augmented.
        Raises:
            ValueError: when speed_rate error
        """
        if min_speed_rate < 0.9:
            raise ValueError(
                "Sampling speed below 0.9 can cause unnatural effects")
        if max_speed_rate > 1.1:
            raise ValueError(
                "Sampling speed above 1.1 can cause unnatural effects")
        self._min_rate = min_speed_rate
        self._max_rate = max_speed_rate
        self._rng = rng
        self._num_rates = num_rates
        if num_rates > 0:
            self._rates = np.linspace(
                self._min_rate, self._max_rate, self._num_rates, endpoint=True)
    def __call__(self, x, uttid=None, train=True):
        if not train:
            return x
        self.transform_audio(x)
        return x
    def transform_audio(self, audio_segment):
        """Sample a new speed rate from the given range and
        changes the speed of the given audio clip.
        Note that this is an in-place transformation.
        :param audio_segment: Audio segment to add effects to.
        :type audio_segment: AudioSegment|SpeechSegment
        """
        if self._num_rates < 0:
            speed_rate = self._rng.uniform(self._min_rate, self._max_rate)
        else:
            speed_rate = self._rng.choice(self._rates)
        # Skip perturbation in case of identity speed rate
        if speed_rate == 1.0:
            return
        audio_segment.change_speed(speed_rate)
--- a/examples/transv1.8to2.x/deepspeech/frontend/augmentor/volume_perturb.py
+++ b/examples/transv1.8to2.x/deepspeech/frontend/augmentor/volume_perturb.py
@ -1,55 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Contains the volume perturb augmentation model."""
 from deepspeech.frontend.augmentor.base import AugmentorBase
 class VolumePerturbAugmentor(AugmentorBase):
    """Augmentation model for adding random volume perturbation.
    This is used for multi-loudness training of PCEN. See
    https://arxiv.org/pdf/1607.05666v1.pdf
    for more details.
    :param rng: Random generator object.
    :type rng: random.Random
    :param min_gain_dBFS: Minimal gain in dBFS.
    :type min_gain_dBFS: float
    :param max_gain_dBFS: Maximal gain in dBFS.
    :type max_gain_dBFS: float
    """
    def __init__(self, rng, min_gain_dBFS, max_gain_dBFS):
        self._min_gain_dBFS = min_gain_dBFS
        self._max_gain_dBFS = max_gain_dBFS
        self._rng = rng
    def __call__(self, x, uttid=None, train=True):
        if not train:
            return x
        self.transform_audio(x)
        return x
    def transform_audio(self, audio_segment):
        """Change audio loadness.
        Note that this is an in-place transformation.
        :param audio_segment: Audio segment to add effects to.
        :type audio_segment: AudioSegmenet|SpeechSegment
        """
        gain = self._rng.uniform(self._min_gain_dBFS, self._max_gain_dBFS)
        audio_segment.gain_db(gain)
--- a/examples/transv1.8to2.x/deepspeech/frontend/featurizer/init.py
+++ b/examples/transv1.8to2.x/deepspeech/frontend/featurizer/init.py
@ -1,16 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from .audio_featurizer import AudioFeaturizer  #noqa: F401
 from .speech_featurizer import SpeechFeaturizer
 from .text_featurizer import TextFeaturizer
--- a/examples/transv1.8to2.x/deepspeech/frontend/featurizer/audio_featurizer.py
+++ b/examples/transv1.8to2.x/deepspeech/frontend/featurizer/audio_featurizer.py
@ -1,363 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Contains the audio featurizer class."""
 import numpy as np
 from python_speech_features import delta
 from python_speech_features import logfbank
 from python_speech_features import mfcc
 class AudioFeaturizer():
    """Audio featurizer, for extracting features from audio contents of
    AudioSegment or SpeechSegment.
    Currently, it supports feature types of linear spectrogram and mfcc.
    :param specgram_type: Specgram feature type. Options: 'linear'.
    :type specgram_type: str
    :param stride_ms: Striding size (in milliseconds) for generating frames.
    :type stride_ms: float
    :param window_ms: Window size (in milliseconds) for generating frames.
    :type window_ms: float
    :param max_freq: When specgram_type is 'linear', only FFT bins
                     corresponding to frequencies between [0, max_freq] are
                     returned; when specgram_type is 'mfcc', max_feq is the
                     highest band edge of mel filters.
    :types max_freq: None|float
    :param target_sample_rate: Audio are resampled (if upsampling or
                               downsampling is allowed) to this before
                               extracting spectrogram features.
    :type target_sample_rate: float
    :param use_dB_normalization: Whether to normalize the audio to a certain
                                 decibels before extracting the features.
    :type use_dB_normalization: bool
    :param target_dB: Target audio decibels for normalization.
    :type target_dB: float
    """
    def __init__(self,
                 specgram_type: str='linear',
                 feat_dim: int=None,
                 delta_delta: bool=False,
                 stride_ms=10.0,
                 window_ms=20.0,
                 n_fft=None,
                 max_freq=None,
                 target_sample_rate=16000,
                 use_dB_normalization=True,
                 target_dB=-20,
                 dither=1.0):
        self._specgram_type = specgram_type
        # mfcc and fbank using `feat_dim`
        self._feat_dim = feat_dim
        # mfcc and fbank using `delta-delta`
        self._delta_delta = delta_delta
        self._stride_ms = stride_ms
        self._window_ms = window_ms
        self._max_freq = max_freq
        self._target_sample_rate = target_sample_rate
        self._use_dB_normalization = use_dB_normalization
        self._target_dB = target_dB
        self._fft_point = n_fft
        self._dither = dither
    def featurize(self,
                  audio_segment,
                  allow_downsampling=True,
                  allow_upsampling=True):
        """Extract audio features from AudioSegment or SpeechSegment.
        :param audio_segment: Audio/speech segment to extract features from.
        :type audio_segment: AudioSegment|SpeechSegment
        :param allow_downsampling: Whether to allow audio downsampling before
                                   featurizing.
        :type allow_downsampling: bool
        :param allow_upsampling: Whether to allow audio upsampling before
                                 featurizing.
        :type allow_upsampling: bool
        :return: Spectrogram audio feature in 2darray.
        :rtype: ndarray
        :raises ValueError: If audio sample rate is not supported.
        """
        # upsampling or downsampling
        if ((audio_segment.sample_rate > self._target_sample_rate and
             allow_downsampling) or
            (audio_segment.sample_rate < self._target_sample_rate and
             allow_upsampling)):
            audio_segment.resample(self._target_sample_rate)
        if audio_segment.sample_rate != self._target_sample_rate:
            raise ValueError("Audio sample rate is not supported. "
                             "Turn allow_downsampling or allow up_sampling on.")
        # decibel normalization
        if self._use_dB_normalization:
            audio_segment.normalize(target_db=self._target_dB)
        # extract spectrogram
        return self._compute_specgram(audio_segment)
    @property
    def stride_ms(self):
        return self._stride_ms
    @property
    def feature_size(self):
        """audio feature size"""
        feat_dim = 0
        if self._specgram_type == 'linear':
            fft_point = self._window_ms if self._fft_point is None else self._fft_point
            feat_dim = int(fft_point * (self._target_sample_rate / 1000) / 2 +
                           1)
        elif self._specgram_type == 'mfcc':
            # mfcc, delta, delta-delta
            feat_dim = int(self._feat_dim *
                           3) if self._delta_delta else int(self._feat_dim)
        elif self._specgram_type == 'fbank':
            # fbank, delta, delta-delta
            feat_dim = int(self._feat_dim *
                           3) if self._delta_delta else int(self._feat_dim)
        else:
            raise ValueError("Unknown specgram_type %s. "
                             "Supported values: linear." % self._specgram_type)
        return feat_dim
    def _compute_specgram(self, audio_segment):
        """Extract various audio features."""
        sample_rate = audio_segment.sample_rate
        if self._specgram_type == 'linear':
            samples = audio_segment.samples
            return self._compute_linear_specgram(
                samples,
                sample_rate,
                stride_ms=self._stride_ms,
                window_ms=self._window_ms,
                max_freq=self._max_freq)
        elif self._specgram_type == 'mfcc':
            samples = audio_segment.to('int16')
            return self._compute_mfcc(
                samples,
                sample_rate,
                feat_dim=self._feat_dim,
                stride_ms=self._stride_ms,
                window_ms=self._window_ms,
                max_freq=self._max_freq,
                dither=self._dither,
                delta_delta=self._delta_delta)
        elif self._specgram_type == 'fbank':
            samples = audio_segment.to('int16')
            return self._compute_fbank(
                samples,
                sample_rate,
                feat_dim=self._feat_dim,
                stride_ms=self._stride_ms,
                window_ms=self._window_ms,
                max_freq=self._max_freq,
                dither=self._dither,
                delta_delta=self._delta_delta)
        else:
            raise ValueError("Unknown specgram_type %s. "
                             "Supported values: linear." % self._specgram_type)
    def _specgram_real(self, samples, window_size, stride_size, sample_rate):
        """Compute the spectrogram for samples from a real signal."""
        # extract strided windows
        truncate_size = (len(samples) - window_size) % stride_size
        samples = samples[:len(samples) - truncate_size]
        nshape = (window_size, (len(samples) - window_size) // stride_size + 1)
        nstrides = (samples.strides[0], samples.strides[0] * stride_size)
        windows = np.lib.stride_tricks.as_strided(
            samples, shape=nshape, strides=nstrides)
        assert np.all(
            windows[:, 1] == samples[stride_size:(stride_size + window_size)])
        # window weighting, squared Fast Fourier Transform (fft), scaling
        weighting = np.hanning(window_size)[:, None]
        # https://numpy.org/doc/stable/reference/generated/numpy.fft.rfft.html
        fft = np.fft.rfft(windows * weighting, n=None, axis=0)
        fft = np.absolute(fft)
        fft = fft**2
        scale = np.sum(weighting**2) * sample_rate
        fft[1:-1, :] *= (2.0 / scale)
        fft[(0, -1), :] /= scale
        # prepare fft frequency list
        freqs = float(sample_rate) / window_size * np.arange(fft.shape[0])
        return fft, freqs
    def _compute_linear_specgram(self,
                                 samples,
                                 sample_rate,
                                 stride_ms=10.0,
                                 window_ms=20.0,
                                 max_freq=None,
                                 eps=1e-14):
        """Compute the linear spectrogram from FFT energy.
        Args:
            samples ([type]): [description]
            sample_rate ([type]): [description]
            stride_ms (float, optional): [description]. Defaults to 10.0.
            window_ms (float, optional): [description]. Defaults to 20.0.
            max_freq ([type], optional): [description]. Defaults to None.
            eps ([type], optional): [description]. Defaults to 1e-14.
        Raises:
            ValueError: [description]
            ValueError: [description]
        Returns:
            np.ndarray: log spectrogram, (time, freq)
        """
        if max_freq is None:
            max_freq = sample_rate / 2
        if max_freq > sample_rate / 2:
            raise ValueError("max_freq must not be greater than half of "
                             "sample rate.")
        if stride_ms > window_ms:
            raise ValueError("Stride size must not be greater than "
                             "window size.")
        stride_size = int(0.001 * sample_rate * stride_ms)
        window_size = int(0.001 * sample_rate * window_ms)
        specgram, freqs = self._specgram_real(
            samples,
            window_size=window_size,
            stride_size=stride_size,
            sample_rate=sample_rate)
        ind = np.where(freqs <= max_freq)[0][-1] + 1
        # (freq, time)
        spec = np.log(specgram[:ind, :] + eps)
        return np.transpose(spec)
    def _concat_delta_delta(self, feat):
        """append delat, delta-delta feature.
        Args:
            feat (np.ndarray): (T, D)
        Returns:
            np.ndarray: feat with delta-delta, (T, 3*D)
        """
        # Deltas
        d_feat = delta(feat, 2)
        # Deltas-Deltas
        dd_feat = delta(feat, 2)
        # concat above three features
        concat_feat = np.concatenate((feat, d_feat, dd_feat), axis=1)
        return concat_feat
    def _compute_mfcc(self,
                      samples,
                      sample_rate,
                      feat_dim=13,
                      stride_ms=10.0,
                      window_ms=25.0,
                      max_freq=None,
                      dither=1.0,
                      delta_delta=True):
        """Compute mfcc from samples.
        Args:
            samples (np.ndarray, np.int16): the audio signal from which to compute features.
            sample_rate (float): the sample rate of the signal we are working with, in Hz.
            feat_dim (int): the number of cepstrum to return, default 13.
            stride_ms (float, optional): stride length in ms. Defaults to 10.0.
            window_ms (float, optional): window length in ms. Defaults to 25.0.
            max_freq ([type], optional): highest band edge of mel filters. In Hz, default is samplerate/2. Defaults to None.
            delta_delta (bool, optional): Whether with delta delta. Defaults to False.
        Raises:
            ValueError: max_freq > samplerate/2
            ValueError: stride_ms > window_ms
        Returns:
            np.ndarray: mfcc feature, (D, T).
        """
        if max_freq is None:
            max_freq = sample_rate / 2
        if max_freq > sample_rate / 2:
            raise ValueError("max_freq must not be greater than half of "
                             "sample rate.")
        if stride_ms > window_ms:
            raise ValueError("Stride size must not be greater than "
                             "window size.")
        # compute the 13 cepstral coefficients, and the first one is replaced
        # by log(frame energy), (T, D)
        mfcc_feat = mfcc(
            signal=samples,
            samplerate=sample_rate,
            winlen=0.001 * window_ms,
            winstep=0.001 * stride_ms,
            numcep=feat_dim,
            nfilt=23,
            nfft=512,
            lowfreq=20,
            highfreq=max_freq,
            dither=dither,
            remove_dc_offset=True,
            preemph=0.97,
            ceplifter=22,
            useEnergy=True,
            winfunc='povey')
        if delta_delta:
            mfcc_feat = self._concat_delta_delta(mfcc_feat)
        return mfcc_feat
    def _compute_fbank(self,
                       samples,
                       sample_rate,
                       feat_dim=40,
                       stride_ms=10.0,
                       window_ms=25.0,
                       max_freq=None,
                       dither=1.0,
                       delta_delta=False):
        """Compute logfbank from samples.
        Args:
            samples (np.ndarray, np.int16): the audio signal from which to compute features. Should be an N*1 array
            sample_rate (float): the sample rate of the signal we are working with, in Hz.
            feat_dim (int): the number of cepstrum to return, default 13.
            stride_ms (float, optional): stride length in ms. Defaults to 10.0.
            window_ms (float, optional): window length in ms. Defaults to 20.0.
            max_freq (float, optional): highest band edge of mel filters. In Hz, default is samplerate/2. Defaults to None.
            delta_delta (bool, optional): Whether with delta delta. Defaults to False.
        Raises:
            ValueError: max_freq > samplerate/2
            ValueError: stride_ms > window_ms
        Returns:
            np.ndarray: mfcc feature, (D, T).
        """
        if max_freq is None:
            max_freq = sample_rate / 2
        if max_freq > sample_rate / 2:
            raise ValueError("max_freq must not be greater than half of "
                             "sample rate.")
        if stride_ms > window_ms:
            raise ValueError("Stride size must not be greater than "
                             "window size.")
        # (T, D)
        fbank_feat = logfbank(
            signal=samples,
            samplerate=sample_rate,
            winlen=0.001 * window_ms,
            winstep=0.001 * stride_ms,
            nfilt=feat_dim,
            nfft=512,
            lowfreq=20,
            highfreq=max_freq,
            dither=dither,
            remove_dc_offset=True,
            preemph=0.97,
            wintype='povey')
        if delta_delta:
            fbank_feat = self._concat_delta_delta(fbank_feat)
        return fbank_feat
--- a/examples/transv1.8to2.x/deepspeech/frontend/featurizer/speech_featurizer.py
+++ b/examples/transv1.8to2.x/deepspeech/frontend/featurizer/speech_featurizer.py
@ -1,153 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Contains the speech featurizer class."""
 from deepspeech.frontend.featurizer.audio_featurizer import AudioFeaturizer
 from deepspeech.frontend.featurizer.text_featurizer import TextFeaturizer
 class SpeechFeaturizer():
    """Speech featurizer, for extracting features from both audio and transcript
    contents of SpeechSegment.
    Currently, for audio parts, it supports feature types of linear
    spectrogram and mfcc; for transcript parts, it only supports char-level
    tokenizing and conversion into a list of token indices. Note that the
    token indexing order follows the given vocabulary file.
    :param vocab_filepath: Filepath to load vocabulary for token indices
                           conversion.
    :type specgram_type: str
    :param specgram_type: Specgram feature type. Options: 'linear', 'mfcc'.
    :type specgram_type: str
    :param stride_ms: Striding size (in milliseconds) for generating frames.
    :type stride_ms: float
    :param window_ms: Window size (in milliseconds) for generating frames.
    :type window_ms: float
    :param max_freq: When specgram_type is 'linear', only FFT bins
                     corresponding to frequencies between [0, max_freq] are
                     returned; when specgram_type is 'mfcc', max_freq is the
                     highest band edge of mel filters.
    :types max_freq: None|float
    :param target_sample_rate: Speech are resampled (if upsampling or
                               downsampling is allowed) to this before
                               extracting spectrogram features.
    :type target_sample_rate: float
    :param use_dB_normalization: Whether to normalize the audio to a certain
                                 decibels before extracting the features.
    :type use_dB_normalization: bool
    :param target_dB: Target audio decibels for normalization.
    :type target_dB: float
    """
    def __init__(self,
                 unit_type,
                 vocab_filepath,
                 spm_model_prefix=None,
                 specgram_type='linear',
                 feat_dim=None,
                 delta_delta=False,
                 stride_ms=10.0,
                 window_ms=20.0,
                 n_fft=None,
                 max_freq=None,
                 target_sample_rate=16000,
                 use_dB_normalization=True,
                 target_dB=-20,
                 dither=1.0):
        self._audio_featurizer = AudioFeaturizer(
            specgram_type=specgram_type,
            feat_dim=feat_dim,
            delta_delta=delta_delta,
            stride_ms=stride_ms,
            window_ms=window_ms,
            n_fft=n_fft,
            max_freq=max_freq,
            target_sample_rate=target_sample_rate,
            use_dB_normalization=use_dB_normalization,
            target_dB=target_dB,
            dither=dither)
        self._text_featurizer = TextFeaturizer(unit_type, vocab_filepath,
                                               spm_model_prefix)
    def featurize(self, speech_segment, keep_transcription_text):
        """Extract features for speech segment.
        1. For audio parts, extract the audio features.
        2. For transcript parts, keep the original text or convert text string
           to a list of token indices in char-level.
        Args:
            speech_segment (SpeechSegment): Speech segment to extract features from.
            keep_transcription_text (bool): True, keep transcript text, False, token ids
        Returns:
            tuple: 1) spectrogram audio feature in 2darray, 2) list oftoken indices.
        """
        spec_feature = self._audio_featurizer.featurize(speech_segment)
        if keep_transcription_text:
            return spec_feature, speech_segment.transcript
        if speech_segment.has_token:
            text_ids = speech_segment.token_ids
        else:
            text_ids = self._text_featurizer.featurize(
                speech_segment.transcript)
        return spec_feature, text_ids
    @property
    def vocab_size(self):
        """Return the vocabulary size.
        Returns:
            int: Vocabulary size.
        """
        return self._text_featurizer.vocab_size
    @property
    def vocab_list(self):
        """Return the vocabulary in list.
        Returns:
            List[str]: 
        """
        return self._text_featurizer.vocab_list
    @property
    def vocab_dict(self):
        """Return the vocabulary in dict.
        Returns:
            Dict[str, int]: 
        """
        return self._text_featurizer.vocab_dict
    @property
    def feature_size(self):
        """Return the audio feature size.
        Returns:
            int: audio feature size.
        """
        return self._audio_featurizer.feature_size
    @property
    def stride_ms(self):
        """time length in `ms` unit per frame
        Returns:
            float: time(ms)/frame
        """
        return self._audio_featurizer.stride_ms
    @property
    def text_feature(self):
        """Return the text feature object.
        Returns:
            TextFeaturizer: object.
        """
        return self._text_featurizer
--- a/examples/transv1.8to2.x/deepspeech/frontend/featurizer/text_featurizer.py
+++ b/examples/transv1.8to2.x/deepspeech/frontend/featurizer/text_featurizer.py
@ -1,202 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Contains the text featurizer class."""
 import sentencepiece as spm
 from ..utility import EOS
 from ..utility import load_dict
 from ..utility import UNK
 __all__ = ["TextFeaturizer"]
 class TextFeaturizer():
    def __init__(self,
                 unit_type,
                 vocab_filepath,
                 spm_model_prefix=None,
                 maskctc=False):
        """Text featurizer, for processing or extracting features from text.
        Currently, it supports char/word/sentence-piece level tokenizing and conversion into
        a list of token indices. Note that the token indexing order follows the
        given vocabulary file.
        Args:
            unit_type (str): unit type, e.g. char, word, spm
            vocab_filepath (str): Filepath to load vocabulary for token indices conversion.
            spm_model_prefix (str, optional): spm model prefix. Defaults to None.
        """
        assert unit_type in ('char', 'spm', 'word')
        self.unit_type = unit_type
        self.unk = UNK
        self.maskctc = maskctc
        if vocab_filepath:
            self.vocab_dict, self._id2token, self.vocab_list, self.unk_id, self.eos_id = self._load_vocabulary_from_file(
                vocab_filepath, maskctc)
            self.vocab_size = len(self.vocab_list)
        if unit_type == 'spm':
            spm_model = spm_model_prefix + '.model'
            self.sp = spm.SentencePieceProcessor()
            self.sp.Load(spm_model)
    def tokenize(self, text):
        if self.unit_type == 'char':
            tokens = self.char_tokenize(text)
        elif self.unit_type == 'word':
            tokens = self.word_tokenize(text)
        else:  # spm
            tokens = self.spm_tokenize(text)
        return tokens
    def detokenize(self, tokens):
        if self.unit_type == 'char':
            text = self.char_detokenize(tokens)
        elif self.unit_type == 'word':
            text = self.word_detokenize(tokens)
        else:  # spm
            text = self.spm_detokenize(tokens)
        return text
    def featurize(self, text):
        """Convert text string to a list of token indices.
        Args:
            text (str): Text.
        Returns:
            List[int]: List of token indices.
        """
        tokens = self.tokenize(text)
        ids = []
        for token in tokens:
            token = token if token in self.vocab_dict else self.unk
            ids.append(self.vocab_dict[token])
        return ids
    def defeaturize(self, idxs):
        """Convert a list of token indices to text string,
        ignore index after eos_id. 
        Args:
            idxs (List[int]): List of token indices.
        Returns:
            str: Text.
        """
        tokens = []
        for idx in idxs:
            if idx == self.eos_id:
                break
            tokens.append(self._id2token[idx])
        text = self.detokenize(tokens)
        return text
    def char_tokenize(self, text):
        """Character tokenizer.
        Args:
            text (str): text string.
        Returns:
            List[str]: tokens.
        """
        return list(text.strip())
    def char_detokenize(self, tokens):
        """Character detokenizer.
        Args:
            tokens (List[str]): tokens.
        Returns:
           str: text string.
        """
        return "".join(tokens)
    def word_tokenize(self, text):
        """Word tokenizer, separate by <space>."""
        return text.strip().split()
    def word_detokenize(self, tokens):
        """Word detokenizer, separate by <space>."""
        return " ".join(tokens)
    def spm_tokenize(self, text):
        """spm tokenize.
        Args:
            text (str): text string.
        Returns:
            List[str]: sentence pieces str code
        """
        stats = {"num_empty": 0, "num_filtered": 0}
        def valid(line):
            return True
        def encode(l):
            return self.sp.EncodeAsPieces(l)
        def encode_line(line):
            line = line.strip()
            if len(line) > 0:
                line = encode(line)
                if valid(line):
                    return line
                else:
                    stats["num_filtered"] += 1
            else:
                stats["num_empty"] += 1
            return None
        enc_line = encode_line(text)
        return enc_line
    def spm_detokenize(self, tokens, input_format='piece'):
        """spm detokenize.
        Args:
            ids (List[str]): tokens.
        Returns:
            str: text
        """
        if input_format == "piece":
            def decode(l):
                return "".join(self.sp.DecodePieces(l))
        elif input_format == "id":
            def decode(l):
                return "".join(self.sp.DecodeIds(l))
        return decode(tokens)
    def _load_vocabulary_from_file(self, vocab_filepath: str, maskctc: bool):
        """Load vocabulary from file."""
        vocab_list = load_dict(vocab_filepath, maskctc)
        assert vocab_list is not None
        id2token = dict(
            [(idx, token) for (idx, token) in enumerate(vocab_list)])
        token2id = dict(
            [(token, idx) for (idx, token) in enumerate(vocab_list)])
        unk_id = vocab_list.index(UNK)
        eos_id = vocab_list.index(EOS)
        return token2id, id2token, vocab_list, unk_id, eos_id
--- a/examples/transv1.8to2.x/deepspeech/frontend/normalizer.py
+++ b/examples/transv1.8to2.x/deepspeech/frontend/normalizer.py
@ -1,199 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Contains feature normalizers."""
 import json
 import numpy as np
 import paddle
 from paddle.io import DataLoader
 from paddle.io import Dataset
 from deepspeech.frontend.audio import AudioSegment
 from deepspeech.frontend.utility import load_cmvn
 from deepspeech.frontend.utility import read_manifest
 from deepspeech.utils.log import Log
 __all__ = ["FeatureNormalizer"]
 logger = Log(__name__).getlog()
 # https://github.com/PaddlePaddle/Paddle/pull/31481
 class CollateFunc(object):
    def __init__(self, feature_func):
        self.feature_func = feature_func
    def __call__(self, batch):
        mean_stat = None
        var_stat = None
        number = 0
        for item in batch:
            audioseg = AudioSegment.from_file(item['feat'])
            feat = self.feature_func(audioseg)  #(T, D)
            sums = np.sum(feat, axis=0)
            if mean_stat is None:
                mean_stat = sums
            else:
                mean_stat += sums
            square_sums = np.sum(np.square(feat), axis=0)
            if var_stat is None:
                var_stat = square_sums
            else:
                var_stat += square_sums
            number += feat.shape[0]
        return number, mean_stat, var_stat
 class AudioDataset(Dataset):
    def __init__(self, manifest_path, num_samples=-1, rng=None, random_seed=0):
        self._rng = rng if rng else np.random.RandomState(random_seed)
        manifest = read_manifest(manifest_path)
        if num_samples == -1:
            sampled_manifest = manifest
        else:
            sampled_manifest = self._rng.choice(
                manifest, num_samples, replace=False)
        self.items = sampled_manifest
    def __len__(self):
        return len(self.items)
    def __getitem__(self, idx):
        return self.items[idx]
 class FeatureNormalizer(object):
    """Feature normalizer. Normalize features to be of zero mean and unit
    stddev.
    if mean_std_filepath is provided (not None), the normalizer will directly
    initilize from the file. Otherwise, both manifest_path and featurize_func
    should be given for on-the-fly mean and stddev computing.
    :param mean_std_filepath: File containing the pre-computed mean and stddev.
    :type mean_std_filepath: None|str
    :param manifest_path: Manifest of instances for computing mean and stddev.
    :type meanifest_path: None|str
    :param featurize_func: Function to extract features. It should be callable
                           with ``featurize_func(audio_segment)``.
    :type featurize_func: None|callable
    :param num_samples: Number of random samples for computing mean and stddev.
    :type num_samples: int
    :param random_seed: Random seed for sampling instances.
    :type random_seed: int
    :raises ValueError: If both mean_std_filepath and manifest_path
                        (or both mean_std_filepath and featurize_func) are None.
    """
    def __init__(self,
                 mean_std_filepath,
                 manifest_path=None,
                 featurize_func=None,
                 num_samples=500,
                 num_workers=0,
                 random_seed=0):
        if not mean_std_filepath:
            if not (manifest_path and featurize_func):
                raise ValueError("If mean_std_filepath is None, meanifest_path "
                                 "and featurize_func should not be None.")
            self._rng = np.random.RandomState(random_seed)
            self._compute_mean_std(manifest_path, featurize_func, num_samples,
                                   num_workers)
        else:
            self._read_mean_std_from_file(mean_std_filepath)
    def apply(self, features):
        """Normalize features to be of zero mean and unit stddev.
        :param features: Input features to be normalized.
        :type features: ndarray, shape (T, D)
        :param eps:  added to stddev to provide numerical stablibity.
        :type eps: float
        :return: Normalized features.
        :rtype: ndarray
        """
        return (features - self._mean) * self._istd
    def _read_mean_std_from_file(self, filepath, eps=1e-20):
        """Load mean and std from file."""
        mean, istd = load_cmvn(filepath, filetype='json')
        self._mean = np.expand_dims(mean, axis=0)
        self._istd = np.expand_dims(istd, axis=0)
        '''
        print ("filepath", filepath)
        npz = np.load(filepath)
        self._mean = npz['mean'].reshape(1,161)
        self._istd = npz['std'].reshape(1,161)
        print ("mean.shape", self._mean.shape)
        print ("istd.shape", self._istd.shape)
        '''
    def write_to_file(self, filepath):
        """Write the mean and stddev to the file.
        :param filepath: File to write mean and stddev.
        :type filepath: str
        """
        with open(filepath, 'w') as fout:
            fout.write(json.dumps(self.cmvn_info))
    def _compute_mean_std(self,
                          manifest_path,
                          featurize_func,
                          num_samples,
                          num_workers,
                          batch_size=64,
                          eps=1e-20):
        """Compute mean and std from randomly sampled instances."""
        paddle.set_device('cpu')
        collate_func = CollateFunc(featurize_func)
        dataset = AudioDataset(manifest_path, num_samples, self._rng)
        data_loader = DataLoader(
            dataset,
            batch_size=batch_size,
            shuffle=False,
            num_workers=num_workers,
            collate_fn=collate_func)
        with paddle.no_grad():
            all_mean_stat = None
            all_var_stat = None
            all_number = 0
            wav_number = 0
            for i, batch in enumerate(data_loader):
                number, mean_stat, var_stat = batch
                if i == 0:
                    all_mean_stat = mean_stat
                    all_var_stat = var_stat
                else:
                    all_mean_stat += mean_stat
                    all_var_stat += var_stat
                all_number += number
                wav_number += batch_size
                if wav_number % 1000 == 0:
                    logger.info(
                        f'process {wav_number} wavs,{all_number} frames.')
        self.cmvn_info = {
            'mean_stat': list(all_mean_stat.tolist()),
            'var_stat': list(all_var_stat.tolist()),
            'frame_num': all_number,
        }
        return self.cmvn_info
--- a/examples/transv1.8to2.x/deepspeech/frontend/speech.py
+++ b/examples/transv1.8to2.x/deepspeech/frontend/speech.py
@ -1,217 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Contains the speech segment class."""
 import numpy as np
 from deepspeech.frontend.audio import AudioSegment
 class SpeechSegment(AudioSegment):
    """Speech Segment with Text
    Args:
        AudioSegment (AudioSegment): Audio Segment
    """
    def __init__(self,
                 samples,
                 sample_rate,
                 transcript,
                 tokens=None,
                 token_ids=None):
        """Speech segment abstraction, a subclass of AudioSegment,
            with an additional transcript.
        Args:
            samples (ndarray.float32): Audio samples [num_samples x num_channels].
            sample_rate (int): Audio sample rate.
            transcript (str): Transcript text for the speech.
            tokens (List[str], optinal): Transcript tokens for the speech.
            token_ids (List[int], optional): Transcript token ids for the speech.
        """
        AudioSegment.__init__(self, samples, sample_rate)
        self._transcript = transcript
        # must init `tokens` with `token_ids` at the same time
        self._tokens = tokens
        self._token_ids = token_ids
    def __eq__(self, other):
        """Return whether two objects are equal.
        Returns:
            bool: True, when equal to other
        """
        if not AudioSegment.__eq__(self, other):
            return False
        if self._transcript != other._transcript:
            return False
        if self.has_token and other.has_token:
            if self._tokens != other._tokens:
                return False
            if self._token_ids != other._token_ids:
                return False
        return True
    def __ne__(self, other):
        """Return whether two objects are unequal."""
        return not self.__eq__(other)
    @classmethod
    def from_file(cls, filepath, transcript, tokens=None, token_ids=None):
        """Create speech segment from audio file and corresponding transcript.
        Args:
            filepath (str|file): Filepath or file object to audio file.
            transcript (str): Transcript text for the speech.
            tokens (List[str], optional): text tokens. Defaults to None.
            token_ids (List[int], optional): text token ids. Defaults to None.
        Returns:
            SpeechSegment: Speech segment instance.
        """
        audio = AudioSegment.from_file(filepath)
        return cls(audio.samples, audio.sample_rate, transcript, tokens,
                   token_ids)
    @classmethod
    def from_bytes(cls, bytes, transcript, tokens=None, token_ids=None):
        """Create speech segment from a byte string and corresponding
        Args:
            filepath (str|file): Filepath or file object to audio file.
            transcript (str): Transcript text for the speech.
            tokens (List[str], optional): text tokens. Defaults to None.
            token_ids (List[int], optional): text token ids. Defaults to None.
        Returns:
            SpeechSegment: Speech segment instance.
        """
        audio = AudioSegment.from_bytes(bytes)
        return cls(audio.samples, audio.sample_rate, transcript, tokens,
                   token_ids)
    @classmethod
    def concatenate(cls, *segments):
        """Concatenate an arbitrary number of speech segments together, both
        audio and transcript will be concatenated.
        :param *segments: Input speech segments to be concatenated.
        :type *segments: tuple of SpeechSegment
        :return: Speech segment instance.
        :rtype: SpeechSegment
        :raises ValueError: If the number of segments is zero, or if the 
                            sample_rate of any two segments does not match.
        :raises TypeError: If any segment is not SpeechSegment instance.
        """
        if len(segments) == 0:
            raise ValueError("No speech segments are given to concatenate.")
        sample_rate = segments[0]._sample_rate
        transcripts = ""
        tokens = []
        token_ids = []
        for seg in segments:
            if sample_rate != seg._sample_rate:
                raise ValueError("Can't concatenate segments with "
                                 "different sample rates")
            if type(seg) is not cls:
                raise TypeError("Only speech segments of the same type "
                                "instance can be concatenated.")
            transcripts += seg._transcript
            if self.has_token:
                tokens += seg._tokens
                token_ids += seg._token_ids
        samples = np.concatenate([seg.samples for seg in segments])
        return cls(samples, sample_rate, transcripts, tokens, token_ids)
    @classmethod
    def slice_from_file(cls,
                        filepath,
                        transcript,
                        tokens=None,
                        token_ids=None,
                        start=None,
                        end=None):
        """Loads a small section of an speech without having to load
        the entire file into the memory which can be incredibly wasteful.
        :param filepath: Filepath or file object to audio file.
        :type filepath: str|file
        :param start: Start time in seconds. If start is negative, it wraps
                      around from the end. If not provided, this function 
                      reads from the very beginning.
        :type start: float
        :param end: End time in seconds. If end is negative, it wraps around
                    from the end. If not provided, the default behvaior is
                    to read to the end of the file.
        :type end: float
        :param transcript: Transcript text for the speech. if not provided, 
                           the defaults is an empty string.
        :type transript: str
        :return: SpeechSegment instance of the specified slice of the input
                 speech file.
        :rtype: SpeechSegment
        """
        audio = AudioSegment.slice_from_file(filepath, start, end)
        return cls(audio.samples, audio.sample_rate, transcript, tokens,
                   token_ids)
    @classmethod
    def make_silence(cls, duration, sample_rate):
        """Creates a silent speech segment of the given duration and
        sample rate, transcript will be an empty string.
        Args:
            duration (float): Length of silence in seconds.
            sample_rate (float): Sample rate.
        Returns:
            SpeechSegment: Silence of the given duration.
        """
        audio = AudioSegment.make_silence(duration, sample_rate)
        return cls(audio.samples, audio.sample_rate, "")
    @property
    def has_token(self):
        if self._tokens and self._token_ids:
            return True
        return False
    @property
    def transcript(self):
        """Return the transcript text.
        Returns:
            str: Transcript text for the speech.
        """
        return self._transcript
    @property
    def tokens(self):
        """Return the transcript text tokens.
        Returns:
            List[str]: text tokens.
        """
        return self._tokens
    @property
    def token_ids(self):
        """Return the transcript text token ids.
        Returns:
            List[int]: text token ids.
        """
        return self._token_ids
--- a/examples/transv1.8to2.x/deepspeech/frontend/utility.py
+++ b/examples/transv1.8to2.x/deepspeech/frontend/utility.py
@ -1,289 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Contains data helper functions."""
 import codecs
 import json
 import math
 from typing import List
 from typing import Optional
 from typing import Text
 import numpy as np
 from deepspeech.utils.log import Log
 logger = Log(__name__).getlog()
 __all__ = [
    "load_dict", "load_cmvn", "read_manifest", "rms_to_db", "rms_to_dbfs",
    "max_dbfs", "mean_dbfs", "gain_db_to_ratio", "normalize_audio", "SOS",
    "EOS", "UNK", "BLANK", "MASKCTC"
 ]
 IGNORE_ID = -1
 # `sos` and `eos` using same token
 SOS = "<eos>"
 EOS = SOS
 UNK = "<unk>"
 BLANK = "<blank>"
 MASKCTC = "<mask>"
 def load_dict(dict_path: Optional[Text], maskctc=False) -> Optional[List[Text]]:
    if dict_path is None:
        return None
    with open(dict_path, "r") as f:
        dictionary = f.readlines()
    char_list = [entry.strip().split(" ")[0] for entry in dictionary]
    if BLANK not in char_list:
        char_list.insert(0, BLANK)
    if EOS not in char_list:
        char_list.append(EOS)
    # for non-autoregressive maskctc model
    if maskctc and MASKCTC not in char_list:
        char_list.append(MASKCTC)
    return char_list
 def read_manifest(
        manifest_path,
        max_input_len=float('inf'),
        min_input_len=0.0,
        max_output_len=float('inf'),
        min_output_len=0.0,
        max_output_input_ratio=float('inf'),
        min_output_input_ratio=0.0, ):
    """Load and parse manifest file.
    Args:
        manifest_path ([type]): Manifest file to load and parse.
        max_input_len ([type], optional): maximum output seq length, 
            in seconds for raw wav, in frame numbers for feature data. 
            Defaults to float('inf').
        min_input_len (float, optional): minimum input seq length, 
            in seconds for raw wav, in frame numbers for feature data. 
            Defaults to 0.0.
        max_output_len (float, optional): maximum input seq length, 
            in modeling units. Defaults to 500.0.
        min_output_len (float, optional): minimum input seq length, 
            in modeling units. Defaults to 0.0.
        max_output_input_ratio (float, optional): 
            maximum output seq length/output seq length ratio. Defaults to 10.0.
        min_output_input_ratio (float, optional): 
            minimum output seq length/output seq length ratio. Defaults to 0.05.
    Raises:
        IOError: If failed to parse the manifest.
    Returns:
        List[dict]: Manifest parsing results.
    """
    manifest = []
    for json_line in codecs.open(manifest_path, 'r', 'utf-8'):
        try:
            json_data = json.loads(json_line)
        except Exception as e:
            raise IOError("Error reading manifest: %s" % str(e))
        feat_len = json_data["feat_shape"][
            0] if 'feat_shape' in json_data else 1.0
        token_len = json_data["token_shape"][
            0] if 'token_shape' in json_data else 1.0
        conditions = [
            feat_len >= min_input_len,
            feat_len <= max_input_len,
            token_len >= min_output_len,
            token_len <= max_output_len,
            token_len / feat_len >= min_output_input_ratio,
            token_len / feat_len <= max_output_input_ratio,
        ]
        if all(conditions):
            manifest.append(json_data)
    return manifest
 def rms_to_db(rms: float):
    """Root Mean Square to dB.
    Args:
        rms ([float]): root mean square
    Returns:
        float: dB
    """
    return 20.0 * math.log10(max(1e-16, rms))
 def rms_to_dbfs(rms: float):
    """Root Mean Square to dBFS.
    https://fireattack.wordpress.com/2017/02/06/replaygain-loudness-normalization-and-applications/
    Audio is mix of sine wave, so 1 amp sine wave's Full scale is 0.7071, equal to -3.0103dB.
    dB = dBFS + 3.0103
    dBFS = db - 3.0103
    e.g. 0 dB = -3.0103 dBFS
    Args:
        rms ([float]): root mean square
    Returns:
        float: dBFS
    """
    return rms_to_db(rms) - 3.0103
 def max_dbfs(sample_data: np.ndarray):
    """Peak dBFS based on the maximum energy sample. 
    Args:
        sample_data ([np.ndarray]): float array, [-1, 1].
    Returns:
        float: dBFS 
    """
    # Peak dBFS based on the maximum energy sample. Will prevent overdrive if used for normalization.
    return rms_to_dbfs(max(abs(np.min(sample_data)), abs(np.max(sample_data))))
 def mean_dbfs(sample_data):
    """Peak dBFS based on the RMS energy. 
    Args:
        sample_data ([np.ndarray]): float array, [-1, 1].
    Returns:
        float: dBFS 
    """
    return rms_to_dbfs(
        math.sqrt(np.mean(np.square(sample_data, dtype=np.float64))))
 def gain_db_to_ratio(gain_db: float):
    """dB to ratio
    Args:
        gain_db (float): gain in dB
    Returns:
        float: scale in amp
    """
    return math.pow(10.0, gain_db / 20.0)
 def normalize_audio(sample_data: np.ndarray, dbfs: float=-3.0103):
    """Nomalize audio to dBFS.
    Args:
        sample_data (np.ndarray): input wave samples, [-1, 1].
        dbfs (float, optional): target dBFS. Defaults to -3.0103.
    Returns:
        np.ndarray: normalized wave
    """
    return np.maximum(
        np.minimum(sample_data * gain_db_to_ratio(dbfs - max_dbfs(sample_data)),
                   1.0), -1.0)
 def _load_json_cmvn(json_cmvn_file):
    """ Load the json format cmvn stats file and calculate cmvn
    Args:
        json_cmvn_file: cmvn stats file in json format
    Returns:
        a numpy array of [means, vars]
    """
    with open(json_cmvn_file) as f:
        cmvn_stats = json.load(f)
    means = cmvn_stats['mean_stat']
    variance = cmvn_stats['var_stat']
    count = cmvn_stats['frame_num']
    for i in range(len(means)):
        means[i] /= count
        variance[i] = variance[i] / count - means[i] * means[i]
        if variance[i] < 1.0e-20:
            variance[i] = 1.0e-20
        variance[i] = 1.0 / math.sqrt(variance[i])
    cmvn = np.array([means, variance])
    return cmvn
 def _load_kaldi_cmvn(kaldi_cmvn_file):
    """ Load the kaldi format cmvn stats file and calculate cmvn
    Args:
        kaldi_cmvn_file:  kaldi text style global cmvn file, which
           is generated by:
           compute-cmvn-stats --binary=false scp:feats.scp global_cmvn
    Returns:
        a numpy array of [means, vars]
    """
    means = []
    variance = []
    with open(kaldi_cmvn_file, 'r') as fid:
        # kaldi binary file start with '\0B'
        if fid.read(2) == '\0B':
            logger.error('kaldi cmvn binary file is not supported, please '
                         'recompute it by: compute-cmvn-stats --binary=false '
                         ' scp:feats.scp global_cmvn')
            sys.exit(1)
        fid.seek(0)
        arr = fid.read().split()
        assert (arr[0] == '[')
        assert (arr[-2] == '0')
        assert (arr[-1] == ']')
        feat_dim = int((len(arr) - 2 - 2) / 2)
        for i in range(1, feat_dim + 1):
            means.append(float(arr[i]))
        count = float(arr[feat_dim + 1])
        for i in range(feat_dim + 2, 2 * feat_dim + 2):
            variance.append(float(arr[i]))
    for i in range(len(means)):
        means[i] /= count
        variance[i] = variance[i] / count - means[i] * means[i]
        if variance[i] < 1.0e-20:
            variance[i] = 1.0e-20
        variance[i] = 1.0 / math.sqrt(variance[i])
    cmvn = np.array([means, variance])
    return cmvn
 def load_cmvn(cmvn_file: str, filetype: str):
    """load cmvn from file.
    Args:
        cmvn_file (str): cmvn path.
        filetype (str): file type, optional[npz, json, kaldi].
    Raises:
        ValueError: file type not support.
    Returns:
        Tuple[np.ndarray, np.ndarray]: mean, istd
    """
    assert filetype in ['npz', 'json', 'kaldi'], filetype
    filetype = filetype.lower()
    if filetype == "json":
        cmvn = _load_json_cmvn(cmvn_file)
    elif filetype == "kaldi":
        cmvn = _load_kaldi_cmvn(cmvn_file)
    else:
        raise ValueError(f"cmvn file type no support: {filetype}")
    return cmvn[0], cmvn[1]
--- a/examples/transv1.8to2.x/deepspeech/io/init.py
+++ b/examples/transv1.8to2.x/deepspeech/io/init.py
@ -1,13 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
--- a/examples/transv1.8to2.x/deepspeech/io/batchfy.py
+++ b/examples/transv1.8to2.x/deepspeech/io/batchfy.py
@ -1,469 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import itertools
 import numpy as np
 from deepspeech.utils.log import Log
 __all__ = ["make_batchset"]
 logger = Log(__name__).getlog()
 def batchfy_by_seq(
        sorted_data,
        batch_size,
        max_length_in,
        max_length_out,
        min_batch_size=1,
        shortest_first=False,
        ikey="input",
        iaxis=0,
        okey="output",
        oaxis=0, ):
    """Make batch set from json dictionary
    :param List[(str, Dict[str, Any])] sorted_data: dictionary loaded from data.json
    :param int batch_size: batch size
    :param int max_length_in: maximum length of input to decide adaptive batch size
    :param int max_length_out: maximum length of output to decide adaptive batch size
    :param int min_batch_size: mininum batch size (for multi-gpu)
    :param bool shortest_first: Sort from batch with shortest samples
        to longest if true, otherwise reverse
    :param str ikey: key to access input
        (for ASR ikey="input", for TTS, MT ikey="output".)
    :param int iaxis: dimension to access input
        (for ASR, TTS iaxis=0, for MT iaxis="1".)
    :param str okey: key to access output
        (for ASR, MT okey="output". for TTS okey="input".)
    :param int oaxis: dimension to access output
        (for ASR, TTS, MT oaxis=0, reserved for future research, -1 means all axis.)
    :return: List[List[Tuple[str, dict]]] list of batches
    """
    if batch_size <= 0:
        raise ValueError(f"Invalid batch_size={batch_size}")
    # check #utts is more than min_batch_size
    if len(sorted_data) < min_batch_size:
        raise ValueError(
            f"#utts({len(sorted_data)}) is less than min_batch_size({min_batch_size})."
        )
    # make list of minibatches
    minibatches = []
    start = 0
    while True:
        _, info = sorted_data[start]
        ilen = int(info[ikey][iaxis]["shape"][0])
        olen = (int(info[okey][oaxis]["shape"][0]) if oaxis >= 0 else
                max(map(lambda x: int(x["shape"][0]), info[okey])))
        factor = max(int(ilen / max_length_in), int(olen / max_length_out))
        # change batchsize depending on the input and output length
        # if ilen = 1000 and max_length_in = 800
        # then b = batchsize / 2
        # and max(min_batches, .) avoids batchsize = 0
        bs = max(min_batch_size, int(batch_size / (1 + factor)))
        end = min(len(sorted_data), start + bs)
        minibatch = sorted_data[start:end]
        if shortest_first:
            minibatch.reverse()
        # check each batch is more than minimum batchsize
        if len(minibatch) < min_batch_size:
            mod = min_batch_size - len(minibatch) % min_batch_size
            additional_minibatch = [
                sorted_data[i] for i in np.random.randint(0, start, mod)
            ]
            if shortest_first:
                additional_minibatch.reverse()
            minibatch.extend(additional_minibatch)
        minibatches.append(minibatch)
        if end == len(sorted_data):
            break
        start = end
    # batch: List[List[Tuple[str, dict]]]
    return minibatches
 def batchfy_by_bin(
        sorted_data,
        batch_bins,
        num_batches=0,
        min_batch_size=1,
        shortest_first=False,
        ikey="input",
        okey="output", ):
    """Make variably sized batch set, which maximizes
    the number of bins up to `batch_bins`.
    :param List[(str, Dict[str, Any])] sorted_data: dictionary loaded from data.json
    :param int batch_bins: Maximum frames of a batch
    :param int num_batches: # number of batches to use (for debug)
    :param int min_batch_size: minimum batch size (for multi-gpu)
    :param int test: Return only every `test` batches
    :param bool shortest_first: Sort from batch with shortest samples
        to longest if true, otherwise reverse
    :param str ikey: key to access input (for ASR ikey="input", for TTS ikey="output".)
    :param str okey: key to access output (for ASR okey="output". for TTS okey="input".)
    :return: List[Tuple[str, Dict[str, List[Dict[str, Any]]]] list of batches
    """
    if batch_bins <= 0:
        raise ValueError(f"invalid batch_bins={batch_bins}")
    length = len(sorted_data)
    idim = int(sorted_data[0][1][ikey][0]["shape"][1])
    odim = int(sorted_data[0][1][okey][0]["shape"][1])
    logger.info("# utts: " + str(len(sorted_data)))
    minibatches = []
    start = 0
    n = 0
    while True:
        # Dynamic batch size depending on size of samples
        b = 0
        next_size = 0
        max_olen = 0
        while next_size < batch_bins and (start + b) < length:
            ilen = int(sorted_data[start + b][1][ikey][0]["shape"][0]) * idim
            olen = int(sorted_data[start + b][1][okey][0]["shape"][0]) * odim
            if olen > max_olen:
                max_olen = olen
            next_size = (max_olen + ilen) * (b + 1)
            if next_size <= batch_bins:
                b += 1
            elif next_size == 0:
                raise ValueError(
                    f"Can't fit one sample in batch_bins ({batch_bins}): "
                    f"Please increase the value")
        end = min(length, start + max(min_batch_size, b))
        batch = sorted_data[start:end]
        if shortest_first:
            batch.reverse()
        minibatches.append(batch)
        # Check for min_batch_size and fixes the batches if needed
        i = -1
        while len(minibatches[i]) < min_batch_size:
            missing = min_batch_size - len(minibatches[i])
            if -i == len(minibatches):
                minibatches[i + 1].extend(minibatches[i])
                minibatches = minibatches[1:]
                break
            else:
                minibatches[i].extend(minibatches[i - 1][:missing])
                minibatches[i - 1] = minibatches[i - 1][missing:]
                i -= 1
        if end == length:
            break
        start = end
        n += 1
    if num_batches > 0:
        minibatches = minibatches[:num_batches]
    lengths = [len(x) for x in minibatches]
    logger.info(
        str(len(minibatches)) + " batches containing from " + str(min(lengths))
        + " to " + str(max(lengths)) + " samples " + "(avg " + str(
            int(np.mean(lengths))) + " samples).")
    return minibatches
 def batchfy_by_frame(
        sorted_data,
        max_frames_in,
        max_frames_out,
        max_frames_inout,
        num_batches=0,
        min_batch_size=1,
        shortest_first=False,
        ikey="input",
        okey="output", ):
    """Make variable batch set, which maximizes the number of frames to max_batch_frame.
    :param List[(str, Dict[str, Any])] sorteddata: dictionary loaded from data.json
    :param int max_frames_in: Maximum input frames of a batch
    :param int max_frames_out: Maximum output frames of a batch
    :param int max_frames_inout: Maximum input+output frames of a batch
    :param int num_batches: # number of batches to use (for debug)
    :param int min_batch_size: minimum batch size (for multi-gpu)
    :param int test: Return only every `test` batches
    :param bool shortest_first: Sort from batch with shortest samples
        to longest if true, otherwise reverse
    :param str ikey: key to access input (for ASR ikey="input", for TTS ikey="output".)
    :param str okey: key to access output (for ASR okey="output". for TTS okey="input".)
    :return: List[Tuple[str, Dict[str, List[Dict[str, Any]]]] list of batches
    """
    if max_frames_in <= 0 and max_frames_out <= 0 and max_frames_inout <= 0:
        raise ValueError(
            "At least, one of `--batch-frames-in`, `--batch-frames-out` or "
            "`--batch-frames-inout` should be > 0")
    length = len(sorted_data)
    minibatches = []
    start = 0
    end = 0
    while end != length:
        # Dynamic batch size depending on size of samples
        b = 0
        max_olen = 0
        max_ilen = 0
        while (start + b) < length:
            ilen = int(sorted_data[start + b][1][ikey][0]["shape"][0])
            if ilen > max_frames_in and max_frames_in != 0:
                raise ValueError(
                    f"Can't fit one sample in --batch-frames-in ({max_frames_in}): "
                    f"Please increase the value")
            olen = int(sorted_data[start + b][1][okey][0]["shape"][0])
            if olen > max_frames_out and max_frames_out != 0:
                raise ValueError(
                    f"Can't fit one sample in --batch-frames-out ({max_frames_out}): "
                    f"Please increase the value")
            if ilen + olen > max_frames_inout and max_frames_inout != 0:
                raise ValueError(
                    f"Can't fit one sample in --batch-frames-out ({max_frames_inout}): "
                    f"Please increase the value")
            max_olen = max(max_olen, olen)
            max_ilen = max(max_ilen, ilen)
            in_ok = max_ilen * (b + 1) <= max_frames_in or max_frames_in == 0
            out_ok = max_olen * (b + 1) <= max_frames_out or max_frames_out == 0
            inout_ok = (max_ilen + max_olen) * (
                b + 1) <= max_frames_inout or max_frames_inout == 0
            if in_ok and out_ok and inout_ok:
                # add more seq in the minibatch
                b += 1
            else:
                # no more seq in the minibatch
                break
        end = min(length, start + b)
        batch = sorted_data[start:end]
        if shortest_first:
            batch.reverse()
        minibatches.append(batch)
        # Check for min_batch_size and fixes the batches if needed
        i = -1
        while len(minibatches[i]) < min_batch_size:
            missing = min_batch_size - len(minibatches[i])
            if -i == len(minibatches):
                minibatches[i + 1].extend(minibatches[i])
                minibatches = minibatches[1:]
                break
            else:
                minibatches[i].extend(minibatches[i - 1][:missing])
                minibatches[i - 1] = minibatches[i - 1][missing:]
                i -= 1
        start = end
    if num_batches > 0:
        minibatches = minibatches[:num_batches]
    lengths = [len(x) for x in minibatches]
    logger.info(
        str(len(minibatches)) + " batches containing from " + str(min(lengths))
        + " to " + str(max(lengths)) + " samples" + "(avg " + str(
            int(np.mean(lengths))) + " samples).")
    return minibatches
 def batchfy_shuffle(data, batch_size, min_batch_size, num_batches,
                    shortest_first):
    import random
    logger.info("use shuffled batch.")
    sorted_data = random.sample(data.items(), len(data.items()))
    logger.info("# utts: " + str(len(sorted_data)))
    # make list of minibatches
    minibatches = []
    start = 0
    while True:
        end = min(len(sorted_data), start + batch_size)
        # check each batch is more than minimum batchsize
        minibatch = sorted_data[start:end]
        if shortest_first:
            minibatch.reverse()
        if len(minibatch) < min_batch_size:
            mod = min_batch_size - len(minibatch) % min_batch_size
            additional_minibatch = [
                sorted_data[i] for i in np.random.randint(0, start, mod)
            ]
            if shortest_first:
                additional_minibatch.reverse()
            minibatch.extend(additional_minibatch)
        minibatches.append(minibatch)
        if end == len(sorted_data):
            break
        start = end
    # for debugging
    if num_batches > 0:
        minibatches = minibatches[:num_batches]
        logger.info("# minibatches: " + str(len(minibatches)))
    return minibatches
 BATCH_COUNT_CHOICES = ["auto", "seq", "bin", "frame"]
 BATCH_SORT_KEY_CHOICES = ["input", "output", "shuffle"]
 def make_batchset(
        data,
        batch_size=0,
        max_length_in=float("inf"),
        max_length_out=float("inf"),
        num_batches=0,
        min_batch_size=1,
        shortest_first=False,
        batch_sort_key="input",
        count="auto",
        batch_bins=0,
        batch_frames_in=0,
        batch_frames_out=0,
        batch_frames_inout=0,
        iaxis=0,
        oaxis=0, ):
    """Make batch set from json dictionary
    if utts have "category" value,
        >>> data = [{'category': 'A', 'input': ..., 'utt':'utt1'},
        ...         {'category': 'B', 'input': ..., 'utt':'utt2'},
        ...         {'category': 'B', 'input': ..., 'utt':'utt3'},
        ...         {'category': 'A', 'input': ..., 'utt':'utt4'}]
        >>> make_batchset(data, batchsize=2, ...)
        [[('utt1', ...), ('utt4', ...)], [('utt2', ...), ('utt3': ...)]]
    Note that if any utts doesn't have "category",
    perform as same as batchfy_by_{count}
    :param List[Dict[str, Any]] data: dictionary loaded from data.json
    :param int batch_size: maximum number of sequences in a minibatch.
    :param int batch_bins: maximum number of bins (frames x dim) in a minibatch.
    :param int batch_frames_in:  maximum number of input frames in a minibatch.
    :param int batch_frames_out: maximum number of output frames in a minibatch.
    :param int batch_frames_out: maximum number of input+output frames in a minibatch.
    :param str count: strategy to count maximum size of batch.
        For choices, see espnet.asr.batchfy.BATCH_COUNT_CHOICES
    :param int max_length_in: maximum length of input to decide adaptive batch size
    :param int max_length_out: maximum length of output to decide adaptive batch size
    :param int num_batches: # number of batches to use (for debug)
    :param int min_batch_size: minimum batch size (for multi-gpu)
    :param bool shortest_first: Sort from batch with shortest samples
        to longest if true, otherwise reverse
    :param str batch_sort_key: how to sort data before creating minibatches
        ["input", "output", "shuffle"]
    :param bool swap_io: if True, use "input" as output and "output"
        as input in `data` dict
    :param bool mt: if True, use 0-axis of "output" as output and 1-axis of "output"
        as input in `data` dict
    :param int iaxis: dimension to access input
        (for ASR, TTS iaxis=0, for MT iaxis="1".)
    :param int oaxis: dimension to access output (for ASR, TTS, MT oaxis=0,
        reserved for future research, -1 means all axis.)
    :return: List[List[Tuple[str, dict]]] list of batches
    """
    # check args
    if count not in BATCH_COUNT_CHOICES:
        raise ValueError(
            f"arg 'count' ({count}) should be one of {BATCH_COUNT_CHOICES}")
    if batch_sort_key not in BATCH_SORT_KEY_CHOICES:
        raise ValueError(f"arg 'batch_sort_key' ({batch_sort_key}) should be "
                         f"one of {BATCH_SORT_KEY_CHOICES}")
    ikey = "input"
    okey = "output"
    batch_sort_axis = 0  # index of list 
    if count == "auto":
        if batch_size != 0:
            count = "seq"
        elif batch_bins != 0:
            count = "bin"
        elif batch_frames_in != 0 or batch_frames_out != 0 or batch_frames_inout != 0:
            count = "frame"
        else:
            raise ValueError(
                f"cannot detect `count` manually set one of {BATCH_COUNT_CHOICES}"
            )
        logger.info(f"count is auto detected as {count}")
    if count != "seq" and batch_sort_key == "shuffle":
        raise ValueError(
            "batch_sort_key=shuffle is only available if batch_count=seq")
    category2data = {}  # Dict[str, dict]
    for v in data:
        k = v['utt']
        category2data.setdefault(v.get("category"), {})[k] = v
    batches_list = []  # List[List[List[Tuple[str, dict]]]]
    for d in category2data.values():
        if batch_sort_key == "shuffle":
            batches = batchfy_shuffle(d, batch_size, min_batch_size,
                                      num_batches, shortest_first)
            batches_list.append(batches)
            continue
        # sort it by input lengths (long to short)
        sorted_data = sorted(
            d.items(),
            key=lambda data: int(data[1][batch_sort_key][batch_sort_axis]["shape"][0]),
            reverse=not shortest_first, )
        logger.info("# utts: " + str(len(sorted_data)))
        if count == "seq":
            batches = batchfy_by_seq(
                sorted_data,
                batch_size=batch_size,
                max_length_in=max_length_in,
                max_length_out=max_length_out,
                min_batch_size=min_batch_size,
                shortest_first=shortest_first,
                ikey=ikey,
                iaxis=iaxis,
                okey=okey,
                oaxis=oaxis, )
        if count == "bin":
            batches = batchfy_by_bin(
                sorted_data,
                batch_bins=batch_bins,
                min_batch_size=min_batch_size,
                shortest_first=shortest_first,
                ikey=ikey,
                okey=okey, )
        if count == "frame":
            batches = batchfy_by_frame(
                sorted_data,
                max_frames_in=batch_frames_in,
                max_frames_out=batch_frames_out,
                max_frames_inout=batch_frames_inout,
                min_batch_size=min_batch_size,
                shortest_first=shortest_first,
                ikey=ikey,
                okey=okey, )
        batches_list.append(batches)
    if len(batches_list) == 1:
        batches = batches_list[0]
    else:
        # Concat list. This way is faster than "sum(batch_list, [])"
        batches = list(itertools.chain(*batches_list))
    # for debugging
    if num_batches > 0:
        batches = batches[:num_batches]
    logger.info("# minibatches: " + str(len(batches)))
    # batch: List[List[Tuple[str, dict]]]
    return batches
--- a/examples/transv1.8to2.x/deepspeech/io/collator.py
+++ b/examples/transv1.8to2.x/deepspeech/io/collator.py
@ -1,321 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import io
 from collections import namedtuple
 from typing import Optional
 import numpy as np
 from yacs.config import CfgNode
 from deepspeech.frontend.augmentor.augmentation import AugmentationPipeline
 from deepspeech.frontend.featurizer.speech_featurizer import SpeechFeaturizer
 from deepspeech.frontend.normalizer import FeatureNormalizer
 from deepspeech.frontend.speech import SpeechSegment
 from deepspeech.frontend.utility import IGNORE_ID
 from deepspeech.io.utility import pad_list
 from deepspeech.utils.log import Log
 __all__ = ["SpeechCollator"]
 logger = Log(__name__).getlog()
 # namedtupe need global for pickle.
 TarLocalData = namedtuple('TarLocalData', ['tar2info', 'tar2object'])
 class SpeechCollator():
    @classmethod
    def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
        default = CfgNode(
            dict(
                augmentation_config="",
                random_seed=0,
                mean_std_filepath="",
                unit_type="char",
                vocab_filepath="",
                spm_model_prefix="",
                specgram_type='linear',  # 'linear', 'mfcc', 'fbank'
                feat_dim=0,  # 'mfcc', 'fbank'
                delta_delta=False,  # 'mfcc', 'fbank'
                stride_ms=10.0,  # ms
                window_ms=20.0,  # ms
                n_fft=None,  # fft points
                max_freq=None,  # None for samplerate/2
                target_sample_rate=16000,  # target sample rate
                use_dB_normalization=True,
                target_dB=-20,
                dither=1.0,  # feature dither
                keep_transcription_text=False))
        if config is not None:
            config.merge_from_other_cfg(default)
        return default
    @classmethod
    def from_config(cls, config):
        """Build a SpeechCollator object from a config.
        Args:
            config (yacs.config.CfgNode): configs object.
        Returns:
            SpeechCollator: collator object.
        """
        assert 'augmentation_config' in config.collator
        assert 'keep_transcription_text' in config.collator
        assert 'mean_std_filepath' in config.collator
        assert 'vocab_filepath' in config.collator
        assert 'specgram_type' in config.collator
        assert 'n_fft' in config.collator
        assert config.collator
        if isinstance(config.collator.augmentation_config, (str, bytes)):
            if config.collator.augmentation_config:
                aug_file = io.open(
                    config.collator.augmentation_config,
                    mode='r',
                    encoding='utf8')
            else:
                aug_file = io.StringIO(initial_value='{}', newline='')
        else:
            aug_file = config.collator.augmentation_config
            assert isinstance(aug_file, io.StringIO)
        speech_collator = cls(
            aug_file=aug_file,
            random_seed=0,
            mean_std_filepath=config.collator.mean_std_filepath,
            unit_type=config.collator.unit_type,
            vocab_filepath=config.collator.vocab_filepath,
            spm_model_prefix=config.collator.spm_model_prefix,
            specgram_type=config.collator.specgram_type,
            feat_dim=config.collator.feat_dim,
            delta_delta=config.collator.delta_delta,
            stride_ms=config.collator.stride_ms,
            window_ms=config.collator.window_ms,
            n_fft=config.collator.n_fft,
            max_freq=config.collator.max_freq,
            target_sample_rate=config.collator.target_sample_rate,
            use_dB_normalization=config.collator.use_dB_normalization,
            target_dB=config.collator.target_dB,
            dither=config.collator.dither,
            keep_transcription_text=config.collator.keep_transcription_text)
        return speech_collator
    def __init__(
            self,
            aug_file,
            mean_std_filepath,
            vocab_filepath,
            spm_model_prefix,
            random_seed=0,
            unit_type="char",
            specgram_type='linear',  # 'linear', 'mfcc', 'fbank'
            feat_dim=0,  # 'mfcc', 'fbank'
            delta_delta=False,  # 'mfcc', 'fbank'
            stride_ms=10.0,  # ms
            window_ms=20.0,  # ms
            n_fft=None,  # fft points
            max_freq=None,  # None for samplerate/2
            target_sample_rate=16000,  # target sample rate
            use_dB_normalization=True,
            target_dB=-20,
            dither=1.0,
            keep_transcription_text=True):
        """SpeechCollator Collator
        Args:
            unit_type(str): token unit type, e.g. char, word, spm
            vocab_filepath (str): vocab file path.
            mean_std_filepath (str): mean and std file path, which suffix is *.npy
            spm_model_prefix (str): spm model prefix, need if `unit_type` is spm.
            augmentation_config (str, optional): augmentation json str. Defaults to '{}'.
            stride_ms (float, optional): stride size in ms. Defaults to 10.0.
            window_ms (float, optional): window size in ms. Defaults to 20.0.
            n_fft (int, optional): fft points for rfft. Defaults to None.
            max_freq (int, optional): max cut freq. Defaults to None.
            target_sample_rate (int, optional): target sample rate which used for training. Defaults to 16000.
            specgram_type (str, optional): 'linear', 'mfcc' or 'fbank'. Defaults to 'linear'.
            feat_dim (int, optional): audio feature dim, using by 'mfcc' or 'fbank'. Defaults to None.
            delta_delta (bool, optional): audio feature with delta-delta, using by 'fbank' or 'mfcc'. Defaults to False.
            use_dB_normalization (bool, optional): do dB normalization. Defaults to True.
            target_dB (int, optional): target dB. Defaults to -20.
            random_seed (int, optional): for random generator. Defaults to 0.
            keep_transcription_text (bool, optional): True, when not in training mode, will not do tokenizer; Defaults to False.
            if ``keep_transcription_text`` is False, text is token ids else is raw string.
        Do augmentations
        Padding audio features with zeros to make them have the same shape (or
        a user-defined shape) within one batch.
        """
        self._keep_transcription_text = keep_transcription_text
        self._local_data = TarLocalData(tar2info={}, tar2object={})
        self._augmentation_pipeline = AugmentationPipeline(
            augmentation_config=aug_file.read(), random_seed=random_seed)
        self._normalizer = FeatureNormalizer(
            mean_std_filepath) if mean_std_filepath else None
        self._stride_ms = stride_ms
        self._target_sample_rate = target_sample_rate
        self._speech_featurizer = SpeechFeaturizer(
            unit_type=unit_type,
            vocab_filepath=vocab_filepath,
            spm_model_prefix=spm_model_prefix,
            specgram_type=specgram_type,
            feat_dim=feat_dim,
            delta_delta=delta_delta,
            stride_ms=stride_ms,
            window_ms=window_ms,
            n_fft=n_fft,
            max_freq=max_freq,
            target_sample_rate=target_sample_rate,
            use_dB_normalization=use_dB_normalization,
            target_dB=target_dB,
            dither=dither)
    def _parse_tar(self, file):
        """Parse a tar file to get a tarfile object
        and a map containing tarinfoes
        """
        result = {}
        f = tarfile.open(file)
        for tarinfo in f.getmembers():
            result[tarinfo.name] = tarinfo
        return f, result
    def _subfile_from_tar(self, file):
        """Get subfile object from tar.
        It will return a subfile object from tar file
        and cached tar file info for next reading request.
        """
        tarpath, filename = file.split(':', 1)[1].split('#', 1)
        if 'tar2info' not in self._local_data.__dict__:
            self._local_data.tar2info = {}
        if 'tar2object' not in self._local_data.__dict__:
            self._local_data.tar2object = {}
        if tarpath not in self._local_data.tar2info:
            object, infoes = self._parse_tar(tarpath)
            self._local_data.tar2info[tarpath] = infoes
            self._local_data.tar2object[tarpath] = object
        return self._local_data.tar2object[tarpath].extractfile(
            self._local_data.tar2info[tarpath][filename])
    def process_utterance(self, audio_file, transcript):
        """Load, augment, featurize and normalize for speech data.
        :param audio_file: Filepath or file object of audio file.
        :type audio_file: str | file
        :param transcript: Transcription text.
        :type transcript: str
        :return: Tuple of audio feature tensor and data of transcription part,
                 where transcription part could be token ids or text.
        :rtype: tuple of (2darray, list)
        """
        if isinstance(audio_file, str) and audio_file.startswith('tar:'):
            speech_segment = SpeechSegment.from_file(
                self._subfile_from_tar(audio_file), transcript)
        else:
            speech_segment = SpeechSegment.from_file(audio_file, transcript)
        # audio augment
        self._augmentation_pipeline.transform_audio(speech_segment)
        specgram, transcript_part = self._speech_featurizer.featurize(
            speech_segment, self._keep_transcription_text)
        if self._normalizer:
            specgram = self._normalizer.apply(specgram)
        # specgram augment
        specgram = self._augmentation_pipeline.transform_feature(specgram)
        return specgram, transcript_part
    def __call__(self, batch):
        """batch examples
        Args:
            batch ([List]): batch is (audio, text)
                audio (np.ndarray) shape (T, D)
                text (List[int] or str): shape (U,)
        Returns:
            tuple(audio, text, audio_lens, text_lens): batched data.
                audio : (B, Tmax, D)
                audio_lens: (B)
                text : (B, Umax)
                text_lens: (B)
        """
        audios = []
        audio_lens = []
        texts = []
        text_lens = []
        utts = []
        for utt, audio, text in batch:
            audio, text = self.process_utterance(audio, text)
            #utt
            utts.append(utt)
            # audio
            audios.append(audio)  # [T, D]
            audio_lens.append(audio.shape[0])
            # text
            # for training, text is token ids
            # else text is string, convert to unicode ord
            tokens = []
            if self._keep_transcription_text:
                assert isinstance(text, str), (type(text), text)
                tokens = [ord(t) for t in text]
            else:
                tokens = text  # token ids
            tokens = tokens if isinstance(tokens, np.ndarray) else np.array(
                tokens, dtype=np.int64)
            texts.append(tokens)
            text_lens.append(tokens.shape[0])
        #[B, T, D]
        xs_pad = pad_list(audios, 0.0).astype(np.float32)
        ilens = np.array(audio_lens).astype(np.int64)
        ys_pad = pad_list(texts, IGNORE_ID).astype(np.int64)
        olens = np.array(text_lens).astype(np.int64)
        return utts, xs_pad, ilens, ys_pad, olens
    @property
    def manifest(self):
        return self._manifest
    @property
    def vocab_size(self):
        return self._speech_featurizer.vocab_size
    @property
    def vocab_list(self):
        return self._speech_featurizer.vocab_list
    @property
    def vocab_dict(self):
        return self._speech_featurizer.vocab_dict
    @property
    def text_feature(self):
        return self._speech_featurizer.text_feature
    @property
    def feature_size(self):
        return self._speech_featurizer.feature_size
    @property
    def stride_ms(self):
        return self._speech_featurizer.stride_ms
--- a/examples/transv1.8to2.x/deepspeech/io/collator_st.py
+++ b/examples/transv1.8to2.x/deepspeech/io/collator_st.py
@ -1,631 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import io
 from collections import namedtuple
 from typing import Optional
 import kaldiio
 import numpy as np
 from yacs.config import CfgNode
 from deepspeech.frontend.augmentor.augmentation import AugmentationPipeline
 from deepspeech.frontend.featurizer.speech_featurizer import SpeechFeaturizer
 from deepspeech.frontend.featurizer.text_featurizer import TextFeaturizer
 from deepspeech.frontend.normalizer import FeatureNormalizer
 from deepspeech.frontend.speech import SpeechSegment
 from deepspeech.frontend.utility import IGNORE_ID
 from deepspeech.io.utility import pad_sequence
 from deepspeech.utils.log import Log
 __all__ = ["SpeechCollator", "KaldiPrePorocessedCollator"]
 logger = Log(__name__).getlog()
 # namedtupe need global for pickle.
 TarLocalData = namedtuple('TarLocalData', ['tar2info', 'tar2object'])
 class SpeechCollator():
    @classmethod
    def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
        default = CfgNode(
            dict(
                augmentation_config="",
                random_seed=0,
                mean_std_filepath="",
                unit_type="char",
                vocab_filepath="",
                spm_model_prefix="",
                specgram_type='linear',  # 'linear', 'mfcc', 'fbank'
                feat_dim=0,  # 'mfcc', 'fbank'
                delta_delta=False,  # 'mfcc', 'fbank'
                stride_ms=10.0,  # ms
                window_ms=20.0,  # ms
                n_fft=None,  # fft points
                max_freq=None,  # None for samplerate/2
                target_sample_rate=16000,  # target sample rate
                use_dB_normalization=True,
                target_dB=-20,
                dither=1.0,  # feature dither
                keep_transcription_text=False))
        if config is not None:
            config.merge_from_other_cfg(default)
        return default
    @classmethod
    def from_config(cls, config):
        """Build a SpeechCollator object from a config.
        Args:
            config (yacs.config.CfgNode): configs object.
        Returns:
            SpeechCollator: collator object.
        """
        assert 'augmentation_config' in config.collator
        assert 'keep_transcription_text' in config.collator
        assert 'mean_std_filepath' in config.collator
        assert 'vocab_filepath' in config.collator
        assert 'specgram_type' in config.collator
        assert 'n_fft' in config.collator
        assert config.collator
        if isinstance(config.collator.augmentation_config, (str, bytes)):
            if config.collator.augmentation_config:
                aug_file = io.open(
                    config.collator.augmentation_config,
                    mode='r',
                    encoding='utf8')
            else:
                aug_file = io.StringIO(initial_value='{}', newline='')
        else:
            aug_file = config.collator.augmentation_config
            assert isinstance(aug_file, io.StringIO)
        speech_collator = cls(
            aug_file=aug_file,
            random_seed=0,
            mean_std_filepath=config.collator.mean_std_filepath,
            unit_type=config.collator.unit_type,
            vocab_filepath=config.collator.vocab_filepath,
            spm_model_prefix=config.collator.spm_model_prefix,
            specgram_type=config.collator.specgram_type,
            feat_dim=config.collator.feat_dim,
            delta_delta=config.collator.delta_delta,
            stride_ms=config.collator.stride_ms,
            window_ms=config.collator.window_ms,
            n_fft=config.collator.n_fft,
            max_freq=config.collator.max_freq,
            target_sample_rate=config.collator.target_sample_rate,
            use_dB_normalization=config.collator.use_dB_normalization,
            target_dB=config.collator.target_dB,
            dither=config.collator.dither,
            keep_transcription_text=config.collator.keep_transcription_text)
        return speech_collator
    def __init__(
            self,
            aug_file,
            mean_std_filepath,
            vocab_filepath,
            spm_model_prefix,
            random_seed=0,
            unit_type="char",
            specgram_type='linear',  # 'linear', 'mfcc', 'fbank'
            feat_dim=0,  # 'mfcc', 'fbank'
            delta_delta=False,  # 'mfcc', 'fbank'
            stride_ms=10.0,  # ms
            window_ms=20.0,  # ms
            n_fft=None,  # fft points
            max_freq=None,  # None for samplerate/2
            target_sample_rate=16000,  # target sample rate
            use_dB_normalization=True,
            target_dB=-20,
            dither=1.0,
            keep_transcription_text=True):
        """SpeechCollator Collator
        Args:
            unit_type(str): token unit type, e.g. char, word, spm
            vocab_filepath (str): vocab file path.
            mean_std_filepath (str): mean and std file path, which suffix is *.npy
            spm_model_prefix (str): spm model prefix, need if `unit_type` is spm.
            augmentation_config (str, optional): augmentation json str. Defaults to '{}'.
            stride_ms (float, optional): stride size in ms. Defaults to 10.0.
            window_ms (float, optional): window size in ms. Defaults to 20.0.
            n_fft (int, optional): fft points for rfft. Defaults to None.
            max_freq (int, optional): max cut freq. Defaults to None.
            target_sample_rate (int, optional): target sample rate which used for training. Defaults to 16000.
            specgram_type (str, optional): 'linear', 'mfcc' or 'fbank'. Defaults to 'linear'.
            feat_dim (int, optional): audio feature dim, using by 'mfcc' or 'fbank'. Defaults to None.
            delta_delta (bool, optional): audio feature with delta-delta, using by 'fbank' or 'mfcc'. Defaults to False.
            use_dB_normalization (bool, optional): do dB normalization. Defaults to True.
            target_dB (int, optional): target dB. Defaults to -20.
            random_seed (int, optional): for random generator. Defaults to 0.
            keep_transcription_text (bool, optional): True, when not in training mode, will not do tokenizer; Defaults to False.
            if ``keep_transcription_text`` is False, text is token ids else is raw string.
        Do augmentations
        Padding audio features with zeros to make them have the same shape (or
        a user-defined shape) within one batch.
        """
        self._keep_transcription_text = keep_transcription_text
        self._local_data = TarLocalData(tar2info={}, tar2object={})
        self._augmentation_pipeline = AugmentationPipeline(
            augmentation_config=aug_file.read(), random_seed=random_seed)
        self._normalizer = FeatureNormalizer(
            mean_std_filepath) if mean_std_filepath else None
        self._stride_ms = stride_ms
        self._target_sample_rate = target_sample_rate
        self._speech_featurizer = SpeechFeaturizer(
            unit_type=unit_type,
            vocab_filepath=vocab_filepath,
            spm_model_prefix=spm_model_prefix,
            specgram_type=specgram_type,
            feat_dim=feat_dim,
            delta_delta=delta_delta,
            stride_ms=stride_ms,
            window_ms=window_ms,
            n_fft=n_fft,
            max_freq=max_freq,
            target_sample_rate=target_sample_rate,
            use_dB_normalization=use_dB_normalization,
            target_dB=target_dB,
            dither=dither)
    def _parse_tar(self, file):
        """Parse a tar file to get a tarfile object
        and a map containing tarinfoes
        """
        result = {}
        f = tarfile.open(file)
        for tarinfo in f.getmembers():
            result[tarinfo.name] = tarinfo
        return f, result
    def _subfile_from_tar(self, file):
        """Get subfile object from tar.
        It will return a subfile object from tar file
        and cached tar file info for next reading request.
        """
        tarpath, filename = file.split(':', 1)[1].split('#', 1)
        if 'tar2info' not in self._local_data.__dict__:
            self._local_data.tar2info = {}
        if 'tar2object' not in self._local_data.__dict__:
            self._local_data.tar2object = {}
        if tarpath not in self._local_data.tar2info:
            object, infoes = self._parse_tar(tarpath)
            self._local_data.tar2info[tarpath] = infoes
            self._local_data.tar2object[tarpath] = object
        return self._local_data.tar2object[tarpath].extractfile(
            self._local_data.tar2info[tarpath][filename])
    @property
    def manifest(self):
        return self._manifest
    @property
    def vocab_size(self):
        return self._speech_featurizer.vocab_size
    @property
    def vocab_list(self):
        return self._speech_featurizer.vocab_list
    @property
    def vocab_dict(self):
        return self._speech_featurizer.vocab_dict
    @property
    def text_feature(self):
        return self._speech_featurizer.text_feature
    @property
    def feature_size(self):
        return self._speech_featurizer.feature_size
    @property
    def stride_ms(self):
        return self._speech_featurizer.stride_ms
    def process_utterance(self, audio_file, translation):
        """Load, augment, featurize and normalize for speech data.
        :param audio_file: Filepath or file object of audio file.
        :type audio_file: str | file
        :param translation: translation text.
        :type translation: str
        :return: Tuple of audio feature tensor and data of translation part,
                 where translation part could be token ids or text.
        :rtype: tuple of (2darray, list)
        """
        if isinstance(audio_file, str) and audio_file.startswith('tar:'):
            speech_segment = SpeechSegment.from_file(
                self._subfile_from_tar(audio_file), translation)
        else:
            speech_segment = SpeechSegment.from_file(audio_file, translation)
        # audio augment
        self._augmentation_pipeline.transform_audio(speech_segment)
        specgram, translation_part = self._speech_featurizer.featurize(
            speech_segment, self._keep_transcription_text)
        if self._normalizer:
            specgram = self._normalizer.apply(specgram)
        # specgram augment
        specgram = self._augmentation_pipeline.transform_feature(specgram)
        return specgram, translation_part
    def __call__(self, batch):
        """batch examples
        Args:
            batch ([List]): batch is (audio, text)
                audio (np.ndarray) shape (T, D)
                text (List[int] or str): shape (U,)
        Returns:
            tuple(audio, text, audio_lens, text_lens): batched data.
                audio : (B, Tmax, D)
                audio_lens: (B)
                text : (B, Umax)
                text_lens: (B)
        """
        audios = []
        audio_lens = []
        texts = []
        text_lens = []
        utts = []
        for utt, audio, text in batch:
            audio, text = self.process_utterance(audio, text)
            #utt
            utts.append(utt)
            # audio
            audios.append(audio)  # [T, D]
            audio_lens.append(audio.shape[0])
            # text
            # for training, text is token ids
            # else text is string, convert to unicode ord
            tokens = []
            if self._keep_transcription_text:
                assert isinstance(text, str), (type(text), text)
                tokens = [ord(t) for t in text]
            else:
                tokens = text  # token ids
            tokens = tokens if isinstance(tokens, np.ndarray) else np.array(
                tokens, dtype=np.int64)
            texts.append(tokens)
            text_lens.append(tokens.shape[0])
        padded_audios = pad_sequence(
            audios, padding_value=0.0).astype(np.float32)  #[B, T, D]
        audio_lens = np.array(audio_lens).astype(np.int64)
        padded_texts = pad_sequence(
            texts, padding_value=IGNORE_ID).astype(np.int64)
        text_lens = np.array(text_lens).astype(np.int64)
        return utts, padded_audios, audio_lens, padded_texts, text_lens
 class TripletSpeechCollator(SpeechCollator):
    def process_utterance(self, audio_file, translation, transcript):
        """Load, augment, featurize and normalize for speech data.
        :param audio_file: Filepath or file object of audio file.
        :type audio_file: str | file
        :param translation: translation text.
        :type translation: str
        :return: Tuple of audio feature tensor and data of translation part,
                    where translation part could be token ids or text.
        :rtype: tuple of (2darray, list)
        """
        if isinstance(audio_file, str) and audio_file.startswith('tar:'):
            speech_segment = SpeechSegment.from_file(
                self._subfile_from_tar(audio_file), translation)
        else:
            speech_segment = SpeechSegment.from_file(audio_file, translation)
        # audio augment
        self._augmentation_pipeline.transform_audio(speech_segment)
        specgram, translation_part = self._speech_featurizer.featurize(
            speech_segment, self._keep_transcription_text)
        transcript_part = self._speech_featurizer._text_featurizer.featurize(
            transcript)
        if self._normalizer:
            specgram = self._normalizer.apply(specgram)
        # specgram augment
        specgram = self._augmentation_pipeline.transform_feature(specgram)
        return specgram, translation_part, transcript_part
    def __call__(self, batch):
        """batch examples
        Args:
            batch ([List]): batch is (audio, text)
                audio (np.ndarray) shape (T, D)
                text (List[int] or str): shape (U,)
        Returns:
            tuple(audio, text, audio_lens, text_lens): batched data.
                audio : (B, Tmax, D)
                audio_lens: (B)
                text : (B, Umax)
                text_lens: (B)
        """
        audios = []
        audio_lens = []
        translation_text = []
        translation_text_lens = []
        transcription_text = []
        transcription_text_lens = []
        utts = []
        for utt, audio, translation, transcription in batch:
            audio, translation, transcription = self.process_utterance(
                audio, translation, transcription)
            #utt
            utts.append(utt)
            # audio
            audios.append(audio)  # [T, D]
            audio_lens.append(audio.shape[0])
            # text
            # for training, text is token ids
            # else text is string, convert to unicode ord
            tokens = [[], []]
            for idx, text in enumerate([translation, transcription]):
                if self._keep_transcription_text:
                    assert isinstance(text, str), (type(text), text)
                    tokens[idx] = [ord(t) for t in text]
                else:
                    tokens[idx] = text  # token ids
                tokens[idx] = tokens[idx] if isinstance(
                    tokens[idx], np.ndarray) else np.array(
                        tokens[idx], dtype=np.int64)
            translation_text.append(tokens[0])
            translation_text_lens.append(tokens[0].shape[0])
            transcription_text.append(tokens[1])
            transcription_text_lens.append(tokens[1].shape[0])
        padded_audios = pad_sequence(
            audios, padding_value=0.0).astype(np.float32)  #[B, T, D]
        audio_lens = np.array(audio_lens).astype(np.int64)
        padded_translation = pad_sequence(
            translation_text, padding_value=IGNORE_ID).astype(np.int64)
        translation_lens = np.array(translation_text_lens).astype(np.int64)
        padded_transcription = pad_sequence(
            transcription_text, padding_value=IGNORE_ID).astype(np.int64)
        transcription_lens = np.array(transcription_text_lens).astype(np.int64)
        return utts, padded_audios, audio_lens, (
            padded_translation, padded_transcription), (translation_lens,
                                                        transcription_lens)
 class KaldiPrePorocessedCollator(SpeechCollator):
    @classmethod
    def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
        default = CfgNode(
            dict(
                augmentation_config="",
                random_seed=0,
                unit_type="char",
                vocab_filepath="",
                spm_model_prefix="",
                feat_dim=0,
                stride_ms=10.0,
                keep_transcription_text=False))
        if config is not None:
            config.merge_from_other_cfg(default)
        return default
    @classmethod
    def from_config(cls, config):
        """Build a SpeechCollator object from a config.
        Args:
            config (yacs.config.CfgNode): configs object.
        Returns:
            SpeechCollator: collator object.
        """
        assert 'augmentation_config' in config.collator
        assert 'keep_transcription_text' in config.collator
        assert 'vocab_filepath' in config.collator
        assert config.collator
        if isinstance(config.collator.augmentation_config, (str, bytes)):
            if config.collator.augmentation_config:
                aug_file = io.open(
                    config.collator.augmentation_config,
                    mode='r',
                    encoding='utf8')
            else:
                aug_file = io.StringIO(initial_value='{}', newline='')
        else:
            aug_file = config.collator.augmentation_config
            assert isinstance(aug_file, io.StringIO)
        speech_collator = cls(
            aug_file=aug_file,
            random_seed=0,
            unit_type=config.collator.unit_type,
            vocab_filepath=config.collator.vocab_filepath,
            spm_model_prefix=config.collator.spm_model_prefix,
            feat_dim=config.collator.feat_dim,
            stride_ms=config.collator.stride_ms,
            keep_transcription_text=config.collator.keep_transcription_text)
        return speech_collator
    def __init__(self,
                 aug_file,
                 vocab_filepath,
                 spm_model_prefix,
                 random_seed=0,
                 unit_type="char",
                 feat_dim=0,
                 stride_ms=10.0,
                 keep_transcription_text=True):
        """SpeechCollator Collator
        Args:
            unit_type(str): token unit type, e.g. char, word, spm
            vocab_filepath (str): vocab file path.
            spm_model_prefix (str): spm model prefix, need if `unit_type` is spm.
            augmentation_config (str, optional): augmentation json str. Defaults to '{}'.
            random_seed (int, optional): for random generator. Defaults to 0.
            keep_transcription_text (bool, optional): True, when not in training mode, will not do tokenizer; Defaults to False.
            if ``keep_transcription_text`` is False, text is token ids else is raw string.
        Do augmentations
        Padding audio features with zeros to make them have the same shape (or
        a user-defined shape) within one batch.
        """
        self._keep_transcription_text = keep_transcription_text
        self._feat_dim = feat_dim
        self._stride_ms = stride_ms
        self._local_data = TarLocalData(tar2info={}, tar2object={})
        self._augmentation_pipeline = AugmentationPipeline(
            augmentation_config=aug_file.read(), random_seed=random_seed)
        self._text_featurizer = TextFeaturizer(unit_type, vocab_filepath,
                                               spm_model_prefix)
    def process_utterance(self, audio_file, translation):
        """Load, augment, featurize and normalize for speech data.
        :param audio_file: Filepath or file object of kaldi processed feature.
        :type audio_file: str | file
        :param translation: Translation text.
        :type translation: str
        :return: Tuple of audio feature tensor and data of translation part,
                 where translation part could be token ids or text.
        :rtype: tuple of (2darray, list)
        """
        specgram = kaldiio.load_mat(audio_file)
        assert specgram.shape[
            1] == self._feat_dim, 'expect feat dim {}, but got {}'.format(
                self._feat_dim, specgram.shape[1])
        # specgram augment
        specgram = self._augmentation_pipeline.transform_feature(specgram)
        if self._keep_transcription_text:
            return specgram, translation
        else:
            text_ids = self._text_featurizer.featurize(translation)
            return specgram, text_ids
 class TripletKaldiPrePorocessedCollator(KaldiPrePorocessedCollator):
    def process_utterance(self, audio_file, translation, transcript):
        """Load, augment, featurize and normalize for speech data.
        :param audio_file: Filepath or file object of kali processed feature.
        :type audio_file: str | file
        :param translation: Translation text.
        :type translation: str
        :param transcript: Transcription text.
        :type transcript: str
        :return: Tuple of audio feature tensor and data of translation and transcription parts,
                 where translation and transcription parts could be token ids or text.
        :rtype: tuple of (2darray, (list, list))
        """
        specgram = kaldiio.load_mat(audio_file)
        assert specgram.shape[
            1] == self._feat_dim, 'expect feat dim {}, but got {}'.format(
                self._feat_dim, specgram.shape[1])
        # specgram augment
        specgram = self._augmentation_pipeline.transform_feature(specgram)
        if self._keep_transcription_text:
            return specgram, translation, transcript
        else:
            translation_text_ids = self._text_featurizer.featurize(translation)
            transcript_text_ids = self._text_featurizer.featurize(transcript)
            return specgram, translation_text_ids, transcript_text_ids
    def __call__(self, batch):
        """batch examples
        Args:
            batch ([List]): batch is (audio, text)
                audio (np.ndarray) shape (T, D)
                translation (List[int] or str): shape (U,)
                transcription (List[int] or str): shape (V,)
        Returns:
            tuple(audio, text, audio_lens, text_lens): batched data.
                audio : (B, Tmax, D)
                audio_lens: (B)
                translation_text : (B, Umax)
                translation_text_lens: (B)
                transcription_text : (B, Vmax)
                transcription_text_lens: (B)
        """
        audios = []
        audio_lens = []
        translation_text = []
        translation_text_lens = []
        transcription_text = []
        transcription_text_lens = []
        utts = []
        for utt, audio, translation, transcription in batch:
            audio, translation, transcription = self.process_utterance(
                audio, translation, transcription)
            #utt
            utts.append(utt)
            # audio
            audios.append(audio)  # [T, D]
            audio_lens.append(audio.shape[0])
            # text
            # for training, text is token ids
            # else text is string, convert to unicode ord
            tokens = [[], []]
            for idx, text in enumerate([translation, transcription]):
                if self._keep_transcription_text:
                    assert isinstance(text, str), (type(text), text)
                    tokens[idx] = [ord(t) for t in text]
                else:
                    tokens[idx] = text  # token ids
                tokens[idx] = tokens[idx] if isinstance(
                    tokens[idx], np.ndarray) else np.array(
                        tokens[idx], dtype=np.int64)
            translation_text.append(tokens[0])
            translation_text_lens.append(tokens[0].shape[0])
            transcription_text.append(tokens[1])
            transcription_text_lens.append(tokens[1].shape[0])
        padded_audios = pad_sequence(
            audios, padding_value=0.0).astype(np.float32)  #[B, T, D]
        audio_lens = np.array(audio_lens).astype(np.int64)
        padded_translation = pad_sequence(
            translation_text, padding_value=IGNORE_ID).astype(np.int64)
        translation_lens = np.array(translation_text_lens).astype(np.int64)
        padded_transcription = pad_sequence(
            transcription_text, padding_value=IGNORE_ID).astype(np.int64)
        transcription_lens = np.array(transcription_text_lens).astype(np.int64)
        return utts, padded_audios, audio_lens, (
            padded_translation, padded_transcription), (translation_lens,
                                                        transcription_lens)
--- a/examples/transv1.8to2.x/deepspeech/io/converter.py
+++ b/examples/transv1.8to2.x/deepspeech/io/converter.py
@ -1,81 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import numpy as np
 from deepspeech.io.utility import pad_list
 from deepspeech.utils.log import Log
 __all__ = ["CustomConverter"]
 logger = Log(__name__).getlog()
 class CustomConverter():
    """Custom batch converter.
    Args:
        subsampling_factor (int): The subsampling factor.
        dtype (np.dtype): Data type to convert.
    """
    def __init__(self, subsampling_factor=1, dtype=np.float32):
        """Construct a CustomConverter object."""
        self.subsampling_factor = subsampling_factor
        self.ignore_id = -1
        self.dtype = dtype
    def __call__(self, batch):
        """Transform a batch and send it to a device.
        Args:
            batch (list): The batch to transform.
        Returns:
            tuple(np.ndarray, nn.ndarray, nn.ndarray)
        """
        # batch should be located in list
        assert len(batch) == 1
        (xs, ys), utts = batch[0]
        assert xs[0] is not None, "please check Reader and Augmentation impl."
        # perform subsampling
        if self.subsampling_factor > 1:
            xs = [x[::self.subsampling_factor, :] for x in xs]
        # get batch of lengths of input sequences
        ilens = np.array([x.shape[0] for x in xs])
        # perform padding and convert to tensor
        # currently only support real number
        if xs[0].dtype.kind == "c":
            xs_pad_real = pad_list([x.real for x in xs], 0).astype(self.dtype)
            xs_pad_imag = pad_list([x.imag for x in xs], 0).astype(self.dtype)
            # Note(kamo):
            # {'real': ..., 'imag': ...} will be changed to ComplexTensor in E2E.
            # Don't create ComplexTensor and give it E2E here
            # because torch.nn.DataParellel can't handle it.
            xs_pad = {"real": xs_pad_real, "imag": xs_pad_imag}
        else:
            xs_pad = pad_list(xs, 0).astype(self.dtype)
        # NOTE: this is for multi-output (e.g., speech translation)
        ys_pad = pad_list(
            [np.array(y[0][:]) if isinstance(y, tuple) else y for y in ys],
            self.ignore_id)
        olens = np.array(
            [y[0].shape[0] if isinstance(y, tuple) else y.shape[0] for y in ys])
        return utts, xs_pad, ilens, ys_pad, olens
--- a/examples/transv1.8to2.x/deepspeech/io/dataloader.py
+++ b/examples/transv1.8to2.x/deepspeech/io/dataloader.py
@ -1,170 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from typing import Any
 from typing import Dict
 from typing import List
 from typing import Text
 import numpy as np
 from paddle.io import DataLoader
 from deepspeech.frontend.utility import read_manifest
 from deepspeech.io.batchfy import make_batchset
 from deepspeech.io.converter import CustomConverter
 from deepspeech.io.dataset import TransformDataset
 from deepspeech.io.reader import LoadInputsAndTargets
 from deepspeech.utils.log import Log
 __all__ = ["BatchDataLoader"]
 logger = Log(__name__).getlog()
 def feat_dim_and_vocab_size(data_json: List[Dict[Text, Any]],
                            mode: Text="asr",
                            iaxis=0,
                            oaxis=0):
    if mode == 'asr':
        feat_dim = data_json[0]['input'][oaxis]['shape'][1]
        vocab_size = data_json[0]['output'][oaxis]['shape'][1]
    else:
        raise ValueError(f"{mode} mode not support!")
    return feat_dim, vocab_size
 def batch_collate(x):
    """de-tuple.
    Args:
        x (List[Tuple]): [(utts, xs, ilens, ys, olens)]
    Returns:
        Tuple: (utts, xs, ilens, ys, olens)
    """
    return x[0]
 class BatchDataLoader():
    def __init__(self,
                 json_file: str,
                 train_mode: bool,
                 sortagrad: bool=False,
                 batch_size: int=0,
                 maxlen_in: float=float('inf'),
                 maxlen_out: float=float('inf'),
                 minibatches: int=0,
                 mini_batch_size: int=1,
                 batch_count: str='auto',
                 batch_bins: int=0,
                 batch_frames_in: int=0,
                 batch_frames_out: int=0,
                 batch_frames_inout: int=0,
                 preprocess_conf=None,
                 n_iter_processes: int=1,
                 subsampling_factor: int=1,
                 num_encs: int=1):
        self.json_file = json_file
        self.train_mode = train_mode
        self.use_sortagrad = sortagrad == -1 or sortagrad > 0
        self.batch_size = batch_size
        self.maxlen_in = maxlen_in
        self.maxlen_out = maxlen_out
        self.batch_count = batch_count
        self.batch_bins = batch_bins
        self.batch_frames_in = batch_frames_in
        self.batch_frames_out = batch_frames_out
        self.batch_frames_inout = batch_frames_inout
        self.subsampling_factor = subsampling_factor
        self.num_encs = num_encs
        self.preprocess_conf = preprocess_conf
        self.n_iter_processes = n_iter_processes
        # read json data
        self.data_json = read_manifest(json_file)
        self.feat_dim, self.vocab_size = feat_dim_and_vocab_size(
            self.data_json, mode='asr')
        # make minibatch list (variable length)
        self.minibaches = make_batchset(
            self.data_json,
            batch_size,
            maxlen_in,
            maxlen_out,
            minibatches,  # for debug
            min_batch_size=mini_batch_size,
            shortest_first=self.use_sortagrad,
            count=batch_count,
            batch_bins=batch_bins,
            batch_frames_in=batch_frames_in,
            batch_frames_out=batch_frames_out,
            batch_frames_inout=batch_frames_inout,
            iaxis=0,
            oaxis=0, )
        # data reader
        self.reader = LoadInputsAndTargets(
            mode="asr",
            load_output=True,
            preprocess_conf=preprocess_conf,
            preprocess_args={"train":
                             train_mode},  # Switch the mode of preprocessing
        )
        # Setup a converter
        if num_encs == 1:
            self.converter = CustomConverter(
                subsampling_factor=subsampling_factor, dtype=np.float32)
        else:
            assert NotImplementedError("not impl CustomConverterMulEnc.")
        # hack to make batchsize argument as 1
        # actual bathsize is included in a list
        # default collate function converts numpy array to pytorch tensor
        # we used an empty collate function instead which returns list
        self.dataset = TransformDataset(self.minibaches, self.converter,
                                        self.reader)
        self.dataloader = DataLoader(
            dataset=self.dataset,
            batch_size=1,
            shuffle=not self.use_sortagrad if self.train_mode else False,
            collate_fn=batch_collate,
            num_workers=self.n_iter_processes, )
    def __repr__(self):
        echo = f"<{self.__class__.__module__}.{self.__class__.__name__} object at {hex(id(self))}> "
        echo += f"train_mode: {self.train_mode}, "
        echo += f"sortagrad: {self.use_sortagrad}, "
        echo += f"batch_size: {self.batch_size}, "
        echo += f"maxlen_in: {self.maxlen_in}, "
        echo += f"maxlen_out: {self.maxlen_out}, "
        echo += f"batch_count: {self.batch_count}, "
        echo += f"batch_bins: {self.batch_bins}, "
        echo += f"batch_frames_in: {self.batch_frames_in}, "
        echo += f"batch_frames_out: {self.batch_frames_out}, "
        echo += f"batch_frames_inout: {self.batch_frames_inout}, "
        echo += f"subsampling_factor: {self.subsampling_factor}, "
        echo += f"num_encs: {self.num_encs}, "
        echo += f"num_workers: {self.n_iter_processes}, "
        echo += f"file: {self.json_file}"
        return echo
    def __len__(self):
        return len(self.dataloader)
    def __iter__(self):
        return self.dataloader.__iter__()
    def __call__(self):
        return self.__iter__()
--- a/examples/transv1.8to2.x/deepspeech/io/dataset.py
+++ b/examples/transv1.8to2.x/deepspeech/io/dataset.py
@ -1,149 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from typing import Optional
 from paddle.io import Dataset
 from yacs.config import CfgNode
 from deepspeech.frontend.utility import read_manifest
 from deepspeech.utils.log import Log
 __all__ = ["ManifestDataset", "TripletManifestDataset", "TransformDataset"]
 logger = Log(__name__).getlog()
 class ManifestDataset(Dataset):
    @classmethod
    def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
        default = CfgNode(
            dict(
                manifest="",
                max_input_len=27.0,
                min_input_len=0.0,
                max_output_len=float('inf'),
                min_output_len=0.0,
                max_output_input_ratio=float('inf'),
                min_output_input_ratio=0.0, ))
        if config is not None:
            config.merge_from_other_cfg(default)
        return default
    @classmethod
    def from_config(cls, config):
        """Build a ManifestDataset object from a config.
        Args:
            config (yacs.config.CfgNode): configs object.
        Returns:
            ManifestDataset: dataet object.
        """
        assert 'manifest' in config.data
        assert config.data.manifest
        dataset = cls(
            manifest_path=config.data.manifest,
            max_input_len=config.data.max_input_len,
            min_input_len=config.data.min_input_len,
            max_output_len=config.data.max_output_len,
            min_output_len=config.data.min_output_len,
            max_output_input_ratio=config.data.max_output_input_ratio,
            min_output_input_ratio=config.data.min_output_input_ratio, )
        return dataset
    def __init__(self,
                 manifest_path,
                 max_input_len=float('inf'),
                 min_input_len=0.0,
                 max_output_len=float('inf'),
                 min_output_len=0.0,
                 max_output_input_ratio=float('inf'),
                 min_output_input_ratio=0.0):
        """Manifest Dataset
        Args:
            manifest_path (str): manifest josn file path
            max_input_len ([type], optional): maximum output seq length, 
                in seconds for raw wav, in frame numbers for feature data. Defaults to float('inf').
            min_input_len (float, optional): minimum input seq length, 
                in seconds for raw wav, in frame numbers for feature data. Defaults to 0.0.
            max_output_len (float, optional): maximum input seq length, 
                in modeling units. Defaults to 500.0.
            min_output_len (float, optional): minimum input seq length, 
                in modeling units. Defaults to 0.0.
            max_output_input_ratio (float, optional): maximum output seq length/output seq length ratio. 
                Defaults to 10.0.
            min_output_input_ratio (float, optional): minimum output seq length/output seq length ratio.
                Defaults to 0.05.
        """
        super().__init__()
        # read manifest
        self._manifest = read_manifest(
            manifest_path=manifest_path,
            max_input_len=max_input_len,
            min_input_len=min_input_len,
            max_output_len=max_output_len,
            min_output_len=min_output_len,
            max_output_input_ratio=max_output_input_ratio,
            min_output_input_ratio=min_output_input_ratio)
        self._manifest.sort(key=lambda x: x["feat_shape"][0])
    def __len__(self):
        return len(self._manifest)
    def __getitem__(self, idx):
        instance = self._manifest[idx]
        return instance["utt"], instance["feat"], instance["text"]
 class TripletManifestDataset(ManifestDataset):
    """
        For Joint Training of Speech Translation and ASR.
        text: translation,
        text1: transcript.
    """
    def __getitem__(self, idx):
        instance = self._manifest[idx]
        return instance["utt"], instance["feat"], instance["text"], instance[
            "text1"]
 class TransformDataset(Dataset):
    """Transform Dataset.
    Args:
        data: list object from make_batchset
        converter: batch function
        reader: read data
    """
    def __init__(self, data, converter, reader):
        """Init function."""
        super().__init__()
        self.data = data
        self.converter = converter
        self.reader = reader
    def __len__(self):
        """Len function."""
        return len(self.data)
    def __getitem__(self, idx):
        """[] operator."""
        return self.converter([self.reader(self.data[idx], return_uttid=True)])
--- a/examples/transv1.8to2.x/deepspeech/io/reader.py
+++ b/examples/transv1.8to2.x/deepspeech/io/reader.py
@ -1,410 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from collections import OrderedDict
 import kaldiio
 import numpy as np
 import soundfile
 from deepspeech.frontend.augmentor.augmentation import AugmentationPipeline
 from deepspeech.utils.log import Log
 __all__ = ["LoadInputsAndTargets"]
 logger = Log(__name__).getlog()
 class LoadInputsAndTargets():
    """Create a mini-batch from a list of dicts
    >>> batch = [('utt1',
    ...           dict(input=[dict(feat='some.ark:123',
    ...                            filetype='mat',
    ...                            name='input1',
    ...                            shape=[100, 80])],
    ...                output=[dict(tokenid='1 2 3 4',
    ...                             name='target1',
    ...                             shape=[4, 31])]]))
    >>> l = LoadInputsAndTargets()
    >>> feat, target = l(batch)
    :param: str mode: Specify the task mode, "asr" or "tts"
    :param: str preprocess_conf: The path of a json file for pre-processing
    :param: bool load_input: If False, not to load the input data
    :param: bool load_output: If False, not to load the output data
    :param: bool sort_in_input_length: Sort the mini-batch in descending order
        of the input length
    :param: bool use_speaker_embedding: Used for tts mode only
    :param: bool use_second_target: Used for tts mode only
    :param: dict preprocess_args: Set some optional arguments for preprocessing
    :param: Optional[dict] preprocess_args: Used for tts mode only
    """
    def __init__(
            self,
            mode="asr",
            preprocess_conf=None,
            load_input=True,
            load_output=True,
            sort_in_input_length=True,
            preprocess_args=None,
            keep_all_data_on_mem=False, ):
        self._loaders = {}
        if mode not in ["asr"]:
            raise ValueError("Only asr are allowed: mode={}".format(mode))
        if preprocess_conf is not None:
            with open(preprocess_conf, 'r') as fin:
                self.preprocessing = AugmentationPipeline(fin.read())
            logger.warning(
                "[Experimental feature] Some preprocessing will be done "
                "for the mini-batch creation using {}".format(
                    self.preprocessing))
        else:
            # If conf doesn't exist, this function don't touch anything.
            self.preprocessing = None
        self.mode = mode
        self.load_output = load_output
        self.load_input = load_input
        self.sort_in_input_length = sort_in_input_length
        if preprocess_args is None:
            self.preprocess_args = {}
        else:
            assert isinstance(preprocess_args, dict), type(preprocess_args)
            self.preprocess_args = dict(preprocess_args)
        self.keep_all_data_on_mem = keep_all_data_on_mem
    def __call__(self, batch, return_uttid=False):
        """Function to load inputs and targets from list of dicts
        :param List[Tuple[str, dict]] batch: list of dict which is subset of
            loaded data.json
        :param bool return_uttid: return utterance ID information for visualization
        :return: list of input token id sequences [(L_1), (L_2), ..., (L_B)]
        :return: list of input feature sequences
            [(T_1, D), (T_2, D), ..., (T_B, D)]
        :rtype: list of float ndarray
        :return: list of target token id sequences [(L_1), (L_2), ..., (L_B)]
        :rtype: list of int ndarray
        """
        x_feats_dict = OrderedDict()  # OrderedDict[str, List[np.ndarray]]
        y_feats_dict = OrderedDict()  # OrderedDict[str, List[np.ndarray]]
        uttid_list = []  # List[str]
        for uttid, info in batch:
            uttid_list.append(uttid)
            if self.load_input:
                # Note(kamo): This for-loop is for multiple inputs
                for idx, inp in enumerate(info["input"]):
                    # {"input":
                    #  [{"feat": "some/path.h5:F01_050C0101_PED_REAL",
                    #    "filetype": "hdf5",
                    #    "name": "input1", ...}], ...}
                    x = self._get_from_loader(
                        filepath=inp["feat"],
                        filetype=inp.get("filetype", "mat"))
                    x_feats_dict.setdefault(inp["name"], []).append(x)
            if self.load_output:
                for idx, inp in enumerate(info["output"]):
                    if "tokenid" in inp:
                        # ======= Legacy format for output =======
                        # {"output": [{"tokenid": "1 2 3 4"}])
                        x = np.fromiter(
                            map(int, inp["tokenid"].split()), dtype=np.int64)
                    else:
                        # ======= New format =======
                        # {"input":
                        #  [{"feat": "some/path.h5:F01_050C0101_PED_REAL",
                        #    "filetype": "hdf5",
                        #    "name": "target1", ...}], ...}
                        x = self._get_from_loader(
                            filepath=inp["feat"],
                            filetype=inp.get("filetype", "mat"))
                    y_feats_dict.setdefault(inp["name"], []).append(x)
        if self.mode == "asr":
            return_batch, uttid_list = self._create_batch_asr(
                x_feats_dict, y_feats_dict, uttid_list)
        else:
            raise NotImplementedError(self.mode)
        if self.preprocessing is not None:
            # Apply pre-processing all input features
            for x_name in return_batch.keys():
                if x_name.startswith("input"):
                    return_batch[x_name] = self.preprocessing(
                        return_batch[x_name], uttid_list,
                        **self.preprocess_args)
        if return_uttid:
            return tuple(return_batch.values()), uttid_list
        # Doesn't return the names now.
        return tuple(return_batch.values())
    def _create_batch_asr(self, x_feats_dict, y_feats_dict, uttid_list):
        """Create a OrderedDict for the mini-batch
        :param OrderedDict x_feats_dict:
            e.g. {"input1": [ndarray, ndarray, ...],
                  "input2": [ndarray, ndarray, ...]}
        :param OrderedDict y_feats_dict:
            e.g. {"target1": [ndarray, ndarray, ...],
                  "target2": [ndarray, ndarray, ...]}
        :param: List[str] uttid_list:
            Give uttid_list to sort in the same order as the mini-batch
        :return: batch, uttid_list
        :rtype: Tuple[OrderedDict, List[str]]
        """
        # handle single-input and multi-input (paralell) asr mode
        xs = list(x_feats_dict.values())
        if self.load_output:
            ys = list(y_feats_dict.values())
            assert len(xs[0]) == len(ys[0]), (len(xs[0]), len(ys[0]))
            # get index of non-zero length samples
            nonzero_idx = list(
                filter(lambda i: len(ys[0][i]) > 0, range(len(ys[0]))))
            for n in range(1, len(y_feats_dict)):
                nonzero_idx = filter(lambda i: len(ys[n][i]) > 0, nonzero_idx)
        else:
            # Note(kamo): Be careful not to make nonzero_idx to a generator
            nonzero_idx = list(range(len(xs[0])))
        if self.sort_in_input_length:
            # sort in input lengths based on the first input
            nonzero_sorted_idx = sorted(
                nonzero_idx, key=lambda i: -len(xs[0][i]))
        else:
            nonzero_sorted_idx = nonzero_idx
        if len(nonzero_sorted_idx) != len(xs[0]):
            logger.warning(
                "Target sequences include empty tokenid (batch {} -> {}).".
                format(len(xs[0]), len(nonzero_sorted_idx)))
        # remove zero-length samples
        xs = [[x[i] for i in nonzero_sorted_idx] for x in xs]
        uttid_list = [uttid_list[i] for i in nonzero_sorted_idx]
        x_names = list(x_feats_dict.keys())
        if self.load_output:
            ys = [[y[i] for i in nonzero_sorted_idx] for y in ys]
            y_names = list(y_feats_dict.keys())
            # Keeping x_name and y_name, e.g. input1, for future extension
            return_batch = OrderedDict([
                * [(x_name, x) for x_name, x in zip(x_names, xs)],
                * [(y_name, y) for y_name, y in zip(y_names, ys)],
            ])
        else:
            return_batch = OrderedDict(
                [(x_name, x) for x_name, x in zip(x_names, xs)])
        return return_batch, uttid_list
    def _get_from_loader(self, filepath, filetype):
        """Return ndarray
        In order to make the fds to be opened only at the first referring,
        the loader are stored in self._loaders
        >>> ndarray = loader.get_from_loader(
        ...     'some/path.h5:F01_050C0101_PED_REAL', filetype='hdf5')
        :param: str filepath:
        :param: str filetype:
        :return:
        :rtype: np.ndarray
        """
        if filetype == "hdf5":
            # e.g.
            #    {"input": [{"feat": "some/path.h5:F01_050C0101_PED_REAL",
            #                "filetype": "hdf5",
            # -> filepath = "some/path.h5", key = "F01_050C0101_PED_REAL"
            filepath, key = filepath.split(":", 1)
            loader = self._loaders.get(filepath)
            if loader is None:
                # To avoid disk access, create loader only for the first time
                loader = h5py.File(filepath, "r")
                self._loaders[filepath] = loader
            return loader[key][()]
        elif filetype == "sound.hdf5":
            # e.g.
            #    {"input": [{"feat": "some/path.h5:F01_050C0101_PED_REAL",
            #                "filetype": "sound.hdf5",
            # -> filepath = "some/path.h5", key = "F01_050C0101_PED_REAL"
            filepath, key = filepath.split(":", 1)
            loader = self._loaders.get(filepath)
            if loader is None:
                # To avoid disk access, create loader only for the first time
                loader = SoundHDF5File(filepath, "r", dtype="int16")
                self._loaders[filepath] = loader
            array, rate = loader[key]
            return array
        elif filetype == "sound":
            # e.g.
            #    {"input": [{"feat": "some/path.wav",
            #                "filetype": "sound"},
            # Assume PCM16
            if not self.keep_all_data_on_mem:
                array, _ = soundfile.read(filepath, dtype="int16")
                return array
            if filepath not in self._loaders:
                array, _ = soundfile.read(filepath, dtype="int16")
                self._loaders[filepath] = array
            return self._loaders[filepath]
        elif filetype == "npz":
            # e.g.
            #    {"input": [{"feat": "some/path.npz:F01_050C0101_PED_REAL",
            #                "filetype": "npz",
            filepath, key = filepath.split(":", 1)
            loader = self._loaders.get(filepath)
            if loader is None:
                # To avoid disk access, create loader only for the first time
                loader = np.load(filepath)
                self._loaders[filepath] = loader
            return loader[key]
        elif filetype == "npy":
            # e.g.
            #    {"input": [{"feat": "some/path.npy",
            #                "filetype": "npy"},
            if not self.keep_all_data_on_mem:
                return np.load(filepath)
            if filepath not in self._loaders:
                self._loaders[filepath] = np.load(filepath)
            return self._loaders[filepath]
        elif filetype in ["mat", "vec"]:
            # e.g.
            #    {"input": [{"feat": "some/path.ark:123",
            #                "filetype": "mat"}]},
            # In this case, "123" indicates the starting points of the matrix
            # load_mat can load both matrix and vector
            if not self.keep_all_data_on_mem:
                return kaldiio.load_mat(filepath)
            if filepath not in self._loaders:
                self._loaders[filepath] = kaldiio.load_mat(filepath)
            return self._loaders[filepath]
        elif filetype == "scp":
            # e.g.
            #    {"input": [{"feat": "some/path.scp:F01_050C0101_PED_REAL",
            #                "filetype": "scp",
            filepath, key = filepath.split(":", 1)
            loader = self._loaders.get(filepath)
            if loader is None:
                # To avoid disk access, create loader only for the first time
                loader = kaldiio.load_scp(filepath)
                self._loaders[filepath] = loader
            return loader[key]
        else:
            raise NotImplementedError(
                "Not supported: loader_type={}".format(filetype))
 class SoundHDF5File():
    """Collecting sound files to a HDF5 file
    >>> f = SoundHDF5File('a.flac.h5', mode='a')
    >>> array = np.random.randint(0, 100, 100, dtype=np.int16)
    >>> f['id'] = (array, 16000)
    >>> array, rate = f['id']
    :param: str filepath:
    :param: str mode:
    :param: str format: The type used when saving wav. flac, nist, htk, etc.
    :param: str dtype:
    """
    def __init__(self,
                 filepath,
                 mode="r+",
                 format=None,
                 dtype="int16",
                 **kwargs):
        self.filepath = filepath
        self.mode = mode
        self.dtype = dtype
        self.file = h5py.File(filepath, mode, **kwargs)
        if format is None:
            # filepath = a.flac.h5 -> format = flac
            second_ext = os.path.splitext(os.path.splitext(filepath)[0])[1]
            format = second_ext[1:]
            if format.upper() not in soundfile.available_formats():
                # If not found, flac is selected
                format = "flac"
        # This format affects only saving
        self.format = format
    def __repr__(self):
        return '<SoundHDF5 file "{}" (mode {}, format {}, type {})>'.format(
            self.filepath, self.mode, self.format, self.dtype)
    def create_dataset(self, name, shape=None, data=None, **kwds):
        f = io.BytesIO()
        array, rate = data
        soundfile.write(f, array, rate, format=self.format)
        self.file.create_dataset(
            name, shape=shape, data=np.void(f.getvalue()), **kwds)
    def __setitem__(self, name, data):
        self.create_dataset(name, data=data)
    def __getitem__(self, key):
        data = self.file[key][()]
        f = io.BytesIO(data.tobytes())
        array, rate = soundfile.read(f, dtype=self.dtype)
        return array, rate
    def keys(self):
        return self.file.keys()
    def values(self):
        for k in self.file:
            yield self[k]
    def items(self):
        for k in self.file:
            yield k, self[k]
    def __iter__(self):
        return iter(self.file)
    def __contains__(self, item):
        return item in self.file
    def __len__(self, item):
        return len(self.file)
    def __enter__(self):
        return self
    def __exit__(self, exc_type, exc_val, exc_tb):
        self.file.close()
    def close(self):
        self.file.close()
--- a/examples/transv1.8to2.x/deepspeech/io/sampler.py
+++ b/examples/transv1.8to2.x/deepspeech/io/sampler.py
@ -1,251 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import math
 import numpy as np
 from paddle import distributed as dist
 from paddle.io import BatchSampler
 from paddle.io import DistributedBatchSampler
 from deepspeech.utils.log import Log
 __all__ = [
    "SortagradDistributedBatchSampler",
    "SortagradBatchSampler",
 ]
 logger = Log(__name__).getlog()
 def _batch_shuffle(indices, batch_size, epoch, clipped=False):
    """Put similarly-sized instances into minibatches for better efficiency
    and make a batch-wise shuffle.
    1. Sort the audio clips by duration.
    2. Generate a random number `k`, k in [0, batch_size).
    3. Randomly shift `k` instances in order to create different batches
        for different epochs. Create minibatches.
    4. Shuffle the minibatches.
    :param indices: indexes. List of int.
    :type indices: list
    :param batch_size: Batch size. This size is also used for generate
                        a random number for batch shuffle.
    :type batch_size: int
    :param clipped: Whether to clip the heading (small shift) and trailing
                    (incomplete batch) instances.
    :type clipped: bool
    :return: Batch shuffled mainifest.
    :rtype: list
    """
    rng = np.random.RandomState(epoch)
    shift_len = rng.randint(0, batch_size - 1)
    batch_indices = list(zip(* [iter(indices[shift_len:])] * batch_size))
    rng.shuffle(batch_indices)
    batch_indices = [item for batch in batch_indices for item in batch]
    assert clipped is False
    if not clipped:
        res_len = len(indices) - shift_len - len(batch_indices)
        # when res_len is 0, will return whole list, len(List[-0:]) = len(List[:])
        if res_len != 0:
            batch_indices.extend(indices[-res_len:])
        batch_indices.extend(indices[0:shift_len])
        assert len(indices) == len(
            batch_indices
        ), f"_batch_shuffle: {len(indices)} : {len(batch_indices)} : {res_len} - {shift_len}"
    return batch_indices
 class SortagradDistributedBatchSampler(DistributedBatchSampler):
    def __init__(self,
                 dataset,
                 batch_size,
                 num_replicas=None,
                 rank=None,
                 shuffle=False,
                 drop_last=False,
                 sortagrad=False,
                 shuffle_method="batch_shuffle"):
        """Sortagrad Sampler for multi gpus.
        Args:
            dataset (paddle.io.Dataset): 
            batch_size (int): batch size for one gpu
            num_replicas (int, optional): world size or numbers of gpus. Defaults to None.
            rank (int, optional): rank id. Defaults to None.
            shuffle (bool, optional): True for do shuffle, or else. Defaults to False.
            drop_last (bool, optional): whether drop last batch which is less than batch size. Defaults to False.
            sortagrad (bool, optional): True, do sortgrad in first epoch, then shuffle as usual; or else. Defaults to False.
            shuffle_method (str, optional): shuffle method, "instance_shuffle" or "batch_shuffle". Defaults to "batch_shuffle".
        """
        super().__init__(dataset, batch_size, num_replicas, rank, shuffle,
                         drop_last)
        self._sortagrad = sortagrad
        self._shuffle_method = shuffle_method
    def __iter__(self):
        num_samples = len(self.dataset)
        indices = np.arange(num_samples).tolist()
        indices += indices[:(self.total_size - len(indices))]
        assert len(indices) == self.total_size
        # sort (by duration) or batch-wise shuffle the manifest
        if self.shuffle:
            if self.epoch == 0 and self._sortagrad:
                logger.info(
                    f'rank: {dist.get_rank()} dataset sortagrad! epoch {self.epoch}'
                )
            else:
                logger.info(
                    f'rank: {dist.get_rank()} dataset shuffle! epoch {self.epoch}'
                )
                if self._shuffle_method == "batch_shuffle":
                    # using `batch_size * nrank`, or will cause instability loss and nan or inf grad, 
                    # since diff batch examlpe length in batches case instability loss in diff rank, 
                    # e.g. rank0 maxlength 20, rank3 maxlength 1000
                    indices = _batch_shuffle(
                        indices,
                        self.batch_size * self.nranks,
                        self.epoch,
                        clipped=False)
                elif self._shuffle_method == "instance_shuffle":
                    np.random.RandomState(self.epoch).shuffle(indices)
                else:
                    raise ValueError("Unknown shuffle method %s." %
                                     self._shuffle_method)
        assert len(
            indices
        ) == self.total_size, f"batch shuffle examples error: {len(indices)} : {self.total_size}"
        # slice `self.batch_size` examples by rank id
        def _get_indices_by_batch_size(indices):
            subsampled_indices = []
            last_batch_size = self.total_size % (self.batch_size * self.nranks)
            assert last_batch_size % self.nranks == 0
            last_local_batch_size = last_batch_size // self.nranks
            for i in range(self.local_rank * self.batch_size,
                           len(indices) - last_batch_size,
                           self.batch_size * self.nranks):
                subsampled_indices.extend(indices[i:i + self.batch_size])
            indices = indices[len(indices) - last_batch_size:]
            subsampled_indices.extend(
                indices[self.local_rank * last_local_batch_size:(
                    self.local_rank + 1) * last_local_batch_size])
            return subsampled_indices
        if self.nranks > 1:
            indices = _get_indices_by_batch_size(indices)
        assert len(indices) == self.num_samples
        _sample_iter = iter(indices)
        batch_indices = []
        for idx in _sample_iter:
            batch_indices.append(idx)
            if len(batch_indices) == self.batch_size:
                logger.debug(
                    f"rank: {dist.get_rank()} batch index: {batch_indices} ")
                yield batch_indices
                batch_indices = []
        if not self.drop_last and len(batch_indices) > 0:
            yield batch_indices
    def __len__(self):
        num_samples = self.num_samples
        num_samples += int(not self.drop_last) * (self.batch_size - 1)
        return num_samples // self.batch_size
 class SortagradBatchSampler(BatchSampler):
    def __init__(self,
                 dataset,
                 batch_size,
                 shuffle=False,
                 drop_last=False,
                 sortagrad=False,
                 shuffle_method="batch_shuffle"):
        """Sortagrad Sampler for one gpu.
        Args:
            dataset (paddle.io.Dataset): 
            batch_size (int): batch size for one gpu
            shuffle (bool, optional): True for do shuffle, or else. Defaults to False.
            drop_last (bool, optional): whether drop last batch which is less than batch size. Defaults to False.
            sortagrad (bool, optional): True, do sortgrad in first epoch, then shuffle as usual; or else. Defaults to False.
            shuffle_method (str, optional): shuffle method, "instance_shuffle" or "batch_shuffle". Defaults to "batch_shuffle".
        """
        self.dataset = dataset
        assert isinstance(batch_size, int) and batch_size > 0, \
            "batch_size should be a positive integer"
        self.batch_size = batch_size
        assert isinstance(shuffle, bool), \
            "shuffle should be a boolean value"
        self.shuffle = shuffle
        assert isinstance(drop_last, bool), \
            "drop_last should be a boolean number"
        self.drop_last = drop_last
        self.epoch = 0
        self.num_samples = int(math.ceil(len(self.dataset) * 1.0))
        self.total_size = self.num_samples
        self._sortagrad = sortagrad
        self._shuffle_method = shuffle_method
    def __iter__(self):
        num_samples = len(self.dataset)
        indices = np.arange(num_samples).tolist()
        indices += indices[:(self.total_size - len(indices))]
        assert len(indices) == self.total_size
        # sort (by duration) or batch-wise shuffle the manifest
        if self.shuffle:
            if self.epoch == 0 and self._sortagrad:
                logger.info(f'dataset sortagrad! epoch {self.epoch}')
            else:
                logger.info(f'dataset shuffle! epoch {self.epoch}')
                if self._shuffle_method == "batch_shuffle":
                    indices = _batch_shuffle(
                        indices, self.batch_size, self.epoch, clipped=False)
                elif self._shuffle_method == "instance_shuffle":
                    np.random.RandomState(self.epoch).shuffle(indices)
                else:
                    raise ValueError("Unknown shuffle method %s." %
                                     self._shuffle_method)
        assert len(
            indices
        ) == self.total_size, f"batch shuffle examples error: {len(indices)} : {self.total_size}"
        assert len(indices) == self.num_samples
        _sample_iter = iter(indices)
        batch_indices = []
        for idx in _sample_iter:
            batch_indices.append(idx)
            if len(batch_indices) == self.batch_size:
                logger.debug(
                    f"rank: {dist.get_rank()} batch index: {batch_indices} ")
                yield batch_indices
                batch_indices = []
        if not self.drop_last and len(batch_indices) > 0:
            yield batch_indices
        self.epoch += 1
    def __len__(self):
        num_samples = self.num_samples
        num_samples += int(not self.drop_last) * (self.batch_size - 1)
        return num_samples // self.batch_size
--- a/examples/transv1.8to2.x/deepspeech/io/utility.py
+++ b/examples/transv1.8to2.x/deepspeech/io/utility.py
@ -1,87 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from typing import List
 import numpy as np
 from deepspeech.utils.log import Log
 __all__ = ["pad_list", "pad_sequence"]
 logger = Log(__name__).getlog()
 def pad_list(sequences: List[np.ndarray],
             padding_value: float=0.0) -> np.ndarray:
    return pad_sequence(sequences, True, padding_value)
 def pad_sequence(sequences: List[np.ndarray],
                 batch_first: bool=True,
                 padding_value: float=0.0) -> np.ndarray:
    r"""Pad a list of variable length Tensors with ``padding_value``
    ``pad_sequence`` stacks a list of Tensors along a new dimension,
    and pads them to equal length. For example, if the input is list of
    sequences with size ``L x *`` and if batch_first is False, and ``T x B x *``
    otherwise.
    `B` is batch size. It is equal to the number of elements in ``sequences``.
    `T` is length of the longest sequence.
    `L` is length of the sequence.
    `*` is any number of trailing dimensions, including none.
    Example:
        >>> a = np.ones([25, 300])
        >>> b = np.ones([22, 300])
        >>> c = np.ones([15, 300])
        >>> pad_sequence([a, b, c]).shape
        [25, 3, 300]
    Note:
        This function returns a np.ndarray of size ``T x B x *`` or ``B x T x *``
        where `T` is the length of the longest sequence. This function assumes
        trailing dimensions and type of all the Tensors in sequences are same.
    Args:
        sequences (list[np.ndarray]): list of variable length sequences.
        batch_first (bool, optional): output will be in ``B x T x *`` if True, or in
            ``T x B x *`` otherwise
        padding_value (float, optional): value for padded elements. Default: 0.
    Returns:
        np.ndarray of size ``T x B x *`` if :attr:`batch_first` is ``False``.
        np.ndarray of size ``B x T x *`` otherwise
    """
    # assuming trailing dimensions and type of all the Tensors
    # in sequences are same and fetching those from sequences[0]
    max_size = sequences[0].shape
    trailing_dims = max_size[1:]
    max_len = max([s.shape[0] for s in sequences])
    if batch_first:
        out_dims = (len(sequences), max_len) + trailing_dims
    else:
        out_dims = (max_len, len(sequences)) + trailing_dims
    out_tensor = np.full(out_dims, padding_value, dtype=sequences[0].dtype)
    for i, tensor in enumerate(sequences):
        length = tensor.shape[0]
        # use index notation to prevent duplicate references to the tensor
        if batch_first:
            out_tensor[i, :length, ...] = tensor
        else:
            out_tensor[:length, i, ...] = tensor
    return out_tensor
--- a/examples/transv1.8to2.x/deepspeech/models/init.py
+++ b/examples/transv1.8to2.x/deepspeech/models/init.py
@ -1,13 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
--- a/examples/transv1.8to2.x/deepspeech/models/ds2/conv.py
+++ b/examples/transv1.8to2.x/deepspeech/models/ds2/conv.py
@ -1,165 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from paddle import nn
 from paddle.nn import functional as F
 from deepspeech.modules.activation import brelu
 from deepspeech.modules.mask import make_non_pad_mask
 from deepspeech.utils.log import Log
 logger = Log(__name__).getlog()
 __all__ = ['ConvStack', "conv_output_size"]
 def conv_output_size(I, F, P, S):
    # https://stanford.edu/~shervine/teaching/cs-230/cheatsheet-convolutional-neural-networks#hyperparameters
    # Output size after Conv:
    #   By noting I the length of the input volume size,
    #   F the length of the filter,
    #   P the amount of zero padding,
    #   S the stride,
    #   then the output size O of the feature map along that dimension is given by:
    #       O = (I - F + Pstart + Pend) // S + 1
    #   When Pstart == Pend == P, we can replace Pstart + Pend by 2P.
    #   When Pstart == Pend == 0
    #       O = (I - F - S) // S
    # https://iq.opengenus.org/output-size-of-convolution/
    # Output height = (Input height + padding height top + padding height bottom - kernel height) / (stride height) + 1
    # Output width = (Output width + padding width right + padding width left - kernel width) / (stride width) + 1
    return (I - F + 2 * P - S) // S
 class ConvBn(nn.Layer):
    """Convolution layer with batch normalization.
    :param kernel_size: The x dimension of a filter kernel. Or input a tuple for
                        two image dimension.
    :type kernel_size: int|tuple|list
    :param num_channels_in: Number of input channels.
    :type num_channels_in: int
    :param num_channels_out: Number of output channels.
    :type num_channels_out: int
    :param stride: The x dimension of the stride. Or input a tuple for two
                image dimension.
    :type stride: int|tuple|list
    :param padding: The x dimension of the padding. Or input a tuple for two
                    image dimension.
    :type padding: int|tuple|list
    :param act: Activation type, relu|brelu
    :type act: string
    :return: Batch norm layer after convolution layer.
    :rtype: Variable
    """
    def __init__(self, num_channels_in, num_channels_out, kernel_size, stride,
                 padding, act):
        super().__init__()
        assert len(kernel_size) == 2
        assert len(stride) == 2
        assert len(padding) == 2
        self.kernel_size = kernel_size
        self.stride = stride
        self.padding = padding
        self.conv = nn.Conv2D(
            num_channels_in,
            num_channels_out,
            kernel_size=kernel_size,
            stride=stride,
            padding=padding,
            weight_attr=None,
            bias_attr=False,
            data_format='NCHW')
        self.bn = nn.BatchNorm2D(
            num_channels_out,
            weight_attr=None,
            bias_attr=None,
            data_format='NCHW')
        self.act = F.relu if act == 'relu' else brelu
    def forward(self, x, x_len):
        """
        x(Tensor): audio, shape [B, C, D, T]
        """
        x = self.conv(x)
        x = self.bn(x)
        x = self.act(x)
        x_len = (x_len - self.kernel_size[1] + 2 * self.padding[1]
                 ) // self.stride[1] + 1
        # reset padding part to 0
        masks = make_non_pad_mask(x_len)  #[B, T]
        masks = masks.unsqueeze(1).unsqueeze(1)  # [B, 1, 1, T]
        # TODO(Hui Zhang): not support bool multiply
        # masks = masks.type_as(x)
        masks = masks.astype(x.dtype)
        x = x.multiply(masks)
        return x, x_len
 class ConvStack(nn.Layer):
    """Convolution group with stacked convolution layers.
    :param feat_size: audio feature dim.
    :type feat_size: int
    :param num_stacks: Number of stacked convolution layers.
    :type num_stacks: int
    """
    def __init__(self, feat_size, num_stacks):
        super().__init__()
        self.feat_size = feat_size  # D
        self.num_stacks = num_stacks
        self.conv_in = ConvBn(
            num_channels_in=1,
            num_channels_out=32,
            kernel_size=(41, 11),  #[D, T]
            stride=(2, 3),
            padding=(20, 5),
            act='brelu')
        out_channel = 32
        convs = [
            ConvBn(
                num_channels_in=32,
                num_channels_out=out_channel,
                kernel_size=(21, 11),
                stride=(2, 1),
                padding=(10, 5),
                act='brelu') for i in range(num_stacks - 1)
        ]
        self.conv_stack = nn.LayerList(convs)
        # conv output feat_dim
        output_height = (feat_size - 1) // 2 + 1
        for i in range(self.num_stacks - 1):
            output_height = (output_height - 1) // 2 + 1
        self.output_height = out_channel * output_height
    def forward(self, x, x_len):
        """
        x: shape [B, C, D, T]
        x_len : shape [B]
        """
        x, x_len = self.conv_in(x, x_len)
        for i, conv in enumerate(self.conv_stack):
            x, x_len = conv(x, x_len)
        return x, x_len
--- a/examples/transv1.8to2.x/deepspeech/modules/init.py
+++ b/examples/transv1.8to2.x/deepspeech/modules/init.py
@ -1,13 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
--- a/examples/transv1.8to2.x/deepspeech/modules/activation.py
+++ b/examples/transv1.8to2.x/deepspeech/modules/activation.py
@ -1,145 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from collections import OrderedDict
 import paddle
 from paddle import nn
 from deepspeech.utils.log import Log
 logger = Log(__name__).getlog()
 __all__ = ["get_activation", "brelu", "LinearGLUBlock", "ConvGLUBlock"]
 def brelu(x, t_min=0.0, t_max=24.0, name=None):
    # paddle.to_tensor is dygraph_only can not work under JIT
    t_min = paddle.full(shape=[1], fill_value=t_min, dtype='float32')
    t_max = paddle.full(shape=[1], fill_value=t_max, dtype='float32')
    return x.maximum(t_min).minimum(t_max)
 class LinearGLUBlock(nn.Layer):
    """A linear Gated Linear Units (GLU) block."""
    def __init__(self, idim: int):
        """ GLU.
        Args:
            idim (int): input and output dimension
        """
        super().__init__()
        self.fc = nn.Linear(idim, idim * 2)
    def forward(self, xs):
        return glu(self.fc(xs), dim=-1)
 class ConvGLUBlock(nn.Layer):
    def __init__(self, kernel_size, in_ch, out_ch, bottlececk_dim=0,
                 dropout=0.):
        """A convolutional Gated Linear Units (GLU) block.
        Args:
            kernel_size (int): kernel size
            in_ch (int): number of input channels
            out_ch (int): number of output channels
            bottlececk_dim (int): dimension of the bottleneck layers for computational efficiency. Defaults to 0.
            dropout (float): dropout probability. Defaults to 0..
        """
        super().__init__()
        self.conv_residual = None
        if in_ch != out_ch:
            self.conv_residual = nn.utils.weight_norm(
                nn.Conv2D(
                    in_channels=in_ch, out_channels=out_ch, kernel_size=(1, 1)),
                name='weight',
                dim=0)
            self.dropout_residual = nn.Dropout(p=dropout)
        self.pad_left = nn.Pad2d((0, 0, kernel_size - 1, 0), 0)
        layers = OrderedDict()
        if bottlececk_dim == 0:
            layers['conv'] = nn.utils.weight_norm(
                nn.Conv2D(
                    in_channels=in_ch,
                    out_channels=out_ch * 2,
                    kernel_size=(kernel_size, 1)),
                name='weight',
                dim=0)
            # TODO(hirofumi0810): padding?
            layers['dropout'] = nn.Dropout(p=dropout)
            layers['glu'] = GLU()
        elif bottlececk_dim > 0:
            layers['conv_in'] = nn.utils.weight_norm(
                nn.Conv2D(
                    in_channels=in_ch,
                    out_channels=bottlececk_dim,
                    kernel_size=(1, 1)),
                name='weight',
                dim=0)
            layers['dropout_in'] = nn.Dropout(p=dropout)
            layers['conv_bottleneck'] = nn.utils.weight_norm(
                nn.Conv2D(
                    in_channels=bottlececk_dim,
                    out_channels=bottlececk_dim,
                    kernel_size=(kernel_size, 1)),
                name='weight',
                dim=0)
            layers['dropout'] = nn.Dropout(p=dropout)
            layers['glu'] = GLU()
            layers['conv_out'] = nn.utils.weight_norm(
                nn.Conv2D(
                    in_channels=bottlececk_dim,
                    out_channels=out_ch * 2,
                    kernel_size=(1, 1)),
                name='weight',
                dim=0)
            layers['dropout_out'] = nn.Dropout(p=dropout)
        self.layers = nn.Sequential(layers)
    def forward(self, xs):
        """Forward pass.
        Args:
            xs (FloatTensor): `[B, in_ch, T, feat_dim]`
        Returns:
            out (FloatTensor): `[B, out_ch, T, feat_dim]`
        """
        residual = xs
        if self.conv_residual is not None:
            residual = self.dropout_residual(self.conv_residual(residual))
        xs = self.pad_left(xs)  # `[B, embed_dim, T+kernel-1, 1]`
        xs = self.layers(xs)  # `[B, out_ch * 2, T ,1]`
        xs = xs + residual
        return xs
 def get_activation(act):
    """Return activation function."""
    # Lazy load to avoid unused import
    activation_funcs = {
        "hardtanh": paddle.nn.Hardtanh,
        "tanh": paddle.nn.Tanh,
        "relu": paddle.nn.ReLU,
        "selu": paddle.nn.SELU,
        "swish": paddle.nn.Swish,
        "gelu": paddle.nn.GELU,
        "brelu": brelu,
    }
    return activation_funcs[act]()
--- a/examples/transv1.8to2.x/deepspeech/modules/cmvn.py
+++ b/examples/transv1.8to2.x/deepspeech/modules/cmvn.py
@ -1,51 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import paddle
 from paddle import nn
 from deepspeech.utils.log import Log
 logger = Log(__name__).getlog()
 __all__ = ['GlobalCMVN']
 class GlobalCMVN(nn.Layer):
    def __init__(self,
                 mean: paddle.Tensor,
                 istd: paddle.Tensor,
                 norm_var: bool=True):
        """
        Args:
            mean (paddle.Tensor): mean stats
            istd (paddle.Tensor): inverse std, std which is 1.0 / std
        """
        super().__init__()
        assert mean.shape == istd.shape
        self.norm_var = norm_var
        # The buffer can be accessed from this module using self.mean
        self.register_buffer("mean", mean)
        self.register_buffer("istd", istd)
    def forward(self, x: paddle.Tensor):
        """
        Args:
            x (paddle.Tensor): (batch, max_len, feat_dim)
        Returns:
            (paddle.Tensor): normalized feature
        """
        x = x - self.mean
        if self.norm_var:
            x = x * self.istd
        return x
--- a/examples/transv1.8to2.x/deepspeech/modules/crf.py
+++ b/examples/transv1.8to2.x/deepspeech/modules/crf.py
@ -1,370 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import paddle
 from paddle import nn
 from deepspeech.utils.log import Log
 logger = Log(__name__).getlog()
 __all__ = ['CRF']
 class CRF(nn.Layer):
    """
    Linear-chain Conditional Random Field (CRF).
    Args:
        nb_labels (int): number of labels in your tagset, including special symbols.
        bos_tag_id (int): integer representing the beginning of sentence symbol in
            your tagset.
        eos_tag_id (int): integer representing the end of sentence symbol in your tagset.
        pad_tag_id (int, optional): integer representing the pad symbol in your tagset.
            If None, the model will treat the PAD as a normal tag. Otherwise, the model
            will apply constraints for PAD transitions.
        batch_first (bool): Whether the first dimension represents the batch dimension.
    """
    def __init__(self,
                 nb_labels: int,
                 bos_tag_id: int,
                 eos_tag_id: int,
                 pad_tag_id: int=None,
                 batch_first: bool=True):
        super().__init__()
        self.nb_labels = nb_labels
        self.BOS_TAG_ID = bos_tag_id
        self.EOS_TAG_ID = eos_tag_id
        self.PAD_TAG_ID = pad_tag_id
        self.batch_first = batch_first
        # initialize transitions from a random uniform distribution between -0.1 and 0.1
        self.transitions = self.create_parameter(
            [self.nb_labels, self.nb_labels],
            default_initializer=nn.initializer.Uniform(-0.1, 0.1))
        self.init_weights()
    def init_weights(self):
        # enforce contraints (rows=from, columns=to) with a big negative number
        # so exp(-10000) will tend to zero
        # no transitions allowed to the beginning of sentence
        self.transitions[:, self.BOS_TAG_ID] = -10000.0
        # no transition alloed from the end of sentence
        self.transitions[self.EOS_TAG_ID, :] = -10000.0
        if self.PAD_TAG_ID is not None:
            # no transitions from padding
            self.transitions[self.PAD_TAG_ID, :] = -10000.0
            # no transitions to padding
            self.transitions[:, self.PAD_TAG_ID] = -10000.0
            # except if the end of sentence is reached
            # or we are already in a pad position
            self.transitions[self.PAD_TAG_ID, self.EOS_TAG_ID] = 0.0
            self.transitions[self.PAD_TAG_ID, self.PAD_TAG_ID] = 0.0
    def forward(self,
                emissions: paddle.Tensor,
                tags: paddle.Tensor,
                mask: paddle.Tensor=None) -> paddle.Tensor:
        """Compute the negative log-likelihood. See `log_likelihood` method."""
        nll = -self.log_likelihood(emissions, tags, mask=mask)
        return nll
    def log_likelihood(self, emissions, tags, mask=None):
        """Compute the probability of a sequence of tags given a sequence of
        emissions scores.
        Args:
            emissions (paddle.Tensor): Sequence of emissions for each label.
                Shape of (batch_size, seq_len, nb_labels) if batch_first is True,
                (seq_len, batch_size, nb_labels) otherwise.
            tags (paddle.LongTensor): Sequence of labels.
                Shape of (batch_size, seq_len) if batch_first is True,
                (seq_len, batch_size) otherwise.
            mask (paddle.FloatTensor, optional): Tensor representing valid positions.
                If None, all positions are considered valid.
                Shape of (batch_size, seq_len) if batch_first is True,
                (seq_len, batch_size) otherwise.
        Returns:
            paddle.Tensor: sum of the log-likelihoods for each sequence in the batch.
                Shape of ()
        """
        # fix tensors order by setting batch as the first dimension
        if not self.batch_first:
            emissions = emissions.transpose(0, 1)
            tags = tags.transpose(0, 1)
        if mask is None:
            mask = paddle.ones(emissions.shape[:2], dtype=paddle.float)
        scores = self._compute_scores(emissions, tags, mask=mask)
        partition = self._compute_log_partition(emissions, mask=mask)
        return paddle.sum(scores - partition)
    def decode(self, emissions, mask=None):
        """Find the most probable sequence of labels given the emissions using
        the Viterbi algorithm.
        Args:
            emissions (paddle.Tensor): Sequence of emissions for each label.
                Shape (batch_size, seq_len, nb_labels) if batch_first is True,
                (seq_len, batch_size, nb_labels) otherwise.
            mask (paddle.FloatTensor, optional): Tensor representing valid positions.
                If None, all positions are considered valid.
                Shape (batch_size, seq_len) if batch_first is True,
                (seq_len, batch_size) otherwise.
        Returns:
            paddle.Tensor: the viterbi score for the for each batch.
                Shape of (batch_size,)
            list of lists: the best viterbi sequence of labels for each batch. [B, T]
        """
        # fix tensors order by setting batch as the first dimension
        if not self.batch_first:
            emissions = emissions.transpose(0, 1)
            tags = tags.transpose(0, 1)
        if mask is None:
            mask = paddle.ones(emissions.shape[:2], dtype=paddle.float)
        scores, sequences = self._viterbi_decode(emissions, mask)
        return scores, sequences
    def _compute_scores(self, emissions, tags, mask):
        """Compute the scores for a given batch of emissions with their tags.
        Args:
            emissions (paddle.Tensor): (batch_size, seq_len, nb_labels)
            tags (Paddle.LongTensor): (batch_size, seq_len)
            mask (Paddle.FloatTensor): (batch_size, seq_len)
        Returns:
            paddle.Tensor: Scores for each batch.
                Shape of (batch_size,)
        """
        batch_size, seq_length = tags.shape
        scores = paddle.zeros([batch_size])
        # save first and last tags to be used later
        first_tags = tags[:, 0]
        last_valid_idx = mask.int().sum(1) - 1
        # TODO(Hui Zhang): not support fancy index. 
        # last_tags = tags.gather(last_valid_idx.unsqueeze(1), axis=1).squeeze()
        batch_idx = paddle.arange(batch_size, dtype=last_valid_idx.dtype)
        gather_last_valid_idx = paddle.stack(
            [batch_idx, last_valid_idx], axis=-1)
        last_tags = tags.gather_nd(gather_last_valid_idx)
        # add the transition from BOS to the first tags for each batch
        # t_scores = self.transitions[self.BOS_TAG_ID, first_tags]
        t_scores = self.transitions[self.BOS_TAG_ID].gather(first_tags)
        # add the [unary] emission scores for the first tags for each batch
        # for all batches, the first word, see the correspondent emissions
        # for the first tags (which is a list of ids):
        # emissions[:, 0, [tag_1, tag_2, ..., tag_nblabels]]
        # e_scores = emissions[:, 0].gather(1, first_tags.unsqueeze(1)).squeeze()
        gather_first_tags_idx = paddle.stack([batch_idx, first_tags], axis=-1)
        e_scores = emissions[:, 0].gather_nd(gather_first_tags_idx)
        # the scores for a word is just the sum of both scores
        scores += e_scores + t_scores
        # now lets do this for each remaining word
        for i in range(1, seq_length):
            # we could: iterate over batches, check if we reached a mask symbol
            # and stop the iteration, but vecotrizing is faster due to gpu,
            # so instead we perform an element-wise multiplication
            is_valid = mask[:, i]
            previous_tags = tags[:, i - 1]
            current_tags = tags[:, i]
            # calculate emission and transition scores as we did before
            # e_scores = emissions[:, i].gather(1, current_tags.unsqueeze(1)).squeeze()
            gather_current_tags_idx = paddle.stack(
                [batch_idx, current_tags], axis=-1)
            e_scores = emissions[:, i].gather_nd(gather_current_tags_idx)
            # t_scores = self.transitions[previous_tags, current_tags]
            gather_transitions_idx = paddle.stack(
                [previous_tags, current_tags], axis=-1)
            t_scores = self.transitions.gather_nd(gather_transitions_idx)
            # apply the mask
            e_scores = e_scores * is_valid
            t_scores = t_scores * is_valid
            scores += e_scores + t_scores
        # add the transition from the end tag to the EOS tag for each batch
        # scores += self.transitions[last_tags, self.EOS_TAG_ID]
        scores += self.transitions.gather(last_tags)[:, self.EOS_TAG_ID]
        return scores
    def _compute_log_partition(self, emissions, mask):
        """Compute the partition function in log-space using the forward-algorithm.
        Args:
            emissions (paddle.Tensor): (batch_size, seq_len, nb_labels)
            mask (Paddle.FloatTensor): (batch_size, seq_len)
        Returns:
            paddle.Tensor: the partition scores for each batch.
                Shape of (batch_size,)
        """
        batch_size, seq_length, nb_labels = emissions.shape
        # in the first iteration, BOS will have all the scores
        alphas = self.transitions[self.BOS_TAG_ID, :].unsqueeze(
            0) + emissions[:, 0]
        for i in range(1, seq_length):
            # (bs, nb_labels) -> (bs, 1, nb_labels)
            e_scores = emissions[:, i].unsqueeze(1)
            # (nb_labels, nb_labels) -> (bs, nb_labels, nb_labels)
            t_scores = self.transitions.unsqueeze(0)
            # (bs, nb_labels)  -> (bs, nb_labels, 1)
            a_scores = alphas.unsqueeze(2)
            scores = e_scores + t_scores + a_scores
            new_alphas = paddle.logsumexp(scores, axis=1)
            # set alphas if the mask is valid, otherwise keep the current values
            is_valid = mask[:, i].unsqueeze(-1)
            alphas = is_valid * new_alphas + (1 - is_valid) * alphas
        # add the scores for the final transition
        last_transition = self.transitions[:, self.EOS_TAG_ID]
        end_scores = alphas + last_transition.unsqueeze(0)
        # return a *log* of sums of exps
        return paddle.logsumexp(end_scores, axis=1)
    def _viterbi_decode(self, emissions, mask):
        """Compute the viterbi algorithm to find the most probable sequence of labels
        given a sequence of emissions.
        Args:
            emissions (paddle.Tensor): (batch_size, seq_len, nb_labels)
            mask (Paddle.FloatTensor): (batch_size, seq_len)
        Returns:
            paddle.Tensor: the viterbi score for the for each batch.
                Shape of (batch_size,)
            list of lists of ints: the best viterbi sequence of labels for each batch
        """
        batch_size, seq_length, nb_labels = emissions.shape
        # in the first iteration, BOS will have all the scores and then, the max
        alphas = self.transitions[self.BOS_TAG_ID, :].unsqueeze(
            0) + emissions[:, 0]
        backpointers = []
        for i in range(1, seq_length):
            # (bs, nb_labels) -> (bs, 1, nb_labels)
            e_scores = emissions[:, i].unsqueeze(1)
            # (nb_labels, nb_labels) -> (bs, nb_labels, nb_labels)
            t_scores = self.transitions.unsqueeze(0)
            # (bs, nb_labels)  -> (bs, nb_labels, 1)
            a_scores = alphas.unsqueeze(2)
            # combine current scores with previous alphas
            scores = e_scores + t_scores + a_scores
            # so far is exactly like the forward algorithm,
            # but now, instead of calculating the logsumexp,
            # we will find the highest score and the tag associated with it
            # max_scores, max_score_tags = paddle.max(scores, axis=1)
            max_scores = paddle.max(scores, axis=1)
            max_score_tags = paddle.argmax(scores, axis=1)
            # set alphas if the mask is valid, otherwise keep the current values
            is_valid = mask[:, i].unsqueeze(-1)
            alphas = is_valid * max_scores + (1 - is_valid) * alphas
            # add the max_score_tags for our list of backpointers
            # max_scores has shape (batch_size, nb_labels) so we transpose it to
            # be compatible with our previous loopy version of viterbi
            backpointers.append(max_score_tags.t())
        # add the scores for the final transition
        last_transition = self.transitions[:, self.EOS_TAG_ID]
        end_scores = alphas + last_transition.unsqueeze(0)
        # get the final most probable score and the final most probable tag
        # max_final_scores, max_final_tags = paddle.max(end_scores, axis=1)
        max_final_scores = paddle.max(end_scores, axis=1)
        max_final_tags = paddle.argmax(end_scores, axis=1)
        # find the best sequence of labels for each sample in the batch
        best_sequences = []
        emission_lengths = mask.int().sum(axis=1)
        for i in range(batch_size):
            # recover the original sentence length for the i-th sample in the batch
            sample_length = emission_lengths[i].item()
            # recover the max tag for the last timestep
            sample_final_tag = max_final_tags[i].item()
            # limit the backpointers until the last but one
            # since the last corresponds to the sample_final_tag
            sample_backpointers = backpointers[:sample_length - 1]
            # follow the backpointers to build the sequence of labels
            sample_path = self._find_best_path(i, sample_final_tag,
                                               sample_backpointers)
            # add this path to the list of best sequences
            best_sequences.append(sample_path)
        return max_final_scores, best_sequences
    def _find_best_path(self, sample_id, best_tag, backpointers):
        """Auxiliary function to find the best path sequence for a specific sample.
            Args:
                sample_id (int): sample index in the range [0, batch_size)
                best_tag (int): tag which maximizes the final score
                backpointers (list of lists of tensors): list of pointers with
                shape (seq_len_i-1, nb_labels, batch_size) where seq_len_i
                represents the length of the ith sample in the batch
            Returns:
                list of ints: a list of tag indexes representing the bast path
        """
        # add the final best_tag to our best path
        best_path = [best_tag]
        # traverse the backpointers in backwards
        for backpointers_t in reversed(backpointers):
            # recover the best_tag at this timestep
            best_tag = backpointers_t[best_tag][sample_id].item()
            # append to the beginning of the list so we don't need to reverse it later
            best_path.insert(0, best_tag)
        return best_path
--- a/examples/transv1.8to2.x/deepspeech/modules/ctc.py
+++ b/examples/transv1.8to2.x/deepspeech/modules/ctc.py
@ -1,274 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import paddle
 from paddle import nn
 from paddle.nn import functional as F
 from typeguard import check_argument_types
 from deepspeech.modules.loss import CTCLoss
 from deepspeech.utils import ctc_utils
 from deepspeech.utils.log import Log
 logger = Log(__name__).getlog()
 try:
    from deepspeech.decoders.swig_wrapper import ctc_beam_search_decoder_batch  # noqa: F401
    from deepspeech.decoders.swig_wrapper import ctc_greedy_decoder  # noqa: F401
    from deepspeech.decoders.swig_wrapper import Scorer  # noqa: F401
 except Exception as e:
    logger.info("ctcdecoder not installed!")
 __all__ = ['CTCDecoder']
 class CTCDecoder(nn.Layer):
    def __init__(self,
                 odim,
                 enc_n_units,
                 blank_id=0,
                 dropout_rate: float=0.0,
                 reduction: bool=True,
                 batch_average: bool=True):
        """CTC decoder
        Args:
            odim ([int]): text vocabulary size
            enc_n_units ([int]): encoder output dimention
            dropout_rate (float): dropout rate (0.0 ~ 1.0)
            reduction (bool): reduce the CTC loss into a scalar, True for 'sum' or 'none'
            batch_average (bool): do batch dim wise average.
            grad_norm_type (str): one of 'instance', 'batchsize', 'frame', None.
        """
        assert check_argument_types()
        super().__init__()
        self.blank_id = blank_id
        self.odim = odim
        self.dropout_rate = dropout_rate
        self.ctc_lo = nn.Linear(enc_n_units, self.odim)
        reduction_type = "sum" if reduction else "none"
        self.criterion = CTCLoss(
            blank=self.blank_id,
            reduction=reduction_type,
            batch_average=batch_average)
        # CTCDecoder LM Score handle
        self._ext_scorer = None
    def forward(self, hs_pad, hlens, ys_pad, ys_lens):
        """Calculate CTC loss.
        Args:
            hs_pad (Tensor): batch of padded hidden state sequences (B, Tmax, D)
            hlens (Tensor): batch of lengths of hidden state sequences (B)
            ys_pad (Tenosr): batch of padded character id sequence tensor (B, Lmax)
            ys_lens (Tensor): batch of lengths of character sequence (B)
        Returns:
            loss (Tenosr): ctc loss value, scalar.
        """
        logits = self.ctc_lo(F.dropout(hs_pad, p=self.dropout_rate))
        loss = self.criterion(logits, ys_pad, hlens, ys_lens)
        return loss
    def softmax(self, eouts: paddle.Tensor, temperature: float=1.0):
        """Get CTC probabilities.
        Args:
            eouts (FloatTensor): `[B, T, enc_units]`
        Returns:
            probs (FloatTensor): `[B, T, odim]`
        """
        self.probs = F.softmax(self.ctc_lo(eouts) / temperature, axis=2)
        return self.probs
    def log_softmax(self, hs_pad: paddle.Tensor,
                    temperature: float=1.0) -> paddle.Tensor:
        """log_softmax of frame activations
        Args:
            Tensor hs_pad: 3d tensor (B, Tmax, eprojs)
        Returns:
            paddle.Tensor: log softmax applied 3d tensor (B, Tmax, odim)
        """
        return F.log_softmax(self.ctc_lo(hs_pad) / temperature, axis=2)
    def argmax(self, hs_pad: paddle.Tensor) -> paddle.Tensor:
        """argmax of frame activations
        Args:
            paddle.Tensor hs_pad: 3d tensor (B, Tmax, eprojs)
        Returns:
            paddle.Tensor: argmax applied 2d tensor (B, Tmax)
        """
        return paddle.argmax(self.ctc_lo(hs_pad), dim=2)
    def forced_align(self,
                     ctc_probs: paddle.Tensor,
                     y: paddle.Tensor,
                     blank_id=0) -> list:
        """ctc forced alignment.
        Args:
            ctc_probs (paddle.Tensor): hidden state sequence, 2d tensor (T, D)
            y (paddle.Tensor): label id sequence tensor, 1d tensor (L)
            blank_id (int): blank symbol index
        Returns:
            paddle.Tensor: best alignment result, (T).
        """
        return ctc_utils.forced_align(ctc_probs, y, blank_id)
    def _decode_batch_greedy(self, probs_split, vocab_list):
        """Decode by best path for a batch of probs matrix input.
        :param probs_split: List of 2-D probability matrix, and each consists
                            of prob vectors for one speech utterancce.
        :param probs_split: List of matrix
        :param vocab_list: List of tokens in the vocabulary, for decoding.
        :type vocab_list: list
        :return: List of transcription texts.
        :rtype: List of str
        """
        results = []
        for i, probs in enumerate(probs_split):
            output_transcription = ctc_greedy_decoder(
                probs_seq=probs, vocabulary=vocab_list, blank_id=self.blank_id)
            results.append(output_transcription)
        return results
    def _init_ext_scorer(self, beam_alpha, beam_beta, language_model_path,
                         vocab_list):
        """Initialize the external scorer.
        :param beam_alpha: Parameter associated with language model.
        :type beam_alpha: float
        :param beam_beta: Parameter associated with word count.
        :type beam_beta: float
        :param language_model_path: Filepath for language model. If it is
                                    empty, the external scorer will be set to
                                    None, and the decoding method will be pure
                                    beam search without scorer.
        :type language_model_path: str|None
        :param vocab_list: List of tokens in the vocabulary, for decoding.
        :type vocab_list: list
        """
        # init once
        if self._ext_scorer is not None:
            return
        if language_model_path != '':
            logger.info("begin to initialize the external scorer "
                        "for decoding")
            self._ext_scorer = Scorer(beam_alpha, beam_beta,
                                      language_model_path, vocab_list)
            lm_char_based = self._ext_scorer.is_character_based()
            lm_max_order = self._ext_scorer.get_max_order()
            lm_dict_size = self._ext_scorer.get_dict_size()
            logger.info("language model: "
                        "is_character_based = %d," % lm_char_based +
                        " max_order = %d," % lm_max_order + " dict_size = %d" %
                        lm_dict_size)
            logger.info("end initializing scorer")
        else:
            self._ext_scorer = None
            logger.info("no language model provided, "
                        "decoding by pure beam search without scorer.")
    def _decode_batch_beam_search(self, probs_split, beam_alpha, beam_beta,
                                  beam_size, cutoff_prob, cutoff_top_n,
                                  vocab_list, num_processes):
        """Decode by beam search for a batch of probs matrix input.
        :param probs_split: List of 2-D probability matrix, and each consists
                            of prob vectors for one speech utterancce.
        :param probs_split: List of matrix
        :param beam_alpha: Parameter associated with language model.
        :type beam_alpha: float
        :param beam_beta: Parameter associated with word count.
        :type beam_beta: float
        :param beam_size: Width for Beam search.
        :type beam_size: int
        :param cutoff_prob: Cutoff probability in pruning,
                            default 1.0, no pruning.
        :type cutoff_prob: float
        :param cutoff_top_n: Cutoff number in pruning, only top cutoff_top_n
                        characters with highest probs in vocabulary will be
                        used in beam search, default 40.
        :type cutoff_top_n: int
        :param vocab_list: List of tokens in the vocabulary, for decoding.
        :type vocab_list: list
        :param num_processes: Number of processes (CPU) for decoder.
        :type num_processes: int
        :return: List of transcription texts.
        :rtype: List of str
        """
        if self._ext_scorer is not None:
            self._ext_scorer.reset_params(beam_alpha, beam_beta)
        # beam search decode
        num_processes = min(num_processes, len(probs_split))
        beam_search_results = ctc_beam_search_decoder_batch(
            probs_split=probs_split,
            vocabulary=vocab_list,
            beam_size=beam_size,
            num_processes=num_processes,
            ext_scoring_func=self._ext_scorer,
            cutoff_prob=cutoff_prob,
            cutoff_top_n=cutoff_top_n,
            blank_id=self.blank_id)
        results = [result[0][1] for result in beam_search_results]
        return results
    def init_decode(self, beam_alpha, beam_beta, lang_model_path, vocab_list,
                    decoding_method):
        if decoding_method == "ctc_beam_search":
            self._init_ext_scorer(beam_alpha, beam_beta, lang_model_path,
                                  vocab_list)
    def decode_probs(self, probs, logits_lens, vocab_list, decoding_method,
                     lang_model_path, beam_alpha, beam_beta, beam_size,
                     cutoff_prob, cutoff_top_n, num_processes):
        """ctc decoding with probs.
        Args:
            probs (Tenosr): activation after softmax
            logits_lens (Tenosr): audio output lens
            vocab_list ([type]): [description]
            decoding_method ([type]): [description]
            lang_model_path ([type]): [description]
            beam_alpha ([type]): [description]
            beam_beta ([type]): [description]
            beam_size ([type]): [description]
            cutoff_prob ([type]): [description]
            cutoff_top_n ([type]): [description]
            num_processes ([type]): [description]
        Raises:
            ValueError: when decoding_method not support.
        Returns:
            List[str]: transcripts.
        """
        probs_split = [probs[i, :l, :] for i, l in enumerate(logits_lens)]
        if decoding_method == "ctc_greedy":
            result_transcripts = self._decode_batch_greedy(
                probs_split=probs_split, vocab_list=vocab_list)
        elif decoding_method == "ctc_beam_search":
            result_transcripts = self._decode_batch_beam_search(
                probs_split=probs_split,
                beam_alpha=beam_alpha,
                beam_beta=beam_beta,
                beam_size=beam_size,
                cutoff_prob=cutoff_prob,
                cutoff_top_n=cutoff_top_n,
                vocab_list=vocab_list,
                num_processes=num_processes)
        else:
            raise ValueError(f"Not support: {decoding_method}")
        return result_transcripts
--- a/examples/transv1.8to2.x/deepspeech/modules/decoder.py
+++ b/examples/transv1.8to2.x/deepspeech/modules/decoder.py
@ -1,182 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Decoder definition."""
 from typing import List
 from typing import Optional
 from typing import Tuple
 import paddle
 from paddle import nn
 from typeguard import check_argument_types
 from deepspeech.modules.attention import MultiHeadedAttention
 from deepspeech.modules.decoder_layer import DecoderLayer
 from deepspeech.modules.embedding import PositionalEncoding
 from deepspeech.modules.mask import make_non_pad_mask
 from deepspeech.modules.mask import subsequent_mask
 from deepspeech.modules.positionwise_feed_forward import PositionwiseFeedForward
 from deepspeech.utils.log import Log
 logger = Log(__name__).getlog()
 __all__ = ["TransformerDecoder"]
 class TransformerDecoder(nn.Layer):
    """Base class of Transfomer decoder module.
    Args:
        vocab_size: output dim
        encoder_output_size: dimension of attention
        attention_heads: the number of heads of multi head attention
        linear_units: the hidden units number of position-wise feedforward
        num_blocks: the number of decoder blocks
        dropout_rate: dropout rate
        self_attention_dropout_rate: dropout rate for attention
        input_layer: input layer type, `embed`
        use_output_layer: whether to use output layer
        pos_enc_class: PositionalEncoding module
        normalize_before:
            True: use layer_norm before each sub-block of a layer.
            False: use layer_norm after each sub-block of a layer.
        concat_after: whether to concat attention layer's input and output
            True: x -> x + linear(concat(x, att(x)))
            False: x -> x + att(x)
    """
    def __init__(
            self,
            vocab_size: int,
            encoder_output_size: int,
            attention_heads: int=4,
            linear_units: int=2048,
            num_blocks: int=6,
            dropout_rate: float=0.1,
            positional_dropout_rate: float=0.1,
            self_attention_dropout_rate: float=0.0,
            src_attention_dropout_rate: float=0.0,
            input_layer: str="embed",
            use_output_layer: bool=True,
            normalize_before: bool=True,
            concat_after: bool=False, ):
        assert check_argument_types()
        super().__init__()
        attention_dim = encoder_output_size
        if input_layer == "embed":
            self.embed = nn.Sequential(
                nn.Embedding(vocab_size, attention_dim),
                PositionalEncoding(attention_dim, positional_dropout_rate), )
        else:
            raise ValueError(f"only 'embed' is supported: {input_layer}")
        self.normalize_before = normalize_before
        self.after_norm = nn.LayerNorm(attention_dim, epsilon=1e-12)
        self.use_output_layer = use_output_layer
        self.output_layer = nn.Linear(attention_dim, vocab_size)
        self.decoders = nn.LayerList([
            DecoderLayer(
                size=attention_dim,
                self_attn=MultiHeadedAttention(attention_heads, attention_dim,
                                               self_attention_dropout_rate),
                src_attn=MultiHeadedAttention(attention_heads, attention_dim,
                                              src_attention_dropout_rate),
                feed_forward=PositionwiseFeedForward(
                    attention_dim, linear_units, dropout_rate),
                dropout_rate=dropout_rate,
                normalize_before=normalize_before,
                concat_after=concat_after, ) for _ in range(num_blocks)
        ])
    def forward(
            self,
            memory: paddle.Tensor,
            memory_mask: paddle.Tensor,
            ys_in_pad: paddle.Tensor,
            ys_in_lens: paddle.Tensor, ) -> Tuple[paddle.Tensor, paddle.Tensor]:
        """Forward decoder.
        Args:
            memory: encoded memory, float32  (batch, maxlen_in, feat)
            memory_mask: encoder memory mask, (batch, 1, maxlen_in)
            ys_in_pad: padded input token ids, int64 (batch, maxlen_out)
            ys_in_lens: input lengths of this batch (batch)
        Returns:
            (tuple): tuple containing:
                x: decoded token score before softmax (batch, maxlen_out, vocab_size)
                    if use_output_layer is True,
                olens: (batch, )
        """
        tgt = ys_in_pad
        # tgt_mask: (B, 1, L)
        tgt_mask = (make_non_pad_mask(ys_in_lens).unsqueeze(1))
        # m: (1, L, L)
        m = subsequent_mask(tgt_mask.size(-1)).unsqueeze(0)
        # tgt_mask: (B, L, L)
        # TODO(Hui Zhang): not support & for tensor
        # tgt_mask = tgt_mask & m
        tgt_mask = tgt_mask.logical_and(m)
        x, _ = self.embed(tgt)
        for layer in self.decoders:
            x, tgt_mask, memory, memory_mask = layer(x, tgt_mask, memory,
                                                     memory_mask)
        if self.normalize_before:
            x = self.after_norm(x)
        if self.use_output_layer:
            x = self.output_layer(x)
        # TODO(Hui Zhang): reduce_sum not support bool type
        # olens = tgt_mask.sum(1)
        olens = tgt_mask.astype(paddle.int).sum(1)
        return x, olens
    def forward_one_step(
            self,
            memory: paddle.Tensor,
            memory_mask: paddle.Tensor,
            tgt: paddle.Tensor,
            tgt_mask: paddle.Tensor,
            cache: Optional[List[paddle.Tensor]]=None,
    ) -> Tuple[paddle.Tensor, List[paddle.Tensor]]:
        """Forward one step.
            This is only used for decoding.
        Args:
            memory: encoded memory, float32  (batch, maxlen_in, feat)
            memory_mask: encoded memory mask, (batch, 1, maxlen_in)
            tgt: input token ids, int64 (batch, maxlen_out)
            tgt_mask: input token mask,  (batch, maxlen_out, maxlen_out)
                      dtype=paddle.bool
            cache: cached output list of (batch, max_time_out-1, size)
        Returns:
            y, cache: NN output value and cache per `self.decoders`.
                y.shape` is (batch, token)
        """
        x, _ = self.embed(tgt)
        new_cache = []
        for i, decoder in enumerate(self.decoders):
            if cache is None:
                c = None
            else:
                c = cache[i]
            x, tgt_mask, memory, memory_mask = decoder(
                x, tgt_mask, memory, memory_mask, cache=c)
            new_cache.append(x)
        if self.normalize_before:
            y = self.after_norm(x[:, -1])
        else:
            y = x[:, -1]
        if self.use_output_layer:
            y = paddle.log_softmax(self.output_layer(y), axis=-1)
        return y, new_cache
--- a/examples/transv1.8to2.x/deepspeech/modules/decoder_layer.py
+++ b/examples/transv1.8to2.x/deepspeech/modules/decoder_layer.py
@ -1,151 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Decoder self-attention layer definition."""
 from typing import Optional
 from typing import Tuple
 import paddle
 from paddle import nn
 from deepspeech.utils.log import Log
 logger = Log(__name__).getlog()
 __all__ = ["DecoderLayer"]
 class DecoderLayer(nn.Layer):
    """Single decoder layer module.
    Args:
        size (int): Input dimension.
        self_attn (nn.Layer): Self-attention module instance.
            `MultiHeadedAttention` instance can be used as the argument.
        src_attn (nn.Layer): Self-attention module instance.
            `MultiHeadedAttention` instance can be used as the argument.
        feed_forward (nn.Layer): Feed-forward module instance.
            `PositionwiseFeedForward` instance can be used as the argument.
        dropout_rate (float): Dropout rate.
        normalize_before (bool):
            True: use layer_norm before each sub-block.
            False: to use layer_norm after each sub-block.
        concat_after (bool): Whether to concat attention layer's input
            and output.
            True: x -> x + linear(concat(x, att(x)))
            False: x -> x + att(x)
    """
    def __init__(
            self,
            size: int,
            self_attn: nn.Layer,
            src_attn: nn.Layer,
            feed_forward: nn.Layer,
            dropout_rate: float,
            normalize_before: bool=True,
            concat_after: bool=False, ):
        """Construct an DecoderLayer object."""
        super().__init__()
        self.size = size
        self.self_attn = self_attn
        self.src_attn = src_attn
        self.feed_forward = feed_forward
        self.norm1 = nn.LayerNorm(size, epsilon=1e-12)
        self.norm2 = nn.LayerNorm(size, epsilon=1e-12)
        self.norm3 = nn.LayerNorm(size, epsilon=1e-12)
        self.dropout = nn.Dropout(dropout_rate)
        self.normalize_before = normalize_before
        self.concat_after = concat_after
        self.concat_linear1 = nn.Linear(size + size, size)
        self.concat_linear2 = nn.Linear(size + size, size)
    def forward(
            self,
            tgt: paddle.Tensor,
            tgt_mask: paddle.Tensor,
            memory: paddle.Tensor,
            memory_mask: paddle.Tensor,
            cache: Optional[paddle.Tensor]=None
    ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor]:
        """Compute decoded features.
        Args:
            tgt (paddle.Tensor): Input tensor (#batch, maxlen_out, size).
            tgt_mask (paddle.Tensor): Mask for input tensor
                (#batch, maxlen_out).
            memory (paddle.Tensor): Encoded memory
                (#batch, maxlen_in, size).
            memory_mask (paddle.Tensor): Encoded memory mask
                (#batch, maxlen_in).
            cache (paddle.Tensor): cached tensors.
                (#batch, maxlen_out - 1, size).
        Returns:
            paddle.Tensor: Output tensor (#batch, maxlen_out, size).
            paddle.Tensor: Mask for output tensor (#batch, maxlen_out).
            paddle.Tensor: Encoded memory (#batch, maxlen_in, size).
            paddle.Tensor: Encoded memory mask (#batch, maxlen_in).
        """
        residual = tgt
        if self.normalize_before:
            tgt = self.norm1(tgt)
        if cache is None:
            tgt_q = tgt
            tgt_q_mask = tgt_mask
        else:
            # compute only the last frame query keeping dim: max_time_out -> 1
            assert cache.shape == [
                tgt.shape[0],
                tgt.shape[1] - 1,
                self.size,
            ], f"{cache.shape} == {[tgt.shape[0], tgt.shape[1] - 1, self.size]}"
            tgt_q = tgt[:, -1:, :]
            residual = residual[:, -1:, :]
            # TODO(Hui Zhang): slice not support bool type
            # tgt_q_mask = tgt_mask[:, -1:, :]
            tgt_q_mask = tgt_mask.cast(paddle.int64)[:, -1:, :].cast(
                paddle.bool)
        if self.concat_after:
            tgt_concat = paddle.cat(
                (tgt_q, self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)), dim=-1)
            x = residual + self.concat_linear1(tgt_concat)
        else:
            x = residual + self.dropout(
                self.self_attn(tgt_q, tgt, tgt, tgt_q_mask))
        if not self.normalize_before:
            x = self.norm1(x)
        residual = x
        if self.normalize_before:
            x = self.norm2(x)
        if self.concat_after:
            x_concat = paddle.cat(
                (x, self.src_attn(x, memory, memory, memory_mask)), dim=-1)
            x = residual + self.concat_linear2(x_concat)
        else:
            x = residual + self.dropout(
                self.src_attn(x, memory, memory, memory_mask))
        if not self.normalize_before:
            x = self.norm2(x)
        residual = x
        if self.normalize_before:
            x = self.norm3(x)
        x = residual + self.dropout(self.feed_forward(x))
        if not self.normalize_before:
            x = self.norm3(x)
        if cache is not None:
            x = paddle.cat([cache, x], dim=1)
        return x, tgt_mask, memory, memory_mask
--- a/examples/transv1.8to2.x/deepspeech/modules/encoder.py
+++ b/examples/transv1.8to2.x/deepspeech/modules/encoder.py
@ -1,453 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Encoder definition."""
 from typing import List
 from typing import Optional
 from typing import Tuple
 import paddle
 from paddle import nn
 from typeguard import check_argument_types
 from deepspeech.modules.activation import get_activation
 from deepspeech.modules.attention import MultiHeadedAttention
 from deepspeech.modules.attention import RelPositionMultiHeadedAttention
 from deepspeech.modules.conformer_convolution import ConvolutionModule
 from deepspeech.modules.embedding import PositionalEncoding
 from deepspeech.modules.embedding import RelPositionalEncoding
 from deepspeech.modules.encoder_layer import ConformerEncoderLayer
 from deepspeech.modules.encoder_layer import TransformerEncoderLayer
 from deepspeech.modules.mask import add_optional_chunk_mask
 from deepspeech.modules.mask import make_non_pad_mask
 from deepspeech.modules.positionwise_feed_forward import PositionwiseFeedForward
 from deepspeech.modules.subsampling import Conv2dSubsampling4
 from deepspeech.modules.subsampling import Conv2dSubsampling6
 from deepspeech.modules.subsampling import Conv2dSubsampling8
 from deepspeech.modules.subsampling import LinearNoSubsampling
 from deepspeech.utils.log import Log
 logger = Log(__name__).getlog()
 __all__ = ["BaseEncoder", 'TransformerEncoder', "ConformerEncoder"]
 class BaseEncoder(nn.Layer):
    def __init__(
            self,
            input_size: int,
            output_size: int=256,
            attention_heads: int=4,
            linear_units: int=2048,
            num_blocks: int=6,
            dropout_rate: float=0.1,
            positional_dropout_rate: float=0.1,
            attention_dropout_rate: float=0.0,
            input_layer: str="conv2d",
            pos_enc_layer_type: str="abs_pos",
            normalize_before: bool=True,
            concat_after: bool=False,
            static_chunk_size: int=0,
            use_dynamic_chunk: bool=False,
            global_cmvn: paddle.nn.Layer=None,
            use_dynamic_left_chunk: bool=False, ):
        """
        Args:
            input_size (int): input dim, d_feature
            output_size (int): dimension of attention, d_model
            attention_heads (int): the number of heads of multi head attention
            linear_units (int): the hidden units number of position-wise feed
                forward
            num_blocks (int): the number of encoder blocks
            dropout_rate (float): dropout rate
            attention_dropout_rate (float): dropout rate in attention
            positional_dropout_rate (float): dropout rate after adding
                positional encoding
            input_layer (str): input layer type.
                optional [linear, conv2d, conv2d6, conv2d8]
            pos_enc_layer_type (str): Encoder positional encoding layer type.
                opitonal [abs_pos, scaled_abs_pos, rel_pos]
            normalize_before (bool):
                True: use layer_norm before each sub-block of a layer.
                False: use layer_norm after each sub-block of a layer.
            concat_after (bool): whether to concat attention layer's input
                and output.
                True: x -> x + linear(concat(x, att(x)))
                False: x -> x + att(x)
            static_chunk_size (int): chunk size for static chunk training and
                decoding
            use_dynamic_chunk (bool): whether use dynamic chunk size for
                training or not, You can only use fixed chunk(chunk_size > 0)
                or dyanmic chunk size(use_dynamic_chunk = True)
            global_cmvn (Optional[paddle.nn.Layer]): Optional GlobalCMVN layer
            use_dynamic_left_chunk (bool): whether use dynamic left chunk in
                dynamic chunk training
        """
        assert check_argument_types()
        super().__init__()
        self._output_size = output_size
        if pos_enc_layer_type == "abs_pos":
            pos_enc_class = PositionalEncoding
        elif pos_enc_layer_type == "rel_pos":
            pos_enc_class = RelPositionalEncoding
        else:
            raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type)
        if input_layer == "linear":
            subsampling_class = LinearNoSubsampling
        elif input_layer == "conv2d":
            subsampling_class = Conv2dSubsampling4
        elif input_layer == "conv2d6":
            subsampling_class = Conv2dSubsampling6
        elif input_layer == "conv2d8":
            subsampling_class = Conv2dSubsampling8
        else:
            raise ValueError("unknown input_layer: " + input_layer)
        self.global_cmvn = global_cmvn
        self.embed = subsampling_class(
            idim=input_size,
            odim=output_size,
            dropout_rate=dropout_rate,
            pos_enc_class=pos_enc_class(
                d_model=output_size, dropout_rate=positional_dropout_rate), )
        self.normalize_before = normalize_before
        self.after_norm = nn.LayerNorm(output_size, epsilon=1e-12)
        self.static_chunk_size = static_chunk_size
        self.use_dynamic_chunk = use_dynamic_chunk
        self.use_dynamic_left_chunk = use_dynamic_left_chunk
    def output_size(self) -> int:
        return self._output_size
    def forward(
            self,
            xs: paddle.Tensor,
            xs_lens: paddle.Tensor,
            decoding_chunk_size: int=0,
            num_decoding_left_chunks: int=-1,
    ) -> Tuple[paddle.Tensor, paddle.Tensor]:
        """Embed positions in tensor.
        Args:
            xs: padded input tensor (B, L, D)
            xs_lens: input length (B)
            decoding_chunk_size: decoding chunk size for dynamic chunk
                0: default for training, use random dynamic chunk.
                <0: for decoding, use full chunk.
                >0: for decoding, use fixed chunk size as set.
            num_decoding_left_chunks: number of left chunks, this is for decoding,
                the chunk size is decoding_chunk_size.
                >=0: use num_decoding_left_chunks
                <0: use all left chunks
        Returns:
            encoder output tensor, lens and mask
        """
        masks = make_non_pad_mask(xs_lens).unsqueeze(1)  # (B, 1, L)
        if self.global_cmvn is not None:
            xs = self.global_cmvn(xs)
        #TODO(Hui Zhang): self.embed(xs, masks, offset=0), stride_slice not support bool tensor
        xs, pos_emb, masks = self.embed(xs, masks.type_as(xs), offset=0)
        #TODO(Hui Zhang): remove mask.astype, stride_slice not support bool tensor
        masks = masks.astype(paddle.bool)
        #TODO(Hui Zhang): mask_pad = ~masks
        mask_pad = masks.logical_not()
        chunk_masks = add_optional_chunk_mask(
            xs, masks, self.use_dynamic_chunk, self.use_dynamic_left_chunk,
            decoding_chunk_size, self.static_chunk_size,
            num_decoding_left_chunks)
        for layer in self.encoders:
            xs, chunk_masks, _ = layer(xs, chunk_masks, pos_emb, mask_pad)
        if self.normalize_before:
            xs = self.after_norm(xs)
        # Here we assume the mask is not changed in encoder layers, so just
        # return the masks before encoder layers, and the masks will be used
        # for cross attention with decoder later
        return xs, masks
    def forward_chunk(
            self,
            xs: paddle.Tensor,
            offset: int,
            required_cache_size: int,
            subsampling_cache: Optional[paddle.Tensor]=None,
            elayers_output_cache: Optional[List[paddle.Tensor]]=None,
            conformer_cnn_cache: Optional[List[paddle.Tensor]]=None,
    ) -> Tuple[paddle.Tensor, paddle.Tensor, List[paddle.Tensor], List[
            paddle.Tensor]]:
        """ Forward just one chunk
        Args:
            xs (paddle.Tensor): chunk input, [B=1, T, D]
            offset (int): current offset in encoder output time stamp
            required_cache_size (int): cache size required for next chunk
                compuation
                >=0: actual cache size
                <0: means all history cache is required
            subsampling_cache (Optional[paddle.Tensor]): subsampling cache
            elayers_output_cache (Optional[List[paddle.Tensor]]):
                transformer/conformer encoder layers output cache
            conformer_cnn_cache (Optional[List[paddle.Tensor]]): conformer
                cnn cache
        Returns:
            paddle.Tensor: output of current input xs
            paddle.Tensor: subsampling cache required for next chunk computation
            List[paddle.Tensor]: encoder layers output cache required for next
                chunk computation
            List[paddle.Tensor]: conformer cnn cache
        """
        assert xs.size(0) == 1  # batch size must be one
        # tmp_masks is just for interface compatibility
        # TODO(Hui Zhang): stride_slice not support bool tensor
        # tmp_masks = paddle.ones([1, xs.size(1)], dtype=paddle.bool)
        tmp_masks = paddle.ones([1, xs.size(1)], dtype=paddle.int32)
        tmp_masks = tmp_masks.unsqueeze(1)  #[B=1, C=1, T]
        if self.global_cmvn is not None:
            xs = self.global_cmvn(xs)
        xs, pos_emb, _ = self.embed(
            xs, tmp_masks, offset=offset)  #xs=(B, T, D), pos_emb=(B=1, T, D)
        if subsampling_cache is not None:
            cache_size = subsampling_cache.size(1)  #T
            xs = paddle.cat((subsampling_cache, xs), dim=1)
        else:
            cache_size = 0
        # only used when using `RelPositionMultiHeadedAttention`
        pos_emb = self.embed.position_encoding(
            offset=offset - cache_size, size=xs.size(1))
        if required_cache_size < 0:
            next_cache_start = 0
        elif required_cache_size == 0:
            next_cache_start = xs.size(1)
        else:
            next_cache_start = xs.size(1) - required_cache_size
        r_subsampling_cache = xs[:, next_cache_start:, :]
        # Real mask for transformer/conformer layers
        masks = paddle.ones([1, xs.size(1)], dtype=paddle.bool)
        masks = masks.unsqueeze(1)  #[B=1, L'=1, T]
        r_elayers_output_cache = []
        r_conformer_cnn_cache = []
        for i, layer in enumerate(self.encoders):
            attn_cache = None if elayers_output_cache is None else elayers_output_cache[
                i]
            cnn_cache = None if conformer_cnn_cache is None else conformer_cnn_cache[
                i]
            xs, _, new_cnn_cache = layer(
                xs,
                masks,
                pos_emb,
                output_cache=attn_cache,
                cnn_cache=cnn_cache)
            r_elayers_output_cache.append(xs[:, next_cache_start:, :])
            r_conformer_cnn_cache.append(new_cnn_cache)
        if self.normalize_before:
            xs = self.after_norm(xs)
        return (xs[:, cache_size:, :], r_subsampling_cache,
                r_elayers_output_cache, r_conformer_cnn_cache)
    def forward_chunk_by_chunk(
            self,
            xs: paddle.Tensor,
            decoding_chunk_size: int,
            num_decoding_left_chunks: int=-1,
    ) -> Tuple[paddle.Tensor, paddle.Tensor]:
        """ Forward input chunk by chunk with chunk_size like a streaming
            fashion
        Here we should pay special attention to computation cache in the
        streaming style forward chunk by chunk. Three things should be taken
        into account for computation in the current network:
            1. transformer/conformer encoder layers output cache
            2. convolution in conformer
            3. convolution in subsampling
        However, we don't implement subsampling cache for:
            1. We can control subsampling module to output the right result by
               overlapping input instead of cache left context, even though it
               wastes some computation, but subsampling only takes a very
               small fraction of computation in the whole model.
            2. Typically, there are several covolution layers with subsampling
               in subsampling module, it is tricky and complicated to do cache
               with different convolution layers with different subsampling
               rate.
            3. Currently, nn.Sequential is used to stack all the convolution
               layers in subsampling, we need to rewrite it to make it work
               with cache, which is not prefered.
        Args:
            xs (paddle.Tensor): (1, max_len, dim)
            chunk_size (int): decoding chunk size.
            num_left_chunks (int): decoding with num left chunks.
        """
        assert decoding_chunk_size > 0
        # The model is trained by static or dynamic chunk
        assert self.static_chunk_size > 0 or self.use_dynamic_chunk
        # feature stride and window for `subsampling` module
        subsampling = self.embed.subsampling_rate
        context = self.embed.right_context + 1  # Add current frame
        stride = subsampling * decoding_chunk_size
        decoding_window = (decoding_chunk_size - 1) * subsampling + context
        num_frames = xs.size(1)
        required_cache_size = decoding_chunk_size * num_decoding_left_chunks
        subsampling_cache: Optional[paddle.Tensor] = None
        elayers_output_cache: Optional[List[paddle.Tensor]] = None
        conformer_cnn_cache: Optional[List[paddle.Tensor]] = None
        outputs = []
        offset = 0
        # Feed forward overlap input step by step
        for cur in range(0, num_frames - context + 1, stride):
            end = min(cur + decoding_window, num_frames)
            chunk_xs = xs[:, cur:end, :]
            (y, subsampling_cache, elayers_output_cache,
             conformer_cnn_cache) = self.forward_chunk(
                 chunk_xs, offset, required_cache_size, subsampling_cache,
                 elayers_output_cache, conformer_cnn_cache)
            outputs.append(y)
            offset += y.size(1)
        ys = paddle.cat(outputs, 1)
        # fake mask, just for jit script and compatibility with `forward` api
        masks = paddle.ones([1, ys.size(1)], dtype=paddle.bool)
        masks = masks.unsqueeze(1)
        return ys, masks
 class TransformerEncoder(BaseEncoder):
    """Transformer encoder module."""
    def __init__(
            self,
            input_size: int,
            output_size: int=256,
            attention_heads: int=4,
            linear_units: int=2048,
            num_blocks: int=6,
            dropout_rate: float=0.1,
            positional_dropout_rate: float=0.1,
            attention_dropout_rate: float=0.0,
            input_layer: str="conv2d",
            pos_enc_layer_type: str="abs_pos",
            normalize_before: bool=True,
            concat_after: bool=False,
            static_chunk_size: int=0,
            use_dynamic_chunk: bool=False,
            global_cmvn: nn.Layer=None,
            use_dynamic_left_chunk: bool=False, ):
        """ Construct TransformerEncoder
        See Encoder for the meaning of each parameter.
        """
        assert check_argument_types()
        super().__init__(input_size, output_size, attention_heads, linear_units,
                         num_blocks, dropout_rate, positional_dropout_rate,
                         attention_dropout_rate, input_layer,
                         pos_enc_layer_type, normalize_before, concat_after,
                         static_chunk_size, use_dynamic_chunk, global_cmvn,
                         use_dynamic_left_chunk)
        self.encoders = nn.LayerList([
            TransformerEncoderLayer(
                size=output_size,
                self_attn=MultiHeadedAttention(attention_heads, output_size,
                                               attention_dropout_rate),
                feed_forward=PositionwiseFeedForward(output_size, linear_units,
                                                     dropout_rate),
                dropout_rate=dropout_rate,
                normalize_before=normalize_before,
                concat_after=concat_after) for _ in range(num_blocks)
        ])
 class ConformerEncoder(BaseEncoder):
    """Conformer encoder module."""
    def __init__(
            self,
            input_size: int,
            output_size: int=256,
            attention_heads: int=4,
            linear_units: int=2048,
            num_blocks: int=6,
            dropout_rate: float=0.1,
            positional_dropout_rate: float=0.1,
            attention_dropout_rate: float=0.0,
            input_layer: str="conv2d",
            pos_enc_layer_type: str="rel_pos",
            normalize_before: bool=True,
            concat_after: bool=False,
            static_chunk_size: int=0,
            use_dynamic_chunk: bool=False,
            global_cmvn: nn.Layer=None,
            use_dynamic_left_chunk: bool=False,
            positionwise_conv_kernel_size: int=1,
            macaron_style: bool=True,
            selfattention_layer_type: str="rel_selfattn",
            activation_type: str="swish",
            use_cnn_module: bool=True,
            cnn_module_kernel: int=15,
            causal: bool=False,
            cnn_module_norm: str="batch_norm", ):
        """Construct ConformerEncoder
        Args:
            input_size to use_dynamic_chunk, see in BaseEncoder
            positionwise_conv_kernel_size (int): Kernel size of positionwise
                conv1d layer.
            macaron_style (bool): Whether to use macaron style for
                positionwise layer.
            selfattention_layer_type (str): Encoder attention layer type,
                the parameter has no effect now, it's just for configure
                compatibility.
            activation_type (str): Encoder activation function type.
            use_cnn_module (bool): Whether to use convolution module.
            cnn_module_kernel (int): Kernel size of convolution module.
            causal (bool): whether to use causal convolution or not.
            cnn_module_norm (str): cnn conv norm type, Optional['batch_norm','layer_norm']
        """
        assert check_argument_types()
        super().__init__(input_size, output_size, attention_heads, linear_units,
                         num_blocks, dropout_rate, positional_dropout_rate,
                         attention_dropout_rate, input_layer,
                         pos_enc_layer_type, normalize_before, concat_after,
                         static_chunk_size, use_dynamic_chunk, global_cmvn,
                         use_dynamic_left_chunk)
        activation = get_activation(activation_type)
        # self-attention module definition
        encoder_selfattn_layer = RelPositionMultiHeadedAttention
        encoder_selfattn_layer_args = (attention_heads, output_size,
                                       attention_dropout_rate)
        # feed-forward module definition
        positionwise_layer = PositionwiseFeedForward
        positionwise_layer_args = (output_size, linear_units, dropout_rate,
                                   activation)
        # convolution module definition
        convolution_layer = ConvolutionModule
        convolution_layer_args = (output_size, cnn_module_kernel, activation,
                                  cnn_module_norm, causal)
        self.encoders = nn.LayerList([
            ConformerEncoderLayer(
                size=output_size,
                self_attn=encoder_selfattn_layer(*encoder_selfattn_layer_args),
                feed_forward=positionwise_layer(*positionwise_layer_args),
                feed_forward_macaron=positionwise_layer(
                    *positionwise_layer_args) if macaron_style else None,
                conv_module=convolution_layer(*convolution_layer_args)
                if use_cnn_module else None,
                dropout_rate=dropout_rate,
                normalize_before=normalize_before,
                concat_after=concat_after) for _ in range(num_blocks)
        ])
--- a/examples/transv1.8to2.x/deepspeech/modules/loss.py
+++ b/examples/transv1.8to2.x/deepspeech/modules/loss.py
@ -1,144 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import paddle
 from paddle import nn
 from paddle.nn import functional as F
 from deepspeech.utils.log import Log
 logger = Log(__name__).getlog()
 __all__ = ['CTCLoss', "LabelSmoothingLoss"]
 class CTCLoss(nn.Layer):
    def __init__(self, blank=0, reduction='sum', batch_average=False):
        super().__init__()
        # last token id as blank id
        self.loss = nn.CTCLoss(blank=blank, reduction=reduction)
        self.batch_average = batch_average
    def forward(self, logits, ys_pad, hlens, ys_lens):
        """Compute CTC loss.
        Args:
            logits ([paddle.Tensor]): [B, Tmax, D]
            ys_pad ([paddle.Tensor]): [B, Tmax]
            hlens ([paddle.Tensor]): [B]
            ys_lens ([paddle.Tensor]): [B]
        Returns:
            [paddle.Tensor]: scalar. If reduction is 'none', then (N), where N = \text{batch size}.
        """
        B = paddle.shape(logits)[0]
        # warp-ctc need logits, and do softmax on logits by itself
        # warp-ctc need activation with shape [T, B, V + 1]
        # logits: (B, L, D) -> (L, B, D)
        logits = logits.transpose([1, 0, 2])
        # (TODO:Hui Zhang) ctc loss does not support int64 labels
        ys_pad = ys_pad.astype(paddle.int32)
        loss = self.loss(
            logits, ys_pad, hlens, ys_lens, norm_by_times=self.batch_average)
        if self.batch_average:
            # Batch-size average
            loss = loss / B
        return loss
 class LabelSmoothingLoss(nn.Layer):
    """Label-smoothing loss.
    In a standard CE loss, the label's data distribution is:
        [0,1,2] ->
        [
            [1.0, 0.0, 0.0],
            [0.0, 1.0, 0.0],
            [0.0, 0.0, 1.0],
        ]
    In the smoothing version CE Loss,some probabilities
    are taken from the true label prob (1.0) and are divided
    among other labels.
        e.g.
        smoothing=0.1
        [0,1,2] ->
        [
            [0.9, 0.05, 0.05],
            [0.05, 0.9, 0.05],
            [0.05, 0.05, 0.9],
        ]
    """
    def __init__(self,
                 size: int,
                 padding_idx: int,
                 smoothing: float,
                 normalize_length: bool=False):
        """Label-smoothing loss.
        Args:
            size (int): the number of class
            padding_idx (int): padding class id which will be ignored for loss
            smoothing (float): smoothing rate (0.0 means the conventional CE)
            normalize_length (bool): 
                True, normalize loss by sequence length; 
                False, normalize loss by batch size.
                Defaults to False.
        """
        super().__init__()
        self.size = size
        self.padding_idx = padding_idx
        self.smoothing = smoothing
        self.confidence = 1.0 - smoothing
        self.normalize_length = normalize_length
        self.criterion = nn.KLDivLoss(reduction="none")
    def forward(self, x: paddle.Tensor, target: paddle.Tensor) -> paddle.Tensor:
        """Compute loss between x and target.
        The model outputs and data labels tensors are flatten to
        (batch*seqlen, class) shape and a mask is applied to the
        padding part which should not be calculated for loss.
        Args:
            x (paddle.Tensor): prediction (batch, seqlen, class)
            target (paddle.Tensor):
                target signal masked with self.padding_id (batch, seqlen)
        Returns:
            loss (paddle.Tensor) : The KL loss, scalar float value
        """
        B, T, D = paddle.shape(x)
        assert D == self.size
        x = x.reshape((-1, self.size))
        target = target.reshape([-1])
        # use zeros_like instead of torch.no_grad() for true_dist,
        # since no_grad() can not be exported by JIT
        true_dist = paddle.full_like(x, self.smoothing / (self.size - 1))
        ignore = target == self.padding_idx  # (B,)
        # target = target * (1 - ignore)  # avoid -1 index
        target = target.masked_fill(ignore, 0)  # avoid -1 index
        # true_dist.scatter_(1, target.unsqueeze(1), self.confidence)
        target_mask = F.one_hot(target, self.size)
        true_dist *= (1 - target_mask)
        true_dist += target_mask * self.confidence
        kl = self.criterion(F.log_softmax(x, axis=1), true_dist)
        #TODO(Hui Zhang): sum not support bool type
        #total = len(target) - int(ignore.sum())
        total = len(target) - int(ignore.type_as(target).sum())
        denom = total if self.normalize_length else B
        #numer = (kl * (1 - ignore)).sum()
        numer = kl.masked_fill(ignore.unsqueeze(1), 0).sum()
        return numer / denom
--- a/examples/transv1.8to2.x/deepspeech/modules/mask.py
+++ b/examples/transv1.8to2.x/deepspeech/modules/mask.py
@ -1,260 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import paddle
 from deepspeech.utils.log import Log
 logger = Log(__name__).getlog()
 __all__ = [
    "make_pad_mask", "make_non_pad_mask", "subsequent_mask",
    "subsequent_chunk_mask", "add_optional_chunk_mask", "mask_finished_scores",
    "mask_finished_preds"
 ]
 def make_pad_mask(lengths: paddle.Tensor) -> paddle.Tensor:
    """Make mask tensor containing indices of padded part.
    See description of make_non_pad_mask.
    Args:
        lengths (paddle.Tensor): Batch of lengths (B,).
    Returns:
        paddle.Tensor: Mask tensor containing indices of padded part.
    Examples:
        >>> lengths = [5, 3, 2]
        >>> make_pad_mask(lengths)
        masks = [[0, 0, 0, 0 ,0],
                 [0, 0, 0, 1, 1],
                 [0, 0, 1, 1, 1]]
    """
    # (TODO: Hui Zhang): jit not support Tenosr.dim() and Tensor.ndim
    # assert lengths.dim() == 1
    batch_size = int(lengths.shape[0])
    max_len = int(lengths.max())
    seq_range = paddle.arange(0, max_len, dtype=paddle.int64)
    seq_range_expand = seq_range.unsqueeze(0).expand([batch_size, max_len])
    seq_length_expand = lengths.unsqueeze(-1)
    mask = seq_range_expand >= seq_length_expand
    return mask
 def make_non_pad_mask(lengths: paddle.Tensor) -> paddle.Tensor:
    """Make mask tensor containing indices of non-padded part.
    The sequences in a batch may have different lengths. To enable
    batch computing, padding is need to make all sequence in same
    size. To avoid the padding part pass value to context dependent
    block such as attention or convolution , this padding part is
    masked.
    This pad_mask is used in both encoder and decoder.
    1 for non-padded part and 0 for padded part.
    Args:
        lengths (paddle.Tensor): Batch of lengths (B,).
    Returns:
        paddle.Tensor: mask tensor containing indices of padded part.
    Examples:
        >>> lengths = [5, 3, 2]
        >>> make_non_pad_mask(lengths)
        masks = [[1, 1, 1, 1 ,1],
                 [1, 1, 1, 0, 0],
                 [1, 1, 0, 0, 0]]
    """
    #TODO(Hui Zhang): return ~make_pad_mask(lengths), not support ~
    return make_pad_mask(lengths).logical_not()
 def subsequent_mask(size: int) -> paddle.Tensor:
    """Create mask for subsequent steps (size, size).
    This mask is used only in decoder which works in an auto-regressive mode.
    This means the current step could only do attention with its left steps.
    In encoder, fully attention is used when streaming is not necessary and
    the sequence is not long. In this case, no attention mask is needed.
    When streaming is need, chunk-based attention is used in encoder. See
    subsequent_chunk_mask for the chunk-based attention mask.
    Args:
        size (int): size of mask
    Returns:
        paddle.Tensor: mask, [size, size]
    Examples:
        >>> subsequent_mask(3)
        [[1, 0, 0],
         [1, 1, 0],
         [1, 1, 1]]
    """
    ret = paddle.ones([size, size], dtype=paddle.bool)
    #TODO(Hui Zhang): tril not support bool
    #return paddle.tril(ret)
    ret = ret.astype(paddle.float)
    ret = paddle.tril(ret)
    ret = ret.astype(paddle.bool)
    return ret
 def subsequent_chunk_mask(
        size: int,
        chunk_size: int,
        num_left_chunks: int=-1, ) -> paddle.Tensor:
    """Create mask for subsequent steps (size, size) with chunk size,
       this is for streaming encoder
    Args:
        size (int): size of mask
        chunk_size (int): size of chunk
        num_left_chunks (int): number of left chunks
            <0: use full chunk
            >=0: use num_left_chunks
    Returns:
        paddle.Tensor: mask, [size, size]
    Examples:
        >>> subsequent_chunk_mask(4, 2)
        [[1, 1, 0, 0],
         [1, 1, 0, 0],
         [1, 1, 1, 1],
         [1, 1, 1, 1]]
    """
    ret = paddle.zeros([size, size], dtype=paddle.bool)
    for i in range(size):
        if num_left_chunks < 0:
            start = 0
        else:
            start = max(0, (i // chunk_size - num_left_chunks) * chunk_size)
        ending = min(size, (i // chunk_size + 1) * chunk_size)
        ret[i, start:ending] = True
    return ret
 def add_optional_chunk_mask(xs: paddle.Tensor,
                            masks: paddle.Tensor,
                            use_dynamic_chunk: bool,
                            use_dynamic_left_chunk: bool,
                            decoding_chunk_size: int,
                            static_chunk_size: int,
                            num_decoding_left_chunks: int):
    """ Apply optional mask for encoder.
    Args:
        xs (paddle.Tensor): padded input, (B, L, D), L for max length
        mask (paddle.Tensor): mask for xs, (B, 1, L)
        use_dynamic_chunk (bool): whether to use dynamic chunk or not
        use_dynamic_left_chunk (bool): whether to use dynamic left chunk for
            training.
        decoding_chunk_size (int): decoding chunk size for dynamic chunk, it's
            0: default for training, use random dynamic chunk.
            <0: for decoding, use full chunk.
            >0: for decoding, use fixed chunk size as set.
        static_chunk_size (int): chunk size for static chunk training/decoding
            if it's greater than 0, if use_dynamic_chunk is true,
            this parameter will be ignored
        num_decoding_left_chunks (int): number of left chunks, this is for decoding,
            the chunk size is decoding_chunk_size.
            >=0: use num_decoding_left_chunks
            <0: use all left chunks
    Returns:
        paddle.Tensor: chunk mask of the input xs.
    """
    # Whether to use chunk mask or not
    if use_dynamic_chunk:
        max_len = xs.shape[1]
        if decoding_chunk_size < 0:
            chunk_size = max_len
            num_left_chunks = -1
        elif decoding_chunk_size > 0:
            chunk_size = decoding_chunk_size
            num_left_chunks = num_decoding_left_chunks
        else:
            # chunk size is either [1, 25] or full context(max_len).
            # Since we use 4 times subsampling and allow up to 1s(100 frames)
            # delay, the maximum frame is 100 / 4 = 25.
            chunk_size = int(paddle.randint(1, max_len, (1, )))
            num_left_chunks = -1
            if chunk_size > max_len // 2:
                chunk_size = max_len
            else:
                chunk_size = chunk_size % 25 + 1
                if use_dynamic_left_chunk:
                    max_left_chunks = (max_len - 1) // chunk_size
                    num_left_chunks = int(
                        paddle.randint(0, max_left_chunks, (1, )))
        chunk_masks = subsequent_chunk_mask(xs.shape[1], chunk_size,
                                            num_left_chunks)  # (L, L)
        chunk_masks = chunk_masks.unsqueeze(0)  # (1, L, L)
        # chunk_masks = masks & chunk_masks  # (B, L, L)
        chunk_masks = masks.logical_and(chunk_masks)  # (B, L, L)
    elif static_chunk_size > 0:
        num_left_chunks = num_decoding_left_chunks
        chunk_masks = subsequent_chunk_mask(xs.shape[1], static_chunk_size,
                                            num_left_chunks)  # (L, L)
        chunk_masks = chunk_masks.unsqueeze(0)  # (1, L, L)
        # chunk_masks = masks & chunk_masks  # (B, L, L)
        chunk_masks = masks.logical_and(chunk_masks)  # (B, L, L)
    else:
        chunk_masks = masks
    return chunk_masks
 def mask_finished_scores(score: paddle.Tensor,
                         flag: paddle.Tensor) -> paddle.Tensor:
    """
    If a sequence is finished, we only allow one alive branch. This function
    aims to give one branch a zero score and the rest -inf score.
    Args:
        score (paddle.Tensor): A real value array with shape
            (batch_size * beam_size, beam_size).
        flag (paddle.Tensor): A bool array with shape
            (batch_size * beam_size, 1).
    Returns:
        paddle.Tensor: (batch_size * beam_size, beam_size).
    Examples:
        flag: tensor([[ True],
                      [False]])
        score: tensor([[-0.3666, -0.6664,  0.6019],
                       [-1.1490, -0.2948,  0.7460]])
        unfinished: tensor([[False,  True,  True],
                            [False, False, False]])
        finished: tensor([[ True, False, False],
                          [False, False, False]])
        return: tensor([[ 0.0000,    -inf,    -inf],
                        [-1.1490, -0.2948,  0.7460]])
    """
    beam_size = score.shape[-1]
    zero_mask = paddle.zeros_like(flag, dtype=paddle.bool)
    if beam_size > 1:
        unfinished = paddle.concat(
            (zero_mask, flag.tile([1, beam_size - 1])), axis=1)
        finished = paddle.concat(
            (flag, zero_mask.tile([1, beam_size - 1])), axis=1)
    else:
        unfinished = zero_mask
        finished = flag
    # infs = paddle.ones_like(score) * -float('inf')
    # score = paddle.where(unfinished, infs, score)
    # score = paddle.where(finished, paddle.zeros_like(score), score)
    score.masked_fill_(unfinished, -float('inf'))
    score.masked_fill_(finished, 0)
    return score
 def mask_finished_preds(pred: paddle.Tensor, flag: paddle.Tensor,
                        eos: int) -> paddle.Tensor:
    """
    If a sequence is finished, all of its branch should be <eos>
    Args:
        pred (paddle.Tensor): A int array with shape
            (batch_size * beam_size, beam_size).
        flag (paddle.Tensor): A bool array with shape
            (batch_size * beam_size, 1).
    Returns:
        paddle.Tensor: (batch_size * beam_size).
    """
    beam_size = pred.shape[-1]
    finished = flag.repeat(1, beam_size)
    return pred.masked_fill_(finished, eos)
--- a/examples/transv1.8to2.x/deepspeech/utils/init.py
+++ b/examples/transv1.8to2.x/deepspeech/utils/init.py
@ -1,13 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
--- a/examples/transv1.8to2.x/deepspeech/utils/bleu_score.py
+++ b/examples/transv1.8to2.x/deepspeech/utils/bleu_score.py
@ -1,54 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """This module provides functions to calculate bleu score in different level.
 e.g. wer for word-level, cer for char-level.
 """
 import sacrebleu
 __all__ = ['bleu', 'char_bleu']
 def bleu(hypothesis, reference):
    """Calculate BLEU. BLEU compares reference text and
    hypothesis text in word-level using scarebleu.
    :param reference: The reference sentences.
    :type reference: list[list[str]]
    :param hypothesis: The hypothesis sentence.
    :type hypothesis: list[str]
    :raises ValueError: If the reference length is zero.
    """
    return sacrebleu.corpus_bleu(hypothesis, reference)
 def char_bleu(hypothesis, reference):
    """Calculate BLEU. BLEU compares reference text and
    hypothesis text in char-level using scarebleu.
    :param reference: The reference sentences.
    :type reference: list[list[str]]
    :param hypothesis: The hypothesis sentence.
    :type hypothesis: list[str]
    :raises ValueError: If the reference number is zero.
    """
    hypothesis = [' '.join(list(hyp.replace(' ', ''))) for hyp in hypothesis]
    reference = [[' '.join(list(ref_i.replace(' ', ''))) for ref_i in ref]
                 for ref in reference]
    return sacrebleu.corpus_bleu(hypothesis, reference)
--- a/examples/transv1.8to2.x/deepspeech/utils/checkpoint.py
+++ b/examples/transv1.8to2.x/deepspeech/utils/checkpoint.py
@ -1,298 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import glob
 import json
 import os
 import re
 from pathlib import Path
 from typing import Text
 from typing import Union
 import paddle
 from paddle import distributed as dist
 from paddle.optimizer import Optimizer
 from deepspeech.utils import mp_tools
 from deepspeech.utils.log import Log
 logger = Log(__name__).getlog()
 __all__ = ["Checkpoint"]
 class Checkpoint():
    def __init__(self, kbest_n: int=5, latest_n: int=1):
        self.best_records: Mapping[Path, float] = {}
        self.latest_records = []
        self.kbest_n = kbest_n
        self.latest_n = latest_n
        self._save_all = (kbest_n == -1)
    def add_checkpoint(self,
                       checkpoint_dir,
                       tag_or_iteration: Union[int, Text],
                       model: paddle.nn.Layer,
                       optimizer: Optimizer=None,
                       infos: dict=None,
                       metric_type="val_loss"):
        """Save checkpoint in best_n and latest_n.
        Args:
            checkpoint_dir (str): the directory where checkpoint is saved.
            tag_or_iteration (int or str): the latest iteration(step or epoch) number or tag.
            model (Layer):  model to be checkpointed.
            optimizer (Optimizer, optional): optimizer to be checkpointed.
            infos (dict or None)):  any info you want to save.
            metric_type (str, optional): metric type. Defaults to "val_loss".
        """
        if (metric_type not in infos.keys()):
            self._save_parameters(checkpoint_dir, tag_or_iteration, model,
                                  optimizer, infos)
            return
        #save best
        if self._should_save_best(infos[metric_type]):
            self._save_best_checkpoint_and_update(
                infos[metric_type], checkpoint_dir, tag_or_iteration, model,
                optimizer, infos)
        #save latest
        self._save_latest_checkpoint_and_update(
            checkpoint_dir, tag_or_iteration, model, optimizer, infos)
        if isinstance(tag_or_iteration, int):
            self._save_checkpoint_record(checkpoint_dir, tag_or_iteration)
    def load_parameters(self,
                        model,
                        optimizer=None,
                        checkpoint_dir=None,
                        checkpoint_path=None,
                        record_file="checkpoint_latest"):
        """Load a last model checkpoint from disk.
        Args:
            model (Layer): model to load parameters.
            optimizer (Optimizer, optional): optimizer to load states if needed.
                Defaults to None.
            checkpoint_dir (str, optional): the directory where checkpoint is saved.
            checkpoint_path (str, optional): if specified, load the checkpoint
                stored in the checkpoint_path(prefix) and the argument 'checkpoint_dir' will
                be ignored. Defaults to None.
            record_file "checkpoint_latest" or "checkpoint_best"
        Returns:
            configs (dict): epoch or step, lr and other meta info should be saved.
        """
        configs = {}
        if checkpoint_path is not None:
            pass
        elif checkpoint_dir is not None and record_file is not None:
            # load checkpint from record file
            checkpoint_record = os.path.join(checkpoint_dir, record_file)
            iteration = self._load_checkpoint_idx(checkpoint_record)
            if iteration == -1:
                return configs
            checkpoint_path = os.path.join(checkpoint_dir,
                                           "{}".format(iteration))
        else:
            raise ValueError(
                "At least one of 'checkpoint_path' or 'checkpoint_dir' should be specified!"
            )
        rank = dist.get_rank()
        params_path = checkpoint_path + ".pdparams"
        model_dict = paddle.load(params_path)
        model.set_state_dict(model_dict)
        logger.info("Rank {}: loaded model from {}".format(rank, params_path))
        optimizer_path = checkpoint_path + ".pdopt"
        if optimizer and os.path.isfile(optimizer_path):
            optimizer_dict = paddle.load(optimizer_path)
            optimizer.set_state_dict(optimizer_dict)
            logger.info("Rank {}: loaded optimizer state from {}".format(
                rank, optimizer_path))
        info_path = re.sub('.pdparams$', '.json', params_path)
        if os.path.exists(info_path):
            with open(info_path, 'r') as fin:
                configs = json.load(fin)
        return configs
    def load_latest_parameters(self,
                               model,
                               optimizer=None,
                               checkpoint_dir=None,
                               checkpoint_path=None):
        """Load a last model checkpoint from disk.
        Args:
            model (Layer): model to load parameters.
            optimizer (Optimizer, optional): optimizer to load states if needed.
                Defaults to None.
            checkpoint_dir (str, optional): the directory where checkpoint is saved.
            checkpoint_path (str, optional): if specified, load the checkpoint
                stored in the checkpoint_path(prefix) and the argument 'checkpoint_dir' will
                be ignored. Defaults to None.
        Returns:
            configs (dict): epoch or step, lr and other meta info should be saved.
        """
        return self.load_parameters(model, optimizer, checkpoint_dir,
                                    checkpoint_path, "checkpoint_latest")
    def load_best_parameters(self,
                             model,
                             optimizer=None,
                             checkpoint_dir=None,
                             checkpoint_path=None):
        """Load a last model checkpoint from disk.
        Args:
            model (Layer): model to load parameters.
            optimizer (Optimizer, optional): optimizer to load states if needed.
                Defaults to None.
            checkpoint_dir (str, optional): the directory where checkpoint is saved.
            checkpoint_path (str, optional): if specified, load the checkpoint
                stored in the checkpoint_path(prefix) and the argument 'checkpoint_dir' will
                be ignored. Defaults to None.
        Returns:
            configs (dict): epoch or step, lr and other meta info should be saved.
        """
        return self.load_parameters(model, optimizer, checkpoint_dir,
                                    checkpoint_path, "checkpoint_best")
    def _should_save_best(self, metric: float) -> bool:
        if not self._best_full():
            return True
        # already full
        worst_record_path = max(self.best_records, key=self.best_records.get)
        # worst_record_path = max(self.best_records.iteritems(), key=operator.itemgetter(1))[0]
        worst_metric = self.best_records[worst_record_path]
        return metric < worst_metric
    def _best_full(self):
        return (not self._save_all) and len(self.best_records) == self.kbest_n
    def _latest_full(self):
        return len(self.latest_records) == self.latest_n
    def _save_best_checkpoint_and_update(self, metric, checkpoint_dir,
                                         tag_or_iteration, model, optimizer,
                                         infos):
        # remove the worst
        if self._best_full():
            worst_record_path = max(self.best_records,
                                    key=self.best_records.get)
            self.best_records.pop(worst_record_path)
            if (worst_record_path not in self.latest_records):
                logger.info(
                    "remove the worst checkpoint: {}".format(worst_record_path))
                self._del_checkpoint(checkpoint_dir, worst_record_path)
        # add the new one
        self._save_parameters(checkpoint_dir, tag_or_iteration, model,
                              optimizer, infos)
        self.best_records[tag_or_iteration] = metric
    def _save_latest_checkpoint_and_update(
            self, checkpoint_dir, tag_or_iteration, model, optimizer, infos):
        # remove the old
        if self._latest_full():
            to_del_fn = self.latest_records.pop(0)
            if (to_del_fn not in self.best_records.keys()):
                logger.info(
                    "remove the latest checkpoint: {}".format(to_del_fn))
                self._del_checkpoint(checkpoint_dir, to_del_fn)
        self.latest_records.append(tag_or_iteration)
        self._save_parameters(checkpoint_dir, tag_or_iteration, model,
                              optimizer, infos)
    def _del_checkpoint(self, checkpoint_dir, tag_or_iteration):
        checkpoint_path = os.path.join(checkpoint_dir,
                                       "{}".format(tag_or_iteration))
        for filename in glob.glob(checkpoint_path + ".*"):
            os.remove(filename)
            logger.info("delete file: {}".format(filename))
    def _load_checkpoint_idx(self, checkpoint_record: str) -> int:
        """Get the iteration number corresponding to the latest saved checkpoint.
        Args:
            checkpoint_path (str): the saved path of checkpoint.
        Returns:
            int: the latest iteration number. -1 for no checkpoint to load.
        """
        if not os.path.isfile(checkpoint_record):
            return -1
        # Fetch the latest checkpoint index.
        with open(checkpoint_record, "rt") as handle:
            latest_checkpoint = handle.readlines()[-1].strip()
            iteration = int(latest_checkpoint.split(":")[-1])
        return iteration
    def _save_checkpoint_record(self, checkpoint_dir: str, iteration: int):
        """Save the iteration number of the latest model to be checkpoint record.
        Args:
            checkpoint_dir (str): the directory where checkpoint is saved.
            iteration (int): the latest iteration number.
        Returns:
            None
        """
        checkpoint_record_latest = os.path.join(checkpoint_dir,
                                                "checkpoint_latest")
        checkpoint_record_best = os.path.join(checkpoint_dir, "checkpoint_best")
        with open(checkpoint_record_best, "w") as handle:
            for i in self.best_records.keys():
                handle.write("model_checkpoint_path:{}\n".format(i))
        with open(checkpoint_record_latest, "w") as handle:
            for i in self.latest_records:
                handle.write("model_checkpoint_path:{}\n".format(i))
    @mp_tools.rank_zero_only
    def _save_parameters(self,
                         checkpoint_dir: str,
                         tag_or_iteration: Union[int, str],
                         model: paddle.nn.Layer,
                         optimizer: Optimizer=None,
                         infos: dict=None):
        """Checkpoint the latest trained model parameters.
        Args:
            checkpoint_dir (str): the directory where checkpoint is saved.
            tag_or_iteration (int or str): the latest iteration(step or epoch) number.
            model (Layer): model to be checkpointed.
            optimizer (Optimizer, optional): optimizer to be checkpointed.
                Defaults to None.
            infos (dict or None): any info you want to save.
        Returns:
            None
        """
        checkpoint_path = os.path.join(checkpoint_dir,
                                       "{}".format(tag_or_iteration))
        model_dict = model.state_dict()
        params_path = checkpoint_path + ".pdparams"
        paddle.save(model_dict, params_path)
        logger.info("Saved model to {}".format(params_path))
        if optimizer:
            opt_dict = optimizer.state_dict()
            optimizer_path = checkpoint_path + ".pdopt"
            paddle.save(opt_dict, optimizer_path)
            logger.info("Saved optimzier state to {}".format(optimizer_path))
        info_path = re.sub('.pdparams$', '.json', params_path)
        infos = {} if infos is None else infos
        with open(info_path, 'w') as fout:
            data = json.dumps(infos)
            fout.write(data)
--- a/examples/transv1.8to2.x/deepspeech/utils/ctc_utils.py
+++ b/examples/transv1.8to2.x/deepspeech/utils/ctc_utils.py
@ -1,134 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from typing import List
 import numpy as np
 import paddle
 from deepspeech.utils.log import Log
 logger = Log(__name__).getlog()
 __all__ = ["forced_align", "remove_duplicates_and_blank", "insert_blank"]
 def remove_duplicates_and_blank(hyp: List[int], blank_id=0) -> List[int]:
    """ctc alignment to ctc label ids.
    "abaa-acee-" -> "abaace"
    Args:
        hyp (List[int]): hypotheses ids, (L)
        blank_id (int, optional): blank id. Defaults to 0.
    Returns:
        List[int]: remove dupicate ids, then remove blank id.
    """
    new_hyp: List[int] = []
    cur = 0
    while cur < len(hyp):
        # add non-blank into new_hyp
        if hyp[cur] != blank_id:
            new_hyp.append(hyp[cur])
        # skip repeat label
        prev = cur
        while cur < len(hyp) and hyp[cur] == hyp[prev]:
            cur += 1
    return new_hyp
 def insert_blank(label: np.ndarray, blank_id: int=0) -> np.ndarray:
    """Insert blank token between every two label token.
    "abcdefg" -> "-a-b-c-d-e-f-g-"
    Args:
        label ([np.ndarray]): label ids, List[int], (L).
        blank_id (int, optional): blank id. Defaults to 0.
    Returns:
        [np.ndarray]: (2L+1).
    """
    label = np.expand_dims(label, 1)  #[L, 1]
    blanks = np.zeros((label.shape[0], 1), dtype=np.int64) + blank_id
    label = np.concatenate([blanks, label], axis=1)  #[L, 2]
    label = label.reshape(-1)  #[2L], -l-l-l
    label = np.append(label, label[0])  #[2L + 1], -l-l-l-
    return label
 def forced_align(ctc_probs: paddle.Tensor, y: paddle.Tensor,
                 blank_id=0) -> List[int]:
    """ctc forced alignment.
    https://distill.pub/2017/ctc/
    Args:
        ctc_probs (paddle.Tensor): hidden state sequence, 2d tensor (T, D)
        y (paddle.Tensor): label id sequence tensor, 1d tensor (L)
        blank_id (int): blank symbol index
    Returns:
        List[int]: best alignment result, (T).
    """
    y_insert_blank = insert_blank(y, blank_id)  #(2L+1)
    log_alpha = paddle.zeros(
        (ctc_probs.size(0), len(y_insert_blank)))  #(T, 2L+1)
    log_alpha = log_alpha - float('inf')  # log of zero
    # TODO(Hui Zhang): zeros not support paddle.int16
    state_path = (paddle.zeros(
        (ctc_probs.size(0), len(y_insert_blank)), dtype=paddle.int32) - 1
                  )  # state path, Tuple((T, 2L+1))
    # init start state
    # TODO(Hui Zhang): VarBase.__getitem__() not support np.int64
    log_alpha[0, 0] = ctc_probs[0][int(y_insert_blank[0])]  # State-b, Sb
    log_alpha[0, 1] = ctc_probs[0][int(y_insert_blank[1])]  # State-nb, Snb
    for t in range(1, ctc_probs.size(0)):  # T
        for s in range(len(y_insert_blank)):  # 2L+1
            if y_insert_blank[s] == blank_id or s < 2 or y_insert_blank[
                    s] == y_insert_blank[s - 2]:
                candidates = paddle.to_tensor(
                    [log_alpha[t - 1, s], log_alpha[t - 1, s - 1]])
                prev_state = [s, s - 1]
            else:
                candidates = paddle.to_tensor([
                    log_alpha[t - 1, s],
                    log_alpha[t - 1, s - 1],
                    log_alpha[t - 1, s - 2],
                ])
                prev_state = [s, s - 1, s - 2]
            # TODO(Hui Zhang): VarBase.__getitem__() not support np.int64
            log_alpha[t, s] = paddle.max(candidates) + ctc_probs[t][int(
                y_insert_blank[s])]
            state_path[t, s] = prev_state[paddle.argmax(candidates)]
    # TODO(Hui Zhang): zeros not support paddle.int16
    state_seq = -1 * paddle.ones((ctc_probs.size(0), 1), dtype=paddle.int32)
    candidates = paddle.to_tensor([
        log_alpha[-1, len(y_insert_blank) - 1],  # Sb
        log_alpha[-1, len(y_insert_blank) - 2]  # Snb
    ])
    prev_state = [len(y_insert_blank) - 1, len(y_insert_blank) - 2]
    state_seq[-1] = prev_state[paddle.argmax(candidates)]
    for t in range(ctc_probs.size(0) - 2, -1, -1):
        state_seq[t] = state_path[t + 1, state_seq[t + 1, 0]]
    output_alignment = []
    for t in range(0, ctc_probs.size(0)):
        output_alignment.append(y_insert_blank[state_seq[t, 0]])
    return output_alignment
--- a/examples/transv1.8to2.x/deepspeech/utils/dynamic_import.py
+++ b/examples/transv1.8to2.x/deepspeech/utils/dynamic_import.py
@ -1,67 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import importlib
 import inspect
 from typing import Any
 from typing import Dict
 from typing import List
 from typing import Text
 from deepspeech.utils.log import Log
 from deepspeech.utils.tensor_utils import has_tensor
 logger = Log(__name__).getlog()
 __all__ = ["dynamic_import", "instance_class"]
 def dynamic_import(import_path, alias=dict()):
    """dynamic import module and class
    :param str import_path: syntax 'module_name:class_name'
        e.g., 'deepspeech.models.u2:U2Model'
    :param dict alias: shortcut for registered class
    :return: imported class
    """
    if import_path not in alias and ":" not in import_path:
        raise ValueError("import_path should be one of {} or "
                         'include ":", e.g. "deepspeech.models.u2:U2Model" : '
                         "{}".format(set(alias), import_path))
    if ":" not in import_path:
        import_path = alias[import_path]
    module_name, objname = import_path.split(":")
    m = importlib.import_module(module_name)
    return getattr(m, objname)
 def filter_valid_args(args: Dict[Text, Any], valid_keys: List[Text]):
    # filter by `valid_keys` and filter `val` is not None
    new_args = {
        key: val
        for key, val in args.items() if (key in valid_keys and val is not None)
    }
    return new_args
 def filter_out_tenosr(args: Dict[Text, Any]):
    return {key: val for key, val in args.items() if not has_tensor(val)}
 def instance_class(module_class, args: Dict[Text, Any]):
    valid_keys = inspect.signature(module_class).parameters.keys()
    new_args = filter_valid_args(args, valid_keys)
    logger.info(
        f"Instance: {module_class.__name__} {filter_out_tenosr(new_args)}.")
    return module_class(**new_args)
--- a/examples/transv1.8to2.x/deepspeech/utils/error_rate.py
+++ b/examples/transv1.8to2.x/deepspeech/utils/error_rate.py
@ -1,206 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """This module provides functions to calculate error rate in different level.
 e.g. wer for word-level, cer for char-level.
 """
 import numpy as np
 __all__ = ['word_errors', 'char_errors', 'wer', 'cer']
 def _levenshtein_distance(ref, hyp):
    """Levenshtein distance is a string metric for measuring the difference
    between two sequences. Informally, the levenshtein disctance is defined as
    the minimum number of single-character edits (substitutions, insertions or
    deletions) required to change one word into the other. We can naturally
    extend the edits to word level when calculate levenshtein disctance for
    two sentences.
    """
    m = len(ref)
    n = len(hyp)
    # special case
    if ref == hyp:
        return 0
    if m == 0:
        return n
    if n == 0:
        return m
    if m < n:
        ref, hyp = hyp, ref
        m, n = n, m
    # use O(min(m, n)) space
    distance = np.zeros((2, n + 1), dtype=np.int32)
    # initialize distance matrix
    for j in range(n + 1):
        distance[0][j] = j
    # calculate levenshtein distance
    for i in range(1, m + 1):
        prev_row_idx = (i - 1) % 2
        cur_row_idx = i % 2
        distance[cur_row_idx][0] = i
        for j in range(1, n + 1):
            if ref[i - 1] == hyp[j - 1]:
                distance[cur_row_idx][j] = distance[prev_row_idx][j - 1]
            else:
                s_num = distance[prev_row_idx][j - 1] + 1
                i_num = distance[cur_row_idx][j - 1] + 1
                d_num = distance[prev_row_idx][j] + 1
                distance[cur_row_idx][j] = min(s_num, i_num, d_num)
    return distance[m % 2][n]
 def word_errors(reference, hypothesis, ignore_case=False, delimiter=' '):
    """Compute the levenshtein distance between reference sequence and
    hypothesis sequence in word-level.
    :param reference: The reference sentence.
    :type reference: str
    :param hypothesis: The hypothesis sentence.
    :type hypothesis: str
    :param ignore_case: Whether case-sensitive or not.
    :type ignore_case: bool
    :param delimiter: Delimiter of input sentences.
    :type delimiter: char
    :return: Levenshtein distance and word number of reference sentence.
    :rtype: list
    """
    if ignore_case:
        reference = reference.lower()
        hypothesis = hypothesis.lower()
    ref_words = list(filter(None, reference.split(delimiter)))
    hyp_words = list(filter(None, hypothesis.split(delimiter)))
    edit_distance = _levenshtein_distance(ref_words, hyp_words)
    return float(edit_distance), len(ref_words)
 def char_errors(reference, hypothesis, ignore_case=False, remove_space=False):
    """Compute the levenshtein distance between reference sequence and
    hypothesis sequence in char-level.
    :param reference: The reference sentence.
    :type reference: str
    :param hypothesis: The hypothesis sentence.
    :type hypothesis: str
    :param ignore_case: Whether case-sensitive or not.
    :type ignore_case: bool
    :param remove_space: Whether remove internal space characters
    :type remove_space: bool
    :return: Levenshtein distance and length of reference sentence.
    :rtype: list
    """
    if ignore_case:
        reference = reference.lower()
        hypothesis = hypothesis.lower()
    join_char = ' '
    if remove_space:
        join_char = ''
    reference = join_char.join(list(filter(None, reference.split(' '))))
    hypothesis = join_char.join(list(filter(None, hypothesis.split(' '))))
    edit_distance = _levenshtein_distance(reference, hypothesis)
    return float(edit_distance), len(reference)
 def wer(reference, hypothesis, ignore_case=False, delimiter=' '):
    """Calculate word error rate (WER). WER compares reference text and
    hypothesis text in word-level. WER is defined as:
    .. math::
        WER = (Sw + Dw + Iw) / Nw
    where
    .. code-block:: text
        Sw is the number of words subsituted,
        Dw is the number of words deleted,
        Iw is the number of words inserted,
        Nw is the number of words in the reference
    We can use levenshtein distance to calculate WER. Please draw an attention
    that empty items will be removed when splitting sentences by delimiter.
    :param reference: The reference sentence.
    :type reference: str
    :param hypothesis: The hypothesis sentence.
    :type hypothesis: str
    :param ignore_case: Whether case-sensitive or not.
    :type ignore_case: bool
    :param delimiter: Delimiter of input sentences.
    :type delimiter: char
    :return: Word error rate.
    :rtype: float
    :raises ValueError: If word number of reference is zero.
    """
    edit_distance, ref_len = word_errors(reference, hypothesis, ignore_case,
                                         delimiter)
    if ref_len == 0:
        raise ValueError("Reference's word number should be greater than 0.")
    wer = float(edit_distance) / ref_len
    return wer
 def cer(reference, hypothesis, ignore_case=False, remove_space=False):
    """Calculate charactor error rate (CER). CER compares reference text and
    hypothesis text in char-level. CER is defined as:
    .. math::
        CER = (Sc + Dc + Ic) / Nc
    where
    .. code-block:: text
        Sc is the number of characters substituted,
        Dc is the number of characters deleted,
        Ic is the number of characters inserted
        Nc is the number of characters in the reference
    We can use levenshtein distance to calculate CER. Chinese input should be
    encoded to unicode. Please draw an attention that the leading and tailing
    space characters will be truncated and multiple consecutive space
    characters in a sentence will be replaced by one space character.
    :param reference: The reference sentence.
    :type reference: str
    :param hypothesis: The hypothesis sentence.
    :type hypothesis: str
    :param ignore_case: Whether case-sensitive or not.
    :type ignore_case: bool
    :param remove_space: Whether remove internal space characters
    :type remove_space: bool
    :return: Character error rate.
    :rtype: float
    :raises ValueError: If the reference length is zero.
    """
    edit_distance, ref_len = char_errors(reference, hypothesis, ignore_case,
                                         remove_space)
    if ref_len == 0:
        raise ValueError("Length of reference should be greater than 0.")
    cer = float(edit_distance) / ref_len
    return cer
--- a/examples/transv1.8to2.x/deepspeech/utils/layer_tools.py
+++ b/examples/transv1.8to2.x/deepspeech/utils/layer_tools.py
@ -1,88 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import numpy as np
 from paddle import nn
 __all__ = [
    "summary", "gradient_norm", "freeze", "unfreeze", "print_grads",
    "print_params"
 ]
 def summary(layer: nn.Layer, print_func=print):
    if print_func is None:
        return
    num_params = num_elements = 0
    for name, param in layer.state_dict().items():
        if print_func:
            print_func(
                "{} | {} | {}".format(name, param.shape, np.prod(param.shape)))
        num_elements += np.prod(param.shape)
        num_params += 1
    if print_func:
        num_elements = num_elements / 1024**2
        print_func(
            f"Total parameters: {num_params}, {num_elements:.2f}M elements.")
 def print_grads(model, print_func=print):
    if print_func is None:
        return
    for n, p in model.named_parameters():
        msg = f"param grad: {n}: shape: {p.shape} grad: {p.grad}"
        print_func(msg)
 def print_params(model, print_func=print):
    if print_func is None:
        return
    total = 0.0
    num_params = 0.0
    for n, p in model.named_parameters():
        msg = f"{n} | {p.shape} | {np.prod(p.shape)} | {not p.stop_gradient}"
        total += np.prod(p.shape)
        num_params += 1
        if print_func:
            print_func(msg)
    if print_func:
        total = total / 1024**2
        print_func(f"Total parameters: {num_params}, {total:.2f}M elements.")
 def gradient_norm(layer: nn.Layer):
    grad_norm_dict = {}
    for name, param in layer.state_dict().items():
        if param.trainable:
            grad = param.gradient()  # return numpy.ndarray
            grad_norm_dict[name] = np.linalg.norm(grad) / grad.size
    return grad_norm_dict
 def recursively_remove_weight_norm(layer: nn.Layer):
    for layer in layer.sublayers():
        try:
            nn.utils.remove_weight_norm(layer)
        except ValueError as e:
            # ther is not weight norm hoom in this layer
            pass
 def freeze(layer: nn.Layer):
    for param in layer.parameters():
        param.trainable = False
 def unfreeze(layer: nn.Layer):
    for param in layer.parameters():
        param.trainable = True
--- a/examples/transv1.8to2.x/deepspeech/utils/log.py
+++ b/examples/transv1.8to2.x/deepspeech/utils/log.py
@ -1,182 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import getpass
 import logging
 import os
 import socket
 import sys
 from paddle import inference
 FORMAT_STR = '[%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s'
 DATE_FMT_STR = '%Y/%m/%d %H:%M:%S'
 logging.basicConfig(
    level=logging.DEBUG, format=FORMAT_STR, datefmt=DATE_FMT_STR)
 def find_log_dir(log_dir=None):
    """Returns the most suitable directory to put log files into.
    Args:
        log_dir: str|None, if specified, the logfile(s) will be created in that
            directory.  Otherwise if the --log_dir command-line flag is provided,
            the logfile will be created in that directory.  Otherwise the logfile
            will be created in a standard location.
    Raises:
        FileNotFoundError: raised when it cannot find a log directory.
  """
    # Get a list of possible log dirs (will try to use them in order).
    if log_dir:
        # log_dir was explicitly specified as an arg, so use it and it alone.
        dirs = [log_dir]
    else:
        dirs = ['/tmp/', './']
    # Find the first usable log dir.
    for d in dirs:
        if os.path.isdir(d) and os.access(d, os.W_OK):
            return d
    raise FileNotFoundError(
        "Can't find a writable directory for logs, tried %s" % dirs)
 def find_log_dir_and_names(program_name=None, log_dir=None):
    """Computes the directory and filename prefix for log file.
    Args:
        program_name: str|None, the filename part of the path to the program that
            is running without its extension.  e.g: if your program is called
            'usr/bin/foobar.py' this method should probably be called with
            program_name='foobar' However, this is just a convention, you can
            pass in any string you want, and it will be used as part of the
            log filename. If you don't pass in anything, the default behavior
            is as described in the example.  In python standard logging mode,
            the program_name will be prepended with py_ if it is the program_name
            argument is omitted.
        log_dir: str|None, the desired log directory.
    Returns:
        (log_dir, file_prefix, symlink_prefix)
    Raises:
        FileNotFoundError: raised in Python 3 when it cannot find a log directory.
        OSError: raised in Python 2 when it cannot find a log directory.
  """
    if not program_name:
        # Strip the extension (foobar.par becomes foobar, and
        # fubar.py becomes fubar). We do this so that the log
        # file names are similar to C++ log file names.
        program_name = os.path.splitext(os.path.basename(sys.argv[0]))[0]
        # Prepend py_ to files so that python code gets a unique file, and
        # so that C++ libraries do not try to write to the same log files as us.
        program_name = 'py_%s' % program_name
    actual_log_dir = find_log_dir(log_dir=log_dir)
    try:
        username = getpass.getuser()
    except KeyError:
        # This can happen, e.g. when running under docker w/o passwd file.
        if hasattr(os, 'getuid'):
            # Windows doesn't have os.getuid
            username = str(os.getuid())
        else:
            username = 'unknown'
    hostname = socket.gethostname()
    file_prefix = '%s.%s.%s.log' % (program_name, hostname, username)
    return actual_log_dir, file_prefix, program_name
 class Log():
    log_name = None
    def __init__(self, logger=None):
        self.logger = logging.getLogger(logger)
        self.logger.setLevel(logging.DEBUG)
        file_dir = os.getcwd() + '/log'
        if not os.path.exists(file_dir):
            os.mkdir(file_dir)
        self.log_dir = file_dir
        actual_log_dir, file_prefix, symlink_prefix = find_log_dir_and_names(
            program_name=None, log_dir=self.log_dir)
        basename = '%s.DEBUG.%d' % (file_prefix, os.getpid())
        filename = os.path.join(actual_log_dir, basename)
        if Log.log_name is None:
            Log.log_name = filename
        # Create a symlink to the log file with a canonical name.
        symlink = os.path.join(actual_log_dir, symlink_prefix + '.DEBUG')
        try:
            if os.path.islink(symlink):
                os.unlink(symlink)
            os.symlink(os.path.basename(Log.log_name), symlink)
        except EnvironmentError:
            # If it fails, we're sad but it's no error.  Commonly, this
            # fails because the symlink was created by another user and so
            # we can't modify it
            pass
        if not self.logger.hasHandlers():
            formatter = logging.Formatter(fmt=FORMAT_STR, datefmt=DATE_FMT_STR)
            fh = logging.FileHandler(Log.log_name)
            fh.setLevel(logging.DEBUG)
            fh.setFormatter(formatter)
            self.logger.addHandler(fh)
            ch = logging.StreamHandler()
            ch.setLevel(logging.INFO)
            ch.setFormatter(formatter)
            self.logger.addHandler(ch)
        # stop propagate for propagating may print
        # log multiple times
        self.logger.propagate = False
    def getlog(self):
        return self.logger
 class Autolog:
    def __init__(self,
                 batch_size,
                 model_name="DeepSpeech",
                 model_precision="fp32"):
        import auto_log
        pid = os.getpid()
        if (os.environ['CUDA_VISIBLE_DEVICES'].strip() != ''):
            gpu_id = int(os.environ['CUDA_VISIBLE_DEVICES'].split(',')[0])
            infer_config = inference.Config()
            infer_config.enable_use_gpu(100, gpu_id)
        else:
            gpu_id = None
            infer_config = inference.Config()
        autolog = auto_log.AutoLogger(
            model_name=model_name,
            model_precision=model_precision,
            batch_size=batch_size,
            data_shape="dynamic",
            save_path="./output/auto_log.lpg",
            inference_config=infer_config,
            pids=pid,
            process_name=None,
            gpu_ids=gpu_id,
            time_keys=['preprocess_time', 'inference_time', 'postprocess_time'],
            warmup=0)
        self.autolog = autolog
    def getlog(self):
        return self.autolog
--- a/examples/transv1.8to2.x/deepspeech/utils/mp_tools.py
+++ b/examples/transv1.8to2.x/deepspeech/utils/mp_tools.py
@ -1,30 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from functools import wraps
 from paddle import distributed as dist
 __all__ = ["rank_zero_only"]
 def rank_zero_only(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        rank = dist.get_rank()
        if rank != 0:
            return
        result = func(*args, **kwargs)
        return result
    return wrapper
--- a/examples/transv1.8to2.x/deepspeech/utils/socket_server.py
+++ b/examples/transv1.8to2.x/deepspeech/utils/socket_server.py
@ -1,112 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
 import random
 import socket
 import socketserver
 import struct
 import time
 import wave
 from time import gmtime
 from time import strftime
 from deepspeech.frontend.utility import read_manifest
 __all__ = ["socket_send", "warm_up_test", "AsrTCPServer", "AsrRequestHandler"]
 def socket_send(server_ip: str, server_port: str, data: bytes):
    # Connect to server and send data
    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    sock.connect((server_ip, server_port))
    sent = data
    sock.sendall(struct.pack('>i', len(sent)) + sent)
    print('Speech[length=%d] Sent.' % len(sent))
    # Receive data from the server and shut down
    received = sock.recv(1024)
    print("Recognition Results: {}".format(received.decode('utf8')))
    sock.close()
 def warm_up_test(audio_process_handler,
                 manifest_path,
                 num_test_cases,
                 random_seed=0):
    """Warming-up test."""
    manifest = read_manifest(manifest_path)
    rng = random.Random(random_seed)
    samples = rng.sample(manifest, num_test_cases)
    for idx, sample in enumerate(samples):
        print("Warm-up Test Case %d: %s" % (idx, sample['feat']))
        start_time = time.time()
        transcript = audio_process_handler(sample['feat'])
        finish_time = time.time()
        print("Response Time: %f, Transcript: %s" %
              (finish_time - start_time, transcript))
 class AsrTCPServer(socketserver.TCPServer):
    """The ASR TCP Server."""
    def __init__(self,
                 server_address,
                 RequestHandlerClass,
                 speech_save_dir,
                 audio_process_handler,
                 bind_and_activate=True):
        self.speech_save_dir = speech_save_dir
        self.audio_process_handler = audio_process_handler
        socketserver.TCPServer.__init__(
            self, server_address, RequestHandlerClass, bind_and_activate=True)
 class AsrRequestHandler(socketserver.BaseRequestHandler):
    """The ASR request handler."""
    def handle(self):
        # receive data through TCP socket
        chunk = self.request.recv(1024)
        target_len = struct.unpack('>i', chunk[:4])[0]
        data = chunk[4:]
        while len(data) < target_len:
            chunk = self.request.recv(1024)
            data += chunk
        # write to file
        filename = self._write_to_file(data)
        print("Received utterance[length=%d] from %s, saved to %s." %
              (len(data), self.client_address[0], filename))
        start_time = time.time()
        transcript = self.server.audio_process_handler(filename)
        finish_time = time.time()
        print("Response Time: %f, Transcript: %s" %
              (finish_time - start_time, transcript))
        self.request.sendall(transcript.encode('utf-8'))
    def _write_to_file(self, data):
        # prepare save dir and filename
        if not os.path.exists(self.server.speech_save_dir):
            os.mkdir(self.server.speech_save_dir)
        timestamp = strftime("%Y%m%d%H%M%S", gmtime())
        out_filename = os.path.join(
            self.server.speech_save_dir,
            timestamp + "_" + self.client_address[0] + ".wav")
        # write to wav file
        file = wave.open(out_filename, 'wb')
        file.setnchannels(1)
        file.setsampwidth(2)
        file.setframerate(16000)
        file.writeframes(data)
        file.close()
        return out_filename
--- a/examples/transv1.8to2.x/deepspeech/utils/tensor_utils.py
+++ b/examples/transv1.8to2.x/deepspeech/utils/tensor_utils.py
@ -1,180 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Unility functions for Transformer."""
 from typing import List
 from typing import Tuple
 import paddle
 from deepspeech.utils.log import Log
 __all__ = ["pad_sequence", "add_sos_eos", "th_accuracy", "has_tensor"]
 logger = Log(__name__).getlog()
 def has_tensor(val):
    if isinstance(val, (list, tuple)):
        for item in val:
            if has_tensor(item):
                return True
    elif isinstance(val, dict):
        for k, v in val.items():
            print(k)
            if has_tensor(v):
                return True
    else:
        return paddle.is_tensor(val)
 def pad_sequence(sequences: List[paddle.Tensor],
                 batch_first: bool=False,
                 padding_value: float=0.0) -> paddle.Tensor:
    r"""Pad a list of variable length Tensors with ``padding_value``
    ``pad_sequence`` stacks a list of Tensors along a new dimension,
    and pads them to equal length. For example, if the input is list of
    sequences with size ``L x *`` and if batch_first is False, and ``T x B x *``
    otherwise.
    `B` is batch size. It is equal to the number of elements in ``sequences``.
    `T` is length of the longest sequence.
    `L` is length of the sequence.
    `*` is any number of trailing dimensions, including none.
    Example:
        >>> from paddle.nn.utils.rnn import pad_sequence
        >>> a = paddle.ones(25, 300)
        >>> b = paddle.ones(22, 300)
        >>> c = paddle.ones(15, 300)
        >>> pad_sequence([a, b, c]).size()
        paddle.Tensor([25, 3, 300])
    Note:
        This function returns a Tensor of size ``T x B x *`` or ``B x T x *``
        where `T` is the length of the longest sequence. This function assumes
        trailing dimensions and type of all the Tensors in sequences are same.
    Args:
        sequences (list[Tensor]): list of variable length sequences.
        batch_first (bool, optional): output will be in ``B x T x *`` if True, or in
            ``T x B x *`` otherwise
        padding_value (float, optional): value for padded elements. Default: 0.
    Returns:
        Tensor of size ``T x B x *`` if :attr:`batch_first` is ``False``.
        Tensor of size ``B x T x *`` otherwise
    """
    # assuming trailing dimensions and type of all the Tensors
    # in sequences are same and fetching those from sequences[0]
    max_size = sequences[0].size()
    # (TODO Hui Zhang): slice not supprot `end==start`
    # trailing_dims = max_size[1:]
    trailing_dims = max_size[1:] if max_size.ndim >= 2 else ()
    max_len = max([s.size(0) for s in sequences])
    if batch_first:
        out_dims = (len(sequences), max_len) + trailing_dims
    else:
        out_dims = (max_len, len(sequences)) + trailing_dims
    out_tensor = sequences[0].new_full(out_dims, padding_value)
    for i, tensor in enumerate(sequences):
        length = tensor.size(0)
        # use index notation to prevent duplicate references to the tensor
        if batch_first:
            out_tensor[i, :length, ...] = tensor
        else:
            out_tensor[:length, i, ...] = tensor
    return out_tensor
 def add_sos_eos(ys_pad: paddle.Tensor, sos: int, eos: int,
                ignore_id: int) -> Tuple[paddle.Tensor, paddle.Tensor]:
    """Add <sos> and <eos> labels.
    Args:
        ys_pad (paddle.Tensor): batch of padded target sequences (B, Lmax)
        sos (int): index of <sos>
        eos (int): index of <eeos>
        ignore_id (int): index of padding
    Returns:
        ys_in (paddle.Tensor) : (B, Lmax + 1)
        ys_out (paddle.Tensor) : (B, Lmax + 1)
    Examples:
        >>> sos_id = 10
        >>> eos_id = 11
        >>> ignore_id = -1
        >>> ys_pad
        tensor([[ 1,  2,  3,  4,  5],
                [ 4,  5,  6, -1, -1],
                [ 7,  8,  9, -1, -1]], dtype=paddle.int32)
        >>> ys_in,ys_out=add_sos_eos(ys_pad, sos_id , eos_id, ignore_id)
        >>> ys_in
        tensor([[10,  1,  2,  3,  4,  5],
                [10,  4,  5,  6, 11, 11],
                [10,  7,  8,  9, 11, 11]])
        >>> ys_out
        tensor([[ 1,  2,  3,  4,  5, 11],
                [ 4,  5,  6, 11, -1, -1],
                [ 7,  8,  9, 11, -1, -1]])
    """
    # TODO(Hui Zhang): using comment code, 
    #_sos = paddle.to_tensor(
    #    [sos], dtype=paddle.long, stop_gradient=True, place=ys_pad.place)
    #_eos = paddle.to_tensor(
    #    [eos], dtype=paddle.long, stop_gradient=True, place=ys_pad.place)
    #ys = [y[y != ignore_id] for y in ys_pad]  # parse padded ys
    #ys_in = [paddle.cat([_sos, y], dim=0) for y in ys]
    #ys_out = [paddle.cat([y, _eos], dim=0) for y in ys]
    #return pad_sequence(ys_in, padding_value=eos), pad_sequence(ys_out, padding_value=ignore_id)
    B = ys_pad.size(0)
    _sos = paddle.ones([B, 1], dtype=ys_pad.dtype) * sos
    _eos = paddle.ones([B, 1], dtype=ys_pad.dtype) * eos
    ys_in = paddle.cat([_sos, ys_pad], dim=1)
    mask_pad = (ys_in == ignore_id)
    ys_in = ys_in.masked_fill(mask_pad, eos)
    ys_out = paddle.cat([ys_pad, _eos], dim=1)
    ys_out = ys_out.masked_fill(mask_pad, eos)
    mask_eos = (ys_out == ignore_id)
    ys_out = ys_out.masked_fill(mask_eos, eos)
    ys_out = ys_out.masked_fill(mask_pad, ignore_id)
    return ys_in, ys_out
 def th_accuracy(pad_outputs: paddle.Tensor,
                pad_targets: paddle.Tensor,
                ignore_label: int) -> float:
    """Calculate accuracy.
    Args:
        pad_outputs (Tensor): Prediction tensors (B * Lmax, D).
        pad_targets (LongTensor): Target label tensors (B, Lmax, D).
        ignore_label (int): Ignore label id.
    Returns:
        float: Accuracy value (0.0 - 1.0).
    """
    pad_pred = pad_outputs.view(
        pad_targets.size(0), pad_targets.size(1), pad_outputs.size(1)).argmax(2)
    mask = pad_targets != ignore_label
    #TODO(Hui Zhang): sum not support bool type
    # numerator = paddle.sum(
    #     pad_pred.masked_select(mask) == pad_targets.masked_select(mask))
    numerator = (
        pad_pred.masked_select(mask) == pad_targets.masked_select(mask))
    numerator = paddle.sum(numerator.type_as(pad_targets))
    #TODO(Hui Zhang): sum not support bool type
    # denominator = paddle.sum(mask)
    denominator = paddle.sum(mask.type_as(pad_targets))
    return float(numerator) / float(denominator)
--- a/examples/transv1.8to2.x/deepspeech/utils/text_grid.py
+++ b/examples/transv1.8to2.x/deepspeech/utils/text_grid.py
@ -1,127 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from typing import Dict
 from typing import List
 from typing import Text
 import textgrid
 def segment_alignment(alignment: List[int], blank_id=0) -> List[List[int]]:
    """segment ctc alignment ids by continuous blank and repeat label.
    Args:
        alignment (List[int]): ctc alignment id sequence. 
            e.g. [0, 0, 0, 1, 1, 1, 2, 0, 0, 3]
        blank_id (int, optional): blank id. Defaults to 0.
    Returns:
        List[List[int]]: token align, segment aligment id sequence. 
            e.g. [[0, 0, 0, 1, 1, 1], [2], [0, 0, 3]]
    """
    # convert alignment to a praat format, which is a doing phonetics
    # by computer and helps analyzing alignment
    align_segs = []
    # get frames level duration for each token
    start = 0
    end = 0
    while end < len(alignment):
        while end < len(alignment) and alignment[end] == blank_id:  # blank
            end += 1
        if end == len(alignment):
            align_segs[-1].extend(alignment[start:])
            break
        end += 1
        while end < len(alignment) and alignment[end - 1] == alignment[
                end]:  # repeat label
            end += 1
        align_segs.append(alignment[start:end])
        start = end
    return align_segs
 def align_to_tierformat(align_segs: List[List[int]],
                        subsample: int,
                        token_dict: Dict[int, Text],
                        blank_id=0) -> List[Text]:
    """Generate textgrid.Interval format from alignment segmentations.
    Args:
        align_segs (List[List[int]]): segmented ctc alignment ids.
        subsample (int): 25ms frame_length, 10ms hop_length, 1/subsample
        token_dict (Dict[int, Text]): int -> str map.
    Returns:
        List[Text]: list of textgrid.Interval text, str(start, end, text).
    """
    hop_length = 10  # ms
    second_ms = 1000  # ms
    frame_per_second = second_ms / hop_length  # 25ms frame_length, 10ms hop_length
    second_per_frame = 1.0 / frame_per_second
    begin = 0
    duration = 0
    tierformat = []
    for idx, tokens in enumerate(align_segs):
        token_len = len(tokens)
        token = tokens[-1]
        # time duration in second
        duration = token_len * subsample * second_per_frame
        if idx < len(align_segs) - 1:
            print(f"{begin:.2f} {begin + duration:.2f} {token_dict[token]}")
            tierformat.append(
                f"{begin:.2f} {begin + duration:.2f} {token_dict[token]}\n")
        else:
            for i in tokens:
                if i != blank_id:
                    token = i
                    break
            print(f"{begin:.2f} {begin + duration:.2f} {token_dict[token]}")
            tierformat.append(
                f"{begin:.2f} {begin + duration:.2f} {token_dict[token]}\n")
        begin = begin + duration
    return tierformat
 def generate_textgrid(maxtime: float,
                      intervals: List[Text],
                      output: Text,
                      name: Text='ali') -> None:
    """Create alignment textgrid file.
    Args:
        maxtime (float): audio duartion.
        intervals (List[Text]): ctc output alignment. e.g. "start-time end-time word" per item.
        output (Text): textgrid filepath.
        name (Text, optional): tier or layer name. Defaults to 'ali'.
    """
    # Download Praat: https://www.fon.hum.uva.nl/praat/
    avg_interval = maxtime / (len(intervals) + 1)
    print(f"average second/token: {avg_interval}")
    margin = 0.0001
    tg = textgrid.TextGrid(maxTime=maxtime)
    tier = textgrid.IntervalTier(name=name, maxTime=maxtime)
    i = 0
    for dur in intervals:
        s, e, text = dur.split()
        tier.add(minTime=float(s) + margin, maxTime=float(e), mark=text)
    tg.append(tier)
    tg.write(output)
    print("successfully generator textgrid {}.".format(output))
--- a/examples/transv1.8to2.x/deepspeech/utils/utility.py
+++ b/examples/transv1.8to2.x/deepspeech/utils/utility.py
@ -1,110 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Contains common utility functions."""
 import distutils.util
 import math
 import os
 import random
 from typing import List
 import numpy as np
 import paddle
 __all__ = ["seed_all", 'print_arguments', 'add_arguments', "log_add"]
 def seed_all(seed: int=210329):
    np.random.seed(seed)
    random.seed(seed)
    paddle.seed(seed)
 def print_arguments(args, info=None):
    """Print argparse's arguments.
    Usage:
    .. code-block:: python
        parser = argparse.ArgumentParser()
        parser.add_argument("name", default="Jonh", type=str, help="User name.")
        args = parser.parse_args()
        print_arguments(args)
    :param args: Input argparse.Namespace for printing.
    :type args: argparse.Namespace
    """
    filename = ""
    if info:
        filename = info["__file__"]
    filename = os.path.basename(filename)
    print(f"----------- {filename} Configuration Arguments -----------")
    for arg, value in sorted(vars(args).items()):
        print("%s: %s" % (arg, value))
    print("-----------------------------------------------------------")
 def add_arguments(argname, type, default, help, argparser, **kwargs):
    """Add argparse's argument.
    Usage:
    .. code-block:: python
        parser = argparse.ArgumentParser()
        add_argument("name", str, "Jonh", "User name.", parser)
        args = parser.parse_args()
    """
    type = distutils.util.strtobool if type == bool else type
    argparser.add_argument(
        "--" + argname,
        default=default,
        type=type,
        help=help + ' Default: %(default)s.',
        **kwargs)
 def log_add(args: List[int]) -> float:
    """Stable log add
    Args:
        args (List[int]): log scores
    Returns:
        float: sum of log scores
    """
    if all(a == -float('inf') for a in args):
        return -float('inf')
    a_max = max(args)
    lsp = math.log(sum(math.exp(a - a_max) for a in args))
    return a_max + lsp
 def get_subsample(config):
    """Subsample rate from config.
    Args:
        config (yacs.config.CfgNode): yaml config
    Returns:
        int: subsample rate.
    """
    input_layer = config["model"]["encoder_conf"]["input_layer"]
    assert input_layer in ["conv2d", "conv2d6", "conv2d8"]
    if input_layer == "conv2d":
        return 4
    elif input_layer == "conv2d6":
        return 6
    elif input_layer == "conv2d8":
        return 8
--- a/examples/transv1.8to2.x/example/aishell/run_data.sh
+++ b/examples/transv1.8to2.x/example/aishell/run_data.sh
@ -1,42 +0,0 @@
 #! /usr/bin/env bash
 cd ../.. > /dev/null
 # download data, generate manifests
 PYTHONPATH=.:$PYTHONPATH python3 data/aishell/aishell.py \
 --manifest_prefix='data/aishell/manifest' \
 --target_dir='../dataset/aishell'
 if [ $? -ne 0 ]; then
    echo "Prepare Aishell failed. Terminated."
    exit 1
 fi
 # build vocabulary
 python3 tools/build_vocab.py \
 --count_threshold=0 \
 --vocab_path='data/aishell/vocab.txt' \
 --manifest_paths 'data/aishell/manifest.train' 'data/aishell/manifest.dev'
 if [ $? -ne 0 ]; then
    echo "Build vocabulary failed. Terminated."
    exit 1
 fi
 # compute mean and stddev for normalizer
 python3 tools/compute_mean_std.py \
 --manifest_path='data/aishell/manifest.train' \
 --num_samples=2000 \
 --specgram_type='linear' \
 --output_path='data/aishell/mean_std.npz'
 if [ $? -ne 0 ]; then
    echo "Compute mean and stddev failed. Terminated."
    exit 1
 fi
 echo "Aishell data preparation done."
 exit 0
--- a/examples/transv1.8to2.x/example/aishell/run_infer_golden.sh
+++ b/examples/transv1.8to2.x/example/aishell/run_infer_golden.sh
@ -1,55 +0,0 @@
 #! /usr/bin/env bash
 cd ../.. > /dev/null
 # download language model
 cd models/lm > /dev/null
 bash download_lm_ch.sh
 if [ $? -ne 0 ]; then
    exit 1
 fi
 cd - > /dev/null
 # download well-trained model
 cd models/aishell > /dev/null
 bash download_model.sh
 if [ $? -ne 0 ]; then
    exit 1
 fi
 cd - > /dev/null
 # infer
 CUDA_VISIBLE_DEVICES=0 \
 python3 -u infer2x.py \
 --num_samples=10 \
 --beam_size=300 \
 --feat_dim=161 \
 --num_proc_bsearch=8 \
 --num_conv_layers=2 \
 --num_rnn_layers=3 \
 --rnn_layer_size=1024 \
 --alpha=2.6 \
 --beta=5.0 \
 --cutoff_prob=0.99 \
 --cutoff_top_n=40 \
 --use_gru=True \
 --use_gpu=False \
 --share_rnn_weights=False \
 --infer_manifest='data/aishell/manifest.test' \
 --mean_std_path='models/aishell/mean_std.npz' \
 --vocab_path='models/aishell/vocab.txt' \
 --model_path='models/aishell/aishell_v1.8.pdparams' \
 --lang_model_path='models/lm/zh_giga.no_cna_cmn.prune01244.klm' \
 --decoding_method='ctc_beam_search' \
 --error_rate_type='cer' \
 --specgram_type='linear'
 if [ $? -ne 0 ]; then
    echo "Failed in inference!"
    exit 1
 fi
 exit 0
--- a/examples/transv1.8to2.x/example/aishell/run_test_golden.sh
+++ b/examples/transv1.8to2.x/example/aishell/run_test_golden.sh
@ -1,54 +0,0 @@
 #! /usr/bin/env bash
 cd ../.. > /dev/null
 # download language model
 cd models/lm > /dev/null
 bash download_lm_ch.sh
 if [ $? -ne 0 ]; then
    exit 1
 fi
 cd - > /dev/null
 # download well-trained model
 cd models/aishell > /dev/null
 bash download_model.sh
 if [ $? -ne 0 ]; then
    exit 1
 fi
 cd - > /dev/null
 # evaluate model
 CUDA_VISIBLE_DEVICES=1 \
 python3 -u test2x.py \
 --batch_size=64 \
 --beam_size=300 \
 --feat_dim=161 \
 --num_proc_bsearch=8 \
 --num_conv_layers=2 \
 --num_rnn_layers=3 \
 --rnn_layer_size=1024 \
 --alpha=2.6 \
 --beta=5.0 \
 --cutoff_prob=0.99 \
 --cutoff_top_n=40 \
 --use_gru=True \
 --use_gpu=True \
 --share_rnn_weights=False \
 --test_manifest='data/aishell/manifest.test' \
 --mean_std_path='models/aishell/mean_std.npz' \
 --vocab_path='models/aishell/vocab.txt' \
 --model_path='models/aishell/aishell_v1.8.pdparams' \
 --lang_model_path='models/lm/zh_giga.no_cna_cmn.prune01244.klm' \
 --decoding_method='ctc_beam_search' \
 --error_rate_type='cer' \
 --specgram_type='linear'
 if [ $? -ne 0 ]; then
    echo "Failed in evaluation!"
    exit 1
 fi
 exit 0
--- a/examples/transv1.8to2.x/example/baidu_en8k/run_data.sh
+++ b/examples/transv1.8to2.x/example/baidu_en8k/run_data.sh
@ -1,45 +0,0 @@
 #! /usr/bin/env bash
 cd ../.. > /dev/null
 # download data, generate manifests
 PYTHONPATH=.:$PYTHONPATH python3 data/librispeech/librispeech.py \
 --manifest_prefix='data/librispeech/manifest' \
 --target_dir='../dataset/librispeech' \
 --full_download='True'
 if [ $? -ne 0 ]; then
    echo "Prepare LibriSpeech failed. Terminated."
    exit 1
 fi
 cat data/librispeech/manifest.train-* | shuf > data/librispeech/manifest.train
 # build vocabulary
 python3 tools/build_vocab.py \
 --count_threshold=0 \
 --vocab_path='data/librispeech/vocab.txt' \
 --manifest_paths='data/librispeech/manifest.train'
 if [ $? -ne 0 ]; then
    echo "Build vocabulary failed. Terminated."
    exit 1
 fi
 # compute mean and stddev for normalizer
 python3 tools/compute_mean_std.py \
 --manifest_path='data/librispeech/manifest.train' \
 --num_samples=2000 \
 --specgram_type='linear' \
 --output_path='data/librispeech/mean_std.npz'
 if [ $? -ne 0 ]; then
    echo "Compute mean and stddev failed. Terminated."
    exit 1
 fi
 echo "LibriSpeech Data preparation done."
 exit 0
--- a/examples/transv1.8to2.x/example/baidu_en8k/run_infer_golden.sh
+++ b/examples/transv1.8to2.x/example/baidu_en8k/run_infer_golden.sh
@ -1,55 +0,0 @@
 #! /usr/bin/env bash
 cd ../.. > /dev/null
 # download language model
 cd models/lm > /dev/null
 bash download_lm_en.sh
 if [ $? -ne 0 ]; then
    exit 1
 fi
 cd - > /dev/null
 # download well-trained model
 cd models/baidu_en8k > /dev/null
 bash download_model.sh
 if [ $? -ne 0 ]; then
    exit 1
 fi
 cd - > /dev/null
 # infer
 CUDA_VISIBLE_DEVICES=0 \
 python3 -u infer2x.py \
 --num_samples=10 \
 --beam_size=500 \
 --feat_dim=161 \
 --num_proc_bsearch=5 \
 --num_conv_layers=2 \
 --num_rnn_layers=3 \
 --rnn_layer_size=1024 \
 --alpha=1.4 \
 --beta=0.35 \
 --cutoff_prob=1.0 \
 --cutoff_top_n=40 \
 --use_gru=True \
 --use_gpu=False \
 --share_rnn_weights=False \
 --infer_manifest='data/librispeech/manifest.test-clean' \
 --mean_std_path='models/baidu_en8k/mean_std.npz' \
 --vocab_path='models/baidu_en8k/vocab.txt' \
 --model_path='models/baidu_en8k/baidu_en8k_v1.8.pdparams' \
 --lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
 --decoding_method='ctc_beam_search' \
 --error_rate_type='wer' \
 --specgram_type='linear'
 if [ $? -ne 0 ]; then
    echo "Failed in inference!"
    exit 1
 fi
 exit 0
--- a/examples/transv1.8to2.x/example/baidu_en8k/run_test_golden.sh
+++ b/examples/transv1.8to2.x/example/baidu_en8k/run_test_golden.sh
@ -1,55 +0,0 @@
 #! /usr/bin/env bash
 cd ../.. > /dev/null
 # download language model
 cd models/lm > /dev/null
 bash download_lm_en.sh
 if [ $? -ne 0 ]; then
    exit 1
 fi
 cd - > /dev/null
 # download well-trained model
 cd models/baidu_en8k > /dev/null
 bash download_model.sh
 if [ $? -ne 0 ]; then
    exit 1
 fi
 cd - > /dev/null
 # evaluate model
 CUDA_VISIBLE_DEVICES=0 \
 python3 -u test2x.py \
 --batch_size=32 \
 --beam_size=500 \
 --feat_dim=161 \
 --num_proc_bsearch=8 \
 --num_conv_layers=2 \
 --num_rnn_layers=3 \
 --rnn_layer_size=1024 \
 --alpha=1.4 \
 --beta=0.35 \
 --cutoff_prob=1.0 \
 --cutoff_top_n=40 \
 --use_gru=True \
 --use_gpu=False \
 --share_rnn_weights=False \
 --test_manifest='data/librispeech/manifest.test-clean' \
 --mean_std_path='models/baidu_en8k/mean_std.npz' \
 --vocab_path='models/baidu_en8k/vocab.txt' \
 --model_path='models/baidu_en8k/baidu_en8k_v1.8.pdparams' \
 --lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
 --decoding_method='ctc_beam_search' \
 --error_rate_type='wer' \
 --specgram_type='linear'
 if [ $? -ne 0 ]; then
    echo "Failed in evaluation!"
    exit 1
 fi
 exit 0
--- a/examples/transv1.8to2.x/example/librispeech/run_data.sh
+++ b/examples/transv1.8to2.x/example/librispeech/run_data.sh
@ -1,45 +0,0 @@
 #! /usr/bin/env bash
 cd ../.. > /dev/null
 # download data, generate manifests
 PYTHONPATH=.:$PYTHONPATH python3 data/librispeech/librispeech.py \
 --manifest_prefix='data/librispeech/manifest' \
 --target_dir='../dataset/librispeech' \
 --full_download='True'
 if [ $? -ne 0 ]; then
    echo "Prepare LibriSpeech failed. Terminated."
    exit 1
 fi
 cat data/librispeech/manifest.train-* | shuf > data/librispeech/manifest.train
 # build vocabulary
 python3 tools/build_vocab.py \
 --count_threshold=0 \
 --vocab_path='data/librispeech/vocab.txt' \
 --manifest_paths='data/librispeech/manifest.train'
 if [ $? -ne 0 ]; then
    echo "Build vocabulary failed. Terminated."
    exit 1
 fi
 # compute mean and stddev for normalizer
 python3 tools/compute_mean_std.py \
 --manifest_path='data/librispeech/manifest.train' \
 --num_samples=2000 \
 --specgram_type='linear' \
 --output_path='data/librispeech/mean_std.npz'
 if [ $? -ne 0 ]; then
    echo "Compute mean and stddev failed. Terminated."
    exit 1
 fi
 echo "LibriSpeech Data preparation done."
 exit 0
--- a/examples/transv1.8to2.x/example/librispeech/run_infer_golden.sh
+++ b/examples/transv1.8to2.x/example/librispeech/run_infer_golden.sh
@ -1,55 +0,0 @@
 #! /usr/bin/env bash
 cd ../.. > /dev/null
 # download language model
 cd models/lm > /dev/null
 bash download_lm_en.sh
 if [ $? -ne 0 ]; then
    exit 1
 fi
 cd - > /dev/null
 # download well-trained model
 cd models/librispeech > /dev/null
 bash download_model.sh
 if [ $? -ne 0 ]; then
    exit 1
 fi
 cd - > /dev/null
 # infer
 CUDA_VISIBLE_DEVICES=0 \
 python3 -u infer2x.py \
 --num_samples=10 \
 --beam_size=500 \
 --feat_dim=161 \
 --num_proc_bsearch=8 \
 --num_conv_layers=2 \
 --num_rnn_layers=3 \
 --rnn_layer_size=2048 \
 --alpha=2.5 \
 --beta=0.3 \
 --cutoff_prob=1.0 \
 --cutoff_top_n=40 \
 --use_gru=False \
 --use_gpu=True \
 --share_rnn_weights=True \
 --infer_manifest='data/librispeech/manifest.test-clean' \
 --mean_std_path='models/librispeech/mean_std.npz' \
 --vocab_path='models/librispeech/vocab.txt' \
 --model_path='models/librispeech/librispeech_v1.8.pdparams' \
 --lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
 --decoding_method='ctc_beam_search' \
 --error_rate_type='wer' \
 --specgram_type='linear'
 if [ $? -ne 0 ]; then
    echo "Failed in inference!"
    exit 1
 fi
 exit 0
--- a/examples/transv1.8to2.x/example/librispeech/run_test_golden.sh
+++ b/examples/transv1.8to2.x/example/librispeech/run_test_golden.sh
@ -1,55 +0,0 @@
 #! /usr/bin/env bash
 cd ../.. > /dev/null
 # download language model
 cd models/lm > /dev/null
 bash download_lm_en.sh
 if [ $? -ne 0 ]; then
    exit 1
 fi
 cd - > /dev/null
 # download well-trained model
 cd models/librispeech > /dev/null
 bash download_model.sh
 if [ $? -ne 0 ]; then
    exit 1
 fi
 cd - > /dev/null
 # evaluate model
 CUDA_VISIBLE_DEVICES=0 \
 python3 -u test2x.py \
 --batch_size=32 \
 --beam_size=500 \
 --feat_dim=161 \
 --num_proc_bsearch=8 \
 --num_conv_layers=2 \
 --num_rnn_layers=3 \
 --rnn_layer_size=2048 \
 --alpha=2.5 \
 --beta=0.3 \
 --cutoff_prob=1.0 \
 --cutoff_top_n=40 \
 --use_gru=False \
 --use_gpu=True \
 --share_rnn_weights=True \
 --test_manifest='data/librispeech/manifest.test-clean' \
 --mean_std_path='models/librispeech/mean_std.npz' \
 --vocab_path='models/librispeech/vocab.txt' \
 --model_path='models/librispeech/librispeech_v1.8.pdparams' \
 --lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
 --decoding_method='ctc_beam_search' \
 --error_rate_type='wer' \
 --specgram_type='linear'
 if [ $? -ne 0 ]; then
    echo "Failed in evaluation!"
    exit 1
 fi
 exit 0
--- a/examples/transv1.8to2.x/infer2x.py
+++ b/examples/transv1.8to2.x/infer2x.py
@ -1,163 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inferer for DeepSpeech2 model."""
 import argparse
 import functools
 import numpy as np
 import paddle
 import paddle.fluid as fluid
 from data_utils.data import DataGenerator
 from model_utils.model_check import check_cuda
 from model_utils.model_check import check_version
 from deepspeech.models.ds2 import DeepSpeech2Model as DS2
 from utils.error_rate import cer
 from utils.error_rate import wer
 from utils.utility import add_arguments
 from utils.utility import print_arguments
 parser = argparse.ArgumentParser(description=__doc__)
 add_arg = functools.partial(add_arguments, argparser=parser)
 # yapf: disable
 add_arg('num_samples',      int,    10,     "# of samples to infer.")
 add_arg('beam_size',        int,    500,    "Beam search width.")
 add_arg('feat_dim',        int,    161,    "Feature dim.")
 add_arg('num_proc_bsearch', int,    8,      "# of CPUs for beam search.")
 add_arg('num_conv_layers',  int,    2,      "# of convolution layers.")
 add_arg('num_rnn_layers',   int,    3,      "# of recurrent layers.")
 add_arg('rnn_layer_size',   int,    2048,   "# of recurrent cells per layer.")
 add_arg('alpha',            float,  2.5,    "Coef of LM for beam search.")
 add_arg('beta',             float,  0.3,    "Coef of WC for beam search.")
 add_arg('cutoff_prob',      float,  1.0,    "Cutoff probability for pruning.")
 add_arg('cutoff_top_n',     int,    40,     "Cutoff number for pruning.")
 add_arg('use_gru',          bool,   False,  "Use GRUs instead of simple RNNs.")
 add_arg('use_gpu',          bool,   True,   "Use GPU or not.")
 add_arg('share_rnn_weights',            bool,   True,   "Share input-hidden weights across bi-directional RNNs. Not for GRU.")
 add_arg('infer_manifest',   str,
        'data/librispeech/manifest.dev-clean',
        "Filepath of manifest to infer.")
 add_arg('mean_std_path',    str,
        'data/librispeech/mean_std.npz',
        "Filepath of normalizer's mean & std.")
 add_arg('vocab_path',       str,
        'data/librispeech/vocab.txt',
        "Filepath of vocabulary.")
 add_arg('lang_model_path',  str,
        'models/lm/common_crawl_00.prune01111.trie.klm',
        "Filepath for language model.")
 add_arg('model_path',       str,
        './checkpoints/libri/step_final',
        "If None, the training starts from scratch, "
        "otherwise, it resumes from the pre-trained model.")
 add_arg('decoding_method',  str,
        'ctc_beam_search',
        "Decoding method. Options: ctc_beam_search, ctc_greedy",
        choices=['ctc_beam_search', 'ctc_greedy'])
 add_arg('error_rate_type',  str,
        'wer',
        "Error rate type for evaluation.",
        choices=['wer', 'cer'])
 add_arg('specgram_type',    str,
        'linear',
        "Audio feature type. Options: linear, mfcc.",
        choices=['linear', 'mfcc'])
 # yapf: disable
 args = parser.parse_args()
 def infer():
    """Inference for DeepSpeech2."""
    # check if set use_gpu=True in paddlepaddle cpu version
    check_cuda(args.use_gpu)
    # check if paddlepaddle version is satisfied
    check_version()
    if args.use_gpu:
        place = fluid.CUDAPlace(0)
    else:
        place = fluid.CPUPlace()
    data_generator = DataGenerator(
        vocab_filepath=args.vocab_path,
        mean_std_filepath=args.mean_std_path,
        augmentation_config='{}',
        specgram_type=args.specgram_type,
        keep_transcription_text=True,
        place=place,
        is_training=False)
    batch_reader = data_generator.batch_reader_creator(
        manifest_path=args.infer_manifest,
        batch_size=args.num_samples,
        sortagrad=False,
        shuffle_method=None)
    # decoders only accept string encoded in utf-8
    vocab_list = [chars for chars in data_generator.vocab_list]
    for i, char in enumerate(vocab_list):
        if vocab_list[i] == '':
            vocab_list[i] = " "
    model = DS2(
        feat_size=args.feat_dim,
        dict_size=len(vocab_list),
        num_conv_layers=args.num_conv_layers,
        num_rnn_layers=args.num_rnn_layers,
        rnn_size=args.rnn_layer_size,
        use_gru=args.use_gru,
        share_rnn_weights=args.share_rnn_weights,
        blank_id=len(vocab_list) - 1
    )
    params_path = args.model_path
    model_dict = paddle.load(params_path)
    model.set_state_dict(model_dict)
    model.eval()
    error_rate_func = cer if args.error_rate_type == 'cer' else wer
    print("start inference ...")
    for infer_data in batch_reader():
        target_transcripts = infer_data[1]
        audio, target_transcripts, audio_len, mask = infer_data
        audio = np.transpose(audio, (0, 2, 1))
        audio_len = audio_len.reshape(-1)
        audio = paddle.to_tensor(audio)
        audio_len = paddle.to_tensor(audio_len)
        result_transcripts = model.decode(
            audio=audio,
            audio_len=audio_len,
            lang_model_path=args.lang_model_path,
            decoding_method=args.decoding_method,
            beam_alpha=args.alpha,
            beam_beta=args.beta,
            beam_size=args.beam_size,
            cutoff_prob=args.cutoff_prob,
            cutoff_top_n=args.cutoff_top_n,
            vocab_list=vocab_list,
            num_processes=args.num_proc_bsearch
        )
        for target, result in zip(target_transcripts, result_transcripts):
            print("\nTarget Transcription: %s\nOutput Transcription: %s" %
                  (target, result))
            print("Current error rate [%s] = %f" %
                  (args.error_rate_type, error_rate_func(target, result)))
    print("finish inference")
 def main():
    print_arguments(args)
    infer()
 if __name__ == '__main__':
    main()
--- a/examples/transv1.8to2.x/model_utils/init.py
+++ b/examples/transv1.8to2.x/model_utils/init.py
@ -1,13 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
--- a/examples/transv1.8to2.x/model_utils/model_check.py
+++ b/examples/transv1.8to2.x/model_utils/model_check.py
@ -1,49 +0,0 @@
 # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import sys
 import paddle.fluid as fluid
 def check_cuda(
        use_cuda,
        err="\nYou can not set use_cuda = True in the model because you are using paddlepaddle-cpu.\n \
    Please: 1. Install paddlepaddle-gpu to run your models on GPU or 2. Set use_cuda = False to run models on CPU.\n"
 ):
    """
    Log error and exit when set use_gpu=true in paddlepaddle
    cpu version.
    """
    try:
        if use_cuda is True and fluid.is_compiled_with_cuda() is False:
            print(err)
            sys.exit(1)
    except Exception as e:
        pass
 def check_version():
    """
    Log error and exit when the installed version of paddlepaddle is
    not satisfied.
    """
    err = "PaddlePaddle version 1.6 or higher is required, " \
          "or a suitable develop version is satisfied as well. \n" \
          "Please make sure the version is good with your code." \
    try:
        fluid.require_version('1.6.0')
    except Exception as e:
        print(err)
        sys.exit(1)
--- a/examples/transv1.8to2.x/test2x.py
+++ b/examples/transv1.8to2.x/test2x.py
@ -1,169 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Evaluation for DeepSpeech2 model."""
 import argparse
 import functools
 import numpy as np
 import paddle
 import paddle.fluid as fluid
 from data_utils.data import DataGenerator
 from model_utils.model_check import check_cuda
 from model_utils.model_check import check_version
 from deepspeech.models.ds2 import DeepSpeech2Model as DS2
 from utils.error_rate import char_errors
 from utils.error_rate import word_errors
 from utils.utility import add_arguments
 from utils.utility import print_arguments
 parser = argparse.ArgumentParser(description=__doc__)
 add_arg = functools.partial(add_arguments, argparser=parser)
 # yapf: disable
 add_arg('batch_size',       int,    128,    "Minibatch size.")
 add_arg('beam_size',        int,    500,    "Beam search width.")
 add_arg('feat_dim',        int,    161,    "Feature dim.")
 add_arg('num_proc_bsearch', int,    8,      "# of CPUs for beam search.")
 add_arg('num_conv_layers',  int,    2,      "# of convolution layers.")
 add_arg('num_rnn_layers',   int,    3,      "# of recurrent layers.")
 add_arg('rnn_layer_size',   int,    2048,   "# of recurrent cells per layer.")
 add_arg('alpha',            float,  2.5,    "Coef of LM for beam search.")
 add_arg('beta',             float,  0.3,    "Coef of WC for beam search.")
 add_arg('cutoff_prob',      float,  1.0,    "Cutoff probability for pruning.")
 add_arg('cutoff_top_n',     int,    40,     "Cutoff number for pruning.")
 add_arg('use_gru',          bool,   False,  "Use GRUs instead of simple RNNs.")
 add_arg('use_gpu',          bool,   True,   "Use GPU or not.")
 add_arg('share_rnn_weights',            bool,   True,   "Share input-hidden weights across "
                                            "bi-directional RNNs. Not for GRU.")
 add_arg('test_manifest',   str,
        'data/librispeech/manifest.test-clean',
        "Filepath of manifest to evaluate.")
 add_arg('mean_std_path',    str,
        'data/librispeech/mean_std.npz',
        "Filepath of normalizer's mean & std.")
 add_arg('vocab_path',       str,
        'data/librispeech/vocab.txt',
        "Filepath of vocabulary.")
 add_arg('model_path',       str,
        './checkpoints/libri/step_final',
        "If None, the training starts from scratch, "
        "otherwise, it resumes from the pre-trained model.")
 add_arg('lang_model_path',  str,
        'models/lm/common_crawl_00.prune01111.trie.klm',
        "Filepath for language model.")
 add_arg('decoding_method',  str,
        'ctc_beam_search',
        "Decoding method. Options: ctc_beam_search, ctc_greedy",
        choices=['ctc_beam_search', 'ctc_greedy'])
 add_arg('error_rate_type',  str,
        'wer',
        "Error rate type for evaluation.",
        choices=['wer', 'cer'])
 add_arg('specgram_type',    str,
        'linear',
        "Audio feature type. Options: linear, mfcc.",
        choices=['linear', 'mfcc'])
 # yapf: disable
 args = parser.parse_args()
 def evaluate():
    """Evaluate on whole test data for DeepSpeech2."""
    # check if set use_gpu=True in paddlepaddle cpu version
    check_cuda(args.use_gpu)
    # check if paddlepaddle version is satisfied
    check_version()
    if args.use_gpu:
        place = fluid.CUDAPlace(0)
    else:
        place = fluid.CPUPlace()
    data_generator = DataGenerator(
        vocab_filepath=args.vocab_path,
        mean_std_filepath=args.mean_std_path,
        augmentation_config='{}',
        specgram_type=args.specgram_type,
        keep_transcription_text=True,
        place=place,
        is_training=False)
    batch_reader = data_generator.batch_reader_creator(
        manifest_path=args.test_manifest,
        batch_size=args.batch_size,
        sortagrad=False,
        shuffle_method=None)
    # decoders only accept string encoded in utf-8
    vocab_list = [chars for chars in data_generator.vocab_list]
    for i, char in enumerate(vocab_list):
        if vocab_list[i] == '':
            vocab_list[i] = " "
    model = DS2(
        feat_size=args.feat_dim,
        dict_size=len(vocab_list),
        num_conv_layers=args.num_conv_layers,
        num_rnn_layers=args.num_rnn_layers,
        rnn_size=args.rnn_layer_size,
        use_gru=args.use_gru,
        share_rnn_weights=args.share_rnn_weights,
        blank_id=len(vocab_list) - 1
    )
    params_path = args.model_path
    model_dict = paddle.load(params_path)
    model.set_state_dict(model_dict)
    model.eval()
    errors_func = char_errors if args.error_rate_type == 'cer' else word_errors
    errors_sum, len_refs, num_ins = 0.0, 0, 0
    print("start evaluation ...")
    for infer_data in batch_reader():
        audio, target_transcripts, audio_len, mask = infer_data
        audio = np.transpose(audio, (0, 2, 1))
        audio_len = audio_len.reshape(-1)
        audio = paddle.to_tensor(audio)
        audio_len = paddle.to_tensor(audio_len)
        result_transcripts = model.decode(
            audio=audio,
            audio_len=audio_len,
            lang_model_path=args.lang_model_path,
            decoding_method=args.decoding_method,
            beam_alpha=args.alpha,
            beam_beta=args.beta,
            beam_size=args.beam_size,
            cutoff_prob=args.cutoff_prob,
            cutoff_top_n=args.cutoff_top_n,
            vocab_list=vocab_list,
            num_processes=args.num_proc_bsearch
        )
        for target, result in zip(target_transcripts, result_transcripts):
            errors, len_ref = errors_func(target, result)
            errors_sum += errors
            len_refs += len_ref
            num_ins += 1
        print("Error rate [%s] (%d/?) = %f" %
              (args.error_rate_type, num_ins, errors_sum / len_refs))
    print("Final error rate [%s] (%d/%d) = %f" %
          (args.error_rate_type, num_ins, num_ins, errors_sum / len_refs))
    print("finish evaluation")
 def main():
    print_arguments(args)
    evaluate()
 if __name__ == '__main__':
    main()
--- a/Show More
+++ b/Show More