In [1]:
import math
import random
import tarfile
import logging
import numpy as np
from collections import namedtuple
from functools import partial

import paddle
from paddle.io import Dataset
from paddle.io import DataLoader
from paddle.io import BatchSampler
from paddle.io import DistributedBatchSampler
from paddle import distributed as dist

from data_utils.utility import read_manifest
from data_utils.augmentor.augmentation import AugmentationPipeline
from data_utils.featurizer.speech_featurizer import SpeechFeaturizer
from data_utils.speech import SpeechSegment
from data_utils.normalizer import FeatureNormalizer


from data_utils.dataset import (
    DeepSpeech2Dataset,
    DeepSpeech2DistributedBatchSampler,
    DeepSpeech2BatchSampler,
    SpeechCollator,
)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  def convert_to_list(value, n, name, dtype=np.int):
  from numpy.dual import register_func
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  from numpy import (exp, inf, pi, sqrt, floor, sin, cos, around, int,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  long_ = _make_signed(np.long)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  ulong = _make_unsigned(np.long)


In [20]:
def create_dataloader(manifest_path,	
                      vocab_filepath,	
                      mean_std_filepath,	
                      augmentation_config='{}',	
                      max_duration=float('inf'),	
                      min_duration=0.0,	
                      stride_ms=10.0,	
                      window_ms=20.0,	
                      max_freq=None,	
                      specgram_type='linear',	
                      use_dB_normalization=True,	
                      random_seed=0,	
                      keep_transcription_text=False,	
                      is_training=False,	
                      batch_size=1,	
                      num_workers=0,	
                      sortagrad=False,	
                      shuffle_method=None,	
                      dist=False):	

    dataset = DeepSpeech2Dataset(	
        manifest_path,	
        vocab_filepath,	
        mean_std_filepath,	
        augmentation_config=augmentation_config,	
        max_duration=max_duration,	
        min_duration=min_duration,	
        stride_ms=stride_ms,	
        window_ms=window_ms,	
        max_freq=max_freq,	
        specgram_type=specgram_type,	
        use_dB_normalization=use_dB_normalization,	
        random_seed=random_seed,	
        keep_transcription_text=keep_transcription_text)	

    if dist:	
        batch_sampler = DeepSpeech2DistributedBatchSampler(	
            dataset,	
            batch_size,	
            num_replicas=None,	
            rank=None,	
            shuffle=is_training,	
            drop_last=is_training,	
            sortagrad=is_training,	
            shuffle_method=shuffle_method)	
    else:	
        batch_sampler = DeepSpeech2BatchSampler(	
            dataset,	
            shuffle=is_training,	
            batch_size=batch_size,	
            drop_last=is_training,	
            sortagrad=is_training,	
            shuffle_method=shuffle_method)	

    def padding_batch(batch, padding_to=-1, flatten=False, is_training=True):	
        """	
        Padding audio features with zeros to make them have the same shape (or	
        a user-defined shape) within one bach.	

        If ``padding_to`` is -1, the maximun shape in the batch will be used	
        as the target shape for padding. Otherwise, `padding_to` will be the	
        target shape (only refers to the second axis).	

        If `flatten` is True, features will be flatten to 1darray.	
        """	
        new_batch = []	
        # get target shape	
        max_length = max([audio.shape[1] for audio, text in batch])	
        if padding_to != -1:	
            if padding_to < max_length:	
                raise ValueError("If padding_to is not -1, it should be larger "	
                                 "than any instance's shape in the batch")	
            max_length = padding_to	
        max_text_length = max([len(text) for audio, text in batch])	
        # padding	
        padded_audios = []	
        audio_lens = []	
        texts, text_lens = [], []	
        for audio, text in batch:	
            padded_audio = np.zeros([audio.shape[0], max_length])	
            padded_audio[:, :audio.shape[1]] = audio	
            if flatten:	
                padded_audio = padded_audio.flatten()	
            padded_audios.append(padded_audio)	
            audio_lens.append(audio.shape[1])	

            padded_text = np.zeros([max_text_length])
            if is_training:
                padded_text[:len(text)] = text	# ids
            else:
                padded_text[:len(text)] = [ord(t) for t in text] # string
            
            texts.append(padded_text)	
            text_lens.append(len(text))	

        padded_audios = np.array(padded_audios).astype('float32')	
        audio_lens = np.array(audio_lens).astype('int64')	
        texts = np.array(texts).astype('int32')	
        text_lens = np.array(text_lens).astype('int64')	
        return padded_audios, texts, audio_lens, text_lens	

    loader = DataLoader(	
        dataset,	
        batch_sampler=batch_sampler,	
        collate_fn=partial(padding_batch, is_training=is_training),	
        num_workers=num_workers)	
    return loader

In [21]:
import sys
import argparse
import functools
from utils.utility import add_arguments, print_arguments
parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
# yapf: disable
add_arg('num_samples',      int,    5,     "# of samples to infer.")
add_arg('beam_size',        int,    500,    "Beam search width.")
add_arg('num_proc_bsearch', int,    8,      "# of CPUs for beam search.")
add_arg('num_conv_layers',  int,    2,      "# of convolution layers.")
add_arg('num_rnn_layers',   int,    3,      "# of recurrent layers.")
add_arg('rnn_layer_size',   int,    2048,   "# of recurrent cells per layer.")
add_arg('alpha',            float,  2.5,    "Coef of LM for beam search.")
add_arg('beta',             float,  0.3,    "Coef of WC for beam search.")
add_arg('cutoff_prob',      float,  1.0,    "Cutoff probability for pruning.")
add_arg('cutoff_top_n',     int,    40,     "Cutoff number for pruning.")
add_arg('use_gru',          bool,   False,  "Use GRUs instead of simple RNNs.")
add_arg('use_gpu',          bool,   True,   "Use GPU or not.")
add_arg('share_rnn_weights',bool,   True,   "Share input-hidden weights across "
                                            "bi-directional RNNs. Not for GRU.")
add_arg('infer_manifest',   str,
        'examples/aishell/data/manifest.dev',
        "Filepath of manifest to infer.")
add_arg('mean_std_path',    str,
        'examples/aishell/data/mean_std.npz',
        "Filepath of normalizer's mean & std.")
add_arg('vocab_path',       str,
        'examples/aishell/data/vocab.txt',
        "Filepath of vocabulary.")
add_arg('lang_model_path',  str,
        'models/lm/common_crawl_00.prune01111.trie.klm',
        "Filepath for language model.")
add_arg('model_path',       str,
        'examples/aishell/checkpoints/step_final',
        "If None, the training starts from scratch, "
        "otherwise, it resumes from the pre-trained model.")
add_arg('decoding_method',  str,
        'ctc_beam_search',
        "Decoding method. Options: ctc_beam_search, ctc_greedy",
        choices = ['ctc_beam_search', 'ctc_greedy'])
add_arg('error_rate_type',  str,
        'wer',
        "Error rate type for evaluation.",
        choices=['wer', 'cer'])
add_arg('specgram_type',    str,
        'linear',
        "Audio feature type. Options: linear, mfcc.",
        choices=['linear', 'mfcc'])
# yapf: disable
args = parser.parse_args([])
print(vars(args))

{'num_samples': 5, 'beam_size': 500, 'num_proc_bsearch': 8, 'num_conv_layers': 2, 'num_rnn_layers': 3, 'rnn_layer_size': 2048, 'alpha': 2.5, 'beta': 0.3, 'cutoff_prob': 1.0, 'cutoff_top_n': 40, 'use_gru': False, 'use_gpu': True, 'share_rnn_weights': True, 'infer_manifest': 'examples/aishell/data/manifest.dev', 'mean_std_path': 'examples/aishell/data/mean_std.npz', 'vocab_path': 'examples/aishell/data/vocab.txt', 'lang_model_path': 'models/lm/common_crawl_00.prune01111.trie.klm', 'model_path': 'examples/aishell/checkpoints/step_final', 'decoding_method': 'ctc_beam_search', 'error_rate_type': 'wer', 'specgram_type': 'linear'}


In [22]:
batch_reader = create_dataloader(
            manifest_path=args.infer_manifest,
            vocab_filepath=args.vocab_path,
            mean_std_filepath=args.mean_std_path,
            augmentation_config='{}',
            #max_duration=float('inf'),
            max_duration=27.0,
            min_duration=0.0,
            stride_ms=10.0,
            window_ms=20.0,
            max_freq=None,
            specgram_type=args.specgram_type,
            use_dB_normalization=True,
            random_seed=0,
            keep_transcription_text=True,
            is_training=False,
            batch_size=args.num_samples,
            sortagrad=True,
            shuffle_method=None,
            dist=False)

In [30]:
for idx, (audio, audio_len, text, text_len) in enumerate(batch_reader()):
    print('test', text)
    print("test raw", ''.join( chr(i) for i in text[0][:int(text_len[0])] ))
    print("test raw", ''.join( chr(i) for i in text[-1][:int(text_len[-1])] ))
    print('audio len', audio_len)
    print('test len', text_len)
    print('audio', audio)
    break

test Tensor(shape=[5, 6], dtype=int32, place=CUDAPinnedPlace, stop_gradient=True,
       [[22823, 26102, 20195, 37324, 0    , 0    ],
        [22238, 26469, 23601, 22909, 0    , 0    ],
        [20108, 26376, 22235, 26085, 0    , 0    ],
        [36824, 35201, 20445, 25345, 32654, 24863],
        [29042, 27748, 21463, 23456, 0    , 0    ]])
test raw 大时代里
test raw 煲汤受宠
audio len Tensor(shape=[5], dtype=int64, place=CUDAPinnedPlace, stop_gradient=True,
       [163, 167, 180, 186, 186])
test len Tensor(shape=[5], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
       [4, 4, 4, 6, 4])
audio Tensor(shape=[5, 161, 186], dtype=float32, place=CUDAPinnedPlace, stop_gradient=True,
       [[[ 1.11669052,  0.79015088,  0.93658292, ...,  0.        ,  0.        ,  0.        ],
         [ 0.83549136,  0.72643483,  0.83578080, ...,  0.        ,  0.        ,  0.        ],
         [-0.89155018, -0.18894747, -0.53357804, ...,  0.        ,  0.        ,  0.        ],
         ...,
         [ 0.333867