In [1]:
%cd ..
%pwd

/home/ssd5/zhanghui/DeepSpeech2.x


'/home/ssd5/zhanghui/DeepSpeech2.x'

In [6]:
import argparse
import functools

from deepspeech.frontend.augmentor.augmentation import AugmentationPipeline
from deepspeech.frontend.featurizer.audio_featurizer import AudioFeaturizer
from deepspeech.frontend.normalizer import FeatureNormalizer
from deepspeech.utils.utility import add_arguments
from deepspeech.utils.utility import print_arguments

parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
# yapf: disable
add_arg('num_samples',      int,    -1,    "# of samples to for statistics.")
add_arg('specgram_type',    str,
        'fbank',
        "Audio feature type. Options: linear, mfcc, fbank.",
        choices=['linear', 'mfcc', 'fbank'])
add_arg('feat_dim',    int, 80, "Audio feature dim.")
add_arg('delta_delta',    bool,
        False,
        "Audio feature with delta delta.")
add_arg('stride_ms',    float, 10.0,  "stride length in ms.")
add_arg('window_ms',    float, 25.0,  "stride length in ms.")
add_arg('sample_rate',    int, 16000,  "target sample rate.")
add_arg('manifest_path',    str,
        'examples/aishell/s1/data/manifest.train.raw',
        "Filepath of manifest to compute normalizer's mean and stddev.")
add_arg('num_workers',
                        default=16,
                        type=int,
                        help='num of subprocess workers for processing')
add_arg('output_path',    str,
        'data/librispeech/mean_std.npz',
        "Filepath of write mean and stddev to (.npz).")
# yapf: disable
args = parser.parse_args([])
print(args)

Namespace(delta_delta=False, feat_dim=80, manifest_path='examples/aishell/s1/data/manifest.train.raw', num_samples=-1, num_workers=16, output_path='data/librispeech/mean_std.npz', sample_rate=16000, specgram_type='fbank', stride_ms=10.0, window_ms=25.0)


In [7]:
import random

import numpy as np
import paddle
from paddle.io import DataLoader
from paddle.io import Dataset

from deepspeech.frontend.audio import AudioSegment
from deepspeech.frontend.utility import load_cmvn
from deepspeech.frontend.utility import read_manifest

class CollateFunc(object):
    ''' Collate function for AudioDataset
    '''
    def __init__(self):
        pass
       
    def __call__(self, batch):
        mean_stat = None
        var_stat = None
        number = 0
        for feat in batch:
            sums = np.sum(feat, axis=1)
            if mean_stat is None:
                mean_stat = sums
            else:
                mean_stat += sums

            square_sums = np.sum(np.square(feat), axis=1)
            if var_stat is None:
                var_stat = square_sums
            else:
                var_stat += square_sums

            number += feat.shape[1]
        #return paddle.to_tensor(number), paddle.to_tensor(mean_stat), paddle.to_tensor(var_stat)
        return number, mean_stat, var_stat


class AudioDataset(Dataset):
    def __init__(self, manifest_path, feature_func, num_samples=-1, rng=None):
        self.feature_func = feature_func
        self._rng = rng
        manifest = read_manifest(manifest_path)
        if num_samples == -1:
            sampled_manifest = manifest
        else:
            sampled_manifest = self._rng.sample(manifest, num_samples)
        self.items = sampled_manifest

    def __len__(self):
        return len(self.items)

    def __getitem__(self, idx):
        key = self.items[idx]['feat']
        audioseg = AudioSegment.from_file(key)
        feat = self.feature_func(audioseg)  #(D, T)
        return feat

In [5]:

augmentation_pipeline = AugmentationPipeline('{}')
audio_featurizer = AudioFeaturizer(
    specgram_type=args.specgram_type,
    feat_dim=args.feat_dim,
    delta_delta=args.delta_delta,
    stride_ms=args.stride_ms,
    window_ms=args.window_ms,
    n_fft=None,
    max_freq=None,
    target_sample_rate=args.sample_rate,
    use_dB_normalization=True,
    target_dB=-20)

def augment_and_featurize(audio_segment):
    augmentation_pipeline.transform_audio(audio_segment)
    return audio_featurizer.featurize(audio_segment)


collate_func = CollateFunc()

dataset = AudioDataset(
    args.manifest_path,
    augment_and_featurize, 
    args.num_samples)

batch_size = 20
data_loader = DataLoader(
    dataset,
    batch_size=batch_size,
    shuffle=False,
    num_workers=args.num_workers,
    collate_fn=collate_func)

with paddle.no_grad():
    all_mean_stat = None
    all_var_stat = None
    all_number = 0
    wav_number = 0
    for i, batch in enumerate(data_loader()):
    #for batch in data_loader():
        number, mean_stat, var_stat = batch
        if i == 0:
            all_mean_stat = mean_stat
            all_var_stat = var_stat
        else:
            all_mean_stat += mean_stat
            all_var_stat += var_stat
        all_number += number
        wav_number += batch_size

        if wav_number % 1000 == 0:
            print('process {} wavs,{} frames'.format(wav_number,
                                                           all_number))

cmvn_info = {
    'mean_stat': list(all_mean_stat.tolist()),
    'var_stat': list(all_var_stat.tolist()),
    'frame_num': all_number
}

process 1000 wavs,450739 frames
process 2000 wavs,887447 frames
process 3000 wavs,1354148 frames
process 4000 wavs,1816494 frames
process 5000 wavs,2359211 frames
process 6000 wavs,2828455 frames
process 7000 wavs,3276186 frames
process 8000 wavs,3692234 frames
process 9000 wavs,4139360 frames
process 10000 wavs,4591528 frames
process 11000 wavs,5020114 frames
process 12000 wavs,5459523 frames
process 13000 wavs,5899534 frames
process 14000 wavs,6323242 frames
process 15000 wavs,6736597 frames
process 16000 wavs,7207686 frames
process 17000 wavs,7637800 frames
process 18000 wavs,8093004 frames
process 19000 wavs,8529518 frames
process 20000 wavs,8906022 frames
process 21000 wavs,9352652 frames
process 22000 wavs,9807495 frames
process 23000 wavs,10247938 frames
process 24000 wavs,10700011 frames
process 25000 wavs,11126134 frames
process 26000 wavs,11558061 frames
process 27000 wavs,12010359 frames
process 28000 wavs,12470938 frames
process 29000 wavs,12916013 frames
process 30000 wavs

In [9]:
print(cmvn_info)

{'mean_stat': [-813852467.7953382, -769025957.9140725, -809499593.411409, -774700574.014532, -750961217.5896736, -760564397.2864963, -805662399.3771614, -843490965.4231446, -850242081.9416809, -857678651.504435, -879067453.9826999, -908602072.3856701, -936850957.7187386, -957242686.489041, -968425442.0916103, -972687545.5953809, -980383731.7683417, -991533337.6343704, -1001966818.1164789, -1010334169.7486078, -1016855066.9099333, -1022176245.7021623, -1025700476.4788507, -1030678878.3195274, -1037075963.124199, -1042705719.0195516, -1047422212.6492896, -1049003537.271861, -1050314833.7453628, -1050772191.0204058, -1050010034.9948177, -1050436065.1336465, -1053327181.7978873, -1058710548.2036785, -1065950852.4966162, -1071709705.0060445, -1077682778.259181, -1083371045.272074, -1089708906.2657735, -1096312217.7865202, -1101089858.8364556, -1104965332.4332569, -1107791702.5223634, -1109431075.2374773, -1110066333.0280604, -1110382732.0722318, -1110480306.3793216, -1110203297.7110727, -11

In [7]:
import random

import numpy as np
import paddle
from paddle.io import DataLoader
from paddle.io import Dataset

from deepspeech.frontend.audio import AudioSegment
from deepspeech.frontend.utility import load_cmvn
from deepspeech.frontend.utility import read_manifest

# https://github.com/PaddlePaddle/Paddle/pull/31481
class CollateFunc(object):
    ''' Collate function for AudioDataset
    '''
    def __init__(self, feature_func):
        self.feature_func = feature_func
       
    def __call__(self, batch):
        mean_stat = None
        var_stat = None
        number = 0
        for item in batch:
            audioseg = AudioSegment.from_file(item['feat'])
            feat = self.feature_func(audioseg)  #(D, T)

            sums = np.sum(feat, axis=1)
            if mean_stat is None:
                mean_stat = sums
            else:
                mean_stat += sums

            square_sums = np.sum(np.square(feat), axis=1)
            if var_stat is None:
                var_stat = square_sums
            else:
                var_stat += square_sums

            number += feat.shape[1]
        return number, mean_stat, var_stat


class AudioDataset(Dataset):
    def __init__(self, manifest_path, num_samples=-1, rng=None, random_seed=0):
        self._rng = rng if rng else np.random.RandomState(random_seed)
        manifest = read_manifest(manifest_path)
        if num_samples == -1:
            sampled_manifest = manifest
        else:
            sampled_manifest = self._rng.choice(manifest, num_samples, replace=False)
        self.items = sampled_manifest

    def __len__(self):
        return len(self.items)

    def __getitem__(self, idx):
        return self.items[idx]
    
    
augmentation_pipeline = AugmentationPipeline('{}')
audio_featurizer = AudioFeaturizer(
    specgram_type=args.specgram_type,
    feat_dim=args.feat_dim,
    delta_delta=args.delta_delta,
    stride_ms=args.stride_ms,
    window_ms=args.window_ms,
    n_fft=None,
    max_freq=None,
    target_sample_rate=args.sample_rate,
    use_dB_normalization=True,
    target_dB=-20)

def augment_and_featurize(audio_segment):
    augmentation_pipeline.transform_audio(audio_segment)
    return audio_featurizer.featurize(audio_segment)


collate_func = CollateFunc(augment_and_featurize)

dataset = AudioDataset(
    args.manifest_path,
    args.num_samples)

batch_size = 20
data_loader = DataLoader(
    dataset,
    batch_size=batch_size,
    shuffle=False,
    num_workers=args.num_workers,
    collate_fn=collate_func)

with paddle.no_grad():
    all_mean_stat = None
    all_var_stat = None
    all_number = 0
    wav_number = 0
    for i, batch in enumerate(data_loader):
        number, mean_stat, var_stat = batch
        if i == 0:
            all_mean_stat = mean_stat
            all_var_stat = var_stat
        else:
            all_mean_stat += mean_stat
            all_var_stat += var_stat
        all_number += number
        wav_number += batch_size

        if wav_number % 1000 == 0:
            print('process {} wavs,{} frames'.format(wav_number,
                                                           all_number))

cmvn_info = {
    'mean_stat': list(all_mean_stat.tolist()),
    'var_stat': list(all_var_stat.tolist()),
    'frame_num': all_number
}
print(cmvn_info)

<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>
process 1000 wavs,450240 frames
<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>
process 2000 wavs,886411 frames
<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>
process 3000 wavs,1352580 frames
<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>
process 4000 wavs,1814397 frames
<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>
process 5000 wavs,2356587 frames
<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>
process 6000 wavs,2825310 frames
<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>
process 7000 wavs,3272506 frames
<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>
process 8000 wavs,3688045 frames
<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>
process 9000 wavs,4134669 frames
<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>
process 10000 wavs,4586357 frames
<class 'int'> <class 'paddle.Va

<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>
process 85000 wavs,37943596 frames
<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>
process 86000 wavs,38371620 frames
<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>
process 87000 wavs,38844874 frames
<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>
process 88000 wavs,39292686 frames
<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>
process 89000 wavs,39746715 frames
<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>
process 90000 wavs,40241800 frames
<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>
process 91000 wavs,40672817 frames
<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>
process 92000 wavs,41131773 frames
<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>
process 93000 wavs,41612001 frames
<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>
process 94000 wavs,42084822 frames
<class 'in