|
|
|
@ -15,10 +15,69 @@
|
|
|
|
|
import random
|
|
|
|
|
|
|
|
|
|
import numpy as np
|
|
|
|
|
import paddle
|
|
|
|
|
from paddle.io import DataLoader
|
|
|
|
|
from paddle.io import Dataset
|
|
|
|
|
|
|
|
|
|
from deepspeech.frontend.audio import AudioSegment
|
|
|
|
|
from deepspeech.frontend.utility import load_cmvn
|
|
|
|
|
from deepspeech.frontend.utility import read_manifest
|
|
|
|
|
from deepspeech.utils.log import Log
|
|
|
|
|
|
|
|
|
|
__all__ = ["FeatureNormalizer"]
|
|
|
|
|
|
|
|
|
|
logger = Log(__name__).getlog()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class CollateFunc(object):
|
|
|
|
|
''' Collate function for AudioDataset
|
|
|
|
|
'''
|
|
|
|
|
|
|
|
|
|
def __init__(self):
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
def __call__(self, batch):
|
|
|
|
|
mean_stat = None
|
|
|
|
|
var_stat = None
|
|
|
|
|
number = 0
|
|
|
|
|
for feat in batch:
|
|
|
|
|
sums = np.sum(feat, axis=1)
|
|
|
|
|
if mean_stat is None:
|
|
|
|
|
mean_stat = sums
|
|
|
|
|
else:
|
|
|
|
|
mean_stat += sums
|
|
|
|
|
|
|
|
|
|
square_sums = np.sum(np.square(feat), axis=1)
|
|
|
|
|
if var_stat is None:
|
|
|
|
|
var_stat = square_sums
|
|
|
|
|
else:
|
|
|
|
|
var_stat += square_sums
|
|
|
|
|
|
|
|
|
|
number += feat.shape[1]
|
|
|
|
|
return paddle.to_tensor(number), paddle.to_tensor(
|
|
|
|
|
mean_stat), paddle.to_tensor(var_stat)
|
|
|
|
|
#return number, mean_stat, var_stat
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class AudioDataset(Dataset):
|
|
|
|
|
def __init__(self, manifest_path, feature_func, num_samples=-1, rng=None):
|
|
|
|
|
self.feature_func = feature_func
|
|
|
|
|
self._rng = rng
|
|
|
|
|
manifest = read_manifest(manifest_path)
|
|
|
|
|
if num_samples == -1:
|
|
|
|
|
sampled_manifest = manifest
|
|
|
|
|
else:
|
|
|
|
|
sampled_manifest = self._rng.sample(manifest, num_samples)
|
|
|
|
|
self.items = sampled_manifest
|
|
|
|
|
|
|
|
|
|
def __len__(self):
|
|
|
|
|
return len(self.items)
|
|
|
|
|
|
|
|
|
|
def __getitem__(self, idx):
|
|
|
|
|
key = self.items[idx]['feat']
|
|
|
|
|
audioseg = AudioSegment.from_file(key)
|
|
|
|
|
feat = self.feature_func(audioseg) #(D, T)
|
|
|
|
|
return feat
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class FeatureNormalizer(object):
|
|
|
|
@ -49,13 +108,15 @@ class FeatureNormalizer(object):
|
|
|
|
|
manifest_path=None,
|
|
|
|
|
featurize_func=None,
|
|
|
|
|
num_samples=500,
|
|
|
|
|
num_workers=0,
|
|
|
|
|
random_seed=0):
|
|
|
|
|
if not mean_std_filepath:
|
|
|
|
|
if not (manifest_path and featurize_func):
|
|
|
|
|
raise ValueError("If mean_std_filepath is None, meanifest_path "
|
|
|
|
|
"and featurize_func should not be None.")
|
|
|
|
|
self._rng = random.Random(random_seed)
|
|
|
|
|
self._compute_mean_std(manifest_path, featurize_func, num_samples)
|
|
|
|
|
self._compute_mean_std(manifest_path, featurize_func, num_samples,
|
|
|
|
|
num_workers)
|
|
|
|
|
else:
|
|
|
|
|
self._read_mean_std_from_file(mean_std_filepath)
|
|
|
|
|
|
|
|
|
@ -71,37 +132,79 @@ class FeatureNormalizer(object):
|
|
|
|
|
"""
|
|
|
|
|
return (features - self._mean) * self._istd
|
|
|
|
|
|
|
|
|
|
def _read_mean_std_from_file(self, filepath, eps=1e-20):
|
|
|
|
|
"""Load mean and std from file."""
|
|
|
|
|
mean, istd = load_cmvn(filepath, filetype='json')
|
|
|
|
|
self._mean = mean
|
|
|
|
|
self._istd = istd
|
|
|
|
|
|
|
|
|
|
def write_to_file(self, filepath):
|
|
|
|
|
"""Write the mean and stddev to the file.
|
|
|
|
|
|
|
|
|
|
:param filepath: File to write mean and stddev.
|
|
|
|
|
:type filepath: str
|
|
|
|
|
"""
|
|
|
|
|
np.savez(filepath, mean=self._mean, istd=self._istd)
|
|
|
|
|
|
|
|
|
|
def _read_mean_std_from_file(self, filepath, eps=1e-20):
|
|
|
|
|
"""Load mean and std from file."""
|
|
|
|
|
mean, istd = load_cmvn(filepath, filetype='npz')
|
|
|
|
|
self._mean = mean.T
|
|
|
|
|
self._istd = istd.T
|
|
|
|
|
with open(filepath, 'w') as fout:
|
|
|
|
|
fout.write(json.dumps(self.cmvn_info))
|
|
|
|
|
|
|
|
|
|
def _compute_mean_std(self,
|
|
|
|
|
manifest_path,
|
|
|
|
|
featurize_func,
|
|
|
|
|
num_samples,
|
|
|
|
|
num_workers,
|
|
|
|
|
eps=1e-20):
|
|
|
|
|
"""Compute mean and std from randomly sampled instances."""
|
|
|
|
|
manifest = read_manifest(manifest_path)
|
|
|
|
|
if num_samples == -1:
|
|
|
|
|
sampled_manifest = manifest
|
|
|
|
|
else:
|
|
|
|
|
sampled_manifest = self._rng.sample(manifest, num_samples)
|
|
|
|
|
features = []
|
|
|
|
|
for instance in sampled_manifest:
|
|
|
|
|
features.append(
|
|
|
|
|
featurize_func(AudioSegment.from_file(instance["feat"])))
|
|
|
|
|
features = np.hstack(features) #(D, T)
|
|
|
|
|
self._mean = np.mean(features, axis=1) #(D,)
|
|
|
|
|
std = np.std(features, axis=1) #(D,)
|
|
|
|
|
std = np.clip(std, eps, None)
|
|
|
|
|
self._istd = 1.0 / std
|
|
|
|
|
# manifest = read_manifest(manifest_path)
|
|
|
|
|
# if num_samples == -1:
|
|
|
|
|
# sampled_manifest = manifest
|
|
|
|
|
# else:
|
|
|
|
|
# sampled_manifest = self._rng.sample(manifest, num_samples)
|
|
|
|
|
# features = []
|
|
|
|
|
# for instance in sampled_manifest:
|
|
|
|
|
# features.append(
|
|
|
|
|
# featurize_func(AudioSegment.from_file(instance["feat"])))
|
|
|
|
|
# features = np.hstack(features) #(D, T)
|
|
|
|
|
# self._mean = np.mean(features, axis=1) #(D,)
|
|
|
|
|
# std = np.std(features, axis=1) #(D,)
|
|
|
|
|
# std = np.clip(std, eps, None)
|
|
|
|
|
# self._istd = 1.0 / std
|
|
|
|
|
|
|
|
|
|
collate_func = CollateFunc()
|
|
|
|
|
|
|
|
|
|
dataset = AudioDataset(manifest_path, featurize_func, num_samples)
|
|
|
|
|
|
|
|
|
|
batch_size = 20
|
|
|
|
|
data_loader = DataLoader(
|
|
|
|
|
dataset,
|
|
|
|
|
batch_size=batch_size,
|
|
|
|
|
shuffle=False,
|
|
|
|
|
num_workers=num_workers,
|
|
|
|
|
collate_fn=collate_func)
|
|
|
|
|
|
|
|
|
|
with paddle.no_grad():
|
|
|
|
|
all_mean_stat = None
|
|
|
|
|
all_var_stat = None
|
|
|
|
|
all_number = 0
|
|
|
|
|
wav_number = 0
|
|
|
|
|
for batch in data_loader():
|
|
|
|
|
number, mean_stat, var_stat = batch
|
|
|
|
|
if all_mean_stat is None:
|
|
|
|
|
all_mean_stat = mean_stat
|
|
|
|
|
all_var_stat = var_stat
|
|
|
|
|
else:
|
|
|
|
|
all_mean_stat += mean_stat
|
|
|
|
|
all_var_stat += var_stat
|
|
|
|
|
all_number += number
|
|
|
|
|
wav_number += batch_size
|
|
|
|
|
|
|
|
|
|
if wav_number % 1000 == 0:
|
|
|
|
|
logger.info('process {} wavs,{} frames'.format(
|
|
|
|
|
wav_number, int(all_number)))
|
|
|
|
|
|
|
|
|
|
self.cmvn_info = {
|
|
|
|
|
'mean_stat': list(all_mean_stat.tolist()),
|
|
|
|
|
'var_stat': list(all_var_stat.tolist()),
|
|
|
|
|
'frame_num': int(all_number),
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return self.cmvn_info
|
|
|
|
|