parent
1ae41eac90
commit
4da2852982
@ -0,0 +1,9 @@
|
||||
ThreadPool/
|
||||
build/
|
||||
dist/
|
||||
kenlm/
|
||||
openfst-1.6.3/
|
||||
openfst-1.6.3.tar.gz
|
||||
swig_decoders.egg-info/
|
||||
decoders_wrap.cxx
|
||||
swig_decoders.py
|
@ -1,584 +0,0 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import math
|
||||
import random
|
||||
import tarfile
|
||||
import logging
|
||||
import numpy as np
|
||||
from collections import namedtuple
|
||||
from functools import partial
|
||||
|
||||
import paddle
|
||||
from paddle.io import Dataset
|
||||
from paddle.io import DataLoader
|
||||
from paddle.io import BatchSampler
|
||||
from paddle.io import DistributedBatchSampler
|
||||
from paddle import distributed as dist
|
||||
|
||||
from deepspeech.frontend.utility import read_manifest
|
||||
from deepspeech.frontend.augmentor.augmentation import AugmentationPipeline
|
||||
from deepspeech.frontend.featurizer.speech_featurizer import SpeechFeaturizer
|
||||
from deepspeech.frontend.speech import SpeechSegment
|
||||
from deepspeech.frontend.normalizer import FeatureNormalizer
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
__all__ = [
|
||||
"DeepSpeech2Dataset",
|
||||
"DeepSpeech2DistributedBatchSampler",
|
||||
"DeepSpeech2BatchSampler",
|
||||
"SpeechCollator",
|
||||
]
|
||||
|
||||
|
||||
class DeepSpeech2Dataset(Dataset):
|
||||
def __init__(self,
|
||||
manifest_path,
|
||||
vocab_filepath,
|
||||
mean_std_filepath,
|
||||
augmentation_config='{}',
|
||||
max_duration=float('inf'),
|
||||
min_duration=0.0,
|
||||
stride_ms=10.0,
|
||||
window_ms=20.0,
|
||||
n_fft=None,
|
||||
max_freq=None,
|
||||
target_sample_rate=16000,
|
||||
specgram_type='linear',
|
||||
use_dB_normalization=True,
|
||||
target_dB=-20,
|
||||
random_seed=0,
|
||||
keep_transcription_text=False):
|
||||
super().__init__()
|
||||
|
||||
self._max_duration = max_duration
|
||||
self._min_duration = min_duration
|
||||
self._normalizer = FeatureNormalizer(mean_std_filepath)
|
||||
self._augmentation_pipeline = AugmentationPipeline(
|
||||
augmentation_config=augmentation_config, random_seed=random_seed)
|
||||
self._speech_featurizer = SpeechFeaturizer(
|
||||
vocab_filepath=vocab_filepath,
|
||||
specgram_type=specgram_type,
|
||||
stride_ms=stride_ms,
|
||||
window_ms=window_ms,
|
||||
n_fft=n_fft,
|
||||
max_freq=max_freq,
|
||||
target_sample_rate=target_sample_rate,
|
||||
use_dB_normalization=use_dB_normalization,
|
||||
target_dB=target_dB)
|
||||
self._rng = random.Random(random_seed)
|
||||
self._keep_transcription_text = keep_transcription_text
|
||||
# for caching tar files info
|
||||
self._local_data = namedtuple('local_data', ['tar2info', 'tar2object'])
|
||||
self._local_data.tar2info = {}
|
||||
self._local_data.tar2object = {}
|
||||
|
||||
# read manifest
|
||||
self._manifest = read_manifest(
|
||||
manifest_path=manifest_path,
|
||||
max_duration=self._max_duration,
|
||||
min_duration=self._min_duration)
|
||||
self._manifest.sort(key=lambda x: x["duration"])
|
||||
|
||||
@property
|
||||
def manifest(self):
|
||||
return self._manifest
|
||||
|
||||
@property
|
||||
def vocab_size(self):
|
||||
"""Return the vocabulary size.
|
||||
|
||||
:return: Vocabulary size.
|
||||
:rtype: int
|
||||
"""
|
||||
return self._speech_featurizer.vocab_size
|
||||
|
||||
@property
|
||||
def vocab_list(self):
|
||||
"""Return the vocabulary in list.
|
||||
|
||||
:return: Vocabulary in list.
|
||||
:rtype: list
|
||||
"""
|
||||
return self._speech_featurizer.vocab_list
|
||||
|
||||
@property
|
||||
def feature_size(self):
|
||||
return self._speech_featurizer.feature_size
|
||||
|
||||
def _parse_tar(self, file):
|
||||
"""Parse a tar file to get a tarfile object
|
||||
and a map containing tarinfoes
|
||||
"""
|
||||
result = {}
|
||||
f = tarfile.open(file)
|
||||
for tarinfo in f.getmembers():
|
||||
result[tarinfo.name] = tarinfo
|
||||
return f, result
|
||||
|
||||
def _subfile_from_tar(self, file):
|
||||
"""Get subfile object from tar.
|
||||
|
||||
It will return a subfile object from tar file
|
||||
and cached tar file info for next reading request.
|
||||
"""
|
||||
tarpath, filename = file.split(':', 1)[1].split('#', 1)
|
||||
if 'tar2info' not in self._local_data.__dict__:
|
||||
self._local_data.tar2info = {}
|
||||
if 'tar2object' not in self._local_data.__dict__:
|
||||
self._local_data.tar2object = {}
|
||||
if tarpath not in self._local_data.tar2info:
|
||||
object, infoes = self._parse_tar(tarpath)
|
||||
self._local_data.tar2info[tarpath] = infoes
|
||||
self._local_data.tar2object[tarpath] = object
|
||||
return self._local_data.tar2object[tarpath].extractfile(
|
||||
self._local_data.tar2info[tarpath][filename])
|
||||
|
||||
def process_utterance(self, audio_file, transcript):
|
||||
"""Load, augment, featurize and normalize for speech data.
|
||||
|
||||
:param audio_file: Filepath or file object of audio file.
|
||||
:type audio_file: str | file
|
||||
:param transcript: Transcription text.
|
||||
:type transcript: str
|
||||
:return: Tuple of audio feature tensor and data of transcription part,
|
||||
where transcription part could be token ids or text.
|
||||
:rtype: tuple of (2darray, list)
|
||||
"""
|
||||
if isinstance(audio_file, str) and audio_file.startswith('tar:'):
|
||||
speech_segment = SpeechSegment.from_file(
|
||||
self._subfile_from_tar(audio_file), transcript)
|
||||
else:
|
||||
speech_segment = SpeechSegment.from_file(audio_file, transcript)
|
||||
self._augmentation_pipeline.transform_audio(speech_segment)
|
||||
specgram, transcript_part = self._speech_featurizer.featurize(
|
||||
speech_segment, self._keep_transcription_text)
|
||||
specgram = self._normalizer.apply(specgram)
|
||||
return specgram, transcript_part
|
||||
|
||||
def _instance_reader_creator(self, manifest):
|
||||
"""
|
||||
Instance reader creator. Create a callable function to produce
|
||||
instances of data.
|
||||
|
||||
Instance: a tuple of ndarray of audio spectrogram and a list of
|
||||
token indices for transcript.
|
||||
"""
|
||||
|
||||
def reader():
|
||||
for instance in manifest:
|
||||
inst = self.process_utterance(instance["audio_filepath"],
|
||||
instance["text"])
|
||||
yield inst
|
||||
|
||||
return reader
|
||||
|
||||
def __len__(self):
|
||||
return len(self._manifest)
|
||||
|
||||
def __getitem__(self, idx):
|
||||
instance = self._manifest[idx]
|
||||
return self.process_utterance(instance["audio_filepath"],
|
||||
instance["text"])
|
||||
|
||||
|
||||
class DeepSpeech2DistributedBatchSampler(DistributedBatchSampler):
|
||||
def __init__(self,
|
||||
dataset,
|
||||
batch_size,
|
||||
num_replicas=None,
|
||||
rank=None,
|
||||
shuffle=False,
|
||||
drop_last=False,
|
||||
sortagrad=False,
|
||||
shuffle_method="batch_shuffle"):
|
||||
super().__init__(dataset, batch_size, num_replicas, rank, shuffle,
|
||||
drop_last)
|
||||
self._sortagrad = sortagrad
|
||||
self._shuffle_method = shuffle_method
|
||||
|
||||
def _batch_shuffle(self, indices, batch_size, clipped=False):
|
||||
"""Put similarly-sized instances into minibatches for better efficiency
|
||||
and make a batch-wise shuffle.
|
||||
|
||||
1. Sort the audio clips by duration.
|
||||
2. Generate a random number `k`, k in [0, batch_size).
|
||||
3. Randomly shift `k` instances in order to create different batches
|
||||
for different epochs. Create minibatches.
|
||||
4. Shuffle the minibatches.
|
||||
|
||||
:param indices: indexes. List of int.
|
||||
:type indices: list
|
||||
:param batch_size: Batch size. This size is also used for generate
|
||||
a random number for batch shuffle.
|
||||
:type batch_size: int
|
||||
:param clipped: Whether to clip the heading (small shift) and trailing
|
||||
(incomplete batch) instances.
|
||||
:type clipped: bool
|
||||
:return: Batch shuffled mainifest.
|
||||
:rtype: list
|
||||
"""
|
||||
rng = np.random.RandomState(self.epoch)
|
||||
shift_len = rng.randint(0, batch_size - 1)
|
||||
batch_indices = list(zip(* [iter(indices[shift_len:])] * batch_size))
|
||||
rng.shuffle(batch_indices)
|
||||
batch_indices = [item for batch in batch_indices for item in batch]
|
||||
assert (clipped == False)
|
||||
if not clipped:
|
||||
res_len = len(indices) - shift_len - len(batch_indices)
|
||||
# when res_len is 0, will return whole list, len(List[-0:]) = len(List[:])
|
||||
if res_len != 0:
|
||||
batch_indices.extend(indices[-res_len:])
|
||||
batch_indices.extend(indices[0:shift_len])
|
||||
assert len(indices) == len(
|
||||
batch_indices
|
||||
), f"_batch_shuffle: {len(indices)} : {len(batch_indices)} : {res_len} - {shift_len}"
|
||||
return batch_indices
|
||||
|
||||
def __iter__(self):
|
||||
num_samples = len(self.dataset)
|
||||
indices = np.arange(num_samples).tolist()
|
||||
indices += indices[:(self.total_size - len(indices))]
|
||||
assert len(indices) == self.total_size
|
||||
|
||||
# sort (by duration) or batch-wise shuffle the manifest
|
||||
if self.shuffle:
|
||||
if self.epoch == 0 and self._sortagrad:
|
||||
logger.info(
|
||||
f'rank: {dist.get_rank()} dataset sortagrad! epoch {self.epoch}'
|
||||
)
|
||||
else:
|
||||
logger.info(
|
||||
f'rank: {dist.get_rank()} dataset shuffle! epoch {self.epoch}'
|
||||
)
|
||||
if self._shuffle_method == "batch_shuffle":
|
||||
indices = self._batch_shuffle(
|
||||
indices, self.batch_size, clipped=False)
|
||||
elif self._shuffle_method == "instance_shuffle":
|
||||
np.random.RandomState(self.epoch).shuffle(indices)
|
||||
else:
|
||||
raise ValueError("Unknown shuffle method %s." %
|
||||
self._shuffle_method)
|
||||
assert len(
|
||||
indices
|
||||
) == self.total_size, f"batch shuffle examples error: {len(indices)} : {self.total_size}"
|
||||
|
||||
# subsample
|
||||
def _get_indices_by_batch_size(indices):
|
||||
subsampled_indices = []
|
||||
last_batch_size = self.total_size % (self.batch_size * self.nranks)
|
||||
assert last_batch_size % self.nranks == 0
|
||||
last_local_batch_size = last_batch_size // self.nranks
|
||||
|
||||
for i in range(self.local_rank * self.batch_size,
|
||||
len(indices) - last_batch_size,
|
||||
self.batch_size * self.nranks):
|
||||
subsampled_indices.extend(indices[i:i + self.batch_size])
|
||||
|
||||
indices = indices[len(indices) - last_batch_size:]
|
||||
subsampled_indices.extend(
|
||||
indices[self.local_rank * last_local_batch_size:(
|
||||
self.local_rank + 1) * last_local_batch_size])
|
||||
return subsampled_indices
|
||||
|
||||
if self.nranks > 1:
|
||||
indices = _get_indices_by_batch_size(indices)
|
||||
|
||||
assert len(indices) == self.num_samples
|
||||
_sample_iter = iter(indices)
|
||||
|
||||
batch_indices = []
|
||||
for idx in _sample_iter:
|
||||
batch_indices.append(idx)
|
||||
if len(batch_indices) == self.batch_size:
|
||||
logger.info(
|
||||
f"rank: {dist.get_rank()} batch index: {batch_indices} ")
|
||||
yield batch_indices
|
||||
batch_indices = []
|
||||
if not self.drop_last and len(batch_indices) > 0:
|
||||
yield batch_indices
|
||||
|
||||
def __len__(self):
|
||||
num_samples = self.num_samples
|
||||
num_samples += int(not self.drop_last) * (self.batch_size - 1)
|
||||
return num_samples // self.batch_size
|
||||
|
||||
|
||||
class DeepSpeech2BatchSampler(BatchSampler):
|
||||
def __init__(self,
|
||||
dataset,
|
||||
batch_size,
|
||||
shuffle=False,
|
||||
drop_last=False,
|
||||
sortagrad=False,
|
||||
shuffle_method="batch_shuffle"):
|
||||
self.dataset = dataset
|
||||
|
||||
assert isinstance(batch_size, int) and batch_size > 0, \
|
||||
"batch_size should be a positive integer"
|
||||
self.batch_size = batch_size
|
||||
assert isinstance(shuffle, bool), \
|
||||
"shuffle should be a boolean value"
|
||||
self.shuffle = shuffle
|
||||
assert isinstance(drop_last, bool), \
|
||||
"drop_last should be a boolean number"
|
||||
|
||||
self.drop_last = drop_last
|
||||
self.epoch = 0
|
||||
self.num_samples = int(math.ceil(len(self.dataset) * 1.0))
|
||||
self.total_size = self.num_samples
|
||||
self._sortagrad = sortagrad
|
||||
self._shuffle_method = shuffle_method
|
||||
|
||||
def _batch_shuffle(self, indices, batch_size, clipped=False):
|
||||
"""Put similarly-sized instances into minibatches for better efficiency
|
||||
and make a batch-wise shuffle.
|
||||
|
||||
1. Sort the audio clips by duration.
|
||||
2. Generate a random number `k`, k in [0, batch_size).
|
||||
3. Randomly shift `k` instances in order to create different batches
|
||||
for different epochs. Create minibatches.
|
||||
4. Shuffle the minibatches.
|
||||
|
||||
:param indices: indexes. List of int.
|
||||
:type indices: list
|
||||
:param batch_size: Batch size. This size is also used for generate
|
||||
a random number for batch shuffle.
|
||||
:type batch_size: int
|
||||
:param clipped: Whether to clip the heading (small shift) and trailing
|
||||
(incomplete batch) instances.
|
||||
:type clipped: bool
|
||||
:return: Batch shuffled mainifest.
|
||||
:rtype: list
|
||||
"""
|
||||
rng = np.random.RandomState(self.epoch)
|
||||
# must shift at leat by one
|
||||
shift_len = rng.randint(0, batch_size - 1)
|
||||
batch_indices = list(zip(* [iter(indices[shift_len:])] * batch_size))
|
||||
rng.shuffle(batch_indices)
|
||||
batch_indices = [item for batch in batch_indices for item in batch]
|
||||
assert (clipped == False)
|
||||
if not clipped:
|
||||
res_len = len(indices) - shift_len - len(batch_indices)
|
||||
# when res_len is 0, will return whole list, len(List[-0:]) = len(List[:])
|
||||
if res_len != 0:
|
||||
batch_indices.extend(indices[-res_len:])
|
||||
batch_indices.extend(indices[0:shift_len])
|
||||
assert len(indices) == len(
|
||||
batch_indices
|
||||
), f"_batch_shuffle: {len(indices)} : {len(batch_indices)} : {res_len} - {shift_len}"
|
||||
return batch_indices
|
||||
|
||||
def __iter__(self):
|
||||
num_samples = len(self.dataset)
|
||||
indices = np.arange(num_samples).tolist()
|
||||
indices += indices[:(self.total_size - len(indices))]
|
||||
assert len(indices) == self.total_size
|
||||
|
||||
# sort (by duration) or batch-wise shuffle the manifest
|
||||
if self.shuffle:
|
||||
if self.epoch == 0 and self._sortagrad:
|
||||
logger.info(f'dataset sortagrad! epoch {self.epoch}')
|
||||
else:
|
||||
logger.info(f'dataset shuffle! epoch {self.epoch}')
|
||||
if self._shuffle_method == "batch_shuffle":
|
||||
indices = self._batch_shuffle(
|
||||
indices, self.batch_size, clipped=False)
|
||||
elif self._shuffle_method == "instance_shuffle":
|
||||
np.random.RandomState(self.epoch).shuffle(indices)
|
||||
else:
|
||||
raise ValueError("Unknown shuffle method %s." %
|
||||
self._shuffle_method)
|
||||
assert len(
|
||||
indices
|
||||
) == self.total_size, f"batch shuffle examples error: {len(indices)} : {self.total_size}"
|
||||
|
||||
assert len(indices) == self.num_samples
|
||||
_sample_iter = iter(indices)
|
||||
|
||||
batch_indices = []
|
||||
for idx in _sample_iter:
|
||||
batch_indices.append(idx)
|
||||
if len(batch_indices) == self.batch_size:
|
||||
logger.info(
|
||||
f"rank: {dist.get_rank()} batch index: {batch_indices} ")
|
||||
yield batch_indices
|
||||
batch_indices = []
|
||||
if not self.drop_last and len(batch_indices) > 0:
|
||||
yield batch_indices
|
||||
|
||||
self.epoch += 1
|
||||
|
||||
def __len__(self):
|
||||
num_samples = self.num_samples
|
||||
num_samples += int(not self.drop_last) * (self.batch_size - 1)
|
||||
return num_samples // self.batch_size
|
||||
|
||||
|
||||
class SpeechCollator():
|
||||
def __init__(self, padding_to=-1, is_training=True):
|
||||
"""
|
||||
Padding audio features with zeros to make them have the same shape (or
|
||||
a user-defined shape) within one bach.
|
||||
|
||||
If ``padding_to`` is -1, the maximun shape in the batch will be used
|
||||
as the target shape for padding. Otherwise, `padding_to` will be the
|
||||
target shape (only refers to the second axis).
|
||||
"""
|
||||
self._padding_to = padding_to
|
||||
self._is_training = is_training
|
||||
|
||||
def __call__(self, batch):
|
||||
new_batch = []
|
||||
# get target shape
|
||||
max_length = max([audio.shape[1] for audio, _ in batch])
|
||||
if self._padding_to != -1:
|
||||
if self._padding_to < max_length:
|
||||
raise ValueError("If padding_to is not -1, it should be larger "
|
||||
"than any instance's shape in the batch")
|
||||
max_length = self._padding_to
|
||||
max_text_length = max([len(text) for _, text in batch])
|
||||
# padding
|
||||
padded_audios = []
|
||||
audio_lens = []
|
||||
texts, text_lens = [], []
|
||||
for audio, text in batch:
|
||||
# audio
|
||||
padded_audio = np.zeros([audio.shape[0], max_length])
|
||||
padded_audio[:, :audio.shape[1]] = audio
|
||||
padded_audios.append(padded_audio)
|
||||
audio_lens.append(audio.shape[1])
|
||||
# text
|
||||
padded_text = np.zeros([max_text_length])
|
||||
if self._is_training:
|
||||
padded_text[:len(text)] = text #ids
|
||||
else:
|
||||
padded_text[:len(text)] = [ord(t) for t in text] # string
|
||||
texts.append(padded_text)
|
||||
text_lens.append(len(text))
|
||||
|
||||
padded_audios = np.array(padded_audios).astype('float32')
|
||||
audio_lens = np.array(audio_lens).astype('int64')
|
||||
texts = np.array(texts).astype('int32')
|
||||
text_lens = np.array(text_lens).astype('int64')
|
||||
return padded_audios, texts, audio_lens, text_lens
|
||||
|
||||
|
||||
def create_dataloader(manifest_path,
|
||||
vocab_filepath,
|
||||
mean_std_filepath,
|
||||
augmentation_config='{}',
|
||||
max_duration=float('inf'),
|
||||
min_duration=0.0,
|
||||
stride_ms=10.0,
|
||||
window_ms=20.0,
|
||||
max_freq=None,
|
||||
specgram_type='linear',
|
||||
use_dB_normalization=True,
|
||||
random_seed=0,
|
||||
keep_transcription_text=False,
|
||||
is_training=False,
|
||||
batch_size=1,
|
||||
num_workers=0,
|
||||
sortagrad=False,
|
||||
shuffle_method=None,
|
||||
dist=False):
|
||||
|
||||
dataset = DeepSpeech2Dataset(
|
||||
manifest_path,
|
||||
vocab_filepath,
|
||||
mean_std_filepath,
|
||||
augmentation_config=augmentation_config,
|
||||
max_duration=max_duration,
|
||||
min_duration=min_duration,
|
||||
stride_ms=stride_ms,
|
||||
window_ms=window_ms,
|
||||
max_freq=max_freq,
|
||||
specgram_type=specgram_type,
|
||||
use_dB_normalization=use_dB_normalization,
|
||||
random_seed=random_seed,
|
||||
keep_transcription_text=keep_transcription_text)
|
||||
|
||||
if dist:
|
||||
batch_sampler = DeepSpeech2DistributedBatchSampler(
|
||||
dataset,
|
||||
batch_size,
|
||||
num_replicas=None,
|
||||
rank=None,
|
||||
shuffle=is_training,
|
||||
drop_last=is_training,
|
||||
sortagrad=is_training,
|
||||
shuffle_method=shuffle_method)
|
||||
else:
|
||||
batch_sampler = DeepSpeech2BatchSampler(
|
||||
dataset,
|
||||
shuffle=is_training,
|
||||
batch_size=batch_size,
|
||||
drop_last=is_training,
|
||||
sortagrad=is_training,
|
||||
shuffle_method=shuffle_method)
|
||||
|
||||
def padding_batch(batch, padding_to=-1, flatten=False, is_training=True):
|
||||
"""
|
||||
Padding audio features with zeros to make them have the same shape (or
|
||||
a user-defined shape) within one bach.
|
||||
|
||||
If ``padding_to`` is -1, the maximun shape in the batch will be used
|
||||
as the target shape for padding. Otherwise, `padding_to` will be the
|
||||
target shape (only refers to the second axis).
|
||||
|
||||
If `flatten` is True, features will be flatten to 1darray.
|
||||
"""
|
||||
new_batch = []
|
||||
# get target shape
|
||||
max_length = max([audio.shape[1] for audio, text in batch])
|
||||
if padding_to != -1:
|
||||
if padding_to < max_length:
|
||||
raise ValueError("If padding_to is not -1, it should be larger "
|
||||
"than any instance's shape in the batch")
|
||||
max_length = padding_to
|
||||
max_text_length = max([len(text) for audio, text in batch])
|
||||
# padding
|
||||
padded_audios = []
|
||||
audio_lens = []
|
||||
texts, text_lens = [], []
|
||||
for audio, text in batch:
|
||||
padded_audio = np.zeros([audio.shape[0], max_length])
|
||||
padded_audio[:, :audio.shape[1]] = audio
|
||||
if flatten:
|
||||
padded_audio = padded_audio.flatten()
|
||||
padded_audios.append(padded_audio)
|
||||
audio_lens.append(audio.shape[1])
|
||||
|
||||
padded_text = np.zeros([max_text_length])
|
||||
if is_training:
|
||||
padded_text[:len(text)] = text #ids
|
||||
else:
|
||||
padded_text[:len(text)] = [ord(t) for t in text] # string
|
||||
texts.append(padded_text)
|
||||
text_lens.append(len(text))
|
||||
|
||||
padded_audios = np.array(padded_audios).astype('float32')
|
||||
audio_lens = np.array(audio_lens).astype('int64')
|
||||
texts = np.array(texts).astype('int32')
|
||||
text_lens = np.array(text_lens).astype('int64')
|
||||
return padded_audios, texts, audio_lens, text_lens
|
||||
|
||||
loader = DataLoader(
|
||||
dataset,
|
||||
batch_sampler=batch_sampler,
|
||||
collate_fn=partial(padding_batch, is_training=is_training),
|
||||
num_workers=num_workers)
|
||||
return loader
|
@ -0,0 +1,128 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from paddle.io import DataLoader
|
||||
|
||||
from deepspeech.io.collator import SpeechCollator
|
||||
from deepspeech.io.sampler import SortagradDistributedBatchSampler
|
||||
from deepspeech.io.sampler import SortagradBatchSampler
|
||||
from deepspeech.io.dataset import ManifestDataset
|
||||
|
||||
|
||||
def create_dataloader(manifest_path,
|
||||
vocab_filepath,
|
||||
mean_std_filepath,
|
||||
augmentation_config='{}',
|
||||
max_duration=float('inf'),
|
||||
min_duration=0.0,
|
||||
stride_ms=10.0,
|
||||
window_ms=20.0,
|
||||
max_freq=None,
|
||||
specgram_type='linear',
|
||||
use_dB_normalization=True,
|
||||
random_seed=0,
|
||||
keep_transcription_text=False,
|
||||
is_training=False,
|
||||
batch_size=1,
|
||||
num_workers=0,
|
||||
sortagrad=False,
|
||||
shuffle_method=None,
|
||||
dist=False):
|
||||
|
||||
dataset = ManifestDataset(
|
||||
manifest_path,
|
||||
vocab_filepath,
|
||||
mean_std_filepath,
|
||||
augmentation_config=augmentation_config,
|
||||
max_duration=max_duration,
|
||||
min_duration=min_duration,
|
||||
stride_ms=stride_ms,
|
||||
window_ms=window_ms,
|
||||
max_freq=max_freq,
|
||||
specgram_type=specgram_type,
|
||||
use_dB_normalization=use_dB_normalization,
|
||||
random_seed=random_seed,
|
||||
keep_transcription_text=keep_transcription_text)
|
||||
|
||||
if dist:
|
||||
batch_sampler = SortagradDistributedBatchSampler(
|
||||
dataset,
|
||||
batch_size,
|
||||
num_replicas=None,
|
||||
rank=None,
|
||||
shuffle=is_training,
|
||||
drop_last=is_training,
|
||||
sortagrad=is_training,
|
||||
shuffle_method=shuffle_method)
|
||||
else:
|
||||
batch_sampler = SortagradBatchSampler(
|
||||
dataset,
|
||||
shuffle=is_training,
|
||||
batch_size=batch_size,
|
||||
drop_last=is_training,
|
||||
sortagrad=is_training,
|
||||
shuffle_method=shuffle_method)
|
||||
|
||||
def padding_batch(batch, padding_to=-1, flatten=False, is_training=True):
|
||||
"""
|
||||
Padding audio features with zeros to make them have the same shape (or
|
||||
a user-defined shape) within one bach.
|
||||
|
||||
If ``padding_to`` is -1, the maximun shape in the batch will be used
|
||||
as the target shape for padding. Otherwise, `padding_to` will be the
|
||||
target shape (only refers to the second axis).
|
||||
|
||||
If `flatten` is True, features will be flatten to 1darray.
|
||||
"""
|
||||
new_batch = []
|
||||
# get target shape
|
||||
max_length = max([audio.shape[1] for audio, text in batch])
|
||||
if padding_to != -1:
|
||||
if padding_to < max_length:
|
||||
raise ValueError("If padding_to is not -1, it should be larger "
|
||||
"than any instance's shape in the batch")
|
||||
max_length = padding_to
|
||||
max_text_length = max([len(text) for audio, text in batch])
|
||||
# padding
|
||||
padded_audios = []
|
||||
audio_lens = []
|
||||
texts, text_lens = [], []
|
||||
for audio, text in batch:
|
||||
padded_audio = np.zeros([audio.shape[0], max_length])
|
||||
padded_audio[:, :audio.shape[1]] = audio
|
||||
if flatten:
|
||||
padded_audio = padded_audio.flatten()
|
||||
padded_audios.append(padded_audio)
|
||||
audio_lens.append(audio.shape[1])
|
||||
|
||||
padded_text = np.zeros([max_text_length])
|
||||
if is_training:
|
||||
padded_text[:len(text)] = text #ids
|
||||
else:
|
||||
padded_text[:len(text)] = [ord(t) for t in text] # string
|
||||
texts.append(padded_text)
|
||||
text_lens.append(len(text))
|
||||
|
||||
padded_audios = np.array(padded_audios).astype('float32')
|
||||
audio_lens = np.array(audio_lens).astype('int64')
|
||||
texts = np.array(texts).astype('int32')
|
||||
text_lens = np.array(text_lens).astype('int64')
|
||||
return padded_audios, texts, audio_lens, text_lens
|
||||
|
||||
loader = DataLoader(
|
||||
dataset,
|
||||
batch_sampler=batch_sampler,
|
||||
collate_fn=partial(padding_batch, is_training=is_training),
|
||||
num_workers=num_workers)
|
||||
return loader
|
@ -0,0 +1,72 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import logging
|
||||
import numpy as np
|
||||
from collections import namedtuple
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
__all__ = [
|
||||
"SpeechCollator",
|
||||
]
|
||||
|
||||
|
||||
class SpeechCollator():
|
||||
def __init__(self, padding_to=-1, is_training=True):
|
||||
"""
|
||||
Padding audio features with zeros to make them have the same shape (or
|
||||
a user-defined shape) within one bach.
|
||||
|
||||
If ``padding_to`` is -1, the maximun shape in the batch will be used
|
||||
as the target shape for padding. Otherwise, `padding_to` will be the
|
||||
target shape (only refers to the second axis).
|
||||
"""
|
||||
self._padding_to = padding_to
|
||||
self._is_training = is_training
|
||||
|
||||
def __call__(self, batch):
|
||||
new_batch = []
|
||||
# get target shape
|
||||
max_length = max([audio.shape[1] for audio, _ in batch])
|
||||
if self._padding_to != -1:
|
||||
if self._padding_to < max_length:
|
||||
raise ValueError("If padding_to is not -1, it should be larger "
|
||||
"than any instance's shape in the batch")
|
||||
max_length = self._padding_to
|
||||
max_text_length = max([len(text) for _, text in batch])
|
||||
# padding
|
||||
padded_audios = []
|
||||
audio_lens = []
|
||||
texts, text_lens = [], []
|
||||
for audio, text in batch:
|
||||
# audio
|
||||
padded_audio = np.zeros([audio.shape[0], max_length])
|
||||
padded_audio[:, :audio.shape[1]] = audio
|
||||
padded_audios.append(padded_audio)
|
||||
audio_lens.append(audio.shape[1])
|
||||
# text
|
||||
padded_text = np.zeros([max_text_length])
|
||||
if self._is_training:
|
||||
padded_text[:len(text)] = text #ids
|
||||
else:
|
||||
padded_text[:len(text)] = [ord(t) for t in text] # string
|
||||
texts.append(padded_text)
|
||||
text_lens.append(len(text))
|
||||
|
||||
padded_audios = np.array(padded_audios).astype('float32')
|
||||
audio_lens = np.array(audio_lens).astype('int64')
|
||||
texts = np.array(texts).astype('int32')
|
||||
text_lens = np.array(text_lens).astype('int64')
|
||||
return padded_audios, texts, audio_lens, text_lens
|
@ -0,0 +1,186 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import math
|
||||
import random
|
||||
import tarfile
|
||||
import logging
|
||||
import numpy as np
|
||||
from collections import namedtuple
|
||||
from functools import partial
|
||||
|
||||
from paddle.io import Dataset
|
||||
|
||||
from deepspeech.frontend.utility import read_manifest
|
||||
from deepspeech.frontend.augmentor.augmentation import AugmentationPipeline
|
||||
from deepspeech.frontend.featurizer.speech_featurizer import SpeechFeaturizer
|
||||
from deepspeech.frontend.speech import SpeechSegment
|
||||
from deepspeech.frontend.normalizer import FeatureNormalizer
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
__all__ = [
|
||||
"ManifestDataset",
|
||||
]
|
||||
|
||||
|
||||
class ManifestDataset(Dataset):
|
||||
def __init__(self,
|
||||
manifest_path,
|
||||
vocab_filepath,
|
||||
mean_std_filepath,
|
||||
augmentation_config='{}',
|
||||
max_duration=float('inf'),
|
||||
min_duration=0.0,
|
||||
stride_ms=10.0,
|
||||
window_ms=20.0,
|
||||
n_fft=None,
|
||||
max_freq=None,
|
||||
target_sample_rate=16000,
|
||||
specgram_type='linear',
|
||||
use_dB_normalization=True,
|
||||
target_dB=-20,
|
||||
random_seed=0,
|
||||
keep_transcription_text=False):
|
||||
super().__init__()
|
||||
|
||||
self._max_duration = max_duration
|
||||
self._min_duration = min_duration
|
||||
self._normalizer = FeatureNormalizer(mean_std_filepath)
|
||||
self._augmentation_pipeline = AugmentationPipeline(
|
||||
augmentation_config=augmentation_config, random_seed=random_seed)
|
||||
self._speech_featurizer = SpeechFeaturizer(
|
||||
vocab_filepath=vocab_filepath,
|
||||
specgram_type=specgram_type,
|
||||
stride_ms=stride_ms,
|
||||
window_ms=window_ms,
|
||||
n_fft=n_fft,
|
||||
max_freq=max_freq,
|
||||
target_sample_rate=target_sample_rate,
|
||||
use_dB_normalization=use_dB_normalization,
|
||||
target_dB=target_dB)
|
||||
self._rng = random.Random(random_seed)
|
||||
self._keep_transcription_text = keep_transcription_text
|
||||
# for caching tar files info
|
||||
self._local_data = namedtuple('local_data', ['tar2info', 'tar2object'])
|
||||
self._local_data.tar2info = {}
|
||||
self._local_data.tar2object = {}
|
||||
|
||||
# read manifest
|
||||
self._manifest = read_manifest(
|
||||
manifest_path=manifest_path,
|
||||
max_duration=self._max_duration,
|
||||
min_duration=self._min_duration)
|
||||
self._manifest.sort(key=lambda x: x["duration"])
|
||||
|
||||
@property
|
||||
def manifest(self):
|
||||
return self._manifest
|
||||
|
||||
@property
|
||||
def vocab_size(self):
|
||||
"""Return the vocabulary size.
|
||||
|
||||
:return: Vocabulary size.
|
||||
:rtype: int
|
||||
"""
|
||||
return self._speech_featurizer.vocab_size
|
||||
|
||||
@property
|
||||
def vocab_list(self):
|
||||
"""Return the vocabulary in list.
|
||||
|
||||
:return: Vocabulary in list.
|
||||
:rtype: list
|
||||
"""
|
||||
return self._speech_featurizer.vocab_list
|
||||
|
||||
@property
|
||||
def feature_size(self):
|
||||
return self._speech_featurizer.feature_size
|
||||
|
||||
def _parse_tar(self, file):
|
||||
"""Parse a tar file to get a tarfile object
|
||||
and a map containing tarinfoes
|
||||
"""
|
||||
result = {}
|
||||
f = tarfile.open(file)
|
||||
for tarinfo in f.getmembers():
|
||||
result[tarinfo.name] = tarinfo
|
||||
return f, result
|
||||
|
||||
def _subfile_from_tar(self, file):
|
||||
"""Get subfile object from tar.
|
||||
|
||||
It will return a subfile object from tar file
|
||||
and cached tar file info for next reading request.
|
||||
"""
|
||||
tarpath, filename = file.split(':', 1)[1].split('#', 1)
|
||||
if 'tar2info' not in self._local_data.__dict__:
|
||||
self._local_data.tar2info = {}
|
||||
if 'tar2object' not in self._local_data.__dict__:
|
||||
self._local_data.tar2object = {}
|
||||
if tarpath not in self._local_data.tar2info:
|
||||
object, infoes = self._parse_tar(tarpath)
|
||||
self._local_data.tar2info[tarpath] = infoes
|
||||
self._local_data.tar2object[tarpath] = object
|
||||
return self._local_data.tar2object[tarpath].extractfile(
|
||||
self._local_data.tar2info[tarpath][filename])
|
||||
|
||||
def process_utterance(self, audio_file, transcript):
|
||||
"""Load, augment, featurize and normalize for speech data.
|
||||
|
||||
:param audio_file: Filepath or file object of audio file.
|
||||
:type audio_file: str | file
|
||||
:param transcript: Transcription text.
|
||||
:type transcript: str
|
||||
:return: Tuple of audio feature tensor and data of transcription part,
|
||||
where transcription part could be token ids or text.
|
||||
:rtype: tuple of (2darray, list)
|
||||
"""
|
||||
if isinstance(audio_file, str) and audio_file.startswith('tar:'):
|
||||
speech_segment = SpeechSegment.from_file(
|
||||
self._subfile_from_tar(audio_file), transcript)
|
||||
else:
|
||||
speech_segment = SpeechSegment.from_file(audio_file, transcript)
|
||||
self._augmentation_pipeline.transform_audio(speech_segment)
|
||||
specgram, transcript_part = self._speech_featurizer.featurize(
|
||||
speech_segment, self._keep_transcription_text)
|
||||
specgram = self._normalizer.apply(specgram)
|
||||
return specgram, transcript_part
|
||||
|
||||
def _instance_reader_creator(self, manifest):
|
||||
"""
|
||||
Instance reader creator. Create a callable function to produce
|
||||
instances of data.
|
||||
|
||||
Instance: a tuple of ndarray of audio spectrogram and a list of
|
||||
token indices for transcript.
|
||||
"""
|
||||
|
||||
def reader():
|
||||
for instance in manifest:
|
||||
inst = self.process_utterance(instance["audio_filepath"],
|
||||
instance["text"])
|
||||
yield inst
|
||||
|
||||
return reader
|
||||
|
||||
def __len__(self):
|
||||
return len(self._manifest)
|
||||
|
||||
def __getitem__(self, idx):
|
||||
instance = self._manifest[idx]
|
||||
return self.process_utterance(instance["audio_filepath"],
|
||||
instance["text"])
|
@ -0,0 +1,266 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import math
|
||||
import random
|
||||
import tarfile
|
||||
import logging
|
||||
import numpy as np
|
||||
from collections import namedtuple
|
||||
from functools import partial
|
||||
|
||||
import paddle
|
||||
from paddle.io import BatchSampler
|
||||
from paddle.io import DistributedBatchSampler
|
||||
from paddle import distributed as dist
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
__all__ = [
|
||||
"SortagradDistributedBatchSampler",
|
||||
"SortagradBatchSampler",
|
||||
]
|
||||
|
||||
|
||||
class SortagradDistributedBatchSampler(DistributedBatchSampler):
|
||||
def __init__(self,
|
||||
dataset,
|
||||
batch_size,
|
||||
num_replicas=None,
|
||||
rank=None,
|
||||
shuffle=False,
|
||||
drop_last=False,
|
||||
sortagrad=False,
|
||||
shuffle_method="batch_shuffle"):
|
||||
super().__init__(dataset, batch_size, num_replicas, rank, shuffle,
|
||||
drop_last)
|
||||
self._sortagrad = sortagrad
|
||||
self._shuffle_method = shuffle_method
|
||||
|
||||
def _batch_shuffle(self, indices, batch_size, clipped=False):
|
||||
"""Put similarly-sized instances into minibatches for better efficiency
|
||||
and make a batch-wise shuffle.
|
||||
|
||||
1. Sort the audio clips by duration.
|
||||
2. Generate a random number `k`, k in [0, batch_size).
|
||||
3. Randomly shift `k` instances in order to create different batches
|
||||
for different epochs. Create minibatches.
|
||||
4. Shuffle the minibatches.
|
||||
|
||||
:param indices: indexes. List of int.
|
||||
:type indices: list
|
||||
:param batch_size: Batch size. This size is also used for generate
|
||||
a random number for batch shuffle.
|
||||
:type batch_size: int
|
||||
:param clipped: Whether to clip the heading (small shift) and trailing
|
||||
(incomplete batch) instances.
|
||||
:type clipped: bool
|
||||
:return: Batch shuffled mainifest.
|
||||
:rtype: list
|
||||
"""
|
||||
rng = np.random.RandomState(self.epoch)
|
||||
shift_len = rng.randint(0, batch_size - 1)
|
||||
batch_indices = list(zip(* [iter(indices[shift_len:])] * batch_size))
|
||||
rng.shuffle(batch_indices)
|
||||
batch_indices = [item for batch in batch_indices for item in batch]
|
||||
assert (clipped == False)
|
||||
if not clipped:
|
||||
res_len = len(indices) - shift_len - len(batch_indices)
|
||||
# when res_len is 0, will return whole list, len(List[-0:]) = len(List[:])
|
||||
if res_len != 0:
|
||||
batch_indices.extend(indices[-res_len:])
|
||||
batch_indices.extend(indices[0:shift_len])
|
||||
assert len(indices) == len(
|
||||
batch_indices
|
||||
), f"_batch_shuffle: {len(indices)} : {len(batch_indices)} : {res_len} - {shift_len}"
|
||||
return batch_indices
|
||||
|
||||
def __iter__(self):
|
||||
num_samples = len(self.dataset)
|
||||
indices = np.arange(num_samples).tolist()
|
||||
indices += indices[:(self.total_size - len(indices))]
|
||||
assert len(indices) == self.total_size
|
||||
|
||||
# sort (by duration) or batch-wise shuffle the manifest
|
||||
if self.shuffle:
|
||||
if self.epoch == 0 and self._sortagrad:
|
||||
logger.info(
|
||||
f'rank: {dist.get_rank()} dataset sortagrad! epoch {self.epoch}'
|
||||
)
|
||||
else:
|
||||
logger.info(
|
||||
f'rank: {dist.get_rank()} dataset shuffle! epoch {self.epoch}'
|
||||
)
|
||||
if self._shuffle_method == "batch_shuffle":
|
||||
indices = self._batch_shuffle(
|
||||
indices, self.batch_size, clipped=False)
|
||||
elif self._shuffle_method == "instance_shuffle":
|
||||
np.random.RandomState(self.epoch).shuffle(indices)
|
||||
else:
|
||||
raise ValueError("Unknown shuffle method %s." %
|
||||
self._shuffle_method)
|
||||
assert len(
|
||||
indices
|
||||
) == self.total_size, f"batch shuffle examples error: {len(indices)} : {self.total_size}"
|
||||
|
||||
# subsample
|
||||
def _get_indices_by_batch_size(indices):
|
||||
subsampled_indices = []
|
||||
last_batch_size = self.total_size % (self.batch_size * self.nranks)
|
||||
assert last_batch_size % self.nranks == 0
|
||||
last_local_batch_size = last_batch_size // self.nranks
|
||||
|
||||
for i in range(self.local_rank * self.batch_size,
|
||||
len(indices) - last_batch_size,
|
||||
self.batch_size * self.nranks):
|
||||
subsampled_indices.extend(indices[i:i + self.batch_size])
|
||||
|
||||
indices = indices[len(indices) - last_batch_size:]
|
||||
subsampled_indices.extend(
|
||||
indices[self.local_rank * last_local_batch_size:(
|
||||
self.local_rank + 1) * last_local_batch_size])
|
||||
return subsampled_indices
|
||||
|
||||
if self.nranks > 1:
|
||||
indices = _get_indices_by_batch_size(indices)
|
||||
|
||||
assert len(indices) == self.num_samples
|
||||
_sample_iter = iter(indices)
|
||||
|
||||
batch_indices = []
|
||||
for idx in _sample_iter:
|
||||
batch_indices.append(idx)
|
||||
if len(batch_indices) == self.batch_size:
|
||||
logger.info(
|
||||
f"rank: {dist.get_rank()} batch index: {batch_indices} ")
|
||||
yield batch_indices
|
||||
batch_indices = []
|
||||
if not self.drop_last and len(batch_indices) > 0:
|
||||
yield batch_indices
|
||||
|
||||
def __len__(self):
|
||||
num_samples = self.num_samples
|
||||
num_samples += int(not self.drop_last) * (self.batch_size - 1)
|
||||
return num_samples // self.batch_size
|
||||
|
||||
|
||||
class SortagradBatchSampler(BatchSampler):
|
||||
def __init__(self,
|
||||
dataset,
|
||||
batch_size,
|
||||
shuffle=False,
|
||||
drop_last=False,
|
||||
sortagrad=False,
|
||||
shuffle_method="batch_shuffle"):
|
||||
self.dataset = dataset
|
||||
|
||||
assert isinstance(batch_size, int) and batch_size > 0, \
|
||||
"batch_size should be a positive integer"
|
||||
self.batch_size = batch_size
|
||||
assert isinstance(shuffle, bool), \
|
||||
"shuffle should be a boolean value"
|
||||
self.shuffle = shuffle
|
||||
assert isinstance(drop_last, bool), \
|
||||
"drop_last should be a boolean number"
|
||||
|
||||
self.drop_last = drop_last
|
||||
self.epoch = 0
|
||||
self.num_samples = int(math.ceil(len(self.dataset) * 1.0))
|
||||
self.total_size = self.num_samples
|
||||
self._sortagrad = sortagrad
|
||||
self._shuffle_method = shuffle_method
|
||||
|
||||
def _batch_shuffle(self, indices, batch_size, clipped=False):
|
||||
"""Put similarly-sized instances into minibatches for better efficiency
|
||||
and make a batch-wise shuffle.
|
||||
|
||||
1. Sort the audio clips by duration.
|
||||
2. Generate a random number `k`, k in [0, batch_size).
|
||||
3. Randomly shift `k` instances in order to create different batches
|
||||
for different epochs. Create minibatches.
|
||||
4. Shuffle the minibatches.
|
||||
|
||||
:param indices: indexes. List of int.
|
||||
:type indices: list
|
||||
:param batch_size: Batch size. This size is also used for generate
|
||||
a random number for batch shuffle.
|
||||
:type batch_size: int
|
||||
:param clipped: Whether to clip the heading (small shift) and trailing
|
||||
(incomplete batch) instances.
|
||||
:type clipped: bool
|
||||
:return: Batch shuffled mainifest.
|
||||
:rtype: list
|
||||
"""
|
||||
rng = np.random.RandomState(self.epoch)
|
||||
# must shift at leat by one
|
||||
shift_len = rng.randint(0, batch_size - 1)
|
||||
batch_indices = list(zip(* [iter(indices[shift_len:])] * batch_size))
|
||||
rng.shuffle(batch_indices)
|
||||
batch_indices = [item for batch in batch_indices for item in batch]
|
||||
assert (clipped == False)
|
||||
if not clipped:
|
||||
res_len = len(indices) - shift_len - len(batch_indices)
|
||||
# when res_len is 0, will return whole list, len(List[-0:]) = len(List[:])
|
||||
if res_len != 0:
|
||||
batch_indices.extend(indices[-res_len:])
|
||||
batch_indices.extend(indices[0:shift_len])
|
||||
assert len(indices) == len(
|
||||
batch_indices
|
||||
), f"_batch_shuffle: {len(indices)} : {len(batch_indices)} : {res_len} - {shift_len}"
|
||||
return batch_indices
|
||||
|
||||
def __iter__(self):
|
||||
num_samples = len(self.dataset)
|
||||
indices = np.arange(num_samples).tolist()
|
||||
indices += indices[:(self.total_size - len(indices))]
|
||||
assert len(indices) == self.total_size
|
||||
|
||||
# sort (by duration) or batch-wise shuffle the manifest
|
||||
if self.shuffle:
|
||||
if self.epoch == 0 and self._sortagrad:
|
||||
logger.info(f'dataset sortagrad! epoch {self.epoch}')
|
||||
else:
|
||||
logger.info(f'dataset shuffle! epoch {self.epoch}')
|
||||
if self._shuffle_method == "batch_shuffle":
|
||||
indices = self._batch_shuffle(
|
||||
indices, self.batch_size, clipped=False)
|
||||
elif self._shuffle_method == "instance_shuffle":
|
||||
np.random.RandomState(self.epoch).shuffle(indices)
|
||||
else:
|
||||
raise ValueError("Unknown shuffle method %s." %
|
||||
self._shuffle_method)
|
||||
assert len(
|
||||
indices
|
||||
) == self.total_size, f"batch shuffle examples error: {len(indices)} : {self.total_size}"
|
||||
|
||||
assert len(indices) == self.num_samples
|
||||
_sample_iter = iter(indices)
|
||||
|
||||
batch_indices = []
|
||||
for idx in _sample_iter:
|
||||
batch_indices.append(idx)
|
||||
if len(batch_indices) == self.batch_size:
|
||||
logger.info(
|
||||
f"rank: {dist.get_rank()} batch index: {batch_indices} ")
|
||||
yield batch_indices
|
||||
batch_indices = []
|
||||
if not self.drop_last and len(batch_indices) > 0:
|
||||
yield batch_indices
|
||||
|
||||
self.epoch += 1
|
||||
|
||||
def __len__(self):
|
||||
num_samples = self.num_samples
|
||||
num_samples += int(not self.drop_last) * (self.batch_size - 1)
|
||||
return num_samples // self.batch_size
|
@ -0,0 +1,304 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import math
|
||||
import collections
|
||||
import numpy as np
|
||||
import logging
|
||||
from typing import Optional
|
||||
from yacs.config import CfgNode
|
||||
|
||||
import paddle
|
||||
from paddle import nn
|
||||
from paddle.nn import functional as F
|
||||
from paddle.nn import initializer as I
|
||||
|
||||
from deepspeech.modules.conv import ConvStack
|
||||
from deepspeech.modules.conv import RNNStack
|
||||
from deepspeech.modules.mask import sequence_mask
|
||||
from deepspeech.modules.activation import brelu
|
||||
from deepspeech.utils import checkpoint
|
||||
from deepspeech.decoders.swig_wrapper import Scorer
|
||||
from deepspeech.decoders.swig_wrapper import ctc_greedy_decoder
|
||||
from deepspeech.decoders.swig_wrapper import ctc_beam_search_decoder_batch
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
__all__ = ['DeepSpeech2Model']
|
||||
|
||||
|
||||
class DeepSpeech2Model(nn.Layer):
|
||||
"""The DeepSpeech2 network structure.
|
||||
|
||||
:param audio_data: Audio spectrogram data layer.
|
||||
:type audio_data: Variable
|
||||
:param text_data: Transcription text data layer.
|
||||
:type text_data: Variable
|
||||
:param audio_len: Valid sequence length data layer.
|
||||
:type audio_len: Variable
|
||||
:param masks: Masks data layer to reset padding.
|
||||
:type masks: Variable
|
||||
:param dict_size: Dictionary size for tokenized transcription.
|
||||
:type dict_size: int
|
||||
:param num_conv_layers: Number of stacking convolution layers.
|
||||
:type num_conv_layers: int
|
||||
:param num_rnn_layers: Number of stacking RNN layers.
|
||||
:type num_rnn_layers: int
|
||||
:param rnn_size: RNN layer size (dimension of RNN cells).
|
||||
:type rnn_size: int
|
||||
:param use_gru: Use gru if set True. Use simple rnn if set False.
|
||||
:type use_gru: bool
|
||||
:param share_rnn_weights: Whether to share input-hidden weights between
|
||||
forward and backward direction RNNs.
|
||||
It is only available when use_gru=False.
|
||||
:type share_weights: bool
|
||||
:return: A tuple of an output unnormalized log probability layer (
|
||||
before softmax) and a ctc cost layer.
|
||||
:rtype: tuple of LayerOutput
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
|
||||
default = CfgNode(
|
||||
dict(
|
||||
num_conv_layers=2, #Number of stacking convolution layers.
|
||||
num_rnn_layers=3, #Number of stacking RNN layers.
|
||||
rnn_layer_size=1024, #RNN layer size (number of RNN cells).
|
||||
use_gru=True, #Use gru if set True. Use simple rnn if set False.
|
||||
share_rnn_weights=True #Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported.
|
||||
))
|
||||
if config is not None:
|
||||
config.model.merge_from_other_cfg(default)
|
||||
return default
|
||||
|
||||
def __init__(self,
|
||||
feat_size,
|
||||
dict_size,
|
||||
num_conv_layers=2,
|
||||
num_rnn_layers=3,
|
||||
rnn_size=1024,
|
||||
use_gru=False,
|
||||
share_rnn_weights=True):
|
||||
super().__init__()
|
||||
self.feat_size = feat_size # 161 for linear
|
||||
self.dict_size = dict_size
|
||||
|
||||
self.conv = ConvStack(feat_size, num_conv_layers)
|
||||
|
||||
i_size = self.conv.output_height # H after conv stack
|
||||
self.rnn = RNNStack(
|
||||
i_size=i_size,
|
||||
h_size=rnn_size,
|
||||
num_stacks=num_rnn_layers,
|
||||
use_gru=use_gru,
|
||||
share_rnn_weights=share_rnn_weights)
|
||||
self.fc = nn.Linear(rnn_size * 2, dict_size + 1)
|
||||
|
||||
self.logger = logging.getLogger(__name__)
|
||||
self._ext_scorer = None
|
||||
|
||||
def infer(self, audio, audio_len):
|
||||
# [B, D, T] -> [B, C=1, D, T]
|
||||
audio = audio.unsqueeze(1)
|
||||
|
||||
# convolution group
|
||||
x, audio_len = self.conv(audio, audio_len)
|
||||
#print('conv out', x.shape)
|
||||
|
||||
# convert data from convolution feature map to sequence of vectors
|
||||
B, C, D, T = paddle.shape(x)
|
||||
x = x.transpose([0, 3, 1, 2]) #[B, T, C, D]
|
||||
x = x.reshape([B, T, C * D]) #[B, T, C*D]
|
||||
#print('rnn input', x.shape)
|
||||
|
||||
# remove padding part
|
||||
x, audio_len = self.rnn(x, audio_len) #[B, T, D]
|
||||
#print('rnn output', x.shape)
|
||||
|
||||
logits = self.fc(x) #[B, T, V + 1]
|
||||
|
||||
#ctcdecoder need probs, not log_probs
|
||||
probs = F.softmax(logits)
|
||||
|
||||
return logits, probs, audio_len
|
||||
|
||||
def forward(self, audio, text, audio_len, text_len):
|
||||
"""
|
||||
audio: shape [B, D, T]
|
||||
text: shape [B, T]
|
||||
audio_len: shape [B]
|
||||
text_len: shape [B]
|
||||
"""
|
||||
return self.infer(audio, audio_len)
|
||||
|
||||
@paddle.no_grad()
|
||||
def predict(self, audio, audio_len):
|
||||
""" Model infer """
|
||||
return self.infer(audio, audio_len)
|
||||
|
||||
def _decode_batch_greedy(self, probs_split, vocab_list):
|
||||
"""Decode by best path for a batch of probs matrix input.
|
||||
:param probs_split: List of 2-D probability matrix, and each consists
|
||||
of prob vectors for one speech utterancce.
|
||||
:param probs_split: List of matrix
|
||||
:param vocab_list: List of tokens in the vocabulary, for decoding.
|
||||
:type vocab_list: list
|
||||
:return: List of transcription texts.
|
||||
:rtype: List of str
|
||||
"""
|
||||
results = []
|
||||
for i, probs in enumerate(probs_split):
|
||||
output_transcription = ctc_greedy_decoder(
|
||||
probs_seq=probs, vocabulary=vocab_list)
|
||||
results.append(output_transcription)
|
||||
return results
|
||||
|
||||
def _init_ext_scorer(self, beam_alpha, beam_beta, language_model_path,
|
||||
vocab_list):
|
||||
"""Initialize the external scorer.
|
||||
:param beam_alpha: Parameter associated with language model.
|
||||
:type beam_alpha: float
|
||||
:param beam_beta: Parameter associated with word count.
|
||||
:type beam_beta: float
|
||||
:param language_model_path: Filepath for language model. If it is
|
||||
empty, the external scorer will be set to
|
||||
None, and the decoding method will be pure
|
||||
beam search without scorer.
|
||||
:type language_model_path: str|None
|
||||
:param vocab_list: List of tokens in the vocabulary, for decoding.
|
||||
:type vocab_list: list
|
||||
"""
|
||||
# init once
|
||||
if self._ext_scorer != None:
|
||||
return
|
||||
|
||||
if language_model_path != '':
|
||||
self.logger.info("begin to initialize the external scorer "
|
||||
"for decoding")
|
||||
self._ext_scorer = Scorer(beam_alpha, beam_beta,
|
||||
language_model_path, vocab_list)
|
||||
lm_char_based = self._ext_scorer.is_character_based()
|
||||
lm_max_order = self._ext_scorer.get_max_order()
|
||||
lm_dict_size = self._ext_scorer.get_dict_size()
|
||||
self.logger.info("language model: "
|
||||
"is_character_based = %d," % lm_char_based +
|
||||
" max_order = %d," % lm_max_order +
|
||||
" dict_size = %d" % lm_dict_size)
|
||||
self.logger.info("end initializing scorer")
|
||||
else:
|
||||
self._ext_scorer = None
|
||||
self.logger.info("no language model provided, "
|
||||
"decoding by pure beam search without scorer.")
|
||||
|
||||
def _decode_batch_beam_search(self, probs_split, beam_alpha, beam_beta,
|
||||
beam_size, cutoff_prob, cutoff_top_n,
|
||||
vocab_list, num_processes):
|
||||
"""Decode by beam search for a batch of probs matrix input.
|
||||
:param probs_split: List of 2-D probability matrix, and each consists
|
||||
of prob vectors for one speech utterancce.
|
||||
:param probs_split: List of matrix
|
||||
:param beam_alpha: Parameter associated with language model.
|
||||
:type beam_alpha: float
|
||||
:param beam_beta: Parameter associated with word count.
|
||||
:type beam_beta: float
|
||||
:param beam_size: Width for Beam search.
|
||||
:type beam_size: int
|
||||
:param cutoff_prob: Cutoff probability in pruning,
|
||||
default 1.0, no pruning.
|
||||
:type cutoff_prob: float
|
||||
:param cutoff_top_n: Cutoff number in pruning, only top cutoff_top_n
|
||||
characters with highest probs in vocabulary will be
|
||||
used in beam search, default 40.
|
||||
:type cutoff_top_n: int
|
||||
:param vocab_list: List of tokens in the vocabulary, for decoding.
|
||||
:type vocab_list: list
|
||||
:param num_processes: Number of processes (CPU) for decoder.
|
||||
:type num_processes: int
|
||||
:return: List of transcription texts.
|
||||
:rtype: List of str
|
||||
"""
|
||||
if self._ext_scorer != None:
|
||||
self._ext_scorer.reset_params(beam_alpha, beam_beta)
|
||||
|
||||
# beam search decode
|
||||
num_processes = min(num_processes, len(probs_split))
|
||||
beam_search_results = ctc_beam_search_decoder_batch(
|
||||
probs_split=probs_split,
|
||||
vocabulary=vocab_list,
|
||||
beam_size=beam_size,
|
||||
num_processes=num_processes,
|
||||
ext_scoring_func=self._ext_scorer,
|
||||
cutoff_prob=cutoff_prob,
|
||||
cutoff_top_n=cutoff_top_n)
|
||||
|
||||
results = [result[0][1] for result in beam_search_results]
|
||||
return results
|
||||
|
||||
def init_decode(self, beam_alpha, beam_beta, lang_model_path, vocab_list,
|
||||
decoding_method):
|
||||
if decoding_method == "ctc_beam_search":
|
||||
self._init_ext_scorer(beam_alpha, beam_beta, lang_model_path,
|
||||
vocab_list)
|
||||
|
||||
def decode_probs(self, probs, logits_lens, vocab_list, decoding_method,
|
||||
lang_model_path, beam_alpha, beam_beta, beam_size,
|
||||
cutoff_prob, cutoff_top_n, num_processes):
|
||||
""" probs: activation after softmax
|
||||
logits_len: audio output lens
|
||||
"""
|
||||
probs_split = [probs[i, :l, :] for i, l in enumerate(logits_lens)]
|
||||
if decoding_method == "ctc_greedy":
|
||||
result_transcripts = self._decode_batch_greedy(
|
||||
probs_split=probs_split, vocab_list=vocab_list)
|
||||
elif decoding_method == "ctc_beam_search":
|
||||
result_transcripts = self._decode_batch_beam_search(
|
||||
probs_split=probs_split,
|
||||
beam_alpha=beam_alpha,
|
||||
beam_beta=beam_beta,
|
||||
beam_size=beam_size,
|
||||
cutoff_prob=cutoff_prob,
|
||||
cutoff_top_n=cutoff_top_n,
|
||||
vocab_list=vocab_list,
|
||||
num_processes=num_processes)
|
||||
else:
|
||||
raise ValueError(f"Not support: {decoding_method}")
|
||||
return result_transcripts
|
||||
|
||||
@paddle.no_grad()
|
||||
def decode(self, audio, audio_len, vocab_list, decoding_method,
|
||||
lang_model_path, beam_alpha, beam_beta, beam_size, cutoff_prob,
|
||||
cutoff_top_n, num_processes):
|
||||
_, probs, logits_lens = self.predict(audio, audio_len)
|
||||
return self.decode_probs(probs.numpy(), logits_lens, vocab_list,
|
||||
decoding_method, lang_model_path, beam_alpha,
|
||||
beam_beta, beam_size, cutoff_prob,
|
||||
cutoff_top_n, num_processes)
|
||||
|
||||
def from_pretrained(self, checkpoint_path):
|
||||
"""Build a model from a pretrained model.
|
||||
Parameters
|
||||
----------
|
||||
model: nn.Layer
|
||||
Asr Model.
|
||||
|
||||
checkpoint_path: Path or str
|
||||
The path of pretrained model checkpoint, without extension name.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Model
|
||||
The model build from pretrined result.
|
||||
"""
|
||||
checkpoint.load_parameters(self, checkpoint_path=checkpoint_path)
|
||||
return
|
@ -1,754 +0,0 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import math
|
||||
import collections
|
||||
import numpy as np
|
||||
import logging
|
||||
|
||||
import paddle
|
||||
from paddle import nn
|
||||
from paddle.nn import functional as F
|
||||
from paddle.nn import initializer as I
|
||||
|
||||
from deepspeech.utils import checkpoint
|
||||
from deepspeech.decoders.swig_wrapper import Scorer
|
||||
from deepspeech.decoders.swig_wrapper import ctc_greedy_decoder
|
||||
from deepspeech.decoders.swig_wrapper import ctc_beam_search_decoder_batch
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
__all__ = ['DeepSpeech2', 'DeepSpeech2Loss']
|
||||
|
||||
|
||||
def brelu(x, t_min=0.0, t_max=24.0, name=None):
|
||||
t_min = paddle.to_tensor(t_min)
|
||||
t_max = paddle.to_tensor(t_max)
|
||||
return x.maximum(t_min).minimum(t_max)
|
||||
|
||||
|
||||
def sequence_mask(x_len, max_len=None, dtype='float32'):
|
||||
max_len = max_len or x_len.max()
|
||||
x_len = paddle.unsqueeze(x_len, -1)
|
||||
row_vector = paddle.arange(max_len)
|
||||
#mask = row_vector < x_len
|
||||
mask = row_vector > x_len # a bug, broadcast 的时候出错了
|
||||
mask = paddle.cast(mask, dtype)
|
||||
return mask
|
||||
|
||||
|
||||
class ConvBn(nn.Layer):
|
||||
"""Convolution layer with batch normalization.
|
||||
|
||||
:param kernel_size: The x dimension of a filter kernel. Or input a tuple for
|
||||
two image dimension.
|
||||
:type kernel_size: int|tuple|list
|
||||
:param num_channels_in: Number of input channels.
|
||||
:type num_channels_in: int
|
||||
:param num_channels_out: Number of output channels.
|
||||
:type num_channels_out: int
|
||||
:param stride: The x dimension of the stride. Or input a tuple for two
|
||||
image dimension.
|
||||
:type stride: int|tuple|list
|
||||
:param padding: The x dimension of the padding. Or input a tuple for two
|
||||
image dimension.
|
||||
:type padding: int|tuple|list
|
||||
:param act: Activation type, relu|brelu
|
||||
:type act: string
|
||||
:param masks: Masks data layer to reset padding.
|
||||
:type masks: Variable
|
||||
:param name: Name of the layer.
|
||||
:param name: string
|
||||
:return: Batch norm layer after convolution layer.
|
||||
:rtype: Variable
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, num_channels_in, num_channels_out, kernel_size, stride,
|
||||
padding, act):
|
||||
|
||||
super().__init__()
|
||||
self.kernel_size = kernel_size
|
||||
self.stride = stride
|
||||
self.padding = padding
|
||||
|
||||
self.conv = nn.Conv2D(
|
||||
num_channels_in,
|
||||
num_channels_out,
|
||||
kernel_size=kernel_size,
|
||||
stride=stride,
|
||||
padding=padding,
|
||||
weight_attr=None,
|
||||
bias_attr=False,
|
||||
data_format='NCHW')
|
||||
|
||||
self.bn = nn.BatchNorm2D(
|
||||
num_channels_out,
|
||||
weight_attr=None,
|
||||
bias_attr=None,
|
||||
data_format='NCHW')
|
||||
self.act = F.relu if act == 'relu' else brelu
|
||||
|
||||
def forward(self, x, x_len):
|
||||
"""
|
||||
x(Tensor): audio, shape [B, C, D, T]
|
||||
"""
|
||||
x = self.conv(x)
|
||||
x = self.bn(x)
|
||||
x = self.act(x)
|
||||
|
||||
x_len = (x_len - self.kernel_size[1] + 2 * self.padding[1]
|
||||
) // self.stride[1] + 1
|
||||
|
||||
# reset padding part to 0
|
||||
masks = sequence_mask(x_len) #[B, T]
|
||||
masks = masks.unsqueeze(1).unsqueeze(1) # [B, 1, 1, T]
|
||||
x = x.multiply(masks)
|
||||
|
||||
return x, x_len
|
||||
|
||||
|
||||
class ConvStack(nn.Layer):
|
||||
"""Convolution group with stacked convolution layers.
|
||||
|
||||
:param feat_size: audio feature dim.
|
||||
:type feat_size: int
|
||||
:param num_stacks: Number of stacked convolution layers.
|
||||
:type num_stacks: int
|
||||
"""
|
||||
|
||||
def __init__(self, feat_size, num_stacks):
|
||||
super().__init__()
|
||||
self.feat_size = feat_size # D
|
||||
self.num_stacks = num_stacks
|
||||
|
||||
self.conv_in = ConvBn(
|
||||
num_channels_in=1,
|
||||
num_channels_out=32,
|
||||
kernel_size=(41, 11), #[D, T]
|
||||
stride=(2, 3),
|
||||
padding=(20, 5),
|
||||
act='brelu')
|
||||
|
||||
out_channel = 32
|
||||
self.conv_stack = nn.LayerList([
|
||||
ConvBn(
|
||||
num_channels_in=32,
|
||||
num_channels_out=out_channel,
|
||||
kernel_size=(21, 11),
|
||||
stride=(2, 1),
|
||||
padding=(10, 5),
|
||||
act='brelu') for i in range(num_stacks - 1)
|
||||
])
|
||||
|
||||
# conv output feat_dim
|
||||
output_height = (feat_size - 1) // 2 + 1
|
||||
for i in range(self.num_stacks - 1):
|
||||
output_height = (output_height - 1) // 2 + 1
|
||||
self.output_height = out_channel * output_height
|
||||
|
||||
def forward(self, x, x_len):
|
||||
"""
|
||||
x: shape [B, C, D, T]
|
||||
x_len : shape [B]
|
||||
"""
|
||||
x, x_len = self.conv_in(x, x_len)
|
||||
for i, conv in enumerate(self.conv_stack):
|
||||
x, x_len = conv(x, x_len)
|
||||
return x, x_len
|
||||
|
||||
|
||||
class RNNCell(nn.RNNCellBase):
|
||||
r"""
|
||||
Elman RNN (SimpleRNN) cell. Given the inputs and previous states, it
|
||||
computes the outputs and updates states.
|
||||
The formula used is as follows:
|
||||
.. math::
|
||||
h_{t} & = act(x_{t} + b_{ih} + W_{hh}h_{t-1} + b_{hh})
|
||||
y_{t} & = h_{t}
|
||||
|
||||
where :math:`act` is for :attr:`activation`.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
hidden_size,
|
||||
activation="tanh",
|
||||
weight_ih_attr=None,
|
||||
weight_hh_attr=None,
|
||||
bias_ih_attr=None,
|
||||
bias_hh_attr=None,
|
||||
name=None):
|
||||
super().__init__()
|
||||
std = 1.0 / math.sqrt(hidden_size)
|
||||
self.weight_hh = self.create_parameter(
|
||||
(hidden_size, hidden_size),
|
||||
weight_hh_attr,
|
||||
default_initializer=I.Uniform(-std, std))
|
||||
# self.bias_ih = self.create_parameter(
|
||||
# (hidden_size, ),
|
||||
# bias_ih_attr,
|
||||
# is_bias=True,
|
||||
# default_initializer=I.Uniform(-std, std))
|
||||
self.bias_ih = None
|
||||
self.bias_hh = self.create_parameter(
|
||||
(hidden_size, ),
|
||||
bias_hh_attr,
|
||||
is_bias=True,
|
||||
default_initializer=I.Uniform(-std, std))
|
||||
|
||||
self.hidden_size = hidden_size
|
||||
if activation not in ["tanh", "relu", "brelu"]:
|
||||
raise ValueError(
|
||||
"activation for SimpleRNNCell should be tanh or relu, "
|
||||
"but get {}".format(activation))
|
||||
self.activation = activation
|
||||
self._activation_fn = paddle.tanh \
|
||||
if activation == "tanh" \
|
||||
else F.relu
|
||||
if activation == 'brelu':
|
||||
self._activation_fn = brelu
|
||||
|
||||
def forward(self, inputs, states=None):
|
||||
if states is None:
|
||||
states = self.get_initial_states(inputs, self.state_shape)
|
||||
pre_h = states
|
||||
i2h = inputs
|
||||
if self.bias_ih is not None:
|
||||
i2h += self.bias_ih
|
||||
h2h = paddle.matmul(pre_h, self.weight_hh, transpose_y=True)
|
||||
if self.bias_hh is not None:
|
||||
h2h += self.bias_hh
|
||||
h = self._activation_fn(i2h + h2h)
|
||||
return h, h
|
||||
|
||||
@property
|
||||
def state_shape(self):
|
||||
return (self.hidden_size, )
|
||||
|
||||
|
||||
class GRUCellShare(nn.RNNCellBase):
|
||||
r"""
|
||||
Gated Recurrent Unit (GRU) RNN cell. Given the inputs and previous states,
|
||||
it computes the outputs and updates states.
|
||||
The formula for GRU used is as follows:
|
||||
.. math::
|
||||
r_{t} & = \sigma(W_{ir}x_{t} + b_{ir} + W_{hr}h_{t-1} + b_{hr})
|
||||
z_{t} & = \sigma(W_{iz}x_{t} + b_{iz} + W_{hz}h_{t-1} + b_{hz})
|
||||
\widetilde{h}_{t} & = \tanh(W_{ic}x_{t} + b_{ic} + r_{t} * (W_{hc}h_{t-1} + b_{hc}))
|
||||
h_{t} & = z_{t} * h_{t-1} + (1 - z_{t}) * \widetilde{h}_{t}
|
||||
y_{t} & = h_{t}
|
||||
|
||||
where :math:`\sigma` is the sigmoid fucntion, and * is the elemetwise
|
||||
multiplication operator.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
input_size,
|
||||
hidden_size,
|
||||
weight_ih_attr=None,
|
||||
weight_hh_attr=None,
|
||||
bias_ih_attr=None,
|
||||
bias_hh_attr=None,
|
||||
name=None):
|
||||
super().__init__()
|
||||
std = 1.0 / math.sqrt(hidden_size)
|
||||
self.weight_hh = self.create_parameter(
|
||||
(3 * hidden_size, hidden_size),
|
||||
weight_hh_attr,
|
||||
default_initializer=I.Uniform(-std, std))
|
||||
# self.bias_ih = self.create_parameter(
|
||||
# (3 * hidden_size, ),
|
||||
# bias_ih_attr,
|
||||
# is_bias=True,
|
||||
# default_initializer=I.Uniform(-std, std))
|
||||
self.bias_ih = None
|
||||
self.bias_hh = self.create_parameter(
|
||||
(3 * hidden_size, ),
|
||||
bias_hh_attr,
|
||||
is_bias=True,
|
||||
default_initializer=I.Uniform(-std, std))
|
||||
|
||||
self.hidden_size = hidden_size
|
||||
self.input_size = input_size
|
||||
self._gate_activation = F.sigmoid
|
||||
self._activation = paddle.tanh
|
||||
#self._activation = F.relu
|
||||
|
||||
def forward(self, inputs, states=None):
|
||||
if states is None:
|
||||
states = self.get_initial_states(inputs, self.state_shape)
|
||||
|
||||
pre_hidden = states
|
||||
x_gates = inputs
|
||||
if self.bias_ih is not None:
|
||||
x_gates = x_gates + self.bias_ih
|
||||
h_gates = paddle.matmul(pre_hidden, self.weight_hh, transpose_y=True)
|
||||
if self.bias_hh is not None:
|
||||
h_gates = h_gates + self.bias_hh
|
||||
|
||||
x_r, x_z, x_c = paddle.split(x_gates, num_or_sections=3, axis=1)
|
||||
h_r, h_z, h_c = paddle.split(h_gates, num_or_sections=3, axis=1)
|
||||
|
||||
r = self._gate_activation(x_r + h_r)
|
||||
z = self._gate_activation(x_z + h_z)
|
||||
c = self._activation(x_c + r * h_c) # apply reset gate after mm
|
||||
h = (pre_hidden - c) * z + c
|
||||
# https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/fluid/layers/dynamic_gru_cn.html#dynamic-gru
|
||||
|
||||
return h, h
|
||||
|
||||
@property
|
||||
def state_shape(self):
|
||||
r"""
|
||||
The `state_shape` of GRUCell is a shape `[hidden_size]` (-1 for batch
|
||||
size would be automatically inserted into shape). The shape corresponds
|
||||
to the shape of :math:`h_{t-1}`.
|
||||
"""
|
||||
return (self.hidden_size, )
|
||||
|
||||
|
||||
class BiRNNWithBN(nn.Layer):
|
||||
"""Bidirectonal simple rnn layer with sequence-wise batch normalization.
|
||||
The batch normalization is only performed on input-state weights.
|
||||
|
||||
:param name: Name of the layer parameters.
|
||||
:type name: string
|
||||
:param size: Dimension of RNN cells.
|
||||
:type size: int
|
||||
:param share_weights: Whether to share input-hidden weights between
|
||||
forward and backward directional RNNs.
|
||||
:type share_weights: bool
|
||||
:return: Bidirectional simple rnn layer.
|
||||
:rtype: Variable
|
||||
"""
|
||||
|
||||
def __init__(self, i_size, h_size, share_weights):
|
||||
super().__init__()
|
||||
self.share_weights = share_weights
|
||||
if self.share_weights:
|
||||
#input-hidden weights shared between bi-directional rnn.
|
||||
self.fw_fc = nn.Linear(i_size, h_size, bias_attr=False)
|
||||
# batch norm is only performed on input-state projection
|
||||
self.fw_bn = nn.BatchNorm1D(
|
||||
h_size, bias_attr=None, data_format='NLC')
|
||||
self.bw_fc = self.fw_fc
|
||||
self.bw_bn = self.fw_bn
|
||||
else:
|
||||
self.fw_fc = nn.Linear(i_size, h_size, bias_attr=False)
|
||||
self.fw_bn = nn.BatchNorm1D(
|
||||
h_size, bias_attr=None, data_format='NLC')
|
||||
self.bw_fc = nn.Linear(i_size, h_size, bias_attr=False)
|
||||
self.bw_bn = nn.BatchNorm1D(
|
||||
h_size, bias_attr=None, data_format='NLC')
|
||||
|
||||
self.fw_cell = RNNCell(hidden_size=h_size, activation='brelu')
|
||||
self.bw_cell = RNNCell(hidden_size=h_size, activation='brelu')
|
||||
self.fw_rnn = nn.RNN(
|
||||
self.fw_cell, is_reverse=False, time_major=False) #[B, T, D]
|
||||
self.bw_rnn = nn.RNN(
|
||||
self.fw_cell, is_reverse=True, time_major=False) #[B, T, D]
|
||||
|
||||
def forward(self, x, x_len):
|
||||
# x, shape [B, T, D]
|
||||
fw_x = self.fw_bn(self.fw_fc(x))
|
||||
bw_x = self.bw_bn(self.bw_fc(x))
|
||||
fw_x, _ = self.fw_rnn(inputs=fw_x, sequence_length=x_len)
|
||||
bw_x, _ = self.bw_rnn(inputs=bw_x, sequence_length=x_len)
|
||||
x = paddle.concat([fw_x, bw_x], axis=-1)
|
||||
return x, x_len
|
||||
|
||||
|
||||
class BiGRUWithBN(nn.Layer):
|
||||
"""Bidirectonal gru layer with sequence-wise batch normalization.
|
||||
The batch normalization is only performed on input-state weights.
|
||||
|
||||
:param name: Name of the layer.
|
||||
:type name: string
|
||||
:param input: Input layer.
|
||||
:type input: Variable
|
||||
:param size: Dimension of GRU cells.
|
||||
:type size: int
|
||||
:param act: Activation type.
|
||||
:type act: string
|
||||
:return: Bidirectional GRU layer.
|
||||
:rtype: Variable
|
||||
"""
|
||||
|
||||
def __init__(self, i_size, h_size, act):
|
||||
super().__init__()
|
||||
hidden_size = h_size * 3
|
||||
|
||||
self.fw_fc = nn.Linear(i_size, hidden_size, bias_attr=False)
|
||||
self.fw_bn = nn.BatchNorm1D(
|
||||
hidden_size, bias_attr=None, data_format='NLC')
|
||||
self.bw_fc = nn.Linear(i_size, hidden_size, bias_attr=False)
|
||||
self.bw_bn = nn.BatchNorm1D(
|
||||
hidden_size, bias_attr=None, data_format='NLC')
|
||||
|
||||
self.fw_cell = GRUCellShare(input_size=hidden_size, hidden_size=h_size)
|
||||
self.bw_cell = GRUCellShare(input_size=hidden_size, hidden_size=h_size)
|
||||
self.fw_rnn = nn.RNN(
|
||||
self.fw_cell, is_reverse=False, time_major=False) #[B, T, D]
|
||||
self.bw_rnn = nn.RNN(
|
||||
self.fw_cell, is_reverse=True, time_major=False) #[B, T, D]
|
||||
|
||||
def forward(self, x, x_len):
|
||||
# x, shape [B, T, D]
|
||||
fw_x = self.fw_bn(self.fw_fc(x))
|
||||
bw_x = self.bw_bn(self.bw_fc(x))
|
||||
fw_x, _ = self.fw_rnn(inputs=fw_x, sequence_length=x_len)
|
||||
bw_x, _ = self.bw_rnn(inputs=bw_x, sequence_length=x_len)
|
||||
x = paddle.concat([fw_x, bw_x], axis=-1)
|
||||
return x, x_len
|
||||
|
||||
|
||||
class RNNStack(nn.Layer):
|
||||
"""RNN group with stacked bidirectional simple RNN or GRU layers.
|
||||
|
||||
:param input: Input layer.
|
||||
:type input: Variable
|
||||
:param size: Dimension of RNN cells in each layer.
|
||||
:type size: int
|
||||
:param num_stacks: Number of stacked rnn layers.
|
||||
:type num_stacks: int
|
||||
:param use_gru: Use gru if set True. Use simple rnn if set False.
|
||||
:type use_gru: bool
|
||||
:param share_rnn_weights: Whether to share input-hidden weights between
|
||||
forward and backward directional RNNs.
|
||||
It is only available when use_gru=False.
|
||||
:type share_weights: bool
|
||||
:return: Output layer of the RNN group.
|
||||
:rtype: Variable
|
||||
"""
|
||||
|
||||
def __init__(self, i_size, h_size, num_stacks, use_gru, share_rnn_weights):
|
||||
super().__init__()
|
||||
self.rnn_stacks = nn.LayerList()
|
||||
for i in range(num_stacks):
|
||||
if use_gru:
|
||||
#default:GRU using tanh
|
||||
self.rnn_stacks.append(
|
||||
BiGRUWithBN(i_size=i_size, h_size=h_size, act="relu"))
|
||||
else:
|
||||
self.rnn_stacks.append(
|
||||
BiRNNWithBN(
|
||||
i_size=i_size,
|
||||
h_size=h_size,
|
||||
share_weights=share_rnn_weights))
|
||||
i_size = h_size * 2
|
||||
|
||||
def forward(self, x, x_len):
|
||||
"""
|
||||
x: shape [B, T, D]
|
||||
x_len: shpae [B]
|
||||
"""
|
||||
for i, rnn in enumerate(self.rnn_stacks):
|
||||
x, x_len = rnn(x, x_len)
|
||||
masks = sequence_mask(x_len) #[B, T]
|
||||
masks = masks.unsqueeze(-1) # [B, T, 1]
|
||||
x = x.multiply(masks)
|
||||
return x, x_len
|
||||
|
||||
|
||||
class DeepSpeech2(nn.Layer):
|
||||
"""The DeepSpeech2 network structure.
|
||||
|
||||
:param audio_data: Audio spectrogram data layer.
|
||||
:type audio_data: Variable
|
||||
:param text_data: Transcription text data layer.
|
||||
:type text_data: Variable
|
||||
:param audio_len: Valid sequence length data layer.
|
||||
:type audio_len: Variable
|
||||
:param masks: Masks data layer to reset padding.
|
||||
:type masks: Variable
|
||||
:param dict_size: Dictionary size for tokenized transcription.
|
||||
:type dict_size: int
|
||||
:param num_conv_layers: Number of stacking convolution layers.
|
||||
:type num_conv_layers: int
|
||||
:param num_rnn_layers: Number of stacking RNN layers.
|
||||
:type num_rnn_layers: int
|
||||
:param rnn_size: RNN layer size (dimension of RNN cells).
|
||||
:type rnn_size: int
|
||||
:param use_gru: Use gru if set True. Use simple rnn if set False.
|
||||
:type use_gru: bool
|
||||
:param share_rnn_weights: Whether to share input-hidden weights between
|
||||
forward and backward direction RNNs.
|
||||
It is only available when use_gru=False.
|
||||
:type share_weights: bool
|
||||
:return: A tuple of an output unnormalized log probability layer (
|
||||
before softmax) and a ctc cost layer.
|
||||
:rtype: tuple of LayerOutput
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
feat_size,
|
||||
dict_size,
|
||||
num_conv_layers=2,
|
||||
num_rnn_layers=3,
|
||||
rnn_size=1024,
|
||||
use_gru=False,
|
||||
share_rnn_weights=True):
|
||||
super().__init__()
|
||||
self.feat_size = feat_size # 161 for linear
|
||||
self.dict_size = dict_size
|
||||
|
||||
self.conv = ConvStack(feat_size, num_conv_layers)
|
||||
|
||||
i_size = self.conv.output_height # H after conv stack
|
||||
self.rnn = RNNStack(
|
||||
i_size=i_size,
|
||||
h_size=rnn_size,
|
||||
num_stacks=num_rnn_layers,
|
||||
use_gru=use_gru,
|
||||
share_rnn_weights=share_rnn_weights)
|
||||
self.fc = nn.Linear(rnn_size * 2, dict_size + 1)
|
||||
|
||||
self.logger = logging.getLogger(__name__)
|
||||
self._ext_scorer = None
|
||||
|
||||
def infer(self, audio, audio_len):
|
||||
# [B, D, T] -> [B, C=1, D, T]
|
||||
audio = audio.unsqueeze(1)
|
||||
|
||||
# convolution group
|
||||
x, audio_len = self.conv(audio, audio_len)
|
||||
#print('conv out', x.shape)
|
||||
|
||||
# convert data from convolution feature map to sequence of vectors
|
||||
B, C, D, T = paddle.shape(x)
|
||||
x = x.transpose([0, 3, 1, 2]) #[B, T, C, D]
|
||||
x = x.reshape([B, T, C * D]) #[B, T, C*D]
|
||||
#print('rnn input', x.shape)
|
||||
|
||||
# remove padding part
|
||||
x, audio_len = self.rnn(x, audio_len) #[B, T, D]
|
||||
#print('rnn output', x.shape)
|
||||
|
||||
logits = self.fc(x) #[B, T, V + 1]
|
||||
|
||||
#ctcdecoder need probs, not log_probs
|
||||
probs = F.softmax(logits)
|
||||
|
||||
return logits, probs, audio_len
|
||||
|
||||
def forward(self, audio, text, audio_len, text_len):
|
||||
"""
|
||||
audio: shape [B, D, T]
|
||||
text: shape [B, T]
|
||||
audio_len: shape [B]
|
||||
text_len: shape [B]
|
||||
"""
|
||||
return self.infer(audio, audio_len)
|
||||
|
||||
@paddle.no_grad()
|
||||
def predict(self, audio, audio_len):
|
||||
""" Model infer """
|
||||
return self.infer(audio, audio_len)
|
||||
|
||||
def _decode_batch_greedy(self, probs_split, vocab_list):
|
||||
"""Decode by best path for a batch of probs matrix input.
|
||||
:param probs_split: List of 2-D probability matrix, and each consists
|
||||
of prob vectors for one speech utterancce.
|
||||
:param probs_split: List of matrix
|
||||
:param vocab_list: List of tokens in the vocabulary, for decoding.
|
||||
:type vocab_list: list
|
||||
:return: List of transcription texts.
|
||||
:rtype: List of str
|
||||
"""
|
||||
results = []
|
||||
for i, probs in enumerate(probs_split):
|
||||
output_transcription = ctc_greedy_decoder(
|
||||
probs_seq=probs, vocabulary=vocab_list)
|
||||
results.append(output_transcription)
|
||||
return results
|
||||
|
||||
def _init_ext_scorer(self, beam_alpha, beam_beta, language_model_path,
|
||||
vocab_list):
|
||||
"""Initialize the external scorer.
|
||||
:param beam_alpha: Parameter associated with language model.
|
||||
:type beam_alpha: float
|
||||
:param beam_beta: Parameter associated with word count.
|
||||
:type beam_beta: float
|
||||
:param language_model_path: Filepath for language model. If it is
|
||||
empty, the external scorer will be set to
|
||||
None, and the decoding method will be pure
|
||||
beam search without scorer.
|
||||
:type language_model_path: str|None
|
||||
:param vocab_list: List of tokens in the vocabulary, for decoding.
|
||||
:type vocab_list: list
|
||||
"""
|
||||
# init once
|
||||
if self._ext_scorer != None:
|
||||
return
|
||||
|
||||
if language_model_path != '':
|
||||
self.logger.info("begin to initialize the external scorer "
|
||||
"for decoding")
|
||||
self._ext_scorer = Scorer(beam_alpha, beam_beta,
|
||||
language_model_path, vocab_list)
|
||||
lm_char_based = self._ext_scorer.is_character_based()
|
||||
lm_max_order = self._ext_scorer.get_max_order()
|
||||
lm_dict_size = self._ext_scorer.get_dict_size()
|
||||
self.logger.info("language model: "
|
||||
"is_character_based = %d," % lm_char_based +
|
||||
" max_order = %d," % lm_max_order +
|
||||
" dict_size = %d" % lm_dict_size)
|
||||
self.logger.info("end initializing scorer")
|
||||
else:
|
||||
self._ext_scorer = None
|
||||
self.logger.info("no language model provided, "
|
||||
"decoding by pure beam search without scorer.")
|
||||
|
||||
def _decode_batch_beam_search(self, probs_split, beam_alpha, beam_beta,
|
||||
beam_size, cutoff_prob, cutoff_top_n,
|
||||
vocab_list, num_processes):
|
||||
"""Decode by beam search for a batch of probs matrix input.
|
||||
:param probs_split: List of 2-D probability matrix, and each consists
|
||||
of prob vectors for one speech utterancce.
|
||||
:param probs_split: List of matrix
|
||||
:param beam_alpha: Parameter associated with language model.
|
||||
:type beam_alpha: float
|
||||
:param beam_beta: Parameter associated with word count.
|
||||
:type beam_beta: float
|
||||
:param beam_size: Width for Beam search.
|
||||
:type beam_size: int
|
||||
:param cutoff_prob: Cutoff probability in pruning,
|
||||
default 1.0, no pruning.
|
||||
:type cutoff_prob: float
|
||||
:param cutoff_top_n: Cutoff number in pruning, only top cutoff_top_n
|
||||
characters with highest probs in vocabulary will be
|
||||
used in beam search, default 40.
|
||||
:type cutoff_top_n: int
|
||||
:param vocab_list: List of tokens in the vocabulary, for decoding.
|
||||
:type vocab_list: list
|
||||
:param num_processes: Number of processes (CPU) for decoder.
|
||||
:type num_processes: int
|
||||
:return: List of transcription texts.
|
||||
:rtype: List of str
|
||||
"""
|
||||
if self._ext_scorer != None:
|
||||
self._ext_scorer.reset_params(beam_alpha, beam_beta)
|
||||
|
||||
# beam search decode
|
||||
num_processes = min(num_processes, len(probs_split))
|
||||
beam_search_results = ctc_beam_search_decoder_batch(
|
||||
probs_split=probs_split,
|
||||
vocabulary=vocab_list,
|
||||
beam_size=beam_size,
|
||||
num_processes=num_processes,
|
||||
ext_scoring_func=self._ext_scorer,
|
||||
cutoff_prob=cutoff_prob,
|
||||
cutoff_top_n=cutoff_top_n)
|
||||
|
||||
results = [result[0][1] for result in beam_search_results]
|
||||
return results
|
||||
|
||||
def init_decode(self, beam_alpha, beam_beta, lang_model_path, vocab_list,
|
||||
decoding_method):
|
||||
if decoding_method == "ctc_beam_search":
|
||||
self._init_ext_scorer(beam_alpha, beam_beta, lang_model_path,
|
||||
vocab_list)
|
||||
|
||||
def decode_probs(self, probs, logits_lens, vocab_list, decoding_method,
|
||||
lang_model_path, beam_alpha, beam_beta, beam_size,
|
||||
cutoff_prob, cutoff_top_n, num_processes):
|
||||
""" probs: activation after softmax
|
||||
logits_len: audio output lens
|
||||
"""
|
||||
probs_split = [probs[i, :l, :] for i, l in enumerate(logits_lens)]
|
||||
if decoding_method == "ctc_greedy":
|
||||
result_transcripts = self._decode_batch_greedy(
|
||||
probs_split=probs_split, vocab_list=vocab_list)
|
||||
elif decoding_method == "ctc_beam_search":
|
||||
result_transcripts = self._decode_batch_beam_search(
|
||||
probs_split=probs_split,
|
||||
beam_alpha=beam_alpha,
|
||||
beam_beta=beam_beta,
|
||||
beam_size=beam_size,
|
||||
cutoff_prob=cutoff_prob,
|
||||
cutoff_top_n=cutoff_top_n,
|
||||
vocab_list=vocab_list,
|
||||
num_processes=num_processes)
|
||||
else:
|
||||
raise ValueError(f"Not support: {decoding_method}")
|
||||
return result_transcripts
|
||||
|
||||
@paddle.no_grad()
|
||||
def decode(self, audio, audio_len, vocab_list, decoding_method,
|
||||
lang_model_path, beam_alpha, beam_beta, beam_size, cutoff_prob,
|
||||
cutoff_top_n, num_processes):
|
||||
_, probs, logits_lens = self.predict(audio, audio_len)
|
||||
return self.decode_probs(probs.numpy(), logits_lens, vocab_list,
|
||||
decoding_method, lang_model_path, beam_alpha,
|
||||
beam_beta, beam_size, cutoff_prob,
|
||||
cutoff_top_n, num_processes)
|
||||
|
||||
def from_pretrained(self, checkpoint_path):
|
||||
"""Build a model from a pretrained model.
|
||||
Parameters
|
||||
----------
|
||||
model: nn.Layer
|
||||
Asr Model.
|
||||
|
||||
checkpoint_path: Path or str
|
||||
The path of pretrained model checkpoint, without extension name.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Model
|
||||
The model build from pretrined result.
|
||||
"""
|
||||
checkpoint.load_parameters(self, checkpoint_path=checkpoint_path)
|
||||
return
|
||||
|
||||
|
||||
def ctc_loss(logits,
|
||||
labels,
|
||||
input_lengths,
|
||||
label_lengths,
|
||||
blank=0,
|
||||
reduction='mean',
|
||||
norm_by_times=True):
|
||||
#logger.info("my ctc loss with norm by times")
|
||||
## https://github.com/PaddlePaddle/Paddle/blob/f5ca2db2cc/paddle/fluid/operators/warpctc_op.h#L403
|
||||
loss_out = paddle.fluid.layers.warpctc(logits, labels, blank, norm_by_times,
|
||||
input_lengths, label_lengths)
|
||||
|
||||
loss_out = paddle.fluid.layers.squeeze(loss_out, [-1])
|
||||
logger.info(f"warpctc loss: {loss_out}/{loss_out.shape} ")
|
||||
assert reduction in ['mean', 'sum', 'none']
|
||||
if reduction == 'mean':
|
||||
loss_out = paddle.mean(loss_out / label_lengths)
|
||||
elif reduction == 'sum':
|
||||
loss_out = paddle.sum(loss_out)
|
||||
logger.info(f"ctc loss: {loss_out}")
|
||||
return loss_out
|
||||
|
||||
|
||||
F.ctc_loss = ctc_loss
|
||||
|
||||
|
||||
class DeepSpeech2Loss(nn.Layer):
|
||||
def __init__(self, vocab_size):
|
||||
super().__init__()
|
||||
# last token id as blank id
|
||||
self.loss = nn.CTCLoss(blank=vocab_size, reduction='sum')
|
||||
|
||||
def forward(self, logits, text, logits_len, text_len):
|
||||
# warp-ctc do softmax on activations
|
||||
# warp-ctc need activation with shape [T, B, V + 1]
|
||||
logits = logits.transpose([1, 0, 2])
|
||||
|
||||
ctc_loss = self.loss(logits, text, logits_len, text_len)
|
||||
return ctc_loss
|
@ -0,0 +1,147 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import logging
|
||||
|
||||
import paddle
|
||||
from paddle import nn
|
||||
from paddle.nn import functional as F
|
||||
from paddle.nn import initializer as I
|
||||
|
||||
from deepspeech.modules.mask import sequence_mask
|
||||
from deepspeech.modules.activation import brelu
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
__all__ = ['ConvStack']
|
||||
|
||||
|
||||
class ConvBn(nn.Layer):
|
||||
"""Convolution layer with batch normalization.
|
||||
|
||||
:param kernel_size: The x dimension of a filter kernel. Or input a tuple for
|
||||
two image dimension.
|
||||
:type kernel_size: int|tuple|list
|
||||
:param num_channels_in: Number of input channels.
|
||||
:type num_channels_in: int
|
||||
:param num_channels_out: Number of output channels.
|
||||
:type num_channels_out: int
|
||||
:param stride: The x dimension of the stride. Or input a tuple for two
|
||||
image dimension.
|
||||
:type stride: int|tuple|list
|
||||
:param padding: The x dimension of the padding. Or input a tuple for two
|
||||
image dimension.
|
||||
:type padding: int|tuple|list
|
||||
:param act: Activation type, relu|brelu
|
||||
:type act: string
|
||||
:return: Batch norm layer after convolution layer.
|
||||
:rtype: Variable
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, num_channels_in, num_channels_out, kernel_size, stride,
|
||||
padding, act):
|
||||
|
||||
super().__init__()
|
||||
assert len(kernel_size) == 2
|
||||
assert len(stride) == 2
|
||||
assert len(padding) == 2
|
||||
self.kernel_size = kernel_size
|
||||
self.stride = stride
|
||||
self.padding = padding
|
||||
|
||||
self.conv = nn.Conv2D(
|
||||
num_channels_in,
|
||||
num_channels_out,
|
||||
kernel_size=kernel_size,
|
||||
stride=stride,
|
||||
padding=padding,
|
||||
weight_attr=None,
|
||||
bias_attr=False,
|
||||
data_format='NCHW')
|
||||
|
||||
self.bn = nn.BatchNorm2D(
|
||||
num_channels_out,
|
||||
weight_attr=None,
|
||||
bias_attr=None,
|
||||
data_format='NCHW')
|
||||
self.act = F.relu if act == 'relu' else brelu
|
||||
|
||||
def forward(self, x, x_len):
|
||||
"""
|
||||
x(Tensor): audio, shape [B, C, D, T]
|
||||
"""
|
||||
x = self.conv(x)
|
||||
x = self.bn(x)
|
||||
x = self.act(x)
|
||||
|
||||
x_len = (x_len - self.kernel_size[1] + 2 * self.padding[1]
|
||||
) // self.stride[1] + 1
|
||||
|
||||
# reset padding part to 0
|
||||
masks = sequence_mask(x_len) #[B, T]
|
||||
masks = masks.unsqueeze(1).unsqueeze(1) # [B, 1, 1, T]
|
||||
x = x.multiply(masks)
|
||||
|
||||
return x, x_len
|
||||
|
||||
|
||||
class ConvStack(nn.Layer):
|
||||
"""Convolution group with stacked convolution layers.
|
||||
|
||||
:param feat_size: audio feature dim.
|
||||
:type feat_size: int
|
||||
:param num_stacks: Number of stacked convolution layers.
|
||||
:type num_stacks: int
|
||||
"""
|
||||
|
||||
def __init__(self, feat_size, num_stacks):
|
||||
super().__init__()
|
||||
self.feat_size = feat_size # D
|
||||
self.num_stacks = num_stacks
|
||||
|
||||
self.conv_in = ConvBn(
|
||||
num_channels_in=1,
|
||||
num_channels_out=32,
|
||||
kernel_size=(41, 11), #[D, T]
|
||||
stride=(2, 3),
|
||||
padding=(20, 5),
|
||||
act='brelu')
|
||||
|
||||
out_channel = 32
|
||||
self.conv_stack = nn.LayerList([
|
||||
ConvBn(
|
||||
num_channels_in=32,
|
||||
num_channels_out=out_channel,
|
||||
kernel_size=(21, 11),
|
||||
stride=(2, 1),
|
||||
padding=(10, 5),
|
||||
act='brelu') for i in range(num_stacks - 1)
|
||||
])
|
||||
|
||||
# conv output feat_dim
|
||||
output_height = (feat_size - 1) // 2 + 1
|
||||
for i in range(self.num_stacks - 1):
|
||||
output_height = (output_height - 1) // 2 + 1
|
||||
self.output_height = out_channel * output_height
|
||||
|
||||
def forward(self, x, x_len):
|
||||
"""
|
||||
x: shape [B, C, D, T]
|
||||
x_len : shape [B]
|
||||
"""
|
||||
x, x_len = self.conv_in(x, x_len)
|
||||
for i, conv in enumerate(self.conv_stack):
|
||||
x, x_len = conv(x, x_len)
|
||||
return x, x_len
|
@ -0,0 +1,34 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import logging
|
||||
|
||||
import paddle
|
||||
from paddle import nn
|
||||
from paddle.nn import functional as F
|
||||
from paddle.nn import initializer as I
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
__all__ = ['sequence_mask']
|
||||
|
||||
|
||||
def sequence_mask(x_len, max_len=None, dtype='float32'):
|
||||
max_len = max_len or x_len.max()
|
||||
x_len = paddle.unsqueeze(x_len, -1)
|
||||
row_vector = paddle.arange(max_len)
|
||||
#mask = row_vector < x_len
|
||||
mask = row_vector > x_len # a bug, broadcast 的时候出错了
|
||||
mask = paddle.cast(mask, dtype)
|
||||
return mask
|
@ -0,0 +1,309 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import logging
|
||||
|
||||
import paddle
|
||||
from paddle import nn
|
||||
from paddle.nn import functional as F
|
||||
from paddle.nn import initializer as I
|
||||
|
||||
from deepspeech.modules.mask import sequence_mask
|
||||
from deepspeech.modules.activation import brelu
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
__all__ = ['RNNStack']
|
||||
|
||||
|
||||
class RNNCell(nn.RNNCellBase):
|
||||
r"""
|
||||
Elman RNN (SimpleRNN) cell. Given the inputs and previous states, it
|
||||
computes the outputs and updates states.
|
||||
The formula used is as follows:
|
||||
.. math::
|
||||
h_{t} & = act(x_{t} + b_{ih} + W_{hh}h_{t-1} + b_{hh})
|
||||
y_{t} & = h_{t}
|
||||
|
||||
where :math:`act` is for :attr:`activation`.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
hidden_size,
|
||||
activation="tanh",
|
||||
weight_ih_attr=None,
|
||||
weight_hh_attr=None,
|
||||
bias_ih_attr=None,
|
||||
bias_hh_attr=None,
|
||||
name=None):
|
||||
super().__init__()
|
||||
std = 1.0 / math.sqrt(hidden_size)
|
||||
self.weight_hh = self.create_parameter(
|
||||
(hidden_size, hidden_size),
|
||||
weight_hh_attr,
|
||||
default_initializer=I.Uniform(-std, std))
|
||||
self.bias_ih = None
|
||||
self.bias_hh = self.create_parameter(
|
||||
(hidden_size, ),
|
||||
bias_hh_attr,
|
||||
is_bias=True,
|
||||
default_initializer=I.Uniform(-std, std))
|
||||
|
||||
self.hidden_size = hidden_size
|
||||
if activation not in ["tanh", "relu", "brelu"]:
|
||||
raise ValueError(
|
||||
"activation for SimpleRNNCell should be tanh or relu, "
|
||||
"but get {}".format(activation))
|
||||
self.activation = activation
|
||||
self._activation_fn = paddle.tanh \
|
||||
if activation == "tanh" \
|
||||
else F.relu
|
||||
if activation == 'brelu':
|
||||
self._activation_fn = brelu
|
||||
|
||||
def forward(self, inputs, states=None):
|
||||
if states is None:
|
||||
states = self.get_initial_states(inputs, self.state_shape)
|
||||
pre_h = states
|
||||
i2h = inputs
|
||||
if self.bias_ih is not None:
|
||||
i2h += self.bias_ih
|
||||
h2h = paddle.matmul(pre_h, self.weight_hh, transpose_y=True)
|
||||
if self.bias_hh is not None:
|
||||
h2h += self.bias_hh
|
||||
h = self._activation_fn(i2h + h2h)
|
||||
return h, h
|
||||
|
||||
@property
|
||||
def state_shape(self):
|
||||
return (self.hidden_size, )
|
||||
|
||||
|
||||
class GRUCell(nn.RNNCellBase):
|
||||
r"""
|
||||
Gated Recurrent Unit (GRU) RNN cell. Given the inputs and previous states,
|
||||
it computes the outputs and updates states.
|
||||
The formula for GRU used is as follows:
|
||||
.. math::
|
||||
r_{t} & = \sigma(W_{ir}x_{t} + b_{ir} + W_{hr}h_{t-1} + b_{hr})
|
||||
z_{t} & = \sigma(W_{iz}x_{t} + b_{iz} + W_{hz}h_{t-1} + b_{hz})
|
||||
\widetilde{h}_{t} & = \tanh(W_{ic}x_{t} + b_{ic} + r_{t} * (W_{hc}h_{t-1} + b_{hc}))
|
||||
h_{t} & = z_{t} * h_{t-1} + (1 - z_{t}) * \widetilde{h}_{t}
|
||||
y_{t} & = h_{t}
|
||||
|
||||
where :math:`\sigma` is the sigmoid fucntion, and * is the elemetwise
|
||||
multiplication operator.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
input_size,
|
||||
hidden_size,
|
||||
weight_ih_attr=None,
|
||||
weight_hh_attr=None,
|
||||
bias_ih_attr=None,
|
||||
bias_hh_attr=None,
|
||||
name=None):
|
||||
super().__init__()
|
||||
std = 1.0 / math.sqrt(hidden_size)
|
||||
self.weight_hh = self.create_parameter(
|
||||
(3 * hidden_size, hidden_size),
|
||||
weight_hh_attr,
|
||||
default_initializer=I.Uniform(-std, std))
|
||||
self.bias_ih = None
|
||||
self.bias_hh = self.create_parameter(
|
||||
(3 * hidden_size, ),
|
||||
bias_hh_attr,
|
||||
is_bias=True,
|
||||
default_initializer=I.Uniform(-std, std))
|
||||
|
||||
self.hidden_size = hidden_size
|
||||
self.input_size = input_size
|
||||
self._gate_activation = F.sigmoid
|
||||
self._activation = paddle.tanh
|
||||
#self._activation = F.relu
|
||||
|
||||
def forward(self, inputs, states=None):
|
||||
if states is None:
|
||||
states = self.get_initial_states(inputs, self.state_shape)
|
||||
|
||||
pre_hidden = states
|
||||
x_gates = inputs
|
||||
if self.bias_ih is not None:
|
||||
x_gates = x_gates + self.bias_ih
|
||||
h_gates = paddle.matmul(pre_hidden, self.weight_hh, transpose_y=True)
|
||||
if self.bias_hh is not None:
|
||||
h_gates = h_gates + self.bias_hh
|
||||
|
||||
x_r, x_z, x_c = paddle.split(x_gates, num_or_sections=3, axis=1)
|
||||
h_r, h_z, h_c = paddle.split(h_gates, num_or_sections=3, axis=1)
|
||||
|
||||
r = self._gate_activation(x_r + h_r)
|
||||
z = self._gate_activation(x_z + h_z)
|
||||
c = self._activation(x_c + r * h_c) # apply reset gate after mm
|
||||
h = (pre_hidden - c) * z + c
|
||||
# https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/fluid/layers/dynamic_gru_cn.html#dynamic-gru
|
||||
|
||||
return h, h
|
||||
|
||||
@property
|
||||
def state_shape(self):
|
||||
r"""
|
||||
The `state_shape` of GRUCell is a shape `[hidden_size]` (-1 for batch
|
||||
size would be automatically inserted into shape). The shape corresponds
|
||||
to the shape of :math:`h_{t-1}`.
|
||||
"""
|
||||
return (self.hidden_size, )
|
||||
|
||||
|
||||
class BiRNNWithBN(nn.Layer):
|
||||
"""Bidirectonal simple rnn layer with sequence-wise batch normalization.
|
||||
The batch normalization is only performed on input-state weights.
|
||||
|
||||
:param name: Name of the layer parameters.
|
||||
:type name: string
|
||||
:param size: Dimension of RNN cells.
|
||||
:type size: int
|
||||
:param share_weights: Whether to share input-hidden weights between
|
||||
forward and backward directional RNNs.
|
||||
:type share_weights: bool
|
||||
:return: Bidirectional simple rnn layer.
|
||||
:rtype: Variable
|
||||
"""
|
||||
|
||||
def __init__(self, i_size, h_size, share_weights):
|
||||
super().__init__()
|
||||
self.share_weights = share_weights
|
||||
if self.share_weights:
|
||||
#input-hidden weights shared between bi-directional rnn.
|
||||
self.fw_fc = nn.Linear(i_size, h_size, bias_attr=False)
|
||||
# batch norm is only performed on input-state projection
|
||||
self.fw_bn = nn.BatchNorm1D(
|
||||
h_size, bias_attr=None, data_format='NLC')
|
||||
self.bw_fc = self.fw_fc
|
||||
self.bw_bn = self.fw_bn
|
||||
else:
|
||||
self.fw_fc = nn.Linear(i_size, h_size, bias_attr=False)
|
||||
self.fw_bn = nn.BatchNorm1D(
|
||||
h_size, bias_attr=None, data_format='NLC')
|
||||
self.bw_fc = nn.Linear(i_size, h_size, bias_attr=False)
|
||||
self.bw_bn = nn.BatchNorm1D(
|
||||
h_size, bias_attr=None, data_format='NLC')
|
||||
|
||||
self.fw_cell = RNNCell(hidden_size=h_size, activation='brelu')
|
||||
self.bw_cell = RNNCell(hidden_size=h_size, activation='brelu')
|
||||
self.fw_rnn = nn.RNN(
|
||||
self.fw_cell, is_reverse=False, time_major=False) #[B, T, D]
|
||||
self.bw_rnn = nn.RNN(
|
||||
self.fw_cell, is_reverse=True, time_major=False) #[B, T, D]
|
||||
|
||||
def forward(self, x, x_len):
|
||||
# x, shape [B, T, D]
|
||||
fw_x = self.fw_bn(self.fw_fc(x))
|
||||
bw_x = self.bw_bn(self.bw_fc(x))
|
||||
fw_x, _ = self.fw_rnn(inputs=fw_x, sequence_length=x_len)
|
||||
bw_x, _ = self.bw_rnn(inputs=bw_x, sequence_length=x_len)
|
||||
x = paddle.concat([fw_x, bw_x], axis=-1)
|
||||
return x, x_len
|
||||
|
||||
|
||||
class BiGRUWithBN(nn.Layer):
|
||||
"""Bidirectonal gru layer with sequence-wise batch normalization.
|
||||
The batch normalization is only performed on input-state weights.
|
||||
|
||||
:param name: Name of the layer.
|
||||
:type name: string
|
||||
:param input: Input layer.
|
||||
:type input: Variable
|
||||
:param size: Dimension of GRU cells.
|
||||
:type size: int
|
||||
:param act: Activation type.
|
||||
:type act: string
|
||||
:return: Bidirectional GRU layer.
|
||||
:rtype: Variable
|
||||
"""
|
||||
|
||||
def __init__(self, i_size, h_size, act):
|
||||
super().__init__()
|
||||
hidden_size = h_size * 3
|
||||
|
||||
self.fw_fc = nn.Linear(i_size, hidden_size, bias_attr=False)
|
||||
self.fw_bn = nn.BatchNorm1D(
|
||||
hidden_size, bias_attr=None, data_format='NLC')
|
||||
self.bw_fc = nn.Linear(i_size, hidden_size, bias_attr=False)
|
||||
self.bw_bn = nn.BatchNorm1D(
|
||||
hidden_size, bias_attr=None, data_format='NLC')
|
||||
|
||||
self.fw_cell = GRUCell(input_size=hidden_size, hidden_size=h_size)
|
||||
self.bw_cell = GRUCell(input_size=hidden_size, hidden_size=h_size)
|
||||
self.fw_rnn = nn.RNN(
|
||||
self.fw_cell, is_reverse=False, time_major=False) #[B, T, D]
|
||||
self.bw_rnn = nn.RNN(
|
||||
self.fw_cell, is_reverse=True, time_major=False) #[B, T, D]
|
||||
|
||||
def forward(self, x, x_len):
|
||||
# x, shape [B, T, D]
|
||||
fw_x = self.fw_bn(self.fw_fc(x))
|
||||
bw_x = self.bw_bn(self.bw_fc(x))
|
||||
fw_x, _ = self.fw_rnn(inputs=fw_x, sequence_length=x_len)
|
||||
bw_x, _ = self.bw_rnn(inputs=bw_x, sequence_length=x_len)
|
||||
x = paddle.concat([fw_x, bw_x], axis=-1)
|
||||
return x, x_len
|
||||
|
||||
|
||||
class RNNStack(nn.Layer):
|
||||
"""RNN group with stacked bidirectional simple RNN or GRU layers.
|
||||
|
||||
:param input: Input layer.
|
||||
:type input: Variable
|
||||
:param size: Dimension of RNN cells in each layer.
|
||||
:type size: int
|
||||
:param num_stacks: Number of stacked rnn layers.
|
||||
:type num_stacks: int
|
||||
:param use_gru: Use gru if set True. Use simple rnn if set False.
|
||||
:type use_gru: bool
|
||||
:param share_rnn_weights: Whether to share input-hidden weights between
|
||||
forward and backward directional RNNs.
|
||||
It is only available when use_gru=False.
|
||||
:type share_weights: bool
|
||||
:return: Output layer of the RNN group.
|
||||
:rtype: Variable
|
||||
"""
|
||||
|
||||
def __init__(self, i_size, h_size, num_stacks, use_gru, share_rnn_weights):
|
||||
super().__init__()
|
||||
self.rnn_stacks = nn.LayerList()
|
||||
for i in range(num_stacks):
|
||||
if use_gru:
|
||||
#default:GRU using tanh
|
||||
self.rnn_stacks.append(
|
||||
BiGRUWithBN(i_size=i_size, h_size=h_size, act="relu"))
|
||||
else:
|
||||
self.rnn_stacks.append(
|
||||
BiRNNWithBN(
|
||||
i_size=i_size,
|
||||
h_size=h_size,
|
||||
share_weights=share_rnn_weights))
|
||||
i_size = h_size * 2
|
||||
|
||||
def forward(self, x, x_len):
|
||||
"""
|
||||
x: shape [B, T, D]
|
||||
x_len: shpae [B]
|
||||
"""
|
||||
for i, rnn in enumerate(self.rnn_stacks):
|
||||
x, x_len = rnn(x, x_len)
|
||||
masks = sequence_mask(x_len) #[B, T]
|
||||
masks = masks.unsqueeze(-1) # [B, T, 1]
|
||||
x = x.multiply(masks)
|
||||
return x, x_len
|
@ -0,0 +1,73 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import logging
|
||||
|
||||
from paddle.fluid.dygraph import base as imperative_base
|
||||
from paddle.fluid import layers
|
||||
from paddle.fluid import core
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MyClipGradByGlobalNorm(paddle.nn.ClipGradByGlobalNorm):
|
||||
def __init__(self, clip_norm):
|
||||
super().__init__(clip_norm)
|
||||
|
||||
@imperative_base.no_grad
|
||||
def _dygraph_clip(self, params_grads):
|
||||
params_and_grads = []
|
||||
sum_square_list = []
|
||||
for p, g in params_grads:
|
||||
if g is None:
|
||||
continue
|
||||
if getattr(p, 'need_clip', True) is False:
|
||||
continue
|
||||
merge_grad = g
|
||||
if g.type == core.VarDesc.VarType.SELECTED_ROWS:
|
||||
merge_grad = layers.merge_selected_rows(g)
|
||||
merge_grad = layers.get_tensor_from_selected_rows(merge_grad)
|
||||
square = layers.square(merge_grad)
|
||||
sum_square = layers.reduce_sum(square)
|
||||
logger.info(
|
||||
f"Grad Before Clip: {p.name}: {float(layers.sqrt(layers.reduce_sum(layers.square(merge_grad))) ) }"
|
||||
)
|
||||
sum_square_list.append(sum_square)
|
||||
|
||||
# all parameters have been filterd out
|
||||
if len(sum_square_list) == 0:
|
||||
return params_grads
|
||||
|
||||
global_norm_var = layers.concat(sum_square_list)
|
||||
global_norm_var = layers.reduce_sum(global_norm_var)
|
||||
global_norm_var = layers.sqrt(global_norm_var)
|
||||
logger.info(f"Grad Global Norm: {float(global_norm_var)}!!!!")
|
||||
max_global_norm = layers.fill_constant(
|
||||
shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm)
|
||||
clip_var = layers.elementwise_div(
|
||||
x=max_global_norm,
|
||||
y=layers.elementwise_max(x=global_norm_var, y=max_global_norm))
|
||||
for p, g in params_grads:
|
||||
if g is None:
|
||||
continue
|
||||
if getattr(p, 'need_clip', True) is False:
|
||||
params_and_grads.append((p, g))
|
||||
continue
|
||||
new_grad = layers.elementwise_mul(x=g, y=clip_var)
|
||||
logger.info(
|
||||
f"Grad After Clip: {p.name}: {float(layers.sqrt(layers.reduce_sum(layers.square(merge_grad))) ) }"
|
||||
)
|
||||
params_and_grads.append((p, new_grad))
|
||||
|
||||
return params_and_grads
|
@ -0,0 +1,65 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import logging
|
||||
|
||||
import paddle
|
||||
from paddle import nn
|
||||
from paddle.nn import functional as F
|
||||
from paddle.nn import initializer as I
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
__all__ = ['CTCLoss']
|
||||
|
||||
|
||||
def ctc_loss(logits,
|
||||
labels,
|
||||
input_lengths,
|
||||
label_lengths,
|
||||
blank=0,
|
||||
reduction='mean',
|
||||
norm_by_times=True):
|
||||
#logger.info("my ctc loss with norm by times")
|
||||
## https://github.com/PaddlePaddle/Paddle/blob/f5ca2db2cc/paddle/fluid/operators/warpctc_op.h#L403
|
||||
loss_out = paddle.fluid.layers.warpctc(logits, labels, blank, norm_by_times,
|
||||
input_lengths, label_lengths)
|
||||
|
||||
loss_out = paddle.fluid.layers.squeeze(loss_out, [-1])
|
||||
logger.info(f"warpctc loss: {loss_out}/{loss_out.shape} ")
|
||||
assert reduction in ['mean', 'sum', 'none']
|
||||
if reduction == 'mean':
|
||||
loss_out = paddle.mean(loss_out / label_lengths)
|
||||
elif reduction == 'sum':
|
||||
loss_out = paddle.sum(loss_out)
|
||||
logger.info(f"ctc loss: {loss_out}")
|
||||
return loss_out
|
||||
|
||||
|
||||
F.ctc_loss = ctc_loss
|
||||
|
||||
|
||||
class CTCLoss(nn.Layer):
|
||||
def __init__(self, blank_id):
|
||||
super().__init__()
|
||||
# last token id as blank id
|
||||
self.loss = nn.CTCLoss(blank=blank_id, reduction='sum')
|
||||
|
||||
def forward(self, logits, text, logits_len, text_len):
|
||||
# warp-ctc do softmax on activations
|
||||
# warp-ctc need activation with shape [T, B, V + 1]
|
||||
logits = logits.transpose([1, 0, 2])
|
||||
|
||||
ctc_loss = self.loss(logits, text, logits_len, text_len)
|
||||
return ctc_loss
|
Loading…
Reference in new issue