parent
1ae41eac90
commit
4da2852982
@ -0,0 +1,9 @@
|
|||||||
|
ThreadPool/
|
||||||
|
build/
|
||||||
|
dist/
|
||||||
|
kenlm/
|
||||||
|
openfst-1.6.3/
|
||||||
|
openfst-1.6.3.tar.gz
|
||||||
|
swig_decoders.egg-info/
|
||||||
|
decoders_wrap.cxx
|
||||||
|
swig_decoders.py
|
@ -1,584 +0,0 @@
|
|||||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
|
|
||||||
import math
|
|
||||||
import random
|
|
||||||
import tarfile
|
|
||||||
import logging
|
|
||||||
import numpy as np
|
|
||||||
from collections import namedtuple
|
|
||||||
from functools import partial
|
|
||||||
|
|
||||||
import paddle
|
|
||||||
from paddle.io import Dataset
|
|
||||||
from paddle.io import DataLoader
|
|
||||||
from paddle.io import BatchSampler
|
|
||||||
from paddle.io import DistributedBatchSampler
|
|
||||||
from paddle import distributed as dist
|
|
||||||
|
|
||||||
from deepspeech.frontend.utility import read_manifest
|
|
||||||
from deepspeech.frontend.augmentor.augmentation import AugmentationPipeline
|
|
||||||
from deepspeech.frontend.featurizer.speech_featurizer import SpeechFeaturizer
|
|
||||||
from deepspeech.frontend.speech import SpeechSegment
|
|
||||||
from deepspeech.frontend.normalizer import FeatureNormalizer
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
__all__ = [
|
|
||||||
"DeepSpeech2Dataset",
|
|
||||||
"DeepSpeech2DistributedBatchSampler",
|
|
||||||
"DeepSpeech2BatchSampler",
|
|
||||||
"SpeechCollator",
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
class DeepSpeech2Dataset(Dataset):
|
|
||||||
def __init__(self,
|
|
||||||
manifest_path,
|
|
||||||
vocab_filepath,
|
|
||||||
mean_std_filepath,
|
|
||||||
augmentation_config='{}',
|
|
||||||
max_duration=float('inf'),
|
|
||||||
min_duration=0.0,
|
|
||||||
stride_ms=10.0,
|
|
||||||
window_ms=20.0,
|
|
||||||
n_fft=None,
|
|
||||||
max_freq=None,
|
|
||||||
target_sample_rate=16000,
|
|
||||||
specgram_type='linear',
|
|
||||||
use_dB_normalization=True,
|
|
||||||
target_dB=-20,
|
|
||||||
random_seed=0,
|
|
||||||
keep_transcription_text=False):
|
|
||||||
super().__init__()
|
|
||||||
|
|
||||||
self._max_duration = max_duration
|
|
||||||
self._min_duration = min_duration
|
|
||||||
self._normalizer = FeatureNormalizer(mean_std_filepath)
|
|
||||||
self._augmentation_pipeline = AugmentationPipeline(
|
|
||||||
augmentation_config=augmentation_config, random_seed=random_seed)
|
|
||||||
self._speech_featurizer = SpeechFeaturizer(
|
|
||||||
vocab_filepath=vocab_filepath,
|
|
||||||
specgram_type=specgram_type,
|
|
||||||
stride_ms=stride_ms,
|
|
||||||
window_ms=window_ms,
|
|
||||||
n_fft=n_fft,
|
|
||||||
max_freq=max_freq,
|
|
||||||
target_sample_rate=target_sample_rate,
|
|
||||||
use_dB_normalization=use_dB_normalization,
|
|
||||||
target_dB=target_dB)
|
|
||||||
self._rng = random.Random(random_seed)
|
|
||||||
self._keep_transcription_text = keep_transcription_text
|
|
||||||
# for caching tar files info
|
|
||||||
self._local_data = namedtuple('local_data', ['tar2info', 'tar2object'])
|
|
||||||
self._local_data.tar2info = {}
|
|
||||||
self._local_data.tar2object = {}
|
|
||||||
|
|
||||||
# read manifest
|
|
||||||
self._manifest = read_manifest(
|
|
||||||
manifest_path=manifest_path,
|
|
||||||
max_duration=self._max_duration,
|
|
||||||
min_duration=self._min_duration)
|
|
||||||
self._manifest.sort(key=lambda x: x["duration"])
|
|
||||||
|
|
||||||
@property
|
|
||||||
def manifest(self):
|
|
||||||
return self._manifest
|
|
||||||
|
|
||||||
@property
|
|
||||||
def vocab_size(self):
|
|
||||||
"""Return the vocabulary size.
|
|
||||||
|
|
||||||
:return: Vocabulary size.
|
|
||||||
:rtype: int
|
|
||||||
"""
|
|
||||||
return self._speech_featurizer.vocab_size
|
|
||||||
|
|
||||||
@property
|
|
||||||
def vocab_list(self):
|
|
||||||
"""Return the vocabulary in list.
|
|
||||||
|
|
||||||
:return: Vocabulary in list.
|
|
||||||
:rtype: list
|
|
||||||
"""
|
|
||||||
return self._speech_featurizer.vocab_list
|
|
||||||
|
|
||||||
@property
|
|
||||||
def feature_size(self):
|
|
||||||
return self._speech_featurizer.feature_size
|
|
||||||
|
|
||||||
def _parse_tar(self, file):
|
|
||||||
"""Parse a tar file to get a tarfile object
|
|
||||||
and a map containing tarinfoes
|
|
||||||
"""
|
|
||||||
result = {}
|
|
||||||
f = tarfile.open(file)
|
|
||||||
for tarinfo in f.getmembers():
|
|
||||||
result[tarinfo.name] = tarinfo
|
|
||||||
return f, result
|
|
||||||
|
|
||||||
def _subfile_from_tar(self, file):
|
|
||||||
"""Get subfile object from tar.
|
|
||||||
|
|
||||||
It will return a subfile object from tar file
|
|
||||||
and cached tar file info for next reading request.
|
|
||||||
"""
|
|
||||||
tarpath, filename = file.split(':', 1)[1].split('#', 1)
|
|
||||||
if 'tar2info' not in self._local_data.__dict__:
|
|
||||||
self._local_data.tar2info = {}
|
|
||||||
if 'tar2object' not in self._local_data.__dict__:
|
|
||||||
self._local_data.tar2object = {}
|
|
||||||
if tarpath not in self._local_data.tar2info:
|
|
||||||
object, infoes = self._parse_tar(tarpath)
|
|
||||||
self._local_data.tar2info[tarpath] = infoes
|
|
||||||
self._local_data.tar2object[tarpath] = object
|
|
||||||
return self._local_data.tar2object[tarpath].extractfile(
|
|
||||||
self._local_data.tar2info[tarpath][filename])
|
|
||||||
|
|
||||||
def process_utterance(self, audio_file, transcript):
|
|
||||||
"""Load, augment, featurize and normalize for speech data.
|
|
||||||
|
|
||||||
:param audio_file: Filepath or file object of audio file.
|
|
||||||
:type audio_file: str | file
|
|
||||||
:param transcript: Transcription text.
|
|
||||||
:type transcript: str
|
|
||||||
:return: Tuple of audio feature tensor and data of transcription part,
|
|
||||||
where transcription part could be token ids or text.
|
|
||||||
:rtype: tuple of (2darray, list)
|
|
||||||
"""
|
|
||||||
if isinstance(audio_file, str) and audio_file.startswith('tar:'):
|
|
||||||
speech_segment = SpeechSegment.from_file(
|
|
||||||
self._subfile_from_tar(audio_file), transcript)
|
|
||||||
else:
|
|
||||||
speech_segment = SpeechSegment.from_file(audio_file, transcript)
|
|
||||||
self._augmentation_pipeline.transform_audio(speech_segment)
|
|
||||||
specgram, transcript_part = self._speech_featurizer.featurize(
|
|
||||||
speech_segment, self._keep_transcription_text)
|
|
||||||
specgram = self._normalizer.apply(specgram)
|
|
||||||
return specgram, transcript_part
|
|
||||||
|
|
||||||
def _instance_reader_creator(self, manifest):
|
|
||||||
"""
|
|
||||||
Instance reader creator. Create a callable function to produce
|
|
||||||
instances of data.
|
|
||||||
|
|
||||||
Instance: a tuple of ndarray of audio spectrogram and a list of
|
|
||||||
token indices for transcript.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def reader():
|
|
||||||
for instance in manifest:
|
|
||||||
inst = self.process_utterance(instance["audio_filepath"],
|
|
||||||
instance["text"])
|
|
||||||
yield inst
|
|
||||||
|
|
||||||
return reader
|
|
||||||
|
|
||||||
def __len__(self):
|
|
||||||
return len(self._manifest)
|
|
||||||
|
|
||||||
def __getitem__(self, idx):
|
|
||||||
instance = self._manifest[idx]
|
|
||||||
return self.process_utterance(instance["audio_filepath"],
|
|
||||||
instance["text"])
|
|
||||||
|
|
||||||
|
|
||||||
class DeepSpeech2DistributedBatchSampler(DistributedBatchSampler):
|
|
||||||
def __init__(self,
|
|
||||||
dataset,
|
|
||||||
batch_size,
|
|
||||||
num_replicas=None,
|
|
||||||
rank=None,
|
|
||||||
shuffle=False,
|
|
||||||
drop_last=False,
|
|
||||||
sortagrad=False,
|
|
||||||
shuffle_method="batch_shuffle"):
|
|
||||||
super().__init__(dataset, batch_size, num_replicas, rank, shuffle,
|
|
||||||
drop_last)
|
|
||||||
self._sortagrad = sortagrad
|
|
||||||
self._shuffle_method = shuffle_method
|
|
||||||
|
|
||||||
def _batch_shuffle(self, indices, batch_size, clipped=False):
|
|
||||||
"""Put similarly-sized instances into minibatches for better efficiency
|
|
||||||
and make a batch-wise shuffle.
|
|
||||||
|
|
||||||
1. Sort the audio clips by duration.
|
|
||||||
2. Generate a random number `k`, k in [0, batch_size).
|
|
||||||
3. Randomly shift `k` instances in order to create different batches
|
|
||||||
for different epochs. Create minibatches.
|
|
||||||
4. Shuffle the minibatches.
|
|
||||||
|
|
||||||
:param indices: indexes. List of int.
|
|
||||||
:type indices: list
|
|
||||||
:param batch_size: Batch size. This size is also used for generate
|
|
||||||
a random number for batch shuffle.
|
|
||||||
:type batch_size: int
|
|
||||||
:param clipped: Whether to clip the heading (small shift) and trailing
|
|
||||||
(incomplete batch) instances.
|
|
||||||
:type clipped: bool
|
|
||||||
:return: Batch shuffled mainifest.
|
|
||||||
:rtype: list
|
|
||||||
"""
|
|
||||||
rng = np.random.RandomState(self.epoch)
|
|
||||||
shift_len = rng.randint(0, batch_size - 1)
|
|
||||||
batch_indices = list(zip(* [iter(indices[shift_len:])] * batch_size))
|
|
||||||
rng.shuffle(batch_indices)
|
|
||||||
batch_indices = [item for batch in batch_indices for item in batch]
|
|
||||||
assert (clipped == False)
|
|
||||||
if not clipped:
|
|
||||||
res_len = len(indices) - shift_len - len(batch_indices)
|
|
||||||
# when res_len is 0, will return whole list, len(List[-0:]) = len(List[:])
|
|
||||||
if res_len != 0:
|
|
||||||
batch_indices.extend(indices[-res_len:])
|
|
||||||
batch_indices.extend(indices[0:shift_len])
|
|
||||||
assert len(indices) == len(
|
|
||||||
batch_indices
|
|
||||||
), f"_batch_shuffle: {len(indices)} : {len(batch_indices)} : {res_len} - {shift_len}"
|
|
||||||
return batch_indices
|
|
||||||
|
|
||||||
def __iter__(self):
|
|
||||||
num_samples = len(self.dataset)
|
|
||||||
indices = np.arange(num_samples).tolist()
|
|
||||||
indices += indices[:(self.total_size - len(indices))]
|
|
||||||
assert len(indices) == self.total_size
|
|
||||||
|
|
||||||
# sort (by duration) or batch-wise shuffle the manifest
|
|
||||||
if self.shuffle:
|
|
||||||
if self.epoch == 0 and self._sortagrad:
|
|
||||||
logger.info(
|
|
||||||
f'rank: {dist.get_rank()} dataset sortagrad! epoch {self.epoch}'
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
logger.info(
|
|
||||||
f'rank: {dist.get_rank()} dataset shuffle! epoch {self.epoch}'
|
|
||||||
)
|
|
||||||
if self._shuffle_method == "batch_shuffle":
|
|
||||||
indices = self._batch_shuffle(
|
|
||||||
indices, self.batch_size, clipped=False)
|
|
||||||
elif self._shuffle_method == "instance_shuffle":
|
|
||||||
np.random.RandomState(self.epoch).shuffle(indices)
|
|
||||||
else:
|
|
||||||
raise ValueError("Unknown shuffle method %s." %
|
|
||||||
self._shuffle_method)
|
|
||||||
assert len(
|
|
||||||
indices
|
|
||||||
) == self.total_size, f"batch shuffle examples error: {len(indices)} : {self.total_size}"
|
|
||||||
|
|
||||||
# subsample
|
|
||||||
def _get_indices_by_batch_size(indices):
|
|
||||||
subsampled_indices = []
|
|
||||||
last_batch_size = self.total_size % (self.batch_size * self.nranks)
|
|
||||||
assert last_batch_size % self.nranks == 0
|
|
||||||
last_local_batch_size = last_batch_size // self.nranks
|
|
||||||
|
|
||||||
for i in range(self.local_rank * self.batch_size,
|
|
||||||
len(indices) - last_batch_size,
|
|
||||||
self.batch_size * self.nranks):
|
|
||||||
subsampled_indices.extend(indices[i:i + self.batch_size])
|
|
||||||
|
|
||||||
indices = indices[len(indices) - last_batch_size:]
|
|
||||||
subsampled_indices.extend(
|
|
||||||
indices[self.local_rank * last_local_batch_size:(
|
|
||||||
self.local_rank + 1) * last_local_batch_size])
|
|
||||||
return subsampled_indices
|
|
||||||
|
|
||||||
if self.nranks > 1:
|
|
||||||
indices = _get_indices_by_batch_size(indices)
|
|
||||||
|
|
||||||
assert len(indices) == self.num_samples
|
|
||||||
_sample_iter = iter(indices)
|
|
||||||
|
|
||||||
batch_indices = []
|
|
||||||
for idx in _sample_iter:
|
|
||||||
batch_indices.append(idx)
|
|
||||||
if len(batch_indices) == self.batch_size:
|
|
||||||
logger.info(
|
|
||||||
f"rank: {dist.get_rank()} batch index: {batch_indices} ")
|
|
||||||
yield batch_indices
|
|
||||||
batch_indices = []
|
|
||||||
if not self.drop_last and len(batch_indices) > 0:
|
|
||||||
yield batch_indices
|
|
||||||
|
|
||||||
def __len__(self):
|
|
||||||
num_samples = self.num_samples
|
|
||||||
num_samples += int(not self.drop_last) * (self.batch_size - 1)
|
|
||||||
return num_samples // self.batch_size
|
|
||||||
|
|
||||||
|
|
||||||
class DeepSpeech2BatchSampler(BatchSampler):
|
|
||||||
def __init__(self,
|
|
||||||
dataset,
|
|
||||||
batch_size,
|
|
||||||
shuffle=False,
|
|
||||||
drop_last=False,
|
|
||||||
sortagrad=False,
|
|
||||||
shuffle_method="batch_shuffle"):
|
|
||||||
self.dataset = dataset
|
|
||||||
|
|
||||||
assert isinstance(batch_size, int) and batch_size > 0, \
|
|
||||||
"batch_size should be a positive integer"
|
|
||||||
self.batch_size = batch_size
|
|
||||||
assert isinstance(shuffle, bool), \
|
|
||||||
"shuffle should be a boolean value"
|
|
||||||
self.shuffle = shuffle
|
|
||||||
assert isinstance(drop_last, bool), \
|
|
||||||
"drop_last should be a boolean number"
|
|
||||||
|
|
||||||
self.drop_last = drop_last
|
|
||||||
self.epoch = 0
|
|
||||||
self.num_samples = int(math.ceil(len(self.dataset) * 1.0))
|
|
||||||
self.total_size = self.num_samples
|
|
||||||
self._sortagrad = sortagrad
|
|
||||||
self._shuffle_method = shuffle_method
|
|
||||||
|
|
||||||
def _batch_shuffle(self, indices, batch_size, clipped=False):
|
|
||||||
"""Put similarly-sized instances into minibatches for better efficiency
|
|
||||||
and make a batch-wise shuffle.
|
|
||||||
|
|
||||||
1. Sort the audio clips by duration.
|
|
||||||
2. Generate a random number `k`, k in [0, batch_size).
|
|
||||||
3. Randomly shift `k` instances in order to create different batches
|
|
||||||
for different epochs. Create minibatches.
|
|
||||||
4. Shuffle the minibatches.
|
|
||||||
|
|
||||||
:param indices: indexes. List of int.
|
|
||||||
:type indices: list
|
|
||||||
:param batch_size: Batch size. This size is also used for generate
|
|
||||||
a random number for batch shuffle.
|
|
||||||
:type batch_size: int
|
|
||||||
:param clipped: Whether to clip the heading (small shift) and trailing
|
|
||||||
(incomplete batch) instances.
|
|
||||||
:type clipped: bool
|
|
||||||
:return: Batch shuffled mainifest.
|
|
||||||
:rtype: list
|
|
||||||
"""
|
|
||||||
rng = np.random.RandomState(self.epoch)
|
|
||||||
# must shift at leat by one
|
|
||||||
shift_len = rng.randint(0, batch_size - 1)
|
|
||||||
batch_indices = list(zip(* [iter(indices[shift_len:])] * batch_size))
|
|
||||||
rng.shuffle(batch_indices)
|
|
||||||
batch_indices = [item for batch in batch_indices for item in batch]
|
|
||||||
assert (clipped == False)
|
|
||||||
if not clipped:
|
|
||||||
res_len = len(indices) - shift_len - len(batch_indices)
|
|
||||||
# when res_len is 0, will return whole list, len(List[-0:]) = len(List[:])
|
|
||||||
if res_len != 0:
|
|
||||||
batch_indices.extend(indices[-res_len:])
|
|
||||||
batch_indices.extend(indices[0:shift_len])
|
|
||||||
assert len(indices) == len(
|
|
||||||
batch_indices
|
|
||||||
), f"_batch_shuffle: {len(indices)} : {len(batch_indices)} : {res_len} - {shift_len}"
|
|
||||||
return batch_indices
|
|
||||||
|
|
||||||
def __iter__(self):
|
|
||||||
num_samples = len(self.dataset)
|
|
||||||
indices = np.arange(num_samples).tolist()
|
|
||||||
indices += indices[:(self.total_size - len(indices))]
|
|
||||||
assert len(indices) == self.total_size
|
|
||||||
|
|
||||||
# sort (by duration) or batch-wise shuffle the manifest
|
|
||||||
if self.shuffle:
|
|
||||||
if self.epoch == 0 and self._sortagrad:
|
|
||||||
logger.info(f'dataset sortagrad! epoch {self.epoch}')
|
|
||||||
else:
|
|
||||||
logger.info(f'dataset shuffle! epoch {self.epoch}')
|
|
||||||
if self._shuffle_method == "batch_shuffle":
|
|
||||||
indices = self._batch_shuffle(
|
|
||||||
indices, self.batch_size, clipped=False)
|
|
||||||
elif self._shuffle_method == "instance_shuffle":
|
|
||||||
np.random.RandomState(self.epoch).shuffle(indices)
|
|
||||||
else:
|
|
||||||
raise ValueError("Unknown shuffle method %s." %
|
|
||||||
self._shuffle_method)
|
|
||||||
assert len(
|
|
||||||
indices
|
|
||||||
) == self.total_size, f"batch shuffle examples error: {len(indices)} : {self.total_size}"
|
|
||||||
|
|
||||||
assert len(indices) == self.num_samples
|
|
||||||
_sample_iter = iter(indices)
|
|
||||||
|
|
||||||
batch_indices = []
|
|
||||||
for idx in _sample_iter:
|
|
||||||
batch_indices.append(idx)
|
|
||||||
if len(batch_indices) == self.batch_size:
|
|
||||||
logger.info(
|
|
||||||
f"rank: {dist.get_rank()} batch index: {batch_indices} ")
|
|
||||||
yield batch_indices
|
|
||||||
batch_indices = []
|
|
||||||
if not self.drop_last and len(batch_indices) > 0:
|
|
||||||
yield batch_indices
|
|
||||||
|
|
||||||
self.epoch += 1
|
|
||||||
|
|
||||||
def __len__(self):
|
|
||||||
num_samples = self.num_samples
|
|
||||||
num_samples += int(not self.drop_last) * (self.batch_size - 1)
|
|
||||||
return num_samples // self.batch_size
|
|
||||||
|
|
||||||
|
|
||||||
class SpeechCollator():
|
|
||||||
def __init__(self, padding_to=-1, is_training=True):
|
|
||||||
"""
|
|
||||||
Padding audio features with zeros to make them have the same shape (or
|
|
||||||
a user-defined shape) within one bach.
|
|
||||||
|
|
||||||
If ``padding_to`` is -1, the maximun shape in the batch will be used
|
|
||||||
as the target shape for padding. Otherwise, `padding_to` will be the
|
|
||||||
target shape (only refers to the second axis).
|
|
||||||
"""
|
|
||||||
self._padding_to = padding_to
|
|
||||||
self._is_training = is_training
|
|
||||||
|
|
||||||
def __call__(self, batch):
|
|
||||||
new_batch = []
|
|
||||||
# get target shape
|
|
||||||
max_length = max([audio.shape[1] for audio, _ in batch])
|
|
||||||
if self._padding_to != -1:
|
|
||||||
if self._padding_to < max_length:
|
|
||||||
raise ValueError("If padding_to is not -1, it should be larger "
|
|
||||||
"than any instance's shape in the batch")
|
|
||||||
max_length = self._padding_to
|
|
||||||
max_text_length = max([len(text) for _, text in batch])
|
|
||||||
# padding
|
|
||||||
padded_audios = []
|
|
||||||
audio_lens = []
|
|
||||||
texts, text_lens = [], []
|
|
||||||
for audio, text in batch:
|
|
||||||
# audio
|
|
||||||
padded_audio = np.zeros([audio.shape[0], max_length])
|
|
||||||
padded_audio[:, :audio.shape[1]] = audio
|
|
||||||
padded_audios.append(padded_audio)
|
|
||||||
audio_lens.append(audio.shape[1])
|
|
||||||
# text
|
|
||||||
padded_text = np.zeros([max_text_length])
|
|
||||||
if self._is_training:
|
|
||||||
padded_text[:len(text)] = text #ids
|
|
||||||
else:
|
|
||||||
padded_text[:len(text)] = [ord(t) for t in text] # string
|
|
||||||
texts.append(padded_text)
|
|
||||||
text_lens.append(len(text))
|
|
||||||
|
|
||||||
padded_audios = np.array(padded_audios).astype('float32')
|
|
||||||
audio_lens = np.array(audio_lens).astype('int64')
|
|
||||||
texts = np.array(texts).astype('int32')
|
|
||||||
text_lens = np.array(text_lens).astype('int64')
|
|
||||||
return padded_audios, texts, audio_lens, text_lens
|
|
||||||
|
|
||||||
|
|
||||||
def create_dataloader(manifest_path,
|
|
||||||
vocab_filepath,
|
|
||||||
mean_std_filepath,
|
|
||||||
augmentation_config='{}',
|
|
||||||
max_duration=float('inf'),
|
|
||||||
min_duration=0.0,
|
|
||||||
stride_ms=10.0,
|
|
||||||
window_ms=20.0,
|
|
||||||
max_freq=None,
|
|
||||||
specgram_type='linear',
|
|
||||||
use_dB_normalization=True,
|
|
||||||
random_seed=0,
|
|
||||||
keep_transcription_text=False,
|
|
||||||
is_training=False,
|
|
||||||
batch_size=1,
|
|
||||||
num_workers=0,
|
|
||||||
sortagrad=False,
|
|
||||||
shuffle_method=None,
|
|
||||||
dist=False):
|
|
||||||
|
|
||||||
dataset = DeepSpeech2Dataset(
|
|
||||||
manifest_path,
|
|
||||||
vocab_filepath,
|
|
||||||
mean_std_filepath,
|
|
||||||
augmentation_config=augmentation_config,
|
|
||||||
max_duration=max_duration,
|
|
||||||
min_duration=min_duration,
|
|
||||||
stride_ms=stride_ms,
|
|
||||||
window_ms=window_ms,
|
|
||||||
max_freq=max_freq,
|
|
||||||
specgram_type=specgram_type,
|
|
||||||
use_dB_normalization=use_dB_normalization,
|
|
||||||
random_seed=random_seed,
|
|
||||||
keep_transcription_text=keep_transcription_text)
|
|
||||||
|
|
||||||
if dist:
|
|
||||||
batch_sampler = DeepSpeech2DistributedBatchSampler(
|
|
||||||
dataset,
|
|
||||||
batch_size,
|
|
||||||
num_replicas=None,
|
|
||||||
rank=None,
|
|
||||||
shuffle=is_training,
|
|
||||||
drop_last=is_training,
|
|
||||||
sortagrad=is_training,
|
|
||||||
shuffle_method=shuffle_method)
|
|
||||||
else:
|
|
||||||
batch_sampler = DeepSpeech2BatchSampler(
|
|
||||||
dataset,
|
|
||||||
shuffle=is_training,
|
|
||||||
batch_size=batch_size,
|
|
||||||
drop_last=is_training,
|
|
||||||
sortagrad=is_training,
|
|
||||||
shuffle_method=shuffle_method)
|
|
||||||
|
|
||||||
def padding_batch(batch, padding_to=-1, flatten=False, is_training=True):
|
|
||||||
"""
|
|
||||||
Padding audio features with zeros to make them have the same shape (or
|
|
||||||
a user-defined shape) within one bach.
|
|
||||||
|
|
||||||
If ``padding_to`` is -1, the maximun shape in the batch will be used
|
|
||||||
as the target shape for padding. Otherwise, `padding_to` will be the
|
|
||||||
target shape (only refers to the second axis).
|
|
||||||
|
|
||||||
If `flatten` is True, features will be flatten to 1darray.
|
|
||||||
"""
|
|
||||||
new_batch = []
|
|
||||||
# get target shape
|
|
||||||
max_length = max([audio.shape[1] for audio, text in batch])
|
|
||||||
if padding_to != -1:
|
|
||||||
if padding_to < max_length:
|
|
||||||
raise ValueError("If padding_to is not -1, it should be larger "
|
|
||||||
"than any instance's shape in the batch")
|
|
||||||
max_length = padding_to
|
|
||||||
max_text_length = max([len(text) for audio, text in batch])
|
|
||||||
# padding
|
|
||||||
padded_audios = []
|
|
||||||
audio_lens = []
|
|
||||||
texts, text_lens = [], []
|
|
||||||
for audio, text in batch:
|
|
||||||
padded_audio = np.zeros([audio.shape[0], max_length])
|
|
||||||
padded_audio[:, :audio.shape[1]] = audio
|
|
||||||
if flatten:
|
|
||||||
padded_audio = padded_audio.flatten()
|
|
||||||
padded_audios.append(padded_audio)
|
|
||||||
audio_lens.append(audio.shape[1])
|
|
||||||
|
|
||||||
padded_text = np.zeros([max_text_length])
|
|
||||||
if is_training:
|
|
||||||
padded_text[:len(text)] = text #ids
|
|
||||||
else:
|
|
||||||
padded_text[:len(text)] = [ord(t) for t in text] # string
|
|
||||||
texts.append(padded_text)
|
|
||||||
text_lens.append(len(text))
|
|
||||||
|
|
||||||
padded_audios = np.array(padded_audios).astype('float32')
|
|
||||||
audio_lens = np.array(audio_lens).astype('int64')
|
|
||||||
texts = np.array(texts).astype('int32')
|
|
||||||
text_lens = np.array(text_lens).astype('int64')
|
|
||||||
return padded_audios, texts, audio_lens, text_lens
|
|
||||||
|
|
||||||
loader = DataLoader(
|
|
||||||
dataset,
|
|
||||||
batch_sampler=batch_sampler,
|
|
||||||
collate_fn=partial(padding_batch, is_training=is_training),
|
|
||||||
num_workers=num_workers)
|
|
||||||
return loader
|
|
@ -0,0 +1,128 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
from paddle.io import DataLoader
|
||||||
|
|
||||||
|
from deepspeech.io.collator import SpeechCollator
|
||||||
|
from deepspeech.io.sampler import SortagradDistributedBatchSampler
|
||||||
|
from deepspeech.io.sampler import SortagradBatchSampler
|
||||||
|
from deepspeech.io.dataset import ManifestDataset
|
||||||
|
|
||||||
|
|
||||||
|
def create_dataloader(manifest_path,
|
||||||
|
vocab_filepath,
|
||||||
|
mean_std_filepath,
|
||||||
|
augmentation_config='{}',
|
||||||
|
max_duration=float('inf'),
|
||||||
|
min_duration=0.0,
|
||||||
|
stride_ms=10.0,
|
||||||
|
window_ms=20.0,
|
||||||
|
max_freq=None,
|
||||||
|
specgram_type='linear',
|
||||||
|
use_dB_normalization=True,
|
||||||
|
random_seed=0,
|
||||||
|
keep_transcription_text=False,
|
||||||
|
is_training=False,
|
||||||
|
batch_size=1,
|
||||||
|
num_workers=0,
|
||||||
|
sortagrad=False,
|
||||||
|
shuffle_method=None,
|
||||||
|
dist=False):
|
||||||
|
|
||||||
|
dataset = ManifestDataset(
|
||||||
|
manifest_path,
|
||||||
|
vocab_filepath,
|
||||||
|
mean_std_filepath,
|
||||||
|
augmentation_config=augmentation_config,
|
||||||
|
max_duration=max_duration,
|
||||||
|
min_duration=min_duration,
|
||||||
|
stride_ms=stride_ms,
|
||||||
|
window_ms=window_ms,
|
||||||
|
max_freq=max_freq,
|
||||||
|
specgram_type=specgram_type,
|
||||||
|
use_dB_normalization=use_dB_normalization,
|
||||||
|
random_seed=random_seed,
|
||||||
|
keep_transcription_text=keep_transcription_text)
|
||||||
|
|
||||||
|
if dist:
|
||||||
|
batch_sampler = SortagradDistributedBatchSampler(
|
||||||
|
dataset,
|
||||||
|
batch_size,
|
||||||
|
num_replicas=None,
|
||||||
|
rank=None,
|
||||||
|
shuffle=is_training,
|
||||||
|
drop_last=is_training,
|
||||||
|
sortagrad=is_training,
|
||||||
|
shuffle_method=shuffle_method)
|
||||||
|
else:
|
||||||
|
batch_sampler = SortagradBatchSampler(
|
||||||
|
dataset,
|
||||||
|
shuffle=is_training,
|
||||||
|
batch_size=batch_size,
|
||||||
|
drop_last=is_training,
|
||||||
|
sortagrad=is_training,
|
||||||
|
shuffle_method=shuffle_method)
|
||||||
|
|
||||||
|
def padding_batch(batch, padding_to=-1, flatten=False, is_training=True):
|
||||||
|
"""
|
||||||
|
Padding audio features with zeros to make them have the same shape (or
|
||||||
|
a user-defined shape) within one bach.
|
||||||
|
|
||||||
|
If ``padding_to`` is -1, the maximun shape in the batch will be used
|
||||||
|
as the target shape for padding. Otherwise, `padding_to` will be the
|
||||||
|
target shape (only refers to the second axis).
|
||||||
|
|
||||||
|
If `flatten` is True, features will be flatten to 1darray.
|
||||||
|
"""
|
||||||
|
new_batch = []
|
||||||
|
# get target shape
|
||||||
|
max_length = max([audio.shape[1] for audio, text in batch])
|
||||||
|
if padding_to != -1:
|
||||||
|
if padding_to < max_length:
|
||||||
|
raise ValueError("If padding_to is not -1, it should be larger "
|
||||||
|
"than any instance's shape in the batch")
|
||||||
|
max_length = padding_to
|
||||||
|
max_text_length = max([len(text) for audio, text in batch])
|
||||||
|
# padding
|
||||||
|
padded_audios = []
|
||||||
|
audio_lens = []
|
||||||
|
texts, text_lens = [], []
|
||||||
|
for audio, text in batch:
|
||||||
|
padded_audio = np.zeros([audio.shape[0], max_length])
|
||||||
|
padded_audio[:, :audio.shape[1]] = audio
|
||||||
|
if flatten:
|
||||||
|
padded_audio = padded_audio.flatten()
|
||||||
|
padded_audios.append(padded_audio)
|
||||||
|
audio_lens.append(audio.shape[1])
|
||||||
|
|
||||||
|
padded_text = np.zeros([max_text_length])
|
||||||
|
if is_training:
|
||||||
|
padded_text[:len(text)] = text #ids
|
||||||
|
else:
|
||||||
|
padded_text[:len(text)] = [ord(t) for t in text] # string
|
||||||
|
texts.append(padded_text)
|
||||||
|
text_lens.append(len(text))
|
||||||
|
|
||||||
|
padded_audios = np.array(padded_audios).astype('float32')
|
||||||
|
audio_lens = np.array(audio_lens).astype('int64')
|
||||||
|
texts = np.array(texts).astype('int32')
|
||||||
|
text_lens = np.array(text_lens).astype('int64')
|
||||||
|
return padded_audios, texts, audio_lens, text_lens
|
||||||
|
|
||||||
|
loader = DataLoader(
|
||||||
|
dataset,
|
||||||
|
batch_sampler=batch_sampler,
|
||||||
|
collate_fn=partial(padding_batch, is_training=is_training),
|
||||||
|
num_workers=num_workers)
|
||||||
|
return loader
|
@ -0,0 +1,72 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import numpy as np
|
||||||
|
from collections import namedtuple
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"SpeechCollator",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class SpeechCollator():
|
||||||
|
def __init__(self, padding_to=-1, is_training=True):
|
||||||
|
"""
|
||||||
|
Padding audio features with zeros to make them have the same shape (or
|
||||||
|
a user-defined shape) within one bach.
|
||||||
|
|
||||||
|
If ``padding_to`` is -1, the maximun shape in the batch will be used
|
||||||
|
as the target shape for padding. Otherwise, `padding_to` will be the
|
||||||
|
target shape (only refers to the second axis).
|
||||||
|
"""
|
||||||
|
self._padding_to = padding_to
|
||||||
|
self._is_training = is_training
|
||||||
|
|
||||||
|
def __call__(self, batch):
|
||||||
|
new_batch = []
|
||||||
|
# get target shape
|
||||||
|
max_length = max([audio.shape[1] for audio, _ in batch])
|
||||||
|
if self._padding_to != -1:
|
||||||
|
if self._padding_to < max_length:
|
||||||
|
raise ValueError("If padding_to is not -1, it should be larger "
|
||||||
|
"than any instance's shape in the batch")
|
||||||
|
max_length = self._padding_to
|
||||||
|
max_text_length = max([len(text) for _, text in batch])
|
||||||
|
# padding
|
||||||
|
padded_audios = []
|
||||||
|
audio_lens = []
|
||||||
|
texts, text_lens = [], []
|
||||||
|
for audio, text in batch:
|
||||||
|
# audio
|
||||||
|
padded_audio = np.zeros([audio.shape[0], max_length])
|
||||||
|
padded_audio[:, :audio.shape[1]] = audio
|
||||||
|
padded_audios.append(padded_audio)
|
||||||
|
audio_lens.append(audio.shape[1])
|
||||||
|
# text
|
||||||
|
padded_text = np.zeros([max_text_length])
|
||||||
|
if self._is_training:
|
||||||
|
padded_text[:len(text)] = text #ids
|
||||||
|
else:
|
||||||
|
padded_text[:len(text)] = [ord(t) for t in text] # string
|
||||||
|
texts.append(padded_text)
|
||||||
|
text_lens.append(len(text))
|
||||||
|
|
||||||
|
padded_audios = np.array(padded_audios).astype('float32')
|
||||||
|
audio_lens = np.array(audio_lens).astype('int64')
|
||||||
|
texts = np.array(texts).astype('int32')
|
||||||
|
text_lens = np.array(text_lens).astype('int64')
|
||||||
|
return padded_audios, texts, audio_lens, text_lens
|
@ -0,0 +1,186 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import math
|
||||||
|
import random
|
||||||
|
import tarfile
|
||||||
|
import logging
|
||||||
|
import numpy as np
|
||||||
|
from collections import namedtuple
|
||||||
|
from functools import partial
|
||||||
|
|
||||||
|
from paddle.io import Dataset
|
||||||
|
|
||||||
|
from deepspeech.frontend.utility import read_manifest
|
||||||
|
from deepspeech.frontend.augmentor.augmentation import AugmentationPipeline
|
||||||
|
from deepspeech.frontend.featurizer.speech_featurizer import SpeechFeaturizer
|
||||||
|
from deepspeech.frontend.speech import SpeechSegment
|
||||||
|
from deepspeech.frontend.normalizer import FeatureNormalizer
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"ManifestDataset",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class ManifestDataset(Dataset):
|
||||||
|
def __init__(self,
|
||||||
|
manifest_path,
|
||||||
|
vocab_filepath,
|
||||||
|
mean_std_filepath,
|
||||||
|
augmentation_config='{}',
|
||||||
|
max_duration=float('inf'),
|
||||||
|
min_duration=0.0,
|
||||||
|
stride_ms=10.0,
|
||||||
|
window_ms=20.0,
|
||||||
|
n_fft=None,
|
||||||
|
max_freq=None,
|
||||||
|
target_sample_rate=16000,
|
||||||
|
specgram_type='linear',
|
||||||
|
use_dB_normalization=True,
|
||||||
|
target_dB=-20,
|
||||||
|
random_seed=0,
|
||||||
|
keep_transcription_text=False):
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
self._max_duration = max_duration
|
||||||
|
self._min_duration = min_duration
|
||||||
|
self._normalizer = FeatureNormalizer(mean_std_filepath)
|
||||||
|
self._augmentation_pipeline = AugmentationPipeline(
|
||||||
|
augmentation_config=augmentation_config, random_seed=random_seed)
|
||||||
|
self._speech_featurizer = SpeechFeaturizer(
|
||||||
|
vocab_filepath=vocab_filepath,
|
||||||
|
specgram_type=specgram_type,
|
||||||
|
stride_ms=stride_ms,
|
||||||
|
window_ms=window_ms,
|
||||||
|
n_fft=n_fft,
|
||||||
|
max_freq=max_freq,
|
||||||
|
target_sample_rate=target_sample_rate,
|
||||||
|
use_dB_normalization=use_dB_normalization,
|
||||||
|
target_dB=target_dB)
|
||||||
|
self._rng = random.Random(random_seed)
|
||||||
|
self._keep_transcription_text = keep_transcription_text
|
||||||
|
# for caching tar files info
|
||||||
|
self._local_data = namedtuple('local_data', ['tar2info', 'tar2object'])
|
||||||
|
self._local_data.tar2info = {}
|
||||||
|
self._local_data.tar2object = {}
|
||||||
|
|
||||||
|
# read manifest
|
||||||
|
self._manifest = read_manifest(
|
||||||
|
manifest_path=manifest_path,
|
||||||
|
max_duration=self._max_duration,
|
||||||
|
min_duration=self._min_duration)
|
||||||
|
self._manifest.sort(key=lambda x: x["duration"])
|
||||||
|
|
||||||
|
@property
|
||||||
|
def manifest(self):
|
||||||
|
return self._manifest
|
||||||
|
|
||||||
|
@property
|
||||||
|
def vocab_size(self):
|
||||||
|
"""Return the vocabulary size.
|
||||||
|
|
||||||
|
:return: Vocabulary size.
|
||||||
|
:rtype: int
|
||||||
|
"""
|
||||||
|
return self._speech_featurizer.vocab_size
|
||||||
|
|
||||||
|
@property
|
||||||
|
def vocab_list(self):
|
||||||
|
"""Return the vocabulary in list.
|
||||||
|
|
||||||
|
:return: Vocabulary in list.
|
||||||
|
:rtype: list
|
||||||
|
"""
|
||||||
|
return self._speech_featurizer.vocab_list
|
||||||
|
|
||||||
|
@property
|
||||||
|
def feature_size(self):
|
||||||
|
return self._speech_featurizer.feature_size
|
||||||
|
|
||||||
|
def _parse_tar(self, file):
|
||||||
|
"""Parse a tar file to get a tarfile object
|
||||||
|
and a map containing tarinfoes
|
||||||
|
"""
|
||||||
|
result = {}
|
||||||
|
f = tarfile.open(file)
|
||||||
|
for tarinfo in f.getmembers():
|
||||||
|
result[tarinfo.name] = tarinfo
|
||||||
|
return f, result
|
||||||
|
|
||||||
|
def _subfile_from_tar(self, file):
|
||||||
|
"""Get subfile object from tar.
|
||||||
|
|
||||||
|
It will return a subfile object from tar file
|
||||||
|
and cached tar file info for next reading request.
|
||||||
|
"""
|
||||||
|
tarpath, filename = file.split(':', 1)[1].split('#', 1)
|
||||||
|
if 'tar2info' not in self._local_data.__dict__:
|
||||||
|
self._local_data.tar2info = {}
|
||||||
|
if 'tar2object' not in self._local_data.__dict__:
|
||||||
|
self._local_data.tar2object = {}
|
||||||
|
if tarpath not in self._local_data.tar2info:
|
||||||
|
object, infoes = self._parse_tar(tarpath)
|
||||||
|
self._local_data.tar2info[tarpath] = infoes
|
||||||
|
self._local_data.tar2object[tarpath] = object
|
||||||
|
return self._local_data.tar2object[tarpath].extractfile(
|
||||||
|
self._local_data.tar2info[tarpath][filename])
|
||||||
|
|
||||||
|
def process_utterance(self, audio_file, transcript):
|
||||||
|
"""Load, augment, featurize and normalize for speech data.
|
||||||
|
|
||||||
|
:param audio_file: Filepath or file object of audio file.
|
||||||
|
:type audio_file: str | file
|
||||||
|
:param transcript: Transcription text.
|
||||||
|
:type transcript: str
|
||||||
|
:return: Tuple of audio feature tensor and data of transcription part,
|
||||||
|
where transcription part could be token ids or text.
|
||||||
|
:rtype: tuple of (2darray, list)
|
||||||
|
"""
|
||||||
|
if isinstance(audio_file, str) and audio_file.startswith('tar:'):
|
||||||
|
speech_segment = SpeechSegment.from_file(
|
||||||
|
self._subfile_from_tar(audio_file), transcript)
|
||||||
|
else:
|
||||||
|
speech_segment = SpeechSegment.from_file(audio_file, transcript)
|
||||||
|
self._augmentation_pipeline.transform_audio(speech_segment)
|
||||||
|
specgram, transcript_part = self._speech_featurizer.featurize(
|
||||||
|
speech_segment, self._keep_transcription_text)
|
||||||
|
specgram = self._normalizer.apply(specgram)
|
||||||
|
return specgram, transcript_part
|
||||||
|
|
||||||
|
def _instance_reader_creator(self, manifest):
|
||||||
|
"""
|
||||||
|
Instance reader creator. Create a callable function to produce
|
||||||
|
instances of data.
|
||||||
|
|
||||||
|
Instance: a tuple of ndarray of audio spectrogram and a list of
|
||||||
|
token indices for transcript.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def reader():
|
||||||
|
for instance in manifest:
|
||||||
|
inst = self.process_utterance(instance["audio_filepath"],
|
||||||
|
instance["text"])
|
||||||
|
yield inst
|
||||||
|
|
||||||
|
return reader
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return len(self._manifest)
|
||||||
|
|
||||||
|
def __getitem__(self, idx):
|
||||||
|
instance = self._manifest[idx]
|
||||||
|
return self.process_utterance(instance["audio_filepath"],
|
||||||
|
instance["text"])
|
@ -0,0 +1,266 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import math
|
||||||
|
import random
|
||||||
|
import tarfile
|
||||||
|
import logging
|
||||||
|
import numpy as np
|
||||||
|
from collections import namedtuple
|
||||||
|
from functools import partial
|
||||||
|
|
||||||
|
import paddle
|
||||||
|
from paddle.io import BatchSampler
|
||||||
|
from paddle.io import DistributedBatchSampler
|
||||||
|
from paddle import distributed as dist
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"SortagradDistributedBatchSampler",
|
||||||
|
"SortagradBatchSampler",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class SortagradDistributedBatchSampler(DistributedBatchSampler):
|
||||||
|
def __init__(self,
|
||||||
|
dataset,
|
||||||
|
batch_size,
|
||||||
|
num_replicas=None,
|
||||||
|
rank=None,
|
||||||
|
shuffle=False,
|
||||||
|
drop_last=False,
|
||||||
|
sortagrad=False,
|
||||||
|
shuffle_method="batch_shuffle"):
|
||||||
|
super().__init__(dataset, batch_size, num_replicas, rank, shuffle,
|
||||||
|
drop_last)
|
||||||
|
self._sortagrad = sortagrad
|
||||||
|
self._shuffle_method = shuffle_method
|
||||||
|
|
||||||
|
def _batch_shuffle(self, indices, batch_size, clipped=False):
|
||||||
|
"""Put similarly-sized instances into minibatches for better efficiency
|
||||||
|
and make a batch-wise shuffle.
|
||||||
|
|
||||||
|
1. Sort the audio clips by duration.
|
||||||
|
2. Generate a random number `k`, k in [0, batch_size).
|
||||||
|
3. Randomly shift `k` instances in order to create different batches
|
||||||
|
for different epochs. Create minibatches.
|
||||||
|
4. Shuffle the minibatches.
|
||||||
|
|
||||||
|
:param indices: indexes. List of int.
|
||||||
|
:type indices: list
|
||||||
|
:param batch_size: Batch size. This size is also used for generate
|
||||||
|
a random number for batch shuffle.
|
||||||
|
:type batch_size: int
|
||||||
|
:param clipped: Whether to clip the heading (small shift) and trailing
|
||||||
|
(incomplete batch) instances.
|
||||||
|
:type clipped: bool
|
||||||
|
:return: Batch shuffled mainifest.
|
||||||
|
:rtype: list
|
||||||
|
"""
|
||||||
|
rng = np.random.RandomState(self.epoch)
|
||||||
|
shift_len = rng.randint(0, batch_size - 1)
|
||||||
|
batch_indices = list(zip(* [iter(indices[shift_len:])] * batch_size))
|
||||||
|
rng.shuffle(batch_indices)
|
||||||
|
batch_indices = [item for batch in batch_indices for item in batch]
|
||||||
|
assert (clipped == False)
|
||||||
|
if not clipped:
|
||||||
|
res_len = len(indices) - shift_len - len(batch_indices)
|
||||||
|
# when res_len is 0, will return whole list, len(List[-0:]) = len(List[:])
|
||||||
|
if res_len != 0:
|
||||||
|
batch_indices.extend(indices[-res_len:])
|
||||||
|
batch_indices.extend(indices[0:shift_len])
|
||||||
|
assert len(indices) == len(
|
||||||
|
batch_indices
|
||||||
|
), f"_batch_shuffle: {len(indices)} : {len(batch_indices)} : {res_len} - {shift_len}"
|
||||||
|
return batch_indices
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
num_samples = len(self.dataset)
|
||||||
|
indices = np.arange(num_samples).tolist()
|
||||||
|
indices += indices[:(self.total_size - len(indices))]
|
||||||
|
assert len(indices) == self.total_size
|
||||||
|
|
||||||
|
# sort (by duration) or batch-wise shuffle the manifest
|
||||||
|
if self.shuffle:
|
||||||
|
if self.epoch == 0 and self._sortagrad:
|
||||||
|
logger.info(
|
||||||
|
f'rank: {dist.get_rank()} dataset sortagrad! epoch {self.epoch}'
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
logger.info(
|
||||||
|
f'rank: {dist.get_rank()} dataset shuffle! epoch {self.epoch}'
|
||||||
|
)
|
||||||
|
if self._shuffle_method == "batch_shuffle":
|
||||||
|
indices = self._batch_shuffle(
|
||||||
|
indices, self.batch_size, clipped=False)
|
||||||
|
elif self._shuffle_method == "instance_shuffle":
|
||||||
|
np.random.RandomState(self.epoch).shuffle(indices)
|
||||||
|
else:
|
||||||
|
raise ValueError("Unknown shuffle method %s." %
|
||||||
|
self._shuffle_method)
|
||||||
|
assert len(
|
||||||
|
indices
|
||||||
|
) == self.total_size, f"batch shuffle examples error: {len(indices)} : {self.total_size}"
|
||||||
|
|
||||||
|
# subsample
|
||||||
|
def _get_indices_by_batch_size(indices):
|
||||||
|
subsampled_indices = []
|
||||||
|
last_batch_size = self.total_size % (self.batch_size * self.nranks)
|
||||||
|
assert last_batch_size % self.nranks == 0
|
||||||
|
last_local_batch_size = last_batch_size // self.nranks
|
||||||
|
|
||||||
|
for i in range(self.local_rank * self.batch_size,
|
||||||
|
len(indices) - last_batch_size,
|
||||||
|
self.batch_size * self.nranks):
|
||||||
|
subsampled_indices.extend(indices[i:i + self.batch_size])
|
||||||
|
|
||||||
|
indices = indices[len(indices) - last_batch_size:]
|
||||||
|
subsampled_indices.extend(
|
||||||
|
indices[self.local_rank * last_local_batch_size:(
|
||||||
|
self.local_rank + 1) * last_local_batch_size])
|
||||||
|
return subsampled_indices
|
||||||
|
|
||||||
|
if self.nranks > 1:
|
||||||
|
indices = _get_indices_by_batch_size(indices)
|
||||||
|
|
||||||
|
assert len(indices) == self.num_samples
|
||||||
|
_sample_iter = iter(indices)
|
||||||
|
|
||||||
|
batch_indices = []
|
||||||
|
for idx in _sample_iter:
|
||||||
|
batch_indices.append(idx)
|
||||||
|
if len(batch_indices) == self.batch_size:
|
||||||
|
logger.info(
|
||||||
|
f"rank: {dist.get_rank()} batch index: {batch_indices} ")
|
||||||
|
yield batch_indices
|
||||||
|
batch_indices = []
|
||||||
|
if not self.drop_last and len(batch_indices) > 0:
|
||||||
|
yield batch_indices
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
num_samples = self.num_samples
|
||||||
|
num_samples += int(not self.drop_last) * (self.batch_size - 1)
|
||||||
|
return num_samples // self.batch_size
|
||||||
|
|
||||||
|
|
||||||
|
class SortagradBatchSampler(BatchSampler):
|
||||||
|
def __init__(self,
|
||||||
|
dataset,
|
||||||
|
batch_size,
|
||||||
|
shuffle=False,
|
||||||
|
drop_last=False,
|
||||||
|
sortagrad=False,
|
||||||
|
shuffle_method="batch_shuffle"):
|
||||||
|
self.dataset = dataset
|
||||||
|
|
||||||
|
assert isinstance(batch_size, int) and batch_size > 0, \
|
||||||
|
"batch_size should be a positive integer"
|
||||||
|
self.batch_size = batch_size
|
||||||
|
assert isinstance(shuffle, bool), \
|
||||||
|
"shuffle should be a boolean value"
|
||||||
|
self.shuffle = shuffle
|
||||||
|
assert isinstance(drop_last, bool), \
|
||||||
|
"drop_last should be a boolean number"
|
||||||
|
|
||||||
|
self.drop_last = drop_last
|
||||||
|
self.epoch = 0
|
||||||
|
self.num_samples = int(math.ceil(len(self.dataset) * 1.0))
|
||||||
|
self.total_size = self.num_samples
|
||||||
|
self._sortagrad = sortagrad
|
||||||
|
self._shuffle_method = shuffle_method
|
||||||
|
|
||||||
|
def _batch_shuffle(self, indices, batch_size, clipped=False):
|
||||||
|
"""Put similarly-sized instances into minibatches for better efficiency
|
||||||
|
and make a batch-wise shuffle.
|
||||||
|
|
||||||
|
1. Sort the audio clips by duration.
|
||||||
|
2. Generate a random number `k`, k in [0, batch_size).
|
||||||
|
3. Randomly shift `k` instances in order to create different batches
|
||||||
|
for different epochs. Create minibatches.
|
||||||
|
4. Shuffle the minibatches.
|
||||||
|
|
||||||
|
:param indices: indexes. List of int.
|
||||||
|
:type indices: list
|
||||||
|
:param batch_size: Batch size. This size is also used for generate
|
||||||
|
a random number for batch shuffle.
|
||||||
|
:type batch_size: int
|
||||||
|
:param clipped: Whether to clip the heading (small shift) and trailing
|
||||||
|
(incomplete batch) instances.
|
||||||
|
:type clipped: bool
|
||||||
|
:return: Batch shuffled mainifest.
|
||||||
|
:rtype: list
|
||||||
|
"""
|
||||||
|
rng = np.random.RandomState(self.epoch)
|
||||||
|
# must shift at leat by one
|
||||||
|
shift_len = rng.randint(0, batch_size - 1)
|
||||||
|
batch_indices = list(zip(* [iter(indices[shift_len:])] * batch_size))
|
||||||
|
rng.shuffle(batch_indices)
|
||||||
|
batch_indices = [item for batch in batch_indices for item in batch]
|
||||||
|
assert (clipped == False)
|
||||||
|
if not clipped:
|
||||||
|
res_len = len(indices) - shift_len - len(batch_indices)
|
||||||
|
# when res_len is 0, will return whole list, len(List[-0:]) = len(List[:])
|
||||||
|
if res_len != 0:
|
||||||
|
batch_indices.extend(indices[-res_len:])
|
||||||
|
batch_indices.extend(indices[0:shift_len])
|
||||||
|
assert len(indices) == len(
|
||||||
|
batch_indices
|
||||||
|
), f"_batch_shuffle: {len(indices)} : {len(batch_indices)} : {res_len} - {shift_len}"
|
||||||
|
return batch_indices
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
num_samples = len(self.dataset)
|
||||||
|
indices = np.arange(num_samples).tolist()
|
||||||
|
indices += indices[:(self.total_size - len(indices))]
|
||||||
|
assert len(indices) == self.total_size
|
||||||
|
|
||||||
|
# sort (by duration) or batch-wise shuffle the manifest
|
||||||
|
if self.shuffle:
|
||||||
|
if self.epoch == 0 and self._sortagrad:
|
||||||
|
logger.info(f'dataset sortagrad! epoch {self.epoch}')
|
||||||
|
else:
|
||||||
|
logger.info(f'dataset shuffle! epoch {self.epoch}')
|
||||||
|
if self._shuffle_method == "batch_shuffle":
|
||||||
|
indices = self._batch_shuffle(
|
||||||
|
indices, self.batch_size, clipped=False)
|
||||||
|
elif self._shuffle_method == "instance_shuffle":
|
||||||
|
np.random.RandomState(self.epoch).shuffle(indices)
|
||||||
|
else:
|
||||||
|
raise ValueError("Unknown shuffle method %s." %
|
||||||
|
self._shuffle_method)
|
||||||
|
assert len(
|
||||||
|
indices
|
||||||
|
) == self.total_size, f"batch shuffle examples error: {len(indices)} : {self.total_size}"
|
||||||
|
|
||||||
|
assert len(indices) == self.num_samples
|
||||||
|
_sample_iter = iter(indices)
|
||||||
|
|
||||||
|
batch_indices = []
|
||||||
|
for idx in _sample_iter:
|
||||||
|
batch_indices.append(idx)
|
||||||
|
if len(batch_indices) == self.batch_size:
|
||||||
|
logger.info(
|
||||||
|
f"rank: {dist.get_rank()} batch index: {batch_indices} ")
|
||||||
|
yield batch_indices
|
||||||
|
batch_indices = []
|
||||||
|
if not self.drop_last and len(batch_indices) > 0:
|
||||||
|
yield batch_indices
|
||||||
|
|
||||||
|
self.epoch += 1
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
num_samples = self.num_samples
|
||||||
|
num_samples += int(not self.drop_last) * (self.batch_size - 1)
|
||||||
|
return num_samples // self.batch_size
|
@ -0,0 +1,304 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import math
|
||||||
|
import collections
|
||||||
|
import numpy as np
|
||||||
|
import logging
|
||||||
|
from typing import Optional
|
||||||
|
from yacs.config import CfgNode
|
||||||
|
|
||||||
|
import paddle
|
||||||
|
from paddle import nn
|
||||||
|
from paddle.nn import functional as F
|
||||||
|
from paddle.nn import initializer as I
|
||||||
|
|
||||||
|
from deepspeech.modules.conv import ConvStack
|
||||||
|
from deepspeech.modules.conv import RNNStack
|
||||||
|
from deepspeech.modules.mask import sequence_mask
|
||||||
|
from deepspeech.modules.activation import brelu
|
||||||
|
from deepspeech.utils import checkpoint
|
||||||
|
from deepspeech.decoders.swig_wrapper import Scorer
|
||||||
|
from deepspeech.decoders.swig_wrapper import ctc_greedy_decoder
|
||||||
|
from deepspeech.decoders.swig_wrapper import ctc_beam_search_decoder_batch
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
__all__ = ['DeepSpeech2Model']
|
||||||
|
|
||||||
|
|
||||||
|
class DeepSpeech2Model(nn.Layer):
|
||||||
|
"""The DeepSpeech2 network structure.
|
||||||
|
|
||||||
|
:param audio_data: Audio spectrogram data layer.
|
||||||
|
:type audio_data: Variable
|
||||||
|
:param text_data: Transcription text data layer.
|
||||||
|
:type text_data: Variable
|
||||||
|
:param audio_len: Valid sequence length data layer.
|
||||||
|
:type audio_len: Variable
|
||||||
|
:param masks: Masks data layer to reset padding.
|
||||||
|
:type masks: Variable
|
||||||
|
:param dict_size: Dictionary size for tokenized transcription.
|
||||||
|
:type dict_size: int
|
||||||
|
:param num_conv_layers: Number of stacking convolution layers.
|
||||||
|
:type num_conv_layers: int
|
||||||
|
:param num_rnn_layers: Number of stacking RNN layers.
|
||||||
|
:type num_rnn_layers: int
|
||||||
|
:param rnn_size: RNN layer size (dimension of RNN cells).
|
||||||
|
:type rnn_size: int
|
||||||
|
:param use_gru: Use gru if set True. Use simple rnn if set False.
|
||||||
|
:type use_gru: bool
|
||||||
|
:param share_rnn_weights: Whether to share input-hidden weights between
|
||||||
|
forward and backward direction RNNs.
|
||||||
|
It is only available when use_gru=False.
|
||||||
|
:type share_weights: bool
|
||||||
|
:return: A tuple of an output unnormalized log probability layer (
|
||||||
|
before softmax) and a ctc cost layer.
|
||||||
|
:rtype: tuple of LayerOutput
|
||||||
|
"""
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
|
||||||
|
default = CfgNode(
|
||||||
|
dict(
|
||||||
|
num_conv_layers=2, #Number of stacking convolution layers.
|
||||||
|
num_rnn_layers=3, #Number of stacking RNN layers.
|
||||||
|
rnn_layer_size=1024, #RNN layer size (number of RNN cells).
|
||||||
|
use_gru=True, #Use gru if set True. Use simple rnn if set False.
|
||||||
|
share_rnn_weights=True #Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported.
|
||||||
|
))
|
||||||
|
if config is not None:
|
||||||
|
config.model.merge_from_other_cfg(default)
|
||||||
|
return default
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
feat_size,
|
||||||
|
dict_size,
|
||||||
|
num_conv_layers=2,
|
||||||
|
num_rnn_layers=3,
|
||||||
|
rnn_size=1024,
|
||||||
|
use_gru=False,
|
||||||
|
share_rnn_weights=True):
|
||||||
|
super().__init__()
|
||||||
|
self.feat_size = feat_size # 161 for linear
|
||||||
|
self.dict_size = dict_size
|
||||||
|
|
||||||
|
self.conv = ConvStack(feat_size, num_conv_layers)
|
||||||
|
|
||||||
|
i_size = self.conv.output_height # H after conv stack
|
||||||
|
self.rnn = RNNStack(
|
||||||
|
i_size=i_size,
|
||||||
|
h_size=rnn_size,
|
||||||
|
num_stacks=num_rnn_layers,
|
||||||
|
use_gru=use_gru,
|
||||||
|
share_rnn_weights=share_rnn_weights)
|
||||||
|
self.fc = nn.Linear(rnn_size * 2, dict_size + 1)
|
||||||
|
|
||||||
|
self.logger = logging.getLogger(__name__)
|
||||||
|
self._ext_scorer = None
|
||||||
|
|
||||||
|
def infer(self, audio, audio_len):
|
||||||
|
# [B, D, T] -> [B, C=1, D, T]
|
||||||
|
audio = audio.unsqueeze(1)
|
||||||
|
|
||||||
|
# convolution group
|
||||||
|
x, audio_len = self.conv(audio, audio_len)
|
||||||
|
#print('conv out', x.shape)
|
||||||
|
|
||||||
|
# convert data from convolution feature map to sequence of vectors
|
||||||
|
B, C, D, T = paddle.shape(x)
|
||||||
|
x = x.transpose([0, 3, 1, 2]) #[B, T, C, D]
|
||||||
|
x = x.reshape([B, T, C * D]) #[B, T, C*D]
|
||||||
|
#print('rnn input', x.shape)
|
||||||
|
|
||||||
|
# remove padding part
|
||||||
|
x, audio_len = self.rnn(x, audio_len) #[B, T, D]
|
||||||
|
#print('rnn output', x.shape)
|
||||||
|
|
||||||
|
logits = self.fc(x) #[B, T, V + 1]
|
||||||
|
|
||||||
|
#ctcdecoder need probs, not log_probs
|
||||||
|
probs = F.softmax(logits)
|
||||||
|
|
||||||
|
return logits, probs, audio_len
|
||||||
|
|
||||||
|
def forward(self, audio, text, audio_len, text_len):
|
||||||
|
"""
|
||||||
|
audio: shape [B, D, T]
|
||||||
|
text: shape [B, T]
|
||||||
|
audio_len: shape [B]
|
||||||
|
text_len: shape [B]
|
||||||
|
"""
|
||||||
|
return self.infer(audio, audio_len)
|
||||||
|
|
||||||
|
@paddle.no_grad()
|
||||||
|
def predict(self, audio, audio_len):
|
||||||
|
""" Model infer """
|
||||||
|
return self.infer(audio, audio_len)
|
||||||
|
|
||||||
|
def _decode_batch_greedy(self, probs_split, vocab_list):
|
||||||
|
"""Decode by best path for a batch of probs matrix input.
|
||||||
|
:param probs_split: List of 2-D probability matrix, and each consists
|
||||||
|
of prob vectors for one speech utterancce.
|
||||||
|
:param probs_split: List of matrix
|
||||||
|
:param vocab_list: List of tokens in the vocabulary, for decoding.
|
||||||
|
:type vocab_list: list
|
||||||
|
:return: List of transcription texts.
|
||||||
|
:rtype: List of str
|
||||||
|
"""
|
||||||
|
results = []
|
||||||
|
for i, probs in enumerate(probs_split):
|
||||||
|
output_transcription = ctc_greedy_decoder(
|
||||||
|
probs_seq=probs, vocabulary=vocab_list)
|
||||||
|
results.append(output_transcription)
|
||||||
|
return results
|
||||||
|
|
||||||
|
def _init_ext_scorer(self, beam_alpha, beam_beta, language_model_path,
|
||||||
|
vocab_list):
|
||||||
|
"""Initialize the external scorer.
|
||||||
|
:param beam_alpha: Parameter associated with language model.
|
||||||
|
:type beam_alpha: float
|
||||||
|
:param beam_beta: Parameter associated with word count.
|
||||||
|
:type beam_beta: float
|
||||||
|
:param language_model_path: Filepath for language model. If it is
|
||||||
|
empty, the external scorer will be set to
|
||||||
|
None, and the decoding method will be pure
|
||||||
|
beam search without scorer.
|
||||||
|
:type language_model_path: str|None
|
||||||
|
:param vocab_list: List of tokens in the vocabulary, for decoding.
|
||||||
|
:type vocab_list: list
|
||||||
|
"""
|
||||||
|
# init once
|
||||||
|
if self._ext_scorer != None:
|
||||||
|
return
|
||||||
|
|
||||||
|
if language_model_path != '':
|
||||||
|
self.logger.info("begin to initialize the external scorer "
|
||||||
|
"for decoding")
|
||||||
|
self._ext_scorer = Scorer(beam_alpha, beam_beta,
|
||||||
|
language_model_path, vocab_list)
|
||||||
|
lm_char_based = self._ext_scorer.is_character_based()
|
||||||
|
lm_max_order = self._ext_scorer.get_max_order()
|
||||||
|
lm_dict_size = self._ext_scorer.get_dict_size()
|
||||||
|
self.logger.info("language model: "
|
||||||
|
"is_character_based = %d," % lm_char_based +
|
||||||
|
" max_order = %d," % lm_max_order +
|
||||||
|
" dict_size = %d" % lm_dict_size)
|
||||||
|
self.logger.info("end initializing scorer")
|
||||||
|
else:
|
||||||
|
self._ext_scorer = None
|
||||||
|
self.logger.info("no language model provided, "
|
||||||
|
"decoding by pure beam search without scorer.")
|
||||||
|
|
||||||
|
def _decode_batch_beam_search(self, probs_split, beam_alpha, beam_beta,
|
||||||
|
beam_size, cutoff_prob, cutoff_top_n,
|
||||||
|
vocab_list, num_processes):
|
||||||
|
"""Decode by beam search for a batch of probs matrix input.
|
||||||
|
:param probs_split: List of 2-D probability matrix, and each consists
|
||||||
|
of prob vectors for one speech utterancce.
|
||||||
|
:param probs_split: List of matrix
|
||||||
|
:param beam_alpha: Parameter associated with language model.
|
||||||
|
:type beam_alpha: float
|
||||||
|
:param beam_beta: Parameter associated with word count.
|
||||||
|
:type beam_beta: float
|
||||||
|
:param beam_size: Width for Beam search.
|
||||||
|
:type beam_size: int
|
||||||
|
:param cutoff_prob: Cutoff probability in pruning,
|
||||||
|
default 1.0, no pruning.
|
||||||
|
:type cutoff_prob: float
|
||||||
|
:param cutoff_top_n: Cutoff number in pruning, only top cutoff_top_n
|
||||||
|
characters with highest probs in vocabulary will be
|
||||||
|
used in beam search, default 40.
|
||||||
|
:type cutoff_top_n: int
|
||||||
|
:param vocab_list: List of tokens in the vocabulary, for decoding.
|
||||||
|
:type vocab_list: list
|
||||||
|
:param num_processes: Number of processes (CPU) for decoder.
|
||||||
|
:type num_processes: int
|
||||||
|
:return: List of transcription texts.
|
||||||
|
:rtype: List of str
|
||||||
|
"""
|
||||||
|
if self._ext_scorer != None:
|
||||||
|
self._ext_scorer.reset_params(beam_alpha, beam_beta)
|
||||||
|
|
||||||
|
# beam search decode
|
||||||
|
num_processes = min(num_processes, len(probs_split))
|
||||||
|
beam_search_results = ctc_beam_search_decoder_batch(
|
||||||
|
probs_split=probs_split,
|
||||||
|
vocabulary=vocab_list,
|
||||||
|
beam_size=beam_size,
|
||||||
|
num_processes=num_processes,
|
||||||
|
ext_scoring_func=self._ext_scorer,
|
||||||
|
cutoff_prob=cutoff_prob,
|
||||||
|
cutoff_top_n=cutoff_top_n)
|
||||||
|
|
||||||
|
results = [result[0][1] for result in beam_search_results]
|
||||||
|
return results
|
||||||
|
|
||||||
|
def init_decode(self, beam_alpha, beam_beta, lang_model_path, vocab_list,
|
||||||
|
decoding_method):
|
||||||
|
if decoding_method == "ctc_beam_search":
|
||||||
|
self._init_ext_scorer(beam_alpha, beam_beta, lang_model_path,
|
||||||
|
vocab_list)
|
||||||
|
|
||||||
|
def decode_probs(self, probs, logits_lens, vocab_list, decoding_method,
|
||||||
|
lang_model_path, beam_alpha, beam_beta, beam_size,
|
||||||
|
cutoff_prob, cutoff_top_n, num_processes):
|
||||||
|
""" probs: activation after softmax
|
||||||
|
logits_len: audio output lens
|
||||||
|
"""
|
||||||
|
probs_split = [probs[i, :l, :] for i, l in enumerate(logits_lens)]
|
||||||
|
if decoding_method == "ctc_greedy":
|
||||||
|
result_transcripts = self._decode_batch_greedy(
|
||||||
|
probs_split=probs_split, vocab_list=vocab_list)
|
||||||
|
elif decoding_method == "ctc_beam_search":
|
||||||
|
result_transcripts = self._decode_batch_beam_search(
|
||||||
|
probs_split=probs_split,
|
||||||
|
beam_alpha=beam_alpha,
|
||||||
|
beam_beta=beam_beta,
|
||||||
|
beam_size=beam_size,
|
||||||
|
cutoff_prob=cutoff_prob,
|
||||||
|
cutoff_top_n=cutoff_top_n,
|
||||||
|
vocab_list=vocab_list,
|
||||||
|
num_processes=num_processes)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Not support: {decoding_method}")
|
||||||
|
return result_transcripts
|
||||||
|
|
||||||
|
@paddle.no_grad()
|
||||||
|
def decode(self, audio, audio_len, vocab_list, decoding_method,
|
||||||
|
lang_model_path, beam_alpha, beam_beta, beam_size, cutoff_prob,
|
||||||
|
cutoff_top_n, num_processes):
|
||||||
|
_, probs, logits_lens = self.predict(audio, audio_len)
|
||||||
|
return self.decode_probs(probs.numpy(), logits_lens, vocab_list,
|
||||||
|
decoding_method, lang_model_path, beam_alpha,
|
||||||
|
beam_beta, beam_size, cutoff_prob,
|
||||||
|
cutoff_top_n, num_processes)
|
||||||
|
|
||||||
|
def from_pretrained(self, checkpoint_path):
|
||||||
|
"""Build a model from a pretrained model.
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
model: nn.Layer
|
||||||
|
Asr Model.
|
||||||
|
|
||||||
|
checkpoint_path: Path or str
|
||||||
|
The path of pretrained model checkpoint, without extension name.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Model
|
||||||
|
The model build from pretrined result.
|
||||||
|
"""
|
||||||
|
checkpoint.load_parameters(self, checkpoint_path=checkpoint_path)
|
||||||
|
return
|
@ -1,754 +0,0 @@
|
|||||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
|
|
||||||
import math
|
|
||||||
import collections
|
|
||||||
import numpy as np
|
|
||||||
import logging
|
|
||||||
|
|
||||||
import paddle
|
|
||||||
from paddle import nn
|
|
||||||
from paddle.nn import functional as F
|
|
||||||
from paddle.nn import initializer as I
|
|
||||||
|
|
||||||
from deepspeech.utils import checkpoint
|
|
||||||
from deepspeech.decoders.swig_wrapper import Scorer
|
|
||||||
from deepspeech.decoders.swig_wrapper import ctc_greedy_decoder
|
|
||||||
from deepspeech.decoders.swig_wrapper import ctc_beam_search_decoder_batch
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
__all__ = ['DeepSpeech2', 'DeepSpeech2Loss']
|
|
||||||
|
|
||||||
|
|
||||||
def brelu(x, t_min=0.0, t_max=24.0, name=None):
|
|
||||||
t_min = paddle.to_tensor(t_min)
|
|
||||||
t_max = paddle.to_tensor(t_max)
|
|
||||||
return x.maximum(t_min).minimum(t_max)
|
|
||||||
|
|
||||||
|
|
||||||
def sequence_mask(x_len, max_len=None, dtype='float32'):
|
|
||||||
max_len = max_len or x_len.max()
|
|
||||||
x_len = paddle.unsqueeze(x_len, -1)
|
|
||||||
row_vector = paddle.arange(max_len)
|
|
||||||
#mask = row_vector < x_len
|
|
||||||
mask = row_vector > x_len # a bug, broadcast 的时候出错了
|
|
||||||
mask = paddle.cast(mask, dtype)
|
|
||||||
return mask
|
|
||||||
|
|
||||||
|
|
||||||
class ConvBn(nn.Layer):
|
|
||||||
"""Convolution layer with batch normalization.
|
|
||||||
|
|
||||||
:param kernel_size: The x dimension of a filter kernel. Or input a tuple for
|
|
||||||
two image dimension.
|
|
||||||
:type kernel_size: int|tuple|list
|
|
||||||
:param num_channels_in: Number of input channels.
|
|
||||||
:type num_channels_in: int
|
|
||||||
:param num_channels_out: Number of output channels.
|
|
||||||
:type num_channels_out: int
|
|
||||||
:param stride: The x dimension of the stride. Or input a tuple for two
|
|
||||||
image dimension.
|
|
||||||
:type stride: int|tuple|list
|
|
||||||
:param padding: The x dimension of the padding. Or input a tuple for two
|
|
||||||
image dimension.
|
|
||||||
:type padding: int|tuple|list
|
|
||||||
:param act: Activation type, relu|brelu
|
|
||||||
:type act: string
|
|
||||||
:param masks: Masks data layer to reset padding.
|
|
||||||
:type masks: Variable
|
|
||||||
:param name: Name of the layer.
|
|
||||||
:param name: string
|
|
||||||
:return: Batch norm layer after convolution layer.
|
|
||||||
:rtype: Variable
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, num_channels_in, num_channels_out, kernel_size, stride,
|
|
||||||
padding, act):
|
|
||||||
|
|
||||||
super().__init__()
|
|
||||||
self.kernel_size = kernel_size
|
|
||||||
self.stride = stride
|
|
||||||
self.padding = padding
|
|
||||||
|
|
||||||
self.conv = nn.Conv2D(
|
|
||||||
num_channels_in,
|
|
||||||
num_channels_out,
|
|
||||||
kernel_size=kernel_size,
|
|
||||||
stride=stride,
|
|
||||||
padding=padding,
|
|
||||||
weight_attr=None,
|
|
||||||
bias_attr=False,
|
|
||||||
data_format='NCHW')
|
|
||||||
|
|
||||||
self.bn = nn.BatchNorm2D(
|
|
||||||
num_channels_out,
|
|
||||||
weight_attr=None,
|
|
||||||
bias_attr=None,
|
|
||||||
data_format='NCHW')
|
|
||||||
self.act = F.relu if act == 'relu' else brelu
|
|
||||||
|
|
||||||
def forward(self, x, x_len):
|
|
||||||
"""
|
|
||||||
x(Tensor): audio, shape [B, C, D, T]
|
|
||||||
"""
|
|
||||||
x = self.conv(x)
|
|
||||||
x = self.bn(x)
|
|
||||||
x = self.act(x)
|
|
||||||
|
|
||||||
x_len = (x_len - self.kernel_size[1] + 2 * self.padding[1]
|
|
||||||
) // self.stride[1] + 1
|
|
||||||
|
|
||||||
# reset padding part to 0
|
|
||||||
masks = sequence_mask(x_len) #[B, T]
|
|
||||||
masks = masks.unsqueeze(1).unsqueeze(1) # [B, 1, 1, T]
|
|
||||||
x = x.multiply(masks)
|
|
||||||
|
|
||||||
return x, x_len
|
|
||||||
|
|
||||||
|
|
||||||
class ConvStack(nn.Layer):
|
|
||||||
"""Convolution group with stacked convolution layers.
|
|
||||||
|
|
||||||
:param feat_size: audio feature dim.
|
|
||||||
:type feat_size: int
|
|
||||||
:param num_stacks: Number of stacked convolution layers.
|
|
||||||
:type num_stacks: int
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, feat_size, num_stacks):
|
|
||||||
super().__init__()
|
|
||||||
self.feat_size = feat_size # D
|
|
||||||
self.num_stacks = num_stacks
|
|
||||||
|
|
||||||
self.conv_in = ConvBn(
|
|
||||||
num_channels_in=1,
|
|
||||||
num_channels_out=32,
|
|
||||||
kernel_size=(41, 11), #[D, T]
|
|
||||||
stride=(2, 3),
|
|
||||||
padding=(20, 5),
|
|
||||||
act='brelu')
|
|
||||||
|
|
||||||
out_channel = 32
|
|
||||||
self.conv_stack = nn.LayerList([
|
|
||||||
ConvBn(
|
|
||||||
num_channels_in=32,
|
|
||||||
num_channels_out=out_channel,
|
|
||||||
kernel_size=(21, 11),
|
|
||||||
stride=(2, 1),
|
|
||||||
padding=(10, 5),
|
|
||||||
act='brelu') for i in range(num_stacks - 1)
|
|
||||||
])
|
|
||||||
|
|
||||||
# conv output feat_dim
|
|
||||||
output_height = (feat_size - 1) // 2 + 1
|
|
||||||
for i in range(self.num_stacks - 1):
|
|
||||||
output_height = (output_height - 1) // 2 + 1
|
|
||||||
self.output_height = out_channel * output_height
|
|
||||||
|
|
||||||
def forward(self, x, x_len):
|
|
||||||
"""
|
|
||||||
x: shape [B, C, D, T]
|
|
||||||
x_len : shape [B]
|
|
||||||
"""
|
|
||||||
x, x_len = self.conv_in(x, x_len)
|
|
||||||
for i, conv in enumerate(self.conv_stack):
|
|
||||||
x, x_len = conv(x, x_len)
|
|
||||||
return x, x_len
|
|
||||||
|
|
||||||
|
|
||||||
class RNNCell(nn.RNNCellBase):
|
|
||||||
r"""
|
|
||||||
Elman RNN (SimpleRNN) cell. Given the inputs and previous states, it
|
|
||||||
computes the outputs and updates states.
|
|
||||||
The formula used is as follows:
|
|
||||||
.. math::
|
|
||||||
h_{t} & = act(x_{t} + b_{ih} + W_{hh}h_{t-1} + b_{hh})
|
|
||||||
y_{t} & = h_{t}
|
|
||||||
|
|
||||||
where :math:`act` is for :attr:`activation`.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self,
|
|
||||||
hidden_size,
|
|
||||||
activation="tanh",
|
|
||||||
weight_ih_attr=None,
|
|
||||||
weight_hh_attr=None,
|
|
||||||
bias_ih_attr=None,
|
|
||||||
bias_hh_attr=None,
|
|
||||||
name=None):
|
|
||||||
super().__init__()
|
|
||||||
std = 1.0 / math.sqrt(hidden_size)
|
|
||||||
self.weight_hh = self.create_parameter(
|
|
||||||
(hidden_size, hidden_size),
|
|
||||||
weight_hh_attr,
|
|
||||||
default_initializer=I.Uniform(-std, std))
|
|
||||||
# self.bias_ih = self.create_parameter(
|
|
||||||
# (hidden_size, ),
|
|
||||||
# bias_ih_attr,
|
|
||||||
# is_bias=True,
|
|
||||||
# default_initializer=I.Uniform(-std, std))
|
|
||||||
self.bias_ih = None
|
|
||||||
self.bias_hh = self.create_parameter(
|
|
||||||
(hidden_size, ),
|
|
||||||
bias_hh_attr,
|
|
||||||
is_bias=True,
|
|
||||||
default_initializer=I.Uniform(-std, std))
|
|
||||||
|
|
||||||
self.hidden_size = hidden_size
|
|
||||||
if activation not in ["tanh", "relu", "brelu"]:
|
|
||||||
raise ValueError(
|
|
||||||
"activation for SimpleRNNCell should be tanh or relu, "
|
|
||||||
"but get {}".format(activation))
|
|
||||||
self.activation = activation
|
|
||||||
self._activation_fn = paddle.tanh \
|
|
||||||
if activation == "tanh" \
|
|
||||||
else F.relu
|
|
||||||
if activation == 'brelu':
|
|
||||||
self._activation_fn = brelu
|
|
||||||
|
|
||||||
def forward(self, inputs, states=None):
|
|
||||||
if states is None:
|
|
||||||
states = self.get_initial_states(inputs, self.state_shape)
|
|
||||||
pre_h = states
|
|
||||||
i2h = inputs
|
|
||||||
if self.bias_ih is not None:
|
|
||||||
i2h += self.bias_ih
|
|
||||||
h2h = paddle.matmul(pre_h, self.weight_hh, transpose_y=True)
|
|
||||||
if self.bias_hh is not None:
|
|
||||||
h2h += self.bias_hh
|
|
||||||
h = self._activation_fn(i2h + h2h)
|
|
||||||
return h, h
|
|
||||||
|
|
||||||
@property
|
|
||||||
def state_shape(self):
|
|
||||||
return (self.hidden_size, )
|
|
||||||
|
|
||||||
|
|
||||||
class GRUCellShare(nn.RNNCellBase):
|
|
||||||
r"""
|
|
||||||
Gated Recurrent Unit (GRU) RNN cell. Given the inputs and previous states,
|
|
||||||
it computes the outputs and updates states.
|
|
||||||
The formula for GRU used is as follows:
|
|
||||||
.. math::
|
|
||||||
r_{t} & = \sigma(W_{ir}x_{t} + b_{ir} + W_{hr}h_{t-1} + b_{hr})
|
|
||||||
z_{t} & = \sigma(W_{iz}x_{t} + b_{iz} + W_{hz}h_{t-1} + b_{hz})
|
|
||||||
\widetilde{h}_{t} & = \tanh(W_{ic}x_{t} + b_{ic} + r_{t} * (W_{hc}h_{t-1} + b_{hc}))
|
|
||||||
h_{t} & = z_{t} * h_{t-1} + (1 - z_{t}) * \widetilde{h}_{t}
|
|
||||||
y_{t} & = h_{t}
|
|
||||||
|
|
||||||
where :math:`\sigma` is the sigmoid fucntion, and * is the elemetwise
|
|
||||||
multiplication operator.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self,
|
|
||||||
input_size,
|
|
||||||
hidden_size,
|
|
||||||
weight_ih_attr=None,
|
|
||||||
weight_hh_attr=None,
|
|
||||||
bias_ih_attr=None,
|
|
||||||
bias_hh_attr=None,
|
|
||||||
name=None):
|
|
||||||
super().__init__()
|
|
||||||
std = 1.0 / math.sqrt(hidden_size)
|
|
||||||
self.weight_hh = self.create_parameter(
|
|
||||||
(3 * hidden_size, hidden_size),
|
|
||||||
weight_hh_attr,
|
|
||||||
default_initializer=I.Uniform(-std, std))
|
|
||||||
# self.bias_ih = self.create_parameter(
|
|
||||||
# (3 * hidden_size, ),
|
|
||||||
# bias_ih_attr,
|
|
||||||
# is_bias=True,
|
|
||||||
# default_initializer=I.Uniform(-std, std))
|
|
||||||
self.bias_ih = None
|
|
||||||
self.bias_hh = self.create_parameter(
|
|
||||||
(3 * hidden_size, ),
|
|
||||||
bias_hh_attr,
|
|
||||||
is_bias=True,
|
|
||||||
default_initializer=I.Uniform(-std, std))
|
|
||||||
|
|
||||||
self.hidden_size = hidden_size
|
|
||||||
self.input_size = input_size
|
|
||||||
self._gate_activation = F.sigmoid
|
|
||||||
self._activation = paddle.tanh
|
|
||||||
#self._activation = F.relu
|
|
||||||
|
|
||||||
def forward(self, inputs, states=None):
|
|
||||||
if states is None:
|
|
||||||
states = self.get_initial_states(inputs, self.state_shape)
|
|
||||||
|
|
||||||
pre_hidden = states
|
|
||||||
x_gates = inputs
|
|
||||||
if self.bias_ih is not None:
|
|
||||||
x_gates = x_gates + self.bias_ih
|
|
||||||
h_gates = paddle.matmul(pre_hidden, self.weight_hh, transpose_y=True)
|
|
||||||
if self.bias_hh is not None:
|
|
||||||
h_gates = h_gates + self.bias_hh
|
|
||||||
|
|
||||||
x_r, x_z, x_c = paddle.split(x_gates, num_or_sections=3, axis=1)
|
|
||||||
h_r, h_z, h_c = paddle.split(h_gates, num_or_sections=3, axis=1)
|
|
||||||
|
|
||||||
r = self._gate_activation(x_r + h_r)
|
|
||||||
z = self._gate_activation(x_z + h_z)
|
|
||||||
c = self._activation(x_c + r * h_c) # apply reset gate after mm
|
|
||||||
h = (pre_hidden - c) * z + c
|
|
||||||
# https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/fluid/layers/dynamic_gru_cn.html#dynamic-gru
|
|
||||||
|
|
||||||
return h, h
|
|
||||||
|
|
||||||
@property
|
|
||||||
def state_shape(self):
|
|
||||||
r"""
|
|
||||||
The `state_shape` of GRUCell is a shape `[hidden_size]` (-1 for batch
|
|
||||||
size would be automatically inserted into shape). The shape corresponds
|
|
||||||
to the shape of :math:`h_{t-1}`.
|
|
||||||
"""
|
|
||||||
return (self.hidden_size, )
|
|
||||||
|
|
||||||
|
|
||||||
class BiRNNWithBN(nn.Layer):
|
|
||||||
"""Bidirectonal simple rnn layer with sequence-wise batch normalization.
|
|
||||||
The batch normalization is only performed on input-state weights.
|
|
||||||
|
|
||||||
:param name: Name of the layer parameters.
|
|
||||||
:type name: string
|
|
||||||
:param size: Dimension of RNN cells.
|
|
||||||
:type size: int
|
|
||||||
:param share_weights: Whether to share input-hidden weights between
|
|
||||||
forward and backward directional RNNs.
|
|
||||||
:type share_weights: bool
|
|
||||||
:return: Bidirectional simple rnn layer.
|
|
||||||
:rtype: Variable
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, i_size, h_size, share_weights):
|
|
||||||
super().__init__()
|
|
||||||
self.share_weights = share_weights
|
|
||||||
if self.share_weights:
|
|
||||||
#input-hidden weights shared between bi-directional rnn.
|
|
||||||
self.fw_fc = nn.Linear(i_size, h_size, bias_attr=False)
|
|
||||||
# batch norm is only performed on input-state projection
|
|
||||||
self.fw_bn = nn.BatchNorm1D(
|
|
||||||
h_size, bias_attr=None, data_format='NLC')
|
|
||||||
self.bw_fc = self.fw_fc
|
|
||||||
self.bw_bn = self.fw_bn
|
|
||||||
else:
|
|
||||||
self.fw_fc = nn.Linear(i_size, h_size, bias_attr=False)
|
|
||||||
self.fw_bn = nn.BatchNorm1D(
|
|
||||||
h_size, bias_attr=None, data_format='NLC')
|
|
||||||
self.bw_fc = nn.Linear(i_size, h_size, bias_attr=False)
|
|
||||||
self.bw_bn = nn.BatchNorm1D(
|
|
||||||
h_size, bias_attr=None, data_format='NLC')
|
|
||||||
|
|
||||||
self.fw_cell = RNNCell(hidden_size=h_size, activation='brelu')
|
|
||||||
self.bw_cell = RNNCell(hidden_size=h_size, activation='brelu')
|
|
||||||
self.fw_rnn = nn.RNN(
|
|
||||||
self.fw_cell, is_reverse=False, time_major=False) #[B, T, D]
|
|
||||||
self.bw_rnn = nn.RNN(
|
|
||||||
self.fw_cell, is_reverse=True, time_major=False) #[B, T, D]
|
|
||||||
|
|
||||||
def forward(self, x, x_len):
|
|
||||||
# x, shape [B, T, D]
|
|
||||||
fw_x = self.fw_bn(self.fw_fc(x))
|
|
||||||
bw_x = self.bw_bn(self.bw_fc(x))
|
|
||||||
fw_x, _ = self.fw_rnn(inputs=fw_x, sequence_length=x_len)
|
|
||||||
bw_x, _ = self.bw_rnn(inputs=bw_x, sequence_length=x_len)
|
|
||||||
x = paddle.concat([fw_x, bw_x], axis=-1)
|
|
||||||
return x, x_len
|
|
||||||
|
|
||||||
|
|
||||||
class BiGRUWithBN(nn.Layer):
|
|
||||||
"""Bidirectonal gru layer with sequence-wise batch normalization.
|
|
||||||
The batch normalization is only performed on input-state weights.
|
|
||||||
|
|
||||||
:param name: Name of the layer.
|
|
||||||
:type name: string
|
|
||||||
:param input: Input layer.
|
|
||||||
:type input: Variable
|
|
||||||
:param size: Dimension of GRU cells.
|
|
||||||
:type size: int
|
|
||||||
:param act: Activation type.
|
|
||||||
:type act: string
|
|
||||||
:return: Bidirectional GRU layer.
|
|
||||||
:rtype: Variable
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, i_size, h_size, act):
|
|
||||||
super().__init__()
|
|
||||||
hidden_size = h_size * 3
|
|
||||||
|
|
||||||
self.fw_fc = nn.Linear(i_size, hidden_size, bias_attr=False)
|
|
||||||
self.fw_bn = nn.BatchNorm1D(
|
|
||||||
hidden_size, bias_attr=None, data_format='NLC')
|
|
||||||
self.bw_fc = nn.Linear(i_size, hidden_size, bias_attr=False)
|
|
||||||
self.bw_bn = nn.BatchNorm1D(
|
|
||||||
hidden_size, bias_attr=None, data_format='NLC')
|
|
||||||
|
|
||||||
self.fw_cell = GRUCellShare(input_size=hidden_size, hidden_size=h_size)
|
|
||||||
self.bw_cell = GRUCellShare(input_size=hidden_size, hidden_size=h_size)
|
|
||||||
self.fw_rnn = nn.RNN(
|
|
||||||
self.fw_cell, is_reverse=False, time_major=False) #[B, T, D]
|
|
||||||
self.bw_rnn = nn.RNN(
|
|
||||||
self.fw_cell, is_reverse=True, time_major=False) #[B, T, D]
|
|
||||||
|
|
||||||
def forward(self, x, x_len):
|
|
||||||
# x, shape [B, T, D]
|
|
||||||
fw_x = self.fw_bn(self.fw_fc(x))
|
|
||||||
bw_x = self.bw_bn(self.bw_fc(x))
|
|
||||||
fw_x, _ = self.fw_rnn(inputs=fw_x, sequence_length=x_len)
|
|
||||||
bw_x, _ = self.bw_rnn(inputs=bw_x, sequence_length=x_len)
|
|
||||||
x = paddle.concat([fw_x, bw_x], axis=-1)
|
|
||||||
return x, x_len
|
|
||||||
|
|
||||||
|
|
||||||
class RNNStack(nn.Layer):
|
|
||||||
"""RNN group with stacked bidirectional simple RNN or GRU layers.
|
|
||||||
|
|
||||||
:param input: Input layer.
|
|
||||||
:type input: Variable
|
|
||||||
:param size: Dimension of RNN cells in each layer.
|
|
||||||
:type size: int
|
|
||||||
:param num_stacks: Number of stacked rnn layers.
|
|
||||||
:type num_stacks: int
|
|
||||||
:param use_gru: Use gru if set True. Use simple rnn if set False.
|
|
||||||
:type use_gru: bool
|
|
||||||
:param share_rnn_weights: Whether to share input-hidden weights between
|
|
||||||
forward and backward directional RNNs.
|
|
||||||
It is only available when use_gru=False.
|
|
||||||
:type share_weights: bool
|
|
||||||
:return: Output layer of the RNN group.
|
|
||||||
:rtype: Variable
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, i_size, h_size, num_stacks, use_gru, share_rnn_weights):
|
|
||||||
super().__init__()
|
|
||||||
self.rnn_stacks = nn.LayerList()
|
|
||||||
for i in range(num_stacks):
|
|
||||||
if use_gru:
|
|
||||||
#default:GRU using tanh
|
|
||||||
self.rnn_stacks.append(
|
|
||||||
BiGRUWithBN(i_size=i_size, h_size=h_size, act="relu"))
|
|
||||||
else:
|
|
||||||
self.rnn_stacks.append(
|
|
||||||
BiRNNWithBN(
|
|
||||||
i_size=i_size,
|
|
||||||
h_size=h_size,
|
|
||||||
share_weights=share_rnn_weights))
|
|
||||||
i_size = h_size * 2
|
|
||||||
|
|
||||||
def forward(self, x, x_len):
|
|
||||||
"""
|
|
||||||
x: shape [B, T, D]
|
|
||||||
x_len: shpae [B]
|
|
||||||
"""
|
|
||||||
for i, rnn in enumerate(self.rnn_stacks):
|
|
||||||
x, x_len = rnn(x, x_len)
|
|
||||||
masks = sequence_mask(x_len) #[B, T]
|
|
||||||
masks = masks.unsqueeze(-1) # [B, T, 1]
|
|
||||||
x = x.multiply(masks)
|
|
||||||
return x, x_len
|
|
||||||
|
|
||||||
|
|
||||||
class DeepSpeech2(nn.Layer):
|
|
||||||
"""The DeepSpeech2 network structure.
|
|
||||||
|
|
||||||
:param audio_data: Audio spectrogram data layer.
|
|
||||||
:type audio_data: Variable
|
|
||||||
:param text_data: Transcription text data layer.
|
|
||||||
:type text_data: Variable
|
|
||||||
:param audio_len: Valid sequence length data layer.
|
|
||||||
:type audio_len: Variable
|
|
||||||
:param masks: Masks data layer to reset padding.
|
|
||||||
:type masks: Variable
|
|
||||||
:param dict_size: Dictionary size for tokenized transcription.
|
|
||||||
:type dict_size: int
|
|
||||||
:param num_conv_layers: Number of stacking convolution layers.
|
|
||||||
:type num_conv_layers: int
|
|
||||||
:param num_rnn_layers: Number of stacking RNN layers.
|
|
||||||
:type num_rnn_layers: int
|
|
||||||
:param rnn_size: RNN layer size (dimension of RNN cells).
|
|
||||||
:type rnn_size: int
|
|
||||||
:param use_gru: Use gru if set True. Use simple rnn if set False.
|
|
||||||
:type use_gru: bool
|
|
||||||
:param share_rnn_weights: Whether to share input-hidden weights between
|
|
||||||
forward and backward direction RNNs.
|
|
||||||
It is only available when use_gru=False.
|
|
||||||
:type share_weights: bool
|
|
||||||
:return: A tuple of an output unnormalized log probability layer (
|
|
||||||
before softmax) and a ctc cost layer.
|
|
||||||
:rtype: tuple of LayerOutput
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self,
|
|
||||||
feat_size,
|
|
||||||
dict_size,
|
|
||||||
num_conv_layers=2,
|
|
||||||
num_rnn_layers=3,
|
|
||||||
rnn_size=1024,
|
|
||||||
use_gru=False,
|
|
||||||
share_rnn_weights=True):
|
|
||||||
super().__init__()
|
|
||||||
self.feat_size = feat_size # 161 for linear
|
|
||||||
self.dict_size = dict_size
|
|
||||||
|
|
||||||
self.conv = ConvStack(feat_size, num_conv_layers)
|
|
||||||
|
|
||||||
i_size = self.conv.output_height # H after conv stack
|
|
||||||
self.rnn = RNNStack(
|
|
||||||
i_size=i_size,
|
|
||||||
h_size=rnn_size,
|
|
||||||
num_stacks=num_rnn_layers,
|
|
||||||
use_gru=use_gru,
|
|
||||||
share_rnn_weights=share_rnn_weights)
|
|
||||||
self.fc = nn.Linear(rnn_size * 2, dict_size + 1)
|
|
||||||
|
|
||||||
self.logger = logging.getLogger(__name__)
|
|
||||||
self._ext_scorer = None
|
|
||||||
|
|
||||||
def infer(self, audio, audio_len):
|
|
||||||
# [B, D, T] -> [B, C=1, D, T]
|
|
||||||
audio = audio.unsqueeze(1)
|
|
||||||
|
|
||||||
# convolution group
|
|
||||||
x, audio_len = self.conv(audio, audio_len)
|
|
||||||
#print('conv out', x.shape)
|
|
||||||
|
|
||||||
# convert data from convolution feature map to sequence of vectors
|
|
||||||
B, C, D, T = paddle.shape(x)
|
|
||||||
x = x.transpose([0, 3, 1, 2]) #[B, T, C, D]
|
|
||||||
x = x.reshape([B, T, C * D]) #[B, T, C*D]
|
|
||||||
#print('rnn input', x.shape)
|
|
||||||
|
|
||||||
# remove padding part
|
|
||||||
x, audio_len = self.rnn(x, audio_len) #[B, T, D]
|
|
||||||
#print('rnn output', x.shape)
|
|
||||||
|
|
||||||
logits = self.fc(x) #[B, T, V + 1]
|
|
||||||
|
|
||||||
#ctcdecoder need probs, not log_probs
|
|
||||||
probs = F.softmax(logits)
|
|
||||||
|
|
||||||
return logits, probs, audio_len
|
|
||||||
|
|
||||||
def forward(self, audio, text, audio_len, text_len):
|
|
||||||
"""
|
|
||||||
audio: shape [B, D, T]
|
|
||||||
text: shape [B, T]
|
|
||||||
audio_len: shape [B]
|
|
||||||
text_len: shape [B]
|
|
||||||
"""
|
|
||||||
return self.infer(audio, audio_len)
|
|
||||||
|
|
||||||
@paddle.no_grad()
|
|
||||||
def predict(self, audio, audio_len):
|
|
||||||
""" Model infer """
|
|
||||||
return self.infer(audio, audio_len)
|
|
||||||
|
|
||||||
def _decode_batch_greedy(self, probs_split, vocab_list):
|
|
||||||
"""Decode by best path for a batch of probs matrix input.
|
|
||||||
:param probs_split: List of 2-D probability matrix, and each consists
|
|
||||||
of prob vectors for one speech utterancce.
|
|
||||||
:param probs_split: List of matrix
|
|
||||||
:param vocab_list: List of tokens in the vocabulary, for decoding.
|
|
||||||
:type vocab_list: list
|
|
||||||
:return: List of transcription texts.
|
|
||||||
:rtype: List of str
|
|
||||||
"""
|
|
||||||
results = []
|
|
||||||
for i, probs in enumerate(probs_split):
|
|
||||||
output_transcription = ctc_greedy_decoder(
|
|
||||||
probs_seq=probs, vocabulary=vocab_list)
|
|
||||||
results.append(output_transcription)
|
|
||||||
return results
|
|
||||||
|
|
||||||
def _init_ext_scorer(self, beam_alpha, beam_beta, language_model_path,
|
|
||||||
vocab_list):
|
|
||||||
"""Initialize the external scorer.
|
|
||||||
:param beam_alpha: Parameter associated with language model.
|
|
||||||
:type beam_alpha: float
|
|
||||||
:param beam_beta: Parameter associated with word count.
|
|
||||||
:type beam_beta: float
|
|
||||||
:param language_model_path: Filepath for language model. If it is
|
|
||||||
empty, the external scorer will be set to
|
|
||||||
None, and the decoding method will be pure
|
|
||||||
beam search without scorer.
|
|
||||||
:type language_model_path: str|None
|
|
||||||
:param vocab_list: List of tokens in the vocabulary, for decoding.
|
|
||||||
:type vocab_list: list
|
|
||||||
"""
|
|
||||||
# init once
|
|
||||||
if self._ext_scorer != None:
|
|
||||||
return
|
|
||||||
|
|
||||||
if language_model_path != '':
|
|
||||||
self.logger.info("begin to initialize the external scorer "
|
|
||||||
"for decoding")
|
|
||||||
self._ext_scorer = Scorer(beam_alpha, beam_beta,
|
|
||||||
language_model_path, vocab_list)
|
|
||||||
lm_char_based = self._ext_scorer.is_character_based()
|
|
||||||
lm_max_order = self._ext_scorer.get_max_order()
|
|
||||||
lm_dict_size = self._ext_scorer.get_dict_size()
|
|
||||||
self.logger.info("language model: "
|
|
||||||
"is_character_based = %d," % lm_char_based +
|
|
||||||
" max_order = %d," % lm_max_order +
|
|
||||||
" dict_size = %d" % lm_dict_size)
|
|
||||||
self.logger.info("end initializing scorer")
|
|
||||||
else:
|
|
||||||
self._ext_scorer = None
|
|
||||||
self.logger.info("no language model provided, "
|
|
||||||
"decoding by pure beam search without scorer.")
|
|
||||||
|
|
||||||
def _decode_batch_beam_search(self, probs_split, beam_alpha, beam_beta,
|
|
||||||
beam_size, cutoff_prob, cutoff_top_n,
|
|
||||||
vocab_list, num_processes):
|
|
||||||
"""Decode by beam search for a batch of probs matrix input.
|
|
||||||
:param probs_split: List of 2-D probability matrix, and each consists
|
|
||||||
of prob vectors for one speech utterancce.
|
|
||||||
:param probs_split: List of matrix
|
|
||||||
:param beam_alpha: Parameter associated with language model.
|
|
||||||
:type beam_alpha: float
|
|
||||||
:param beam_beta: Parameter associated with word count.
|
|
||||||
:type beam_beta: float
|
|
||||||
:param beam_size: Width for Beam search.
|
|
||||||
:type beam_size: int
|
|
||||||
:param cutoff_prob: Cutoff probability in pruning,
|
|
||||||
default 1.0, no pruning.
|
|
||||||
:type cutoff_prob: float
|
|
||||||
:param cutoff_top_n: Cutoff number in pruning, only top cutoff_top_n
|
|
||||||
characters with highest probs in vocabulary will be
|
|
||||||
used in beam search, default 40.
|
|
||||||
:type cutoff_top_n: int
|
|
||||||
:param vocab_list: List of tokens in the vocabulary, for decoding.
|
|
||||||
:type vocab_list: list
|
|
||||||
:param num_processes: Number of processes (CPU) for decoder.
|
|
||||||
:type num_processes: int
|
|
||||||
:return: List of transcription texts.
|
|
||||||
:rtype: List of str
|
|
||||||
"""
|
|
||||||
if self._ext_scorer != None:
|
|
||||||
self._ext_scorer.reset_params(beam_alpha, beam_beta)
|
|
||||||
|
|
||||||
# beam search decode
|
|
||||||
num_processes = min(num_processes, len(probs_split))
|
|
||||||
beam_search_results = ctc_beam_search_decoder_batch(
|
|
||||||
probs_split=probs_split,
|
|
||||||
vocabulary=vocab_list,
|
|
||||||
beam_size=beam_size,
|
|
||||||
num_processes=num_processes,
|
|
||||||
ext_scoring_func=self._ext_scorer,
|
|
||||||
cutoff_prob=cutoff_prob,
|
|
||||||
cutoff_top_n=cutoff_top_n)
|
|
||||||
|
|
||||||
results = [result[0][1] for result in beam_search_results]
|
|
||||||
return results
|
|
||||||
|
|
||||||
def init_decode(self, beam_alpha, beam_beta, lang_model_path, vocab_list,
|
|
||||||
decoding_method):
|
|
||||||
if decoding_method == "ctc_beam_search":
|
|
||||||
self._init_ext_scorer(beam_alpha, beam_beta, lang_model_path,
|
|
||||||
vocab_list)
|
|
||||||
|
|
||||||
def decode_probs(self, probs, logits_lens, vocab_list, decoding_method,
|
|
||||||
lang_model_path, beam_alpha, beam_beta, beam_size,
|
|
||||||
cutoff_prob, cutoff_top_n, num_processes):
|
|
||||||
""" probs: activation after softmax
|
|
||||||
logits_len: audio output lens
|
|
||||||
"""
|
|
||||||
probs_split = [probs[i, :l, :] for i, l in enumerate(logits_lens)]
|
|
||||||
if decoding_method == "ctc_greedy":
|
|
||||||
result_transcripts = self._decode_batch_greedy(
|
|
||||||
probs_split=probs_split, vocab_list=vocab_list)
|
|
||||||
elif decoding_method == "ctc_beam_search":
|
|
||||||
result_transcripts = self._decode_batch_beam_search(
|
|
||||||
probs_split=probs_split,
|
|
||||||
beam_alpha=beam_alpha,
|
|
||||||
beam_beta=beam_beta,
|
|
||||||
beam_size=beam_size,
|
|
||||||
cutoff_prob=cutoff_prob,
|
|
||||||
cutoff_top_n=cutoff_top_n,
|
|
||||||
vocab_list=vocab_list,
|
|
||||||
num_processes=num_processes)
|
|
||||||
else:
|
|
||||||
raise ValueError(f"Not support: {decoding_method}")
|
|
||||||
return result_transcripts
|
|
||||||
|
|
||||||
@paddle.no_grad()
|
|
||||||
def decode(self, audio, audio_len, vocab_list, decoding_method,
|
|
||||||
lang_model_path, beam_alpha, beam_beta, beam_size, cutoff_prob,
|
|
||||||
cutoff_top_n, num_processes):
|
|
||||||
_, probs, logits_lens = self.predict(audio, audio_len)
|
|
||||||
return self.decode_probs(probs.numpy(), logits_lens, vocab_list,
|
|
||||||
decoding_method, lang_model_path, beam_alpha,
|
|
||||||
beam_beta, beam_size, cutoff_prob,
|
|
||||||
cutoff_top_n, num_processes)
|
|
||||||
|
|
||||||
def from_pretrained(self, checkpoint_path):
|
|
||||||
"""Build a model from a pretrained model.
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
model: nn.Layer
|
|
||||||
Asr Model.
|
|
||||||
|
|
||||||
checkpoint_path: Path or str
|
|
||||||
The path of pretrained model checkpoint, without extension name.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
Model
|
|
||||||
The model build from pretrined result.
|
|
||||||
"""
|
|
||||||
checkpoint.load_parameters(self, checkpoint_path=checkpoint_path)
|
|
||||||
return
|
|
||||||
|
|
||||||
|
|
||||||
def ctc_loss(logits,
|
|
||||||
labels,
|
|
||||||
input_lengths,
|
|
||||||
label_lengths,
|
|
||||||
blank=0,
|
|
||||||
reduction='mean',
|
|
||||||
norm_by_times=True):
|
|
||||||
#logger.info("my ctc loss with norm by times")
|
|
||||||
## https://github.com/PaddlePaddle/Paddle/blob/f5ca2db2cc/paddle/fluid/operators/warpctc_op.h#L403
|
|
||||||
loss_out = paddle.fluid.layers.warpctc(logits, labels, blank, norm_by_times,
|
|
||||||
input_lengths, label_lengths)
|
|
||||||
|
|
||||||
loss_out = paddle.fluid.layers.squeeze(loss_out, [-1])
|
|
||||||
logger.info(f"warpctc loss: {loss_out}/{loss_out.shape} ")
|
|
||||||
assert reduction in ['mean', 'sum', 'none']
|
|
||||||
if reduction == 'mean':
|
|
||||||
loss_out = paddle.mean(loss_out / label_lengths)
|
|
||||||
elif reduction == 'sum':
|
|
||||||
loss_out = paddle.sum(loss_out)
|
|
||||||
logger.info(f"ctc loss: {loss_out}")
|
|
||||||
return loss_out
|
|
||||||
|
|
||||||
|
|
||||||
F.ctc_loss = ctc_loss
|
|
||||||
|
|
||||||
|
|
||||||
class DeepSpeech2Loss(nn.Layer):
|
|
||||||
def __init__(self, vocab_size):
|
|
||||||
super().__init__()
|
|
||||||
# last token id as blank id
|
|
||||||
self.loss = nn.CTCLoss(blank=vocab_size, reduction='sum')
|
|
||||||
|
|
||||||
def forward(self, logits, text, logits_len, text_len):
|
|
||||||
# warp-ctc do softmax on activations
|
|
||||||
# warp-ctc need activation with shape [T, B, V + 1]
|
|
||||||
logits = logits.transpose([1, 0, 2])
|
|
||||||
|
|
||||||
ctc_loss = self.loss(logits, text, logits_len, text_len)
|
|
||||||
return ctc_loss
|
|
@ -0,0 +1,147 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
|
import paddle
|
||||||
|
from paddle import nn
|
||||||
|
from paddle.nn import functional as F
|
||||||
|
from paddle.nn import initializer as I
|
||||||
|
|
||||||
|
from deepspeech.modules.mask import sequence_mask
|
||||||
|
from deepspeech.modules.activation import brelu
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
__all__ = ['ConvStack']
|
||||||
|
|
||||||
|
|
||||||
|
class ConvBn(nn.Layer):
|
||||||
|
"""Convolution layer with batch normalization.
|
||||||
|
|
||||||
|
:param kernel_size: The x dimension of a filter kernel. Or input a tuple for
|
||||||
|
two image dimension.
|
||||||
|
:type kernel_size: int|tuple|list
|
||||||
|
:param num_channels_in: Number of input channels.
|
||||||
|
:type num_channels_in: int
|
||||||
|
:param num_channels_out: Number of output channels.
|
||||||
|
:type num_channels_out: int
|
||||||
|
:param stride: The x dimension of the stride. Or input a tuple for two
|
||||||
|
image dimension.
|
||||||
|
:type stride: int|tuple|list
|
||||||
|
:param padding: The x dimension of the padding. Or input a tuple for two
|
||||||
|
image dimension.
|
||||||
|
:type padding: int|tuple|list
|
||||||
|
:param act: Activation type, relu|brelu
|
||||||
|
:type act: string
|
||||||
|
:return: Batch norm layer after convolution layer.
|
||||||
|
:rtype: Variable
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, num_channels_in, num_channels_out, kernel_size, stride,
|
||||||
|
padding, act):
|
||||||
|
|
||||||
|
super().__init__()
|
||||||
|
assert len(kernel_size) == 2
|
||||||
|
assert len(stride) == 2
|
||||||
|
assert len(padding) == 2
|
||||||
|
self.kernel_size = kernel_size
|
||||||
|
self.stride = stride
|
||||||
|
self.padding = padding
|
||||||
|
|
||||||
|
self.conv = nn.Conv2D(
|
||||||
|
num_channels_in,
|
||||||
|
num_channels_out,
|
||||||
|
kernel_size=kernel_size,
|
||||||
|
stride=stride,
|
||||||
|
padding=padding,
|
||||||
|
weight_attr=None,
|
||||||
|
bias_attr=False,
|
||||||
|
data_format='NCHW')
|
||||||
|
|
||||||
|
self.bn = nn.BatchNorm2D(
|
||||||
|
num_channels_out,
|
||||||
|
weight_attr=None,
|
||||||
|
bias_attr=None,
|
||||||
|
data_format='NCHW')
|
||||||
|
self.act = F.relu if act == 'relu' else brelu
|
||||||
|
|
||||||
|
def forward(self, x, x_len):
|
||||||
|
"""
|
||||||
|
x(Tensor): audio, shape [B, C, D, T]
|
||||||
|
"""
|
||||||
|
x = self.conv(x)
|
||||||
|
x = self.bn(x)
|
||||||
|
x = self.act(x)
|
||||||
|
|
||||||
|
x_len = (x_len - self.kernel_size[1] + 2 * self.padding[1]
|
||||||
|
) // self.stride[1] + 1
|
||||||
|
|
||||||
|
# reset padding part to 0
|
||||||
|
masks = sequence_mask(x_len) #[B, T]
|
||||||
|
masks = masks.unsqueeze(1).unsqueeze(1) # [B, 1, 1, T]
|
||||||
|
x = x.multiply(masks)
|
||||||
|
|
||||||
|
return x, x_len
|
||||||
|
|
||||||
|
|
||||||
|
class ConvStack(nn.Layer):
|
||||||
|
"""Convolution group with stacked convolution layers.
|
||||||
|
|
||||||
|
:param feat_size: audio feature dim.
|
||||||
|
:type feat_size: int
|
||||||
|
:param num_stacks: Number of stacked convolution layers.
|
||||||
|
:type num_stacks: int
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, feat_size, num_stacks):
|
||||||
|
super().__init__()
|
||||||
|
self.feat_size = feat_size # D
|
||||||
|
self.num_stacks = num_stacks
|
||||||
|
|
||||||
|
self.conv_in = ConvBn(
|
||||||
|
num_channels_in=1,
|
||||||
|
num_channels_out=32,
|
||||||
|
kernel_size=(41, 11), #[D, T]
|
||||||
|
stride=(2, 3),
|
||||||
|
padding=(20, 5),
|
||||||
|
act='brelu')
|
||||||
|
|
||||||
|
out_channel = 32
|
||||||
|
self.conv_stack = nn.LayerList([
|
||||||
|
ConvBn(
|
||||||
|
num_channels_in=32,
|
||||||
|
num_channels_out=out_channel,
|
||||||
|
kernel_size=(21, 11),
|
||||||
|
stride=(2, 1),
|
||||||
|
padding=(10, 5),
|
||||||
|
act='brelu') for i in range(num_stacks - 1)
|
||||||
|
])
|
||||||
|
|
||||||
|
# conv output feat_dim
|
||||||
|
output_height = (feat_size - 1) // 2 + 1
|
||||||
|
for i in range(self.num_stacks - 1):
|
||||||
|
output_height = (output_height - 1) // 2 + 1
|
||||||
|
self.output_height = out_channel * output_height
|
||||||
|
|
||||||
|
def forward(self, x, x_len):
|
||||||
|
"""
|
||||||
|
x: shape [B, C, D, T]
|
||||||
|
x_len : shape [B]
|
||||||
|
"""
|
||||||
|
x, x_len = self.conv_in(x, x_len)
|
||||||
|
for i, conv in enumerate(self.conv_stack):
|
||||||
|
x, x_len = conv(x, x_len)
|
||||||
|
return x, x_len
|
@ -0,0 +1,34 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
|
import paddle
|
||||||
|
from paddle import nn
|
||||||
|
from paddle.nn import functional as F
|
||||||
|
from paddle.nn import initializer as I
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
__all__ = ['sequence_mask']
|
||||||
|
|
||||||
|
|
||||||
|
def sequence_mask(x_len, max_len=None, dtype='float32'):
|
||||||
|
max_len = max_len or x_len.max()
|
||||||
|
x_len = paddle.unsqueeze(x_len, -1)
|
||||||
|
row_vector = paddle.arange(max_len)
|
||||||
|
#mask = row_vector < x_len
|
||||||
|
mask = row_vector > x_len # a bug, broadcast 的时候出错了
|
||||||
|
mask = paddle.cast(mask, dtype)
|
||||||
|
return mask
|
@ -0,0 +1,309 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
|
import paddle
|
||||||
|
from paddle import nn
|
||||||
|
from paddle.nn import functional as F
|
||||||
|
from paddle.nn import initializer as I
|
||||||
|
|
||||||
|
from deepspeech.modules.mask import sequence_mask
|
||||||
|
from deepspeech.modules.activation import brelu
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
__all__ = ['RNNStack']
|
||||||
|
|
||||||
|
|
||||||
|
class RNNCell(nn.RNNCellBase):
|
||||||
|
r"""
|
||||||
|
Elman RNN (SimpleRNN) cell. Given the inputs and previous states, it
|
||||||
|
computes the outputs and updates states.
|
||||||
|
The formula used is as follows:
|
||||||
|
.. math::
|
||||||
|
h_{t} & = act(x_{t} + b_{ih} + W_{hh}h_{t-1} + b_{hh})
|
||||||
|
y_{t} & = h_{t}
|
||||||
|
|
||||||
|
where :math:`act` is for :attr:`activation`.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
hidden_size,
|
||||||
|
activation="tanh",
|
||||||
|
weight_ih_attr=None,
|
||||||
|
weight_hh_attr=None,
|
||||||
|
bias_ih_attr=None,
|
||||||
|
bias_hh_attr=None,
|
||||||
|
name=None):
|
||||||
|
super().__init__()
|
||||||
|
std = 1.0 / math.sqrt(hidden_size)
|
||||||
|
self.weight_hh = self.create_parameter(
|
||||||
|
(hidden_size, hidden_size),
|
||||||
|
weight_hh_attr,
|
||||||
|
default_initializer=I.Uniform(-std, std))
|
||||||
|
self.bias_ih = None
|
||||||
|
self.bias_hh = self.create_parameter(
|
||||||
|
(hidden_size, ),
|
||||||
|
bias_hh_attr,
|
||||||
|
is_bias=True,
|
||||||
|
default_initializer=I.Uniform(-std, std))
|
||||||
|
|
||||||
|
self.hidden_size = hidden_size
|
||||||
|
if activation not in ["tanh", "relu", "brelu"]:
|
||||||
|
raise ValueError(
|
||||||
|
"activation for SimpleRNNCell should be tanh or relu, "
|
||||||
|
"but get {}".format(activation))
|
||||||
|
self.activation = activation
|
||||||
|
self._activation_fn = paddle.tanh \
|
||||||
|
if activation == "tanh" \
|
||||||
|
else F.relu
|
||||||
|
if activation == 'brelu':
|
||||||
|
self._activation_fn = brelu
|
||||||
|
|
||||||
|
def forward(self, inputs, states=None):
|
||||||
|
if states is None:
|
||||||
|
states = self.get_initial_states(inputs, self.state_shape)
|
||||||
|
pre_h = states
|
||||||
|
i2h = inputs
|
||||||
|
if self.bias_ih is not None:
|
||||||
|
i2h += self.bias_ih
|
||||||
|
h2h = paddle.matmul(pre_h, self.weight_hh, transpose_y=True)
|
||||||
|
if self.bias_hh is not None:
|
||||||
|
h2h += self.bias_hh
|
||||||
|
h = self._activation_fn(i2h + h2h)
|
||||||
|
return h, h
|
||||||
|
|
||||||
|
@property
|
||||||
|
def state_shape(self):
|
||||||
|
return (self.hidden_size, )
|
||||||
|
|
||||||
|
|
||||||
|
class GRUCell(nn.RNNCellBase):
|
||||||
|
r"""
|
||||||
|
Gated Recurrent Unit (GRU) RNN cell. Given the inputs and previous states,
|
||||||
|
it computes the outputs and updates states.
|
||||||
|
The formula for GRU used is as follows:
|
||||||
|
.. math::
|
||||||
|
r_{t} & = \sigma(W_{ir}x_{t} + b_{ir} + W_{hr}h_{t-1} + b_{hr})
|
||||||
|
z_{t} & = \sigma(W_{iz}x_{t} + b_{iz} + W_{hz}h_{t-1} + b_{hz})
|
||||||
|
\widetilde{h}_{t} & = \tanh(W_{ic}x_{t} + b_{ic} + r_{t} * (W_{hc}h_{t-1} + b_{hc}))
|
||||||
|
h_{t} & = z_{t} * h_{t-1} + (1 - z_{t}) * \widetilde{h}_{t}
|
||||||
|
y_{t} & = h_{t}
|
||||||
|
|
||||||
|
where :math:`\sigma` is the sigmoid fucntion, and * is the elemetwise
|
||||||
|
multiplication operator.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
input_size,
|
||||||
|
hidden_size,
|
||||||
|
weight_ih_attr=None,
|
||||||
|
weight_hh_attr=None,
|
||||||
|
bias_ih_attr=None,
|
||||||
|
bias_hh_attr=None,
|
||||||
|
name=None):
|
||||||
|
super().__init__()
|
||||||
|
std = 1.0 / math.sqrt(hidden_size)
|
||||||
|
self.weight_hh = self.create_parameter(
|
||||||
|
(3 * hidden_size, hidden_size),
|
||||||
|
weight_hh_attr,
|
||||||
|
default_initializer=I.Uniform(-std, std))
|
||||||
|
self.bias_ih = None
|
||||||
|
self.bias_hh = self.create_parameter(
|
||||||
|
(3 * hidden_size, ),
|
||||||
|
bias_hh_attr,
|
||||||
|
is_bias=True,
|
||||||
|
default_initializer=I.Uniform(-std, std))
|
||||||
|
|
||||||
|
self.hidden_size = hidden_size
|
||||||
|
self.input_size = input_size
|
||||||
|
self._gate_activation = F.sigmoid
|
||||||
|
self._activation = paddle.tanh
|
||||||
|
#self._activation = F.relu
|
||||||
|
|
||||||
|
def forward(self, inputs, states=None):
|
||||||
|
if states is None:
|
||||||
|
states = self.get_initial_states(inputs, self.state_shape)
|
||||||
|
|
||||||
|
pre_hidden = states
|
||||||
|
x_gates = inputs
|
||||||
|
if self.bias_ih is not None:
|
||||||
|
x_gates = x_gates + self.bias_ih
|
||||||
|
h_gates = paddle.matmul(pre_hidden, self.weight_hh, transpose_y=True)
|
||||||
|
if self.bias_hh is not None:
|
||||||
|
h_gates = h_gates + self.bias_hh
|
||||||
|
|
||||||
|
x_r, x_z, x_c = paddle.split(x_gates, num_or_sections=3, axis=1)
|
||||||
|
h_r, h_z, h_c = paddle.split(h_gates, num_or_sections=3, axis=1)
|
||||||
|
|
||||||
|
r = self._gate_activation(x_r + h_r)
|
||||||
|
z = self._gate_activation(x_z + h_z)
|
||||||
|
c = self._activation(x_c + r * h_c) # apply reset gate after mm
|
||||||
|
h = (pre_hidden - c) * z + c
|
||||||
|
# https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/fluid/layers/dynamic_gru_cn.html#dynamic-gru
|
||||||
|
|
||||||
|
return h, h
|
||||||
|
|
||||||
|
@property
|
||||||
|
def state_shape(self):
|
||||||
|
r"""
|
||||||
|
The `state_shape` of GRUCell is a shape `[hidden_size]` (-1 for batch
|
||||||
|
size would be automatically inserted into shape). The shape corresponds
|
||||||
|
to the shape of :math:`h_{t-1}`.
|
||||||
|
"""
|
||||||
|
return (self.hidden_size, )
|
||||||
|
|
||||||
|
|
||||||
|
class BiRNNWithBN(nn.Layer):
|
||||||
|
"""Bidirectonal simple rnn layer with sequence-wise batch normalization.
|
||||||
|
The batch normalization is only performed on input-state weights.
|
||||||
|
|
||||||
|
:param name: Name of the layer parameters.
|
||||||
|
:type name: string
|
||||||
|
:param size: Dimension of RNN cells.
|
||||||
|
:type size: int
|
||||||
|
:param share_weights: Whether to share input-hidden weights between
|
||||||
|
forward and backward directional RNNs.
|
||||||
|
:type share_weights: bool
|
||||||
|
:return: Bidirectional simple rnn layer.
|
||||||
|
:rtype: Variable
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, i_size, h_size, share_weights):
|
||||||
|
super().__init__()
|
||||||
|
self.share_weights = share_weights
|
||||||
|
if self.share_weights:
|
||||||
|
#input-hidden weights shared between bi-directional rnn.
|
||||||
|
self.fw_fc = nn.Linear(i_size, h_size, bias_attr=False)
|
||||||
|
# batch norm is only performed on input-state projection
|
||||||
|
self.fw_bn = nn.BatchNorm1D(
|
||||||
|
h_size, bias_attr=None, data_format='NLC')
|
||||||
|
self.bw_fc = self.fw_fc
|
||||||
|
self.bw_bn = self.fw_bn
|
||||||
|
else:
|
||||||
|
self.fw_fc = nn.Linear(i_size, h_size, bias_attr=False)
|
||||||
|
self.fw_bn = nn.BatchNorm1D(
|
||||||
|
h_size, bias_attr=None, data_format='NLC')
|
||||||
|
self.bw_fc = nn.Linear(i_size, h_size, bias_attr=False)
|
||||||
|
self.bw_bn = nn.BatchNorm1D(
|
||||||
|
h_size, bias_attr=None, data_format='NLC')
|
||||||
|
|
||||||
|
self.fw_cell = RNNCell(hidden_size=h_size, activation='brelu')
|
||||||
|
self.bw_cell = RNNCell(hidden_size=h_size, activation='brelu')
|
||||||
|
self.fw_rnn = nn.RNN(
|
||||||
|
self.fw_cell, is_reverse=False, time_major=False) #[B, T, D]
|
||||||
|
self.bw_rnn = nn.RNN(
|
||||||
|
self.fw_cell, is_reverse=True, time_major=False) #[B, T, D]
|
||||||
|
|
||||||
|
def forward(self, x, x_len):
|
||||||
|
# x, shape [B, T, D]
|
||||||
|
fw_x = self.fw_bn(self.fw_fc(x))
|
||||||
|
bw_x = self.bw_bn(self.bw_fc(x))
|
||||||
|
fw_x, _ = self.fw_rnn(inputs=fw_x, sequence_length=x_len)
|
||||||
|
bw_x, _ = self.bw_rnn(inputs=bw_x, sequence_length=x_len)
|
||||||
|
x = paddle.concat([fw_x, bw_x], axis=-1)
|
||||||
|
return x, x_len
|
||||||
|
|
||||||
|
|
||||||
|
class BiGRUWithBN(nn.Layer):
|
||||||
|
"""Bidirectonal gru layer with sequence-wise batch normalization.
|
||||||
|
The batch normalization is only performed on input-state weights.
|
||||||
|
|
||||||
|
:param name: Name of the layer.
|
||||||
|
:type name: string
|
||||||
|
:param input: Input layer.
|
||||||
|
:type input: Variable
|
||||||
|
:param size: Dimension of GRU cells.
|
||||||
|
:type size: int
|
||||||
|
:param act: Activation type.
|
||||||
|
:type act: string
|
||||||
|
:return: Bidirectional GRU layer.
|
||||||
|
:rtype: Variable
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, i_size, h_size, act):
|
||||||
|
super().__init__()
|
||||||
|
hidden_size = h_size * 3
|
||||||
|
|
||||||
|
self.fw_fc = nn.Linear(i_size, hidden_size, bias_attr=False)
|
||||||
|
self.fw_bn = nn.BatchNorm1D(
|
||||||
|
hidden_size, bias_attr=None, data_format='NLC')
|
||||||
|
self.bw_fc = nn.Linear(i_size, hidden_size, bias_attr=False)
|
||||||
|
self.bw_bn = nn.BatchNorm1D(
|
||||||
|
hidden_size, bias_attr=None, data_format='NLC')
|
||||||
|
|
||||||
|
self.fw_cell = GRUCell(input_size=hidden_size, hidden_size=h_size)
|
||||||
|
self.bw_cell = GRUCell(input_size=hidden_size, hidden_size=h_size)
|
||||||
|
self.fw_rnn = nn.RNN(
|
||||||
|
self.fw_cell, is_reverse=False, time_major=False) #[B, T, D]
|
||||||
|
self.bw_rnn = nn.RNN(
|
||||||
|
self.fw_cell, is_reverse=True, time_major=False) #[B, T, D]
|
||||||
|
|
||||||
|
def forward(self, x, x_len):
|
||||||
|
# x, shape [B, T, D]
|
||||||
|
fw_x = self.fw_bn(self.fw_fc(x))
|
||||||
|
bw_x = self.bw_bn(self.bw_fc(x))
|
||||||
|
fw_x, _ = self.fw_rnn(inputs=fw_x, sequence_length=x_len)
|
||||||
|
bw_x, _ = self.bw_rnn(inputs=bw_x, sequence_length=x_len)
|
||||||
|
x = paddle.concat([fw_x, bw_x], axis=-1)
|
||||||
|
return x, x_len
|
||||||
|
|
||||||
|
|
||||||
|
class RNNStack(nn.Layer):
|
||||||
|
"""RNN group with stacked bidirectional simple RNN or GRU layers.
|
||||||
|
|
||||||
|
:param input: Input layer.
|
||||||
|
:type input: Variable
|
||||||
|
:param size: Dimension of RNN cells in each layer.
|
||||||
|
:type size: int
|
||||||
|
:param num_stacks: Number of stacked rnn layers.
|
||||||
|
:type num_stacks: int
|
||||||
|
:param use_gru: Use gru if set True. Use simple rnn if set False.
|
||||||
|
:type use_gru: bool
|
||||||
|
:param share_rnn_weights: Whether to share input-hidden weights between
|
||||||
|
forward and backward directional RNNs.
|
||||||
|
It is only available when use_gru=False.
|
||||||
|
:type share_weights: bool
|
||||||
|
:return: Output layer of the RNN group.
|
||||||
|
:rtype: Variable
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, i_size, h_size, num_stacks, use_gru, share_rnn_weights):
|
||||||
|
super().__init__()
|
||||||
|
self.rnn_stacks = nn.LayerList()
|
||||||
|
for i in range(num_stacks):
|
||||||
|
if use_gru:
|
||||||
|
#default:GRU using tanh
|
||||||
|
self.rnn_stacks.append(
|
||||||
|
BiGRUWithBN(i_size=i_size, h_size=h_size, act="relu"))
|
||||||
|
else:
|
||||||
|
self.rnn_stacks.append(
|
||||||
|
BiRNNWithBN(
|
||||||
|
i_size=i_size,
|
||||||
|
h_size=h_size,
|
||||||
|
share_weights=share_rnn_weights))
|
||||||
|
i_size = h_size * 2
|
||||||
|
|
||||||
|
def forward(self, x, x_len):
|
||||||
|
"""
|
||||||
|
x: shape [B, T, D]
|
||||||
|
x_len: shpae [B]
|
||||||
|
"""
|
||||||
|
for i, rnn in enumerate(self.rnn_stacks):
|
||||||
|
x, x_len = rnn(x, x_len)
|
||||||
|
masks = sequence_mask(x_len) #[B, T]
|
||||||
|
masks = masks.unsqueeze(-1) # [B, T, 1]
|
||||||
|
x = x.multiply(masks)
|
||||||
|
return x, x_len
|
@ -0,0 +1,73 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from paddle.fluid.dygraph import base as imperative_base
|
||||||
|
from paddle.fluid import layers
|
||||||
|
from paddle.fluid import core
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class MyClipGradByGlobalNorm(paddle.nn.ClipGradByGlobalNorm):
|
||||||
|
def __init__(self, clip_norm):
|
||||||
|
super().__init__(clip_norm)
|
||||||
|
|
||||||
|
@imperative_base.no_grad
|
||||||
|
def _dygraph_clip(self, params_grads):
|
||||||
|
params_and_grads = []
|
||||||
|
sum_square_list = []
|
||||||
|
for p, g in params_grads:
|
||||||
|
if g is None:
|
||||||
|
continue
|
||||||
|
if getattr(p, 'need_clip', True) is False:
|
||||||
|
continue
|
||||||
|
merge_grad = g
|
||||||
|
if g.type == core.VarDesc.VarType.SELECTED_ROWS:
|
||||||
|
merge_grad = layers.merge_selected_rows(g)
|
||||||
|
merge_grad = layers.get_tensor_from_selected_rows(merge_grad)
|
||||||
|
square = layers.square(merge_grad)
|
||||||
|
sum_square = layers.reduce_sum(square)
|
||||||
|
logger.info(
|
||||||
|
f"Grad Before Clip: {p.name}: {float(layers.sqrt(layers.reduce_sum(layers.square(merge_grad))) ) }"
|
||||||
|
)
|
||||||
|
sum_square_list.append(sum_square)
|
||||||
|
|
||||||
|
# all parameters have been filterd out
|
||||||
|
if len(sum_square_list) == 0:
|
||||||
|
return params_grads
|
||||||
|
|
||||||
|
global_norm_var = layers.concat(sum_square_list)
|
||||||
|
global_norm_var = layers.reduce_sum(global_norm_var)
|
||||||
|
global_norm_var = layers.sqrt(global_norm_var)
|
||||||
|
logger.info(f"Grad Global Norm: {float(global_norm_var)}!!!!")
|
||||||
|
max_global_norm = layers.fill_constant(
|
||||||
|
shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm)
|
||||||
|
clip_var = layers.elementwise_div(
|
||||||
|
x=max_global_norm,
|
||||||
|
y=layers.elementwise_max(x=global_norm_var, y=max_global_norm))
|
||||||
|
for p, g in params_grads:
|
||||||
|
if g is None:
|
||||||
|
continue
|
||||||
|
if getattr(p, 'need_clip', True) is False:
|
||||||
|
params_and_grads.append((p, g))
|
||||||
|
continue
|
||||||
|
new_grad = layers.elementwise_mul(x=g, y=clip_var)
|
||||||
|
logger.info(
|
||||||
|
f"Grad After Clip: {p.name}: {float(layers.sqrt(layers.reduce_sum(layers.square(merge_grad))) ) }"
|
||||||
|
)
|
||||||
|
params_and_grads.append((p, new_grad))
|
||||||
|
|
||||||
|
return params_and_grads
|
@ -0,0 +1,65 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
|
import paddle
|
||||||
|
from paddle import nn
|
||||||
|
from paddle.nn import functional as F
|
||||||
|
from paddle.nn import initializer as I
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
__all__ = ['CTCLoss']
|
||||||
|
|
||||||
|
|
||||||
|
def ctc_loss(logits,
|
||||||
|
labels,
|
||||||
|
input_lengths,
|
||||||
|
label_lengths,
|
||||||
|
blank=0,
|
||||||
|
reduction='mean',
|
||||||
|
norm_by_times=True):
|
||||||
|
#logger.info("my ctc loss with norm by times")
|
||||||
|
## https://github.com/PaddlePaddle/Paddle/blob/f5ca2db2cc/paddle/fluid/operators/warpctc_op.h#L403
|
||||||
|
loss_out = paddle.fluid.layers.warpctc(logits, labels, blank, norm_by_times,
|
||||||
|
input_lengths, label_lengths)
|
||||||
|
|
||||||
|
loss_out = paddle.fluid.layers.squeeze(loss_out, [-1])
|
||||||
|
logger.info(f"warpctc loss: {loss_out}/{loss_out.shape} ")
|
||||||
|
assert reduction in ['mean', 'sum', 'none']
|
||||||
|
if reduction == 'mean':
|
||||||
|
loss_out = paddle.mean(loss_out / label_lengths)
|
||||||
|
elif reduction == 'sum':
|
||||||
|
loss_out = paddle.sum(loss_out)
|
||||||
|
logger.info(f"ctc loss: {loss_out}")
|
||||||
|
return loss_out
|
||||||
|
|
||||||
|
|
||||||
|
F.ctc_loss = ctc_loss
|
||||||
|
|
||||||
|
|
||||||
|
class CTCLoss(nn.Layer):
|
||||||
|
def __init__(self, blank_id):
|
||||||
|
super().__init__()
|
||||||
|
# last token id as blank id
|
||||||
|
self.loss = nn.CTCLoss(blank=blank_id, reduction='sum')
|
||||||
|
|
||||||
|
def forward(self, logits, text, logits_len, text_len):
|
||||||
|
# warp-ctc do softmax on activations
|
||||||
|
# warp-ctc need activation with shape [T, B, V + 1]
|
||||||
|
logits = logits.transpose([1, 0, 2])
|
||||||
|
|
||||||
|
ctc_loss = self.loss(logits, text, logits_len, text_len)
|
||||||
|
return ctc_loss
|
Loading…
Reference in new issue