|
|
@ -174,7 +174,6 @@ class SpeechCollator():
|
|
|
|
self._stride_ms = stride_ms
|
|
|
|
self._stride_ms = stride_ms
|
|
|
|
self._target_sample_rate = target_sample_rate
|
|
|
|
self._target_sample_rate = target_sample_rate
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
self._speech_featurizer = SpeechFeaturizer(
|
|
|
|
self._speech_featurizer = SpeechFeaturizer(
|
|
|
|
unit_type=unit_type,
|
|
|
|
unit_type=unit_type,
|
|
|
|
vocab_filepath=vocab_filepath,
|
|
|
|
vocab_filepath=vocab_filepath,
|
|
|
@ -231,7 +230,8 @@ class SpeechCollator():
|
|
|
|
self._augmentation_pipeline.randomize_parameters_audio_transform()
|
|
|
|
self._augmentation_pipeline.randomize_parameters_audio_transform()
|
|
|
|
|
|
|
|
|
|
|
|
def randomize_feature_parameters(self, n_frames, n_bins):
|
|
|
|
def randomize_feature_parameters(self, n_frames, n_bins):
|
|
|
|
self._augmentation_pipeline.randomize_parameters_feature_transform(n_frames, n_bins)
|
|
|
|
self._augmentation_pipeline.randomize_parameters_feature_transform(
|
|
|
|
|
|
|
|
n_frames, n_bins)
|
|
|
|
|
|
|
|
|
|
|
|
def process_feature_and_transform(self, audio_file, transcript):
|
|
|
|
def process_feature_and_transform(self, audio_file, transcript):
|
|
|
|
"""Load, augment, featurize and normalize for speech data.
|
|
|
|
"""Load, augment, featurize and normalize for speech data.
|
|
|
@ -261,7 +261,6 @@ class SpeechCollator():
|
|
|
|
|
|
|
|
|
|
|
|
return specgram, transcript_part
|
|
|
|
return specgram, transcript_part
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# def process_utterance(self, audio_file, transcript, single=True):
|
|
|
|
# def process_utterance(self, audio_file, transcript, single=True):
|
|
|
|
# """Load, augment, featurize and normalize for speech data.
|
|
|
|
# """Load, augment, featurize and normalize for speech data.
|
|
|
|
|
|
|
|
|
|
|
@ -282,7 +281,6 @@ class SpeechCollator():
|
|
|
|
# # audio augment
|
|
|
|
# # audio augment
|
|
|
|
# self._augmentation_pipeline.transform_audio(speech_segment)
|
|
|
|
# self._augmentation_pipeline.transform_audio(speech_segment)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# # Spectrum transform
|
|
|
|
# # Spectrum transform
|
|
|
|
# specgram, transcript_part = self._speech_featurizer.featurize(
|
|
|
|
# specgram, transcript_part = self._speech_featurizer.featurize(
|
|
|
|
# speech_segment, self._keep_transcription_text)
|
|
|
|
# speech_segment, self._keep_transcription_text)
|
|
|
@ -357,7 +355,9 @@ class SpeechCollator():
|
|
|
|
for i in range(len(padded_audios)):
|
|
|
|
for i in range(len(padded_audios)):
|
|
|
|
if not self._randomize_each_batch:
|
|
|
|
if not self._randomize_each_batch:
|
|
|
|
self.randomize_feature_parameters(audio_lens[i], n_bins)
|
|
|
|
self.randomize_feature_parameters(audio_lens[i], n_bins)
|
|
|
|
padded_audios[i] = self._augmentation_pipeline.apply_feature_transform(padded_audios[i])
|
|
|
|
padded_audios[
|
|
|
|
|
|
|
|
i] = self._augmentation_pipeline.apply_feature_transform(
|
|
|
|
|
|
|
|
padded_audios[i])
|
|
|
|
|
|
|
|
|
|
|
|
return utts, padded_audios, audio_lens, padded_texts, text_lens
|
|
|
|
return utts, padded_audios, audio_lens, padded_texts, text_lens
|
|
|
|
|
|
|
|
|
|
|
|