|
|
@ -97,22 +97,22 @@ class DataGenerator(object):
|
|
|
|
self._local_data.tar2info = {}
|
|
|
|
self._local_data.tar2info = {}
|
|
|
|
self._local_data.tar2object = {}
|
|
|
|
self._local_data.tar2object = {}
|
|
|
|
|
|
|
|
|
|
|
|
def process_utterance(self, filename, transcript):
|
|
|
|
def process_utterance(self, audio_file, transcript):
|
|
|
|
"""Load, augment, featurize and normalize for speech data.
|
|
|
|
"""Load, augment, featurize and normalize for speech data.
|
|
|
|
|
|
|
|
|
|
|
|
:param filename: Audio filepath
|
|
|
|
:param audio_file: Filepath or file object of audio file.
|
|
|
|
:type filename: basestring | file
|
|
|
|
:type audio_file: basestring | file
|
|
|
|
:param transcript: Transcription text.
|
|
|
|
:param transcript: Transcription text.
|
|
|
|
:type transcript: basestring
|
|
|
|
:type transcript: basestring
|
|
|
|
:return: Tuple of audio feature tensor and data of transcription part,
|
|
|
|
:return: Tuple of audio feature tensor and data of transcription part,
|
|
|
|
where transcription part could be token ids or text.
|
|
|
|
where transcription part could be token ids or text.
|
|
|
|
:rtype: tuple of (2darray, list)
|
|
|
|
:rtype: tuple of (2darray, list)
|
|
|
|
"""
|
|
|
|
"""
|
|
|
|
if filename.startswith('tar:'):
|
|
|
|
if isinstance(audio_file, basestring) and audio_file.startswith('tar:'):
|
|
|
|
speech_segment = SpeechSegment.from_file(
|
|
|
|
speech_segment = SpeechSegment.from_file(
|
|
|
|
self._subfile_from_tar(filename), transcript)
|
|
|
|
self._subfile_from_tar(audio_file), transcript)
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
speech_segment = SpeechSegment.from_file(filename, transcript)
|
|
|
|
speech_segment = SpeechSegment.from_file(audio_file, transcript)
|
|
|
|
self._augmentation_pipeline.transform_audio(speech_segment)
|
|
|
|
self._augmentation_pipeline.transform_audio(speech_segment)
|
|
|
|
specgram, transcript_part = self._speech_featurizer.featurize(
|
|
|
|
specgram, transcript_part = self._speech_featurizer.featurize(
|
|
|
|
speech_segment, self._keep_transcription_text)
|
|
|
|
speech_segment, self._keep_transcription_text)
|
|
|
|