|
|
@ -203,34 +203,22 @@ class SpeechCollator():
|
|
|
|
where transcription part could be token ids or text.
|
|
|
|
where transcription part could be token ids or text.
|
|
|
|
:rtype: tuple of (2darray, list)
|
|
|
|
:rtype: tuple of (2darray, list)
|
|
|
|
"""
|
|
|
|
"""
|
|
|
|
start_time = time.time()
|
|
|
|
|
|
|
|
if isinstance(audio_file, str) and audio_file.startswith('tar:'):
|
|
|
|
if isinstance(audio_file, str) and audio_file.startswith('tar:'):
|
|
|
|
speech_segment = SpeechSegment.from_file(
|
|
|
|
speech_segment = SpeechSegment.from_file(
|
|
|
|
self._subfile_from_tar(audio_file), transcript)
|
|
|
|
self._subfile_from_tar(audio_file), transcript)
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
speech_segment = SpeechSegment.from_file(audio_file, transcript)
|
|
|
|
speech_segment = SpeechSegment.from_file(audio_file, transcript)
|
|
|
|
load_wav_time = time.time() - start_time
|
|
|
|
|
|
|
|
#logger.debug(f"load wav time: {load_wav_time}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# audio augment
|
|
|
|
# audio augment
|
|
|
|
start_time = time.time()
|
|
|
|
|
|
|
|
self._augmentation_pipeline.transform_audio(speech_segment)
|
|
|
|
self._augmentation_pipeline.transform_audio(speech_segment)
|
|
|
|
audio_aug_time = time.time() - start_time
|
|
|
|
|
|
|
|
#logger.debug(f"audio augmentation time: {audio_aug_time}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
start_time = time.time()
|
|
|
|
|
|
|
|
specgram, transcript_part = self._speech_featurizer.featurize(
|
|
|
|
specgram, transcript_part = self._speech_featurizer.featurize(
|
|
|
|
speech_segment, self._keep_transcription_text)
|
|
|
|
speech_segment, self._keep_transcription_text)
|
|
|
|
if self._normalizer:
|
|
|
|
if self._normalizer:
|
|
|
|
specgram = self._normalizer.apply(specgram)
|
|
|
|
specgram = self._normalizer.apply(specgram)
|
|
|
|
feature_time = time.time() - start_time
|
|
|
|
|
|
|
|
#logger.debug(f"audio & test feature time: {feature_time}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# specgram augment
|
|
|
|
# specgram augment
|
|
|
|
start_time = time.time()
|
|
|
|
|
|
|
|
specgram = self._augmentation_pipeline.transform_feature(specgram)
|
|
|
|
specgram = self._augmentation_pipeline.transform_feature(specgram)
|
|
|
|
feature_aug_time = time.time() - start_time
|
|
|
|
|
|
|
|
#logger.debug(f"audio feature augmentation time: {feature_aug_time}")
|
|
|
|
|
|
|
|
return specgram, transcript_part
|
|
|
|
return specgram, transcript_part
|
|
|
|
|
|
|
|
|
|
|
|
def __call__(self, batch):
|
|
|
|
def __call__(self, batch):
|
|
|
@ -283,16 +271,6 @@ class SpeechCollator():
|
|
|
|
return utts, padded_audios, audio_lens, padded_texts, text_lens
|
|
|
|
return utts, padded_audios, audio_lens, padded_texts, text_lens
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# @property
|
|
|
|
|
|
|
|
# def text_feature(self):
|
|
|
|
|
|
|
|
# return self._speech_featurizer.text_feature
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# @property
|
|
|
|
|
|
|
|
# def stride_ms(self):
|
|
|
|
|
|
|
|
# return self._speech_featurizer.stride_ms
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
###########
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@property
|
|
|
|
@property
|
|
|
|
def manifest(self):
|
|
|
|
def manifest(self):
|
|
|
|