|
|
@ -154,8 +154,8 @@ class SpeechCollator():
|
|
|
|
random_seed (int, optional): for random generator. Defaults to 0.
|
|
|
|
random_seed (int, optional): for random generator. Defaults to 0.
|
|
|
|
keep_transcription_text (bool, optional): True, when not in training mode, will not do tokenizer; Defaults to False.
|
|
|
|
keep_transcription_text (bool, optional): True, when not in training mode, will not do tokenizer; Defaults to False.
|
|
|
|
if ``keep_transcription_text`` is False, text is token ids else is raw string.
|
|
|
|
if ``keep_transcription_text`` is False, text is token ids else is raw string.
|
|
|
|
|
|
|
|
|
|
|
|
Do augmentations
|
|
|
|
Do augmentations
|
|
|
|
Padding audio features with zeros to make them have the same shape (or
|
|
|
|
Padding audio features with zeros to make them have the same shape (or
|
|
|
|
a user-defined shape) within one batch.
|
|
|
|
a user-defined shape) within one batch.
|
|
|
|
"""
|
|
|
|
"""
|
|
|
@ -271,7 +271,7 @@ class SpeechCollator():
|
|
|
|
utts.append(utt)
|
|
|
|
utts.append(utt)
|
|
|
|
# audio
|
|
|
|
# audio
|
|
|
|
audios.append(audio) # [T, D]
|
|
|
|
audios.append(audio) # [T, D]
|
|
|
|
audio_lens.append(audio.shape[1])
|
|
|
|
audio_lens.append(audio.shape[0])
|
|
|
|
# text
|
|
|
|
# text
|
|
|
|
# for training, text is token ids
|
|
|
|
# for training, text is token ids
|
|
|
|
# else text is string, convert to unicode ord
|
|
|
|
# else text is string, convert to unicode ord
|
|
|
|