@ -271,7 +271,7 @@ class SpeechCollator():
utts.append(utt)
# audio
audios.append(audio) # [T, D]
audio_lens.append(audio.shape[1])
audio_lens.append(audio.shape[0])
# text
# for training, text is token ids
# else text is string, convert to unicode ord