fix the bug of spec shape

pull/673/head
Haoxin Ma 4 years ago
parent 043127b6fd
commit 82ca0f6549

@ -175,6 +175,7 @@ class AudioFeaturizer(object):
max_freq=None,
eps=1e-14):
"""Compute the linear spectrogram from FFT energy."""
# return T,D
if max_freq is None:
max_freq = sample_rate / 2
if max_freq > sample_rate / 2:
@ -190,8 +191,12 @@ class AudioFeaturizer(object):
window_size=window_size,
stride_size=stride_size,
sample_rate=sample_rate)
ind = np.where(freqs <= max_freq)[0][-1] + 1
return np.log(specgram[:ind, :] + eps)
specgram = np.log(specgram[:ind, :] + eps)
specgram = np.transpose(specgram) #T,D
return specgram
def _specgram_real(self, samples, window_size, stride_size, sample_rate):
"""Compute the spectrogram for samples from a real signal."""
@ -294,6 +299,7 @@ class AudioFeaturizer(object):
ceplifter=22,
useEnergy=True,
winfunc='povey')
mfcc_feat = np.transpose(mfcc_feat)
if delta_delta:
mfcc_feat = self._concat_delta_delta(mfcc_feat)

@ -131,8 +131,8 @@ class FeatureNormalizer(object):
def _read_mean_std_from_file(self, filepath, eps=1e-20):
"""Load mean and std from file."""
mean, istd = load_cmvn(filepath, filetype='json')
self._mean = np.expand_dims(mean, axis=-1)
self._istd = np.expand_dims(istd, axis=-1)
self._mean = np.expand_dims(mean, axis=0)
self._istd = np.expand_dims(istd, axis=0)
def write_to_file(self, filepath):
"""Write the mean and stddev to the file.

@ -326,10 +326,8 @@ class SpeechCollator():
audio, text = self.process_feature_and_transform(audio, text)
#utt
utts.append(utt)
# audio
# print("---debug---")
# print(audio.shape)
audio=audio.T
audios.append(audio) # [T, D]
audio_lens.append(audio.shape[0])
# text
@ -358,7 +356,7 @@ class SpeechCollator():
self.randomize_feature_parameters(min(audio_lens), n_bins)
for i in range(len(padded_audios)):
if not self._randomize_each_batch:
self.randomize_feature_parameters(n_bins, audio_lens[i])
self.randomize_feature_parameters(audio_lens[i], n_bins)
padded_audios[i] = self._augmentation_pipeline.apply_feature_transform(padded_audios[i])
return utts, padded_audios, audio_lens, padded_texts, text_lens

@ -11,7 +11,7 @@ data:
max_output_input_ratio: .inf
collator:
batch_size: 32 #64 # one gpu
batch_size: 64 # one gpu
randomize_each_batch: False
mean_std_filepath: data/mean_std.json
unit_type: char

Loading…
Cancel
Save