diff --git a/deepspeech/frontend/featurizer/audio_featurizer.py b/deepspeech/frontend/featurizer/audio_featurizer.py index f209d305d..b537e7335 100644 --- a/deepspeech/frontend/featurizer/audio_featurizer.py +++ b/deepspeech/frontend/featurizer/audio_featurizer.py @@ -175,6 +175,7 @@ class AudioFeaturizer(object): max_freq=None, eps=1e-14): """Compute the linear spectrogram from FFT energy.""" + # return T,D if max_freq is None: max_freq = sample_rate / 2 if max_freq > sample_rate / 2: @@ -190,8 +191,12 @@ class AudioFeaturizer(object): window_size=window_size, stride_size=stride_size, sample_rate=sample_rate) + ind = np.where(freqs <= max_freq)[0][-1] + 1 - return np.log(specgram[:ind, :] + eps) + specgram = np.log(specgram[:ind, :] + eps) + + specgram = np.transpose(specgram) #T,D + return specgram def _specgram_real(self, samples, window_size, stride_size, sample_rate): """Compute the spectrogram for samples from a real signal.""" @@ -294,6 +299,7 @@ class AudioFeaturizer(object): ceplifter=22, useEnergy=True, winfunc='povey') + mfcc_feat = np.transpose(mfcc_feat) if delta_delta: mfcc_feat = self._concat_delta_delta(mfcc_feat) diff --git a/deepspeech/frontend/normalizer.py b/deepspeech/frontend/normalizer.py index 287b51e58..0bf24edd1 100644 --- a/deepspeech/frontend/normalizer.py +++ b/deepspeech/frontend/normalizer.py @@ -131,8 +131,8 @@ class FeatureNormalizer(object): def _read_mean_std_from_file(self, filepath, eps=1e-20): """Load mean and std from file.""" mean, istd = load_cmvn(filepath, filetype='json') - self._mean = np.expand_dims(mean, axis=-1) - self._istd = np.expand_dims(istd, axis=-1) + self._mean = np.expand_dims(mean, axis=0) + self._istd = np.expand_dims(istd, axis=0) def write_to_file(self, filepath): """Write the mean and stddev to the file. diff --git a/deepspeech/io/collator.py b/deepspeech/io/collator.py index f105acc06..514dc2cc3 100644 --- a/deepspeech/io/collator.py +++ b/deepspeech/io/collator.py @@ -326,10 +326,8 @@ class SpeechCollator(): audio, text = self.process_feature_and_transform(audio, text) #utt utts.append(utt) - # audio # print("---debug---") # print(audio.shape) - audio=audio.T audios.append(audio) # [T, D] audio_lens.append(audio.shape[0]) # text @@ -358,7 +356,7 @@ class SpeechCollator(): self.randomize_feature_parameters(min(audio_lens), n_bins) for i in range(len(padded_audios)): if not self._randomize_each_batch: - self.randomize_feature_parameters(n_bins, audio_lens[i]) + self.randomize_feature_parameters(audio_lens[i], n_bins) padded_audios[i] = self._augmentation_pipeline.apply_feature_transform(padded_audios[i]) return utts, padded_audios, audio_lens, padded_texts, text_lens diff --git a/examples/aishell/s0/conf/deepspeech2.yaml b/examples/aishell/s0/conf/deepspeech2.yaml index 1fe21a406..e7a5c6dcf 100644 --- a/examples/aishell/s0/conf/deepspeech2.yaml +++ b/examples/aishell/s0/conf/deepspeech2.yaml @@ -11,7 +11,7 @@ data: max_output_input_ratio: .inf collator: - batch_size: 32 #64 # one gpu + batch_size: 64 # one gpu randomize_each_batch: False mean_std_filepath: data/mean_std.json unit_type: char