Merge pull request #1432 from Jackwaterveg/fix

[Bug fix] fix resample
pull/1439/head
TianYuan 4 years ago committed by GitHub
commit 70ebbfd89f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -311,8 +311,10 @@ class ASRExecutor(BaseExecutor):
audio = audio[:, 0] audio = audio[:, 0]
# pcm16 -> pcm 32 # pcm16 -> pcm 32
audio = self._pcm16to32(audio) audio = self._pcm16to32(audio)
audio = librosa.resample(audio, audio_sample_rate, audio = librosa.resample(
self.sample_rate) audio,
orig_sr=audio_sample_rate,
target_sr=self.sample_rate)
audio_sample_rate = self.sample_rate audio_sample_rate = self.sample_rate
# pcm32 -> pcm 16 # pcm32 -> pcm 16
audio = self._pcm32to16(audio) audio = self._pcm32to16(audio)

@ -90,7 +90,8 @@ class SpeedPerturbation():
# Note1: resample requires the sampling-rate of input and output, # Note1: resample requires the sampling-rate of input and output,
# but actually only the ratio is used. # but actually only the ratio is used.
y = librosa.resample(x, ratio, 1, res_type=self.res_type) y = librosa.resample(
x, orig_sr=ratio, target_sr=1, res_type=self.res_type)
if self.keep_length: if self.keep_length:
diff = abs(len(x) - len(y)) diff = abs(len(x) - len(y))

@ -38,7 +38,7 @@ def stft(x,
x = np.stack( x = np.stack(
[ [
librosa.stft( librosa.stft(
x[:, ch], y=x[:, ch],
n_fft=n_fft, n_fft=n_fft,
hop_length=n_shift, hop_length=n_shift,
win_length=win_length, win_length=win_length,
@ -67,7 +67,7 @@ def istft(x, n_shift, win_length=None, window="hann", center=True):
x = np.stack( x = np.stack(
[ [
librosa.istft( librosa.istft(
x[:, ch].T, # [Time, Freq] -> [Freq, Time] y=x[:, ch].T, # [Time, Freq] -> [Freq, Time]
hop_length=n_shift, hop_length=n_shift,
win_length=win_length, win_length=win_length,
window=window, window=window,
@ -95,7 +95,8 @@ def stft2logmelspectrogram(x_stft,
# spc: (Time, Channel, Freq) or (Time, Freq) # spc: (Time, Channel, Freq) or (Time, Freq)
spc = np.abs(x_stft) spc = np.abs(x_stft)
# mel_basis: (Mel_freq, Freq) # mel_basis: (Mel_freq, Freq)
mel_basis = librosa.filters.mel(fs, n_fft, n_mels, fmin, fmax) mel_basis = librosa.filters.mel(
sr=fs, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax)
# lmspc: (Time, Channel, Mel_freq) or (Time, Mel_freq) # lmspc: (Time, Channel, Mel_freq) or (Time, Mel_freq)
lmspc = np.log10(np.maximum(eps, np.dot(spc, mel_basis.T))) lmspc = np.log10(np.maximum(eps, np.dot(spc, mel_basis.T)))

@ -127,7 +127,7 @@ def compute_partial_slices(n_samples: int,
partial_utterance_n_frames : int partial_utterance_n_frames : int
the number of mel spectrogram frames in each partial utterance. the number of mel spectrogram frames in each partial utterance.
min_pad_coverage : int min_pad_coverage : int
when reaching the last partial utterance, it may or may not have enough frames. when reaching the last partial utterance, it may or may not have enough frames.
If at least <min_pad_coverage> of <partial_utterance_n_frames> are present, If at least <min_pad_coverage> of <partial_utterance_n_frames> are present,
then the last partial utterance will be considered, as if we padded the audio. Otherwise, then the last partial utterance will be considered, as if we padded the audio. Otherwise,
@ -137,7 +137,7 @@ def compute_partial_slices(n_samples: int,
by how much the partial utterance should overlap. If set to 0, the partial utterances are entirely disjoint. by how much the partial utterance should overlap. If set to 0, the partial utterances are entirely disjoint.
Returns Returns
---------- ----------
the waveform slices and mel spectrogram slices as lists of array slices. the waveform slices and mel spectrogram slices as lists of array slices.
Index respectively the waveform and the mel spectrogram with these slices to obtain the partialutterances. Index respectively the waveform and the mel spectrogram with these slices to obtain the partialutterances.
""" """
assert 0 <= overlap < 1 assert 0 <= overlap < 1
@ -206,7 +206,8 @@ class SpeakerVerificationPreprocessor(object):
# Resample if numpy.array is passed and sr does not match # Resample if numpy.array is passed and sr does not match
if source_sr is not None and source_sr != self.sampling_rate: if source_sr is not None and source_sr != self.sampling_rate:
wav = librosa.resample(wav, source_sr, self.sampling_rate) wav = librosa.resample(
wav, orig_sr=source_sr, target_sr=self.sampling_rate)
# loudness normalization # loudness normalization
wav = normalize_volume( wav = normalize_volume(
@ -221,7 +222,7 @@ class SpeakerVerificationPreprocessor(object):
def melspectrogram(self, wav): def melspectrogram(self, wav):
mel = librosa.feature.melspectrogram( mel = librosa.feature.melspectrogram(
wav, y=wav,
sr=self.sampling_rate, sr=self.sampling_rate,
n_fft=self.n_fft, n_fft=self.n_fft,
hop_length=self.hop_length, hop_length=self.hop_length,

Loading…
Cancel
Save