fix resample

pull/1432/head
huangyuxin 3 years ago
parent e0280ff949
commit 9a55783aa0

@ -311,8 +311,10 @@ class ASRExecutor(BaseExecutor):
audio = audio[:, 0] audio = audio[:, 0]
# pcm16 -> pcm 32 # pcm16 -> pcm 32
audio = self._pcm16to32(audio) audio = self._pcm16to32(audio)
audio = librosa.resample(audio, audio_sample_rate, audio = librosa.resample(
self.sample_rate) audio,
orig_sr=audio_sample_rate,
target_sr=self.sample_rate)
audio_sample_rate = self.sample_rate audio_sample_rate = self.sample_rate
# pcm32 -> pcm 16 # pcm32 -> pcm 16
audio = self._pcm32to16(audio) audio = self._pcm32to16(audio)

@ -90,7 +90,8 @@ class SpeedPerturbation():
# Note1: resample requires the sampling-rate of input and output, # Note1: resample requires the sampling-rate of input and output,
# but actually only the ratio is used. # but actually only the ratio is used.
y = librosa.resample(x, ratio, 1, res_type=self.res_type) y = librosa.resample(
x, orig_sr=ratio, target_sr=1, res_type=self.res_type)
if self.keep_length: if self.keep_length:
diff = abs(len(x) - len(y)) diff = abs(len(x) - len(y))

@ -38,7 +38,7 @@ def stft(x,
x = np.stack( x = np.stack(
[ [
librosa.stft( librosa.stft(
x[:, ch], y=x[:, ch],
n_fft=n_fft, n_fft=n_fft,
hop_length=n_shift, hop_length=n_shift,
win_length=win_length, win_length=win_length,
@ -67,7 +67,7 @@ def istft(x, n_shift, win_length=None, window="hann", center=True):
x = np.stack( x = np.stack(
[ [
librosa.istft( librosa.istft(
x[:, ch].T, # [Time, Freq] -> [Freq, Time] y=x[:, ch].T, # [Time, Freq] -> [Freq, Time]
hop_length=n_shift, hop_length=n_shift,
win_length=win_length, win_length=win_length,
window=window, window=window,
@ -95,7 +95,8 @@ def stft2logmelspectrogram(x_stft,
# spc: (Time, Channel, Freq) or (Time, Freq) # spc: (Time, Channel, Freq) or (Time, Freq)
spc = np.abs(x_stft) spc = np.abs(x_stft)
# mel_basis: (Mel_freq, Freq) # mel_basis: (Mel_freq, Freq)
mel_basis = librosa.filters.mel(fs, n_fft, n_mels, fmin, fmax) mel_basis = librosa.filters.mel(
sr=fs, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax)
# lmspc: (Time, Channel, Mel_freq) or (Time, Mel_freq) # lmspc: (Time, Channel, Mel_freq) or (Time, Mel_freq)
lmspc = np.log10(np.maximum(eps, np.dot(spc, mel_basis.T))) lmspc = np.log10(np.maximum(eps, np.dot(spc, mel_basis.T)))

@ -127,7 +127,7 @@ def compute_partial_slices(n_samples: int,
partial_utterance_n_frames : int partial_utterance_n_frames : int
the number of mel spectrogram frames in each partial utterance. the number of mel spectrogram frames in each partial utterance.
min_pad_coverage : int min_pad_coverage : int
when reaching the last partial utterance, it may or may not have enough frames. when reaching the last partial utterance, it may or may not have enough frames.
If at least <min_pad_coverage> of <partial_utterance_n_frames> are present, If at least <min_pad_coverage> of <partial_utterance_n_frames> are present,
then the last partial utterance will be considered, as if we padded the audio. Otherwise, then the last partial utterance will be considered, as if we padded the audio. Otherwise,
@ -137,7 +137,7 @@ def compute_partial_slices(n_samples: int,
by how much the partial utterance should overlap. If set to 0, the partial utterances are entirely disjoint. by how much the partial utterance should overlap. If set to 0, the partial utterances are entirely disjoint.
Returns Returns
---------- ----------
the waveform slices and mel spectrogram slices as lists of array slices. the waveform slices and mel spectrogram slices as lists of array slices.
Index respectively the waveform and the mel spectrogram with these slices to obtain the partialutterances. Index respectively the waveform and the mel spectrogram with these slices to obtain the partialutterances.
""" """
assert 0 <= overlap < 1 assert 0 <= overlap < 1
@ -206,7 +206,8 @@ class SpeakerVerificationPreprocessor(object):
# Resample if numpy.array is passed and sr does not match # Resample if numpy.array is passed and sr does not match
if source_sr is not None and source_sr != self.sampling_rate: if source_sr is not None and source_sr != self.sampling_rate:
wav = librosa.resample(wav, source_sr, self.sampling_rate) wav = librosa.resample(
wav, orig_sr=source_sr, target_sr=self.sampling_rate)
# loudness normalization # loudness normalization
wav = normalize_volume( wav = normalize_volume(
@ -221,7 +222,7 @@ class SpeakerVerificationPreprocessor(object):
def melspectrogram(self, wav): def melspectrogram(self, wav):
mel = librosa.feature.melspectrogram( mel = librosa.feature.melspectrogram(
wav, y=wav,
sr=self.sampling_rate, sr=self.sampling_rate,
n_fft=self.n_fft, n_fft=self.n_fft,
hop_length=self.hop_length, hop_length=self.hop_length,

Loading…
Cancel
Save