diff --git a/paddlespeech/cli/asr/infer.py b/paddlespeech/cli/asr/infer.py index 6e14e0d6..ef769fbc 100644 --- a/paddlespeech/cli/asr/infer.py +++ b/paddlespeech/cli/asr/infer.py @@ -311,8 +311,10 @@ class ASRExecutor(BaseExecutor): audio = audio[:, 0] # pcm16 -> pcm 32 audio = self._pcm16to32(audio) - audio = librosa.resample(audio, audio_sample_rate, - self.sample_rate) + audio = librosa.resample( + audio, + orig_sr=audio_sample_rate, + target_sr=self.sample_rate) audio_sample_rate = self.sample_rate # pcm32 -> pcm 16 audio = self._pcm32to16(audio) diff --git a/paddlespeech/s2t/transform/perturb.py b/paddlespeech/s2t/transform/perturb.py index 226885f3..9e41b824 100644 --- a/paddlespeech/s2t/transform/perturb.py +++ b/paddlespeech/s2t/transform/perturb.py @@ -90,7 +90,8 @@ class SpeedPerturbation(): # Note1: resample requires the sampling-rate of input and output, # but actually only the ratio is used. - y = librosa.resample(x, ratio, 1, res_type=self.res_type) + y = librosa.resample( + x, orig_sr=ratio, target_sr=1, res_type=self.res_type) if self.keep_length: diff = abs(len(x) - len(y)) diff --git a/paddlespeech/s2t/transform/spectrogram.py b/paddlespeech/s2t/transform/spectrogram.py index a6346c34..988fd627 100644 --- a/paddlespeech/s2t/transform/spectrogram.py +++ b/paddlespeech/s2t/transform/spectrogram.py @@ -38,7 +38,7 @@ def stft(x, x = np.stack( [ librosa.stft( - x[:, ch], + y=x[:, ch], n_fft=n_fft, hop_length=n_shift, win_length=win_length, @@ -67,7 +67,7 @@ def istft(x, n_shift, win_length=None, window="hann", center=True): x = np.stack( [ librosa.istft( - x[:, ch].T, # [Time, Freq] -> [Freq, Time] + y=x[:, ch].T, # [Time, Freq] -> [Freq, Time] hop_length=n_shift, win_length=win_length, window=window, @@ -95,7 +95,8 @@ def stft2logmelspectrogram(x_stft, # spc: (Time, Channel, Freq) or (Time, Freq) spc = np.abs(x_stft) # mel_basis: (Mel_freq, Freq) - mel_basis = librosa.filters.mel(fs, n_fft, n_mels, fmin, fmax) + mel_basis = librosa.filters.mel( + sr=fs, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax) # lmspc: (Time, Channel, Mel_freq) or (Time, Mel_freq) lmspc = np.log10(np.maximum(eps, np.dot(spc, mel_basis.T))) diff --git a/paddlespeech/vector/exps/ge2e/audio_processor.py b/paddlespeech/vector/exps/ge2e/audio_processor.py index 2d6bbe34..1ab0419e 100644 --- a/paddlespeech/vector/exps/ge2e/audio_processor.py +++ b/paddlespeech/vector/exps/ge2e/audio_processor.py @@ -127,7 +127,7 @@ def compute_partial_slices(n_samples: int, partial_utterance_n_frames : int the number of mel spectrogram frames in each partial utterance. - min_pad_coverage : int + min_pad_coverage : int when reaching the last partial utterance, it may or may not have enough frames. If at least of are present, then the last partial utterance will be considered, as if we padded the audio. Otherwise, @@ -137,7 +137,7 @@ def compute_partial_slices(n_samples: int, by how much the partial utterance should overlap. If set to 0, the partial utterances are entirely disjoint. Returns ---------- - the waveform slices and mel spectrogram slices as lists of array slices. + the waveform slices and mel spectrogram slices as lists of array slices. Index respectively the waveform and the mel spectrogram with these slices to obtain the partialutterances. """ assert 0 <= overlap < 1 @@ -206,7 +206,8 @@ class SpeakerVerificationPreprocessor(object): # Resample if numpy.array is passed and sr does not match if source_sr is not None and source_sr != self.sampling_rate: - wav = librosa.resample(wav, source_sr, self.sampling_rate) + wav = librosa.resample( + wav, orig_sr=source_sr, target_sr=self.sampling_rate) # loudness normalization wav = normalize_volume( @@ -221,7 +222,7 @@ class SpeakerVerificationPreprocessor(object): def melspectrogram(self, wav): mel = librosa.feature.melspectrogram( - wav, + y=wav, sr=self.sampling_rate, n_fft=self.n_fft, hop_length=self.hop_length,