From 9a55783aa0821a454b14feb3fa17b6bd5c8d9d44 Mon Sep 17 00:00:00 2001
From: huangyuxin <hyxin2014@126.com>
Date: Wed, 9 Feb 2022 09:47:37 +0000
Subject: [PATCH] fix resample

---
 paddlespeech/cli/asr/infer.py                    | 6 ++++--
 paddlespeech/s2t/transform/perturb.py            | 3 ++-
 paddlespeech/s2t/transform/spectrogram.py        | 7 ++++---
 paddlespeech/vector/exps/ge2e/audio_processor.py | 9 +++++----
 4 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/paddlespeech/cli/asr/infer.py b/paddlespeech/cli/asr/infer.py
index 6e14e0d6..ef769fbc 100644
--- a/paddlespeech/cli/asr/infer.py
+++ b/paddlespeech/cli/asr/infer.py
@@ -311,8 +311,10 @@ class ASRExecutor(BaseExecutor):
                     audio = audio[:, 0]
                 # pcm16 -> pcm 32
                 audio = self._pcm16to32(audio)
-                audio = librosa.resample(audio, audio_sample_rate,
-                                         self.sample_rate)
+                audio = librosa.resample(
+                    audio,
+                    orig_sr=audio_sample_rate,
+                    target_sr=self.sample_rate)
                 audio_sample_rate = self.sample_rate
                 # pcm32 -> pcm 16
                 audio = self._pcm32to16(audio)
diff --git a/paddlespeech/s2t/transform/perturb.py b/paddlespeech/s2t/transform/perturb.py
index 226885f3..9e41b824 100644
--- a/paddlespeech/s2t/transform/perturb.py
+++ b/paddlespeech/s2t/transform/perturb.py
@@ -90,7 +90,8 @@ class SpeedPerturbation():
 
         # Note1: resample requires the sampling-rate of input and output,
         #        but actually only the ratio is used.
-        y = librosa.resample(x, ratio, 1, res_type=self.res_type)
+        y = librosa.resample(
+            x, orig_sr=ratio, target_sr=1, res_type=self.res_type)
 
         if self.keep_length:
             diff = abs(len(x) - len(y))
diff --git a/paddlespeech/s2t/transform/spectrogram.py b/paddlespeech/s2t/transform/spectrogram.py
index a6346c34..988fd627 100644
--- a/paddlespeech/s2t/transform/spectrogram.py
+++ b/paddlespeech/s2t/transform/spectrogram.py
@@ -38,7 +38,7 @@ def stft(x,
     x = np.stack(
         [
             librosa.stft(
-                x[:, ch],
+                y=x[:, ch],
                 n_fft=n_fft,
                 hop_length=n_shift,
                 win_length=win_length,
@@ -67,7 +67,7 @@ def istft(x, n_shift, win_length=None, window="hann", center=True):
     x = np.stack(
         [
             librosa.istft(
-                x[:, ch].T,  # [Time, Freq] -> [Freq, Time]
+                y=x[:, ch].T,  # [Time, Freq] -> [Freq, Time]
                 hop_length=n_shift,
                 win_length=win_length,
                 window=window,
@@ -95,7 +95,8 @@ def stft2logmelspectrogram(x_stft,
     # spc: (Time, Channel, Freq) or (Time, Freq)
     spc = np.abs(x_stft)
     # mel_basis: (Mel_freq, Freq)
-    mel_basis = librosa.filters.mel(fs, n_fft, n_mels, fmin, fmax)
+    mel_basis = librosa.filters.mel(
+        sr=fs, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax)
     # lmspc: (Time, Channel, Mel_freq) or (Time, Mel_freq)
     lmspc = np.log10(np.maximum(eps, np.dot(spc, mel_basis.T)))
 
diff --git a/paddlespeech/vector/exps/ge2e/audio_processor.py b/paddlespeech/vector/exps/ge2e/audio_processor.py
index 2d6bbe34..1ab0419e 100644
--- a/paddlespeech/vector/exps/ge2e/audio_processor.py
+++ b/paddlespeech/vector/exps/ge2e/audio_processor.py
@@ -127,7 +127,7 @@ def compute_partial_slices(n_samples: int,
     partial_utterance_n_frames : int
         the number of mel spectrogram frames in each partial utterance.
 
-    min_pad_coverage : int 
+    min_pad_coverage : int
         when reaching the last partial utterance, it may or may not have enough frames.
         If at least <min_pad_coverage> of <partial_utterance_n_frames> are present,
         then the last partial utterance will be considered, as if we padded the audio. Otherwise,
@@ -137,7 +137,7 @@ def compute_partial_slices(n_samples: int,
         by how much the partial utterance should overlap. If set to 0, the partial utterances are entirely disjoint.
     Returns
     ----------
-        the waveform slices and mel spectrogram slices as lists of array slices. 
+        the waveform slices and mel spectrogram slices as lists of array slices.
         Index respectively the waveform and the mel spectrogram with these slices to obtain the partialutterances.
     """
     assert 0 <= overlap < 1
@@ -206,7 +206,8 @@ class SpeakerVerificationPreprocessor(object):
 
         # Resample if numpy.array is passed and sr does not match
         if source_sr is not None and source_sr != self.sampling_rate:
-            wav = librosa.resample(wav, source_sr, self.sampling_rate)
+            wav = librosa.resample(
+                wav, orig_sr=source_sr, target_sr=self.sampling_rate)
 
         # loudness normalization
         wav = normalize_volume(
@@ -221,7 +222,7 @@ class SpeakerVerificationPreprocessor(object):
 
     def melspectrogram(self, wav):
         mel = librosa.feature.melspectrogram(
-            wav,
+            y=wav,
             sr=self.sampling_rate,
             n_fft=self.n_fft,
             hop_length=self.hop_length,