Merge branch 'develop' of https://github.com/PaddlePaddle/PaddleSpeech into rename_tacotron2

4 years ago · 30085ac229
parent 25347bb6a3 70ebbfd89f
commit 30085ac229
4 changed files with 15 additions and 10 deletions
--- a/paddlespeech/cli/asr/infer.py
+++ b/paddlespeech/cli/asr/infer.py
@ -311,8 +311,10 @@ class ASRExecutor(BaseExecutor):
                    audio = audio[:, 0]
                # pcm16 -> pcm 32
                audio = self._pcm16to32(audio)
-                audio = librosa.resample(audio, audio_sample_rate,
-                                         self.sample_rate)
+                audio = librosa.resample(
+                    audio,
+                    orig_sr=audio_sample_rate,
+                    target_sr=self.sample_rate)
                audio_sample_rate = self.sample_rate
                # pcm32 -> pcm 16
                audio = self._pcm32to16(audio)
--- a/paddlespeech/s2t/transform/perturb.py
+++ b/paddlespeech/s2t/transform/perturb.py
@ -90,7 +90,8 @@ class SpeedPerturbation():

        # Note1: resample requires the sampling-rate of input and output,
        #        but actually only the ratio is used.
-        y = librosa.resample(x, ratio, 1, res_type=self.res_type)
+        y = librosa.resample(
+            x, orig_sr=ratio, target_sr=1, res_type=self.res_type)

        if self.keep_length:
            diff = abs(len(x) - len(y))
--- a/paddlespeech/s2t/transform/spectrogram.py
+++ b/paddlespeech/s2t/transform/spectrogram.py
@ -38,7 +38,7 @@ def stft(x,
    x = np.stack(
        [
            librosa.stft(
-                x[:, ch],
+                y=x[:, ch],
                n_fft=n_fft,
                hop_length=n_shift,
                win_length=win_length,
@ -67,7 +67,7 @@ def istft(x, n_shift, win_length=None, window="hann", center=True):
    x = np.stack(
        [
            librosa.istft(
-                x[:, ch].T,  # [Time, Freq] -> [Freq, Time]
+                y=x[:, ch].T,  # [Time, Freq] -> [Freq, Time]
                hop_length=n_shift,
                win_length=win_length,
                window=window,
@ -95,7 +95,8 @@ def stft2logmelspectrogram(x_stft,
    # spc: (Time, Channel, Freq) or (Time, Freq)
    spc = np.abs(x_stft)
    # mel_basis: (Mel_freq, Freq)
-    mel_basis = librosa.filters.mel(fs, n_fft, n_mels, fmin, fmax)
+    mel_basis = librosa.filters.mel(
+        sr=fs, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax)
    # lmspc: (Time, Channel, Mel_freq) or (Time, Mel_freq)
    lmspc = np.log10(np.maximum(eps, np.dot(spc, mel_basis.T)))

--- a/paddlespeech/vector/exps/ge2e/audio_processor.py
+++ b/paddlespeech/vector/exps/ge2e/audio_processor.py
@ -206,7 +206,8 @@ class SpeakerVerificationPreprocessor(object):

        # Resample if numpy.array is passed and sr does not match
        if source_sr is not None and source_sr != self.sampling_rate:
-            wav = librosa.resample(wav, source_sr, self.sampling_rate)
+            wav = librosa.resample(
+                wav, orig_sr=source_sr, target_sr=self.sampling_rate)

        # loudness normalization
        wav = normalize_volume(
@ -221,7 +222,7 @@ class SpeakerVerificationPreprocessor(object):

    def melspectrogram(self, wav):
        mel = librosa.feature.melspectrogram(
-            wav,
+            y=wav,
            sr=self.sampling_rate,
            n_fft=self.n_fft,
            hop_length=self.hop_length,