Merge branch 'develop' of https://github.com/PaddlePaddle/PaddleSpeech into rename_tacotron2

pull/1436/head
TianYuan 3 years ago
commit 30085ac229

@ -311,8 +311,10 @@ class ASRExecutor(BaseExecutor):
audio = audio[:, 0]
# pcm16 -> pcm 32
audio = self._pcm16to32(audio)
audio = librosa.resample(audio, audio_sample_rate,
self.sample_rate)
audio = librosa.resample(
audio,
orig_sr=audio_sample_rate,
target_sr=self.sample_rate)
audio_sample_rate = self.sample_rate
# pcm32 -> pcm 16
audio = self._pcm32to16(audio)

@ -90,7 +90,8 @@ class SpeedPerturbation():
# Note1: resample requires the sampling-rate of input and output,
# but actually only the ratio is used.
y = librosa.resample(x, ratio, 1, res_type=self.res_type)
y = librosa.resample(
x, orig_sr=ratio, target_sr=1, res_type=self.res_type)
if self.keep_length:
diff = abs(len(x) - len(y))

@ -38,7 +38,7 @@ def stft(x,
x = np.stack(
[
librosa.stft(
x[:, ch],
y=x[:, ch],
n_fft=n_fft,
hop_length=n_shift,
win_length=win_length,
@ -67,7 +67,7 @@ def istft(x, n_shift, win_length=None, window="hann", center=True):
x = np.stack(
[
librosa.istft(
x[:, ch].T, # [Time, Freq] -> [Freq, Time]
y=x[:, ch].T, # [Time, Freq] -> [Freq, Time]
hop_length=n_shift,
win_length=win_length,
window=window,
@ -95,7 +95,8 @@ def stft2logmelspectrogram(x_stft,
# spc: (Time, Channel, Freq) or (Time, Freq)
spc = np.abs(x_stft)
# mel_basis: (Mel_freq, Freq)
mel_basis = librosa.filters.mel(fs, n_fft, n_mels, fmin, fmax)
mel_basis = librosa.filters.mel(
sr=fs, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax)
# lmspc: (Time, Channel, Mel_freq) or (Time, Mel_freq)
lmspc = np.log10(np.maximum(eps, np.dot(spc, mel_basis.T)))

@ -127,7 +127,7 @@ def compute_partial_slices(n_samples: int,
partial_utterance_n_frames : int
the number of mel spectrogram frames in each partial utterance.
min_pad_coverage : int
min_pad_coverage : int
when reaching the last partial utterance, it may or may not have enough frames.
If at least <min_pad_coverage> of <partial_utterance_n_frames> are present,
then the last partial utterance will be considered, as if we padded the audio. Otherwise,
@ -137,7 +137,7 @@ def compute_partial_slices(n_samples: int,
by how much the partial utterance should overlap. If set to 0, the partial utterances are entirely disjoint.
Returns
----------
the waveform slices and mel spectrogram slices as lists of array slices.
the waveform slices and mel spectrogram slices as lists of array slices.
Index respectively the waveform and the mel spectrogram with these slices to obtain the partialutterances.
"""
assert 0 <= overlap < 1
@ -206,7 +206,8 @@ class SpeakerVerificationPreprocessor(object):
# Resample if numpy.array is passed and sr does not match
if source_sr is not None and source_sr != self.sampling_rate:
wav = librosa.resample(wav, source_sr, self.sampling_rate)
wav = librosa.resample(
wav, orig_sr=source_sr, target_sr=self.sampling_rate)
# loudness normalization
wav = normalize_volume(
@ -221,7 +222,7 @@ class SpeakerVerificationPreprocessor(object):
def melspectrogram(self, wav):
mel = librosa.feature.melspectrogram(
wav,
y=wav,
sr=self.sampling_rate,
n_fft=self.n_fft,
hop_length=self.hop_length,

Loading…
Cancel
Save