PaddleSpeech/paddlespeech/t2s/datasets/vocoder_batch_fn.py

# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import paddle

from paddlespeech.t2s.audio.codec import encode_mu_law
from paddlespeech.t2s.audio.codec import float_2_label
from paddlespeech.t2s.audio.codec import label_2_float


class Clip(object):
    """Collate functor for training vocoders.
    """

    def __init__(
            self,
            batch_max_steps=20480,
            hop_size=256,
            aux_context_window=0, ):
        """Initialize customized collater for DataLoader.
        Args:

            batch_max_steps (int): The maximum length of input signal in batch.
            hop_size (int): Hop size of auxiliary features.
            aux_context_window (int): Context window size for auxiliary feature conv.

        """
        if batch_max_steps % hop_size != 0:
            batch_max_steps += -(batch_max_steps % hop_size)
        assert batch_max_steps % hop_size == 0
        self.batch_max_steps = batch_max_steps
        self.batch_max_frames = batch_max_steps // hop_size
        self.hop_size = hop_size
        self.aux_context_window = aux_context_window

        # set useful values in random cutting
        self.start_offset = aux_context_window
        self.end_offset = -(self.batch_max_frames + aux_context_window)
        self.mel_threshold = self.batch_max_frames + 2 * aux_context_window

    def __call__(self, batch):
        """Convert into batch tensors.

        Args:
            batch (list): list of tuple of the pair of audio and features. Audio shape (T, ), features shape(T', C).

        Returns:
            Tensor:
                Target signal batch (B, 1, T).
            Tensor:
                Auxiliary feature batch (B, C, T'), where
                T = (T' - 2 * aux_context_window) * hop_size.
        """
        # check length
        batch = [
            self._adjust_length(b['wave'], b['feats']) for b in batch
            if b['feats'].shape[0] > self.mel_threshold
        ]
        xs, cs = [b[0] for b in batch], [b[1] for b in batch]

        # make batch with random cut
        c_lengths = [c.shape[0] for c in cs]
        start_frames = np.array([
            np.random.randint(self.start_offset, cl + self.end_offset)
            for cl in c_lengths
        ])
        x_starts = start_frames * self.hop_size
        x_ends = x_starts + self.batch_max_steps

        c_starts = start_frames - self.aux_context_window
        c_ends = start_frames + self.batch_max_frames + self.aux_context_window
        y_batch = np.stack(
            [x[start:end] for x, start, end in zip(xs, x_starts, x_ends)])
        c_batch = np.stack(
            [c[start:end] for c, start, end in zip(cs, c_starts, c_ends)])

        # convert each batch to tensor, assume that each item in batch has the same length
        y_batch = paddle.to_tensor(
            y_batch, dtype=paddle.float32).unsqueeze(1)  # (B, 1, T)
        c_batch = paddle.to_tensor(
            c_batch, dtype=paddle.float32).transpose([0, 2, 1])  # (B, C, T')

        return y_batch, c_batch

    def _adjust_length(self, x, c):
        """Adjust the audio and feature lengths.

        Note:
            Basically we assume that the length of x and c are adjusted
            through preprocessing stage, but if we use other library processed
            features, this process will be needed.

        """
        if len(x) < c.shape[0] * self.hop_size:
            x = np.pad(x, (0, c.shape[0] * self.hop_size - len(x)), mode="edge")
        elif len(x) > c.shape[0] * self.hop_size:
            x = x[:c.shape[0] * self.hop_size]
        # check the legnth is valid
        assert len(x) == c.shape[
            0] * self.hop_size, f"wave length: ({len(x)}), mel length: ({c.shape[0]})"

        return x, c


class WaveRNNClip(Clip):
    def __init__(self,
                 mode: str='RAW',
                 batch_max_steps: int=4500,
                 hop_size: int=300,
                 aux_context_window: int=2,
                 bits: int=9,
                 mu_law: bool=True):
        self.mode = mode
        self.mel_win = batch_max_steps // hop_size + 2 * aux_context_window
        self.batch_max_steps = batch_max_steps
        self.hop_size = hop_size
        self.aux_context_window = aux_context_window
        self.mu_law = mu_law
        self.batch_max_frames = batch_max_steps // hop_size
        self.mel_threshold = self.batch_max_frames + 2 * aux_context_window
        if self.mode == 'MOL':
            self.bits = 16
        else:
            self.bits = bits

    def to_quant(self, wav):
        if self.mode == 'RAW':
            if self.mu_law:
                quant = encode_mu_law(wav, mu=2**self.bits)
            else:
                quant = float_2_label(wav, bits=self.bits)
        elif self.mode == 'MOL':
            quant = float_2_label(wav, bits=16)
        quant = quant.astype(np.int64)
        return quant

    def __call__(self, batch):
        # voc_pad = 2  this will pad the input so that the resnet can 'see' wider than input length
        # max_offsets = n_frames - 2 - (mel_win + 2 * hp.voc_pad) = n_frames - 15
        """Convert into batch tensors.
        Args:
            batch (list): list of tuple of the pair of audio and features. Audio shape (T, ), features shape(T', C).

        Returns:
            Tensor: Input signal batch (B, 1, T).
            Tensor: Target signal batch (B, 1, T).
            Tensor: Auxiliary feature batch (B, C, T'), 
                where T = (T' - 2 * aux_context_window) * hop_size.

        """
        # check length
        batch = [
            self._adjust_length(b['wave'], b['feats']) for b in batch
            if b['feats'].shape[0] > self.mel_threshold
        ]
        wav, mel = [b[0] for b in batch], [b[1] for b in batch]
        # mel 此处需要转置
        mel = [x.T for x in mel]
        max_offsets = [
            x.shape[-1] - 2 - (self.mel_win + 2 * self.aux_context_window)
            for x in mel
        ]
        # the slice point of mel selecting randomly 
        mel_offsets = [np.random.randint(0, offset) for offset in max_offsets]
        # the slice point of wav selecting randomly, which is behind 2(=pad) frames 
        sig_offsets = [(offset + self.aux_context_window) * self.hop_size
                       for offset in mel_offsets]
        # mels.shape[1] = voc_seq_len // hop_length + 2 * voc_pad
        mels = [
            x[:, mel_offsets[i]:mel_offsets[i] + self.mel_win]
            for i, x in enumerate(mel)
        ]
        # label.shape[1] = voc_seq_len + 1
        wav = [self.to_quant(x) for x in wav]

        labels = [
            x[sig_offsets[i]:sig_offsets[i] + self.batch_max_steps + 1]
            for i, x in enumerate(wav)
        ]

        mels = np.stack(mels).astype(np.float32)
        labels = np.stack(labels).astype(np.int64)

        mels = paddle.to_tensor(mels)
        labels = paddle.to_tensor(labels, dtype='int64')
        # x is input, y is label
        x = labels[:, :self.batch_max_steps]
        y = labels[:, 1:]
        '''
        mode = RAW:
            mu_law = True:
                quant: bits = 9   0, 1, 2, ..., 509, 510, 511  int
            mu_law = False
                quant bits = 9    [0， 511]  float
        mode = MOL:
            quant: bits = 16  [0. 65536]  float
        '''
        # x should be normalizes in.[0, 1] in RAW mode
        x = label_2_float(paddle.cast(x, dtype='float32'), self.bits)
        # y should be normalizes in.[0, 1] in MOL mode
        if self.mode == 'MOL':
            y = label_2_float(paddle.cast(y, dtype='float32'), self.bits)

        return x, y, mels


# for paddleslim


class Clip_static(Clip):
    """Collate functor for training vocoders.
    """

    def __call__(self, batch):
        """Convert into batch tensors.

        Args:
            batch (list): list of tuple of the pair of audio and features. Audio shape (T, ), features shape(T', C).

        Returns: 
            Dict[str, np.array]:
                Auxiliary feature batch (B, C, T'), where
                T = (T' - 2 * aux_context_window) * hop_size.
        """
        # check length
        batch = [
            self._adjust_length(b['wave'], b['feats']) for b in batch
            if b['feats'].shape[0] > self.mel_threshold
        ]
        xs, cs = [b[0] for b in batch], [b[1] for b in batch]

        # make batch with random cut
        c_lengths = [c.shape[0] for c in cs]
        start_frames = np.array([
            np.random.randint(self.start_offset, cl + self.end_offset)
            for cl in c_lengths
        ])

        c_starts = start_frames - self.aux_context_window
        c_ends = start_frames + self.batch_max_frames + self.aux_context_window
        c_batch = np.stack(
            [c[start:end] for c, start, end in zip(cs, c_starts, c_ends)])
        # infer axis (T',C) is different with train axis (B, C, T')
        # c_batch = c_batch.transpose([0, 2, 1])  # (B, C, T')
        # do not need batch axis in infer
        c_batch = c_batch[0]
        batch = {"logmel": c_batch}
        return batch