PaddleSpeech/audio/paddleaudio/datasets/ravdess.py

# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import collections
import os
import random
from typing import List
from typing import Tuple

from ..utils.download import download_and_decompress
from ..utils.env import DATA_HOME
from .dataset import AudioClassificationDataset

__all__ = ['RAVDESS']


class RAVDESS(AudioClassificationDataset):
    """
    The RAVDESS contains 24 professional actors (12 female, 12 male), vocalizing two
    lexically-matched statements in a neutral North American accent. Speech emotions
    includes calm, happy, sad, angry, fearful, surprise, and disgust expressions.
    Each expression is produced at two levels of emotional intensity (normal, strong),
    with an additional neutral expression.

    Reference:
        The Ryerson Audio-Visual Database of Emotional Speech and Song (RAVDESS):
        A dynamic, multimodal set of facial and vocal expressions in North American English
        https://doi.org/10.1371/journal.pone.0196391
    """

    archieves = [
        {
            'url':
            'https://zenodo.org/record/1188976/files/Audio_Song_Actors_01-24.zip',
            'md5':
            '5411230427d67a21e18aa4d466e6d1b9',
        },
        {
            'url':
            'https://zenodo.org/record/1188976/files/Audio_Speech_Actors_01-24.zip',
            'md5':
            'bc696df654c87fed845eb13823edef8a',
        },
    ]
    label_list = [
        'neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust',
        'surprised'
    ]
    meta_info = collections.namedtuple(
        'META_INFO', ('modality', 'vocal_channel', 'emotion',
                      'emotion_intensity', 'statement', 'repitition', 'actor'))
    speech_path = os.path.join(DATA_HOME, 'Audio_Speech_Actors_01-24')
    song_path = os.path.join(DATA_HOME, 'Audio_Song_Actors_01-24')

    def __init__(self,
                 mode='train',
                 seed=0,
                 n_folds=5,
                 split=1,
                 feat_type='raw',
                 **kwargs):
        """
        Ags:
            mode (:obj:`str`, `optional`, defaults to `train`):
                It identifies the dataset mode (train or dev).
            seed (:obj:`int`, `optional`, defaults to 0):
                Set the random seed to shuffle samples.
            n_folds (:obj:`int`, `optional`, defaults to 5):
                Split the dataset into n folds. 1 fold for dev dataset and n-1 for train dataset.
            split (:obj:`int`, `optional`, defaults to 1):
                It specify the fold of dev dataset.
            feat_type (:obj:`str`, `optional`, defaults to `raw`):
                It identifies the feature type that user wants to extrace of an audio file.
        """
        assert split <= n_folds, f'The selected split should not be larger than n_fold, but got {split} > {n_folds}'
        files, labels = self._get_data(mode, seed, n_folds, split)
        super(RAVDESS, self).__init__(
            files=files, labels=labels, feat_type=feat_type, **kwargs)

    def _get_meta_info(self, files) -> List[collections.namedtuple]:
        ret = []
        for file in files:
            basename_without_extend = os.path.basename(file)[:-4]
            ret.append(self.meta_info(*basename_without_extend.split('-')))
        return ret

    def _get_data(self, mode, seed, n_folds,
                  split) -> Tuple[List[str], List[int]]:
        if not os.path.isdir(self.speech_path) and not os.path.isdir(
                self.song_path):
            download_and_decompress(self.archieves, DATA_HOME)

        wav_files = []
        for root, _, files in os.walk(self.speech_path):
            for file in files:
                if file.endswith('.wav'):
                    wav_files.append(os.path.join(root, file))

        for root, _, files in os.walk(self.song_path):
            for file in files:
                if file.endswith('.wav'):
                    wav_files.append(os.path.join(root, file))

        random.seed(seed)  # shuffle samples to split data
        random.shuffle(
            wav_files
        )  # make sure using the same seed to create train and dev dataset
        meta_info = self._get_meta_info(wav_files)

        files = []
        labels = []
        n_samples_per_fold = len(meta_info) // n_folds
        for idx, sample in enumerate(meta_info):
            _, _, emotion, _, _, _, _ = sample
            target = int(emotion) - 1
            fold = idx // n_samples_per_fold + 1

            if mode == 'train' and int(fold) != split:
                files.append(wav_files[idx])
                labels.append(target)

            if mode != 'train' and int(fold) == split:
                files.append(wav_files[idx])
                labels.append(target)

        return files, labels