PaddleSpeech/examples/voxceleb/sv0/local/make_vox_csv_dataset_from_j...

# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Convert the PaddleSpeech jsonline format data to csv format data in voxceleb experiment.
Currently, Speaker Identificaton Training process use csv format.
"""
import argparse
import csv
import json
import os
import random

import tqdm
from paddleaudio.backends import soundfile_load as load_audio
from yacs.config import CfgNode

from paddlespeech.s2t.utils.log import Log
from paddlespeech.vector.utils.vector_utils import get_chunks

logger = Log(__name__).getlog()


def prepare_csv(wav_files, output_file, config, split_chunks=True):
    """Prepare the csv file according the wav files

    Args:
        wav_files (list): all the audio list to prepare the csv file
        output_file (str): the output csv file
        config (CfgNode): yaml configuration content
        split_chunks (bool, optional): audio split flag. Defaults to True.
    """
    if not os.path.exists(os.path.dirname(output_file)):
        os.makedirs(os.path.dirname(output_file))
    csv_lines = []
    header = ["utt_id", "duration", "wav", "start", "stop", "label"]
    # voxceleb meta info for each training utterance segment
    # we extract a segment from a utterance to train
    # and the segment' period is between start and stop time point in the original wav file
    # each field in the meta info means as follows:
    # utt_id: the utterance segment name, which is uniq in training dataset
    # duration: the total utterance time
    # wav: utterance file path, which should be absoulute path
    # start: start point in the original wav file sample point range
    # stop: stop point in the original wav file sample point range
    # label: the utterance segment's label name,
    #        which is speaker name in speaker verification domain
    for item in tqdm.tqdm(wav_files, total=len(wav_files)):
        item = json.loads(item.strip())
        audio_id = item['utt'].replace(".wav",
                                       "")  # we remove the wav suffix name
        audio_duration = item['feat_shape'][0]
        wav_file = item['feat']
        label = audio_id.split('-')[
            0]  # speaker name in speaker verification domain
        waveform, sr = load_audio(wav_file)
        if split_chunks:
            uniq_chunks_list = get_chunks(config.chunk_duration, audio_id,
                                          audio_duration)
            for chunk in uniq_chunks_list:
                s, e = chunk.split("_")[-2:]  # Timestamps of start and end
                start_sample = int(float(s) * sr)
                end_sample = int(float(e) * sr)
                # id, duration, wav, start, stop, label
                # in vector, the label in speaker id
                csv_lines.append([
                    chunk, audio_duration, wav_file, start_sample, end_sample,
                    label
                ])
        else:
            csv_lines.append([
                audio_id, audio_duration, wav_file, 0, waveform.shape[0], label
            ])

    with open(output_file, mode="w") as csv_f:
        csv_writer = csv.writer(
            csv_f, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        csv_writer.writerow(header)
        for line in csv_lines:
            csv_writer.writerow(line)


def get_enroll_test_list(dataset_list, verification_file):
    """Get the enroll and test utterance list from all the voxceleb1 test utterance dataset.
       Generally, we get the enroll and test utterances from the verfification file.
       The verification file format as follows:
       target/nontarget enroll-utt test-utt,
       we set 0 as nontarget and 1 as target, eg:
       0 a.wav b.wav
       1 a.wav a.wav

    Args:
        dataset_list (list): all the dataset to get the test utterances
        verification_file (str): voxceleb1 trial file
    """
    logger.info(f"verification file: {verification_file}")
    enroll_audios = set()
    test_audios = set()
    with open(verification_file, 'r') as f:
        for line in f:
            _, enroll_file, test_file = line.strip().split(' ')
            enroll_audios.add('-'.join(enroll_file.split('/')))
            test_audios.add('-'.join(test_file.split('/')))

    enroll_files = []
    test_files = []
    for dataset in dataset_list:
        with open(dataset, 'r') as f:
            for line in f:
                # audio_id may be in enroll and test at the same time
                # eg: 1 a.wav a.wav
                # the audio a.wav is enroll and test file at the same time
                audio_id = json.loads(line.strip())['utt']
                if audio_id in enroll_audios:
                    enroll_files.append(line)
                if audio_id in test_audios:
                    test_files.append(line)

    enroll_files = sorted(enroll_files)
    test_files = sorted(test_files)

    return enroll_files, test_files


def get_train_dev_list(dataset_list, target_dir, split_ratio):
    """Get the train and dev utterance list from all the training utterance dataset.
       Generally, we use the split_ratio as the train dataset ratio,
       and the remaining utterance (ratio is 1 - split_ratio) is the dev dataset

    Args:
        dataset_list (list): all the dataset to get the all utterances
        target_dir (str): the target train and dev directory,
                          we will create the csv directory to store the {train,dev}.csv file
        split_ratio (float): train dataset ratio in all utterance list
    """
    logger.info("start to get train and dev utt list")
    if not os.path.exists(os.path.join(target_dir, "meta")):
        os.makedirs(os.path.join(target_dir, "meta"))

    audio_files = []
    speakers = set()
    for dataset in dataset_list:
        with open(dataset, 'r') as f:
            for line in f:
                # the label is speaker name
                label_name = json.loads(line.strip())['utt2spk']
                speakers.add(label_name)
                audio_files.append(line.strip())
    speakers = sorted(speakers)
    logger.info(f"we get {len(speakers)} speakers from all the train dataset")

    with open(os.path.join(target_dir, "meta", "label2id.txt"), 'w') as f:
        for label_id, label_name in enumerate(speakers):
            f.write(f'{label_name} {label_id}\n')
    logger.info(
        f'we store the speakers to {os.path.join(target_dir, "meta", "label2id.txt")}'
    )

    # the split_ratio is for train dataset
    # the remaining is for dev dataset
    split_idx = int(split_ratio * len(audio_files))
    audio_files = sorted(audio_files)
    random.shuffle(audio_files)
    train_files, dev_files = audio_files[:split_idx], audio_files[split_idx:]
    logger.info(
        f"we get train utterances: {len(train_files)}, dev utterance: {len(dev_files)}"
    )
    return train_files, dev_files


def prepare_data(args, config):
    """Convert the jsonline format to csv format

    Args:
        args (argparse.Namespace): scripts args
        config (CfgNode): yaml configuration content
    """
    # stage0: set the random seed
    random.seed(config.seed)

    # if external config set the skip_prep flat, we will do nothing
    if config.skip_prep:
        return

    # stage 1: prepare the enroll and test csv file
    #          And we generate the speaker to label file label2id.txt
    logger.info("start to prepare the data csv file")
    enroll_files, test_files = get_enroll_test_list(
        [args.test], verification_file=config.verification_file)
    prepare_csv(
        enroll_files,
        os.path.join(args.target_dir, "csv", "enroll.csv"),
        config,
        split_chunks=False)
    prepare_csv(
        test_files,
        os.path.join(args.target_dir, "csv", "test.csv"),
        config,
        split_chunks=False)

    # stage 2: prepare the train and dev csv file
    #          we get the train dataset ratio as config.split_ratio
    #          and the remaining is dev dataset
    logger.info("start to prepare the data csv file")
    train_files, dev_files = get_train_dev_list(
        args.train, target_dir=args.target_dir, split_ratio=config.split_ratio)
    prepare_csv(train_files,
                os.path.join(args.target_dir, "csv", "train.csv"), config)
    prepare_csv(dev_files,
                os.path.join(args.target_dir, "csv", "dev.csv"), config)


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument(
        "--train",
        required=True,
        nargs='+',
        help="The jsonline files list for train.")
    parser.add_argument(
        "--test", required=True, help="The jsonline file for test")
    parser.add_argument(
        "--target_dir",
        default=None,
        required=True,
        help="The target directory stores the csv files and meta file.")
    parser.add_argument(
        "--config",
        default=None,
        required=True,
        type=str,
        help="configuration file")
    args = parser.parse_args()

    # parse the yaml config file
    config = CfgNode(new_allowed=True)
    if args.config:
        config.merge_from_file(args.config)

    # prepare the csv file from jsonlines files
    prepare_data(args, config)