You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
PaddleSpeech/examples/voxceleb/sv0/local/make_vox_csv_dataset_from_j...

252 lines
9.6 KiB

# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Convert the PaddleSpeech jsonline format data to csv format data in voxceleb experiment.
Currently, Speaker Identificaton Training process use csv format.
"""
import argparse
import csv
import json
import os
import random
import tqdm
from paddleaudio.backends import soundfile_load as load_audio
from yacs.config import CfgNode
from paddlespeech.s2t.utils.log import Log
from paddlespeech.vector.utils.vector_utils import get_chunks
logger = Log(__name__).getlog()
def prepare_csv(wav_files, output_file, config, split_chunks=True):
"""Prepare the csv file according the wav files
Args:
wav_files (list): all the audio list to prepare the csv file
output_file (str): the output csv file
config (CfgNode): yaml configuration content
split_chunks (bool, optional): audio split flag. Defaults to True.
"""
if not os.path.exists(os.path.dirname(output_file)):
os.makedirs(os.path.dirname(output_file))
csv_lines = []
header = ["utt_id", "duration", "wav", "start", "stop", "label"]
# voxceleb meta info for each training utterance segment
# we extract a segment from a utterance to train
# and the segment' period is between start and stop time point in the original wav file
# each field in the meta info means as follows:
# utt_id: the utterance segment name, which is uniq in training dataset
# duration: the total utterance time
# wav: utterance file path, which should be absoulute path
# start: start point in the original wav file sample point range
# stop: stop point in the original wav file sample point range
# label: the utterance segment's label name,
# which is speaker name in speaker verification domain
for item in tqdm.tqdm(wav_files, total=len(wav_files)):
item = json.loads(item.strip())
audio_id = item['utt'].replace(".wav",
"") # we remove the wav suffix name
audio_duration = item['feat_shape'][0]
wav_file = item['feat']
label = audio_id.split('-')[
0] # speaker name in speaker verification domain
waveform, sr = load_audio(wav_file)
if split_chunks:
uniq_chunks_list = get_chunks(config.chunk_duration, audio_id,
audio_duration)
for chunk in uniq_chunks_list:
s, e = chunk.split("_")[-2:] # Timestamps of start and end
start_sample = int(float(s) * sr)
end_sample = int(float(e) * sr)
# id, duration, wav, start, stop, label
# in vector, the label in speaker id
csv_lines.append([
chunk, audio_duration, wav_file, start_sample, end_sample,
label
])
else:
csv_lines.append([
audio_id, audio_duration, wav_file, 0, waveform.shape[0], label
])
with open(output_file, mode="w") as csv_f:
csv_writer = csv.writer(
csv_f, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
csv_writer.writerow(header)
for line in csv_lines:
csv_writer.writerow(line)
def get_enroll_test_list(dataset_list, verification_file):
"""Get the enroll and test utterance list from all the voxceleb1 test utterance dataset.
Generally, we get the enroll and test utterances from the verfification file.
The verification file format as follows:
target/nontarget enroll-utt test-utt,
we set 0 as nontarget and 1 as target, eg:
0 a.wav b.wav
1 a.wav a.wav
Args:
dataset_list (list): all the dataset to get the test utterances
verification_file (str): voxceleb1 trial file
"""
logger.info(f"verification file: {verification_file}")
enroll_audios = set()
test_audios = set()
with open(verification_file, 'r') as f:
for line in f:
_, enroll_file, test_file = line.strip().split(' ')
enroll_audios.add('-'.join(enroll_file.split('/')))
test_audios.add('-'.join(test_file.split('/')))
enroll_files = []
test_files = []
for dataset in dataset_list:
with open(dataset, 'r') as f:
for line in f:
# audio_id may be in enroll and test at the same time
# eg: 1 a.wav a.wav
# the audio a.wav is enroll and test file at the same time
audio_id = json.loads(line.strip())['utt']
if audio_id in enroll_audios:
enroll_files.append(line)
if audio_id in test_audios:
test_files.append(line)
enroll_files = sorted(enroll_files)
test_files = sorted(test_files)
return enroll_files, test_files
def get_train_dev_list(dataset_list, target_dir, split_ratio):
"""Get the train and dev utterance list from all the training utterance dataset.
Generally, we use the split_ratio as the train dataset ratio,
and the remaining utterance (ratio is 1 - split_ratio) is the dev dataset
Args:
dataset_list (list): all the dataset to get the all utterances
target_dir (str): the target train and dev directory,
we will create the csv directory to store the {train,dev}.csv file
split_ratio (float): train dataset ratio in all utterance list
"""
logger.info("start to get train and dev utt list")
if not os.path.exists(os.path.join(target_dir, "meta")):
os.makedirs(os.path.join(target_dir, "meta"))
audio_files = []
speakers = set()
for dataset in dataset_list:
with open(dataset, 'r') as f:
for line in f:
# the label is speaker name
label_name = json.loads(line.strip())['utt2spk']
speakers.add(label_name)
audio_files.append(line.strip())
speakers = sorted(speakers)
logger.info(f"we get {len(speakers)} speakers from all the train dataset")
with open(os.path.join(target_dir, "meta", "label2id.txt"), 'w') as f:
for label_id, label_name in enumerate(speakers):
f.write(f'{label_name} {label_id}\n')
logger.info(
f'we store the speakers to {os.path.join(target_dir, "meta", "label2id.txt")}'
)
# the split_ratio is for train dataset
# the remaining is for dev dataset
split_idx = int(split_ratio * len(audio_files))
audio_files = sorted(audio_files)
random.shuffle(audio_files)
train_files, dev_files = audio_files[:split_idx], audio_files[split_idx:]
logger.info(
f"we get train utterances: {len(train_files)}, dev utterance: {len(dev_files)}"
)
return train_files, dev_files
def prepare_data(args, config):
"""Convert the jsonline format to csv format
Args:
args (argparse.Namespace): scripts args
config (CfgNode): yaml configuration content
"""
# stage0: set the random seed
random.seed(config.seed)
# if external config set the skip_prep flat, we will do nothing
if config.skip_prep:
return
# stage 1: prepare the enroll and test csv file
# And we generate the speaker to label file label2id.txt
logger.info("start to prepare the data csv file")
enroll_files, test_files = get_enroll_test_list(
[args.test], verification_file=config.verification_file)
prepare_csv(
enroll_files,
os.path.join(args.target_dir, "csv", "enroll.csv"),
config,
split_chunks=False)
prepare_csv(
test_files,
os.path.join(args.target_dir, "csv", "test.csv"),
config,
split_chunks=False)
# stage 2: prepare the train and dev csv file
# we get the train dataset ratio as config.split_ratio
# and the remaining is dev dataset
logger.info("start to prepare the data csv file")
train_files, dev_files = get_train_dev_list(
args.train, target_dir=args.target_dir, split_ratio=config.split_ratio)
prepare_csv(train_files,
os.path.join(args.target_dir, "csv", "train.csv"), config)
prepare_csv(dev_files,
os.path.join(args.target_dir, "csv", "dev.csv"), config)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--train",
required=True,
nargs='+',
help="The jsonline files list for train.")
parser.add_argument(
"--test", required=True, help="The jsonline file for test")
parser.add_argument(
"--target_dir",
default=None,
required=True,
help="The target directory stores the csv files and meta file.")
parser.add_argument(
"--config",
default=None,
required=True,
type=str,
help="configuration file")
args = parser.parse_args()
# parse the yaml config file
config = CfgNode(new_allowed=True)
if args.config:
config.merge_from_file(args.config)
# prepare the csv file from jsonlines files
prepare_data(args, config)