refactor voxceleb2 data download, test=doc

pull/1630/head
xiongxinlei 3 years ago
parent ebfe3e6b13
commit 38e4e9c893

@ -149,7 +149,7 @@ def prepare_dataset(base_url, data_list, target_dir, manifest_path,
# we will download the voxceleb1 data to ${target_dir}/vox1/dev/ or ${target_dir}/vox1/test directory
if not os.path.exists(os.path.join(target_dir, "wav")):
# download all dataset part
print("start to download the vox1 dev zip package")
print(f"start to download the vox1 zip package to {target_dir}")
for zip_part in data_list.keys():
download_url = " --no-check-certificate " + base_url + "/" + zip_part
download(

@ -22,10 +22,12 @@ import codecs
import glob
import json
import os
import subprocess
from pathlib import Path
import soundfile
from utils.utility import check_md5sum
from utils.utility import download
from utils.utility import unzip
@ -35,12 +37,22 @@ DATA_HOME = os.path.expanduser('.')
BASE_URL = "--no-check-certificate https://www.robots.ox.ac.uk/~vgg/data/voxceleb/data/"
# dev data
DEV_DATA_URL = BASE_URL + '/vox2_aac.zip'
DEV_MD5SUM = "bbc063c46078a602ca71605645c2a402"
DEV_LIST = {
"vox2_dev_aac_partaa": "da070494c573e5c0564b1d11c3b20577",
"vox2_dev_aac_partab": "17fe6dab2b32b48abaf1676429cdd06f",
"vox2_dev_aac_partac": "1de58e086c5edf63625af1cb6d831528",
"vox2_dev_aac_partad": "5a043eb03e15c5a918ee6a52aad477f9",
"vox2_dev_aac_partae": "cea401b624983e2d0b2a87fb5d59aa60",
"vox2_dev_aac_partaf": "fc886d9ba90ab88e7880ee98effd6ae9",
"vox2_dev_aac_partag": "d160ecc3f6ee3eed54d55349531cb42e",
"vox2_dev_aac_partah": "6b84a81b9af72a9d9eecbb3b1f602e65",
}
DEV_TARGET_DATA = "vox2_dev_aac_parta* vox2_dev_aac.zip bbc063c46078a602ca71605645c2a402"
# test data
TEST_DATA_URL = BASE_URL + '/vox2_test_aac.zip'
TEST_MD5SUM = "0d2b3ea430a821c33263b5ea37ede312"
TEST_LIST = {"vox2_test_aac.zip": "0d2b3ea430a821c33263b5ea37ede312"}
TEST_TARGET_DATA = "vox2_test_aac.zip vox2_test_aac.zip 0d2b3ea430a821c33263b5ea37ede312"
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
@ -68,6 +80,14 @@ args = parser.parse_args()
def create_manifest(data_dir, manifest_path_prefix):
"""Generate the voxceleb2 dataset manifest file.
We will create the ${manifest_path_prefix}.vox2 as the final manifest file
The dev and test wav info will be put in one manifest file.
Args:
data_dir (str): voxceleb2 wav directory, which include dev and test subdataset
manifest_path_prefix (str): manifest file prefix
"""
print("Creating manifest %s ..." % manifest_path_prefix)
json_lines = []
data_path = os.path.join(data_dir, "**", "*.wav")
@ -119,7 +139,19 @@ def create_manifest(data_dir, manifest_path_prefix):
print(f"{total_sec / total_num} sec/utt", file=f)
def download_dataset(url, md5sum, target_dir, dataset):
def download_dataset(base_url, data_list, target_data, target_dir, dataset):
"""Download the voxceleb2 zip package
Args:
base_url (str): the voxceleb2 dataset download baseline url
data_list (dict): the dataset part zip package and the md5 value
target_data (str): the final dataset zip info
target_dir (str): the dataset stored directory
dataset (str): the dataset name, dev or test
Raises:
RuntimeError: the md5sum occurs error
"""
if not os.path.exists(target_dir):
os.makedirs(target_dir)
@ -129,9 +161,34 @@ def download_dataset(url, md5sum, target_dir, dataset):
# but the test dataset will unzip to aac
# so, wo create the ${target_dir}/test and unzip the m4a to test dir
if not os.path.exists(os.path.join(target_dir, dataset)):
filepath = download(url, md5sum, target_dir)
print(f"start to download the vox2 zip package to {target_dir}")
for zip_part in data_list.keys():
download_url = " --no-check-certificate " + base_url + "/" + zip_part
download(
url=download_url,
md5sum=data_list[zip_part],
target_dir=target_dir)
# pack the all part to target zip file
all_target_part, target_name, target_md5sum = target_data.split()
target_name = os.path.join(target_dir, target_name)
if not os.path.exists(target_name):
pack_part_cmd = "cat {}/{} > {}".format(target_dir, all_target_part,
target_name)
subprocess.call(pack_part_cmd, shell=True)
# check the target zip file md5sum
if not check_md5sum(target_name, target_md5sum):
raise RuntimeError("{} MD5 checkssum failed".format(target_name))
else:
print("Check {} md5sum successfully".format(target_name))
if dataset == "test":
unzip(filepath, os.path.join(target_dir, "test"))
# we need make the test directory
unzip(target_name, os.path.join(target_dir, "test"))
else:
# upzip dev zip pacakge and will create the dev directory
unzip(target_name, target_dir)
def main():
@ -142,14 +199,16 @@ def main():
print("download: {}".format(args.download))
if args.download:
download_dataset(
url=DEV_DATA_URL,
md5sum=DEV_MD5SUM,
base_url=BASE_URL,
data_list=DEV_LIST,
target_data=DEV_TARGET_DATA,
target_dir=args.target_dir,
dataset="dev")
download_dataset(
url=TEST_DATA_URL,
md5sum=TEST_MD5SUM,
base_url=BASE_URL,
data_list=TEST_LIST,
target_data=TEST_TARGET_DATA,
target_dir=args.target_dir,
dataset="test")

@ -1,8 +1,6 @@
###########################################
# Data #
###########################################
# we should explicitly specify the wav path of vox2 audio data converted from m4a
vox2_base_path:
augment: True
batch_size: 32
num_workers: 2
@ -30,7 +28,6 @@ hop_size: 160 #10ms, sample rate 16000, 10 * 16000 / 1000 = 160
# if we want use another model, please choose another configuration yaml file
model:
input_size: 80
# "channels": [512, 512, 512, 512, 1536],
channels: [1024, 1024, 1024, 1024, 3072]
kernel_sizes: [5, 3, 3, 3, 1]
dilations: [1, 2, 3, 4, 1]
@ -42,8 +39,8 @@ model:
###########################################
seed: 1986 # according from speechbrain configuration
epochs: 10
save_interval: 1
log_interval: 1
save_interval: 10
log_interval: 10
learning_rate: 1e-8

@ -0,0 +1,53 @@
###########################################
# Data #
###########################################
augment: True
batch_size: 16
num_workers: 2
num_speakers: 1211 # 1211 vox1, 5994 vox2, 7205 vox1+2, test speakers: 41
shuffle: True
skip_prep: False
split_ratio: 0.9
chunk_duration: 3.0 # seconds
random_chunk: True
verification_file: data/vox1/veri_test2.txt
###########################################################
# FEATURE EXTRACTION SETTING #
###########################################################
# currently, we only support fbank
sr: 16000 # sample rate
n_mels: 80
window_size: 400 #25ms, sample rate 16000, 25 * 16000 / 1000 = 400
hop_size: 160 #10ms, sample rate 16000, 10 * 16000 / 1000 = 160
###########################################################
# MODEL SETTING #
###########################################################
# currently, we only support ecapa-tdnn in the ecapa_tdnn.yaml
# if we want use another model, please choose another configuration yaml file
model:
input_size: 80
channels: [512, 512, 512, 512, 1536]
kernel_sizes: [5, 3, 3, 3, 1]
dilations: [1, 2, 3, 4, 1]
attention_channels: 128
lin_neurons: 192
###########################################
# Training #
###########################################
seed: 1986 # according from speechbrain configuration
epochs: 100
save_interval: 10
log_interval: 10
learning_rate: 1e-8
###########################################
# Testing #
###########################################
global_embedding_norm: True
embedding_mean_norm: True
embedding_std_norm: False

@ -38,7 +38,10 @@ mkdir -p ${TARGET_DIR}
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# download data, generate manifests
# we will generate the manifest.{dev, test} file in ${dir}/vox1/ directory
# we will generate the manifest.{dev,test} file from ${TARGET_DIR}/voxceleb/vox1/{dev,test} directory
# and generate the meta info and download the trial file
# manifest.dev: 148642
# manifest.test: 4847
echo "Start to download vox1 dataset and generate the manifest files "
python3 ${TARGET_DIR}/voxceleb/voxceleb1.py \
--manifest_prefix="${dir}/vox1/manifest" \
@ -53,6 +56,8 @@ fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# download voxceleb2 data
# we will download the data and unzip the package
# and we will store the m4a file in ${TARGET_DIR}/voxceleb/vox2/{dev,test}
echo "start to download vox2 dataset"
python3 ${TARGET_DIR}/voxceleb/voxceleb2.py \
--download \
@ -99,7 +104,7 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# Currently, our training system use csv file for dataset
echo "convert the json format to csv format to be compatible with training process"
python3 local/make_vox_csv_dataset_from_json.py\
--train "${dir}/vox1/manifest.dev" \
--train "${dir}/vox1/manifest.dev" "${dir}/vox2/manifest.vox2"\
--test "${dir}/vox1/manifest.test" \
--target_dir "${dir}/vox/" \
--config ${conf_path}

@ -18,24 +18,22 @@ set -e
#######################################################################
# stage 0: data prepare, including voxceleb1 download and generate {train,dev,enroll,test}.csv
# voxceleb2 data is m4a format, so we need user to convert the m4a to wav yourselves as described in Readme.md with the script local/convert.sh
# voxceleb2 data is m4a format, so we need convert the m4a to wav yourselves with the script local/convert.sh
# stage 1: train the speaker identification model
# stage 2: test speaker identification
# stage 3: extract the training embeding to train the LDA and PLDA
# stage 3: (todo)extract the training embeding to train the LDA and PLDA
######################################################################
# we can set the variable PPAUDIO_HOME to specifiy the root directory of the downloaded vox1 and vox2 dataset
# default the dataset will be stored in the ~/.paddleaudio/
# the vox2 dataset is stored in m4a format, we need to convert the audio from m4a to wav yourself
# and put all of them to ${PPAUDIO_HOME}/datasets/vox2
# we will find the wav from ${PPAUDIO_HOME}/datasets/vox1/wav and ${PPAUDIO_HOME}/datasets/vox2/wav
# export PPAUDIO_HOME=
# and put all of them to ${MAIN_ROOT}/datasets/vox2
# we will find the wav from ${MAIN_ROOT}/datasets/vox1/{dev,test}/wav and ${MAIN_ROOT}/datasets/vox2/wav
stage=0
stop_stage=50
# data directory
# if we set the variable ${dir}, we will store the wav info to this directory
# otherwise, we will store the wav info to vox1 and vox2 directory respectively
# otherwise, we will store the wav info to data/vox1 and data/vox2 directory respectively
# vox2 wav path, we must convert the m4a format to wav format
dir=data/ # data info directory
@ -64,6 +62,6 @@ if [ $stage -le 2 ] && [ ${stop_stage} -ge 2 ]; then
fi
# if [ $stage -le 3 ]; then
# # stage 2: extract the training embeding to train the LDA and PLDA
# # stage 3: extract the training embeding to train the LDA and PLDA
# # todo: extract the training embedding
# fi

Loading…
Cancel
Save