refactor voxceleb2 data download, test=doc

pull/1630/head
xiongxinlei 4 years ago
parent ebfe3e6b13
commit 38e4e9c893

@ -149,7 +149,7 @@ def prepare_dataset(base_url, data_list, target_dir, manifest_path,
# we will download the voxceleb1 data to ${target_dir}/vox1/dev/ or ${target_dir}/vox1/test directory # we will download the voxceleb1 data to ${target_dir}/vox1/dev/ or ${target_dir}/vox1/test directory
if not os.path.exists(os.path.join(target_dir, "wav")): if not os.path.exists(os.path.join(target_dir, "wav")):
# download all dataset part # download all dataset part
print("start to download the vox1 dev zip package") print(f"start to download the vox1 zip package to {target_dir}")
for zip_part in data_list.keys(): for zip_part in data_list.keys():
download_url = " --no-check-certificate " + base_url + "/" + zip_part download_url = " --no-check-certificate " + base_url + "/" + zip_part
download( download(

@ -22,10 +22,12 @@ import codecs
import glob import glob
import json import json
import os import os
import subprocess
from pathlib import Path from pathlib import Path
import soundfile import soundfile
from utils.utility import check_md5sum
from utils.utility import download from utils.utility import download
from utils.utility import unzip from utils.utility import unzip
@ -35,12 +37,22 @@ DATA_HOME = os.path.expanduser('.')
BASE_URL = "--no-check-certificate https://www.robots.ox.ac.uk/~vgg/data/voxceleb/data/" BASE_URL = "--no-check-certificate https://www.robots.ox.ac.uk/~vgg/data/voxceleb/data/"
# dev data # dev data
DEV_DATA_URL = BASE_URL + '/vox2_aac.zip' DEV_LIST = {
DEV_MD5SUM = "bbc063c46078a602ca71605645c2a402" "vox2_dev_aac_partaa": "da070494c573e5c0564b1d11c3b20577",
"vox2_dev_aac_partab": "17fe6dab2b32b48abaf1676429cdd06f",
"vox2_dev_aac_partac": "1de58e086c5edf63625af1cb6d831528",
"vox2_dev_aac_partad": "5a043eb03e15c5a918ee6a52aad477f9",
"vox2_dev_aac_partae": "cea401b624983e2d0b2a87fb5d59aa60",
"vox2_dev_aac_partaf": "fc886d9ba90ab88e7880ee98effd6ae9",
"vox2_dev_aac_partag": "d160ecc3f6ee3eed54d55349531cb42e",
"vox2_dev_aac_partah": "6b84a81b9af72a9d9eecbb3b1f602e65",
}
DEV_TARGET_DATA = "vox2_dev_aac_parta* vox2_dev_aac.zip bbc063c46078a602ca71605645c2a402"
# test data # test data
TEST_DATA_URL = BASE_URL + '/vox2_test_aac.zip' TEST_LIST = {"vox2_test_aac.zip": "0d2b3ea430a821c33263b5ea37ede312"}
TEST_MD5SUM = "0d2b3ea430a821c33263b5ea37ede312" TEST_TARGET_DATA = "vox2_test_aac.zip vox2_test_aac.zip 0d2b3ea430a821c33263b5ea37ede312"
parser = argparse.ArgumentParser(description=__doc__) parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument( parser.add_argument(
@ -68,6 +80,14 @@ args = parser.parse_args()
def create_manifest(data_dir, manifest_path_prefix): def create_manifest(data_dir, manifest_path_prefix):
"""Generate the voxceleb2 dataset manifest file.
We will create the ${manifest_path_prefix}.vox2 as the final manifest file
The dev and test wav info will be put in one manifest file.
Args:
data_dir (str): voxceleb2 wav directory, which include dev and test subdataset
manifest_path_prefix (str): manifest file prefix
"""
print("Creating manifest %s ..." % manifest_path_prefix) print("Creating manifest %s ..." % manifest_path_prefix)
json_lines = [] json_lines = []
data_path = os.path.join(data_dir, "**", "*.wav") data_path = os.path.join(data_dir, "**", "*.wav")
@ -119,7 +139,19 @@ def create_manifest(data_dir, manifest_path_prefix):
print(f"{total_sec / total_num} sec/utt", file=f) print(f"{total_sec / total_num} sec/utt", file=f)
def download_dataset(url, md5sum, target_dir, dataset): def download_dataset(base_url, data_list, target_data, target_dir, dataset):
"""Download the voxceleb2 zip package
Args:
base_url (str): the voxceleb2 dataset download baseline url
data_list (dict): the dataset part zip package and the md5 value
target_data (str): the final dataset zip info
target_dir (str): the dataset stored directory
dataset (str): the dataset name, dev or test
Raises:
RuntimeError: the md5sum occurs error
"""
if not os.path.exists(target_dir): if not os.path.exists(target_dir):
os.makedirs(target_dir) os.makedirs(target_dir)
@ -129,9 +161,34 @@ def download_dataset(url, md5sum, target_dir, dataset):
# but the test dataset will unzip to aac # but the test dataset will unzip to aac
# so, wo create the ${target_dir}/test and unzip the m4a to test dir # so, wo create the ${target_dir}/test and unzip the m4a to test dir
if not os.path.exists(os.path.join(target_dir, dataset)): if not os.path.exists(os.path.join(target_dir, dataset)):
filepath = download(url, md5sum, target_dir) print(f"start to download the vox2 zip package to {target_dir}")
for zip_part in data_list.keys():
download_url = " --no-check-certificate " + base_url + "/" + zip_part
download(
url=download_url,
md5sum=data_list[zip_part],
target_dir=target_dir)
# pack the all part to target zip file
all_target_part, target_name, target_md5sum = target_data.split()
target_name = os.path.join(target_dir, target_name)
if not os.path.exists(target_name):
pack_part_cmd = "cat {}/{} > {}".format(target_dir, all_target_part,
target_name)
subprocess.call(pack_part_cmd, shell=True)
# check the target zip file md5sum
if not check_md5sum(target_name, target_md5sum):
raise RuntimeError("{} MD5 checkssum failed".format(target_name))
else:
print("Check {} md5sum successfully".format(target_name))
if dataset == "test": if dataset == "test":
unzip(filepath, os.path.join(target_dir, "test")) # we need make the test directory
unzip(target_name, os.path.join(target_dir, "test"))
else:
# upzip dev zip pacakge and will create the dev directory
unzip(target_name, target_dir)
def main(): def main():
@ -142,14 +199,16 @@ def main():
print("download: {}".format(args.download)) print("download: {}".format(args.download))
if args.download: if args.download:
download_dataset( download_dataset(
url=DEV_DATA_URL, base_url=BASE_URL,
md5sum=DEV_MD5SUM, data_list=DEV_LIST,
target_data=DEV_TARGET_DATA,
target_dir=args.target_dir, target_dir=args.target_dir,
dataset="dev") dataset="dev")
download_dataset( download_dataset(
url=TEST_DATA_URL, base_url=BASE_URL,
md5sum=TEST_MD5SUM, data_list=TEST_LIST,
target_data=TEST_TARGET_DATA,
target_dir=args.target_dir, target_dir=args.target_dir,
dataset="test") dataset="test")

@ -1,8 +1,6 @@
########################################### ###########################################
# Data # # Data #
########################################### ###########################################
# we should explicitly specify the wav path of vox2 audio data converted from m4a
vox2_base_path:
augment: True augment: True
batch_size: 32 batch_size: 32
num_workers: 2 num_workers: 2
@ -30,7 +28,6 @@ hop_size: 160 #10ms, sample rate 16000, 10 * 16000 / 1000 = 160
# if we want use another model, please choose another configuration yaml file # if we want use another model, please choose another configuration yaml file
model: model:
input_size: 80 input_size: 80
# "channels": [512, 512, 512, 512, 1536],
channels: [1024, 1024, 1024, 1024, 3072] channels: [1024, 1024, 1024, 1024, 3072]
kernel_sizes: [5, 3, 3, 3, 1] kernel_sizes: [5, 3, 3, 3, 1]
dilations: [1, 2, 3, 4, 1] dilations: [1, 2, 3, 4, 1]
@ -42,8 +39,8 @@ model:
########################################### ###########################################
seed: 1986 # according from speechbrain configuration seed: 1986 # according from speechbrain configuration
epochs: 10 epochs: 10
save_interval: 1 save_interval: 10
log_interval: 1 log_interval: 10
learning_rate: 1e-8 learning_rate: 1e-8

@ -0,0 +1,53 @@
###########################################
# Data #
###########################################
augment: True
batch_size: 16
num_workers: 2
num_speakers: 1211 # 1211 vox1, 5994 vox2, 7205 vox1+2, test speakers: 41
shuffle: True
skip_prep: False
split_ratio: 0.9
chunk_duration: 3.0 # seconds
random_chunk: True
verification_file: data/vox1/veri_test2.txt
###########################################################
# FEATURE EXTRACTION SETTING #
###########################################################
# currently, we only support fbank
sr: 16000 # sample rate
n_mels: 80
window_size: 400 #25ms, sample rate 16000, 25 * 16000 / 1000 = 400
hop_size: 160 #10ms, sample rate 16000, 10 * 16000 / 1000 = 160
###########################################################
# MODEL SETTING #
###########################################################
# currently, we only support ecapa-tdnn in the ecapa_tdnn.yaml
# if we want use another model, please choose another configuration yaml file
model:
input_size: 80
channels: [512, 512, 512, 512, 1536]
kernel_sizes: [5, 3, 3, 3, 1]
dilations: [1, 2, 3, 4, 1]
attention_channels: 128
lin_neurons: 192
###########################################
# Training #
###########################################
seed: 1986 # according from speechbrain configuration
epochs: 100
save_interval: 10
log_interval: 10
learning_rate: 1e-8
###########################################
# Testing #
###########################################
global_embedding_norm: True
embedding_mean_norm: True
embedding_std_norm: False

@ -38,7 +38,10 @@ mkdir -p ${TARGET_DIR}
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# download data, generate manifests # download data, generate manifests
# we will generate the manifest.{dev, test} file in ${dir}/vox1/ directory # we will generate the manifest.{dev,test} file from ${TARGET_DIR}/voxceleb/vox1/{dev,test} directory
# and generate the meta info and download the trial file
# manifest.dev: 148642
# manifest.test: 4847
echo "Start to download vox1 dataset and generate the manifest files " echo "Start to download vox1 dataset and generate the manifest files "
python3 ${TARGET_DIR}/voxceleb/voxceleb1.py \ python3 ${TARGET_DIR}/voxceleb/voxceleb1.py \
--manifest_prefix="${dir}/vox1/manifest" \ --manifest_prefix="${dir}/vox1/manifest" \
@ -53,6 +56,8 @@ fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# download voxceleb2 data # download voxceleb2 data
# we will download the data and unzip the package
# and we will store the m4a file in ${TARGET_DIR}/voxceleb/vox2/{dev,test}
echo "start to download vox2 dataset" echo "start to download vox2 dataset"
python3 ${TARGET_DIR}/voxceleb/voxceleb2.py \ python3 ${TARGET_DIR}/voxceleb/voxceleb2.py \
--download \ --download \
@ -99,7 +104,7 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# Currently, our training system use csv file for dataset # Currently, our training system use csv file for dataset
echo "convert the json format to csv format to be compatible with training process" echo "convert the json format to csv format to be compatible with training process"
python3 local/make_vox_csv_dataset_from_json.py\ python3 local/make_vox_csv_dataset_from_json.py\
--train "${dir}/vox1/manifest.dev" \ --train "${dir}/vox1/manifest.dev" "${dir}/vox2/manifest.vox2"\
--test "${dir}/vox1/manifest.test" \ --test "${dir}/vox1/manifest.test" \
--target_dir "${dir}/vox/" \ --target_dir "${dir}/vox/" \
--config ${conf_path} --config ${conf_path}

@ -18,24 +18,22 @@ set -e
####################################################################### #######################################################################
# stage 0: data prepare, including voxceleb1 download and generate {train,dev,enroll,test}.csv # stage 0: data prepare, including voxceleb1 download and generate {train,dev,enroll,test}.csv
# voxceleb2 data is m4a format, so we need user to convert the m4a to wav yourselves as described in Readme.md with the script local/convert.sh # voxceleb2 data is m4a format, so we need convert the m4a to wav yourselves with the script local/convert.sh
# stage 1: train the speaker identification model # stage 1: train the speaker identification model
# stage 2: test speaker identification # stage 2: test speaker identification
# stage 3: extract the training embeding to train the LDA and PLDA # stage 3: (todo)extract the training embeding to train the LDA and PLDA
###################################################################### ######################################################################
# we can set the variable PPAUDIO_HOME to specifiy the root directory of the downloaded vox1 and vox2 dataset
# default the dataset will be stored in the ~/.paddleaudio/
# the vox2 dataset is stored in m4a format, we need to convert the audio from m4a to wav yourself # the vox2 dataset is stored in m4a format, we need to convert the audio from m4a to wav yourself
# and put all of them to ${PPAUDIO_HOME}/datasets/vox2 # and put all of them to ${MAIN_ROOT}/datasets/vox2
# we will find the wav from ${PPAUDIO_HOME}/datasets/vox1/wav and ${PPAUDIO_HOME}/datasets/vox2/wav # we will find the wav from ${MAIN_ROOT}/datasets/vox1/{dev,test}/wav and ${MAIN_ROOT}/datasets/vox2/wav
# export PPAUDIO_HOME=
stage=0 stage=0
stop_stage=50 stop_stage=50
# data directory # data directory
# if we set the variable ${dir}, we will store the wav info to this directory # if we set the variable ${dir}, we will store the wav info to this directory
# otherwise, we will store the wav info to vox1 and vox2 directory respectively # otherwise, we will store the wav info to data/vox1 and data/vox2 directory respectively
# vox2 wav path, we must convert the m4a format to wav format # vox2 wav path, we must convert the m4a format to wav format
dir=data/ # data info directory dir=data/ # data info directory
@ -64,6 +62,6 @@ if [ $stage -le 2 ] && [ ${stop_stage} -ge 2 ]; then
fi fi
# if [ $stage -le 3 ]; then # if [ $stage -le 3 ]; then
# # stage 2: extract the training embeding to train the LDA and PLDA # # stage 3: extract the training embeding to train the LDA and PLDA
# # todo: extract the training embedding # # todo: extract the training embedding
# fi # fi

Loading…
Cancel
Save