You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
174 lines
6.4 KiB
174 lines
6.4 KiB
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
import multiprocessing as mp
|
|
from functools import partial
|
|
from pathlib import Path
|
|
from typing import List
|
|
|
|
import numpy as np
|
|
from tqdm import tqdm
|
|
|
|
from parakeet.exps.ge2e.audio_processor import SpeakerVerificationPreprocessor
|
|
|
|
|
|
def _process_utterance(path_pair, processor: SpeakerVerificationPreprocessor):
|
|
# Load and preprocess the waveform
|
|
input_path, output_path = path_pair
|
|
wav = processor.preprocess_wav(input_path)
|
|
if len(wav) == 0:
|
|
return
|
|
|
|
# Create the mel spectrogram, discard those that are too short
|
|
frames = processor.melspectrogram(wav)
|
|
if len(frames) < processor.partial_n_frames:
|
|
return
|
|
|
|
np.save(output_path, frames)
|
|
|
|
|
|
def _process_speaker(speaker_dir: Path,
|
|
processor: SpeakerVerificationPreprocessor,
|
|
datasets_root: Path,
|
|
output_dir: Path,
|
|
pattern: str,
|
|
skip_existing: bool=False):
|
|
# datastes root: a reference path to compute speaker_name
|
|
# we prepand dataset name to speaker_id becase we are mixing serveal
|
|
# multispeaker datasets together
|
|
speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts)
|
|
speaker_output_dir = output_dir / speaker_name
|
|
speaker_output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# load exsiting file set
|
|
sources_fpath = speaker_output_dir / "_sources.txt"
|
|
if sources_fpath.exists():
|
|
try:
|
|
with sources_fpath.open("rt") as sources_file:
|
|
existing_names = {line.split(",")[0] for line in sources_file}
|
|
except Exception as e:
|
|
existing_names = {}
|
|
else:
|
|
existing_names = {}
|
|
|
|
sources_file = sources_fpath.open("at" if skip_existing else "wt")
|
|
for in_fpath in speaker_dir.rglob(pattern):
|
|
out_name = "_".join(
|
|
in_fpath.relative_to(speaker_dir).with_suffix(".npy").parts)
|
|
if skip_existing and out_name in existing_names:
|
|
continue
|
|
out_fpath = speaker_output_dir / out_name
|
|
_process_utterance((in_fpath, out_fpath), processor)
|
|
sources_file.write(f"{out_name},{in_fpath}\n")
|
|
|
|
sources_file.close()
|
|
|
|
|
|
def _process_dataset(processor: SpeakerVerificationPreprocessor,
|
|
datasets_root: Path,
|
|
speaker_dirs: List[Path],
|
|
dataset_name: str,
|
|
output_dir: Path,
|
|
pattern: str,
|
|
skip_existing: bool=False):
|
|
print(
|
|
f"{dataset_name}: Preprocessing data for {len(speaker_dirs)} speakers.")
|
|
|
|
_func = partial(
|
|
_process_speaker,
|
|
processor=processor,
|
|
datasets_root=datasets_root,
|
|
output_dir=output_dir,
|
|
pattern=pattern,
|
|
skip_existing=skip_existing)
|
|
|
|
with mp.Pool(16) as pool:
|
|
list(
|
|
tqdm(
|
|
pool.imap(_func, speaker_dirs),
|
|
dataset_name,
|
|
len(speaker_dirs),
|
|
unit="speakers"))
|
|
print(f"Done preprocessing {dataset_name}.")
|
|
|
|
|
|
def process_librispeech(processor,
|
|
datasets_root,
|
|
output_dir,
|
|
skip_existing=False):
|
|
dataset_name = "LibriSpeech/train-other-500"
|
|
dataset_root = datasets_root / dataset_name
|
|
speaker_dirs = list(dataset_root.glob("*"))
|
|
_process_dataset(processor, datasets_root, speaker_dirs, dataset_name,
|
|
output_dir, "*.flac", skip_existing)
|
|
|
|
|
|
def process_voxceleb1(processor, datasets_root, output_dir,
|
|
skip_existing=False):
|
|
dataset_name = "VoxCeleb1"
|
|
dataset_root = datasets_root / dataset_name
|
|
|
|
anglophone_nationalites = ["australia", "canada", "ireland", "uk", "usa"]
|
|
with dataset_root.joinpath("vox1_meta.csv").open("rt") as metafile:
|
|
metadata = [line.strip().split("\t") for line in metafile][1:]
|
|
|
|
# speaker id -> nationality
|
|
nationalities = {line[0]: line[3] for line in metadata if line[-1] == "dev"}
|
|
keep_speaker_ids = [
|
|
speaker_id for speaker_id, nationality in nationalities.items()
|
|
if nationality.lower() in anglophone_nationalites
|
|
]
|
|
print(
|
|
"VoxCeleb1: using samples from {} (presumed anglophone) speakers out of {}."
|
|
.format(len(keep_speaker_ids), len(nationalities)))
|
|
|
|
speaker_dirs = list((dataset_root / "wav").glob("*"))
|
|
speaker_dirs = [
|
|
speaker_dir for speaker_dir in speaker_dirs
|
|
if speaker_dir.name in keep_speaker_ids
|
|
]
|
|
_process_dataset(processor, datasets_root, speaker_dirs, dataset_name,
|
|
output_dir, "*.wav", skip_existing)
|
|
|
|
|
|
def process_voxceleb2(processor, datasets_root, output_dir,
|
|
skip_existing=False):
|
|
dataset_name = "VoxCeleb2"
|
|
dataset_root = datasets_root / dataset_name
|
|
# There is no nationality in meta data for VoxCeleb2
|
|
speaker_dirs = list((dataset_root / "wav").glob("*"))
|
|
_process_dataset(processor, datasets_root, speaker_dirs, dataset_name,
|
|
output_dir, "*.wav", skip_existing)
|
|
|
|
|
|
def process_aidatatang_200zh(processor,
|
|
datasets_root,
|
|
output_dir,
|
|
skip_existing=False):
|
|
dataset_name = "aidatatang_200zh/train"
|
|
dataset_root = datasets_root / dataset_name
|
|
|
|
speaker_dirs = list((dataset_root).glob("*"))
|
|
_process_dataset(processor, datasets_root, speaker_dirs, dataset_name,
|
|
output_dir, "*.wav", skip_existing)
|
|
|
|
|
|
def process_magicdata(processor, datasets_root, output_dir,
|
|
skip_existing=False):
|
|
dataset_name = "magicdata/train"
|
|
dataset_root = datasets_root / dataset_name
|
|
|
|
speaker_dirs = list((dataset_root).glob("*"))
|
|
_process_dataset(processor, datasets_root, speaker_dirs, dataset_name,
|
|
output_dir, "*.wav", skip_existing)
|