# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import argparse from pathlib import Path from audio_processor import SpeakerVerificationPreprocessor from parakeet.exps.ge2e.config import get_cfg_defaults from parakeet.exps.ge2e.dataset_processors import process_aidatatang_200zh from parakeet.exps.ge2e.dataset_processors import process_librispeech from parakeet.exps.ge2e.dataset_processors import process_magicdata from parakeet.exps.ge2e.dataset_processors import process_voxceleb1 from parakeet.exps.ge2e.dataset_processors import process_voxceleb2 if __name__ == "__main__": parser = argparse.ArgumentParser( description="preprocess dataset for speaker verification task") parser.add_argument( "--datasets_root", type=Path, help="Path to the directory containing your LibriSpeech, LibriTTS and VoxCeleb datasets." ) parser.add_argument( "--output_dir", type=Path, help="Path to save processed dataset.") parser.add_argument( "--dataset_names", type=str, default="librispeech_other,voxceleb1,voxceleb2", help="comma-separated list of names of the datasets you want to preprocess. only " "the train set of these datastes will be used. Possible names: librispeech_other, " "voxceleb1, voxceleb2, aidatatang_200zh, magicdata.") parser.add_argument( "--skip_existing", action="store_true", help="Whether to skip ouput files with the same name. Useful if this script was interrupted." ) parser.add_argument( "--no_trim", action="store_true", help="Preprocess audio without trimming silences (not recommended).") args = parser.parse_args() if not args.no_trim: try: import webrtcvad print(webrtcvad.__version__) except Exception as e: raise ModuleNotFoundError( "Package 'webrtcvad' not found. This package enables " "noise removal and is recommended. Please install and " "try again. If installation fails, " "use --no_trim to disable this error message.") del args.no_trim args.datasets = [item.strip() for item in args.dataset_names.split(",")] if not hasattr(args, "output_dir"): args.output_dir = args.dataset_root / "SV2TTS" / "encoder" args.output_dir = args.output_dir.expanduser() args.datasets_root = args.datasets_root.expanduser() assert args.datasets_root.exists() args.output_dir.mkdir(exist_ok=True, parents=True) config = get_cfg_defaults() print(args) c = config.data processor = SpeakerVerificationPreprocessor( sampling_rate=c.sampling_rate, audio_norm_target_dBFS=c.audio_norm_target_dBFS, vad_window_length=c.vad_window_length, vad_moving_average_width=c.vad_moving_average_width, vad_max_silence_length=c.vad_max_silence_length, mel_window_length=c.mel_window_length, mel_window_step=c.mel_window_step, n_mels=c.n_mels, partial_n_frames=c.partial_n_frames, min_pad_coverage=c.min_pad_coverage, partial_overlap_ratio=c.min_pad_coverage, ) preprocess_func = { "librispeech_other": process_librispeech, "voxceleb1": process_voxceleb1, "voxceleb2": process_voxceleb2, "aidatatang_200zh": process_aidatatang_200zh, "magicdata": process_magicdata, } for dataset in args.datasets: print("Preprocessing %s" % dataset) preprocess_func[dataset](processor, args.datasets_root, args.output_dir, args.skip_existing)