pull/1437/head
commit
303bc4b18b
@ -1,11 +1,46 @@
|
||||
# Changelog
|
||||
|
||||
Date: 2022-1-29, Author: yt605155624.
|
||||
Add features to: T2S:
|
||||
- Update aishell3 vc0 with new Tacotron2.
|
||||
- PRLink: https://github.com/PaddlePaddle/PaddleSpeech/pull/1419
|
||||
|
||||
Date: 2022-1-29, Author: yt605155624.
|
||||
Add features to: T2S:
|
||||
- Add ljspeech Tacotron2.
|
||||
- PRLink: https://github.com/PaddlePaddle/PaddleSpeech/pull/1416
|
||||
|
||||
Date: 2022-1-24, Author: yt605155624.
|
||||
Add features to: T2S:
|
||||
- Add csmsc WaveRNN.
|
||||
- PRLink: https://github.com/PaddlePaddle/PaddleSpeech/pull/1379
|
||||
|
||||
Date: 2022-1-19, Author: yt605155624.
|
||||
Add features to: T2S:
|
||||
- Add csmsc Tacotron2.
|
||||
- PRLink: https://github.com/PaddlePaddle/PaddleSpeech/pull/1314
|
||||
|
||||
|
||||
Date: 2022-1-10, Author: Jackwaterveg.
|
||||
Add features to: CLI:
|
||||
- Support English (librispeech/asr1/transformer).
|
||||
Add features to: CLI:
|
||||
- Support English (librispeech/asr1/transformer).
|
||||
- Support choosing `decode_method` for conformer and transformer models.
|
||||
- Refactor the config, using the unified config.
|
||||
- PRLink: https://github.com/PaddlePaddle/PaddleSpeech/pull/1297
|
||||
|
||||
***
|
||||
|
||||
Date: 2022-1-17, Author: Jackwaterveg.
|
||||
Add features to: CLI:
|
||||
- Support deepspeech2 online/offline model(aishell).
|
||||
- PRLink: https://github.com/PaddlePaddle/PaddleSpeech/pull/1356
|
||||
|
||||
***
|
||||
|
||||
Date: 2022-1-24, Author: Jackwaterveg.
|
||||
Add features to: ctc_decoders:
|
||||
- Support online ctc prefix-beam search decoder.
|
||||
- Unified ctc online decoder and ctc offline decoder.
|
||||
- PRLink: https://github.com/PaddlePaddle/PaddleSpeech/pull/821
|
||||
|
||||
***
|
||||
|
@ -0,0 +1,10 @@
|
||||
# [VoxCeleb](http://www.robots.ox.ac.uk/~vgg/data/voxceleb/)
|
||||
VoxCeleb is an audio-visual dataset consisting of short clips of human speech, extracted from interview videos uploaded to YouTube。
|
||||
|
||||
VoxCeleb contains speech from speakers spanning a wide range of different ethnicities, accents, professions and ages.
|
||||
All speaking face-tracks are captured "in the wild", with background chatter, laughter, overlapping speech, pose variation and different lighting conditions.
|
||||
VoxCeleb consists of both audio and video. Each segment is at least 3 seconds long.
|
||||
|
||||
The dataset consists of two versions, VoxCeleb1 and VoxCeleb2. Each version has it's own train/test split. For each we provide YouTube URLs, face detections and tracks, audio files, cropped face videos and speaker meta-data. There is no overlap between the two versions.
|
||||
|
||||
more info in details refers to http://www.robots.ox.ac.uk/~vgg/data/voxceleb/
|
@ -0,0 +1,188 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Prepare VoxCeleb1 dataset
|
||||
|
||||
create manifest files.
|
||||
Manifest file is a json-format file with each line containing the
|
||||
meta data (i.e. audio filepath, transcript and audio duration)
|
||||
of each audio file in the data set.
|
||||
|
||||
researchers should download the voxceleb1 dataset yourselves
|
||||
through google form to get the username & password and unpack the data
|
||||
"""
|
||||
import argparse
|
||||
import codecs
|
||||
import glob
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
import soundfile
|
||||
|
||||
from utils.utility import check_md5sum
|
||||
from utils.utility import download
|
||||
from utils.utility import unzip
|
||||
|
||||
# all the data will be download in the current data/voxceleb directory default
|
||||
DATA_HOME = os.path.expanduser('.')
|
||||
|
||||
# if you use the http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/ as the download base url
|
||||
# you need to get the username & password via the google form
|
||||
|
||||
# if you use the https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a as the download base url,
|
||||
# you need use --no-check-certificate to connect the target download url
|
||||
|
||||
BASE_URL = "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a"
|
||||
|
||||
# dev data
|
||||
DEV_LIST = {
|
||||
"vox1_dev_wav_partaa": "e395d020928bc15670b570a21695ed96",
|
||||
"vox1_dev_wav_partab": "bbfaaccefab65d82b21903e81a8a8020",
|
||||
"vox1_dev_wav_partac": "017d579a2a96a077f40042ec33e51512",
|
||||
"vox1_dev_wav_partad": "7bb1e9f70fddc7a678fa998ea8b3ba19",
|
||||
}
|
||||
DEV_TARGET_DATA = "vox1_dev_wav_parta* vox1_dev_wav.zip ae63e55b951748cc486645f532ba230b"
|
||||
|
||||
# test data
|
||||
TEST_LIST = {"vox1_test_wav.zip": "185fdc63c3c739954633d50379a3d102"}
|
||||
TEST_TARGET_DATA = "vox1_test_wav.zip vox1_test_wav.zip 185fdc63c3c739954633d50379a3d102"
|
||||
|
||||
# kaldi trial
|
||||
# this trial file is organized by kaldi according the official file,
|
||||
# which is a little different with the official trial veri_test2.txt
|
||||
KALDI_BASE_URL = "http://www.openslr.org/resources/49/"
|
||||
TRIAL_LIST = {"voxceleb1_test_v2.txt": "29fc7cc1c5d59f0816dc15d6e8be60f7"}
|
||||
TRIAL_TARGET_DATA = "voxceleb1_test_v2.txt voxceleb1_test_v2.txt 29fc7cc1c5d59f0816dc15d6e8be60f7"
|
||||
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument(
|
||||
"--target_dir",
|
||||
default=DATA_HOME + "/voxceleb1/",
|
||||
type=str,
|
||||
help="Directory to save the voxceleb1 dataset. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--manifest_prefix",
|
||||
default="manifest",
|
||||
type=str,
|
||||
help="Filepath prefix for output manifests. (default: %(default)s)")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
def create_manifest(data_dir, manifest_path_prefix):
|
||||
print("Creating manifest %s ..." % manifest_path_prefix)
|
||||
json_lines = []
|
||||
data_path = os.path.join(data_dir, "wav", "**", "*.wav")
|
||||
total_sec = 0.0
|
||||
total_text = 0.0
|
||||
total_num = 0
|
||||
speakers = set()
|
||||
for audio_path in glob.glob(data_path, recursive=True):
|
||||
audio_id = "-".join(audio_path.split("/")[-3:])
|
||||
utt2spk = audio_path.split("/")[-3]
|
||||
duration = soundfile.info(audio_path).duration
|
||||
text = ""
|
||||
json_lines.append(
|
||||
json.dumps(
|
||||
{
|
||||
"utt": audio_id,
|
||||
"utt2spk": str(utt2spk),
|
||||
"feat": audio_path,
|
||||
"feat_shape": (duration, ),
|
||||
"text": text # compatible with asr data format
|
||||
},
|
||||
ensure_ascii=False))
|
||||
|
||||
total_sec += duration
|
||||
total_text += len(text)
|
||||
total_num += 1
|
||||
speakers.add(utt2spk)
|
||||
|
||||
# data_dir_name refer to dev or test
|
||||
# voxceleb1 is given explicit in the path
|
||||
data_dir_name = Path(data_dir).name
|
||||
manifest_path_prefix = manifest_path_prefix + "." + data_dir_name
|
||||
with codecs.open(manifest_path_prefix, 'w', encoding='utf-8') as f:
|
||||
for line in json_lines:
|
||||
f.write(line + "\n")
|
||||
|
||||
manifest_dir = os.path.dirname(manifest_path_prefix)
|
||||
meta_path = os.path.join(manifest_dir, "voxceleb1." +
|
||||
data_dir_name) + ".meta"
|
||||
with codecs.open(meta_path, 'w', encoding='utf-8') as f:
|
||||
print(f"{total_num} utts", file=f)
|
||||
print(f"{len(speakers)} speakers", file=f)
|
||||
print(f"{total_sec / (60 * 60)} h", file=f)
|
||||
print(f"{total_text} text", file=f)
|
||||
print(f"{total_text / total_sec} text/sec", file=f)
|
||||
print(f"{total_sec / total_num} sec/utt", file=f)
|
||||
|
||||
def prepare_dataset(base_url, data_list, target_dir, manifest_path,
|
||||
target_data):
|
||||
if not os.path.exists(target_dir):
|
||||
os.mkdir(target_dir)
|
||||
|
||||
# wav directory already exists, it need do nothing
|
||||
if not os.path.exists(os.path.join(target_dir, "wav")):
|
||||
# download all dataset part
|
||||
for zip_part in data_list.keys():
|
||||
download_url = " --no-check-certificate " + base_url + "/" + zip_part
|
||||
download(
|
||||
url=download_url,
|
||||
md5sum=data_list[zip_part],
|
||||
target_dir=target_dir)
|
||||
|
||||
# pack the all part to target zip file
|
||||
all_target_part, target_name, target_md5sum = target_data.split()
|
||||
target_name = os.path.join(target_dir, target_name)
|
||||
if not os.path.exists(target_name):
|
||||
pack_part_cmd = "cat {}/{} > {}".format(target_dir, all_target_part,
|
||||
target_name)
|
||||
subprocess.call(pack_part_cmd, shell=True)
|
||||
|
||||
# check the target zip file md5sum
|
||||
if not check_md5sum(target_name, target_md5sum):
|
||||
raise RuntimeError("{} MD5 checkssum failed".format(target_name))
|
||||
else:
|
||||
print("Check {} md5sum successfully".format(target_name))
|
||||
|
||||
# unzip the all zip file
|
||||
if target_name.endswith(".zip"):
|
||||
unzip(target_name, target_dir)
|
||||
|
||||
# create the manifest file
|
||||
create_manifest(data_dir=target_dir, manifest_path_prefix=manifest_path)
|
||||
|
||||
def main():
|
||||
if args.target_dir.startswith('~'):
|
||||
args.target_dir = os.path.expanduser(args.target_dir)
|
||||
|
||||
prepare_dataset(
|
||||
base_url=BASE_URL,
|
||||
data_list=DEV_LIST,
|
||||
target_dir=os.path.join(args.target_dir, "dev"),
|
||||
manifest_path=args.manifest_prefix,
|
||||
target_data=DEV_TARGET_DATA)
|
||||
|
||||
prepare_dataset(
|
||||
base_url=BASE_URL,
|
||||
data_list=TEST_LIST,
|
||||
target_dir=os.path.join(args.target_dir, "test"),
|
||||
manifest_path=args.manifest_prefix,
|
||||
target_data=TEST_TARGET_DATA)
|
||||
|
||||
print("Manifest prepare done!")
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
After Width: | Height: | Size: 315 KiB |
After Width: | Height: | Size: 156 KiB |
After Width: | Height: | Size: 54 KiB |
@ -0,0 +1,86 @@
|
||||
###########################################################
|
||||
# FEATURE EXTRACTION SETTING #
|
||||
###########################################################
|
||||
|
||||
fs: 24000 # sr
|
||||
n_fft: 2048 # FFT size (samples).
|
||||
n_shift: 300 # Hop size (samples). 12.5ms
|
||||
win_length: 1200 # Window length (samples). 50ms
|
||||
# If set to null, it will be the same as fft_size.
|
||||
window: "hann" # Window function.
|
||||
|
||||
# Only used for feats_type != raw
|
||||
|
||||
fmin: 80 # Minimum frequency of Mel basis.
|
||||
fmax: 7600 # Maximum frequency of Mel basis.
|
||||
n_mels: 80 # The number of mel basis.
|
||||
|
||||
###########################################################
|
||||
# DATA SETTING #
|
||||
###########################################################
|
||||
batch_size: 64
|
||||
num_workers: 2
|
||||
|
||||
###########################################################
|
||||
# MODEL SETTING #
|
||||
###########################################################
|
||||
model: # keyword arguments for the selected model
|
||||
embed_dim: 512 # char or phn embedding dimension
|
||||
elayers: 1 # number of blstm layers in encoder
|
||||
eunits: 512 # number of blstm units
|
||||
econv_layers: 3 # number of convolutional layers in encoder
|
||||
econv_chans: 512 # number of channels in convolutional layer
|
||||
econv_filts: 5 # filter size of convolutional layer
|
||||
atype: location # attention function type
|
||||
adim: 512 # attention dimension
|
||||
aconv_chans: 32 # number of channels in convolutional layer of attention
|
||||
aconv_filts: 15 # filter size of convolutional layer of attention
|
||||
cumulate_att_w: True # whether to cumulate attention weight
|
||||
dlayers: 2 # number of lstm layers in decoder
|
||||
dunits: 1024 # number of lstm units in decoder
|
||||
prenet_layers: 2 # number of layers in prenet
|
||||
prenet_units: 256 # number of units in prenet
|
||||
postnet_layers: 5 # number of layers in postnet
|
||||
postnet_chans: 512 # number of channels in postnet
|
||||
postnet_filts: 5 # filter size of postnet layer
|
||||
output_activation: null # activation function for the final output
|
||||
use_batch_norm: True # whether to use batch normalization in encoder
|
||||
use_concate: True # whether to concatenate encoder embedding with decoder outputs
|
||||
use_residual: False # whether to use residual connection in encoder
|
||||
dropout_rate: 0.5 # dropout rate
|
||||
zoneout_rate: 0.1 # zoneout rate
|
||||
reduction_factor: 1 # reduction factor
|
||||
spk_embed_dim: 256 # speaker embedding dimension
|
||||
spk_embed_integration_type: concat # how to integrate speaker embedding
|
||||
|
||||
|
||||
###########################################################
|
||||
# UPDATER SETTING #
|
||||
###########################################################
|
||||
updater:
|
||||
use_masking: True # whether to apply masking for padded part in loss calculation
|
||||
bce_pos_weight: 5.0 # weight of positive sample in binary cross entropy calculation
|
||||
use_guided_attn_loss: True # whether to use guided attention loss
|
||||
guided_attn_loss_sigma: 0.4 # sigma of guided attention loss
|
||||
guided_attn_loss_lambda: 1.0 # strength of guided attention loss
|
||||
|
||||
|
||||
##########################################################
|
||||
# OPTIMIZER SETTING #
|
||||
##########################################################
|
||||
optimizer:
|
||||
optim: adam # optimizer type
|
||||
learning_rate: 1.0e-03 # learning rate
|
||||
epsilon: 1.0e-06 # epsilon
|
||||
weight_decay: 0.0 # weight decay coefficient
|
||||
|
||||
###########################################################
|
||||
# TRAINING SETTING #
|
||||
###########################################################
|
||||
max_epoch: 200
|
||||
num_snapshots: 5
|
||||
|
||||
###########################################################
|
||||
# OTHER SETTING #
|
||||
###########################################################
|
||||
seed: 42
|
@ -1,36 +1,72 @@
|
||||
#!/bin/bash
|
||||
|
||||
stage=0
|
||||
stage=3
|
||||
stop_stage=100
|
||||
|
||||
input=$1
|
||||
preprocess_path=$2
|
||||
alignment=$3
|
||||
ge2e_ckpt_path=$4
|
||||
config_path=$1
|
||||
ge2e_ckpt_path=$2
|
||||
|
||||
# gen speaker embedding
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
python3 ${MAIN_ROOT}/paddlespeech/vector/exps/ge2e/inference.py \
|
||||
--input=${input}/wav \
|
||||
--output=${preprocess_path}/embed \
|
||||
--input=~/datasets/data_aishell3/train/wav/ \
|
||||
--output=dump/embed \
|
||||
--checkpoint_path=${ge2e_ckpt_path}
|
||||
fi
|
||||
|
||||
# copy from tts3/preprocess
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
echo "Process wav ..."
|
||||
python3 ${BIN_DIR}/process_wav.py \
|
||||
--input=${input}/wav \
|
||||
--output=${preprocess_path}/normalized_wav \
|
||||
--alignment=${alignment}
|
||||
# get durations from MFA's result
|
||||
echo "Generate durations.txt from MFA results ..."
|
||||
python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
|
||||
--inputdir=./aishell3_alignment_tone \
|
||||
--output durations.txt \
|
||||
--config=${config_path}
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
python3 ${BIN_DIR}/preprocess_transcription.py \
|
||||
--input=${input} \
|
||||
--output=${preprocess_path}
|
||||
# extract features
|
||||
echo "Extract features ..."
|
||||
python3 ${BIN_DIR}/preprocess.py \
|
||||
--dataset=aishell3 \
|
||||
--rootdir=~/datasets/data_aishell3/ \
|
||||
--dumpdir=dump \
|
||||
--dur-file=durations.txt \
|
||||
--config=${config_path} \
|
||||
--num-cpu=20 \
|
||||
--cut-sil=True \
|
||||
--spk_emb_dir=dump/embed
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||
python3 ${BIN_DIR}/extract_mel.py \
|
||||
--input=${preprocess_path}/normalized_wav \
|
||||
--output=${preprocess_path}/mel
|
||||
# get features' stats(mean and std)
|
||||
echo "Get features' stats ..."
|
||||
python3 ${MAIN_ROOT}/utils/compute_statistics.py \
|
||||
--metadata=dump/train/raw/metadata.jsonl \
|
||||
--field-name="speech"
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
|
||||
# normalize and covert phone to id, dev and test should use train's stats
|
||||
echo "Normalize ..."
|
||||
python3 ${BIN_DIR}/normalize.py \
|
||||
--metadata=dump/train/raw/metadata.jsonl \
|
||||
--dumpdir=dump/train/norm \
|
||||
--speech-stats=dump/train/speech_stats.npy \
|
||||
--phones-dict=dump/phone_id_map.txt \
|
||||
--speaker-dict=dump/speaker_id_map.txt
|
||||
|
||||
python3 ${BIN_DIR}/normalize.py \
|
||||
--metadata=dump/dev/raw/metadata.jsonl \
|
||||
--dumpdir=dump/dev/norm \
|
||||
--speech-stats=dump/train/speech_stats.npy \
|
||||
--phones-dict=dump/phone_id_map.txt \
|
||||
--speaker-dict=dump/speaker_id_map.txt
|
||||
|
||||
python3 ${BIN_DIR}/normalize.py \
|
||||
--metadata=dump/test/raw/metadata.jsonl \
|
||||
--dumpdir=dump/test/norm \
|
||||
--speech-stats=dump/train/speech_stats.npy \
|
||||
--phones-dict=dump/phone_id_map.txt \
|
||||
--speaker-dict=dump/speaker_id_map.txt
|
||||
fi
|
||||
|
@ -0,0 +1,22 @@
|
||||
#!/bin/bash
|
||||
|
||||
config_path=$1
|
||||
train_output_path=$2
|
||||
ckpt_name=$3
|
||||
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
|
||||
python3 ${BIN_DIR}/../synthesize.py \
|
||||
--am=tacotron2_aishell3 \
|
||||
--am_config=${config_path} \
|
||||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--am_stat=dump/train/speech_stats.npy \
|
||||
--voc=pwgan_aishell3 \
|
||||
--voc_config=pwg_aishell3_ckpt_0.5/default.yaml \
|
||||
--voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
|
||||
--voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \
|
||||
--test_metadata=dump/test/norm/metadata.jsonl \
|
||||
--output_dir=${train_output_path}/test \
|
||||
--phones_dict=dump/phone_id_map.txt \
|
||||
--speaker_dict=dump/speaker_id_map.txt \
|
||||
--voice-cloning=True
|
@ -1,9 +1,13 @@
|
||||
#!/bin/bash
|
||||
|
||||
preprocess_path=$1
|
||||
config_path=$1
|
||||
train_output_path=$2
|
||||
|
||||
python3 ${BIN_DIR}/train.py \
|
||||
--data=${preprocess_path} \
|
||||
--output=${train_output_path} \
|
||||
--ngpu=1
|
||||
--train-metadata=dump/train/norm/metadata.jsonl \
|
||||
--dev-metadata=dump/dev/norm/metadata.jsonl \
|
||||
--config=${config_path} \
|
||||
--output-dir=${train_output_path} \
|
||||
--ngpu=2 \
|
||||
--phones-dict=dump/phone_id_map.txt \
|
||||
--voice-cloning=True
|
@ -1,14 +1,24 @@
|
||||
#!/bin/bash
|
||||
|
||||
ge2e_params_path=$1
|
||||
tacotron2_params_path=$2
|
||||
waveflow_params_path=$3
|
||||
vc_input=$4
|
||||
vc_output=$5
|
||||
config_path=$1
|
||||
train_output_path=$2
|
||||
ckpt_name=$3
|
||||
ge2e_params_path=$4
|
||||
ref_audio_dir=$5
|
||||
|
||||
python3 ${BIN_DIR}/voice_cloning.py \
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
|
||||
python3 ${BIN_DIR}/../voice_cloning.py \
|
||||
--am=tacotron2_aishell3 \
|
||||
--am_config=${config_path} \
|
||||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--am_stat=dump/train/speech_stats.npy \
|
||||
--voc=pwgan_aishell3 \
|
||||
--voc_config=pwg_aishell3_ckpt_0.5/default.yaml \
|
||||
--voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
|
||||
--voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \
|
||||
--ge2e_params_path=${ge2e_params_path} \
|
||||
--tacotron2_params_path=${tacotron2_params_path} \
|
||||
--waveflow_params_path=${waveflow_params_path} \
|
||||
--input-dir=${vc_input} \
|
||||
--output-dir=${vc_output}
|
||||
--text="凯莫瑞安联合体的经济崩溃迫在眉睫。" \
|
||||
--input-dir=${ref_audio_dir} \
|
||||
--output-dir=${train_output_path}/vc_syn \
|
||||
--phones-dict=dump/phone_id_map.txt
|
||||
|
@ -0,0 +1,3 @@
|
||||
# Speaker Diarization on AMI corpus
|
||||
|
||||
* sd0 - speaker diarization by AHC,SC base on x-vectors
|
@ -0,0 +1 @@
|
||||
results
|
@ -0,0 +1,13 @@
|
||||
# Speaker Diarization on AMI corpus
|
||||
|
||||
## About the AMI corpus:
|
||||
"The AMI Meeting Corpus consists of 100 hours of meeting recordings. The recordings use a range of signals synchronized to a common timeline. These include close-talking and far-field microphones, individual and room-view video cameras, and output from a slide projector and an electronic whiteboard. During the meetings, the participants also have unsynchronized pens available to them that record what is written. The meetings were recorded in English using three different rooms with different acoustic properties, and include mostly non-native speakers." See [ami overview](http://groups.inf.ed.ac.uk/ami/corpus/overview.shtml) for more details.
|
||||
|
||||
## About the example
|
||||
The script performs diarization using x-vectors(TDNN,ECAPA-TDNN) on the AMI mix-headset data. We demonstrate the use of different clustering methods: AHC, spectral.
|
||||
|
||||
## How to Run
|
||||
Use the following command to run diarization on AMI corpus.
|
||||
`bash ./run.sh`
|
||||
|
||||
## Results (DER) coming soon! :)
|
@ -0,0 +1,572 @@
|
||||
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
Data preparation.
|
||||
|
||||
Download: http://groups.inf.ed.ac.uk/ami/download/
|
||||
|
||||
Prepares metadata files (JSON) from manual annotations "segments/" using RTTM format (Oracle VAD).
|
||||
|
||||
Authors
|
||||
* qingenz123@126.com (Qingen ZHAO) 2022
|
||||
|
||||
"""
|
||||
|
||||
import os
|
||||
import logging
|
||||
import argparse
|
||||
import xml.etree.ElementTree as et
|
||||
import glob
|
||||
import json
|
||||
from ami_splits import get_AMI_split
|
||||
from distutils.util import strtobool
|
||||
|
||||
from dataio import (
|
||||
load_pkl,
|
||||
save_pkl, )
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
SAMPLERATE = 16000
|
||||
|
||||
|
||||
def prepare_ami(
|
||||
data_folder,
|
||||
manual_annot_folder,
|
||||
save_folder,
|
||||
ref_rttm_dir,
|
||||
meta_data_dir,
|
||||
split_type="full_corpus_asr",
|
||||
skip_TNO=True,
|
||||
mic_type="Mix-Headset",
|
||||
vad_type="oracle",
|
||||
max_subseg_dur=3.0,
|
||||
overlap=1.5, ):
|
||||
"""
|
||||
Prepares reference RTTM and JSON files for the AMI dataset.
|
||||
|
||||
Arguments
|
||||
---------
|
||||
data_folder : str
|
||||
Path to the folder where the original amicorpus is stored.
|
||||
manual_annot_folder : str
|
||||
Directory where the manual annotations are stored.
|
||||
save_folder : str
|
||||
The save directory in results.
|
||||
ref_rttm_dir : str
|
||||
Directory to store reference RTTM files.
|
||||
meta_data_dir : str
|
||||
Directory to store the meta data (json) files.
|
||||
split_type : str
|
||||
Standard dataset split. See ami_splits.py for more information.
|
||||
Allowed split_type: "scenario_only", "full_corpus" or "full_corpus_asr"
|
||||
skip_TNO: bool
|
||||
Skips TNO meeting recordings if True.
|
||||
mic_type : str
|
||||
Type of microphone to be used.
|
||||
vad_type : str
|
||||
Type of VAD. Kept for future when VAD will be added.
|
||||
max_subseg_dur : float
|
||||
Duration in seconds of a subsegments to be prepared from larger segments.
|
||||
overlap : float
|
||||
Overlap duration in seconds between adjacent subsegments
|
||||
|
||||
Example
|
||||
-------
|
||||
>>> from dataset.ami.ami_prepare import prepare_ami
|
||||
>>> data_folder = '/home/data/ami/amicorpus/'
|
||||
>>> manual_annot_folder = '/home/data/ami/ami_public_manual/'
|
||||
>>> save_folder = './results/
|
||||
>>> split_type = 'full_corpus_asr'
|
||||
>>> mic_type = 'Mix-Headset'
|
||||
>>> prepare_ami(data_folder, manual_annot_folder, save_folder, split_type, mic_type)
|
||||
"""
|
||||
|
||||
# Meta files
|
||||
meta_files = [
|
||||
os.path.join(meta_data_dir, "ami_train." + mic_type + ".subsegs.json"),
|
||||
os.path.join(meta_data_dir, "ami_dev." + mic_type + ".subsegs.json"),
|
||||
os.path.join(meta_data_dir, "ami_eval." + mic_type + ".subsegs.json"),
|
||||
]
|
||||
|
||||
# Create configuration for easily skipping data_preparation stage
|
||||
conf = {
|
||||
"data_folder": data_folder,
|
||||
"save_folder": save_folder,
|
||||
"ref_rttm_dir": ref_rttm_dir,
|
||||
"meta_data_dir": meta_data_dir,
|
||||
"split_type": split_type,
|
||||
"skip_TNO": skip_TNO,
|
||||
"mic_type": mic_type,
|
||||
"vad": vad_type,
|
||||
"max_subseg_dur": max_subseg_dur,
|
||||
"overlap": overlap,
|
||||
"meta_files": meta_files,
|
||||
}
|
||||
|
||||
if not os.path.exists(save_folder):
|
||||
os.makedirs(save_folder)
|
||||
|
||||
# Setting output option files.
|
||||
opt_file = "opt_ami_prepare." + mic_type + ".pkl"
|
||||
|
||||
# Check if this phase is already done (if so, skip it)
|
||||
if skip(save_folder, conf, meta_files, opt_file):
|
||||
logger.info(
|
||||
"Skipping data preparation, as it was completed in previous run.")
|
||||
return
|
||||
|
||||
msg = "\tCreating meta-data file for the AMI Dataset.."
|
||||
logger.debug(msg)
|
||||
|
||||
# Get the split
|
||||
train_set, dev_set, eval_set = get_AMI_split(split_type)
|
||||
|
||||
# Prepare RTTM from XML(manual annot) and store are groundtruth
|
||||
# Create ref_RTTM directory
|
||||
if not os.path.exists(ref_rttm_dir):
|
||||
os.makedirs(ref_rttm_dir)
|
||||
|
||||
# Create reference RTTM files
|
||||
splits = ["train", "dev", "eval"]
|
||||
for i in splits:
|
||||
rttm_file = ref_rttm_dir + "/fullref_ami_" + i + ".rttm"
|
||||
if i == "train":
|
||||
prepare_segs_for_RTTM(
|
||||
train_set,
|
||||
rttm_file,
|
||||
data_folder,
|
||||
manual_annot_folder,
|
||||
i,
|
||||
skip_TNO, )
|
||||
if i == "dev":
|
||||
prepare_segs_for_RTTM(
|
||||
dev_set,
|
||||
rttm_file,
|
||||
data_folder,
|
||||
manual_annot_folder,
|
||||
i,
|
||||
skip_TNO, )
|
||||
if i == "eval":
|
||||
prepare_segs_for_RTTM(
|
||||
eval_set,
|
||||
rttm_file,
|
||||
data_folder,
|
||||
manual_annot_folder,
|
||||
i,
|
||||
skip_TNO, )
|
||||
|
||||
# Create meta_files for splits
|
||||
meta_data_dir = meta_data_dir
|
||||
if not os.path.exists(meta_data_dir):
|
||||
os.makedirs(meta_data_dir)
|
||||
|
||||
for i in splits:
|
||||
rttm_file = ref_rttm_dir + "/fullref_ami_" + i + ".rttm"
|
||||
meta_filename_prefix = "ami_" + i
|
||||
prepare_metadata(
|
||||
rttm_file,
|
||||
meta_data_dir,
|
||||
data_folder,
|
||||
meta_filename_prefix,
|
||||
max_subseg_dur,
|
||||
overlap,
|
||||
mic_type, )
|
||||
|
||||
save_opt_file = os.path.join(save_folder, opt_file)
|
||||
save_pkl(conf, save_opt_file)
|
||||
|
||||
|
||||
def get_RTTM_per_rec(segs, spkrs_list, rec_id):
|
||||
"""Prepares rttm for each recording
|
||||
"""
|
||||
|
||||
rttm = []
|
||||
|
||||
# Prepare header
|
||||
for spkr_id in spkrs_list:
|
||||
# e.g. SPKR-INFO ES2008c 0 <NA> <NA> <NA> unknown ES2008c.A_PM <NA> <NA>
|
||||
line = ("SPKR-INFO " + rec_id + " 0 <NA> <NA> <NA> unknown " + spkr_id +
|
||||
" <NA> <NA>")
|
||||
rttm.append(line)
|
||||
|
||||
# Append remaining lines
|
||||
for row in segs:
|
||||
# e.g. SPEAKER ES2008c 0 37.880 0.590 <NA> <NA> ES2008c.A_PM <NA> <NA>
|
||||
|
||||
if float(row[1]) < float(row[0]):
|
||||
msg1 = (
|
||||
"Possibly Incorrect Annotation Found!! transcriber_start (%s) > transcriber_end (%s)"
|
||||
% (row[0], row[1]))
|
||||
msg2 = (
|
||||
"Excluding this incorrect row from the RTTM : %s, %s, %s, %s" %
|
||||
(rec_id, row[0], str(round(float(row[1]) - float(row[0]), 4)),
|
||||
str(row[2]), ))
|
||||
logger.info(msg1)
|
||||
logger.info(msg2)
|
||||
continue
|
||||
|
||||
line = ("SPEAKER " + rec_id + " 0 " + str(round(float(row[0]), 4)) + " "
|
||||
+ str(round(float(row[1]) - float(row[0]), 4)) + " <NA> <NA> " +
|
||||
str(row[2]) + " <NA> <NA>")
|
||||
rttm.append(line)
|
||||
|
||||
return rttm
|
||||
|
||||
|
||||
def prepare_segs_for_RTTM(list_ids, out_rttm_file, audio_dir, annot_dir,
|
||||
split_type, skip_TNO):
|
||||
|
||||
RTTM = [] # Stores all RTTMs clubbed together for a given dataset split
|
||||
|
||||
for main_meet_id in list_ids:
|
||||
|
||||
# Skip TNO meetings from dev and eval sets
|
||||
if (main_meet_id.startswith("TS") and split_type != "train" and
|
||||
skip_TNO is True):
|
||||
msg = ("Skipping TNO meeting in AMI " + str(split_type) + " set : "
|
||||
+ str(main_meet_id))
|
||||
logger.info(msg)
|
||||
continue
|
||||
|
||||
list_sessions = glob.glob(audio_dir + "/" + main_meet_id + "*")
|
||||
list_sessions.sort()
|
||||
|
||||
for sess in list_sessions:
|
||||
rec_id = os.path.basename(sess)
|
||||
path = annot_dir + "/segments/" + rec_id
|
||||
f = path + ".*.segments.xml"
|
||||
list_spkr_xmls = glob.glob(f)
|
||||
list_spkr_xmls.sort() # A, B, C, D, E etc (Speakers)
|
||||
segs = []
|
||||
spkrs_list = (
|
||||
[]) # Since non-scenario recordings contains 3-5 speakers
|
||||
|
||||
for spkr_xml_file in list_spkr_xmls:
|
||||
|
||||
# Speaker ID
|
||||
spkr = os.path.basename(spkr_xml_file).split(".")[1]
|
||||
spkr_ID = rec_id + "." + spkr
|
||||
spkrs_list.append(spkr_ID)
|
||||
|
||||
# Parse xml tree
|
||||
tree = et.parse(spkr_xml_file)
|
||||
root = tree.getroot()
|
||||
|
||||
# Start, end and speaker_ID from xml file
|
||||
segs = segs + [[
|
||||
elem.attrib["transcriber_start"],
|
||||
elem.attrib["transcriber_end"],
|
||||
spkr_ID,
|
||||
] for elem in root.iter("segment")]
|
||||
|
||||
# Sort rows as per the start time (per recording)
|
||||
segs.sort(key=lambda x: float(x[0]))
|
||||
|
||||
rttm_per_rec = get_RTTM_per_rec(segs, spkrs_list, rec_id)
|
||||
RTTM = RTTM + rttm_per_rec
|
||||
|
||||
# Write one RTTM as groundtruth. For example, "fullref_eval.rttm"
|
||||
with open(out_rttm_file, "w") as f:
|
||||
for item in RTTM:
|
||||
f.write("%s\n" % item)
|
||||
|
||||
|
||||
def is_overlapped(end1, start2):
|
||||
"""Returns True if the two segments overlap
|
||||
|
||||
Arguments
|
||||
---------
|
||||
end1 : float
|
||||
End time of the first segment.
|
||||
start2 : float
|
||||
Start time of the second segment.
|
||||
"""
|
||||
|
||||
if start2 > end1:
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
|
||||
def merge_rttm_intervals(rttm_segs):
|
||||
"""Merges adjacent segments in rttm if they overlap.
|
||||
"""
|
||||
# For one recording
|
||||
# rec_id = rttm_segs[0][1]
|
||||
rttm_segs.sort(key=lambda x: float(x[3]))
|
||||
|
||||
# first_seg = rttm_segs[0] # first interval.. as it is
|
||||
merged_segs = [rttm_segs[0]]
|
||||
strt = float(rttm_segs[0][3])
|
||||
end = float(rttm_segs[0][3]) + float(rttm_segs[0][4])
|
||||
|
||||
for row in rttm_segs[1:]:
|
||||
s = float(row[3])
|
||||
e = float(row[3]) + float(row[4])
|
||||
|
||||
if is_overlapped(end, s):
|
||||
# Update only end. The strt will be same as in last segment
|
||||
# Just update last row in the merged_segs
|
||||
end = max(end, e)
|
||||
merged_segs[-1][3] = str(round(strt, 4))
|
||||
merged_segs[-1][4] = str(round((end - strt), 4))
|
||||
merged_segs[-1][7] = "overlap" # previous_row[7] + '-'+ row[7]
|
||||
else:
|
||||
# Add a new disjoint segment
|
||||
strt = s
|
||||
end = e
|
||||
merged_segs.append(row) # this will have 1 spkr ID
|
||||
|
||||
return merged_segs
|
||||
|
||||
|
||||
def get_subsegments(merged_segs, max_subseg_dur=3.0, overlap=1.5):
|
||||
"""Divides bigger segments into smaller sub-segments
|
||||
"""
|
||||
|
||||
shift = max_subseg_dur - overlap
|
||||
subsegments = []
|
||||
|
||||
# These rows are in RTTM format
|
||||
for row in merged_segs:
|
||||
seg_dur = float(row[4])
|
||||
rec_id = row[1]
|
||||
|
||||
if seg_dur > max_subseg_dur:
|
||||
num_subsegs = int(seg_dur / shift)
|
||||
# Taking 0.01 sec as small step
|
||||
seg_start = float(row[3])
|
||||
seg_end = seg_start + seg_dur
|
||||
|
||||
# Now divide this segment (new_row) in smaller subsegments
|
||||
for i in range(num_subsegs):
|
||||
subseg_start = seg_start + i * shift
|
||||
subseg_end = min(subseg_start + max_subseg_dur - 0.01, seg_end)
|
||||
subseg_dur = subseg_end - subseg_start
|
||||
|
||||
new_row = [
|
||||
"SPEAKER",
|
||||
rec_id,
|
||||
"0",
|
||||
str(round(float(subseg_start), 4)),
|
||||
str(round(float(subseg_dur), 4)),
|
||||
"<NA>",
|
||||
"<NA>",
|
||||
row[7],
|
||||
"<NA>",
|
||||
"<NA>",
|
||||
]
|
||||
|
||||
subsegments.append(new_row)
|
||||
|
||||
# Break if exceeding the boundary
|
||||
if subseg_end >= seg_end:
|
||||
break
|
||||
else:
|
||||
subsegments.append(row)
|
||||
|
||||
return subsegments
|
||||
|
||||
|
||||
def prepare_metadata(rttm_file, save_dir, data_dir, filename, max_subseg_dur,
|
||||
overlap, mic_type):
|
||||
# Read RTTM, get unique meeting_IDs (from RTTM headers)
|
||||
# For each MeetingID. select that meetID -> merge -> subsegment -> json -> append
|
||||
|
||||
# Read RTTM
|
||||
RTTM = []
|
||||
with open(rttm_file, "r") as f:
|
||||
for line in f:
|
||||
entry = line[:-1]
|
||||
RTTM.append(entry)
|
||||
|
||||
spkr_info = filter(lambda x: x.startswith("SPKR-INFO"), RTTM)
|
||||
rec_ids = list(set([row.split(" ")[1] for row in spkr_info]))
|
||||
rec_ids.sort() # sorting just to make JSON look in proper sequence
|
||||
|
||||
# For each recording merge segments and then perform subsegmentation
|
||||
MERGED_SEGMENTS = []
|
||||
SUBSEGMENTS = []
|
||||
for rec_id in rec_ids:
|
||||
segs_iter = filter(lambda x: x.startswith("SPEAKER " + str(rec_id)),
|
||||
RTTM)
|
||||
gt_rttm_segs = [row.split(" ") for row in segs_iter]
|
||||
|
||||
# Merge, subsegment and then convert to json format.
|
||||
merged_segs = merge_rttm_intervals(
|
||||
gt_rttm_segs) # We lose speaker_ID after merging
|
||||
MERGED_SEGMENTS = MERGED_SEGMENTS + merged_segs
|
||||
|
||||
# Divide segments into smaller sub-segments
|
||||
subsegs = get_subsegments(merged_segs, max_subseg_dur, overlap)
|
||||
SUBSEGMENTS = SUBSEGMENTS + subsegs
|
||||
|
||||
# Write segment AND sub-segments (in RTTM format)
|
||||
segs_file = save_dir + "/" + filename + ".segments.rttm"
|
||||
subsegment_file = save_dir + "/" + filename + ".subsegments.rttm"
|
||||
|
||||
with open(segs_file, "w") as f:
|
||||
for row in MERGED_SEGMENTS:
|
||||
line_str = " ".join(row)
|
||||
f.write("%s\n" % line_str)
|
||||
|
||||
with open(subsegment_file, "w") as f:
|
||||
for row in SUBSEGMENTS:
|
||||
line_str = " ".join(row)
|
||||
f.write("%s\n" % line_str)
|
||||
|
||||
# Create JSON from subsegments
|
||||
json_dict = {}
|
||||
for row in SUBSEGMENTS:
|
||||
rec_id = row[1]
|
||||
strt = str(round(float(row[3]), 4))
|
||||
end = str(round((float(row[3]) + float(row[4])), 4))
|
||||
subsegment_ID = rec_id + "_" + strt + "_" + end
|
||||
dur = row[4]
|
||||
start_sample = int(float(strt) * SAMPLERATE)
|
||||
end_sample = int(float(end) * SAMPLERATE)
|
||||
|
||||
# If multi-mic audio is selected
|
||||
if mic_type == "Array1":
|
||||
wav_file_base_path = (data_dir + "/" + rec_id + "/audio/" + rec_id +
|
||||
"." + mic_type + "-")
|
||||
|
||||
f = [] # adding all 8 mics
|
||||
for i in range(8):
|
||||
f.append(wav_file_base_path + str(i + 1).zfill(2) + ".wav")
|
||||
audio_files_path_list = f
|
||||
|
||||
# Note: key "files" with 's' is used for multi-mic
|
||||
json_dict[subsegment_ID] = {
|
||||
"wav": {
|
||||
"files": audio_files_path_list,
|
||||
"duration": float(dur),
|
||||
"start": int(start_sample),
|
||||
"stop": int(end_sample),
|
||||
},
|
||||
}
|
||||
else:
|
||||
# Single mic audio
|
||||
wav_file_path = (data_dir + "/" + rec_id + "/audio/" + rec_id + "."
|
||||
+ mic_type + ".wav")
|
||||
|
||||
# Note: key "file" without 's' is used for single-mic
|
||||
json_dict[subsegment_ID] = {
|
||||
"wav": {
|
||||
"file": wav_file_path,
|
||||
"duration": float(dur),
|
||||
"start": int(start_sample),
|
||||
"stop": int(end_sample),
|
||||
},
|
||||
}
|
||||
|
||||
out_json_file = save_dir + "/" + filename + "." + mic_type + ".subsegs.json"
|
||||
with open(out_json_file, mode="w") as json_f:
|
||||
json.dump(json_dict, json_f, indent=2)
|
||||
|
||||
msg = "%s JSON prepared" % (out_json_file)
|
||||
logger.debug(msg)
|
||||
|
||||
|
||||
def skip(save_folder, conf, meta_files, opt_file):
|
||||
"""
|
||||
Detects if the AMI data_preparation has been already done.
|
||||
If the preparation has been done, we can skip it.
|
||||
|
||||
Returns
|
||||
-------
|
||||
bool
|
||||
if True, the preparation phase can be skipped.
|
||||
if False, it must be done.
|
||||
"""
|
||||
# Checking if meta (json) files are available
|
||||
skip = True
|
||||
for file_path in meta_files:
|
||||
if not os.path.isfile(file_path):
|
||||
skip = False
|
||||
|
||||
# Checking saved options
|
||||
save_opt_file = os.path.join(save_folder, opt_file)
|
||||
if skip is True:
|
||||
if os.path.isfile(save_opt_file):
|
||||
opts_old = load_pkl(save_opt_file)
|
||||
if opts_old == conf:
|
||||
skip = True
|
||||
else:
|
||||
skip = False
|
||||
else:
|
||||
skip = False
|
||||
|
||||
return skip
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
prog='python ami_prepare.py --data_folder /home/data/ami/amicorpus \
|
||||
--manual_annot_folder /home/data/ami/ami_public_manual_1.6.2 \
|
||||
--save_folder ./results/ --ref_rttm_dir ./results/ref_rttms \
|
||||
--meta_data_dir ./results/metadata',
|
||||
description='AMI Data preparation')
|
||||
parser.add_argument(
|
||||
'--data_folder',
|
||||
required=True,
|
||||
help='Path to the folder where the original amicorpus is stored')
|
||||
parser.add_argument(
|
||||
'--manual_annot_folder',
|
||||
required=True,
|
||||
help='Directory where the manual annotations are stored')
|
||||
parser.add_argument(
|
||||
'--save_folder', required=True, help='The save directory in results')
|
||||
parser.add_argument(
|
||||
'--ref_rttm_dir',
|
||||
required=True,
|
||||
help='Directory to store reference RTTM files')
|
||||
parser.add_argument(
|
||||
'--meta_data_dir',
|
||||
required=True,
|
||||
help='Directory to store the meta data (json) files')
|
||||
parser.add_argument(
|
||||
'--split_type',
|
||||
default="full_corpus_asr",
|
||||
help='Standard dataset split. See ami_splits.py for more information')
|
||||
parser.add_argument(
|
||||
'--skip_TNO',
|
||||
default=True,
|
||||
type=strtobool,
|
||||
help='Skips TNO meeting recordings if True')
|
||||
parser.add_argument(
|
||||
'--mic_type',
|
||||
default="Mix-Headset",
|
||||
help='Type of microphone to be used')
|
||||
parser.add_argument(
|
||||
'--vad_type',
|
||||
default="oracle",
|
||||
help='Type of VAD. Kept for future when VAD will be added')
|
||||
parser.add_argument(
|
||||
'--max_subseg_dur',
|
||||
default=3.0,
|
||||
type=float,
|
||||
help='Duration in seconds of a subsegments to be prepared from larger segments'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--overlap',
|
||||
default=1.5,
|
||||
type=float,
|
||||
help='Overlap duration in seconds between adjacent subsegments')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
prepare_ami(args.data_folder, args.manual_annot_folder, args.save_folder,
|
||||
args.ref_rttm_dir, args.meta_data_dir)
|
@ -0,0 +1,234 @@
|
||||
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
AMI corpus contained 100 hours of meeting recording.
|
||||
This script returns the standard train, dev and eval split for AMI corpus.
|
||||
For more information on dataset please refer to http://groups.inf.ed.ac.uk/ami/corpus/datasets.shtml
|
||||
|
||||
Authors
|
||||
* qingenz123@126.com (Qingen ZHAO) 2022
|
||||
|
||||
"""
|
||||
|
||||
ALLOWED_OPTIONS = ["scenario_only", "full_corpus", "full_corpus_asr"]
|
||||
|
||||
|
||||
def get_AMI_split(split_option):
|
||||
"""
|
||||
Prepares train, dev, and test sets for given split_option
|
||||
|
||||
Arguments
|
||||
---------
|
||||
split_option: str
|
||||
The standard split option.
|
||||
Allowed options: "scenario_only", "full_corpus", "full_corpus_asr"
|
||||
|
||||
Returns
|
||||
-------
|
||||
Meeting IDs for train, dev, and test sets for given split_option
|
||||
"""
|
||||
|
||||
if split_option not in ALLOWED_OPTIONS:
|
||||
print(
|
||||
f'Invalid split "{split_option}" requested!\nValid split_options are: ',
|
||||
ALLOWED_OPTIONS, )
|
||||
return
|
||||
|
||||
if split_option == "scenario_only":
|
||||
|
||||
train_set = [
|
||||
"ES2002",
|
||||
"ES2005",
|
||||
"ES2006",
|
||||
"ES2007",
|
||||
"ES2008",
|
||||
"ES2009",
|
||||
"ES2010",
|
||||
"ES2012",
|
||||
"ES2013",
|
||||
"ES2015",
|
||||
"ES2016",
|
||||
"IS1000",
|
||||
"IS1001",
|
||||
"IS1002",
|
||||
"IS1003",
|
||||
"IS1004",
|
||||
"IS1005",
|
||||
"IS1006",
|
||||
"IS1007",
|
||||
"TS3005",
|
||||
"TS3008",
|
||||
"TS3009",
|
||||
"TS3010",
|
||||
"TS3011",
|
||||
"TS3012",
|
||||
]
|
||||
|
||||
dev_set = [
|
||||
"ES2003",
|
||||
"ES2011",
|
||||
"IS1008",
|
||||
"TS3004",
|
||||
"TS3006",
|
||||
]
|
||||
|
||||
test_set = [
|
||||
"ES2004",
|
||||
"ES2014",
|
||||
"IS1009",
|
||||
"TS3003",
|
||||
"TS3007",
|
||||
]
|
||||
|
||||
if split_option == "full_corpus":
|
||||
# List of train: SA (TRAINING PART OF SEEN DATA)
|
||||
train_set = [
|
||||
"ES2002",
|
||||
"ES2005",
|
||||
"ES2006",
|
||||
"ES2007",
|
||||
"ES2008",
|
||||
"ES2009",
|
||||
"ES2010",
|
||||
"ES2012",
|
||||
"ES2013",
|
||||
"ES2015",
|
||||
"ES2016",
|
||||
"IS1000",
|
||||
"IS1001",
|
||||
"IS1002",
|
||||
"IS1003",
|
||||
"IS1004",
|
||||
"IS1005",
|
||||
"IS1006",
|
||||
"IS1007",
|
||||
"TS3005",
|
||||
"TS3008",
|
||||
"TS3009",
|
||||
"TS3010",
|
||||
"TS3011",
|
||||
"TS3012",
|
||||
"EN2001",
|
||||
"EN2003",
|
||||
"EN2004",
|
||||
"EN2005",
|
||||
"EN2006",
|
||||
"EN2009",
|
||||
"IN1001",
|
||||
"IN1002",
|
||||
"IN1005",
|
||||
"IN1007",
|
||||
"IN1008",
|
||||
"IN1009",
|
||||
"IN1012",
|
||||
"IN1013",
|
||||
"IN1014",
|
||||
"IN1016",
|
||||
]
|
||||
|
||||
# List of dev: SB (DEV PART OF SEEN DATA)
|
||||
dev_set = [
|
||||
"ES2003",
|
||||
"ES2011",
|
||||
"IS1008",
|
||||
"TS3004",
|
||||
"TS3006",
|
||||
"IB4001",
|
||||
"IB4002",
|
||||
"IB4003",
|
||||
"IB4004",
|
||||
"IB4010",
|
||||
"IB4011",
|
||||
]
|
||||
|
||||
# List of test: SC (UNSEEN DATA FOR EVALUATION)
|
||||
# Note that IB4005 does not appear because it has speakers in common with two sets of data.
|
||||
test_set = [
|
||||
"ES2004",
|
||||
"ES2014",
|
||||
"IS1009",
|
||||
"TS3003",
|
||||
"TS3007",
|
||||
"EN2002",
|
||||
]
|
||||
|
||||
if split_option == "full_corpus_asr":
|
||||
train_set = [
|
||||
"ES2002",
|
||||
"ES2003",
|
||||
"ES2005",
|
||||
"ES2006",
|
||||
"ES2007",
|
||||
"ES2008",
|
||||
"ES2009",
|
||||
"ES2010",
|
||||
"ES2012",
|
||||
"ES2013",
|
||||
"ES2014",
|
||||
"ES2015",
|
||||
"ES2016",
|
||||
"IS1000",
|
||||
"IS1001",
|
||||
"IS1002",
|
||||
"IS1003",
|
||||
"IS1004",
|
||||
"IS1005",
|
||||
"IS1006",
|
||||
"IS1007",
|
||||
"TS3005",
|
||||
"TS3006",
|
||||
"TS3007",
|
||||
"TS3008",
|
||||
"TS3009",
|
||||
"TS3010",
|
||||
"TS3011",
|
||||
"TS3012",
|
||||
"EN2001",
|
||||
"EN2003",
|
||||
"EN2004",
|
||||
"EN2005",
|
||||
"EN2006",
|
||||
"EN2009",
|
||||
"IN1001",
|
||||
"IN1002",
|
||||
"IN1005",
|
||||
"IN1007",
|
||||
"IN1008",
|
||||
"IN1009",
|
||||
"IN1012",
|
||||
"IN1013",
|
||||
"IN1014",
|
||||
"IN1016",
|
||||
]
|
||||
|
||||
dev_set = [
|
||||
"ES2011",
|
||||
"IS1008",
|
||||
"TS3004",
|
||||
"IB4001",
|
||||
"IB4002",
|
||||
"IB4003",
|
||||
"IB4004",
|
||||
"IB4010",
|
||||
"IB4011",
|
||||
]
|
||||
|
||||
test_set = [
|
||||
"ES2004",
|
||||
"IS1009",
|
||||
"TS3003",
|
||||
"EN2002",
|
||||
]
|
||||
|
||||
return train_set, dev_set, test_set
|
@ -0,0 +1,49 @@
|
||||
#!/bin/bash
|
||||
|
||||
stage=1
|
||||
|
||||
TARGET_DIR=${MAIN_ROOT}/dataset/ami
|
||||
data_folder=${TARGET_DIR}/amicorpus #e.g., /path/to/amicorpus/
|
||||
manual_annot_folder=${TARGET_DIR}/ami_public_manual_1.6.2 #e.g., /path/to/ami_public_manual_1.6.2/
|
||||
|
||||
save_folder=${MAIN_ROOT}/examples/ami/sd0/data
|
||||
ref_rttm_dir=${save_folder}/ref_rttms
|
||||
meta_data_dir=${save_folder}/metadata
|
||||
|
||||
set=L
|
||||
|
||||
. ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
|
||||
set -u
|
||||
set -o pipefail
|
||||
|
||||
mkdir -p ${save_folder}
|
||||
|
||||
if [ ${stage} -le 0 ]; then
|
||||
# Download AMI corpus, You need around 10GB of free space to get whole data
|
||||
# The signals are too large to package in this way,
|
||||
# so you need to use the chooser to indicate which ones you wish to download
|
||||
echo "Please follow https://groups.inf.ed.ac.uk/ami/download/ to download the data."
|
||||
echo "Annotations: AMI manual annotations v1.6.2 "
|
||||
echo "Signals: "
|
||||
echo "1) Select one or more AMI meetings: the IDs please follow ./ami_split.py"
|
||||
echo "2) Select media streams: Just select Headset mix"
|
||||
exit 0;
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 1 ]; then
|
||||
echo "AMI Data preparation"
|
||||
|
||||
python local/ami_prepare.py --data_folder ${data_folder} \
|
||||
--manual_annot_folder ${manual_annot_folder} \
|
||||
--save_folder ${save_folder} --ref_rttm_dir ${ref_rttm_dir} \
|
||||
--meta_data_dir ${meta_data_dir}
|
||||
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Prepare AMI failed. Please check log message."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
fi
|
||||
|
||||
echo "AMI data preparation done."
|
||||
exit 0
|
@ -0,0 +1,97 @@
|
||||
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
Data reading and writing.
|
||||
|
||||
Authors
|
||||
* qingenz123@126.com (Qingen ZHAO) 2022
|
||||
|
||||
"""
|
||||
import os
|
||||
import pickle
|
||||
|
||||
|
||||
def save_pkl(obj, file):
|
||||
"""Save an object in pkl format.
|
||||
|
||||
Arguments
|
||||
---------
|
||||
obj : object
|
||||
Object to save in pkl format
|
||||
file : str
|
||||
Path to the output file
|
||||
sampling_rate : int
|
||||
Sampling rate of the audio file, TODO: this is not used?
|
||||
|
||||
Example
|
||||
-------
|
||||
>>> tmpfile = os.path.join(getfixture('tmpdir'), "example.pkl")
|
||||
>>> save_pkl([1, 2, 3, 4, 5], tmpfile)
|
||||
>>> load_pkl(tmpfile)
|
||||
[1, 2, 3, 4, 5]
|
||||
"""
|
||||
with open(file, "wb") as f:
|
||||
pickle.dump(obj, f)
|
||||
|
||||
|
||||
def load_pickle(pickle_path):
|
||||
"""Utility function for loading .pkl pickle files.
|
||||
|
||||
Arguments
|
||||
---------
|
||||
pickle_path : str
|
||||
Path to pickle file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
out : object
|
||||
Python object loaded from pickle.
|
||||
"""
|
||||
with open(pickle_path, "rb") as f:
|
||||
out = pickle.load(f)
|
||||
return out
|
||||
|
||||
|
||||
def load_pkl(file):
|
||||
"""Loads a pkl file.
|
||||
|
||||
For an example, see `save_pkl`.
|
||||
|
||||
Arguments
|
||||
---------
|
||||
file : str
|
||||
Path to the input pkl file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
The loaded object.
|
||||
"""
|
||||
|
||||
# Deals with the situation where two processes are trying
|
||||
# to access the same label dictionary by creating a lock
|
||||
count = 100
|
||||
while count > 0:
|
||||
if os.path.isfile(file + ".lock"):
|
||||
time.sleep(1)
|
||||
count -= 1
|
||||
else:
|
||||
break
|
||||
|
||||
try:
|
||||
open(file + ".lock", "w").close()
|
||||
with open(file, "rb") as f:
|
||||
return pickle.load(f)
|
||||
finally:
|
||||
if os.path.isfile(file + ".lock"):
|
||||
os.remove(file + ".lock")
|
@ -0,0 +1,15 @@
|
||||
export MAIN_ROOT=`realpath ${PWD}/../../../`
|
||||
|
||||
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
|
||||
export LC_ALL=C
|
||||
|
||||
export PYTHONDONTWRITEBYTECODE=1
|
||||
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
|
||||
export PYTHONIOENCODING=UTF-8
|
||||
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
|
||||
|
||||
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
|
||||
|
||||
# model exp
|
||||
#MODEL=ECAPA_TDNN
|
||||
#export BIN_DIR=${MAIN_ROOT}/paddlespeech/vector/exps/${MODEL}/bin
|
@ -0,0 +1,14 @@
|
||||
#!/bin/bash
|
||||
|
||||
. path.sh || exit 1;
|
||||
set -e
|
||||
|
||||
stage=1
|
||||
|
||||
|
||||
. ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
|
||||
|
||||
if [ ${stage} -le 1 ]; then
|
||||
# prepare data
|
||||
bash ./local/data.sh || exit -1
|
||||
fi
|
@ -0,0 +1 @@
|
||||
../../../utils
|
@ -0,0 +1,250 @@
|
||||
# Tacotron2 with CSMSC
|
||||
This example contains code used to train a [Tacotron2](https://arxiv.org/abs/1712.05884) model with [Chinese Standard Mandarin Speech Copus](https://www.data-baker.com/open_source.html).
|
||||
|
||||
## Dataset
|
||||
### Download and Extract
|
||||
Download CSMSC from it's [Official Website](https://test.data-baker.com/data/index/source).
|
||||
|
||||
### Get MFA Result and Extract
|
||||
We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get phonemes for Tacotron2, the durations of MFA are not needed here.
|
||||
You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo.
|
||||
|
||||
## Get Started
|
||||
Assume the path to the dataset is `~/datasets/BZNSYP`.
|
||||
Assume the path to the MFA result of CSMSC is `./baker_alignment_tone`.
|
||||
Run the command below to
|
||||
1. **source path**.
|
||||
2. preprocess the dataset.
|
||||
3. train the model.
|
||||
4. synthesize wavs.
|
||||
- synthesize waveform from `metadata.jsonl`.
|
||||
- synthesize waveform from a text file.
|
||||
|
||||
```bash
|
||||
./run.sh
|
||||
```
|
||||
You can choose a range of stages you want to run, or set `stage` equal to `stop-stage` to use only one stage, for example, running the following command will only preprocess the dataset.
|
||||
```bash
|
||||
./run.sh --stage 0 --stop-stage 0
|
||||
```
|
||||
### Data Preprocessing
|
||||
```bash
|
||||
./local/preprocess.sh ${conf_path}
|
||||
```
|
||||
When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below.
|
||||
|
||||
```text
|
||||
dump
|
||||
├── dev
|
||||
│ ├── norm
|
||||
│ └── raw
|
||||
├── phone_id_map.txt
|
||||
├── speaker_id_map.txt
|
||||
├── test
|
||||
│ ├── norm
|
||||
│ └── raw
|
||||
└── train
|
||||
├── norm
|
||||
├── raw
|
||||
└── speech_stats.npy
|
||||
```
|
||||
The dataset is split into 3 parts, namely `train`, `dev`, and` test`, each of which contains a `norm` and `raw` subfolder. The raw folder contains speech features of each utterance, while the norm folder contains normalized ones. The statistics used to normalize features are computed from the training set, which is located in `dump/train/*_stats.npy`.
|
||||
|
||||
Also, there is a `metadata.jsonl` in each subfolder. It is a table-like file that contains phones, text_lengths, speech_lengths, durations, the path of speech features, speaker, and the id of each utterance.
|
||||
|
||||
### Model Training
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
|
||||
```
|
||||
`./local/train.sh` calls `${BIN_DIR}/train.py`.
|
||||
Here's the complete help message.
|
||||
```text
|
||||
usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
|
||||
[--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
|
||||
[--ngpu NGPU] [--phones-dict PHONES_DICT]
|
||||
|
||||
Train a Tacotron2 model.
|
||||
|
||||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
--config CONFIG tacotron2 config file.
|
||||
--train-metadata TRAIN_METADATA
|
||||
training data.
|
||||
--dev-metadata DEV_METADATA
|
||||
dev data.
|
||||
--output-dir OUTPUT_DIR
|
||||
output dir.
|
||||
--ngpu NGPU if ngpu == 0, use cpu.
|
||||
--phones-dict PHONES_DICT
|
||||
phone vocabulary file.
|
||||
```
|
||||
1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`.
|
||||
2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder.
|
||||
3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are saved in `checkpoints/` inside this directory.
|
||||
4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
|
||||
5. `--phones-dict` is the path of the phone vocabulary file.
|
||||
|
||||
### Synthesizing
|
||||
We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc1) as the neural vocoder.
|
||||
Download pretrained parallel wavegan model from [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip) and unzip it.
|
||||
```bash
|
||||
unzip pwg_baker_ckpt_0.4.zip
|
||||
```
|
||||
Parallel WaveGAN checkpoint contains files listed below.
|
||||
```text
|
||||
pwg_baker_ckpt_0.4
|
||||
├── pwg_default.yaml # default config used to train parallel wavegan
|
||||
├── pwg_snapshot_iter_400000.pdz # model parameters of parallel wavegan
|
||||
└── pwg_stats.npy # statistics used to normalize spectrogram when training parallel wavegan
|
||||
```
|
||||
`./local/synthesize.sh` calls `${BIN_DIR}/../synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
|
||||
```
|
||||
```text
|
||||
usage: synthesize.py [-h]
|
||||
[--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc}]
|
||||
[--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
|
||||
[--am_stat AM_STAT] [--phones_dict PHONES_DICT]
|
||||
[--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT]
|
||||
[--voice-cloning VOICE_CLONING]
|
||||
[--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}]
|
||||
[--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
|
||||
[--voc_stat VOC_STAT] [--ngpu NGPU]
|
||||
[--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR]
|
||||
|
||||
Synthesize with acoustic model & vocoder
|
||||
|
||||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc}
|
||||
Choose acoustic model type of tts task.
|
||||
--am_config AM_CONFIG
|
||||
Config of acoustic model. Use deault config when it is
|
||||
None.
|
||||
--am_ckpt AM_CKPT Checkpoint file of acoustic model.
|
||||
--am_stat AM_STAT mean and standard deviation used to normalize
|
||||
spectrogram when training acoustic model.
|
||||
--phones_dict PHONES_DICT
|
||||
phone vocabulary file.
|
||||
--tones_dict TONES_DICT
|
||||
tone vocabulary file.
|
||||
--speaker_dict SPEAKER_DICT
|
||||
speaker id map file.
|
||||
--voice-cloning VOICE_CLONING
|
||||
whether training voice cloning model.
|
||||
--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}
|
||||
Choose vocoder type of tts task.
|
||||
--voc_config VOC_CONFIG
|
||||
Config of voc. Use deault config when it is None.
|
||||
--voc_ckpt VOC_CKPT Checkpoint file of voc.
|
||||
--voc_stat VOC_STAT mean and standard deviation used to normalize
|
||||
spectrogram when training voc.
|
||||
--ngpu NGPU if ngpu == 0, use cpu.
|
||||
--test_metadata TEST_METADATA
|
||||
test metadata.
|
||||
--output_dir OUTPUT_DIR
|
||||
output dir.
|
||||
```
|
||||
`./local/synthesize_e2e.sh` calls `${BIN_DIR}/../synthesize_e2e.py`, which can synthesize waveform from text file.
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name}
|
||||
```
|
||||
```text
|
||||
usage: synthesize_e2e.py [-h]
|
||||
[--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc}]
|
||||
[--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
|
||||
[--am_stat AM_STAT] [--phones_dict PHONES_DICT]
|
||||
[--tones_dict TONES_DICT]
|
||||
[--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID]
|
||||
[--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc}]
|
||||
[--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
|
||||
[--voc_stat VOC_STAT] [--lang LANG]
|
||||
[--inference_dir INFERENCE_DIR] [--ngpu NGPU]
|
||||
[--text TEXT] [--output_dir OUTPUT_DIR]
|
||||
|
||||
Synthesize with acoustic model & vocoder
|
||||
|
||||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc}
|
||||
Choose acoustic model type of tts task.
|
||||
--am_config AM_CONFIG
|
||||
Config of acoustic model. Use deault config when it is
|
||||
None.
|
||||
--am_ckpt AM_CKPT Checkpoint file of acoustic model.
|
||||
--am_stat AM_STAT mean and standard deviation used to normalize
|
||||
spectrogram when training acoustic model.
|
||||
--phones_dict PHONES_DICT
|
||||
phone vocabulary file.
|
||||
--tones_dict TONES_DICT
|
||||
tone vocabulary file.
|
||||
--speaker_dict SPEAKER_DICT
|
||||
speaker id map file.
|
||||
--spk_id SPK_ID spk id for multi speaker acoustic model
|
||||
--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc}
|
||||
Choose vocoder type of tts task.
|
||||
--voc_config VOC_CONFIG
|
||||
Config of voc. Use deault config when it is None.
|
||||
--voc_ckpt VOC_CKPT Checkpoint file of voc.
|
||||
--voc_stat VOC_STAT mean and standard deviation used to normalize
|
||||
spectrogram when training voc.
|
||||
--lang LANG Choose model language. zh or en
|
||||
--inference_dir INFERENCE_DIR
|
||||
dir to save inference models
|
||||
--ngpu NGPU if ngpu == 0, use cpu.
|
||||
--text TEXT text to synthesize, a 'utt_id sentence' pair per line.
|
||||
--output_dir OUTPUT_DIR
|
||||
output dir.
|
||||
```
|
||||
1. `--am` is acoustic model type with the format {model_name}_{dataset}
|
||||
2. `--am_config`, `--am_checkpoint`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the Tacotron2 pretrained model.
|
||||
3. `--voc` is vocoder type with the format {model_name}_{dataset}
|
||||
4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model.
|
||||
5. `--lang` is the model language, which can be `zh` or `en`.
|
||||
6. `--test_metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder.
|
||||
7. `--text` is the text file, which contains sentences to synthesize.
|
||||
8. `--output_dir` is the directory to save synthesized audio files.
|
||||
9. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
|
||||
|
||||
|
||||
## Pretrained Model
|
||||
Pretrained Tacotron2 model with no silence in the edge of audios:
|
||||
- [tacotron2_csmsc_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_csmsc_ckpt_0.2.0.zip)
|
||||
|
||||
The static model can be downloaded here [tacotron2_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_csmsc_static_0.2.0.zip).
|
||||
|
||||
|
||||
Model | Step | eval/loss | eval/l1_loss | eval/mse_loss | eval/bce_loss| eval/attn_loss
|
||||
:-------------:| :------------:| :-----: | :-----: | :--------: |:--------:|:---------:
|
||||
default| 1(gpu) x 30600|0.57185|0.39614|0.14642|0.029|5.8e-05|
|
||||
|
||||
Tacotron2 checkpoint contains files listed below.
|
||||
```text
|
||||
tacotron2_csmsc_ckpt_0.2.0
|
||||
├── default.yaml # default config used to train Tacotron2
|
||||
├── phone_id_map.txt # phone vocabulary file when training Tacotron2
|
||||
├── snapshot_iter_30600.pdz # model parameters and optimizer states
|
||||
└── speech_stats.npy # statistics used to normalize spectrogram when training Tacotron2
|
||||
```
|
||||
You can use the following scripts to synthesize for `${BIN_DIR}/../sentences.txt` using pretrained Tacotron2 and parallel wavegan models.
|
||||
```bash
|
||||
source path.sh
|
||||
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
|
||||
python3 ${BIN_DIR}/../synthesize_e2e.py \
|
||||
--am=tacotron2_csmsc \
|
||||
--am_config=tacotron2_csmsc_ckpt_0.2.0/default.yaml \
|
||||
--am_ckpt=tacotron2_csmsc_ckpt_0.2.0/snapshot_iter_30600.pdz \
|
||||
--am_stat=tacotron2_csmsc_ckpt_0.2.0/speech_stats.npy \
|
||||
--voc=pwgan_csmsc \
|
||||
--voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \
|
||||
--voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
|
||||
--voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
|
||||
--lang=zh \
|
||||
--text=${BIN_DIR}/../sentences.txt \
|
||||
--output_dir=exp/default/test_e2e \
|
||||
--inference_dir=exp/default/inference \
|
||||
--phones_dict=tacotron2_csmsc_ckpt_0.2.0/phone_id_map.txt
|
||||
```
|
@ -0,0 +1,51 @@
|
||||
#!/bin/bash
|
||||
|
||||
train_output_path=$1
|
||||
|
||||
stage=0
|
||||
stop_stage=0
|
||||
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
python3 ${BIN_DIR}/../inference.py \
|
||||
--inference_dir=${train_output_path}/inference \
|
||||
--am=tacotron2_csmsc \
|
||||
--voc=pwgan_csmsc \
|
||||
--text=${BIN_DIR}/../sentences.txt \
|
||||
--output_dir=${train_output_path}/pd_infer_out \
|
||||
--phones_dict=dump/phone_id_map.txt
|
||||
fi
|
||||
|
||||
# for more GAN Vocoders
|
||||
# multi band melgan
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
python3 ${BIN_DIR}/../inference.py \
|
||||
--inference_dir=${train_output_path}/inference \
|
||||
--am=tacotron2_csmsc \
|
||||
--voc=mb_melgan_csmsc \
|
||||
--text=${BIN_DIR}/../sentences.txt \
|
||||
--output_dir=${train_output_path}/pd_infer_out \
|
||||
--phones_dict=dump/phone_id_map.txt
|
||||
fi
|
||||
|
||||
# style melgan
|
||||
# style melgan's Dygraph to Static Graph is not ready now
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
python3 ${BIN_DIR}/../inference.py \
|
||||
--inference_dir=${train_output_path}/inference \
|
||||
--am=tacotron2_csmsc \
|
||||
--voc=style_melgan_csmsc \
|
||||
--text=${BIN_DIR}/../sentences.txt \
|
||||
--output_dir=${train_output_path}/pd_infer_out \
|
||||
--phones_dict=dump/phone_id_map.txt
|
||||
fi
|
||||
|
||||
# hifigan
|
||||
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||
python3 ${BIN_DIR}/../inference.py \
|
||||
--inference_dir=${train_output_path}/inference \
|
||||
--am=tacotron2_csmsc \
|
||||
--voc=hifigan_csmsc \
|
||||
--text=${BIN_DIR}/../sentences.txt \
|
||||
--output_dir=${train_output_path}/pd_infer_out \
|
||||
--phones_dict=dump/phone_id_map.txt
|
||||
fi
|
@ -0,0 +1,67 @@
|
||||
|
||||
###########################################################
|
||||
# FEATURE EXTRACTION SETTING #
|
||||
###########################################################
|
||||
fs: 24000 # Sampling rate.
|
||||
n_fft: 2048 # FFT size (samples).
|
||||
n_shift: 300 # Hop size (samples). 12.5ms
|
||||
win_length: 1200 # Window length (samples). 50ms
|
||||
# If set to null, it will be the same as fft_size.
|
||||
window: "hann" # Window function.
|
||||
n_mels: 80 # Number of mel basis.
|
||||
fmin: 80 # Minimum freq in mel basis calculation. (Hz)
|
||||
fmax: 7600 # Maximum frequency in mel basis calculation. (Hz)
|
||||
mu_law: True # Recommended to suppress noise if using raw bitsexit()
|
||||
|
||||
|
||||
###########################################################
|
||||
# MODEL SETTING #
|
||||
###########################################################
|
||||
model:
|
||||
rnn_dims: 512 # Hidden dims of RNN Layers.
|
||||
fc_dims: 512
|
||||
bits: 9 # Bit depth of signal
|
||||
aux_context_window: 2 # Context window size for auxiliary feature.
|
||||
# If set to 2, previous 2 and future 2 frames will be considered.
|
||||
aux_channels: 80 # Number of channels for auxiliary feature conv.
|
||||
# Must be the same as num_mels.
|
||||
upsample_scales: [4, 5, 3, 5] # Upsampling scales. Prodcut of these must be the same as hop size, same with pwgan here
|
||||
compute_dims: 128 # Dims of Conv1D in MelResNet.
|
||||
res_out_dims: 128 # Dims of output in MelResNet.
|
||||
res_blocks: 10 # Number of residual blocks.
|
||||
mode: RAW # either 'raw'(softmax on raw bits) or 'mold' (sample from mixture of logistics)
|
||||
inference:
|
||||
gen_batched: True # whether to genenate sample in batch mode
|
||||
target: 12000 # target number of samples to be generated in each batch entry
|
||||
overlap: 600 # number of samples for crossfading between batches
|
||||
|
||||
|
||||
###########################################################
|
||||
# DATA LOADER SETTING #
|
||||
###########################################################
|
||||
batch_size: 64 # Batch size.
|
||||
batch_max_steps: 4500 # Length of each audio in batch. Make sure dividable by hop_size.
|
||||
num_workers: 2 # Number of workers in DataLoader.
|
||||
|
||||
###########################################################
|
||||
# OPTIMIZER SETTING #
|
||||
###########################################################
|
||||
grad_clip: 4.0
|
||||
learning_rate: 1.0e-4
|
||||
|
||||
|
||||
###########################################################
|
||||
# INTERVAL SETTING #
|
||||
###########################################################
|
||||
|
||||
train_max_steps: 400000 # Number of training steps.
|
||||
save_interval_steps: 5000 # Interval steps to save checkpoint.
|
||||
eval_interval_steps: 1000 # Interval steps to evaluate the network.
|
||||
gen_eval_samples_interval_steps: 5000 # the iteration interval of generating valid samples
|
||||
generate_num: 5 # number of samples to generate at each checkpoint
|
||||
|
||||
###########################################################
|
||||
# OTHER SETTING #
|
||||
###########################################################
|
||||
num_snapshots: 10 # max number of snapshots to keep while training
|
||||
seed: 42 # random seed for paddle, random, and np.random
|
@ -0,0 +1,55 @@
|
||||
#!/bin/bash
|
||||
|
||||
stage=0
|
||||
stop_stage=100
|
||||
|
||||
config_path=$1
|
||||
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
# get durations from MFA's result
|
||||
echo "Generate durations.txt from MFA results ..."
|
||||
python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
|
||||
--inputdir=./baker_alignment_tone \
|
||||
--output=durations.txt \
|
||||
--config=${config_path}
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
# extract features
|
||||
echo "Extract features ..."
|
||||
python3 ${BIN_DIR}/../gan_vocoder/preprocess.py \
|
||||
--rootdir=~/datasets/BZNSYP/ \
|
||||
--dataset=baker \
|
||||
--dumpdir=dump \
|
||||
--dur-file=durations.txt \
|
||||
--config=${config_path} \
|
||||
--cut-sil=True \
|
||||
--num-cpu=20
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
# get features' stats(mean and std)
|
||||
echo "Get features' stats ..."
|
||||
python3 ${MAIN_ROOT}/utils/compute_statistics.py \
|
||||
--metadata=dump/train/raw/metadata.jsonl \
|
||||
--field-name="feats"
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||
# normalize, dev and test should use train's stats
|
||||
echo "Normalize ..."
|
||||
|
||||
python3 ${BIN_DIR}/../gan_vocoder/normalize.py \
|
||||
--metadata=dump/train/raw/metadata.jsonl \
|
||||
--dumpdir=dump/train/norm \
|
||||
--stats=dump/train/feats_stats.npy
|
||||
python3 ${BIN_DIR}/../gan_vocoder/normalize.py \
|
||||
--metadata=dump/dev/raw/metadata.jsonl \
|
||||
--dumpdir=dump/dev/norm \
|
||||
--stats=dump/train/feats_stats.npy
|
||||
|
||||
python3 ${BIN_DIR}/../gan_vocoder/normalize.py \
|
||||
--metadata=dump/test/raw/metadata.jsonl \
|
||||
--dumpdir=dump/test/norm \
|
||||
--stats=dump/train/feats_stats.npy
|
||||
fi
|
@ -0,0 +1,13 @@
|
||||
#!/bin/bash
|
||||
|
||||
config_path=$1
|
||||
train_output_path=$2
|
||||
ckpt_name=$3
|
||||
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
|
||||
python3 ${BIN_DIR}/synthesize.py \
|
||||
--config=${config_path} \
|
||||
--checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--test-metadata=dump/test/norm/metadata.jsonl \
|
||||
--output-dir=${train_output_path}/test
|
@ -0,0 +1,13 @@
|
||||
#!/bin/bash
|
||||
|
||||
config_path=$1
|
||||
train_output_path=$2
|
||||
|
||||
FLAGS_cudnn_exhaustive_search=true \
|
||||
FLAGS_conv_workspace_size_limit=4000 \
|
||||
python ${BIN_DIR}/train.py \
|
||||
--train-metadata=dump/train/norm/metadata.jsonl \
|
||||
--dev-metadata=dump/dev/norm/metadata.jsonl \
|
||||
--config=${config_path} \
|
||||
--output-dir=${train_output_path} \
|
||||
--ngpu=1
|
@ -0,0 +1,13 @@
|
||||
#!/bin/bash
|
||||
export MAIN_ROOT=`realpath ${PWD}/../../../`
|
||||
|
||||
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
|
||||
export LC_ALL=C
|
||||
|
||||
export PYTHONDONTWRITEBYTECODE=1
|
||||
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
|
||||
export PYTHONIOENCODING=UTF-8
|
||||
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
|
||||
|
||||
MODEL=wavernn
|
||||
export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
|
@ -0,0 +1,30 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
source path.sh
|
||||
|
||||
gpus=0,1
|
||||
stage=0
|
||||
stop_stage=100
|
||||
|
||||
conf_path=conf/default.yaml
|
||||
train_output_path=exp/default
|
||||
test_input=dump/dump_gta_test
|
||||
ckpt_name=snapshot_iter_100000.pdz
|
||||
|
||||
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
|
||||
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
# prepare data
|
||||
./local/preprocess.sh ${conf_path} || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
# prepare data
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
# synthesize
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
|
||||
fi
|
@ -0,0 +1,87 @@
|
||||
# This configuration is for Paddle to train Tacotron 2. Compared to the
|
||||
# original paper, this configuration additionally use the guided attention
|
||||
# loss to accelerate the learning of the diagonal attention. It requires
|
||||
# only a single GPU with 12 GB memory and it takes ~1 days to finish the
|
||||
# training on Titan V.
|
||||
|
||||
###########################################################
|
||||
# FEATURE EXTRACTION SETTING #
|
||||
###########################################################
|
||||
fs: 22050 # Sampling rate.
|
||||
n_fft: 1024 # FFT size (samples).
|
||||
n_shift: 256 # Hop size (samples). 11.6ms
|
||||
win_length: null # Window length (samples).
|
||||
# If set to null, it will be the same as fft_size.
|
||||
window: "hann" # Window function.
|
||||
n_mels: 80 # Number of mel basis.
|
||||
fmin: 80 # Minimum freq in mel basis calculation. (Hz)
|
||||
fmax: 7600 # Maximum frequency in mel basis calculation. (Hz)
|
||||
|
||||
###########################################################
|
||||
# DATA SETTING #
|
||||
###########################################################
|
||||
batch_size: 64
|
||||
num_workers: 2
|
||||
|
||||
###########################################################
|
||||
# MODEL SETTING #
|
||||
###########################################################
|
||||
model: # keyword arguments for the selected model
|
||||
embed_dim: 512 # char or phn embedding dimension
|
||||
elayers: 1 # number of blstm layers in encoder
|
||||
eunits: 512 # number of blstm units
|
||||
econv_layers: 3 # number of convolutional layers in encoder
|
||||
econv_chans: 512 # number of channels in convolutional layer
|
||||
econv_filts: 5 # filter size of convolutional layer
|
||||
atype: location # attention function type
|
||||
adim: 512 # attention dimension
|
||||
aconv_chans: 32 # number of channels in convolutional layer of attention
|
||||
aconv_filts: 15 # filter size of convolutional layer of attention
|
||||
cumulate_att_w: True # whether to cumulate attention weight
|
||||
dlayers: 2 # number of lstm layers in decoder
|
||||
dunits: 1024 # number of lstm units in decoder
|
||||
prenet_layers: 2 # number of layers in prenet
|
||||
prenet_units: 256 # number of units in prenet
|
||||
postnet_layers: 5 # number of layers in postnet
|
||||
postnet_chans: 512 # number of channels in postnet
|
||||
postnet_filts: 5 # filter size of postnet layer
|
||||
output_activation: null # activation function for the final output
|
||||
use_batch_norm: True # whether to use batch normalization in encoder
|
||||
use_concate: True # whether to concatenate encoder embedding with decoder outputs
|
||||
use_residual: False # whether to use residual connection in encoder
|
||||
dropout_rate: 0.5 # dropout rate
|
||||
zoneout_rate: 0.1 # zoneout rate
|
||||
reduction_factor: 1 # reduction factor
|
||||
spk_embed_dim: null # speaker embedding dimension
|
||||
|
||||
|
||||
###########################################################
|
||||
# UPDATER SETTING #
|
||||
###########################################################
|
||||
updater:
|
||||
use_masking: True # whether to apply masking for padded part in loss calculation
|
||||
bce_pos_weight: 5.0 # weight of positive sample in binary cross entropy calculation
|
||||
use_guided_attn_loss: True # whether to use guided attention loss
|
||||
guided_attn_loss_sigma: 0.4 # sigma of guided attention loss
|
||||
guided_attn_loss_lambda: 1.0 # strength of guided attention loss
|
||||
|
||||
|
||||
##########################################################
|
||||
# OPTIMIZER SETTING #
|
||||
##########################################################
|
||||
optimizer:
|
||||
optim: adam # optimizer type
|
||||
learning_rate: 1.0e-03 # learning rate
|
||||
epsilon: 1.0e-06 # epsilon
|
||||
weight_decay: 0.0 # weight decay coefficient
|
||||
|
||||
###########################################################
|
||||
# TRAINING SETTING #
|
||||
###########################################################
|
||||
max_epoch: 300
|
||||
num_snapshots: 5
|
||||
|
||||
###########################################################
|
||||
# OTHER SETTING #
|
||||
###########################################################
|
||||
seed: 42
|
@ -1,8 +1,62 @@
|
||||
#!/bin/bash
|
||||
|
||||
preprocess_path=$1
|
||||
stage=0
|
||||
stop_stage=100
|
||||
|
||||
python3 ${BIN_DIR}/preprocess.py \
|
||||
--input=~/datasets/LJSpeech-1.1 \
|
||||
--output=${preprocess_path} \
|
||||
-v \
|
||||
config_path=$1
|
||||
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
# get durations from MFA's result
|
||||
echo "Generate durations.txt from MFA results ..."
|
||||
python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
|
||||
--inputdir=./ljspeech_alignment \
|
||||
--output=durations.txt \
|
||||
--config=${config_path}
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
# extract features
|
||||
echo "Extract features ..."
|
||||
python3 ${BIN_DIR}/preprocess.py \
|
||||
--dataset=ljspeech \
|
||||
--rootdir=~/datasets/LJSpeech-1.1/ \
|
||||
--dumpdir=dump \
|
||||
--dur-file=durations.txt \
|
||||
--config=${config_path} \
|
||||
--num-cpu=20 \
|
||||
--cut-sil=True
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
# get features' stats(mean and std)
|
||||
echo "Get features' stats ..."
|
||||
python3 ${MAIN_ROOT}/utils/compute_statistics.py \
|
||||
--metadata=dump/train/raw/metadata.jsonl \
|
||||
--field-name="speech"
|
||||
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||
# normalize and covert phone to id, dev and test should use train's stats
|
||||
echo "Normalize ..."
|
||||
python3 ${BIN_DIR}/normalize.py \
|
||||
--metadata=dump/train/raw/metadata.jsonl \
|
||||
--dumpdir=dump/train/norm \
|
||||
--speech-stats=dump/train/speech_stats.npy \
|
||||
--phones-dict=dump/phone_id_map.txt \
|
||||
--speaker-dict=dump/speaker_id_map.txt
|
||||
|
||||
python3 ${BIN_DIR}/normalize.py \
|
||||
--metadata=dump/dev/raw/metadata.jsonl \
|
||||
--dumpdir=dump/dev/norm \
|
||||
--speech-stats=dump/train/speech_stats.npy \
|
||||
--phones-dict=dump/phone_id_map.txt \
|
||||
--speaker-dict=dump/speaker_id_map.txt
|
||||
|
||||
python3 ${BIN_DIR}/normalize.py \
|
||||
--metadata=dump/test/raw/metadata.jsonl \
|
||||
--dumpdir=dump/test/norm \
|
||||
--speech-stats=dump/train/speech_stats.npy \
|
||||
--phones-dict=dump/phone_id_map.txt \
|
||||
--speaker-dict=dump/speaker_id_map.txt
|
||||
fi
|
||||
|
@ -1,11 +1,20 @@
|
||||
#!/bin/bash
|
||||
|
||||
train_output_path=$1
|
||||
ckpt_name=$2
|
||||
config_path=$1
|
||||
train_output_path=$2
|
||||
ckpt_name=$3
|
||||
|
||||
python3 ${BIN_DIR}/synthesize.py \
|
||||
--config=${train_output_path}/config.yaml \
|
||||
--checkpoint_path=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--input=${BIN_DIR}/../sentences_en.txt \
|
||||
--output=${train_output_path}/test \
|
||||
--ngpu=1
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
|
||||
python3 ${BIN_DIR}/../synthesize.py \
|
||||
--am=tacotron2_ljspeech \
|
||||
--am_config=${config_path} \
|
||||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--am_stat=dump/train/speech_stats.npy \
|
||||
--voc=pwgan_ljspeech \
|
||||
--voc_config=pwg_ljspeech_ckpt_0.5/pwg_default.yaml \
|
||||
--voc_ckpt=pwg_ljspeech_ckpt_0.5/pwg_snapshot_iter_400000.pdz \
|
||||
--voc_stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \
|
||||
--test_metadata=dump/test/norm/metadata.jsonl \
|
||||
--output_dir=${train_output_path}/test \
|
||||
--phones_dict=dump/phone_id_map.txt
|
||||
|
@ -0,0 +1,22 @@
|
||||
#!/bin/bash
|
||||
|
||||
config_path=$1
|
||||
train_output_path=$2
|
||||
ckpt_name=$3
|
||||
# TODO: dygraph to static graph is not good for tacotron2_ljspeech now
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
|
||||
python3 ${BIN_DIR}/../synthesize_e2e.py \
|
||||
--am=tacotron2_ljspeech \
|
||||
--am_config=${config_path} \
|
||||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--am_stat=dump/train/speech_stats.npy \
|
||||
--voc=pwgan_ljspeech \
|
||||
--voc_config=pwg_ljspeech_ckpt_0.5/pwg_default.yaml \
|
||||
--voc_ckpt=pwg_ljspeech_ckpt_0.5/pwg_snapshot_iter_400000.pdz \
|
||||
--voc_stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \
|
||||
--lang=en \
|
||||
--text=${BIN_DIR}/../sentences_en.txt \
|
||||
--output_dir=${train_output_path}/test_e2e \
|
||||
--phones_dict=dump/phone_id_map.txt \
|
||||
# --inference_dir=${train_output_path}/inference
|
@ -1,9 +1,12 @@
|
||||
#!/bin/bash
|
||||
|
||||
preprocess_path=$1
|
||||
config_path=$1
|
||||
train_output_path=$2
|
||||
|
||||
python3 ${BIN_DIR}/train.py \
|
||||
--data=${preprocess_path} \
|
||||
--output=${train_output_path} \
|
||||
--ngpu=1 \
|
||||
--train-metadata=dump/train/norm/metadata.jsonl \
|
||||
--dev-metadata=dump/dev/norm/metadata.jsonl \
|
||||
--config=${config_path} \
|
||||
--output-dir=${train_output_path} \
|
||||
--ngpu=1 \
|
||||
--phones-dict=dump/phone_id_map.txt
|
@ -0,0 +1,8 @@
|
||||
|
||||
dataset info refer to [VoxCeleb](https://www.robots.ox.ac.uk/~vgg/data/voxceleb/index.html#about)
|
||||
|
||||
sv0 - speaker verfication with softmax backend etc, all python code
|
||||
more info refer to the sv0/readme.txt
|
||||
|
||||
sv1 - dependence on kaldi, speaker verfication with plda/sc backend,
|
||||
more info refer to the sv1/readme.txt
|
@ -0,0 +1,81 @@
|
||||
#!/usr/bin/python3
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""
|
||||
Make VoxCeleb1 trial of kaldi format
|
||||
this script creat the test trial from kaldi trial voxceleb1_test_v2.txt or official trial veri_test2.txt
|
||||
to kaldi trial format
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import codecs
|
||||
import os
|
||||
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument("--voxceleb_trial",
|
||||
default="voxceleb1_test_v2",
|
||||
type=str,
|
||||
help="VoxCeleb trial file. Default we use the kaldi trial voxceleb1_test_v2.txt")
|
||||
parser.add_argument("--trial",
|
||||
default="data/test/trial",
|
||||
type=str,
|
||||
help="Kaldi format trial file")
|
||||
args = parser.parse_args()
|
||||
|
||||
def main(voxceleb_trial, trial):
|
||||
"""
|
||||
VoxCeleb provide several trial file, which format is different with kaldi format.
|
||||
|
||||
VoxCeleb format's meaning is as following:
|
||||
--------------------------------
|
||||
target_or_nontarget path1 path2
|
||||
--------------------------------
|
||||
target_or_nontarget is an integer: 1 target path1 is equal to path2
|
||||
0 nontarget path1 is unequal to path2
|
||||
path1: spkr_id/rec_id/name
|
||||
path2: spkr_id/rec_id/name
|
||||
|
||||
Kaldi format's meaning is as following:
|
||||
---------------------------------------
|
||||
utt_id1 utt_id2 target_or_nontarget
|
||||
---------------------------------------
|
||||
utt_id1: utterance identification or speaker identification
|
||||
utt_id2: utterance identification or speaker identification
|
||||
target_or_nontarget is an string: 'target' utt_id1 is equal to utt_id2
|
||||
'nontarget' utt_id2 is unequal to utt_id2
|
||||
"""
|
||||
print("Start convert the voxceleb trial to kaldi format")
|
||||
if not os.path.exists(voxceleb_trial):
|
||||
raise RuntimeError("{} does not exist. Pleas input the correct file path".format(voxceleb_trial))
|
||||
|
||||
trial_dirname = os.path.dirname(trial)
|
||||
if not os.path.exists(trial_dirname):
|
||||
os.mkdir(trial_dirname)
|
||||
|
||||
with codecs.open(voxceleb_trial, 'r', encoding='utf-8') as f, \
|
||||
codecs.open(trial, 'w', encoding='utf-8') as w:
|
||||
for line in f:
|
||||
target_or_nontarget, path1, path2 = line.strip().split()
|
||||
|
||||
utt_id1 = "-".join(path1.split("/"))
|
||||
utt_id2 = "-".join(path2.split("/"))
|
||||
target = "nontarget"
|
||||
if int(target_or_nontarget):
|
||||
target = "target"
|
||||
w.write("{} {} {}\n".format(utt_id1, utt_id2, target))
|
||||
print("Convert the voxceleb trial to kaldi format successfully")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main(args.voxceleb_trial, args.trial)
|
@ -0,0 +1,51 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import math
|
||||
|
||||
import numpy as np
|
||||
import paddle
|
||||
|
||||
|
||||
# x: [0: 2**bit-1], return: [-1, 1]
|
||||
def label_2_float(x, bits):
|
||||
return 2 * x / (2**bits - 1.) - 1.
|
||||
|
||||
|
||||
#x: [-1, 1], return: [0, 2**bits-1]
|
||||
def float_2_label(x, bits):
|
||||
assert abs(x).max() <= 1.0
|
||||
x = (x + 1.) * (2**bits - 1) / 2
|
||||
return x.clip(0, 2**bits - 1)
|
||||
|
||||
|
||||
# y: [-1, 1], mu: 2**bits, return: [0, 2**bits-1]
|
||||
# see https://en.wikipedia.org/wiki/%CE%9C-law_algorithm
|
||||
# be careful the input `mu` here, which is +1 than that of the link above
|
||||
def encode_mu_law(x, mu):
|
||||
mu = mu - 1
|
||||
fx = np.sign(x) * np.log(1 + mu * np.abs(x)) / np.log(1 + mu)
|
||||
return np.floor((fx + 1) / 2 * mu + 0.5)
|
||||
|
||||
|
||||
# from_labels = True:
|
||||
# y: [0: 2**bit-1], mu: 2**bits, return: [-1,1]
|
||||
# from_labels = False:
|
||||
# y: [-1, 1], return: [-1, 1]
|
||||
def decode_mu_law(y, mu, from_labels=True):
|
||||
# TODO: get rid of log2 - makes no sense
|
||||
if from_labels:
|
||||
y = label_2_float(y, math.log2(mu))
|
||||
mu = mu - 1
|
||||
x = paddle.sign(y) / mu * ((1 + mu)**paddle.abs(y) - 1)
|
||||
return x
|
@ -1,328 +0,0 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import argparse
|
||||
import os
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from operator import itemgetter
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
from typing import Dict
|
||||
from typing import List
|
||||
|
||||
import jsonlines
|
||||
import librosa
|
||||
import numpy as np
|
||||
import tqdm
|
||||
import yaml
|
||||
from yacs.config import CfgNode
|
||||
|
||||
from paddlespeech.t2s.data.get_feats import LogMelFBank
|
||||
from paddlespeech.t2s.datasets.preprocess_utils import compare_duration_and_mel_length
|
||||
from paddlespeech.t2s.datasets.preprocess_utils import get_input_token
|
||||
from paddlespeech.t2s.datasets.preprocess_utils import get_phn_dur
|
||||
from paddlespeech.t2s.datasets.preprocess_utils import get_spk_id_map
|
||||
from paddlespeech.t2s.datasets.preprocess_utils import merge_silence
|
||||
|
||||
|
||||
def process_sentence(config: Dict[str, Any],
|
||||
fp: Path,
|
||||
sentences: Dict,
|
||||
output_dir: Path,
|
||||
mel_extractor=None,
|
||||
cut_sil: bool=True,
|
||||
spk_emb_dir: Path=None):
|
||||
utt_id = fp.stem
|
||||
# for vctk
|
||||
if utt_id.endswith("_mic2"):
|
||||
utt_id = utt_id[:-5]
|
||||
record = None
|
||||
if utt_id in sentences:
|
||||
# reading, resampling may occur
|
||||
wav, _ = librosa.load(str(fp), sr=config.fs)
|
||||
if len(wav.shape) != 1 or np.abs(wav).max() > 1.0:
|
||||
return record
|
||||
assert len(wav.shape) == 1, f"{utt_id} is not a mono-channel audio."
|
||||
assert np.abs(wav).max(
|
||||
) <= 1.0, f"{utt_id} is seems to be different that 16 bit PCM."
|
||||
phones = sentences[utt_id][0]
|
||||
durations = sentences[utt_id][1]
|
||||
speaker = sentences[utt_id][2]
|
||||
d_cumsum = np.pad(np.array(durations).cumsum(0), (1, 0), 'constant')
|
||||
# little imprecise than use *.TextGrid directly
|
||||
times = librosa.frames_to_time(
|
||||
d_cumsum, sr=config.fs, hop_length=config.n_shift)
|
||||
if cut_sil:
|
||||
start = 0
|
||||
end = d_cumsum[-1]
|
||||
if phones[0] == "sil" and len(durations) > 1:
|
||||
start = times[1]
|
||||
durations = durations[1:]
|
||||
phones = phones[1:]
|
||||
if phones[-1] == 'sil' and len(durations) > 1:
|
||||
end = times[-2]
|
||||
durations = durations[:-1]
|
||||
phones = phones[:-1]
|
||||
sentences[utt_id][0] = phones
|
||||
sentences[utt_id][1] = durations
|
||||
start, end = librosa.time_to_samples([start, end], sr=config.fs)
|
||||
wav = wav[start:end]
|
||||
# extract mel feats
|
||||
logmel = mel_extractor.get_log_mel_fbank(wav)
|
||||
# change duration according to mel_length
|
||||
compare_duration_and_mel_length(sentences, utt_id, logmel)
|
||||
phones = sentences[utt_id][0]
|
||||
durations = sentences[utt_id][1]
|
||||
num_frames = logmel.shape[0]
|
||||
assert sum(durations) == num_frames
|
||||
mel_dir = output_dir / "data_speech"
|
||||
mel_dir.mkdir(parents=True, exist_ok=True)
|
||||
mel_path = mel_dir / (utt_id + "_speech.npy")
|
||||
np.save(mel_path, logmel)
|
||||
record = {
|
||||
"utt_id": utt_id,
|
||||
"phones": phones,
|
||||
"text_lengths": len(phones),
|
||||
"speech_lengths": num_frames,
|
||||
"speech": str(mel_path),
|
||||
"speaker": speaker
|
||||
}
|
||||
if spk_emb_dir:
|
||||
if speaker in os.listdir(spk_emb_dir):
|
||||
embed_name = utt_id + ".npy"
|
||||
embed_path = spk_emb_dir / speaker / embed_name
|
||||
if embed_path.is_file():
|
||||
record["spk_emb"] = str(embed_path)
|
||||
else:
|
||||
return None
|
||||
return record
|
||||
|
||||
|
||||
def process_sentences(config,
|
||||
fps: List[Path],
|
||||
sentences: Dict,
|
||||
output_dir: Path,
|
||||
mel_extractor=None,
|
||||
nprocs: int=1,
|
||||
cut_sil: bool=True,
|
||||
spk_emb_dir: Path=None):
|
||||
if nprocs == 1:
|
||||
results = []
|
||||
for fp in fps:
|
||||
record = process_sentence(config, fp, sentences, output_dir,
|
||||
mel_extractor, cut_sil, spk_emb_dir)
|
||||
if record:
|
||||
results.append(record)
|
||||
else:
|
||||
with ThreadPoolExecutor(nprocs) as pool:
|
||||
futures = []
|
||||
with tqdm.tqdm(total=len(fps)) as progress:
|
||||
for fp in fps:
|
||||
future = pool.submit(process_sentence, config, fp,
|
||||
sentences, output_dir, mel_extractor,
|
||||
cut_sil, spk_emb_dir)
|
||||
future.add_done_callback(lambda p: progress.update())
|
||||
futures.append(future)
|
||||
|
||||
results = []
|
||||
for ft in futures:
|
||||
record = ft.result()
|
||||
if record:
|
||||
results.append(record)
|
||||
|
||||
results.sort(key=itemgetter("utt_id"))
|
||||
with jsonlines.open(output_dir / "metadata.jsonl", 'w') as writer:
|
||||
for item in results:
|
||||
writer.write(item)
|
||||
print("Done")
|
||||
|
||||
|
||||
def main():
|
||||
# parse config and args
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Preprocess audio and then extract features.")
|
||||
|
||||
parser.add_argument(
|
||||
"--dataset",
|
||||
default="baker",
|
||||
type=str,
|
||||
help="name of dataset, should in {baker, aishell3, ljspeech, vctk} now")
|
||||
|
||||
parser.add_argument(
|
||||
"--rootdir", default=None, type=str, help="directory to dataset.")
|
||||
|
||||
parser.add_argument(
|
||||
"--dumpdir",
|
||||
type=str,
|
||||
required=True,
|
||||
help="directory to dump feature files.")
|
||||
parser.add_argument(
|
||||
"--dur-file", default=None, type=str, help="path to durations.txt.")
|
||||
|
||||
parser.add_argument("--config", type=str, help="fastspeech2 config file.")
|
||||
|
||||
parser.add_argument(
|
||||
"--verbose",
|
||||
type=int,
|
||||
default=1,
|
||||
help="logging level. higher is more logging. (default=1)")
|
||||
parser.add_argument(
|
||||
"--num-cpu", type=int, default=1, help="number of process.")
|
||||
|
||||
def str2bool(str):
|
||||
return True if str.lower() == 'true' else False
|
||||
|
||||
parser.add_argument(
|
||||
"--cut-sil",
|
||||
type=str2bool,
|
||||
default=True,
|
||||
help="whether cut sil in the edge of audio")
|
||||
|
||||
parser.add_argument(
|
||||
"--spk_emb_dir",
|
||||
default=None,
|
||||
type=str,
|
||||
help="directory to speaker embedding files.")
|
||||
args = parser.parse_args()
|
||||
|
||||
rootdir = Path(args.rootdir).expanduser()
|
||||
dumpdir = Path(args.dumpdir).expanduser()
|
||||
# use absolute path
|
||||
dumpdir = dumpdir.resolve()
|
||||
dumpdir.mkdir(parents=True, exist_ok=True)
|
||||
dur_file = Path(args.dur_file).expanduser()
|
||||
|
||||
if args.spk_emb_dir:
|
||||
spk_emb_dir = Path(args.spk_emb_dir).expanduser().resolve()
|
||||
else:
|
||||
spk_emb_dir = None
|
||||
|
||||
assert rootdir.is_dir()
|
||||
assert dur_file.is_file()
|
||||
|
||||
with open(args.config, 'rt') as f:
|
||||
config = CfgNode(yaml.safe_load(f))
|
||||
|
||||
if args.verbose > 1:
|
||||
print(vars(args))
|
||||
print(config)
|
||||
|
||||
sentences, speaker_set = get_phn_dur(dur_file)
|
||||
|
||||
merge_silence(sentences)
|
||||
phone_id_map_path = dumpdir / "phone_id_map.txt"
|
||||
speaker_id_map_path = dumpdir / "speaker_id_map.txt"
|
||||
get_input_token(sentences, phone_id_map_path, args.dataset)
|
||||
get_spk_id_map(speaker_set, speaker_id_map_path)
|
||||
|
||||
if args.dataset == "baker":
|
||||
wav_files = sorted(list((rootdir / "Wave").rglob("*.wav")))
|
||||
# split data into 3 sections
|
||||
num_train = 9800
|
||||
num_dev = 100
|
||||
train_wav_files = wav_files[:num_train]
|
||||
dev_wav_files = wav_files[num_train:num_train + num_dev]
|
||||
test_wav_files = wav_files[num_train + num_dev:]
|
||||
elif args.dataset == "aishell3":
|
||||
sub_num_dev = 5
|
||||
wav_dir = rootdir / "train" / "wav"
|
||||
train_wav_files = []
|
||||
dev_wav_files = []
|
||||
test_wav_files = []
|
||||
for speaker in os.listdir(wav_dir):
|
||||
wav_files = sorted(list((wav_dir / speaker).rglob("*.wav")))
|
||||
if len(wav_files) > 100:
|
||||
train_wav_files += wav_files[:-sub_num_dev * 2]
|
||||
dev_wav_files += wav_files[-sub_num_dev * 2:-sub_num_dev]
|
||||
test_wav_files += wav_files[-sub_num_dev:]
|
||||
else:
|
||||
train_wav_files += wav_files
|
||||
|
||||
elif args.dataset == "ljspeech":
|
||||
wav_files = sorted(list((rootdir / "wavs").rglob("*.wav")))
|
||||
# split data into 3 sections
|
||||
num_train = 12900
|
||||
num_dev = 100
|
||||
train_wav_files = wav_files[:num_train]
|
||||
dev_wav_files = wav_files[num_train:num_train + num_dev]
|
||||
test_wav_files = wav_files[num_train + num_dev:]
|
||||
elif args.dataset == "vctk":
|
||||
sub_num_dev = 5
|
||||
wav_dir = rootdir / "wav48_silence_trimmed"
|
||||
train_wav_files = []
|
||||
dev_wav_files = []
|
||||
test_wav_files = []
|
||||
for speaker in os.listdir(wav_dir):
|
||||
wav_files = sorted(list((wav_dir / speaker).rglob("*_mic2.flac")))
|
||||
if len(wav_files) > 100:
|
||||
train_wav_files += wav_files[:-sub_num_dev * 2]
|
||||
dev_wav_files += wav_files[-sub_num_dev * 2:-sub_num_dev]
|
||||
test_wav_files += wav_files[-sub_num_dev:]
|
||||
else:
|
||||
train_wav_files += wav_files
|
||||
|
||||
else:
|
||||
print("dataset should in {baker, aishell3, ljspeech, vctk} now!")
|
||||
|
||||
train_dump_dir = dumpdir / "train" / "raw"
|
||||
train_dump_dir.mkdir(parents=True, exist_ok=True)
|
||||
dev_dump_dir = dumpdir / "dev" / "raw"
|
||||
dev_dump_dir.mkdir(parents=True, exist_ok=True)
|
||||
test_dump_dir = dumpdir / "test" / "raw"
|
||||
test_dump_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Extractor
|
||||
mel_extractor = LogMelFBank(
|
||||
sr=config.fs,
|
||||
n_fft=config.n_fft,
|
||||
hop_length=config.n_shift,
|
||||
win_length=config.win_length,
|
||||
window=config.window,
|
||||
n_mels=config.n_mels,
|
||||
fmin=config.fmin,
|
||||
fmax=config.fmax)
|
||||
|
||||
# process for the 3 sections
|
||||
if train_wav_files:
|
||||
process_sentences(
|
||||
config,
|
||||
train_wav_files,
|
||||
sentences,
|
||||
train_dump_dir,
|
||||
mel_extractor,
|
||||
nprocs=args.num_cpu,
|
||||
cut_sil=args.cut_sil,
|
||||
spk_emb_dir=spk_emb_dir)
|
||||
if dev_wav_files:
|
||||
process_sentences(
|
||||
config,
|
||||
dev_wav_files,
|
||||
sentences,
|
||||
dev_dump_dir,
|
||||
mel_extractor,
|
||||
cut_sil=args.cut_sil,
|
||||
spk_emb_dir=spk_emb_dir)
|
||||
if test_wav_files:
|
||||
process_sentences(
|
||||
config,
|
||||
test_wav_files,
|
||||
sentences,
|
||||
test_dump_dir,
|
||||
mel_extractor,
|
||||
nprocs=args.num_cpu,
|
||||
cut_sil=args.cut_sil,
|
||||
spk_emb_dir=spk_emb_dir)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue