[vec][score] add plda model, test=doc fix #1667
commit
6446f72cab
@ -0,0 +1,164 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Prepare VoxCeleb2 dataset
|
||||
|
||||
Download and unpack the voxceleb2 data files.
|
||||
Voxceleb2 data is stored as the m4a format,
|
||||
so we need convert the m4a to wav with the convert.sh scripts
|
||||
"""
|
||||
import argparse
|
||||
import codecs
|
||||
import glob
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import soundfile
|
||||
|
||||
from utils.utility import download
|
||||
from utils.utility import unzip
|
||||
|
||||
# all the data will be download in the current data/voxceleb directory default
|
||||
DATA_HOME = os.path.expanduser('.')
|
||||
|
||||
BASE_URL = "--no-check-certificate https://www.robots.ox.ac.uk/~vgg/data/voxceleb/data/"
|
||||
|
||||
# dev data
|
||||
DEV_DATA_URL = BASE_URL + '/vox2_aac.zip'
|
||||
DEV_MD5SUM = "bbc063c46078a602ca71605645c2a402"
|
||||
|
||||
# test data
|
||||
TEST_DATA_URL = BASE_URL + '/vox2_test_aac.zip'
|
||||
TEST_MD5SUM = "0d2b3ea430a821c33263b5ea37ede312"
|
||||
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument(
|
||||
"--target_dir",
|
||||
default=DATA_HOME + "/voxceleb2/",
|
||||
type=str,
|
||||
help="Directory to save the voxceleb1 dataset. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--manifest_prefix",
|
||||
default="manifest",
|
||||
type=str,
|
||||
help="Filepath prefix for output manifests. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--download",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help="Download the voxceleb2 dataset. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--generate",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help="Generate the manifest files. (default: %(default)s)")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
def create_manifest(data_dir, manifest_path_prefix):
|
||||
print("Creating manifest %s ..." % manifest_path_prefix)
|
||||
json_lines = []
|
||||
data_path = os.path.join(data_dir, "**", "*.wav")
|
||||
total_sec = 0.0
|
||||
total_text = 0.0
|
||||
total_num = 0
|
||||
speakers = set()
|
||||
for audio_path in glob.glob(data_path, recursive=True):
|
||||
audio_id = "-".join(audio_path.split("/")[-3:])
|
||||
utt2spk = audio_path.split("/")[-3]
|
||||
duration = soundfile.info(audio_path).duration
|
||||
text = ""
|
||||
json_lines.append(
|
||||
json.dumps(
|
||||
{
|
||||
"utt": audio_id,
|
||||
"utt2spk": str(utt2spk),
|
||||
"feat": audio_path,
|
||||
"feat_shape": (duration, ),
|
||||
"text": text # compatible with asr data format
|
||||
},
|
||||
ensure_ascii=False))
|
||||
|
||||
total_sec += duration
|
||||
total_text += len(text)
|
||||
total_num += 1
|
||||
speakers.add(utt2spk)
|
||||
|
||||
# data_dir_name refer to dev or test
|
||||
# voxceleb2 is given explicit in the path
|
||||
data_dir_name = Path(data_dir).name
|
||||
manifest_path_prefix = manifest_path_prefix + "." + data_dir_name
|
||||
|
||||
if not os.path.exists(os.path.dirname(manifest_path_prefix)):
|
||||
os.makedirs(os.path.dirname(manifest_path_prefix))
|
||||
with codecs.open(manifest_path_prefix, 'w', encoding='utf-8') as f:
|
||||
for line in json_lines:
|
||||
f.write(line + "\n")
|
||||
|
||||
manifest_dir = os.path.dirname(manifest_path_prefix)
|
||||
meta_path = os.path.join(manifest_dir, "voxceleb2." +
|
||||
data_dir_name) + ".meta"
|
||||
with codecs.open(meta_path, 'w', encoding='utf-8') as f:
|
||||
print(f"{total_num} utts", file=f)
|
||||
print(f"{len(speakers)} speakers", file=f)
|
||||
print(f"{total_sec / (60 * 60)} h", file=f)
|
||||
print(f"{total_text} text", file=f)
|
||||
print(f"{total_text / total_sec} text/sec", file=f)
|
||||
print(f"{total_sec / total_num} sec/utt", file=f)
|
||||
|
||||
|
||||
def download_dataset(url, md5sum, target_dir, dataset):
|
||||
if not os.path.exists(target_dir):
|
||||
os.makedirs(target_dir)
|
||||
|
||||
# wav directory already exists, it need do nothing
|
||||
print("target dir {}".format(os.path.join(target_dir, dataset)))
|
||||
# unzip the dev dataset will create the dev and unzip the m4a to dev dir
|
||||
# but the test dataset will unzip to aac
|
||||
# so, wo create the ${target_dir}/test and unzip the m4a to test dir
|
||||
if not os.path.exists(os.path.join(target_dir, dataset)):
|
||||
filepath = download(url, md5sum, target_dir)
|
||||
if dataset == "test":
|
||||
unzip(filepath, os.path.join(target_dir, "test"))
|
||||
|
||||
|
||||
def main():
|
||||
if args.target_dir.startswith('~'):
|
||||
args.target_dir = os.path.expanduser(args.target_dir)
|
||||
|
||||
# download and unpack the vox2-dev data
|
||||
print("download: {}".format(args.download))
|
||||
if args.download:
|
||||
download_dataset(
|
||||
url=DEV_DATA_URL,
|
||||
md5sum=DEV_MD5SUM,
|
||||
target_dir=args.target_dir,
|
||||
dataset="dev")
|
||||
|
||||
download_dataset(
|
||||
url=TEST_DATA_URL,
|
||||
md5sum=TEST_MD5SUM,
|
||||
target_dir=args.target_dir,
|
||||
dataset="test")
|
||||
|
||||
print("VoxCeleb2 download is done!")
|
||||
|
||||
if args.generate:
|
||||
create_manifest(
|
||||
args.target_dir, manifest_path_prefix=args.manifest_prefix)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
@ -0,0 +1,88 @@
|
||||
version: '3.5'
|
||||
|
||||
services:
|
||||
etcd:
|
||||
container_name: milvus-etcd
|
||||
image: quay.io/coreos/etcd:v3.5.0
|
||||
networks:
|
||||
app_net:
|
||||
environment:
|
||||
- ETCD_AUTO_COMPACTION_MODE=revision
|
||||
- ETCD_AUTO_COMPACTION_RETENTION=1000
|
||||
- ETCD_QUOTA_BACKEND_BYTES=4294967296
|
||||
volumes:
|
||||
- ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/etcd:/etcd
|
||||
command: etcd -advertise-client-urls=http://127.0.0.1:2379 -listen-client-urls http://0.0.0.0:2379 --data-dir /etcd
|
||||
|
||||
minio:
|
||||
container_name: milvus-minio
|
||||
image: minio/minio:RELEASE.2020-12-03T00-03-10Z
|
||||
networks:
|
||||
app_net:
|
||||
environment:
|
||||
MINIO_ACCESS_KEY: minioadmin
|
||||
MINIO_SECRET_KEY: minioadmin
|
||||
volumes:
|
||||
- ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/minio:/minio_data
|
||||
command: minio server /minio_data
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
|
||||
interval: 30s
|
||||
timeout: 20s
|
||||
retries: 3
|
||||
|
||||
standalone:
|
||||
container_name: milvus-standalone
|
||||
image: milvusdb/milvus:v2.0.1
|
||||
networks:
|
||||
app_net:
|
||||
ipv4_address: 172.16.23.10
|
||||
command: ["milvus", "run", "standalone"]
|
||||
environment:
|
||||
ETCD_ENDPOINTS: etcd:2379
|
||||
MINIO_ADDRESS: minio:9000
|
||||
volumes:
|
||||
- ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/milvus:/var/lib/milvus
|
||||
ports:
|
||||
- "19530:19530"
|
||||
depends_on:
|
||||
- "etcd"
|
||||
- "minio"
|
||||
|
||||
mysql:
|
||||
container_name: audio-mysql
|
||||
image: mysql:5.7
|
||||
networks:
|
||||
app_net:
|
||||
ipv4_address: 172.16.23.11
|
||||
environment:
|
||||
- MYSQL_ROOT_PASSWORD=123456
|
||||
volumes:
|
||||
- ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/mysql:/var/lib/mysql
|
||||
ports:
|
||||
- "3306:3306"
|
||||
|
||||
webclient:
|
||||
container_name: audio-webclient
|
||||
image: paddlepaddle/paddlespeech-audio-search-client:2.3
|
||||
networks:
|
||||
app_net:
|
||||
ipv4_address: 172.16.23.13
|
||||
environment:
|
||||
API_URL: 'http://127.0.0.1:8002'
|
||||
ports:
|
||||
- "8068:80"
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://localhost/"]
|
||||
interval: 30s
|
||||
timeout: 20s
|
||||
retries: 3
|
||||
|
||||
networks:
|
||||
app_net:
|
||||
driver: bridge
|
||||
ipam:
|
||||
driver: default
|
||||
config:
|
||||
- subnet: 172.16.23.0/24
|
||||
gateway: 172.16.23.1
|
After Width: | Height: | Size: 29 KiB |
After Width: | Height: | Size: 50 KiB |
After Width: | Height: | Size: 33 KiB |
After Width: | Height: | Size: 81 KiB |
@ -0,0 +1,13 @@
|
||||
diskcache==5.2.1
|
||||
dtaidistance==2.3.1
|
||||
fastapi
|
||||
librosa==0.8.0
|
||||
numpy==1.21.0
|
||||
pydantic
|
||||
pymilvus==2.0.1
|
||||
pymysql
|
||||
python-multipart
|
||||
soundfile==0.10.3.post1
|
||||
starlette
|
||||
typing
|
||||
uvicorn
|
@ -0,0 +1,36 @@
|
||||
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import os
|
||||
|
||||
############### Milvus Configuration ###############
|
||||
MILVUS_HOST = os.getenv("MILVUS_HOST", "127.0.0.1")
|
||||
MILVUS_PORT = int(os.getenv("MILVUS_PORT", "19530"))
|
||||
VECTOR_DIMENSION = int(os.getenv("VECTOR_DIMENSION", "192"))
|
||||
INDEX_FILE_SIZE = int(os.getenv("INDEX_FILE_SIZE", "1024"))
|
||||
METRIC_TYPE = os.getenv("METRIC_TYPE", "L2")
|
||||
DEFAULT_TABLE = os.getenv("DEFAULT_TABLE", "audio_table")
|
||||
TOP_K = int(os.getenv("TOP_K", "10"))
|
||||
|
||||
############### MySQL Configuration ###############
|
||||
MYSQL_HOST = os.getenv("MYSQL_HOST", "127.0.0.1")
|
||||
MYSQL_PORT = int(os.getenv("MYSQL_PORT", "3306"))
|
||||
MYSQL_USER = os.getenv("MYSQL_USER", "root")
|
||||
MYSQL_PWD = os.getenv("MYSQL_PWD", "123456")
|
||||
MYSQL_DB = os.getenv("MYSQL_DB", "mysql")
|
||||
|
||||
############### Data Path ###############
|
||||
UPLOAD_PATH = os.getenv("UPLOAD_PATH", "tmp/audio-data")
|
||||
|
||||
############### Number of Log Files ###############
|
||||
LOGS_NUM = int(os.getenv("logs_num", "0"))
|
@ -0,0 +1,34 @@
|
||||
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import numpy as np
|
||||
|
||||
from logs import LOGGER
|
||||
from paddlespeech.cli import VectorExecutor
|
||||
|
||||
vector_executor = VectorExecutor()
|
||||
|
||||
|
||||
def get_audio_embedding(path):
|
||||
"""
|
||||
Use vpr_inference to generate embedding of audio
|
||||
"""
|
||||
try:
|
||||
embedding = vector_executor(
|
||||
audio_file=path, model='ecapatdnn_voxceleb12')
|
||||
embedding = embedding / np.linalg.norm(embedding)
|
||||
embedding = embedding.tolist()
|
||||
return embedding
|
||||
except Exception as e:
|
||||
LOGGER.error(f"Error with embedding:{e}")
|
||||
return None
|
@ -0,0 +1,168 @@
|
||||
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import os
|
||||
from typing import Optional
|
||||
|
||||
import uvicorn
|
||||
from config import UPLOAD_PATH
|
||||
from diskcache import Cache
|
||||
from fastapi import FastAPI
|
||||
from fastapi import File
|
||||
from fastapi import UploadFile
|
||||
from logs import LOGGER
|
||||
from milvus_helpers import MilvusHelper
|
||||
from mysql_helpers import MySQLHelper
|
||||
from operations.count import do_count
|
||||
from operations.drop import do_drop
|
||||
from operations.load import do_load
|
||||
from operations.search import do_search
|
||||
from pydantic import BaseModel
|
||||
from starlette.middleware.cors import CORSMiddleware
|
||||
from starlette.requests import Request
|
||||
from starlette.responses import FileResponse
|
||||
|
||||
app = FastAPI()
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=["*"],
|
||||
allow_credentials=True,
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"])
|
||||
|
||||
MODEL = None
|
||||
MILVUS_CLI = MilvusHelper()
|
||||
MYSQL_CLI = MySQLHelper()
|
||||
|
||||
# Mkdir 'tmp/audio-data'
|
||||
if not os.path.exists(UPLOAD_PATH):
|
||||
os.makedirs(UPLOAD_PATH)
|
||||
LOGGER.info(f"Mkdir the path: {UPLOAD_PATH}")
|
||||
|
||||
|
||||
@app.get('/data')
|
||||
def audio_path(audio_path):
|
||||
# Get the audio file
|
||||
try:
|
||||
LOGGER.info(f"Successfully load audio: {audio_path}")
|
||||
return FileResponse(audio_path)
|
||||
except Exception as e:
|
||||
LOGGER.error(f"upload audio error: {e}")
|
||||
return {'status': False, 'msg': e}, 400
|
||||
|
||||
|
||||
@app.get('/progress')
|
||||
def get_progress():
|
||||
# Get the progress of dealing with data
|
||||
try:
|
||||
cache = Cache('./tmp')
|
||||
return f"current: {cache['current']}, total: {cache['total']}"
|
||||
except Exception as e:
|
||||
LOGGER.error(f"Upload data error: {e}")
|
||||
return {'status': False, 'msg': e}, 400
|
||||
|
||||
|
||||
class Item(BaseModel):
|
||||
Table: Optional[str] = None
|
||||
File: str
|
||||
|
||||
|
||||
@app.post('/audio/load')
|
||||
async def load_audios(item: Item):
|
||||
# Insert all the audio files under the file path to Milvus/MySQL
|
||||
try:
|
||||
total_num = do_load(item.Table, item.File, MILVUS_CLI, MYSQL_CLI)
|
||||
LOGGER.info(f"Successfully loaded data, total count: {total_num}")
|
||||
return {'status': True, 'msg': "Successfully loaded data!"}
|
||||
except Exception as e:
|
||||
LOGGER.error(e)
|
||||
return {'status': False, 'msg': e}, 400
|
||||
|
||||
|
||||
@app.post('/audio/search')
|
||||
async def search_audio(request: Request,
|
||||
table_name: str=None,
|
||||
audio: UploadFile=File(...)):
|
||||
# Search the uploaded audio in Milvus/MySQL
|
||||
try:
|
||||
# Save the upload data to server.
|
||||
content = await audio.read()
|
||||
query_audio_path = os.path.join(UPLOAD_PATH, audio.filename)
|
||||
with open(query_audio_path, "wb+") as f:
|
||||
f.write(content)
|
||||
host = request.headers['host']
|
||||
_, paths, distances = do_search(host, table_name, query_audio_path,
|
||||
MILVUS_CLI, MYSQL_CLI)
|
||||
names = []
|
||||
for path, score in zip(paths, distances):
|
||||
names.append(os.path.basename(path))
|
||||
LOGGER.info(f"search result {path}, score {score}")
|
||||
res = dict(zip(paths, zip(names, distances)))
|
||||
# Sort results by distance metric, closest distances first
|
||||
res = sorted(res.items(), key=lambda item: item[1][1], reverse=True)
|
||||
LOGGER.info("Successfully searched similar audio!")
|
||||
return res
|
||||
except Exception as e:
|
||||
LOGGER.error(e)
|
||||
return {'status': False, 'msg': e}, 400
|
||||
|
||||
|
||||
@app.post('/audio/search/local')
|
||||
async def search_local_audio(request: Request,
|
||||
query_audio_path: str,
|
||||
table_name: str=None):
|
||||
# Search the uploaded audio in Milvus/MySQL
|
||||
try:
|
||||
host = request.headers['host']
|
||||
_, paths, distances = do_search(host, table_name, query_audio_path,
|
||||
MILVUS_CLI, MYSQL_CLI)
|
||||
names = []
|
||||
for path, score in zip(paths, distances):
|
||||
names.append(os.path.basename(path))
|
||||
LOGGER.info(f"search result {path}, score {score}")
|
||||
res = dict(zip(paths, zip(names, distances)))
|
||||
# Sort results by distance metric, closest distances first
|
||||
res = sorted(res.items(), key=lambda item: item[1][1], reverse=True)
|
||||
LOGGER.info("Successfully searched similar audio!")
|
||||
return res
|
||||
except Exception as e:
|
||||
LOGGER.error(e)
|
||||
return {'status': False, 'msg': e}, 400
|
||||
|
||||
|
||||
@app.get('/audio/count')
|
||||
async def count_audio(table_name: str=None):
|
||||
# Returns the total number of vectors in the system
|
||||
try:
|
||||
num = do_count(table_name, MILVUS_CLI)
|
||||
LOGGER.info("Successfully count the number of data!")
|
||||
return num
|
||||
except Exception as e:
|
||||
LOGGER.error(e)
|
||||
return {'status': False, 'msg': e}, 400
|
||||
|
||||
|
||||
@app.post('/audio/drop')
|
||||
async def drop_tables(table_name: str=None):
|
||||
# Delete the collection of Milvus and MySQL
|
||||
try:
|
||||
status = do_drop(table_name, MILVUS_CLI, MYSQL_CLI)
|
||||
LOGGER.info("Successfully drop tables in Milvus and MySQL!")
|
||||
return status
|
||||
except Exception as e:
|
||||
LOGGER.error(e)
|
||||
return {'status': False, 'msg': e}, 400
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
uvicorn.run(app=app, host='0.0.0.0', port=8002)
|
@ -0,0 +1,185 @@
|
||||
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import sys
|
||||
|
||||
from config import METRIC_TYPE
|
||||
from config import MILVUS_HOST
|
||||
from config import MILVUS_PORT
|
||||
from config import VECTOR_DIMENSION
|
||||
from logs import LOGGER
|
||||
from pymilvus import Collection
|
||||
from pymilvus import CollectionSchema
|
||||
from pymilvus import connections
|
||||
from pymilvus import DataType
|
||||
from pymilvus import FieldSchema
|
||||
from pymilvus import utility
|
||||
|
||||
|
||||
class MilvusHelper:
|
||||
"""
|
||||
the basic operations of PyMilvus
|
||||
|
||||
# This example shows how to:
|
||||
# 1. connect to Milvus server
|
||||
# 2. create a collection
|
||||
# 3. insert entities
|
||||
# 4. create index
|
||||
# 5. search
|
||||
# 6. delete a collection
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
try:
|
||||
self.collection = None
|
||||
connections.connect(host=MILVUS_HOST, port=MILVUS_PORT)
|
||||
LOGGER.debug(
|
||||
f"Successfully connect to Milvus with IP:{MILVUS_HOST} and PORT:{MILVUS_PORT}"
|
||||
)
|
||||
except Exception as e:
|
||||
LOGGER.error(f"Failed to connect Milvus: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
def set_collection(self, collection_name):
|
||||
try:
|
||||
if self.has_collection(collection_name):
|
||||
self.collection = Collection(name=collection_name)
|
||||
else:
|
||||
raise Exception(
|
||||
f"There is no collection named:{collection_name}")
|
||||
except Exception as e:
|
||||
LOGGER.error(f"Failed to set collection in Milvus: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
def has_collection(self, collection_name):
|
||||
# Return if Milvus has the collection
|
||||
try:
|
||||
return utility.has_collection(collection_name)
|
||||
except Exception as e:
|
||||
LOGGER.error(f"Failed to check state of collection in Milvus: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
def create_collection(self, collection_name):
|
||||
# Create milvus collection if not exists
|
||||
try:
|
||||
if not self.has_collection(collection_name):
|
||||
field1 = FieldSchema(
|
||||
name="id",
|
||||
dtype=DataType.INT64,
|
||||
descrition="int64",
|
||||
is_primary=True,
|
||||
auto_id=True)
|
||||
field2 = FieldSchema(
|
||||
name="embedding",
|
||||
dtype=DataType.FLOAT_VECTOR,
|
||||
descrition="speaker embeddings",
|
||||
dim=VECTOR_DIMENSION,
|
||||
is_primary=False)
|
||||
schema = CollectionSchema(
|
||||
fields=[field1, field2], description="embeddings info")
|
||||
self.collection = Collection(
|
||||
name=collection_name, schema=schema)
|
||||
LOGGER.debug(f"Create Milvus collection: {collection_name}")
|
||||
else:
|
||||
self.set_collection(collection_name)
|
||||
return "OK"
|
||||
except Exception as e:
|
||||
LOGGER.error(f"Failed to create collection in Milvus: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
def insert(self, collection_name, vectors):
|
||||
# Batch insert vectors to milvus collection
|
||||
try:
|
||||
self.create_collection(collection_name)
|
||||
data = [vectors]
|
||||
self.set_collection(collection_name)
|
||||
mr = self.collection.insert(data)
|
||||
ids = mr.primary_keys
|
||||
self.collection.load()
|
||||
LOGGER.debug(
|
||||
f"Insert vectors to Milvus in collection: {collection_name} with {len(vectors)} rows"
|
||||
)
|
||||
return ids
|
||||
except Exception as e:
|
||||
LOGGER.error(f"Failed to insert data to Milvus: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
def create_index(self, collection_name):
|
||||
# Create IVF_FLAT index on milvus collection
|
||||
try:
|
||||
self.set_collection(collection_name)
|
||||
default_index = {
|
||||
"index_type": "IVF_SQ8",
|
||||
"metric_type": METRIC_TYPE,
|
||||
"params": {
|
||||
"nlist": 16384
|
||||
}
|
||||
}
|
||||
status = self.collection.create_index(
|
||||
field_name="embedding", index_params=default_index)
|
||||
if not status.code:
|
||||
LOGGER.debug(
|
||||
f"Successfully create index in collection:{collection_name} with param:{default_index}"
|
||||
)
|
||||
return status
|
||||
else:
|
||||
raise Exception(status.message)
|
||||
except Exception as e:
|
||||
LOGGER.error(f"Failed to create index: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
def delete_collection(self, collection_name):
|
||||
# Delete Milvus collection
|
||||
try:
|
||||
self.set_collection(collection_name)
|
||||
self.collection.drop()
|
||||
LOGGER.debug("Successfully drop collection!")
|
||||
return "ok"
|
||||
except Exception as e:
|
||||
LOGGER.error(f"Failed to drop collection: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
def search_vectors(self, collection_name, vectors, top_k):
|
||||
# Search vector in milvus collection
|
||||
try:
|
||||
self.set_collection(collection_name)
|
||||
search_params = {
|
||||
"metric_type": METRIC_TYPE,
|
||||
"params": {
|
||||
"nprobe": 16
|
||||
}
|
||||
}
|
||||
res = self.collection.search(
|
||||
vectors,
|
||||
anns_field="embedding",
|
||||
param=search_params,
|
||||
limit=top_k)
|
||||
LOGGER.debug(f"Successfully search in collection: {res}")
|
||||
return res
|
||||
except Exception as e:
|
||||
LOGGER.error(f"Failed to search vectors in Milvus: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
def count(self, collection_name):
|
||||
# Get the number of milvus collection
|
||||
try:
|
||||
self.set_collection(collection_name)
|
||||
num = self.collection.num_entities
|
||||
LOGGER.debug(
|
||||
f"Successfully get the num:{num} of the collection:{collection_name}"
|
||||
)
|
||||
return num
|
||||
except Exception as e:
|
||||
LOGGER.error(f"Failed to count vectors in Milvus: {e}")
|
||||
sys.exit(1)
|
@ -0,0 +1,133 @@
|
||||
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import sys
|
||||
|
||||
import pymysql
|
||||
from config import MYSQL_DB
|
||||
from config import MYSQL_HOST
|
||||
from config import MYSQL_PORT
|
||||
from config import MYSQL_PWD
|
||||
from config import MYSQL_USER
|
||||
from logs import LOGGER
|
||||
|
||||
|
||||
class MySQLHelper():
|
||||
"""
|
||||
the basic operations of PyMySQL
|
||||
|
||||
# This example shows how to:
|
||||
# 1. connect to MySQL server
|
||||
# 2. create a table
|
||||
# 3. insert data to table
|
||||
# 4. search by milvus ids
|
||||
# 5. delete table
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.conn = pymysql.connect(
|
||||
host=MYSQL_HOST,
|
||||
user=MYSQL_USER,
|
||||
port=MYSQL_PORT,
|
||||
password=MYSQL_PWD,
|
||||
database=MYSQL_DB,
|
||||
local_infile=True)
|
||||
self.cursor = self.conn.cursor()
|
||||
|
||||
def test_connection(self):
|
||||
try:
|
||||
self.conn.ping()
|
||||
except Exception:
|
||||
self.conn = pymysql.connect(
|
||||
host=MYSQL_HOST,
|
||||
user=MYSQL_USER,
|
||||
port=MYSQL_PORT,
|
||||
password=MYSQL_PWD,
|
||||
database=MYSQL_DB,
|
||||
local_infile=True)
|
||||
self.cursor = self.conn.cursor()
|
||||
|
||||
def create_mysql_table(self, table_name):
|
||||
# Create mysql table if not exists
|
||||
self.test_connection()
|
||||
sql = "create table if not exists " + table_name + "(milvus_id TEXT, audio_path TEXT);"
|
||||
try:
|
||||
self.cursor.execute(sql)
|
||||
LOGGER.debug(f"MYSQL create table: {table_name} with sql: {sql}")
|
||||
except Exception as e:
|
||||
LOGGER.error(f"MYSQL ERROR: {e} with sql: {sql}")
|
||||
sys.exit(1)
|
||||
|
||||
def load_data_to_mysql(self, table_name, data):
|
||||
# Batch insert (Milvus_ids, img_path) to mysql
|
||||
self.test_connection()
|
||||
sql = "insert into " + table_name + " (milvus_id,audio_path) values (%s,%s);"
|
||||
try:
|
||||
self.cursor.executemany(sql, data)
|
||||
self.conn.commit()
|
||||
LOGGER.debug(
|
||||
f"MYSQL loads data to table: {table_name} successfully")
|
||||
except Exception as e:
|
||||
LOGGER.error(f"MYSQL ERROR: {e} with sql: {sql}")
|
||||
sys.exit(1)
|
||||
|
||||
def search_by_milvus_ids(self, ids, table_name):
|
||||
# Get the img_path according to the milvus ids
|
||||
self.test_connection()
|
||||
str_ids = str(ids).replace('[', '').replace(']', '')
|
||||
sql = "select audio_path from " + table_name + " where milvus_id in (" + str_ids + ") order by field (milvus_id," + str_ids + ");"
|
||||
try:
|
||||
self.cursor.execute(sql)
|
||||
results = self.cursor.fetchall()
|
||||
results = [res[0] for res in results]
|
||||
LOGGER.debug("MYSQL search by milvus id.")
|
||||
return results
|
||||
except Exception as e:
|
||||
LOGGER.error(f"MYSQL ERROR: {e} with sql: {sql}")
|
||||
sys.exit(1)
|
||||
|
||||
def delete_table(self, table_name):
|
||||
# Delete mysql table if exists
|
||||
self.test_connection()
|
||||
sql = "drop table if exists " + table_name + ";"
|
||||
try:
|
||||
self.cursor.execute(sql)
|
||||
LOGGER.debug(f"MYSQL delete table:{table_name}")
|
||||
except Exception as e:
|
||||
LOGGER.error(f"MYSQL ERROR: {e} with sql: {sql}")
|
||||
sys.exit(1)
|
||||
|
||||
def delete_all_data(self, table_name):
|
||||
# Delete all the data in mysql table
|
||||
self.test_connection()
|
||||
sql = 'delete from ' + table_name + ';'
|
||||
try:
|
||||
self.cursor.execute(sql)
|
||||
self.conn.commit()
|
||||
LOGGER.debug(f"MYSQL delete all data in table:{table_name}")
|
||||
except Exception as e:
|
||||
LOGGER.error(f"MYSQL ERROR: {e} with sql: {sql}")
|
||||
sys.exit(1)
|
||||
|
||||
def count_table(self, table_name):
|
||||
# Get the number of mysql table
|
||||
self.test_connection()
|
||||
sql = "select count(milvus_id) from " + table_name + ";"
|
||||
try:
|
||||
self.cursor.execute(sql)
|
||||
results = self.cursor.fetchall()
|
||||
LOGGER.debug(f"MYSQL count table:{table_name}")
|
||||
return results[0][0]
|
||||
except Exception as e:
|
||||
LOGGER.error(f"MYSQL ERROR: {e} with sql: {sql}")
|
||||
sys.exit(1)
|
@ -0,0 +1,13 @@
|
||||
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
@ -0,0 +1,33 @@
|
||||
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import sys
|
||||
|
||||
from config import DEFAULT_TABLE
|
||||
from logs import LOGGER
|
||||
|
||||
|
||||
def do_count(table_name, milvus_cli):
|
||||
"""
|
||||
Returns the total number of vectors in the system
|
||||
"""
|
||||
if not table_name:
|
||||
table_name = DEFAULT_TABLE
|
||||
try:
|
||||
if not milvus_cli.has_collection(table_name):
|
||||
return None
|
||||
num = milvus_cli.count(table_name)
|
||||
return num
|
||||
except Exception as e:
|
||||
LOGGER.error(f"Error attempting to count table {e}")
|
||||
sys.exit(1)
|
@ -0,0 +1,34 @@
|
||||
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import sys
|
||||
|
||||
from config import DEFAULT_TABLE
|
||||
from logs import LOGGER
|
||||
|
||||
|
||||
def do_drop(table_name, milvus_cli, mysql_cli):
|
||||
"""
|
||||
Delete the collection of Milvus and MySQL
|
||||
"""
|
||||
if not table_name:
|
||||
table_name = DEFAULT_TABLE
|
||||
try:
|
||||
if not milvus_cli.has_collection(table_name):
|
||||
return "Collection is not exist"
|
||||
status = milvus_cli.delete_collection(table_name)
|
||||
mysql_cli.delete_table(table_name)
|
||||
return status
|
||||
except Exception as e:
|
||||
LOGGER.error(f"Error attempting to drop table: {e}")
|
||||
sys.exit(1)
|
@ -0,0 +1,84 @@
|
||||
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import os
|
||||
import sys
|
||||
|
||||
from config import DEFAULT_TABLE
|
||||
from diskcache import Cache
|
||||
from encode import get_audio_embedding
|
||||
from logs import LOGGER
|
||||
|
||||
|
||||
def get_audios(path):
|
||||
"""
|
||||
List all wav and aif files recursively under the path folder.
|
||||
"""
|
||||
supported_formats = [".wav", ".mp3", ".ogg", ".flac", ".m4a"]
|
||||
return [
|
||||
item for sublist in [[os.path.join(dir, file) for file in files]
|
||||
for dir, _, files in list(os.walk(path))]
|
||||
for item in sublist if os.path.splitext(item)[1] in supported_formats
|
||||
]
|
||||
|
||||
|
||||
def extract_features(audio_dir):
|
||||
"""
|
||||
Get the vector of audio
|
||||
"""
|
||||
try:
|
||||
cache = Cache('./tmp')
|
||||
feats = []
|
||||
names = []
|
||||
audio_list = get_audios(audio_dir)
|
||||
total = len(audio_list)
|
||||
cache['total'] = total
|
||||
for i, audio_path in enumerate(audio_list):
|
||||
norm_feat = get_audio_embedding(audio_path)
|
||||
if norm_feat is None:
|
||||
continue
|
||||
feats.append(norm_feat)
|
||||
names.append(audio_path.encode())
|
||||
cache['current'] = i + 1
|
||||
print(
|
||||
f"Extracting feature from audio No. {i + 1} , {total} audios in total"
|
||||
)
|
||||
return feats, names
|
||||
except Exception as e:
|
||||
LOGGER.error(f"Error with extracting feature from audio {e}")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def format_data(ids, names):
|
||||
"""
|
||||
Combine the id of the vector and the name of the audio into a list
|
||||
"""
|
||||
data = []
|
||||
for i in range(len(ids)):
|
||||
value = (str(ids[i]), names[i])
|
||||
data.append(value)
|
||||
return data
|
||||
|
||||
|
||||
def do_load(table_name, audio_dir, milvus_cli, mysql_cli):
|
||||
"""
|
||||
Import vectors to Milvus and data to Mysql respectively
|
||||
"""
|
||||
if not table_name:
|
||||
table_name = DEFAULT_TABLE
|
||||
vectors, names = extract_features(audio_dir)
|
||||
ids = milvus_cli.insert(table_name, vectors)
|
||||
milvus_cli.create_index(table_name)
|
||||
mysql_cli.create_mysql_table(table_name)
|
||||
mysql_cli.load_data_to_mysql(table_name, format_data(ids, names))
|
||||
return len(ids)
|
@ -0,0 +1,41 @@
|
||||
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import sys
|
||||
|
||||
from config import DEFAULT_TABLE
|
||||
from config import TOP_K
|
||||
from encode import get_audio_embedding
|
||||
from logs import LOGGER
|
||||
|
||||
|
||||
def do_search(host, table_name, audio_path, milvus_cli, mysql_cli):
|
||||
"""
|
||||
Search the uploaded audio in Milvus/MySQL
|
||||
"""
|
||||
try:
|
||||
if not table_name:
|
||||
table_name = DEFAULT_TABLE
|
||||
feat = get_audio_embedding(audio_path)
|
||||
vectors = milvus_cli.search_vectors(table_name, [feat], TOP_K)
|
||||
vids = [str(x.id) for x in vectors[0]]
|
||||
paths = mysql_cli.search_by_milvus_ids(vids, table_name)
|
||||
distances = [x.distance for x in vectors[0]]
|
||||
for i in range(len(paths)):
|
||||
tmp = "http://" + str(host) + "/data?audio_path=" + str(paths[i])
|
||||
paths[i] = tmp
|
||||
distances[i] = (1 - distances[i]) * 100
|
||||
return vids, paths, distances
|
||||
except Exception as e:
|
||||
LOGGER.error(f"Error with search: {e}")
|
||||
sys.exit(1)
|
@ -0,0 +1,95 @@
|
||||
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from fastapi.testclient import TestClient
|
||||
from main import app
|
||||
|
||||
from utils.utility import download
|
||||
from utils.utility import unpack
|
||||
|
||||
client = TestClient(app)
|
||||
|
||||
|
||||
def download_audio_data():
|
||||
"""
|
||||
download audio data
|
||||
"""
|
||||
url = "https://paddlespeech.bj.bcebos.com/vector/audio/example_audio.tar.gz"
|
||||
md5sum = "52ac69316c1aa1fdef84da7dd2c67b39"
|
||||
target_dir = "./"
|
||||
filepath = download(url, md5sum, target_dir)
|
||||
unpack(filepath, target_dir, True)
|
||||
|
||||
|
||||
def test_drop():
|
||||
"""
|
||||
Delete the collection of Milvus and MySQL
|
||||
"""
|
||||
response = client.post("/audio/drop")
|
||||
assert response.status_code == 200
|
||||
|
||||
|
||||
def test_load():
|
||||
"""
|
||||
Insert all the audio files under the file path to Milvus/MySQL
|
||||
"""
|
||||
response = client.post("/audio/load", json={"File": "./example_audio"})
|
||||
assert response.status_code == 200
|
||||
assert response.json() == {
|
||||
'status': True,
|
||||
'msg': "Successfully loaded data!"
|
||||
}
|
||||
|
||||
|
||||
def test_progress():
|
||||
"""
|
||||
Get the progress of dealing with data
|
||||
"""
|
||||
response = client.get("/progress")
|
||||
assert response.status_code == 200
|
||||
assert response.json() == "current: 20, total: 20"
|
||||
|
||||
|
||||
def test_count():
|
||||
"""
|
||||
Returns the total number of vectors in the system
|
||||
"""
|
||||
response = client.get("audio/count")
|
||||
assert response.status_code == 200
|
||||
assert response.json() == 20
|
||||
|
||||
|
||||
def test_search():
|
||||
"""
|
||||
Search the uploaded audio in Milvus/MySQL
|
||||
"""
|
||||
response = client.post(
|
||||
"/audio/search/local?query_audio_path=.%2Fexample_audio%2Ftest.wav")
|
||||
assert response.status_code == 200
|
||||
assert len(response.json()) == 10
|
||||
|
||||
|
||||
def test_data():
|
||||
"""
|
||||
Get the audio file
|
||||
"""
|
||||
response = client.get("/data?audio_path=.%2Fexample_audio%2Ftest.wav")
|
||||
assert response.status_code == 200
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
download_audio_data()
|
||||
test_load()
|
||||
test_count()
|
||||
test_search()
|
||||
test_drop()
|
@ -0,0 +1,158 @@
|
||||
([简体中文](./README_cn.md)|English)
|
||||
# Speech Verification)
|
||||
|
||||
## Introduction
|
||||
|
||||
Speaker Verification, refers to the problem of getting a speaker embedding from an audio.
|
||||
|
||||
This demo is an implementation to extract speaker embedding from a specific audio file. It can be done by a single command or a few lines in python using `PaddleSpeech`.
|
||||
|
||||
## Usage
|
||||
### 1. Installation
|
||||
see [installation](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md).
|
||||
|
||||
You can choose one way from easy, meduim and hard to install paddlespeech.
|
||||
|
||||
### 2. Prepare Input File
|
||||
The input of this demo should be a WAV file(`.wav`), and the sample rate must be the same as the model.
|
||||
|
||||
Here are sample files for this demo that can be downloaded:
|
||||
```bash
|
||||
wget -c https://paddlespeech.bj.bcebos.com/vector/audio/85236145389.wav
|
||||
```
|
||||
|
||||
### 3. Usage
|
||||
- Command Line(Recommended)
|
||||
```bash
|
||||
paddlespeech vector --task spk --input 85236145389.wav
|
||||
|
||||
echo -e "demo1 85236145389.wav" > vec.job
|
||||
paddlespeech vector --task spk --input vec.job
|
||||
|
||||
echo -e "demo2 85236145389.wav \n demo3 85236145389.wav" | paddlespeech vector --task spk
|
||||
```
|
||||
|
||||
Usage:
|
||||
```bash
|
||||
paddlespeech vector --help
|
||||
```
|
||||
Arguments:
|
||||
- `input`(required): Audio file to recognize.
|
||||
- `model`: Model type of vector task. Default: `ecapatdnn_voxceleb12`.
|
||||
- `sample_rate`: Sample rate of the model. Default: `16000`.
|
||||
- `config`: Config of vector task. Use pretrained model when it is None. Default: `None`.
|
||||
- `ckpt_path`: Model checkpoint. Use pretrained model when it is None. Default: `None`.
|
||||
- `device`: Choose device to execute model inference. Default: default device of paddlepaddle in current environment.
|
||||
|
||||
Output:
|
||||
|
||||
```bash
|
||||
demo [ -5.749211 9.505463 -8.200284 -5.2075014 5.3940268
|
||||
-3.04878 1.611095 10.127234 -10.534177 -15.821609
|
||||
1.2032688 -0.35080156 1.2629458 -12.643498 -2.5758228
|
||||
-11.343508 2.3385992 -8.719341 14.213509 15.404744
|
||||
-0.39327756 6.338786 2.688887 8.7104025 17.469526
|
||||
-8.77959 7.0576906 4.648855 -1.3089896 -23.294737
|
||||
8.013747 13.891729 -9.926753 5.655307 -5.9422326
|
||||
-22.842539 0.6293588 -18.46266 -10.811862 9.8192625
|
||||
3.0070958 3.8072643 -2.3861165 3.0821571 -14.739942
|
||||
1.7594414 -0.6485091 4.485623 2.0207152 7.264915
|
||||
-6.40137 23.63524 2.9711294 -22.708025 9.93719
|
||||
20.354511 -10.324688 -0.700492 -8.783211 -5.27593
|
||||
15.999649 3.3004563 12.747926 15.429879 4.7849145
|
||||
5.6699696 -2.3826702 10.605882 3.9112158 3.1500628
|
||||
15.859915 -2.1832209 -23.908653 -6.4799504 -4.5365124
|
||||
-9.224193 14.568347 -10.568833 4.982321 -4.342062
|
||||
0.0914714 12.645902 -5.74285 -3.2141201 -2.7173362
|
||||
-6.680575 0.4757669 -5.035051 -6.7964664 16.865469
|
||||
-11.54324 7.681869 0.44475392 9.708182 -8.932846
|
||||
0.4123232 -4.361452 1.3948607 9.511665 0.11667654
|
||||
2.9079323 6.049952 9.275183 -18.078873 6.2983274
|
||||
-0.7500531 -2.725033 -7.6027865 3.3404543 2.990815
|
||||
4.010979 11.000591 -2.8873312 7.1352735 -16.79663
|
||||
18.495346 -14.293832 7.89578 2.2714825 22.976387
|
||||
-4.875734 -3.0836344 -2.9999814 13.751918 6.448228
|
||||
-11.924197 2.171869 2.0423572 -6.173772 10.778437
|
||||
25.77281 -4.9495463 14.57806 0.3044315 2.6132357
|
||||
-7.591999 -2.076944 9.025118 1.7834753 -3.1799617
|
||||
-4.9401326 23.465864 5.1685796 -9.018578 9.037825
|
||||
-4.4150195 6.859591 -12.274467 -0.88911164 5.186309
|
||||
-3.9988663 -13.638606 -9.925445 -0.06329413 -3.6709652
|
||||
-12.397416 -12.719869 -1.395601 2.1150916 5.7381287
|
||||
-4.4691963 -3.82819 -0.84233856 -1.1604277 -13.490127
|
||||
8.731719 -20.778936 -11.495662 5.8033476 -4.752041
|
||||
10.833007 -6.717991 4.504732 13.4244375 1.1306485
|
||||
7.3435574 1.400918 14.704036 -9.501399 7.2315617
|
||||
-6.417456 1.3333273 11.872697 -0.30664724 8.8845
|
||||
6.5569253 4.7948146 0.03662816 -8.704245 6.224871
|
||||
-3.2701402 -11.508579 ]
|
||||
```
|
||||
|
||||
- Python API
|
||||
```python
|
||||
import paddle
|
||||
from paddlespeech.cli import VectorExecutor
|
||||
|
||||
vector_executor = VectorExecutor()
|
||||
audio_emb = vector_executor(
|
||||
model='ecapatdnn_voxceleb12',
|
||||
sample_rate=16000,
|
||||
config=None,
|
||||
ckpt_path=None,
|
||||
audio_file='./85236145389.wav',
|
||||
force_yes=False,
|
||||
device=paddle.get_device())
|
||||
print('Audio embedding Result: \n{}'.format(audio_emb))
|
||||
```
|
||||
|
||||
Output:
|
||||
```bash
|
||||
# Vector Result:
|
||||
[ -5.749211 9.505463 -8.200284 -5.2075014 5.3940268
|
||||
-3.04878 1.611095 10.127234 -10.534177 -15.821609
|
||||
1.2032688 -0.35080156 1.2629458 -12.643498 -2.5758228
|
||||
-11.343508 2.3385992 -8.719341 14.213509 15.404744
|
||||
-0.39327756 6.338786 2.688887 8.7104025 17.469526
|
||||
-8.77959 7.0576906 4.648855 -1.3089896 -23.294737
|
||||
8.013747 13.891729 -9.926753 5.655307 -5.9422326
|
||||
-22.842539 0.6293588 -18.46266 -10.811862 9.8192625
|
||||
3.0070958 3.8072643 -2.3861165 3.0821571 -14.739942
|
||||
1.7594414 -0.6485091 4.485623 2.0207152 7.264915
|
||||
-6.40137 23.63524 2.9711294 -22.708025 9.93719
|
||||
20.354511 -10.324688 -0.700492 -8.783211 -5.27593
|
||||
15.999649 3.3004563 12.747926 15.429879 4.7849145
|
||||
5.6699696 -2.3826702 10.605882 3.9112158 3.1500628
|
||||
15.859915 -2.1832209 -23.908653 -6.4799504 -4.5365124
|
||||
-9.224193 14.568347 -10.568833 4.982321 -4.342062
|
||||
0.0914714 12.645902 -5.74285 -3.2141201 -2.7173362
|
||||
-6.680575 0.4757669 -5.035051 -6.7964664 16.865469
|
||||
-11.54324 7.681869 0.44475392 9.708182 -8.932846
|
||||
0.4123232 -4.361452 1.3948607 9.511665 0.11667654
|
||||
2.9079323 6.049952 9.275183 -18.078873 6.2983274
|
||||
-0.7500531 -2.725033 -7.6027865 3.3404543 2.990815
|
||||
4.010979 11.000591 -2.8873312 7.1352735 -16.79663
|
||||
18.495346 -14.293832 7.89578 2.2714825 22.976387
|
||||
-4.875734 -3.0836344 -2.9999814 13.751918 6.448228
|
||||
-11.924197 2.171869 2.0423572 -6.173772 10.778437
|
||||
25.77281 -4.9495463 14.57806 0.3044315 2.6132357
|
||||
-7.591999 -2.076944 9.025118 1.7834753 -3.1799617
|
||||
-4.9401326 23.465864 5.1685796 -9.018578 9.037825
|
||||
-4.4150195 6.859591 -12.274467 -0.88911164 5.186309
|
||||
-3.9988663 -13.638606 -9.925445 -0.06329413 -3.6709652
|
||||
-12.397416 -12.719869 -1.395601 2.1150916 5.7381287
|
||||
-4.4691963 -3.82819 -0.84233856 -1.1604277 -13.490127
|
||||
8.731719 -20.778936 -11.495662 5.8033476 -4.752041
|
||||
10.833007 -6.717991 4.504732 13.4244375 1.1306485
|
||||
7.3435574 1.400918 14.704036 -9.501399 7.2315617
|
||||
-6.417456 1.3333273 11.872697 -0.30664724 8.8845
|
||||
6.5569253 4.7948146 0.03662816 -8.704245 6.224871
|
||||
-3.2701402 -11.508579 ]
|
||||
```
|
||||
|
||||
### 4.Pretrained Models
|
||||
|
||||
Here is a list of pretrained models released by PaddleSpeech that can be used by command and python API:
|
||||
|
||||
| Model | Sample Rate
|
||||
| :--- | :---: |
|
||||
| ecapatdnn_voxceleb12 | 16k
|
@ -0,0 +1,6 @@
|
||||
#!/bin/bash
|
||||
|
||||
wget -c https://paddlespeech.bj.bcebos.com/vector/audio/85236145389.wav
|
||||
|
||||
# asr
|
||||
paddlespeech vector --task spk --input ./85236145389.wav
|
@ -0,0 +1 @@
|
||||
*.wav
|
@ -0,0 +1,4 @@
|
||||
#!/bin/bash
|
||||
|
||||
wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
|
||||
paddlespeech_client cls --server_ip 127.0.0.1 --port 8090 --input ./zh.wav --topk 1
|
@ -0,0 +1,156 @@
|
||||
# HiFiGAN with AISHELL-3
|
||||
This example contains code used to train a [HiFiGAN](https://arxiv.org/abs/2010.05646) model with [AISHELL-3](http://www.aishelltech.com/aishell_3).
|
||||
|
||||
AISHELL-3 is a large-scale and high-fidelity multi-speaker Mandarin speech corpus that could be used to train multi-speaker Text-to-Speech (TTS) systems.
|
||||
## Dataset
|
||||
### Download and Extract
|
||||
Download AISHELL-3.
|
||||
```bash
|
||||
wget https://www.openslr.org/resources/93/data_aishell3.tgz
|
||||
```
|
||||
Extract AISHELL-3.
|
||||
```bash
|
||||
mkdir data_aishell3
|
||||
tar zxvf data_aishell3.tgz -C data_aishell3
|
||||
```
|
||||
### Get MFA Result and Extract
|
||||
We use [MFA2.x](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for aishell3_fastspeech2.
|
||||
You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) (use MFA1.x now) of our repo.
|
||||
|
||||
## Get Started
|
||||
Assume the path to the dataset is `~/datasets/data_aishell3`.
|
||||
Assume the path to the MFA result of AISHELL-3 is `./aishell3_alignment_tone`.
|
||||
Run the command below to
|
||||
1. **source path**.
|
||||
2. preprocess the dataset.
|
||||
3. train the model.
|
||||
4. synthesize wavs.
|
||||
- synthesize waveform from `metadata.jsonl`.
|
||||
```bash
|
||||
./run.sh
|
||||
```
|
||||
You can choose a range of stages you want to run, or set `stage` equal to `stop-stage` to use only one stage, for example, run the following command will only preprocess the dataset.
|
||||
```bash
|
||||
./run.sh --stage 0 --stop-stage 0
|
||||
```
|
||||
### Data Preprocessing
|
||||
```bash
|
||||
./local/preprocess.sh ${conf_path}
|
||||
```
|
||||
When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below.
|
||||
|
||||
```text
|
||||
dump
|
||||
├── dev
|
||||
│ ├── norm
|
||||
│ └── raw
|
||||
├── test
|
||||
│ ├── norm
|
||||
│ └── raw
|
||||
└── train
|
||||
├── norm
|
||||
├── raw
|
||||
└── feats_stats.npy
|
||||
```
|
||||
|
||||
The dataset is split into 3 parts, namely `train`, `dev`, and `test`, each of which contains a `norm` and `raw` subfolder. The `raw` folder contains the log magnitude of the mel spectrogram of each utterance, while the norm folder contains the normalized spectrogram. The statistics used to normalize the spectrogram are computed from the training set, which is located in `dump/train/feats_stats.npy`.
|
||||
|
||||
Also, there is a `metadata.jsonl` in each subfolder. It is a table-like file that contains id and paths to the spectrogram of each utterance.
|
||||
|
||||
### Model Training
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
|
||||
```
|
||||
`./local/train.sh` calls `${BIN_DIR}/train.py`.
|
||||
Here's the complete help message.
|
||||
|
||||
```text
|
||||
usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
|
||||
[--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
|
||||
[--ngpu NGPU] [--batch-size BATCH_SIZE] [--max-iter MAX_ITER]
|
||||
[--run-benchmark RUN_BENCHMARK]
|
||||
[--profiler_options PROFILER_OPTIONS]
|
||||
|
||||
Train a ParallelWaveGAN model.
|
||||
|
||||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
--config CONFIG config file to overwrite default config.
|
||||
--train-metadata TRAIN_METADATA
|
||||
training data.
|
||||
--dev-metadata DEV_METADATA
|
||||
dev data.
|
||||
--output-dir OUTPUT_DIR
|
||||
output dir.
|
||||
--ngpu NGPU if ngpu == 0, use cpu.
|
||||
|
||||
benchmark:
|
||||
arguments related to benchmark.
|
||||
|
||||
--batch-size BATCH_SIZE
|
||||
batch size.
|
||||
--max-iter MAX_ITER train max steps.
|
||||
--run-benchmark RUN_BENCHMARK
|
||||
runing benchmark or not, if True, use the --batch-size
|
||||
and --max-iter.
|
||||
--profiler_options PROFILER_OPTIONS
|
||||
The option of profiler, which should be in format
|
||||
"key1=value1;key2=value2;key3=value3".
|
||||
```
|
||||
|
||||
1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`.
|
||||
2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder.
|
||||
3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are saved in `checkpoints/` inside this directory.
|
||||
4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
|
||||
|
||||
### Synthesizing
|
||||
`./local/synthesize.sh` calls `${BIN_DIR}/../synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
|
||||
```
|
||||
```text
|
||||
usage: synthesize.py [-h] [--generator-type GENERATOR_TYPE] [--config CONFIG]
|
||||
[--checkpoint CHECKPOINT] [--test-metadata TEST_METADATA]
|
||||
[--output-dir OUTPUT_DIR] [--ngpu NGPU]
|
||||
|
||||
Synthesize with GANVocoder.
|
||||
|
||||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
--generator-type GENERATOR_TYPE
|
||||
type of GANVocoder, should in {pwgan, mb_melgan,
|
||||
style_melgan, } now
|
||||
--config CONFIG GANVocoder config file.
|
||||
--checkpoint CHECKPOINT
|
||||
snapshot to load.
|
||||
--test-metadata TEST_METADATA
|
||||
dev data.
|
||||
--output-dir OUTPUT_DIR
|
||||
output dir.
|
||||
--ngpu NGPU if ngpu == 0, use cpu.
|
||||
```
|
||||
|
||||
1. `--config` config file. You should use the same config with which the model is trained.
|
||||
2. `--checkpoint` is the checkpoint to load. Pick one of the checkpoints from `checkpoints` inside the training output directory.
|
||||
3. `--test-metadata` is the metadata of the test dataset. Use the `metadata.jsonl` in the `dev/norm` subfolder from the processed directory.
|
||||
4. `--output-dir` is the directory to save the synthesized audio files.
|
||||
5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
|
||||
## Pretrained Models
|
||||
The pretrained model can be downloaded here [hifigan_aishell3_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_ckpt_0.2.0.zip).
|
||||
|
||||
|
||||
Model | Step | eval/generator_loss | eval/mel_loss| eval/feature_matching_loss
|
||||
:-------------:| :------------:| :-----: | :-----: | :--------:
|
||||
default| 1(gpu) x 2500000|24.060|0.1068|7.499
|
||||
|
||||
HiFiGAN checkpoint contains files listed below.
|
||||
|
||||
```text
|
||||
hifigan_aishell3_ckpt_0.2.0
|
||||
├── default.yaml # default config used to train hifigan
|
||||
├── feats_stats.npy # statistics used to normalize spectrogram when training hifigan
|
||||
└── snapshot_iter_2500000.pdz # generator parameters of hifigan
|
||||
```
|
||||
|
||||
## Acknowledgement
|
||||
We adapted some code from https://github.com/kan-bayashi/ParallelWaveGAN.
|
@ -0,0 +1,168 @@
|
||||
# This is the configuration file for AISHELL-3 dataset.
|
||||
# This configuration is based on HiFiGAN V1, which is
|
||||
# an official configuration. But I found that the optimizer
|
||||
# setting does not work well with my implementation.
|
||||
# So I changed optimizer settings as follows:
|
||||
# - AdamW -> Adam
|
||||
# - betas: [0.8, 0.99] -> betas: [0.5, 0.9]
|
||||
# - Scheduler: ExponentialLR -> MultiStepLR
|
||||
# To match the shift size difference, the upsample scales
|
||||
# is also modified from the original 256 shift setting.
|
||||
###########################################################
|
||||
# FEATURE EXTRACTION SETTING #
|
||||
###########################################################
|
||||
fs: 24000 # Sampling rate.
|
||||
n_fft: 2048 # FFT size (samples).
|
||||
n_shift: 300 # Hop size (samples). 12.5ms
|
||||
win_length: 1200 # Window length (samples). 50ms
|
||||
# If set to null, it will be the same as fft_size.
|
||||
window: "hann" # Window function.
|
||||
n_mels: 80 # Number of mel basis.
|
||||
fmin: 80 # Minimum freq in mel basis calculation. (Hz)
|
||||
fmax: 7600 # Maximum frequency in mel basis calculation. (Hz)
|
||||
|
||||
###########################################################
|
||||
# GENERATOR NETWORK ARCHITECTURE SETTING #
|
||||
###########################################################
|
||||
generator_params:
|
||||
in_channels: 80 # Number of input channels.
|
||||
out_channels: 1 # Number of output channels.
|
||||
channels: 512 # Number of initial channels.
|
||||
kernel_size: 7 # Kernel size of initial and final conv layers.
|
||||
upsample_scales: [5, 5, 4, 3] # Upsampling scales.
|
||||
upsample_kernel_sizes: [10, 10, 8, 6] # Kernel size for upsampling layers.
|
||||
resblock_kernel_sizes: [3, 7, 11] # Kernel size for residual blocks.
|
||||
resblock_dilations: # Dilations for residual blocks.
|
||||
- [1, 3, 5]
|
||||
- [1, 3, 5]
|
||||
- [1, 3, 5]
|
||||
use_additional_convs: True # Whether to use additional conv layer in residual blocks.
|
||||
bias: True # Whether to use bias parameter in conv.
|
||||
nonlinear_activation: "leakyrelu" # Nonlinear activation type.
|
||||
nonlinear_activation_params: # Nonlinear activation paramters.
|
||||
negative_slope: 0.1
|
||||
use_weight_norm: True # Whether to apply weight normalization.
|
||||
|
||||
|
||||
###########################################################
|
||||
# DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
|
||||
###########################################################
|
||||
discriminator_params:
|
||||
scales: 3 # Number of multi-scale discriminator.
|
||||
scale_downsample_pooling: "AvgPool1D" # Pooling operation for scale discriminator.
|
||||
scale_downsample_pooling_params:
|
||||
kernel_size: 4 # Pooling kernel size.
|
||||
stride: 2 # Pooling stride.
|
||||
padding: 2 # Padding size.
|
||||
scale_discriminator_params:
|
||||
in_channels: 1 # Number of input channels.
|
||||
out_channels: 1 # Number of output channels.
|
||||
kernel_sizes: [15, 41, 5, 3] # List of kernel sizes.
|
||||
channels: 128 # Initial number of channels.
|
||||
max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers.
|
||||
max_groups: 16 # Maximum number of groups in downsampling conv layers.
|
||||
bias: True
|
||||
downsample_scales: [4, 4, 4, 4, 1] # Downsampling scales.
|
||||
nonlinear_activation: "leakyrelu" # Nonlinear activation.
|
||||
nonlinear_activation_params:
|
||||
negative_slope: 0.1
|
||||
follow_official_norm: True # Whether to follow the official norm setting.
|
||||
periods: [2, 3, 5, 7, 11] # List of period for multi-period discriminator.
|
||||
period_discriminator_params:
|
||||
in_channels: 1 # Number of input channels.
|
||||
out_channels: 1 # Number of output channels.
|
||||
kernel_sizes: [5, 3] # List of kernel sizes.
|
||||
channels: 32 # Initial number of channels.
|
||||
downsample_scales: [3, 3, 3, 3, 1] # Downsampling scales.
|
||||
max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers.
|
||||
bias: True # Whether to use bias parameter in conv layer."
|
||||
nonlinear_activation: "leakyrelu" # Nonlinear activation.
|
||||
nonlinear_activation_params: # Nonlinear activation paramters.
|
||||
negative_slope: 0.1
|
||||
use_weight_norm: True # Whether to apply weight normalization.
|
||||
use_spectral_norm: False # Whether to apply spectral normalization.
|
||||
|
||||
|
||||
###########################################################
|
||||
# STFT LOSS SETTING #
|
||||
###########################################################
|
||||
use_stft_loss: False # Whether to use multi-resolution STFT loss.
|
||||
use_mel_loss: True # Whether to use Mel-spectrogram loss.
|
||||
mel_loss_params:
|
||||
fs: 24000
|
||||
fft_size: 2048
|
||||
hop_size: 300
|
||||
win_length: 1200
|
||||
window: "hann"
|
||||
num_mels: 80
|
||||
fmin: 0
|
||||
fmax: 12000
|
||||
log_base: null
|
||||
generator_adv_loss_params:
|
||||
average_by_discriminators: False # Whether to average loss by #discriminators.
|
||||
discriminator_adv_loss_params:
|
||||
average_by_discriminators: False # Whether to average loss by #discriminators.
|
||||
use_feat_match_loss: True
|
||||
feat_match_loss_params:
|
||||
average_by_discriminators: False # Whether to average loss by #discriminators.
|
||||
average_by_layers: False # Whether to average loss by #layers in each discriminator.
|
||||
include_final_outputs: False # Whether to include final outputs in feat match loss calculation.
|
||||
|
||||
###########################################################
|
||||
# ADVERSARIAL LOSS SETTING #
|
||||
###########################################################
|
||||
lambda_aux: 45.0 # Loss balancing coefficient for STFT loss.
|
||||
lambda_adv: 1.0 # Loss balancing coefficient for adversarial loss.
|
||||
lambda_feat_match: 2.0 # Loss balancing coefficient for feat match loss..
|
||||
|
||||
###########################################################
|
||||
# DATA LOADER SETTING #
|
||||
###########################################################
|
||||
batch_size: 16 # Batch size.
|
||||
batch_max_steps: 8400 # Length of each audio in batch. Make sure dividable by hop_size.
|
||||
num_workers: 2 # Number of workers in DataLoader.
|
||||
|
||||
###########################################################
|
||||
# OPTIMIZER & SCHEDULER SETTING #
|
||||
###########################################################
|
||||
generator_optimizer_params:
|
||||
beta1: 0.5
|
||||
beta2: 0.9
|
||||
weight_decay: 0.0 # Generator's weight decay coefficient.
|
||||
generator_scheduler_params:
|
||||
learning_rate: 2.0e-4 # Generator's learning rate.
|
||||
gamma: 0.5 # Generator's scheduler gamma.
|
||||
milestones: # At each milestone, lr will be multiplied by gamma.
|
||||
- 200000
|
||||
- 400000
|
||||
- 600000
|
||||
- 800000
|
||||
generator_grad_norm: -1 # Generator's gradient norm.
|
||||
discriminator_optimizer_params:
|
||||
beta1: 0.5
|
||||
beta2: 0.9
|
||||
weight_decay: 0.0 # Discriminator's weight decay coefficient.
|
||||
discriminator_scheduler_params:
|
||||
learning_rate: 2.0e-4 # Discriminator's learning rate.
|
||||
gamma: 0.5 # Discriminator's scheduler gamma.
|
||||
milestones: # At each milestone, lr will be multiplied by gamma.
|
||||
- 200000
|
||||
- 400000
|
||||
- 600000
|
||||
- 800000
|
||||
discriminator_grad_norm: -1 # Discriminator's gradient norm.
|
||||
|
||||
###########################################################
|
||||
# INTERVAL SETTING #
|
||||
###########################################################
|
||||
generator_train_start_steps: 1 # Number of steps to start to train discriminator.
|
||||
discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
|
||||
train_max_steps: 2500000 # Number of training steps.
|
||||
save_interval_steps: 5000 # Interval steps to save checkpoint.
|
||||
eval_interval_steps: 1000 # Interval steps to evaluate the network.
|
||||
|
||||
###########################################################
|
||||
# OTHER SETTING #
|
||||
###########################################################
|
||||
num_snapshots: 10 # max number of snapshots to keep while training
|
||||
seed: 42 # random seed for paddle, random, and np.random
|
@ -0,0 +1,55 @@
|
||||
#!/bin/bash
|
||||
|
||||
stage=0
|
||||
stop_stage=100
|
||||
|
||||
config_path=$1
|
||||
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
# get durations from MFA's result
|
||||
echo "Generate durations.txt from MFA results ..."
|
||||
python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
|
||||
--inputdir=./aishell3_alignment_tone \
|
||||
--output=durations.txt \
|
||||
--config=${config_path}
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
# extract features
|
||||
echo "Extract features ..."
|
||||
python3 ${BIN_DIR}/../preprocess.py \
|
||||
--rootdir=~/datasets/data_aishell3/ \
|
||||
--dataset=aishell3 \
|
||||
--dumpdir=dump \
|
||||
--dur-file=durations.txt \
|
||||
--config=${config_path} \
|
||||
--cut-sil=True \
|
||||
--num-cpu=20
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
# get features' stats(mean and std)
|
||||
echo "Get features' stats ..."
|
||||
python3 ${MAIN_ROOT}/utils/compute_statistics.py \
|
||||
--metadata=dump/train/raw/metadata.jsonl \
|
||||
--field-name="feats"
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||
# normalize, dev and test should use train's stats
|
||||
echo "Normalize ..."
|
||||
|
||||
python3 ${BIN_DIR}/../normalize.py \
|
||||
--metadata=dump/train/raw/metadata.jsonl \
|
||||
--dumpdir=dump/train/norm \
|
||||
--stats=dump/train/feats_stats.npy
|
||||
python3 ${BIN_DIR}/../normalize.py \
|
||||
--metadata=dump/dev/raw/metadata.jsonl \
|
||||
--dumpdir=dump/dev/norm \
|
||||
--stats=dump/train/feats_stats.npy
|
||||
|
||||
python3 ${BIN_DIR}/../normalize.py \
|
||||
--metadata=dump/test/raw/metadata.jsonl \
|
||||
--dumpdir=dump/test/norm \
|
||||
--stats=dump/train/feats_stats.npy
|
||||
fi
|
@ -0,0 +1,14 @@
|
||||
#!/bin/bash
|
||||
|
||||
config_path=$1
|
||||
train_output_path=$2
|
||||
ckpt_name=$3
|
||||
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
|
||||
python3 ${BIN_DIR}/../synthesize.py \
|
||||
--config=${config_path} \
|
||||
--checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--test-metadata=dump/test/norm/metadata.jsonl \
|
||||
--output-dir=${train_output_path}/test \
|
||||
--generator-type=hifigan
|
@ -0,0 +1,13 @@
|
||||
#!/bin/bash
|
||||
|
||||
config_path=$1
|
||||
train_output_path=$2
|
||||
|
||||
FLAGS_cudnn_exhaustive_search=true \
|
||||
FLAGS_conv_workspace_size_limit=4000 \
|
||||
python ${BIN_DIR}/train.py \
|
||||
--train-metadata=dump/train/norm/metadata.jsonl \
|
||||
--dev-metadata=dump/dev/norm/metadata.jsonl \
|
||||
--config=${config_path} \
|
||||
--output-dir=${train_output_path} \
|
||||
--ngpu=1
|
@ -0,0 +1,13 @@
|
||||
#!/bin/bash
|
||||
export MAIN_ROOT=`realpath ${PWD}/../../../`
|
||||
|
||||
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
|
||||
export LC_ALL=C
|
||||
|
||||
export PYTHONDONTWRITEBYTECODE=1
|
||||
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
|
||||
export PYTHONIOENCODING=UTF-8
|
||||
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
|
||||
|
||||
MODEL=hifigan
|
||||
export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL}
|
@ -0,0 +1,32 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
source path.sh
|
||||
|
||||
gpus=0
|
||||
stage=0
|
||||
stop_stage=100
|
||||
|
||||
conf_path=conf/default.yaml
|
||||
train_output_path=exp/default
|
||||
ckpt_name=snapshot_iter_5000.pdz
|
||||
|
||||
# with the following command, you can choose the stage range you want to run
|
||||
# such as `./run.sh --stage 0 --stop-stage 0`
|
||||
# this can not be mixed use with `$1`, `$2` ...
|
||||
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
|
||||
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
# prepare data
|
||||
./local/preprocess.sh ${conf_path} || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
# train model, all `ckpt` under `train_output_path/checkpoints/` dir
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
# synthesize
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
|
||||
fi
|
@ -0,0 +1,107 @@
|
||||
# use CNND
|
||||
###########################################################
|
||||
# FEATURE EXTRACTION SETTING #
|
||||
###########################################################
|
||||
|
||||
fs: 24000 # sr
|
||||
n_fft: 2048 # FFT size (samples).
|
||||
n_shift: 300 # Hop size (samples). 12.5ms
|
||||
win_length: 1200 # Window length (samples). 50ms
|
||||
# If set to null, it will be the same as fft_size.
|
||||
window: "hann" # Window function.
|
||||
|
||||
# Only used for feats_type != raw
|
||||
|
||||
fmin: 80 # Minimum frequency of Mel basis.
|
||||
fmax: 7600 # Maximum frequency of Mel basis.
|
||||
n_mels: 80 # The number of mel basis.
|
||||
|
||||
# Only used for the model using pitch features (e.g. FastSpeech2)
|
||||
f0min: 80 # Minimum f0 for pitch extraction.
|
||||
f0max: 400 # Maximum f0 for pitch extraction.
|
||||
|
||||
|
||||
###########################################################
|
||||
# DATA SETTING #
|
||||
###########################################################
|
||||
batch_size: 64
|
||||
num_workers: 4
|
||||
|
||||
|
||||
###########################################################
|
||||
# MODEL SETTING #
|
||||
###########################################################
|
||||
model:
|
||||
adim: 384 # attention dimension
|
||||
aheads: 2 # number of attention heads
|
||||
elayers: 4 # number of encoder layers
|
||||
eunits: 1536 # number of encoder ff units
|
||||
dlayers: 4 # number of decoder layers
|
||||
dunits: 1536 # number of decoder ff units
|
||||
positionwise_layer_type: conv1d # type of position-wise layer
|
||||
positionwise_conv_kernel_size: 3 # kernel size of position wise conv layer
|
||||
duration_predictor_layers: 2 # number of layers of duration predictor
|
||||
duration_predictor_chans: 256 # number of channels of duration predictor
|
||||
duration_predictor_kernel_size: 3 # filter size of duration predictor
|
||||
postnet_layers: 5 # number of layers of postnset
|
||||
postnet_filts: 5 # filter size of conv layers in postnet
|
||||
postnet_chans: 256 # number of channels of conv layers in postnet
|
||||
use_scaled_pos_enc: True # whether to use scaled positional encoding
|
||||
encoder_normalize_before: True # whether to perform layer normalization before the input
|
||||
decoder_normalize_before: True # whether to perform layer normalization before the input
|
||||
reduction_factor: 1 # reduction factor
|
||||
encoder_type: transformer # encoder type
|
||||
decoder_type: cnndecoder # decoder type
|
||||
init_type: xavier_uniform # initialization type
|
||||
init_enc_alpha: 1.0 # initial value of alpha of encoder scaled position encoding
|
||||
init_dec_alpha: 1.0 # initial value of alpha of decoder scaled position encoding
|
||||
transformer_enc_dropout_rate: 0.2 # dropout rate for transformer encoder layer
|
||||
transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding
|
||||
transformer_enc_attn_dropout_rate: 0.2 # dropout rate for transformer encoder attention layer
|
||||
cnn_dec_dropout_rate: 0.2 # dropout rate for cnn decoder layer
|
||||
cnn_postnet_dropout_rate: 0.2
|
||||
cnn_postnet_resblock_kernel_sizes: [256, 256] # kernel sizes for residual block of cnn_postnet
|
||||
cnn_postnet_kernel_size: 5 # kernel size of cnn_postnet
|
||||
cnn_decoder_embedding_dim: 256
|
||||
pitch_predictor_layers: 5 # number of conv layers in pitch predictor
|
||||
pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor
|
||||
pitch_predictor_kernel_size: 5 # kernel size of conv leyers in pitch predictor
|
||||
pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor
|
||||
pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch
|
||||
pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch
|
||||
stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder
|
||||
energy_predictor_layers: 2 # number of conv layers in energy predictor
|
||||
energy_predictor_chans: 256 # number of channels of conv layers in energy predictor
|
||||
energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor
|
||||
energy_predictor_dropout: 0.5 # dropout rate in energy predictor
|
||||
energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy
|
||||
energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy
|
||||
stop_gradient_from_energy_predictor: False # whether to stop the gradient from energy predictor to encoder
|
||||
|
||||
|
||||
|
||||
###########################################################
|
||||
# UPDATER SETTING #
|
||||
###########################################################
|
||||
updater:
|
||||
use_masking: True # whether to apply masking for padded part in loss calculation
|
||||
|
||||
|
||||
###########################################################
|
||||
# OPTIMIZER SETTING #
|
||||
###########################################################
|
||||
optimizer:
|
||||
optim: adam # optimizer type
|
||||
learning_rate: 0.001 # learning rate
|
||||
|
||||
###########################################################
|
||||
# TRAINING SETTING #
|
||||
###########################################################
|
||||
max_epoch: 1000
|
||||
num_snapshots: 5
|
||||
|
||||
|
||||
###########################################################
|
||||
# OTHER SETTING #
|
||||
###########################################################
|
||||
seed: 10086
|
@ -0,0 +1,92 @@
|
||||
#!/bin/bash
|
||||
|
||||
config_path=$1
|
||||
train_output_path=$2
|
||||
ckpt_name=$3
|
||||
|
||||
stage=0
|
||||
stop_stage=0
|
||||
|
||||
# pwgan
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
|
||||
python3 ${BIN_DIR}/../synthesize_streaming.py \
|
||||
--am=fastspeech2_csmsc \
|
||||
--am_config=${config_path} \
|
||||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--am_stat=dump/train/speech_stats.npy \
|
||||
--voc=pwgan_csmsc \
|
||||
--voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \
|
||||
--voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
|
||||
--voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
|
||||
--lang=zh \
|
||||
--text=${BIN_DIR}/../sentences.txt \
|
||||
--output_dir=${train_output_path}/test_e2e_streaming \
|
||||
--phones_dict=dump/phone_id_map.txt \
|
||||
--am_streaming=True
|
||||
fi
|
||||
|
||||
# for more GAN Vocoders
|
||||
# multi band melgan
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
|
||||
python3 ${BIN_DIR}/../synthesize_streaming.py \
|
||||
--am=fastspeech2_csmsc \
|
||||
--am_config=${config_path} \
|
||||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--am_stat=dump/train/speech_stats.npy \
|
||||
--voc=mb_melgan_csmsc \
|
||||
--voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
|
||||
--voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
|
||||
--voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
|
||||
--lang=zh \
|
||||
--text=${BIN_DIR}/../sentences.txt \
|
||||
--output_dir=${train_output_path}/test_e2e_streaming \
|
||||
--phones_dict=dump/phone_id_map.txt \
|
||||
--am_streaming=True
|
||||
fi
|
||||
|
||||
# the pretrained models haven't release now
|
||||
# style melgan
|
||||
# style melgan's Dygraph to Static Graph is not ready now
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
|
||||
python3 ${BIN_DIR}/../synthesize_streaming.py \
|
||||
--am=fastspeech2_csmsc \
|
||||
--am_config=${config_path} \
|
||||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--am_stat=dump/train/speech_stats.npy \
|
||||
--voc=style_melgan_csmsc \
|
||||
--voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
|
||||
--voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
|
||||
--voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
|
||||
--lang=zh \
|
||||
--text=${BIN_DIR}/../sentences.txt \
|
||||
--output_dir=${train_output_path}/test_e2e_streaming \
|
||||
--phones_dict=dump/phone_id_map.txt \
|
||||
--am_streaming=True
|
||||
fi
|
||||
|
||||
# hifigan
|
||||
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||
echo "in hifigan syn_e2e"
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
|
||||
python3 ${BIN_DIR}/../synthesize_streaming.py \
|
||||
--am=fastspeech2_csmsc \
|
||||
--am_config=${config_path} \
|
||||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--am_stat=dump/train/speech_stats.npy \
|
||||
--voc=hifigan_csmsc \
|
||||
--voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
|
||||
--voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
|
||||
--voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
|
||||
--lang=zh \
|
||||
--text=${BIN_DIR}/../sentences.txt \
|
||||
--output_dir=${train_output_path}/test_e2e_streaming \
|
||||
--phones_dict=dump/phone_id_map.txt \
|
||||
--am_streaming=True
|
||||
fi
|
@ -0,0 +1,48 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
source path.sh
|
||||
|
||||
gpus=0,1
|
||||
stage=0
|
||||
stop_stage=100
|
||||
|
||||
conf_path=conf/cnndecoder.yaml
|
||||
train_output_path=exp/cnndecoder
|
||||
ckpt_name=snapshot_iter_153.pdz
|
||||
|
||||
# with the following command, you can choose the stage range you want to run
|
||||
# such as `./run.sh --stage 0 --stop-stage 0`
|
||||
# this can not be mixed use with `$1`, `$2` ...
|
||||
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
|
||||
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
# prepare data
|
||||
./local/preprocess.sh ${conf_path} || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
# train model, all `ckpt` under `train_output_path/checkpoints/` dir
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
# synthesize, vocoder is pwgan
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||
# synthesize_e2e, vocoder is pwgan
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
|
||||
# inference with static model
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path} || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
|
||||
# synthesize_e2e, vocoder is pwgan
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_streaming.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
|
||||
fi
|
||||
|
@ -0,0 +1,148 @@
|
||||
# HiFiGAN with the LJSpeech-1.1
|
||||
This example contains code used to train a [HiFiGAN](https://arxiv.org/abs/2010.05646) model with [LJSpeech-1.1](https://keithito.com/LJ-Speech-Dataset/).
|
||||
## Dataset
|
||||
### Download and Extract
|
||||
Download LJSpeech-1.1 from the [official website](https://keithito.com/LJ-Speech-Dataset/).
|
||||
### Get MFA Result and Extract
|
||||
We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut the silence in the edge of audio.
|
||||
You can download from here [ljspeech_alignment.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/LJSpeech-1.1/ljspeech_alignment.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo.
|
||||
|
||||
## Get Started
|
||||
Assume the path to the dataset is `~/datasets/LJSpeech-1.1`.
|
||||
Assume the path to the MFA result of LJSpeech-1.1 is `./ljspeech_alignment`.
|
||||
Run the command below to
|
||||
1. **source path**.
|
||||
2. preprocess the dataset.
|
||||
3. train the model.
|
||||
4. synthesize wavs.
|
||||
- synthesize waveform from `metadata.jsonl`.
|
||||
```bash
|
||||
./run.sh
|
||||
```
|
||||
You can choose a range of stages you want to run, or set `stage` equal to `stop-stage` to use only one stage, for example, running the following command will only preprocess the dataset.
|
||||
```bash
|
||||
./run.sh --stage 0 --stop-stage 0
|
||||
```
|
||||
### Data Preprocessing
|
||||
```bash
|
||||
./local/preprocess.sh ${conf_path}
|
||||
```
|
||||
When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below.
|
||||
|
||||
```text
|
||||
dump
|
||||
├── dev
|
||||
│ ├── norm
|
||||
│ └── raw
|
||||
├── test
|
||||
│ ├── norm
|
||||
│ └── raw
|
||||
└── train
|
||||
├── norm
|
||||
├── raw
|
||||
└── feats_stats.npy
|
||||
```
|
||||
|
||||
The dataset is split into 3 parts, namely `train`, `dev`, and `test`, each of which contains a `norm` and `raw` subfolder. The `raw` folder contains the log magnitude of the mel spectrogram of each utterance, while the norm folder contains the normalized spectrogram. The statistics used to normalize the spectrogram are computed from the training set, which is located in `dump/train/feats_stats.npy`.
|
||||
|
||||
Also, there is a `metadata.jsonl` in each subfolder. It is a table-like file that contains id and paths to the spectrogram of each utterance.
|
||||
|
||||
### Model Training
|
||||
`./local/train.sh` calls `${BIN_DIR}/train.py`.
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
|
||||
```
|
||||
Here's the complete help message.
|
||||
|
||||
```text
|
||||
usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
|
||||
[--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
|
||||
[--ngpu NGPU] [--batch-size BATCH_SIZE] [--max-iter MAX_ITER]
|
||||
[--run-benchmark RUN_BENCHMARK]
|
||||
[--profiler_options PROFILER_OPTIONS]
|
||||
|
||||
Train a ParallelWaveGAN model.
|
||||
|
||||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
--config CONFIG config file to overwrite default config.
|
||||
--train-metadata TRAIN_METADATA
|
||||
training data.
|
||||
--dev-metadata DEV_METADATA
|
||||
dev data.
|
||||
--output-dir OUTPUT_DIR
|
||||
output dir.
|
||||
--ngpu NGPU if ngpu == 0, use cpu.
|
||||
|
||||
benchmark:
|
||||
arguments related to benchmark.
|
||||
|
||||
--batch-size BATCH_SIZE
|
||||
batch size.
|
||||
--max-iter MAX_ITER train max steps.
|
||||
--run-benchmark RUN_BENCHMARK
|
||||
runing benchmark or not, if True, use the --batch-size
|
||||
and --max-iter.
|
||||
--profiler_options PROFILER_OPTIONS
|
||||
The option of profiler, which should be in format
|
||||
"key1=value1;key2=value2;key3=value3".
|
||||
```
|
||||
|
||||
1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`.
|
||||
2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder.
|
||||
3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are saved in `checkpoints/` inside this directory.
|
||||
4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
|
||||
|
||||
### Synthesizing
|
||||
`./local/synthesize.sh` calls `${BIN_DIR}/../synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
|
||||
```
|
||||
```text
|
||||
usage: synthesize.py [-h] [--generator-type GENERATOR_TYPE] [--config CONFIG]
|
||||
[--checkpoint CHECKPOINT] [--test-metadata TEST_METADATA]
|
||||
[--output-dir OUTPUT_DIR] [--ngpu NGPU]
|
||||
|
||||
Synthesize with GANVocoder.
|
||||
|
||||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
--generator-type GENERATOR_TYPE
|
||||
type of GANVocoder, should in {pwgan, mb_melgan,
|
||||
style_melgan, } now
|
||||
--config CONFIG GANVocoder config file.
|
||||
--checkpoint CHECKPOINT
|
||||
snapshot to load.
|
||||
--test-metadata TEST_METADATA
|
||||
dev data.
|
||||
--output-dir OUTPUT_DIR
|
||||
output dir.
|
||||
--ngpu NGPU if ngpu == 0, use cpu.
|
||||
```
|
||||
|
||||
1. `--config` parallel wavegan config file. You should use the same config with which the model is trained.
|
||||
2. `--checkpoint` is the checkpoint to load. Pick one of the checkpoints from `checkpoints` inside the training output directory.
|
||||
3. `--test-metadata` is the metadata of the test dataset. Use the `metadata.jsonl` in the `dev/norm` subfolder from the processed directory.
|
||||
4. `--output-dir` is the directory to save the synthesized audio files.
|
||||
5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
|
||||
|
||||
## Pretrained Model
|
||||
The pretrained model can be downloaded here [hifigan_ljspeech_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_ljspeech_ckpt_0.2.0.zip).
|
||||
|
||||
|
||||
Model | Step | eval/generator_loss | eval/mel_loss| eval/feature_matching_loss
|
||||
:-------------:| :------------:| :-----: | :-----: | :--------:
|
||||
default| 1(gpu) x 2500000|24.492|0.115|7.227
|
||||
|
||||
HiFiGAN checkpoint contains files listed below.
|
||||
|
||||
```text
|
||||
hifigan_ljspeech_ckpt_0.2.0
|
||||
├── default.yaml # default config used to train hifigan
|
||||
├── feats_stats.npy # statistics used to normalize spectrogram when training hifigan
|
||||
└── snapshot_iter_2500000.pdz # generator parameters of hifigan
|
||||
```
|
||||
|
||||
|
||||
## Acknowledgement
|
||||
We adapted some code from https://github.com/kan-bayashi/ParallelWaveGAN.
|
@ -0,0 +1,167 @@
|
||||
# This is the configuration file for LJSpeech dataset.
|
||||
# This configuration is based on HiFiGAN V1, which is an official configuration.
|
||||
# But I found that the optimizer setting does not work well with my implementation.
|
||||
# So I changed optimizer settings as follows:
|
||||
# - AdamW -> Adam
|
||||
# - betas: [0.8, 0.99] -> betas: [0.5, 0.9]
|
||||
# - Scheduler: ExponentialLR -> MultiStepLR
|
||||
# To match the shift size difference, the upsample scales is also modified from the original 256 shift setting.
|
||||
|
||||
###########################################################
|
||||
# FEATURE EXTRACTION SETTING #
|
||||
###########################################################
|
||||
fs: 22050 # Sampling rate.
|
||||
n_fft: 1024 # FFT size (samples).
|
||||
n_shift: 256 # Hop size (samples). 11.6ms
|
||||
win_length: null # Window length (samples).
|
||||
# If set to null, it will be the same as fft_size.
|
||||
window: "hann" # Window function.
|
||||
n_mels: 80 # Number of mel basis.
|
||||
fmin: 80 # Minimum freq in mel basis calculation. (Hz)
|
||||
fmax: 7600 # Maximum frequency in mel basis calculation. (Hz)
|
||||
|
||||
###########################################################
|
||||
# GENERATOR NETWORK ARCHITECTURE SETTING #
|
||||
###########################################################
|
||||
generator_params:
|
||||
in_channels: 80 # Number of input channels.
|
||||
out_channels: 1 # Number of output channels.
|
||||
channels: 512 # Number of initial channels.
|
||||
kernel_size: 7 # Kernel size of initial and final conv layers.
|
||||
upsample_scales: [8, 8, 2, 2] # Upsampling scales.
|
||||
upsample_kernel_sizes: [16, 16, 4, 4] # Kernel size for upsampling layers.
|
||||
resblock_kernel_sizes: [3, 7, 11] # Kernel size for residual blocks.
|
||||
resblock_dilations: # Dilations for residual blocks.
|
||||
- [1, 3, 5]
|
||||
- [1, 3, 5]
|
||||
- [1, 3, 5]
|
||||
use_additional_convs: True # Whether to use additional conv layer in residual blocks.
|
||||
bias: True # Whether to use bias parameter in conv.
|
||||
nonlinear_activation: "leakyrelu" # Nonlinear activation type.
|
||||
nonlinear_activation_params: # Nonlinear activation paramters.
|
||||
negative_slope: 0.1
|
||||
use_weight_norm: True # Whether to apply weight normalization.
|
||||
|
||||
|
||||
###########################################################
|
||||
# DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
|
||||
###########################################################
|
||||
discriminator_params:
|
||||
scales: 3 # Number of multi-scale discriminator.
|
||||
scale_downsample_pooling: "AvgPool1D" # Pooling operation for scale discriminator.
|
||||
scale_downsample_pooling_params:
|
||||
kernel_size: 4 # Pooling kernel size.
|
||||
stride: 2 # Pooling stride.
|
||||
padding: 2 # Padding size.
|
||||
scale_discriminator_params:
|
||||
in_channels: 1 # Number of input channels.
|
||||
out_channels: 1 # Number of output channels.
|
||||
kernel_sizes: [15, 41, 5, 3] # List of kernel sizes.
|
||||
channels: 128 # Initial number of channels.
|
||||
max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers.
|
||||
max_groups: 16 # Maximum number of groups in downsampling conv layers.
|
||||
bias: True
|
||||
downsample_scales: [4, 4, 4, 4, 1] # Downsampling scales.
|
||||
nonlinear_activation: "leakyrelu" # Nonlinear activation.
|
||||
nonlinear_activation_params:
|
||||
negative_slope: 0.1
|
||||
follow_official_norm: True # Whether to follow the official norm setting.
|
||||
periods: [2, 3, 5, 7, 11] # List of period for multi-period discriminator.
|
||||
period_discriminator_params:
|
||||
in_channels: 1 # Number of input channels.
|
||||
out_channels: 1 # Number of output channels.
|
||||
kernel_sizes: [5, 3] # List of kernel sizes.
|
||||
channels: 32 # Initial number of channels.
|
||||
downsample_scales: [3, 3, 3, 3, 1] # Downsampling scales.
|
||||
max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers.
|
||||
bias: True # Whether to use bias parameter in conv layer."
|
||||
nonlinear_activation: "leakyrelu" # Nonlinear activation.
|
||||
nonlinear_activation_params: # Nonlinear activation paramters.
|
||||
negative_slope: 0.1
|
||||
use_weight_norm: True # Whether to apply weight normalization.
|
||||
use_spectral_norm: False # Whether to apply spectral normalization.
|
||||
|
||||
|
||||
###########################################################
|
||||
# STFT LOSS SETTING #
|
||||
###########################################################
|
||||
use_stft_loss: False # Whether to use multi-resolution STFT loss.
|
||||
use_mel_loss: True # Whether to use Mel-spectrogram loss.
|
||||
mel_loss_params:
|
||||
fs: 22050
|
||||
fft_size: 1024
|
||||
hop_size: 256
|
||||
win_length: null
|
||||
window: "hann"
|
||||
num_mels: 80
|
||||
fmin: 0
|
||||
fmax: 11025
|
||||
log_base: null
|
||||
generator_adv_loss_params:
|
||||
average_by_discriminators: False # Whether to average loss by #discriminators.
|
||||
discriminator_adv_loss_params:
|
||||
average_by_discriminators: False # Whether to average loss by #discriminators.
|
||||
use_feat_match_loss: True
|
||||
feat_match_loss_params:
|
||||
average_by_discriminators: False # Whether to average loss by #discriminators.
|
||||
average_by_layers: False # Whether to average loss by #layers in each discriminator.
|
||||
include_final_outputs: False # Whether to include final outputs in feat match loss calculation.
|
||||
|
||||
###########################################################
|
||||
# ADVERSARIAL LOSS SETTING #
|
||||
###########################################################
|
||||
lambda_aux: 45.0 # Loss balancing coefficient for STFT loss.
|
||||
lambda_adv: 1.0 # Loss balancing coefficient for adversarial loss.
|
||||
lambda_feat_match: 2.0 # Loss balancing coefficient for feat match loss..
|
||||
|
||||
###########################################################
|
||||
# DATA LOADER SETTING #
|
||||
###########################################################
|
||||
batch_size: 16 # Batch size.
|
||||
batch_max_steps: 8192 # Length of each audio in batch. Make sure dividable by hop_size.
|
||||
num_workers: 2 # Number of workers in DataLoader.
|
||||
|
||||
###########################################################
|
||||
# OPTIMIZER & SCHEDULER SETTING #
|
||||
###########################################################
|
||||
generator_optimizer_params:
|
||||
beta1: 0.5
|
||||
beta2: 0.9
|
||||
weight_decay: 0.0 # Generator's weight decay coefficient.
|
||||
generator_scheduler_params:
|
||||
learning_rate: 2.0e-4 # Generator's learning rate.
|
||||
gamma: 0.5 # Generator's scheduler gamma.
|
||||
milestones: # At each milestone, lr will be multiplied by gamma.
|
||||
- 200000
|
||||
- 400000
|
||||
- 600000
|
||||
- 800000
|
||||
generator_grad_norm: -1 # Generator's gradient norm.
|
||||
discriminator_optimizer_params:
|
||||
beta1: 0.5
|
||||
beta2: 0.9
|
||||
weight_decay: 0.0 # Discriminator's weight decay coefficient.
|
||||
discriminator_scheduler_params:
|
||||
learning_rate: 2.0e-4 # Discriminator's learning rate.
|
||||
gamma: 0.5 # Discriminator's scheduler gamma.
|
||||
milestones: # At each milestone, lr will be multiplied by gamma.
|
||||
- 200000
|
||||
- 400000
|
||||
- 600000
|
||||
- 800000
|
||||
discriminator_grad_norm: -1 # Discriminator's gradient norm.
|
||||
|
||||
###########################################################
|
||||
# INTERVAL SETTING #
|
||||
###########################################################
|
||||
generator_train_start_steps: 1 # Number of steps to start to train discriminator.
|
||||
discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
|
||||
train_max_steps: 2500000 # Number of training steps.
|
||||
save_interval_steps: 5000 # Interval steps to save checkpoint.
|
||||
eval_interval_steps: 1000 # Interval steps to evaluate the network.
|
||||
|
||||
###########################################################
|
||||
# OTHER SETTING #
|
||||
###########################################################
|
||||
num_snapshots: 10 # max number of snapshots to keep while training
|
||||
seed: 42 # random seed for paddle, random, and np.random
|
@ -0,0 +1,55 @@
|
||||
#!/bin/bash
|
||||
|
||||
stage=0
|
||||
stop_stage=100
|
||||
|
||||
config_path=$1
|
||||
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
# get durations from MFA's result
|
||||
echo "Generate durations.txt from MFA results ..."
|
||||
python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
|
||||
--inputdir=./ljspeech_alignment \
|
||||
--output=durations.txt \
|
||||
--config=${config_path}
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
# extract features
|
||||
echo "Extract features ..."
|
||||
python3 ${BIN_DIR}/../preprocess.py \
|
||||
--rootdir=~/datasets/LJSpeech-1.1/ \
|
||||
--dataset=ljspeech \
|
||||
--dumpdir=dump \
|
||||
--dur-file=durations.txt \
|
||||
--config=${config_path} \
|
||||
--cut-sil=True \
|
||||
--num-cpu=20
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
# get features' stats(mean and std)
|
||||
echo "Get features' stats ..."
|
||||
python3 ${MAIN_ROOT}/utils/compute_statistics.py \
|
||||
--metadata=dump/train/raw/metadata.jsonl \
|
||||
--field-name="feats"
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||
# normalize, dev and test should use train's stats
|
||||
echo "Normalize ..."
|
||||
|
||||
python3 ${BIN_DIR}/../normalize.py \
|
||||
--metadata=dump/train/raw/metadata.jsonl \
|
||||
--dumpdir=dump/train/norm \
|
||||
--stats=dump/train/feats_stats.npy
|
||||
python3 ${BIN_DIR}/../normalize.py \
|
||||
--metadata=dump/dev/raw/metadata.jsonl \
|
||||
--dumpdir=dump/dev/norm \
|
||||
--stats=dump/train/feats_stats.npy
|
||||
|
||||
python3 ${BIN_DIR}/../normalize.py \
|
||||
--metadata=dump/test/raw/metadata.jsonl \
|
||||
--dumpdir=dump/test/norm \
|
||||
--stats=dump/train/feats_stats.npy
|
||||
fi
|
@ -0,0 +1,14 @@
|
||||
#!/bin/bash
|
||||
|
||||
config_path=$1
|
||||
train_output_path=$2
|
||||
ckpt_name=$3
|
||||
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
|
||||
python3 ${BIN_DIR}/../synthesize.py \
|
||||
--config=${config_path} \
|
||||
--checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--test-metadata=dump/test/norm/metadata.jsonl \
|
||||
--output-dir=${train_output_path}/test \
|
||||
--generator-type=hifigan
|
@ -0,0 +1,13 @@
|
||||
#!/bin/bash
|
||||
|
||||
config_path=$1
|
||||
train_output_path=$2
|
||||
|
||||
FLAGS_cudnn_exhaustive_search=true \
|
||||
FLAGS_conv_workspace_size_limit=4000 \
|
||||
python ${BIN_DIR}/train.py \
|
||||
--train-metadata=dump/train/norm/metadata.jsonl \
|
||||
--dev-metadata=dump/dev/norm/metadata.jsonl \
|
||||
--config=${config_path} \
|
||||
--output-dir=${train_output_path} \
|
||||
--ngpu=1
|
@ -0,0 +1,13 @@
|
||||
#!/bin/bash
|
||||
export MAIN_ROOT=`realpath ${PWD}/../../../`
|
||||
|
||||
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
|
||||
export LC_ALL=C
|
||||
|
||||
export PYTHONDONTWRITEBYTECODE=1
|
||||
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
|
||||
export PYTHONIOENCODING=UTF-8
|
||||
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
|
||||
|
||||
MODEL=hifigan
|
||||
export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL}
|
@ -0,0 +1,32 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
source path.sh
|
||||
|
||||
gpus=0,1
|
||||
stage=0
|
||||
stop_stage=100
|
||||
|
||||
conf_path=conf/default.yaml
|
||||
train_output_path=exp/default
|
||||
ckpt_name=snapshot_iter_5000.pdz
|
||||
|
||||
# with the following command, you can choose the stage range you want to run
|
||||
# such as `./run.sh --stage 0 --stop-stage 0`
|
||||
# this can not be mixed use with `$1`, `$2` ...
|
||||
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
|
||||
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
# prepare data
|
||||
./local/preprocess.sh ${conf_path} || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
# train model, all `ckpt` under `train_output_path/checkpoints/` dir
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
# synthesize
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
|
||||
fi
|
@ -0,0 +1,153 @@
|
||||
# HiFiGAN with VCTK
|
||||
This example contains code used to train a [HiFiGAN](https://arxiv.org/abs/2010.05646) model with [VCTK](https://datashare.ed.ac.uk/handle/10283/3443).
|
||||
|
||||
## Dataset
|
||||
### Download and Extract
|
||||
Download VCTK-0.92 from the [official website](https://datashare.ed.ac.uk/handle/10283/3443) and extract it to `~/datasets`. Then the dataset is in directory `~/datasets/VCTK-Corpus-0.92`.
|
||||
|
||||
### Get MFA Result and Extract
|
||||
We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut the silence in the edge of audio.
|
||||
You can download from here [vctk_alignment.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/VCTK-Corpus-0.92/vctk_alignment.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo.
|
||||
ps: we remove three speakers in VCTK-0.92 (see [reorganize_vctk.py](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/other/mfa/local/reorganize_vctk.py)):
|
||||
1. `p315`, because of no text for it.
|
||||
2. `p280` and `p362`, because no *_mic2.flac (which is better than *_mic1.flac) for them.
|
||||
|
||||
## Get Started
|
||||
Assume the path to the dataset is `~/datasets/VCTK-Corpus-0.92`.
|
||||
Assume the path to the MFA result of VCTK is `./vctk_alignment`.
|
||||
Run the command below to
|
||||
1. **source path**.
|
||||
2. preprocess the dataset.
|
||||
3. train the model.
|
||||
4. synthesize wavs.
|
||||
- synthesize waveform from `metadata.jsonl`.
|
||||
```bash
|
||||
./run.sh
|
||||
```
|
||||
You can choose a range of stages you want to run, or set `stage` equal to `stop-stage` to use only one stage, for example, running the following command will only preprocess the dataset.
|
||||
```bash
|
||||
./run.sh --stage 0 --stop-stage 0
|
||||
```
|
||||
### Data Preprocessing
|
||||
```bash
|
||||
./local/preprocess.sh ${conf_path}
|
||||
```
|
||||
When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below.
|
||||
|
||||
```text
|
||||
dump
|
||||
├── dev
|
||||
│ ├── norm
|
||||
│ └── raw
|
||||
├── test
|
||||
│ ├── norm
|
||||
│ └── raw
|
||||
└── train
|
||||
├── norm
|
||||
├── raw
|
||||
└── feats_stats.npy
|
||||
```
|
||||
|
||||
The dataset is split into 3 parts, namely `train`, `dev`, and `test`, each of which contains a `norm` and `raw` subfolder. The `raw` folder contains the log magnitude of the mel spectrogram of each utterance, while the norm folder contains the normalized spectrogram. The statistics used to normalize the spectrogram are computed from the training set, which is located in `dump/train/feats_stats.npy`.
|
||||
|
||||
Also, there is a `metadata.jsonl` in each subfolder. It is a table-like file that contains id and paths to the spectrogram of each utterance.
|
||||
|
||||
### Model Training
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
|
||||
```
|
||||
`./local/train.sh` calls `${BIN_DIR}/train.py`.
|
||||
Here's the complete help message.
|
||||
|
||||
```text
|
||||
usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
|
||||
[--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
|
||||
[--ngpu NGPU] [--batch-size BATCH_SIZE] [--max-iter MAX_ITER]
|
||||
[--run-benchmark RUN_BENCHMARK]
|
||||
[--profiler_options PROFILER_OPTIONS]
|
||||
|
||||
Train a ParallelWaveGAN model.
|
||||
|
||||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
--config CONFIG config file to overwrite default config.
|
||||
--train-metadata TRAIN_METADATA
|
||||
training data.
|
||||
--dev-metadata DEV_METADATA
|
||||
dev data.
|
||||
--output-dir OUTPUT_DIR
|
||||
output dir.
|
||||
--ngpu NGPU if ngpu == 0, use cpu.
|
||||
|
||||
benchmark:
|
||||
arguments related to benchmark.
|
||||
|
||||
--batch-size BATCH_SIZE
|
||||
batch size.
|
||||
--max-iter MAX_ITER train max steps.
|
||||
--run-benchmark RUN_BENCHMARK
|
||||
runing benchmark or not, if True, use the --batch-size
|
||||
and --max-iter.
|
||||
--profiler_options PROFILER_OPTIONS
|
||||
The option of profiler, which should be in format
|
||||
"key1=value1;key2=value2;key3=value3".
|
||||
```
|
||||
|
||||
1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`.
|
||||
2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder.
|
||||
3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are saved in `checkpoints/` inside this directory.
|
||||
4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
|
||||
|
||||
### Synthesizing
|
||||
`./local/synthesize.sh` calls `${BIN_DIR}/../synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
|
||||
```
|
||||
```text
|
||||
usage: synthesize.py [-h] [--generator-type GENERATOR_TYPE] [--config CONFIG]
|
||||
[--checkpoint CHECKPOINT] [--test-metadata TEST_METADATA]
|
||||
[--output-dir OUTPUT_DIR] [--ngpu NGPU]
|
||||
|
||||
Synthesize with GANVocoder.
|
||||
|
||||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
--generator-type GENERATOR_TYPE
|
||||
type of GANVocoder, should in {pwgan, mb_melgan,
|
||||
style_melgan, } now
|
||||
--config CONFIG GANVocoder config file.
|
||||
--checkpoint CHECKPOINT
|
||||
snapshot to load.
|
||||
--test-metadata TEST_METADATA
|
||||
dev data.
|
||||
--output-dir OUTPUT_DIR
|
||||
output dir.
|
||||
--ngpu NGPU if ngpu == 0, use cpu.
|
||||
```
|
||||
|
||||
|
||||
1. `--config` config file. You should use the same config with which the model is trained.
|
||||
2. `--checkpoint` is the checkpoint to load. Pick one of the checkpoints from `checkpoints` inside the training output directory.
|
||||
3. `--test-metadata` is the metadata of the test dataset. Use the `metadata.jsonl` in the `dev/norm` subfolder from the processed directory.
|
||||
4. `--output-dir` is the directory to save the synthesized audio files.
|
||||
5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
|
||||
|
||||
## Pretrained Model
|
||||
The pretrained model can be downloaded here [hifigan_vctk_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_ckpt_0.2.0.zip).
|
||||
|
||||
|
||||
Model | Step | eval/generator_loss | eval/mel_loss| eval/feature_matching_loss
|
||||
:-------------:| :------------:| :-----: | :-----: | :--------:
|
||||
default| 1(gpu) x 2500000|58.092|0.1234|24.384
|
||||
|
||||
HiFiGAN checkpoint contains files listed below.
|
||||
|
||||
```text
|
||||
hifigan_vctk_ckpt_0.2.0
|
||||
├── default.yaml # default config used to train hifigan
|
||||
├── feats_stats.npy # statistics used to normalize spectrogram when training hifigan
|
||||
└── snapshot_iter_2500000.pdz # generator parameters of hifigan
|
||||
```
|
||||
|
||||
## Acknowledgement
|
||||
We adapted some code from https://github.com/kan-bayashi/ParallelWaveGAN.
|
@ -0,0 +1,168 @@
|
||||
# This is the configuration file for VCTK dataset.
|
||||
# This configuration is based on HiFiGAN V1, which is
|
||||
# an official configuration. But I found that the optimizer
|
||||
# setting does not work well with my implementation.
|
||||
# So I changed optimizer settings as follows:
|
||||
# - AdamW -> Adam
|
||||
# - betas: [0.8, 0.99] -> betas: [0.5, 0.9]
|
||||
# - Scheduler: ExponentialLR -> MultiStepLR
|
||||
# To match the shift size difference, the upsample scales
|
||||
# is also modified from the original 256 shift setting.
|
||||
###########################################################
|
||||
# FEATURE EXTRACTION SETTING #
|
||||
###########################################################
|
||||
fs: 24000 # Sampling rate.
|
||||
n_fft: 2048 # FFT size (samples).
|
||||
n_shift: 300 # Hop size (samples). 12.5ms
|
||||
win_length: 1200 # Window length (samples). 50ms
|
||||
# If set to null, it will be the same as fft_size.
|
||||
window: "hann" # Window function.
|
||||
n_mels: 80 # Number of mel basis.
|
||||
fmin: 80 # Minimum freq in mel basis calculation. (Hz)
|
||||
fmax: 7600 # Maximum frequency in mel basis calculation. (Hz)
|
||||
|
||||
###########################################################
|
||||
# GENERATOR NETWORK ARCHITECTURE SETTING #
|
||||
###########################################################
|
||||
generator_params:
|
||||
in_channels: 80 # Number of input channels.
|
||||
out_channels: 1 # Number of output channels.
|
||||
channels: 512 # Number of initial channels.
|
||||
kernel_size: 7 # Kernel size of initial and final conv layers.
|
||||
upsample_scales: [5, 5, 4, 3] # Upsampling scales.
|
||||
upsample_kernel_sizes: [10, 10, 8, 6] # Kernel size for upsampling layers.
|
||||
resblock_kernel_sizes: [3, 7, 11] # Kernel size for residual blocks.
|
||||
resblock_dilations: # Dilations for residual blocks.
|
||||
- [1, 3, 5]
|
||||
- [1, 3, 5]
|
||||
- [1, 3, 5]
|
||||
use_additional_convs: True # Whether to use additional conv layer in residual blocks.
|
||||
bias: True # Whether to use bias parameter in conv.
|
||||
nonlinear_activation: "leakyrelu" # Nonlinear activation type.
|
||||
nonlinear_activation_params: # Nonlinear activation paramters.
|
||||
negative_slope: 0.1
|
||||
use_weight_norm: True # Whether to apply weight normalization.
|
||||
|
||||
|
||||
###########################################################
|
||||
# DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
|
||||
###########################################################
|
||||
discriminator_params:
|
||||
scales: 3 # Number of multi-scale discriminator.
|
||||
scale_downsample_pooling: "AvgPool1D" # Pooling operation for scale discriminator.
|
||||
scale_downsample_pooling_params:
|
||||
kernel_size: 4 # Pooling kernel size.
|
||||
stride: 2 # Pooling stride.
|
||||
padding: 2 # Padding size.
|
||||
scale_discriminator_params:
|
||||
in_channels: 1 # Number of input channels.
|
||||
out_channels: 1 # Number of output channels.
|
||||
kernel_sizes: [15, 41, 5, 3] # List of kernel sizes.
|
||||
channels: 128 # Initial number of channels.
|
||||
max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers.
|
||||
max_groups: 16 # Maximum number of groups in downsampling conv layers.
|
||||
bias: True
|
||||
downsample_scales: [4, 4, 4, 4, 1] # Downsampling scales.
|
||||
nonlinear_activation: "leakyrelu" # Nonlinear activation.
|
||||
nonlinear_activation_params:
|
||||
negative_slope: 0.1
|
||||
follow_official_norm: True # Whether to follow the official norm setting.
|
||||
periods: [2, 3, 5, 7, 11] # List of period for multi-period discriminator.
|
||||
period_discriminator_params:
|
||||
in_channels: 1 # Number of input channels.
|
||||
out_channels: 1 # Number of output channels.
|
||||
kernel_sizes: [5, 3] # List of kernel sizes.
|
||||
channels: 32 # Initial number of channels.
|
||||
downsample_scales: [3, 3, 3, 3, 1] # Downsampling scales.
|
||||
max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers.
|
||||
bias: True # Whether to use bias parameter in conv layer."
|
||||
nonlinear_activation: "leakyrelu" # Nonlinear activation.
|
||||
nonlinear_activation_params: # Nonlinear activation paramters.
|
||||
negative_slope: 0.1
|
||||
use_weight_norm: True # Whether to apply weight normalization.
|
||||
use_spectral_norm: False # Whether to apply spectral normalization.
|
||||
|
||||
|
||||
###########################################################
|
||||
# STFT LOSS SETTING #
|
||||
###########################################################
|
||||
use_stft_loss: False # Whether to use multi-resolution STFT loss.
|
||||
use_mel_loss: True # Whether to use Mel-spectrogram loss.
|
||||
mel_loss_params:
|
||||
fs: 24000
|
||||
fft_size: 2048
|
||||
hop_size: 300
|
||||
win_length: 1200
|
||||
window: "hann"
|
||||
num_mels: 80
|
||||
fmin: 0
|
||||
fmax: 12000
|
||||
log_base: null
|
||||
generator_adv_loss_params:
|
||||
average_by_discriminators: False # Whether to average loss by #discriminators.
|
||||
discriminator_adv_loss_params:
|
||||
average_by_discriminators: False # Whether to average loss by #discriminators.
|
||||
use_feat_match_loss: True
|
||||
feat_match_loss_params:
|
||||
average_by_discriminators: False # Whether to average loss by #discriminators.
|
||||
average_by_layers: False # Whether to average loss by #layers in each discriminator.
|
||||
include_final_outputs: False # Whether to include final outputs in feat match loss calculation.
|
||||
|
||||
###########################################################
|
||||
# ADVERSARIAL LOSS SETTING #
|
||||
###########################################################
|
||||
lambda_aux: 45.0 # Loss balancing coefficient for STFT loss.
|
||||
lambda_adv: 1.0 # Loss balancing coefficient for adversarial loss.
|
||||
lambda_feat_match: 2.0 # Loss balancing coefficient for feat match loss..
|
||||
|
||||
###########################################################
|
||||
# DATA LOADER SETTING #
|
||||
###########################################################
|
||||
batch_size: 16 # Batch size.
|
||||
batch_max_steps: 8400 # Length of each audio in batch. Make sure dividable by hop_size.
|
||||
num_workers: 2 # Number of workers in DataLoader.
|
||||
|
||||
###########################################################
|
||||
# OPTIMIZER & SCHEDULER SETTING #
|
||||
###########################################################
|
||||
generator_optimizer_params:
|
||||
beta1: 0.5
|
||||
beta2: 0.9
|
||||
weight_decay: 0.0 # Generator's weight decay coefficient.
|
||||
generator_scheduler_params:
|
||||
learning_rate: 2.0e-4 # Generator's learning rate.
|
||||
gamma: 0.5 # Generator's scheduler gamma.
|
||||
milestones: # At each milestone, lr will be multiplied by gamma.
|
||||
- 200000
|
||||
- 400000
|
||||
- 600000
|
||||
- 800000
|
||||
generator_grad_norm: -1 # Generator's gradient norm.
|
||||
discriminator_optimizer_params:
|
||||
beta1: 0.5
|
||||
beta2: 0.9
|
||||
weight_decay: 0.0 # Discriminator's weight decay coefficient.
|
||||
discriminator_scheduler_params:
|
||||
learning_rate: 2.0e-4 # Discriminator's learning rate.
|
||||
gamma: 0.5 # Discriminator's scheduler gamma.
|
||||
milestones: # At each milestone, lr will be multiplied by gamma.
|
||||
- 200000
|
||||
- 400000
|
||||
- 600000
|
||||
- 800000
|
||||
discriminator_grad_norm: -1 # Discriminator's gradient norm.
|
||||
|
||||
###########################################################
|
||||
# INTERVAL SETTING #
|
||||
###########################################################
|
||||
generator_train_start_steps: 1 # Number of steps to start to train discriminator.
|
||||
discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
|
||||
train_max_steps: 2500000 # Number of training steps.
|
||||
save_interval_steps: 5000 # Interval steps to save checkpoint.
|
||||
eval_interval_steps: 1000 # Interval steps to evaluate the network.
|
||||
|
||||
###########################################################
|
||||
# OTHER SETTING #
|
||||
###########################################################
|
||||
num_snapshots: 10 # max number of snapshots to keep while training
|
||||
seed: 42 # random seed for paddle, random, and np.random
|
@ -0,0 +1,55 @@
|
||||
#!/bin/bash
|
||||
|
||||
stage=0
|
||||
stop_stage=100
|
||||
|
||||
config_path=$1
|
||||
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
# get durations from MFA's result
|
||||
echo "Generate durations.txt from MFA results ..."
|
||||
python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
|
||||
--inputdir=./vctk_alignment \
|
||||
--output=durations.txt \
|
||||
--config=${config_path}
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
# extract features
|
||||
echo "Extract features ..."
|
||||
python3 ${BIN_DIR}/../preprocess.py \
|
||||
--rootdir=~/datasets/VCTK-Corpus-0.92/ \
|
||||
--dataset=vctk \
|
||||
--dumpdir=dump \
|
||||
--dur-file=durations.txt \
|
||||
--config=${config_path} \
|
||||
--cut-sil=True \
|
||||
--num-cpu=20
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
# get features' stats(mean and std)
|
||||
echo "Get features' stats ..."
|
||||
python3 ${MAIN_ROOT}/utils/compute_statistics.py \
|
||||
--metadata=dump/train/raw/metadata.jsonl \
|
||||
--field-name="feats"
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||
# normalize, dev and test should use train's stats
|
||||
echo "Normalize ..."
|
||||
|
||||
python3 ${BIN_DIR}/../normalize.py \
|
||||
--metadata=dump/train/raw/metadata.jsonl \
|
||||
--dumpdir=dump/train/norm \
|
||||
--stats=dump/train/feats_stats.npy
|
||||
python3 ${BIN_DIR}/../normalize.py \
|
||||
--metadata=dump/dev/raw/metadata.jsonl \
|
||||
--dumpdir=dump/dev/norm \
|
||||
--stats=dump/train/feats_stats.npy
|
||||
|
||||
python3 ${BIN_DIR}/../normalize.py \
|
||||
--metadata=dump/test/raw/metadata.jsonl \
|
||||
--dumpdir=dump/test/norm \
|
||||
--stats=dump/train/feats_stats.npy
|
||||
fi
|
@ -0,0 +1,14 @@
|
||||
#!/bin/bash
|
||||
|
||||
config_path=$1
|
||||
train_output_path=$2
|
||||
ckpt_name=$3
|
||||
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
|
||||
python3 ${BIN_DIR}/../synthesize.py \
|
||||
--config=${config_path} \
|
||||
--checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--test-metadata=dump/test/norm/metadata.jsonl \
|
||||
--output-dir=${train_output_path}/test \
|
||||
--generator-type=hifigan
|
@ -0,0 +1,13 @@
|
||||
#!/bin/bash
|
||||
|
||||
config_path=$1
|
||||
train_output_path=$2
|
||||
|
||||
FLAGS_cudnn_exhaustive_search=true \
|
||||
FLAGS_conv_workspace_size_limit=4000 \
|
||||
python ${BIN_DIR}/train.py \
|
||||
--train-metadata=dump/train/norm/metadata.jsonl \
|
||||
--dev-metadata=dump/dev/norm/metadata.jsonl \
|
||||
--config=${config_path} \
|
||||
--output-dir=${train_output_path} \
|
||||
--ngpu=1
|
@ -0,0 +1,13 @@
|
||||
#!/bin/bash
|
||||
export MAIN_ROOT=`realpath ${PWD}/../../../`
|
||||
|
||||
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
|
||||
export LC_ALL=C
|
||||
|
||||
export PYTHONDONTWRITEBYTECODE=1
|
||||
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
|
||||
export PYTHONIOENCODING=UTF-8
|
||||
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
|
||||
|
||||
MODEL=hifigan
|
||||
export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL}
|
@ -0,0 +1,32 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
source path.sh
|
||||
|
||||
gpus=0
|
||||
stage=0
|
||||
stop_stage=100
|
||||
|
||||
conf_path=conf/default.yaml
|
||||
train_output_path=exp/default
|
||||
ckpt_name=snapshot_iter_5000.pdz
|
||||
|
||||
# with the following command, you can choose the stage range you want to run
|
||||
# such as `./run.sh --stage 0 --stop-stage 0`
|
||||
# this can not be mixed use with `$1`, `$2` ...
|
||||
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
|
||||
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
# prepare data
|
||||
./local/preprocess.sh ${conf_path} || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
# train model, all `ckpt` under `train_output_path/checkpoints/` dir
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
# synthesize
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
|
||||
fi
|
@ -0,0 +1,7 @@
|
||||
# VoxCeleb
|
||||
|
||||
## ECAPA-TDNN
|
||||
|
||||
| Model | Number of Params | Release | Config | dim | Test set | Cosine | Cosine + S-Norm |
|
||||
| --- | --- | --- | --- | --- | --- | --- | ---- |
|
||||
| ECAPA-TDNN | 85M | 0.1.1 | conf/ecapa_tdnn.yaml |192 | test | 1.15 | 1.06 |
|
@ -0,0 +1,52 @@
|
||||
###########################################
|
||||
# Data #
|
||||
###########################################
|
||||
# we should explicitly specify the wav path of vox2 audio data converted from m4a
|
||||
vox2_base_path:
|
||||
augment: True
|
||||
batch_size: 16
|
||||
num_workers: 2
|
||||
num_speakers: 7205 # 1211 vox1, 5994 vox2, 7205 vox1+2, test speakers: 41
|
||||
shuffle: True
|
||||
random_chunk: True
|
||||
|
||||
###########################################################
|
||||
# FEATURE EXTRACTION SETTING #
|
||||
###########################################################
|
||||
# currently, we only support fbank
|
||||
sr: 16000 # sample rate
|
||||
n_mels: 80
|
||||
window_size: 400 #25ms, sample rate 16000, 25 * 16000 / 1000 = 400
|
||||
hop_size: 160 #10ms, sample rate 16000, 10 * 16000 / 1000 = 160
|
||||
|
||||
###########################################################
|
||||
# MODEL SETTING #
|
||||
###########################################################
|
||||
# currently, we only support ecapa-tdnn in the ecapa_tdnn.yaml
|
||||
# if we want use another model, please choose another configuration yaml file
|
||||
model:
|
||||
input_size: 80
|
||||
# "channels": [512, 512, 512, 512, 1536],
|
||||
channels: [1024, 1024, 1024, 1024, 3072]
|
||||
kernel_sizes: [5, 3, 3, 3, 1]
|
||||
dilations: [1, 2, 3, 4, 1]
|
||||
attention_channels: 128
|
||||
lin_neurons: 192
|
||||
|
||||
###########################################
|
||||
# Training #
|
||||
###########################################
|
||||
seed: 1986 # according from speechbrain configuration
|
||||
epochs: 10
|
||||
save_interval: 1
|
||||
log_interval: 1
|
||||
learning_rate: 1e-8
|
||||
|
||||
|
||||
###########################################
|
||||
# Testing #
|
||||
###########################################
|
||||
global_embedding_norm: True
|
||||
embedding_mean_norm: True
|
||||
embedding_std_norm: False
|
||||
|
@ -0,0 +1,58 @@
|
||||
#!/bin/bash
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
stage=1
|
||||
stop_stage=100
|
||||
|
||||
. ${MAIN_ROOT}/utils/parse_options.sh || exit -1;
|
||||
|
||||
if [ $# -ne 2 ] ; then
|
||||
echo "Usage: $0 [options] <data-dir> <conf-path>";
|
||||
echo "e.g.: $0 ./data/ conf/ecapa_tdnn.yaml"
|
||||
echo "Options: "
|
||||
echo " --stage <stage|-1> # Used to run a partially-completed data process from somewhere in the middle."
|
||||
echo " --stop-stage <stop-stage|100> # Used to run a partially-completed data process stop stage in the middle"
|
||||
exit 1;
|
||||
fi
|
||||
|
||||
dir=$1
|
||||
conf_path=$2
|
||||
mkdir -p ${dir}
|
||||
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
# data prepare for vox1 and vox2, vox2 must be converted from m4a to wav
|
||||
# we should use the local/convert.sh convert m4a to wav
|
||||
python3 local/data_prepare.py \
|
||||
--data-dir ${dir} \
|
||||
--config ${conf_path}
|
||||
fi
|
||||
|
||||
TARGET_DIR=${MAIN_ROOT}/dataset
|
||||
mkdir -p ${TARGET_DIR}
|
||||
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
# download data, generate manifests
|
||||
python3 ${TARGET_DIR}/voxceleb/voxceleb1.py \
|
||||
--manifest_prefix="data/vox1/manifest" \
|
||||
--target_dir="${TARGET_DIR}/voxceleb/vox1/"
|
||||
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Prepare voxceleb failed. Terminated."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# for dataset in train dev test; do
|
||||
# mv data/manifest.${dataset} data/manifest.${dataset}.raw
|
||||
# done
|
||||
fi
|
@ -0,0 +1,70 @@
|
||||
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import argparse
|
||||
|
||||
import paddle
|
||||
from yacs.config import CfgNode
|
||||
|
||||
from paddleaudio.datasets.voxceleb import VoxCeleb
|
||||
from paddlespeech.s2t.utils.log import Log
|
||||
from paddlespeech.vector.io.augment import build_augment_pipeline
|
||||
from paddlespeech.vector.training.seeding import seed_everything
|
||||
|
||||
logger = Log(__name__).getlog()
|
||||
|
||||
|
||||
def main(args, config):
|
||||
|
||||
# stage0: set the cpu device, all data prepare process will be done in cpu mode
|
||||
paddle.set_device("cpu")
|
||||
# set the random seed, it is a must for multiprocess training
|
||||
seed_everything(config.seed)
|
||||
|
||||
# stage 1: generate the voxceleb csv file
|
||||
# Note: this may occurs c++ execption, but the program will execute fine
|
||||
# so we ignore the execption
|
||||
# we explicitly pass the vox2 base path to data prepare and generate the audio info
|
||||
logger.info("start to generate the voxceleb dataset info")
|
||||
train_dataset = VoxCeleb(
|
||||
'train', target_dir=args.data_dir, vox2_base_path=config.vox2_base_path)
|
||||
|
||||
# stage 2: generate the augment noise csv file
|
||||
if config.augment:
|
||||
logger.info("start to generate the augment dataset info")
|
||||
augment_pipeline = build_augment_pipeline(target_dir=args.data_dir)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# yapf: disable
|
||||
parser = argparse.ArgumentParser(__doc__)
|
||||
parser.add_argument("--data-dir",
|
||||
default="./data/",
|
||||
type=str,
|
||||
help="data directory")
|
||||
parser.add_argument("--config",
|
||||
default=None,
|
||||
type=str,
|
||||
help="configuration file")
|
||||
args = parser.parse_args()
|
||||
# yapf: enable
|
||||
|
||||
# https://yaml.org/type/float.html
|
||||
config = CfgNode(new_allowed=True)
|
||||
if args.config:
|
||||
config.merge_from_file(args.config)
|
||||
|
||||
config.freeze()
|
||||
print(config)
|
||||
|
||||
main(args, config)
|
@ -0,0 +1,51 @@
|
||||
#!/bin/bash
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
. ./path.sh
|
||||
|
||||
stage=0
|
||||
stop_stage=100
|
||||
exp_dir=exp/ecapa-tdnn-vox12-big/ # experiment directory
|
||||
conf_path=conf/ecapa_tdnn.yaml
|
||||
audio_path="demo/voxceleb/00001.wav"
|
||||
use_gpu=true
|
||||
|
||||
. ${MAIN_ROOT}/utils/parse_options.sh || exit -1;
|
||||
|
||||
if [ $# -ne 0 ] ; then
|
||||
echo "Usage: $0 [options]";
|
||||
echo "e.g.: $0 ./data/ exp/voxceleb12/ conf/ecapa_tdnn.yaml"
|
||||
echo "Options: "
|
||||
echo " --use-gpu <true,false|true> # specify is gpu is to be used for training"
|
||||
echo " --stage <stage|-1> # Used to run a partially-completed data process from somewhere in the middle."
|
||||
echo " --stop-stage <stop-stage|100> # Used to run a partially-completed data process stop stage in the middle"
|
||||
echo " --exp-dir # experiment directorh, where is has the model.pdparams"
|
||||
echo " --conf-path # configuration file for extracting the embedding"
|
||||
echo " --audio-path # audio-path, which will be processed to extract the embedding"
|
||||
exit 1;
|
||||
fi
|
||||
|
||||
# set the test device
|
||||
device="cpu"
|
||||
if ${use_gpu}; then
|
||||
device="gpu"
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
# extract the audio embedding
|
||||
python3 ${BIN_DIR}/extract_emb.py --device ${device} \
|
||||
--config ${conf_path} \
|
||||
--audio-path ${audio_path} --load-checkpoint ${exp_dir}
|
||||
fi
|
@ -0,0 +1,42 @@
|
||||
#!/bin/bash
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
stage=1
|
||||
stop_stage=100
|
||||
use_gpu=true # if true, we run on GPU.
|
||||
|
||||
. ${MAIN_ROOT}/utils/parse_options.sh || exit -1;
|
||||
|
||||
if [ $# -ne 3 ] ; then
|
||||
echo "Usage: $0 [options] <data-dir> <exp-dir> <conf-path>";
|
||||
echo "e.g.: $0 ./data/ exp/voxceleb12/ conf/ecapa_tdnn.yaml"
|
||||
echo "Options: "
|
||||
echo " --use-gpu <true,false|true> # specify is gpu is to be used for training"
|
||||
echo " --stage <stage|-1> # Used to run a partially-completed data process from somewhere in the middle."
|
||||
echo " --stop-stage <stop-stage|100> # Used to run a partially-completed data process stop stage in the middle"
|
||||
exit 1;
|
||||
fi
|
||||
|
||||
dir=$1
|
||||
exp_dir=$2
|
||||
conf_path=$3
|
||||
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
# test the model and compute the eer metrics
|
||||
python3 ${BIN_DIR}/test.py \
|
||||
--data-dir ${dir} \
|
||||
--load-checkpoint ${exp_dir} \
|
||||
--config ${conf_path}
|
||||
fi
|
@ -0,0 +1,61 @@
|
||||
#!/bin/bash
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
stage=0
|
||||
stop_stage=100
|
||||
use_gpu=true # if true, we run on GPU.
|
||||
|
||||
. ${MAIN_ROOT}/utils/parse_options.sh || exit -1;
|
||||
|
||||
if [ $# -ne 3 ] ; then
|
||||
echo "Usage: $0 [options] <data-dir> <exp-dir> <conf-path>";
|
||||
echo "e.g.: $0 ./data/ exp/voxceleb12/ conf/ecapa_tdnn.yaml"
|
||||
echo "Options: "
|
||||
echo " --use-gpu <true,false|true> # specify is gpu is to be used for training"
|
||||
echo " --stage <stage|-1> # Used to run a partially-completed data process from somewhere in the middle."
|
||||
echo " --stop-stage <stop-stage|100> # Used to run a partially-completed data process stop stage in the middle"
|
||||
exit 1;
|
||||
fi
|
||||
|
||||
dir=$1
|
||||
exp_dir=$2
|
||||
conf_path=$3
|
||||
|
||||
# get the gpu nums for training
|
||||
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
|
||||
echo "using $ngpu gpus..."
|
||||
|
||||
# setting training device
|
||||
device="cpu"
|
||||
if ${use_gpu}; then
|
||||
device="gpu"
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
# train the speaker identification task with voxceleb data
|
||||
# and we will create the trained model parameters in ${exp_dir}/model.pdparams as the soft link
|
||||
# Note: we will store the log file in exp/log directory
|
||||
python3 -m paddle.distributed.launch --gpus=$CUDA_VISIBLE_DEVICES \
|
||||
${BIN_DIR}/train.py --device ${device} --checkpoint-dir ${exp_dir} \
|
||||
--data-dir ${dir} --config ${conf_path}
|
||||
|
||||
fi
|
||||
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Failed in training!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
exit 0
|
@ -0,0 +1,28 @@
|
||||
#!/bin/bash
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
export MAIN_ROOT=`realpath ${PWD}/../../../`
|
||||
|
||||
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
|
||||
export LC_ALL=C
|
||||
|
||||
export PYTHONDONTWRITEBYTECODE=1
|
||||
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
|
||||
export PYTHONIOENCODING=UTF-8
|
||||
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
|
||||
|
||||
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
|
||||
|
||||
MODEL=ecapa_tdnn
|
||||
export BIN_DIR=${MAIN_ROOT}/paddlespeech/vector/exps/${MODEL}
|
@ -0,0 +1,69 @@
|
||||
#!/bin/bash
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
. ./path.sh
|
||||
set -e
|
||||
|
||||
#######################################################################
|
||||
# stage 0: data prepare, including voxceleb1 download and generate {train,dev,enroll,test}.csv
|
||||
# voxceleb2 data is m4a format, so we need user to convert the m4a to wav yourselves as described in Readme.md with the script local/convert.sh
|
||||
# stage 1: train the speaker identification model
|
||||
# stage 2: test speaker identification
|
||||
# stage 3: extract the training embeding to train the LDA and PLDA
|
||||
######################################################################
|
||||
|
||||
# we can set the variable PPAUDIO_HOME to specifiy the root directory of the downloaded vox1 and vox2 dataset
|
||||
# default the dataset will be stored in the ~/.paddleaudio/
|
||||
# the vox2 dataset is stored in m4a format, we need to convert the audio from m4a to wav yourself
|
||||
# and put all of them to ${PPAUDIO_HOME}/datasets/vox2
|
||||
# we will find the wav from ${PPAUDIO_HOME}/datasets/vox1/wav and ${PPAUDIO_HOME}/datasets/vox2/wav
|
||||
# export PPAUDIO_HOME=
|
||||
stage=0
|
||||
stop_stage=50
|
||||
|
||||
# data directory
|
||||
# if we set the variable ${dir}, we will store the wav info to this directory
|
||||
# otherwise, we will store the wav info to vox1 and vox2 directory respectively
|
||||
# vox2 wav path, we must convert the m4a format to wav format
|
||||
dir=data/ # data info directory
|
||||
|
||||
exp_dir=exp/ecapa-tdnn-vox12-big/ # experiment directory
|
||||
conf_path=conf/ecapa_tdnn.yaml
|
||||
gpus=0,1,2,3
|
||||
|
||||
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
|
||||
|
||||
mkdir -p ${exp_dir}
|
||||
|
||||
if [ $stage -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
# stage 0: data prepare for vox1 and vox2, vox2 must be converted from m4a to wav
|
||||
bash ./local/data.sh ${dir} ${conf_path}|| exit -1;
|
||||
fi
|
||||
|
||||
if [ $stage -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
# stage 1: train the speaker identification model
|
||||
CUDA_VISIBLE_DEVICES=${gpus} bash ./local/train.sh ${dir} ${exp_dir} ${conf_path}
|
||||
fi
|
||||
|
||||
if [ $stage -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
# stage 2: get the speaker verification scores with cosine function
|
||||
# now we only support use cosine to get the scores
|
||||
CUDA_VISIBLE_DEVICES=0 bash ./local/test.sh ${dir} ${exp_dir} ${conf_path}
|
||||
fi
|
||||
|
||||
# if [ $stage -le 3 ]; then
|
||||
# # stage 2: extract the training embeding to train the LDA and PLDA
|
||||
# # todo: extract the training embedding
|
||||
# fi
|
@ -0,0 +1 @@
|
||||
../../../utils/
|
@ -0,0 +1,2 @@
|
||||
.eggs
|
||||
*.wav
|
@ -1,5 +1,9 @@
|
||||
# Changelog
|
||||
|
||||
Date: 2022-3-15, Author: Xiaojie Chen.
|
||||
- kaldi and librosa mfcc, fbank, spectrogram.
|
||||
- unit test and benchmark.
|
||||
|
||||
Date: 2022-2-25, Author: Hui Zhang.
|
||||
- Refactor architecture.
|
||||
- dtw distance and mcd style dtw
|
||||
- dtw distance and mcd style dtw.
|
||||
|
@ -0,0 +1,7 @@
|
||||
# PaddleAudio
|
||||
|
||||
PaddleAudio is an audio library for PaddlePaddle.
|
||||
|
||||
## Install
|
||||
|
||||
`pip install .`
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue