commit
0f7ede11ef
@ -0,0 +1 @@
|
||||
*.wav
|
@ -1,4 +1,10 @@
|
||||
#!/bin/bash
|
||||
|
||||
wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
|
||||
|
||||
# asr
|
||||
paddlespeech asr --input ./zh.wav
|
||||
|
||||
|
||||
# asr + punc
|
||||
paddlespeech asr --input ./zh.wav | paddlespeech text --task punc
|
@ -1,25 +1,107 @@
|
||||
# This is the parameter configuration file for PaddleSpeech Serving.
|
||||
|
||||
##################################################################
|
||||
# SERVER SETTING #
|
||||
##################################################################
|
||||
host: '0.0.0.0'
|
||||
#################################################################################
|
||||
# SERVER SETTING #
|
||||
#################################################################################
|
||||
host: 127.0.0.1
|
||||
port: 8090
|
||||
|
||||
##################################################################
|
||||
# CONFIG FILE #
|
||||
##################################################################
|
||||
# The engine_type of speech task needs to keep the same type as the config file of speech task.
|
||||
# E.g: The engine_type of asr is 'python', the engine_backend of asr is 'XX/asr.yaml'
|
||||
# E.g: The engine_type of asr is 'inference', the engine_backend of asr is 'XX/asr_pd.yaml'
|
||||
#
|
||||
# add engine type (Options: python, inference)
|
||||
engine_type:
|
||||
asr: 'inference'
|
||||
tts: 'inference'
|
||||
|
||||
# add engine backend type (Options: asr, tts) and config file here.
|
||||
# Adding a speech task to engine_backend means starting the service.
|
||||
engine_backend:
|
||||
asr: 'conf/asr/asr_pd.yaml'
|
||||
tts: 'conf/tts/tts_pd.yaml'
|
||||
# The task format in the engin_list is: <speech task>_<engine type>
|
||||
# task choices = ['asr_python', 'asr_inference', 'tts_python', 'tts_inference']
|
||||
|
||||
engine_list: ['asr_python', 'tts_python']
|
||||
|
||||
|
||||
#################################################################################
|
||||
# ENGINE CONFIG #
|
||||
#################################################################################
|
||||
################### speech task: asr; engine_type: python #######################
|
||||
asr_python:
|
||||
model: 'conformer_wenetspeech'
|
||||
lang: 'zh'
|
||||
sample_rate: 16000
|
||||
cfg_path: # [optional]
|
||||
ckpt_path: # [optional]
|
||||
decode_method: 'attention_rescoring'
|
||||
force_yes: True
|
||||
device: # set 'gpu:id' or 'cpu'
|
||||
|
||||
|
||||
################### speech task: asr; engine_type: inference #######################
|
||||
asr_inference:
|
||||
# model_type choices=['deepspeech2offline_aishell']
|
||||
model_type: 'deepspeech2offline_aishell'
|
||||
am_model: # the pdmodel file of am static model [optional]
|
||||
am_params: # the pdiparams file of am static model [optional]
|
||||
lang: 'zh'
|
||||
sample_rate: 16000
|
||||
cfg_path:
|
||||
decode_method:
|
||||
force_yes: True
|
||||
|
||||
am_predictor_conf:
|
||||
device: # set 'gpu:id' or 'cpu'
|
||||
switch_ir_optim: True
|
||||
glog_info: False # True -> print glog
|
||||
summary: True # False -> do not show predictor config
|
||||
|
||||
|
||||
################### speech task: tts; engine_type: python #######################
|
||||
tts_python:
|
||||
# am (acoustic model) choices=['speedyspeech_csmsc', 'fastspeech2_csmsc',
|
||||
# 'fastspeech2_ljspeech', 'fastspeech2_aishell3',
|
||||
# 'fastspeech2_vctk']
|
||||
am: 'fastspeech2_csmsc'
|
||||
am_config:
|
||||
am_ckpt:
|
||||
am_stat:
|
||||
phones_dict:
|
||||
tones_dict:
|
||||
speaker_dict:
|
||||
spk_id: 0
|
||||
|
||||
# voc (vocoder) choices=['pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3',
|
||||
# 'pwgan_vctk', 'mb_melgan_csmsc']
|
||||
voc: 'pwgan_csmsc'
|
||||
voc_config:
|
||||
voc_ckpt:
|
||||
voc_stat:
|
||||
|
||||
# others
|
||||
lang: 'zh'
|
||||
device: # set 'gpu:id' or 'cpu'
|
||||
|
||||
|
||||
################### speech task: tts; engine_type: inference #######################
|
||||
tts_inference:
|
||||
# am (acoustic model) choices=['speedyspeech_csmsc', 'fastspeech2_csmsc']
|
||||
am: 'fastspeech2_csmsc'
|
||||
am_model: # the pdmodel file of your am static model (XX.pdmodel)
|
||||
am_params: # the pdiparams file of your am static model (XX.pdipparams)
|
||||
am_sample_rate: 24000
|
||||
phones_dict:
|
||||
tones_dict:
|
||||
speaker_dict:
|
||||
spk_id: 0
|
||||
|
||||
am_predictor_conf:
|
||||
device: # set 'gpu:id' or 'cpu'
|
||||
switch_ir_optim: True
|
||||
glog_info: False # True -> print glog
|
||||
summary: True # False -> do not show predictor config
|
||||
|
||||
# voc (vocoder) choices=['pwgan_csmsc', 'mb_melgan_csmsc','hifigan_csmsc']
|
||||
voc: 'pwgan_csmsc'
|
||||
voc_model: # the pdmodel file of your vocoder static model (XX.pdmodel)
|
||||
voc_params: # the pdiparams file of your vocoder static model (XX.pdipparams)
|
||||
voc_sample_rate: 24000
|
||||
|
||||
voc_predictor_conf:
|
||||
device: # set 'gpu:id' or 'cpu'
|
||||
switch_ir_optim: True
|
||||
glog_info: False # True -> print glog
|
||||
summary: True # False -> do not show predictor config
|
||||
|
||||
# others
|
||||
lang: 'zh'
|
||||
|
||||
|
@ -1,8 +0,0 @@
|
||||
model: 'conformer_wenetspeech'
|
||||
lang: 'zh'
|
||||
sample_rate: 16000
|
||||
cfg_path: # [optional]
|
||||
ckpt_path: # [optional]
|
||||
decode_method: 'attention_rescoring'
|
||||
force_yes: True
|
||||
device: 'cpu' # set 'gpu:id' or 'cpu'
|
@ -1,25 +0,0 @@
|
||||
# This is the parameter configuration file for ASR server.
|
||||
# These are the static models that support paddle inference.
|
||||
|
||||
##################################################################
|
||||
# ACOUSTIC MODEL SETTING #
|
||||
# am choices=['deepspeech2offline_aishell'] TODO
|
||||
##################################################################
|
||||
model_type: 'deepspeech2offline_aishell'
|
||||
am_model: # the pdmodel file of am static model [optional]
|
||||
am_params: # the pdiparams file of am static model [optional]
|
||||
lang: 'zh'
|
||||
sample_rate: 16000
|
||||
cfg_path:
|
||||
decode_method:
|
||||
force_yes: True
|
||||
|
||||
am_predictor_conf:
|
||||
device: 'cpu' # set 'gpu:id' or 'cpu'
|
||||
enable_mkldnn: True
|
||||
switch_ir_optim: True
|
||||
|
||||
|
||||
##################################################################
|
||||
# OTHERS #
|
||||
##################################################################
|
@ -1,32 +0,0 @@
|
||||
# This is the parameter configuration file for TTS server.
|
||||
|
||||
##################################################################
|
||||
# ACOUSTIC MODEL SETTING #
|
||||
# am choices=['speedyspeech_csmsc', 'fastspeech2_csmsc',
|
||||
# 'fastspeech2_ljspeech', 'fastspeech2_aishell3',
|
||||
# 'fastspeech2_vctk']
|
||||
##################################################################
|
||||
am: 'fastspeech2_csmsc'
|
||||
am_config:
|
||||
am_ckpt:
|
||||
am_stat:
|
||||
phones_dict:
|
||||
tones_dict:
|
||||
speaker_dict:
|
||||
spk_id: 0
|
||||
|
||||
##################################################################
|
||||
# VOCODER SETTING #
|
||||
# voc choices=['pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3',
|
||||
# 'pwgan_vctk', 'mb_melgan_csmsc']
|
||||
##################################################################
|
||||
voc: 'pwgan_csmsc'
|
||||
voc_config:
|
||||
voc_ckpt:
|
||||
voc_stat:
|
||||
|
||||
##################################################################
|
||||
# OTHERS #
|
||||
##################################################################
|
||||
lang: 'zh'
|
||||
device: 'cpu' # set 'gpu:id' or 'cpu'
|
@ -1,40 +0,0 @@
|
||||
# This is the parameter configuration file for TTS server.
|
||||
# These are the static models that support paddle inference.
|
||||
|
||||
##################################################################
|
||||
# ACOUSTIC MODEL SETTING #
|
||||
# am choices=['speedyspeech_csmsc', 'fastspeech2_csmsc']
|
||||
##################################################################
|
||||
am: 'fastspeech2_csmsc'
|
||||
am_model: # the pdmodel file of your am static model (XX.pdmodel)
|
||||
am_params: # the pdiparams file of your am static model (XX.pdipparams)
|
||||
am_sample_rate: 24000
|
||||
phones_dict:
|
||||
tones_dict:
|
||||
speaker_dict:
|
||||
spk_id: 0
|
||||
|
||||
am_predictor_conf:
|
||||
device: 'cpu' # set 'gpu:id' or 'cpu'
|
||||
enable_mkldnn: False
|
||||
switch_ir_optim: False
|
||||
|
||||
|
||||
##################################################################
|
||||
# VOCODER SETTING #
|
||||
# voc choices=['pwgan_csmsc', 'mb_melgan_csmsc','hifigan_csmsc']
|
||||
##################################################################
|
||||
voc: 'pwgan_csmsc'
|
||||
voc_model: # the pdmodel file of your vocoder static model (XX.pdmodel)
|
||||
voc_params: # the pdiparams file of your vocoder static model (XX.pdipparams)
|
||||
voc_sample_rate: 24000
|
||||
|
||||
voc_predictor_conf:
|
||||
device: 'cpu' # set 'gpu:id' or 'cpu'
|
||||
enable_mkldnn: False
|
||||
switch_ir_optim: False
|
||||
|
||||
##################################################################
|
||||
# OTHERS #
|
||||
##################################################################
|
||||
lang: 'zh'
|
@ -1,3 +1,3 @@
|
||||
#!/bin/bash
|
||||
|
||||
paddlespeech_server start --config_file ./conf/application.yaml
|
||||
paddlespeech_server start --config_file ./conf/application.yaml
|
||||
|
@ -1,3 +1,7 @@
|
||||
#!/bin/bash
|
||||
|
||||
# single process
|
||||
paddlespeech tts --input 今天的天气不错啊
|
||||
|
||||
# Batch process
|
||||
echo -e "1 欢迎光临。\n2 谢谢惠顾。" | paddlespeech tts
|
@ -0,0 +1,369 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "a1e738e0",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 获取测试的 logit 数据"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "29d3368b",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"hlens.npy\n",
|
||||
"logits.npy\n",
|
||||
"ys_lens.npy\n",
|
||||
"ys_pad.npy\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"!mkdir -p ./test_data\n",
|
||||
"!test -f ./test_data/ctc_loss_compare_data.tgz || wget -P ./test_data https://paddlespeech.bj.bcebos.com/datasets/unit_test/asr/ctc_loss_compare_data.tgz\n",
|
||||
"!tar xzvf test_data/ctc_loss_compare_data.tgz -C ./test_data\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "240caf1d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import numpy as np\n",
|
||||
"import time\n",
|
||||
"\n",
|
||||
"data_dir=\"./test_data\"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "91bad949",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"logits_np = np.load(os.path.join(data_dir, \"logits.npy\"))\n",
|
||||
"ys_pad_np = np.load(os.path.join(data_dir, \"ys_pad.npy\"))\n",
|
||||
"hlens_np = np.load(os.path.join(data_dir, \"hlens.npy\"))\n",
|
||||
"ys_lens_np = np.load(os.path.join(data_dir, \"ys_lens.npy\"))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "4cef2f15",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 使用 torch 的 ctc loss"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "90612004",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'1.10.1+cu102'"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import torch\n",
|
||||
"torch.__version__"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "00799f97",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def torch_ctc_loss(use_cpu):\n",
|
||||
" if use_cpu:\n",
|
||||
" device = torch.device(\"cpu\")\n",
|
||||
" else:\n",
|
||||
" device = torch.device(\"cuda\")\n",
|
||||
"\n",
|
||||
" reduction_type = \"sum\" \n",
|
||||
"\n",
|
||||
" ctc_loss = torch.nn.CTCLoss(reduction=reduction_type)\n",
|
||||
"\n",
|
||||
" ys_hat = torch.tensor(logits_np, device = device)\n",
|
||||
" ys_pad = torch.tensor(ys_pad_np, device = device)\n",
|
||||
" hlens = torch.tensor(hlens_np, device = device)\n",
|
||||
" ys_lens = torch.tensor(ys_lens_np, device = device)\n",
|
||||
"\n",
|
||||
" ys_hat = ys_hat.transpose(0, 1)\n",
|
||||
" \n",
|
||||
" # 开始计算时间\n",
|
||||
" start_time = time.time()\n",
|
||||
" ys_hat = ys_hat.log_softmax(2)\n",
|
||||
" loss = ctc_loss(ys_hat, ys_pad, hlens, ys_lens)\n",
|
||||
" end_time = time.time()\n",
|
||||
" \n",
|
||||
" loss = loss / ys_hat.size(1)\n",
|
||||
" return end_time - start_time, loss.item()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ba47b5a4",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 使用 paddle 的 ctc loss"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "6882a06e",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'2.2.2'"
|
||||
]
|
||||
},
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import paddle\n",
|
||||
"paddle.__version__"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "3cfa3b7c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def paddle_ctc_loss(use_cpu): \n",
|
||||
" import paddle.nn as pn\n",
|
||||
" if use_cpu:\n",
|
||||
" device = \"cpu\"\n",
|
||||
" else:\n",
|
||||
" device = \"gpu\"\n",
|
||||
"\n",
|
||||
" paddle.set_device(device)\n",
|
||||
"\n",
|
||||
" logits = paddle.to_tensor(logits_np)\n",
|
||||
" ys_pad = paddle.to_tensor(ys_pad_np,dtype='int32')\n",
|
||||
" hlens = paddle.to_tensor(hlens_np, dtype='int64')\n",
|
||||
" ys_lens = paddle.to_tensor(ys_lens_np, dtype='int64')\n",
|
||||
"\n",
|
||||
" logits = logits.transpose([1,0,2])\n",
|
||||
"\n",
|
||||
" ctc_loss = pn.CTCLoss(reduction='sum')\n",
|
||||
" # 开始计算时间\n",
|
||||
" start_time = time.time()\n",
|
||||
" pn_loss = ctc_loss(logits, ys_pad, hlens, ys_lens)\n",
|
||||
" end_time = time.time()\n",
|
||||
" \n",
|
||||
" pn_loss = pn_loss / logits.shape[1]\n",
|
||||
" return end_time - start_time, pn_loss.item()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "40413ef9",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"CPU, iteration 10\n",
|
||||
"torch_ctc_loss 159.17137145996094\n",
|
||||
"paddle_ctc_loss 159.16574096679688\n",
|
||||
"paddle average time 1.718252992630005\n",
|
||||
"torch average time 0.17536230087280275\n",
|
||||
"paddle time / torch time (cpu) 9.798303193320452\n",
|
||||
"\n",
|
||||
"GPU, iteration 10\n",
|
||||
"torch_ctc_loss 159.172119140625\n",
|
||||
"paddle_ctc_loss 159.17205810546875\n",
|
||||
"paddle average time 0.018606925010681154\n",
|
||||
"torch average time 0.0026710033416748047\n",
|
||||
"paddle time / torch time (gpu) 6.966267963938231\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# 使用 CPU\n",
|
||||
"\n",
|
||||
"iteration = 10\n",
|
||||
"use_cpu = True\n",
|
||||
"torch_total_time = 0\n",
|
||||
"paddle_total_time = 0\n",
|
||||
"for _ in range(iteration):\n",
|
||||
" cost_time, torch_loss = torch_ctc_loss(use_cpu)\n",
|
||||
" torch_total_time += cost_time\n",
|
||||
"for _ in range(iteration):\n",
|
||||
" cost_time, paddle_loss = paddle_ctc_loss(use_cpu)\n",
|
||||
" paddle_total_time += cost_time\n",
|
||||
"print (\"CPU, iteration\", iteration)\n",
|
||||
"print (\"torch_ctc_loss\", torch_loss)\n",
|
||||
"print (\"paddle_ctc_loss\", paddle_loss)\n",
|
||||
"print (\"paddle average time\", paddle_total_time / iteration)\n",
|
||||
"print (\"torch average time\", torch_total_time / iteration)\n",
|
||||
"print (\"paddle time / torch time (cpu)\" , paddle_total_time/ torch_total_time)\n",
|
||||
"\n",
|
||||
"print (\"\")\n",
|
||||
"\n",
|
||||
"# 使用 GPU\n",
|
||||
"\n",
|
||||
"use_cpu = False\n",
|
||||
"torch_total_time = 0\n",
|
||||
"paddle_total_time = 0\n",
|
||||
"for _ in range(iteration):\n",
|
||||
" cost_time, torch_loss = torch_ctc_loss(use_cpu)\n",
|
||||
" torch_total_time += cost_time\n",
|
||||
"for _ in range(iteration):\n",
|
||||
" cost_time, paddle_loss = paddle_ctc_loss(use_cpu)\n",
|
||||
" paddle_total_time += cost_time\n",
|
||||
"print (\"GPU, iteration\", iteration)\n",
|
||||
"print (\"torch_ctc_loss\", torch_loss)\n",
|
||||
"print (\"paddle_ctc_loss\", paddle_loss)\n",
|
||||
"print (\"paddle average time\", paddle_total_time / iteration)\n",
|
||||
"print (\"torch average time\", torch_total_time / iteration)\n",
|
||||
"print (\"paddle time / torch time (gpu)\" , paddle_total_time/ torch_total_time)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "7cdf8697",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 其他: 使用 PaddleSpeech 中的 ctcloss 查一下loss值"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "73fad81d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"logits_np = np.load(os.path.join(data_dir, \"logits.npy\"))\n",
|
||||
"ys_pad_np = np.load(os.path.join(data_dir, \"ys_pad.npy\"))\n",
|
||||
"hlens_np = np.load(os.path.join(data_dir, \"hlens.npy\"))\n",
|
||||
"ys_lens_np = np.load(os.path.join(data_dir, \"ys_lens.npy\"))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"id": "2b41e45d",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"2022-02-25 11:34:34.143 | INFO | paddlespeech.s2t.modules.loss:__init__:41 - CTCLoss Loss reduction: sum, div-bs: True\n",
|
||||
"2022-02-25 11:34:34.143 | INFO | paddlespeech.s2t.modules.loss:__init__:42 - CTCLoss Grad Norm Type: instance\n",
|
||||
"2022-02-25 11:34:34.144 | INFO | paddlespeech.s2t.modules.loss:__init__:73 - CTCLoss() kwargs:{'norm_by_times': True}, not support: {'norm_by_batchsize': False, 'norm_by_total_logits_len': False}\n",
|
||||
"loss 159.17205810546875\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/root/miniconda3/lib/python3.7/site-packages/paddle/fluid/dygraph/math_op_patch.py:253: UserWarning: The dtype of left and right variables are not the same, left dtype is paddle.float32, but right dtype is paddle.int32, the right dtype will convert to paddle.float32\n",
|
||||
" format(lhs_dtype, rhs_dtype, lhs_dtype))\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"use_cpu = False\n",
|
||||
"\n",
|
||||
"from paddlespeech.s2t.modules.loss import CTCLoss\n",
|
||||
"\n",
|
||||
"if use_cpu:\n",
|
||||
" device = \"cpu\"\n",
|
||||
"else:\n",
|
||||
" device = \"gpu\"\n",
|
||||
"\n",
|
||||
"paddle.set_device(device)\n",
|
||||
"\n",
|
||||
"blank_id=0\n",
|
||||
"reduction_type='sum'\n",
|
||||
"batch_average= True\n",
|
||||
"grad_norm_type='instance'\n",
|
||||
"\n",
|
||||
"criterion = CTCLoss(\n",
|
||||
" blank=blank_id,\n",
|
||||
" reduction=reduction_type,\n",
|
||||
" batch_average=batch_average,\n",
|
||||
" grad_norm_type=grad_norm_type)\n",
|
||||
"\n",
|
||||
"logits = paddle.to_tensor(logits_np)\n",
|
||||
"ys_pad = paddle.to_tensor(ys_pad_np,dtype='int32')\n",
|
||||
"hlens = paddle.to_tensor(hlens_np, dtype='int64')\n",
|
||||
"ys_lens = paddle.to_tensor(ys_lens_np, dtype='int64')\n",
|
||||
"\n",
|
||||
"pn_ctc_loss = criterion(logits, ys_pad, hlens, ys_lens)\n",
|
||||
"print(\"loss\", pn_ctc_loss.item())\n",
|
||||
" "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "de525d38",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 结论\n",
|
||||
"在 CPU 环境下: torch 的 CTC loss 的计算速度是 paddle 的 9.8 倍 \n",
|
||||
"在 GPU 环境下: torch 的 CTC loss 的计算速度是 paddle 的 6.87 倍\n",
|
||||
"\n",
|
||||
"## 其他结论\n",
|
||||
"torch 的 ctc loss 在 CPU 和 GPU 下 都没有完全对齐。其中CPU的前向对齐精度大约为 1e-2。 GPU 的前向对齐精度大约为 1e-4 。"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.10"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
@ -0,0 +1,110 @@
|
||||
###########################################################
|
||||
# FEATURE EXTRACTION SETTING #
|
||||
###########################################################
|
||||
|
||||
fs: 24000 # sr
|
||||
n_fft: 2048 # FFT size (samples).
|
||||
n_shift: 300 # Hop size (samples). 12.5ms
|
||||
win_length: 1200 # Window length (samples). 50ms
|
||||
# If set to null, it will be the same as fft_size.
|
||||
window: "hann" # Window function.
|
||||
|
||||
# Only used for feats_type != raw
|
||||
|
||||
fmin: 80 # Minimum frequency of Mel basis.
|
||||
fmax: 7600 # Maximum frequency of Mel basis.
|
||||
n_mels: 80 # The number of mel basis.
|
||||
|
||||
# Only used for the model using pitch features (e.g. FastSpeech2)
|
||||
f0min: 80 # Maximum f0 for pitch extraction.
|
||||
f0max: 400 # Minimum f0 for pitch extraction.
|
||||
|
||||
|
||||
###########################################################
|
||||
# DATA SETTING #
|
||||
###########################################################
|
||||
batch_size: 32
|
||||
num_workers: 4
|
||||
|
||||
|
||||
###########################################################
|
||||
# MODEL SETTING #
|
||||
###########################################################
|
||||
model:
|
||||
adim: 384 # attention dimension
|
||||
aheads: 2 # number of attention heads
|
||||
elayers: 4 # number of encoder layers
|
||||
eunits: 1536 # number of encoder ff units
|
||||
dlayers: 4 # number of decoder layers
|
||||
dunits: 1536 # number of decoder ff units
|
||||
positionwise_layer_type: conv1d # type of position-wise layer
|
||||
positionwise_conv_kernel_size: 3 # kernel size of position wise conv layer
|
||||
duration_predictor_layers: 2 # number of layers of duration predictor
|
||||
duration_predictor_chans: 256 # number of channels of duration predictor
|
||||
duration_predictor_kernel_size: 3 # filter size of duration predictor
|
||||
postnet_layers: 5 # number of layers of postnset
|
||||
postnet_filts: 5 # filter size of conv layers in postnet
|
||||
postnet_chans: 256 # number of channels of conv layers in postnet
|
||||
encoder_normalize_before: True # whether to perform layer normalization before the input
|
||||
decoder_normalize_before: True # whether to perform layer normalization before the input
|
||||
reduction_factor: 1 # reduction factor
|
||||
encoder_type: conformer # encoder type
|
||||
decoder_type: conformer # decoder type
|
||||
conformer_pos_enc_layer_type: rel_pos # conformer positional encoding type
|
||||
conformer_self_attn_layer_type: rel_selfattn # conformer self-attention type
|
||||
conformer_activation_type: swish # conformer activation type
|
||||
use_macaron_style_in_conformer: true # whether to use macaron style in conformer
|
||||
use_cnn_in_conformer: true # whether to use CNN in conformer
|
||||
conformer_enc_kernel_size: 7 # kernel size in CNN module of conformer-based encoder
|
||||
conformer_dec_kernel_size: 31 # kernel size in CNN module of conformer-based decoder
|
||||
init_type: xavier_uniform # initialization type
|
||||
transformer_enc_dropout_rate: 0.2 # dropout rate for transformer encoder layer
|
||||
transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding
|
||||
transformer_enc_attn_dropout_rate: 0.2 # dropout rate for transformer encoder attention layer
|
||||
transformer_dec_dropout_rate: 0.2 # dropout rate for transformer decoder layer
|
||||
transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding
|
||||
transformer_dec_attn_dropout_rate: 0.2 # dropout rate for transformer decoder attention layer
|
||||
pitch_predictor_layers: 5 # number of conv layers in pitch predictor
|
||||
pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor
|
||||
pitch_predictor_kernel_size: 5 # kernel size of conv leyers in pitch predictor
|
||||
pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor
|
||||
pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch
|
||||
pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch
|
||||
stop_gradient_from_pitch_predictor: true # whether to stop the gradient from pitch predictor to encoder
|
||||
energy_predictor_layers: 2 # number of conv layers in energy predictor
|
||||
energy_predictor_chans: 256 # number of channels of conv layers in energy predictor
|
||||
energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor
|
||||
energy_predictor_dropout: 0.5 # dropout rate in energy predictor
|
||||
energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy
|
||||
energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy
|
||||
stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder
|
||||
spk_embed_dim: 256 # speaker embedding dimension
|
||||
spk_embed_integration_type: concat # speaker embedding integration type
|
||||
|
||||
|
||||
###########################################################
|
||||
# UPDATER SETTING #
|
||||
###########################################################
|
||||
updater:
|
||||
use_masking: True # whether to apply masking for padded part in loss calculation
|
||||
|
||||
|
||||
|
||||
###########################################################
|
||||
# OPTIMIZER SETTING #
|
||||
###########################################################
|
||||
optimizer:
|
||||
optim: adam # optimizer type
|
||||
learning_rate: 0.001 # learning rate
|
||||
|
||||
###########################################################
|
||||
# TRAINING SETTING #
|
||||
###########################################################
|
||||
max_epoch: 1000
|
||||
num_snapshots: 5
|
||||
|
||||
|
||||
###########################################################
|
||||
# OTHER SETTING #
|
||||
###########################################################
|
||||
seed: 10086
|
@ -1,20 +1,105 @@
|
||||
#!/bin/bash
|
||||
|
||||
config_path=$1
|
||||
train_output_path=$2
|
||||
ckpt_name=$3
|
||||
stage=0
|
||||
stop_stage=0
|
||||
|
||||
# pwgan
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
|
||||
python3 ${BIN_DIR}/../synthesize.py \
|
||||
--am=speedyspeech_csmsc \
|
||||
--am_config=${config_path} \
|
||||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--am_stat=dump/train/speech_stats.npy \
|
||||
--voc=pwgan_csmsc \
|
||||
--voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \
|
||||
--voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
|
||||
--voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
|
||||
--test_metadata=dump/test/norm/metadata.jsonl \
|
||||
--output_dir=${train_output_path}/test \
|
||||
--phones_dict=dump/phone_id_map.txt \
|
||||
--tones_dict=dump/tone_id_map.txt
|
||||
fi
|
||||
|
||||
# for more GAN Vocoders
|
||||
# multi band melgan
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
|
||||
python3 ${BIN_DIR}/../synthesize.py \
|
||||
--am=speedyspeech_csmsc \
|
||||
--am_config=${config_path} \
|
||||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--am_stat=dump/train/speech_stats.npy \
|
||||
--voc=mb_melgan_csmsc \
|
||||
--voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
|
||||
--voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
|
||||
--voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
|
||||
--test_metadata=dump/test/norm/metadata.jsonl \
|
||||
--output_dir=${train_output_path}/test \
|
||||
--phones_dict=dump/phone_id_map.txt \
|
||||
--tones_dict=dump/tone_id_map.txt
|
||||
fi
|
||||
|
||||
# style melgan
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
|
||||
python3 ${BIN_DIR}/../synthesize.py \
|
||||
--am=speedyspeech_csmsc \
|
||||
--am_config=${config_path} \
|
||||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--am_stat=dump/train/speech_stats.npy \
|
||||
--voc=style_melgan_csmsc \
|
||||
--voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
|
||||
--voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
|
||||
--voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
|
||||
--test_metadata=dump/test/norm/metadata.jsonl \
|
||||
--output_dir=${train_output_path}/test \
|
||||
--phones_dict=dump/phone_id_map.txt \
|
||||
--tones_dict=dump/tone_id_map.txt
|
||||
fi
|
||||
|
||||
# hifigan
|
||||
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||
echo "in hifigan syn"
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
|
||||
python3 ${BIN_DIR}/../synthesize.py \
|
||||
--am=speedyspeech_csmsc \
|
||||
--am_config=${config_path} \
|
||||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--am_stat=dump/train/speech_stats.npy \
|
||||
--voc=hifigan_csmsc \
|
||||
--voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
|
||||
--voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
|
||||
--voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
|
||||
--test_metadata=dump/test/norm/metadata.jsonl \
|
||||
--output_dir=${train_output_path}/test \
|
||||
--phones_dict=dump/phone_id_map.txt \
|
||||
--tones_dict=dump/tone_id_map.txt
|
||||
fi
|
||||
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
|
||||
python3 ${BIN_DIR}/../synthesize.py \
|
||||
--am=speedyspeech_csmsc \
|
||||
--am_config=${config_path} \
|
||||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--am_stat=dump/train/feats_stats.npy \
|
||||
--voc=pwgan_csmsc \
|
||||
--voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \
|
||||
--voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
|
||||
--voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
|
||||
--test_metadata=dump/test/norm/metadata.jsonl \
|
||||
--output_dir=${train_output_path}/test \
|
||||
--phones_dict=dump/phone_id_map.txt \
|
||||
--tones_dict=dump/tone_id_map.txt
|
||||
# wavernn
|
||||
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
|
||||
echo "in wavernn syn"
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
|
||||
python3 ${BIN_DIR}/../synthesize.py \
|
||||
--am=speedyspeech_csmsc \
|
||||
--am_config=${config_path} \
|
||||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--am_stat=dump/train/speech_stats.npy \
|
||||
--voc=wavernn_csmsc \
|
||||
--voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \
|
||||
--voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
|
||||
--voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
|
||||
--test_metadata=dump/test/norm/metadata.jsonl \
|
||||
--output_dir=${train_output_path}/test \
|
||||
--tones_dict=dump/tone_id_map.txt \
|
||||
--phones_dict=dump/phone_id_map.txt
|
||||
fi
|
||||
|
@ -1 +1,5 @@
|
||||
# Changelog
|
||||
|
||||
Date: 2022-2-25, Author: Hui Zhang.
|
||||
- Refactor architecture.
|
||||
- dtw distance and mcd style dtw
|
||||
|
@ -1,170 +0,0 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from typing import List
|
||||
|
||||
import numpy as np
|
||||
from numpy import ndarray as array
|
||||
|
||||
from ..backends import depth_convert
|
||||
from ..utils import ParameterError
|
||||
|
||||
__all__ = [
|
||||
'depth_augment',
|
||||
'spect_augment',
|
||||
'random_crop1d',
|
||||
'random_crop2d',
|
||||
'adaptive_spect_augment',
|
||||
]
|
||||
|
||||
|
||||
def randint(high: int) -> int:
|
||||
"""Generate one random integer in range [0 high)
|
||||
|
||||
This is a helper function for random data augmentaiton
|
||||
"""
|
||||
return int(np.random.randint(0, high=high))
|
||||
|
||||
|
||||
def rand() -> float:
|
||||
"""Generate one floating-point number in range [0 1)
|
||||
|
||||
This is a helper function for random data augmentaiton
|
||||
"""
|
||||
return float(np.random.rand(1))
|
||||
|
||||
|
||||
def depth_augment(y: array,
|
||||
choices: List=['int8', 'int16'],
|
||||
probs: List[float]=[0.5, 0.5]) -> array:
|
||||
""" Audio depth augmentation
|
||||
|
||||
Do audio depth augmentation to simulate the distortion brought by quantization.
|
||||
"""
|
||||
assert len(probs) == len(
|
||||
choices
|
||||
), 'number of choices {} must be equal to size of probs {}'.format(
|
||||
len(choices), len(probs))
|
||||
depth = np.random.choice(choices, p=probs)
|
||||
src_depth = y.dtype
|
||||
y1 = depth_convert(y, depth)
|
||||
y2 = depth_convert(y1, src_depth)
|
||||
|
||||
return y2
|
||||
|
||||
|
||||
def adaptive_spect_augment(spect: array, tempo_axis: int=0,
|
||||
level: float=0.1) -> array:
|
||||
"""Do adpative spectrogram augmentation
|
||||
|
||||
The level of the augmentation is gowern by the paramter level,
|
||||
ranging from 0 to 1, with 0 represents no augmentation。
|
||||
|
||||
"""
|
||||
assert spect.ndim == 2., 'only supports 2d tensor or numpy array'
|
||||
if tempo_axis == 0:
|
||||
nt, nf = spect.shape
|
||||
else:
|
||||
nf, nt = spect.shape
|
||||
|
||||
time_mask_width = int(nt * level * 0.5)
|
||||
freq_mask_width = int(nf * level * 0.5)
|
||||
|
||||
num_time_mask = int(10 * level)
|
||||
num_freq_mask = int(10 * level)
|
||||
|
||||
if tempo_axis == 0:
|
||||
for _ in range(num_time_mask):
|
||||
start = randint(nt - time_mask_width)
|
||||
spect[start:start + time_mask_width, :] = 0
|
||||
for _ in range(num_freq_mask):
|
||||
start = randint(nf - freq_mask_width)
|
||||
spect[:, start:start + freq_mask_width] = 0
|
||||
else:
|
||||
for _ in range(num_time_mask):
|
||||
start = randint(nt - time_mask_width)
|
||||
spect[:, start:start + time_mask_width] = 0
|
||||
for _ in range(num_freq_mask):
|
||||
start = randint(nf - freq_mask_width)
|
||||
spect[start:start + freq_mask_width, :] = 0
|
||||
|
||||
return spect
|
||||
|
||||
|
||||
def spect_augment(spect: array,
|
||||
tempo_axis: int=0,
|
||||
max_time_mask: int=3,
|
||||
max_freq_mask: int=3,
|
||||
max_time_mask_width: int=30,
|
||||
max_freq_mask_width: int=20) -> array:
|
||||
"""Do spectrogram augmentation in both time and freq axis
|
||||
|
||||
Reference:
|
||||
|
||||
"""
|
||||
assert spect.ndim == 2., 'only supports 2d tensor or numpy array'
|
||||
if tempo_axis == 0:
|
||||
nt, nf = spect.shape
|
||||
else:
|
||||
nf, nt = spect.shape
|
||||
|
||||
num_time_mask = randint(max_time_mask)
|
||||
num_freq_mask = randint(max_freq_mask)
|
||||
|
||||
time_mask_width = randint(max_time_mask_width)
|
||||
freq_mask_width = randint(max_freq_mask_width)
|
||||
|
||||
if tempo_axis == 0:
|
||||
for _ in range(num_time_mask):
|
||||
start = randint(nt - time_mask_width)
|
||||
spect[start:start + time_mask_width, :] = 0
|
||||
for _ in range(num_freq_mask):
|
||||
start = randint(nf - freq_mask_width)
|
||||
spect[:, start:start + freq_mask_width] = 0
|
||||
else:
|
||||
for _ in range(num_time_mask):
|
||||
start = randint(nt - time_mask_width)
|
||||
spect[:, start:start + time_mask_width] = 0
|
||||
for _ in range(num_freq_mask):
|
||||
start = randint(nf - freq_mask_width)
|
||||
spect[start:start + freq_mask_width, :] = 0
|
||||
|
||||
return spect
|
||||
|
||||
|
||||
def random_crop1d(y: array, crop_len: int) -> array:
|
||||
""" Do random cropping on 1d input signal
|
||||
|
||||
The input is a 1d signal, typically a sound waveform
|
||||
"""
|
||||
if y.ndim != 1:
|
||||
'only accept 1d tensor or numpy array'
|
||||
n = len(y)
|
||||
idx = randint(n - crop_len)
|
||||
return y[idx:idx + crop_len]
|
||||
|
||||
|
||||
def random_crop2d(s: array, crop_len: int, tempo_axis: int=0) -> array:
|
||||
""" Do random cropping for 2D array, typically a spectrogram.
|
||||
|
||||
The cropping is done in temporal direction on the time-freq input signal.
|
||||
"""
|
||||
if tempo_axis >= s.ndim:
|
||||
raise ParameterError('axis out of range')
|
||||
|
||||
n = s.shape[tempo_axis]
|
||||
idx = randint(high=n - crop_len)
|
||||
sli = [slice(None) for i in range(s.ndim)]
|
||||
sli[tempo_axis] = slice(idx, idx + crop_len)
|
||||
out = s[tuple(sli)]
|
||||
return out
|
@ -1,461 +0,0 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import math
|
||||
from functools import partial
|
||||
from typing import Optional
|
||||
from typing import Union
|
||||
|
||||
import paddle
|
||||
import paddle.nn as nn
|
||||
|
||||
from .window import get_window
|
||||
|
||||
__all__ = [
|
||||
'Spectrogram',
|
||||
'MelSpectrogram',
|
||||
'LogMelSpectrogram',
|
||||
]
|
||||
|
||||
|
||||
def hz_to_mel(freq: Union[paddle.Tensor, float],
|
||||
htk: bool=False) -> Union[paddle.Tensor, float]:
|
||||
"""Convert Hz to Mels.
|
||||
Parameters:
|
||||
freq: the input tensor of arbitrary shape, or a single floating point number.
|
||||
htk: use HTK formula to do the conversion.
|
||||
The default value is False.
|
||||
Returns:
|
||||
The frequencies represented in Mel-scale.
|
||||
"""
|
||||
|
||||
if htk:
|
||||
if isinstance(freq, paddle.Tensor):
|
||||
return 2595.0 * paddle.log10(1.0 + freq / 700.0)
|
||||
else:
|
||||
return 2595.0 * math.log10(1.0 + freq / 700.0)
|
||||
|
||||
# Fill in the linear part
|
||||
f_min = 0.0
|
||||
f_sp = 200.0 / 3
|
||||
|
||||
mels = (freq - f_min) / f_sp
|
||||
|
||||
# Fill in the log-scale part
|
||||
|
||||
min_log_hz = 1000.0 # beginning of log region (Hz)
|
||||
min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels)
|
||||
logstep = math.log(6.4) / 27.0 # step size for log region
|
||||
|
||||
if isinstance(freq, paddle.Tensor):
|
||||
target = min_log_mel + paddle.log(
|
||||
freq / min_log_hz + 1e-10) / logstep # prevent nan with 1e-10
|
||||
mask = (freq > min_log_hz).astype(freq.dtype)
|
||||
mels = target * mask + mels * (
|
||||
1 - mask) # will replace by masked_fill OP in future
|
||||
else:
|
||||
if freq >= min_log_hz:
|
||||
mels = min_log_mel + math.log(freq / min_log_hz + 1e-10) / logstep
|
||||
|
||||
return mels
|
||||
|
||||
|
||||
def mel_to_hz(mel: Union[float, paddle.Tensor],
|
||||
htk: bool=False) -> Union[float, paddle.Tensor]:
|
||||
"""Convert mel bin numbers to frequencies.
|
||||
Parameters:
|
||||
mel: the mel frequency represented as a tensor of arbitrary shape, or a floating point number.
|
||||
htk: use HTK formula to do the conversion.
|
||||
Returns:
|
||||
The frequencies represented in hz.
|
||||
"""
|
||||
if htk:
|
||||
return 700.0 * (10.0**(mel / 2595.0) - 1.0)
|
||||
|
||||
f_min = 0.0
|
||||
f_sp = 200.0 / 3
|
||||
freqs = f_min + f_sp * mel
|
||||
# And now the nonlinear scale
|
||||
min_log_hz = 1000.0 # beginning of log region (Hz)
|
||||
min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels)
|
||||
logstep = math.log(6.4) / 27.0 # step size for log region
|
||||
if isinstance(mel, paddle.Tensor):
|
||||
target = min_log_hz * paddle.exp(logstep * (mel - min_log_mel))
|
||||
mask = (mel > min_log_mel).astype(mel.dtype)
|
||||
freqs = target * mask + freqs * (
|
||||
1 - mask) # will replace by masked_fill OP in future
|
||||
else:
|
||||
if mel >= min_log_mel:
|
||||
freqs = min_log_hz * math.exp(logstep * (mel - min_log_mel))
|
||||
|
||||
return freqs
|
||||
|
||||
|
||||
def mel_frequencies(n_mels: int=64,
|
||||
f_min: float=0.0,
|
||||
f_max: float=11025.0,
|
||||
htk: bool=False,
|
||||
dtype: str=paddle.float32):
|
||||
"""Compute mel frequencies.
|
||||
Parameters:
|
||||
n_mels(int): number of Mel bins.
|
||||
f_min(float): the lower cut-off frequency, below which the filter response is zero.
|
||||
f_max(float): the upper cut-off frequency, above which the filter response is zero.
|
||||
htk(bool): whether to use htk formula.
|
||||
dtype(str): the datatype of the return frequencies.
|
||||
Returns:
|
||||
The frequencies represented in Mel-scale
|
||||
"""
|
||||
# 'Center freqs' of mel bands - uniformly spaced between limits
|
||||
min_mel = hz_to_mel(f_min, htk=htk)
|
||||
max_mel = hz_to_mel(f_max, htk=htk)
|
||||
mels = paddle.linspace(min_mel, max_mel, n_mels, dtype=dtype)
|
||||
freqs = mel_to_hz(mels, htk=htk)
|
||||
return freqs
|
||||
|
||||
|
||||
def fft_frequencies(sr: int, n_fft: int, dtype: str=paddle.float32):
|
||||
"""Compute fourier frequencies.
|
||||
Parameters:
|
||||
sr(int): the audio sample rate.
|
||||
n_fft(float): the number of fft bins.
|
||||
dtype(str): the datatype of the return frequencies.
|
||||
Returns:
|
||||
The frequencies represented in hz.
|
||||
"""
|
||||
return paddle.linspace(0, float(sr) / 2, int(1 + n_fft // 2), dtype=dtype)
|
||||
|
||||
|
||||
def compute_fbank_matrix(sr: int,
|
||||
n_fft: int,
|
||||
n_mels: int=64,
|
||||
f_min: float=0.0,
|
||||
f_max: Optional[float]=None,
|
||||
htk: bool=False,
|
||||
norm: Union[str, float]='slaney',
|
||||
dtype: str=paddle.float32):
|
||||
"""Compute fbank matrix.
|
||||
Parameters:
|
||||
sr(int): the audio sample rate.
|
||||
n_fft(int): the number of fft bins.
|
||||
n_mels(int): the number of Mel bins.
|
||||
f_min(float): the lower cut-off frequency, below which the filter response is zero.
|
||||
f_max(float): the upper cut-off frequency, above which the filter response is zero.
|
||||
htk: whether to use htk formula.
|
||||
return_complex(bool): whether to return complex matrix. If True, the matrix will
|
||||
be complex type. Otherwise, the real and image part will be stored in the last
|
||||
axis of returned tensor.
|
||||
dtype(str): the datatype of the returned fbank matrix.
|
||||
Returns:
|
||||
The fbank matrix of shape (n_mels, int(1+n_fft//2)).
|
||||
Shape:
|
||||
output: (n_mels, int(1+n_fft//2))
|
||||
"""
|
||||
|
||||
if f_max is None:
|
||||
f_max = float(sr) / 2
|
||||
|
||||
# Initialize the weights
|
||||
weights = paddle.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype)
|
||||
|
||||
# Center freqs of each FFT bin
|
||||
fftfreqs = fft_frequencies(sr=sr, n_fft=n_fft, dtype=dtype)
|
||||
|
||||
# 'Center freqs' of mel bands - uniformly spaced between limits
|
||||
mel_f = mel_frequencies(
|
||||
n_mels + 2, f_min=f_min, f_max=f_max, htk=htk, dtype=dtype)
|
||||
|
||||
fdiff = mel_f[1:] - mel_f[:-1] #np.diff(mel_f)
|
||||
ramps = mel_f.unsqueeze(1) - fftfreqs.unsqueeze(0)
|
||||
#ramps = np.subtract.outer(mel_f, fftfreqs)
|
||||
|
||||
for i in range(n_mels):
|
||||
# lower and upper slopes for all bins
|
||||
lower = -ramps[i] / fdiff[i]
|
||||
upper = ramps[i + 2] / fdiff[i + 1]
|
||||
|
||||
# .. then intersect them with each other and zero
|
||||
weights[i] = paddle.maximum(
|
||||
paddle.zeros_like(lower), paddle.minimum(lower, upper))
|
||||
|
||||
# Slaney-style mel is scaled to be approx constant energy per channel
|
||||
if norm == 'slaney':
|
||||
enorm = 2.0 / (mel_f[2:n_mels + 2] - mel_f[:n_mels])
|
||||
weights *= enorm.unsqueeze(1)
|
||||
elif isinstance(norm, int) or isinstance(norm, float):
|
||||
weights = paddle.nn.functional.normalize(weights, p=norm, axis=-1)
|
||||
|
||||
return weights
|
||||
|
||||
|
||||
def power_to_db(magnitude: paddle.Tensor,
|
||||
ref_value: float=1.0,
|
||||
amin: float=1e-10,
|
||||
top_db: Optional[float]=None) -> paddle.Tensor:
|
||||
"""Convert a power spectrogram (amplitude squared) to decibel (dB) units.
|
||||
The function computes the scaling ``10 * log10(x / ref)`` in a numerically
|
||||
stable way.
|
||||
Parameters:
|
||||
magnitude(Tensor): the input magnitude tensor of any shape.
|
||||
ref_value(float): the reference value. If smaller than 1.0, the db level
|
||||
of the signal will be pulled up accordingly. Otherwise, the db level
|
||||
is pushed down.
|
||||
amin(float): the minimum value of input magnitude, below which the input
|
||||
magnitude is clipped(to amin).
|
||||
top_db(float): the maximum db value of resulting spectrum, above which the
|
||||
spectrum is clipped(to top_db).
|
||||
Returns:
|
||||
The spectrogram in log-scale.
|
||||
shape:
|
||||
input: any shape
|
||||
output: same as input
|
||||
"""
|
||||
if amin <= 0:
|
||||
raise Exception("amin must be strictly positive")
|
||||
|
||||
if ref_value <= 0:
|
||||
raise Exception("ref_value must be strictly positive")
|
||||
|
||||
ones = paddle.ones_like(magnitude)
|
||||
log_spec = 10.0 * paddle.log10(paddle.maximum(ones * amin, magnitude))
|
||||
log_spec -= 10.0 * math.log10(max(ref_value, amin))
|
||||
|
||||
if top_db is not None:
|
||||
if top_db < 0:
|
||||
raise Exception("top_db must be non-negative")
|
||||
log_spec = paddle.maximum(log_spec, ones * (log_spec.max() - top_db))
|
||||
|
||||
return log_spec
|
||||
|
||||
|
||||
class Spectrogram(nn.Layer):
|
||||
def __init__(self,
|
||||
n_fft: int=512,
|
||||
hop_length: Optional[int]=None,
|
||||
win_length: Optional[int]=None,
|
||||
window: str='hann',
|
||||
center: bool=True,
|
||||
pad_mode: str='reflect',
|
||||
dtype: str=paddle.float32):
|
||||
"""Compute spectrogram of a given signal, typically an audio waveform.
|
||||
The spectorgram is defined as the complex norm of the short-time
|
||||
Fourier transformation.
|
||||
Parameters:
|
||||
n_fft(int): the number of frequency components of the discrete Fourier transform.
|
||||
The default value is 2048,
|
||||
hop_length(int|None): the hop length of the short time FFT. If None, it is set to win_length//4.
|
||||
The default value is None.
|
||||
win_length: the window length of the short time FFt. If None, it is set to same as n_fft.
|
||||
The default value is None.
|
||||
window(str): the name of the window function applied to the single before the Fourier transform.
|
||||
The folllowing window names are supported: 'hamming','hann','kaiser','gaussian',
|
||||
'exponential','triang','bohman','blackman','cosine','tukey','taylor'.
|
||||
The default value is 'hann'
|
||||
center(bool): if True, the signal is padded so that frame t is centered at x[t * hop_length].
|
||||
If False, frame t begins at x[t * hop_length]
|
||||
The default value is True
|
||||
pad_mode(str): the mode to pad the signal if necessary. The supported modes are 'reflect'
|
||||
and 'constant'. The default value is 'reflect'.
|
||||
dtype(str): the data type of input and window.
|
||||
Notes:
|
||||
The Spectrogram transform relies on STFT transform to compute the spectrogram.
|
||||
By default, the weights are not learnable. To fine-tune the Fourier coefficients,
|
||||
set stop_gradient=False before training.
|
||||
For more information, see STFT().
|
||||
"""
|
||||
super(Spectrogram, self).__init__()
|
||||
|
||||
if win_length is None:
|
||||
win_length = n_fft
|
||||
|
||||
fft_window = get_window(window, win_length, fftbins=True, dtype=dtype)
|
||||
self._stft = partial(
|
||||
paddle.signal.stft,
|
||||
n_fft=n_fft,
|
||||
hop_length=hop_length,
|
||||
win_length=win_length,
|
||||
window=fft_window,
|
||||
center=center,
|
||||
pad_mode=pad_mode)
|
||||
|
||||
def forward(self, x):
|
||||
stft = self._stft(x)
|
||||
spectrogram = paddle.square(paddle.abs(stft))
|
||||
return spectrogram
|
||||
|
||||
|
||||
class MelSpectrogram(nn.Layer):
|
||||
def __init__(self,
|
||||
sr: int=22050,
|
||||
n_fft: int=512,
|
||||
hop_length: Optional[int]=None,
|
||||
win_length: Optional[int]=None,
|
||||
window: str='hann',
|
||||
center: bool=True,
|
||||
pad_mode: str='reflect',
|
||||
n_mels: int=64,
|
||||
f_min: float=50.0,
|
||||
f_max: Optional[float]=None,
|
||||
htk: bool=False,
|
||||
norm: Union[str, float]='slaney',
|
||||
dtype: str=paddle.float32):
|
||||
"""Compute the melspectrogram of a given signal, typically an audio waveform.
|
||||
The melspectrogram is also known as filterbank or fbank feature in audio community.
|
||||
It is computed by multiplying spectrogram with Mel filter bank matrix.
|
||||
Parameters:
|
||||
sr(int): the audio sample rate.
|
||||
The default value is 22050.
|
||||
n_fft(int): the number of frequency components of the discrete Fourier transform.
|
||||
The default value is 2048,
|
||||
hop_length(int|None): the hop length of the short time FFT. If None, it is set to win_length//4.
|
||||
The default value is None.
|
||||
win_length: the window length of the short time FFt. If None, it is set to same as n_fft.
|
||||
The default value is None.
|
||||
window(str): the name of the window function applied to the single before the Fourier transform.
|
||||
The folllowing window names are supported: 'hamming','hann','kaiser','gaussian',
|
||||
'exponential','triang','bohman','blackman','cosine','tukey','taylor'.
|
||||
The default value is 'hann'
|
||||
center(bool): if True, the signal is padded so that frame t is centered at x[t * hop_length].
|
||||
If False, frame t begins at x[t * hop_length]
|
||||
The default value is True
|
||||
pad_mode(str): the mode to pad the signal if necessary. The supported modes are 'reflect'
|
||||
and 'constant'.
|
||||
The default value is 'reflect'.
|
||||
n_mels(int): the mel bins.
|
||||
f_min(float): the lower cut-off frequency, below which the filter response is zero.
|
||||
f_max(float): the upper cut-off frequency, above which the filter response is zeros.
|
||||
htk(bool): whether to use HTK formula in computing fbank matrix.
|
||||
norm(str|float): the normalization type in computing fbank matrix. Slaney-style is used by default.
|
||||
You can specify norm=1.0/2.0 to use customized p-norm normalization.
|
||||
dtype(str): the datatype of fbank matrix used in the transform. Use float64 to increase numerical
|
||||
accuracy. Note that the final transform will be conducted in float32 regardless of dtype of fbank matrix.
|
||||
"""
|
||||
super(MelSpectrogram, self).__init__()
|
||||
|
||||
self._spectrogram = Spectrogram(
|
||||
n_fft=n_fft,
|
||||
hop_length=hop_length,
|
||||
win_length=win_length,
|
||||
window=window,
|
||||
center=center,
|
||||
pad_mode=pad_mode,
|
||||
dtype=dtype)
|
||||
self.n_mels = n_mels
|
||||
self.f_min = f_min
|
||||
self.f_max = f_max
|
||||
self.htk = htk
|
||||
self.norm = norm
|
||||
if f_max is None:
|
||||
f_max = sr // 2
|
||||
self.fbank_matrix = compute_fbank_matrix(
|
||||
sr=sr,
|
||||
n_fft=n_fft,
|
||||
n_mels=n_mels,
|
||||
f_min=f_min,
|
||||
f_max=f_max,
|
||||
htk=htk,
|
||||
norm=norm,
|
||||
dtype=dtype) # float64 for better numerical results
|
||||
self.register_buffer('fbank_matrix', self.fbank_matrix)
|
||||
|
||||
def forward(self, x):
|
||||
spect_feature = self._spectrogram(x)
|
||||
mel_feature = paddle.matmul(self.fbank_matrix, spect_feature)
|
||||
return mel_feature
|
||||
|
||||
|
||||
class LogMelSpectrogram(nn.Layer):
|
||||
def __init__(self,
|
||||
sr: int=22050,
|
||||
n_fft: int=512,
|
||||
hop_length: Optional[int]=None,
|
||||
win_length: Optional[int]=None,
|
||||
window: str='hann',
|
||||
center: bool=True,
|
||||
pad_mode: str='reflect',
|
||||
n_mels: int=64,
|
||||
f_min: float=50.0,
|
||||
f_max: Optional[float]=None,
|
||||
htk: bool=False,
|
||||
norm: Union[str, float]='slaney',
|
||||
ref_value: float=1.0,
|
||||
amin: float=1e-10,
|
||||
top_db: Optional[float]=None,
|
||||
dtype: str=paddle.float32):
|
||||
"""Compute log-mel-spectrogram(also known as LogFBank) feature of a given signal,
|
||||
typically an audio waveform.
|
||||
Parameters:
|
||||
sr(int): the audio sample rate.
|
||||
The default value is 22050.
|
||||
n_fft(int): the number of frequency components of the discrete Fourier transform.
|
||||
The default value is 2048,
|
||||
hop_length(int|None): the hop length of the short time FFT. If None, it is set to win_length//4.
|
||||
The default value is None.
|
||||
win_length: the window length of the short time FFt. If None, it is set to same as n_fft.
|
||||
The default value is None.
|
||||
window(str): the name of the window function applied to the single before the Fourier transform.
|
||||
The folllowing window names are supported: 'hamming','hann','kaiser','gaussian',
|
||||
'exponential','triang','bohman','blackman','cosine','tukey','taylor'.
|
||||
The default value is 'hann'
|
||||
center(bool): if True, the signal is padded so that frame t is centered at x[t * hop_length].
|
||||
If False, frame t begins at x[t * hop_length]
|
||||
The default value is True
|
||||
pad_mode(str): the mode to pad the signal if necessary. The supported modes are 'reflect'
|
||||
and 'constant'.
|
||||
The default value is 'reflect'.
|
||||
n_mels(int): the mel bins.
|
||||
f_min(float): the lower cut-off frequency, below which the filter response is zero.
|
||||
f_max(float): the upper cut-off frequency, above which the filter response is zeros.
|
||||
ref_value(float): the reference value. If smaller than 1.0, the db level
|
||||
htk(bool): whether to use HTK formula in computing fbank matrix.
|
||||
norm(str|float): the normalization type in computing fbank matrix. Slaney-style is used by default.
|
||||
You can specify norm=1.0/2.0 to use customized p-norm normalization.
|
||||
dtype(str): the datatype of fbank matrix used in the transform. Use float64 to increase numerical
|
||||
accuracy. Note that the final transform will be conducted in float32 regardless of dtype of fbank matrix.
|
||||
amin(float): the minimum value of input magnitude, below which the input of the signal will be pulled up accordingly.
|
||||
Otherwise, the db level is pushed down.
|
||||
magnitude is clipped(to amin). For numerical stability, set amin to a larger value,
|
||||
e.g., 1e-3.
|
||||
top_db(float): the maximum db value of resulting spectrum, above which the
|
||||
spectrum is clipped(to top_db).
|
||||
"""
|
||||
super(LogMelSpectrogram, self).__init__()
|
||||
|
||||
self._melspectrogram = MelSpectrogram(
|
||||
sr=sr,
|
||||
n_fft=n_fft,
|
||||
hop_length=hop_length,
|
||||
win_length=win_length,
|
||||
window=window,
|
||||
center=center,
|
||||
pad_mode=pad_mode,
|
||||
n_mels=n_mels,
|
||||
f_min=f_min,
|
||||
f_max=f_max,
|
||||
htk=htk,
|
||||
norm=norm,
|
||||
dtype=dtype)
|
||||
|
||||
self.ref_value = ref_value
|
||||
self.amin = amin
|
||||
self.top_db = top_db
|
||||
|
||||
def forward(self, x):
|
||||
# import ipdb; ipdb.set_trace()
|
||||
mel_feature = self._melspectrogram(x)
|
||||
log_mel_feature = power_to_db(
|
||||
mel_feature,
|
||||
ref_value=self.ref_value,
|
||||
amin=self.amin,
|
||||
top_db=self.top_db)
|
||||
return log_mel_feature
|
@ -0,0 +1,22 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from . import compliance
|
||||
from . import datasets
|
||||
from . import features
|
||||
from . import functional
|
||||
from . import io
|
||||
from . import metric
|
||||
from . import sox_effects
|
||||
from .backends import load
|
||||
from .backends import save
|
@ -0,0 +1,19 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from .soundfile_backend import depth_convert
|
||||
from .soundfile_backend import load
|
||||
from .soundfile_backend import normalize
|
||||
from .soundfile_backend import resample
|
||||
from .soundfile_backend import save
|
||||
from .soundfile_backend import to_mono
|
@ -0,0 +1,13 @@
|
||||
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
@ -0,0 +1,638 @@
|
||||
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# Modified from torchaudio(https://github.com/pytorch/audio)
|
||||
import math
|
||||
from typing import Tuple
|
||||
|
||||
import paddle
|
||||
from paddle import Tensor
|
||||
|
||||
from ..functional import create_dct
|
||||
from ..functional.window import get_window
|
||||
|
||||
__all__ = [
|
||||
'spectrogram',
|
||||
'fbank',
|
||||
'mfcc',
|
||||
]
|
||||
|
||||
# window types
|
||||
HANNING = 'hann'
|
||||
HAMMING = 'hamming'
|
||||
POVEY = 'povey'
|
||||
RECTANGULAR = 'rect'
|
||||
BLACKMAN = 'blackman'
|
||||
|
||||
|
||||
def _get_epsilon(dtype):
|
||||
return paddle.to_tensor(1e-07, dtype=dtype)
|
||||
|
||||
|
||||
def _next_power_of_2(x: int) -> int:
|
||||
return 1 if x == 0 else 2**(x - 1).bit_length()
|
||||
|
||||
|
||||
def _get_strided(waveform: Tensor,
|
||||
window_size: int,
|
||||
window_shift: int,
|
||||
snip_edges: bool) -> Tensor:
|
||||
assert waveform.dim() == 1
|
||||
num_samples = waveform.shape[0]
|
||||
|
||||
if snip_edges:
|
||||
if num_samples < window_size:
|
||||
return paddle.empty((0, 0), dtype=waveform.dtype)
|
||||
else:
|
||||
m = 1 + (num_samples - window_size) // window_shift
|
||||
else:
|
||||
reversed_waveform = paddle.flip(waveform, [0])
|
||||
m = (num_samples + (window_shift // 2)) // window_shift
|
||||
pad = window_size // 2 - window_shift // 2
|
||||
pad_right = reversed_waveform
|
||||
if pad > 0:
|
||||
pad_left = reversed_waveform[-pad:]
|
||||
waveform = paddle.concat((pad_left, waveform, pad_right), axis=0)
|
||||
else:
|
||||
waveform = paddle.concat((waveform[-pad:], pad_right), axis=0)
|
||||
|
||||
return paddle.signal.frame(waveform, window_size, window_shift)[:, :m].T
|
||||
|
||||
|
||||
def _feature_window_function(
|
||||
window_type: str,
|
||||
window_size: int,
|
||||
blackman_coeff: float,
|
||||
dtype: int, ) -> Tensor:
|
||||
if window_type == HANNING:
|
||||
return get_window('hann', window_size, fftbins=False, dtype=dtype)
|
||||
elif window_type == HAMMING:
|
||||
return get_window('hamming', window_size, fftbins=False, dtype=dtype)
|
||||
elif window_type == POVEY:
|
||||
return get_window(
|
||||
'hann', window_size, fftbins=False, dtype=dtype).pow(0.85)
|
||||
elif window_type == RECTANGULAR:
|
||||
return paddle.ones([window_size], dtype=dtype)
|
||||
elif window_type == BLACKMAN:
|
||||
a = 2 * math.pi / (window_size - 1)
|
||||
window_function = paddle.arange(window_size, dtype=dtype)
|
||||
return (blackman_coeff - 0.5 * paddle.cos(a * window_function) +
|
||||
(0.5 - blackman_coeff) * paddle.cos(2 * a * window_function)
|
||||
).astype(dtype)
|
||||
else:
|
||||
raise Exception('Invalid window type ' + window_type)
|
||||
|
||||
|
||||
def _get_log_energy(strided_input: Tensor, epsilon: Tensor,
|
||||
energy_floor: float) -> Tensor:
|
||||
log_energy = paddle.maximum(strided_input.pow(2).sum(1), epsilon).log()
|
||||
if energy_floor == 0.0:
|
||||
return log_energy
|
||||
return paddle.maximum(
|
||||
log_energy,
|
||||
paddle.to_tensor(math.log(energy_floor), dtype=strided_input.dtype))
|
||||
|
||||
|
||||
def _get_waveform_and_window_properties(
|
||||
waveform: Tensor,
|
||||
channel: int,
|
||||
sr: int,
|
||||
frame_shift: float,
|
||||
frame_length: float,
|
||||
round_to_power_of_two: bool,
|
||||
preemphasis_coefficient: float) -> Tuple[Tensor, int, int, int]:
|
||||
channel = max(channel, 0)
|
||||
assert channel < waveform.shape[0], (
|
||||
'Invalid channel {} for size {}'.format(channel, waveform.shape[0]))
|
||||
waveform = waveform[channel, :] # size (n)
|
||||
window_shift = int(
|
||||
sr * frame_shift *
|
||||
0.001) # pass frame_shift and frame_length in milliseconds
|
||||
window_size = int(sr * frame_length * 0.001)
|
||||
padded_window_size = _next_power_of_2(
|
||||
window_size) if round_to_power_of_two else window_size
|
||||
|
||||
assert 2 <= window_size <= len(waveform), (
|
||||
'choose a window size {} that is [2, {}]'.format(window_size,
|
||||
len(waveform)))
|
||||
assert 0 < window_shift, '`window_shift` must be greater than 0'
|
||||
assert padded_window_size % 2 == 0, 'the padded `window_size` must be divisible by two.' \
|
||||
' use `round_to_power_of_two` or change `frame_length`'
|
||||
assert 0. <= preemphasis_coefficient <= 1.0, '`preemphasis_coefficient` must be between [0,1]'
|
||||
assert sr > 0, '`sr` must be greater than zero'
|
||||
return waveform, window_shift, window_size, padded_window_size
|
||||
|
||||
|
||||
def _get_window(waveform: Tensor,
|
||||
padded_window_size: int,
|
||||
window_size: int,
|
||||
window_shift: int,
|
||||
window_type: str,
|
||||
blackman_coeff: float,
|
||||
snip_edges: bool,
|
||||
raw_energy: bool,
|
||||
energy_floor: float,
|
||||
dither: float,
|
||||
remove_dc_offset: bool,
|
||||
preemphasis_coefficient: float) -> Tuple[Tensor, Tensor]:
|
||||
dtype = waveform.dtype
|
||||
epsilon = _get_epsilon(dtype)
|
||||
|
||||
# (m, window_size)
|
||||
strided_input = _get_strided(waveform, window_size, window_shift,
|
||||
snip_edges)
|
||||
|
||||
if dither != 0.0:
|
||||
x = paddle.maximum(epsilon,
|
||||
paddle.rand(strided_input.shape, dtype=dtype))
|
||||
rand_gauss = paddle.sqrt(-2 * x.log()) * paddle.cos(2 * math.pi * x)
|
||||
strided_input = strided_input + rand_gauss * dither
|
||||
|
||||
if remove_dc_offset:
|
||||
row_means = paddle.mean(strided_input, axis=1).unsqueeze(1) # (m, 1)
|
||||
strided_input = strided_input - row_means
|
||||
|
||||
if raw_energy:
|
||||
signal_log_energy = _get_log_energy(strided_input, epsilon,
|
||||
energy_floor) # (m)
|
||||
|
||||
if preemphasis_coefficient != 0.0:
|
||||
offset_strided_input = paddle.nn.functional.pad(
|
||||
strided_input.unsqueeze(0), (1, 0),
|
||||
data_format='NCL',
|
||||
mode='replicate').squeeze(0) # (m, window_size + 1)
|
||||
strided_input = strided_input - preemphasis_coefficient * offset_strided_input[:, :
|
||||
-1]
|
||||
|
||||
window_function = _feature_window_function(
|
||||
window_type, window_size, blackman_coeff,
|
||||
dtype).unsqueeze(0) # (1, window_size)
|
||||
strided_input = strided_input * window_function # (m, window_size)
|
||||
|
||||
# (m, padded_window_size)
|
||||
if padded_window_size != window_size:
|
||||
padding_right = padded_window_size - window_size
|
||||
strided_input = paddle.nn.functional.pad(
|
||||
strided_input.unsqueeze(0), (0, padding_right),
|
||||
data_format='NCL',
|
||||
mode='constant',
|
||||
value=0).squeeze(0)
|
||||
|
||||
if not raw_energy:
|
||||
signal_log_energy = _get_log_energy(strided_input, epsilon,
|
||||
energy_floor) # size (m)
|
||||
|
||||
return strided_input, signal_log_energy
|
||||
|
||||
|
||||
def _subtract_column_mean(tensor: Tensor, subtract_mean: bool) -> Tensor:
|
||||
if subtract_mean:
|
||||
col_means = paddle.mean(tensor, axis=0).unsqueeze(0)
|
||||
tensor = tensor - col_means
|
||||
return tensor
|
||||
|
||||
|
||||
def spectrogram(waveform: Tensor,
|
||||
blackman_coeff: float=0.42,
|
||||
channel: int=-1,
|
||||
dither: float=0.0,
|
||||
energy_floor: float=1.0,
|
||||
frame_length: float=25.0,
|
||||
frame_shift: float=10.0,
|
||||
preemphasis_coefficient: float=0.97,
|
||||
raw_energy: bool=True,
|
||||
remove_dc_offset: bool=True,
|
||||
round_to_power_of_two: bool=True,
|
||||
sr: int=16000,
|
||||
snip_edges: bool=True,
|
||||
subtract_mean: bool=False,
|
||||
window_type: str=POVEY) -> Tensor:
|
||||
"""Compute and return a spectrogram from a waveform. The output is identical to Kaldi's.
|
||||
|
||||
Args:
|
||||
waveform (Tensor): A waveform tensor with shape [C, T].
|
||||
blackman_coeff (float, optional): Coefficient for Blackman window.. Defaults to 0.42.
|
||||
channel (int, optional): Select the channel of waveform. Defaults to -1.
|
||||
dither (float, optional): Dithering constant . Defaults to 0.0.
|
||||
energy_floor (float, optional): Floor on energy of the output Spectrogram. Defaults to 1.0.
|
||||
frame_length (float, optional): Frame length in milliseconds. Defaults to 25.0.
|
||||
frame_shift (float, optional): Shift between adjacent frames in milliseconds. Defaults to 10.0.
|
||||
preemphasis_coefficient (float, optional): Preemphasis coefficient for input waveform. Defaults to 0.97.
|
||||
raw_energy (bool, optional): Whether to compute before preemphasis and windowing. Defaults to True.
|
||||
remove_dc_offset (bool, optional): Whether to subtract mean from waveform on frames. Defaults to True.
|
||||
round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input
|
||||
to FFT. Defaults to True.
|
||||
sr (int, optional): Sample rate of input waveform. Defaults to 16000.
|
||||
snip_edges (bool, optional): Drop samples in the end of waveform that cann't fit a singal frame when it
|
||||
is set True. Otherwise performs reflect padding to the end of waveform. Defaults to True.
|
||||
subtract_mean (bool, optional): Whether to subtract mean of feature files. Defaults to False.
|
||||
window_type (str, optional): Choose type of window for FFT computation. Defaults to POVEY.
|
||||
|
||||
Returns:
|
||||
Tensor: A spectrogram tensor with shape (m, padded_window_size // 2 + 1) where m is the number of frames
|
||||
depends on frame_length and frame_shift.
|
||||
"""
|
||||
dtype = waveform.dtype
|
||||
epsilon = _get_epsilon(dtype)
|
||||
|
||||
waveform, window_shift, window_size, padded_window_size = _get_waveform_and_window_properties(
|
||||
waveform, channel, sr, frame_shift, frame_length, round_to_power_of_two,
|
||||
preemphasis_coefficient)
|
||||
|
||||
strided_input, signal_log_energy = _get_window(
|
||||
waveform, padded_window_size, window_size, window_shift, window_type,
|
||||
blackman_coeff, snip_edges, raw_energy, energy_floor, dither,
|
||||
remove_dc_offset, preemphasis_coefficient)
|
||||
|
||||
# (m, padded_window_size // 2 + 1, 2)
|
||||
fft = paddle.fft.rfft(strided_input)
|
||||
|
||||
power_spectrum = paddle.maximum(
|
||||
fft.abs().pow(2.), epsilon).log() # (m, padded_window_size // 2 + 1)
|
||||
power_spectrum[:, 0] = signal_log_energy
|
||||
|
||||
power_spectrum = _subtract_column_mean(power_spectrum, subtract_mean)
|
||||
return power_spectrum
|
||||
|
||||
|
||||
def _inverse_mel_scale_scalar(mel_freq: float) -> float:
|
||||
return 700.0 * (math.exp(mel_freq / 1127.0) - 1.0)
|
||||
|
||||
|
||||
def _inverse_mel_scale(mel_freq: Tensor) -> Tensor:
|
||||
return 700.0 * ((mel_freq / 1127.0).exp() - 1.0)
|
||||
|
||||
|
||||
def _mel_scale_scalar(freq: float) -> float:
|
||||
return 1127.0 * math.log(1.0 + freq / 700.0)
|
||||
|
||||
|
||||
def _mel_scale(freq: Tensor) -> Tensor:
|
||||
return 1127.0 * (1.0 + freq / 700.0).log()
|
||||
|
||||
|
||||
def _vtln_warp_freq(vtln_low_cutoff: float,
|
||||
vtln_high_cutoff: float,
|
||||
low_freq: float,
|
||||
high_freq: float,
|
||||
vtln_warp_factor: float,
|
||||
freq: Tensor) -> Tensor:
|
||||
assert vtln_low_cutoff > low_freq, 'be sure to set the vtln_low option higher than low_freq'
|
||||
assert vtln_high_cutoff < high_freq, 'be sure to set the vtln_high option lower than high_freq [or negative]'
|
||||
l = vtln_low_cutoff * max(1.0, vtln_warp_factor)
|
||||
h = vtln_high_cutoff * min(1.0, vtln_warp_factor)
|
||||
scale = 1.0 / vtln_warp_factor
|
||||
Fl = scale * l
|
||||
Fh = scale * h
|
||||
assert l > low_freq and h < high_freq
|
||||
scale_left = (Fl - low_freq) / (l - low_freq)
|
||||
scale_right = (high_freq - Fh) / (high_freq - h)
|
||||
res = paddle.empty_like(freq)
|
||||
|
||||
outside_low_high_freq = paddle.less_than(freq, paddle.to_tensor(low_freq)) \
|
||||
| paddle.greater_than(freq, paddle.to_tensor(high_freq))
|
||||
before_l = paddle.less_than(freq, paddle.to_tensor(l))
|
||||
before_h = paddle.less_than(freq, paddle.to_tensor(h))
|
||||
after_h = paddle.greater_equal(freq, paddle.to_tensor(h))
|
||||
|
||||
res[after_h] = high_freq + scale_right * (freq[after_h] - high_freq)
|
||||
res[before_h] = scale * freq[before_h]
|
||||
res[before_l] = low_freq + scale_left * (freq[before_l] - low_freq)
|
||||
res[outside_low_high_freq] = freq[outside_low_high_freq]
|
||||
|
||||
return res
|
||||
|
||||
|
||||
def _vtln_warp_mel_freq(vtln_low_cutoff: float,
|
||||
vtln_high_cutoff: float,
|
||||
low_freq,
|
||||
high_freq: float,
|
||||
vtln_warp_factor: float,
|
||||
mel_freq: Tensor) -> Tensor:
|
||||
return _mel_scale(
|
||||
_vtln_warp_freq(vtln_low_cutoff, vtln_high_cutoff, low_freq, high_freq,
|
||||
vtln_warp_factor, _inverse_mel_scale(mel_freq)))
|
||||
|
||||
|
||||
def _get_mel_banks(num_bins: int,
|
||||
window_length_padded: int,
|
||||
sample_freq: float,
|
||||
low_freq: float,
|
||||
high_freq: float,
|
||||
vtln_low: float,
|
||||
vtln_high: float,
|
||||
vtln_warp_factor: float) -> Tuple[Tensor, Tensor]:
|
||||
assert num_bins > 3, 'Must have at least 3 mel bins'
|
||||
assert window_length_padded % 2 == 0
|
||||
num_fft_bins = window_length_padded / 2
|
||||
nyquist = 0.5 * sample_freq
|
||||
|
||||
if high_freq <= 0.0:
|
||||
high_freq += nyquist
|
||||
|
||||
assert (0.0 <= low_freq < nyquist) and (0.0 < high_freq <= nyquist) and (low_freq < high_freq), \
|
||||
('Bad values in options: low-freq {} and high-freq {} vs. nyquist {}'.format(low_freq, high_freq, nyquist))
|
||||
|
||||
fft_bin_width = sample_freq / window_length_padded
|
||||
mel_low_freq = _mel_scale_scalar(low_freq)
|
||||
mel_high_freq = _mel_scale_scalar(high_freq)
|
||||
|
||||
mel_freq_delta = (mel_high_freq - mel_low_freq) / (num_bins + 1)
|
||||
|
||||
if vtln_high < 0.0:
|
||||
vtln_high += nyquist
|
||||
|
||||
assert vtln_warp_factor == 1.0 or ((low_freq < vtln_low < high_freq) and
|
||||
(0.0 < vtln_high < high_freq) and (vtln_low < vtln_high)), \
|
||||
('Bad values in options: vtln-low {} and vtln-high {}, versus '
|
||||
'low-freq {} and high-freq {}'.format(vtln_low, vtln_high, low_freq, high_freq))
|
||||
|
||||
bin = paddle.arange(num_bins).unsqueeze(1)
|
||||
left_mel = mel_low_freq + bin * mel_freq_delta # (num_bins, 1)
|
||||
center_mel = mel_low_freq + (bin + 1.0) * mel_freq_delta # (num_bins, 1)
|
||||
right_mel = mel_low_freq + (bin + 2.0) * mel_freq_delta # (num_bins, 1)
|
||||
|
||||
if vtln_warp_factor != 1.0:
|
||||
left_mel = _vtln_warp_mel_freq(vtln_low, vtln_high, low_freq, high_freq,
|
||||
vtln_warp_factor, left_mel)
|
||||
center_mel = _vtln_warp_mel_freq(vtln_low, vtln_high, low_freq,
|
||||
high_freq, vtln_warp_factor,
|
||||
center_mel)
|
||||
right_mel = _vtln_warp_mel_freq(vtln_low, vtln_high, low_freq,
|
||||
high_freq, vtln_warp_factor, right_mel)
|
||||
|
||||
center_freqs = _inverse_mel_scale(center_mel) # (num_bins)
|
||||
# (1, num_fft_bins)
|
||||
mel = _mel_scale(fft_bin_width * paddle.arange(num_fft_bins)).unsqueeze(0)
|
||||
|
||||
# (num_bins, num_fft_bins)
|
||||
up_slope = (mel - left_mel) / (center_mel - left_mel)
|
||||
down_slope = (right_mel - mel) / (right_mel - center_mel)
|
||||
|
||||
if vtln_warp_factor == 1.0:
|
||||
bins = paddle.maximum(
|
||||
paddle.zeros([1]), paddle.minimum(up_slope, down_slope))
|
||||
else:
|
||||
bins = paddle.zeros_like(up_slope)
|
||||
up_idx = paddle.greater_than(mel, left_mel) & paddle.less_than(
|
||||
mel, center_mel)
|
||||
down_idx = paddle.greater_than(mel, center_mel) & paddle.less_than(
|
||||
mel, right_mel)
|
||||
bins[up_idx] = up_slope[up_idx]
|
||||
bins[down_idx] = down_slope[down_idx]
|
||||
|
||||
return bins, center_freqs
|
||||
|
||||
|
||||
def fbank(waveform: Tensor,
|
||||
blackman_coeff: float=0.42,
|
||||
channel: int=-1,
|
||||
dither: float=0.0,
|
||||
energy_floor: float=1.0,
|
||||
frame_length: float=25.0,
|
||||
frame_shift: float=10.0,
|
||||
high_freq: float=0.0,
|
||||
htk_compat: bool=False,
|
||||
low_freq: float=20.0,
|
||||
n_mels: int=23,
|
||||
preemphasis_coefficient: float=0.97,
|
||||
raw_energy: bool=True,
|
||||
remove_dc_offset: bool=True,
|
||||
round_to_power_of_two: bool=True,
|
||||
sr: int=16000,
|
||||
snip_edges: bool=True,
|
||||
subtract_mean: bool=False,
|
||||
use_energy: bool=False,
|
||||
use_log_fbank: bool=True,
|
||||
use_power: bool=True,
|
||||
vtln_high: float=-500.0,
|
||||
vtln_low: float=100.0,
|
||||
vtln_warp: float=1.0,
|
||||
window_type: str=POVEY) -> Tensor:
|
||||
"""Compute and return filter banks from a waveform. The output is identical to Kaldi's.
|
||||
|
||||
Args:
|
||||
waveform (Tensor): A waveform tensor with shape [C, T].
|
||||
blackman_coeff (float, optional): Coefficient for Blackman window.. Defaults to 0.42.
|
||||
channel (int, optional): Select the channel of waveform. Defaults to -1.
|
||||
dither (float, optional): Dithering constant . Defaults to 0.0.
|
||||
energy_floor (float, optional): Floor on energy of the output Spectrogram. Defaults to 1.0.
|
||||
frame_length (float, optional): Frame length in milliseconds. Defaults to 25.0.
|
||||
frame_shift (float, optional): Shift between adjacent frames in milliseconds. Defaults to 10.0.
|
||||
high_freq (float, optional): The upper cut-off frequency. Defaults to 0.0.
|
||||
htk_compat (bool, optional): Put energy to the last when it is set True. Defaults to False.
|
||||
low_freq (float, optional): The lower cut-off frequency. Defaults to 20.0.
|
||||
n_mels (int, optional): Number of output mel bins. Defaults to 23.
|
||||
preemphasis_coefficient (float, optional): Preemphasis coefficient for input waveform. Defaults to 0.97.
|
||||
raw_energy (bool, optional): Whether to compute before preemphasis and windowing. Defaults to True.
|
||||
remove_dc_offset (bool, optional): Whether to subtract mean from waveform on frames. Defaults to True.
|
||||
round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input
|
||||
to FFT. Defaults to True.
|
||||
sr (int, optional): Sample rate of input waveform. Defaults to 16000.
|
||||
snip_edges (bool, optional): Drop samples in the end of waveform that cann't fit a singal frame when it
|
||||
is set True. Otherwise performs reflect padding to the end of waveform. Defaults to True.
|
||||
subtract_mean (bool, optional): Whether to subtract mean of feature files. Defaults to False.
|
||||
use_energy (bool, optional): Add an dimension with energy of spectrogram to the output. Defaults to False.
|
||||
use_log_fbank (bool, optional): Return log fbank when it is set True. Defaults to True.
|
||||
use_power (bool, optional): Whether to use power instead of magnitude. Defaults to True.
|
||||
vtln_high (float, optional): High inflection point in piecewise linear VTLN warping function. Defaults to -500.0.
|
||||
vtln_low (float, optional): Low inflection point in piecewise linear VTLN warping function. Defaults to 100.0.
|
||||
vtln_warp (float, optional): Vtln warp factor. Defaults to 1.0.
|
||||
window_type (str, optional): Choose type of window for FFT computation. Defaults to POVEY.
|
||||
|
||||
Returns:
|
||||
Tensor: A filter banks tensor with shape (m, n_mels).
|
||||
"""
|
||||
dtype = waveform.dtype
|
||||
|
||||
waveform, window_shift, window_size, padded_window_size = _get_waveform_and_window_properties(
|
||||
waveform, channel, sr, frame_shift, frame_length, round_to_power_of_two,
|
||||
preemphasis_coefficient)
|
||||
|
||||
strided_input, signal_log_energy = _get_window(
|
||||
waveform, padded_window_size, window_size, window_shift, window_type,
|
||||
blackman_coeff, snip_edges, raw_energy, energy_floor, dither,
|
||||
remove_dc_offset, preemphasis_coefficient)
|
||||
|
||||
# (m, padded_window_size // 2 + 1)
|
||||
spectrum = paddle.fft.rfft(strided_input).abs()
|
||||
if use_power:
|
||||
spectrum = spectrum.pow(2.)
|
||||
|
||||
# (n_mels, padded_window_size // 2)
|
||||
mel_energies, _ = _get_mel_banks(n_mels, padded_window_size, sr, low_freq,
|
||||
high_freq, vtln_low, vtln_high, vtln_warp)
|
||||
mel_energies = mel_energies.astype(dtype)
|
||||
|
||||
# (n_mels, padded_window_size // 2 + 1)
|
||||
mel_energies = paddle.nn.functional.pad(
|
||||
mel_energies.unsqueeze(0), (0, 1),
|
||||
data_format='NCL',
|
||||
mode='constant',
|
||||
value=0).squeeze(0)
|
||||
|
||||
# (m, n_mels)
|
||||
mel_energies = paddle.mm(spectrum, mel_energies.T)
|
||||
if use_log_fbank:
|
||||
mel_energies = paddle.maximum(mel_energies, _get_epsilon(dtype)).log()
|
||||
|
||||
if use_energy:
|
||||
signal_log_energy = signal_log_energy.unsqueeze(1)
|
||||
if htk_compat:
|
||||
mel_energies = paddle.concat(
|
||||
(mel_energies, signal_log_energy), axis=1)
|
||||
else:
|
||||
mel_energies = paddle.concat(
|
||||
(signal_log_energy, mel_energies), axis=1)
|
||||
|
||||
# (m, n_mels + 1)
|
||||
mel_energies = _subtract_column_mean(mel_energies, subtract_mean)
|
||||
return mel_energies
|
||||
|
||||
|
||||
def _get_dct_matrix(n_mfcc: int, n_mels: int) -> Tensor:
|
||||
dct_matrix = create_dct(n_mels, n_mels, 'ortho')
|
||||
dct_matrix[:, 0] = math.sqrt(1 / float(n_mels))
|
||||
dct_matrix = dct_matrix[:, :n_mfcc] # (n_mels, n_mfcc)
|
||||
return dct_matrix
|
||||
|
||||
|
||||
def _get_lifter_coeffs(n_mfcc: int, cepstral_lifter: float) -> Tensor:
|
||||
i = paddle.arange(n_mfcc)
|
||||
return 1.0 + 0.5 * cepstral_lifter * paddle.sin(math.pi * i /
|
||||
cepstral_lifter)
|
||||
|
||||
|
||||
def mfcc(waveform: Tensor,
|
||||
blackman_coeff: float=0.42,
|
||||
cepstral_lifter: float=22.0,
|
||||
channel: int=-1,
|
||||
dither: float=0.0,
|
||||
energy_floor: float=1.0,
|
||||
frame_length: float=25.0,
|
||||
frame_shift: float=10.0,
|
||||
high_freq: float=0.0,
|
||||
htk_compat: bool=False,
|
||||
low_freq: float=20.0,
|
||||
n_mfcc: int=13,
|
||||
n_mels: int=23,
|
||||
preemphasis_coefficient: float=0.97,
|
||||
raw_energy: bool=True,
|
||||
remove_dc_offset: bool=True,
|
||||
round_to_power_of_two: bool=True,
|
||||
sr: int=16000,
|
||||
snip_edges: bool=True,
|
||||
subtract_mean: bool=False,
|
||||
use_energy: bool=False,
|
||||
vtln_high: float=-500.0,
|
||||
vtln_low: float=100.0,
|
||||
vtln_warp: float=1.0,
|
||||
window_type: str=POVEY) -> Tensor:
|
||||
"""Compute and return mel frequency cepstral coefficients from a waveform. The output is
|
||||
identical to Kaldi's.
|
||||
|
||||
Args:
|
||||
waveform (Tensor): A waveform tensor with shape [C, T].
|
||||
blackman_coeff (float, optional): Coefficient for Blackman window.. Defaults to 0.42.
|
||||
cepstral_lifter (float, optional): Scaling of output mfccs. Defaults to 22.0.
|
||||
channel (int, optional): Select the channel of waveform. Defaults to -1.
|
||||
dither (float, optional): Dithering constant . Defaults to 0.0.
|
||||
energy_floor (float, optional): Floor on energy of the output Spectrogram. Defaults to 1.0.
|
||||
frame_length (float, optional): Frame length in milliseconds. Defaults to 25.0.
|
||||
frame_shift (float, optional): Shift between adjacent frames in milliseconds. Defaults to 10.0.
|
||||
high_freq (float, optional): The upper cut-off frequency. Defaults to 0.0.
|
||||
htk_compat (bool, optional): Put energy to the last when it is set True. Defaults to False.
|
||||
low_freq (float, optional): The lower cut-off frequency. Defaults to 20.0.
|
||||
n_mfcc (int, optional): Number of cepstra in MFCC. Defaults to 13.
|
||||
n_mels (int, optional): Number of output mel bins. Defaults to 23.
|
||||
preemphasis_coefficient (float, optional): Preemphasis coefficient for input waveform. Defaults to 0.97.
|
||||
raw_energy (bool, optional): Whether to compute before preemphasis and windowing. Defaults to True.
|
||||
remove_dc_offset (bool, optional): Whether to subtract mean from waveform on frames. Defaults to True.
|
||||
round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input
|
||||
to FFT. Defaults to True.
|
||||
sr (int, optional): Sample rate of input waveform. Defaults to 16000.
|
||||
snip_edges (bool, optional): Drop samples in the end of waveform that cann't fit a singal frame when it
|
||||
is set True. Otherwise performs reflect padding to the end of waveform. Defaults to True.
|
||||
subtract_mean (bool, optional): Whether to subtract mean of feature files. Defaults to False.
|
||||
use_energy (bool, optional): Add an dimension with energy of spectrogram to the output. Defaults to False.
|
||||
vtln_high (float, optional): High inflection point in piecewise linear VTLN warping function. Defaults to -500.0.
|
||||
vtln_low (float, optional): Low inflection point in piecewise linear VTLN warping function. Defaults to 100.0.
|
||||
vtln_warp (float, optional): Vtln warp factor. Defaults to 1.0.
|
||||
window_type (str, optional): Choose type of window for FFT computation. Defaults to POVEY.
|
||||
|
||||
Returns:
|
||||
Tensor: A mel frequency cepstral coefficients tensor with shape (m, n_mfcc).
|
||||
"""
|
||||
assert n_mfcc <= n_mels, 'n_mfcc cannot be larger than n_mels: %d vs %d' % (
|
||||
n_mfcc, n_mels)
|
||||
|
||||
dtype = waveform.dtype
|
||||
|
||||
# (m, n_mels + use_energy)
|
||||
feature = fbank(
|
||||
waveform=waveform,
|
||||
blackman_coeff=blackman_coeff,
|
||||
channel=channel,
|
||||
dither=dither,
|
||||
energy_floor=energy_floor,
|
||||
frame_length=frame_length,
|
||||
frame_shift=frame_shift,
|
||||
high_freq=high_freq,
|
||||
htk_compat=htk_compat,
|
||||
low_freq=low_freq,
|
||||
n_mels=n_mels,
|
||||
preemphasis_coefficient=preemphasis_coefficient,
|
||||
raw_energy=raw_energy,
|
||||
remove_dc_offset=remove_dc_offset,
|
||||
round_to_power_of_two=round_to_power_of_two,
|
||||
sr=sr,
|
||||
snip_edges=snip_edges,
|
||||
subtract_mean=False,
|
||||
use_energy=use_energy,
|
||||
use_log_fbank=True,
|
||||
use_power=True,
|
||||
vtln_high=vtln_high,
|
||||
vtln_low=vtln_low,
|
||||
vtln_warp=vtln_warp,
|
||||
window_type=window_type)
|
||||
|
||||
if use_energy:
|
||||
# (m)
|
||||
signal_log_energy = feature[:, n_mels if htk_compat else 0]
|
||||
mel_offset = int(not htk_compat)
|
||||
feature = feature[:, mel_offset:(n_mels + mel_offset)]
|
||||
|
||||
# (n_mels, n_mfcc)
|
||||
dct_matrix = _get_dct_matrix(n_mfcc, n_mels).astype(dtype=dtype)
|
||||
|
||||
# (m, n_mfcc)
|
||||
feature = feature.matmul(dct_matrix)
|
||||
|
||||
if cepstral_lifter != 0.0:
|
||||
# (1, n_mfcc)
|
||||
lifter_coeffs = _get_lifter_coeffs(n_mfcc, cepstral_lifter).unsqueeze(0)
|
||||
feature *= lifter_coeffs.astype(dtype=dtype)
|
||||
|
||||
if use_energy:
|
||||
feature[:, 0] = signal_log_energy
|
||||
|
||||
if htk_compat:
|
||||
energy = feature[:, 0].unsqueeze(1) # (m, 1)
|
||||
feature = feature[:, 1:] # (m, n_mfcc - 1)
|
||||
if not use_energy:
|
||||
energy *= math.sqrt(2)
|
||||
|
||||
feature = paddle.concat((feature, energy), axis=1)
|
||||
|
||||
feature = _subtract_column_mean(feature, subtract_mean)
|
||||
return feature
|
@ -0,0 +1,344 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from functools import partial
|
||||
from typing import Optional
|
||||
from typing import Union
|
||||
|
||||
import paddle
|
||||
import paddle.nn as nn
|
||||
|
||||
from ..functional import compute_fbank_matrix
|
||||
from ..functional import create_dct
|
||||
from ..functional import power_to_db
|
||||
from ..functional.window import get_window
|
||||
|
||||
__all__ = [
|
||||
'Spectrogram',
|
||||
'MelSpectrogram',
|
||||
'LogMelSpectrogram',
|
||||
'MFCC',
|
||||
]
|
||||
|
||||
|
||||
class Spectrogram(nn.Layer):
|
||||
def __init__(self,
|
||||
n_fft: int=512,
|
||||
hop_length: Optional[int]=None,
|
||||
win_length: Optional[int]=None,
|
||||
window: str='hann',
|
||||
center: bool=True,
|
||||
pad_mode: str='reflect',
|
||||
dtype: str=paddle.float32):
|
||||
"""Compute spectrogram of a given signal, typically an audio waveform.
|
||||
The spectorgram is defined as the complex norm of the short-time
|
||||
Fourier transformation.
|
||||
Parameters:
|
||||
n_fft (int): the number of frequency components of the discrete Fourier transform.
|
||||
The default value is 2048,
|
||||
hop_length (int|None): the hop length of the short time FFT. If None, it is set to win_length//4.
|
||||
The default value is None.
|
||||
win_length: the window length of the short time FFt. If None, it is set to same as n_fft.
|
||||
The default value is None.
|
||||
window (str): the name of the window function applied to the single before the Fourier transform.
|
||||
The folllowing window names are supported: 'hamming','hann','kaiser','gaussian',
|
||||
'exponential','triang','bohman','blackman','cosine','tukey','taylor'.
|
||||
The default value is 'hann'
|
||||
center (bool): if True, the signal is padded so that frame t is centered at x[t * hop_length].
|
||||
If False, frame t begins at x[t * hop_length]
|
||||
The default value is True
|
||||
pad_mode (str): the mode to pad the signal if necessary. The supported modes are 'reflect'
|
||||
and 'constant'. The default value is 'reflect'.
|
||||
dtype (str): the data type of input and window.
|
||||
Notes:
|
||||
The Spectrogram transform relies on STFT transform to compute the spectrogram.
|
||||
By default, the weights are not learnable. To fine-tune the Fourier coefficients,
|
||||
set stop_gradient=False before training.
|
||||
For more information, see STFT().
|
||||
"""
|
||||
super(Spectrogram, self).__init__()
|
||||
|
||||
if win_length is None:
|
||||
win_length = n_fft
|
||||
|
||||
self.fft_window = get_window(
|
||||
window, win_length, fftbins=True, dtype=dtype)
|
||||
self._stft = partial(
|
||||
paddle.signal.stft,
|
||||
n_fft=n_fft,
|
||||
hop_length=hop_length,
|
||||
win_length=win_length,
|
||||
window=self.fft_window,
|
||||
center=center,
|
||||
pad_mode=pad_mode)
|
||||
self.register_buffer('fft_window', self.fft_window)
|
||||
|
||||
def forward(self, x):
|
||||
stft = self._stft(x)
|
||||
spectrogram = paddle.square(paddle.abs(stft))
|
||||
return spectrogram
|
||||
|
||||
|
||||
class MelSpectrogram(nn.Layer):
|
||||
def __init__(self,
|
||||
sr: int=22050,
|
||||
n_fft: int=512,
|
||||
hop_length: Optional[int]=None,
|
||||
win_length: Optional[int]=None,
|
||||
window: str='hann',
|
||||
center: bool=True,
|
||||
pad_mode: str='reflect',
|
||||
n_mels: int=64,
|
||||
f_min: float=50.0,
|
||||
f_max: Optional[float]=None,
|
||||
htk: bool=False,
|
||||
norm: Union[str, float]='slaney',
|
||||
dtype: str=paddle.float32):
|
||||
"""Compute the melspectrogram of a given signal, typically an audio waveform.
|
||||
The melspectrogram is also known as filterbank or fbank feature in audio community.
|
||||
It is computed by multiplying spectrogram with Mel filter bank matrix.
|
||||
Parameters:
|
||||
sr(int): the audio sample rate.
|
||||
The default value is 22050.
|
||||
n_fft(int): the number of frequency components of the discrete Fourier transform.
|
||||
The default value is 2048,
|
||||
hop_length(int|None): the hop length of the short time FFT. If None, it is set to win_length//4.
|
||||
The default value is None.
|
||||
win_length: the window length of the short time FFt. If None, it is set to same as n_fft.
|
||||
The default value is None.
|
||||
window(str): the name of the window function applied to the single before the Fourier transform.
|
||||
The folllowing window names are supported: 'hamming','hann','kaiser','gaussian',
|
||||
'exponential','triang','bohman','blackman','cosine','tukey','taylor'.
|
||||
The default value is 'hann'
|
||||
center(bool): if True, the signal is padded so that frame t is centered at x[t * hop_length].
|
||||
If False, frame t begins at x[t * hop_length]
|
||||
The default value is True
|
||||
pad_mode(str): the mode to pad the signal if necessary. The supported modes are 'reflect'
|
||||
and 'constant'.
|
||||
The default value is 'reflect'.
|
||||
n_mels(int): the mel bins.
|
||||
f_min(float): the lower cut-off frequency, below which the filter response is zero.
|
||||
f_max(float): the upper cut-off frequency, above which the filter response is zeros.
|
||||
htk(bool): whether to use HTK formula in computing fbank matrix.
|
||||
norm(str|float): the normalization type in computing fbank matrix. Slaney-style is used by default.
|
||||
You can specify norm=1.0/2.0 to use customized p-norm normalization.
|
||||
dtype(str): the datatype of fbank matrix used in the transform. Use float64 to increase numerical
|
||||
accuracy. Note that the final transform will be conducted in float32 regardless of dtype of fbank matrix.
|
||||
"""
|
||||
super(MelSpectrogram, self).__init__()
|
||||
|
||||
self._spectrogram = Spectrogram(
|
||||
n_fft=n_fft,
|
||||
hop_length=hop_length,
|
||||
win_length=win_length,
|
||||
window=window,
|
||||
center=center,
|
||||
pad_mode=pad_mode,
|
||||
dtype=dtype)
|
||||
self.n_mels = n_mels
|
||||
self.f_min = f_min
|
||||
self.f_max = f_max
|
||||
self.htk = htk
|
||||
self.norm = norm
|
||||
if f_max is None:
|
||||
f_max = sr // 2
|
||||
self.fbank_matrix = compute_fbank_matrix(
|
||||
sr=sr,
|
||||
n_fft=n_fft,
|
||||
n_mels=n_mels,
|
||||
f_min=f_min,
|
||||
f_max=f_max,
|
||||
htk=htk,
|
||||
norm=norm,
|
||||
dtype=dtype) # float64 for better numerical results
|
||||
self.register_buffer('fbank_matrix', self.fbank_matrix)
|
||||
|
||||
def forward(self, x):
|
||||
spect_feature = self._spectrogram(x)
|
||||
mel_feature = paddle.matmul(self.fbank_matrix, spect_feature)
|
||||
return mel_feature
|
||||
|
||||
|
||||
class LogMelSpectrogram(nn.Layer):
|
||||
def __init__(self,
|
||||
sr: int=22050,
|
||||
n_fft: int=512,
|
||||
hop_length: Optional[int]=None,
|
||||
win_length: Optional[int]=None,
|
||||
window: str='hann',
|
||||
center: bool=True,
|
||||
pad_mode: str='reflect',
|
||||
n_mels: int=64,
|
||||
f_min: float=50.0,
|
||||
f_max: Optional[float]=None,
|
||||
htk: bool=False,
|
||||
norm: Union[str, float]='slaney',
|
||||
ref_value: float=1.0,
|
||||
amin: float=1e-10,
|
||||
top_db: Optional[float]=None,
|
||||
dtype: str=paddle.float32):
|
||||
"""Compute log-mel-spectrogram(also known as LogFBank) feature of a given signal,
|
||||
typically an audio waveform.
|
||||
Parameters:
|
||||
sr (int): the audio sample rate.
|
||||
The default value is 22050.
|
||||
n_fft (int): the number of frequency components of the discrete Fourier transform.
|
||||
The default value is 2048,
|
||||
hop_length (int|None): the hop length of the short time FFT. If None, it is set to win_length//4.
|
||||
The default value is None.
|
||||
win_length: the window length of the short time FFt. If None, it is set to same as n_fft.
|
||||
The default value is None.
|
||||
window (str): the name of the window function applied to the single before the Fourier transform.
|
||||
The folllowing window names are supported: 'hamming','hann','kaiser','gaussian',
|
||||
'exponential','triang','bohman','blackman','cosine','tukey','taylor'.
|
||||
The default value is 'hann'
|
||||
center (bool): if True, the signal is padded so that frame t is centered at x[t * hop_length].
|
||||
If False, frame t begins at x[t * hop_length]
|
||||
The default value is True
|
||||
pad_mode (str): the mode to pad the signal if necessary. The supported modes are 'reflect'
|
||||
and 'constant'.
|
||||
The default value is 'reflect'.
|
||||
n_mels (int): the mel bins.
|
||||
f_min (float): the lower cut-off frequency, below which the filter response is zero.
|
||||
f_max (float): the upper cut-off frequency, above which the filter response is zeros.
|
||||
htk (bool): whether to use HTK formula in computing fbank matrix.
|
||||
norm (str|float): the normalization type in computing fbank matrix. Slaney-style is used by default.
|
||||
You can specify norm=1.0/2.0 to use customized p-norm normalization.
|
||||
ref_value (float): the reference value. If smaller than 1.0, the db level
|
||||
amin (float): the minimum value of input magnitude, below which the input of the signal will be pulled up accordingly.
|
||||
Otherwise, the db level is pushed down.
|
||||
magnitude is clipped(to amin). For numerical stability, set amin to a larger value,
|
||||
e.g., 1e-3.
|
||||
top_db (float): the maximum db value of resulting spectrum, above which the
|
||||
spectrum is clipped(to top_db).
|
||||
dtype (str): the datatype of fbank matrix used in the transform. Use float64 to increase numerical
|
||||
accuracy. Note that the final transform will be conducted in float32 regardless of dtype of fbank matrix.
|
||||
"""
|
||||
super(LogMelSpectrogram, self).__init__()
|
||||
|
||||
self._melspectrogram = MelSpectrogram(
|
||||
sr=sr,
|
||||
n_fft=n_fft,
|
||||
hop_length=hop_length,
|
||||
win_length=win_length,
|
||||
window=window,
|
||||
center=center,
|
||||
pad_mode=pad_mode,
|
||||
n_mels=n_mels,
|
||||
f_min=f_min,
|
||||
f_max=f_max,
|
||||
htk=htk,
|
||||
norm=norm,
|
||||
dtype=dtype)
|
||||
|
||||
self.ref_value = ref_value
|
||||
self.amin = amin
|
||||
self.top_db = top_db
|
||||
|
||||
def forward(self, x):
|
||||
# import ipdb; ipdb.set_trace()
|
||||
mel_feature = self._melspectrogram(x)
|
||||
log_mel_feature = power_to_db(
|
||||
mel_feature,
|
||||
ref_value=self.ref_value,
|
||||
amin=self.amin,
|
||||
top_db=self.top_db)
|
||||
return log_mel_feature
|
||||
|
||||
|
||||
class MFCC(nn.Layer):
|
||||
def __init__(self,
|
||||
sr: int=22050,
|
||||
n_mfcc: int=40,
|
||||
n_fft: int=512,
|
||||
hop_length: Optional[int]=None,
|
||||
win_length: Optional[int]=None,
|
||||
window: str='hann',
|
||||
center: bool=True,
|
||||
pad_mode: str='reflect',
|
||||
n_mels: int=64,
|
||||
f_min: float=50.0,
|
||||
f_max: Optional[float]=None,
|
||||
htk: bool=False,
|
||||
norm: Union[str, float]='slaney',
|
||||
ref_value: float=1.0,
|
||||
amin: float=1e-10,
|
||||
top_db: Optional[float]=None,
|
||||
dtype: str=paddle.float32):
|
||||
"""Compute mel frequency cepstral coefficients(MFCCs) feature of given waveforms.
|
||||
|
||||
Parameters:
|
||||
sr(int): the audio sample rate.
|
||||
The default value is 22050.
|
||||
n_mfcc (int, optional): Number of cepstra in MFCC. Defaults to 40.
|
||||
n_fft (int): the number of frequency components of the discrete Fourier transform.
|
||||
The default value is 2048,
|
||||
hop_length (int|None): the hop length of the short time FFT. If None, it is set to win_length//4.
|
||||
The default value is None.
|
||||
win_length: the window length of the short time FFt. If None, it is set to same as n_fft.
|
||||
The default value is None.
|
||||
window (str): the name of the window function applied to the single before the Fourier transform.
|
||||
The folllowing window names are supported: 'hamming','hann','kaiser','gaussian',
|
||||
'exponential','triang','bohman','blackman','cosine','tukey','taylor'.
|
||||
The default value is 'hann'
|
||||
center (bool): if True, the signal is padded so that frame t is centered at x[t * hop_length].
|
||||
If False, frame t begins at x[t * hop_length]
|
||||
The default value is True
|
||||
pad_mode (str): the mode to pad the signal if necessary. The supported modes are 'reflect'
|
||||
and 'constant'.
|
||||
The default value is 'reflect'.
|
||||
n_mels (int): the mel bins.
|
||||
f_min (float): the lower cut-off frequency, below which the filter response is zero.
|
||||
f_max (float): the upper cut-off frequency, above which the filter response is zeros.
|
||||
htk (bool): whether to use HTK formula in computing fbank matrix.
|
||||
norm (str|float): the normalization type in computing fbank matrix. Slaney-style is used by default.
|
||||
You can specify norm=1.0/2.0 to use customized p-norm normalization.
|
||||
ref_value (float): the reference value. If smaller than 1.0, the db level
|
||||
amin (float): the minimum value of input magnitude, below which the input of the signal will be pulled up accordingly.
|
||||
Otherwise, the db level is pushed down.
|
||||
magnitude is clipped(to amin). For numerical stability, set amin to a larger value,
|
||||
e.g., 1e-3.
|
||||
top_db (float): the maximum db value of resulting spectrum, above which the
|
||||
spectrum is clipped(to top_db).
|
||||
dtype (str): the datatype of fbank matrix used in the transform. Use float64 to increase numerical
|
||||
accuracy. Note that the final transform will be conducted in float32 regardless of dtype of fbank matrix.
|
||||
"""
|
||||
super(MFCC, self).__init__()
|
||||
assert n_mfcc <= n_mels, 'n_mfcc cannot be larger than n_mels: %d vs %d' % (
|
||||
n_mfcc, n_mels)
|
||||
self._log_melspectrogram = LogMelSpectrogram(
|
||||
sr=sr,
|
||||
n_fft=n_fft,
|
||||
hop_length=hop_length,
|
||||
win_length=win_length,
|
||||
window=window,
|
||||
center=center,
|
||||
pad_mode=pad_mode,
|
||||
n_mels=n_mels,
|
||||
f_min=f_min,
|
||||
f_max=f_max,
|
||||
htk=htk,
|
||||
norm=norm,
|
||||
ref_value=ref_value,
|
||||
amin=amin,
|
||||
top_db=top_db,
|
||||
dtype=dtype)
|
||||
self.dct_matrix = create_dct(n_mfcc=n_mfcc, n_mels=n_mels, dtype=dtype)
|
||||
self.register_buffer('dct_matrix', self.dct_matrix)
|
||||
|
||||
def forward(self, x):
|
||||
log_mel_feature = self._log_melspectrogram(x)
|
||||
mfcc = paddle.matmul(
|
||||
log_mel_feature.transpose((0, 2, 1)), self.dct_matrix).transpose(
|
||||
(0, 2, 1)) # (B, n_mels, L)
|
||||
return mfcc
|
@ -0,0 +1,20 @@
|
||||
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from .functional import compute_fbank_matrix
|
||||
from .functional import create_dct
|
||||
from .functional import fft_frequencies
|
||||
from .functional import hz_to_mel
|
||||
from .functional import mel_frequencies
|
||||
from .functional import mel_to_hz
|
||||
from .functional import power_to_db
|
@ -0,0 +1,265 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# Modified from librosa(https://github.com/librosa/librosa)
|
||||
import math
|
||||
from typing import Optional
|
||||
from typing import Union
|
||||
|
||||
import paddle
|
||||
|
||||
__all__ = [
|
||||
'hz_to_mel',
|
||||
'mel_to_hz',
|
||||
'mel_frequencies',
|
||||
'fft_frequencies',
|
||||
'compute_fbank_matrix',
|
||||
'power_to_db',
|
||||
'create_dct',
|
||||
]
|
||||
|
||||
|
||||
def hz_to_mel(freq: Union[paddle.Tensor, float],
|
||||
htk: bool=False) -> Union[paddle.Tensor, float]:
|
||||
"""Convert Hz to Mels.
|
||||
Parameters:
|
||||
freq: the input tensor of arbitrary shape, or a single floating point number.
|
||||
htk: use HTK formula to do the conversion.
|
||||
The default value is False.
|
||||
Returns:
|
||||
The frequencies represented in Mel-scale.
|
||||
"""
|
||||
|
||||
if htk:
|
||||
if isinstance(freq, paddle.Tensor):
|
||||
return 2595.0 * paddle.log10(1.0 + freq / 700.0)
|
||||
else:
|
||||
return 2595.0 * math.log10(1.0 + freq / 700.0)
|
||||
|
||||
# Fill in the linear part
|
||||
f_min = 0.0
|
||||
f_sp = 200.0 / 3
|
||||
|
||||
mels = (freq - f_min) / f_sp
|
||||
|
||||
# Fill in the log-scale part
|
||||
|
||||
min_log_hz = 1000.0 # beginning of log region (Hz)
|
||||
min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels)
|
||||
logstep = math.log(6.4) / 27.0 # step size for log region
|
||||
|
||||
if isinstance(freq, paddle.Tensor):
|
||||
target = min_log_mel + paddle.log(
|
||||
freq / min_log_hz + 1e-10) / logstep # prevent nan with 1e-10
|
||||
mask = (freq > min_log_hz).astype(freq.dtype)
|
||||
mels = target * mask + mels * (
|
||||
1 - mask) # will replace by masked_fill OP in future
|
||||
else:
|
||||
if freq >= min_log_hz:
|
||||
mels = min_log_mel + math.log(freq / min_log_hz + 1e-10) / logstep
|
||||
|
||||
return mels
|
||||
|
||||
|
||||
def mel_to_hz(mel: Union[float, paddle.Tensor],
|
||||
htk: bool=False) -> Union[float, paddle.Tensor]:
|
||||
"""Convert mel bin numbers to frequencies.
|
||||
Parameters:
|
||||
mel: the mel frequency represented as a tensor of arbitrary shape, or a floating point number.
|
||||
htk: use HTK formula to do the conversion.
|
||||
Returns:
|
||||
The frequencies represented in hz.
|
||||
"""
|
||||
if htk:
|
||||
return 700.0 * (10.0**(mel / 2595.0) - 1.0)
|
||||
|
||||
f_min = 0.0
|
||||
f_sp = 200.0 / 3
|
||||
freqs = f_min + f_sp * mel
|
||||
# And now the nonlinear scale
|
||||
min_log_hz = 1000.0 # beginning of log region (Hz)
|
||||
min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels)
|
||||
logstep = math.log(6.4) / 27.0 # step size for log region
|
||||
if isinstance(mel, paddle.Tensor):
|
||||
target = min_log_hz * paddle.exp(logstep * (mel - min_log_mel))
|
||||
mask = (mel > min_log_mel).astype(mel.dtype)
|
||||
freqs = target * mask + freqs * (
|
||||
1 - mask) # will replace by masked_fill OP in future
|
||||
else:
|
||||
if mel >= min_log_mel:
|
||||
freqs = min_log_hz * math.exp(logstep * (mel - min_log_mel))
|
||||
|
||||
return freqs
|
||||
|
||||
|
||||
def mel_frequencies(n_mels: int=64,
|
||||
f_min: float=0.0,
|
||||
f_max: float=11025.0,
|
||||
htk: bool=False,
|
||||
dtype: str=paddle.float32):
|
||||
"""Compute mel frequencies.
|
||||
Parameters:
|
||||
n_mels(int): number of Mel bins.
|
||||
f_min(float): the lower cut-off frequency, below which the filter response is zero.
|
||||
f_max(float): the upper cut-off frequency, above which the filter response is zero.
|
||||
htk(bool): whether to use htk formula.
|
||||
dtype(str): the datatype of the return frequencies.
|
||||
Returns:
|
||||
The frequencies represented in Mel-scale
|
||||
"""
|
||||
# 'Center freqs' of mel bands - uniformly spaced between limits
|
||||
min_mel = hz_to_mel(f_min, htk=htk)
|
||||
max_mel = hz_to_mel(f_max, htk=htk)
|
||||
mels = paddle.linspace(min_mel, max_mel, n_mels, dtype=dtype)
|
||||
freqs = mel_to_hz(mels, htk=htk)
|
||||
return freqs
|
||||
|
||||
|
||||
def fft_frequencies(sr: int, n_fft: int, dtype: str=paddle.float32):
|
||||
"""Compute fourier frequencies.
|
||||
Parameters:
|
||||
sr(int): the audio sample rate.
|
||||
n_fft(float): the number of fft bins.
|
||||
dtype(str): the datatype of the return frequencies.
|
||||
Returns:
|
||||
The frequencies represented in hz.
|
||||
"""
|
||||
return paddle.linspace(0, float(sr) / 2, int(1 + n_fft // 2), dtype=dtype)
|
||||
|
||||
|
||||
def compute_fbank_matrix(sr: int,
|
||||
n_fft: int,
|
||||
n_mels: int=64,
|
||||
f_min: float=0.0,
|
||||
f_max: Optional[float]=None,
|
||||
htk: bool=False,
|
||||
norm: Union[str, float]='slaney',
|
||||
dtype: str=paddle.float32):
|
||||
"""Compute fbank matrix.
|
||||
Parameters:
|
||||
sr(int): the audio sample rate.
|
||||
n_fft(int): the number of fft bins.
|
||||
n_mels(int): the number of Mel bins.
|
||||
f_min(float): the lower cut-off frequency, below which the filter response is zero.
|
||||
f_max(float): the upper cut-off frequency, above which the filter response is zero.
|
||||
htk: whether to use htk formula.
|
||||
return_complex(bool): whether to return complex matrix. If True, the matrix will
|
||||
be complex type. Otherwise, the real and image part will be stored in the last
|
||||
axis of returned tensor.
|
||||
dtype(str): the datatype of the returned fbank matrix.
|
||||
Returns:
|
||||
The fbank matrix of shape (n_mels, int(1+n_fft//2)).
|
||||
Shape:
|
||||
output: (n_mels, int(1+n_fft//2))
|
||||
"""
|
||||
|
||||
if f_max is None:
|
||||
f_max = float(sr) / 2
|
||||
|
||||
# Initialize the weights
|
||||
weights = paddle.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype)
|
||||
|
||||
# Center freqs of each FFT bin
|
||||
fftfreqs = fft_frequencies(sr=sr, n_fft=n_fft, dtype=dtype)
|
||||
|
||||
# 'Center freqs' of mel bands - uniformly spaced between limits
|
||||
mel_f = mel_frequencies(
|
||||
n_mels + 2, f_min=f_min, f_max=f_max, htk=htk, dtype=dtype)
|
||||
|
||||
fdiff = mel_f[1:] - mel_f[:-1] #np.diff(mel_f)
|
||||
ramps = mel_f.unsqueeze(1) - fftfreqs.unsqueeze(0)
|
||||
#ramps = np.subtract.outer(mel_f, fftfreqs)
|
||||
|
||||
for i in range(n_mels):
|
||||
# lower and upper slopes for all bins
|
||||
lower = -ramps[i] / fdiff[i]
|
||||
upper = ramps[i + 2] / fdiff[i + 1]
|
||||
|
||||
# .. then intersect them with each other and zero
|
||||
weights[i] = paddle.maximum(
|
||||
paddle.zeros_like(lower), paddle.minimum(lower, upper))
|
||||
|
||||
# Slaney-style mel is scaled to be approx constant energy per channel
|
||||
if norm == 'slaney':
|
||||
enorm = 2.0 / (mel_f[2:n_mels + 2] - mel_f[:n_mels])
|
||||
weights *= enorm.unsqueeze(1)
|
||||
elif isinstance(norm, int) or isinstance(norm, float):
|
||||
weights = paddle.nn.functional.normalize(weights, p=norm, axis=-1)
|
||||
|
||||
return weights
|
||||
|
||||
|
||||
def power_to_db(magnitude: paddle.Tensor,
|
||||
ref_value: float=1.0,
|
||||
amin: float=1e-10,
|
||||
top_db: Optional[float]=None) -> paddle.Tensor:
|
||||
"""Convert a power spectrogram (amplitude squared) to decibel (dB) units.
|
||||
The function computes the scaling ``10 * log10(x / ref)`` in a numerically
|
||||
stable way.
|
||||
Parameters:
|
||||
magnitude(Tensor): the input magnitude tensor of any shape.
|
||||
ref_value(float): the reference value. If smaller than 1.0, the db level
|
||||
of the signal will be pulled up accordingly. Otherwise, the db level
|
||||
is pushed down.
|
||||
amin(float): the minimum value of input magnitude, below which the input
|
||||
magnitude is clipped(to amin).
|
||||
top_db(float): the maximum db value of resulting spectrum, above which the
|
||||
spectrum is clipped(to top_db).
|
||||
Returns:
|
||||
The spectrogram in log-scale.
|
||||
shape:
|
||||
input: any shape
|
||||
output: same as input
|
||||
"""
|
||||
if amin <= 0:
|
||||
raise Exception("amin must be strictly positive")
|
||||
|
||||
if ref_value <= 0:
|
||||
raise Exception("ref_value must be strictly positive")
|
||||
|
||||
ones = paddle.ones_like(magnitude)
|
||||
log_spec = 10.0 * paddle.log10(paddle.maximum(ones * amin, magnitude))
|
||||
log_spec -= 10.0 * math.log10(max(ref_value, amin))
|
||||
|
||||
if top_db is not None:
|
||||
if top_db < 0:
|
||||
raise Exception("top_db must be non-negative")
|
||||
log_spec = paddle.maximum(log_spec, ones * (log_spec.max() - top_db))
|
||||
|
||||
return log_spec
|
||||
|
||||
|
||||
def create_dct(n_mfcc: int,
|
||||
n_mels: int,
|
||||
norm: Optional[str]='ortho',
|
||||
dtype: Optional[str]=paddle.float32) -> paddle.Tensor:
|
||||
"""Create a discrete cosine transform(DCT) matrix.
|
||||
|
||||
Parameters:
|
||||
n_mfcc (int): Number of mel frequency cepstral coefficients.
|
||||
n_mels (int): Number of mel filterbanks.
|
||||
norm (str, optional): Normalizaiton type. Defaults to 'ortho'.
|
||||
Returns:
|
||||
Tensor: The DCT matrix with shape (n_mels, n_mfcc).
|
||||
"""
|
||||
n = paddle.arange(n_mels, dtype=dtype)
|
||||
k = paddle.arange(n_mfcc, dtype=dtype).unsqueeze(1)
|
||||
dct = paddle.cos(math.pi / float(n_mels) * (n + 0.5) *
|
||||
k) # size (n_mfcc, n_mels)
|
||||
if norm is None:
|
||||
dct *= 2.0
|
||||
else:
|
||||
assert norm == "ortho"
|
||||
dct[0] *= 1.0 / math.sqrt(2.0)
|
||||
dct *= math.sqrt(2.0 / float(n_mels))
|
||||
return dct.T
|
@ -0,0 +1,15 @@
|
||||
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from .dtw import dtw_distance
|
||||
from .mcd import mcd_distance
|
@ -0,0 +1,42 @@
|
||||
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import numpy as np
|
||||
from dtaidistance import dtw_ndim
|
||||
|
||||
__all__ = [
|
||||
'dtw_distance',
|
||||
]
|
||||
|
||||
|
||||
def dtw_distance(xs: np.ndarray, ys: np.ndarray) -> float:
|
||||
"""dtw distance
|
||||
|
||||
Dynamic Time Warping.
|
||||
This function keeps a compact matrix, not the full warping paths matrix.
|
||||
Uses dynamic programming to compute:
|
||||
|
||||
wps[i, j] = (s1[i]-s2[j])**2 + min(
|
||||
wps[i-1, j ] + penalty, // vertical / insertion / expansion
|
||||
wps[i , j-1] + penalty, // horizontal / deletion / compression
|
||||
wps[i-1, j-1]) // diagonal / match
|
||||
dtw = sqrt(wps[-1, -1])
|
||||
|
||||
Args:
|
||||
xs (np.ndarray): ref sequence, [T,D]
|
||||
ys (np.ndarray): hyp sequence, [T,D]
|
||||
|
||||
Returns:
|
||||
float: dtw distance
|
||||
"""
|
||||
return dtw_ndim.distance(xs, ys)
|
@ -0,0 +1,48 @@
|
||||
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import mcd.metrics_fast as mt
|
||||
import numpy as np
|
||||
from mcd import dtw
|
||||
|
||||
__all__ = [
|
||||
'mcd_distance',
|
||||
]
|
||||
|
||||
|
||||
def mcd_distance(xs: np.ndarray, ys: np.ndarray, cost_fn=mt.logSpecDbDist):
|
||||
"""Mel cepstral distortion (MCD), dtw distance.
|
||||
|
||||
Dynamic Time Warping.
|
||||
Uses dynamic programming to compute:
|
||||
wps[i, j] = cost_fn(xs[i], ys[j]) + min(
|
||||
wps[i-1, j ], // vertical / insertion / expansion
|
||||
wps[i , j-1], // horizontal / deletion / compression
|
||||
wps[i-1, j-1]) // diagonal / match
|
||||
dtw = sqrt(wps[-1, -1])
|
||||
|
||||
Cost Function:
|
||||
logSpecDbConst = 10.0 / math.log(10.0) * math.sqrt(2.0)
|
||||
def logSpecDbDist(x, y):
|
||||
diff = x - y
|
||||
return logSpecDbConst * math.sqrt(np.inner(diff, diff))
|
||||
|
||||
Args:
|
||||
xs (np.ndarray): ref sequence, [T,D]
|
||||
ys (np.ndarray): hyp sequence, [T,D]
|
||||
|
||||
Returns:
|
||||
float: dtw distance
|
||||
"""
|
||||
min_cost, path = dtw.dtw(xs, ys, cost_fn)
|
||||
return min_cost
|
@ -0,0 +1,13 @@
|
||||
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
@ -0,0 +1,25 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from .download import decompress
|
||||
from .download import download_and_decompress
|
||||
from .download import load_state_dict_from_url
|
||||
from .env import DATA_HOME
|
||||
from .env import MODEL_HOME
|
||||
from .env import PPAUDIO_HOME
|
||||
from .env import USER_HOME
|
||||
from .error import ParameterError
|
||||
from .log import Logger
|
||||
from .log import logger
|
||||
from .time import seconds_to_hms
|
||||
from .time import Timer
|
@ -0,0 +1,193 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import argparse
|
||||
from typing import List
|
||||
|
||||
from prettytable import PrettyTable
|
||||
|
||||
from ..log import logger
|
||||
from ..utils import cli_register
|
||||
from ..utils import stats_wrapper
|
||||
|
||||
__all__ = ['StatsExecutor']
|
||||
|
||||
model_name_format = {
|
||||
'asr': 'Model-Language-Sample Rate',
|
||||
'cls': 'Model-Sample Rate',
|
||||
'st': 'Model-Source language-Target language',
|
||||
'text': 'Model-Task-Language',
|
||||
'tts': 'Model-Language'
|
||||
}
|
||||
|
||||
|
||||
@cli_register(
|
||||
name='paddlespeech.stats',
|
||||
description='Get speech tasks support models list.')
|
||||
class StatsExecutor():
|
||||
def __init__(self):
|
||||
super(StatsExecutor, self).__init__()
|
||||
|
||||
self.parser = argparse.ArgumentParser(
|
||||
prog='paddlespeech.stats', add_help=True)
|
||||
self.parser.add_argument(
|
||||
'--task',
|
||||
type=str,
|
||||
default='asr',
|
||||
choices=['asr', 'cls', 'st', 'text', 'tts'],
|
||||
help='Choose speech task.',
|
||||
required=True)
|
||||
self.task_choices = ['asr', 'cls', 'st', 'text', 'tts']
|
||||
|
||||
def show_support_models(self, pretrained_models: dict):
|
||||
fields = model_name_format[self.task].split("-")
|
||||
table = PrettyTable(fields)
|
||||
for key in pretrained_models:
|
||||
table.add_row(key.split("-"))
|
||||
print(table)
|
||||
|
||||
def execute(self, argv: List[str]) -> bool:
|
||||
"""
|
||||
Command line entry.
|
||||
"""
|
||||
parser_args = self.parser.parse_args(argv)
|
||||
self.task = parser_args.task
|
||||
if self.task not in self.task_choices:
|
||||
logger.error(
|
||||
"Please input correct speech task, choices = ['asr', 'cls', 'st', 'text', 'tts']"
|
||||
)
|
||||
return False
|
||||
|
||||
elif self.task == 'asr':
|
||||
try:
|
||||
from ..asr.infer import pretrained_models
|
||||
logger.info(
|
||||
"Here is the list of ASR pretrained models released by PaddleSpeech that can be used by command line and python API"
|
||||
)
|
||||
self.show_support_models(pretrained_models)
|
||||
return True
|
||||
except BaseException:
|
||||
logger.error("Failed to get the list of ASR pretrained models.")
|
||||
return False
|
||||
|
||||
elif self.task == 'cls':
|
||||
try:
|
||||
from ..cls.infer import pretrained_models
|
||||
logger.info(
|
||||
"Here is the list of CLS pretrained models released by PaddleSpeech that can be used by command line and python API"
|
||||
)
|
||||
self.show_support_models(pretrained_models)
|
||||
return True
|
||||
except BaseException:
|
||||
logger.error("Failed to get the list of CLS pretrained models.")
|
||||
return False
|
||||
|
||||
elif self.task == 'st':
|
||||
try:
|
||||
from ..st.infer import pretrained_models
|
||||
logger.info(
|
||||
"Here is the list of ST pretrained models released by PaddleSpeech that can be used by command line and python API"
|
||||
)
|
||||
self.show_support_models(pretrained_models)
|
||||
return True
|
||||
except BaseException:
|
||||
logger.error("Failed to get the list of ST pretrained models.")
|
||||
return False
|
||||
|
||||
elif self.task == 'text':
|
||||
try:
|
||||
from ..text.infer import pretrained_models
|
||||
logger.info(
|
||||
"Here is the list of TEXT pretrained models released by PaddleSpeech that can be used by command line and python API"
|
||||
)
|
||||
self.show_support_models(pretrained_models)
|
||||
return True
|
||||
except BaseException:
|
||||
logger.error(
|
||||
"Failed to get the list of TEXT pretrained models.")
|
||||
return False
|
||||
|
||||
elif self.task == 'tts':
|
||||
try:
|
||||
from ..tts.infer import pretrained_models
|
||||
logger.info(
|
||||
"Here is the list of TTS pretrained models released by PaddleSpeech that can be used by command line and python API"
|
||||
)
|
||||
self.show_support_models(pretrained_models)
|
||||
return True
|
||||
except BaseException:
|
||||
logger.error("Failed to get the list of TTS pretrained models.")
|
||||
return False
|
||||
|
||||
@stats_wrapper
|
||||
def __call__(
|
||||
self,
|
||||
task: str=None, ):
|
||||
"""
|
||||
Python API to call an executor.
|
||||
"""
|
||||
self.task = task
|
||||
if self.task not in self.task_choices:
|
||||
print(
|
||||
"Please input correct speech task, choices = ['asr', 'cls', 'st', 'text', 'tts']"
|
||||
)
|
||||
|
||||
elif self.task == 'asr':
|
||||
try:
|
||||
from ..asr.infer import pretrained_models
|
||||
print(
|
||||
"Here is the list of ASR pretrained models released by PaddleSpeech that can be used by command line and python API"
|
||||
)
|
||||
self.show_support_models(pretrained_models)
|
||||
except BaseException:
|
||||
print("Failed to get the list of ASR pretrained models.")
|
||||
|
||||
elif self.task == 'cls':
|
||||
try:
|
||||
from ..cls.infer import pretrained_models
|
||||
print(
|
||||
"Here is the list of CLS pretrained models released by PaddleSpeech that can be used by command line and python API"
|
||||
)
|
||||
self.show_support_models(pretrained_models)
|
||||
except BaseException:
|
||||
print("Failed to get the list of CLS pretrained models.")
|
||||
|
||||
elif self.task == 'st':
|
||||
try:
|
||||
from ..st.infer import pretrained_models
|
||||
print(
|
||||
"Here is the list of ST pretrained models released by PaddleSpeech that can be used by command line and python API"
|
||||
)
|
||||
self.show_support_models(pretrained_models)
|
||||
except BaseException:
|
||||
print("Failed to get the list of ST pretrained models.")
|
||||
|
||||
elif self.task == 'text':
|
||||
try:
|
||||
from ..text.infer import pretrained_models
|
||||
print(
|
||||
"Here is the list of TEXT pretrained models released by PaddleSpeech that can be used by command line and python API"
|
||||
)
|
||||
self.show_support_models(pretrained_models)
|
||||
except BaseException:
|
||||
print("Failed to get the list of TEXT pretrained models.")
|
||||
|
||||
elif self.task == 'tts':
|
||||
try:
|
||||
from ..tts.infer import pretrained_models
|
||||
print(
|
||||
"Here is the list of TTS pretrained models released by PaddleSpeech that can be used by command line and python API"
|
||||
)
|
||||
self.show_support_models(pretrained_models)
|
||||
except BaseException:
|
||||
print("Failed to get the list of TTS pretrained models.")
|
@ -1,25 +1,107 @@
|
||||
# This is the parameter configuration file for PaddleSpeech Serving.
|
||||
|
||||
##################################################################
|
||||
# SERVER SETTING #
|
||||
##################################################################
|
||||
host: '0.0.0.0'
|
||||
#################################################################################
|
||||
# SERVER SETTING #
|
||||
#################################################################################
|
||||
host: 127.0.0.1
|
||||
port: 8090
|
||||
|
||||
##################################################################
|
||||
# CONFIG FILE #
|
||||
##################################################################
|
||||
# The engine_type of speech task needs to keep the same type as the config file of speech task.
|
||||
# E.g: The engine_type of asr is 'python', the engine_backend of asr is 'XX/asr.yaml'
|
||||
# E.g: The engine_type of asr is 'inference', the engine_backend of asr is 'XX/asr_pd.yaml'
|
||||
#
|
||||
# add engine type (Options: python, inference)
|
||||
engine_type:
|
||||
asr: 'python'
|
||||
tts: 'python'
|
||||
|
||||
# add engine backend type (Options: asr, tts) and config file here.
|
||||
# Adding a speech task to engine_backend means starting the service.
|
||||
engine_backend:
|
||||
asr: 'conf/asr/asr.yaml'
|
||||
tts: 'conf/tts/tts.yaml'
|
||||
# The task format in the engin_list is: <speech task>_<engine type>
|
||||
# task choices = ['asr_python', 'asr_inference', 'tts_python', 'tts_inference']
|
||||
|
||||
engine_list: ['asr_python', 'tts_python']
|
||||
|
||||
|
||||
#################################################################################
|
||||
# ENGINE CONFIG #
|
||||
#################################################################################
|
||||
################### speech task: asr; engine_type: python #######################
|
||||
asr_python:
|
||||
model: 'conformer_wenetspeech'
|
||||
lang: 'zh'
|
||||
sample_rate: 16000
|
||||
cfg_path: # [optional]
|
||||
ckpt_path: # [optional]
|
||||
decode_method: 'attention_rescoring'
|
||||
force_yes: True
|
||||
device: # set 'gpu:id' or 'cpu'
|
||||
|
||||
|
||||
################### speech task: asr; engine_type: inference #######################
|
||||
asr_inference:
|
||||
# model_type choices=['deepspeech2offline_aishell']
|
||||
model_type: 'deepspeech2offline_aishell'
|
||||
am_model: # the pdmodel file of am static model [optional]
|
||||
am_params: # the pdiparams file of am static model [optional]
|
||||
lang: 'zh'
|
||||
sample_rate: 16000
|
||||
cfg_path:
|
||||
decode_method:
|
||||
force_yes: True
|
||||
|
||||
am_predictor_conf:
|
||||
device: # set 'gpu:id' or 'cpu'
|
||||
switch_ir_optim: True
|
||||
glog_info: False # True -> print glog
|
||||
summary: True # False -> do not show predictor config
|
||||
|
||||
|
||||
################### speech task: tts; engine_type: python #######################
|
||||
tts_python:
|
||||
# am (acoustic model) choices=['speedyspeech_csmsc', 'fastspeech2_csmsc',
|
||||
# 'fastspeech2_ljspeech', 'fastspeech2_aishell3',
|
||||
# 'fastspeech2_vctk']
|
||||
am: 'fastspeech2_csmsc'
|
||||
am_config:
|
||||
am_ckpt:
|
||||
am_stat:
|
||||
phones_dict:
|
||||
tones_dict:
|
||||
speaker_dict:
|
||||
spk_id: 0
|
||||
|
||||
# voc (vocoder) choices=['pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3',
|
||||
# 'pwgan_vctk', 'mb_melgan_csmsc']
|
||||
voc: 'pwgan_csmsc'
|
||||
voc_config:
|
||||
voc_ckpt:
|
||||
voc_stat:
|
||||
|
||||
# others
|
||||
lang: 'zh'
|
||||
device: # set 'gpu:id' or 'cpu'
|
||||
|
||||
|
||||
################### speech task: tts; engine_type: inference #######################
|
||||
tts_inference:
|
||||
# am (acoustic model) choices=['speedyspeech_csmsc', 'fastspeech2_csmsc']
|
||||
am: 'fastspeech2_csmsc'
|
||||
am_model: # the pdmodel file of your am static model (XX.pdmodel)
|
||||
am_params: # the pdiparams file of your am static model (XX.pdipparams)
|
||||
am_sample_rate: 24000
|
||||
phones_dict:
|
||||
tones_dict:
|
||||
speaker_dict:
|
||||
spk_id: 0
|
||||
|
||||
am_predictor_conf:
|
||||
device: # set 'gpu:id' or 'cpu'
|
||||
switch_ir_optim: True
|
||||
glog_info: False # True -> print glog
|
||||
summary: True # False -> do not show predictor config
|
||||
|
||||
# voc (vocoder) choices=['pwgan_csmsc', 'mb_melgan_csmsc','hifigan_csmsc']
|
||||
voc: 'pwgan_csmsc'
|
||||
voc_model: # the pdmodel file of your vocoder static model (XX.pdmodel)
|
||||
voc_params: # the pdiparams file of your vocoder static model (XX.pdipparams)
|
||||
voc_sample_rate: 24000
|
||||
|
||||
voc_predictor_conf:
|
||||
device: # set 'gpu:id' or 'cpu'
|
||||
switch_ir_optim: True
|
||||
glog_info: False # True -> print glog
|
||||
summary: True # False -> do not show predictor config
|
||||
|
||||
# others
|
||||
lang: 'zh'
|
||||
|
||||
|
@ -1,8 +0,0 @@
|
||||
model: 'conformer_wenetspeech'
|
||||
lang: 'zh'
|
||||
sample_rate: 16000
|
||||
cfg_path: # [optional]
|
||||
ckpt_path: # [optional]
|
||||
decode_method: 'attention_rescoring'
|
||||
force_yes: True
|
||||
device: 'cpu' # set 'gpu:id' or 'cpu'
|
@ -1,25 +0,0 @@
|
||||
# This is the parameter configuration file for ASR server.
|
||||
# These are the static models that support paddle inference.
|
||||
|
||||
##################################################################
|
||||
# ACOUSTIC MODEL SETTING #
|
||||
# am choices=['deepspeech2offline_aishell'] TODO
|
||||
##################################################################
|
||||
model_type: 'deepspeech2offline_aishell'
|
||||
am_model: # the pdmodel file of am static model [optional]
|
||||
am_params: # the pdiparams file of am static model [optional]
|
||||
lang: 'zh'
|
||||
sample_rate: 16000
|
||||
cfg_path:
|
||||
decode_method:
|
||||
force_yes: True
|
||||
|
||||
am_predictor_conf:
|
||||
device: 'cpu' # set 'gpu:id' or 'cpu'
|
||||
enable_mkldnn: True
|
||||
switch_ir_optim: True
|
||||
|
||||
|
||||
##################################################################
|
||||
# OTHERS #
|
||||
##################################################################
|
@ -1,32 +0,0 @@
|
||||
# This is the parameter configuration file for TTS server.
|
||||
|
||||
##################################################################
|
||||
# ACOUSTIC MODEL SETTING #
|
||||
# am choices=['speedyspeech_csmsc', 'fastspeech2_csmsc',
|
||||
# 'fastspeech2_ljspeech', 'fastspeech2_aishell3',
|
||||
# 'fastspeech2_vctk']
|
||||
##################################################################
|
||||
am: 'fastspeech2_csmsc'
|
||||
am_config:
|
||||
am_ckpt:
|
||||
am_stat:
|
||||
phones_dict:
|
||||
tones_dict:
|
||||
speaker_dict:
|
||||
spk_id: 0
|
||||
|
||||
##################################################################
|
||||
# VOCODER SETTING #
|
||||
# voc choices=['pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3',
|
||||
# 'pwgan_vctk', 'mb_melgan_csmsc']
|
||||
##################################################################
|
||||
voc: 'pwgan_csmsc'
|
||||
voc_config:
|
||||
voc_ckpt:
|
||||
voc_stat:
|
||||
|
||||
##################################################################
|
||||
# OTHERS #
|
||||
##################################################################
|
||||
lang: 'zh'
|
||||
device: 'cpu' # set 'gpu:id' or 'cpu'
|
@ -1,40 +0,0 @@
|
||||
# This is the parameter configuration file for TTS server.
|
||||
# These are the static models that support paddle inference.
|
||||
|
||||
##################################################################
|
||||
# ACOUSTIC MODEL SETTING #
|
||||
# am choices=['speedyspeech_csmsc', 'fastspeech2_csmsc']
|
||||
##################################################################
|
||||
am: 'fastspeech2_csmsc'
|
||||
am_model: # the pdmodel file of your am static model (XX.pdmodel)
|
||||
am_params: # the pdiparams file of your am static model (XX.pdipparams)
|
||||
am_sample_rate: 24000 # must match the model
|
||||
phones_dict:
|
||||
tones_dict:
|
||||
speaker_dict:
|
||||
spk_id: 0
|
||||
|
||||
am_predictor_conf:
|
||||
device: 'cpu' # set 'gpu:id' or 'cpu'
|
||||
enable_mkldnn: False
|
||||
switch_ir_optim: False
|
||||
|
||||
|
||||
##################################################################
|
||||
# VOCODER SETTING #
|
||||
# voc choices=['pwgan_csmsc', 'mb_melgan_csmsc','hifigan_csmsc']
|
||||
##################################################################
|
||||
voc: 'pwgan_csmsc'
|
||||
voc_model: # the pdmodel file of your vocoder static model (XX.pdmodel)
|
||||
voc_params: # the pdiparams file of your vocoder static model (XX.pdipparams)
|
||||
voc_sample_rate: 24000 #must match the model
|
||||
|
||||
voc_predictor_conf:
|
||||
device: 'cpu' # set 'gpu:id' or 'cpu'
|
||||
enable_mkldnn: False
|
||||
switch_ir_optim: False
|
||||
|
||||
##################################################################
|
||||
# OTHERS #
|
||||
##################################################################
|
||||
lang: 'zh'
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue