Merge branch 'develop' into fix_varbase

pull/833/head
Hui Zhang 4 years ago committed by GitHub
commit 282914f45b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -1,5 +1,3 @@
[中文版](README_cn.md)
# PaddlePaddle Speech to Any toolkit # PaddlePaddle Speech to Any toolkit
![License](https://img.shields.io/badge/license-Apache%202-red.svg) ![License](https://img.shields.io/badge/license-Apache%202-red.svg)
@ -11,7 +9,7 @@
## Features ## Features
See [feature list](doc/src/feature_list.md) for more information. See [feature list](docs/src/feature_list.md) for more information.
## Setup ## Setup
@ -20,20 +18,20 @@ All tested under:
* python>=3.7 * python>=3.7
* paddlepaddle>=2.2.0rc * paddlepaddle>=2.2.0rc
Please see [install](doc/src/install.md). Please see [install](docs/src/install.md).
## Getting Started ## Getting Started
Please see [Getting Started](doc/src/getting_started.md) and [tiny egs](examples/tiny/s0/README.md). Please see [Getting Started](docs/src/getting_started.md) and [tiny egs](examples/tiny/s0/README.md).
## More Information ## More Information
* [Data Prepration](doc/src/data_preparation.md) * [Data Prepration](docs/src/data_preparation.md)
* [Data Augmentation](doc/src/augmentation.md) * [Data Augmentation](docs/src/augmentation.md)
* [Ngram LM](doc/src/ngram_lm.md) * [Ngram LM](docs/src/ngram_lm.md)
* [Benchmark](doc/src/benchmark.md) * [Benchmark](docs/src/benchmark.md)
* [Relased Model](doc/src/released_model.md) * [Relased Model](docs/src/released_model.md)
## Questions and Help ## Questions and Help
@ -47,4 +45,4 @@ DeepSpeech is provided under the [Apache-2.0 License](./LICENSE).
## Acknowledgement ## Acknowledgement
We depends on many open source repos. See [References](doc/src/reference.md) for more information. We depends on many open source repos. See [References](docs/src/reference.md) for more information.

@ -1,49 +0,0 @@
[English](README.md)
# PaddlePaddle Speech to Any toolkit
![License](https://img.shields.io/badge/license-Apache%202-red.svg)
![python version](https://img.shields.io/badge/python-3.7+-orange.svg)
![support os](https://img.shields.io/badge/os-linux-yellow.svg)
*DeepSpeech*是一个采用[PaddlePaddle](https://github.com/PaddlePaddle/Paddle)平台的端到端自动语音识别引擎的开源项目,
我们的愿景是为语音识别在工业应用和学术研究上,提供易于使用、高效、小型化和可扩展的工具,包括训练,推理,以及 部署。
## 特性
参看 [特性列表](doc/src/feature_list.md)。
## 安装
在以下环境测试验证过:
* Ubuntu 16.04
* python>=3.7
* paddlepaddle>=2.2.0rc
参看 [安装](doc/src/install.md)。
## 开始
请查看 [开始](doc/src/getting_started.md) 和 [tiny egs](examples/tiny/s0/README.md)。
## 更多信息
* [数据处理](doc/src/data_preparation.md)
* [数据增强](doc/src/augmentation.md)
* [语言模型](doc/src/ngram_lm.md)
* [Benchmark](doc/src/benchmark.md)
* [Relased Model](doc/src/released_model.md)
## 问题和帮助
欢迎您在[Github讨论](https://github.com/PaddlePaddle/DeepSpeech/discussions)提交问题,[Github问题](https://github.com/PaddlePaddle/models/issues)中反馈bug。也欢迎您为这个项目做出贡献。
## License
DeepSpeech 遵循[Apache-2.0开源协议](./LICENSE)。
## 感谢
开发中参考一些优秀的仓库,详情参见 [References](doc/src/reference.md)。

@ -1,191 +0,0 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Beam search parameters tuning for DeepSpeech2 model."""
import functools
import sys
import numpy as np
from paddle.io import DataLoader
from deepspeech.exps.deepspeech2.config import get_cfg_defaults
from deepspeech.io.collator import SpeechCollator
from deepspeech.io.dataset import ManifestDataset
from deepspeech.models.ds2 import DeepSpeech2Model
from deepspeech.training.cli import default_argument_parser
from deepspeech.utils import error_rate
from deepspeech.utils.utility import add_arguments
from deepspeech.utils.utility import print_arguments
def tune(config, args):
"""Tune parameters alpha and beta incrementally."""
if not args.num_alphas >= 0:
raise ValueError("num_alphas must be non-negative!")
if not args.num_betas >= 0:
raise ValueError("num_betas must be non-negative!")
config.defrost()
config.data.manfiest = config.data.dev_manifest
config.data.augmentation_config = ""
config.data.keep_transcription_text = True
dev_dataset = ManifestDataset.from_config(config)
valid_loader = DataLoader(
dev_dataset,
batch_size=config.data.batch_size,
shuffle=False,
drop_last=False,
collate_fn=SpeechCollator(keep_transcription_text=True))
model = DeepSpeech2Model.from_pretrained(valid_loader, config,
args.checkpoint_path)
model.eval()
# decoders only accept string encoded in utf-8
vocab_list = valid_loader.dataset.vocab_list
errors_func = error_rate.char_errors if config.decoding.error_rate_type == 'cer' else error_rate.word_errors
# create grid for search
cand_alphas = np.linspace(args.alpha_from, args.alpha_to, args.num_alphas)
cand_betas = np.linspace(args.beta_from, args.beta_to, args.num_betas)
params_grid = [(alpha, beta) for alpha in cand_alphas
for beta in cand_betas]
err_sum = [0.0 for i in range(len(params_grid))]
err_ave = [0.0 for i in range(len(params_grid))]
num_ins, len_refs, cur_batch = 0, 0, 0
# initialize external scorer
model.decoder.init_decode(args.alpha_from, args.beta_from,
config.decoding.lang_model_path, vocab_list,
config.decoding.decoding_method)
## incremental tuning parameters over multiple batches
print("start tuning ...")
for infer_data in valid_loader():
if (args.num_batches >= 0) and (cur_batch >= args.num_batches):
break
def ordid2token(texts, texts_len):
""" ord() id to chr() chr """
trans = []
for text, n in zip(texts, texts_len):
n = n.numpy().item()
ids = text[:n]
trans.append(''.join([chr(i) for i in ids]))
return trans
audio, audio_len, text, text_len = infer_data
target_transcripts = ordid2token(text, text_len)
num_ins += audio.shape[0]
# model infer
eouts, eouts_len = model.encoder(audio, audio_len)
probs = model.decoder.softmax(eouts)
# grid search
for index, (alpha, beta) in enumerate(params_grid):
print(f"tuneing: alpha={alpha} beta={beta}")
result_transcripts = model.decoder.decode_probs(
probs.numpy(), eouts_len, vocab_list,
config.decoding.decoding_method,
config.decoding.lang_model_path, alpha, beta,
config.decoding.beam_size, config.decoding.cutoff_prob,
config.decoding.cutoff_top_n, config.decoding.num_proc_bsearch)
for target, result in zip(target_transcripts, result_transcripts):
errors, len_ref = errors_func(target, result)
err_sum[index] += errors
# accumulate the length of references of every batchπ
# in the first iteration
if args.alpha_from == alpha and args.beta_from == beta:
len_refs += len_ref
err_ave[index] = err_sum[index] / len_refs
if index % 2 == 0:
sys.stdout.write('.')
sys.stdout.flush()
print("tuneing: one grid done!")
# output on-line tuning result at the end of current batch
err_ave_min = min(err_ave)
min_index = err_ave.index(err_ave_min)
print("\nBatch %d [%d/?], current opt (alpha, beta) = (%s, %s), "
" min [%s] = %f" %
(cur_batch, num_ins, "%.3f" % params_grid[min_index][0],
"%.3f" % params_grid[min_index][1],
config.decoding.error_rate_type, err_ave_min))
cur_batch += 1
# output WER/CER at every (alpha, beta)
print("\nFinal %s:\n" % config.decoding.error_rate_type)
for index in range(len(params_grid)):
print("(alpha, beta) = (%s, %s), [%s] = %f" %
("%.3f" % params_grid[index][0], "%.3f" % params_grid[index][1],
config.decoding.error_rate_type, err_ave[index]))
err_ave_min = min(err_ave)
min_index = err_ave.index(err_ave_min)
print("\nFinish tuning on %d batches, final opt (alpha, beta) = (%s, %s)" %
(cur_batch, "%.3f" % params_grid[min_index][0],
"%.3f" % params_grid[min_index][1]))
print("finish tuning")
def main(config, args):
tune(config, args)
if __name__ == "__main__":
parser = default_argument_parser()
add_arg = functools.partial(add_arguments, argparser=parser)
add_arg('num_batches', int, -1, "# of batches tuning on. "
"Default -1, on whole dev set.")
add_arg('num_alphas', int, 45, "# of alpha candidates for tuning.")
add_arg('num_betas', int, 8, "# of beta candidates for tuning.")
add_arg('alpha_from', float, 1.0, "Where alpha starts tuning from.")
add_arg('alpha_to', float, 3.2, "Where alpha ends tuning with.")
add_arg('beta_from', float, 0.1, "Where beta starts tuning from.")
add_arg('beta_to', float, 0.45, "Where beta ends tuning with.")
add_arg('batch_size', int, 256, "# of samples per batch.")
add_arg('beam_size', int, 500, "Beam search width.")
add_arg('num_proc_bsearch', int, 8, "# of CPUs for beam search.")
add_arg('cutoff_prob', float, 1.0, "Cutoff probability for pruning.")
add_arg('cutoff_top_n', int, 40, "Cutoff number for pruning.")
args = parser.parse_args()
print_arguments(args, globals())
# https://yaml.org/type/float.html
config = get_cfg_defaults()
if args.config:
config.merge_from_file(args.config)
if args.opts:
config.merge_from_list(args.opts)
config.data.batch_size = args.batch_size
config.decoding.beam_size = args.beam_size
config.decoding.num_proc_bsearch = args.num_proc_bsearch
config.decoding.cutoff_prob = args.cutoff_prob
config.decoding.cutoff_top_n = args.cutoff_top_n
config.freeze()
print(config)
if args.dump_config:
with open(args.dump_config, 'w') as f:
print(config, file=f)
main(config, args)

@ -35,12 +35,14 @@ from deepspeech.models.ds2 import DeepSpeech2Model
from deepspeech.models.ds2_online import DeepSpeech2InferModelOnline from deepspeech.models.ds2_online import DeepSpeech2InferModelOnline
from deepspeech.models.ds2_online import DeepSpeech2ModelOnline from deepspeech.models.ds2_online import DeepSpeech2ModelOnline
from deepspeech.training.gradclip import ClipGradByGlobalNormWithLog from deepspeech.training.gradclip import ClipGradByGlobalNormWithLog
from deepspeech.training.reporter import report
from deepspeech.training.trainer import Trainer from deepspeech.training.trainer import Trainer
from deepspeech.utils import error_rate from deepspeech.utils import error_rate
from deepspeech.utils import layer_tools from deepspeech.utils import layer_tools
from deepspeech.utils import mp_tools from deepspeech.utils import mp_tools
from deepspeech.utils.log import Autolog from deepspeech.utils.log import Autolog
from deepspeech.utils.log import Log from deepspeech.utils.log import Log
from deepspeech.utils.utility import UpdateConfig
logger = Log(__name__).getlog() logger = Log(__name__).getlog()
@ -66,7 +68,9 @@ class DeepSpeech2Trainer(Trainer):
super().__init__(config, args) super().__init__(config, args)
def train_batch(self, batch_index, batch_data, msg): def train_batch(self, batch_index, batch_data, msg):
train_conf = self.config.training batch_size = self.config.collator.batch_size
accum_grad = self.config.training.accum_grad
start = time.time() start = time.time()
# forward # forward
@ -77,7 +81,7 @@ class DeepSpeech2Trainer(Trainer):
} }
# loss backward # loss backward
if (batch_index + 1) % train_conf.accum_grad != 0: if (batch_index + 1) % accum_grad != 0:
# Disable gradient synchronizations across DDP processes. # Disable gradient synchronizations across DDP processes.
# Within this context, gradients will be accumulated on module # Within this context, gradients will be accumulated on module
# variables, which will later be synchronized. # variables, which will later be synchronized.
@ -92,19 +96,18 @@ class DeepSpeech2Trainer(Trainer):
layer_tools.print_grads(self.model, print_func=None) layer_tools.print_grads(self.model, print_func=None)
# optimizer step # optimizer step
if (batch_index + 1) % train_conf.accum_grad == 0: if (batch_index + 1) % accum_grad == 0:
self.optimizer.step() self.optimizer.step()
self.optimizer.clear_grad() self.optimizer.clear_grad()
self.iteration += 1 self.iteration += 1
iteration_time = time.time() - start iteration_time = time.time() - start
msg += "train time: {:>.3f}s, ".format(iteration_time) for k, v in losses_np.items():
msg += "batch size: {}, ".format(self.config.collator.batch_size) report(k, v)
msg += "accum: {}, ".format(train_conf.accum_grad) report("batch_size", batch_size)
msg += ', '.join('{}: {:>.6f}'.format(k, v) report("accum", accum_grad)
for k, v in losses_np.items()) report("step_cost", iteration_time)
logger.info(msg)
if dist.get_rank() == 0 and self.visualizer: if dist.get_rank() == 0 and self.visualizer:
for k, v in losses_np.items(): for k, v in losses_np.items():
@ -147,10 +150,9 @@ class DeepSpeech2Trainer(Trainer):
def setup_model(self): def setup_model(self):
config = self.config.clone() config = self.config.clone()
config.defrost() with UpdateConfig(config):
config.model.feat_size = self.train_loader.collate_fn.feature_size config.model.feat_size = self.train_loader.collate_fn.feature_size
config.model.dict_size = self.train_loader.collate_fn.vocab_size config.model.dict_size = self.train_loader.collate_fn.vocab_size
config.freeze()
if self.args.model_type == 'offline': if self.args.model_type == 'offline':
model = DeepSpeech2Model.from_config(config.model) model = DeepSpeech2Model.from_config(config.model)

@ -17,6 +17,7 @@ import os
import sys import sys
import time import time
from collections import defaultdict from collections import defaultdict
from collections import OrderedDict
from contextlib import nullcontext from contextlib import nullcontext
from pathlib import Path from pathlib import Path
from typing import Optional from typing import Optional
@ -33,6 +34,8 @@ from deepspeech.io.sampler import SortagradBatchSampler
from deepspeech.io.sampler import SortagradDistributedBatchSampler from deepspeech.io.sampler import SortagradDistributedBatchSampler
from deepspeech.models.u2 import U2Model from deepspeech.models.u2 import U2Model
from deepspeech.training.optimizer import OptimizerFactory from deepspeech.training.optimizer import OptimizerFactory
from deepspeech.training.reporter import ObsScope
from deepspeech.training.reporter import report
from deepspeech.training.scheduler import LRSchedulerFactory from deepspeech.training.scheduler import LRSchedulerFactory
from deepspeech.training.timer import Timer from deepspeech.training.timer import Timer
from deepspeech.training.trainer import Trainer from deepspeech.training.trainer import Trainer
@ -43,6 +46,7 @@ from deepspeech.utils import mp_tools
from deepspeech.utils import text_grid from deepspeech.utils import text_grid
from deepspeech.utils import utility from deepspeech.utils import utility
from deepspeech.utils.log import Log from deepspeech.utils.log import Log
from deepspeech.utils.utility import UpdateConfig
logger = Log(__name__).getlog() logger = Log(__name__).getlog()
@ -100,7 +104,8 @@ class U2Trainer(Trainer):
# Disable gradient synchronizations across DDP processes. # Disable gradient synchronizations across DDP processes.
# Within this context, gradients will be accumulated on module # Within this context, gradients will be accumulated on module
# variables, which will later be synchronized. # variables, which will later be synchronized.
context = self.model.no_sync # When using cpu w/o DDP, model does not have `no_sync`
context = self.model.no_sync if self.parallel else nullcontext
else: else:
# Used for single gpu training and DDP gradient synchronization # Used for single gpu training and DDP gradient synchronization
# processes. # processes.
@ -119,12 +124,11 @@ class U2Trainer(Trainer):
iteration_time = time.time() - start iteration_time = time.time() - start
if (batch_index + 1) % train_conf.log_interval == 0: if (batch_index + 1) % train_conf.log_interval == 0:
msg += "train time: {:>.3f}s, ".format(iteration_time) for k, v in losses_np.items():
msg += "batch size: {}, ".format(self.config.collator.batch_size) report(k, v)
msg += "accum: {}, ".format(train_conf.accum_grad) report("batch_size", self.config.collator.batch_size)
msg += ', '.join('{}: {:>.6f}'.format(k, v) report("accum", train_conf.accum_grad)
for k, v in losses_np.items()) report("step_cost", iteration_time)
logger.info(msg)
if dist.get_rank() == 0 and self.visualizer: if dist.get_rank() == 0 and self.visualizer:
losses_np_v = losses_np.copy() losses_np_v = losses_np.copy()
@ -197,15 +201,29 @@ class U2Trainer(Trainer):
data_start_time = time.time() data_start_time = time.time()
for batch_index, batch in enumerate(self.train_loader): for batch_index, batch in enumerate(self.train_loader):
dataload_time = time.time() - data_start_time dataload_time = time.time() - data_start_time
msg = "Train: Rank: {}, ".format(dist.get_rank()) msg = "Train:"
msg += "epoch: {}, ".format(self.epoch) observation = OrderedDict()
msg += "step: {}, ".format(self.iteration) with ObsScope(observation):
msg += "batch : {}/{}, ".format(batch_index + 1, report("Rank", dist.get_rank())
len(self.train_loader)) report("epoch", self.epoch)
msg += "lr: {:>.8f}, ".format(self.lr_scheduler()) report('step', self.iteration)
msg += "data time: {:>.3f}s, ".format(dataload_time) report('step/total',
self.train_batch(batch_index, batch, msg) (batch_index + 1) / len(self.train_loader))
self.after_train_batch() report("lr", self.lr_scheduler())
self.train_batch(batch_index, batch, msg)
self.after_train_batch()
report('reader_cost', dataload_time)
observation['batch_cost'] = observation[
'reader_cost'] + observation['step_cost']
observation['samples'] = observation['batch_size']
observation['ips[sent./sec]'] = observation[
'batch_size'] / observation['batch_cost']
for k, v in observation.items():
msg += f" {k}: "
msg += f"{v:>.8f}" if isinstance(v,
float) else f"{v}"
msg += ","
logger.info(msg)
data_start_time = time.time() data_start_time = time.time()
except Exception as e: except Exception as e:
logger.error(e) logger.error(e)
@ -314,10 +332,11 @@ class U2Trainer(Trainer):
def setup_model(self): def setup_model(self):
config = self.config config = self.config
model_conf = config.model model_conf = config.model
model_conf.defrost()
model_conf.input_dim = self.train_loader.collate_fn.feature_size with UpdateConfig(model_conf):
model_conf.output_dim = self.train_loader.collate_fn.vocab_size model_conf.input_dim = self.train_loader.collate_fn.feature_size
model_conf.freeze() model_conf.output_dim = self.train_loader.collate_fn.vocab_size
model = U2Model.from_config(model_conf) model = U2Model.from_config(model_conf)
if self.parallel: if self.parallel:
@ -560,7 +579,7 @@ class U2Tester(U2Trainer):
# 1. Encoder # 1. Encoder
encoder_out, encoder_mask = self.model._forward_encoder( encoder_out, encoder_mask = self.model._forward_encoder(
feat, feats_length) # (B, maxlen, encoder_dim) feat, feats_length) # (B, maxlen, encoder_dim)
maxlen = encoder_out.size(1) maxlen = encoder_out.shape[1]
ctc_probs = self.model.ctc.log_softmax( ctc_probs = self.model.ctc.log_softmax(
encoder_out) # (1, maxlen, vocab_size) encoder_out) # (1, maxlen, vocab_size)

@ -32,6 +32,7 @@ from deepspeech.training.trainer import Trainer
from deepspeech.training.updaters.trainer import Trainer as NewTrainer from deepspeech.training.updaters.trainer import Trainer as NewTrainer
from deepspeech.utils import layer_tools from deepspeech.utils import layer_tools
from deepspeech.utils.log import Log from deepspeech.utils.log import Log
from deepspeech.utils.utility import UpdateConfig
logger = Log(__name__).getlog() logger = Log(__name__).getlog()
@ -121,10 +122,10 @@ class U2Trainer(Trainer):
def setup_model(self): def setup_model(self):
config = self.config config = self.config
model_conf = config.model model_conf = config.model
model_conf.defrost() with UpdateConfig(model_conf):
model_conf.input_dim = self.train_loader.collate_fn.feature_size model_conf.input_dim = self.train_loader.collate_fn.feature_size
model_conf.output_dim = self.train_loader.collate_fn.vocab_size model_conf.output_dim = self.train_loader.collate_fn.vocab_size
model_conf.freeze()
model = U2Model.from_config(model_conf) model = U2Model.from_config(model_conf)
if self.parallel: if self.parallel:

@ -41,6 +41,7 @@ from deepspeech.utils import mp_tools
from deepspeech.utils import text_grid from deepspeech.utils import text_grid
from deepspeech.utils import utility from deepspeech.utils import utility
from deepspeech.utils.log import Log from deepspeech.utils.log import Log
from deepspeech.utils.utility import UpdateConfig
logger = Log(__name__).getlog() logger = Log(__name__).getlog()
@ -319,10 +320,10 @@ class U2Trainer(Trainer):
# model # model
model_conf = config.model model_conf = config.model
model_conf.defrost() with UpdateConfig(model_conf):
model_conf.input_dim = self.train_loader.feat_dim model_conf.input_dim = self.train_loader.feat_dim
model_conf.output_dim = self.train_loader.vocab_size model_conf.output_dim = self.train_loader.vocab_size
model_conf.freeze()
model = U2Model.from_config(model_conf) model = U2Model.from_config(model_conf)
if self.parallel: if self.parallel:
model = paddle.DataParallel(model) model = paddle.DataParallel(model)
@ -555,7 +556,7 @@ class U2Tester(U2Trainer):
# 1. Encoder # 1. Encoder
encoder_out, encoder_mask = self.model._forward_encoder( encoder_out, encoder_mask = self.model._forward_encoder(
feat, feats_length) # (B, maxlen, encoder_dim) feat, feats_length) # (B, maxlen, encoder_dim)
maxlen = encoder_out.size(1) maxlen = encoder_out.shape[1]
ctc_probs = self.model.ctc.log_softmax( ctc_probs = self.model.ctc.log_softmax(
encoder_out) # (1, maxlen, vocab_size) encoder_out) # (1, maxlen, vocab_size)

@ -47,6 +47,7 @@ from deepspeech.utils import mp_tools
from deepspeech.utils import text_grid from deepspeech.utils import text_grid
from deepspeech.utils import utility from deepspeech.utils import utility
from deepspeech.utils.log import Log from deepspeech.utils.log import Log
from deepspeech.utils.utility import UpdateConfig
logger = Log(__name__).getlog() logger = Log(__name__).getlog()
@ -345,10 +346,10 @@ class U2STTrainer(Trainer):
def setup_model(self): def setup_model(self):
config = self.config config = self.config
model_conf = config.model model_conf = config.model
model_conf.defrost() with UpdateConfig(model_conf):
model_conf.input_dim = self.train_loader.collate_fn.feature_size model_conf.input_dim = self.train_loader.collate_fn.feature_size
model_conf.output_dim = self.train_loader.collate_fn.vocab_size model_conf.output_dim = self.train_loader.collate_fn.vocab_size
model_conf.freeze()
model = U2STModel.from_config(model_conf) model = U2STModel.from_config(model_conf)
if self.parallel: if self.parallel:
@ -587,7 +588,7 @@ class U2STTester(U2STTrainer):
# 1. Encoder # 1. Encoder
encoder_out, encoder_mask = self.model._forward_encoder( encoder_out, encoder_mask = self.model._forward_encoder(
feat, feats_length) # (B, maxlen, encoder_dim) feat, feats_length) # (B, maxlen, encoder_dim)
maxlen = encoder_out.size(1) maxlen = encoder_out.shape[1]
ctc_probs = self.model.ctc.log_softmax( ctc_probs = self.model.ctc.log_softmax(
encoder_out) # (1, maxlen, vocab_size) encoder_out) # (1, maxlen, vocab_size)

@ -76,19 +76,19 @@ class ManifestDataset(Dataset):
Args: Args:
manifest_path (str): manifest josn file path manifest_path (str): manifest josn file path
max_input_len ([type], optional): maximum output seq length, max_input_len ([type], optional): maximum output seq length,
in seconds for raw wav, in frame numbers for feature data. Defaults to float('inf'). in seconds for raw wav, in frame numbers for feature data. Defaults to float('inf').
min_input_len (float, optional): minimum input seq length, min_input_len (float, optional): minimum input seq length,
in seconds for raw wav, in frame numbers for feature data. Defaults to 0.0. in seconds for raw wav, in frame numbers for feature data. Defaults to 0.0.
max_output_len (float, optional): maximum input seq length, max_output_len (float, optional): maximum input seq length,
in modeling units. Defaults to 500.0. in modeling units. Defaults to 500.0.
min_output_len (float, optional): minimum input seq length, min_output_len (float, optional): minimum input seq length,
in modeling units. Defaults to 0.0. in modeling units. Defaults to 0.0.
max_output_input_ratio (float, optional): maximum output seq length/output seq length ratio. max_output_input_ratio (float, optional): maximum output seq length/output seq length ratio.
Defaults to 10.0. Defaults to 10.0.
min_output_input_ratio (float, optional): minimum output seq length/output seq length ratio. min_output_input_ratio (float, optional): minimum output seq length/output seq length ratio.
Defaults to 0.05. Defaults to 0.05.
""" """
super().__init__() super().__init__()

@ -48,6 +48,7 @@ from deepspeech.utils.tensor_utils import add_sos_eos
from deepspeech.utils.tensor_utils import pad_sequence from deepspeech.utils.tensor_utils import pad_sequence
from deepspeech.utils.tensor_utils import th_accuracy from deepspeech.utils.tensor_utils import th_accuracy
from deepspeech.utils.utility import log_add from deepspeech.utils.utility import log_add
from deepspeech.utils.utility import UpdateConfig
__all__ = ["U2Model", "U2InferModel"] __all__ = ["U2Model", "U2InferModel"]
@ -297,8 +298,8 @@ class U2BaseModel(nn.Layer):
speech, speech_lengths, decoding_chunk_size, speech, speech_lengths, decoding_chunk_size,
num_decoding_left_chunks, num_decoding_left_chunks,
simulate_streaming) # (B, maxlen, encoder_dim) simulate_streaming) # (B, maxlen, encoder_dim)
maxlen = encoder_out.size(1) maxlen = encoder_out.shape[1]
encoder_dim = encoder_out.size(2) encoder_dim = encoder_out.shape[2]
running_size = batch_size * beam_size running_size = batch_size * beam_size
encoder_out = encoder_out.unsqueeze(1).repeat(1, beam_size, 1, 1).view( encoder_out = encoder_out.unsqueeze(1).repeat(1, beam_size, 1, 1).view(
running_size, maxlen, encoder_dim) # (B*N, maxlen, encoder_dim) running_size, maxlen, encoder_dim) # (B*N, maxlen, encoder_dim)
@ -403,7 +404,7 @@ class U2BaseModel(nn.Layer):
encoder_out, encoder_mask = self._forward_encoder( encoder_out, encoder_mask = self._forward_encoder(
speech, speech_lengths, decoding_chunk_size, speech, speech_lengths, decoding_chunk_size,
num_decoding_left_chunks, simulate_streaming) num_decoding_left_chunks, simulate_streaming)
maxlen = encoder_out.size(1) maxlen = encoder_out.shape[1]
encoder_out_lens = encoder_mask.squeeze(1).sum(1) encoder_out_lens = encoder_mask.squeeze(1).sum(1)
ctc_probs = self.ctc.log_softmax(encoder_out) # (B, maxlen, vocab_size) ctc_probs = self.ctc.log_softmax(encoder_out) # (B, maxlen, vocab_size)
@ -454,7 +455,7 @@ class U2BaseModel(nn.Layer):
speech, speech_lengths, decoding_chunk_size, speech, speech_lengths, decoding_chunk_size,
num_decoding_left_chunks, num_decoding_left_chunks,
simulate_streaming) # (B, maxlen, encoder_dim) simulate_streaming) # (B, maxlen, encoder_dim)
maxlen = encoder_out.size(1) maxlen = encoder_out.shape[1]
ctc_probs = self.ctc.log_softmax(encoder_out) # (1, maxlen, vocab_size) ctc_probs = self.ctc.log_softmax(encoder_out) # (1, maxlen, vocab_size)
ctc_probs = ctc_probs.squeeze(0) ctc_probs = ctc_probs.squeeze(0)
@ -582,7 +583,7 @@ class U2BaseModel(nn.Layer):
encoder_out = encoder_out.repeat(beam_size, 1, 1) encoder_out = encoder_out.repeat(beam_size, 1, 1)
encoder_mask = paddle.ones( encoder_mask = paddle.ones(
(beam_size, 1, encoder_out.size(1)), dtype=paddle.bool) (beam_size, 1, encoder_out.shape[1]), dtype=paddle.bool)
decoder_out, _ = self.decoder( decoder_out, _ = self.decoder(
encoder_out, encoder_mask, hyps_pad, encoder_out, encoder_mask, hyps_pad,
hyps_lens) # (beam_size, max_hyps_len, vocab_size) hyps_lens) # (beam_size, max_hyps_len, vocab_size)
@ -689,13 +690,13 @@ class U2BaseModel(nn.Layer):
Returns: Returns:
paddle.Tensor: decoder output, (B, L) paddle.Tensor: decoder output, (B, L)
""" """
assert encoder_out.size(0) == 1 assert encoder_out.shape[0] == 1
num_hyps = hyps.size(0) num_hyps = hyps.shape[0]
assert hyps_lens.size(0) == num_hyps assert hyps_lens.shape[0] == num_hyps
encoder_out = encoder_out.repeat(num_hyps, 1, 1) encoder_out = encoder_out.repeat(num_hyps, 1, 1)
# (B, 1, T) # (B, 1, T)
encoder_mask = paddle.ones( encoder_mask = paddle.ones(
[num_hyps, 1, encoder_out.size(1)], dtype=paddle.bool) [num_hyps, 1, encoder_out.shape[1]], dtype=paddle.bool)
# (num_hyps, max_hyps_len, vocab_size) # (num_hyps, max_hyps_len, vocab_size)
decoder_out, _ = self.decoder(encoder_out, encoder_mask, hyps, decoder_out, _ = self.decoder(encoder_out, encoder_mask, hyps,
hyps_lens) hyps_lens)
@ -750,7 +751,7 @@ class U2BaseModel(nn.Layer):
Returns: Returns:
List[List[int]]: transcripts. List[List[int]]: transcripts.
""" """
batch_size = feats.size(0) batch_size = feats.shape[0]
if decoding_method in ['ctc_prefix_beam_search', if decoding_method in ['ctc_prefix_beam_search',
'attention_rescoring'] and batch_size > 1: 'attention_rescoring'] and batch_size > 1:
logger.fatal( logger.fatal(
@ -778,7 +779,7 @@ class U2BaseModel(nn.Layer):
# result in List[int], change it to List[List[int]] for compatible # result in List[int], change it to List[List[int]] for compatible
# with other batch decoding mode # with other batch decoding mode
elif decoding_method == 'ctc_prefix_beam_search': elif decoding_method == 'ctc_prefix_beam_search':
assert feats.size(0) == 1 assert feats.shape[0] == 1
hyp = self.ctc_prefix_beam_search( hyp = self.ctc_prefix_beam_search(
feats, feats,
feats_lengths, feats_lengths,
@ -788,7 +789,7 @@ class U2BaseModel(nn.Layer):
simulate_streaming=simulate_streaming) simulate_streaming=simulate_streaming)
hyps = [hyp] hyps = [hyp]
elif decoding_method == 'attention_rescoring': elif decoding_method == 'attention_rescoring':
assert feats.size(0) == 1 assert feats.shape[0] == 1
hyp = self.attention_rescoring( hyp = self.attention_rescoring(
feats, feats,
feats_lengths, feats_lengths,
@ -903,10 +904,10 @@ class U2Model(U2BaseModel):
Returns: Returns:
DeepSpeech2Model: The model built from pretrained result. DeepSpeech2Model: The model built from pretrained result.
""" """
config.defrost() with UpdateConfig(config):
config.input_dim = dataloader.collate_fn.feature_size config.input_dim = dataloader.collate_fn.feature_size
config.output_dim = dataloader.collate_fn.vocab_size config.output_dim = dataloader.collate_fn.vocab_size
config.freeze()
model = cls.from_config(config) model = cls.from_config(config)
if checkpoint_path: if checkpoint_path:

@ -42,6 +42,7 @@ from deepspeech.utils import layer_tools
from deepspeech.utils.log import Log from deepspeech.utils.log import Log
from deepspeech.utils.tensor_utils import add_sos_eos from deepspeech.utils.tensor_utils import add_sos_eos
from deepspeech.utils.tensor_utils import th_accuracy from deepspeech.utils.tensor_utils import th_accuracy
from deepspeech.utils.utility import UpdateConfig
__all__ = ["U2STModel", "U2STInferModel"] __all__ = ["U2STModel", "U2STInferModel"]
@ -339,8 +340,8 @@ class U2STBaseModel(nn.Layer):
speech, speech_lengths, decoding_chunk_size, speech, speech_lengths, decoding_chunk_size,
num_decoding_left_chunks, num_decoding_left_chunks,
simulate_streaming) # (B, maxlen, encoder_dim) simulate_streaming) # (B, maxlen, encoder_dim)
maxlen = encoder_out.size(1) maxlen = encoder_out.shape[1]
encoder_dim = encoder_out.size(2) encoder_dim = encoder_out.shape[2]
running_size = batch_size * beam_size running_size = batch_size * beam_size
encoder_out = encoder_out.unsqueeze(1).repeat(1, beam_size, 1, 1).view( encoder_out = encoder_out.unsqueeze(1).repeat(1, beam_size, 1, 1).view(
running_size, maxlen, encoder_dim) # (B*N, maxlen, encoder_dim) running_size, maxlen, encoder_dim) # (B*N, maxlen, encoder_dim)
@ -495,13 +496,13 @@ class U2STBaseModel(nn.Layer):
Returns: Returns:
paddle.Tensor: decoder output, (B, L) paddle.Tensor: decoder output, (B, L)
""" """
assert encoder_out.size(0) == 1 assert encoder_out.shape[0] == 1
num_hyps = hyps.size(0) num_hyps = hyps.shape[0]
assert hyps_lens.size(0) == num_hyps assert hyps_lens.shape[0] == num_hyps
encoder_out = encoder_out.repeat(num_hyps, 1, 1) encoder_out = encoder_out.repeat(num_hyps, 1, 1)
# (B, 1, T) # (B, 1, T)
encoder_mask = paddle.ones( encoder_mask = paddle.ones(
[num_hyps, 1, encoder_out.size(1)], dtype=paddle.bool) [num_hyps, 1, encoder_out.shape[1]], dtype=paddle.bool)
# (num_hyps, max_hyps_len, vocab_size) # (num_hyps, max_hyps_len, vocab_size)
decoder_out, _ = self.decoder(encoder_out, encoder_mask, hyps, decoder_out, _ = self.decoder(encoder_out, encoder_mask, hyps,
hyps_lens) hyps_lens)
@ -556,7 +557,7 @@ class U2STBaseModel(nn.Layer):
Returns: Returns:
List[List[int]]: transcripts. List[List[int]]: transcripts.
""" """
batch_size = feats.size(0) batch_size = feats.shape[0]
if decoding_method == 'fullsentence': if decoding_method == 'fullsentence':
hyps = self.translate( hyps = self.translate(
@ -686,10 +687,10 @@ class U2STModel(U2STBaseModel):
Returns: Returns:
DeepSpeech2Model: The model built from pretrained result. DeepSpeech2Model: The model built from pretrained result.
""" """
config.defrost() with UpdateConfig(config):
config.input_dim = dataloader.collate_fn.feature_size config.input_dim = dataloader.collate_fn.feature_size
config.output_dim = dataloader.collate_fn.vocab_size config.output_dim = dataloader.collate_fn.vocab_size
config.freeze()
model = cls.from_config(config) model = cls.from_config(config)
if checkpoint_path: if checkpoint_path:

@ -70,7 +70,7 @@ class MultiHeadedAttention(nn.Layer):
paddle.Tensor: Transformed value tensor, size paddle.Tensor: Transformed value tensor, size
(#batch, n_head, time2, d_k). (#batch, n_head, time2, d_k).
""" """
n_batch = query.size(0) n_batch = query.shape[0]
q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k) q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k)
k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k) k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k)
v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k) v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k)
@ -96,7 +96,7 @@ class MultiHeadedAttention(nn.Layer):
paddle.Tensor: Transformed value weighted paddle.Tensor: Transformed value weighted
by the attention score, (#batch, time1, d_model). by the attention score, (#batch, time1, d_model).
""" """
n_batch = value.size(0) n_batch = value.shape[0]
if mask is not None: if mask is not None:
mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2)
scores = scores.masked_fill(mask, -float('inf')) scores = scores.masked_fill(mask, -float('inf'))
@ -172,15 +172,16 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention):
paddle.Tensor: Output tensor. (batch, head, time1, time1) paddle.Tensor: Output tensor. (batch, head, time1, time1)
""" """
zero_pad = paddle.zeros( zero_pad = paddle.zeros(
(x.size(0), x.size(1), x.size(2), 1), dtype=x.dtype) (x.shape[0], x.shape[1], x.shape[2], 1), dtype=x.dtype)
x_padded = paddle.cat([zero_pad, x], dim=-1) x_padded = paddle.cat([zero_pad, x], dim=-1)
x_padded = x_padded.view(x.size(0), x.size(1), x.size(3) + 1, x.size(2)) x_padded = x_padded.view(x.shape[0], x.shape[1], x.shape[3] + 1,
x.shape[2])
x = x_padded[:, :, 1:].view_as(x) # [B, H, T1, T1] x = x_padded[:, :, 1:].view_as(x) # [B, H, T1, T1]
if zero_triu: if zero_triu:
ones = paddle.ones((x.size(2), x.size(3))) ones = paddle.ones((x.shape[2], x.shape[3]))
x = x * paddle.tril(ones, x.size(3) - x.size(2))[None, None, :, :] x = x * paddle.tril(ones, x.shape[3] - x.shape[2])[None, None, :, :]
return x return x
@ -205,7 +206,7 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention):
q, k, v = self.forward_qkv(query, key, value) q, k, v = self.forward_qkv(query, key, value)
q = q.transpose([0, 2, 1, 3]) # (batch, time1, head, d_k) q = q.transpose([0, 2, 1, 3]) # (batch, time1, head, d_k)
n_batch_pos = pos_emb.size(0) n_batch_pos = pos_emb.shape[0]
p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k) p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k)
p = p.transpose([0, 2, 1, 3]) # (batch, head, time1, d_k) p = p.transpose([0, 2, 1, 3]) # (batch, head, time1, d_k)

@ -122,7 +122,7 @@ class TransformerDecoder(nn.Layer):
# tgt_mask: (B, 1, L) # tgt_mask: (B, 1, L)
tgt_mask = (make_non_pad_mask(ys_in_lens).unsqueeze(1)) tgt_mask = (make_non_pad_mask(ys_in_lens).unsqueeze(1))
# m: (1, L, L) # m: (1, L, L)
m = subsequent_mask(tgt_mask.size(-1)).unsqueeze(0) m = subsequent_mask(tgt_mask.shape[-1]).unsqueeze(0)
# tgt_mask: (B, L, L) # tgt_mask: (B, L, L)
tgt_mask = tgt_mask & m tgt_mask = tgt_mask & m

@ -68,7 +68,7 @@ class PositionalEncoding(nn.Layer):
paddle.Tensor: for compatibility to RelPositionalEncoding, (batch=1, time, ...) paddle.Tensor: for compatibility to RelPositionalEncoding, (batch=1, time, ...)
""" """
T = x.shape[1] T = x.shape[1]
assert offset + x.size(1) < self.max_len assert offset + x.shape[1] < self.max_len
#TODO(Hui Zhang): using T = x.size(1), __getitem__ not support Tensor #TODO(Hui Zhang): using T = x.size(1), __getitem__ not support Tensor
pos_emb = self.pe[:, offset:offset + T] pos_emb = self.pe[:, offset:offset + T]
x = x * self.xscale + pos_emb x = x * self.xscale + pos_emb
@ -114,7 +114,7 @@ class RelPositionalEncoding(PositionalEncoding):
paddle.Tensor: Encoded tensor (batch, time, `*`). paddle.Tensor: Encoded tensor (batch, time, `*`).
paddle.Tensor: Positional embedding tensor (1, time, `*`). paddle.Tensor: Positional embedding tensor (1, time, `*`).
""" """
assert offset + x.size(1) < self.max_len assert offset + x.shape[1] < self.max_len
x = x * self.xscale x = x * self.xscale
#TODO(Hui Zhang): using x.size(1), __getitem__ not support Tensor #TODO(Hui Zhang): using x.size(1), __getitem__ not support Tensor
pos_emb = self.pe[:, offset:offset + x.shape[1]] pos_emb = self.pe[:, offset:offset + x.shape[1]]

@ -159,7 +159,7 @@ class BaseEncoder(nn.Layer):
if self.global_cmvn is not None: if self.global_cmvn is not None:
xs = self.global_cmvn(xs) xs = self.global_cmvn(xs)
#TODO(Hui Zhang): self.embed(xs, masks, offset=0), stride_slice not support bool tensor #TODO(Hui Zhang): self.embed(xs, masks, offset=0), stride_slice not support bool tensor
xs, pos_emb, masks = self.embed(xs, masks.type_as(xs), offset=0) xs, pos_emb, masks = self.embed(xs, masks.astype(xs.dtype), offset=0)
#TODO(Hui Zhang): remove mask.astype, stride_slice not support bool tensor #TODO(Hui Zhang): remove mask.astype, stride_slice not support bool tensor
masks = masks.astype(paddle.bool) masks = masks.astype(paddle.bool)
mask_pad = ~masks mask_pad = ~masks
@ -206,11 +206,11 @@ class BaseEncoder(nn.Layer):
chunk computation chunk computation
List[paddle.Tensor]: conformer cnn cache List[paddle.Tensor]: conformer cnn cache
""" """
assert xs.size(0) == 1 # batch size must be one assert xs.shape[0] == 1 # batch size must be one
# tmp_masks is just for interface compatibility # tmp_masks is just for interface compatibility
# TODO(Hui Zhang): stride_slice not support bool tensor # TODO(Hui Zhang): stride_slice not support bool tensor
# tmp_masks = paddle.ones([1, xs.size(1)], dtype=paddle.bool) # tmp_masks = paddle.ones([1, xs.size(1)], dtype=paddle.bool)
tmp_masks = paddle.ones([1, xs.size(1)], dtype=paddle.int32) tmp_masks = paddle.ones([1, xs.shape[1]], dtype=paddle.int32)
tmp_masks = tmp_masks.unsqueeze(1) #[B=1, C=1, T] tmp_masks = tmp_masks.unsqueeze(1) #[B=1, C=1, T]
if self.global_cmvn is not None: if self.global_cmvn is not None:
@ -220,25 +220,25 @@ class BaseEncoder(nn.Layer):
xs, tmp_masks, offset=offset) #xs=(B, T, D), pos_emb=(B=1, T, D) xs, tmp_masks, offset=offset) #xs=(B, T, D), pos_emb=(B=1, T, D)
if subsampling_cache is not None: if subsampling_cache is not None:
cache_size = subsampling_cache.size(1) #T cache_size = subsampling_cache.shape[1] #T
xs = paddle.cat((subsampling_cache, xs), dim=1) xs = paddle.cat((subsampling_cache, xs), dim=1)
else: else:
cache_size = 0 cache_size = 0
# only used when using `RelPositionMultiHeadedAttention` # only used when using `RelPositionMultiHeadedAttention`
pos_emb = self.embed.position_encoding( pos_emb = self.embed.position_encoding(
offset=offset - cache_size, size=xs.size(1)) offset=offset - cache_size, size=xs.shape[1])
if required_cache_size < 0: if required_cache_size < 0:
next_cache_start = 0 next_cache_start = 0
elif required_cache_size == 0: elif required_cache_size == 0:
next_cache_start = xs.size(1) next_cache_start = xs.shape[1]
else: else:
next_cache_start = xs.size(1) - required_cache_size next_cache_start = xs.shape[1] - required_cache_size
r_subsampling_cache = xs[:, next_cache_start:, :] r_subsampling_cache = xs[:, next_cache_start:, :]
# Real mask for transformer/conformer layers # Real mask for transformer/conformer layers
masks = paddle.ones([1, xs.size(1)], dtype=paddle.bool) masks = paddle.ones([1, xs.shape[1]], dtype=paddle.bool)
masks = masks.unsqueeze(1) #[B=1, L'=1, T] masks = masks.unsqueeze(1) #[B=1, L'=1, T]
r_elayers_output_cache = [] r_elayers_output_cache = []
r_conformer_cnn_cache = [] r_conformer_cnn_cache = []
@ -302,7 +302,7 @@ class BaseEncoder(nn.Layer):
stride = subsampling * decoding_chunk_size stride = subsampling * decoding_chunk_size
decoding_window = (decoding_chunk_size - 1) * subsampling + context decoding_window = (decoding_chunk_size - 1) * subsampling + context
num_frames = xs.size(1) num_frames = xs.shape[1]
required_cache_size = decoding_chunk_size * num_decoding_left_chunks required_cache_size = decoding_chunk_size * num_decoding_left_chunks
subsampling_cache: Optional[paddle.Tensor] = None subsampling_cache: Optional[paddle.Tensor] = None
elayers_output_cache: Optional[List[paddle.Tensor]] = None elayers_output_cache: Optional[List[paddle.Tensor]] = None
@ -318,10 +318,10 @@ class BaseEncoder(nn.Layer):
chunk_xs, offset, required_cache_size, subsampling_cache, chunk_xs, offset, required_cache_size, subsampling_cache,
elayers_output_cache, conformer_cnn_cache) elayers_output_cache, conformer_cnn_cache)
outputs.append(y) outputs.append(y)
offset += y.size(1) offset += y.shape[1]
ys = paddle.cat(outputs, 1) ys = paddle.cat(outputs, 1)
# fake mask, just for jit script and compatibility with `forward` api # fake mask, just for jit script and compatibility with `forward` api
masks = paddle.ones([1, ys.size(1)], dtype=paddle.bool) masks = paddle.ones([1, ys.shape[1]], dtype=paddle.bool)
masks = masks.unsqueeze(1) masks = masks.unsqueeze(1)
return ys, masks return ys, masks

@ -43,33 +43,57 @@ def default_argument_parser():
""" """
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
# yapf: disable train_group = parser.add_argument_group(
# data and output title='Train Options', description=None)
parser.add_argument("--config", metavar="FILE", help="path of the config file to overwrite to default config with.") train_group.add_argument(
parser.add_argument("--dump-config", metavar="FILE", help="dump config to yaml file.") "--seed",
parser.add_argument("--output", metavar="OUTPUT_DIR", help="path to save checkpoint and logs.") type=int,
default=None,
# load from saved checkpoint help="seed to use for paddle, np and random. None or 0 for random, else set seed."
parser.add_argument("--checkpoint_path", type=str, help="path of the checkpoint to load") )
train_group.add_argument(
# running "--device",
parser.add_argument("--device", type=str, default='gpu', choices=["cpu", "gpu"], type=str,
help="device type to use, cpu and gpu are supported.") default='gpu',
parser.add_argument("--nprocs", type=int, default=1, help="number of parallel processes to use.") choices=["cpu", "gpu"],
help="device cpu and gpu are supported.")
# overwrite extra config and default config train_group.add_argument(
# parser.add_argument("--opts", nargs=argparse.REMAINDER, "--nprocs",
# help="options to overwrite --config file and the default config, passing in KEY VALUE pairs") type=int,
parser.add_argument("--opts", type=str, default=[], nargs='+', default=1,
help="options to overwrite --config file and the default config, passing in KEY VALUE pairs") help="number of parallel processes. 0 for cpu.")
train_group.add_argument(
# random seed "--config", metavar="CONFIG_FILE", help="config file.")
parser.add_argument("--seed", type=int, default=None, train_group.add_argument(
help="seed to use for paddle, np and random. None or 0 for random, else set seed.") "--output", metavar="CKPT_DIR", help="path to save checkpoint.")
train_group.add_argument(
# profiler "--checkpoint_path", type=str, help="path to load checkpoint")
parser.add_argument('--profiler_options', type=str, default=None, train_group.add_argument(
help='The option of profiler, which should be in format \"key1=value1;key2=value2;key3=value3\".') "--opts",
# yapd: enable type=str,
default=[],
nargs='+',
help="overwrite --config file, passing in LIST[KEY VALUE] pairs")
train_group.add_argument(
"--dump-config", metavar="FILE", help="dump config to `this` file.")
profile_group = parser.add_argument_group(
title='Benchmark Options', description=None)
profile_group.add_argument(
'--profiler-options',
type=str,
default=None,
help='The option of profiler, which should be in format \"key1=value1;key2=value2;key3=value3\".'
)
profile_group.add_argument(
'--benchmark-batch-size',
type=int,
default=None,
help='batch size for benchmark.')
profile_group.add_argument(
'--benchmark-max-step',
type=int,
default=None,
help='max iteration for benchmark.')
return parser return parser

@ -20,8 +20,8 @@ from paddle.nn import Layer
from . import extension from . import extension
from ..reporter import DictSummary from ..reporter import DictSummary
from ..reporter import ObsScope
from ..reporter import report from ..reporter import report
from ..reporter import scope
from ..timer import Timer from ..timer import Timer
from deepspeech.utils.log import Log from deepspeech.utils.log import Log
logger = Log(__name__).getlog() logger = Log(__name__).getlog()
@ -78,7 +78,7 @@ class StandardEvaluator(extension.Extension):
summary = DictSummary() summary = DictSummary()
for batch in self.dataloader: for batch in self.dataloader:
observation = {} observation = {}
with scope(observation): with ObsScope(observation):
# main evaluation computation here. # main evaluation computation here.
with paddle.no_grad(): with paddle.no_grad():
self.evaluate_sync(self.evaluate_core(batch)) self.evaluate_sync(self.evaluate_core(batch))

@ -19,7 +19,7 @@ OBSERVATIONS = None
@contextlib.contextmanager @contextlib.contextmanager
def scope(observations): def ObsScope(observations):
# make `observation` the target to report to. # make `observation` the target to report to.
# it is basically a dictionary that stores temporary observations # it is basically a dictionary that stores temporary observations
global OBSERVATIONS global OBSERVATIONS

@ -11,19 +11,24 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import sys
import time import time
from collections import OrderedDict
from pathlib import Path from pathlib import Path
import paddle import paddle
from paddle import distributed as dist from paddle import distributed as dist
from tensorboardX import SummaryWriter from tensorboardX import SummaryWriter
from deepspeech.training.reporter import ObsScope
from deepspeech.training.reporter import report
from deepspeech.training.timer import Timer from deepspeech.training.timer import Timer
from deepspeech.utils import mp_tools from deepspeech.utils import mp_tools
from deepspeech.utils import profiler from deepspeech.utils import profiler
from deepspeech.utils.checkpoint import Checkpoint from deepspeech.utils.checkpoint import Checkpoint
from deepspeech.utils.log import Log from deepspeech.utils.log import Log
from deepspeech.utils.utility import seed_all from deepspeech.utils.utility import seed_all
from deepspeech.utils.utility import UpdateConfig
__all__ = ["Trainer"] __all__ = ["Trainer"]
@ -96,11 +101,21 @@ class Trainer():
self.checkpoint_dir = None self.checkpoint_dir = None
self.iteration = 0 self.iteration = 0
self.epoch = 0 self.epoch = 0
self.rank = dist.get_rank()
logger.info(f"Rank: {self.rank}/{dist.get_world_size()}")
if args.seed: if args.seed:
seed_all(args.seed) seed_all(args.seed)
logger.info(f"Set seed {args.seed}") logger.info(f"Set seed {args.seed}")
if self.args.benchmark_batch_size:
with UpdateConfig(self.config):
self.config.collator.batch_size = self.args.benchmark_batch_size
self.config.training.log_interval = 1
logger.info(
f"Benchmark reset batch-size: {self.args.benchmark_batch_size}")
def setup(self): def setup(self):
"""Setup the experiment. """Setup the experiment.
""" """
@ -188,6 +203,12 @@ class Trainer():
if self.args.profiler_options: if self.args.profiler_options:
profiler.add_profiler_step(self.args.profiler_options) profiler.add_profiler_step(self.args.profiler_options)
if self.args.benchmark_max_step and self.iteration > self.args.benchmark_max_step:
logger.info(
f"Reach benchmark-max-step: {self.args.benchmark_max_step}")
sys.exit(
f"Reach benchmark-max-step: {self.args.benchmark_max_step}")
def train(self): def train(self):
"""The training process control by epoch.""" """The training process control by epoch."""
from_scratch = self.resume_or_scratch() from_scratch = self.resume_or_scratch()
@ -208,15 +229,29 @@ class Trainer():
data_start_time = time.time() data_start_time = time.time()
for batch_index, batch in enumerate(self.train_loader): for batch_index, batch in enumerate(self.train_loader):
dataload_time = time.time() - data_start_time dataload_time = time.time() - data_start_time
msg = "Train: Rank: {}, ".format(dist.get_rank()) msg = "Train:"
msg += "epoch: {}, ".format(self.epoch) observation = OrderedDict()
msg += "step: {}, ".format(self.iteration) with ObsScope(observation):
msg += "batch : {}/{}, ".format(batch_index + 1, report("Rank", dist.get_rank())
len(self.train_loader)) report("epoch", self.epoch)
msg += "lr: {:>.8f}, ".format(self.lr_scheduler()) report('step', self.iteration)
msg += "data time: {:>.3f}s, ".format(dataload_time) report('step/total',
self.train_batch(batch_index, batch, msg) (batch_index + 1) / len(self.train_loader))
self.after_train_batch() report("lr", self.lr_scheduler())
self.train_batch(batch_index, batch, msg)
self.after_train_batch()
report('reader_cost', dataload_time)
observation['batch_cost'] = observation[
'reader_cost'] + observation['step_cost']
observation['samples'] = observation['batch_size']
observation['ips[sent./sec]'] = observation[
'batch_size'] / observation['batch_cost']
for k, v in observation.items():
msg += f" {k}: "
msg += f"{v:>.8f}" if isinstance(v,
float) else f"{v}"
msg += ","
logger.info(msg)
data_start_time = time.time() data_start_time = time.time()
except Exception as e: except Exception as e:
logger.error(e) logger.error(e)

@ -24,7 +24,7 @@ import tqdm
from deepspeech.training.extensions.extension import Extension from deepspeech.training.extensions.extension import Extension
from deepspeech.training.extensions.extension import PRIORITY_READER from deepspeech.training.extensions.extension import PRIORITY_READER
from deepspeech.training.reporter import scope from deepspeech.training.reporter import ObsScope
from deepspeech.training.triggers import get_trigger from deepspeech.training.triggers import get_trigger
from deepspeech.training.triggers.limit_trigger import LimitTrigger from deepspeech.training.triggers.limit_trigger import LimitTrigger
from deepspeech.training.updaters.updater import UpdaterBase from deepspeech.training.updaters.updater import UpdaterBase
@ -144,7 +144,7 @@ class Trainer():
# you can use `report` freely in Updater.update() # you can use `report` freely in Updater.update()
# updating parameters and state # updating parameters and state
with scope(self.observation): with ObsScope(self.observation):
update() update()
p.update() p.update()

@ -84,19 +84,19 @@ def forced_align(ctc_probs: paddle.Tensor, y: paddle.Tensor,
y_insert_blank = insert_blank(y, blank_id) #(2L+1) y_insert_blank = insert_blank(y, blank_id) #(2L+1)
log_alpha = paddle.zeros( log_alpha = paddle.zeros(
(ctc_probs.size(0), len(y_insert_blank))) #(T, 2L+1) (ctc_probs.shape[0], len(y_insert_blank))) #(T, 2L+1)
log_alpha = log_alpha - float('inf') # log of zero log_alpha = log_alpha - float('inf') # log of zero
# self.__setitem_varbase__(item, value) When assign a value to a paddle.Tensor, the data type of the paddle.Tensor not support int16 # self.__setitem_varbase__(item, value) When assign a value to a paddle.Tensor, the data type of the paddle.Tensor not support int16
state_path = (paddle.zeros( state_path = (paddle.zeros(
(ctc_probs.size(0), len(y_insert_blank)), dtype=paddle.int32) - 1 (ctc_probs.shape[0], len(y_insert_blank)), dtype=paddle.int32) - 1
) # state path, Tuple((T, 2L+1)) ) # state path, Tuple((T, 2L+1))
# init start state # init start state
log_alpha[0, 0] = ctc_probs[0][y_insert_blank[0]] # State-b, Sb log_alpha[0, 0] = ctc_probs[0][y_insert_blank[0]] # State-b, Sb
log_alpha[0, 1] = ctc_probs[0][y_insert_blank[1]] # State-nb, Snb log_alpha[0, 1] = ctc_probs[0][y_insert_blank[1]] # State-nb, Snb
for t in range(1, ctc_probs.size(0)): # T for t in range(1, ctc_probs.shape[0]): # T
for s in range(len(y_insert_blank)): # 2L+1 for s in range(len(y_insert_blank)): # 2L+1
if y_insert_blank[s] == blank_id or s < 2 or y_insert_blank[ if y_insert_blank[s] == blank_id or s < 2 or y_insert_blank[
s] == y_insert_blank[s - 2]: s] == y_insert_blank[s - 2]:
@ -114,7 +114,7 @@ def forced_align(ctc_probs: paddle.Tensor, y: paddle.Tensor,
y_insert_blank[s]] y_insert_blank[s]]
state_path[t, s] = prev_state[paddle.argmax(candidates)] state_path[t, s] = prev_state[paddle.argmax(candidates)]
# self.__setitem_varbase__(item, value) When assign a value to a paddle.Tensor, the data type of the paddle.Tensor not support int16 # self.__setitem_varbase__(item, value) When assign a value to a paddle.Tensor, the data type of the paddle.Tensor not support int16
state_seq = -1 * paddle.ones((ctc_probs.size(0), 1), dtype=paddle.int32) state_seq = -1 * paddle.ones((ctc_probs.shape[0], 1), dtype=paddle.int32)
candidates = paddle.to_tensor([ candidates = paddle.to_tensor([
log_alpha[-1, len(y_insert_blank) - 1], # Sb log_alpha[-1, len(y_insert_blank) - 1], # Sb
@ -122,11 +122,11 @@ def forced_align(ctc_probs: paddle.Tensor, y: paddle.Tensor,
]) ])
prev_state = [len(y_insert_blank) - 1, len(y_insert_blank) - 2] prev_state = [len(y_insert_blank) - 1, len(y_insert_blank) - 2]
state_seq[-1] = prev_state[paddle.argmax(candidates)] state_seq[-1] = prev_state[paddle.argmax(candidates)]
for t in range(ctc_probs.size(0) - 2, -1, -1): for t in range(ctc_probs.shape[0] - 2, -1, -1):
state_seq[t] = state_path[t + 1, state_seq[t + 1, 0]] state_seq[t] = state_path[t + 1, state_seq[t + 1, 0]]
output_alignment = [] output_alignment = []
for t in range(0, ctc_probs.size(0)): for t in range(0, ctc_probs.shape[0]):
output_alignment.append(y_insert_blank[state_seq[t, 0]]) output_alignment.append(y_insert_blank[state_seq[t, 0]])
return output_alignment return output_alignment

@ -83,7 +83,7 @@ def pad_sequence(sequences: List[paddle.Tensor],
# (TODO Hui Zhang): slice not supprot `end==start` # (TODO Hui Zhang): slice not supprot `end==start`
# trailing_dims = max_size[1:] # trailing_dims = max_size[1:]
trailing_dims = max_size[1:] if max_size.ndim >= 2 else () trailing_dims = max_size[1:] if max_size.ndim >= 2 else ()
max_len = max([s.size(0) for s in sequences]) max_len = max([s.shape[0] for s in sequences])
if batch_first: if batch_first:
out_dims = (len(sequences), max_len) + trailing_dims out_dims = (len(sequences), max_len) + trailing_dims
else: else:
@ -91,7 +91,7 @@ def pad_sequence(sequences: List[paddle.Tensor],
out_tensor = sequences[0].new_full(out_dims, padding_value) out_tensor = sequences[0].new_full(out_dims, padding_value)
for i, tensor in enumerate(sequences): for i, tensor in enumerate(sequences):
length = tensor.size(0) length = tensor.shape[0]
# use index notation to prevent duplicate references to the tensor # use index notation to prevent duplicate references to the tensor
if batch_first: if batch_first:
out_tensor[i, :length, ...] = tensor out_tensor[i, :length, ...] = tensor
@ -139,7 +139,7 @@ def add_sos_eos(ys_pad: paddle.Tensor, sos: int, eos: int,
#ys_in = [paddle.cat([_sos, y], dim=0) for y in ys] #ys_in = [paddle.cat([_sos, y], dim=0) for y in ys]
#ys_out = [paddle.cat([y, _eos], dim=0) for y in ys] #ys_out = [paddle.cat([y, _eos], dim=0) for y in ys]
#return pad_sequence(ys_in, padding_value=eos), pad_sequence(ys_out, padding_value=ignore_id) #return pad_sequence(ys_in, padding_value=eos), pad_sequence(ys_out, padding_value=ignore_id)
B = ys_pad.size(0) B = ys_pad.shape[0]
_sos = paddle.ones([B, 1], dtype=ys_pad.dtype) * sos _sos = paddle.ones([B, 1], dtype=ys_pad.dtype) * sos
_eos = paddle.ones([B, 1], dtype=ys_pad.dtype) * eos _eos = paddle.ones([B, 1], dtype=ys_pad.dtype) * eos
ys_in = paddle.cat([_sos, ys_pad], dim=1) ys_in = paddle.cat([_sos, ys_pad], dim=1)
@ -165,8 +165,8 @@ def th_accuracy(pad_outputs: paddle.Tensor,
Returns: Returns:
float: Accuracy value (0.0 - 1.0). float: Accuracy value (0.0 - 1.0).
""" """
pad_pred = pad_outputs.view( pad_pred = pad_outputs.view(pad_targets.shape[0], pad_targets.shape[1],
pad_targets.size(0), pad_targets.size(1), pad_outputs.size(1)).argmax(2) pad_outputs.shape[1]).argmax(2)
mask = pad_targets != ignore_label mask = pad_targets != ignore_label
numerator = paddle.sum( numerator = paddle.sum(
pad_pred.masked_select(mask) == pad_targets.masked_select(mask)) pad_pred.masked_select(mask) == pad_targets.masked_select(mask))

@ -16,15 +16,27 @@ import distutils.util
import math import math
import os import os
import random import random
from contextlib import contextmanager
from typing import List from typing import List
import numpy as np import numpy as np
import paddle import paddle
__all__ = ["seed_all", 'print_arguments', 'add_arguments', "log_add"] __all__ = [
"UpdateConfig", "seed_all", 'print_arguments', 'add_arguments', "log_add"
]
@contextmanager
def UpdateConfig(config):
"""Update yacs config"""
config.defrost()
yield
config.freeze()
def seed_all(seed: int=210329): def seed_all(seed: int=210329):
"""freeze random generator seed."""
np.random.seed(seed) np.random.seed(seed)
random.seed(seed) random.seed(seed)
paddle.seed(seed) paddle.seed(seed)

@ -4,7 +4,7 @@ To avoid the trouble of environment setup, [running in Docker container](#runnin
## Prerequisites ## Prerequisites
- Python >= 3.7 - Python >= 3.7
- PaddlePaddle 2.0.0 or later (please refer to the [Installation Guide](https://www.paddlepaddle.org.cn/documentation/docs/en/beginners_guide/index_en.html)) - PaddlePaddle latest version (please refer to the [Installation Guide](https://www.paddlepaddle.org.cn/documentation/docs/en/beginners_guide/index_en.html))
## Setup (Important) ## Setup (Important)

@ -1,5 +1,7 @@
# Reference # Reference
We refer these repos to build `model` and `engine`:
* [delta](https://github.com/Delta-ML/delta.git) * [delta](https://github.com/Delta-ML/delta.git)
* [espnet](https://github.com/espnet/espnet.git) * [espnet](https://github.com/espnet/espnet.git)
* [kaldi](https://github.com/kaldi-asr/kaldi.git) * [kaldi](https://github.com/kaldi-asr/kaldi.git)

@ -1,7 +1,8 @@
#!/bin/bash #!/bin/bash
profiler_options= profiler_options=
benchmark_batch_size=0
benchmark_max_step=0
# seed may break model convergence # seed may break model convergence
seed=0 seed=0
@ -32,12 +33,15 @@ ckpt_name=$2
mkdir -p exp mkdir -p exp
python3 -u ${BIN_DIR}/train.py \ python3 -u ${BIN_DIR}/train.py \
--seed ${seed} \
--device ${device} \ --device ${device} \
--nproc ${ngpu} \ --nproc ${ngpu} \
--config ${config_path} \ --config ${config_path} \
--output exp/${ckpt_name} \ --output exp/${ckpt_name} \
--profiler_options ${profiler_options} \ --profiler-options "${profiler_options}" \
--seed ${seed} --benchmark-batch-size ${benchmark_batch_size} \
--benchmark-max-step ${benchmark_max_step}
if [ ${seed} != 0 ]; then if [ ${seed} != 0 ]; then
unset FLAGS_cudnn_deterministic unset FLAGS_cudnn_deterministic
@ -48,4 +52,4 @@ if [ $? -ne 0 ]; then
exit 1 exit 1
fi fi
exit 0 exit 0

@ -19,17 +19,17 @@
{ {
"type": "specaug", "type": "specaug",
"params": { "params": {
"W": 0,
"warp_mode": "PIL",
"F": 10, "F": 10,
"T": 50,
"n_freq_masks": 2, "n_freq_masks": 2,
"T": 50,
"n_time_masks": 2, "n_time_masks": 2,
"p": 1.0, "p": 1.0,
"W": 80,
"adaptive_number_ratio": 0, "adaptive_number_ratio": 0,
"adaptive_size_ratio": 0, "adaptive_size_ratio": 0,
"max_n_time_masks": 20, "max_n_time_masks": 20,
"replace_with_zero": true, "replace_with_zero": true
"warp_mode": "PIL"
}, },
"prob": 1.0 "prob": 1.0
} }

@ -33,7 +33,7 @@ collator:
keep_transcription_text: False keep_transcription_text: False
sortagrad: True sortagrad: True
shuffle_method: batch_shuffle shuffle_method: batch_shuffle
num_workers: 0 num_workers: 2
# network architecture # network architecture
@ -74,7 +74,7 @@ model:
training: training:
n_epoch: 120 n_epoch: 120
accum_grad: 2 accum_grad: 2
global_grad_clip: 5.0 global_grad_clip: 5.0
optim: adam optim: adam

@ -38,7 +38,7 @@ python3 -u ${BIN_DIR}/train.py \
--config ${config_path} \ --config ${config_path} \
--output exp/${ckpt_name} \ --output exp/${ckpt_name} \
--model_type ${model_type} \ --model_type ${model_type} \
--profiler_options "${profiler_options}" \ --profiler-options "${profiler_options}" \
--seed ${seed} --seed ${seed}
if [ ${seed} != 0 ]; then if [ ${seed} != 0 ]; then

@ -1,37 +1,49 @@
#!/bin/bash #!/bin/bash
if [ $# != 2 ];then profiler_options=
echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name" benchmark_batch_size=0
exit -1 benchmark_max_step=0
fi
# seed may break model convergence
seed=0
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo "using $ngpu gpus..." echo "using $ngpu gpus..."
config_path=$1
ckpt_name=$2
device=gpu device=gpu
if [ ${ngpu} == 0 ];then if [ ${ngpu} == 0 ];then
device=cpu device=cpu
fi fi
mkdir -p exp if [ ${seed} != 0 ]; then
# seed may break model convergence
seed=0
if [ ${seed} != 0 ]; then
export FLAGS_cudnn_deterministic=True export FLAGS_cudnn_deterministic=True
echo "using seed $seed & FLAGS_cudnn_deterministic=True ..."
fi
if [ $# != 2 ];then
echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name"
exit -1
fi fi
config_path=$1
ckpt_name=$2
mkdir -p exp
python3 -u ${BIN_DIR}/train.py \ python3 -u ${BIN_DIR}/train.py \
--seed ${seed} \
--device ${device} \ --device ${device} \
--nproc ${ngpu} \ --nproc ${ngpu} \
--config ${config_path} \ --config ${config_path} \
--output exp/${ckpt_name} \ --output exp/${ckpt_name} \
--seed ${seed} --profiler-options "${profiler_options}" \
--benchmark-batch-size ${benchmark_batch_size} \
--benchmark-max-step ${benchmark_max_step}
if [ ${seed} != 0 ]; then if [ ${seed} != 0 ]; then
unset FLAGS_cudnn_deterministic unset FLAGS_cudnn_deterministic
fi fi

@ -1,41 +1,46 @@
#!/bin/bash #!/bin/bash
CUR_DIR=${PWD}
ROOT_DIR=../../ ROOT_DIR=../../
# 提供可稳定复现性能的脚本默认在标准docker环境内py37执行 # 提供可稳定复现性能的脚本默认在标准docker环境内py37执行
# collect env info # collect env info
bash ${ROOT_DIR}/utils/pd_env_collect.sh bash ${ROOT_DIR}/utils/pd_env_collect.sh
cat pd_env.txt #cat pd_env.txt
# 执行目录:需说明
pushd ${ROOT_DIR}/examples/aishell/s1
# 1 安装该模型需要的依赖 (如需开启优化策略请注明) # 1 安装该模型需要的依赖 (如需开启优化策略请注明)
pushd ${ROOT_DIR}/tools; make; popd #pushd ${ROOT_DIR}/tools; make; popd
source ${ROOT_DIR}/tools/venv/bin/activate #source ${ROOT_DIR}/tools/venv/bin/activate
pushd ${ROOT_DIR}; bash setup.sh; popd #pushd ${ROOT_DIR}; bash setup.sh; popd
# 2 拷贝该模型需要数据、预训练模型 # 2 拷贝该模型需要数据、预训练模型
# 执行目录:需说明
#pushd ${ROOT_DIR}/examples/aishell/s1
pushd ${ROOT_DIR}/examples/tiny/s1
mkdir -p exp/log mkdir -p exp/log
loca/data.sh &> exp/log/data.log . path.sh
#bash local/data.sh &> exp/log/data.log
# 3 批量运行如不方便批量12需放到单个模型中 # 3 批量运行如不方便批量12需放到单个模型中
model_mode_list=(conformer) model_mode_list=(conformer transformer)
fp_item_list=(fp32) fp_item_list=(fp32)
bs_item=(32 64 96) bs_item_list=(32 64 96)
for model_mode in ${model_mode_list[@]}; do for model_mode in ${model_mode_list[@]}; do
for fp_item in ${fp_item_list[@]}; do for fp_item in ${fp_item_list[@]}; do
for bs_item in ${bs_list[@]} for bs_item in ${bs_item_list[@]}
do do
echo "index is speed, 1gpus, begin, ${model_name}" echo "index is speed, 1gpus, begin, ${model_name}"
run_mode=sp run_mode=sp
CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} 500 ${model_mode} # (5min) CUDA_VISIBLE_DEVICES=0 bash ${CUR_DIR}/run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} 500 ${model_mode} # (5min)
sleep 60 sleep 60
echo "index is speed, 8gpus, run_mode is multi_process, begin, ${model_name}" echo "index is speed, 8gpus, run_mode is multi_process, begin, ${model_name}"
run_mode=mp run_mode=mp
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} 500 ${model_mode} CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash ${CUR_DIR}/run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} 500 ${model_mode}
sleep 60 sleep 60
done done
done done

@ -23,19 +23,19 @@ function _train(){
echo "Train on ${num_gpu_devices} GPUs" echo "Train on ${num_gpu_devices} GPUs"
echo "current CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES, gpus=$num_gpu_devices, batch_size=$batch_size" echo "current CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES, gpus=$num_gpu_devices, batch_size=$batch_size"
train_cmd="--model_name=${model_name} train_cmd="--benchmark-batch-size ${batch_size}
--batch_size=${batch_size} --benchmark-max-step ${max_iter}
--fp=${fp_item} \ conf/${model_name}.yaml ${model_name}"
--max_iter=${max_iter} "
case ${run_mode} in case ${run_mode} in
sp) train_cmd="python -u tools/train.py "${train_cmd}" ;; sp) train_cmd="bash local/train.sh "${train_cmd}"" ;;
mp) mp)
train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --gpus=$CUDA_VISIBLE_DEVICES tools/train.py "${train_cmd}" train_cmd="bash local/train.sh "${train_cmd}"" ;;
log_parse_file="mylog/workerlog.0" ;;
*) echo "choose run_mode(sp or mp)"; exit 1; *) echo "choose run_mode(sp or mp)"; exit 1;
esac esac
# 以下不用修改
timeout 15m ${train_cmd} > ${log_file} 2>&1 # 以下不用修改
CUDA_VISIBLE_DEVICES=${device} timeout 15m ${train_cmd} > ${log_file} 2>&1
if [ $? -ne 0 ];then if [ $? -ne 0 ];then
echo -e "${model_name}, FAIL" echo -e "${model_name}, FAIL"
export job_fail_flag=1 export job_fail_flag=1
@ -43,7 +43,8 @@ function _train(){
echo -e "${model_name}, SUCCESS" echo -e "${model_name}, SUCCESS"
export job_fail_flag=0 export job_fail_flag=0
fi fi
kill -9 `ps -ef|grep 'python'|awk '{print $2}'`
trap 'for pid in $(jobs -pr); do kill -KILL $pid; done' INT QUIT TERM
if [ $run_mode = "mp" -a -d mylog ]; then if [ $run_mode = "mp" -a -d mylog ]; then
rm ${log_file} rm ${log_file}

@ -1,146 +0,0 @@
from typing import Tuple
import numpy as np
import paddle
from paddle import Tensor
from paddle import nn
from paddle.nn import functional as F
def frame(x: Tensor,
num_samples: Tensor,
win_length: int,
hop_length: int,
clip: bool = True) -> Tuple[Tensor, Tensor]:
"""Extract frames from audio.
Parameters
----------
x : Tensor
Shape (N, T), batched waveform.
num_samples : Tensor
Shape (N, ), number of samples of each waveform.
win_length : int
Window length.
hop_length : int
Number of samples shifted between ajancent frames.
clip : bool, optional
Whether to clip audio that does not fit into the last frame, by
default True
Returns
-------
frames : Tensor
Shape (N, T', win_length).
num_frames : Tensor
Shape (N, ) number of valid frames
"""
assert hop_length <= win_length
num_frames = (num_samples - win_length) // hop_length
padding = (0, 0)
if not clip:
num_frames += 1
# NOTE: pad hop_length - 1 to the right to ensure that there is at most
# one frame dangling to the righe edge
padding = (0, hop_length - 1)
weight = paddle.eye(win_length).unsqueeze(1)
frames = F.conv1d(x.unsqueeze(1),
weight,
padding=padding,
stride=(hop_length, ))
return frames, num_frames
class STFT(nn.Layer):
"""A module for computing stft transformation in a differentiable way.
Parameters
------------
n_fft : int
Number of samples in a frame.
hop_length : int
Number of samples shifted between adjacent frames.
win_length : int
Length of the window.
clip: bool
Whether to clip audio is necesaary.
"""
def __init__(self,
n_fft: int,
hop_length: int,
win_length: int,
window_type: str = None,
clip: bool = True):
super().__init__()
self.hop_length = hop_length
self.n_bin = 1 + n_fft // 2
self.n_fft = n_fft
self.clip = clip
# calculate window
if window_type is None:
window = np.ones(win_length)
elif window_type == "hann":
window = np.hanning(win_length)
elif window_type == "hamming":
window = np.hamming(win_length)
else:
raise ValueError("Not supported yet!")
if win_length < n_fft:
window = F.pad(window, (0, n_fft - win_length))
elif win_length > n_fft:
window = window[:n_fft]
# (n_bins, n_fft) complex
kernel_size = min(n_fft, win_length)
weight = np.fft.fft(np.eye(n_fft))[:self.n_bin, :kernel_size]
w_real = weight.real
w_imag = weight.imag
# (2 * n_bins, kernel_size)
w = np.concatenate([w_real, w_imag], axis=0)
w = w * window
# (2 * n_bins, 1, kernel_size) # (C_out, C_in, kernel_size)
w = np.expand_dims(w, 1)
weight = paddle.cast(paddle.to_tensor(w), paddle.get_default_dtype())
self.register_buffer("weight", weight)
def forward(self, x: Tensor, num_samples: Tensor) -> Tuple[Tensor, Tensor]:
"""Compute the stft transform.
Parameters
------------
x : Tensor [shape=(B, T)]
The input waveform.
num_samples : Tensor
Number of samples of each waveform.
Returns
------------
D : Tensor
Shape(N, T', n_bins, 2) Spectrogram.
num_frames: Tensor
Shape (N,) number of samples of each spectrogram
"""
num_frames = (num_samples - self.win_length) // self.hop_length
padding = (0, 0)
if not self.clip:
num_frames += 1
padding = (0, self.hop_length - 1)
batch_size, _, _ = paddle.shape(x)
x = x.unsqueeze(-1)
D = F.conv1d(self.weight,
x,
stride=(self.hop_length, ),
padding=padding,
data_format="NLC")
D = paddle.reshape(D, [batch_size, -1, self.n_bin, 2])
return D, num_frames

@ -0,0 +1,201 @@
import paddle
import numpy as np
from typing import Tuple, Optional, Union
# https://github.com/kaldi-asr/kaldi/blob/cbed4ff688/src/feat/feature-window.cc#L109
def povey_window(frame_len:int) -> np.ndarray:
win = np.empty(frame_len)
a = 2 * np.pi / (frame_len -1)
for i in range(frame_len):
win[i] = (0.5 - 0.5 * np.cos(a * i) )**0.85
return win
def hann_window(frame_len:int) -> np.ndarray:
win = np.empty(frame_len)
a = 2 * np.pi / (frame_len -1)
for i in range(frame_len):
win[i] = 0.5 - 0.5 * np.cos(a * i)
return win
def sine_window(frame_len:int) -> np.ndarray:
win = np.empty(frame_len)
a = 2 * np.pi / (frame_len -1)
for i in range(frame_len):
win[i] = np.sin(0.5 * a * i)
return win
def hamm_window(frame_len:int) -> np.ndarray:
win = np.empty(frame_len)
a = 2 * np.pi / (frame_len -1)
for i in range(frame_len):
win[i] = 0.54 - 0.46 * np.cos(a * i)
return win
def get_window(wintype:Optional[str], winlen:int) -> np.ndarray:
"""get window function
Args:
wintype (Optional[str]): window type.
winlen (int): window length in samples.
Raises:
ValueError: not support window.
Returns:
np.ndarray: window coeffs.
"""
# calculate window
if not wintype or wintype == 'rectangular':
window = np.ones(winlen)
elif wintype == "hann":
window = hann_window(winlen)
elif wintype == "hamm":
window = hamm_window(winlen)
elif wintype == "povey":
window = povey_window(winlen)
else:
msg = f"{wintype} Not supported yet!"
raise ValueError(msg)
return window
def dft_matrix(n_fft:int, winlen:int=None, n_bin:int=None) -> Tuple[np.ndarray, np.ndarray, int]:
# https://en.wikipedia.org/wiki/Discrete_Fourier_transform
# (n_bins, n_fft) complex
if n_bin is None:
n_bin = 1 + n_fft // 2
if winlen is None:
winlen = n_bin
# https://github.com/numpy/numpy/blob/v1.20.0/numpy/fft/_pocketfft.py#L49
kernel_size = min(n_fft, winlen)
n = np.arange(0, n_fft, 1.)
wsin = np.empty((n_bin, kernel_size)) #[Cout, kernel_size]
wcos = np.empty((n_bin, kernel_size)) #[Cout, kernel_size]
for k in range(n_bin): # Only half of the bins contain useful info
wsin[k,:] = -np.sin(2*np.pi*k*n/n_fft)[:kernel_size]
wcos[k,:] = np.cos(2*np.pi*k*n/n_fft)[:kernel_size]
w_real = wcos
w_imag = wsin
return w_real, w_imag, kernel_size
def dft_matrix_fast(n_fft:int, winlen:int=None, n_bin:int=None) -> Tuple[np.ndarray, np.ndarray, int]:
# (n_bins, n_fft) complex
if n_bin is None:
n_bin = 1 + n_fft // 2
if winlen is None:
winlen = n_bin
# https://github.com/numpy/numpy/blob/v1.20.0/numpy/fft/_pocketfft.py#L49
kernel_size = min(n_fft, winlen)
# https://en.wikipedia.org/wiki/DFT_matrix
# https://ccrma.stanford.edu/~jos/st/Matrix_Formulation_DFT.html
weight = np.fft.fft(np.eye(n_fft))[:self.n_bin, :kernel_size]
w_real = weight.real
w_imag = weight.imag
return w_real, w_imag, kernel_size
def bin2hz(bin:Union[List[int], np.ndarray], N:int, sr:int)->List[float]:
"""FFT bins to Hz.
http://practicalcryptography.com/miscellaneous/machine-learning/intuitive-guide-discrete-fourier-transform/
Args:
bins (List[int] or np.ndarray): bin index.
N (int): the number of samples, or FFT points.
sr (int): sampling rate.
Returns:
List[float]: Hz's.
"""
hz = bin * float(sr) / N
def hz2mel(hz):
"""Convert a value in Hertz to Mels
:param hz: a value in Hz. This can also be a numpy array, conversion proceeds element-wise.
:returns: a value in Mels. If an array was passed in, an identical sized array is returned.
"""
return 1127 * np.log(1+hz/700.0)
def mel2hz(mel):
"""Convert a value in Mels to Hertz
:param mel: a value in Mels. This can also be a numpy array, conversion proceeds element-wise.
:returns: a value in Hertz. If an array was passed in, an identical sized array is returned.
"""
return 700 * (np.exp(mel/1127.0)-1)
def rms_to_db(rms: float):
"""Root Mean Square to dB.
Args:
rms ([float]): root mean square
Returns:
float: dB
"""
return 20.0 * math.log10(max(1e-16, rms))
def rms_to_dbfs(rms: float):
"""Root Mean Square to dBFS.
https://fireattack.wordpress.com/2017/02/06/replaygain-loudness-normalization-and-applications/
Audio is mix of sine wave, so 1 amp sine wave's Full scale is 0.7071, equal to -3.0103dB.
dB = dBFS + 3.0103
dBFS = db - 3.0103
e.g. 0 dB = -3.0103 dBFS
Args:
rms ([float]): root mean square
Returns:
float: dBFS
"""
return rms_to_db(rms) - 3.0103
def max_dbfs(sample_data: np.ndarray):
"""Peak dBFS based on the maximum energy sample.
Args:
sample_data ([np.ndarray]): float array, [-1, 1].
Returns:
float: dBFS
"""
# Peak dBFS based on the maximum energy sample. Will prevent overdrive if used for normalization.
return rms_to_dbfs(max(abs(np.min(sample_data)), abs(np.max(sample_data))))
def mean_dbfs(sample_data):
"""Peak dBFS based on the RMS energy.
Args:
sample_data ([np.ndarray]): float array, [-1, 1].
Returns:
float: dBFS
"""
return rms_to_dbfs(
math.sqrt(np.mean(np.square(sample_data, dtype=np.float64))))
def gain_db_to_ratio(gain_db: float):
"""dB to ratio
Args:
gain_db (float): gain in dB
Returns:
float: scale in amp
"""
return math.pow(10.0, gain_db / 20.0)

Binary file not shown.

@ -0,0 +1,266 @@
from typing import Tuple
import numpy as np
import paddle
from paddle import Tensor
from paddle import nn
from paddle.nn import functional as F
import soundfile as sf
from .common import get_window
from .common import dft_matrix
def read(wavpath:str, sr:int = None, start=0, stop=None, dtype='int16', always_2d=True)->Tuple[int, np.ndarray]:
"""load wav file.
Args:
wavpath (str): wav path.
sr (int, optional): expect sample rate. Defaults to None.
dtype (str, optional): wav data bits. Defaults to 'int16'.
Returns:
Tuple[int, np.ndarray]: sr (int), wav (int16) [T, C].
"""
wav, r_sr = sf.read(wavpath, start=start, stop=stop, dtype=dtype, always_2d=always_2d)
if sr:
assert sr == r_sr
return r_sr, wav
def write(wavpath:str, wav:np.ndarray, sr:int, dtype='PCM_16'):
"""write wav file.
Args:
wavpath (str): file path to save.
wav (np.ndarray): wav data.
sr (int): data samplerate.
dtype (str, optional): wav bit format. Defaults to 'PCM_16'.
"""
sf.write(wavpath, wav, sr, subtype=dtype)
def frames(x: Tensor,
num_samples: Tensor,
sr: int,
win_length: float,
stride_length: float,
clip: bool = False) -> Tuple[Tensor, Tensor]:
"""Extract frames from audio.
Parameters
----------
x : Tensor
Shape (B, T), batched waveform.
num_samples : Tensor
Shape (B, ), number of samples of each waveform.
sr: int
Sampling Rate.
win_length : float
Window length in ms.
stride_length : float
Stride length in ms.
clip : bool, optional
Whether to clip audio that does not fit into the last frame, by
default True
Returns
-------
frames : Tensor
Shape (B, T', win_length).
num_frames : Tensor
Shape (B, ) number of valid frames
"""
assert stride_length <= win_length
stride_length = int(stride_length * sr)
win_length = int(win_length * sr)
num_frames = (num_samples - win_length) // stride_length
padding = (0, 0)
if not clip:
num_frames += 1
need_samples = num_frames * stride_length + win_length
padding = (0, need_samples - num_samples - 1)
weight = paddle.eye(win_length).unsqueeze(1) #[win_length, 1, win_length]
frames = F.conv1d(x.unsqueeze(-1),
weight,
padding=padding,
stride=(stride_length, ),
data_format='NLC')
return frames, num_frames
def dither(signal:Tensor, dither_value=1.0)->Tensor:
"""dither frames for log compute.
Args:
signal (Tensor): [B, T, D]
dither_value (float, optional): [scalar]. Defaults to 1.0.
Returns:
Tensor: [B, T, D]
"""
D = paddle.shape(signal)[-1]
signal += paddle.normal(shape=[1, 1, D]) * dither_value
return signal
def remove_dc_offset(signal:Tensor)->Tensor:
"""remove dc.
Args:
signal (Tensor): [B, T, D]
Returns:
Tensor: [B, T, D]
"""
signal -= paddle.mean(signal, axis=-1, keepdim=True)
return signal
def preemphasis(signal:Tensor, coeff=0.97)->Tensor:
"""perform preemphasis on the input signal.
Args:
signal (Tensor): [B, T, D], The signal to filter.
coeff (float, optional): [scalar].The preemphasis coefficient. 0 is no filter, Defaults to 0.97.
Returns:
Tensor: [B, T, D]
"""
return paddle.concat([
(1-coeff)*signal[:, :, 0:1],
signal[:, :, 1:] - coeff * signal[:, :, :-1]
], axis=-1)
class STFT(nn.Layer):
"""A module for computing stft transformation in a differentiable way.
http://practicalcryptography.com/miscellaneous/machine-learning/intuitive-guide-discrete-fourier-transform/
Parameters
------------
n_fft : int
Number of samples in a frame.
sr: int
Number of Samplilng rate.
stride_length : float
Number of samples shifted between adjacent frames.
win_length : float
Length of the window.
clip: bool
Whether to clip audio is necesaary.
"""
def __init__(self,
n_fft: int,
sr: int,
win_length: float,
stride_length: float,
dither:float=0.0,
preemph_coeff:float=0.97,
remove_dc_offset:bool=True,
window_type: str = 'povey',
clip: bool = False):
super().__init__()
self.sr = sr
self.win_length = win_length
self.stride_length = stride_length
self.dither = dither
self.preemph_coeff = preemph_coeff
self.remove_dc_offset = remove_dc_offset
self.window_type = window_type
self.clip = clip
self.n_fft = n_fft
self.n_bin = 1 + n_fft // 2
w_real, w_imag, kernel_size = dft_matrix(
self.n_fft, int(self.win_length * self.sr), self.n_bin
)
# calculate window
window = get_window(window_type, kernel_size)
# (2 * n_bins, kernel_size)
w = np.concatenate([w_real, w_imag], axis=0)
w = w * window
# (kernel_size, 2 * n_bins)
w = np.transpose(w)
weight = paddle.cast(paddle.to_tensor(w), paddle.get_default_dtype())
self.register_buffer("weight", weight)
def forward(self, x: Tensor, num_samples: Tensor) -> Tuple[Tensor, Tensor]:
"""Compute the stft transform.
Parameters
------------
x : Tensor [shape=(B, T)]
The input waveform.
num_samples : Tensor [shape=(B,)]
Number of samples of each waveform.
Returns
------------
C : Tensor
Shape(B, T', n_bins, 2) Spectrogram.
num_frames: Tensor
Shape (B,) number of samples of each spectrogram
"""
batch_size = paddle.shape(num_samples)
F, nframe = frames(x, num_samples, self.sr, self.win_length, self.stride_length, clip=self.clip)
if self.dither:
F = dither(F, self.dither)
if self.remove_dc_offset:
F = remove_dc_offset(F)
if self.preemph_coeff:
F = preemphasis(F)
C = paddle.matmul(F, self.weight) # [B, T, K] [K, 2 * n_bins]
C = paddle.reshape(C, [batch_size, -1, 2, self.n_bin])
C = C.transpose([0, 1, 3, 2])
return C, nframe
def powspec(C:Tensor) -> Tensor:
"""Compute the power spectrum |X_k|^2.
Args:
C (Tensor): [B, T, C, 2]
Returns:
Tensor: [B, T, C]
"""
real, imag = paddle.chunk(C, 2, axis=-1)
return paddle.square(real.squeeze(-1)) + paddle.square(imag.squeeze(-1))
def magspec(C: Tensor, eps=1e-10) -> Tensor:
"""Compute the magnitude spectrum |X_k|.
Args:
C (Tensor): [B, T, C, 2]
eps (float): epsilon.
Returns:
Tensor: [B, T, C]
"""
pspec = powspec(C)
return paddle.sqrt(pspec + eps)
def logspec(C: Tensor, eps=1e-10) -> Tensor:
"""Compute log-spectrum 20log10X_k.
Args:
C (Tensor): [description]
eps ([type], optional): [description]. Defaults to 1e-10.
Returns:
Tensor: [description]
"""
spec = magspec(C)
return 20 * paddle.log10(spec + eps)

@ -0,0 +1,533 @@
from typing import Tuple
import numpy as np
import paddle
import unittest
import decimal
import numpy
import math
import logging
from pathlib import Path
from scipy.fftpack import dct
from third_party.paddle_audio.frontend import kaldi
def round_half_up(number):
return int(decimal.Decimal(number).quantize(decimal.Decimal('1'), rounding=decimal.ROUND_HALF_UP))
def rolling_window(a, window, step=1):
# http://ellisvalentiner.com/post/2017-03-21-np-strides-trick
shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
strides = a.strides + (a.strides[-1],)
return numpy.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)[::step]
def do_dither(signal, dither_value=1.0):
signal += numpy.random.normal(size=signal.shape) * dither_value
return signal
def do_remove_dc_offset(signal):
signal -= numpy.mean(signal)
return signal
def do_preemphasis(signal, coeff=0.97):
"""perform preemphasis on the input signal.
:param signal: The signal to filter.
:param coeff: The preemphasis coefficient. 0 is no filter, default is 0.95.
:returns: the filtered signal.
"""
return numpy.append((1-coeff)*signal[0], signal[1:] - coeff * signal[:-1])
def framesig(sig, frame_len, frame_step, dither=1.0, preemph=0.97, remove_dc_offset=True, wintype='hamming', stride_trick=True):
"""Frame a signal into overlapping frames.
:param sig: the audio signal to frame.
:param frame_len: length of each frame measured in samples.
:param frame_step: number of samples after the start of the previous frame that the next frame should begin.
:param winfunc: the analysis window to apply to each frame. By default no window is applied.
:param stride_trick: use stride trick to compute the rolling window and window multiplication faster
:returns: an array of frames. Size is NUMFRAMES by frame_len.
"""
slen = len(sig)
frame_len = int(round_half_up(frame_len))
frame_step = int(round_half_up(frame_step))
if slen <= frame_len:
numframes = 1
else:
numframes = 1 + (( slen - frame_len) // frame_step)
# check kaldi/src/feat/feature-window.h
padsignal = sig[:(numframes-1)*frame_step+frame_len]
if wintype is 'povey':
win = numpy.empty(frame_len)
for i in range(frame_len):
win[i] = (0.5-0.5*numpy.cos(2*numpy.pi/(frame_len-1)*i))**0.85
else: # the hamming window
win = numpy.hamming(frame_len)
if stride_trick:
frames = rolling_window(padsignal, window=frame_len, step=frame_step)
else:
indices = numpy.tile(numpy.arange(0, frame_len), (numframes, 1)) + numpy.tile(
numpy.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T
indices = numpy.array(indices, dtype=numpy.int32)
frames = padsignal[indices]
win = numpy.tile(win, (numframes, 1))
frames = frames.astype(numpy.float32)
raw_frames = numpy.zeros(frames.shape)
for frm in range(frames.shape[0]):
frames[frm,:] = do_dither(frames[frm,:], dither) # dither
frames[frm,:] = do_remove_dc_offset(frames[frm,:]) # remove dc offset
raw_frames[frm,:] = frames[frm,:]
frames[frm,:] = do_preemphasis(frames[frm,:], preemph) # preemphasize
return frames * win, raw_frames
def magspec(frames, NFFT):
"""Compute the magnitude spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
:param frames: the array of frames. Each row is a frame.
:param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
:returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the magnitude spectrum of the corresponding frame.
"""
if numpy.shape(frames)[1] > NFFT:
logging.warn(
'frame length (%d) is greater than FFT size (%d), frame will be truncated. Increase NFFT to avoid.',
numpy.shape(frames)[1], NFFT)
complex_spec = numpy.fft.rfft(frames, NFFT)
return numpy.absolute(complex_spec)
def powspec(frames, NFFT):
"""Compute the power spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
:param frames: the array of frames. Each row is a frame.
:param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
:returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the power spectrum of the corresponding frame.
"""
return numpy.square(magspec(frames, NFFT))
def mfcc(signal,samplerate=16000,winlen=0.025,winstep=0.01,numcep=13,
nfilt=23,nfft=512,lowfreq=20,highfreq=None,dither=1.0,remove_dc_offset=True,preemph=0.97,
ceplifter=22,useEnergy=True,wintype='povey'):
"""Compute MFCC features from an audio signal.
:param signal: the audio signal from which to compute features. Should be an N*1 array
:param samplerate: the samplerate of the signal we are working with.
:param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
:param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
:param numcep: the number of cepstrum to return, default 13
:param nfilt: the number of filters in the filterbank, default 26.
:param nfft: the FFT size. Default is 512.
:param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
:param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
:param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
:param ceplifter: apply a lifter to final cepstral coefficients. 0 is no lifter. Default is 22.
:param appendEnergy: if this is true, the zeroth cepstral coefficient is replaced with the log of the total frame energy.
:param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming
:returns: A numpy array of size (NUMFRAMES by numcep) containing features. Each row holds 1 feature vector.
"""
feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,dither,remove_dc_offset,preemph,wintype)
feat = numpy.log(feat)
feat = dct(feat, type=2, axis=1, norm='ortho')[:,:numcep]
feat = lifter(feat,ceplifter)
if useEnergy: feat[:,0] = numpy.log(energy) # replace first cepstral coefficient with log of frame energy
return feat
def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
nfilt=40,nfft=512,lowfreq=0,highfreq=None,dither=1.0,remove_dc_offset=True, preemph=0.97,
wintype='hamming'):
"""Compute Mel-filterbank energy features from an audio signal.
:param signal: the audio signal from which to compute features. Should be an N*1 array
:param samplerate: the samplerate of the signal we are working with.
:param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
:param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
:param nfilt: the number of filters in the filterbank, default 26.
:param nfft: the FFT size. Default is 512.
:param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
:param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
:param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
:param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming
winfunc=lambda x:numpy.ones((x,))
:returns: 2 values. The first is a numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. The
second return value is the energy in each frame (total energy, unwindowed)
"""
highfreq= highfreq or samplerate/2
frames,raw_frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, dither, preemph, remove_dc_offset, wintype)
pspec = sigproc.powspec(frames,nfft) # nearly the same until this part
energy = numpy.sum(raw_frames**2,1) # this stores the raw energy in each frame
energy = numpy.where(energy == 0,numpy.finfo(float).eps,energy) # if energy is zero, we get problems with log
fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq)
feat = numpy.dot(pspec,fb.T) # compute the filterbank energies
feat = numpy.where(feat == 0,numpy.finfo(float).eps,feat) # if feat is zero, we get problems with log
return feat,energy
def logfbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
nfilt=40,nfft=512,lowfreq=64,highfreq=None,dither=1.0,remove_dc_offset=True,preemph=0.97,wintype='hamming'):
"""Compute log Mel-filterbank energy features from an audio signal.
:param signal: the audio signal from which to compute features. Should be an N*1 array
:param samplerate: the samplerate of the signal we are working with.
:param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
:param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
:param nfilt: the number of filters in the filterbank, default 26.
:param nfft: the FFT size. Default is 512.
:param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
:param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
:param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
:returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector.
"""
feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,dither, remove_dc_offset,preemph,wintype)
return numpy.log(feat)
def hz2mel(hz):
"""Convert a value in Hertz to Mels
:param hz: a value in Hz. This can also be a numpy array, conversion proceeds element-wise.
:returns: a value in Mels. If an array was passed in, an identical sized array is returned.
"""
return 1127 * numpy.log(1+hz/700.0)
def mel2hz(mel):
"""Convert a value in Mels to Hertz
:param mel: a value in Mels. This can also be a numpy array, conversion proceeds element-wise.
:returns: a value in Hertz. If an array was passed in, an identical sized array is returned.
"""
return 700 * (numpy.exp(mel/1127.0)-1)
def get_filterbanks(nfilt=26,nfft=512,samplerate=16000,lowfreq=0,highfreq=None):
"""Compute a Mel-filterbank. The filters are stored in the rows, the columns correspond
to fft bins. The filters are returned as an array of size nfilt * (nfft/2 + 1)
:param nfilt: the number of filters in the filterbank, default 20.
:param nfft: the FFT size. Default is 512.
:param samplerate: the samplerate of the signal we are working with. Affects mel spacing.
:param lowfreq: lowest band edge of mel filters, default 0 Hz
:param highfreq: highest band edge of mel filters, default samplerate/2
:returns: A numpy array of size nfilt * (nfft/2 + 1) containing filterbank. Each row holds 1 filter.
"""
highfreq= highfreq or samplerate/2
assert highfreq <= samplerate/2, "highfreq is greater than samplerate/2"
# compute points evenly spaced in mels
lowmel = hz2mel(lowfreq)
highmel = hz2mel(highfreq)
# check kaldi/src/feat/Mel-computations.h
fbank = numpy.zeros([nfilt,nfft//2+1])
mel_freq_delta = (highmel-lowmel)/(nfilt+1)
for j in range(0,nfilt):
leftmel = lowmel+j*mel_freq_delta
centermel = lowmel+(j+1)*mel_freq_delta
rightmel = lowmel+(j+2)*mel_freq_delta
for i in range(0,nfft//2):
mel=hz2mel(i*samplerate/nfft)
if mel>leftmel and mel<rightmel:
if mel<centermel:
fbank[j,i]=(mel-leftmel)/(centermel-leftmel)
else:
fbank[j,i]=(rightmel-mel)/(rightmel-centermel)
return fbank
def lifter(cepstra, L=22):
"""Apply a cepstral lifter the the matrix of cepstra. This has the effect of increasing the
magnitude of the high frequency DCT coeffs.
:param cepstra: the matrix of mel-cepstra, will be numframes * numcep in size.
:param L: the liftering coefficient to use. Default is 22. L <= 0 disables lifter.
"""
if L > 0:
nframes,ncoeff = numpy.shape(cepstra)
n = numpy.arange(ncoeff)
lift = 1 + (L/2.)*numpy.sin(numpy.pi*n/L)
return lift*cepstra
else:
# values of L <= 0, do nothing
return cepstra
def delta(feat, N):
"""Compute delta features from a feature vector sequence.
:param feat: A numpy array of size (NUMFRAMES by number of features) containing features. Each row holds 1 feature vector.
:param N: For each frame, calculate delta features based on preceding and following N frames
:returns: A numpy array of size (NUMFRAMES by number of features) containing delta features. Each row holds 1 delta feature vector.
"""
if N < 1:
raise ValueError('N must be an integer >= 1')
NUMFRAMES = len(feat)
denominator = 2 * sum([i**2 for i in range(1, N+1)])
delta_feat = numpy.empty_like(feat)
padded = numpy.pad(feat, ((N, N), (0, 0)), mode='edge') # padded version of feat
for t in range(NUMFRAMES):
delta_feat[t] = numpy.dot(numpy.arange(-N, N+1), padded[t : t+2*N+1]) / denominator # [t : t+2*N+1] == [(N+t)-N : (N+t)+N+1]
return delta_feat
##### modify for test ######
def framesig_without_dither_dc_preemphasize(sig, frame_len, frame_step, wintype='hamming', stride_trick=True):
"""Frame a signal into overlapping frames.
:param sig: the audio signal to frame.
:param frame_len: length of each frame measured in samples.
:param frame_step: number of samples after the start of the previous frame that the next frame should begin.
:param winfunc: the analysis window to apply to each frame. By default no window is applied.
:param stride_trick: use stride trick to compute the rolling window and window multiplication faster
:returns: an array of frames. Size is NUMFRAMES by frame_len.
"""
slen = len(sig)
frame_len = int(round_half_up(frame_len))
frame_step = int(round_half_up(frame_step))
if slen <= frame_len:
numframes = 1
else:
numframes = 1 + (( slen - frame_len) // frame_step)
# check kaldi/src/feat/feature-window.h
padsignal = sig[:(numframes-1)*frame_step+frame_len]
if wintype is 'povey':
win = numpy.empty(frame_len)
for i in range(frame_len):
win[i] = (0.5-0.5*numpy.cos(2*numpy.pi/(frame_len-1)*i))**0.85
elif wintype == '':
win = numpy.ones(frame_len)
elif wintype == 'hann':
win = numpy.hanning(frame_len)
else: # the hamming window
win = numpy.hamming(frame_len)
if stride_trick:
frames = rolling_window(padsignal, window=frame_len, step=frame_step)
else:
indices = numpy.tile(numpy.arange(0, frame_len), (numframes, 1)) + numpy.tile(
numpy.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T
indices = numpy.array(indices, dtype=numpy.int32)
frames = padsignal[indices]
win = numpy.tile(win, (numframes, 1))
frames = frames.astype(numpy.float32)
raw_frames = frames
return frames * win, raw_frames
def frames(signal,samplerate=16000,winlen=0.025,winstep=0.01,
nfilt=40,nfft=512,lowfreq=0,highfreq=None, wintype='hamming'):
frames_with_win, raw_frames = framesig_without_dither_dc_preemphasize(signal, winlen*samplerate, winstep*samplerate, wintype)
return frames_with_win, raw_frames
def complexspec(frames, NFFT):
"""Compute the magnitude spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
:param frames: the array of frames. Each row is a frame.
:param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
:returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the magnitude spectrum of the corresponding frame.
"""
if numpy.shape(frames)[1] > NFFT:
logging.warn(
'frame length (%d) is greater than FFT size (%d), frame will be truncated. Increase NFFT to avoid.',
numpy.shape(frames)[1], NFFT)
complex_spec = numpy.fft.rfft(frames, NFFT)
return complex_spec
def stft_with_window(signal,samplerate=16000,winlen=0.025,winstep=0.01,
nfilt=40,nfft=512,lowfreq=0,highfreq=None,dither=1.0,remove_dc_offset=True, preemph=0.97,
wintype='hamming'):
frames_with_win, raw_frames = framesig_without_dither_dc_preemphasize(signal, winlen*samplerate, winstep*samplerate, wintype)
spec = magspec(frames_with_win, nfft) # nearly the same until this part
scomplex = complexspec(frames_with_win, nfft)
rspec = magspec(raw_frames, nfft)
rcomplex = complexspec(raw_frames, nfft)
return spec, scomplex, rspec, rcomplex
class TestKaldiFE(unittest.TestCase):
def setUp(self):
self. this_dir = Path(__file__).parent
self.wavpath = str(self.this_dir / 'english.wav')
self.winlen=0.025 # ms
self.winstep=0.01 # ms
self.nfft=512
self.lowfreq = 0
self.highfreq = None
self.wintype='hamm'
self.nfilt=40
paddle.set_device('cpu')
def test_read(self):
import scipy.io.wavfile as wav
rate, sig = wav.read(self.wavpath)
sr, wav = kaldi.read(self.wavpath)
wav = wav[:, 0]
self.assertTrue(np.all(sig == wav))
self.assertEqual(rate, sr)
def test_frames(self):
sr, wav = kaldi.read(self.wavpath)
wav = wav[:, 0]
_, fs = frames(wav, samplerate=sr,
winlen=self.winlen, winstep=self.winstep,
nfilt=self.nfilt, nfft=self.nfft,
lowfreq=self.lowfreq, highfreq=self.highfreq,
wintype=self.wintype)
t_wav = paddle.to_tensor([wav], dtype='float32')
t_wavlen = paddle.to_tensor([len(wav)])
t_fs, t_nframe = kaldi.frames(t_wav, t_wavlen, sr, self.winlen, self.winstep, clip=False)
t_fs = t_fs.astype(fs.dtype)[0]
self.assertEqual(t_nframe.item(), fs.shape[0])
self.assertTrue(np.allclose(t_fs.numpy(), fs))
def test_stft(self):
sr, wav = kaldi.read(self.wavpath)
wav = wav[:, 0]
for wintype in ['', 'hamm', 'hann', 'povey']:
self.wintype=wintype
_, stft_c_win, _, _ = stft_with_window(wav, samplerate=sr,
winlen=self.winlen, winstep=self.winstep,
nfilt=self.nfilt, nfft=self.nfft,
lowfreq=self.lowfreq, highfreq=self.highfreq,
wintype=self.wintype)
t_wav = paddle.to_tensor([wav], dtype='float32')
t_wavlen = paddle.to_tensor([len(wav)])
stft_class = kaldi.STFT(self.nfft, sr, self.winlen, self.winstep, window_type=self.wintype, dither=0.0, preemph_coeff=0.0, remove_dc_offset=False, clip=False)
t_stft, t_nframe = stft_class(t_wav, t_wavlen)
t_stft = t_stft.astype(stft_c_win.real.dtype)[0]
t_real = t_stft[:, :, 0]
t_imag = t_stft[:, :, 1]
self.assertEqual(t_nframe.item(), stft_c_win.real.shape[0])
self.assertLess(np.sum(t_real.numpy()) - np.sum(stft_c_win.real), 1)
self.assertTrue(np.allclose(t_real.numpy(), stft_c_win.real, atol=1e-1))
self.assertLess(np.sum(t_imag.numpy()) - np.sum(stft_c_win.imag), 1)
self.assertTrue(np.allclose(t_imag.numpy(), stft_c_win.imag, atol=1e-1))
def test_magspec(self):
sr, wav = kaldi.read(self.wavpath)
wav = wav[:, 0]
for wintype in ['', 'hamm', 'hann', 'povey']:
self.wintype=wintype
stft_win, _, _, _ = stft_with_window(wav, samplerate=sr,
winlen=self.winlen, winstep=self.winstep,
nfilt=self.nfilt, nfft=self.nfft,
lowfreq=self.lowfreq, highfreq=self.highfreq,
wintype=self.wintype)
t_wav = paddle.to_tensor([wav], dtype='float32')
t_wavlen = paddle.to_tensor([len(wav)])
stft_class = kaldi.STFT(self.nfft, sr, self.winlen, self.winstep, window_type=self.wintype, dither=0.0, preemph_coeff=0.0, remove_dc_offset=False, clip=False)
t_stft, t_nframe = stft_class(t_wav, t_wavlen)
t_stft = t_stft.astype(stft_win.dtype)
t_spec = kaldi.magspec(t_stft)[0]
self.assertEqual(t_nframe.item(), stft_win.shape[0])
self.assertLess(np.sum(t_spec.numpy()) - np.sum(stft_win), 1)
self.assertTrue(np.allclose(t_spec.numpy(), stft_win, atol=1e-1))
def test_magsepc_winprocess(self):
sr, wav = kaldi.read(self.wavpath)
wav = wav[:, 0]
fs, _= framesig(wav, self.winlen*sr, self.winstep*sr,
dither=0.0, preemph=0.97, remove_dc_offset=True, wintype='povey', stride_trick=True)
spec = magspec(fs, self.nfft) # nearly the same until this part
t_wav = paddle.to_tensor([wav], dtype='float32')
t_wavlen = paddle.to_tensor([len(wav)])
stft_class = kaldi.STFT(
self.nfft, sr, self.winlen, self.winstep,
window_type='povey', dither=0.0, preemph_coeff=0.97, remove_dc_offset=True, clip=False)
t_stft, t_nframe = stft_class(t_wav, t_wavlen)
t_stft = t_stft.astype(spec.dtype)
t_spec = kaldi.magspec(t_stft)[0]
self.assertEqual(t_nframe.item(), fs.shape[0])
self.assertLess(np.sum(t_spec.numpy()) - np.sum(spec), 1)
self.assertTrue(np.allclose(t_spec.numpy(), spec, atol=1e-1))
def test_powspec(self):
sr, wav = kaldi.read(self.wavpath)
wav = wav[:, 0]
for wintype in ['', 'hamm', 'hann', 'povey']:
self.wintype=wintype
stft_win, _, _, _ = stft_with_window(wav, samplerate=sr,
winlen=self.winlen, winstep=self.winstep,
nfilt=self.nfilt, nfft=self.nfft,
lowfreq=self.lowfreq, highfreq=self.highfreq,
wintype=self.wintype)
stft_win = np.square(stft_win)
t_wav = paddle.to_tensor([wav], dtype='float32')
t_wavlen = paddle.to_tensor([len(wav)])
stft_class = kaldi.STFT(self.nfft, sr, self.winlen, self.winstep, window_type=self.wintype, dither=0.0, preemph_coeff=0.0, remove_dc_offset=False, clip=False)
t_stft, t_nframe = stft_class(t_wav, t_wavlen)
t_stft = t_stft.astype(stft_win.dtype)
t_spec = kaldi.powspec(t_stft)[0]
self.assertEqual(t_nframe.item(), stft_win.shape[0])
self.assertLess(np.sum(t_spec.numpy() - stft_win), 5e4)
self.assertTrue(np.allclose(t_spec.numpy(), stft_win, atol=1e2))
# from python_speech_features import mfcc
# from python_speech_features import delta
# from python_speech_features import logfbank
# import scipy.io.wavfile as wav
# (rate,sig) = wav.read("english.wav")
# # note that generally nfilt=40 is used for speech recognition
# fbank_feat = logfbank(sig,nfilt=23,lowfreq=20,dither=0,wintype='povey')
# # the computed fbank coefficents of english.wav with dimension [110,23]
# # [ 12.2865 12.6906 13.1765 15.714 16.064 15.7553 16.5746 16.9205 16.6472 16.1302 16.4576 16.7326 16.8864 17.7215 18.88 19.1377 19.1495 18.6683 18.3886 20.3506 20.2772 18.8248 18.1899
# # 11.9198 13.146 14.7215 15.8642 17.4288 16.394 16.8238 16.1095 16.4297 16.6331 16.3163 16.5093 17.4981 18.3429 19.6555 19.6263 19.8435 19.0534 19.001 20.0287 19.7707 19.5852 19.1112
# # ...
# # ...
# # the same with that using kaldi commands: compute-fbank-feats --dither=0.0
# mfcc_feat = mfcc(sig,dither=0,useEnergy=True,wintype='povey')
# # the computed mfcc coefficents of english.wav with dimension [110,13]
# # [ 17.1337 -23.3651 -7.41751 -7.73686 -21.3682 -8.93884 -3.70843 4.68346 -16.0676 12.782 -7.24054 8.25089 10.7292
# # 17.1692 -23.3028 -5.61872 -4.0075 -23.287 -20.6101 -5.51584 -6.15273 -14.4333 8.13052 -0.0345329 2.06274 -0.564298
# # ...
# # ...
# # the same with that using kaldi commands: compute-mfcc-feats --dither=0.0
if __name__ == '__main__':
unittest.main()
Loading…
Cancel
Save