Merge branch 'develop' of https://github.com/PaddlePaddle/DeepSpeech into release_model
commit
7e96942c58
@ -1,49 +0,0 @@
|
||||
[English](README.md)
|
||||
|
||||
# PaddlePaddle Speech to Any toolkit
|
||||
|
||||
![License](https://img.shields.io/badge/license-Apache%202-red.svg)
|
||||
![python version](https://img.shields.io/badge/python-3.7+-orange.svg)
|
||||
![support os](https://img.shields.io/badge/os-linux-yellow.svg)
|
||||
|
||||
*DeepSpeech*是一个采用[PaddlePaddle](https://github.com/PaddlePaddle/Paddle)平台的端到端自动语音识别引擎的开源项目,
|
||||
我们的愿景是为语音识别在工业应用和学术研究上,提供易于使用、高效、小型化和可扩展的工具,包括训练,推理,以及 部署。
|
||||
|
||||
## 特性
|
||||
|
||||
参看 [特性列表](doc/src/feature_list.md)。
|
||||
|
||||
|
||||
## 安装
|
||||
|
||||
在以下环境测试验证过:
|
||||
|
||||
* Ubuntu 16.04
|
||||
* python>=3.7
|
||||
* paddlepaddle>=2.2.0rc
|
||||
|
||||
参看 [安装](doc/src/install.md)。
|
||||
|
||||
## 开始
|
||||
|
||||
请查看 [开始](doc/src/getting_started.md) 和 [tiny egs](examples/tiny/s0/README.md)。
|
||||
|
||||
## 更多信息
|
||||
|
||||
* [数据处理](doc/src/data_preparation.md)
|
||||
* [数据增强](doc/src/augmentation.md)
|
||||
* [语言模型](doc/src/ngram_lm.md)
|
||||
* [Benchmark](doc/src/benchmark.md)
|
||||
* [Relased Model](doc/src/released_model.md)
|
||||
|
||||
## 问题和帮助
|
||||
|
||||
欢迎您在[Github讨论](https://github.com/PaddlePaddle/DeepSpeech/discussions)提交问题,[Github问题](https://github.com/PaddlePaddle/models/issues)中反馈bug。也欢迎您为这个项目做出贡献。
|
||||
|
||||
## License
|
||||
|
||||
DeepSpeech 遵循[Apache-2.0开源协议](./LICENSE)。
|
||||
|
||||
## 感谢
|
||||
|
||||
开发中参考一些优秀的仓库,详情参见 [References](doc/src/reference.md)。
|
@ -1,191 +0,0 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Beam search parameters tuning for DeepSpeech2 model."""
|
||||
import functools
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
from paddle.io import DataLoader
|
||||
|
||||
from deepspeech.exps.deepspeech2.config import get_cfg_defaults
|
||||
from deepspeech.io.collator import SpeechCollator
|
||||
from deepspeech.io.dataset import ManifestDataset
|
||||
from deepspeech.models.ds2 import DeepSpeech2Model
|
||||
from deepspeech.training.cli import default_argument_parser
|
||||
from deepspeech.utils import error_rate
|
||||
from deepspeech.utils.utility import add_arguments
|
||||
from deepspeech.utils.utility import print_arguments
|
||||
|
||||
|
||||
def tune(config, args):
|
||||
"""Tune parameters alpha and beta incrementally."""
|
||||
if not args.num_alphas >= 0:
|
||||
raise ValueError("num_alphas must be non-negative!")
|
||||
if not args.num_betas >= 0:
|
||||
raise ValueError("num_betas must be non-negative!")
|
||||
config.defrost()
|
||||
config.data.manfiest = config.data.dev_manifest
|
||||
config.data.augmentation_config = ""
|
||||
config.data.keep_transcription_text = True
|
||||
dev_dataset = ManifestDataset.from_config(config)
|
||||
|
||||
valid_loader = DataLoader(
|
||||
dev_dataset,
|
||||
batch_size=config.data.batch_size,
|
||||
shuffle=False,
|
||||
drop_last=False,
|
||||
collate_fn=SpeechCollator(keep_transcription_text=True))
|
||||
|
||||
model = DeepSpeech2Model.from_pretrained(valid_loader, config,
|
||||
args.checkpoint_path)
|
||||
model.eval()
|
||||
|
||||
# decoders only accept string encoded in utf-8
|
||||
vocab_list = valid_loader.dataset.vocab_list
|
||||
errors_func = error_rate.char_errors if config.decoding.error_rate_type == 'cer' else error_rate.word_errors
|
||||
|
||||
# create grid for search
|
||||
cand_alphas = np.linspace(args.alpha_from, args.alpha_to, args.num_alphas)
|
||||
cand_betas = np.linspace(args.beta_from, args.beta_to, args.num_betas)
|
||||
params_grid = [(alpha, beta) for alpha in cand_alphas
|
||||
for beta in cand_betas]
|
||||
|
||||
err_sum = [0.0 for i in range(len(params_grid))]
|
||||
err_ave = [0.0 for i in range(len(params_grid))]
|
||||
|
||||
num_ins, len_refs, cur_batch = 0, 0, 0
|
||||
# initialize external scorer
|
||||
model.decoder.init_decode(args.alpha_from, args.beta_from,
|
||||
config.decoding.lang_model_path, vocab_list,
|
||||
config.decoding.decoding_method)
|
||||
## incremental tuning parameters over multiple batches
|
||||
print("start tuning ...")
|
||||
for infer_data in valid_loader():
|
||||
if (args.num_batches >= 0) and (cur_batch >= args.num_batches):
|
||||
break
|
||||
|
||||
def ordid2token(texts, texts_len):
|
||||
""" ord() id to chr() chr """
|
||||
trans = []
|
||||
for text, n in zip(texts, texts_len):
|
||||
n = n.numpy().item()
|
||||
ids = text[:n]
|
||||
trans.append(''.join([chr(i) for i in ids]))
|
||||
return trans
|
||||
|
||||
audio, audio_len, text, text_len = infer_data
|
||||
target_transcripts = ordid2token(text, text_len)
|
||||
num_ins += audio.shape[0]
|
||||
|
||||
# model infer
|
||||
eouts, eouts_len = model.encoder(audio, audio_len)
|
||||
probs = model.decoder.softmax(eouts)
|
||||
|
||||
# grid search
|
||||
for index, (alpha, beta) in enumerate(params_grid):
|
||||
print(f"tuneing: alpha={alpha} beta={beta}")
|
||||
result_transcripts = model.decoder.decode_probs(
|
||||
probs.numpy(), eouts_len, vocab_list,
|
||||
config.decoding.decoding_method,
|
||||
config.decoding.lang_model_path, alpha, beta,
|
||||
config.decoding.beam_size, config.decoding.cutoff_prob,
|
||||
config.decoding.cutoff_top_n, config.decoding.num_proc_bsearch)
|
||||
|
||||
for target, result in zip(target_transcripts, result_transcripts):
|
||||
errors, len_ref = errors_func(target, result)
|
||||
err_sum[index] += errors
|
||||
|
||||
# accumulate the length of references of every batchπ
|
||||
# in the first iteration
|
||||
if args.alpha_from == alpha and args.beta_from == beta:
|
||||
len_refs += len_ref
|
||||
|
||||
err_ave[index] = err_sum[index] / len_refs
|
||||
if index % 2 == 0:
|
||||
sys.stdout.write('.')
|
||||
sys.stdout.flush()
|
||||
print("tuneing: one grid done!")
|
||||
|
||||
# output on-line tuning result at the end of current batch
|
||||
err_ave_min = min(err_ave)
|
||||
min_index = err_ave.index(err_ave_min)
|
||||
print("\nBatch %d [%d/?], current opt (alpha, beta) = (%s, %s), "
|
||||
" min [%s] = %f" %
|
||||
(cur_batch, num_ins, "%.3f" % params_grid[min_index][0],
|
||||
"%.3f" % params_grid[min_index][1],
|
||||
config.decoding.error_rate_type, err_ave_min))
|
||||
cur_batch += 1
|
||||
|
||||
# output WER/CER at every (alpha, beta)
|
||||
print("\nFinal %s:\n" % config.decoding.error_rate_type)
|
||||
for index in range(len(params_grid)):
|
||||
print("(alpha, beta) = (%s, %s), [%s] = %f" %
|
||||
("%.3f" % params_grid[index][0], "%.3f" % params_grid[index][1],
|
||||
config.decoding.error_rate_type, err_ave[index]))
|
||||
|
||||
err_ave_min = min(err_ave)
|
||||
min_index = err_ave.index(err_ave_min)
|
||||
print("\nFinish tuning on %d batches, final opt (alpha, beta) = (%s, %s)" %
|
||||
(cur_batch, "%.3f" % params_grid[min_index][0],
|
||||
"%.3f" % params_grid[min_index][1]))
|
||||
|
||||
print("finish tuning")
|
||||
|
||||
|
||||
def main(config, args):
|
||||
tune(config, args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = default_argument_parser()
|
||||
add_arg = functools.partial(add_arguments, argparser=parser)
|
||||
add_arg('num_batches', int, -1, "# of batches tuning on. "
|
||||
"Default -1, on whole dev set.")
|
||||
add_arg('num_alphas', int, 45, "# of alpha candidates for tuning.")
|
||||
add_arg('num_betas', int, 8, "# of beta candidates for tuning.")
|
||||
add_arg('alpha_from', float, 1.0, "Where alpha starts tuning from.")
|
||||
add_arg('alpha_to', float, 3.2, "Where alpha ends tuning with.")
|
||||
add_arg('beta_from', float, 0.1, "Where beta starts tuning from.")
|
||||
add_arg('beta_to', float, 0.45, "Where beta ends tuning with.")
|
||||
|
||||
add_arg('batch_size', int, 256, "# of samples per batch.")
|
||||
add_arg('beam_size', int, 500, "Beam search width.")
|
||||
add_arg('num_proc_bsearch', int, 8, "# of CPUs for beam search.")
|
||||
add_arg('cutoff_prob', float, 1.0, "Cutoff probability for pruning.")
|
||||
add_arg('cutoff_top_n', int, 40, "Cutoff number for pruning.")
|
||||
|
||||
args = parser.parse_args()
|
||||
print_arguments(args, globals())
|
||||
|
||||
# https://yaml.org/type/float.html
|
||||
config = get_cfg_defaults()
|
||||
if args.config:
|
||||
config.merge_from_file(args.config)
|
||||
if args.opts:
|
||||
config.merge_from_list(args.opts)
|
||||
|
||||
config.data.batch_size = args.batch_size
|
||||
config.decoding.beam_size = args.beam_size
|
||||
config.decoding.num_proc_bsearch = args.num_proc_bsearch
|
||||
config.decoding.cutoff_prob = args.cutoff_prob
|
||||
config.decoding.cutoff_top_n = args.cutoff_top_n
|
||||
|
||||
config.freeze()
|
||||
print(config)
|
||||
|
||||
if args.dump_config:
|
||||
with open(args.dump_config, 'w') as f:
|
||||
print(config, file=f)
|
||||
|
||||
main(config, args)
|
@ -0,0 +1,119 @@
|
||||
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import sys
|
||||
|
||||
import paddle
|
||||
|
||||
from deepspeech.utils.log import Log
|
||||
|
||||
logger = Log(__name__).getlog()
|
||||
|
||||
# A global variable to record the number of calling times for profiler
|
||||
# functions. It is used to specify the tracing range of training steps.
|
||||
_profiler_step_id = 0
|
||||
|
||||
# A global variable to avoid parsing from string every time.
|
||||
_profiler_options = None
|
||||
|
||||
|
||||
class ProfilerOptions(object):
|
||||
'''
|
||||
Use a string to initialize a ProfilerOptions.
|
||||
The string should be in the format: "key1=value1;key2=value;key3=value3".
|
||||
For example:
|
||||
"profile_path=model.profile"
|
||||
"batch_range=[50, 60]; profile_path=model.profile"
|
||||
"batch_range=[50, 60]; tracer_option=OpDetail; profile_path=model.profile"
|
||||
ProfilerOptions supports following key-value pair:
|
||||
batch_range - a integer list, e.g. [100, 110].
|
||||
state - a string, the optional values are 'CPU', 'GPU' or 'All'.
|
||||
sorted_key - a string, the optional values are 'calls', 'total',
|
||||
'max', 'min' or 'ave.
|
||||
tracer_option - a string, the optional values are 'Default', 'OpDetail',
|
||||
'AllOpDetail'.
|
||||
profile_path - a string, the path to save the serialized profile data,
|
||||
which can be used to generate a timeline.
|
||||
exit_on_finished - a boolean.
|
||||
'''
|
||||
|
||||
def __init__(self, options_str):
|
||||
assert isinstance(options_str, str)
|
||||
|
||||
self._options = {
|
||||
'batch_range': [10, 20],
|
||||
'state': 'All',
|
||||
'sorted_key': 'total',
|
||||
'tracer_option': 'Default',
|
||||
'profile_path': '/tmp/profile',
|
||||
'exit_on_finished': True
|
||||
}
|
||||
self._parse_from_string(options_str)
|
||||
|
||||
def _parse_from_string(self, options_str):
|
||||
if not options_str:
|
||||
return
|
||||
|
||||
for kv in options_str.replace(' ', '').split(';'):
|
||||
key, value = kv.split('=')
|
||||
if key == 'batch_range':
|
||||
value_list = value.replace('[', '').replace(']', '').split(',')
|
||||
value_list = list(map(int, value_list))
|
||||
if len(value_list) >= 2 and value_list[0] >= 0 and value_list[
|
||||
1] > value_list[0]:
|
||||
self._options[key] = value_list
|
||||
elif key == 'exit_on_finished':
|
||||
self._options[key] = value.lower() in ("yes", "true", "t", "1")
|
||||
elif key in [
|
||||
'state', 'sorted_key', 'tracer_option', 'profile_path'
|
||||
]:
|
||||
self._options[key] = value
|
||||
|
||||
def __getitem__(self, name):
|
||||
if self._options.get(name, None) is None:
|
||||
raise ValueError(
|
||||
"ProfilerOptions does not have an option named %s." % name)
|
||||
return self._options[name]
|
||||
|
||||
|
||||
def add_profiler_step(options_str=None):
|
||||
'''
|
||||
Enable the operator-level timing using PaddlePaddle's profiler.
|
||||
The profiler uses a independent variable to count the profiler steps.
|
||||
One call of this function is treated as a profiler step.
|
||||
|
||||
Args:
|
||||
profiler_options - a string to initialize the ProfilerOptions.
|
||||
Default is None, and the profiler is disabled.
|
||||
'''
|
||||
if options_str is None:
|
||||
return
|
||||
|
||||
global _profiler_step_id
|
||||
global _profiler_options
|
||||
|
||||
if _profiler_options is None:
|
||||
_profiler_options = ProfilerOptions(options_str)
|
||||
logger.info(f"Profiler: {options_str}")
|
||||
logger.info(f"Profiler: {_profiler_options._options}")
|
||||
|
||||
if _profiler_step_id == _profiler_options['batch_range'][0]:
|
||||
paddle.utils.profiler.start_profiler(_profiler_options['state'],
|
||||
_profiler_options['tracer_option'])
|
||||
elif _profiler_step_id == _profiler_options['batch_range'][1]:
|
||||
paddle.utils.profiler.stop_profiler(_profiler_options['sorted_key'],
|
||||
_profiler_options['profile_path'])
|
||||
if _profiler_options['exit_on_finished']:
|
||||
sys.exit(0)
|
||||
|
||||
_profiler_step_id += 1
|
Before Width: | Height: | Size: 206 KiB |
Before Width: | Height: | Size: 108 KiB |
@ -1,16 +0,0 @@
|
||||
# Benchmarks
|
||||
|
||||
## Acceleration with Multi-GPUs
|
||||
|
||||
We compare the training time with 1, 2, 4, 8 Tesla V100 GPUs (with a subset of LibriSpeech samples whose audio durations are between 6.0 and 7.0 seconds). And it shows that a **near-linear** acceleration with multiple GPUs has been achieved. In the following figure, the time (in seconds) cost for training is printed on the blue bars.
|
||||
|
||||
<img src="../images/multi_gpu_speedup.png" width=450>
|
||||
|
||||
| # of GPU | Acceleration Rate |
|
||||
| -------- | --------------: |
|
||||
| 1 | 1.00 X |
|
||||
| 2 | 1.98 X |
|
||||
| 4 | 3.73 X |
|
||||
| 8 | 6.95 X |
|
||||
|
||||
`utils/profile.sh` provides such a demo profiling tool, you can change it as need.
|
Before Width: | Height: | Size: 93 KiB After Width: | Height: | Size: 93 KiB |
Before Width: | Height: | Size: 93 KiB After Width: | Height: | Size: 93 KiB |
@ -1,5 +1,7 @@
|
||||
# Reference
|
||||
|
||||
We refer these repos to build `model` and `engine`:
|
||||
|
||||
* [delta](https://github.com/Delta-ML/delta.git)
|
||||
* [espnet](https://github.com/espnet/espnet.git)
|
||||
* [kaldi](https://github.com/kaldi-asr/kaldi.git)
|
@ -0,0 +1,58 @@
|
||||
# [CC-CEDICT](https://cc-cedict.org/wiki/)
|
||||
|
||||
What is CC-CEDICT?
|
||||
CC-CEDICT is a continuation of the CEDICT project.
|
||||
The objective of the CEDICT project was to create an online, downloadable (as opposed to searchable-only) public-domain Chinese-English dictionary.
|
||||
CEDICT was started by Paul Andrew Denisowski in October 1997.
|
||||
For the most part, the project is modeled on Jim Breen's highly successful EDICT (Japanese-English dictionary) project and is intended to be a collaborative effort,
|
||||
with users providing entries and corrections to the main file.
|
||||
|
||||
|
||||
## Parse CC-CEDICT to Json format
|
||||
|
||||
1. Parse to Json
|
||||
|
||||
```
|
||||
run.sh
|
||||
```
|
||||
|
||||
2. Result
|
||||
|
||||
```
|
||||
exp/
|
||||
|-- cedict
|
||||
`-- cedict.json
|
||||
|
||||
0 directories, 2 files
|
||||
```
|
||||
|
||||
```
|
||||
4c4bffc84e24467fe1b2ea9ba37ed6b6 exp/cedict
|
||||
3adf504dacd13886f88cc9fe3b37c75d exp/cedict.json
|
||||
```
|
||||
|
||||
```
|
||||
==> exp/cedict <==
|
||||
# CC-CEDICT
|
||||
# Community maintained free Chinese-English dictionary.
|
||||
#
|
||||
# Published by MDBG
|
||||
#
|
||||
# License:
|
||||
# Creative Commons Attribution-ShareAlike 4.0 International License
|
||||
# https://creativecommons.org/licenses/by-sa/4.0/
|
||||
#
|
||||
# Referenced works:
|
||||
|
||||
==> exp/cedict.json <==
|
||||
{"traditional": "2019\u51a0\u72c0\u75c5\u6bd2\u75c5", "simplified": "2019\u51a0\u72b6\u75c5\u6bd2\u75c5", "pinyin": "er4 ling2 yi1 jiu3 guan1 zhuang4 bing4 du2 bing4", "english": "COVID-19, the coronavirus disease identified in 2019"}
|
||||
{"traditional": "21\u4e09\u9ad4\u7d9c\u5408\u75c7", "simplified": "21\u4e09\u4f53\u7efc\u5408\u75c7", "pinyin": "er4 shi2 yi1 san1 ti3 zong1 he2 zheng4", "english": "trisomy"}
|
||||
{"traditional": "3C", "simplified": "3C", "pinyin": "san1 C", "english": "abbr. for computers, communications, and consumer electronics"}
|
||||
{"traditional": "3P", "simplified": "3P", "pinyin": "san1 P", "english": "(slang) threesome"}
|
||||
{"traditional": "3Q", "simplified": "3Q", "pinyin": "san1 Q", "english": "(Internet slang) thank you (loanword)"}
|
||||
{"traditional": "421", "simplified": "421", "pinyin": "si4 er4 yi1", "english": "four grandparents, two parents and an only child"}
|
||||
{"traditional": "502\u81a0", "simplified": "502\u80f6", "pinyin": "wu3 ling2 er4 jiao1", "english": "cyanoacrylate glue"}
|
||||
{"traditional": "88", "simplified": "88", "pinyin": "ba1 ba1", "english": "(Internet slang) bye-bye (alternative for \u62dc\u62dc[bai2 bai2])"}
|
||||
{"traditional": "996", "simplified": "996", "pinyin": "jiu3 jiu3 liu4", "english": "9am-9pm, six days a week (work schedule)"}
|
||||
{"traditional": "A", "simplified": "A", "pinyin": "A", "english": "(slang) (Tw) to steal"}
|
||||
```
|
@ -1,5 +0,0 @@
|
||||
# Download Baker dataset
|
||||
|
||||
Baker dataset has to be downloaded mannually and moved to 'data/', because you will have to pass the CATTCHA from a browswe to download the dataset.
|
||||
|
||||
Download URL https://test.data-baker.com/#/data/index/source.
|
@ -0,0 +1,3 @@
|
||||
# G2P
|
||||
|
||||
* zh - Chinese G2P
|
@ -1,4 +1,4 @@
|
||||
export MAIN_ROOT=`realpath ${PWD}/../../`
|
||||
export MAIN_ROOT=`realpath ${PWD}/../../../`
|
||||
|
||||
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
|
||||
export LC_ALL=C
|
@ -0,0 +1,3 @@
|
||||
# Ngram LM
|
||||
|
||||
* s0 - kenlm ngram lm
|
@ -0,0 +1 @@
|
||||
data/lm
|
@ -1,7 +1,96 @@
|
||||
# [SentencePiece Model](https://github.com/google/sentencepiece)
|
||||
|
||||
## Run
|
||||
Train a `spm` model for English tokenizer.
|
||||
|
||||
```
|
||||
. path.sh
|
||||
bash run.sh
|
||||
```
|
||||
|
||||
## Results
|
||||
|
||||
```
|
||||
data/
|
||||
└── lang_char
|
||||
├── input.bpe
|
||||
├── input.decode
|
||||
├── input.txt
|
||||
├── train_unigram100.model
|
||||
├── train_unigram100_units.txt
|
||||
└── train_unigram100.vocab
|
||||
|
||||
1 directory, 6 files
|
||||
```
|
||||
|
||||
```
|
||||
b5a230c26c61db5c36f34e503102f936 data/lang_char/input.bpe
|
||||
ec5a9b24acc35469229e41256ceaf77d data/lang_char/input.decode
|
||||
ec5a9b24acc35469229e41256ceaf77d data/lang_char/input.txt
|
||||
124bf3fe7ce3b73b1994234c15268577 data/lang_char/train_unigram100.model
|
||||
0df2488cc8eaace95eb12713facb5cf0 data/lang_char/train_unigram100_units.txt
|
||||
46360cac35c751310e8e8ffd3a034cb5 data/lang_char/train_unigram100.vocab
|
||||
```
|
||||
|
||||
```
|
||||
==> data/lang_char/input.bpe <==
|
||||
▁mi ster ▁quilter ▁ is ▁the ▁a p ost le ▁o f ▁the ▁mi d d le ▁c las s es ▁ and ▁we ▁ar e ▁g l a d ▁ to ▁we l c om e ▁h is ▁g o s pe l
|
||||
▁ n or ▁ is ▁mi ster ▁quilter ' s ▁ma nne r ▁ l ess ▁in ter es t ing ▁tha n ▁h is ▁ma t ter
|
||||
▁h e ▁ t e ll s ▁us ▁tha t ▁ at ▁ t h is ▁f es t ive ▁ s e ason ▁o f ▁the ▁ y e ar ▁w ith ▁ ch r is t m a s ▁ and ▁ro a s t ▁be e f ▁ l o om ing ▁be fore ▁us ▁ s i mile s ▁d r a w n ▁f r om ▁ e at ing ▁ and ▁it s ▁re s u l t s ▁o c c ur ▁m ost ▁re a di l y ▁ to ▁the ▁ mind
|
||||
▁h e ▁ ha s ▁g r a v e ▁d o u b t s ▁w h e t h er ▁ s i r ▁f r e d er ic k ▁ l eig h to n ' s ▁w or k ▁ is ▁re all y ▁gre e k ▁a f ter ▁ all ▁ and ▁c a n ▁di s c o v er ▁in ▁it ▁b u t ▁li t t le ▁o f ▁ro ck y ▁it ha c a
|
||||
▁li nne ll ' s ▁ p ic tur es ▁ar e ▁a ▁ s or t ▁o f ▁ u p ▁g u ar d s ▁ and ▁ at ▁ em ▁painting s ▁ and ▁m ason ' s ▁ e x q u is i t e ▁ i d y ll s ▁ar e ▁a s ▁ n at ion a l ▁a s ▁a ▁ j ing o ▁ p o em ▁mi ster ▁b i r k e t ▁f o ster ' s ▁ l and s c a pe s ▁ s mile ▁ at ▁on e ▁m u ch ▁in ▁the ▁ s a m e ▁w a y ▁tha t ▁mi ster ▁c ar k er ▁us e d ▁ to ▁f las h ▁h is ▁ t e e t h ▁ and ▁mi ster ▁ j o h n ▁c o ll i er ▁g ive s ▁h is ▁ s i t ter ▁a ▁ ch e er f u l ▁ s l a p ▁on ▁the ▁b a ck ▁be fore ▁h
|
||||
e ▁ s a y s ▁li k e ▁a ▁ s ha m p o o er ▁in ▁a ▁ tur k is h ▁b at h ▁ n e x t ▁ma n
|
||||
▁it ▁ is ▁o b v i o u s l y ▁ u nne c ess ar y ▁for ▁us ▁ to ▁ p o i n t ▁o u t ▁h o w ▁ l u m i n o u s ▁the s e ▁c rit ic is m s ▁ar e ▁h o w ▁d e l ic at e ▁in ▁ e x p r ess ion
|
||||
▁on ▁the ▁g e n er a l ▁ p r i n c i p l es ▁o f ▁ar t ▁mi ster ▁quilter ▁w rit es ▁w ith ▁ e qual ▁ l u c i di t y
|
||||
▁painting ▁h e ▁ t e ll s ▁us ▁ is ▁o f ▁a ▁di f f er e n t ▁ qual i t y ▁ to ▁ma t h em at ic s ▁ and ▁f i nish ▁in ▁ar t ▁ is ▁a d d ing ▁m or e ▁f a c t
|
||||
▁a s ▁for ▁ e t ch ing s ▁the y ▁ar e ▁o f ▁ t w o ▁ k i n d s ▁b rit is h ▁ and ▁for eig n
|
||||
▁h e ▁ l a ment s ▁m ost ▁b i t ter l y ▁the ▁di v or c e ▁tha t ▁ ha s ▁be e n ▁ma d e ▁be t w e e n ▁d e c or at ive ▁ar t ▁ and ▁w ha t ▁we ▁us u all y ▁c all ▁ p ic tur es ▁ma k es ▁the ▁c u s t om ar y ▁a p pe a l ▁ to ▁the ▁ las t ▁ j u d g ment ▁ and ▁re mind s ▁us ▁tha t ▁in ▁the ▁gre at ▁d a y s ▁o f ▁ar t ▁mi c ha e l ▁a n g e l o ▁w a s ▁the ▁f ur nish ing ▁ u p h o l ster er
|
||||
|
||||
==> data/lang_char/input.decode <==
|
||||
mister quilter is the apostle of the middle classes and we are glad to welcome his gospel
|
||||
nor is mister quilter's manner less interesting than his matter
|
||||
he tells us that at this festive season of the year with christmas and roast beef looming before us similes drawn from eating and its results occur most readily to the mind
|
||||
he has grave doubts whether sir frederick leighton's work is really greek after all and can discover in it but little of rocky ithaca
|
||||
linnell's pictures are a sort of up guards and at em paintings and mason's exquisite idylls are as national as a jingo poem mister birket foster's landscapes smile at one much in the same way that mister carker used to flash his teeth and mister john collier gives his sitter a cheerful slap on the back before he says like a shampooer in a turkish bath next man
|
||||
it is obviously unnecessary for us to point out how luminous these criticisms are how delicate in expression
|
||||
on the general principles of art mister quilter writes with equal lucidity
|
||||
painting he tells us is of a different quality to mathematics and finish in art is adding more fact
|
||||
as for etchings they are of two kinds british and foreign
|
||||
he laments most bitterly the divorce that has been made between decorative art and what we usually call pictures makes the customary appeal to the last judgment and reminds us that in the great days of art michael angelo was the furnishing upholsterer
|
||||
|
||||
==> data/lang_char/input.txt <==
|
||||
mister quilter is the apostle of the middle classes and we are glad to welcome his gospel
|
||||
nor is mister quilter's manner less interesting than his matter
|
||||
he tells us that at this festive season of the year with christmas and roast beef looming before us similes drawn from eating and its results occur most readily to the mind
|
||||
he has grave doubts whether sir frederick leighton's work is really greek after all and can discover in it but little of rocky ithaca
|
||||
linnell's pictures are a sort of up guards and at em paintings and mason's exquisite idylls are as national as a jingo poem mister birket foster's landscapes smile at one much in the same way that mister carker used to flash his teeth and mister john collier gives his sitter a cheerful slap on the back before he says like a shampooer in a turkish bath next man
|
||||
it is obviously unnecessary for us to point out how luminous these criticisms are how delicate in expression
|
||||
on the general principles of art mister quilter writes with equal lucidity
|
||||
painting he tells us is of a different quality to mathematics and finish in art is adding more fact
|
||||
as for etchings they are of two kinds british and foreign
|
||||
he laments most bitterly the divorce that has been made between decorative art and what we usually call pictures makes the customary appeal to the last judgment and reminds us that in the great days of art michael angelo was the furnishing upholsterer
|
||||
|
||||
==> data/lang_char/train_unigram100_units.txt <==
|
||||
<blank> 0
|
||||
<unk> 1
|
||||
' 2
|
||||
a 3
|
||||
all 4
|
||||
and 5
|
||||
ar 6
|
||||
ason 7
|
||||
at 8
|
||||
b 9
|
||||
|
||||
==> data/lang_char/train_unigram100.vocab <==
|
||||
<unk> 0
|
||||
<s> 0
|
||||
</s> 0
|
||||
▁ -2.01742
|
||||
e -2.7203
|
||||
s -2.82989
|
||||
t -2.99689
|
||||
l -3.53267
|
||||
n -3.84935
|
||||
o -3.88229
|
||||
```
|
||||
|
@ -1,3 +0,0 @@
|
||||
# Regular expression based text normalization for Chinese
|
||||
|
||||
For simplicity and ease of implementation, text normalization is basically done by rules and dictionaries. Here's an example.
|
@ -1,33 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
if [ $# != 1 ];then
|
||||
echo "usage: tune ckpt_path"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# grid-search for hyper-parameters in language model
|
||||
python3 -u ${BIN_DIR}/tune.py \
|
||||
--device 'gpu' \
|
||||
--nproc 1 \
|
||||
--config conf/deepspeech2.yaml \
|
||||
--num_batches=-1 \
|
||||
--batch_size=128 \
|
||||
--beam_size=500 \
|
||||
--num_proc_bsearch=12 \
|
||||
--num_alphas=45 \
|
||||
--num_betas=8 \
|
||||
--alpha_from=1.0 \
|
||||
--alpha_to=3.2 \
|
||||
--beta_from=0.1 \
|
||||
--beta_to=0.45 \
|
||||
--cutoff_prob=1.0 \
|
||||
--cutoff_top_n=40 \
|
||||
--checkpoint_path ${1}
|
||||
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Failed in tuning!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
||||
exit 0
|
@ -0,0 +1 @@
|
||||
exp
|
@ -0,0 +1,2 @@
|
||||
old-pd_env.txt
|
||||
pd_env.txt
|
@ -0,0 +1,11 @@
|
||||
# Benchmark Test
|
||||
|
||||
## Data
|
||||
|
||||
* Aishell
|
||||
|
||||
## Docker
|
||||
|
||||
```
|
||||
registry.baidubce.com/paddlepaddle/paddle 2.1.1-gpu-cuda10.2-cudnn7 59d5ec1de486
|
||||
```
|
@ -1,146 +0,0 @@
|
||||
from typing import Tuple
|
||||
import numpy as np
|
||||
import paddle
|
||||
from paddle import Tensor
|
||||
from paddle import nn
|
||||
from paddle.nn import functional as F
|
||||
|
||||
|
||||
def frame(x: Tensor,
|
||||
num_samples: Tensor,
|
||||
win_length: int,
|
||||
hop_length: int,
|
||||
clip: bool = True) -> Tuple[Tensor, Tensor]:
|
||||
"""Extract frames from audio.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x : Tensor
|
||||
Shape (N, T), batched waveform.
|
||||
num_samples : Tensor
|
||||
Shape (N, ), number of samples of each waveform.
|
||||
win_length : int
|
||||
Window length.
|
||||
hop_length : int
|
||||
Number of samples shifted between ajancent frames.
|
||||
clip : bool, optional
|
||||
Whether to clip audio that does not fit into the last frame, by
|
||||
default True
|
||||
|
||||
Returns
|
||||
-------
|
||||
frames : Tensor
|
||||
Shape (N, T', win_length).
|
||||
num_frames : Tensor
|
||||
Shape (N, ) number of valid frames
|
||||
"""
|
||||
assert hop_length <= win_length
|
||||
num_frames = (num_samples - win_length) // hop_length
|
||||
padding = (0, 0)
|
||||
if not clip:
|
||||
num_frames += 1
|
||||
# NOTE: pad hop_length - 1 to the right to ensure that there is at most
|
||||
# one frame dangling to the righe edge
|
||||
padding = (0, hop_length - 1)
|
||||
|
||||
weight = paddle.eye(win_length).unsqueeze(1)
|
||||
|
||||
frames = F.conv1d(x.unsqueeze(1),
|
||||
weight,
|
||||
padding=padding,
|
||||
stride=(hop_length, ))
|
||||
return frames, num_frames
|
||||
|
||||
|
||||
class STFT(nn.Layer):
|
||||
"""A module for computing stft transformation in a differentiable way.
|
||||
|
||||
Parameters
|
||||
------------
|
||||
n_fft : int
|
||||
Number of samples in a frame.
|
||||
|
||||
hop_length : int
|
||||
Number of samples shifted between adjacent frames.
|
||||
|
||||
win_length : int
|
||||
Length of the window.
|
||||
|
||||
clip: bool
|
||||
Whether to clip audio is necesaary.
|
||||
"""
|
||||
def __init__(self,
|
||||
n_fft: int,
|
||||
hop_length: int,
|
||||
win_length: int,
|
||||
window_type: str = None,
|
||||
clip: bool = True):
|
||||
super().__init__()
|
||||
|
||||
self.hop_length = hop_length
|
||||
self.n_bin = 1 + n_fft // 2
|
||||
self.n_fft = n_fft
|
||||
self.clip = clip
|
||||
|
||||
# calculate window
|
||||
if window_type is None:
|
||||
window = np.ones(win_length)
|
||||
elif window_type == "hann":
|
||||
window = np.hanning(win_length)
|
||||
elif window_type == "hamming":
|
||||
window = np.hamming(win_length)
|
||||
else:
|
||||
raise ValueError("Not supported yet!")
|
||||
|
||||
if win_length < n_fft:
|
||||
window = F.pad(window, (0, n_fft - win_length))
|
||||
elif win_length > n_fft:
|
||||
window = window[:n_fft]
|
||||
|
||||
# (n_bins, n_fft) complex
|
||||
kernel_size = min(n_fft, win_length)
|
||||
weight = np.fft.fft(np.eye(n_fft))[:self.n_bin, :kernel_size]
|
||||
w_real = weight.real
|
||||
w_imag = weight.imag
|
||||
|
||||
# (2 * n_bins, kernel_size)
|
||||
w = np.concatenate([w_real, w_imag], axis=0)
|
||||
w = w * window
|
||||
|
||||
# (2 * n_bins, 1, kernel_size) # (C_out, C_in, kernel_size)
|
||||
w = np.expand_dims(w, 1)
|
||||
weight = paddle.cast(paddle.to_tensor(w), paddle.get_default_dtype())
|
||||
self.register_buffer("weight", weight)
|
||||
|
||||
def forward(self, x: Tensor, num_samples: Tensor) -> Tuple[Tensor, Tensor]:
|
||||
"""Compute the stft transform.
|
||||
Parameters
|
||||
------------
|
||||
x : Tensor [shape=(B, T)]
|
||||
The input waveform.
|
||||
num_samples : Tensor
|
||||
Number of samples of each waveform.
|
||||
Returns
|
||||
------------
|
||||
D : Tensor
|
||||
Shape(N, T', n_bins, 2) Spectrogram.
|
||||
|
||||
num_frames: Tensor
|
||||
Shape (N,) number of samples of each spectrogram
|
||||
"""
|
||||
num_frames = (num_samples - self.win_length) // self.hop_length
|
||||
padding = (0, 0)
|
||||
if not self.clip:
|
||||
num_frames += 1
|
||||
padding = (0, self.hop_length - 1)
|
||||
|
||||
batch_size, _, _ = paddle.shape(x)
|
||||
x = x.unsqueeze(-1)
|
||||
D = F.conv1d(self.weight,
|
||||
x,
|
||||
stride=(self.hop_length, ),
|
||||
padding=padding,
|
||||
data_format="NLC")
|
||||
D = paddle.reshape(D, [batch_size, -1, self.n_bin, 2])
|
||||
return D, num_frames
|
||||
|
@ -0,0 +1,201 @@
|
||||
import paddle
|
||||
import numpy as np
|
||||
from typing import Tuple, Optional, Union
|
||||
|
||||
|
||||
# https://github.com/kaldi-asr/kaldi/blob/cbed4ff688/src/feat/feature-window.cc#L109
|
||||
def povey_window(frame_len:int) -> np.ndarray:
|
||||
win = np.empty(frame_len)
|
||||
a = 2 * np.pi / (frame_len -1)
|
||||
for i in range(frame_len):
|
||||
win[i] = (0.5 - 0.5 * np.cos(a * i) )**0.85
|
||||
return win
|
||||
|
||||
def hann_window(frame_len:int) -> np.ndarray:
|
||||
win = np.empty(frame_len)
|
||||
a = 2 * np.pi / (frame_len -1)
|
||||
for i in range(frame_len):
|
||||
win[i] = 0.5 - 0.5 * np.cos(a * i)
|
||||
return win
|
||||
|
||||
def sine_window(frame_len:int) -> np.ndarray:
|
||||
win = np.empty(frame_len)
|
||||
a = 2 * np.pi / (frame_len -1)
|
||||
for i in range(frame_len):
|
||||
win[i] = np.sin(0.5 * a * i)
|
||||
return win
|
||||
|
||||
def hamm_window(frame_len:int) -> np.ndarray:
|
||||
win = np.empty(frame_len)
|
||||
a = 2 * np.pi / (frame_len -1)
|
||||
for i in range(frame_len):
|
||||
win[i] = 0.54 - 0.46 * np.cos(a * i)
|
||||
return win
|
||||
|
||||
def get_window(wintype:Optional[str], winlen:int) -> np.ndarray:
|
||||
"""get window function
|
||||
|
||||
Args:
|
||||
wintype (Optional[str]): window type.
|
||||
winlen (int): window length in samples.
|
||||
|
||||
Raises:
|
||||
ValueError: not support window.
|
||||
|
||||
Returns:
|
||||
np.ndarray: window coeffs.
|
||||
"""
|
||||
# calculate window
|
||||
if not wintype or wintype == 'rectangular':
|
||||
window = np.ones(winlen)
|
||||
elif wintype == "hann":
|
||||
window = hann_window(winlen)
|
||||
elif wintype == "hamm":
|
||||
window = hamm_window(winlen)
|
||||
elif wintype == "povey":
|
||||
window = povey_window(winlen)
|
||||
else:
|
||||
msg = f"{wintype} Not supported yet!"
|
||||
raise ValueError(msg)
|
||||
return window
|
||||
|
||||
|
||||
def dft_matrix(n_fft:int, winlen:int=None, n_bin:int=None) -> Tuple[np.ndarray, np.ndarray, int]:
|
||||
# https://en.wikipedia.org/wiki/Discrete_Fourier_transform
|
||||
# (n_bins, n_fft) complex
|
||||
if n_bin is None:
|
||||
n_bin = 1 + n_fft // 2
|
||||
if winlen is None:
|
||||
winlen = n_bin
|
||||
# https://github.com/numpy/numpy/blob/v1.20.0/numpy/fft/_pocketfft.py#L49
|
||||
kernel_size = min(n_fft, winlen)
|
||||
|
||||
n = np.arange(0, n_fft, 1.)
|
||||
wsin = np.empty((n_bin, kernel_size)) #[Cout, kernel_size]
|
||||
wcos = np.empty((n_bin, kernel_size)) #[Cout, kernel_size]
|
||||
for k in range(n_bin): # Only half of the bins contain useful info
|
||||
wsin[k,:] = -np.sin(2*np.pi*k*n/n_fft)[:kernel_size]
|
||||
wcos[k,:] = np.cos(2*np.pi*k*n/n_fft)[:kernel_size]
|
||||
w_real = wcos
|
||||
w_imag = wsin
|
||||
return w_real, w_imag, kernel_size
|
||||
|
||||
|
||||
def dft_matrix_fast(n_fft:int, winlen:int=None, n_bin:int=None) -> Tuple[np.ndarray, np.ndarray, int]:
|
||||
# (n_bins, n_fft) complex
|
||||
if n_bin is None:
|
||||
n_bin = 1 + n_fft // 2
|
||||
if winlen is None:
|
||||
winlen = n_bin
|
||||
# https://github.com/numpy/numpy/blob/v1.20.0/numpy/fft/_pocketfft.py#L49
|
||||
kernel_size = min(n_fft, winlen)
|
||||
|
||||
# https://en.wikipedia.org/wiki/DFT_matrix
|
||||
# https://ccrma.stanford.edu/~jos/st/Matrix_Formulation_DFT.html
|
||||
weight = np.fft.fft(np.eye(n_fft))[:self.n_bin, :kernel_size]
|
||||
w_real = weight.real
|
||||
w_imag = weight.imag
|
||||
return w_real, w_imag, kernel_size
|
||||
|
||||
|
||||
def bin2hz(bin:Union[List[int], np.ndarray], N:int, sr:int)->List[float]:
|
||||
"""FFT bins to Hz.
|
||||
|
||||
http://practicalcryptography.com/miscellaneous/machine-learning/intuitive-guide-discrete-fourier-transform/
|
||||
|
||||
Args:
|
||||
bins (List[int] or np.ndarray): bin index.
|
||||
N (int): the number of samples, or FFT points.
|
||||
sr (int): sampling rate.
|
||||
|
||||
Returns:
|
||||
List[float]: Hz's.
|
||||
"""
|
||||
hz = bin * float(sr) / N
|
||||
|
||||
|
||||
def hz2mel(hz):
|
||||
"""Convert a value in Hertz to Mels
|
||||
|
||||
:param hz: a value in Hz. This can also be a numpy array, conversion proceeds element-wise.
|
||||
:returns: a value in Mels. If an array was passed in, an identical sized array is returned.
|
||||
"""
|
||||
return 1127 * np.log(1+hz/700.0)
|
||||
|
||||
|
||||
def mel2hz(mel):
|
||||
"""Convert a value in Mels to Hertz
|
||||
|
||||
:param mel: a value in Mels. This can also be a numpy array, conversion proceeds element-wise.
|
||||
:returns: a value in Hertz. If an array was passed in, an identical sized array is returned.
|
||||
"""
|
||||
return 700 * (np.exp(mel/1127.0)-1)
|
||||
|
||||
|
||||
|
||||
def rms_to_db(rms: float):
|
||||
"""Root Mean Square to dB.
|
||||
|
||||
Args:
|
||||
rms ([float]): root mean square
|
||||
|
||||
Returns:
|
||||
float: dB
|
||||
"""
|
||||
return 20.0 * math.log10(max(1e-16, rms))
|
||||
|
||||
|
||||
def rms_to_dbfs(rms: float):
|
||||
"""Root Mean Square to dBFS.
|
||||
https://fireattack.wordpress.com/2017/02/06/replaygain-loudness-normalization-and-applications/
|
||||
Audio is mix of sine wave, so 1 amp sine wave's Full scale is 0.7071, equal to -3.0103dB.
|
||||
|
||||
dB = dBFS + 3.0103
|
||||
dBFS = db - 3.0103
|
||||
e.g. 0 dB = -3.0103 dBFS
|
||||
|
||||
Args:
|
||||
rms ([float]): root mean square
|
||||
|
||||
Returns:
|
||||
float: dBFS
|
||||
"""
|
||||
return rms_to_db(rms) - 3.0103
|
||||
|
||||
|
||||
def max_dbfs(sample_data: np.ndarray):
|
||||
"""Peak dBFS based on the maximum energy sample.
|
||||
|
||||
Args:
|
||||
sample_data ([np.ndarray]): float array, [-1, 1].
|
||||
|
||||
Returns:
|
||||
float: dBFS
|
||||
"""
|
||||
# Peak dBFS based on the maximum energy sample. Will prevent overdrive if used for normalization.
|
||||
return rms_to_dbfs(max(abs(np.min(sample_data)), abs(np.max(sample_data))))
|
||||
|
||||
|
||||
def mean_dbfs(sample_data):
|
||||
"""Peak dBFS based on the RMS energy.
|
||||
|
||||
Args:
|
||||
sample_data ([np.ndarray]): float array, [-1, 1].
|
||||
|
||||
Returns:
|
||||
float: dBFS
|
||||
"""
|
||||
return rms_to_dbfs(
|
||||
math.sqrt(np.mean(np.square(sample_data, dtype=np.float64))))
|
||||
|
||||
|
||||
def gain_db_to_ratio(gain_db: float):
|
||||
"""dB to ratio
|
||||
|
||||
Args:
|
||||
gain_db (float): gain in dB
|
||||
|
||||
Returns:
|
||||
float: scale in amp
|
||||
"""
|
||||
return math.pow(10.0, gain_db / 20.0)
|
Binary file not shown.
@ -0,0 +1,533 @@
|
||||
from typing import Tuple
|
||||
import numpy as np
|
||||
import paddle
|
||||
import unittest
|
||||
|
||||
import decimal
|
||||
import numpy
|
||||
import math
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
from scipy.fftpack import dct
|
||||
|
||||
from third_party.paddle_audio.frontend import kaldi
|
||||
|
||||
def round_half_up(number):
|
||||
return int(decimal.Decimal(number).quantize(decimal.Decimal('1'), rounding=decimal.ROUND_HALF_UP))
|
||||
|
||||
def rolling_window(a, window, step=1):
|
||||
# http://ellisvalentiner.com/post/2017-03-21-np-strides-trick
|
||||
shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
|
||||
strides = a.strides + (a.strides[-1],)
|
||||
return numpy.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)[::step]
|
||||
|
||||
|
||||
def do_dither(signal, dither_value=1.0):
|
||||
signal += numpy.random.normal(size=signal.shape) * dither_value
|
||||
return signal
|
||||
|
||||
def do_remove_dc_offset(signal):
|
||||
signal -= numpy.mean(signal)
|
||||
return signal
|
||||
|
||||
def do_preemphasis(signal, coeff=0.97):
|
||||
"""perform preemphasis on the input signal.
|
||||
|
||||
:param signal: The signal to filter.
|
||||
:param coeff: The preemphasis coefficient. 0 is no filter, default is 0.95.
|
||||
:returns: the filtered signal.
|
||||
"""
|
||||
return numpy.append((1-coeff)*signal[0], signal[1:] - coeff * signal[:-1])
|
||||
|
||||
|
||||
def framesig(sig, frame_len, frame_step, dither=1.0, preemph=0.97, remove_dc_offset=True, wintype='hamming', stride_trick=True):
|
||||
"""Frame a signal into overlapping frames.
|
||||
|
||||
:param sig: the audio signal to frame.
|
||||
:param frame_len: length of each frame measured in samples.
|
||||
:param frame_step: number of samples after the start of the previous frame that the next frame should begin.
|
||||
:param winfunc: the analysis window to apply to each frame. By default no window is applied.
|
||||
:param stride_trick: use stride trick to compute the rolling window and window multiplication faster
|
||||
:returns: an array of frames. Size is NUMFRAMES by frame_len.
|
||||
"""
|
||||
slen = len(sig)
|
||||
frame_len = int(round_half_up(frame_len))
|
||||
frame_step = int(round_half_up(frame_step))
|
||||
if slen <= frame_len:
|
||||
numframes = 1
|
||||
else:
|
||||
numframes = 1 + (( slen - frame_len) // frame_step)
|
||||
|
||||
# check kaldi/src/feat/feature-window.h
|
||||
padsignal = sig[:(numframes-1)*frame_step+frame_len]
|
||||
if wintype is 'povey':
|
||||
win = numpy.empty(frame_len)
|
||||
for i in range(frame_len):
|
||||
win[i] = (0.5-0.5*numpy.cos(2*numpy.pi/(frame_len-1)*i))**0.85
|
||||
else: # the hamming window
|
||||
win = numpy.hamming(frame_len)
|
||||
|
||||
if stride_trick:
|
||||
frames = rolling_window(padsignal, window=frame_len, step=frame_step)
|
||||
else:
|
||||
indices = numpy.tile(numpy.arange(0, frame_len), (numframes, 1)) + numpy.tile(
|
||||
numpy.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T
|
||||
indices = numpy.array(indices, dtype=numpy.int32)
|
||||
frames = padsignal[indices]
|
||||
win = numpy.tile(win, (numframes, 1))
|
||||
|
||||
frames = frames.astype(numpy.float32)
|
||||
raw_frames = numpy.zeros(frames.shape)
|
||||
for frm in range(frames.shape[0]):
|
||||
frames[frm,:] = do_dither(frames[frm,:], dither) # dither
|
||||
frames[frm,:] = do_remove_dc_offset(frames[frm,:]) # remove dc offset
|
||||
raw_frames[frm,:] = frames[frm,:]
|
||||
frames[frm,:] = do_preemphasis(frames[frm,:], preemph) # preemphasize
|
||||
|
||||
return frames * win, raw_frames
|
||||
|
||||
|
||||
def magspec(frames, NFFT):
|
||||
"""Compute the magnitude spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
|
||||
|
||||
:param frames: the array of frames. Each row is a frame.
|
||||
:param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
|
||||
:returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the magnitude spectrum of the corresponding frame.
|
||||
"""
|
||||
if numpy.shape(frames)[1] > NFFT:
|
||||
logging.warn(
|
||||
'frame length (%d) is greater than FFT size (%d), frame will be truncated. Increase NFFT to avoid.',
|
||||
numpy.shape(frames)[1], NFFT)
|
||||
complex_spec = numpy.fft.rfft(frames, NFFT)
|
||||
return numpy.absolute(complex_spec)
|
||||
|
||||
|
||||
def powspec(frames, NFFT):
|
||||
"""Compute the power spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
|
||||
|
||||
:param frames: the array of frames. Each row is a frame.
|
||||
:param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
|
||||
:returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the power spectrum of the corresponding frame.
|
||||
"""
|
||||
return numpy.square(magspec(frames, NFFT))
|
||||
|
||||
|
||||
|
||||
def mfcc(signal,samplerate=16000,winlen=0.025,winstep=0.01,numcep=13,
|
||||
nfilt=23,nfft=512,lowfreq=20,highfreq=None,dither=1.0,remove_dc_offset=True,preemph=0.97,
|
||||
ceplifter=22,useEnergy=True,wintype='povey'):
|
||||
"""Compute MFCC features from an audio signal.
|
||||
|
||||
:param signal: the audio signal from which to compute features. Should be an N*1 array
|
||||
:param samplerate: the samplerate of the signal we are working with.
|
||||
:param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
|
||||
:param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
|
||||
:param numcep: the number of cepstrum to return, default 13
|
||||
:param nfilt: the number of filters in the filterbank, default 26.
|
||||
:param nfft: the FFT size. Default is 512.
|
||||
:param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
|
||||
:param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
|
||||
:param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
|
||||
:param ceplifter: apply a lifter to final cepstral coefficients. 0 is no lifter. Default is 22.
|
||||
:param appendEnergy: if this is true, the zeroth cepstral coefficient is replaced with the log of the total frame energy.
|
||||
:param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming
|
||||
:returns: A numpy array of size (NUMFRAMES by numcep) containing features. Each row holds 1 feature vector.
|
||||
"""
|
||||
feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,dither,remove_dc_offset,preemph,wintype)
|
||||
feat = numpy.log(feat)
|
||||
feat = dct(feat, type=2, axis=1, norm='ortho')[:,:numcep]
|
||||
feat = lifter(feat,ceplifter)
|
||||
if useEnergy: feat[:,0] = numpy.log(energy) # replace first cepstral coefficient with log of frame energy
|
||||
return feat
|
||||
|
||||
def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
|
||||
nfilt=40,nfft=512,lowfreq=0,highfreq=None,dither=1.0,remove_dc_offset=True, preemph=0.97,
|
||||
wintype='hamming'):
|
||||
"""Compute Mel-filterbank energy features from an audio signal.
|
||||
|
||||
:param signal: the audio signal from which to compute features. Should be an N*1 array
|
||||
:param samplerate: the samplerate of the signal we are working with.
|
||||
:param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
|
||||
:param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
|
||||
:param nfilt: the number of filters in the filterbank, default 26.
|
||||
:param nfft: the FFT size. Default is 512.
|
||||
:param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
|
||||
:param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
|
||||
:param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
|
||||
:param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming
|
||||
winfunc=lambda x:numpy.ones((x,))
|
||||
:returns: 2 values. The first is a numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. The
|
||||
second return value is the energy in each frame (total energy, unwindowed)
|
||||
"""
|
||||
highfreq= highfreq or samplerate/2
|
||||
frames,raw_frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, dither, preemph, remove_dc_offset, wintype)
|
||||
pspec = sigproc.powspec(frames,nfft) # nearly the same until this part
|
||||
energy = numpy.sum(raw_frames**2,1) # this stores the raw energy in each frame
|
||||
energy = numpy.where(energy == 0,numpy.finfo(float).eps,energy) # if energy is zero, we get problems with log
|
||||
|
||||
fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq)
|
||||
feat = numpy.dot(pspec,fb.T) # compute the filterbank energies
|
||||
feat = numpy.where(feat == 0,numpy.finfo(float).eps,feat) # if feat is zero, we get problems with log
|
||||
|
||||
return feat,energy
|
||||
|
||||
def logfbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
|
||||
nfilt=40,nfft=512,lowfreq=64,highfreq=None,dither=1.0,remove_dc_offset=True,preemph=0.97,wintype='hamming'):
|
||||
"""Compute log Mel-filterbank energy features from an audio signal.
|
||||
|
||||
:param signal: the audio signal from which to compute features. Should be an N*1 array
|
||||
:param samplerate: the samplerate of the signal we are working with.
|
||||
:param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
|
||||
:param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
|
||||
:param nfilt: the number of filters in the filterbank, default 26.
|
||||
:param nfft: the FFT size. Default is 512.
|
||||
:param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
|
||||
:param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
|
||||
:param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
|
||||
:returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector.
|
||||
"""
|
||||
feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,dither, remove_dc_offset,preemph,wintype)
|
||||
return numpy.log(feat)
|
||||
|
||||
def hz2mel(hz):
|
||||
"""Convert a value in Hertz to Mels
|
||||
|
||||
:param hz: a value in Hz. This can also be a numpy array, conversion proceeds element-wise.
|
||||
:returns: a value in Mels. If an array was passed in, an identical sized array is returned.
|
||||
"""
|
||||
return 1127 * numpy.log(1+hz/700.0)
|
||||
|
||||
def mel2hz(mel):
|
||||
"""Convert a value in Mels to Hertz
|
||||
|
||||
:param mel: a value in Mels. This can also be a numpy array, conversion proceeds element-wise.
|
||||
:returns: a value in Hertz. If an array was passed in, an identical sized array is returned.
|
||||
"""
|
||||
return 700 * (numpy.exp(mel/1127.0)-1)
|
||||
|
||||
def get_filterbanks(nfilt=26,nfft=512,samplerate=16000,lowfreq=0,highfreq=None):
|
||||
"""Compute a Mel-filterbank. The filters are stored in the rows, the columns correspond
|
||||
to fft bins. The filters are returned as an array of size nfilt * (nfft/2 + 1)
|
||||
|
||||
:param nfilt: the number of filters in the filterbank, default 20.
|
||||
:param nfft: the FFT size. Default is 512.
|
||||
:param samplerate: the samplerate of the signal we are working with. Affects mel spacing.
|
||||
:param lowfreq: lowest band edge of mel filters, default 0 Hz
|
||||
:param highfreq: highest band edge of mel filters, default samplerate/2
|
||||
:returns: A numpy array of size nfilt * (nfft/2 + 1) containing filterbank. Each row holds 1 filter.
|
||||
"""
|
||||
highfreq= highfreq or samplerate/2
|
||||
assert highfreq <= samplerate/2, "highfreq is greater than samplerate/2"
|
||||
|
||||
# compute points evenly spaced in mels
|
||||
lowmel = hz2mel(lowfreq)
|
||||
highmel = hz2mel(highfreq)
|
||||
|
||||
# check kaldi/src/feat/Mel-computations.h
|
||||
fbank = numpy.zeros([nfilt,nfft//2+1])
|
||||
mel_freq_delta = (highmel-lowmel)/(nfilt+1)
|
||||
for j in range(0,nfilt):
|
||||
leftmel = lowmel+j*mel_freq_delta
|
||||
centermel = lowmel+(j+1)*mel_freq_delta
|
||||
rightmel = lowmel+(j+2)*mel_freq_delta
|
||||
for i in range(0,nfft//2):
|
||||
mel=hz2mel(i*samplerate/nfft)
|
||||
if mel>leftmel and mel<rightmel:
|
||||
if mel<centermel:
|
||||
fbank[j,i]=(mel-leftmel)/(centermel-leftmel)
|
||||
else:
|
||||
fbank[j,i]=(rightmel-mel)/(rightmel-centermel)
|
||||
return fbank
|
||||
|
||||
def lifter(cepstra, L=22):
|
||||
"""Apply a cepstral lifter the the matrix of cepstra. This has the effect of increasing the
|
||||
magnitude of the high frequency DCT coeffs.
|
||||
|
||||
:param cepstra: the matrix of mel-cepstra, will be numframes * numcep in size.
|
||||
:param L: the liftering coefficient to use. Default is 22. L <= 0 disables lifter.
|
||||
"""
|
||||
if L > 0:
|
||||
nframes,ncoeff = numpy.shape(cepstra)
|
||||
n = numpy.arange(ncoeff)
|
||||
lift = 1 + (L/2.)*numpy.sin(numpy.pi*n/L)
|
||||
return lift*cepstra
|
||||
else:
|
||||
# values of L <= 0, do nothing
|
||||
return cepstra
|
||||
|
||||
def delta(feat, N):
|
||||
"""Compute delta features from a feature vector sequence.
|
||||
|
||||
:param feat: A numpy array of size (NUMFRAMES by number of features) containing features. Each row holds 1 feature vector.
|
||||
:param N: For each frame, calculate delta features based on preceding and following N frames
|
||||
:returns: A numpy array of size (NUMFRAMES by number of features) containing delta features. Each row holds 1 delta feature vector.
|
||||
"""
|
||||
if N < 1:
|
||||
raise ValueError('N must be an integer >= 1')
|
||||
NUMFRAMES = len(feat)
|
||||
denominator = 2 * sum([i**2 for i in range(1, N+1)])
|
||||
delta_feat = numpy.empty_like(feat)
|
||||
padded = numpy.pad(feat, ((N, N), (0, 0)), mode='edge') # padded version of feat
|
||||
for t in range(NUMFRAMES):
|
||||
delta_feat[t] = numpy.dot(numpy.arange(-N, N+1), padded[t : t+2*N+1]) / denominator # [t : t+2*N+1] == [(N+t)-N : (N+t)+N+1]
|
||||
return delta_feat
|
||||
|
||||
##### modify for test ######
|
||||
|
||||
def framesig_without_dither_dc_preemphasize(sig, frame_len, frame_step, wintype='hamming', stride_trick=True):
|
||||
"""Frame a signal into overlapping frames.
|
||||
|
||||
:param sig: the audio signal to frame.
|
||||
:param frame_len: length of each frame measured in samples.
|
||||
:param frame_step: number of samples after the start of the previous frame that the next frame should begin.
|
||||
:param winfunc: the analysis window to apply to each frame. By default no window is applied.
|
||||
:param stride_trick: use stride trick to compute the rolling window and window multiplication faster
|
||||
:returns: an array of frames. Size is NUMFRAMES by frame_len.
|
||||
"""
|
||||
slen = len(sig)
|
||||
frame_len = int(round_half_up(frame_len))
|
||||
frame_step = int(round_half_up(frame_step))
|
||||
if slen <= frame_len:
|
||||
numframes = 1
|
||||
else:
|
||||
numframes = 1 + (( slen - frame_len) // frame_step)
|
||||
|
||||
# check kaldi/src/feat/feature-window.h
|
||||
padsignal = sig[:(numframes-1)*frame_step+frame_len]
|
||||
|
||||
if wintype is 'povey':
|
||||
win = numpy.empty(frame_len)
|
||||
for i in range(frame_len):
|
||||
win[i] = (0.5-0.5*numpy.cos(2*numpy.pi/(frame_len-1)*i))**0.85
|
||||
elif wintype == '':
|
||||
win = numpy.ones(frame_len)
|
||||
elif wintype == 'hann':
|
||||
win = numpy.hanning(frame_len)
|
||||
else: # the hamming window
|
||||
win = numpy.hamming(frame_len)
|
||||
|
||||
if stride_trick:
|
||||
frames = rolling_window(padsignal, window=frame_len, step=frame_step)
|
||||
else:
|
||||
indices = numpy.tile(numpy.arange(0, frame_len), (numframes, 1)) + numpy.tile(
|
||||
numpy.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T
|
||||
indices = numpy.array(indices, dtype=numpy.int32)
|
||||
frames = padsignal[indices]
|
||||
win = numpy.tile(win, (numframes, 1))
|
||||
|
||||
frames = frames.astype(numpy.float32)
|
||||
raw_frames = frames
|
||||
return frames * win, raw_frames
|
||||
|
||||
|
||||
def frames(signal,samplerate=16000,winlen=0.025,winstep=0.01,
|
||||
nfilt=40,nfft=512,lowfreq=0,highfreq=None, wintype='hamming'):
|
||||
frames_with_win, raw_frames = framesig_without_dither_dc_preemphasize(signal, winlen*samplerate, winstep*samplerate, wintype)
|
||||
return frames_with_win, raw_frames
|
||||
|
||||
|
||||
def complexspec(frames, NFFT):
|
||||
"""Compute the magnitude spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
|
||||
|
||||
:param frames: the array of frames. Each row is a frame.
|
||||
:param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
|
||||
:returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the magnitude spectrum of the corresponding frame.
|
||||
"""
|
||||
if numpy.shape(frames)[1] > NFFT:
|
||||
logging.warn(
|
||||
'frame length (%d) is greater than FFT size (%d), frame will be truncated. Increase NFFT to avoid.',
|
||||
numpy.shape(frames)[1], NFFT)
|
||||
complex_spec = numpy.fft.rfft(frames, NFFT)
|
||||
return complex_spec
|
||||
|
||||
|
||||
def stft_with_window(signal,samplerate=16000,winlen=0.025,winstep=0.01,
|
||||
nfilt=40,nfft=512,lowfreq=0,highfreq=None,dither=1.0,remove_dc_offset=True, preemph=0.97,
|
||||
wintype='hamming'):
|
||||
frames_with_win, raw_frames = framesig_without_dither_dc_preemphasize(signal, winlen*samplerate, winstep*samplerate, wintype)
|
||||
|
||||
spec = magspec(frames_with_win, nfft) # nearly the same until this part
|
||||
scomplex = complexspec(frames_with_win, nfft)
|
||||
|
||||
rspec = magspec(raw_frames, nfft)
|
||||
rcomplex = complexspec(raw_frames, nfft)
|
||||
return spec, scomplex, rspec, rcomplex
|
||||
|
||||
|
||||
class TestKaldiFE(unittest.TestCase):
|
||||
def setUp(self):
|
||||
self. this_dir = Path(__file__).parent
|
||||
|
||||
self.wavpath = str(self.this_dir / 'english.wav')
|
||||
self.winlen=0.025 # ms
|
||||
self.winstep=0.01 # ms
|
||||
self.nfft=512
|
||||
self.lowfreq = 0
|
||||
self.highfreq = None
|
||||
self.wintype='hamm'
|
||||
self.nfilt=40
|
||||
|
||||
paddle.set_device('cpu')
|
||||
|
||||
|
||||
def test_read(self):
|
||||
import scipy.io.wavfile as wav
|
||||
rate, sig = wav.read(self.wavpath)
|
||||
sr, wav = kaldi.read(self.wavpath)
|
||||
wav = wav[:, 0]
|
||||
self.assertTrue(np.all(sig == wav))
|
||||
self.assertEqual(rate, sr)
|
||||
|
||||
def test_frames(self):
|
||||
sr, wav = kaldi.read(self.wavpath)
|
||||
wav = wav[:, 0]
|
||||
_, fs = frames(wav, samplerate=sr,
|
||||
winlen=self.winlen, winstep=self.winstep,
|
||||
nfilt=self.nfilt, nfft=self.nfft,
|
||||
lowfreq=self.lowfreq, highfreq=self.highfreq,
|
||||
wintype=self.wintype)
|
||||
|
||||
t_wav = paddle.to_tensor([wav], dtype='float32')
|
||||
t_wavlen = paddle.to_tensor([len(wav)])
|
||||
t_fs, t_nframe = kaldi.frames(t_wav, t_wavlen, sr, self.winlen, self.winstep, clip=False)
|
||||
t_fs = t_fs.astype(fs.dtype)[0]
|
||||
|
||||
self.assertEqual(t_nframe.item(), fs.shape[0])
|
||||
self.assertTrue(np.allclose(t_fs.numpy(), fs))
|
||||
|
||||
|
||||
def test_stft(self):
|
||||
sr, wav = kaldi.read(self.wavpath)
|
||||
wav = wav[:, 0]
|
||||
|
||||
for wintype in ['', 'hamm', 'hann', 'povey']:
|
||||
self.wintype=wintype
|
||||
_, stft_c_win, _, _ = stft_with_window(wav, samplerate=sr,
|
||||
winlen=self.winlen, winstep=self.winstep,
|
||||
nfilt=self.nfilt, nfft=self.nfft,
|
||||
lowfreq=self.lowfreq, highfreq=self.highfreq,
|
||||
wintype=self.wintype)
|
||||
|
||||
t_wav = paddle.to_tensor([wav], dtype='float32')
|
||||
t_wavlen = paddle.to_tensor([len(wav)])
|
||||
|
||||
stft_class = kaldi.STFT(self.nfft, sr, self.winlen, self.winstep, window_type=self.wintype, dither=0.0, preemph_coeff=0.0, remove_dc_offset=False, clip=False)
|
||||
t_stft, t_nframe = stft_class(t_wav, t_wavlen)
|
||||
t_stft = t_stft.astype(stft_c_win.real.dtype)[0]
|
||||
t_real = t_stft[:, :, 0]
|
||||
t_imag = t_stft[:, :, 1]
|
||||
|
||||
self.assertEqual(t_nframe.item(), stft_c_win.real.shape[0])
|
||||
|
||||
self.assertLess(np.sum(t_real.numpy()) - np.sum(stft_c_win.real), 1)
|
||||
self.assertTrue(np.allclose(t_real.numpy(), stft_c_win.real, atol=1e-1))
|
||||
|
||||
self.assertLess(np.sum(t_imag.numpy()) - np.sum(stft_c_win.imag), 1)
|
||||
self.assertTrue(np.allclose(t_imag.numpy(), stft_c_win.imag, atol=1e-1))
|
||||
|
||||
|
||||
def test_magspec(self):
|
||||
sr, wav = kaldi.read(self.wavpath)
|
||||
wav = wav[:, 0]
|
||||
for wintype in ['', 'hamm', 'hann', 'povey']:
|
||||
self.wintype=wintype
|
||||
stft_win, _, _, _ = stft_with_window(wav, samplerate=sr,
|
||||
winlen=self.winlen, winstep=self.winstep,
|
||||
nfilt=self.nfilt, nfft=self.nfft,
|
||||
lowfreq=self.lowfreq, highfreq=self.highfreq,
|
||||
wintype=self.wintype)
|
||||
|
||||
t_wav = paddle.to_tensor([wav], dtype='float32')
|
||||
t_wavlen = paddle.to_tensor([len(wav)])
|
||||
|
||||
stft_class = kaldi.STFT(self.nfft, sr, self.winlen, self.winstep, window_type=self.wintype, dither=0.0, preemph_coeff=0.0, remove_dc_offset=False, clip=False)
|
||||
t_stft, t_nframe = stft_class(t_wav, t_wavlen)
|
||||
t_stft = t_stft.astype(stft_win.dtype)
|
||||
t_spec = kaldi.magspec(t_stft)[0]
|
||||
|
||||
self.assertEqual(t_nframe.item(), stft_win.shape[0])
|
||||
|
||||
self.assertLess(np.sum(t_spec.numpy()) - np.sum(stft_win), 1)
|
||||
self.assertTrue(np.allclose(t_spec.numpy(), stft_win, atol=1e-1))
|
||||
|
||||
|
||||
def test_magsepc_winprocess(self):
|
||||
sr, wav = kaldi.read(self.wavpath)
|
||||
wav = wav[:, 0]
|
||||
fs, _= framesig(wav, self.winlen*sr, self.winstep*sr,
|
||||
dither=0.0, preemph=0.97, remove_dc_offset=True, wintype='povey', stride_trick=True)
|
||||
spec = magspec(fs, self.nfft) # nearly the same until this part
|
||||
|
||||
t_wav = paddle.to_tensor([wav], dtype='float32')
|
||||
t_wavlen = paddle.to_tensor([len(wav)])
|
||||
stft_class = kaldi.STFT(
|
||||
self.nfft, sr, self.winlen, self.winstep,
|
||||
window_type='povey', dither=0.0, preemph_coeff=0.97, remove_dc_offset=True, clip=False)
|
||||
t_stft, t_nframe = stft_class(t_wav, t_wavlen)
|
||||
t_stft = t_stft.astype(spec.dtype)
|
||||
t_spec = kaldi.magspec(t_stft)[0]
|
||||
|
||||
self.assertEqual(t_nframe.item(), fs.shape[0])
|
||||
|
||||
self.assertLess(np.sum(t_spec.numpy()) - np.sum(spec), 1)
|
||||
self.assertTrue(np.allclose(t_spec.numpy(), spec, atol=1e-1))
|
||||
|
||||
|
||||
def test_powspec(self):
|
||||
sr, wav = kaldi.read(self.wavpath)
|
||||
wav = wav[:, 0]
|
||||
for wintype in ['', 'hamm', 'hann', 'povey']:
|
||||
self.wintype=wintype
|
||||
stft_win, _, _, _ = stft_with_window(wav, samplerate=sr,
|
||||
winlen=self.winlen, winstep=self.winstep,
|
||||
nfilt=self.nfilt, nfft=self.nfft,
|
||||
lowfreq=self.lowfreq, highfreq=self.highfreq,
|
||||
wintype=self.wintype)
|
||||
stft_win = np.square(stft_win)
|
||||
|
||||
t_wav = paddle.to_tensor([wav], dtype='float32')
|
||||
t_wavlen = paddle.to_tensor([len(wav)])
|
||||
|
||||
stft_class = kaldi.STFT(self.nfft, sr, self.winlen, self.winstep, window_type=self.wintype, dither=0.0, preemph_coeff=0.0, remove_dc_offset=False, clip=False)
|
||||
t_stft, t_nframe = stft_class(t_wav, t_wavlen)
|
||||
t_stft = t_stft.astype(stft_win.dtype)
|
||||
t_spec = kaldi.powspec(t_stft)[0]
|
||||
|
||||
self.assertEqual(t_nframe.item(), stft_win.shape[0])
|
||||
|
||||
self.assertLess(np.sum(t_spec.numpy() - stft_win), 5e4)
|
||||
self.assertTrue(np.allclose(t_spec.numpy(), stft_win, atol=1e2))
|
||||
|
||||
|
||||
# from python_speech_features import mfcc
|
||||
# from python_speech_features import delta
|
||||
# from python_speech_features import logfbank
|
||||
# import scipy.io.wavfile as wav
|
||||
|
||||
# (rate,sig) = wav.read("english.wav")
|
||||
|
||||
# # note that generally nfilt=40 is used for speech recognition
|
||||
# fbank_feat = logfbank(sig,nfilt=23,lowfreq=20,dither=0,wintype='povey')
|
||||
|
||||
# # the computed fbank coefficents of english.wav with dimension [110,23]
|
||||
# # [ 12.2865 12.6906 13.1765 15.714 16.064 15.7553 16.5746 16.9205 16.6472 16.1302 16.4576 16.7326 16.8864 17.7215 18.88 19.1377 19.1495 18.6683 18.3886 20.3506 20.2772 18.8248 18.1899
|
||||
# # 11.9198 13.146 14.7215 15.8642 17.4288 16.394 16.8238 16.1095 16.4297 16.6331 16.3163 16.5093 17.4981 18.3429 19.6555 19.6263 19.8435 19.0534 19.001 20.0287 19.7707 19.5852 19.1112
|
||||
# # ...
|
||||
# # ...
|
||||
# # the same with that using kaldi commands: compute-fbank-feats --dither=0.0
|
||||
|
||||
|
||||
# mfcc_feat = mfcc(sig,dither=0,useEnergy=True,wintype='povey')
|
||||
|
||||
# # the computed mfcc coefficents of english.wav with dimension [110,13]
|
||||
# # [ 17.1337 -23.3651 -7.41751 -7.73686 -21.3682 -8.93884 -3.70843 4.68346 -16.0676 12.782 -7.24054 8.25089 10.7292
|
||||
# # 17.1692 -23.3028 -5.61872 -4.0075 -23.287 -20.6101 -5.51584 -6.15273 -14.4333 8.13052 -0.0345329 2.06274 -0.564298
|
||||
# # ...
|
||||
# # ...
|
||||
# # the same with that using kaldi commands: compute-mfcc-feats --dither=0.0
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
@ -0,0 +1,167 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
unset GREP_OPTIONS
|
||||
|
||||
set -u # Check for undefined variables
|
||||
|
||||
die() {
|
||||
# Print a message and exit with code 1.
|
||||
#
|
||||
# Usage: die <error_message>
|
||||
# e.g., die "Something bad happened."
|
||||
|
||||
echo $@
|
||||
exit 1
|
||||
}
|
||||
|
||||
echo "Collecting system information..."
|
||||
|
||||
OUTPUT_FILE=pd_env.txt
|
||||
python_bin_path=$(which python || which python3 || die "Cannot find Python binary")
|
||||
|
||||
{
|
||||
echo
|
||||
echo '== check python ==================================================='
|
||||
} >> ${OUTPUT_FILE}
|
||||
|
||||
cat <<EOF > /tmp/check_python.py
|
||||
import platform
|
||||
print("""python version: %s
|
||||
python branch: %s
|
||||
python build version: %s
|
||||
python compiler version: %s
|
||||
python implementation: %s
|
||||
""" % (
|
||||
platform.python_version(),
|
||||
platform.python_branch(),
|
||||
platform.python_build(),
|
||||
platform.python_compiler(),
|
||||
platform.python_implementation(),
|
||||
))
|
||||
EOF
|
||||
${python_bin_path} /tmp/check_python.py 2>&1 >> ${OUTPUT_FILE}
|
||||
|
||||
{
|
||||
echo
|
||||
echo '== check os platform ==============================================='
|
||||
} >> ${OUTPUT_FILE}
|
||||
|
||||
cat <<EOF > /tmp/check_os.py
|
||||
import platform
|
||||
print("""os: %s
|
||||
os kernel version: %s
|
||||
os release version: %s
|
||||
os platform: %s
|
||||
linux distribution: %s
|
||||
linux os distribution: %s
|
||||
mac version: %s
|
||||
uname: %s
|
||||
architecture: %s
|
||||
machine: %s
|
||||
""" % (
|
||||
platform.system(),
|
||||
platform.version(),
|
||||
platform.release(),
|
||||
platform.platform(),
|
||||
platform.linux_distribution(),
|
||||
platform.dist(),
|
||||
platform.mac_ver(),
|
||||
platform.uname(),
|
||||
platform.architecture(),
|
||||
platform.machine(),
|
||||
))
|
||||
EOF
|
||||
${python_bin_path} /tmp/check_os.py 2>&1 >> ${OUTPUT_FILE}
|
||||
|
||||
{
|
||||
echo
|
||||
echo '== are we in docker ============================================='
|
||||
num=`cat /proc/1/cgroup | grep docker | wc -l`;
|
||||
if [ $num -ge 1 ]; then
|
||||
echo "Yes"
|
||||
else
|
||||
echo "No"
|
||||
fi
|
||||
|
||||
echo
|
||||
echo '== compiler ====================================================='
|
||||
c++ --version 2>&1
|
||||
|
||||
echo
|
||||
echo '== check pips ==================================================='
|
||||
pip list 2>&1 | grep "proto\|numpy\|paddlepaddle"
|
||||
|
||||
|
||||
echo
|
||||
echo '== check for virtualenv ========================================='
|
||||
${python_bin_path} -c "import sys;print(hasattr(sys, \"real_prefix\"))"
|
||||
|
||||
echo
|
||||
echo '== paddlepaddle import ============================================'
|
||||
} >> ${OUTPUT_FILE}
|
||||
|
||||
cat <<EOF > /tmp/check_pd.py
|
||||
import paddle as pd;
|
||||
pd.set_device('cpu')
|
||||
print("pd.version.full_version = %s" % pd.version.full_version)
|
||||
print("pd.version.commit = %s" % pd.version.commit)
|
||||
print("pd.__version__ = %s" % pd.__version__)
|
||||
print("Sanity check: %r" % pd.zeros([1,2,3])[:1])
|
||||
EOF
|
||||
${python_bin_path} /tmp/check_pd.py 2>&1 >> ${OUTPUT_FILE}
|
||||
|
||||
LD_DEBUG=libs ${python_bin_path} -c "import paddle" 2>>${OUTPUT_FILE} > /tmp/loadedlibs
|
||||
|
||||
{
|
||||
grep libcudnn.so /tmp/loadedlibs
|
||||
echo
|
||||
echo '== env =========================================================='
|
||||
if [ -z ${LD_LIBRARY_PATH+x} ]; then
|
||||
echo "LD_LIBRARY_PATH is unset";
|
||||
else
|
||||
echo LD_LIBRARY_PATH ${LD_LIBRARY_PATH} ;
|
||||
fi
|
||||
if [ -z ${DYLD_LIBRARY_PATH+x} ]; then
|
||||
echo "DYLD_LIBRARY_PATH is unset";
|
||||
else
|
||||
echo DYLD_LIBRARY_PATH ${DYLD_LIBRARY_PATH} ;
|
||||
fi
|
||||
|
||||
|
||||
echo
|
||||
echo '== nvidia-smi ==================================================='
|
||||
nvidia-smi 2>&1
|
||||
|
||||
echo
|
||||
echo '== cuda libs ==================================================='
|
||||
} >> ${OUTPUT_FILE}
|
||||
|
||||
find /usr/local -type f -name 'libcudart*' 2>/dev/null | grep cuda | grep -v "\\.cache" >> ${OUTPUT_FILE}
|
||||
find /usr/local -type f -name 'libudnn*' 2>/dev/null | grep cuda | grep -v "\\.cache" >> ${OUTPUT_FILE}
|
||||
|
||||
{
|
||||
echo
|
||||
echo '== paddlepaddle installed from info =================='
|
||||
pip show paddlepaddle-gpu
|
||||
|
||||
echo
|
||||
echo '== python version =============================================='
|
||||
echo '(major, minor, micro, releaselevel, serial)'
|
||||
python -c 'import sys; print(sys.version_info[:])'
|
||||
|
||||
echo
|
||||
echo '== bazel version ==============================================='
|
||||
bazel version
|
||||
echo '== cmake version ==============================================='
|
||||
cmake --version
|
||||
} >> ${OUTPUT_FILE}
|
||||
|
||||
# Remove any words with google.
|
||||
mv $OUTPUT_FILE old-$OUTPUT_FILE
|
||||
grep -v -i google old-${OUTPUT_FILE} > $OUTPUT_FILE
|
||||
|
||||
echo "Wrote environment to ${OUTPUT_FILE}. You can review the contents of that file."
|
||||
echo "and use it to populate the fields in the github issue template."
|
||||
echo
|
||||
echo "cat ${OUTPUT_FILE}"
|
||||
echo
|
Loading…
Reference in new issue