From 9da38a2db60749e686095ad107a3dc17c409d6b2 Mon Sep 17 00:00:00 2001 From: Wang Xin Date: Fri, 16 Dec 2022 11:39:47 +0800 Subject: [PATCH 001/159] fix AttributeError module 'distutils' has no attribute 'ccompiler' (#2745) --- third_party/ctc_decoders/setup.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/third_party/ctc_decoders/setup.py b/third_party/ctc_decoders/setup.py index 9a8b292a0..c13f3df99 100644 --- a/third_party/ctc_decoders/setup.py +++ b/third_party/ctc_decoders/setup.py @@ -19,7 +19,8 @@ import os import platform import sys -from setuptools import distutils +import setuptools +from distutils import ccompiler from setuptools import Extension from setuptools import setup @@ -75,7 +76,7 @@ def compile_test(header, library): # hack compile to support parallel compiling -distutils.ccompiler.CCompiler.compile = parallelCCompile +ccompiler.CCompiler.compile = parallelCCompile FILES = glob.glob('kenlm/util/*.cc') \ + glob.glob('kenlm/lm/*.cc') \ From b358eb5c9964747d5e5db31e151cf5a64a71292a Mon Sep 17 00:00:00 2001 From: TianYuan Date: Fri, 16 Dec 2022 17:31:44 +0800 Subject: [PATCH 002/159] Update .pre-commit-config.yaml --- .pre-commit-config.yaml | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 15b842d55..d20c6a211 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,12 +3,8 @@ repos: rev: v0.16.0 hooks: - id: yapf - name: yapf - language: python - entry: yapf - args: [-i, -vv] - types: [python] - exclude: (?=speechx/speechx/kaldi|audio/paddleaudio/src|third_party).*(\.cpp|\.cc|\.h\.hpp|\.py)$ + files: \.py$ + exclude: (?=third_party).*(\.py)$ - repo: https://github.com/pre-commit/pre-commit-hooks rev: a11d9314b22d8f8c7556443875b731ef05965464 From 979bbd9dcb6b2214692f4751d2e5cf8dc476c806 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Fri, 16 Dec 2022 21:51:15 +0800 Subject: [PATCH 003/159] add mkldnn and trt config for paddleInference (#2748) --- paddlespeech/t2s/exps/inference.py | 27 +++++++-- paddlespeech/t2s/exps/syn_utils.py | 91 ++++++++++++++++++++++++++++-- 2 files changed, 106 insertions(+), 12 deletions(-) diff --git a/paddlespeech/t2s/exps/inference.py b/paddlespeech/t2s/exps/inference.py index e0ae20bb1..56dd7838a 100644 --- a/paddlespeech/t2s/exps/inference.py +++ b/paddlespeech/t2s/exps/inference.py @@ -88,20 +88,27 @@ def parse_args(): parser.add_argument("--output_dir", type=str, help="output dir") # inference parser.add_argument( - "--int8", + "--use_trt", type=str2bool, default=False, - help="Whether to use int8 inference.", ) + help="whether to use TensorRT or not in GPU", ) parser.add_argument( - "--fp16", + "--use_mkldnn", type=str2bool, default=False, - help="Whether to use float16 inference.", ) + help="whether to use MKLDNN or not in CPU.", ) + parser.add_argument( + "--precision", + type=str, + default='fp32', + choices=['fp32', 'fp16', 'bf16', 'int8'], + help="mode of running") parser.add_argument( "--device", default="gpu", choices=["gpu", "cpu"], help="Device selected for inference.", ) + parser.add_argument('--cpu_threads', type=int, default=1) args, _ = parser.parse_known_args() return args @@ -124,7 +131,11 @@ def main(): model_dir=args.inference_dir, model_file=args.am + ".pdmodel", params_file=args.am + ".pdiparams", - device=args.device) + device=args.device, + use_trt=args.use_trt, + use_mkldnn=args.use_mkldnn, + cpu_threads=args.cpu_threads, + precision=args.precision) # model: {model_name}_{dataset} am_dataset = args.am[args.am.rindex('_') + 1:] @@ -133,7 +144,11 @@ def main(): model_dir=args.inference_dir, model_file=args.voc + ".pdmodel", params_file=args.voc + ".pdiparams", - device=args.device) + device=args.device, + use_trt=args.use_trt, + use_mkldnn=args.use_mkldnn, + cpu_threads=args.cpu_threads, + precision=args.precision) output_dir = Path(args.output_dir) output_dir.mkdir(parents=True, exist_ok=True) diff --git a/paddlespeech/t2s/exps/syn_utils.py b/paddlespeech/t2s/exps/syn_utils.py index 0ac799813..6b693440c 100644 --- a/paddlespeech/t2s/exps/syn_utils.py +++ b/paddlespeech/t2s/exps/syn_utils.py @@ -464,18 +464,97 @@ def voc_to_static(voc_inference, # inference -def get_predictor(model_dir: Optional[os.PathLike]=None, - model_file: Optional[os.PathLike]=None, - params_file: Optional[os.PathLike]=None, - device: str='cpu'): +def get_predictor( + model_dir: Optional[os.PathLike]=None, + model_file: Optional[os.PathLike]=None, + params_file: Optional[os.PathLike]=None, + device: str='cpu', + # for gpu + use_trt: bool=False, + # for trt + use_dynamic_shape: bool=True, + min_subgraph_size: int=5, + # for cpu + cpu_threads: int=1, + use_mkldnn: bool=False, + # for trt or mkldnn + precision: int="fp32"): + """ + Args: + model_dir (os.PathLike): root path of model.pdmodel and model.pdiparams. + model_file (os.PathLike): name of model_file. + params_file (os.PathLike): name of params_file. + device (str): Choose the device you want to run, it can be: cpu/gpu, default is cpu. + use_trt (bool): whether to use TensorRT or not in GPU. + use_dynamic_shape (bool): use dynamic shape or not in TensorRT. + use_mkldnn (bool): whether to use MKLDNN or not in CPU. + cpu_threads (int): num of thread when use CPU. + precision (str): mode of running (fp32/fp16/bf16/int8). + """ + rerun_flag = False + if device != "gpu" and use_trt: + raise ValueError( + "Predict by TensorRT mode: {}, expect device=='gpu', but device == {}". + format(precision, device)) config = inference.Config( str(Path(model_dir) / model_file), str(Path(model_dir) / params_file)) + config.enable_memory_optim() + config.switch_ir_optim(True) if device == "gpu": config.enable_use_gpu(100, 0) - elif device == "cpu": + else: config.disable_gpu() - config.enable_memory_optim() + config.set_cpu_math_library_num_threads(cpu_threads) + if use_mkldnn: + # fp32 + config.enable_mkldnn() + if precision == "int8": + config.enable_mkldnn_int8({ + "conv2d_transpose", "conv2d", "depthwise_conv2d", "pool2d", + "transpose2", "elementwise_mul" + }) + # config.enable_mkldnn_int8() + elif precision in {"fp16", "bf16"}: + config.enable_mkldnn_bfloat16() + print("MKLDNN with {}".format(precision)) + if use_trt: + if precision == "bf16": + print("paddle trt does not support bf16, switching to fp16.") + precision = "fp16" + precision_map = { + "int8": inference.Config.Precision.Int8, + "fp32": inference.Config.Precision.Float32, + "fp16": inference.Config.Precision.Half, + } + assert precision in precision_map.keys() + pdtxt_name = model_file.split(".")[0] + "_" + precision + ".txt" + if use_dynamic_shape: + dynamic_shape_file = os.path.join(model_dir, pdtxt_name) + if os.path.exists(dynamic_shape_file): + config.enable_tuned_tensorrt_dynamic_shape(dynamic_shape_file, + True) + # for fastspeech2 + config.exp_disable_tensorrt_ops(["reshape2"]) + print("trt set dynamic shape done!") + else: + # In order to avoid memory overflow when collecting dynamic shapes, it is changed to use CPU. + config.disable_gpu() + config.set_cpu_math_library_num_threads(10) + config.collect_shape_range_info(dynamic_shape_file) + print("Start collect dynamic shape...") + rerun_flag = True + + if not rerun_flag: + print("Tensor RT with {}".format(precision)) + config.enable_tensorrt_engine( + workspace_size=1 << 30, + max_batch_size=1, + min_subgraph_size=min_subgraph_size, + precision_mode=precision_map[precision], + use_static=True, + use_calib_mode=False, ) + predictor = inference.create_predictor(config) return predictor From 59d4b8a2a817c0487c7a275afd4453a3d725419f Mon Sep 17 00:00:00 2001 From: TianYuan Date: Thu, 22 Dec 2022 10:21:06 +0800 Subject: [PATCH 004/159] Update run_mix.sh --- examples/other/tts_finetune/tts3/run_mix.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/other/tts_finetune/tts3/run_mix.sh b/examples/other/tts_finetune/tts3/run_mix.sh index 960278a53..7630022b3 100755 --- a/examples/other/tts_finetune/tts3/run_mix.sh +++ b/examples/other/tts_finetune/tts3/run_mix.sh @@ -8,7 +8,6 @@ input_dir=./input/SSB0005_mini newdir_name="newdir" new_dir=${input_dir}/${newdir_name} pretrained_model_dir=./pretrained_models/fastspeech2_mix_ckpt_1.2.0 -mfa_tools=./tools mfa_dir=./mfa_result dump_dir=./dump output_dir=./exp/default From 553fefff9b00d204cdd2d47441826f7f76fffdae Mon Sep 17 00:00:00 2001 From: TianYuan Date: Thu, 22 Dec 2022 10:21:30 +0800 Subject: [PATCH 005/159] Update run_en.sh --- examples/other/tts_finetune/tts3/run_en.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/other/tts_finetune/tts3/run_en.sh b/examples/other/tts_finetune/tts3/run_en.sh index 765274e85..537214863 100755 --- a/examples/other/tts_finetune/tts3/run_en.sh +++ b/examples/other/tts_finetune/tts3/run_en.sh @@ -7,7 +7,6 @@ input_dir=./input/ljspeech_mini newdir_name="newdir" new_dir=${input_dir}/${newdir_name} pretrained_model_dir=./pretrained_models/fastspeech2_vctk_ckpt_1.2.0 -mfa_tools=./tools mfa_dir=./mfa_result dump_dir=./dump output_dir=./exp/default From 355517d1ca131078e492b2de4117deca82fa7ff4 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Thu, 22 Dec 2022 10:21:49 +0800 Subject: [PATCH 006/159] Update run.sh --- examples/other/tts_finetune/tts3/run.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/other/tts_finetune/tts3/run.sh b/examples/other/tts_finetune/tts3/run.sh index ed1705f8c..cc25d8f67 100755 --- a/examples/other/tts_finetune/tts3/run.sh +++ b/examples/other/tts_finetune/tts3/run.sh @@ -8,7 +8,6 @@ input_dir=./input/csmsc_mini newdir_name="newdir" new_dir=${input_dir}/${newdir_name} pretrained_model_dir=./pretrained_models/fastspeech2_aishell3_ckpt_1.1.0 -mfa_tools=./tools mfa_dir=./mfa_result dump_dir=./dump output_dir=./exp/default From f96fbc891d6cf856215f77339214deb3d9368810 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Mon, 26 Dec 2022 13:57:57 +0800 Subject: [PATCH 007/159] fix paddlenlp's version (#2764) --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index f3e5e7288..3b58adf33 100644 --- a/setup.py +++ b/setup.py @@ -47,7 +47,7 @@ base = [ "onnxruntime==1.11.0", "opencc", "pandas", - "paddlenlp>=2.4.3", + "paddlenlp>=2.4.3, <=2.4.5", "paddlespeech_feat", "Pillow>=9.0.0", "praatio==5.0.0", From 9dd1602c7cfe4bbca3a4e947e6c70b3998027c38 Mon Sep 17 00:00:00 2001 From: Link Date: Mon, 26 Dec 2022 14:34:16 +0800 Subject: [PATCH 008/159] Fix libsndfile.so not found (#2763) Fix libsndfile.so not found --- docker/ubuntu18-cpu/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/ubuntu18-cpu/Dockerfile b/docker/ubuntu18-cpu/Dockerfile index 35f45f2e4..3ae48cb65 100644 --- a/docker/ubuntu18-cpu/Dockerfile +++ b/docker/ubuntu18-cpu/Dockerfile @@ -2,7 +2,7 @@ FROM registry.baidubce.com/paddlepaddle/paddle:2.2.2 LABEL maintainer="paddlesl@baidu.com" RUN apt-get update \ - && apt-get install libsndfile-dev \ + && apt-get install libsndfile-dev libsndfile1 \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* From 6725bcd823cf8164520b0495fa88e207b7842b27 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Tue, 27 Dec 2022 15:21:16 +0800 Subject: [PATCH 009/159] revise paddlenlp's version (#2767) --- paddlespeech/text/models/ernie_crf/model.py | 2 +- paddlespeech/text/models/ernie_linear/ernie_linear.py | 4 ++-- setup.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/paddlespeech/text/models/ernie_crf/model.py b/paddlespeech/text/models/ernie_crf/model.py index d1ce8099e..19f591b8e 100644 --- a/paddlespeech/text/models/ernie_crf/model.py +++ b/paddlespeech/text/models/ernie_crf/model.py @@ -27,7 +27,7 @@ class ErnieCrf(nn.Layer): **kwargs): super().__init__() self.ernie = ErnieForTokenClassification.from_pretrained( - pretrained_token, num_classes=num_classes, **kwargs) + pretrained_token, num_labels=num_classes, **kwargs) self.num_classes = num_classes self.crf = LinearChainCrf( self.num_classes, crf_lr=crf_lr, with_start_stop_tag=False) diff --git a/paddlespeech/text/models/ernie_linear/ernie_linear.py b/paddlespeech/text/models/ernie_linear/ernie_linear.py index c450a904c..48a261ab3 100644 --- a/paddlespeech/text/models/ernie_linear/ernie_linear.py +++ b/paddlespeech/text/models/ernie_linear/ernie_linear.py @@ -43,9 +43,9 @@ class ErnieLinear(nn.Layer): num_classes, int ) and num_classes > 0, 'Argument `num_classes` must be an integer.' self.ernie = ErnieForTokenClassification.from_pretrained( - pretrained_token, num_classes=num_classes, **kwargs) + pretrained_token, num_labels=num_classes, **kwargs) - self.num_classes = self.ernie.num_classes + self.num_classes = self.ernie.num_labels self.softmax = nn.Softmax() def forward(self, diff --git a/setup.py b/setup.py index 3b58adf33..e55f8454d 100644 --- a/setup.py +++ b/setup.py @@ -47,7 +47,7 @@ base = [ "onnxruntime==1.11.0", "opencc", "pandas", - "paddlenlp>=2.4.3, <=2.4.5", + "paddlenlp>=2.4.8", "paddlespeech_feat", "Pillow>=9.0.0", "praatio==5.0.0", From 2e51e0da9033c96741eff421d439807e97f04525 Mon Sep 17 00:00:00 2001 From: HuangLiangJie Date: Tue, 27 Dec 2022 18:21:24 +0800 Subject: [PATCH 010/159] [TTS]Fix attention bugs and sort VITS data with feats_lengths (#2770) --- paddlespeech/t2s/exps/vits/normalize.py | 2 +- paddlespeech/t2s/exps/vits/preprocess.py | 2 +- paddlespeech/t2s/exps/vits/train.py | 6 +++--- paddlespeech/t2s/modules/transformer/attention.py | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/paddlespeech/t2s/exps/vits/normalize.py b/paddlespeech/t2s/exps/vits/normalize.py index 5881ae95c..514cbef8e 100644 --- a/paddlespeech/t2s/exps/vits/normalize.py +++ b/paddlespeech/t2s/exps/vits/normalize.py @@ -187,7 +187,7 @@ def main(): record["spk_emb"] = str(item["spk_emb"]) output_metadata.append(record) - output_metadata.sort(key=itemgetter('utt_id')) + output_metadata.sort(key=itemgetter('feats_lengths')) output_metadata_path = Path(args.dumpdir) / "metadata.jsonl" with jsonlines.open(output_metadata_path, 'w') as writer: for item in output_metadata: diff --git a/paddlespeech/t2s/exps/vits/preprocess.py b/paddlespeech/t2s/exps/vits/preprocess.py index f89ab356f..2b1a40834 100644 --- a/paddlespeech/t2s/exps/vits/preprocess.py +++ b/paddlespeech/t2s/exps/vits/preprocess.py @@ -166,7 +166,7 @@ def process_sentences(config, if record: results.append(record) - results.sort(key=itemgetter("utt_id")) + results.sort(key=itemgetter("feats_lengths")) with jsonlines.open(output_dir / "metadata.jsonl", 'w') as writer: for item in results: writer.write(item) diff --git a/paddlespeech/t2s/exps/vits/train.py b/paddlespeech/t2s/exps/vits/train.py index c994faa5a..c0238a98a 100644 --- a/paddlespeech/t2s/exps/vits/train.py +++ b/paddlespeech/t2s/exps/vits/train.py @@ -24,13 +24,13 @@ import yaml from paddle import DataParallel from paddle import distributed as dist from paddle.io import DataLoader -from paddle.io import DistributedBatchSampler from paddle.optimizer import Adam from yacs.config import CfgNode from paddlespeech.t2s.datasets.am_batch_fn import vits_multi_spk_batch_fn from paddlespeech.t2s.datasets.am_batch_fn import vits_single_spk_batch_fn from paddlespeech.t2s.datasets.data_table import DataTable +from paddlespeech.t2s.datasets.sampler import ErnieSATSampler from paddlespeech.t2s.models.vits import VITS from paddlespeech.t2s.models.vits import VITSEvaluator from paddlespeech.t2s.models.vits import VITSUpdater @@ -107,12 +107,12 @@ def train_sp(args, config): converters=converters, ) # collate function and dataloader - train_sampler = DistributedBatchSampler( + train_sampler = ErnieSATSampler( train_dataset, batch_size=config.batch_size, shuffle=True, drop_last=True) - dev_sampler = DistributedBatchSampler( + dev_sampler = ErnieSATSampler( dev_dataset, batch_size=config.batch_size, shuffle=False, diff --git a/paddlespeech/t2s/modules/transformer/attention.py b/paddlespeech/t2s/modules/transformer/attention.py index d7a032445..e3c9a992a 100644 --- a/paddlespeech/t2s/modules/transformer/attention.py +++ b/paddlespeech/t2s/modules/transformer/attention.py @@ -196,7 +196,7 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention): if self.zero_triu: ones = paddle.ones((t1, t2)) - x = x * paddle.tril(ones, t2 - 1)[None, None, :, :] + x = x * paddle.tril(ones, t2 - t1)[None, None, :, :] return x @@ -299,7 +299,7 @@ class LegacyRelPositionMultiHeadedAttention(MultiHeadedAttention): if self.zero_triu: ones = paddle.ones((t1, t2)) - x = x * paddle.tril(ones, t2 - 1)[None, None, :, :] + x = x * paddle.tril(ones, t2 - t1)[None, None, :, :] return x From 1e8394202c38882db82c07fabfcb49ee559dbaac Mon Sep 17 00:00:00 2001 From: TianYuan Date: Wed, 28 Dec 2022 14:51:55 +0800 Subject: [PATCH 011/159] fix onnxruntime's version (#2771) --- setup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index e55f8454d..4f6d10d7c 100644 --- a/setup.py +++ b/setup.py @@ -44,8 +44,9 @@ base = [ "loguru", "matplotlib", "nara_wpe", - "onnxruntime==1.11.0", + "onnxruntime>=1.11.0", "opencc", + "opencc-python-reimplemented", "pandas", "paddlenlp>=2.4.8", "paddlespeech_feat", From 96d76c83ade6dad94738ef4ae1a095dec3f858b3 Mon Sep 17 00:00:00 2001 From: liangym <34430015+lym0302@users.noreply.github.com> Date: Wed, 28 Dec 2022 21:58:44 +0800 Subject: [PATCH 012/159] multi-spk tts static model (#2779) * updata readme, test=doc * update yaml and readme, test=tts * fix batch_size, test=tts * update readme, test=doc * chmod, test=tts * add multi-spk tts static model infer on server, test=tts --- .../server/engine/tts/paddleinference/tts_engine.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/paddlespeech/server/engine/tts/paddleinference/tts_engine.py b/paddlespeech/server/engine/tts/paddleinference/tts_engine.py index 2878c852b..20b98fae6 100644 --- a/paddlespeech/server/engine/tts/paddleinference/tts_engine.py +++ b/paddlespeech/server/engine/tts/paddleinference/tts_engine.py @@ -247,11 +247,13 @@ class TTSServerExecutor(TTSExecutor): else: # multi speaker do not have static model if am_dataset in {"aishell3", "vctk"}: - pass + am_result = run_model( + self.am_predictor, + [part_phone_ids.numpy(), np.array([spk_id])]) else: am_result = run_model(self.am_predictor, [part_phone_ids.numpy()]) - mel = am_result[0] + mel = am_result[0] self.am_time += (time.time() - am_st) # voc From 964211a81bb59d9f5da9a78cca95787e5bb219b8 Mon Sep 17 00:00:00 2001 From: HuangLiangJie Date: Tue, 3 Jan 2023 19:31:27 +0800 Subject: [PATCH 013/159] Change optimizer for vits, test=tts (#2791) --- paddlespeech/t2s/exps/vits/train.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/paddlespeech/t2s/exps/vits/train.py b/paddlespeech/t2s/exps/vits/train.py index c0238a98a..07301db56 100644 --- a/paddlespeech/t2s/exps/vits/train.py +++ b/paddlespeech/t2s/exps/vits/train.py @@ -24,7 +24,7 @@ import yaml from paddle import DataParallel from paddle import distributed as dist from paddle.io import DataLoader -from paddle.optimizer import Adam +from paddle.optimizer import AdamW from yacs.config import CfgNode from paddlespeech.t2s.datasets.am_batch_fn import vits_multi_spk_batch_fn @@ -164,14 +164,14 @@ def train_sp(args, config): lr_schedule_g = scheduler_classes[config["generator_scheduler"]]( **config["generator_scheduler_params"]) - optimizer_g = Adam( + optimizer_g = AdamW( learning_rate=lr_schedule_g, parameters=gen_parameters, **config["generator_optimizer_params"]) lr_schedule_d = scheduler_classes[config["discriminator_scheduler"]]( **config["discriminator_scheduler_params"]) - optimizer_d = Adam( + optimizer_d = AdamW( learning_rate=lr_schedule_d, parameters=dis_parameters, **config["discriminator_optimizer_params"]) From d7a6268bcce1094e29549a507bc405d110e2761f Mon Sep 17 00:00:00 2001 From: YangZhou <56786796+SmileGoat@users.noreply.github.com> Date: Fri, 6 Jan 2023 10:30:04 +0800 Subject: [PATCH 014/159] [audio]replace kaldi fbank with kaldi-native-fbank in paddleaudio (#2799) * replace kaldi_fbank with kaldi-native-fbank in paddleaudio * fix mac --- .pre-commit-config.yaml | 6 +- audio/CMakeLists.txt | 10 +- audio/paddleaudio/CMakeLists.txt | 16 - audio/paddleaudio/kaldi/__init__.py | 2 +- audio/paddleaudio/kaldi/kaldi.py | 95 +- audio/paddleaudio/src/CMakeLists.txt | 18 +- .../src/pybind/kaldi/feature_common.h | 16 +- .../src/pybind/kaldi/feature_common_inl.h | 89 +- .../src/pybind/kaldi/kaldi_feature.cc | 40 +- .../src/pybind/kaldi/kaldi_feature.h | 16 +- .../src/pybind/kaldi/kaldi_feature_wrapper.cc | 21 +- .../src/pybind/kaldi/kaldi_feature_wrapper.h | 8 +- audio/paddleaudio/src/pybind/pybind.cpp | 90 +- audio/paddleaudio/third_party/CMakeLists.txt | 5 +- .../kaldi-native-fbank/csrc/CMakeLists.txt | 22 + .../kaldi-native-fbank/csrc/feature-fbank.cc | 117 + .../kaldi-native-fbank/csrc/feature-fbank.h | 132 + .../csrc/feature-functions.cc | 49 + .../csrc/feature-functions.h | 38 + .../kaldi-native-fbank/csrc/feature-window.cc | 236 ++ .../kaldi-native-fbank/csrc/feature-window.h | 178 + .../kaldi-native-fbank/csrc/fftsg.c | 3271 +++++++++++++++++ .../kaldi-native-fbank/csrc/log.cc | 143 + .../third_party/kaldi-native-fbank/csrc/log.h | 347 ++ .../csrc/mel-computations.cc | 256 ++ .../csrc/mel-computations.h | 115 + .../kaldi-native-fbank/csrc/rfft.cc | 66 + .../kaldi-native-fbank/csrc/rfft.h | 56 + .../third_party/kaldi/CMakeLists.txt | 111 - audio/setup.py | 24 +- 30 files changed, 5234 insertions(+), 359 deletions(-) create mode 100644 audio/paddleaudio/third_party/kaldi-native-fbank/csrc/CMakeLists.txt create mode 100644 audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-fbank.cc create mode 100644 audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-fbank.h create mode 100644 audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-functions.cc create mode 100644 audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-functions.h create mode 100644 audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-window.cc create mode 100644 audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-window.h create mode 100644 audio/paddleaudio/third_party/kaldi-native-fbank/csrc/fftsg.c create mode 100644 audio/paddleaudio/third_party/kaldi-native-fbank/csrc/log.cc create mode 100644 audio/paddleaudio/third_party/kaldi-native-fbank/csrc/log.h create mode 100644 audio/paddleaudio/third_party/kaldi-native-fbank/csrc/mel-computations.cc create mode 100644 audio/paddleaudio/third_party/kaldi-native-fbank/csrc/mel-computations.h create mode 100644 audio/paddleaudio/third_party/kaldi-native-fbank/csrc/rfft.cc create mode 100644 audio/paddleaudio/third_party/kaldi-native-fbank/csrc/rfft.h delete mode 100644 audio/paddleaudio/third_party/kaldi/CMakeLists.txt diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d20c6a211..53fc6ba0e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -31,7 +31,7 @@ repos: - --ignore=E501,E228,E226,E261,E266,E128,E402,W503 - --builtins=G,request - --jobs=1 - exclude: (?=speechx/speechx/kaldi|audio/paddleaudio/src|third_party).*(\.cpp|\.cc|\.h\.hpp|\.py)$ + exclude: (?=speechx/speechx/kaldi|audio/paddleaudio/src|audio/paddleaudio/third_party|third_party).*(\.cpp|\.cc|\.h\.hpp|\.py)$ - repo : https://github.com/Lucas-C/pre-commit-hooks rev: v1.0.1 @@ -53,13 +53,13 @@ repos: entry: bash .pre-commit-hooks/clang-format.hook -i language: system files: \.(h\+\+|h|hh|hxx|hpp|cuh|c|cc|cpp|cu|c\+\+|cxx|tpp|txx)$ - exclude: (?=speechx/speechx/kaldi|audio/paddleaudio/src|speechx/patch|speechx/tools/fstbin|speechx/tools/lmbin|third_party/ctc_decoders).*(\.cpp|\.cc|\.h|\.hpp|\.py)$ + exclude: (?=speechx/speechx/kaldi|audio/paddleaudio/src|audio/paddleaudio/third_party/kaldi-native-fbank/csrc|speechx/patch|speechx/tools/fstbin|speechx/tools/lmbin|third_party/ctc_decoders).*(\.cpp|\.cc|\.h|\.hpp|\.py)$ - id: cpplint name: cpplint description: Static code analysis of C/C++ files language: python files: \.(h\+\+|h|hh|hxx|hpp|cuh|c|cc|cpp|cu|c\+\+|cxx|tpp|txx)$ - exclude: (?=speechx/speechx/kaldi|audio/paddleaudio/src|speechx/patch|speechx/tools/fstbin|speechx/tools/lmbin|third_party/ctc_decoders).*(\.cpp|\.cc|\.h|\.hpp|\.py)$ + exclude: (?=speechx/speechx/kaldi|audio/paddleaudio/src|audio/paddleaudio/third_party/kaldi-native-fbank/csrc|speechx/patch|speechx/tools/fstbin|speechx/tools/lmbin|third_party/ctc_decoders).*(\.cpp|\.cc|\.h|\.hpp|\.py)$ entry: cpplint --filter=-build,-whitespace,+whitespace/comma,-whitespace/indent - repo: https://github.com/asottile/reorder_python_imports rev: v2.4.0 diff --git a/audio/CMakeLists.txt b/audio/CMakeLists.txt index d9ae63cd2..021e24477 100644 --- a/audio/CMakeLists.txt +++ b/audio/CMakeLists.txt @@ -41,24 +41,18 @@ option(BUILD_PADDLEAUDIO_PYTHON_EXTENSION "Build Python extension" ON) # cmake set(CMAKE_MODULE_PATH "${CMAKE_MODULE_PATH};${PROJECT_SOURCE_DIR}/cmake;${PROJECT_SOURCE_DIR}/cmake/external") -if (NOT MSVC) - find_package(GFortranLibs REQUIRED) - include(FortranCInterface) - include(FindGFortranLibs REQUIRED) -endif() - # fc_patch dir set(FETCHCONTENT_QUIET off) get_filename_component(fc_patch "fc_patch" REALPATH BASE_DIR "${CMAKE_SOURCE_DIR}") set(FETCHCONTENT_BASE_DIR ${fc_patch}) set(THIRD_PARTY_PATH ${fc_patch}) -include(openblas) - set(PYBIND11_PYTHON_VERSION ${PY_VERSION}) include(cmake/pybind.cmake) include_directories(${PYTHON_INCLUDE_DIR}) +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/paddleaudio/third_party/) + # packages find_package(Python3 COMPONENTS Interpreter Development) diff --git a/audio/paddleaudio/CMakeLists.txt b/audio/paddleaudio/CMakeLists.txt index dbf2bd3eb..c6b43c780 100644 --- a/audio/paddleaudio/CMakeLists.txt +++ b/audio/paddleaudio/CMakeLists.txt @@ -1,19 +1,3 @@ add_subdirectory(third_party) add_subdirectory(src) - -if (APPLE) - file(COPY ${GFORTRAN_LIBRARIES_DIR}/libgcc_s.1.1.dylib - DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/lib) -endif(APPLE) - -if (UNIX AND NOT APPLE) - file(COPY ${GFORTRAN_LIBRARIES_DIR}/libgfortran.so.5 - DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/lib FOLLOW_SYMLINK_CHAIN) - - file(COPY ${GFORTRAN_LIBRARIES_DIR}/libquadmath.so.0 - DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/lib FOLLOW_SYMLINK_CHAIN) - - file(COPY ${GFORTRAN_LIBRARIES_DIR}/libgcc_s.so.1 - DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/lib FOLLOW_SYMLINK_CHAIN) -endif() diff --git a/audio/paddleaudio/kaldi/__init__.py b/audio/paddleaudio/kaldi/__init__.py index f951e280a..a0ae644d1 100644 --- a/audio/paddleaudio/kaldi/__init__.py +++ b/audio/paddleaudio/kaldi/__init__.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. from .kaldi import fbank -from .kaldi import pitch +#from .kaldi import pitch diff --git a/audio/paddleaudio/kaldi/kaldi.py b/audio/paddleaudio/kaldi/kaldi.py index 16969d772..0f080de04 100644 --- a/audio/paddleaudio/kaldi/kaldi.py +++ b/audio/paddleaudio/kaldi/kaldi.py @@ -16,7 +16,6 @@ from paddleaudio._internal import module_utils __all__ = [ 'fbank', - 'pitch', ] @@ -33,8 +32,6 @@ def fbank( round_to_power_of_two: bool=True, blackman_coeff: float=0.42, snip_edges: bool=True, - allow_downsample: bool=False, - allow_upsample: bool=False, max_feature_vectors: int=-1, num_bins: int=23, low_freq: float=20, @@ -62,8 +59,6 @@ def fbank( frame_opts.round_to_power_of_two = round_to_power_of_two frame_opts.blackman_coeff = blackman_coeff frame_opts.snip_edges = snip_edges - frame_opts.allow_downsample = allow_downsample - frame_opts.allow_upsample = allow_upsample frame_opts.max_feature_vectors = max_feature_vectors mel_opts.num_bins = num_bins @@ -85,48 +80,48 @@ def fbank( return feat -@module_utils.requires_kaldi() -def pitch(wav, - samp_freq: int=16000, - frame_shift_ms: float=10.0, - frame_length_ms: float=25.0, - preemph_coeff: float=0.0, - min_f0: int=50, - max_f0: int=400, - soft_min_f0: float=10.0, - penalty_factor: float=0.1, - lowpass_cutoff: int=1000, - resample_freq: int=4000, - delta_pitch: float=0.005, - nccf_ballast: int=7000, - lowpass_filter_width: int=1, - upsample_filter_width: int=5, - max_frames_latency: int=0, - frames_per_chunk: int=0, - simulate_first_pass_online: bool=False, - recompute_frame: int=500, - nccf_ballast_online: bool=False, - snip_edges: bool=True): - pitch_opts = paddleaudio._paddleaudio.PitchExtractionOptions() - pitch_opts.samp_freq = samp_freq - pitch_opts.frame_shift_ms = frame_shift_ms - pitch_opts.frame_length_ms = frame_length_ms - pitch_opts.preemph_coeff = preemph_coeff - pitch_opts.min_f0 = min_f0 - pitch_opts.max_f0 = max_f0 - pitch_opts.soft_min_f0 = soft_min_f0 - pitch_opts.penalty_factor = penalty_factor - pitch_opts.lowpass_cutoff = lowpass_cutoff - pitch_opts.resample_freq = resample_freq - pitch_opts.delta_pitch = delta_pitch - pitch_opts.nccf_ballast = nccf_ballast - pitch_opts.lowpass_filter_width = lowpass_filter_width - pitch_opts.upsample_filter_width = upsample_filter_width - pitch_opts.max_frames_latency = max_frames_latency - pitch_opts.frames_per_chunk = frames_per_chunk - pitch_opts.simulate_first_pass_online = simulate_first_pass_online - pitch_opts.recompute_frame = recompute_frame - pitch_opts.nccf_ballast_online = nccf_ballast_online - pitch_opts.snip_edges = snip_edges - pitch = paddleaudio._paddleaudio.ComputeKaldiPitch(pitch_opts, wav) - return pitch +#@module_utils.requires_kaldi() +#def pitch(wav, +#samp_freq: int=16000, +#frame_shift_ms: float=10.0, +#frame_length_ms: float=25.0, +#preemph_coeff: float=0.0, +#min_f0: int=50, +#max_f0: int=400, +#soft_min_f0: float=10.0, +#penalty_factor: float=0.1, +#lowpass_cutoff: int=1000, +#resample_freq: int=4000, +#delta_pitch: float=0.005, +#nccf_ballast: int=7000, +#lowpass_filter_width: int=1, +#upsample_filter_width: int=5, +#max_frames_latency: int=0, +#frames_per_chunk: int=0, +#simulate_first_pass_online: bool=False, +#recompute_frame: int=500, +#nccf_ballast_online: bool=False, +#snip_edges: bool=True): +#pitch_opts = paddleaudio._paddleaudio.PitchExtractionOptions() +#pitch_opts.samp_freq = samp_freq +#pitch_opts.frame_shift_ms = frame_shift_ms +#pitch_opts.frame_length_ms = frame_length_ms +#pitch_opts.preemph_coeff = preemph_coeff +#pitch_opts.min_f0 = min_f0 +#pitch_opts.max_f0 = max_f0 +#pitch_opts.soft_min_f0 = soft_min_f0 +#pitch_opts.penalty_factor = penalty_factor +#pitch_opts.lowpass_cutoff = lowpass_cutoff +#pitch_opts.resample_freq = resample_freq +#pitch_opts.delta_pitch = delta_pitch +#pitch_opts.nccf_ballast = nccf_ballast +#pitch_opts.lowpass_filter_width = lowpass_filter_width +#pitch_opts.upsample_filter_width = upsample_filter_width +#pitch_opts.max_frames_latency = max_frames_latency +#pitch_opts.frames_per_chunk = frames_per_chunk +#pitch_opts.simulate_first_pass_online = simulate_first_pass_online +#pitch_opts.recompute_frame = recompute_frame +#pitch_opts.nccf_ballast_online = nccf_ballast_online +#pitch_opts.snip_edges = snip_edges +#pitch = paddleaudio._paddleaudio.ComputeKaldiPitch(pitch_opts, wav) +#return pitch diff --git a/audio/paddleaudio/src/CMakeLists.txt b/audio/paddleaudio/src/CMakeLists.txt index fb6f32092..21e0f170d 100644 --- a/audio/paddleaudio/src/CMakeLists.txt +++ b/audio/paddleaudio/src/CMakeLists.txt @@ -52,7 +52,7 @@ if(BUILD_KALDI) list( APPEND LIBPADDLEAUDIO_LINK_LIBRARIES - libkaldi + kaldi-native-fbank-core ) list( APPEND @@ -92,14 +92,6 @@ define_library( "${LIBPADDLEAUDIO_COMPILE_DEFINITIONS}" ) -if (APPLE) - add_custom_command(TARGET libpaddleaudio POST_BUILD COMMAND install_name_tool -change "${GFORTRAN_LIBRARIES_DIR}/libgcc_s.1.1.dylib" "@loader_path/libgcc_s.1.1.dylib" libpaddleaudio.so) -endif(APPLE) - -if (UNIX AND NOT APPLE) - set_target_properties(libpaddleaudio PROPERTIES INSTALL_RPATH "$ORIGIN") -endif() - if (APPLE) set(AUDIO_LIBRARY libpaddleaudio CACHE INTERNAL "") else() @@ -207,11 +199,3 @@ define_extension( # ) # endif() endif() - -if (APPLE) - add_custom_command(TARGET _paddleaudio POST_BUILD COMMAND install_name_tool -change "${GFORTRAN_LIBRARIES_DIR}/libgcc_s.1.1.dylib" "@loader_path/lib/libgcc_s.1.1.dylib" _paddleaudio.so) -endif(APPLE) - -if (UNIX AND NOT APPLE) - set_target_properties(_paddleaudio PROPERTIES INSTALL_RPATH "$ORIGIN/lib") -endif() diff --git a/audio/paddleaudio/src/pybind/kaldi/feature_common.h b/audio/paddleaudio/src/pybind/kaldi/feature_common.h index 05522bb7e..6571fa3eb 100644 --- a/audio/paddleaudio/src/pybind/kaldi/feature_common.h +++ b/audio/paddleaudio/src/pybind/kaldi/feature_common.h @@ -16,7 +16,7 @@ #include "pybind11/pybind11.h" #include "pybind11/numpy.h" -#include "feat/feature-window.h" +#include "kaldi-native-fbank/csrc/feature-window.h" namespace paddleaudio { namespace kaldi { @@ -28,18 +28,18 @@ class StreamingFeatureTpl { public: typedef typename F::Options Options; StreamingFeatureTpl(const Options& opts); - bool ComputeFeature(const ::kaldi::VectorBase<::kaldi::BaseFloat>& wav, - ::kaldi::Vector<::kaldi::BaseFloat>* feats); - void Reset() { remained_wav_.Resize(0); } + bool ComputeFeature(const std::vector& wav, + std::vector* feats); + void Reset() { remained_wav_.resize(0); } int Dim() { return computer_.Dim(); } private: - bool Compute(const ::kaldi::Vector<::kaldi::BaseFloat>& waves, - ::kaldi::Vector<::kaldi::BaseFloat>* feats); + bool Compute(const std::vector& waves, + std::vector* feats); Options opts_; - ::kaldi::FeatureWindowFunction window_function_; - ::kaldi::Vector<::kaldi::BaseFloat> remained_wav_; + knf::FeatureWindowFunction window_function_; + std::vector remained_wav_; F computer_; }; diff --git a/audio/paddleaudio/src/pybind/kaldi/feature_common_inl.h b/audio/paddleaudio/src/pybind/kaldi/feature_common_inl.h index c894b9775..985d586fe 100644 --- a/audio/paddleaudio/src/pybind/kaldi/feature_common_inl.h +++ b/audio/paddleaudio/src/pybind/kaldi/feature_common_inl.h @@ -12,7 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "base/kaldi-common.h" namespace paddleaudio { namespace kaldi { @@ -25,24 +24,29 @@ StreamingFeatureTpl::StreamingFeatureTpl(const Options& opts) template bool StreamingFeatureTpl::ComputeFeature( - const ::kaldi::VectorBase<::kaldi::BaseFloat>& wav, - ::kaldi::Vector<::kaldi::BaseFloat>* feats) { + const std::vector& wav, + std::vector* feats) { // append remaned waves - ::kaldi::int32 wav_len = wav.Dim(); + int wav_len = wav.size(); if (wav_len == 0) return false; - ::kaldi::int32 left_len = remained_wav_.Dim(); - ::kaldi::Vector<::kaldi::BaseFloat> waves(left_len + wav_len); - waves.Range(0, left_len).CopyFromVec(remained_wav_); - waves.Range(left_len, wav_len).CopyFromVec(wav); + int left_len = remained_wav_.size(); + std::vector waves(left_len + wav_len); + std::memcpy(waves.data(), + remained_wav_.data(), + left_len * sizeof(float)); + std::memcpy(waves.data() + left_len, + wav.data(), + wav_len * sizeof(float)); // cache remaned waves - ::kaldi::FrameExtractionOptions frame_opts = computer_.GetFrameOptions(); - ::kaldi::int32 num_frames = ::kaldi::NumFrames(waves.Dim(), frame_opts); - ::kaldi::int32 frame_shift = frame_opts.WindowShift(); - ::kaldi::int32 left_samples = waves.Dim() - frame_shift * num_frames; - remained_wav_.Resize(left_samples); - remained_wav_.CopyFromVec( - waves.Range(frame_shift * num_frames, left_samples)); + knf::FrameExtractionOptions frame_opts = computer_.GetFrameOptions(); + int num_frames = knf::NumFrames(waves.size(), frame_opts); + int frame_shift = frame_opts.WindowShift(); + int left_samples = waves.size() - frame_shift * num_frames; + remained_wav_.resize(left_samples); + std::memcpy(remained_wav_.data(), + waves.data() + frame_shift * num_frames, + left_samples * sizeof(float)); // compute speech feature Compute(waves, feats); @@ -51,40 +55,39 @@ bool StreamingFeatureTpl::ComputeFeature( // Compute feat template -bool StreamingFeatureTpl::Compute( - const ::kaldi::Vector<::kaldi::BaseFloat>& waves, - ::kaldi::Vector<::kaldi::BaseFloat>* feats) { - ::kaldi::BaseFloat vtln_warp = 1.0; - const ::kaldi::FrameExtractionOptions& frame_opts = - computer_.GetFrameOptions(); - ::kaldi::int32 num_samples = waves.Dim(); - ::kaldi::int32 frame_length = frame_opts.WindowSize(); - ::kaldi::int32 sample_rate = frame_opts.samp_freq; +bool StreamingFeatureTpl::Compute(const std::vector& waves, + std::vector* feats) { + const knf::FrameExtractionOptions& frame_opts = computer_.GetFrameOptions(); + int num_samples = waves.size(); + int frame_length = frame_opts.WindowSize(); + int sample_rate = frame_opts.samp_freq; if (num_samples < frame_length) { - return false; + return true; } - ::kaldi::int32 num_frames = ::kaldi::NumFrames(num_samples, frame_opts); - feats->Resize(num_frames * Dim()); + int num_frames = knf::NumFrames(num_samples, frame_opts); + feats->resize(num_frames * Dim()); - ::kaldi::Vector<::kaldi::BaseFloat> window; + std::vector window; bool need_raw_log_energy = computer_.NeedRawLogEnergy(); - for (::kaldi::int32 frame = 0; frame < num_frames; frame++) { - ::kaldi::BaseFloat raw_log_energy = 0.0; - ::kaldi::ExtractWindow(0, - waves, - frame, - frame_opts, - window_function_, - &window, - need_raw_log_energy ? &raw_log_energy : NULL); + for (int frame = 0; frame < num_frames; frame++) { + std::fill(window.begin(), window.end(), 0); + float raw_log_energy = 0.0; + float vtln_warp = 1.0; + knf::ExtractWindow(0, + waves, + frame, + frame_opts, + window_function_, + &window, + need_raw_log_energy ? &raw_log_energy : NULL); - ::kaldi::Vector<::kaldi::BaseFloat> this_feature(computer_.Dim(), - ::kaldi::kUndefined); - computer_.Compute(raw_log_energy, vtln_warp, &window, &this_feature); - ::kaldi::SubVector<::kaldi::BaseFloat> output_row( - feats->Data() + frame * Dim(), Dim()); - output_row.CopyFromVec(this_feature); + std::vector this_feature(computer_.Dim()); + computer_.Compute( + raw_log_energy, vtln_warp, &window, this_feature.data()); + std::memcpy(feats->data() + frame * Dim(), + this_feature.data(), + sizeof(float) * Dim()); } return true; } diff --git a/audio/paddleaudio/src/pybind/kaldi/kaldi_feature.cc b/audio/paddleaudio/src/pybind/kaldi/kaldi_feature.cc index 40e3786e8..83df454c5 100644 --- a/audio/paddleaudio/src/pybind/kaldi/kaldi_feature.cc +++ b/audio/paddleaudio/src/pybind/kaldi/kaldi_feature.cc @@ -13,16 +13,16 @@ // limitations under the License. #include "paddleaudio/src/pybind/kaldi/kaldi_feature.h" -#include "feat/pitch-functions.h" +//#include "feat/pitch-functions.h" namespace paddleaudio { namespace kaldi { bool InitFbank( - ::kaldi::FrameExtractionOptions frame_opts, - ::kaldi::MelBanksOptions mel_opts, + knf::FrameExtractionOptions frame_opts, + knf::MelBanksOptions mel_opts, FbankOptions fbank_opts) { - ::kaldi::FbankOptions opts; + knf::FbankOptions opts; opts.frame_opts = frame_opts; opts.mel_opts = mel_opts; opts.use_energy = fbank_opts.use_energy; @@ -41,8 +41,8 @@ py::array_t ComputeFbankStreaming(const py::array_t& wav) { } py::array_t ComputeFbank( - ::kaldi::FrameExtractionOptions frame_opts, - ::kaldi::MelBanksOptions mel_opts, + knf::FrameExtractionOptions frame_opts, + knf::MelBanksOptions mel_opts, FbankOptions fbank_opts, const py::array_t& wav) { InitFbank(frame_opts, mel_opts, fbank_opts); @@ -55,21 +55,21 @@ void ResetFbank() { paddleaudio::kaldi::KaldiFeatureWrapper::GetInstance()->ResetFbank(); } -py::array_t ComputeKaldiPitch( - const ::kaldi::PitchExtractionOptions& opts, - const py::array_t& wav) { - py::buffer_info info = wav.request(); - ::kaldi::SubVector<::kaldi::BaseFloat> input_wav((float*)info.ptr, info.size); +//py::array_t ComputeKaldiPitch( + //const ::kaldi::PitchExtractionOptions& opts, + //const py::array_t& wav) { + //py::buffer_info info = wav.request(); + //::kaldi::SubVector<::kaldi::BaseFloat> input_wav((float*)info.ptr, info.size); - ::kaldi::Matrix<::kaldi::BaseFloat> features; - ::kaldi::ComputeKaldiPitch(opts, input_wav, &features); - auto result = py::array_t({features.NumRows(), features.NumCols()}); - for (int row_idx = 0; row_idx < features.NumRows(); ++row_idx) { - std::memcpy(result.mutable_data(row_idx), features.Row(row_idx).Data(), - sizeof(float)*features.NumCols()); - } - return result; -} + //::kaldi::Matrix<::kaldi::BaseFloat> features; + //::kaldi::ComputeKaldiPitch(opts, input_wav, &features); + //auto result = py::array_t({features.NumRows(), features.NumCols()}); + //for (int row_idx = 0; row_idx < features.NumRows(); ++row_idx) { + //std::memcpy(result.mutable_data(row_idx), features.Row(row_idx).Data(), + //sizeof(float)*features.NumCols()); + //} + //return result; +//} } // namespace kaldi } // namespace paddleaudio diff --git a/audio/paddleaudio/src/pybind/kaldi/kaldi_feature.h b/audio/paddleaudio/src/pybind/kaldi/kaldi_feature.h index e059c52c1..031ec863b 100644 --- a/audio/paddleaudio/src/pybind/kaldi/kaldi_feature.h +++ b/audio/paddleaudio/src/pybind/kaldi/kaldi_feature.h @@ -19,7 +19,7 @@ #include #include "paddleaudio/src/pybind/kaldi/kaldi_feature_wrapper.h" -#include "feat/pitch-functions.h" +//#include "feat/pitch-functions.h" namespace py = pybind11; @@ -42,13 +42,13 @@ struct FbankOptions{ }; bool InitFbank( - ::kaldi::FrameExtractionOptions frame_opts, - ::kaldi::MelBanksOptions mel_opts, + knf::FrameExtractionOptions frame_opts, + knf::MelBanksOptions mel_opts, FbankOptions fbank_opts); py::array_t ComputeFbank( - ::kaldi::FrameExtractionOptions frame_opts, - ::kaldi::MelBanksOptions mel_opts, + knf::FrameExtractionOptions frame_opts, + knf::MelBanksOptions mel_opts, FbankOptions fbank_opts, const py::array_t& wav); @@ -56,9 +56,9 @@ py::array_t ComputeFbankStreaming(const py::array_t& wav); void ResetFbank(); -py::array_t ComputeKaldiPitch( - const ::kaldi::PitchExtractionOptions& opts, - const py::array_t& wav); +//py::array_t ComputeKaldiPitch( + //const ::kaldi::PitchExtractionOptions& opts, + //const py::array_t& wav); } // namespace kaldi } // namespace paddleaudio diff --git a/audio/paddleaudio/src/pybind/kaldi/kaldi_feature_wrapper.cc b/audio/paddleaudio/src/pybind/kaldi/kaldi_feature_wrapper.cc index 79558046b..8b8ff18be 100644 --- a/audio/paddleaudio/src/pybind/kaldi/kaldi_feature_wrapper.cc +++ b/audio/paddleaudio/src/pybind/kaldi/kaldi_feature_wrapper.cc @@ -22,7 +22,7 @@ KaldiFeatureWrapper* KaldiFeatureWrapper::GetInstance() { return &instance; } -bool KaldiFeatureWrapper::InitFbank(::kaldi::FbankOptions opts) { +bool KaldiFeatureWrapper::InitFbank(knf::FbankOptions opts) { fbank_.reset(new Fbank(opts)); return true; } @@ -30,21 +30,18 @@ bool KaldiFeatureWrapper::InitFbank(::kaldi::FbankOptions opts) { py::array_t KaldiFeatureWrapper::ComputeFbank( const py::array_t wav) { py::buffer_info info = wav.request(); - ::kaldi::SubVector<::kaldi::BaseFloat> input_wav((float*)info.ptr, info.size); + std::vector input_wav((float*)info.ptr, (float*)info.ptr + info.size); - ::kaldi::Vector<::kaldi::BaseFloat> feats; + std::vector feats; bool flag = fbank_->ComputeFeature(input_wav, &feats); - if (flag == false || feats.Dim() == 0) return py::array_t(); - auto result = py::array_t(feats.Dim()); + if (flag == false || feats.size() == 0) return py::array_t(); + auto result = py::array_t(feats.size()); py::buffer_info xs = result.request(); - std::cout << std::endl; float* res_ptr = (float*)xs.ptr; - for (int idx = 0; idx < feats.Dim(); ++idx) { - *res_ptr = feats(idx); - res_ptr++; - } - - return result.reshape({feats.Dim() / Dim(), Dim()}); + std::memcpy(res_ptr, feats.data(), sizeof(float)*feats.size()); + std::vector shape{static_cast(feats.size() / Dim()), + static_cast(Dim())}; + return result.reshape(shape); } } // namesapce kaldi diff --git a/audio/paddleaudio/src/pybind/kaldi/kaldi_feature_wrapper.h b/audio/paddleaudio/src/pybind/kaldi/kaldi_feature_wrapper.h index bee1eee02..daad2d587 100644 --- a/audio/paddleaudio/src/pybind/kaldi/kaldi_feature_wrapper.h +++ b/audio/paddleaudio/src/pybind/kaldi/kaldi_feature_wrapper.h @@ -14,20 +14,18 @@ #pragma once -#include "base/kaldi-common.h" -#include "feat/feature-fbank.h" - +#include "paddleaudio/third_party/kaldi-native-fbank/csrc/feature-fbank.h" #include "paddleaudio/src/pybind/kaldi/feature_common.h" namespace paddleaudio { namespace kaldi { -typedef StreamingFeatureTpl<::kaldi::FbankComputer> Fbank; +typedef StreamingFeatureTpl Fbank; class KaldiFeatureWrapper { public: static KaldiFeatureWrapper* GetInstance(); - bool InitFbank(::kaldi::FbankOptions opts); + bool InitFbank(knf::FbankOptions opts); py::array_t ComputeFbank(const py::array_t wav); int Dim() { return fbank_->Dim(); } void ResetFbank() { fbank_->Reset(); } diff --git a/audio/paddleaudio/src/pybind/pybind.cpp b/audio/paddleaudio/src/pybind/pybind.cpp index 692e80995..510712034 100644 --- a/audio/paddleaudio/src/pybind/pybind.cpp +++ b/audio/paddleaudio/src/pybind/pybind.cpp @@ -2,7 +2,7 @@ #ifdef INCLUDE_KALDI #include "paddleaudio/src/pybind/kaldi/kaldi_feature.h" -#include "paddleaudio/third_party/kaldi/feat/feature-fbank.h" +#include "paddleaudio/third_party/kaldi-native-fbank/csrc/feature-fbank.h" #endif #ifdef INCLUDE_SOX @@ -89,53 +89,51 @@ PYBIND11_MODULE(_paddleaudio, m) { #ifdef INCLUDE_KALDI m.def("ComputeFbank", &paddleaudio::kaldi::ComputeFbank, "compute fbank"); - py::class_(m, "PitchExtractionOptions") - .def(py::init<>()) - .def_readwrite("samp_freq", &kaldi::PitchExtractionOptions::samp_freq) - .def_readwrite("frame_shift_ms", &kaldi::PitchExtractionOptions::frame_shift_ms) - .def_readwrite("frame_length_ms", &kaldi::PitchExtractionOptions::frame_length_ms) - .def_readwrite("preemph_coeff", &kaldi::PitchExtractionOptions::preemph_coeff) - .def_readwrite("min_f0", &kaldi::PitchExtractionOptions::min_f0) - .def_readwrite("max_f0", &kaldi::PitchExtractionOptions::max_f0) - .def_readwrite("soft_min_f0", &kaldi::PitchExtractionOptions::soft_min_f0) - .def_readwrite("penalty_factor", &kaldi::PitchExtractionOptions::penalty_factor) - .def_readwrite("lowpass_cutoff", &kaldi::PitchExtractionOptions::lowpass_cutoff) - .def_readwrite("resample_freq", &kaldi::PitchExtractionOptions::resample_freq) - .def_readwrite("delta_pitch", &kaldi::PitchExtractionOptions::delta_pitch) - .def_readwrite("nccf_ballast", &kaldi::PitchExtractionOptions::nccf_ballast) - .def_readwrite("lowpass_filter_width", &kaldi::PitchExtractionOptions::lowpass_filter_width) - .def_readwrite("upsample_filter_width", &kaldi::PitchExtractionOptions::upsample_filter_width) - .def_readwrite("max_frames_latency", &kaldi::PitchExtractionOptions::max_frames_latency) - .def_readwrite("frames_per_chunk", &kaldi::PitchExtractionOptions::frames_per_chunk) - .def_readwrite("simulate_first_pass_online", &kaldi::PitchExtractionOptions::simulate_first_pass_online) - .def_readwrite("recompute_frame", &kaldi::PitchExtractionOptions::recompute_frame) - .def_readwrite("nccf_ballast_online", &kaldi::PitchExtractionOptions::nccf_ballast_online) - .def_readwrite("snip_edges", &kaldi::PitchExtractionOptions::snip_edges); - m.def("ComputeKaldiPitch", &paddleaudio::kaldi::ComputeKaldiPitch, "compute kaldi pitch"); - py::class_(m, "FrameExtractionOptions") + //py::class_(m, "PitchExtractionOptions") + //.def(py::init<>()) + //.def_readwrite("samp_freq", &kaldi::PitchExtractionOptions::samp_freq) + //.def_readwrite("frame_shift_ms", &kaldi::PitchExtractionOptions::frame_shift_ms) + //.def_readwrite("frame_length_ms", &kaldi::PitchExtractionOptions::frame_length_ms) + //.def_readwrite("preemph_coeff", &kaldi::PitchExtractionOptions::preemph_coeff) + //.def_readwrite("min_f0", &kaldi::PitchExtractionOptions::min_f0) + //.def_readwrite("max_f0", &kaldi::PitchExtractionOptions::max_f0) + //.def_readwrite("soft_min_f0", &kaldi::PitchExtractionOptions::soft_min_f0) + //.def_readwrite("penalty_factor", &kaldi::PitchExtractionOptions::penalty_factor) + //.def_readwrite("lowpass_cutoff", &kaldi::PitchExtractionOptions::lowpass_cutoff) + //.def_readwrite("resample_freq", &kaldi::PitchExtractionOptions::resample_freq) + //.def_readwrite("delta_pitch", &kaldi::PitchExtractionOptions::delta_pitch) + //.def_readwrite("nccf_ballast", &kaldi::PitchExtractionOptions::nccf_ballast) + //.def_readwrite("lowpass_filter_width", &kaldi::PitchExtractionOptions::lowpass_filter_width) + //.def_readwrite("upsample_filter_width", &kaldi::PitchExtractionOptions::upsample_filter_width) + //.def_readwrite("max_frames_latency", &kaldi::PitchExtractionOptions::max_frames_latency) + //.def_readwrite("frames_per_chunk", &kaldi::PitchExtractionOptions::frames_per_chunk) + //.def_readwrite("simulate_first_pass_online", &kaldi::PitchExtractionOptions::simulate_first_pass_online) + //.def_readwrite("recompute_frame", &kaldi::PitchExtractionOptions::recompute_frame) + //.def_readwrite("nccf_ballast_online", &kaldi::PitchExtractionOptions::nccf_ballast_online) + //.def_readwrite("snip_edges", &kaldi::PitchExtractionOptions::snip_edges); + //m.def("ComputeKaldiPitch", &paddleaudio::kaldi::ComputeKaldiPitch, "compute kaldi pitch"); + py::class_(m, "FrameExtractionOptions") .def(py::init<>()) - .def_readwrite("samp_freq", &kaldi::FrameExtractionOptions::samp_freq) - .def_readwrite("frame_shift_ms", &kaldi::FrameExtractionOptions::frame_shift_ms) - .def_readwrite("frame_length_ms", &kaldi::FrameExtractionOptions::frame_length_ms) - .def_readwrite("dither", &kaldi::FrameExtractionOptions::dither) - .def_readwrite("preemph_coeff", &kaldi::FrameExtractionOptions::preemph_coeff) - .def_readwrite("remove_dc_offset", &kaldi::FrameExtractionOptions::remove_dc_offset) - .def_readwrite("window_type", &kaldi::FrameExtractionOptions::window_type) - .def_readwrite("round_to_power_of_two", &kaldi::FrameExtractionOptions::round_to_power_of_two) - .def_readwrite("blackman_coeff", &kaldi::FrameExtractionOptions::blackman_coeff) - .def_readwrite("snip_edges", &kaldi::FrameExtractionOptions::snip_edges) - .def_readwrite("allow_downsample", &kaldi::FrameExtractionOptions::allow_downsample) - .def_readwrite("allow_upsample", &kaldi::FrameExtractionOptions::allow_upsample) - .def_readwrite("max_feature_vectors", &kaldi::FrameExtractionOptions::max_feature_vectors); - py::class_(m, "MelBanksOptions") + .def_readwrite("samp_freq", &knf::FrameExtractionOptions::samp_freq) + .def_readwrite("frame_shift_ms", &knf::FrameExtractionOptions::frame_shift_ms) + .def_readwrite("frame_length_ms", &knf::FrameExtractionOptions::frame_length_ms) + .def_readwrite("dither", &knf::FrameExtractionOptions::dither) + .def_readwrite("preemph_coeff", &knf::FrameExtractionOptions::preemph_coeff) + .def_readwrite("remove_dc_offset", &knf::FrameExtractionOptions::remove_dc_offset) + .def_readwrite("window_type", &knf::FrameExtractionOptions::window_type) + .def_readwrite("round_to_power_of_two", &knf::FrameExtractionOptions::round_to_power_of_two) + .def_readwrite("blackman_coeff", &knf::FrameExtractionOptions::blackman_coeff) + .def_readwrite("snip_edges", &knf::FrameExtractionOptions::snip_edges) + .def_readwrite("max_feature_vectors", &knf::FrameExtractionOptions::max_feature_vectors); + py::class_(m, "MelBanksOptions") .def(py::init<>()) - .def_readwrite("num_bins", &kaldi::MelBanksOptions::num_bins) - .def_readwrite("low_freq", &kaldi::MelBanksOptions::low_freq) - .def_readwrite("high_freq", &kaldi::MelBanksOptions::high_freq) - .def_readwrite("vtln_low", &kaldi::MelBanksOptions::vtln_low) - .def_readwrite("vtln_high", &kaldi::MelBanksOptions::vtln_high) - .def_readwrite("debug_mel", &kaldi::MelBanksOptions::debug_mel) - .def_readwrite("htk_mode", &kaldi::MelBanksOptions::htk_mode); + .def_readwrite("num_bins", &knf::MelBanksOptions::num_bins) + .def_readwrite("low_freq", &knf::MelBanksOptions::low_freq) + .def_readwrite("high_freq", &knf::MelBanksOptions::high_freq) + .def_readwrite("vtln_low", &knf::MelBanksOptions::vtln_low) + .def_readwrite("vtln_high", &knf::MelBanksOptions::vtln_high) + .def_readwrite("debug_mel", &knf::MelBanksOptions::debug_mel) + .def_readwrite("htk_mode", &knf::MelBanksOptions::htk_mode); py::class_(m, "FbankOptions") .def(py::init<>()) diff --git a/audio/paddleaudio/third_party/CMakeLists.txt b/audio/paddleaudio/third_party/CMakeLists.txt index 43288f39b..4b85bada0 100644 --- a/audio/paddleaudio/third_party/CMakeLists.txt +++ b/audio/paddleaudio/third_party/CMakeLists.txt @@ -11,5 +11,6 @@ endif() # kaldi ################################################################################ if (BUILD_KALDI) - add_subdirectory(kaldi) -endif() \ No newline at end of file + include_directories(${CMAKE_CURRENT_SOURCE_DIR}) + add_subdirectory(kaldi-native-fbank/csrc) +endif() diff --git a/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/CMakeLists.txt b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/CMakeLists.txt new file mode 100644 index 000000000..176607fc0 --- /dev/null +++ b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/CMakeLists.txt @@ -0,0 +1,22 @@ +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../../) +add_library(kaldi-native-fbank-core + feature-fbank.cc + feature-functions.cc + feature-window.cc + fftsg.c + log.cc + mel-computations.cc + rfft.cc +) +# We are using std::call_once() in log.h,which requires us to link with -pthread +if(NOT WIN32) + target_link_libraries(kaldi-native-fbank-core -pthread) +endif() + +if(KNF_HAVE_EXECINFO_H) + target_compile_definitions(kaldi-native-fbank-core PRIVATE KNF_HAVE_EXECINFO_H=1) +endif() + +if(KNF_HAVE_CXXABI_H) + target_compile_definitions(kaldi-native-fbank-core PRIVATE KNF_HAVE_CXXABI_H=1) +endif() diff --git a/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-fbank.cc b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-fbank.cc new file mode 100644 index 000000000..740ee17e9 --- /dev/null +++ b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-fbank.cc @@ -0,0 +1,117 @@ +/** + * Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang) + * + * See LICENSE for clarification regarding multiple authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// This file is copied/modified from kaldi/src/feat/feature-fbank.cc +// +#include "kaldi-native-fbank/csrc/feature-fbank.h" + +#include + +#include "kaldi-native-fbank/csrc/feature-functions.h" + +namespace knf { + +static void Sqrt(float *in_out, int32_t n) { + for (int32_t i = 0; i != n; ++i) { + in_out[i] = std::sqrt(in_out[i]); + } +} + +std::ostream &operator<<(std::ostream &os, const FbankOptions &opts) { + os << opts.ToString(); + return os; +} + +FbankComputer::FbankComputer(const FbankOptions &opts) + : opts_(opts), rfft_(opts.frame_opts.PaddedWindowSize()) { + if (opts.energy_floor > 0.0f) { + log_energy_floor_ = logf(opts.energy_floor); + } + + // We'll definitely need the filterbanks info for VTLN warping factor 1.0. + // [note: this call caches it.] + GetMelBanks(1.0f); +} + +FbankComputer::~FbankComputer() { + for (auto iter = mel_banks_.begin(); iter != mel_banks_.end(); ++iter) + delete iter->second; +} + +const MelBanks *FbankComputer::GetMelBanks(float vtln_warp) { + MelBanks *this_mel_banks = nullptr; + + // std::map::iterator iter = mel_banks_.find(vtln_warp); + auto iter = mel_banks_.find(vtln_warp); + if (iter == mel_banks_.end()) { + this_mel_banks = new MelBanks(opts_.mel_opts, opts_.frame_opts, vtln_warp); + mel_banks_[vtln_warp] = this_mel_banks; + } else { + this_mel_banks = iter->second; + } + return this_mel_banks; +} + +void FbankComputer::Compute(float signal_raw_log_energy, float vtln_warp, + std::vector *signal_frame, float *feature) { + const MelBanks &mel_banks = *(GetMelBanks(vtln_warp)); + + KNF_CHECK_EQ(signal_frame->size(), opts_.frame_opts.PaddedWindowSize()); + + // Compute energy after window function (not the raw one). + if (opts_.use_energy && !opts_.raw_energy) { + signal_raw_log_energy = std::log( + std::max(InnerProduct(signal_frame->data(), signal_frame->data(), + signal_frame->size()), + std::numeric_limits::epsilon())); + } + rfft_.Compute(signal_frame->data()); // signal_frame is modified in-place + ComputePowerSpectrum(signal_frame); + + // Use magnitude instead of power if requested. + if (!opts_.use_power) { + Sqrt(signal_frame->data(), signal_frame->size() / 2 + 1); + } + + int32_t mel_offset = ((opts_.use_energy && !opts_.htk_compat) ? 1 : 0); + + // Its length is opts_.mel_opts.num_bins + float *mel_energies = feature + mel_offset; + + // Sum with mel filter banks over the power spectrum + mel_banks.Compute(signal_frame->data(), mel_energies); + + if (opts_.use_log_fbank) { + // Avoid log of zero (which should be prevented anyway by dithering). + for (int32_t i = 0; i != opts_.mel_opts.num_bins; ++i) { + auto t = std::max(mel_energies[i], std::numeric_limits::epsilon()); + mel_energies[i] = std::log(t); + } + } + + // Copy energy as first value (or the last, if htk_compat == true). + if (opts_.use_energy) { + if (opts_.energy_floor > 0.0 && signal_raw_log_energy < log_energy_floor_) { + signal_raw_log_energy = log_energy_floor_; + } + int32_t energy_index = opts_.htk_compat ? opts_.mel_opts.num_bins : 0; + feature[energy_index] = signal_raw_log_energy; + } +} + +} // namespace knf diff --git a/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-fbank.h b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-fbank.h new file mode 100644 index 000000000..0ef3fac0d --- /dev/null +++ b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-fbank.h @@ -0,0 +1,132 @@ +/** + * Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang) + * + * See LICENSE for clarification regarding multiple authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// This file is copied/modified from kaldi/src/feat/feature-fbank.h + +#ifndef KALDI_NATIVE_FBANK_CSRC_FEATURE_FBANK_H_ +#define KALDI_NATIVE_FBANK_CSRC_FEATURE_FBANK_H_ + +#include + +#include "kaldi-native-fbank/csrc/feature-window.h" +#include "kaldi-native-fbank/csrc/mel-computations.h" +#include "kaldi-native-fbank/csrc/rfft.h" + +namespace knf { + +struct FbankOptions { + FrameExtractionOptions frame_opts; + MelBanksOptions mel_opts; + // append an extra dimension with energy to the filter banks + bool use_energy = false; + float energy_floor = 0.0f; // active iff use_energy==true + + // If true, compute log_energy before preemphasis and windowing + // If false, compute log_energy after preemphasis ans windowing + bool raw_energy = true; // active iff use_energy==true + + // If true, put energy last (if using energy) + // If false, put energy first + bool htk_compat = false; // active iff use_energy==true + + // if true (default), produce log-filterbank, else linear + bool use_log_fbank = true; + + // if true (default), use power in filterbank + // analysis, else magnitude. + bool use_power = true; + + FbankOptions() { mel_opts.num_bins = 23; } + + std::string ToString() const { + std::ostringstream os; + os << "frame_opts: \n"; + os << frame_opts << "\n"; + os << "\n"; + + os << "mel_opts: \n"; + os << mel_opts << "\n"; + + os << "use_energy: " << use_energy << "\n"; + os << "energy_floor: " << energy_floor << "\n"; + os << "raw_energy: " << raw_energy << "\n"; + os << "htk_compat: " << htk_compat << "\n"; + os << "use_log_fbank: " << use_log_fbank << "\n"; + os << "use_power: " << use_power << "\n"; + return os.str(); + } +}; + +std::ostream &operator<<(std::ostream &os, const FbankOptions &opts); + +class FbankComputer { + public: + using Options = FbankOptions; + + explicit FbankComputer(const FbankOptions &opts); + ~FbankComputer(); + + int32_t Dim() const { + return opts_.mel_opts.num_bins + (opts_.use_energy ? 1 : 0); + } + + // if true, compute log_energy_pre_window but after dithering and dc removal + bool NeedRawLogEnergy() const { return opts_.use_energy && opts_.raw_energy; } + + const FrameExtractionOptions &GetFrameOptions() const { + return opts_.frame_opts; + } + + const FbankOptions &GetOptions() const { return opts_; } + + /** + Function that computes one frame of features from + one frame of signal. + + @param [in] signal_raw_log_energy The log-energy of the frame of the signal + prior to windowing and pre-emphasis, or + log(numeric_limits::min()), whichever is greater. Must be + ignored by this function if this class returns false from + this->NeedsRawLogEnergy(). + @param [in] vtln_warp The VTLN warping factor that the user wants + to be applied when computing features for this utterance. Will + normally be 1.0, meaning no warping is to be done. The value will + be ignored for feature types that don't support VLTN, such as + spectrogram features. + @param [in] signal_frame One frame of the signal, + as extracted using the function ExtractWindow() using the options + returned by this->GetFrameOptions(). The function will use the + vector as a workspace, which is why it's a non-const pointer. + @param [out] feature Pointer to a vector of size this->Dim(), to which + the computed feature will be written. It should be pre-allocated. + */ + void Compute(float signal_raw_log_energy, float vtln_warp, + std::vector *signal_frame, float *feature); + + private: + const MelBanks *GetMelBanks(float vtln_warp); + + FbankOptions opts_; + float log_energy_floor_; + std::map mel_banks_; // float is VTLN coefficient. + Rfft rfft_; +}; + +} // namespace knf + +#endif // KALDI_NATIVE_FBANK_CSRC_FEATURE_FBANK_H_ diff --git a/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-functions.cc b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-functions.cc new file mode 100644 index 000000000..00ae4c798 --- /dev/null +++ b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-functions.cc @@ -0,0 +1,49 @@ +/** + * Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang) + * + * See LICENSE for clarification regarding multiple authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// This file is copied/modified from kaldi/src/feat/feature-functions.cc + +#include "kaldi-native-fbank/csrc/feature-functions.h" + +#include +#include + +namespace knf { + +void ComputePowerSpectrum(std::vector *complex_fft) { + int32_t dim = complex_fft->size(); + + // now we have in complex_fft, first half of complex spectrum + // it's stored as [real0, realN/2, real1, im1, real2, im2, ...] + + float *p = complex_fft->data(); + int32_t half_dim = dim / 2; + float first_energy = p[0] * p[0]; + float last_energy = p[1] * p[1]; // handle this special case + + for (int32_t i = 1; i < half_dim; ++i) { + float real = p[i * 2]; + float im = p[i * 2 + 1]; + p[i] = real * real + im * im; + } + p[0] = first_energy; + p[half_dim] = last_energy; // Will actually never be used, and anyway + // if the signal has been bandlimited sensibly this should be zero. +} + +} // namespace knf diff --git a/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-functions.h b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-functions.h new file mode 100644 index 000000000..852d0612c --- /dev/null +++ b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-functions.h @@ -0,0 +1,38 @@ +/** + * Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang) + * + * See LICENSE for clarification regarding multiple authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// This file is copied/modified from kaldi/src/feat/feature-functions.h +#ifndef KALDI_NATIVE_FBANK_CSRC_FEATURE_FUNCTIONS_H +#define KALDI_NATIVE_FBANK_CSRC_FEATURE_FUNCTIONS_H + +#include +namespace knf { + +// ComputePowerSpectrum converts a complex FFT (as produced by the FFT +// functions in csrc/rfft.h), and converts it into +// a power spectrum. If the complex FFT is a vector of size n (representing +// half of the complex FFT of a real signal of size n, as described there), +// this function computes in the first (n/2) + 1 elements of it, the +// energies of the fft bins from zero to the Nyquist frequency. Contents of the +// remaining (n/2) - 1 elements are undefined at output. + +void ComputePowerSpectrum(std::vector *complex_fft); + +} // namespace knf + +#endif // KALDI_NATIVE_FBANK_CSRC_FEATURE_FUNCTIONS_H diff --git a/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-window.cc b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-window.cc new file mode 100644 index 000000000..b86a2c3d2 --- /dev/null +++ b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-window.cc @@ -0,0 +1,236 @@ +// kaldi-native-fbank/csrc/feature-window.cc +// +// Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang) + +// This file is copied/modified from kaldi/src/feat/feature-window.cc + +#include "kaldi-native-fbank/csrc/feature-window.h" + +#include +#include + +#ifndef M_2PI +#define M_2PI 6.283185307179586476925286766559005 +#endif + +namespace knf { + +std::ostream &operator<<(std::ostream &os, const FrameExtractionOptions &opts) { + os << opts.ToString(); + return os; +} + +FeatureWindowFunction::FeatureWindowFunction(const FrameExtractionOptions &opts) + : window_(opts.WindowSize()) { + int32_t frame_length = opts.WindowSize(); + KNF_CHECK_GT(frame_length, 0); + + float *window_data = window_.data(); + + double a = M_2PI / (frame_length - 1); + for (int32_t i = 0; i < frame_length; i++) { + double i_fl = static_cast(i); + if (opts.window_type == "hanning") { + window_data[i] = 0.5 - 0.5 * cos(a * i_fl); + } else if (opts.window_type == "sine") { + // when you are checking ws wikipedia, please + // note that 0.5 * a = M_PI/(frame_length-1) + window_data[i] = sin(0.5 * a * i_fl); + } else if (opts.window_type == "hamming") { + window_data[i] = 0.54 - 0.46 * cos(a * i_fl); + } else if (opts.window_type == + "povey") { // like hamming but goes to zero at edges. + window_data[i] = pow(0.5 - 0.5 * cos(a * i_fl), 0.85); + } else if (opts.window_type == "rectangular") { + window_data[i] = 1.0; + } else if (opts.window_type == "blackman") { + window_data[i] = opts.blackman_coeff - 0.5 * cos(a * i_fl) + + (0.5 - opts.blackman_coeff) * cos(2 * a * i_fl); + } else { + KNF_LOG(FATAL) << "Invalid window type " << opts.window_type; + } + } +} + +void FeatureWindowFunction::Apply(float *wave) const { + int32_t window_size = window_.size(); + const float *p = window_.data(); + for (int32_t k = 0; k != window_size; ++k) { + wave[k] *= p[k]; + } +} + +int64_t FirstSampleOfFrame(int32_t frame, const FrameExtractionOptions &opts) { + int64_t frame_shift = opts.WindowShift(); + if (opts.snip_edges) { + return frame * frame_shift; + } else { + int64_t midpoint_of_frame = frame_shift * frame + frame_shift / 2, + beginning_of_frame = midpoint_of_frame - opts.WindowSize() / 2; + return beginning_of_frame; + } +} + +int32_t NumFrames(int64_t num_samples, const FrameExtractionOptions &opts, + bool flush /*= true*/) { + int64_t frame_shift = opts.WindowShift(); + int64_t frame_length = opts.WindowSize(); + if (opts.snip_edges) { + // with --snip-edges=true (the default), we use a HTK-like approach to + // determining the number of frames-- all frames have to fit completely into + // the waveform, and the first frame begins at sample zero. + if (num_samples < frame_length) + return 0; + else + return (1 + ((num_samples - frame_length) / frame_shift)); + // You can understand the expression above as follows: 'num_samples - + // frame_length' is how much room we have to shift the frame within the + // waveform; 'frame_shift' is how much we shift it each time; and the ratio + // is how many times we can shift it (integer arithmetic rounds down). + } else { + // if --snip-edges=false, the number of frames is determined by rounding the + // (file-length / frame-shift) to the nearest integer. The point of this + // formula is to make the number of frames an obvious and predictable + // function of the frame shift and signal length, which makes many + // segmentation-related questions simpler. + // + // Because integer division in C++ rounds toward zero, we add (half the + // frame-shift minus epsilon) before dividing, to have the effect of + // rounding towards the closest integer. + int32_t num_frames = (num_samples + (frame_shift / 2)) / frame_shift; + + if (flush) return num_frames; + + // note: 'end' always means the last plus one, i.e. one past the last. + int64_t end_sample_of_last_frame = + FirstSampleOfFrame(num_frames - 1, opts) + frame_length; + + // the following code is optimized more for clarity than efficiency. + // If flush == false, we can't output frames that extend past the end + // of the signal. + while (num_frames > 0 && end_sample_of_last_frame > num_samples) { + num_frames--; + end_sample_of_last_frame -= frame_shift; + } + return num_frames; + } +} + +void ExtractWindow(int64_t sample_offset, const std::vector &wave, + int32_t f, const FrameExtractionOptions &opts, + const FeatureWindowFunction &window_function, + std::vector *window, + float *log_energy_pre_window /*= nullptr*/) { + KNF_CHECK(sample_offset >= 0 && wave.size() != 0); + + int32_t frame_length = opts.WindowSize(); + int32_t frame_length_padded = opts.PaddedWindowSize(); + + int64_t num_samples = sample_offset + wave.size(); + int64_t start_sample = FirstSampleOfFrame(f, opts); + int64_t end_sample = start_sample + frame_length; + + if (opts.snip_edges) { + KNF_CHECK(start_sample >= sample_offset && end_sample <= num_samples); + } else { + KNF_CHECK(sample_offset == 0 || start_sample >= sample_offset); + } + + if (window->size() != frame_length_padded) { + window->resize(frame_length_padded); + } + + // wave_start and wave_end are start and end indexes into 'wave', for the + // piece of wave that we're trying to extract. + int32_t wave_start = int32_t(start_sample - sample_offset); + int32_t wave_end = wave_start + frame_length; + + if (wave_start >= 0 && wave_end <= wave.size()) { + // the normal case-- no edge effects to consider. + std::copy(wave.begin() + wave_start, + wave.begin() + wave_start + frame_length, window->data()); + } else { + // Deal with any end effects by reflection, if needed. This code will only + // be reached for about two frames per utterance, so we don't concern + // ourselves excessively with efficiency. + int32_t wave_dim = wave.size(); + for (int32_t s = 0; s < frame_length; ++s) { + int32_t s_in_wave = s + wave_start; + while (s_in_wave < 0 || s_in_wave >= wave_dim) { + // reflect around the beginning or end of the wave. + // e.g. -1 -> 0, -2 -> 1. + // dim -> dim - 1, dim + 1 -> dim - 2. + // the code supports repeated reflections, although this + // would only be needed in pathological cases. + if (s_in_wave < 0) + s_in_wave = -s_in_wave - 1; + else + s_in_wave = 2 * wave_dim - 1 - s_in_wave; + } + (*window)[s] = wave[s_in_wave]; + } + } + + ProcessWindow(opts, window_function, window->data(), log_energy_pre_window); +} + +static void RemoveDcOffset(float *d, int32_t n) { + float sum = 0; + for (int32_t i = 0; i != n; ++i) { + sum += d[i]; + } + + float mean = sum / n; + + for (int32_t i = 0; i != n; ++i) { + d[i] -= mean; + } +} + +float InnerProduct(const float *a, const float *b, int32_t n) { + float sum = 0; + for (int32_t i = 0; i != n; ++i) { + sum += a[i] * b[i]; + } + return sum; +} + +static void Preemphasize(float *d, int32_t n, float preemph_coeff) { + if (preemph_coeff == 0.0) { + return; + } + + KNF_CHECK(preemph_coeff >= 0.0 && preemph_coeff <= 1.0); + + for (int32_t i = n - 1; i > 0; --i) { + d[i] -= preemph_coeff * d[i - 1]; + } + d[0] -= preemph_coeff * d[0]; +} + +void ProcessWindow(const FrameExtractionOptions &opts, + const FeatureWindowFunction &window_function, float *window, + float *log_energy_pre_window /*= nullptr*/) { + int32_t frame_length = opts.WindowSize(); + + // TODO(fangjun): Remove dither + KNF_CHECK_EQ(opts.dither, 0); + + if (opts.remove_dc_offset) { + RemoveDcOffset(window, frame_length); + } + + if (log_energy_pre_window != NULL) { + float energy = std::max(InnerProduct(window, window, frame_length), + std::numeric_limits::epsilon()); + *log_energy_pre_window = std::log(energy); + } + + if (opts.preemph_coeff != 0.0) { + Preemphasize(window, frame_length, opts.preemph_coeff); + } + + window_function.Apply(window); +} + +} // namespace knf diff --git a/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-window.h b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-window.h new file mode 100644 index 000000000..a33844f4c --- /dev/null +++ b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-window.h @@ -0,0 +1,178 @@ +// kaldi-native-fbank/csrc/feature-window.h +// +// Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang) + +// This file is copied/modified from kaldi/src/feat/feature-window.h + +#ifndef KALDI_NATIVE_FEAT_CSRC_FEATURE_WINDOW_H_ +#define KALDI_NATIVE_FEAT_CSRC_FEATURE_WINDOW_H_ + +#include +#include +#include + +#include "kaldi-native-fbank/csrc/log.h" + +namespace knf { + +inline int32_t RoundUpToNearestPowerOfTwo(int32_t n) { + // copied from kaldi/src/base/kaldi-math.cc + KNF_CHECK_GT(n, 0); + n--; + n |= n >> 1; + n |= n >> 2; + n |= n >> 4; + n |= n >> 8; + n |= n >> 16; + return n + 1; +} + +struct FrameExtractionOptions { + float samp_freq = 16000; + float frame_shift_ms = 10.0f; // in milliseconds. + float frame_length_ms = 25.0f; // in milliseconds. + float dither = 1.0f; // Amount of dithering, 0.0 means no dither. + float preemph_coeff = 0.97f; // Preemphasis coefficient. + bool remove_dc_offset = true; // Subtract mean of wave before FFT. + std::string window_type = "povey"; // e.g. Hamming window + // May be "hamming", "rectangular", "povey", "hanning", "sine", "blackman" + // "povey" is a window I made to be similar to Hamming but to go to zero at + // the edges, it's pow((0.5 - 0.5*cos(n/N*2*pi)), 0.85) I just don't think the + // Hamming window makes sense as a windowing function. + bool round_to_power_of_two = true; + float blackman_coeff = 0.42f; + bool snip_edges = true; + // bool allow_downsample = false; + // bool allow_upsample = false; + + // Used for streaming feature extraction. It indicates the number + // of feature frames to keep in the recycling vector. -1 means to + // keep all feature frames. + int32_t max_feature_vectors = -1; + + int32_t WindowShift() const { + return static_cast(samp_freq * 0.001f * frame_shift_ms); + } + int32_t WindowSize() const { + return static_cast(samp_freq * 0.001f * frame_length_ms); + } + int32_t PaddedWindowSize() const { + return (round_to_power_of_two ? RoundUpToNearestPowerOfTwo(WindowSize()) + : WindowSize()); + } + std::string ToString() const { + std::ostringstream os; +#define KNF_PRINT(x) os << #x << ": " << x << "\n" + KNF_PRINT(samp_freq); + KNF_PRINT(frame_shift_ms); + KNF_PRINT(frame_length_ms); + KNF_PRINT(dither); + KNF_PRINT(preemph_coeff); + KNF_PRINT(remove_dc_offset); + KNF_PRINT(window_type); + KNF_PRINT(round_to_power_of_two); + KNF_PRINT(blackman_coeff); + KNF_PRINT(snip_edges); + // KNF_PRINT(allow_downsample); + // KNF_PRINT(allow_upsample); + KNF_PRINT(max_feature_vectors); +#undef KNF_PRINT + return os.str(); + } +}; + +std::ostream &operator<<(std::ostream &os, const FrameExtractionOptions &opts); + +class FeatureWindowFunction { + public: + FeatureWindowFunction() = default; + explicit FeatureWindowFunction(const FrameExtractionOptions &opts); + /** + * @param wave Pointer to a 1-D array of shape [window_size]. + * It is modified in-place: wave[i] = wave[i] * window_[i]. + * @param + */ + void Apply(float *wave) const; + + private: + std::vector window_; // of size opts.WindowSize() +}; + +int64_t FirstSampleOfFrame(int32_t frame, const FrameExtractionOptions &opts); + +/** + This function returns the number of frames that we can extract from a wave + file with the given number of samples in it (assumed to have the same + sampling rate as specified in 'opts'). + + @param [in] num_samples The number of samples in the wave file. + @param [in] opts The frame-extraction options class + + @param [in] flush True if we are asserting that this number of samples + is 'all there is', false if we expecting more data to possibly come in. This + only makes a difference to the answer + if opts.snips_edges== false. For offline feature extraction you always want + flush == true. In an online-decoding context, once you know (or decide) that + no more data is coming in, you'd call it with flush == true at the end to + flush out any remaining data. +*/ +int32_t NumFrames(int64_t num_samples, const FrameExtractionOptions &opts, + bool flush = true); + +/* + ExtractWindow() extracts a windowed frame of waveform (possibly with a + power-of-two, padded size, depending on the config), including all the + processing done by ProcessWindow(). + + @param [in] sample_offset If 'wave' is not the entire waveform, but + part of it to the left has been discarded, then the + number of samples prior to 'wave' that we have + already discarded. Set this to zero if you are + processing the entire waveform in one piece, or + if you get 'no matching function' compilation + errors when updating the code. + @param [in] wave The waveform + @param [in] f The frame index to be extracted, with + 0 <= f < NumFrames(sample_offset + wave.Dim(), opts, true) + @param [in] opts The options class to be used + @param [in] window_function The windowing function, as derived from the + options class. + @param [out] window The windowed, possibly-padded waveform to be + extracted. Will be resized as needed. + @param [out] log_energy_pre_window If non-NULL, the log-energy of + the signal prior to pre-emphasis and multiplying by + the windowing function will be written to here. +*/ +void ExtractWindow(int64_t sample_offset, const std::vector &wave, + int32_t f, const FrameExtractionOptions &opts, + const FeatureWindowFunction &window_function, + std::vector *window, + float *log_energy_pre_window = nullptr); + +/** + This function does all the windowing steps after actually + extracting the windowed signal: depending on the + configuration, it does dithering, dc offset removal, + preemphasis, and multiplication by the windowing function. + @param [in] opts The options class to be used + @param [in] window_function The windowing function-- should have + been initialized using 'opts'. + @param [in,out] window A vector of size opts.WindowSize(). Note: + it will typically be a sub-vector of a larger vector of size + opts.PaddedWindowSize(), with the remaining samples zero, + as the FFT code is more efficient if it operates on data with + power-of-two size. + @param [out] log_energy_pre_window If non-NULL, then after dithering and + DC offset removal, this function will write to this pointer the log of + the total energy (i.e. sum-squared) of the frame. + */ +void ProcessWindow(const FrameExtractionOptions &opts, + const FeatureWindowFunction &window_function, float *window, + float *log_energy_pre_window = nullptr); + +// Compute the inner product of two vectors +float InnerProduct(const float *a, const float *b, int32_t n); + +} // namespace knf + +#endif // KALDI_NATIVE_FEAT_CSRC_FEATURE_WINDOW_H_ diff --git a/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/fftsg.c b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/fftsg.c new file mode 100644 index 000000000..ec8217a2b --- /dev/null +++ b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/fftsg.c @@ -0,0 +1,3271 @@ +/* This file is copied from + * https://www.kurims.kyoto-u.ac.jp/~ooura/fft.html + */ +/* +Fast Fourier/Cosine/Sine Transform + dimension :one + data length :power of 2 + decimation :frequency + radix :split-radix + data :inplace + table :use +functions + cdft: Complex Discrete Fourier Transform + rdft: Real Discrete Fourier Transform + ddct: Discrete Cosine Transform + ddst: Discrete Sine Transform + dfct: Cosine Transform of RDFT (Real Symmetric DFT) + dfst: Sine Transform of RDFT (Real Anti-symmetric DFT) +function prototypes + void cdft(int, int, double *, int *, double *); + void rdft(int, int, double *, int *, double *); + void ddct(int, int, double *, int *, double *); + void ddst(int, int, double *, int *, double *); + void dfct(int, double *, double *, int *, double *); + void dfst(int, double *, double *, int *, double *); +macro definitions + USE_CDFT_PTHREADS : default=not defined + CDFT_THREADS_BEGIN_N : must be >= 512, default=8192 + CDFT_4THREADS_BEGIN_N : must be >= 512, default=65536 + USE_CDFT_WINTHREADS : default=not defined + CDFT_THREADS_BEGIN_N : must be >= 512, default=32768 + CDFT_4THREADS_BEGIN_N : must be >= 512, default=524288 + + +-------- Complex DFT (Discrete Fourier Transform) -------- + [definition] + + X[k] = sum_j=0^n-1 x[j]*exp(2*pi*i*j*k/n), 0<=k + X[k] = sum_j=0^n-1 x[j]*exp(-2*pi*i*j*k/n), 0<=k + ip[0] = 0; // first time only + cdft(2*n, 1, a, ip, w); + + ip[0] = 0; // first time only + cdft(2*n, -1, a, ip, w); + [parameters] + 2*n :data length (int) + n >= 1, n = power of 2 + a[0...2*n-1] :input/output data (double *) + input data + a[2*j] = Re(x[j]), + a[2*j+1] = Im(x[j]), 0<=j= 2+sqrt(n) + strictly, + length of ip >= + 2+(1<<(int)(log(n+0.5)/log(2))/2). + ip[0],ip[1] are pointers of the cos/sin table. + w[0...n/2-1] :cos/sin table (double *) + w[],ip[] are initialized if ip[0] == 0. + [remark] + Inverse of + cdft(2*n, -1, a, ip, w); + is + cdft(2*n, 1, a, ip, w); + for (j = 0; j <= 2 * n - 1; j++) { + a[j] *= 1.0 / n; + } + . + + +-------- Real DFT / Inverse of Real DFT -------- + [definition] + RDFT + R[k] = sum_j=0^n-1 a[j]*cos(2*pi*j*k/n), 0<=k<=n/2 + I[k] = sum_j=0^n-1 a[j]*sin(2*pi*j*k/n), 0 IRDFT (excluding scale) + a[k] = (R[0] + R[n/2]*cos(pi*k))/2 + + sum_j=1^n/2-1 R[j]*cos(2*pi*j*k/n) + + sum_j=1^n/2-1 I[j]*sin(2*pi*j*k/n), 0<=k + ip[0] = 0; // first time only + rdft(n, 1, a, ip, w); + + ip[0] = 0; // first time only + rdft(n, -1, a, ip, w); + [parameters] + n :data length (int) + n >= 2, n = power of 2 + a[0...n-1] :input/output data (double *) + + output data + a[2*k] = R[k], 0<=k + input data + a[2*j] = R[j], 0<=j= 2+sqrt(n/2) + strictly, + length of ip >= + 2+(1<<(int)(log(n/2+0.5)/log(2))/2). + ip[0],ip[1] are pointers of the cos/sin table. + w[0...n/2-1] :cos/sin table (double *) + w[],ip[] are initialized if ip[0] == 0. + [remark] + Inverse of + rdft(n, 1, a, ip, w); + is + rdft(n, -1, a, ip, w); + for (j = 0; j <= n - 1; j++) { + a[j] *= 2.0 / n; + } + . + + +-------- DCT (Discrete Cosine Transform) / Inverse of DCT -------- + [definition] + IDCT (excluding scale) + C[k] = sum_j=0^n-1 a[j]*cos(pi*j*(k+1/2)/n), 0<=k DCT + C[k] = sum_j=0^n-1 a[j]*cos(pi*(j+1/2)*k/n), 0<=k + ip[0] = 0; // first time only + ddct(n, 1, a, ip, w); + + ip[0] = 0; // first time only + ddct(n, -1, a, ip, w); + [parameters] + n :data length (int) + n >= 2, n = power of 2 + a[0...n-1] :input/output data (double *) + output data + a[k] = C[k], 0<=k= 2+sqrt(n/2) + strictly, + length of ip >= + 2+(1<<(int)(log(n/2+0.5)/log(2))/2). + ip[0],ip[1] are pointers of the cos/sin table. + w[0...n*5/4-1] :cos/sin table (double *) + w[],ip[] are initialized if ip[0] == 0. + [remark] + Inverse of + ddct(n, -1, a, ip, w); + is + a[0] *= 0.5; + ddct(n, 1, a, ip, w); + for (j = 0; j <= n - 1; j++) { + a[j] *= 2.0 / n; + } + . + + +-------- DST (Discrete Sine Transform) / Inverse of DST -------- + [definition] + IDST (excluding scale) + S[k] = sum_j=1^n A[j]*sin(pi*j*(k+1/2)/n), 0<=k DST + S[k] = sum_j=0^n-1 a[j]*sin(pi*(j+1/2)*k/n), 0 + ip[0] = 0; // first time only + ddst(n, 1, a, ip, w); + + ip[0] = 0; // first time only + ddst(n, -1, a, ip, w); + [parameters] + n :data length (int) + n >= 2, n = power of 2 + a[0...n-1] :input/output data (double *) + + input data + a[j] = A[j], 0 + output data + a[k] = S[k], 0= 2+sqrt(n/2) + strictly, + length of ip >= + 2+(1<<(int)(log(n/2+0.5)/log(2))/2). + ip[0],ip[1] are pointers of the cos/sin table. + w[0...n*5/4-1] :cos/sin table (double *) + w[],ip[] are initialized if ip[0] == 0. + [remark] + Inverse of + ddst(n, -1, a, ip, w); + is + a[0] *= 0.5; + ddst(n, 1, a, ip, w); + for (j = 0; j <= n - 1; j++) { + a[j] *= 2.0 / n; + } + . + + +-------- Cosine Transform of RDFT (Real Symmetric DFT) -------- + [definition] + C[k] = sum_j=0^n a[j]*cos(pi*j*k/n), 0<=k<=n + [usage] + ip[0] = 0; // first time only + dfct(n, a, t, ip, w); + [parameters] + n :data length - 1 (int) + n >= 2, n = power of 2 + a[0...n] :input/output data (double *) + output data + a[k] = C[k], 0<=k<=n + t[0...n/2] :work area (double *) + ip[0...*] :work area for bit reversal (int *) + length of ip >= 2+sqrt(n/4) + strictly, + length of ip >= + 2+(1<<(int)(log(n/4+0.5)/log(2))/2). + ip[0],ip[1] are pointers of the cos/sin table. + w[0...n*5/8-1] :cos/sin table (double *) + w[],ip[] are initialized if ip[0] == 0. + [remark] + Inverse of + a[0] *= 0.5; + a[n] *= 0.5; + dfct(n, a, t, ip, w); + is + a[0] *= 0.5; + a[n] *= 0.5; + dfct(n, a, t, ip, w); + for (j = 0; j <= n; j++) { + a[j] *= 2.0 / n; + } + . + + +-------- Sine Transform of RDFT (Real Anti-symmetric DFT) -------- + [definition] + S[k] = sum_j=1^n-1 a[j]*sin(pi*j*k/n), 0= 2, n = power of 2 + a[0...n-1] :input/output data (double *) + output data + a[k] = S[k], 0= 2+sqrt(n/4) + strictly, + length of ip >= + 2+(1<<(int)(log(n/4+0.5)/log(2))/2). + ip[0],ip[1] are pointers of the cos/sin table. + w[0...n*5/8-1] :cos/sin table (double *) + w[],ip[] are initialized if ip[0] == 0. + [remark] + Inverse of + dfst(n, a, t, ip, w); + is + dfst(n, a, t, ip, w); + for (j = 1; j <= n - 1; j++) { + a[j] *= 2.0 / n; + } + . + + +Appendix : + The cos/sin table is recalculated when the larger table required. + w[] and ip[] are compatible with all routines. +*/ + + +void cdft(int n, int isgn, double *a, int *ip, double *w) { + void makewt(int nw, int *ip, double *w); + void cftfsub(int n, double *a, int *ip, int nw, double *w); + void cftbsub(int n, double *a, int *ip, int nw, double *w); + int nw; + + nw = ip[0]; + if (n > (nw << 2)) { + nw = n >> 2; + makewt(nw, ip, w); + } + if (isgn >= 0) { + cftfsub(n, a, ip, nw, w); + } else { + cftbsub(n, a, ip, nw, w); + } +} + + +void rdft(int n, int isgn, double *a, int *ip, double *w) { + void makewt(int nw, int *ip, double *w); + void makect(int nc, int *ip, double *c); + void cftfsub(int n, double *a, int *ip, int nw, double *w); + void cftbsub(int n, double *a, int *ip, int nw, double *w); + void rftfsub(int n, double *a, int nc, double *c); + void rftbsub(int n, double *a, int nc, double *c); + int nw, nc; + double xi; + + nw = ip[0]; + if (n > (nw << 2)) { + nw = n >> 2; + makewt(nw, ip, w); + } + nc = ip[1]; + if (n > (nc << 2)) { + nc = n >> 2; + makect(nc, ip, w + nw); + } + if (isgn >= 0) { + if (n > 4) { + cftfsub(n, a, ip, nw, w); + rftfsub(n, a, nc, w + nw); + } else if (n == 4) { + cftfsub(n, a, ip, nw, w); + } + xi = a[0] - a[1]; + a[0] += a[1]; + a[1] = xi; + } else { + a[1] = 0.5 * (a[0] - a[1]); + a[0] -= a[1]; + if (n > 4) { + rftbsub(n, a, nc, w + nw); + cftbsub(n, a, ip, nw, w); + } else if (n == 4) { + cftbsub(n, a, ip, nw, w); + } + } +} + + +void ddct(int n, int isgn, double *a, int *ip, double *w) { + void makewt(int nw, int *ip, double *w); + void makect(int nc, int *ip, double *c); + void cftfsub(int n, double *a, int *ip, int nw, double *w); + void cftbsub(int n, double *a, int *ip, int nw, double *w); + void rftfsub(int n, double *a, int nc, double *c); + void rftbsub(int n, double *a, int nc, double *c); + void dctsub(int n, double *a, int nc, double *c); + int j, nw, nc; + double xr; + + nw = ip[0]; + if (n > (nw << 2)) { + nw = n >> 2; + makewt(nw, ip, w); + } + nc = ip[1]; + if (n > nc) { + nc = n; + makect(nc, ip, w + nw); + } + if (isgn < 0) { + xr = a[n - 1]; + for (j = n - 2; j >= 2; j -= 2) { + a[j + 1] = a[j] - a[j - 1]; + a[j] += a[j - 1]; + } + a[1] = a[0] - xr; + a[0] += xr; + if (n > 4) { + rftbsub(n, a, nc, w + nw); + cftbsub(n, a, ip, nw, w); + } else if (n == 4) { + cftbsub(n, a, ip, nw, w); + } + } + dctsub(n, a, nc, w + nw); + if (isgn >= 0) { + if (n > 4) { + cftfsub(n, a, ip, nw, w); + rftfsub(n, a, nc, w + nw); + } else if (n == 4) { + cftfsub(n, a, ip, nw, w); + } + xr = a[0] - a[1]; + a[0] += a[1]; + for (j = 2; j < n; j += 2) { + a[j - 1] = a[j] - a[j + 1]; + a[j] += a[j + 1]; + } + a[n - 1] = xr; + } +} + + +void ddst(int n, int isgn, double *a, int *ip, double *w) { + void makewt(int nw, int *ip, double *w); + void makect(int nc, int *ip, double *c); + void cftfsub(int n, double *a, int *ip, int nw, double *w); + void cftbsub(int n, double *a, int *ip, int nw, double *w); + void rftfsub(int n, double *a, int nc, double *c); + void rftbsub(int n, double *a, int nc, double *c); + void dstsub(int n, double *a, int nc, double *c); + int j, nw, nc; + double xr; + + nw = ip[0]; + if (n > (nw << 2)) { + nw = n >> 2; + makewt(nw, ip, w); + } + nc = ip[1]; + if (n > nc) { + nc = n; + makect(nc, ip, w + nw); + } + if (isgn < 0) { + xr = a[n - 1]; + for (j = n - 2; j >= 2; j -= 2) { + a[j + 1] = -a[j] - a[j - 1]; + a[j] -= a[j - 1]; + } + a[1] = a[0] + xr; + a[0] -= xr; + if (n > 4) { + rftbsub(n, a, nc, w + nw); + cftbsub(n, a, ip, nw, w); + } else if (n == 4) { + cftbsub(n, a, ip, nw, w); + } + } + dstsub(n, a, nc, w + nw); + if (isgn >= 0) { + if (n > 4) { + cftfsub(n, a, ip, nw, w); + rftfsub(n, a, nc, w + nw); + } else if (n == 4) { + cftfsub(n, a, ip, nw, w); + } + xr = a[0] - a[1]; + a[0] += a[1]; + for (j = 2; j < n; j += 2) { + a[j - 1] = -a[j] - a[j + 1]; + a[j] -= a[j + 1]; + } + a[n - 1] = -xr; + } +} + + +void dfct(int n, double *a, double *t, int *ip, double *w) { + void makewt(int nw, int *ip, double *w); + void makect(int nc, int *ip, double *c); + void cftfsub(int n, double *a, int *ip, int nw, double *w); + void rftfsub(int n, double *a, int nc, double *c); + void dctsub(int n, double *a, int nc, double *c); + int j, k, l, m, mh, nw, nc; + double xr, xi, yr, yi; + + nw = ip[0]; + if (n > (nw << 3)) { + nw = n >> 3; + makewt(nw, ip, w); + } + nc = ip[1]; + if (n > (nc << 1)) { + nc = n >> 1; + makect(nc, ip, w + nw); + } + m = n >> 1; + yi = a[m]; + xi = a[0] + a[n]; + a[0] -= a[n]; + t[0] = xi - yi; + t[m] = xi + yi; + if (n > 2) { + mh = m >> 1; + for (j = 1; j < mh; j++) { + k = m - j; + xr = a[j] - a[n - j]; + xi = a[j] + a[n - j]; + yr = a[k] - a[n - k]; + yi = a[k] + a[n - k]; + a[j] = xr; + a[k] = yr; + t[j] = xi - yi; + t[k] = xi + yi; + } + t[mh] = a[mh] + a[n - mh]; + a[mh] -= a[n - mh]; + dctsub(m, a, nc, w + nw); + if (m > 4) { + cftfsub(m, a, ip, nw, w); + rftfsub(m, a, nc, w + nw); + } else if (m == 4) { + cftfsub(m, a, ip, nw, w); + } + a[n - 1] = a[0] - a[1]; + a[1] = a[0] + a[1]; + for (j = m - 2; j >= 2; j -= 2) { + a[2 * j + 1] = a[j] + a[j + 1]; + a[2 * j - 1] = a[j] - a[j + 1]; + } + l = 2; + m = mh; + while (m >= 2) { + dctsub(m, t, nc, w + nw); + if (m > 4) { + cftfsub(m, t, ip, nw, w); + rftfsub(m, t, nc, w + nw); + } else if (m == 4) { + cftfsub(m, t, ip, nw, w); + } + a[n - l] = t[0] - t[1]; + a[l] = t[0] + t[1]; + k = 0; + for (j = 2; j < m; j += 2) { + k += l << 2; + a[k - l] = t[j] - t[j + 1]; + a[k + l] = t[j] + t[j + 1]; + } + l <<= 1; + mh = m >> 1; + for (j = 0; j < mh; j++) { + k = m - j; + t[j] = t[m + k] - t[m + j]; + t[k] = t[m + k] + t[m + j]; + } + t[mh] = t[m + mh]; + m = mh; + } + a[l] = t[0]; + a[n] = t[2] - t[1]; + a[0] = t[2] + t[1]; + } else { + a[1] = a[0]; + a[2] = t[0]; + a[0] = t[1]; + } +} + + +void dfst(int n, double *a, double *t, int *ip, double *w) { + void makewt(int nw, int *ip, double *w); + void makect(int nc, int *ip, double *c); + void cftfsub(int n, double *a, int *ip, int nw, double *w); + void rftfsub(int n, double *a, int nc, double *c); + void dstsub(int n, double *a, int nc, double *c); + int j, k, l, m, mh, nw, nc; + double xr, xi, yr, yi; + + nw = ip[0]; + if (n > (nw << 3)) { + nw = n >> 3; + makewt(nw, ip, w); + } + nc = ip[1]; + if (n > (nc << 1)) { + nc = n >> 1; + makect(nc, ip, w + nw); + } + if (n > 2) { + m = n >> 1; + mh = m >> 1; + for (j = 1; j < mh; j++) { + k = m - j; + xr = a[j] + a[n - j]; + xi = a[j] - a[n - j]; + yr = a[k] + a[n - k]; + yi = a[k] - a[n - k]; + a[j] = xr; + a[k] = yr; + t[j] = xi + yi; + t[k] = xi - yi; + } + t[0] = a[mh] - a[n - mh]; + a[mh] += a[n - mh]; + a[0] = a[m]; + dstsub(m, a, nc, w + nw); + if (m > 4) { + cftfsub(m, a, ip, nw, w); + rftfsub(m, a, nc, w + nw); + } else if (m == 4) { + cftfsub(m, a, ip, nw, w); + } + a[n - 1] = a[1] - a[0]; + a[1] = a[0] + a[1]; + for (j = m - 2; j >= 2; j -= 2) { + a[2 * j + 1] = a[j] - a[j + 1]; + a[2 * j - 1] = -a[j] - a[j + 1]; + } + l = 2; + m = mh; + while (m >= 2) { + dstsub(m, t, nc, w + nw); + if (m > 4) { + cftfsub(m, t, ip, nw, w); + rftfsub(m, t, nc, w + nw); + } else if (m == 4) { + cftfsub(m, t, ip, nw, w); + } + a[n - l] = t[1] - t[0]; + a[l] = t[0] + t[1]; + k = 0; + for (j = 2; j < m; j += 2) { + k += l << 2; + a[k - l] = -t[j] - t[j + 1]; + a[k + l] = t[j] - t[j + 1]; + } + l <<= 1; + mh = m >> 1; + for (j = 1; j < mh; j++) { + k = m - j; + t[j] = t[m + k] + t[m + j]; + t[k] = t[m + k] - t[m + j]; + } + t[0] = t[m + mh]; + m = mh; + } + a[l] = t[0]; + } + a[0] = 0; +} + + +/* -------- initializing routines -------- */ + + +#include + +void makewt(int nw, int *ip, double *w) { + void makeipt(int nw, int *ip); + int j, nwh, nw0, nw1; + double delta, wn4r, wk1r, wk1i, wk3r, wk3i; + + ip[0] = nw; + ip[1] = 1; + if (nw > 2) { + nwh = nw >> 1; + delta = atan(1.0) / nwh; + wn4r = cos(delta * nwh); + w[0] = 1; + w[1] = wn4r; + if (nwh == 4) { + w[2] = cos(delta * 2); + w[3] = sin(delta * 2); + } else if (nwh > 4) { + makeipt(nw, ip); + w[2] = 0.5 / cos(delta * 2); + w[3] = 0.5 / cos(delta * 6); + for (j = 4; j < nwh; j += 4) { + w[j] = cos(delta * j); + w[j + 1] = sin(delta * j); + w[j + 2] = cos(3 * delta * j); + w[j + 3] = -sin(3 * delta * j); + } + } + nw0 = 0; + while (nwh > 2) { + nw1 = nw0 + nwh; + nwh >>= 1; + w[nw1] = 1; + w[nw1 + 1] = wn4r; + if (nwh == 4) { + wk1r = w[nw0 + 4]; + wk1i = w[nw0 + 5]; + w[nw1 + 2] = wk1r; + w[nw1 + 3] = wk1i; + } else if (nwh > 4) { + wk1r = w[nw0 + 4]; + wk3r = w[nw0 + 6]; + w[nw1 + 2] = 0.5 / wk1r; + w[nw1 + 3] = 0.5 / wk3r; + for (j = 4; j < nwh; j += 4) { + wk1r = w[nw0 + 2 * j]; + wk1i = w[nw0 + 2 * j + 1]; + wk3r = w[nw0 + 2 * j + 2]; + wk3i = w[nw0 + 2 * j + 3]; + w[nw1 + j] = wk1r; + w[nw1 + j + 1] = wk1i; + w[nw1 + j + 2] = wk3r; + w[nw1 + j + 3] = wk3i; + } + } + nw0 = nw1; + } + } +} + + +void makeipt(int nw, int *ip) { + int j, l, m, m2, p, q; + + ip[2] = 0; + ip[3] = 16; + m = 2; + for (l = nw; l > 32; l >>= 2) { + m2 = m << 1; + q = m2 << 3; + for (j = m; j < m2; j++) { + p = ip[j] << 2; + ip[m + j] = p; + ip[m2 + j] = p + q; + } + m = m2; + } +} + + +void makect(int nc, int *ip, double *c) { + int j, nch; + double delta; + + ip[1] = nc; + if (nc > 1) { + nch = nc >> 1; + delta = atan(1.0) / nch; + c[0] = cos(delta * nch); + c[nch] = 0.5 * c[0]; + for (j = 1; j < nch; j++) { + c[j] = 0.5 * cos(delta * j); + c[nc - j] = 0.5 * sin(delta * j); + } + } +} + + +/* -------- child routines -------- */ + + +#ifdef USE_CDFT_PTHREADS +#define USE_CDFT_THREADS +#ifndef CDFT_THREADS_BEGIN_N +#define CDFT_THREADS_BEGIN_N 8192 +#endif +#ifndef CDFT_4THREADS_BEGIN_N +#define CDFT_4THREADS_BEGIN_N 65536 +#endif +#include +#include +#include +#define cdft_thread_t pthread_t +#define cdft_thread_create(thp, func, argp) \ + { \ + if (pthread_create(thp, NULL, func, (void *)argp) != 0) { \ + fprintf(stderr, "cdft thread error\n"); \ + exit(1); \ + } \ + } +#define cdft_thread_wait(th) \ + { \ + if (pthread_join(th, NULL) != 0) { \ + fprintf(stderr, "cdft thread error\n"); \ + exit(1); \ + } \ + } +#endif /* USE_CDFT_PTHREADS */ + + +#ifdef USE_CDFT_WINTHREADS +#define USE_CDFT_THREADS +#ifndef CDFT_THREADS_BEGIN_N +#define CDFT_THREADS_BEGIN_N 32768 +#endif +#ifndef CDFT_4THREADS_BEGIN_N +#define CDFT_4THREADS_BEGIN_N 524288 +#endif +#include +#include +#include +#define cdft_thread_t HANDLE +#define cdft_thread_create(thp, func, argp) \ + { \ + DWORD thid; \ + *(thp) = CreateThread( \ + NULL, 0, (LPTHREAD_START_ROUTINE)func, (LPVOID)argp, 0, &thid); \ + if (*(thp) == 0) { \ + fprintf(stderr, "cdft thread error\n"); \ + exit(1); \ + } \ + } +#define cdft_thread_wait(th) \ + { \ + WaitForSingleObject(th, INFINITE); \ + CloseHandle(th); \ + } +#endif /* USE_CDFT_WINTHREADS */ + + +void cftfsub(int n, double *a, int *ip, int nw, double *w) { + void bitrv2(int n, int *ip, double *a); + void bitrv216(double *a); + void bitrv208(double *a); + void cftf1st(int n, double *a, double *w); + void cftrec4(int n, double *a, int nw, double *w); + void cftleaf(int n, int isplt, double *a, int nw, double *w); + void cftfx41(int n, double *a, int nw, double *w); + void cftf161(double *a, double *w); + void cftf081(double *a, double *w); + void cftf040(double *a); + void cftx020(double *a); +#ifdef USE_CDFT_THREADS + void cftrec4_th(int n, double *a, int nw, double *w); +#endif /* USE_CDFT_THREADS */ + + if (n > 8) { + if (n > 32) { + cftf1st(n, a, &w[nw - (n >> 2)]); +#ifdef USE_CDFT_THREADS + if (n > CDFT_THREADS_BEGIN_N) { + cftrec4_th(n, a, nw, w); + } else +#endif /* USE_CDFT_THREADS */ + if (n > 512) { + cftrec4(n, a, nw, w); + } else if (n > 128) { + cftleaf(n, 1, a, nw, w); + } else { + cftfx41(n, a, nw, w); + } + bitrv2(n, ip, a); + } else if (n == 32) { + cftf161(a, &w[nw - 8]); + bitrv216(a); + } else { + cftf081(a, w); + bitrv208(a); + } + } else if (n == 8) { + cftf040(a); + } else if (n == 4) { + cftx020(a); + } +} + + +void cftbsub(int n, double *a, int *ip, int nw, double *w) { + void bitrv2conj(int n, int *ip, double *a); + void bitrv216neg(double *a); + void bitrv208neg(double *a); + void cftb1st(int n, double *a, double *w); + void cftrec4(int n, double *a, int nw, double *w); + void cftleaf(int n, int isplt, double *a, int nw, double *w); + void cftfx41(int n, double *a, int nw, double *w); + void cftf161(double *a, double *w); + void cftf081(double *a, double *w); + void cftb040(double *a); + void cftx020(double *a); +#ifdef USE_CDFT_THREADS + void cftrec4_th(int n, double *a, int nw, double *w); +#endif /* USE_CDFT_THREADS */ + + if (n > 8) { + if (n > 32) { + cftb1st(n, a, &w[nw - (n >> 2)]); +#ifdef USE_CDFT_THREADS + if (n > CDFT_THREADS_BEGIN_N) { + cftrec4_th(n, a, nw, w); + } else +#endif /* USE_CDFT_THREADS */ + if (n > 512) { + cftrec4(n, a, nw, w); + } else if (n > 128) { + cftleaf(n, 1, a, nw, w); + } else { + cftfx41(n, a, nw, w); + } + bitrv2conj(n, ip, a); + } else if (n == 32) { + cftf161(a, &w[nw - 8]); + bitrv216neg(a); + } else { + cftf081(a, w); + bitrv208neg(a); + } + } else if (n == 8) { + cftb040(a); + } else if (n == 4) { + cftx020(a); + } +} + + +void bitrv2(int n, int *ip, double *a) { + int j, j1, k, k1, l, m, nh, nm; + double xr, xi, yr, yi; + + m = 1; + for (l = n >> 2; l > 8; l >>= 2) { + m <<= 1; + } + nh = n >> 1; + nm = 4 * m; + if (l == 8) { + for (k = 0; k < m; k++) { + for (j = 0; j < k; j++) { + j1 = 4 * j + 2 * ip[m + k]; + k1 = 4 * k + 2 * ip[m + j]; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 += 2 * nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 -= nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 += 2 * nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nh; + k1 += 2; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 -= 2 * nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 += nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 -= 2 * nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += 2; + k1 += nh; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 += 2 * nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 -= nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 += 2 * nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nh; + k1 -= 2; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 -= 2 * nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 += nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 -= 2 * nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + } + k1 = 4 * k + 2 * ip[m + k]; + j1 = k1 + 2; + k1 += nh; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 += 2 * nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 -= nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= 2; + k1 -= nh; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nh + 2; + k1 += nh + 2; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nh - nm; + k1 += 2 * nm - 2; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + } + } else { + for (k = 0; k < m; k++) { + for (j = 0; j < k; j++) { + j1 = 4 * j + ip[m + k]; + k1 = 4 * k + ip[m + j]; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 += nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nh; + k1 += 2; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 -= nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += 2; + k1 += nh; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 += nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nh; + k1 -= 2; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 -= nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + } + k1 = 4 * k + ip[m + k]; + j1 = k1 + 2; + k1 += nh; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 += nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + } + } +} + + +void bitrv2conj(int n, int *ip, double *a) { + int j, j1, k, k1, l, m, nh, nm; + double xr, xi, yr, yi; + + m = 1; + for (l = n >> 2; l > 8; l >>= 2) { + m <<= 1; + } + nh = n >> 1; + nm = 4 * m; + if (l == 8) { + for (k = 0; k < m; k++) { + for (j = 0; j < k; j++) { + j1 = 4 * j + 2 * ip[m + k]; + k1 = 4 * k + 2 * ip[m + j]; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 += 2 * nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 -= nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 += 2 * nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nh; + k1 += 2; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 -= 2 * nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 += nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 -= 2 * nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += 2; + k1 += nh; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 += 2 * nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 -= nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 += 2 * nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nh; + k1 -= 2; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 -= 2 * nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 += nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 -= 2 * nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + } + k1 = 4 * k + 2 * ip[m + k]; + j1 = k1 + 2; + k1 += nh; + a[j1 - 1] = -a[j1 - 1]; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + a[k1 + 3] = -a[k1 + 3]; + j1 += nm; + k1 += 2 * nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 -= nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= 2; + k1 -= nh; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nh + 2; + k1 += nh + 2; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nh - nm; + k1 += 2 * nm - 2; + a[j1 - 1] = -a[j1 - 1]; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + a[k1 + 3] = -a[k1 + 3]; + } + } else { + for (k = 0; k < m; k++) { + for (j = 0; j < k; j++) { + j1 = 4 * j + ip[m + k]; + k1 = 4 * k + ip[m + j]; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 += nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nh; + k1 += 2; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 -= nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += 2; + k1 += nh; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 += nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nh; + k1 -= 2; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 -= nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + } + k1 = 4 * k + ip[m + k]; + j1 = k1 + 2; + k1 += nh; + a[j1 - 1] = -a[j1 - 1]; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + a[k1 + 3] = -a[k1 + 3]; + j1 += nm; + k1 += nm; + a[j1 - 1] = -a[j1 - 1]; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + a[k1 + 3] = -a[k1 + 3]; + } + } +} + + +void bitrv216(double *a) { + double x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i, x5r, x5i, x7r, x7i, x8r, x8i, + x10r, x10i, x11r, x11i, x12r, x12i, x13r, x13i, x14r, x14i; + + x1r = a[2]; + x1i = a[3]; + x2r = a[4]; + x2i = a[5]; + x3r = a[6]; + x3i = a[7]; + x4r = a[8]; + x4i = a[9]; + x5r = a[10]; + x5i = a[11]; + x7r = a[14]; + x7i = a[15]; + x8r = a[16]; + x8i = a[17]; + x10r = a[20]; + x10i = a[21]; + x11r = a[22]; + x11i = a[23]; + x12r = a[24]; + x12i = a[25]; + x13r = a[26]; + x13i = a[27]; + x14r = a[28]; + x14i = a[29]; + a[2] = x8r; + a[3] = x8i; + a[4] = x4r; + a[5] = x4i; + a[6] = x12r; + a[7] = x12i; + a[8] = x2r; + a[9] = x2i; + a[10] = x10r; + a[11] = x10i; + a[14] = x14r; + a[15] = x14i; + a[16] = x1r; + a[17] = x1i; + a[20] = x5r; + a[21] = x5i; + a[22] = x13r; + a[23] = x13i; + a[24] = x3r; + a[25] = x3i; + a[26] = x11r; + a[27] = x11i; + a[28] = x7r; + a[29] = x7i; +} + + +void bitrv216neg(double *a) { + double x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i, x5r, x5i, x6r, x6i, x7r, x7i, + x8r, x8i, x9r, x9i, x10r, x10i, x11r, x11i, x12r, x12i, x13r, x13i, + x14r, x14i, x15r, x15i; + + x1r = a[2]; + x1i = a[3]; + x2r = a[4]; + x2i = a[5]; + x3r = a[6]; + x3i = a[7]; + x4r = a[8]; + x4i = a[9]; + x5r = a[10]; + x5i = a[11]; + x6r = a[12]; + x6i = a[13]; + x7r = a[14]; + x7i = a[15]; + x8r = a[16]; + x8i = a[17]; + x9r = a[18]; + x9i = a[19]; + x10r = a[20]; + x10i = a[21]; + x11r = a[22]; + x11i = a[23]; + x12r = a[24]; + x12i = a[25]; + x13r = a[26]; + x13i = a[27]; + x14r = a[28]; + x14i = a[29]; + x15r = a[30]; + x15i = a[31]; + a[2] = x15r; + a[3] = x15i; + a[4] = x7r; + a[5] = x7i; + a[6] = x11r; + a[7] = x11i; + a[8] = x3r; + a[9] = x3i; + a[10] = x13r; + a[11] = x13i; + a[12] = x5r; + a[13] = x5i; + a[14] = x9r; + a[15] = x9i; + a[16] = x1r; + a[17] = x1i; + a[18] = x14r; + a[19] = x14i; + a[20] = x6r; + a[21] = x6i; + a[22] = x10r; + a[23] = x10i; + a[24] = x2r; + a[25] = x2i; + a[26] = x12r; + a[27] = x12i; + a[28] = x4r; + a[29] = x4i; + a[30] = x8r; + a[31] = x8i; +} + + +void bitrv208(double *a) { + double x1r, x1i, x3r, x3i, x4r, x4i, x6r, x6i; + + x1r = a[2]; + x1i = a[3]; + x3r = a[6]; + x3i = a[7]; + x4r = a[8]; + x4i = a[9]; + x6r = a[12]; + x6i = a[13]; + a[2] = x4r; + a[3] = x4i; + a[6] = x6r; + a[7] = x6i; + a[8] = x1r; + a[9] = x1i; + a[12] = x3r; + a[13] = x3i; +} + + +void bitrv208neg(double *a) { + double x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i, x5r, x5i, x6r, x6i, x7r, x7i; + + x1r = a[2]; + x1i = a[3]; + x2r = a[4]; + x2i = a[5]; + x3r = a[6]; + x3i = a[7]; + x4r = a[8]; + x4i = a[9]; + x5r = a[10]; + x5i = a[11]; + x6r = a[12]; + x6i = a[13]; + x7r = a[14]; + x7i = a[15]; + a[2] = x7r; + a[3] = x7i; + a[4] = x3r; + a[5] = x3i; + a[6] = x5r; + a[7] = x5i; + a[8] = x1r; + a[9] = x1i; + a[10] = x6r; + a[11] = x6i; + a[12] = x2r; + a[13] = x2i; + a[14] = x4r; + a[15] = x4i; +} + + +void cftf1st(int n, double *a, double *w) { + int j, j0, j1, j2, j3, k, m, mh; + double wn4r, csc1, csc3, wk1r, wk1i, wk3r, wk3i, wd1r, wd1i, wd3r, wd3i; + double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i, y0r, y0i, y1r, y1i, y2r, y2i, + y3r, y3i; + + mh = n >> 3; + m = 2 * mh; + j1 = m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[0] + a[j2]; + x0i = a[1] + a[j2 + 1]; + x1r = a[0] - a[j2]; + x1i = a[1] - a[j2 + 1]; + x2r = a[j1] + a[j3]; + x2i = a[j1 + 1] + a[j3 + 1]; + x3r = a[j1] - a[j3]; + x3i = a[j1 + 1] - a[j3 + 1]; + a[0] = x0r + x2r; + a[1] = x0i + x2i; + a[j1] = x0r - x2r; + a[j1 + 1] = x0i - x2i; + a[j2] = x1r - x3i; + a[j2 + 1] = x1i + x3r; + a[j3] = x1r + x3i; + a[j3 + 1] = x1i - x3r; + wn4r = w[1]; + csc1 = w[2]; + csc3 = w[3]; + wd1r = 1; + wd1i = 0; + wd3r = 1; + wd3i = 0; + k = 0; + for (j = 2; j < mh - 2; j += 4) { + k += 4; + wk1r = csc1 * (wd1r + w[k]); + wk1i = csc1 * (wd1i + w[k + 1]); + wk3r = csc3 * (wd3r + w[k + 2]); + wk3i = csc3 * (wd3i + w[k + 3]); + wd1r = w[k]; + wd1i = w[k + 1]; + wd3r = w[k + 2]; + wd3i = w[k + 3]; + j1 = j + m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[j] + a[j2]; + x0i = a[j + 1] + a[j2 + 1]; + x1r = a[j] - a[j2]; + x1i = a[j + 1] - a[j2 + 1]; + y0r = a[j + 2] + a[j2 + 2]; + y0i = a[j + 3] + a[j2 + 3]; + y1r = a[j + 2] - a[j2 + 2]; + y1i = a[j + 3] - a[j2 + 3]; + x2r = a[j1] + a[j3]; + x2i = a[j1 + 1] + a[j3 + 1]; + x3r = a[j1] - a[j3]; + x3i = a[j1 + 1] - a[j3 + 1]; + y2r = a[j1 + 2] + a[j3 + 2]; + y2i = a[j1 + 3] + a[j3 + 3]; + y3r = a[j1 + 2] - a[j3 + 2]; + y3i = a[j1 + 3] - a[j3 + 3]; + a[j] = x0r + x2r; + a[j + 1] = x0i + x2i; + a[j + 2] = y0r + y2r; + a[j + 3] = y0i + y2i; + a[j1] = x0r - x2r; + a[j1 + 1] = x0i - x2i; + a[j1 + 2] = y0r - y2r; + a[j1 + 3] = y0i - y2i; + x0r = x1r - x3i; + x0i = x1i + x3r; + a[j2] = wk1r * x0r - wk1i * x0i; + a[j2 + 1] = wk1r * x0i + wk1i * x0r; + x0r = y1r - y3i; + x0i = y1i + y3r; + a[j2 + 2] = wd1r * x0r - wd1i * x0i; + a[j2 + 3] = wd1r * x0i + wd1i * x0r; + x0r = x1r + x3i; + x0i = x1i - x3r; + a[j3] = wk3r * x0r + wk3i * x0i; + a[j3 + 1] = wk3r * x0i - wk3i * x0r; + x0r = y1r + y3i; + x0i = y1i - y3r; + a[j3 + 2] = wd3r * x0r + wd3i * x0i; + a[j3 + 3] = wd3r * x0i - wd3i * x0r; + j0 = m - j; + j1 = j0 + m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[j0] + a[j2]; + x0i = a[j0 + 1] + a[j2 + 1]; + x1r = a[j0] - a[j2]; + x1i = a[j0 + 1] - a[j2 + 1]; + y0r = a[j0 - 2] + a[j2 - 2]; + y0i = a[j0 - 1] + a[j2 - 1]; + y1r = a[j0 - 2] - a[j2 - 2]; + y1i = a[j0 - 1] - a[j2 - 1]; + x2r = a[j1] + a[j3]; + x2i = a[j1 + 1] + a[j3 + 1]; + x3r = a[j1] - a[j3]; + x3i = a[j1 + 1] - a[j3 + 1]; + y2r = a[j1 - 2] + a[j3 - 2]; + y2i = a[j1 - 1] + a[j3 - 1]; + y3r = a[j1 - 2] - a[j3 - 2]; + y3i = a[j1 - 1] - a[j3 - 1]; + a[j0] = x0r + x2r; + a[j0 + 1] = x0i + x2i; + a[j0 - 2] = y0r + y2r; + a[j0 - 1] = y0i + y2i; + a[j1] = x0r - x2r; + a[j1 + 1] = x0i - x2i; + a[j1 - 2] = y0r - y2r; + a[j1 - 1] = y0i - y2i; + x0r = x1r - x3i; + x0i = x1i + x3r; + a[j2] = wk1i * x0r - wk1r * x0i; + a[j2 + 1] = wk1i * x0i + wk1r * x0r; + x0r = y1r - y3i; + x0i = y1i + y3r; + a[j2 - 2] = wd1i * x0r - wd1r * x0i; + a[j2 - 1] = wd1i * x0i + wd1r * x0r; + x0r = x1r + x3i; + x0i = x1i - x3r; + a[j3] = wk3i * x0r + wk3r * x0i; + a[j3 + 1] = wk3i * x0i - wk3r * x0r; + x0r = y1r + y3i; + x0i = y1i - y3r; + a[j3 - 2] = wd3i * x0r + wd3r * x0i; + a[j3 - 1] = wd3i * x0i - wd3r * x0r; + } + wk1r = csc1 * (wd1r + wn4r); + wk1i = csc1 * (wd1i + wn4r); + wk3r = csc3 * (wd3r - wn4r); + wk3i = csc3 * (wd3i - wn4r); + j0 = mh; + j1 = j0 + m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[j0 - 2] + a[j2 - 2]; + x0i = a[j0 - 1] + a[j2 - 1]; + x1r = a[j0 - 2] - a[j2 - 2]; + x1i = a[j0 - 1] - a[j2 - 1]; + x2r = a[j1 - 2] + a[j3 - 2]; + x2i = a[j1 - 1] + a[j3 - 1]; + x3r = a[j1 - 2] - a[j3 - 2]; + x3i = a[j1 - 1] - a[j3 - 1]; + a[j0 - 2] = x0r + x2r; + a[j0 - 1] = x0i + x2i; + a[j1 - 2] = x0r - x2r; + a[j1 - 1] = x0i - x2i; + x0r = x1r - x3i; + x0i = x1i + x3r; + a[j2 - 2] = wk1r * x0r - wk1i * x0i; + a[j2 - 1] = wk1r * x0i + wk1i * x0r; + x0r = x1r + x3i; + x0i = x1i - x3r; + a[j3 - 2] = wk3r * x0r + wk3i * x0i; + a[j3 - 1] = wk3r * x0i - wk3i * x0r; + x0r = a[j0] + a[j2]; + x0i = a[j0 + 1] + a[j2 + 1]; + x1r = a[j0] - a[j2]; + x1i = a[j0 + 1] - a[j2 + 1]; + x2r = a[j1] + a[j3]; + x2i = a[j1 + 1] + a[j3 + 1]; + x3r = a[j1] - a[j3]; + x3i = a[j1 + 1] - a[j3 + 1]; + a[j0] = x0r + x2r; + a[j0 + 1] = x0i + x2i; + a[j1] = x0r - x2r; + a[j1 + 1] = x0i - x2i; + x0r = x1r - x3i; + x0i = x1i + x3r; + a[j2] = wn4r * (x0r - x0i); + a[j2 + 1] = wn4r * (x0i + x0r); + x0r = x1r + x3i; + x0i = x1i - x3r; + a[j3] = -wn4r * (x0r + x0i); + a[j3 + 1] = -wn4r * (x0i - x0r); + x0r = a[j0 + 2] + a[j2 + 2]; + x0i = a[j0 + 3] + a[j2 + 3]; + x1r = a[j0 + 2] - a[j2 + 2]; + x1i = a[j0 + 3] - a[j2 + 3]; + x2r = a[j1 + 2] + a[j3 + 2]; + x2i = a[j1 + 3] + a[j3 + 3]; + x3r = a[j1 + 2] - a[j3 + 2]; + x3i = a[j1 + 3] - a[j3 + 3]; + a[j0 + 2] = x0r + x2r; + a[j0 + 3] = x0i + x2i; + a[j1 + 2] = x0r - x2r; + a[j1 + 3] = x0i - x2i; + x0r = x1r - x3i; + x0i = x1i + x3r; + a[j2 + 2] = wk1i * x0r - wk1r * x0i; + a[j2 + 3] = wk1i * x0i + wk1r * x0r; + x0r = x1r + x3i; + x0i = x1i - x3r; + a[j3 + 2] = wk3i * x0r + wk3r * x0i; + a[j3 + 3] = wk3i * x0i - wk3r * x0r; +} + + +void cftb1st(int n, double *a, double *w) { + int j, j0, j1, j2, j3, k, m, mh; + double wn4r, csc1, csc3, wk1r, wk1i, wk3r, wk3i, wd1r, wd1i, wd3r, wd3i; + double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i, y0r, y0i, y1r, y1i, y2r, y2i, + y3r, y3i; + + mh = n >> 3; + m = 2 * mh; + j1 = m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[0] + a[j2]; + x0i = -a[1] - a[j2 + 1]; + x1r = a[0] - a[j2]; + x1i = -a[1] + a[j2 + 1]; + x2r = a[j1] + a[j3]; + x2i = a[j1 + 1] + a[j3 + 1]; + x3r = a[j1] - a[j3]; + x3i = a[j1 + 1] - a[j3 + 1]; + a[0] = x0r + x2r; + a[1] = x0i - x2i; + a[j1] = x0r - x2r; + a[j1 + 1] = x0i + x2i; + a[j2] = x1r + x3i; + a[j2 + 1] = x1i + x3r; + a[j3] = x1r - x3i; + a[j3 + 1] = x1i - x3r; + wn4r = w[1]; + csc1 = w[2]; + csc3 = w[3]; + wd1r = 1; + wd1i = 0; + wd3r = 1; + wd3i = 0; + k = 0; + for (j = 2; j < mh - 2; j += 4) { + k += 4; + wk1r = csc1 * (wd1r + w[k]); + wk1i = csc1 * (wd1i + w[k + 1]); + wk3r = csc3 * (wd3r + w[k + 2]); + wk3i = csc3 * (wd3i + w[k + 3]); + wd1r = w[k]; + wd1i = w[k + 1]; + wd3r = w[k + 2]; + wd3i = w[k + 3]; + j1 = j + m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[j] + a[j2]; + x0i = -a[j + 1] - a[j2 + 1]; + x1r = a[j] - a[j2]; + x1i = -a[j + 1] + a[j2 + 1]; + y0r = a[j + 2] + a[j2 + 2]; + y0i = -a[j + 3] - a[j2 + 3]; + y1r = a[j + 2] - a[j2 + 2]; + y1i = -a[j + 3] + a[j2 + 3]; + x2r = a[j1] + a[j3]; + x2i = a[j1 + 1] + a[j3 + 1]; + x3r = a[j1] - a[j3]; + x3i = a[j1 + 1] - a[j3 + 1]; + y2r = a[j1 + 2] + a[j3 + 2]; + y2i = a[j1 + 3] + a[j3 + 3]; + y3r = a[j1 + 2] - a[j3 + 2]; + y3i = a[j1 + 3] - a[j3 + 3]; + a[j] = x0r + x2r; + a[j + 1] = x0i - x2i; + a[j + 2] = y0r + y2r; + a[j + 3] = y0i - y2i; + a[j1] = x0r - x2r; + a[j1 + 1] = x0i + x2i; + a[j1 + 2] = y0r - y2r; + a[j1 + 3] = y0i + y2i; + x0r = x1r + x3i; + x0i = x1i + x3r; + a[j2] = wk1r * x0r - wk1i * x0i; + a[j2 + 1] = wk1r * x0i + wk1i * x0r; + x0r = y1r + y3i; + x0i = y1i + y3r; + a[j2 + 2] = wd1r * x0r - wd1i * x0i; + a[j2 + 3] = wd1r * x0i + wd1i * x0r; + x0r = x1r - x3i; + x0i = x1i - x3r; + a[j3] = wk3r * x0r + wk3i * x0i; + a[j3 + 1] = wk3r * x0i - wk3i * x0r; + x0r = y1r - y3i; + x0i = y1i - y3r; + a[j3 + 2] = wd3r * x0r + wd3i * x0i; + a[j3 + 3] = wd3r * x0i - wd3i * x0r; + j0 = m - j; + j1 = j0 + m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[j0] + a[j2]; + x0i = -a[j0 + 1] - a[j2 + 1]; + x1r = a[j0] - a[j2]; + x1i = -a[j0 + 1] + a[j2 + 1]; + y0r = a[j0 - 2] + a[j2 - 2]; + y0i = -a[j0 - 1] - a[j2 - 1]; + y1r = a[j0 - 2] - a[j2 - 2]; + y1i = -a[j0 - 1] + a[j2 - 1]; + x2r = a[j1] + a[j3]; + x2i = a[j1 + 1] + a[j3 + 1]; + x3r = a[j1] - a[j3]; + x3i = a[j1 + 1] - a[j3 + 1]; + y2r = a[j1 - 2] + a[j3 - 2]; + y2i = a[j1 - 1] + a[j3 - 1]; + y3r = a[j1 - 2] - a[j3 - 2]; + y3i = a[j1 - 1] - a[j3 - 1]; + a[j0] = x0r + x2r; + a[j0 + 1] = x0i - x2i; + a[j0 - 2] = y0r + y2r; + a[j0 - 1] = y0i - y2i; + a[j1] = x0r - x2r; + a[j1 + 1] = x0i + x2i; + a[j1 - 2] = y0r - y2r; + a[j1 - 1] = y0i + y2i; + x0r = x1r + x3i; + x0i = x1i + x3r; + a[j2] = wk1i * x0r - wk1r * x0i; + a[j2 + 1] = wk1i * x0i + wk1r * x0r; + x0r = y1r + y3i; + x0i = y1i + y3r; + a[j2 - 2] = wd1i * x0r - wd1r * x0i; + a[j2 - 1] = wd1i * x0i + wd1r * x0r; + x0r = x1r - x3i; + x0i = x1i - x3r; + a[j3] = wk3i * x0r + wk3r * x0i; + a[j3 + 1] = wk3i * x0i - wk3r * x0r; + x0r = y1r - y3i; + x0i = y1i - y3r; + a[j3 - 2] = wd3i * x0r + wd3r * x0i; + a[j3 - 1] = wd3i * x0i - wd3r * x0r; + } + wk1r = csc1 * (wd1r + wn4r); + wk1i = csc1 * (wd1i + wn4r); + wk3r = csc3 * (wd3r - wn4r); + wk3i = csc3 * (wd3i - wn4r); + j0 = mh; + j1 = j0 + m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[j0 - 2] + a[j2 - 2]; + x0i = -a[j0 - 1] - a[j2 - 1]; + x1r = a[j0 - 2] - a[j2 - 2]; + x1i = -a[j0 - 1] + a[j2 - 1]; + x2r = a[j1 - 2] + a[j3 - 2]; + x2i = a[j1 - 1] + a[j3 - 1]; + x3r = a[j1 - 2] - a[j3 - 2]; + x3i = a[j1 - 1] - a[j3 - 1]; + a[j0 - 2] = x0r + x2r; + a[j0 - 1] = x0i - x2i; + a[j1 - 2] = x0r - x2r; + a[j1 - 1] = x0i + x2i; + x0r = x1r + x3i; + x0i = x1i + x3r; + a[j2 - 2] = wk1r * x0r - wk1i * x0i; + a[j2 - 1] = wk1r * x0i + wk1i * x0r; + x0r = x1r - x3i; + x0i = x1i - x3r; + a[j3 - 2] = wk3r * x0r + wk3i * x0i; + a[j3 - 1] = wk3r * x0i - wk3i * x0r; + x0r = a[j0] + a[j2]; + x0i = -a[j0 + 1] - a[j2 + 1]; + x1r = a[j0] - a[j2]; + x1i = -a[j0 + 1] + a[j2 + 1]; + x2r = a[j1] + a[j3]; + x2i = a[j1 + 1] + a[j3 + 1]; + x3r = a[j1] - a[j3]; + x3i = a[j1 + 1] - a[j3 + 1]; + a[j0] = x0r + x2r; + a[j0 + 1] = x0i - x2i; + a[j1] = x0r - x2r; + a[j1 + 1] = x0i + x2i; + x0r = x1r + x3i; + x0i = x1i + x3r; + a[j2] = wn4r * (x0r - x0i); + a[j2 + 1] = wn4r * (x0i + x0r); + x0r = x1r - x3i; + x0i = x1i - x3r; + a[j3] = -wn4r * (x0r + x0i); + a[j3 + 1] = -wn4r * (x0i - x0r); + x0r = a[j0 + 2] + a[j2 + 2]; + x0i = -a[j0 + 3] - a[j2 + 3]; + x1r = a[j0 + 2] - a[j2 + 2]; + x1i = -a[j0 + 3] + a[j2 + 3]; + x2r = a[j1 + 2] + a[j3 + 2]; + x2i = a[j1 + 3] + a[j3 + 3]; + x3r = a[j1 + 2] - a[j3 + 2]; + x3i = a[j1 + 3] - a[j3 + 3]; + a[j0 + 2] = x0r + x2r; + a[j0 + 3] = x0i - x2i; + a[j1 + 2] = x0r - x2r; + a[j1 + 3] = x0i + x2i; + x0r = x1r + x3i; + x0i = x1i + x3r; + a[j2 + 2] = wk1i * x0r - wk1r * x0i; + a[j2 + 3] = wk1i * x0i + wk1r * x0r; + x0r = x1r - x3i; + x0i = x1i - x3r; + a[j3 + 2] = wk3i * x0r + wk3r * x0i; + a[j3 + 3] = wk3i * x0i - wk3r * x0r; +} + + +#ifdef USE_CDFT_THREADS +struct cdft_arg_st { + int n0; + int n; + double *a; + int nw; + double *w; +}; +typedef struct cdft_arg_st cdft_arg_t; + + +void cftrec4_th(int n, double *a, int nw, double *w) { + void *cftrec1_th(void *p); + void *cftrec2_th(void *p); + int i, idiv4, m, nthread; + cdft_thread_t th[4]; + cdft_arg_t ag[4]; + + nthread = 2; + idiv4 = 0; + m = n >> 1; + if (n > CDFT_4THREADS_BEGIN_N) { + nthread = 4; + idiv4 = 1; + m >>= 1; + } + for (i = 0; i < nthread; i++) { + ag[i].n0 = n; + ag[i].n = m; + ag[i].a = &a[i * m]; + ag[i].nw = nw; + ag[i].w = w; + if (i != idiv4) { + cdft_thread_create(&th[i], cftrec1_th, &ag[i]); + } else { + cdft_thread_create(&th[i], cftrec2_th, &ag[i]); + } + } + for (i = 0; i < nthread; i++) { + cdft_thread_wait(th[i]); + } +} + + +void *cftrec1_th(void *p) { + int cfttree(int n, int j, int k, double *a, int nw, double *w); + void cftleaf(int n, int isplt, double *a, int nw, double *w); + void cftmdl1(int n, double *a, double *w); + int isplt, j, k, m, n, n0, nw; + double *a, *w; + + n0 = ((cdft_arg_t *)p)->n0; + n = ((cdft_arg_t *)p)->n; + a = ((cdft_arg_t *)p)->a; + nw = ((cdft_arg_t *)p)->nw; + w = ((cdft_arg_t *)p)->w; + m = n0; + while (m > 512) { + m >>= 2; + cftmdl1(m, &a[n - m], &w[nw - (m >> 1)]); + } + cftleaf(m, 1, &a[n - m], nw, w); + k = 0; + for (j = n - m; j > 0; j -= m) { + k++; + isplt = cfttree(m, j, k, a, nw, w); + cftleaf(m, isplt, &a[j - m], nw, w); + } + return (void *)0; +} + + +void *cftrec2_th(void *p) { + int cfttree(int n, int j, int k, double *a, int nw, double *w); + void cftleaf(int n, int isplt, double *a, int nw, double *w); + void cftmdl2(int n, double *a, double *w); + int isplt, j, k, m, n, n0, nw; + double *a, *w; + + n0 = ((cdft_arg_t *)p)->n0; + n = ((cdft_arg_t *)p)->n; + a = ((cdft_arg_t *)p)->a; + nw = ((cdft_arg_t *)p)->nw; + w = ((cdft_arg_t *)p)->w; + k = 1; + m = n0; + while (m > 512) { + m >>= 2; + k <<= 2; + cftmdl2(m, &a[n - m], &w[nw - m]); + } + cftleaf(m, 0, &a[n - m], nw, w); + k >>= 1; + for (j = n - m; j > 0; j -= m) { + k++; + isplt = cfttree(m, j, k, a, nw, w); + cftleaf(m, isplt, &a[j - m], nw, w); + } + return (void *)0; +} +#endif /* USE_CDFT_THREADS */ + + +void cftrec4(int n, double *a, int nw, double *w) { + int cfttree(int n, int j, int k, double *a, int nw, double *w); + void cftleaf(int n, int isplt, double *a, int nw, double *w); + void cftmdl1(int n, double *a, double *w); + int isplt, j, k, m; + + m = n; + while (m > 512) { + m >>= 2; + cftmdl1(m, &a[n - m], &w[nw - (m >> 1)]); + } + cftleaf(m, 1, &a[n - m], nw, w); + k = 0; + for (j = n - m; j > 0; j -= m) { + k++; + isplt = cfttree(m, j, k, a, nw, w); + cftleaf(m, isplt, &a[j - m], nw, w); + } +} + + +int cfttree(int n, int j, int k, double *a, int nw, double *w) { + void cftmdl1(int n, double *a, double *w); + void cftmdl2(int n, double *a, double *w); + int i, isplt, m; + + if ((k & 3) != 0) { + isplt = k & 1; + if (isplt != 0) { + cftmdl1(n, &a[j - n], &w[nw - (n >> 1)]); + } else { + cftmdl2(n, &a[j - n], &w[nw - n]); + } + } else { + m = n; + for (i = k; (i & 3) == 0; i >>= 2) { + m <<= 2; + } + isplt = i & 1; + if (isplt != 0) { + while (m > 128) { + cftmdl1(m, &a[j - m], &w[nw - (m >> 1)]); + m >>= 2; + } + } else { + while (m > 128) { + cftmdl2(m, &a[j - m], &w[nw - m]); + m >>= 2; + } + } + } + return isplt; +} + + +void cftleaf(int n, int isplt, double *a, int nw, double *w) { + void cftmdl1(int n, double *a, double *w); + void cftmdl2(int n, double *a, double *w); + void cftf161(double *a, double *w); + void cftf162(double *a, double *w); + void cftf081(double *a, double *w); + void cftf082(double *a, double *w); + + if (n == 512) { + cftmdl1(128, a, &w[nw - 64]); + cftf161(a, &w[nw - 8]); + cftf162(&a[32], &w[nw - 32]); + cftf161(&a[64], &w[nw - 8]); + cftf161(&a[96], &w[nw - 8]); + cftmdl2(128, &a[128], &w[nw - 128]); + cftf161(&a[128], &w[nw - 8]); + cftf162(&a[160], &w[nw - 32]); + cftf161(&a[192], &w[nw - 8]); + cftf162(&a[224], &w[nw - 32]); + cftmdl1(128, &a[256], &w[nw - 64]); + cftf161(&a[256], &w[nw - 8]); + cftf162(&a[288], &w[nw - 32]); + cftf161(&a[320], &w[nw - 8]); + cftf161(&a[352], &w[nw - 8]); + if (isplt != 0) { + cftmdl1(128, &a[384], &w[nw - 64]); + cftf161(&a[480], &w[nw - 8]); + } else { + cftmdl2(128, &a[384], &w[nw - 128]); + cftf162(&a[480], &w[nw - 32]); + } + cftf161(&a[384], &w[nw - 8]); + cftf162(&a[416], &w[nw - 32]); + cftf161(&a[448], &w[nw - 8]); + } else { + cftmdl1(64, a, &w[nw - 32]); + cftf081(a, &w[nw - 8]); + cftf082(&a[16], &w[nw - 8]); + cftf081(&a[32], &w[nw - 8]); + cftf081(&a[48], &w[nw - 8]); + cftmdl2(64, &a[64], &w[nw - 64]); + cftf081(&a[64], &w[nw - 8]); + cftf082(&a[80], &w[nw - 8]); + cftf081(&a[96], &w[nw - 8]); + cftf082(&a[112], &w[nw - 8]); + cftmdl1(64, &a[128], &w[nw - 32]); + cftf081(&a[128], &w[nw - 8]); + cftf082(&a[144], &w[nw - 8]); + cftf081(&a[160], &w[nw - 8]); + cftf081(&a[176], &w[nw - 8]); + if (isplt != 0) { + cftmdl1(64, &a[192], &w[nw - 32]); + cftf081(&a[240], &w[nw - 8]); + } else { + cftmdl2(64, &a[192], &w[nw - 64]); + cftf082(&a[240], &w[nw - 8]); + } + cftf081(&a[192], &w[nw - 8]); + cftf082(&a[208], &w[nw - 8]); + cftf081(&a[224], &w[nw - 8]); + } +} + + +void cftmdl1(int n, double *a, double *w) { + int j, j0, j1, j2, j3, k, m, mh; + double wn4r, wk1r, wk1i, wk3r, wk3i; + double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i; + + mh = n >> 3; + m = 2 * mh; + j1 = m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[0] + a[j2]; + x0i = a[1] + a[j2 + 1]; + x1r = a[0] - a[j2]; + x1i = a[1] - a[j2 + 1]; + x2r = a[j1] + a[j3]; + x2i = a[j1 + 1] + a[j3 + 1]; + x3r = a[j1] - a[j3]; + x3i = a[j1 + 1] - a[j3 + 1]; + a[0] = x0r + x2r; + a[1] = x0i + x2i; + a[j1] = x0r - x2r; + a[j1 + 1] = x0i - x2i; + a[j2] = x1r - x3i; + a[j2 + 1] = x1i + x3r; + a[j3] = x1r + x3i; + a[j3 + 1] = x1i - x3r; + wn4r = w[1]; + k = 0; + for (j = 2; j < mh; j += 2) { + k += 4; + wk1r = w[k]; + wk1i = w[k + 1]; + wk3r = w[k + 2]; + wk3i = w[k + 3]; + j1 = j + m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[j] + a[j2]; + x0i = a[j + 1] + a[j2 + 1]; + x1r = a[j] - a[j2]; + x1i = a[j + 1] - a[j2 + 1]; + x2r = a[j1] + a[j3]; + x2i = a[j1 + 1] + a[j3 + 1]; + x3r = a[j1] - a[j3]; + x3i = a[j1 + 1] - a[j3 + 1]; + a[j] = x0r + x2r; + a[j + 1] = x0i + x2i; + a[j1] = x0r - x2r; + a[j1 + 1] = x0i - x2i; + x0r = x1r - x3i; + x0i = x1i + x3r; + a[j2] = wk1r * x0r - wk1i * x0i; + a[j2 + 1] = wk1r * x0i + wk1i * x0r; + x0r = x1r + x3i; + x0i = x1i - x3r; + a[j3] = wk3r * x0r + wk3i * x0i; + a[j3 + 1] = wk3r * x0i - wk3i * x0r; + j0 = m - j; + j1 = j0 + m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[j0] + a[j2]; + x0i = a[j0 + 1] + a[j2 + 1]; + x1r = a[j0] - a[j2]; + x1i = a[j0 + 1] - a[j2 + 1]; + x2r = a[j1] + a[j3]; + x2i = a[j1 + 1] + a[j3 + 1]; + x3r = a[j1] - a[j3]; + x3i = a[j1 + 1] - a[j3 + 1]; + a[j0] = x0r + x2r; + a[j0 + 1] = x0i + x2i; + a[j1] = x0r - x2r; + a[j1 + 1] = x0i - x2i; + x0r = x1r - x3i; + x0i = x1i + x3r; + a[j2] = wk1i * x0r - wk1r * x0i; + a[j2 + 1] = wk1i * x0i + wk1r * x0r; + x0r = x1r + x3i; + x0i = x1i - x3r; + a[j3] = wk3i * x0r + wk3r * x0i; + a[j3 + 1] = wk3i * x0i - wk3r * x0r; + } + j0 = mh; + j1 = j0 + m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[j0] + a[j2]; + x0i = a[j0 + 1] + a[j2 + 1]; + x1r = a[j0] - a[j2]; + x1i = a[j0 + 1] - a[j2 + 1]; + x2r = a[j1] + a[j3]; + x2i = a[j1 + 1] + a[j3 + 1]; + x3r = a[j1] - a[j3]; + x3i = a[j1 + 1] - a[j3 + 1]; + a[j0] = x0r + x2r; + a[j0 + 1] = x0i + x2i; + a[j1] = x0r - x2r; + a[j1 + 1] = x0i - x2i; + x0r = x1r - x3i; + x0i = x1i + x3r; + a[j2] = wn4r * (x0r - x0i); + a[j2 + 1] = wn4r * (x0i + x0r); + x0r = x1r + x3i; + x0i = x1i - x3r; + a[j3] = -wn4r * (x0r + x0i); + a[j3 + 1] = -wn4r * (x0i - x0r); +} + + +void cftmdl2(int n, double *a, double *w) { + int j, j0, j1, j2, j3, k, kr, m, mh; + double wn4r, wk1r, wk1i, wk3r, wk3i, wd1r, wd1i, wd3r, wd3i; + double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i, y0r, y0i, y2r, y2i; + + mh = n >> 3; + m = 2 * mh; + wn4r = w[1]; + j1 = m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[0] - a[j2 + 1]; + x0i = a[1] + a[j2]; + x1r = a[0] + a[j2 + 1]; + x1i = a[1] - a[j2]; + x2r = a[j1] - a[j3 + 1]; + x2i = a[j1 + 1] + a[j3]; + x3r = a[j1] + a[j3 + 1]; + x3i = a[j1 + 1] - a[j3]; + y0r = wn4r * (x2r - x2i); + y0i = wn4r * (x2i + x2r); + a[0] = x0r + y0r; + a[1] = x0i + y0i; + a[j1] = x0r - y0r; + a[j1 + 1] = x0i - y0i; + y0r = wn4r * (x3r - x3i); + y0i = wn4r * (x3i + x3r); + a[j2] = x1r - y0i; + a[j2 + 1] = x1i + y0r; + a[j3] = x1r + y0i; + a[j3 + 1] = x1i - y0r; + k = 0; + kr = 2 * m; + for (j = 2; j < mh; j += 2) { + k += 4; + wk1r = w[k]; + wk1i = w[k + 1]; + wk3r = w[k + 2]; + wk3i = w[k + 3]; + kr -= 4; + wd1i = w[kr]; + wd1r = w[kr + 1]; + wd3i = w[kr + 2]; + wd3r = w[kr + 3]; + j1 = j + m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[j] - a[j2 + 1]; + x0i = a[j + 1] + a[j2]; + x1r = a[j] + a[j2 + 1]; + x1i = a[j + 1] - a[j2]; + x2r = a[j1] - a[j3 + 1]; + x2i = a[j1 + 1] + a[j3]; + x3r = a[j1] + a[j3 + 1]; + x3i = a[j1 + 1] - a[j3]; + y0r = wk1r * x0r - wk1i * x0i; + y0i = wk1r * x0i + wk1i * x0r; + y2r = wd1r * x2r - wd1i * x2i; + y2i = wd1r * x2i + wd1i * x2r; + a[j] = y0r + y2r; + a[j + 1] = y0i + y2i; + a[j1] = y0r - y2r; + a[j1 + 1] = y0i - y2i; + y0r = wk3r * x1r + wk3i * x1i; + y0i = wk3r * x1i - wk3i * x1r; + y2r = wd3r * x3r + wd3i * x3i; + y2i = wd3r * x3i - wd3i * x3r; + a[j2] = y0r + y2r; + a[j2 + 1] = y0i + y2i; + a[j3] = y0r - y2r; + a[j3 + 1] = y0i - y2i; + j0 = m - j; + j1 = j0 + m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[j0] - a[j2 + 1]; + x0i = a[j0 + 1] + a[j2]; + x1r = a[j0] + a[j2 + 1]; + x1i = a[j0 + 1] - a[j2]; + x2r = a[j1] - a[j3 + 1]; + x2i = a[j1 + 1] + a[j3]; + x3r = a[j1] + a[j3 + 1]; + x3i = a[j1 + 1] - a[j3]; + y0r = wd1i * x0r - wd1r * x0i; + y0i = wd1i * x0i + wd1r * x0r; + y2r = wk1i * x2r - wk1r * x2i; + y2i = wk1i * x2i + wk1r * x2r; + a[j0] = y0r + y2r; + a[j0 + 1] = y0i + y2i; + a[j1] = y0r - y2r; + a[j1 + 1] = y0i - y2i; + y0r = wd3i * x1r + wd3r * x1i; + y0i = wd3i * x1i - wd3r * x1r; + y2r = wk3i * x3r + wk3r * x3i; + y2i = wk3i * x3i - wk3r * x3r; + a[j2] = y0r + y2r; + a[j2 + 1] = y0i + y2i; + a[j3] = y0r - y2r; + a[j3 + 1] = y0i - y2i; + } + wk1r = w[m]; + wk1i = w[m + 1]; + j0 = mh; + j1 = j0 + m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[j0] - a[j2 + 1]; + x0i = a[j0 + 1] + a[j2]; + x1r = a[j0] + a[j2 + 1]; + x1i = a[j0 + 1] - a[j2]; + x2r = a[j1] - a[j3 + 1]; + x2i = a[j1 + 1] + a[j3]; + x3r = a[j1] + a[j3 + 1]; + x3i = a[j1 + 1] - a[j3]; + y0r = wk1r * x0r - wk1i * x0i; + y0i = wk1r * x0i + wk1i * x0r; + y2r = wk1i * x2r - wk1r * x2i; + y2i = wk1i * x2i + wk1r * x2r; + a[j0] = y0r + y2r; + a[j0 + 1] = y0i + y2i; + a[j1] = y0r - y2r; + a[j1 + 1] = y0i - y2i; + y0r = wk1i * x1r - wk1r * x1i; + y0i = wk1i * x1i + wk1r * x1r; + y2r = wk1r * x3r - wk1i * x3i; + y2i = wk1r * x3i + wk1i * x3r; + a[j2] = y0r - y2r; + a[j2 + 1] = y0i - y2i; + a[j3] = y0r + y2r; + a[j3 + 1] = y0i + y2i; +} + + +void cftfx41(int n, double *a, int nw, double *w) { + void cftf161(double *a, double *w); + void cftf162(double *a, double *w); + void cftf081(double *a, double *w); + void cftf082(double *a, double *w); + + if (n == 128) { + cftf161(a, &w[nw - 8]); + cftf162(&a[32], &w[nw - 32]); + cftf161(&a[64], &w[nw - 8]); + cftf161(&a[96], &w[nw - 8]); + } else { + cftf081(a, &w[nw - 8]); + cftf082(&a[16], &w[nw - 8]); + cftf081(&a[32], &w[nw - 8]); + cftf081(&a[48], &w[nw - 8]); + } +} + + +void cftf161(double *a, double *w) { + double wn4r, wk1r, wk1i, x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i, y0r, y0i, + y1r, y1i, y2r, y2i, y3r, y3i, y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i, + y8r, y8i, y9r, y9i, y10r, y10i, y11r, y11i, y12r, y12i, y13r, y13i, + y14r, y14i, y15r, y15i; + + wn4r = w[1]; + wk1r = w[2]; + wk1i = w[3]; + x0r = a[0] + a[16]; + x0i = a[1] + a[17]; + x1r = a[0] - a[16]; + x1i = a[1] - a[17]; + x2r = a[8] + a[24]; + x2i = a[9] + a[25]; + x3r = a[8] - a[24]; + x3i = a[9] - a[25]; + y0r = x0r + x2r; + y0i = x0i + x2i; + y4r = x0r - x2r; + y4i = x0i - x2i; + y8r = x1r - x3i; + y8i = x1i + x3r; + y12r = x1r + x3i; + y12i = x1i - x3r; + x0r = a[2] + a[18]; + x0i = a[3] + a[19]; + x1r = a[2] - a[18]; + x1i = a[3] - a[19]; + x2r = a[10] + a[26]; + x2i = a[11] + a[27]; + x3r = a[10] - a[26]; + x3i = a[11] - a[27]; + y1r = x0r + x2r; + y1i = x0i + x2i; + y5r = x0r - x2r; + y5i = x0i - x2i; + x0r = x1r - x3i; + x0i = x1i + x3r; + y9r = wk1r * x0r - wk1i * x0i; + y9i = wk1r * x0i + wk1i * x0r; + x0r = x1r + x3i; + x0i = x1i - x3r; + y13r = wk1i * x0r - wk1r * x0i; + y13i = wk1i * x0i + wk1r * x0r; + x0r = a[4] + a[20]; + x0i = a[5] + a[21]; + x1r = a[4] - a[20]; + x1i = a[5] - a[21]; + x2r = a[12] + a[28]; + x2i = a[13] + a[29]; + x3r = a[12] - a[28]; + x3i = a[13] - a[29]; + y2r = x0r + x2r; + y2i = x0i + x2i; + y6r = x0r - x2r; + y6i = x0i - x2i; + x0r = x1r - x3i; + x0i = x1i + x3r; + y10r = wn4r * (x0r - x0i); + y10i = wn4r * (x0i + x0r); + x0r = x1r + x3i; + x0i = x1i - x3r; + y14r = wn4r * (x0r + x0i); + y14i = wn4r * (x0i - x0r); + x0r = a[6] + a[22]; + x0i = a[7] + a[23]; + x1r = a[6] - a[22]; + x1i = a[7] - a[23]; + x2r = a[14] + a[30]; + x2i = a[15] + a[31]; + x3r = a[14] - a[30]; + x3i = a[15] - a[31]; + y3r = x0r + x2r; + y3i = x0i + x2i; + y7r = x0r - x2r; + y7i = x0i - x2i; + x0r = x1r - x3i; + x0i = x1i + x3r; + y11r = wk1i * x0r - wk1r * x0i; + y11i = wk1i * x0i + wk1r * x0r; + x0r = x1r + x3i; + x0i = x1i - x3r; + y15r = wk1r * x0r - wk1i * x0i; + y15i = wk1r * x0i + wk1i * x0r; + x0r = y12r - y14r; + x0i = y12i - y14i; + x1r = y12r + y14r; + x1i = y12i + y14i; + x2r = y13r - y15r; + x2i = y13i - y15i; + x3r = y13r + y15r; + x3i = y13i + y15i; + a[24] = x0r + x2r; + a[25] = x0i + x2i; + a[26] = x0r - x2r; + a[27] = x0i - x2i; + a[28] = x1r - x3i; + a[29] = x1i + x3r; + a[30] = x1r + x3i; + a[31] = x1i - x3r; + x0r = y8r + y10r; + x0i = y8i + y10i; + x1r = y8r - y10r; + x1i = y8i - y10i; + x2r = y9r + y11r; + x2i = y9i + y11i; + x3r = y9r - y11r; + x3i = y9i - y11i; + a[16] = x0r + x2r; + a[17] = x0i + x2i; + a[18] = x0r - x2r; + a[19] = x0i - x2i; + a[20] = x1r - x3i; + a[21] = x1i + x3r; + a[22] = x1r + x3i; + a[23] = x1i - x3r; + x0r = y5r - y7i; + x0i = y5i + y7r; + x2r = wn4r * (x0r - x0i); + x2i = wn4r * (x0i + x0r); + x0r = y5r + y7i; + x0i = y5i - y7r; + x3r = wn4r * (x0r - x0i); + x3i = wn4r * (x0i + x0r); + x0r = y4r - y6i; + x0i = y4i + y6r; + x1r = y4r + y6i; + x1i = y4i - y6r; + a[8] = x0r + x2r; + a[9] = x0i + x2i; + a[10] = x0r - x2r; + a[11] = x0i - x2i; + a[12] = x1r - x3i; + a[13] = x1i + x3r; + a[14] = x1r + x3i; + a[15] = x1i - x3r; + x0r = y0r + y2r; + x0i = y0i + y2i; + x1r = y0r - y2r; + x1i = y0i - y2i; + x2r = y1r + y3r; + x2i = y1i + y3i; + x3r = y1r - y3r; + x3i = y1i - y3i; + a[0] = x0r + x2r; + a[1] = x0i + x2i; + a[2] = x0r - x2r; + a[3] = x0i - x2i; + a[4] = x1r - x3i; + a[5] = x1i + x3r; + a[6] = x1r + x3i; + a[7] = x1i - x3r; +} + + +void cftf162(double *a, double *w) { + double wn4r, wk1r, wk1i, wk2r, wk2i, wk3r, wk3i, x0r, x0i, x1r, x1i, x2r, + x2i, y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i, y4r, y4i, y5r, y5i, y6r, + y6i, y7r, y7i, y8r, y8i, y9r, y9i, y10r, y10i, y11r, y11i, y12r, y12i, + y13r, y13i, y14r, y14i, y15r, y15i; + + wn4r = w[1]; + wk1r = w[4]; + wk1i = w[5]; + wk3r = w[6]; + wk3i = -w[7]; + wk2r = w[8]; + wk2i = w[9]; + x1r = a[0] - a[17]; + x1i = a[1] + a[16]; + x0r = a[8] - a[25]; + x0i = a[9] + a[24]; + x2r = wn4r * (x0r - x0i); + x2i = wn4r * (x0i + x0r); + y0r = x1r + x2r; + y0i = x1i + x2i; + y4r = x1r - x2r; + y4i = x1i - x2i; + x1r = a[0] + a[17]; + x1i = a[1] - a[16]; + x0r = a[8] + a[25]; + x0i = a[9] - a[24]; + x2r = wn4r * (x0r - x0i); + x2i = wn4r * (x0i + x0r); + y8r = x1r - x2i; + y8i = x1i + x2r; + y12r = x1r + x2i; + y12i = x1i - x2r; + x0r = a[2] - a[19]; + x0i = a[3] + a[18]; + x1r = wk1r * x0r - wk1i * x0i; + x1i = wk1r * x0i + wk1i * x0r; + x0r = a[10] - a[27]; + x0i = a[11] + a[26]; + x2r = wk3i * x0r - wk3r * x0i; + x2i = wk3i * x0i + wk3r * x0r; + y1r = x1r + x2r; + y1i = x1i + x2i; + y5r = x1r - x2r; + y5i = x1i - x2i; + x0r = a[2] + a[19]; + x0i = a[3] - a[18]; + x1r = wk3r * x0r - wk3i * x0i; + x1i = wk3r * x0i + wk3i * x0r; + x0r = a[10] + a[27]; + x0i = a[11] - a[26]; + x2r = wk1r * x0r + wk1i * x0i; + x2i = wk1r * x0i - wk1i * x0r; + y9r = x1r - x2r; + y9i = x1i - x2i; + y13r = x1r + x2r; + y13i = x1i + x2i; + x0r = a[4] - a[21]; + x0i = a[5] + a[20]; + x1r = wk2r * x0r - wk2i * x0i; + x1i = wk2r * x0i + wk2i * x0r; + x0r = a[12] - a[29]; + x0i = a[13] + a[28]; + x2r = wk2i * x0r - wk2r * x0i; + x2i = wk2i * x0i + wk2r * x0r; + y2r = x1r + x2r; + y2i = x1i + x2i; + y6r = x1r - x2r; + y6i = x1i - x2i; + x0r = a[4] + a[21]; + x0i = a[5] - a[20]; + x1r = wk2i * x0r - wk2r * x0i; + x1i = wk2i * x0i + wk2r * x0r; + x0r = a[12] + a[29]; + x0i = a[13] - a[28]; + x2r = wk2r * x0r - wk2i * x0i; + x2i = wk2r * x0i + wk2i * x0r; + y10r = x1r - x2r; + y10i = x1i - x2i; + y14r = x1r + x2r; + y14i = x1i + x2i; + x0r = a[6] - a[23]; + x0i = a[7] + a[22]; + x1r = wk3r * x0r - wk3i * x0i; + x1i = wk3r * x0i + wk3i * x0r; + x0r = a[14] - a[31]; + x0i = a[15] + a[30]; + x2r = wk1i * x0r - wk1r * x0i; + x2i = wk1i * x0i + wk1r * x0r; + y3r = x1r + x2r; + y3i = x1i + x2i; + y7r = x1r - x2r; + y7i = x1i - x2i; + x0r = a[6] + a[23]; + x0i = a[7] - a[22]; + x1r = wk1i * x0r + wk1r * x0i; + x1i = wk1i * x0i - wk1r * x0r; + x0r = a[14] + a[31]; + x0i = a[15] - a[30]; + x2r = wk3i * x0r - wk3r * x0i; + x2i = wk3i * x0i + wk3r * x0r; + y11r = x1r + x2r; + y11i = x1i + x2i; + y15r = x1r - x2r; + y15i = x1i - x2i; + x1r = y0r + y2r; + x1i = y0i + y2i; + x2r = y1r + y3r; + x2i = y1i + y3i; + a[0] = x1r + x2r; + a[1] = x1i + x2i; + a[2] = x1r - x2r; + a[3] = x1i - x2i; + x1r = y0r - y2r; + x1i = y0i - y2i; + x2r = y1r - y3r; + x2i = y1i - y3i; + a[4] = x1r - x2i; + a[5] = x1i + x2r; + a[6] = x1r + x2i; + a[7] = x1i - x2r; + x1r = y4r - y6i; + x1i = y4i + y6r; + x0r = y5r - y7i; + x0i = y5i + y7r; + x2r = wn4r * (x0r - x0i); + x2i = wn4r * (x0i + x0r); + a[8] = x1r + x2r; + a[9] = x1i + x2i; + a[10] = x1r - x2r; + a[11] = x1i - x2i; + x1r = y4r + y6i; + x1i = y4i - y6r; + x0r = y5r + y7i; + x0i = y5i - y7r; + x2r = wn4r * (x0r - x0i); + x2i = wn4r * (x0i + x0r); + a[12] = x1r - x2i; + a[13] = x1i + x2r; + a[14] = x1r + x2i; + a[15] = x1i - x2r; + x1r = y8r + y10r; + x1i = y8i + y10i; + x2r = y9r - y11r; + x2i = y9i - y11i; + a[16] = x1r + x2r; + a[17] = x1i + x2i; + a[18] = x1r - x2r; + a[19] = x1i - x2i; + x1r = y8r - y10r; + x1i = y8i - y10i; + x2r = y9r + y11r; + x2i = y9i + y11i; + a[20] = x1r - x2i; + a[21] = x1i + x2r; + a[22] = x1r + x2i; + a[23] = x1i - x2r; + x1r = y12r - y14i; + x1i = y12i + y14r; + x0r = y13r + y15i; + x0i = y13i - y15r; + x2r = wn4r * (x0r - x0i); + x2i = wn4r * (x0i + x0r); + a[24] = x1r + x2r; + a[25] = x1i + x2i; + a[26] = x1r - x2r; + a[27] = x1i - x2i; + x1r = y12r + y14i; + x1i = y12i - y14r; + x0r = y13r - y15i; + x0i = y13i + y15r; + x2r = wn4r * (x0r - x0i); + x2i = wn4r * (x0i + x0r); + a[28] = x1r - x2i; + a[29] = x1i + x2r; + a[30] = x1r + x2i; + a[31] = x1i - x2r; +} + + +void cftf081(double *a, double *w) { + double wn4r, x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i, y0r, y0i, y1r, y1i, + y2r, y2i, y3r, y3i, y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i; + + wn4r = w[1]; + x0r = a[0] + a[8]; + x0i = a[1] + a[9]; + x1r = a[0] - a[8]; + x1i = a[1] - a[9]; + x2r = a[4] + a[12]; + x2i = a[5] + a[13]; + x3r = a[4] - a[12]; + x3i = a[5] - a[13]; + y0r = x0r + x2r; + y0i = x0i + x2i; + y2r = x0r - x2r; + y2i = x0i - x2i; + y1r = x1r - x3i; + y1i = x1i + x3r; + y3r = x1r + x3i; + y3i = x1i - x3r; + x0r = a[2] + a[10]; + x0i = a[3] + a[11]; + x1r = a[2] - a[10]; + x1i = a[3] - a[11]; + x2r = a[6] + a[14]; + x2i = a[7] + a[15]; + x3r = a[6] - a[14]; + x3i = a[7] - a[15]; + y4r = x0r + x2r; + y4i = x0i + x2i; + y6r = x0r - x2r; + y6i = x0i - x2i; + x0r = x1r - x3i; + x0i = x1i + x3r; + x2r = x1r + x3i; + x2i = x1i - x3r; + y5r = wn4r * (x0r - x0i); + y5i = wn4r * (x0r + x0i); + y7r = wn4r * (x2r - x2i); + y7i = wn4r * (x2r + x2i); + a[8] = y1r + y5r; + a[9] = y1i + y5i; + a[10] = y1r - y5r; + a[11] = y1i - y5i; + a[12] = y3r - y7i; + a[13] = y3i + y7r; + a[14] = y3r + y7i; + a[15] = y3i - y7r; + a[0] = y0r + y4r; + a[1] = y0i + y4i; + a[2] = y0r - y4r; + a[3] = y0i - y4i; + a[4] = y2r - y6i; + a[5] = y2i + y6r; + a[6] = y2r + y6i; + a[7] = y2i - y6r; +} + + +void cftf082(double *a, double *w) { + double wn4r, wk1r, wk1i, x0r, x0i, x1r, x1i, y0r, y0i, y1r, y1i, y2r, y2i, + y3r, y3i, y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i; + + wn4r = w[1]; + wk1r = w[2]; + wk1i = w[3]; + y0r = a[0] - a[9]; + y0i = a[1] + a[8]; + y1r = a[0] + a[9]; + y1i = a[1] - a[8]; + x0r = a[4] - a[13]; + x0i = a[5] + a[12]; + y2r = wn4r * (x0r - x0i); + y2i = wn4r * (x0i + x0r); + x0r = a[4] + a[13]; + x0i = a[5] - a[12]; + y3r = wn4r * (x0r - x0i); + y3i = wn4r * (x0i + x0r); + x0r = a[2] - a[11]; + x0i = a[3] + a[10]; + y4r = wk1r * x0r - wk1i * x0i; + y4i = wk1r * x0i + wk1i * x0r; + x0r = a[2] + a[11]; + x0i = a[3] - a[10]; + y5r = wk1i * x0r - wk1r * x0i; + y5i = wk1i * x0i + wk1r * x0r; + x0r = a[6] - a[15]; + x0i = a[7] + a[14]; + y6r = wk1i * x0r - wk1r * x0i; + y6i = wk1i * x0i + wk1r * x0r; + x0r = a[6] + a[15]; + x0i = a[7] - a[14]; + y7r = wk1r * x0r - wk1i * x0i; + y7i = wk1r * x0i + wk1i * x0r; + x0r = y0r + y2r; + x0i = y0i + y2i; + x1r = y4r + y6r; + x1i = y4i + y6i; + a[0] = x0r + x1r; + a[1] = x0i + x1i; + a[2] = x0r - x1r; + a[3] = x0i - x1i; + x0r = y0r - y2r; + x0i = y0i - y2i; + x1r = y4r - y6r; + x1i = y4i - y6i; + a[4] = x0r - x1i; + a[5] = x0i + x1r; + a[6] = x0r + x1i; + a[7] = x0i - x1r; + x0r = y1r - y3i; + x0i = y1i + y3r; + x1r = y5r - y7r; + x1i = y5i - y7i; + a[8] = x0r + x1r; + a[9] = x0i + x1i; + a[10] = x0r - x1r; + a[11] = x0i - x1i; + x0r = y1r + y3i; + x0i = y1i - y3r; + x1r = y5r + y7r; + x1i = y5i + y7i; + a[12] = x0r - x1i; + a[13] = x0i + x1r; + a[14] = x0r + x1i; + a[15] = x0i - x1r; +} + + +void cftf040(double *a) { + double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i; + + x0r = a[0] + a[4]; + x0i = a[1] + a[5]; + x1r = a[0] - a[4]; + x1i = a[1] - a[5]; + x2r = a[2] + a[6]; + x2i = a[3] + a[7]; + x3r = a[2] - a[6]; + x3i = a[3] - a[7]; + a[0] = x0r + x2r; + a[1] = x0i + x2i; + a[2] = x1r - x3i; + a[3] = x1i + x3r; + a[4] = x0r - x2r; + a[5] = x0i - x2i; + a[6] = x1r + x3i; + a[7] = x1i - x3r; +} + + +void cftb040(double *a) { + double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i; + + x0r = a[0] + a[4]; + x0i = a[1] + a[5]; + x1r = a[0] - a[4]; + x1i = a[1] - a[5]; + x2r = a[2] + a[6]; + x2i = a[3] + a[7]; + x3r = a[2] - a[6]; + x3i = a[3] - a[7]; + a[0] = x0r + x2r; + a[1] = x0i + x2i; + a[2] = x1r + x3i; + a[3] = x1i - x3r; + a[4] = x0r - x2r; + a[5] = x0i - x2i; + a[6] = x1r - x3i; + a[7] = x1i + x3r; +} + + +void cftx020(double *a) { + double x0r, x0i; + + x0r = a[0] - a[2]; + x0i = a[1] - a[3]; + a[0] += a[2]; + a[1] += a[3]; + a[2] = x0r; + a[3] = x0i; +} + + +void rftfsub(int n, double *a, int nc, double *c) { + int j, k, kk, ks, m; + double wkr, wki, xr, xi, yr, yi; + + m = n >> 1; + ks = 2 * nc / m; + kk = 0; + for (j = 2; j < m; j += 2) { + k = n - j; + kk += ks; + wkr = 0.5 - c[nc - kk]; + wki = c[kk]; + xr = a[j] - a[k]; + xi = a[j + 1] + a[k + 1]; + yr = wkr * xr - wki * xi; + yi = wkr * xi + wki * xr; + a[j] -= yr; + a[j + 1] -= yi; + a[k] += yr; + a[k + 1] -= yi; + } +} + + +void rftbsub(int n, double *a, int nc, double *c) { + int j, k, kk, ks, m; + double wkr, wki, xr, xi, yr, yi; + + m = n >> 1; + ks = 2 * nc / m; + kk = 0; + for (j = 2; j < m; j += 2) { + k = n - j; + kk += ks; + wkr = 0.5 - c[nc - kk]; + wki = c[kk]; + xr = a[j] - a[k]; + xi = a[j + 1] + a[k + 1]; + yr = wkr * xr + wki * xi; + yi = wkr * xi - wki * xr; + a[j] -= yr; + a[j + 1] -= yi; + a[k] += yr; + a[k + 1] -= yi; + } +} + + +void dctsub(int n, double *a, int nc, double *c) { + int j, k, kk, ks, m; + double wkr, wki, xr; + + m = n >> 1; + ks = nc / n; + kk = 0; + for (j = 1; j < m; j++) { + k = n - j; + kk += ks; + wkr = c[kk] - c[nc - kk]; + wki = c[kk] + c[nc - kk]; + xr = wki * a[j] - wkr * a[k]; + a[j] = wkr * a[j] + wki * a[k]; + a[k] = xr; + } + a[m] *= c[0]; +} + + +void dstsub(int n, double *a, int nc, double *c) { + int j, k, kk, ks, m; + double wkr, wki, xr; + + m = n >> 1; + ks = nc / n; + kk = 0; + for (j = 1; j < m; j++) { + k = n - j; + kk += ks; + wkr = c[kk] - c[nc - kk]; + wki = c[kk] + c[nc - kk]; + xr = wki * a[k] - wkr * a[j]; + a[k] = wkr * a[k] + wki * a[j]; + a[j] = xr; + } + a[m] *= c[0]; +} diff --git a/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/log.cc b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/log.cc new file mode 100644 index 000000000..6922808ab --- /dev/null +++ b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/log.cc @@ -0,0 +1,143 @@ +/** + * Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang) + * + * See LICENSE for clarification regarding multiple authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Stack trace related stuff is from kaldi. + * Refer to + * https://github.com/kaldi-asr/kaldi/blob/master/src/base/kaldi-error.cc + */ + +#include "kaldi-native-fbank/csrc/log.h" + +#ifdef KNF_HAVE_EXECINFO_H +#include // To get stack trace in error messages. +#ifdef KNF_HAVE_CXXABI_H +#include // For name demangling. +// Useful to decode the stack trace, but only used if we have execinfo.h +#endif // KNF_HAVE_CXXABI_H +#endif // KNF_HAVE_EXECINFO_H + +#include + +#include +#include +#include + +namespace knf { + +std::string GetDateTimeStr() { + std::ostringstream os; + std::time_t t = std::time(nullptr); + std::tm tm = *std::localtime(&t); + os << std::put_time(&tm, "%F %T"); // yyyy-mm-dd hh:mm:ss + return os.str(); +} + +static bool LocateSymbolRange(const std::string &trace_name, std::size_t *begin, + std::size_t *end) { + // Find the first '_' with leading ' ' or '('. + *begin = std::string::npos; + for (std::size_t i = 1; i < trace_name.size(); ++i) { + if (trace_name[i] != '_') { + continue; + } + if (trace_name[i - 1] == ' ' || trace_name[i - 1] == '(') { + *begin = i; + break; + } + } + if (*begin == std::string::npos) { + return false; + } + *end = trace_name.find_first_of(" +", *begin); + return *end != std::string::npos; +} + +#ifdef KNF_HAVE_EXECINFO_H +static std::string Demangle(const std::string &trace_name) { +#ifndef KNF_HAVE_CXXABI_H + return trace_name; +#else // KNF_HAVE_CXXABI_H + // Try demangle the symbol. We are trying to support the following formats + // produced by different platforms: + // + // Linux: + // ./kaldi-error-test(_ZN5kaldi13UnitTestErrorEv+0xb) [0x804965d] + // + // Mac: + // 0 server 0x000000010f67614d _ZNK5kaldi13MessageLogger10LogMessageEv + 813 + // + // We want to extract the name e.g., '_ZN5kaldi13UnitTestErrorEv' and + // demangle it info a readable name like kaldi::UnitTextError. + std::size_t begin, end; + if (!LocateSymbolRange(trace_name, &begin, &end)) { + return trace_name; + } + std::string symbol = trace_name.substr(begin, end - begin); + int status; + char *demangled_name = abi::__cxa_demangle(symbol.c_str(), 0, 0, &status); + if (status == 0 && demangled_name != nullptr) { + symbol = demangled_name; + free(demangled_name); + } + return trace_name.substr(0, begin) + symbol + + trace_name.substr(end, std::string::npos); +#endif // KNF_HAVE_CXXABI_H +} +#endif // KNF_HAVE_EXECINFO_H + +std::string GetStackTrace() { + std::string ans; +#ifdef KNF_HAVE_EXECINFO_H + constexpr const std::size_t kMaxTraceSize = 50; + constexpr const std::size_t kMaxTracePrint = 50; // Must be even. + // Buffer for the trace. + void *trace[kMaxTraceSize]; + // Get the trace. + std::size_t size = backtrace(trace, kMaxTraceSize); + // Get the trace symbols. + char **trace_symbol = backtrace_symbols(trace, size); + if (trace_symbol == nullptr) + return ans; + + // Compose a human-readable backtrace string. + ans += "[ Stack-Trace: ]\n"; + if (size <= kMaxTracePrint) { + for (std::size_t i = 0; i < size; ++i) { + ans += Demangle(trace_symbol[i]) + "\n"; + } + } else { // Print out first+last (e.g.) 5. + for (std::size_t i = 0; i < kMaxTracePrint / 2; ++i) { + ans += Demangle(trace_symbol[i]) + "\n"; + } + ans += ".\n.\n.\n"; + for (std::size_t i = size - kMaxTracePrint / 2; i < size; ++i) { + ans += Demangle(trace_symbol[i]) + "\n"; + } + if (size == kMaxTraceSize) + ans += ".\n.\n.\n"; // Stack was too long, probably a bug. + } + + // We must free the array of pointers allocated by backtrace_symbols(), + // but not the strings themselves. + free(trace_symbol); +#endif // KNF_HAVE_EXECINFO_H + return ans; +} + +} // namespace knf diff --git a/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/log.h b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/log.h new file mode 100644 index 000000000..feb38db1f --- /dev/null +++ b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/log.h @@ -0,0 +1,347 @@ +/** + * Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang) + * + * See LICENSE for clarification regarding multiple authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// The content in this file is copied/modified from +// https://github.com/k2-fsa/k2/blob/master/k2/csrc/log.h +#ifndef KALDI_NATIVE_FBANK_CSRC_LOG_H_ +#define KALDI_NATIVE_FBANK_CSRC_LOG_H_ + +#include + +#include // NOLINT +#include +#include + +namespace knf { + +#if defined(NDEBUG) +constexpr bool kDisableDebug = true; +#else +constexpr bool kDisableDebug = false; +#endif + +enum class LogLevel { + kTrace = 0, + kDebug = 1, + kInfo = 2, + kWarning = 3, + kError = 4, + kFatal = 5, // print message and abort the program +}; + +// They are used in KNF_LOG(xxx), so their names +// do not follow the google c++ code style +// +// You can use them in the following way: +// +// KNF_LOG(TRACE) << "some message"; +// KNF_LOG(DEBUG) << "some message"; +#ifndef _MSC_VER +constexpr LogLevel TRACE = LogLevel::kTrace; +constexpr LogLevel DEBUG = LogLevel::kDebug; +constexpr LogLevel INFO = LogLevel::kInfo; +constexpr LogLevel WARNING = LogLevel::kWarning; +constexpr LogLevel ERROR = LogLevel::kError; +constexpr LogLevel FATAL = LogLevel::kFatal; +#else +#define TRACE LogLevel::kTrace +#define DEBUG LogLevel::kDebug +#define INFO LogLevel::kInfo +#define WARNING LogLevel::kWarning +#define ERROR LogLevel::kError +#define FATAL LogLevel::kFatal +#endif + +std::string GetStackTrace(); + +/* Return the current log level. + + + If the current log level is TRACE, then all logged messages are printed out. + + If the current log level is DEBUG, log messages with "TRACE" level are not + shown and all other levels are printed out. + + Similarly, if the current log level is INFO, log message with "TRACE" and + "DEBUG" are not shown and all other levels are printed out. + + If it is FATAL, then only FATAL messages are shown. + */ +inline LogLevel GetCurrentLogLevel() { + static LogLevel log_level = INFO; + static std::once_flag init_flag; + std::call_once(init_flag, []() { + const char *env_log_level = std::getenv("KNF_LOG_LEVEL"); + if (env_log_level == nullptr) return; + + std::string s = env_log_level; + if (s == "TRACE") + log_level = TRACE; + else if (s == "DEBUG") + log_level = DEBUG; + else if (s == "INFO") + log_level = INFO; + else if (s == "WARNING") + log_level = WARNING; + else if (s == "ERROR") + log_level = ERROR; + else if (s == "FATAL") + log_level = FATAL; + else + fprintf(stderr, + "Unknown KNF_LOG_LEVEL: %s" + "\nSupported values are: " + "TRACE, DEBUG, INFO, WARNING, ERROR, FATAL", + s.c_str()); + }); + return log_level; +} + +inline bool EnableAbort() { + static std::once_flag init_flag; + static bool enable_abort = false; + std::call_once(init_flag, []() { + enable_abort = (std::getenv("KNF_ABORT") != nullptr); + }); + return enable_abort; +} + +class Logger { + public: + Logger(const char *filename, const char *func_name, uint32_t line_num, + LogLevel level) + : filename_(filename), + func_name_(func_name), + line_num_(line_num), + level_(level) { + cur_level_ = GetCurrentLogLevel(); + fprintf(stderr, "here\n"); + switch (level) { + case TRACE: + if (cur_level_ <= TRACE) fprintf(stderr, "[T] "); + break; + case DEBUG: + if (cur_level_ <= DEBUG) fprintf(stderr, "[D] "); + break; + case INFO: + if (cur_level_ <= INFO) fprintf(stderr, "[I] "); + break; + case WARNING: + if (cur_level_ <= WARNING) fprintf(stderr, "[W] "); + break; + case ERROR: + if (cur_level_ <= ERROR) fprintf(stderr, "[E] "); + break; + case FATAL: + if (cur_level_ <= FATAL) fprintf(stderr, "[F] "); + break; + } + + if (cur_level_ <= level_) { + fprintf(stderr, "%s:%u:%s ", filename, line_num, func_name); + } + } + + ~Logger() noexcept(false) { + static constexpr const char *kErrMsg = R"( + Some bad things happened. Please read the above error messages and stack + trace. If you are using Python, the following command may be helpful: + + gdb --args python /path/to/your/code.py + + (You can use `gdb` to debug the code. Please consider compiling + a debug version of KNF.). + + If you are unable to fix it, please open an issue at: + + https://github.com/csukuangfj/kaldi-native-fbank/issues/new + )"; + fprintf(stderr, "\n"); + if (level_ == FATAL) { + std::string stack_trace = GetStackTrace(); + if (!stack_trace.empty()) { + fprintf(stderr, "\n\n%s\n", stack_trace.c_str()); + } + + fflush(nullptr); + +#ifndef __ANDROID_API__ + if (EnableAbort()) { + // NOTE: abort() will terminate the program immediately without + // printing the Python stack backtrace. + abort(); + } + + throw std::runtime_error(kErrMsg); +#else + abort(); +#endif + } + } + + const Logger &operator<<(bool b) const { + if (cur_level_ <= level_) { + fprintf(stderr, b ? "true" : "false"); + } + return *this; + } + + const Logger &operator<<(int8_t i) const { + if (cur_level_ <= level_) fprintf(stderr, "%d", i); + return *this; + } + + const Logger &operator<<(const char *s) const { + if (cur_level_ <= level_) fprintf(stderr, "%s", s); + return *this; + } + + const Logger &operator<<(int32_t i) const { + if (cur_level_ <= level_) fprintf(stderr, "%d", i); + return *this; + } + + const Logger &operator<<(uint32_t i) const { + if (cur_level_ <= level_) fprintf(stderr, "%u", i); + return *this; + } + + const Logger &operator<<(uint64_t i) const { + if (cur_level_ <= level_) + fprintf(stderr, "%llu", (long long unsigned int)i); // NOLINT + return *this; + } + + const Logger &operator<<(int64_t i) const { + if (cur_level_ <= level_) + fprintf(stderr, "%lli", (long long int)i); // NOLINT + return *this; + } + + const Logger &operator<<(float f) const { + if (cur_level_ <= level_) fprintf(stderr, "%f", f); + return *this; + } + + const Logger &operator<<(double d) const { + if (cur_level_ <= level_) fprintf(stderr, "%f", d); + return *this; + } + + template + const Logger &operator<<(const T &t) const { + // require T overloads operator<< + std::ostringstream os; + os << t; + return *this << os.str().c_str(); + } + + // specialization to fix compile error: `stringstream << nullptr` is ambiguous + const Logger &operator<<(const std::nullptr_t &null) const { + if (cur_level_ <= level_) *this << "(null)"; + return *this; + } + + private: + const char *filename_; + const char *func_name_; + uint32_t line_num_; + LogLevel level_; + LogLevel cur_level_; +}; + +class Voidifier { + public: + void operator&(const Logger &)const {} +}; + +} // namespace knf + +#if defined(__clang__) || defined(__GNUC__) || defined(__GNUG__) || \ + defined(__PRETTY_FUNCTION__) +// for clang and GCC +#define KNF_FUNC __PRETTY_FUNCTION__ +#else +// for other compilers +#define KNF_FUNC __func__ +#endif + +#define KNF_STATIC_ASSERT(x) static_assert(x, "") + +#define KNF_CHECK(x) \ + (x) ? (void)0 \ + : ::knf::Voidifier() & \ + ::knf::Logger(__FILE__, KNF_FUNC, __LINE__, ::knf::FATAL) \ + << "Check failed: " << #x << " " + +// WARNING: x and y may be evaluated multiple times, but this happens only +// when the check fails. Since the program aborts if it fails, we don't think +// the extra evaluation of x and y matters. +// +// CAUTION: we recommend the following use case: +// +// auto x = Foo(); +// auto y = Bar(); +// KNF_CHECK_EQ(x, y) << "Some message"; +// +// And please avoid +// +// KNF_CHECK_EQ(Foo(), Bar()); +// +// if `Foo()` or `Bar()` causes some side effects, e.g., changing some +// local static variables or global variables. +#define _KNF_CHECK_OP(x, y, op) \ + ((x)op(y)) ? (void)0 \ + : ::knf::Voidifier() & \ + ::knf::Logger(__FILE__, KNF_FUNC, __LINE__, ::knf::FATAL) \ + << "Check failed: " << #x << " " << #op << " " << #y \ + << " (" << (x) << " vs. " << (y) << ") " + +#define KNF_CHECK_EQ(x, y) _KNF_CHECK_OP(x, y, ==) +#define KNF_CHECK_NE(x, y) _KNF_CHECK_OP(x, y, !=) +#define KNF_CHECK_LT(x, y) _KNF_CHECK_OP(x, y, <) +#define KNF_CHECK_LE(x, y) _KNF_CHECK_OP(x, y, <=) +#define KNF_CHECK_GT(x, y) _KNF_CHECK_OP(x, y, >) +#define KNF_CHECK_GE(x, y) _KNF_CHECK_OP(x, y, >=) + +#define KNF_LOG(x) ::knf::Logger(__FILE__, KNF_FUNC, __LINE__, ::knf::x) + +// ------------------------------------------------------------ +// For debug check +// ------------------------------------------------------------ +// If you define the macro "-D NDEBUG" while compiling kaldi-native-fbank, +// the following macros are in fact empty and does nothing. + +#define KNF_DCHECK(x) ::knf::kDisableDebug ? (void)0 : KNF_CHECK(x) + +#define KNF_DCHECK_EQ(x, y) ::knf::kDisableDebug ? (void)0 : KNF_CHECK_EQ(x, y) + +#define KNF_DCHECK_NE(x, y) ::knf::kDisableDebug ? (void)0 : KNF_CHECK_NE(x, y) + +#define KNF_DCHECK_LT(x, y) ::knf::kDisableDebug ? (void)0 : KNF_CHECK_LT(x, y) + +#define KNF_DCHECK_LE(x, y) ::knf::kDisableDebug ? (void)0 : KNF_CHECK_LE(x, y) + +#define KNF_DCHECK_GT(x, y) ::knf::kDisableDebug ? (void)0 : KNF_CHECK_GT(x, y) + +#define KNF_DCHECK_GE(x, y) ::knf::kDisableDebug ? (void)0 : KNF_CHECK_GE(x, y) + +#define KNF_DLOG(x) \ + ::knf::kDisableDebug ? (void)0 : ::knf::Voidifier() & KNF_LOG(x) + +#endif // KALDI_NATIVE_FBANK_CSRC_LOG_H_ diff --git a/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/mel-computations.cc b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/mel-computations.cc new file mode 100644 index 000000000..dade576b0 --- /dev/null +++ b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/mel-computations.cc @@ -0,0 +1,256 @@ +/** + * Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang) + * + * See LICENSE for clarification regarding multiple authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// This file is copied/modified from kaldi/src/feat/mel-computations.cc + +#include "kaldi-native-fbank/csrc/mel-computations.h" + +#include +#include + +#include "kaldi-native-fbank/csrc/feature-window.h" + +namespace knf { + +std::ostream &operator<<(std::ostream &os, const MelBanksOptions &opts) { + os << opts.ToString(); + return os; +} + +float MelBanks::VtlnWarpFreq( + float vtln_low_cutoff, // upper+lower frequency cutoffs for VTLN. + float vtln_high_cutoff, + float low_freq, // upper+lower frequency cutoffs in mel computation + float high_freq, float vtln_warp_factor, float freq) { + /// This computes a VTLN warping function that is not the same as HTK's one, + /// but has similar inputs (this function has the advantage of never producing + /// empty bins). + + /// This function computes a warp function F(freq), defined between low_freq + /// and high_freq inclusive, with the following properties: + /// F(low_freq) == low_freq + /// F(high_freq) == high_freq + /// The function is continuous and piecewise linear with two inflection + /// points. + /// The lower inflection point (measured in terms of the unwarped + /// frequency) is at frequency l, determined as described below. + /// The higher inflection point is at a frequency h, determined as + /// described below. + /// If l <= f <= h, then F(f) = f/vtln_warp_factor. + /// If the higher inflection point (measured in terms of the unwarped + /// frequency) is at h, then max(h, F(h)) == vtln_high_cutoff. + /// Since (by the last point) F(h) == h/vtln_warp_factor, then + /// max(h, h/vtln_warp_factor) == vtln_high_cutoff, so + /// h = vtln_high_cutoff / max(1, 1/vtln_warp_factor). + /// = vtln_high_cutoff * min(1, vtln_warp_factor). + /// If the lower inflection point (measured in terms of the unwarped + /// frequency) is at l, then min(l, F(l)) == vtln_low_cutoff + /// This implies that l = vtln_low_cutoff / min(1, 1/vtln_warp_factor) + /// = vtln_low_cutoff * max(1, vtln_warp_factor) + + if (freq < low_freq || freq > high_freq) + return freq; // in case this gets called + // for out-of-range frequencies, just return the freq. + + KNF_CHECK_GT(vtln_low_cutoff, low_freq); + KNF_CHECK_LT(vtln_high_cutoff, high_freq); + + float one = 1.0f; + float l = vtln_low_cutoff * std::max(one, vtln_warp_factor); + float h = vtln_high_cutoff * std::min(one, vtln_warp_factor); + float scale = 1.0f / vtln_warp_factor; + float Fl = scale * l; // F(l); + float Fh = scale * h; // F(h); + KNF_CHECK(l > low_freq && h < high_freq); + // slope of left part of the 3-piece linear function + float scale_left = (Fl - low_freq) / (l - low_freq); + // [slope of center part is just "scale"] + + // slope of right part of the 3-piece linear function + float scale_right = (high_freq - Fh) / (high_freq - h); + + if (freq < l) { + return low_freq + scale_left * (freq - low_freq); + } else if (freq < h) { + return scale * freq; + } else { // freq >= h + return high_freq + scale_right * (freq - high_freq); + } +} + +float MelBanks::VtlnWarpMelFreq( + float vtln_low_cutoff, // upper+lower frequency cutoffs for VTLN. + float vtln_high_cutoff, + float low_freq, // upper+lower frequency cutoffs in mel computation + float high_freq, float vtln_warp_factor, float mel_freq) { + return MelScale(VtlnWarpFreq(vtln_low_cutoff, vtln_high_cutoff, low_freq, + high_freq, vtln_warp_factor, + InverseMelScale(mel_freq))); +} + +MelBanks::MelBanks(const MelBanksOptions &opts, + const FrameExtractionOptions &frame_opts, + float vtln_warp_factor) + : htk_mode_(opts.htk_mode) { + int32_t num_bins = opts.num_bins; + if (num_bins < 3) KNF_LOG(FATAL) << "Must have at least 3 mel bins"; + + float sample_freq = frame_opts.samp_freq; + int32_t window_length_padded = frame_opts.PaddedWindowSize(); + KNF_CHECK_EQ(window_length_padded % 2, 0); + + int32_t num_fft_bins = window_length_padded / 2; + float nyquist = 0.5f * sample_freq; + + float low_freq = opts.low_freq, high_freq; + if (opts.high_freq > 0.0f) + high_freq = opts.high_freq; + else + high_freq = nyquist + opts.high_freq; + + if (low_freq < 0.0f || low_freq >= nyquist || high_freq <= 0.0f || + high_freq > nyquist || high_freq <= low_freq) { + KNF_LOG(FATAL) << "Bad values in options: low-freq " << low_freq + << " and high-freq " << high_freq << " vs. nyquist " + << nyquist; + } + + float fft_bin_width = sample_freq / window_length_padded; + // fft-bin width [think of it as Nyquist-freq / half-window-length] + + float mel_low_freq = MelScale(low_freq); + float mel_high_freq = MelScale(high_freq); + + debug_ = opts.debug_mel; + + // divide by num_bins+1 in next line because of end-effects where the bins + // spread out to the sides. + float mel_freq_delta = (mel_high_freq - mel_low_freq) / (num_bins + 1); + + float vtln_low = opts.vtln_low, vtln_high = opts.vtln_high; + if (vtln_high < 0.0f) { + vtln_high += nyquist; + } + + if (vtln_warp_factor != 1.0f && + (vtln_low < 0.0f || vtln_low <= low_freq || vtln_low >= high_freq || + vtln_high <= 0.0f || vtln_high >= high_freq || vtln_high <= vtln_low)) { + KNF_LOG(FATAL) << "Bad values in options: vtln-low " << vtln_low + << " and vtln-high " << vtln_high << ", versus " + << "low-freq " << low_freq << " and high-freq " << high_freq; + } + + bins_.resize(num_bins); + center_freqs_.resize(num_bins); + + for (int32_t bin = 0; bin < num_bins; ++bin) { + float left_mel = mel_low_freq + bin * mel_freq_delta, + center_mel = mel_low_freq + (bin + 1) * mel_freq_delta, + right_mel = mel_low_freq + (bin + 2) * mel_freq_delta; + + if (vtln_warp_factor != 1.0f) { + left_mel = VtlnWarpMelFreq(vtln_low, vtln_high, low_freq, high_freq, + vtln_warp_factor, left_mel); + center_mel = VtlnWarpMelFreq(vtln_low, vtln_high, low_freq, high_freq, + vtln_warp_factor, center_mel); + right_mel = VtlnWarpMelFreq(vtln_low, vtln_high, low_freq, high_freq, + vtln_warp_factor, right_mel); + } + center_freqs_[bin] = InverseMelScale(center_mel); + + // this_bin will be a vector of coefficients that is only + // nonzero where this mel bin is active. + std::vector this_bin(num_fft_bins); + + int32_t first_index = -1, last_index = -1; + for (int32_t i = 0; i < num_fft_bins; ++i) { + float freq = (fft_bin_width * i); // Center frequency of this fft + // bin. + float mel = MelScale(freq); + if (mel > left_mel && mel < right_mel) { + float weight; + if (mel <= center_mel) + weight = (mel - left_mel) / (center_mel - left_mel); + else + weight = (right_mel - mel) / (right_mel - center_mel); + this_bin[i] = weight; + if (first_index == -1) first_index = i; + last_index = i; + } + } + KNF_CHECK(first_index != -1 && last_index >= first_index && + "You may have set num_mel_bins too large."); + + bins_[bin].first = first_index; + int32_t size = last_index + 1 - first_index; + bins_[bin].second.insert(bins_[bin].second.end(), + this_bin.begin() + first_index, + this_bin.begin() + first_index + size); + + // Replicate a bug in HTK, for testing purposes. + if (opts.htk_mode && bin == 0 && mel_low_freq != 0.0f) { + bins_[bin].second[0] = 0.0; + } + } // for (int32_t bin = 0; bin < num_bins; ++bin) { + + if (debug_) { + std::ostringstream os; + for (size_t i = 0; i < bins_.size(); i++) { + os << "bin " << i << ", offset = " << bins_[i].first << ", vec = "; + for (auto k : bins_[i].second) os << k << ", "; + os << "\n"; + } + KNF_LOG(INFO) << os.str(); + } +} + +// "power_spectrum" contains fft energies. +void MelBanks::Compute(const float *power_spectrum, + float *mel_energies_out) const { + int32_t num_bins = bins_.size(); + + for (int32_t i = 0; i < num_bins; i++) { + int32_t offset = bins_[i].first; + const auto &v = bins_[i].second; + float energy = 0; + for (int32_t k = 0; k != v.size(); ++k) { + energy += v[k] * power_spectrum[k + offset]; + } + + // HTK-like flooring- for testing purposes (we prefer dither) + if (htk_mode_ && energy < 1.0) { + energy = 1.0; + } + + mel_energies_out[i] = energy; + + // The following assert was added due to a problem with OpenBlas that + // we had at one point (it was a bug in that library). Just to detect + // it early. + KNF_CHECK_EQ(energy, energy); // check that energy is not nan + } + + if (debug_) { + fprintf(stderr, "MEL BANKS:\n"); + for (int32_t i = 0; i < num_bins; i++) + fprintf(stderr, " %f", mel_energies_out[i]); + fprintf(stderr, "\n"); + } +} + +} // namespace knf diff --git a/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/mel-computations.h b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/mel-computations.h new file mode 100644 index 000000000..e743243a1 --- /dev/null +++ b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/mel-computations.h @@ -0,0 +1,115 @@ +/** + * Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang) + * + * See LICENSE for clarification regarding multiple authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +// This file is copied/modified from kaldi/src/feat/mel-computations.h +#ifndef KALDI_NATIVE_FBANK_CSRC_MEL_COMPUTATIONS_H_ +#define KALDI_NATIVE_FBANK_CSRC_MEL_COMPUTATIONS_H_ + +#include +#include + +#include "kaldi-native-fbank/csrc/feature-window.h" + +namespace knf { + +struct MelBanksOptions { + int32_t num_bins = 25; // e.g. 25; number of triangular bins + float low_freq = 20; // e.g. 20; lower frequency cutoff + + // an upper frequency cutoff; 0 -> no cutoff, negative + // ->added to the Nyquist frequency to get the cutoff. + float high_freq = 0; + + float vtln_low = 100; // vtln lower cutoff of warping function. + + // vtln upper cutoff of warping function: if negative, added + // to the Nyquist frequency to get the cutoff. + float vtln_high = -500; + + bool debug_mel = false; + // htk_mode is a "hidden" config, it does not show up on command line. + // Enables more exact compatibility with HTK, for testing purposes. Affects + // mel-energy flooring and reproduces a bug in HTK. + bool htk_mode = false; + + std::string ToString() const { + std::ostringstream os; + os << "num_bins: " << num_bins << "\n"; + os << "low_freq: " << low_freq << "\n"; + os << "high_freq: " << high_freq << "\n"; + os << "vtln_low: " << vtln_low << "\n"; + os << "vtln_high: " << vtln_high << "\n"; + os << "debug_mel: " << debug_mel << "\n"; + os << "htk_mode: " << htk_mode << "\n"; + return os.str(); + } +}; + +std::ostream &operator<<(std::ostream &os, const MelBanksOptions &opts); + +class MelBanks { + public: + static inline float InverseMelScale(float mel_freq) { + return 700.0f * (expf(mel_freq / 1127.0f) - 1.0f); + } + + static inline float MelScale(float freq) { + return 1127.0f * logf(1.0f + freq / 700.0f); + } + + static float VtlnWarpFreq( + float vtln_low_cutoff, + float vtln_high_cutoff, // discontinuities in warp func + float low_freq, + float high_freq, // upper+lower frequency cutoffs in + // the mel computation + float vtln_warp_factor, float freq); + + static float VtlnWarpMelFreq(float vtln_low_cutoff, float vtln_high_cutoff, + float low_freq, float high_freq, + float vtln_warp_factor, float mel_freq); + + // TODO(fangjun): Remove vtln_warp_factor + MelBanks(const MelBanksOptions &opts, + const FrameExtractionOptions &frame_opts, float vtln_warp_factor); + + /// Compute Mel energies (note: not log energies). + /// At input, "fft_energies" contains the FFT energies (not log). + /// + /// @param fft_energies 1-D array of size num_fft_bins/2+1 + /// @param mel_energies_out 1-D array of size num_mel_bins + void Compute(const float *fft_energies, float *mel_energies_out) const; + + int32_t NumBins() const { return bins_.size(); } + + private: + // center frequencies of bins, numbered from 0 ... num_bins-1. + // Needed by GetCenterFreqs(). + std::vector center_freqs_; + + // the "bins_" vector is a vector, one for each bin, of a pair: + // (the first nonzero fft-bin), (the vector of weights). + std::vector>> bins_; + + // TODO(fangjun): Remove debug_ and htk_mode_ + bool debug_; + bool htk_mode_; +}; + +} // namespace knf + +#endif // KALDI_NATIVE_FBANK_CSRC_MEL_COMPUTATIONS_H_ diff --git a/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/rfft.cc b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/rfft.cc new file mode 100644 index 000000000..69bfde5f8 --- /dev/null +++ b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/rfft.cc @@ -0,0 +1,66 @@ +/** + * Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang) + * + * See LICENSE for clarification regarding multiple authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kaldi-native-fbank/csrc/rfft.h" + +#include +#include + +#include "kaldi-native-fbank/csrc/log.h" + +// see fftsg.c +#ifdef __cplusplus +extern "C" void rdft(int n, int isgn, double *a, int *ip, double *w); +#else +void rdft(int n, int isgn, double *a, int *ip, double *w); +#endif + +namespace knf { +class Rfft::RfftImpl { + public: + explicit RfftImpl(int32_t n) : n_(n), ip_(2 + std::sqrt(n / 2)), w_(n / 2) { + KNF_CHECK_EQ(n & (n - 1), 0); + } + + void Compute(float *in_out) { + std::vector d(in_out, in_out + n_); + + Compute(d.data()); + + std::copy(d.begin(), d.end(), in_out); + } + + void Compute(double *in_out) { + // 1 means forward fft + rdft(n_, 1, in_out, ip_.data(), w_.data()); + } + + private: + int32_t n_; + std::vector ip_; + std::vector w_; +}; + +Rfft::Rfft(int32_t n) : impl_(std::make_unique(n)) {} + +Rfft::~Rfft() = default; + +void Rfft::Compute(float *in_out) { impl_->Compute(in_out); } +void Rfft::Compute(double *in_out) { impl_->Compute(in_out); } + +} // namespace knf diff --git a/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/rfft.h b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/rfft.h new file mode 100644 index 000000000..c8cb9f8c1 --- /dev/null +++ b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/rfft.h @@ -0,0 +1,56 @@ +/** + * Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang) + * + * See LICENSE for clarification regarding multiple authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef KALDI_NATIVE_FBANK_CSRC_RFFT_H_ +#define KALDI_NATIVE_FBANK_CSRC_RFFT_H_ + +#include + +namespace knf { + +// n-point Real discrete Fourier transform +// where n is a power of 2. n >= 2 +// +// R[k] = sum_j=0^n-1 in[j]*cos(2*pi*j*k/n), 0<=k<=n/2 +// I[k] = sum_j=0^n-1 in[j]*sin(2*pi*j*k/n), 0 impl_; +}; + +} // namespace knf + +#endif // KALDI_NATIVE_FBANK_CSRC_RFFT_H_ diff --git a/audio/paddleaudio/third_party/kaldi/CMakeLists.txt b/audio/paddleaudio/third_party/kaldi/CMakeLists.txt deleted file mode 100644 index e63fb5788..000000000 --- a/audio/paddleaudio/third_party/kaldi/CMakeLists.txt +++ /dev/null @@ -1,111 +0,0 @@ -# checkout the thirdparty/kaldi/base/kaldi-types.h -# compile kaldi without openfst -add_definitions("-DCOMPILE_WITHOUT_OPENFST") - -if ((NOT EXISTS ${CMAKE_CURRENT_LIST_DIR}/base)) - file(COPY ../../../../speechx/speechx/kaldi/base DESTINATION ${CMAKE_CURRENT_LIST_DIR}) - file(COPY ../../../../speechx/speechx/kaldi/feat DESTINATION ${CMAKE_CURRENT_LIST_DIR}) - file(COPY ../../../../speechx/speechx/kaldi/matrix DESTINATION ${CMAKE_CURRENT_LIST_DIR}) - file(COPY ../../../../speechx/speechx/kaldi/util DESTINATION ${CMAKE_CURRENT_LIST_DIR}) -endif() - -# kaldi-base -add_library(kaldi-base STATIC - base/io-funcs.cc - base/kaldi-error.cc - base/kaldi-math.cc - base/kaldi-utils.cc - base/timer.cc -) -target_include_directories(kaldi-base PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) - -# kaldi-matrix -add_library(kaldi-matrix STATIC - matrix/compressed-matrix.cc - matrix/matrix-functions.cc - matrix/kaldi-matrix.cc - matrix/kaldi-vector.cc - matrix/optimization.cc - matrix/packed-matrix.cc - matrix/qr.cc - matrix/sparse-matrix.cc - matrix/sp-matrix.cc - matrix/srfft.cc - matrix/tp-matrix.cc -) -target_include_directories(kaldi-matrix PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) - -if (NOT MSVC) - target_link_libraries(kaldi-matrix PUBLIC kaldi-base libopenblas) -else() - target_link_libraries(kaldi-matrix PUBLIC kaldi-base openblas) -endif() - -# kaldi-util -add_library(kaldi-util STATIC - util/kaldi-holder.cc - util/kaldi-io.cc - util/kaldi-semaphore.cc - util/kaldi-table.cc - util/kaldi-thread.cc - util/parse-options.cc - util/simple-io-funcs.cc - util/simple-options.cc - util/text-utils.cc -) -target_include_directories(kaldi-util PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) -target_link_libraries(kaldi-util PUBLIC kaldi-base kaldi-matrix) - -# kaldi-feat-common -add_library(kaldi-feat-common STATIC - feat/cmvn.cc - feat/feature-functions.cc - feat/feature-window.cc - feat/mel-computations.cc - feat/pitch-functions.cc - feat/resample.cc - feat/signal.cc - feat/wave-reader.cc -) -target_include_directories(kaldi-feat-common PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) -target_link_libraries(kaldi-feat-common PUBLIC kaldi-base kaldi-matrix kaldi-util) - - -# kaldi-mfcc -add_library(kaldi-mfcc STATIC - feat/feature-mfcc.cc -) -target_include_directories(kaldi-mfcc PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) -target_link_libraries(kaldi-mfcc PUBLIC kaldi-feat-common) - - -# kaldi-fbank -add_library(kaldi-fbank STATIC - feat/feature-fbank.cc -) -target_include_directories(kaldi-fbank PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) -target_link_libraries(kaldi-fbank PUBLIC kaldi-feat-common) - - -set(KALDI_LIBRARIES - ${CMAKE_CURRENT_BINARY_DIR}/libkaldi-base.a - ${CMAKE_CURRENT_BINARY_DIR}/libkaldi-matrix.a - ${CMAKE_CURRENT_BINARY_DIR}/libkaldi-util.a - ${CMAKE_CURRENT_BINARY_DIR}/libkaldi-feat-common.a - ${CMAKE_CURRENT_BINARY_DIR}/libkaldi-mfcc.a - ${CMAKE_CURRENT_BINARY_DIR}/libkaldi-fbank.a -) - -add_library(libkaldi INTERFACE) -add_dependencies(libkaldi kaldi-base kaldi-matrix kaldi-util kaldi-feat-common kaldi-mfcc kaldi-fbank) -target_include_directories(libkaldi INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}) - -if (APPLE) - target_link_libraries(libkaldi INTERFACE ${KALDI_LIBRARIES} libopenblas ${GFORTRAN_LIBRARIES_DIR}/libgfortran.a ${GFORTRAN_LIBRARIES_DIR}/libquadmath.a ${GFORTRAN_LIBRARIES_DIR}/libgcc_s.1.1.dylib) -elseif (MSVC) - target_link_libraries(libkaldi INTERFACE kaldi-base kaldi-matrix kaldi-util kaldi-feat-common kaldi-mfcc kaldi-fbank openblas) -else() - target_link_libraries(libkaldi INTERFACE -Wl,--start-group -Wl,--whole-archive ${KALDI_LIBRARIES} libopenblas.a gfortran -Wl,--no-whole-archive -Wl,--end-group) -endif() - -target_compile_definitions(libkaldi INTERFACE "-DCOMPILE_WITHOUT_OPENFST") diff --git a/audio/setup.py b/audio/setup.py index d7208a431..82e9a55a5 100644 --- a/audio/setup.py +++ b/audio/setup.py @@ -51,8 +51,7 @@ base = [ ] requirements = { - "install": - base, + "install": base, "develop": [ "sox", "soxbindings", @@ -60,6 +59,7 @@ requirements = { ], } + def check_call(cmd: str, shell=False, executable=None): try: sp.check_call( @@ -92,6 +92,7 @@ def check_output(cmd: Union[str, List[str], Tuple[str]], shell=False): file=sys.stderr) return out_bytes.strip().decode('utf8') + def _run_cmd(cmd): try: return subprocess.check_output( @@ -100,6 +101,7 @@ def _run_cmd(cmd): except Exception: return None + @contextlib.contextmanager def pushd(new_dir): old_dir = os.getcwd() @@ -109,22 +111,26 @@ def pushd(new_dir): os.chdir(old_dir) print(old_dir) + def read(*names, **kwargs): with io.open( os.path.join(os.path.dirname(__file__), *names), encoding=kwargs.get("encoding", "utf8")) as fp: return fp.read() + def _remove(files: str): for f in files: f.unlink() + ################################# Install ################################## def _post_install(install_lib_dir): pass + class DevelopCommand(develop): def run(self): develop.run(self) @@ -142,7 +148,7 @@ class TestCommand(test): # Run nose ensuring that argv simulates running nosetests directly import nose nose.run_exit(argv=['nosetests', '-w', 'tests']) - + def run_benchmark(self): for benchmark_item in glob.glob('tests/benchmark/*py'): os.system(f'pytest {benchmark_item}') @@ -188,6 +194,7 @@ def _make_version_file(version, sha): with open(version_path, "a") as f: f.write(f"__version__ = '{version}'\n") + def _rm_version(): file_ = ROOT_DIR / "paddleaudio" / "__init__.py" with open(file_, "r") as f: @@ -235,8 +242,8 @@ def main(): if platform.system() != 'Windows' and platform.system() != 'Linux': lib_package_data = {'paddleaudio': ['lib/libgcc_s.1.1.dylib']} - if platform.system() == 'Linux': - lib_package_data = {'paddleaudio': ['lib/lib*']} + #if platform.system() == 'Linux': + # lib_package_data = {'paddleaudio': ['lib/lib*']} setup_info = dict( # Metadata @@ -254,8 +261,7 @@ def main(): python_requires='>=3.7', install_requires=requirements["install"], extras_require={ - 'develop': - requirements["develop"], + 'develop': requirements["develop"], #'test': ["nose", "torchaudio==0.10.2", "pytest-benchmark", "librosa=0.8.1", "parameterized", "paddlepaddle"], }, cmdclass={ @@ -284,11 +290,11 @@ def main(): 'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: 3.9', 'Programming Language :: Python :: 3.10', - ], - ) + ], ) setup(**setup_info) _rm_version() + if __name__ == '__main__': main() From 4a11302e3519a576e89dba190d043b3f82581015 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Fri, 6 Jan 2023 10:39:46 +0800 Subject: [PATCH 015/159] Update setup.py --- setup.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/setup.py b/setup.py index 4f6d10d7c..3bde2b205 100644 --- a/setup.py +++ b/setup.py @@ -281,10 +281,8 @@ setup_info = dict( "deepspeech2", "transformer", "conformer", - "fastspeech", - "vocoder", - "pwgan", - "gan", + "fastspeech2", + "gan vocoders", ], python_requires='>=3.7', install_requires=requirements["install"], From 25dcad3de74556d7a1d15d09047b21f0a1a134e7 Mon Sep 17 00:00:00 2001 From: YangZhou <56786796+SmileGoat@users.noreply.github.com> Date: Fri, 6 Jan 2023 11:13:24 +0800 Subject: [PATCH 016/159] update paddleaudio readme, test=doc (#2801) --- audio/README.md | 21 +++++---------------- 1 file changed, 5 insertions(+), 16 deletions(-) diff --git a/audio/README.md b/audio/README.md index bfd8625f0..d42d41229 100644 --- a/audio/README.md +++ b/audio/README.md @@ -2,33 +2,22 @@ 安装方式: pip install paddleaudio -目前支持的平台:Linux: +目前支持的平台:Linux, Mac, Windows ## Environment ## Build wheel +cmd: python setup.py bdist_wheel Linux test build whl environment: -* docker - `registry.baidubce.com/paddlepaddle/paddle:2.2.2` * os - Ubuntu 16.04.7 LTS -* gcc/g++/gfortran - 8.2.0 +* gcc/g++ - 8.2.0 * cmake - 3.18.0 (need install) -* [How to Install Docker](https://docs.docker.com/engine/install/) -* [A Docker Tutorial for Beginners](https://docker-curriculum.com/) - -1. First to launch docker container. - -``` -docker run --privileged --net=host --ipc=host -it --rm -v $PWD:/workspace --name=dev registry.baidubce.com/paddlepaddle/paddle:2.2.2 /bin/bash -``` -2. python setup.py bdist_wheel - MAC:test build whl envrioment: * os -* gcc/g++/gfortran 12.2.0 +* gcc/g++ 12.2.0 * cpu Intel Xeon E5 x86_64 Windows: -not support: paddleaudio C++ extension lib (sox io, kaldi native fbank) -python setup.py bdist_wheel +not support paddleaudio C++ extension lib (sox io, kaldi native fbank) From e793d267d9fa7003c95430a07ca2de6dd92fb6fb Mon Sep 17 00:00:00 2001 From: zxcd <228587199@qq.com> Date: Fri, 6 Jan 2023 16:24:51 +0800 Subject: [PATCH 017/159] [ASR] add code-switch asr tal_cs recipe (#2796) * add tal_cs asr recipe. * add readme and result, and fix some bug. * add commit id and date. --- dataset/tal_cs/README.md | 13 ++ dataset/tal_cs/tal_cs.py | 116 +++++++++++ docs/source/released_model.md | 3 +- examples/tal_cs/asr1/README.md | 190 ++++++++++++++++++ examples/tal_cs/asr1/RESULTS.md | 12 ++ examples/tal_cs/asr1/conf/conformer.yaml | 91 +++++++++ examples/tal_cs/asr1/conf/preprocess.yaml | 29 +++ .../tal_cs/asr1/conf/tuning/chunk_decode.yaml | 12 ++ examples/tal_cs/asr1/conf/tuning/decode.yaml | 12 ++ examples/tal_cs/asr1/local/data.sh | 88 ++++++++ examples/tal_cs/asr1/local/test.sh | 72 +++++++ examples/tal_cs/asr1/local/test_wav.sh | 58 ++++++ examples/tal_cs/asr1/local/train.sh | 72 +++++++ examples/tal_cs/asr1/path.sh | 15 ++ examples/tal_cs/asr1/run.sh | 51 +++++ examples/tal_cs/asr1/utils | 1 + 16 files changed, 834 insertions(+), 1 deletion(-) create mode 100644 dataset/tal_cs/README.md create mode 100644 dataset/tal_cs/tal_cs.py create mode 100644 examples/tal_cs/asr1/README.md create mode 100644 examples/tal_cs/asr1/RESULTS.md create mode 100644 examples/tal_cs/asr1/conf/conformer.yaml create mode 100644 examples/tal_cs/asr1/conf/preprocess.yaml create mode 100644 examples/tal_cs/asr1/conf/tuning/chunk_decode.yaml create mode 100644 examples/tal_cs/asr1/conf/tuning/decode.yaml create mode 100644 examples/tal_cs/asr1/local/data.sh create mode 100755 examples/tal_cs/asr1/local/test.sh create mode 100755 examples/tal_cs/asr1/local/test_wav.sh create mode 100755 examples/tal_cs/asr1/local/train.sh create mode 100755 examples/tal_cs/asr1/path.sh create mode 100644 examples/tal_cs/asr1/run.sh create mode 120000 examples/tal_cs/asr1/utils diff --git a/dataset/tal_cs/README.md b/dataset/tal_cs/README.md new file mode 100644 index 000000000..633056360 --- /dev/null +++ b/dataset/tal_cs/README.md @@ -0,0 +1,13 @@ +# [TAL_CSASR](https://ai.100tal.com/dataset/) + +This data set is TAL English class audio, including mixed Chinese and English speech. Each audio has only one speaker, and this data set has more than 100 speakers. (File 63.36G) This data contains the sample of intra sentence and inter sentence mixing. The ratio between Chinese characters and English words in the data is 13:1. + +- Total data: 587H (train_set: 555.9H, dev_set: 8H, test_set: 23.6H) +- Sample rate: 16000 +- Sample bit: 16 +- Recording device: microphone +- Speaker number: 200+ +- Recording time: 2019 +- Data format: audio: .wav; test: .txt +- Audio duration: 1-60s +- Data type: audio of English teachers' teaching diff --git a/dataset/tal_cs/tal_cs.py b/dataset/tal_cs/tal_cs.py new file mode 100644 index 000000000..2024b21e3 --- /dev/null +++ b/dataset/tal_cs/tal_cs.py @@ -0,0 +1,116 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Prepare TALCS ASR datasets. + +create manifest files. +Manifest file is a json-format file with each line containing the +meta data (i.e. audio filepath, transcript and audio duration) +of each audio file in the data set. +""" +import argparse +import codecs +import io +import json +import os + +import soundfile + +parser = argparse.ArgumentParser(description=__doc__) +parser.add_argument( + "--target_dir", + type=str, + help="Directory to save the dataset. (default: %(default)s)") +parser.add_argument( + "--manifest_prefix", + type=str, + help="Filepath prefix for output manifests. (default: %(default)s)") +args = parser.parse_args() + +TRAIN_SET = os.path.join(args.target_dir, "train_set") +DEV_SET = os.path.join(args.target_dir, "dev_set") +TEST_SET = os.path.join(args.target_dir, "test_set") + +manifest_train_path = os.path.join(args.manifest_prefix, "manifest.train.raw") +manifest_dev_path = os.path.join(args.manifest_prefix, "manifest.dev.raw") +manifest_test_path = os.path.join(args.manifest_prefix, "manifest.test.raw") + + +def create_manifest(data_dir, manifest_path): + """Create a manifest json file summarizing the data set, with each line + containing the meta data (i.e. audio filepath, transcription text, audio + duration) of each audio file within the data set. + """ + print("Creating manifest %s ..." % manifest_path) + json_lines = [] + total_sec = 0.0 + total_char = 0.0 + total_num = 0 + wav_dir = os.path.join(data_dir, 'wav') + text_filepath = os.path.join(data_dir, 'label.txt') + for subfolder, _, filelist in sorted(os.walk(wav_dir)): + for line in io.open(text_filepath, encoding="utf8"): + segments = line.strip().split() + nchars = len(segments[1:]) + text = ' '.join(segments[1:]).lower() + + audio_filepath = os.path.abspath( + os.path.join(subfolder, segments[0] + '.wav')) + audio_data, samplerate = soundfile.read(audio_filepath) + duration = float(len(audio_data)) / samplerate + + utt = os.path.splitext(os.path.basename(audio_filepath))[0] + utt2spk = '-'.join(utt.split('-')[:2]) + + json_lines.append( + json.dumps({ + 'utt': utt, + 'utt2spk': utt2spk, + 'feat': audio_filepath, + 'feat_shape': (duration, ), # second + 'text': text, + })) + + total_sec += duration + total_char += nchars + total_num += 1 + + with codecs.open(manifest_path, 'w', 'utf-8') as out_file: + for line in json_lines: + out_file.write(line + '\n') + + subset = os.path.splitext(manifest_path)[1][1:] + manifest_dir = os.path.dirname(manifest_path) + data_dir_name = os.path.split(data_dir)[-1] + meta_path = os.path.join(manifest_dir, data_dir_name) + '.meta' + with open(meta_path, 'w') as f: + print(f"{subset}:", file=f) + print(f"{total_num} utts", file=f) + print(f"{total_sec / (60*60)} h", file=f) + print(f"{total_char} char", file=f) + print(f"{total_char / total_sec} char/sec", file=f) + print(f"{total_sec / total_num} sec/utt", file=f) + + +def main(): + if args.target_dir.startswith('~'): + args.target_dir = os.path.expanduser(args.target_dir) + + create_manifest(TRAIN_SET, manifest_train_path) + create_manifest(DEV_SET, manifest_dev_path) + create_manifest(TEST_SET, manifest_test_path) + print("Data download and manifest prepare done!") + + +if __name__ == '__main__': + main() diff --git a/docs/source/released_model.md b/docs/source/released_model.md index 87c58b787..10a39e239 100644 --- a/docs/source/released_model.md +++ b/docs/source/released_model.md @@ -17,6 +17,7 @@ Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | [Conformer Librispeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/asr1_conformer_librispeech_ckpt_0.1.1.model.tar.gz) | Librispeech Dataset | subword-based | 191 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0338 | 960 h | [Conformer Librispeech ASR1](../../examples/librispeech/asr1) | python | [Transformer Librispeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/asr1_transformer_librispeech_ckpt_0.1.1.model.tar.gz) | Librispeech Dataset | subword-based | 131 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0381 | 960 h | [Transformer Librispeech ASR1](../../examples/librispeech/asr1) | python | [Transformer Librispeech ASR2 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr2/asr2_transformer_librispeech_ckpt_0.1.1.model.tar.gz) | Librispeech Dataset | subword-based | 131 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: JoinCTC w/ LM |-| 0.0240 | 960 h | [Transformer Librispeech ASR2](../../examples/librispeech/asr2) | python | +[Conformer TALCS ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/tal_cs/asr1/asr1_conformer_talcs_ckpt_1.4.0.model.tar.gz) | TALCS Dataset | subword-based | 470 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0844 | 587 h | [Conformer TALCS ASR1](../../examples/tal_cs/asr1) | python | ### Self-Supervised Pre-trained Model Model | Pre-Train Method | Pre-Train Data | Finetune Data | Size | Descriptions | CER | WER | Example Link | @@ -29,7 +30,7 @@ Model | Pre-Train Method | Pre-Train Data | Finetune Data | Size | Descriptions ### Whisper Model Demo Link | Training Data | Size | Descriptions | CER | Model :-----------: | :-----:| :-------: | :-----: | :-----: |:---------:| -[Whisper](../../demos/whisper) | 680kh from internet | large: 5.8G,
medium: 2.9G,
small: 923M,
base: 277M,
tiny: 145M | Encoder:Transformer,
Decoder:Transformer,
Decoding method:
Greedy search | 2.7
(large, Librispeech) | [whisper-large](https://paddlespeech.bj.bcebos.com/whisper/whisper_model_20221122/whisper-large-model.tar.gz)
[whisper-medium](https://paddlespeech.bj.bcebos.com/whisper/whisper_model_20221122/whisper-medium-model.tar.gz)
[whisper-medium-English-only](https://paddlespeech.bj.bcebos.com/whisper/whisper_model_20221122/whisper-medium-en-model.tar.gz)
[whisper-small](https://paddlespeech.bj.bcebos.com/whisper/whisper_model_20221122/whisper-small-model.tar.gz)
[whisper-small-English-only](https://paddlespeech.bj.bcebos.com/whisper/whisper_model_20221122/whisper-small-en-model.tar.gz)
[whisper-base](https://paddlespeech.bj.bcebos.com/whisper/whisper_model_20221122/whisper-base-model.tar.gz)
[whisper-base-English-only](https://paddlespeech.bj.bcebos.com/whisper/whisper_model_20221122/whisper-base-en-model.tar.gz)
[whisper-tiny](https://paddlespeech.bj.bcebos.com/whisper/whisper_model_20221122/whisper-tiny-model.tar.gz)
[whisper-tiny-English-only](https://paddlespeech.bj.bcebos.com/whisper/whisper_model_20221122/whisper-tiny-en-model.tar.gz) +[Whisper](../../demos/whisper) | 680kh from internet | large: 5.8G,
medium: 2.9G,
small: 923M,
base: 277M,
tiny: 145M | Encoder:Transformer,
Decoder:Transformer,
Decoding method:
Greedy search | 0.027
(large, Librispeech) | [whisper-large](https://paddlespeech.bj.bcebos.com/whisper/whisper_model_20221122/whisper-large-model.tar.gz)
[whisper-medium](https://paddlespeech.bj.bcebos.com/whisper/whisper_model_20221122/whisper-medium-model.tar.gz)
[whisper-medium-English-only](https://paddlespeech.bj.bcebos.com/whisper/whisper_model_20221122/whisper-medium-en-model.tar.gz)
[whisper-small](https://paddlespeech.bj.bcebos.com/whisper/whisper_model_20221122/whisper-small-model.tar.gz)
[whisper-small-English-only](https://paddlespeech.bj.bcebos.com/whisper/whisper_model_20221122/whisper-small-en-model.tar.gz)
[whisper-base](https://paddlespeech.bj.bcebos.com/whisper/whisper_model_20221122/whisper-base-model.tar.gz)
[whisper-base-English-only](https://paddlespeech.bj.bcebos.com/whisper/whisper_model_20221122/whisper-base-en-model.tar.gz)
[whisper-tiny](https://paddlespeech.bj.bcebos.com/whisper/whisper_model_20221122/whisper-tiny-model.tar.gz)
[whisper-tiny-English-only](https://paddlespeech.bj.bcebos.com/whisper/whisper_model_20221122/whisper-tiny-en-model.tar.gz) ### Language Model based on NGram |Language Model | Training Data | Token-based | Size | Descriptions| diff --git a/examples/tal_cs/asr1/README.md b/examples/tal_cs/asr1/README.md new file mode 100644 index 000000000..83a27ac1e --- /dev/null +++ b/examples/tal_cs/asr1/README.md @@ -0,0 +1,190 @@ +# Transformer/Conformer ASR with TALCS +This example contains code used to train [u2](https://arxiv.org/pdf/2012.05481.pdf) model (Transformer or [Conformer](https://arxiv.org/pdf/2005.08100.pdf) model) with [TALCS dataset](https://ai.100tal.com/dataset) +## Overview +All the scripts you need are in `run.sh`. There are several stages in `run.sh`, and each stage has its function. +| Stage | Function | +|:---- |:----------------------------------------------------------- | +| 0 | Process data. It includes:
(1) Download the dataset
(2) Calculate the CMVN of the train dataset
(3) Get the vocabulary file
(4) Get the manifest files of the train, development and test dataset
(5) Get the sentencepiece model | +| 1 | Train the model | +| 2 | Get the final model by averaging the top-k models, set k = 1 means to choose the best model | +| 3 | Test the final model performance | +| 4 | Get ctc alignment of test data using the final model | +| 5 | Infer the single audio file | + +You can choose to run a range of stages by setting `stage` and `stop_stage `. + +For example, if you want to execute the code in stage 2 and stage 3, you can run this script: +```bash +bash run.sh --stage 2 --stop_stage 3 +``` +Or you can set `stage` equal to `stop-stage` to only run one stage. +For example, if you only want to run `stage 0`, you can use the script below: +```bash +bash run.sh --stage 0 --stop_stage 0 +``` +The document below will describe the scripts in `run.sh` in detail. +## The Environment Variables +The path.sh contains the environment variables. +```bash +. ./path.sh +. ./cmd.sh +``` +This script needs to be run first. And another script is also needed: +```bash +source ${MAIN_ROOT}/utils/parse_options.sh +``` +It will support the way of using `--variable value` in the shell scripts. +## The Local Variables +Some local variables are set in `run.sh`. +`gpus` denotes the GPU number you want to use. If you set `gpus=`, it means you only use CPU. +`stage` denotes the number of stages you want to start from in the experiments. +`stop stage` denotes the number of the stage you want to end at in the experiments. +`conf_path` denotes the config path of the model. +`avg_num` denotes the number K of top-K models you want to average to get the final model. +`audio file` denotes the file path of the single file you want to infer in stage 5 +`ckpt` denotes the checkpoint prefix of the model, e.g. "conformer" + +You can set the local variables (except `ckpt`) when you use `run.sh` + +For example, you can set the `gpus` and `avg_num` when you use the command line: +```bash +bash run.sh --gpus 0,1 --avg_num 10 +``` +## Stage 0: Data Processing +To use this example, you need to process data firstly and you can use stage 0 in `run.sh` to do this. The code is shown below: +```bash + if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # prepare data + bash ./local/data.sh || exit -1 + fi +``` +Stage 0 is for processing the data. + +If you only want to process the data. You can run +```bash +bash run.sh --stage 0 --stop_stage 0 +``` +You can also just run these scripts in your command line. +```bash +. ./path.sh +. ./cmd.sh +bash ./local/data.sh +``` +After processing the data, the `data` directory will look like this: +```bash +data/ +|-- dev_set.meta +|-- lang_char +| `-- bpe_bpe_11297.model +| `-- bpe_bpe_11297.vocab +| `-- vocab.txt +|-- manifest.dev +|-- manifest.dev.raw +|-- manifest.test +|-- manifest.test.raw +|-- manifest.train +|-- manifest.train.raw +|-- mean_std.json +|-- test_set.meta +`-- train_set.meta +``` +## Stage 1: Model Training +If you want to train the model. you can use stage 1 in `run.sh`. The code is shown below. +```bash +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # train model, all `ckpt` under `exp` dir + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} + fi +``` +If you want to train the model, you can use the script below to execute stage 0 and stage 1: +```bash +bash run.sh --stage 0 --stop_stage 1 +``` +or you can run these scripts in the command line (only use CPU). +```bash +. ./path.sh +. ./cmd.sh +bash ./local/data.sh +CUDA_VISIBLE_DEVICES= ./local/train.sh conf/conformer.yaml conformer +``` +## Stage 2: Top-k Models Averaging +After training the model, we need to get the final model for testing and inference. In every epoch, the model checkpoint is saved, so we can choose the best model from them based on the validation loss or we can sort them and average the parameters of the top-k models to get the final model. We can use stage 2 to do this, and the code is shown below: +```bash + if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # avg n best model + avg.sh best exp/${ckpt}/checkpoints ${avg_num} + fi +``` +The `avg.sh` is in the `../../../utils/` which is define in the `path.sh`. +If you want to get the final model, you can use the script below to execute stage 0, stage 1, and stage 2: +```bash +bash run.sh --stage 0 --stop_stage 2 +``` +or you can run these scripts in the command line (only use CPU). + +```bash +. ./path.sh +. ./cmd.sh +bash ./local/data.sh +CUDA_VISIBLE_DEVICES= ./local/train.sh conf/conformer.yaml conformer +avg.sh best exp/conformer/checkpoints 10 +``` +## Stage 3: Model Testing +The test stage is to evaluate the model performance. The code of test stage is shown below: +```bash + if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + # test ckpt avg_n + CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + fi +``` +If you want to train a model and test it, you can use the script below to execute stage 0, stage 1, stage 2, and stage 3 : +```bash +bash run.sh --stage 0 --stop_stage 3 +``` +or you can run these scripts in the command line (only use CPU). +```bash +. ./path.sh +. ./cmd.sh +bash ./local/data.sh +CUDA_VISIBLE_DEVICES= ./local/train.sh conf/conformer.yaml conformer +avg.sh best exp/conformer/checkpoints 10 +CUDA_VISIBLE_DEVICES= ./local/test.sh conf/conformer.yaml exp/conformer/checkpoints/avg_10 +``` +## Pretrained Model +You can get the pretrained transformer or conformer from [this](../../../docs/source/released_model.md). + +using the `tar` scripts to unpack the model and then you can use the script to test the model. + +For example: +```bash +wget https://paddlespeech.bj.bcebos.com/s2t/tal_cs/asr1/asr1_conformer_talcs_ckpt_1.4.0.model.tar.gz +tar xzvf asr1_conformer_talcs_ckpt_1.4.0.model.tar.gz +source path.sh +# If you have process the data and get the manifest file, you can skip the following 2 steps +bash local/data.sh --stage -1 --stop_stage -1 +bash local/data.sh --stage 2 --stop_stage 2 +CUDA_VISIBLE_DEVICES= ./local/test.sh conf/conformer.yaml exp/conformer/checkpoints/avg_10 +``` +The performance of the released models are shown in [here](./RESULTS.md). + +## Stage 5: Single Audio File Inference +In some situations, you want to use the trained model to do the inference for the single audio file. You can use stage 5. The code is shown below +```bash + if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then + # test a single .wav file + CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1 + fi +``` +you can train the model by yourself using ```bash run.sh --stage 0 --stop_stage 3```, or you can download the pretrained model through the script below: +```bash +wget https://paddlespeech.bj.bcebos.com/s2t/tal_cs/asr1/asr1_conformer_talcs_ckpt_1.4.0.model.tar.gz +tar xzvf asr1_conformer_talcs_ckpt_1.4.0.model.tar.gz +``` +You can download the audio demo: +```bash +wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wav -P data/ +``` +You need to prepare an audio file or use the audio demo above, please confirm the sample rate of the audio is 16K. You can get the result of the audio demo by running the script below. +```bash +CUDA_VISIBLE_DEVICES= ./local/test_wav.sh conf/conformer.yaml exp/conformer/checkpoints/avg_10 data/demo_01_03.wav +``` diff --git a/examples/tal_cs/asr1/RESULTS.md b/examples/tal_cs/asr1/RESULTS.md new file mode 100644 index 000000000..98607b7b3 --- /dev/null +++ b/examples/tal_cs/asr1/RESULTS.md @@ -0,0 +1,12 @@ +# TALCS +2023.1.6, commit id: fa724285f3b799b97b4348ad3b1084afc0764f9b + +## Conformer +train: Epoch 100, 3 V100-32G, best avg: 10 + +| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER | +| --- | --- | --- | --- | --- | --- | --- | --- | +| conformer | 47.63 M | conf/conformer.yaml | spec_aug | test-set | attention | 9.85091028213501 | 0.102786 | +| conformer | 47.63 M | conf/conformer.yaml | spec_aug | test-set | ctc_greedy_search | 9.85091028213501 | 0.103538 | +| conformer | 47.63 M | conf/conformer.yaml | spec_aug | test-set | ctc_prefix_beam_search | 9.85091028213501 | 0.103317 | +| conformer | 47.63 M | conf/conformer.yaml | spec_aug | test-set | attention_rescoring | 9.85091028213501 | 0.084374 | diff --git a/examples/tal_cs/asr1/conf/conformer.yaml b/examples/tal_cs/asr1/conf/conformer.yaml new file mode 100644 index 000000000..25148d1ba --- /dev/null +++ b/examples/tal_cs/asr1/conf/conformer.yaml @@ -0,0 +1,91 @@ +############################################ +# Network Architecture # +############################################ +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: conformer +encoder_conf: + output_size: 512 # dimension of attention + attention_heads: 8 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: True + cnn_module_kernel: 15 + use_cnn_module: True + activation_type: 'swish' + pos_enc_layer_type: 'rel_pos' + selfattention_layer_type: 'rel_selfattn' + +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 8 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 + +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false + init_type: 'kaiming_uniform' # !Warning: need to convergence + +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test + +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/vocab.txt +spm_model_prefix: 'data/lang_char/bpe_bpe_11297' +unit_type: 'spm' +preprocess_config: conf/preprocess.yaml +feat_dim: 80 +stride_ms: 20.0 +window_ms: 30.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 5 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +num_workers: 2 +subsampling_factor: 1 +num_encs: 1 + +########################################### +# Training # +########################################### +n_epoch: 100 +accum_grad: 4 +global_grad_clip: 5.0 +dist_sampler: False +optim: adam +optim_conf: + lr: 0.002 + weight_decay: 1.0e-6 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/tal_cs/asr1/conf/preprocess.yaml b/examples/tal_cs/asr1/conf/preprocess.yaml new file mode 100644 index 000000000..c7ccc522d --- /dev/null +++ b/examples/tal_cs/asr1/conf/preprocess.yaml @@ -0,0 +1,29 @@ +process: + # extract kaldi fbank from PCM + - type: fbank_kaldi + fs: 16000 + n_mels: 80 + n_shift: 160 + win_length: 400 + dither: 1.0 + - type: cmvn_json + cmvn_path: data/mean_std.json + # these three processes are a.k.a. SpecAugument + - type: time_warp + max_time_warp: 5 + inplace: true + mode: PIL + - type: freq_mask + F: 30 + n_mask: 2 + inplace: true + replace_with_zero: false + - type: time_mask + T: 40 + n_mask: 2 + inplace: true + replace_with_zero: false + + + + diff --git a/examples/tal_cs/asr1/conf/tuning/chunk_decode.yaml b/examples/tal_cs/asr1/conf/tuning/chunk_decode.yaml new file mode 100644 index 000000000..6945ed6eb --- /dev/null +++ b/examples/tal_cs/asr1/conf/tuning/chunk_decode.yaml @@ -0,0 +1,12 @@ +beam_size: 10 +decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' +ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. +reverse_weight: 0.3 # reverse weight for attention rescoring decode mode. +decoding_chunk_size: 16 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. +num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. +simulate_streaming: True # simulate streaming inference. Defaults to False. +decode_batch_size: 128 +error_rate_type: cer diff --git a/examples/tal_cs/asr1/conf/tuning/decode.yaml b/examples/tal_cs/asr1/conf/tuning/decode.yaml new file mode 100644 index 000000000..22611176d --- /dev/null +++ b/examples/tal_cs/asr1/conf/tuning/decode.yaml @@ -0,0 +1,12 @@ +beam_size: 10 +decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' +ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. +#reverse_weight: 0.3 # reverse weight for attention rescoring decode mode. +decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. +num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. +simulate_streaming: False # simulate streaming inference. Defaults to False. +decode_batch_size: 1 +error_rate_type: cer diff --git a/examples/tal_cs/asr1/local/data.sh b/examples/tal_cs/asr1/local/data.sh new file mode 100644 index 000000000..7ea12809f --- /dev/null +++ b/examples/tal_cs/asr1/local/data.sh @@ -0,0 +1,88 @@ +#!/bin/bash +stage=-1 +stop_stage=100 +dict_dir=data/lang_char + +# bpemode (unigram or bpe) +nbpe=11297 +bpemode=bpe +bpeprefix="${dict_dir}/bpe_${bpemode}_${nbpe}" + +stride_ms=20 +window_ms=30 +sample_rate=16000 +feat_dim=80 + +source ${MAIN_ROOT}/utils/parse_options.sh + + +mkdir -p data +mkdir -p ${dict_dir} +TARGET_DIR=${MAIN_ROOT}/dataset +mkdir -p ${TARGET_DIR} + +#prepare data +if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then + if [ ! -d "${MAIN_ROOT}/dataset/tal_cs/TALCS_corpus" ]; then + echo "${MAIN_ROOT}/dataset/tal_cs/TALCS_corpus does not exist. Please donwload tal_cs data and unpack it from https://ai.100tal.com/dataset first." + echo "data md5 reference: 4c879b3c9c05365fc9dee1fc68713afe" + exit + fi + # create manifest json file from TALCS_corpus + python ${MAIN_ROOT}/dataset/tal_cs/tal_cs.py --target_dir ${MAIN_ROOT}/dataset/tal_cs/TALCS_corpus/ --manifest_prefix data/ +fi + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # compute mean and stddev for normalizer + num_workers=$(nproc) + python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ + --manifest_path="data/manifest.train.raw" \ + --num_samples=-1 \ + --spectrum_type="fbank" \ + --feat_dim=${feat_dim} \ + --delta_delta=false \ + --sample_rate=${sample_rate} \ + --stride_ms=${stride_ms} \ + --window_ms=${window_ms} \ + --use_dB_normalization=False \ + --num_workers=${num_workers} \ + --output_path="data/mean_std.json" + echo "compute mean and stddev done." +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + #use train_set build dict + python3 ${MAIN_ROOT}/utils/build_vocab.py \ + --unit_type 'spm' \ + --count_threshold=0 \ + --vocab_path="${dict_dir}/vocab.txt" \ + --manifest_paths="data/manifest.train.raw" \ + --spm_mode=${bpemode} \ + --spm_vocab_size=${nbpe} \ + --spm_model_prefix=${bpeprefix} \ + --spm_character_coverage=1 + echo "build dict done." +fi + +#use new dict format data +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # format manifest with tokenids, vocab size + for sub in train dev test ; do + { + python3 ${MAIN_ROOT}/utils/format_data.py \ + --cmvn_path "data/mean_std.json" \ + --unit_type "spm" \ + --spm_model_prefix ${bpeprefix} \ + --vocab_path="${dict_dir}/vocab.txt" \ + --manifest_path="data/manifest.${sub}.raw" \ + --output_path="data/manifest.${sub}" + + if [ $? -ne 0 ]; then + echo "Formt mnaifest failed. Terminated." + exit 1 + fi + }& + done + wait + echo "format data done." +fi diff --git a/examples/tal_cs/asr1/local/test.sh b/examples/tal_cs/asr1/local/test.sh new file mode 100755 index 000000000..65b884e51 --- /dev/null +++ b/examples/tal_cs/asr1/local/test.sh @@ -0,0 +1,72 @@ +#!/bin/bash + +if [ $# != 3 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix" + exit -1 +fi + +ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') +echo "using $ngpu gpus..." + +config_path=$1 +decode_config_path=$2 +ckpt_prefix=$3 + +chunk_mode=false +if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then + chunk_mode=true +fi + +# download language model +#bash local/download_lm_ch.sh +#if [ $? -ne 0 ]; then +# exit 1 +#fi + + +for type in attention ctc_greedy_search; do + echo "decoding ${type}" + if [ ${chunk_mode} == true ];then + # stream decoding only support batchsize=1 + batch_size=1 + else + batch_size=64 + fi + output_dir=${ckpt_prefix} + mkdir -p ${output_dir} + python3 -u ${BIN_DIR}/test.py \ + --ngpu ${ngpu} \ + --config ${config_path} \ + --decode_cfg ${decode_config_path} \ + --result_file ${output_dir}/${type}.rsl \ + --checkpoint_path ${ckpt_prefix} \ + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} + + if [ $? -ne 0 ]; then + echo "Failed in evaluation!" + exit 1 + fi +done + +for type in ctc_prefix_beam_search attention_rescoring; do + echo "decoding ${type}" + batch_size=1 + output_dir=${ckpt_prefix} + mkdir -p ${output_dir} + python3 -u ${BIN_DIR}/test.py \ + --ngpu ${ngpu} \ + --config ${config_path} \ + --decode_cfg ${decode_config_path} \ + --result_file ${output_dir}/${type}.rsl \ + --checkpoint_path ${ckpt_prefix} \ + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} + + if [ $? -ne 0 ]; then + echo "Failed in evaluation!" + exit 1 + fi +done + +exit 0 diff --git a/examples/tal_cs/asr1/local/test_wav.sh b/examples/tal_cs/asr1/local/test_wav.sh new file mode 100755 index 000000000..d029f2fde --- /dev/null +++ b/examples/tal_cs/asr1/local/test_wav.sh @@ -0,0 +1,58 @@ +#!/bin/bash + +if [ $# != 4 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix audio_file" + exit -1 +fi + +ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') +echo "using $ngpu gpus..." + +config_path=$1 +decode_config_path=$2 +ckpt_prefix=$3 +audio_file=$4 + +mkdir -p data +wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wav -P data/ +if [ $? -ne 0 ]; then + exit 1 +fi + +if [ ! -f ${audio_file} ]; then + echo "Plase input the right audio_file path" + exit 1 +fi + +chunk_mode=false +if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then + chunk_mode=true +fi + +# download language model +#bash local/download_lm_ch.sh +#if [ $? -ne 0 ]; then +# exit 1 +#fi + +for type in attention_rescoring; do + echo "decoding ${type}" + batch_size=1 + output_dir=${ckpt_prefix} + mkdir -p ${output_dir} + python3 -u ${BIN_DIR}/test_wav.py \ + --ngpu ${ngpu} \ + --config ${config_path} \ + --decode_cfg ${decode_config_path} \ + --result_file ${output_dir}/${type}.rsl \ + --checkpoint_path ${ckpt_prefix} \ + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} \ + --audio_file ${audio_file} + + if [ $? -ne 0 ]; then + echo "Failed in evaluation!" + exit 1 + fi +done +exit 0 diff --git a/examples/tal_cs/asr1/local/train.sh b/examples/tal_cs/asr1/local/train.sh new file mode 100755 index 000000000..bfa8dd97d --- /dev/null +++ b/examples/tal_cs/asr1/local/train.sh @@ -0,0 +1,72 @@ +#!/bin/bash + +profiler_options= +benchmark_batch_size=0 +benchmark_max_step=0 + +# seed may break model convergence +seed=0 + +source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; + +ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') +echo "using $ngpu gpus..." + +if [ ${seed} != 0 ]; then + export FLAGS_cudnn_deterministic=True + echo "using seed $seed & FLAGS_cudnn_deterministic=True ..." +fi + +if [ $# -lt 2 ] && [ $# -gt 3 ];then + echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name ips(optional)" + exit -1 +fi + +config_path=$1 +ckpt_name=$2 +ips=$3 + +if [ ! $ips ];then + ips_config= +else + ips_config="--ips="${ips} +fi +echo ${ips_config} + +mkdir -p exp + +# default memeory allocator strategy may case gpu training hang +# for no OOM raised when memory exhaused +export FLAGS_allocator_strategy=naive_best_fit + +if [ ${ngpu} == 0 ]; then +python3 -u ${BIN_DIR}/train.py \ +--ngpu ${ngpu} \ +--seed ${seed} \ +--config ${config_path} \ +--output exp/${ckpt_name} \ +--profiler-options "${profiler_options}" \ +--benchmark-batch-size ${benchmark_batch_size} \ +--benchmark-max-step ${benchmark_max_step} +else +python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${ips_config} ${BIN_DIR}/train.py \ +--ngpu ${ngpu} \ +--seed ${seed} \ +--config ${config_path} \ +--output exp/${ckpt_name} \ +--profiler-options "${profiler_options}" \ +--benchmark-batch-size ${benchmark_batch_size} \ +--benchmark-max-step ${benchmark_max_step} +fi + + +if [ ${seed} != 0 ]; then + unset FLAGS_cudnn_deterministic +fi + +if [ $? -ne 0 ]; then + echo "Failed in training!" + exit 1 +fi + +exit 0 diff --git a/examples/tal_cs/asr1/path.sh b/examples/tal_cs/asr1/path.sh new file mode 100755 index 000000000..666b29bce --- /dev/null +++ b/examples/tal_cs/asr1/path.sh @@ -0,0 +1,15 @@ +export MAIN_ROOT=`realpath ${PWD}/../../../` + +export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} +export LC_ALL=C + +export PYTHONDONTWRITEBYTECODE=1 +# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C +export PYTHONIOENCODING=UTF-8 +export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} + +export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/ + +# model exp +MODEL=u2 +export BIN_DIR=${MAIN_ROOT}/paddlespeech/s2t/exps/${MODEL}/bin diff --git a/examples/tal_cs/asr1/run.sh b/examples/tal_cs/asr1/run.sh new file mode 100644 index 000000000..120d69fce --- /dev/null +++ b/examples/tal_cs/asr1/run.sh @@ -0,0 +1,51 @@ +#!/bin/bash +source path.sh || exit 1; +set -e + +gpus=0,1,2,3 +stage=0 +stop_stage=50 +conf_path=conf/conformer.yaml +ips= #xxx.xxx.xxx.xxx,xxx.xxx.xxx.xxx +decode_conf_path=conf/tuning/decode.yaml +average_checkpoint=true +avg_num=10 + +. ${MAIN_ROOT}/utils/parse_options.sh || exit 1; + +avg_ckpt=avg_${avg_num} +ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}') +echo "checkpoint name ${ckpt}" + +audio_file="data/demo_01_03.wav" + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # prepare data + bash ./local/data.sh || exit -1 +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # train model, all `ckpt` under `exp` dir + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} ${ips} +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # avg n best model + avg.sh best exp/${ckpt}/checkpoints ${avg_num} +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + # test ckpt avg_n + CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 +fi + +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + # test a single .wav file + CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1 +fi + +# Not supported at now!!! +if [ ${stage} -le 51 ] && [ ${stop_stage} -ge 51 ]; then + # export ckpt avg_n + CUDA_VISIBLE_DEVICES=0 ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit +fi \ No newline at end of file diff --git a/examples/tal_cs/asr1/utils b/examples/tal_cs/asr1/utils new file mode 120000 index 000000000..973afe674 --- /dev/null +++ b/examples/tal_cs/asr1/utils @@ -0,0 +1 @@ +../../../utils \ No newline at end of file From 2c4c141de5b8bf7335370f955559d1492adf42c2 Mon Sep 17 00:00:00 2001 From: YangZhou <56786796+SmileGoat@users.noreply.github.com> Date: Tue, 10 Jan 2023 14:35:41 +0800 Subject: [PATCH 018/159] [audio] fix load paddleaudio fail (#2815) * fix paddleaudio import fail --- audio/paddleaudio/_internal/module_utils.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/audio/paddleaudio/_internal/module_utils.py b/audio/paddleaudio/_internal/module_utils.py index 7b3230de9..becd23cd8 100644 --- a/audio/paddleaudio/_internal/module_utils.py +++ b/audio/paddleaudio/_internal/module_utils.py @@ -67,8 +67,11 @@ def deprecated(direction: str, version: Optional[str]=None): def is_kaldi_available(): - return is_module_available("paddleaudio._paddleaudio") - + try: + from paddleaudio import _paddleaudio + return True + except Exception: + return False def requires_kaldi(): if is_kaldi_available(): @@ -128,9 +131,11 @@ def requires_soundfile(): def is_sox_available(): - if platform.system() == "Windows": # not support sox in windows + try: + from paddleaudio import _paddleaudio + return True + except Exception: return False - return is_module_available("paddleaudio._paddleaudio") def requires_sox(): From 88fe26f17ca4a35d007bb934cbe96550b1592508 Mon Sep 17 00:00:00 2001 From: zxcd <228587199@qq.com> Date: Tue, 10 Jan 2023 19:46:39 +0800 Subject: [PATCH 019/159] [ASR] add asr code-switch cli and demo, test='asr' (#2816) * add asr code-switch cli and demo. * fix some model named problem. --- README.md | 2 ++ README_cn.md | 2 ++ demos/speech_recognition/README.md | 28 +++++++++-------- demos/speech_recognition/README_cn.md | 29 ++++++++++-------- demos/speech_recognition/run.sh | 6 ++++ paddlespeech/cli/asr/infer.py | 28 ++++++++++++----- paddlespeech/cli/base_commands.py | 19 ++++++++++-- paddlespeech/resource/pretrained_models.py | 13 ++++++++ .../server/bin/paddlespeech_server.py | 30 ++++++++++++++----- tests/unit/cli/test_cli.sh | 3 +- 10 files changed, 118 insertions(+), 42 deletions(-) diff --git a/README.md b/README.md index dbdf6a4f8..2fb773634 100644 --- a/README.md +++ b/README.md @@ -157,6 +157,8 @@ Via the easy-to-use, efficient, flexible and scalable implementation, our vision - 🧩 *Cascaded models application*: as an extension of the typical traditional audio tasks, we combine the workflows of the aforementioned tasks with other fields like Natural language processing (NLP) and Computer Vision (CV). ### Recent Update +- 🔥 2022.01.10: Add [code-switch asr CLI and Demos](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/speech_recognition). +- 👑 2022.01.06: Add [code-switch asr tal_cs recipe](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/tal_cs/asr1/). - 🎉 2022.12.02: Add [end-to-end Prosody Prediction pipeline](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3_rhy) (including using prosody labels in Acoustic Model). - 🎉 2022.11.30: Add [TTS Android Demo](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/TTSAndroid). - 🤗 2022.11.28: PP-TTS and PP-ASR demos are available in [AIStudio](https://aistudio.baidu.com/aistudio/modelsoverview) and [official website diff --git a/README_cn.md b/README_cn.md index 5cc156c9f..53f6a66e4 100644 --- a/README_cn.md +++ b/README_cn.md @@ -164,6 +164,8 @@ ### 近期更新 +- 🔥 2022.01.10: 新增 [中英混合 ASR CLI 和 Demos](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/speech_recognition). +- 👑 2022.01.06: 新增 [ASR中英混合 tal_cs 训练推理流程](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/tal_cs/asr1/). - 🎉 2022.12.02: 新增 [端到端韵律预测全流程](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3_rhy) (包含在声学模型中使用韵律标签)。 - 🎉 2022.11.30: 新增 [TTS Android 部署示例](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/TTSAndroid)。 - 🤗 2022.11.28: PP-TTS and PP-ASR 示例可在 [AIStudio](https://aistudio.baidu.com/aistudio/modelsoverview) 和[飞桨官网](https://www.paddlepaddle.org.cn/models)体验! diff --git a/demos/speech_recognition/README.md b/demos/speech_recognition/README.md index c815a88af..ee2acd6fd 100644 --- a/demos/speech_recognition/README.md +++ b/demos/speech_recognition/README.md @@ -17,7 +17,7 @@ The input of this demo should be a WAV file(`.wav`), and the sample rate must be Here are sample files for this demo that can be downloaded: ```bash -wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav +wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/ch_zh_mix.wav ``` ### 3. Usage @@ -27,6 +27,8 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee paddlespeech asr --input ./zh.wav -v # English paddlespeech asr --model transformer_librispeech --lang en --input ./en.wav -v + # Code-Switch + paddlespeech asr --model conformer_talcs --lang zh_en --codeswitch True --input ./ch_zh_mix.wav -v # Chinese ASR + Punctuation Restoration paddlespeech asr --input ./zh.wav -v | paddlespeech text --task punc -v ``` @@ -40,6 +42,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee - `input`(required): Audio file to recognize. - `model`: Model type of asr task. Default: `conformer_wenetspeech`. - `lang`: Model language. Default: `zh`. + - `codeswitch`: Code Swith Model. Default: `False` - `sample_rate`: Sample rate of the model. Default: `16000`. - `config`: Config of asr task. Use pretrained model when it is None. Default: `None`. - `ckpt_path`: Model checkpoint. Use pretrained model when it is None. Default: `None`. @@ -83,14 +86,15 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee Here is a list of pretrained models released by PaddleSpeech that can be used by command and python API: -| Model | Language | Sample Rate -| :--- | :---: | :---: | -| conformer_wenetspeech | zh | 16k -| conformer_online_multicn | zh | 16k -| conformer_aishell | zh | 16k -| conformer_online_aishell | zh | 16k -| transformer_librispeech | en | 16k -| deepspeech2online_wenetspeech | zh | 16k -| deepspeech2offline_aishell| zh| 16k -| deepspeech2online_aishell | zh | 16k -| deepspeech2offline_librispeech | en | 16k +| Model | Code Switch | Language | Sample Rate +| :--- | :---: | :---: | :---: | +| conformer_wenetspeech | False | zh | 16k +| conformer_online_multicn | False | zh | 16k +| conformer_aishell | False | zh | 16k +| conformer_online_aishell | False | zh | 16k +| transformer_librispeech | False | en | 16k +| deepspeech2online_wenetspeech | False | zh | 16k +| deepspeech2offline_aishell | False | zh| 16k +| deepspeech2online_aishell | False | zh | 16k +| deepspeech2offline_librispeech | False | en | 16k +| conformer_talcs | True | zh_en | 16k diff --git a/demos/speech_recognition/README_cn.md b/demos/speech_recognition/README_cn.md index 13aa9f277..62dce3bc9 100644 --- a/demos/speech_recognition/README_cn.md +++ b/demos/speech_recognition/README_cn.md @@ -1,4 +1,5 @@ (简体中文|[English](./README.md)) + (简体中文|[English](./README.md)) # 语音识别 ## 介绍 @@ -16,7 +17,7 @@ 可以下载此 demo 的示例音频: ```bash -wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav +wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/ch_zh_mix.wav ``` ### 3. 使用方法 - 命令行 (推荐使用) @@ -25,6 +26,8 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee paddlespeech asr --input ./zh.wav -v # 英文 paddlespeech asr --model transformer_librispeech --lang en --input ./en.wav -v + #中英混合 + paddlespeech asr --model conformer_talcs --lang zh_en --codeswitch True --input ./ch_zh_mix.wav -v # 中文 + 标点恢复 paddlespeech asr --input ./zh.wav -v | paddlespeech text --task punc -v ``` @@ -38,6 +41,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee - `input`(必须输入):用于识别的音频文件。 - `model`:ASR 任务的模型,默认值:`conformer_wenetspeech`。 - `lang`:模型语言,默认值:`zh`。 + - `codeswitch`: 是否使用语言转换,默认值:`False`。 - `sample_rate`:音频采样率,默认值:`16000`。 - `config`:ASR 任务的参数文件,若不设置则使用预训练模型中的默认配置,默认值:`None`。 - `ckpt_path`:模型参数文件,若不设置则下载预训练模型使用,默认值:`None`。 @@ -80,14 +84,15 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee ### 4.预训练模型 以下是 PaddleSpeech 提供的可以被命令行和 python API 使用的预训练模型列表: -| 模型 | 语言 | 采样率 -| :--- | :---: | :---: | -| conformer_wenetspeech | zh | 16k -| conformer_online_multicn | zh | 16k -| conformer_aishell | zh | 16k -| conformer_online_aishell | zh | 16k -| transformer_librispeech | en | 16k -| deepspeech2online_wenetspeech | zh | 16k -| deepspeech2offline_aishell| zh| 16k -| deepspeech2online_aishell | zh | 16k -| deepspeech2offline_librispeech | en | 16k +| 模型 | 语言转换 | 语言 | 采样率 +| :--- | :---: | :---: | :---: | +| conformer_wenetspeech | False | zh | 16k +| conformer_online_multicn | False | zh | 16k +| conformer_aishell | False | zh | 16k +| conformer_online_aishell | False | zh | 16k +| transformer_librispeech | False | en | 16k +| deepspeech2online_wenetspeech | False | zh | 16k +| deepspeech2offline_aishell | False | zh| 16k +| deepspeech2online_aishell | False | zh | 16k +| deepspeech2offline_librispeech | False | en | 16k +| conformer_talcs | True | zh_en | 16k diff --git a/demos/speech_recognition/run.sh b/demos/speech_recognition/run.sh index e48ff3e96..8ba6e4c3e 100755 --- a/demos/speech_recognition/run.sh +++ b/demos/speech_recognition/run.sh @@ -2,6 +2,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav +wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/ch_zh_mix.wav # asr paddlespeech asr --input ./zh.wav @@ -18,6 +19,11 @@ paddlespeech asr --help # english asr paddlespeech asr --lang en --model transformer_librispeech --input ./en.wav + +# code-switch asr +paddlespeech asr --lang zh_en --codeswitch True --model conformer_talcs --input ./ch_zh_mix.wav + + # model stats paddlespeech stats --task asr diff --git a/paddlespeech/cli/asr/infer.py b/paddlespeech/cli/asr/infer.py index 004143361..7a7aef8b0 100644 --- a/paddlespeech/cli/asr/infer.py +++ b/paddlespeech/cli/asr/infer.py @@ -25,6 +25,9 @@ import librosa import numpy as np import paddle import soundfile +from paddlespeech.audio.transform.transformation import Transformation +from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer +from paddlespeech.s2t.utils.utility import UpdateConfig from yacs.config import CfgNode from ...utils.env import MODEL_HOME @@ -34,9 +37,6 @@ from ..log import logger from ..utils import CLI_TIMER from ..utils import stats_wrapper from ..utils import timer_register -from paddlespeech.audio.transform.transformation import Transformation -from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer -from paddlespeech.s2t.utils.utility import UpdateConfig __all__ = ['ASRExecutor'] @@ -62,8 +62,13 @@ class ASRExecutor(BaseExecutor): '--lang', type=str, default='zh', - help='Choose model language. zh or en, zh:[conformer_wenetspeech-zh-16k], en:[transformer_librispeech-en-16k]' + help='Choose model language. [zh, en, zh_en], zh:[conformer_wenetspeech-zh-16k], en:[transformer_librispeech-en-16k], zh_en:[conformer_talcs-codeswitch_zh_en-16k]' ) + self.parser.add_argument( + '--codeswitch', + type=bool, + default=False, + help='Choose whether use code-switch. True or False.') self.parser.add_argument( "--sample_rate", type=int, @@ -127,6 +132,7 @@ class ASRExecutor(BaseExecutor): def _init_from_path(self, model_type: str='wenetspeech', lang: str='zh', + codeswitch: bool=False, sample_rate: int=16000, cfg_path: Optional[os.PathLike]=None, decode_method: str='attention_rescoring', @@ -144,7 +150,12 @@ class ASRExecutor(BaseExecutor): if cfg_path is None or ckpt_path is None: sample_rate_str = '16k' if sample_rate == 16000 else '8k' - tag = model_type + '-' + lang + '-' + sample_rate_str + if lang == "zh_en" and codeswitch is True: + tag = model_type + '-' + 'codeswitch_' + lang + '-' + sample_rate_str + elif lang == "zh_en" or codeswitch is True: + raise Exception("codeswitch is true only in zh_en model") + else: + tag = model_type + '-' + lang + '-' + sample_rate_str self.task_resource.set_task_model(tag, version=None) self.res_path = self.task_resource.res_dir @@ -423,6 +434,7 @@ class ASRExecutor(BaseExecutor): model = parser_args.model lang = parser_args.lang + codeswitch = parser_args.codeswitch sample_rate = parser_args.sample_rate config = parser_args.config ckpt_path = parser_args.ckpt_path @@ -444,6 +456,7 @@ class ASRExecutor(BaseExecutor): audio_file=input_, model=model, lang=lang, + codeswitch=codeswitch, sample_rate=sample_rate, config=config, ckpt_path=ckpt_path, @@ -472,6 +485,7 @@ class ASRExecutor(BaseExecutor): audio_file: os.PathLike, model: str='conformer_u2pp_online_wenetspeech', lang: str='zh', + codeswitch: bool=False, sample_rate: int=16000, config: os.PathLike=None, ckpt_path: os.PathLike=None, @@ -485,8 +499,8 @@ class ASRExecutor(BaseExecutor): """ audio_file = os.path.abspath(audio_file) paddle.set_device(device) - self._init_from_path(model, lang, sample_rate, config, decode_method, - num_decoding_left_chunks, ckpt_path) + self._init_from_path(model, lang, codeswitch, sample_rate, config, + decode_method, num_decoding_left_chunks, ckpt_path) if not self._check(audio_file, sample_rate, force_yes): sys.exit(-1) if rtf: diff --git a/paddlespeech/cli/base_commands.py b/paddlespeech/cli/base_commands.py index 767d0df78..dfeb5cae5 100644 --- a/paddlespeech/cli/base_commands.py +++ b/paddlespeech/cli/base_commands.py @@ -14,6 +14,7 @@ import argparse from typing import List +import numpy from prettytable import PrettyTable from ..resource import CommonTaskResource @@ -78,7 +79,7 @@ class VersionCommand: model_name_format = { - 'asr': 'Model-Language-Sample Rate', + 'asr': 'Model-Size-Code Switch-Multilingual-Language-Sample Rate', 'cls': 'Model-Sample Rate', 'st': 'Model-Source language-Target language', 'text': 'Model-Task-Language', @@ -111,7 +112,21 @@ class StatsCommand: fields = model_name_format[self.task].split("-") table = PrettyTable(fields) for key in pretrained_models: - table.add_row(key.split("-")) + line = key.split("-") + if self.task == "asr" and len(line) < len(fields): + for i in range(len(line), len(fields)): + line.append("-") + if "codeswitch" in key: + line[3], line[1] = line[1].split("_")[0], line[1].split( + "_")[1:] + elif "multilingual" in key: + line[4], line[1] = line[1].split("_")[0], line[1].split( + "_")[1:] + tmp = numpy.array(line) + idx = [0, 5, 3, 4, 1, 2] + line = tmp[idx] + table.add_row(line) + print(table) def execute(self, argv: List[str]) -> bool: diff --git a/paddlespeech/resource/pretrained_models.py b/paddlespeech/resource/pretrained_models.py index 3c5aa1f90..ff0b30f6d 100644 --- a/paddlespeech/resource/pretrained_models.py +++ b/paddlespeech/resource/pretrained_models.py @@ -30,6 +30,7 @@ __all__ = [ ] # The tags for pretrained_models should be "{model_name}[_{dataset}][-{lang}][-...]". +# Add code-switch and multilingual tag, "{model_name}[_{dataset}]-[codeswitch/multilingual][_{lang}][-...]". # e.g. "conformer_wenetspeech-zh-16k" and "panns_cnn6-32k". # Command line and python api use "{model_name}[_{dataset}]" as --model, usage: # "paddlespeech asr --model conformer_wenetspeech --lang zh --sr 16000 --input ./input.wav" @@ -322,6 +323,18 @@ asr_dynamic_pretrained_models = { '099a601759d467cd0a8523ff939819c5' }, }, + "conformer_talcs-codeswitch_zh_en-16k": { + '1.4': { + 'url': + 'https://paddlespeech.bj.bcebos.com/s2t/tal_cs/asr1/asr1_conformer_talcs_ckpt_1.4.0.model.tar.gz', + 'md5': + '01962c5d0a70878fe41cacd4f61e14d1', + 'cfg_path': + 'model.yaml', + 'ckpt_path': + 'exp/conformer/checkpoints/avg_10' + }, + }, } asr_static_pretrained_models = { diff --git a/paddlespeech/server/bin/paddlespeech_server.py b/paddlespeech/server/bin/paddlespeech_server.py index 1b1792bd1..299a8c3d4 100644 --- a/paddlespeech/server/bin/paddlespeech_server.py +++ b/paddlespeech/server/bin/paddlespeech_server.py @@ -16,14 +16,9 @@ import sys import warnings from typing import List +import numpy import uvicorn from fastapi import FastAPI -from prettytable import PrettyTable -from starlette.middleware.cors import CORSMiddleware - -from ..executor import BaseExecutor -from ..util import cli_server_register -from ..util import stats_wrapper from paddlespeech.cli.log import logger from paddlespeech.resource import CommonTaskResource from paddlespeech.server.engine.engine_pool import init_engine_pool @@ -31,6 +26,12 @@ from paddlespeech.server.engine.engine_warmup import warm_up from paddlespeech.server.restful.api import setup_router as setup_http_router from paddlespeech.server.utils.config import get_config from paddlespeech.server.ws.api import setup_router as setup_ws_router +from prettytable import PrettyTable +from starlette.middleware.cors import CORSMiddleware + +from ..executor import BaseExecutor +from ..util import cli_server_register +from ..util import stats_wrapper warnings.filterwarnings("ignore") __all__ = ['ServerExecutor', 'ServerStatsExecutor'] @@ -134,7 +135,7 @@ class ServerStatsExecutor(): required=True) self.task_choices = ['asr', 'tts', 'cls', 'text', 'vector'] self.model_name_format = { - 'asr': 'Model-Language-Sample Rate', + 'asr': 'Model-Size-Code Switch-Multilingual-Language-Sample Rate', 'tts': 'Model-Language', 'cls': 'Model-Sample Rate', 'text': 'Model-Task-Language', @@ -145,7 +146,20 @@ class ServerStatsExecutor(): fields = self.model_name_format[self.task].split("-") table = PrettyTable(fields) for key in pretrained_models: - table.add_row(key.split("-")) + line = key.split("-") + if self.task == "asr" and len(line) < len(fields): + for i in range(len(line), len(fields)): + line.append("-") + if "codeswitch" in key: + line[3], line[1] = line[1].split("_")[0], line[1].split( + "_")[1:] + elif "multilingual" in key: + line[4], line[1] = line[1].split("_")[0], line[1].split( + "_")[1:] + tmp = numpy.array(line) + idx = [0, 5, 3, 4, 1, 2] + line = tmp[idx] + table.add_row(line) print(table) def execute(self, argv: List[str]) -> bool: diff --git a/tests/unit/cli/test_cli.sh b/tests/unit/cli/test_cli.sh index 3a58626d2..5d3b76f6c 100755 --- a/tests/unit/cli/test_cli.sh +++ b/tests/unit/cli/test_cli.sh @@ -14,7 +14,7 @@ paddlespeech ssl --task asr --lang en --input ./en.wav paddlespeech ssl --task vector --lang en --input ./en.wav # Speech_recognition -wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav +wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/ch_zh_mix.wav paddlespeech asr --input ./zh.wav paddlespeech asr --model conformer_aishell --input ./zh.wav paddlespeech asr --model conformer_online_aishell --input ./zh.wav @@ -26,6 +26,7 @@ paddlespeech asr --model deepspeech2offline_aishell --input ./zh.wav paddlespeech asr --model deepspeech2online_wenetspeech --input ./zh.wav paddlespeech asr --model deepspeech2online_aishell --input ./zh.wav paddlespeech asr --model deepspeech2offline_librispeech --lang en --input ./en.wav +paddlespeech asr --model conformer_talcs --lang zh_en --codeswitch True --input ./ch_zh_mix.wav # Support editing num_decoding_left_chunks paddlespeech asr --model conformer_online_wenetspeech --num_decoding_left_chunks 3 --input ./zh.wav From faa2f866516e1e1afb40b25df907ebe3078bd078 Mon Sep 17 00:00:00 2001 From: HuangLiangJie Date: Wed, 11 Jan 2023 12:59:41 +0800 Subject: [PATCH 020/159] [TTS]update VITS init method (#2809) --- paddlespeech/t2s/models/vits/text_encoder.py | 13 +- paddlespeech/t2s/models/vits/vits.py | 55 +++- paddlespeech/utils/initialize.py | 321 +++++++++++++++++++ 3 files changed, 375 insertions(+), 14 deletions(-) create mode 100644 paddlespeech/utils/initialize.py diff --git a/paddlespeech/t2s/models/vits/text_encoder.py b/paddlespeech/t2s/models/vits/text_encoder.py index 799e0c759..015ed76c6 100644 --- a/paddlespeech/t2s/models/vits/text_encoder.py +++ b/paddlespeech/t2s/models/vits/text_encoder.py @@ -24,6 +24,7 @@ from paddle import nn from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask from paddlespeech.t2s.modules.transformer.encoder import ConformerEncoder as Encoder +from paddlespeech.utils.initialize import normal_ class TextEncoder(nn.Layer): @@ -105,10 +106,6 @@ class TextEncoder(nn.Layer): # define modules self.emb = nn.Embedding(vocabs, attention_dim) - dist = paddle.distribution.Normal(loc=0.0, scale=attention_dim**-0.5) - w = dist.sample(self.emb.weight.shape) - self.emb.weight.set_value(w) - self.encoder = Encoder( idim=-1, input_layer=None, @@ -130,6 +127,8 @@ class TextEncoder(nn.Layer): cnn_module_kernel=conformer_kernel_size, ) self.proj = nn.Conv1D(attention_dim, attention_dim * 2, 1) + self.reset_parameters() + def forward( self, x: paddle.Tensor, @@ -166,3 +165,9 @@ class TextEncoder(nn.Layer): m, logs = paddle.split(stats, 2, axis=1) return x, m, logs, x_mask + + def reset_parameters(self): + normal_(self.emb.weight, mean=0.0, std=self.attention_dim**-0.5) + if self.emb._padding_idx is not None: + with paddle.no_grad(): + self.emb.weight[self.emb._padding_idx] = 0 diff --git a/paddlespeech/t2s/models/vits/vits.py b/paddlespeech/t2s/models/vits/vits.py index 0ff3a546d..e68ed5643 100644 --- a/paddlespeech/t2s/models/vits/vits.py +++ b/paddlespeech/t2s/models/vits/vits.py @@ -13,6 +13,7 @@ # limitations under the License. # Modified from espnet(https://github.com/espnet/espnet) """VITS module""" +import math from typing import Any from typing import Dict from typing import Optional @@ -27,7 +28,12 @@ from paddlespeech.t2s.models.hifigan import HiFiGANMultiScaleMultiPeriodDiscrimi from paddlespeech.t2s.models.hifigan import HiFiGANPeriodDiscriminator from paddlespeech.t2s.models.hifigan import HiFiGANScaleDiscriminator from paddlespeech.t2s.models.vits.generator import VITSGenerator -from paddlespeech.t2s.modules.nets_utils import initialize +from paddlespeech.utils.initialize import _calculate_fan_in_and_fan_out +from paddlespeech.utils.initialize import kaiming_uniform_ +from paddlespeech.utils.initialize import normal_ +from paddlespeech.utils.initialize import ones_ +from paddlespeech.utils.initialize import uniform_ +from paddlespeech.utils.initialize import zeros_ AVAILABLE_GENERATERS = { "vits_generator": VITSGenerator, @@ -152,8 +158,7 @@ class VITS(nn.Layer): "use_spectral_norm": False, }, }, - cache_generator_outputs: bool=True, - init_type: str="xavier_uniform", ): + cache_generator_outputs: bool=True, ): """Initialize VITS module. Args: idim (int): @@ -179,9 +184,6 @@ class VITS(nn.Layer): assert check_argument_types() super().__init__() - # initialize parameters - initialize(self, init_type) - # define modules generator_class = AVAILABLE_GENERATERS[generator_type] if generator_type == "vits_generator": @@ -196,8 +198,6 @@ class VITS(nn.Layer): self.discriminator = discriminator_class( **discriminator_params, ) - nn.initializer.set_global_initializer(None) - # cache self.cache_generator_outputs = cache_generator_outputs self._cache = None @@ -214,6 +214,10 @@ class VITS(nn.Layer): self.reuse_cache_gen = True self.reuse_cache_dis = True + self.reset_parameters() + self.generator.decoder.reset_parameters() + self.generator.text_encoder.reset_parameters() + def forward( self, text: paddle.Tensor, @@ -243,7 +247,7 @@ class VITS(nn.Layer): forward_generator (bool): Whether to forward generator. Returns: - + """ if forward_generator: return self._forward_generator( @@ -290,7 +294,7 @@ class VITS(nn.Layer): lids (Optional[Tensor]): Language index tensor (B,) or (B, 1). Returns: - + """ # setup feats = feats.transpose([0, 2, 1]) @@ -497,3 +501,34 @@ class VITS(nn.Layer): lids, ) return dict(wav=paddle.reshape(wav, [-1])) + + def reset_parameters(self): + def _reset_parameters(module): + if isinstance(module, + (nn.Conv1D, nn.Conv1DTranspose, nn.Conv2D, nn.Conv2DTranspose)): + kaiming_uniform_(module.weight, a=math.sqrt(5)) + if module.bias is not None: + fan_in, _ = _calculate_fan_in_and_fan_out(module.weight) + if fan_in != 0: + bound = 1 / math.sqrt(fan_in) + uniform_(module.bias, -bound, bound) + + if isinstance(module, + (nn.BatchNorm1D, nn.BatchNorm2D, nn.GroupNorm, nn.LayerNorm)): + ones_(module.weight) + zeros_(module.bias) + + if isinstance(module, nn.Linear): + kaiming_uniform_(module.weight, a=math.sqrt(5)) + if module.bias is not None: + fan_in, _ = _calculate_fan_in_and_fan_out(module.weight) + bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0 + uniform_(module.bias, -bound, bound) + + if isinstance(module, nn.Embedding): + normal_(module.weight) + if module._padding_idx is not None: + with paddle.no_grad(): + module.weight[module._padding_idx] = 0 + + self.apply(_reset_parameters) diff --git a/paddlespeech/utils/initialize.py b/paddlespeech/utils/initialize.py new file mode 100644 index 000000000..8ebe6845e --- /dev/null +++ b/paddlespeech/utils/initialize.py @@ -0,0 +1,321 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is based on https://github.com/pytorch/pytorch/blob/master/torch/nn/init.py +Ths copyright of pytorch/pytorch is a BSD-style license, as found in the LICENSE file. +""" +import math + +import numpy as np +import paddle +import paddle.nn as nn + +__all__ = [ + "uniform_", + "normal_", + "constant_", + "ones_", + "zeros_", + "xavier_uniform_", + "xavier_normal_", + "kaiming_uniform_", + "kaiming_normal_", + "linear_init_", + "conv_init_", + "reset_initialized_parameter", + "_calculate_fan_in_and_fan_out", +] + + +def _no_grad_uniform_(tensor, a, b): + with paddle.no_grad(): + tensor.set_value( + paddle.uniform( + shape=tensor.shape, dtype=tensor.dtype, min=a, max=b)) + return tensor + + +def _no_grad_normal_(tensor, mean=0.0, std=1.0): + with paddle.no_grad(): + tensor.set_value(paddle.normal(mean=mean, std=std, shape=tensor.shape)) + return tensor + + +def _no_grad_fill_(tensor, value=0.0): + with paddle.no_grad(): + tensor.set_value(paddle.full_like(tensor, value, dtype=tensor.dtype)) + return tensor + + +def uniform_(tensor, a, b): + """ + Modified tensor inspace using uniform_ + Args: + tensor (paddle.Tensor): paddle Tensor + a (float|int): min value. + b (float|int): max value. + Return: + tensor + """ + return _no_grad_uniform_(tensor, a, b) + + +def normal_(tensor, mean=0.0, std=1.0): + """ + Modified tensor inspace using normal_ + Args: + tensor (paddle.Tensor): paddle Tensor + mean (float|int): mean value. + std (float|int): std value. + Return: + tensor + """ + return _no_grad_normal_(tensor, mean, std) + + +def constant_(tensor, value=0.0): + """ + Modified tensor inspace using constant_ + Args: + tensor (paddle.Tensor): paddle Tensor + value (float|int): value to fill tensor. + Return: + tensor + """ + return _no_grad_fill_(tensor, value) + + +def ones_(tensor): + """ + Modified tensor inspace using ones_ + Args: + tensor (paddle.Tensor): paddle Tensor + Return: + tensor + """ + return _no_grad_fill_(tensor, 1) + + +def zeros_(tensor): + """ + Modified tensor inspace using zeros_ + Args: + tensor (paddle.Tensor): paddle Tensor + Return: + tensor + """ + return _no_grad_fill_(tensor, 0) + + +def vector_(tensor, vector): + with paddle.no_grad(): + tensor.set_value(paddle.to_tensor(vector, dtype=tensor.dtype)) + return tensor + + +def _calculate_fan_in_and_fan_out(tensor, reverse=False): + """ + Calculate (fan_in, _fan_out) for tensor + Args: + tensor (Tensor): paddle.Tensor + reverse (bool: False): tensor data format order, False by default as [fout, fin, ...]. e.g. : conv.weight [cout, cin, kh, kw] is False; linear.weight [cin, cout] is True + Return: + Tuple[fan_in, fan_out] + """ + if tensor.ndim < 2: + raise ValueError( + "Fan in and fan out can not be computed for tensor with fewer than 2 dimensions" + ) + + if reverse: + num_input_fmaps, num_output_fmaps = tensor.shape[0], tensor.shape[1] + else: + num_input_fmaps, num_output_fmaps = tensor.shape[1], tensor.shape[0] + + receptive_field_size = 1 + if tensor.ndim > 2: + receptive_field_size = np.prod(tensor.shape[2:]) + + fan_in = num_input_fmaps * receptive_field_size + fan_out = num_output_fmaps * receptive_field_size + + return fan_in, fan_out + + +def xavier_uniform_(tensor, gain=1.0, reverse=False): + """ + Modified tensor inspace using xavier_uniform_ + Args: + tensor (paddle.Tensor): paddle Tensor + gain (float): super parameter, 1. default. + reverse (bool): reverse (bool: False): tensor data format order, False by default as [fout, fin, ...]. + Return: + tensor + """ + fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse=reverse) + std = gain * math.sqrt(2.0 / float(fan_in + fan_out)) + k = math.sqrt(3.0) * std + return _no_grad_uniform_(tensor, -k, k) + + +def xavier_normal_(tensor, gain=1.0, reverse=False): + """ + Modified tensor inspace using xavier_normal_ + Args: + tensor (paddle.Tensor): paddle Tensor + gain (float): super parameter, 1. default. + reverse (bool): reverse (bool: False): tensor data format order, False by default as [fout, fin, ...]. + Return: + tensor + """ + fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse=reverse) + std = gain * math.sqrt(2.0 / float(fan_in + fan_out)) + return _no_grad_normal_(tensor, 0, std) + + +# reference: https://pytorch.org/docs/stable/_modules/torch/nn/init.html +def _calculate_correct_fan(tensor, mode, reverse=False): + mode = mode.lower() + valid_modes = ["fan_in", "fan_out"] + if mode not in valid_modes: + raise ValueError("Mode {} not supported, please use one of {}".format( + mode, valid_modes)) + + fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse) + + return fan_in if mode == "fan_in" else fan_out + + +def _calculate_gain(nonlinearity, param=None): + linear_fns = [ + "linear", "conv1d", "conv2d", "conv3d", "conv_transpose1d", + "conv_transpose2d", "conv_transpose3d" + ] + if nonlinearity in linear_fns or nonlinearity == "sigmoid": + return 1 + elif nonlinearity == "tanh": + return 5.0 / 3 + elif nonlinearity == "relu": + return math.sqrt(2.0) + elif nonlinearity == "leaky_relu": + if param is None: + negative_slope = 0.01 + elif not isinstance(param, bool) and isinstance( + param, int) or isinstance(param, float): + # True/False are instances of int, hence check above + negative_slope = param + else: + raise ValueError( + "negative_slope {} not a valid number".format(param)) + return math.sqrt(2.0 / (1 + negative_slope**2)) + elif nonlinearity == "selu": + return 3.0 / 4 + else: + raise ValueError("Unsupported nonlinearity {}".format(nonlinearity)) + + +def kaiming_uniform_(tensor, + a=0, + mode="fan_in", + nonlinearity="leaky_relu", + reverse=False): + """ + Modified tensor inspace using kaiming_uniform method + Args: + tensor (paddle.Tensor): paddle Tensor + mode (str): ['fan_in', 'fan_out'], 'fin_in' defalut + nonlinearity (str): nonlinearity method name + reverse (bool): reverse (bool: False): tensor data format order, False by default as [fout, fin, ...]. + Return: + tensor + """ + fan = _calculate_correct_fan(tensor, mode, reverse) + gain = _calculate_gain(nonlinearity, a) + std = gain / math.sqrt(fan) + k = math.sqrt(3.0) * std + return _no_grad_uniform_(tensor, -k, k) + + +def kaiming_normal_(tensor, + a=0, + mode="fan_in", + nonlinearity="leaky_relu", + reverse=False): + """ + Modified tensor inspace using kaiming_normal_ + Args: + tensor (paddle.Tensor): paddle Tensor + mode (str): ['fan_in', 'fan_out'], 'fin_in' defalut + nonlinearity (str): nonlinearity method name + reverse (bool): reverse (bool: False): tensor data format order, False by default as [fout, fin, ...]. + Return: + tensor + """ + fan = _calculate_correct_fan(tensor, mode, reverse) + gain = _calculate_gain(nonlinearity, a) + std = gain / math.sqrt(fan) + return _no_grad_normal_(tensor, 0, std) + + +def linear_init_(module): + bound = 1 / math.sqrt(module.weight.shape[0]) + uniform_(module.weight, -bound, bound) + uniform_(module.bias, -bound, bound) + + +def conv_init_(module): + bound = 1 / np.sqrt(np.prod(module.weight.shape[1:])) + uniform_(module.weight, -bound, bound) + if module.bias is not None: + uniform_(module.bias, -bound, bound) + + +def bias_init_with_prob(prior_prob=0.01): + """initialize conv/fc bias value according to a given probability value.""" + bias_init = float(-np.log((1 - prior_prob) / prior_prob)) + return bias_init + + +@paddle.no_grad() +def reset_initialized_parameter(model, include_self=True): + """ + Reset initialized parameter using following method for [conv, linear, embedding, bn] + Args: + model (paddle.Layer): paddle Layer + include_self (bool: False): include_self for Layer.named_sublayers method. Indicate whether including itself + Return: + None + """ + for _, m in model.named_sublayers(include_self=include_self): + if isinstance(m, nn.Conv2D): + k = float(m._groups) / (m._in_channels * m._kernel_size[0] * + m._kernel_size[1]) + k = math.sqrt(k) + _no_grad_uniform_(m.weight, -k, k) + if hasattr(m, "bias") and getattr(m, "bias") is not None: + _no_grad_uniform_(m.bias, -k, k) + + elif isinstance(m, nn.Linear): + k = math.sqrt(1.0 / m.weight.shape[0]) + _no_grad_uniform_(m.weight, -k, k) + if hasattr(m, "bias") and getattr(m, "bias") is not None: + _no_grad_uniform_(m.bias, -k, k) + + elif isinstance(m, nn.Embedding): + _no_grad_normal_(m.weight, mean=0.0, std=1.0) + + elif isinstance(m, (nn.BatchNorm2D, nn.LayerNorm)): + _no_grad_fill_(m.weight, 1.0) + if hasattr(m, "bias") and getattr(m, "bias") is not None: + _no_grad_fill_(m.bias, 0) From ad40dafa856b9c4539e7b9f82bad2d9ff8c317f4 Mon Sep 17 00:00:00 2001 From: zxcd <228587199@qq.com> Date: Thu, 12 Jan 2023 10:23:56 +0800 Subject: [PATCH 021/159] fix some bug. (#2825) --- paddlespeech/s2t/models/whisper/tokenizer.py | 4 ++++ paddlespeech/s2t/models/whisper/whipser.py | 14 +++++++------- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/paddlespeech/s2t/models/whisper/tokenizer.py b/paddlespeech/s2t/models/whisper/tokenizer.py index 8bd85c914..1e1aea044 100644 --- a/paddlespeech/s2t/models/whisper/tokenizer.py +++ b/paddlespeech/s2t/models/whisper/tokenizer.py @@ -155,6 +155,10 @@ class Tokenizer: if ids < len(self.tokenizer): ids_list.append(ids) token_ids = ids_list + elif len(token_ids) == 1: + token_ids = token_ids[0] + else: + raise ValueError(f"token_ids {token_ids} load error.") return self.tokenizer.decode(token_ids, **kwargs) diff --git a/paddlespeech/s2t/models/whisper/whipser.py b/paddlespeech/s2t/models/whisper/whipser.py index 63cafbdb7..81692f37a 100644 --- a/paddlespeech/s2t/models/whisper/whipser.py +++ b/paddlespeech/s2t/models/whisper/whipser.py @@ -17,12 +17,11 @@ from typing import Union import numpy as np import paddle import paddle.nn.functional as F +import paddlespeech.s2t.modules.align as paddlespeech_nn import soundfile import tqdm from paddle import nn from paddle.distribution import Categorical - -import paddlespeech.s2t.modules.align as paddlespeech_nn from paddlespeech.s2t.models.whisper import utils from paddlespeech.s2t.models.whisper.tokenizer import get_tokenizer from paddlespeech.s2t.models.whisper.tokenizer import LANGUAGES @@ -771,8 +770,10 @@ class GreedyDecoder(TokenDecoder): if temperature == 0: next_tokens = paddle.argmax(logits, axis=-1) else: - next_tokens = Categorical(logits=logits / temperature).sample( - shape=logits.shape) + next_tokens = Categorical(logits=logits / temperature).sample([1]) + next_tokens = paddle.reshape(next_tokens, [ + next_tokens.shape[0] * next_tokens.shape[1], + ]) logprobs = F.log_softmax(logits, axis=-1, dtype=paddle.float32) current_logprobs = logprobs[paddle.arange(logprobs.shape[0]), @@ -1205,9 +1206,8 @@ class DecodingTask: DecodingResult( audio_features=features, language=language, - language_probs=probs) - for features, language, probs in zip(audio_features, languages, - language_probs) + language_probs=probs) for features, language, probs in + zip(audio_features, languages, language_probs) ] # repeat the audio & text tensors by the group size, for beam search or best-of-n sampling From a99244d86e56a0d796f04919b4e6493b6d4d22a6 Mon Sep 17 00:00:00 2001 From: cxumol Date: Wed, 11 Jan 2023 22:04:10 -0800 Subject: [PATCH 022/159] fix: whisper language choice, test=asr (#2828) --- paddlespeech/s2t/models/whisper/whipser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddlespeech/s2t/models/whisper/whipser.py b/paddlespeech/s2t/models/whisper/whipser.py index 81692f37a..9cf9a9eca 100644 --- a/paddlespeech/s2t/models/whisper/whipser.py +++ b/paddlespeech/s2t/models/whisper/whipser.py @@ -476,7 +476,7 @@ def transcribe( decode_options["fp16"] = False if decode_options.get( - "language", 'None') or decode_options.get("language", None) is None: + "language") == 'None' or decode_options.get("language", None) is None: if not model.is_multilingual: decode_options["language"] = "en" else: From 742523fb38f521aaa93431658a7eb2042b2bad81 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=99=8B=E4=B8=9C=E6=AF=85?= Date: Fri, 13 Jan 2023 15:45:49 +0800 Subject: [PATCH 023/159] [tts]For mixed Chinese and English speech synthesis, add SSML support for Chinese (#2830) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * 添加.history * [tts]添加中英混合语音合成时对中文SSML的支持 --- .gitignore | 1 + paddlespeech/t2s/frontend/mix_frontend.py | 57 ++++++++++++++++++++--- paddlespeech/t2s/ssml/xml_processor.py | 34 ++++++++++++++ 3 files changed, 86 insertions(+), 6 deletions(-) diff --git a/.gitignore b/.gitignore index 75f56b604..4a0c43312 100644 --- a/.gitignore +++ b/.gitignore @@ -15,6 +15,7 @@ *.egg-info build *output/ +.history audio/dist/ audio/fc_patch/ diff --git a/paddlespeech/t2s/frontend/mix_frontend.py b/paddlespeech/t2s/frontend/mix_frontend.py index 19c98d53f..c13a5ab62 100644 --- a/paddlespeech/t2s/frontend/mix_frontend.py +++ b/paddlespeech/t2s/frontend/mix_frontend.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import re from typing import Dict from typing import List @@ -18,6 +19,7 @@ import paddle from paddlespeech.t2s.frontend import English from paddlespeech.t2s.frontend.zh_frontend import Frontend +from paddlespeech.t2s.ssml.xml_processor import MixTextProcessor class MixFrontend(): @@ -107,7 +109,40 @@ class MixFrontend(): add_sp: bool=True, to_tensor: bool=True) -> Dict[str, List[paddle.Tensor]]: - segments = self.get_segment(sentence) + ''' 1. 添加SSML支持,先列出 文字 和 标签内容, + 然后添加到tmpSegments数组里 + ''' + d_inputs = MixTextProcessor.get_dom_split(sentence) + tmpSegments = [] + for instr in d_inputs: + ''' 暂时只支持 say-as ''' + if instr.lower().startswith("" + segments.append(tuple(currentSeg)) + segments.append(seg) + currentSeg = ["", ""] + else: + if currentSeg[0] == '': + currentSeg[0] = seg[0] + currentSeg[1] = seg[1] + else: + currentSeg[0] = currentSeg[0] + seg[0] + if currentSeg[0] != '': + currentSeg[0] = "" + currentSeg[0] + "" + segments.append(tuple(currentSeg)) phones_list = [] result = {} @@ -120,11 +155,21 @@ class MixFrontend(): input_ids = self.en_frontend.get_input_ids( content, merge_sentences=False, to_tensor=to_tensor) else: - input_ids = self.zh_frontend.get_input_ids( - content, - merge_sentences=False, - get_tone_ids=get_tone_ids, - to_tensor=to_tensor) + ''' 3. 把带speak tag的中文和普通文字分开处理 + ''' + if content.strip() != "" and \ + re.match(r".*?.*?.*", content, re.DOTALL): + input_ids = self.zh_frontend.get_input_ids_ssml( + content, + merge_sentences=False, + get_tone_ids=get_tone_ids, + to_tensor=to_tensor) + else: + input_ids = self.zh_frontend.get_input_ids( + content, + merge_sentences=False, + get_tone_ids=get_tone_ids, + to_tensor=to_tensor) if add_sp: input_ids["phone_ids"][-1] = paddle.concat( [input_ids["phone_ids"][-1], self.sp_id_tensor]) diff --git a/paddlespeech/t2s/ssml/xml_processor.py b/paddlespeech/t2s/ssml/xml_processor.py index b39121347..892ca371e 100644 --- a/paddlespeech/t2s/ssml/xml_processor.py +++ b/paddlespeech/t2s/ssml/xml_processor.py @@ -74,6 +74,28 @@ class MixTextProcessor(): ctlist.append([mixstr, []]) return ctlist + @classmethod + def get_dom_split(self, mixstr): + ''' 文本分解,顺序加了列表中,返回文本和say-as标签 + ''' + ctlist = [] + patn = re.compile(r'(.*\s*?)(.*?)(.*\s*)$', re.M | re.S) + mat = re.match(patn, mixstr) + if mat: + pre_xml = mat.group(1) + in_xml = mat.group(2) + after_xml = mat.group(3) + + ctlist.append(pre_xml) + dom = DomXml(in_xml) + tags = dom.get_text_and_sayas_tags() + ctlist.extend(tags) + + ctlist.append(after_xml) + return ctlist + else: + ctlist.append(mixstr) + return ctlist class DomXml(): def __init__(self, xmlstr): @@ -156,3 +178,15 @@ class DomXml(): if x.hasAttribute('pinyin'): # pinyin print(x.tagName, 'pinyin', x.getAttribute('pinyin'), x.firstChild.data) + + def get_text_and_sayas_tags(self): + '''返回 xml 内容的列表,包括所有文本内容和 tag''' + res = [] + + for x1 in self.rnode: + if x1.nodeType == Node.TEXT_NODE: + res.append(x1.value) + else: + for x2 in x1.childNodes: + res.append(x2.toxml()) + return res From 1fd38c0e8b5937a5e9a1fd576e35c610b7b181a0 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Fri, 13 Jan 2023 17:40:47 +0800 Subject: [PATCH 024/159] fix o (#2831) --- paddlespeech/t2s/frontend/g2pw/onnx_api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddlespeech/t2s/frontend/g2pw/onnx_api.py b/paddlespeech/t2s/frontend/g2pw/onnx_api.py index 47c26a610..3ce3d246d 100644 --- a/paddlespeech/t2s/frontend/g2pw/onnx_api.py +++ b/paddlespeech/t2s/frontend/g2pw/onnx_api.py @@ -100,7 +100,7 @@ class G2PWOnnxConverter: ] self.non_polyphonic = { '一', '不', '和', '咋', '嗲', '剖', '差', '攢', '倒', '難', '奔', '勁', '拗', - '肖', '瘙', '誒', '泊', '听' + '肖', '瘙', '誒', '泊', '听', '噢' } self.non_monophonic = {'似', '攢'} self.monophonic_chars = [ From 57b9d4bca4c897835a52a8f6a2f9ee04ddc4b402 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=89=BE=E6=A2=A6?= Date: Fri, 13 Jan 2023 20:42:23 +0800 Subject: [PATCH 025/159] add diffusion module for training diffsinger (#2832) --- docs/requirements.txt | 1 + paddlespeech/t2s/modules/diffusion.py | 467 ++++++++++++++++++++++++++ setup.py | 1 + 3 files changed, 469 insertions(+) create mode 100644 paddlespeech/t2s/modules/diffusion.py diff --git a/docs/requirements.txt b/docs/requirements.txt index bd7f40ec3..c6228d917 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -27,6 +27,7 @@ pandas pathos==0.2.8 pattern_singleton Pillow>=9.0.0 +ppdiffusers>=0.9.0 praatio==5.0.0 prettytable pypinyin-dict diff --git a/paddlespeech/t2s/modules/diffusion.py b/paddlespeech/t2s/modules/diffusion.py new file mode 100644 index 000000000..52fe84ceb --- /dev/null +++ b/paddlespeech/t2s/modules/diffusion.py @@ -0,0 +1,467 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Diffusion denoising related modules for paddle""" +import math +from typing import Callable +from typing import Optional +from typing import Tuple + +import paddle +import ppdiffusers +from paddle import nn +from ppdiffusers.models.embeddings import Timesteps +from ppdiffusers.schedulers import DDPMScheduler + +from paddlespeech.t2s.modules.nets_utils import initialize +from paddlespeech.t2s.modules.residual_block import WaveNetResidualBlock + + +class WaveNetDenoiser(nn.Layer): + """A Mel-Spectrogram Denoiser modified from WaveNet + + Args: + in_channels (int, optional): + Number of channels of the input mel-spectrogram, by default 80 + out_channels (int, optional): + Number of channels of the output mel-spectrogram, by default 80 + kernel_size (int, optional): + Kernel size of the residual blocks inside, by default 3 + layers (int, optional): + Number of residual blocks inside, by default 20 + stacks (int, optional): + The number of groups to split the residual blocks into, by default 4 + Within each group, the dilation of the residual block grows exponentially. + residual_channels (int, optional): + Residual channel of the residual blocks, by default 256 + gate_channels (int, optional): + Gate channel of the residual blocks, by default 512 + skip_channels (int, optional): + Skip channel of the residual blocks, by default 256 + aux_channels (int, optional): + Auxiliary channel of the residual blocks, by default 256 + dropout (float, optional): + Dropout of the residual blocks, by default 0. + bias (bool, optional): + Whether to use bias in residual blocks, by default True + use_weight_norm (bool, optional): + Whether to use weight norm in all convolutions, by default False + """ + + def __init__( + self, + in_channels: int=80, + out_channels: int=80, + kernel_size: int=3, + layers: int=20, + stacks: int=4, + residual_channels: int=256, + gate_channels: int=512, + skip_channels: int=256, + aux_channels: int=256, + dropout: float=0., + bias: bool=True, + use_weight_norm: bool=False, + init_type: str="kaiming_uniform", ): + super().__init__() + + # initialize parameters + initialize(self, init_type) + + self.in_channels = in_channels + self.out_channels = out_channels + self.aux_channels = aux_channels + self.layers = layers + self.stacks = stacks + self.kernel_size = kernel_size + + assert layers % stacks == 0 + layers_per_stack = layers // stacks + + self.first_t_emb = nn.Sequential( + Timesteps( + residual_channels, + flip_sin_to_cos=False, + downscale_freq_shift=1), + nn.Linear(residual_channels, residual_channels * 4), + nn.Mish(), nn.Linear(residual_channels * 4, residual_channels)) + self.t_emb_layers = nn.LayerList([ + nn.Linear(residual_channels, residual_channels) + for _ in range(layers) + ]) + + self.first_conv = nn.Conv1D( + in_channels, residual_channels, 1, bias_attr=True) + self.first_act = nn.ReLU() + + self.conv_layers = nn.LayerList() + for layer in range(layers): + dilation = 2**(layer % layers_per_stack) + conv = WaveNetResidualBlock( + kernel_size=kernel_size, + residual_channels=residual_channels, + gate_channels=gate_channels, + skip_channels=skip_channels, + aux_channels=aux_channels, + dilation=dilation, + dropout=dropout, + bias=bias) + self.conv_layers.append(conv) + + self.last_conv_layers = nn.Sequential(nn.ReLU(), + nn.Conv1D( + skip_channels, + skip_channels, + 1, + bias_attr=True), + nn.ReLU(), + nn.Conv1D( + skip_channels, + out_channels, + 1, + bias_attr=True)) + + if use_weight_norm: + self.apply_weight_norm() + + def forward(self, x, t, c): + """Denoise mel-spectrogram. + + Args: + x(Tensor): + Shape (N, C_in, T), The input mel-spectrogram. + t(Tensor): + Shape (N), The timestep input. + c(Tensor): + Shape (N, C_aux, T'). The auxiliary input (e.g. fastspeech2 encoder output). + + Returns: + Tensor: Shape (N, C_out, T), the denoised mel-spectrogram. + """ + assert c.shape[-1] == x.shape[-1] + + if t.shape[0] != x.shape[0]: + t = t.tile([x.shape[0]]) + t_emb = self.first_t_emb(t) + t_embs = [ + t_emb_layer(t_emb)[..., None] for t_emb_layer in self.t_emb_layers + ] + + x = self.first_conv(x) + x = self.first_act(x) + skips = 0 + for f, t in zip(self.conv_layers, t_embs): + x = x + t + x, s = f(x, c) + skips += s + skips *= math.sqrt(1.0 / len(self.conv_layers)) + + x = self.last_conv_layers(skips) + return x + + def apply_weight_norm(self): + """Recursively apply weight normalization to all the Convolution layers + in the sublayers. + """ + + def _apply_weight_norm(layer): + if isinstance(layer, (nn.Conv1D, nn.Conv2D)): + nn.utils.weight_norm(layer) + + self.apply(_apply_weight_norm) + + def remove_weight_norm(self): + """Recursively remove weight normalization from all the Convolution + layers in the sublayers. + """ + + def _remove_weight_norm(layer): + try: + nn.utils.remove_weight_norm(layer) + except ValueError: + pass + + self.apply(_remove_weight_norm) + + +class GaussianDiffusion(nn.Layer): + """Common Gaussian Diffusion Denoising Model Module + + Args: + denoiser (Layer, optional): + The model used for denoising noises. + In fact, the denoiser model performs the operation + of producing a output with more noises from the noisy input. + Then we use the diffusion algorithm to calculate + the input with the output to get the denoised result. + num_train_timesteps (int, optional): + The number of timesteps between the noise and the real during training, by default 1000. + beta_start (float, optional): + beta start parameter for the scheduler, by default 0.0001. + beta_end (float, optional): + beta end parameter for the scheduler, by default 0.0001. + beta_schedule (str, optional): + beta schedule parameter for the scheduler, by default 'squaredcos_cap_v2' (cosine schedule). + num_max_timesteps (int, optional): + The max timestep transition from real to noise, by default None. + + Examples: + >>> import paddle + >>> import paddle.nn.functional as F + >>> from tqdm import tqdm + >>> + >>> denoiser = WaveNetDenoiser() + >>> diffusion = GaussianDiffusion(denoiser, num_train_timesteps=1000, num_max_timesteps=100) + >>> x = paddle.ones([4, 80, 192]) # [B, mel_ch, T] # real mel input + >>> c = paddle.randn([4, 256, 192]) # [B, fs2_encoder_out_ch, T] # fastspeech2 encoder output + >>> loss = F.mse_loss(*diffusion(x, c)) + >>> loss.backward() + >>> print('MSE Loss:', loss.item()) + MSE Loss: 1.6669728755950928 + >>> def create_progress_callback(): + >>> pbar = None + >>> def callback(index, timestep, num_timesteps, sample): + >>> nonlocal pbar + >>> if pbar is None: + >>> pbar = tqdm(total=num_timesteps-index) + >>> pbar.update() + >>> + >>> return callback + >>> + >>> # ds=1000, K_step=60, scheduler=ddpm, from aux fs2 mel output + >>> ds = 1000 + >>> infer_steps = 1000 + >>> K_step = 60 + >>> scheduler_type = 'ddpm' + >>> x_in = x + >>> diffusion = GaussianDiffusion(denoiser, num_train_timesteps=ds, num_max_timesteps=K_step) + >>> with paddle.no_grad(): + >>> sample = diffusion.inference( + >>> paddle.randn(x.shape), c, x, + >>> num_inference_steps=infer_steps, + >>> scheduler_type=scheduler_type, + >>> callback=create_progress_callback()) + 100%|█████| 60/60 [00:03<00:00, 18.36it/s] + >>> + >>> # ds=100, K_step=100, scheduler=ddpm, from gaussian noise + >>> ds = 100 + >>> infer_steps = 100 + >>> K_step = 100 + >>> scheduler_type = 'ddpm' + >>> x_in = None + >>> diffusion = GaussianDiffusion(denoiser, num_train_timesteps=ds, num_max_timesteps=K_step) + >>> with paddle.no_grad(): + >>> sample = diffusion.inference( + >>> paddle.randn(x.shape), c, x_in, + >>> num_inference_steps=infer_steps, + >>> scheduler_type=scheduler_type, + >>> callback=create_progress_callback()) + 100%|█████| 100/100 [00:05<00:00, 18.29it/s] + >>> + >>> # ds=1000, K_step=1000, scheduler=pndm, infer_step=25, from gaussian noise + >>> ds = 1000 + >>> infer_steps = 25 + >>> K_step = 1000 + >>> scheduler_type = 'pndm' + >>> x_in = None + >>> diffusion = GaussianDiffusion(denoiser, num_train_timesteps=ds, num_max_timesteps=K_step) + >>> with paddle.no_grad(): + >>> sample = diffusion.inference( + >>> paddle.randn(x.shape), c, None, + >>> num_inference_steps=infer_steps, + >>> scheduler_type=scheduler_type, + >>> callback=create_progress_callback()) + 100%|█████| 25/25 [00:01<00:00, 19.75it/s] + >>> + >>> # ds=1000, K_step=100, scheduler=pndm, infer_step=50, from aux fs2 mel output + >>> ds = 1000 + >>> infer_steps = 50 + >>> K_step = 100 + >>> scheduler_type = 'pndm' + >>> x_in = x + >>> diffusion = GaussianDiffusion(denoiser, num_train_timesteps=ds, num_max_timesteps=K_step) + >>> with paddle.no_grad(): + >>> sample = diffusion.inference( + >>> paddle.randn(x.shape), c, x, + >>> num_inference_steps=infer_steps, + >>> scheduler_type=scheduler_type, + >>> callback=create_progress_callback()) + 100%|█████| 5/5 [00:00<00:00, 23.80it/s] + + """ + + def __init__(self, + denoiser: nn.Layer, + num_train_timesteps: Optional[int]=1000, + beta_start: Optional[float]=0.0001, + beta_end: Optional[float]=0.02, + beta_schedule: Optional[str]="squaredcos_cap_v2", + num_max_timesteps: Optional[int]=None): + super().__init__() + + self.num_train_timesteps = num_train_timesteps + self.beta_start = beta_start + self.beta_end = beta_end + self.beta_schedule = beta_schedule + + self.denoiser = denoiser + self.noise_scheduler = DDPMScheduler( + num_train_timesteps=num_train_timesteps, + beta_start=beta_start, + beta_end=beta_end, + beta_schedule=beta_schedule) + self.num_max_timesteps = num_max_timesteps + + def forward(self, x: paddle.Tensor, cond: Optional[paddle.Tensor]=None + ) -> Tuple[paddle.Tensor, paddle.Tensor]: + """Generate random timesteps noised x. + + Args: + x (Tensor): + The input for adding noises. + cond (Tensor, optional): + Conditional input for compute noises. + + Returns: + y (Tensor): + The output with noises added in. + target (Tensor): + The noises which is added to the input. + + """ + noise_scheduler = self.noise_scheduler + + # Sample noise that we'll add to the mel-spectrograms + target = noise = paddle.randn(x.shape) + + # Sample a random timestep for each mel-spectrogram + num_timesteps = self.num_train_timesteps + if self.num_max_timesteps is not None: + num_timesteps = self.num_max_timesteps + timesteps = paddle.randint(0, num_timesteps, (x.shape[0], )) + + # Add noise to the clean mel-spectrograms according to the noise magnitude at each timestep + # (this is the forward diffusion process) + noisy_images = noise_scheduler.add_noise(x, noise, timesteps) + + y = self.denoiser(noisy_images, timesteps, cond) + + # then compute loss use output y and noisy target for prediction_type == "epsilon" + return y, target + + def inference(self, + noise: paddle.Tensor, + cond: Optional[paddle.Tensor]=None, + ref_x: Optional[paddle.Tensor]=None, + num_inference_steps: Optional[int]=1000, + strength: Optional[float]=None, + scheduler_type: Optional[str]="ddpm", + callback: Optional[Callable[[int, int, int, paddle.Tensor], + None]]=None, + callback_steps: Optional[int]=1): + """Denoising input from noises. Refer to ppdiffusers img2img pipeline. + + Args: + noise (Tensor): + The input tensor as a starting point for denoising. + cond (Tensor, optional): + Conditional input for compute noises. + ref_x (Tensor, optional): + The real output for the denoising process to refer. + num_inference_steps (int, optional): + The number of timesteps between the noise and the real during inference, by default 1000. + strength (float, optional): + Mixing strength of ref_x with noise. The larger the value, the stronger the noise. + Range [0,1], by default None. + scheduler_type (str, optional): + Noise scheduler for generate noises. + Choose a great scheduler can skip many denoising step, by default 'ddpm'. + callback (Callable[[int,int,int,Tensor], None], optional): + Callback function during denoising steps. + + Args: + index (int): + Current denoising index. + timestep (int): + Current denoising timestep. + num_timesteps (int): + Number of the denoising timesteps. + denoised_output (Tensor): + Current intermediate result produced during denoising. + + callback_steps (int, optional): + The step to call the callback function. + + Returns: + denoised_output (Tensor): + The denoised output tensor. + + """ + scheduler_cls = None + for clsname in dir(ppdiffusers.schedulers): + if clsname.lower() == scheduler_type + "scheduler": + scheduler_cls = getattr(ppdiffusers.schedulers, clsname) + break + + if scheduler_cls is None: + raise ValueError(f"No such scheduler type named {scheduler_type}") + + scheduler = scheduler_cls( + num_train_timesteps=self.num_train_timesteps, + beta_start=self.beta_start, + beta_end=self.beta_end, + beta_schedule=self.beta_schedule) + + # set timesteps + scheduler.set_timesteps(num_inference_steps) + + # prepare first noise variables + noisy_input = noise + timesteps = scheduler.timesteps + if ref_x is not None: + init_timestep = None + if strength is None or strength < 0. or strength > 1.: + strength = None + if self.num_max_timesteps is not None: + strength = self.num_max_timesteps / self.num_train_timesteps + if strength is not None: + # get the original timestep using init_timestep + init_timestep = min( + int(num_inference_steps * strength), num_inference_steps) + t_start = max(num_inference_steps - init_timestep, 0) + timesteps = scheduler.timesteps[t_start:] + num_inference_steps = num_inference_steps - t_start + noisy_input = scheduler.add_noise( + ref_x, noise, timesteps[:1].tile([noise.shape[0]])) + + # denoising loop + denoised_output = noisy_input + num_warmup_steps = len( + timesteps) - num_inference_steps * scheduler.order + for i, t in enumerate(timesteps): + denoised_output = scheduler.scale_model_input(denoised_output, t) + + # predict the noise residual + noise_pred = self.denoiser(denoised_output, t, cond) + + # compute the previous noisy sample x_t -> x_t-1 + denoised_output = scheduler.step(noise_pred, t, + denoised_output).prev_sample + + # call the callback, if provided + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and + (i + 1) % scheduler.order == 0): + if callback is not None and i % callback_steps == 0: + callback(i, t, len(timesteps), denoised_output) + + return denoised_output diff --git a/setup.py b/setup.py index 3bde2b205..212d3b109 100644 --- a/setup.py +++ b/setup.py @@ -49,6 +49,7 @@ base = [ "opencc-python-reimplemented", "pandas", "paddlenlp>=2.4.8", + "ppdiffusers>=0.9.0", "paddlespeech_feat", "Pillow>=9.0.0", "praatio==5.0.0", From 2f3ca4ac4809767008f89b0ab24846b2f5e0b983 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Tue, 17 Jan 2023 13:55:18 +0800 Subject: [PATCH 026/159] Update README.md (#2840) * Update README.md * Update README_cn.md --- README.md | 4 ++-- README_cn.md | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 2fb773634..40064f5d2 100644 --- a/README.md +++ b/README.md @@ -157,8 +157,8 @@ Via the easy-to-use, efficient, flexible and scalable implementation, our vision - 🧩 *Cascaded models application*: as an extension of the typical traditional audio tasks, we combine the workflows of the aforementioned tasks with other fields like Natural language processing (NLP) and Computer Vision (CV). ### Recent Update -- 🔥 2022.01.10: Add [code-switch asr CLI and Demos](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/speech_recognition). -- 👑 2022.01.06: Add [code-switch asr tal_cs recipe](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/tal_cs/asr1/). +- 🔥 2023.01.10: Add [code-switch asr CLI and Demos](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/speech_recognition). +- 👑 2023.01.06: Add [code-switch asr tal_cs recipe](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/tal_cs/asr1/). - 🎉 2022.12.02: Add [end-to-end Prosody Prediction pipeline](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3_rhy) (including using prosody labels in Acoustic Model). - 🎉 2022.11.30: Add [TTS Android Demo](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/TTSAndroid). - 🤗 2022.11.28: PP-TTS and PP-ASR demos are available in [AIStudio](https://aistudio.baidu.com/aistudio/modelsoverview) and [official website diff --git a/README_cn.md b/README_cn.md index 53f6a66e4..d2e5f63d7 100644 --- a/README_cn.md +++ b/README_cn.md @@ -164,8 +164,8 @@ ### 近期更新 -- 🔥 2022.01.10: 新增 [中英混合 ASR CLI 和 Demos](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/speech_recognition). -- 👑 2022.01.06: 新增 [ASR中英混合 tal_cs 训练推理流程](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/tal_cs/asr1/). +- 🔥 2023.01.10: 新增 [中英混合 ASR CLI 和 Demos](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/speech_recognition). +- 👑 2023.01.06: 新增 [ASR中英混合 tal_cs 训练推理流程](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/tal_cs/asr1/). - 🎉 2022.12.02: 新增 [端到端韵律预测全流程](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3_rhy) (包含在声学模型中使用韵律标签)。 - 🎉 2022.11.30: 新增 [TTS Android 部署示例](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/TTSAndroid)。 - 🤗 2022.11.28: PP-TTS and PP-ASR 示例可在 [AIStudio](https://aistudio.baidu.com/aistudio/modelsoverview) 和[飞桨官网](https://www.paddlepaddle.org.cn/models)体验! From 478fd2593e215a65ec641bc1ba831e53d9da7d4b Mon Sep 17 00:00:00 2001 From: Ming Date: Tue, 17 Jan 2023 17:40:15 +0800 Subject: [PATCH 027/159] update QR Code in README, test=doc (#2841) --- README.md | 2 +- README_cn.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 40064f5d2..afc4e4d09 100644 --- a/README.md +++ b/README.md @@ -191,7 +191,7 @@ Via the easy-to-use, efficient, flexible and scalable implementation, our vision - Scan the QR code below with your Wechat, you can access to official technical exchange group and get the bonus ( more than 20GB learning materials, such as papers, codes and videos ) and the live link of the lessons. Look forward to your participation.
- +
## Installation diff --git a/README_cn.md b/README_cn.md index d2e5f63d7..ecc4644aa 100644 --- a/README_cn.md +++ b/README_cn.md @@ -202,7 +202,7 @@ 微信扫描二维码关注公众号,点击“马上报名”填写问卷加入官方交流群,获得更高效的问题答疑,与各行各业开发者充分交流,期待您的加入。
- +
From 140aed4b545885cdb9a13117e9d1a009466c44ac Mon Sep 17 00:00:00 2001 From: HuangLiangJie Date: Thu, 19 Jan 2023 16:04:03 +0800 Subject: [PATCH 028/159] [TTS]VITS init sampler reverse, test=tts (#2843) --- paddlespeech/t2s/exps/vits/normalize.py | 2 +- paddlespeech/t2s/exps/vits/preprocess.py | 2 +- paddlespeech/t2s/exps/vits/train.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/paddlespeech/t2s/exps/vits/normalize.py b/paddlespeech/t2s/exps/vits/normalize.py index 514cbef8e..24e15765e 100644 --- a/paddlespeech/t2s/exps/vits/normalize.py +++ b/paddlespeech/t2s/exps/vits/normalize.py @@ -187,7 +187,7 @@ def main(): record["spk_emb"] = str(item["spk_emb"]) output_metadata.append(record) - output_metadata.sort(key=itemgetter('feats_lengths')) + output_metadata.sort(key=itemgetter('feats_lengths'), reverse=True) output_metadata_path = Path(args.dumpdir) / "metadata.jsonl" with jsonlines.open(output_metadata_path, 'w') as writer: for item in output_metadata: diff --git a/paddlespeech/t2s/exps/vits/preprocess.py b/paddlespeech/t2s/exps/vits/preprocess.py index 2b1a40834..d6b226a20 100644 --- a/paddlespeech/t2s/exps/vits/preprocess.py +++ b/paddlespeech/t2s/exps/vits/preprocess.py @@ -166,7 +166,7 @@ def process_sentences(config, if record: results.append(record) - results.sort(key=itemgetter("feats_lengths")) + results.sort(key=itemgetter("feats_lengths"), reverse=True) with jsonlines.open(output_dir / "metadata.jsonl", 'w') as writer: for item in results: writer.write(item) diff --git a/paddlespeech/t2s/exps/vits/train.py b/paddlespeech/t2s/exps/vits/train.py index 07301db56..f6a31ced2 100644 --- a/paddlespeech/t2s/exps/vits/train.py +++ b/paddlespeech/t2s/exps/vits/train.py @@ -110,7 +110,7 @@ def train_sp(args, config): train_sampler = ErnieSATSampler( train_dataset, batch_size=config.batch_size, - shuffle=True, + shuffle=False, drop_last=True) dev_sampler = ErnieSATSampler( dev_dataset, From 2b01e4052559b5c0e1a7d47f4eb1e340a5a1bf1d Mon Sep 17 00:00:00 2001 From: TianYuan Date: Mon, 30 Jan 2023 13:33:38 +0800 Subject: [PATCH 029/159] =?UTF-8?q?[TTS]soft=20link=20for=20shell=20in=20e?= =?UTF-8?q?xample,=20add=20skip=5Fcopy=5Fwave=20in=20norm=20stage=20of=20G?= =?UTF-8?q?=E2=80=A6=20(#2851)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit soft link for shell in example, add skip_copy_wave in norm stage of GANVocoders to save disk --- examples/aishell3/tts3/path.sh | 14 +--- examples/aishell3/vc0/path.sh | 14 +--- examples/aishell3/vc1/local/train.sh | 14 +--- examples/aishell3/vc1/path.sh | 14 +--- examples/aishell3/vc2/local/synthesize.sh | 21 +----- examples/aishell3/vc2/local/train.sh | 14 +--- examples/aishell3/vc2/path.sh | 14 +--- examples/aishell3/voc1/local/preprocess.sh | 10 ++- examples/aishell3/voc1/local/synthesize.sh | 15 +---- examples/aishell3/voc1/local/train.sh | 14 +--- examples/aishell3/voc1/path.sh | 14 +--- examples/aishell3/voc5/local/preprocess.sh | 56 +--------------- examples/aishell3/voc5/local/synthesize.sh | 15 +---- examples/aishell3/voc5/local/train.sh | 14 +--- examples/aishell3/voc5/path.sh | 14 +--- .../ernie_sat/local/synthesize.sh | 26 +------- .../aishell3_vctk/ernie_sat/local/train.sh | 13 +--- examples/aishell3_vctk/ernie_sat/path.sh | 14 +--- examples/csmsc/voc1/local/preprocess.sh | 10 ++- examples/csmsc/voc3/finetune.sh | 65 +------------------ examples/csmsc/voc3/local/preprocess.sh | 56 +--------------- examples/csmsc/voc3/local/train.sh | 14 +--- examples/csmsc/voc4/local/preprocess.sh | 56 +--------------- examples/csmsc/voc4/local/train.sh | 14 +--- examples/csmsc/voc5/finetune.sh | 9 ++- examples/csmsc/voc5/local/preprocess.sh | 56 +--------------- examples/csmsc/voc5/local/train.sh | 14 +--- examples/csmsc/voc6/local/preprocess.sh | 10 ++- examples/csmsc/voc6/local/train.sh | 14 +--- examples/ljspeech/tts0/local/train.sh | 13 +--- examples/ljspeech/tts0/path.sh | 14 +--- examples/ljspeech/tts3/local/train.sh | 13 +--- examples/ljspeech/tts3/path.sh | 14 +--- examples/ljspeech/voc1/local/preprocess.sh | 10 ++- examples/ljspeech/voc1/local/synthesize.sh | 15 +---- examples/ljspeech/voc1/local/train.sh | 14 +--- examples/ljspeech/voc1/path.sh | 14 +--- examples/ljspeech/voc5/local/preprocess.sh | 56 +--------------- examples/ljspeech/voc5/local/synthesize.sh | 15 +---- examples/ljspeech/voc5/local/train.sh | 14 +--- examples/ljspeech/voc5/path.sh | 14 +--- examples/vctk/ernie_sat/local/train.sh | 13 +--- examples/vctk/ernie_sat/path.sh | 14 +--- examples/vctk/tts3/local/train.sh | 14 +--- examples/vctk/tts3/path.sh | 14 +--- examples/vctk/voc1/local/preprocess.sh | 10 ++- examples/vctk/voc1/local/synthesize.sh | 15 +---- examples/vctk/voc1/local/train.sh | 14 +--- examples/vctk/voc1/path.sh | 14 +--- examples/vctk/voc5/local/preprocess.sh | 56 +--------------- examples/vctk/voc5/local/synthesize.sh | 15 +---- examples/vctk/voc5/local/train.sh | 14 +--- examples/vctk/voc5/path.sh | 14 +--- examples/zh_en_tts/tts3/local/train.sh | 14 +--- examples/zh_en_tts/tts3/path.sh | 14 +--- 55 files changed, 90 insertions(+), 979 deletions(-) mode change 100755 => 120000 examples/aishell3/tts3/path.sh mode change 100755 => 120000 examples/aishell3/vc0/path.sh mode change 100755 => 120000 examples/aishell3/vc1/local/train.sh mode change 100755 => 120000 examples/aishell3/vc1/path.sh mode change 100755 => 120000 examples/aishell3/vc2/local/synthesize.sh mode change 100755 => 120000 examples/aishell3/vc2/local/train.sh mode change 100755 => 120000 examples/aishell3/vc2/path.sh mode change 100755 => 120000 examples/aishell3/voc1/local/synthesize.sh mode change 100755 => 120000 examples/aishell3/voc1/local/train.sh mode change 100755 => 120000 examples/aishell3/voc1/path.sh mode change 100755 => 120000 examples/aishell3/voc5/local/preprocess.sh mode change 100755 => 120000 examples/aishell3/voc5/local/synthesize.sh mode change 100755 => 120000 examples/aishell3/voc5/local/train.sh mode change 100755 => 120000 examples/aishell3/voc5/path.sh mode change 100755 => 120000 examples/aishell3_vctk/ernie_sat/local/synthesize.sh mode change 100755 => 120000 examples/aishell3_vctk/ernie_sat/local/train.sh mode change 100755 => 120000 examples/aishell3_vctk/ernie_sat/path.sh mode change 100755 => 120000 examples/csmsc/voc3/finetune.sh mode change 100755 => 120000 examples/csmsc/voc3/local/preprocess.sh mode change 100755 => 120000 examples/csmsc/voc3/local/train.sh mode change 100755 => 120000 examples/csmsc/voc4/local/preprocess.sh mode change 100755 => 120000 examples/csmsc/voc4/local/train.sh mode change 100755 => 120000 examples/csmsc/voc5/local/preprocess.sh mode change 100755 => 120000 examples/csmsc/voc5/local/train.sh mode change 100755 => 120000 examples/csmsc/voc6/local/train.sh mode change 100755 => 120000 examples/ljspeech/tts0/local/train.sh mode change 100755 => 120000 examples/ljspeech/tts0/path.sh mode change 100755 => 120000 examples/ljspeech/tts3/local/train.sh mode change 100755 => 120000 examples/ljspeech/tts3/path.sh mode change 100755 => 120000 examples/ljspeech/voc1/local/synthesize.sh mode change 100755 => 120000 examples/ljspeech/voc1/local/train.sh mode change 100755 => 120000 examples/ljspeech/voc1/path.sh mode change 100755 => 120000 examples/ljspeech/voc5/local/preprocess.sh mode change 100755 => 120000 examples/ljspeech/voc5/local/synthesize.sh mode change 100755 => 120000 examples/ljspeech/voc5/local/train.sh mode change 100755 => 120000 examples/ljspeech/voc5/path.sh mode change 100755 => 120000 examples/vctk/ernie_sat/local/train.sh mode change 100755 => 120000 examples/vctk/ernie_sat/path.sh mode change 100755 => 120000 examples/vctk/tts3/local/train.sh mode change 100755 => 120000 examples/vctk/tts3/path.sh mode change 100755 => 120000 examples/vctk/voc1/local/synthesize.sh mode change 100755 => 120000 examples/vctk/voc1/local/train.sh mode change 100755 => 120000 examples/vctk/voc1/path.sh mode change 100755 => 120000 examples/vctk/voc5/local/preprocess.sh mode change 100755 => 120000 examples/vctk/voc5/local/synthesize.sh mode change 100755 => 120000 examples/vctk/voc5/local/train.sh mode change 100755 => 120000 examples/vctk/voc5/path.sh mode change 100755 => 120000 examples/zh_en_tts/tts3/local/train.sh mode change 100755 => 120000 examples/zh_en_tts/tts3/path.sh diff --git a/examples/aishell3/tts3/path.sh b/examples/aishell3/tts3/path.sh deleted file mode 100755 index fb7e8411c..000000000 --- a/examples/aishell3/tts3/path.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` - -export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} -export LC_ALL=C - -export PYTHONDONTWRITEBYTECODE=1 -# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} - -MODEL=fastspeech2 -export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL} diff --git a/examples/aishell3/tts3/path.sh b/examples/aishell3/tts3/path.sh new file mode 120000 index 000000000..4785b9095 --- /dev/null +++ b/examples/aishell3/tts3/path.sh @@ -0,0 +1 @@ +../../csmsc/tts3/path.sh \ No newline at end of file diff --git a/examples/aishell3/vc0/path.sh b/examples/aishell3/vc0/path.sh deleted file mode 100755 index a37cd21e3..000000000 --- a/examples/aishell3/vc0/path.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` - -export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} -export LC_ALL=C - -export PYTHONDONTWRITEBYTECODE=1 -# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} - -MODEL=tacotron2 -export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL} diff --git a/examples/aishell3/vc0/path.sh b/examples/aishell3/vc0/path.sh new file mode 120000 index 000000000..9e1fdbd16 --- /dev/null +++ b/examples/aishell3/vc0/path.sh @@ -0,0 +1 @@ +../../csmsc/tts0/path.sh \ No newline at end of file diff --git a/examples/aishell3/vc1/local/train.sh b/examples/aishell3/vc1/local/train.sh deleted file mode 100755 index c775fcadc..000000000 --- a/examples/aishell3/vc1/local/train.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 - -python3 ${BIN_DIR}/train.py \ - --train-metadata=dump/train/norm/metadata.jsonl \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=${config_path} \ - --output-dir=${train_output_path} \ - --ngpu=2 \ - --phones-dict=dump/phone_id_map.txt \ - --voice-cloning=True \ No newline at end of file diff --git a/examples/aishell3/vc1/local/train.sh b/examples/aishell3/vc1/local/train.sh new file mode 120000 index 000000000..115a0b8dc --- /dev/null +++ b/examples/aishell3/vc1/local/train.sh @@ -0,0 +1 @@ +../../vc0/local/train.sh \ No newline at end of file diff --git a/examples/aishell3/vc1/path.sh b/examples/aishell3/vc1/path.sh deleted file mode 100755 index fb7e8411c..000000000 --- a/examples/aishell3/vc1/path.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` - -export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} -export LC_ALL=C - -export PYTHONDONTWRITEBYTECODE=1 -# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} - -MODEL=fastspeech2 -export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL} diff --git a/examples/aishell3/vc1/path.sh b/examples/aishell3/vc1/path.sh new file mode 120000 index 000000000..4785b9095 --- /dev/null +++ b/examples/aishell3/vc1/path.sh @@ -0,0 +1 @@ +../../csmsc/tts3/path.sh \ No newline at end of file diff --git a/examples/aishell3/vc2/local/synthesize.sh b/examples/aishell3/vc2/local/synthesize.sh deleted file mode 100755 index 8fd8977d3..000000000 --- a/examples/aishell3/vc2/local/synthesize.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 -ckpt_name=$3 - -python3 ${BIN_DIR}/../synthesize.py \ - --am=fastspeech2_aishell3 \ - --am_config=${config_path} \ - --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ - --am_stat=dump/train/speech_stats.npy \ - --voc=pwgan_aishell3 \ - --voc_config=pwg_aishell3_ckpt_0.5/default.yaml \ - --voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \ - --voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \ - --test_metadata=dump/test/norm/metadata.jsonl \ - --output_dir=${train_output_path}/test \ - --phones_dict=dump/phone_id_map.txt \ - --speaker_dict=dump/speaker_id_map.txt \ - --voice-cloning=True diff --git a/examples/aishell3/vc2/local/synthesize.sh b/examples/aishell3/vc2/local/synthesize.sh new file mode 120000 index 000000000..ca8df6b04 --- /dev/null +++ b/examples/aishell3/vc2/local/synthesize.sh @@ -0,0 +1 @@ +../../vc1/local/synthesize.sh \ No newline at end of file diff --git a/examples/aishell3/vc2/local/train.sh b/examples/aishell3/vc2/local/train.sh deleted file mode 100755 index c775fcadc..000000000 --- a/examples/aishell3/vc2/local/train.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 - -python3 ${BIN_DIR}/train.py \ - --train-metadata=dump/train/norm/metadata.jsonl \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=${config_path} \ - --output-dir=${train_output_path} \ - --ngpu=2 \ - --phones-dict=dump/phone_id_map.txt \ - --voice-cloning=True \ No newline at end of file diff --git a/examples/aishell3/vc2/local/train.sh b/examples/aishell3/vc2/local/train.sh new file mode 120000 index 000000000..115a0b8dc --- /dev/null +++ b/examples/aishell3/vc2/local/train.sh @@ -0,0 +1 @@ +../../vc0/local/train.sh \ No newline at end of file diff --git a/examples/aishell3/vc2/path.sh b/examples/aishell3/vc2/path.sh deleted file mode 100755 index fb7e8411c..000000000 --- a/examples/aishell3/vc2/path.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` - -export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} -export LC_ALL=C - -export PYTHONDONTWRITEBYTECODE=1 -# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} - -MODEL=fastspeech2 -export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL} diff --git a/examples/aishell3/vc2/path.sh b/examples/aishell3/vc2/path.sh new file mode 120000 index 000000000..4785b9095 --- /dev/null +++ b/examples/aishell3/vc2/path.sh @@ -0,0 +1 @@ +../../csmsc/tts3/path.sh \ No newline at end of file diff --git a/examples/aishell3/voc1/local/preprocess.sh b/examples/aishell3/voc1/local/preprocess.sh index 44cc3dbe4..71eab68ad 100755 --- a/examples/aishell3/voc1/local/preprocess.sh +++ b/examples/aishell3/voc1/local/preprocess.sh @@ -42,14 +42,18 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then python3 ${BIN_DIR}/../normalize.py \ --metadata=dump/train/raw/metadata.jsonl \ --dumpdir=dump/train/norm \ - --stats=dump/train/feats_stats.npy + --stats=dump/train/feats_stats.npy \ + --skip-wav-copy + python3 ${BIN_DIR}/../normalize.py \ --metadata=dump/dev/raw/metadata.jsonl \ --dumpdir=dump/dev/norm \ - --stats=dump/train/feats_stats.npy + --stats=dump/train/feats_stats.npy \ + --skip-wav-copy python3 ${BIN_DIR}/../normalize.py \ --metadata=dump/test/raw/metadata.jsonl \ --dumpdir=dump/test/norm \ - --stats=dump/train/feats_stats.npy + --stats=dump/train/feats_stats.npy \ + --skip-wav-copy fi diff --git a/examples/aishell3/voc1/local/synthesize.sh b/examples/aishell3/voc1/local/synthesize.sh deleted file mode 100755 index 145557b3d..000000000 --- a/examples/aishell3/voc1/local/synthesize.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 -ckpt_name=$3 - -FLAGS_allocator_strategy=naive_best_fit \ -FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 ${BIN_DIR}/../synthesize.py \ - --config=${config_path} \ - --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \ - --test-metadata=dump/test/norm/metadata.jsonl \ - --output-dir=${train_output_path}/test \ - --generator-type=pwgan diff --git a/examples/aishell3/voc1/local/synthesize.sh b/examples/aishell3/voc1/local/synthesize.sh new file mode 120000 index 000000000..d6aecd8d1 --- /dev/null +++ b/examples/aishell3/voc1/local/synthesize.sh @@ -0,0 +1 @@ +../../../csmsc/voc1/local/synthesize.sh \ No newline at end of file diff --git a/examples/aishell3/voc1/local/train.sh b/examples/aishell3/voc1/local/train.sh deleted file mode 100755 index 9695631ef..000000000 --- a/examples/aishell3/voc1/local/train.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 - -FLAGS_cudnn_exhaustive_search=true \ -FLAGS_conv_workspace_size_limit=4000 \ -python ${BIN_DIR}/train.py \ - --train-metadata=dump/train/norm/metadata.jsonl \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=${config_path} \ - --output-dir=${train_output_path} \ - --ngpu=1 diff --git a/examples/aishell3/voc1/local/train.sh b/examples/aishell3/voc1/local/train.sh new file mode 120000 index 000000000..2942893d2 --- /dev/null +++ b/examples/aishell3/voc1/local/train.sh @@ -0,0 +1 @@ +../../../csmsc/voc1/local/train.sh \ No newline at end of file diff --git a/examples/aishell3/voc1/path.sh b/examples/aishell3/voc1/path.sh deleted file mode 100755 index 1e6647b86..000000000 --- a/examples/aishell3/voc1/path.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` - -export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} -export LC_ALL=C - -export PYTHONDONTWRITEBYTECODE=1 -# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} - -MODEL=parallelwave_gan -export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL} \ No newline at end of file diff --git a/examples/aishell3/voc1/path.sh b/examples/aishell3/voc1/path.sh new file mode 120000 index 000000000..b7ed4fb8f --- /dev/null +++ b/examples/aishell3/voc1/path.sh @@ -0,0 +1 @@ +../../csmsc/voc1/path.sh \ No newline at end of file diff --git a/examples/aishell3/voc5/local/preprocess.sh b/examples/aishell3/voc5/local/preprocess.sh deleted file mode 100755 index 44cc3dbe4..000000000 --- a/examples/aishell3/voc5/local/preprocess.sh +++ /dev/null @@ -1,55 +0,0 @@ -#!/bin/bash - -stage=0 -stop_stage=100 - -config_path=$1 - -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - # get durations from MFA's result - echo "Generate durations.txt from MFA results ..." - python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \ - --inputdir=./aishell3_alignment_tone \ - --output=durations.txt \ - --config=${config_path} -fi - -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - # extract features - echo "Extract features ..." - python3 ${BIN_DIR}/../preprocess.py \ - --rootdir=~/datasets/data_aishell3/ \ - --dataset=aishell3 \ - --dumpdir=dump \ - --dur-file=durations.txt \ - --config=${config_path} \ - --cut-sil=True \ - --num-cpu=20 -fi - -if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - # get features' stats(mean and std) - echo "Get features' stats ..." - python3 ${MAIN_ROOT}/utils/compute_statistics.py \ - --metadata=dump/train/raw/metadata.jsonl \ - --field-name="feats" -fi - -if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - # normalize, dev and test should use train's stats - echo "Normalize ..." - - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump/train/raw/metadata.jsonl \ - --dumpdir=dump/train/norm \ - --stats=dump/train/feats_stats.npy - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump/dev/raw/metadata.jsonl \ - --dumpdir=dump/dev/norm \ - --stats=dump/train/feats_stats.npy - - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump/test/raw/metadata.jsonl \ - --dumpdir=dump/test/norm \ - --stats=dump/train/feats_stats.npy -fi diff --git a/examples/aishell3/voc5/local/preprocess.sh b/examples/aishell3/voc5/local/preprocess.sh new file mode 120000 index 000000000..f0cb24de9 --- /dev/null +++ b/examples/aishell3/voc5/local/preprocess.sh @@ -0,0 +1 @@ +../../voc1/local/preprocess.sh \ No newline at end of file diff --git a/examples/aishell3/voc5/local/synthesize.sh b/examples/aishell3/voc5/local/synthesize.sh deleted file mode 100755 index 647896175..000000000 --- a/examples/aishell3/voc5/local/synthesize.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 -ckpt_name=$3 - -FLAGS_allocator_strategy=naive_best_fit \ -FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 ${BIN_DIR}/../synthesize.py \ - --config=${config_path} \ - --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \ - --test-metadata=dump/test/norm/metadata.jsonl \ - --output-dir=${train_output_path}/test \ - --generator-type=hifigan diff --git a/examples/aishell3/voc5/local/synthesize.sh b/examples/aishell3/voc5/local/synthesize.sh new file mode 120000 index 000000000..c887112c0 --- /dev/null +++ b/examples/aishell3/voc5/local/synthesize.sh @@ -0,0 +1 @@ +../../../csmsc/voc5/local/synthesize.sh \ No newline at end of file diff --git a/examples/aishell3/voc5/local/train.sh b/examples/aishell3/voc5/local/train.sh deleted file mode 100755 index 9695631ef..000000000 --- a/examples/aishell3/voc5/local/train.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 - -FLAGS_cudnn_exhaustive_search=true \ -FLAGS_conv_workspace_size_limit=4000 \ -python ${BIN_DIR}/train.py \ - --train-metadata=dump/train/norm/metadata.jsonl \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=${config_path} \ - --output-dir=${train_output_path} \ - --ngpu=1 diff --git a/examples/aishell3/voc5/local/train.sh b/examples/aishell3/voc5/local/train.sh new file mode 120000 index 000000000..2942893d2 --- /dev/null +++ b/examples/aishell3/voc5/local/train.sh @@ -0,0 +1 @@ +../../../csmsc/voc1/local/train.sh \ No newline at end of file diff --git a/examples/aishell3/voc5/path.sh b/examples/aishell3/voc5/path.sh deleted file mode 100755 index 7451b3218..000000000 --- a/examples/aishell3/voc5/path.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` - -export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} -export LC_ALL=C - -export PYTHONDONTWRITEBYTECODE=1 -# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} - -MODEL=hifigan -export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL} diff --git a/examples/aishell3/voc5/path.sh b/examples/aishell3/voc5/path.sh new file mode 120000 index 000000000..b67fe2b39 --- /dev/null +++ b/examples/aishell3/voc5/path.sh @@ -0,0 +1 @@ +../../csmsc/voc5/path.sh \ No newline at end of file diff --git a/examples/aishell3_vctk/ernie_sat/local/synthesize.sh b/examples/aishell3_vctk/ernie_sat/local/synthesize.sh deleted file mode 100755 index 8b4178f13..000000000 --- a/examples/aishell3_vctk/ernie_sat/local/synthesize.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 -ckpt_name=$3 - -stage=0 -stop_stage=0 - -# hifigan -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - FLAGS_allocator_strategy=naive_best_fit \ - FLAGS_fraction_of_gpu_memory_to_use=0.01 \ - python3 ${BIN_DIR}/synthesize.py \ - --erniesat_config=${config_path} \ - --erniesat_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ - --erniesat_stat=dump/train/speech_stats.npy \ - --voc=hifigan_aishell3 \ - --voc_config=hifigan_aishell3_ckpt_0.2.0/default.yaml \ - --voc_ckpt=hifigan_aishell3_ckpt_0.2.0/snapshot_iter_2500000.pdz \ - --voc_stat=hifigan_aishell3_ckpt_0.2.0/feats_stats.npy \ - --test_metadata=dump/test/norm/metadata.jsonl \ - --output_dir=${train_output_path}/test \ - --phones_dict=dump/phone_id_map.txt -fi diff --git a/examples/aishell3_vctk/ernie_sat/local/synthesize.sh b/examples/aishell3_vctk/ernie_sat/local/synthesize.sh new file mode 120000 index 000000000..5703dcb2c --- /dev/null +++ b/examples/aishell3_vctk/ernie_sat/local/synthesize.sh @@ -0,0 +1 @@ +../../../aishell3/ernie_sat/local/synthesize.sh \ No newline at end of file diff --git a/examples/aishell3_vctk/ernie_sat/local/train.sh b/examples/aishell3_vctk/ernie_sat/local/train.sh deleted file mode 100755 index 526aac435..000000000 --- a/examples/aishell3_vctk/ernie_sat/local/train.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 - -python3 ${BIN_DIR}/train.py \ - --train-metadata=dump/train/norm/metadata.jsonl \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=${config_path} \ - --output-dir=${train_output_path} \ - --ngpu=8 \ - --phones-dict=dump/phone_id_map.txt \ No newline at end of file diff --git a/examples/aishell3_vctk/ernie_sat/local/train.sh b/examples/aishell3_vctk/ernie_sat/local/train.sh new file mode 120000 index 000000000..9f1d2346d --- /dev/null +++ b/examples/aishell3_vctk/ernie_sat/local/train.sh @@ -0,0 +1 @@ +../../../aishell3/ernie_sat/local/train.sh \ No newline at end of file diff --git a/examples/aishell3_vctk/ernie_sat/path.sh b/examples/aishell3_vctk/ernie_sat/path.sh deleted file mode 100755 index 4ecab0251..000000000 --- a/examples/aishell3_vctk/ernie_sat/path.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` - -export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} -export LC_ALL=C - -export PYTHONDONTWRITEBYTECODE=1 -# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} - -MODEL=ernie_sat -export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL} \ No newline at end of file diff --git a/examples/aishell3_vctk/ernie_sat/path.sh b/examples/aishell3_vctk/ernie_sat/path.sh new file mode 120000 index 000000000..5ec397590 --- /dev/null +++ b/examples/aishell3_vctk/ernie_sat/path.sh @@ -0,0 +1 @@ +../../aishell3/ernie_sat/path.sh \ No newline at end of file diff --git a/examples/csmsc/voc1/local/preprocess.sh b/examples/csmsc/voc1/local/preprocess.sh index 61d6d62be..62d0717b9 100755 --- a/examples/csmsc/voc1/local/preprocess.sh +++ b/examples/csmsc/voc1/local/preprocess.sh @@ -42,14 +42,18 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then python3 ${BIN_DIR}/../normalize.py \ --metadata=dump/train/raw/metadata.jsonl \ --dumpdir=dump/train/norm \ - --stats=dump/train/feats_stats.npy + --stats=dump/train/feats_stats.npy \ + --skip-wav-copy + python3 ${BIN_DIR}/../normalize.py \ --metadata=dump/dev/raw/metadata.jsonl \ --dumpdir=dump/dev/norm \ - --stats=dump/train/feats_stats.npy + --stats=dump/train/feats_stats.npy \ + --skip-wav-copy python3 ${BIN_DIR}/../normalize.py \ --metadata=dump/test/raw/metadata.jsonl \ --dumpdir=dump/test/norm \ - --stats=dump/train/feats_stats.npy + --stats=dump/train/feats_stats.npy \ + --skip-wav-copy fi diff --git a/examples/csmsc/voc3/finetune.sh b/examples/csmsc/voc3/finetune.sh deleted file mode 100755 index 6719bd0be..000000000 --- a/examples/csmsc/voc3/finetune.sh +++ /dev/null @@ -1,64 +0,0 @@ -#!/bin/bash - -source path.sh - -gpus=0 -stage=0 -stop_stage=100 - -source ${MAIN_ROOT}/utils/parse_options.sh || exit 1 - -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - python3 ${MAIN_ROOT}/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py \ - --fastspeech2-config=fastspeech2_nosil_baker_ckpt_0.4/default.yaml \ - --fastspeech2-checkpoint=fastspeech2_nosil_baker_ckpt_0.4/snapshot_iter_76000.pdz \ - --fastspeech2-stat=fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy \ - --dur-file=durations.txt \ - --output-dir=dump_finetune \ - --phones-dict=fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt \ - --dataset=baker \ - --rootdir=~/datasets/BZNSYP/ -fi - -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - python3 ${MAIN_ROOT}/utils/link_wav.py \ - --old-dump-dir=dump \ - --dump-dir=dump_finetune -fi - -if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - # get features' stats(mean and std) - echo "Get features' stats ..." - cp dump/train/feats_stats.npy dump_finetune/train/ -fi - -if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - # normalize, dev and test should use train's stats - echo "Normalize ..." - - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump_finetune/train/raw/metadata.jsonl \ - --dumpdir=dump_finetune/train/norm \ - --stats=dump_finetune/train/feats_stats.npy - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump_finetune/dev/raw/metadata.jsonl \ - --dumpdir=dump_finetune/dev/norm \ - --stats=dump_finetune/train/feats_stats.npy - - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump_finetune/test/raw/metadata.jsonl \ - --dumpdir=dump_finetune/test/norm \ - --stats=dump_finetune/train/feats_stats.npy -fi - -if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then - CUDA_VISIBLE_DEVICES=${gpus} \ - FLAGS_cudnn_exhaustive_search=true \ - FLAGS_conv_workspace_size_limit=4000 \ - python ${BIN_DIR}/train.py \ - --train-metadata=dump_finetune/train/norm/metadata.jsonl \ - --dev-metadata=dump_finetune/dev/norm/metadata.jsonl \ - --config=conf/finetune.yaml \ - --output-dir=exp/finetune \ - --ngpu=1 -fi \ No newline at end of file diff --git a/examples/csmsc/voc3/finetune.sh b/examples/csmsc/voc3/finetune.sh new file mode 120000 index 000000000..b6fa868e2 --- /dev/null +++ b/examples/csmsc/voc3/finetune.sh @@ -0,0 +1 @@ +../voc5/finetune.sh \ No newline at end of file diff --git a/examples/csmsc/voc3/local/preprocess.sh b/examples/csmsc/voc3/local/preprocess.sh deleted file mode 100755 index 61d6d62be..000000000 --- a/examples/csmsc/voc3/local/preprocess.sh +++ /dev/null @@ -1,55 +0,0 @@ -#!/bin/bash - -stage=0 -stop_stage=100 - -config_path=$1 - -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - # get durations from MFA's result - echo "Generate durations.txt from MFA results ..." - python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \ - --inputdir=./baker_alignment_tone \ - --output=durations.txt \ - --config=${config_path} -fi - -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - # extract features - echo "Extract features ..." - python3 ${BIN_DIR}/../preprocess.py \ - --rootdir=~/datasets/BZNSYP/ \ - --dataset=baker \ - --dumpdir=dump \ - --dur-file=durations.txt \ - --config=${config_path} \ - --cut-sil=True \ - --num-cpu=20 -fi - -if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - # get features' stats(mean and std) - echo "Get features' stats ..." - python3 ${MAIN_ROOT}/utils/compute_statistics.py \ - --metadata=dump/train/raw/metadata.jsonl \ - --field-name="feats" -fi - -if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - # normalize, dev and test should use train's stats - echo "Normalize ..." - - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump/train/raw/metadata.jsonl \ - --dumpdir=dump/train/norm \ - --stats=dump/train/feats_stats.npy - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump/dev/raw/metadata.jsonl \ - --dumpdir=dump/dev/norm \ - --stats=dump/train/feats_stats.npy - - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump/test/raw/metadata.jsonl \ - --dumpdir=dump/test/norm \ - --stats=dump/train/feats_stats.npy -fi diff --git a/examples/csmsc/voc3/local/preprocess.sh b/examples/csmsc/voc3/local/preprocess.sh new file mode 120000 index 000000000..f0cb24de9 --- /dev/null +++ b/examples/csmsc/voc3/local/preprocess.sh @@ -0,0 +1 @@ +../../voc1/local/preprocess.sh \ No newline at end of file diff --git a/examples/csmsc/voc3/local/train.sh b/examples/csmsc/voc3/local/train.sh deleted file mode 100755 index 9695631ef..000000000 --- a/examples/csmsc/voc3/local/train.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 - -FLAGS_cudnn_exhaustive_search=true \ -FLAGS_conv_workspace_size_limit=4000 \ -python ${BIN_DIR}/train.py \ - --train-metadata=dump/train/norm/metadata.jsonl \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=${config_path} \ - --output-dir=${train_output_path} \ - --ngpu=1 diff --git a/examples/csmsc/voc3/local/train.sh b/examples/csmsc/voc3/local/train.sh new file mode 120000 index 000000000..9ec3ed94b --- /dev/null +++ b/examples/csmsc/voc3/local/train.sh @@ -0,0 +1 @@ +../../voc1/local/train.sh \ No newline at end of file diff --git a/examples/csmsc/voc4/local/preprocess.sh b/examples/csmsc/voc4/local/preprocess.sh deleted file mode 100755 index 61d6d62be..000000000 --- a/examples/csmsc/voc4/local/preprocess.sh +++ /dev/null @@ -1,55 +0,0 @@ -#!/bin/bash - -stage=0 -stop_stage=100 - -config_path=$1 - -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - # get durations from MFA's result - echo "Generate durations.txt from MFA results ..." - python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \ - --inputdir=./baker_alignment_tone \ - --output=durations.txt \ - --config=${config_path} -fi - -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - # extract features - echo "Extract features ..." - python3 ${BIN_DIR}/../preprocess.py \ - --rootdir=~/datasets/BZNSYP/ \ - --dataset=baker \ - --dumpdir=dump \ - --dur-file=durations.txt \ - --config=${config_path} \ - --cut-sil=True \ - --num-cpu=20 -fi - -if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - # get features' stats(mean and std) - echo "Get features' stats ..." - python3 ${MAIN_ROOT}/utils/compute_statistics.py \ - --metadata=dump/train/raw/metadata.jsonl \ - --field-name="feats" -fi - -if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - # normalize, dev and test should use train's stats - echo "Normalize ..." - - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump/train/raw/metadata.jsonl \ - --dumpdir=dump/train/norm \ - --stats=dump/train/feats_stats.npy - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump/dev/raw/metadata.jsonl \ - --dumpdir=dump/dev/norm \ - --stats=dump/train/feats_stats.npy - - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump/test/raw/metadata.jsonl \ - --dumpdir=dump/test/norm \ - --stats=dump/train/feats_stats.npy -fi diff --git a/examples/csmsc/voc4/local/preprocess.sh b/examples/csmsc/voc4/local/preprocess.sh new file mode 120000 index 000000000..f0cb24de9 --- /dev/null +++ b/examples/csmsc/voc4/local/preprocess.sh @@ -0,0 +1 @@ +../../voc1/local/preprocess.sh \ No newline at end of file diff --git a/examples/csmsc/voc4/local/train.sh b/examples/csmsc/voc4/local/train.sh deleted file mode 100755 index 9695631ef..000000000 --- a/examples/csmsc/voc4/local/train.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 - -FLAGS_cudnn_exhaustive_search=true \ -FLAGS_conv_workspace_size_limit=4000 \ -python ${BIN_DIR}/train.py \ - --train-metadata=dump/train/norm/metadata.jsonl \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=${config_path} \ - --output-dir=${train_output_path} \ - --ngpu=1 diff --git a/examples/csmsc/voc4/local/train.sh b/examples/csmsc/voc4/local/train.sh new file mode 120000 index 000000000..9ec3ed94b --- /dev/null +++ b/examples/csmsc/voc4/local/train.sh @@ -0,0 +1 @@ +../../voc1/local/train.sh \ No newline at end of file diff --git a/examples/csmsc/voc5/finetune.sh b/examples/csmsc/voc5/finetune.sh index 6719bd0be..eb8325aeb 100755 --- a/examples/csmsc/voc5/finetune.sh +++ b/examples/csmsc/voc5/finetune.sh @@ -39,16 +39,19 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then python3 ${BIN_DIR}/../normalize.py \ --metadata=dump_finetune/train/raw/metadata.jsonl \ --dumpdir=dump_finetune/train/norm \ - --stats=dump_finetune/train/feats_stats.npy + --stats=dump_finetune/train/feats_stats.npy \ + --skip-wav-copy python3 ${BIN_DIR}/../normalize.py \ --metadata=dump_finetune/dev/raw/metadata.jsonl \ --dumpdir=dump_finetune/dev/norm \ - --stats=dump_finetune/train/feats_stats.npy + --stats=dump_finetune/train/feats_stats.npy \ + --skip-wav-copy python3 ${BIN_DIR}/../normalize.py \ --metadata=dump_finetune/test/raw/metadata.jsonl \ --dumpdir=dump_finetune/test/norm \ - --stats=dump_finetune/train/feats_stats.npy + --stats=dump_finetune/train/feats_stats.npy \ + --skip-wav-copy fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then diff --git a/examples/csmsc/voc5/local/preprocess.sh b/examples/csmsc/voc5/local/preprocess.sh deleted file mode 100755 index 61d6d62be..000000000 --- a/examples/csmsc/voc5/local/preprocess.sh +++ /dev/null @@ -1,55 +0,0 @@ -#!/bin/bash - -stage=0 -stop_stage=100 - -config_path=$1 - -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - # get durations from MFA's result - echo "Generate durations.txt from MFA results ..." - python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \ - --inputdir=./baker_alignment_tone \ - --output=durations.txt \ - --config=${config_path} -fi - -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - # extract features - echo "Extract features ..." - python3 ${BIN_DIR}/../preprocess.py \ - --rootdir=~/datasets/BZNSYP/ \ - --dataset=baker \ - --dumpdir=dump \ - --dur-file=durations.txt \ - --config=${config_path} \ - --cut-sil=True \ - --num-cpu=20 -fi - -if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - # get features' stats(mean and std) - echo "Get features' stats ..." - python3 ${MAIN_ROOT}/utils/compute_statistics.py \ - --metadata=dump/train/raw/metadata.jsonl \ - --field-name="feats" -fi - -if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - # normalize, dev and test should use train's stats - echo "Normalize ..." - - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump/train/raw/metadata.jsonl \ - --dumpdir=dump/train/norm \ - --stats=dump/train/feats_stats.npy - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump/dev/raw/metadata.jsonl \ - --dumpdir=dump/dev/norm \ - --stats=dump/train/feats_stats.npy - - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump/test/raw/metadata.jsonl \ - --dumpdir=dump/test/norm \ - --stats=dump/train/feats_stats.npy -fi diff --git a/examples/csmsc/voc5/local/preprocess.sh b/examples/csmsc/voc5/local/preprocess.sh new file mode 120000 index 000000000..f0cb24de9 --- /dev/null +++ b/examples/csmsc/voc5/local/preprocess.sh @@ -0,0 +1 @@ +../../voc1/local/preprocess.sh \ No newline at end of file diff --git a/examples/csmsc/voc5/local/train.sh b/examples/csmsc/voc5/local/train.sh deleted file mode 100755 index 9695631ef..000000000 --- a/examples/csmsc/voc5/local/train.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 - -FLAGS_cudnn_exhaustive_search=true \ -FLAGS_conv_workspace_size_limit=4000 \ -python ${BIN_DIR}/train.py \ - --train-metadata=dump/train/norm/metadata.jsonl \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=${config_path} \ - --output-dir=${train_output_path} \ - --ngpu=1 diff --git a/examples/csmsc/voc5/local/train.sh b/examples/csmsc/voc5/local/train.sh new file mode 120000 index 000000000..9ec3ed94b --- /dev/null +++ b/examples/csmsc/voc5/local/train.sh @@ -0,0 +1 @@ +../../voc1/local/train.sh \ No newline at end of file diff --git a/examples/csmsc/voc6/local/preprocess.sh b/examples/csmsc/voc6/local/preprocess.sh index 2dcc39ac7..509824b8e 100755 --- a/examples/csmsc/voc6/local/preprocess.sh +++ b/examples/csmsc/voc6/local/preprocess.sh @@ -42,14 +42,18 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then python3 ${BIN_DIR}/../gan_vocoder/normalize.py \ --metadata=dump/train/raw/metadata.jsonl \ --dumpdir=dump/train/norm \ - --stats=dump/train/feats_stats.npy + --stats=dump/train/feats_stats.npy \ + --skip-wav-copy + python3 ${BIN_DIR}/../gan_vocoder/normalize.py \ --metadata=dump/dev/raw/metadata.jsonl \ --dumpdir=dump/dev/norm \ - --stats=dump/train/feats_stats.npy + --stats=dump/train/feats_stats.npy \ + --skip-wav-copy python3 ${BIN_DIR}/../gan_vocoder/normalize.py \ --metadata=dump/test/raw/metadata.jsonl \ --dumpdir=dump/test/norm \ - --stats=dump/train/feats_stats.npy + --stats=dump/train/feats_stats.npy \ + --skip-wav-copy fi diff --git a/examples/csmsc/voc6/local/train.sh b/examples/csmsc/voc6/local/train.sh deleted file mode 100755 index 9695631ef..000000000 --- a/examples/csmsc/voc6/local/train.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 - -FLAGS_cudnn_exhaustive_search=true \ -FLAGS_conv_workspace_size_limit=4000 \ -python ${BIN_DIR}/train.py \ - --train-metadata=dump/train/norm/metadata.jsonl \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=${config_path} \ - --output-dir=${train_output_path} \ - --ngpu=1 diff --git a/examples/csmsc/voc6/local/train.sh b/examples/csmsc/voc6/local/train.sh new file mode 120000 index 000000000..9ec3ed94b --- /dev/null +++ b/examples/csmsc/voc6/local/train.sh @@ -0,0 +1 @@ +../../voc1/local/train.sh \ No newline at end of file diff --git a/examples/ljspeech/tts0/local/train.sh b/examples/ljspeech/tts0/local/train.sh deleted file mode 100755 index f90db9150..000000000 --- a/examples/ljspeech/tts0/local/train.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 - -python3 ${BIN_DIR}/train.py \ - --train-metadata=dump/train/norm/metadata.jsonl \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=${config_path} \ - --output-dir=${train_output_path} \ - --ngpu=1 \ - --phones-dict=dump/phone_id_map.txt \ No newline at end of file diff --git a/examples/ljspeech/tts0/local/train.sh b/examples/ljspeech/tts0/local/train.sh new file mode 120000 index 000000000..7f54e9239 --- /dev/null +++ b/examples/ljspeech/tts0/local/train.sh @@ -0,0 +1 @@ +../../../csmsc/tts0/local/train.sh \ No newline at end of file diff --git a/examples/ljspeech/tts0/path.sh b/examples/ljspeech/tts0/path.sh deleted file mode 100755 index a37cd21e3..000000000 --- a/examples/ljspeech/tts0/path.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` - -export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} -export LC_ALL=C - -export PYTHONDONTWRITEBYTECODE=1 -# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} - -MODEL=tacotron2 -export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL} diff --git a/examples/ljspeech/tts0/path.sh b/examples/ljspeech/tts0/path.sh new file mode 120000 index 000000000..9e1fdbd16 --- /dev/null +++ b/examples/ljspeech/tts0/path.sh @@ -0,0 +1 @@ +../../csmsc/tts0/path.sh \ No newline at end of file diff --git a/examples/ljspeech/tts3/local/train.sh b/examples/ljspeech/tts3/local/train.sh deleted file mode 100755 index d1302f99f..000000000 --- a/examples/ljspeech/tts3/local/train.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 - -python3 ${BIN_DIR}/train.py \ - --train-metadata=dump/train/norm/metadata.jsonl \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=${config_path} \ - --output-dir=${train_output_path} \ - --ngpu=1 \ - --phones-dict=dump/phone_id_map.txt diff --git a/examples/ljspeech/tts3/local/train.sh b/examples/ljspeech/tts3/local/train.sh new file mode 120000 index 000000000..d7b05058e --- /dev/null +++ b/examples/ljspeech/tts3/local/train.sh @@ -0,0 +1 @@ +../../../csmsc/tts3/local/train.sh \ No newline at end of file diff --git a/examples/ljspeech/tts3/path.sh b/examples/ljspeech/tts3/path.sh deleted file mode 100755 index fb7e8411c..000000000 --- a/examples/ljspeech/tts3/path.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` - -export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} -export LC_ALL=C - -export PYTHONDONTWRITEBYTECODE=1 -# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} - -MODEL=fastspeech2 -export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL} diff --git a/examples/ljspeech/tts3/path.sh b/examples/ljspeech/tts3/path.sh new file mode 120000 index 000000000..4785b9095 --- /dev/null +++ b/examples/ljspeech/tts3/path.sh @@ -0,0 +1 @@ +../../csmsc/tts3/path.sh \ No newline at end of file diff --git a/examples/ljspeech/voc1/local/preprocess.sh b/examples/ljspeech/voc1/local/preprocess.sh index d1af60dad..bfbf75b7d 100755 --- a/examples/ljspeech/voc1/local/preprocess.sh +++ b/examples/ljspeech/voc1/local/preprocess.sh @@ -42,14 +42,18 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then python3 ${BIN_DIR}/../normalize.py \ --metadata=dump/train/raw/metadata.jsonl \ --dumpdir=dump/train/norm \ - --stats=dump/train/feats_stats.npy + --stats=dump/train/feats_stats.npy \ + --skip-wav-copy + python3 ${BIN_DIR}/../normalize.py \ --metadata=dump/dev/raw/metadata.jsonl \ --dumpdir=dump/dev/norm \ - --stats=dump/train/feats_stats.npy + --stats=dump/train/feats_stats.npy \ + --skip-wav-copy python3 ${BIN_DIR}/../normalize.py \ --metadata=dump/test/raw/metadata.jsonl \ --dumpdir=dump/test/norm \ - --stats=dump/train/feats_stats.npy + --stats=dump/train/feats_stats.npy \ + --skip-wav-copy fi diff --git a/examples/ljspeech/voc1/local/synthesize.sh b/examples/ljspeech/voc1/local/synthesize.sh deleted file mode 100755 index 145557b3d..000000000 --- a/examples/ljspeech/voc1/local/synthesize.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 -ckpt_name=$3 - -FLAGS_allocator_strategy=naive_best_fit \ -FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 ${BIN_DIR}/../synthesize.py \ - --config=${config_path} \ - --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \ - --test-metadata=dump/test/norm/metadata.jsonl \ - --output-dir=${train_output_path}/test \ - --generator-type=pwgan diff --git a/examples/ljspeech/voc1/local/synthesize.sh b/examples/ljspeech/voc1/local/synthesize.sh new file mode 120000 index 000000000..d6aecd8d1 --- /dev/null +++ b/examples/ljspeech/voc1/local/synthesize.sh @@ -0,0 +1 @@ +../../../csmsc/voc1/local/synthesize.sh \ No newline at end of file diff --git a/examples/ljspeech/voc1/local/train.sh b/examples/ljspeech/voc1/local/train.sh deleted file mode 100755 index 9695631ef..000000000 --- a/examples/ljspeech/voc1/local/train.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 - -FLAGS_cudnn_exhaustive_search=true \ -FLAGS_conv_workspace_size_limit=4000 \ -python ${BIN_DIR}/train.py \ - --train-metadata=dump/train/norm/metadata.jsonl \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=${config_path} \ - --output-dir=${train_output_path} \ - --ngpu=1 diff --git a/examples/ljspeech/voc1/local/train.sh b/examples/ljspeech/voc1/local/train.sh new file mode 120000 index 000000000..2942893d2 --- /dev/null +++ b/examples/ljspeech/voc1/local/train.sh @@ -0,0 +1 @@ +../../../csmsc/voc1/local/train.sh \ No newline at end of file diff --git a/examples/ljspeech/voc1/path.sh b/examples/ljspeech/voc1/path.sh deleted file mode 100755 index 1e6647b86..000000000 --- a/examples/ljspeech/voc1/path.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` - -export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} -export LC_ALL=C - -export PYTHONDONTWRITEBYTECODE=1 -# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} - -MODEL=parallelwave_gan -export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL} \ No newline at end of file diff --git a/examples/ljspeech/voc1/path.sh b/examples/ljspeech/voc1/path.sh new file mode 120000 index 000000000..b7ed4fb8f --- /dev/null +++ b/examples/ljspeech/voc1/path.sh @@ -0,0 +1 @@ +../../csmsc/voc1/path.sh \ No newline at end of file diff --git a/examples/ljspeech/voc5/local/preprocess.sh b/examples/ljspeech/voc5/local/preprocess.sh deleted file mode 100755 index d1af60dad..000000000 --- a/examples/ljspeech/voc5/local/preprocess.sh +++ /dev/null @@ -1,55 +0,0 @@ -#!/bin/bash - -stage=0 -stop_stage=100 - -config_path=$1 - -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - # get durations from MFA's result - echo "Generate durations.txt from MFA results ..." - python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \ - --inputdir=./ljspeech_alignment \ - --output=durations.txt \ - --config=${config_path} -fi - -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - # extract features - echo "Extract features ..." - python3 ${BIN_DIR}/../preprocess.py \ - --rootdir=~/datasets/LJSpeech-1.1/ \ - --dataset=ljspeech \ - --dumpdir=dump \ - --dur-file=durations.txt \ - --config=${config_path} \ - --cut-sil=True \ - --num-cpu=20 -fi - -if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - # get features' stats(mean and std) - echo "Get features' stats ..." - python3 ${MAIN_ROOT}/utils/compute_statistics.py \ - --metadata=dump/train/raw/metadata.jsonl \ - --field-name="feats" -fi - -if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - # normalize, dev and test should use train's stats - echo "Normalize ..." - - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump/train/raw/metadata.jsonl \ - --dumpdir=dump/train/norm \ - --stats=dump/train/feats_stats.npy - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump/dev/raw/metadata.jsonl \ - --dumpdir=dump/dev/norm \ - --stats=dump/train/feats_stats.npy - - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump/test/raw/metadata.jsonl \ - --dumpdir=dump/test/norm \ - --stats=dump/train/feats_stats.npy -fi diff --git a/examples/ljspeech/voc5/local/preprocess.sh b/examples/ljspeech/voc5/local/preprocess.sh new file mode 120000 index 000000000..f0cb24de9 --- /dev/null +++ b/examples/ljspeech/voc5/local/preprocess.sh @@ -0,0 +1 @@ +../../voc1/local/preprocess.sh \ No newline at end of file diff --git a/examples/ljspeech/voc5/local/synthesize.sh b/examples/ljspeech/voc5/local/synthesize.sh deleted file mode 100755 index 647896175..000000000 --- a/examples/ljspeech/voc5/local/synthesize.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 -ckpt_name=$3 - -FLAGS_allocator_strategy=naive_best_fit \ -FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 ${BIN_DIR}/../synthesize.py \ - --config=${config_path} \ - --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \ - --test-metadata=dump/test/norm/metadata.jsonl \ - --output-dir=${train_output_path}/test \ - --generator-type=hifigan diff --git a/examples/ljspeech/voc5/local/synthesize.sh b/examples/ljspeech/voc5/local/synthesize.sh new file mode 120000 index 000000000..c887112c0 --- /dev/null +++ b/examples/ljspeech/voc5/local/synthesize.sh @@ -0,0 +1 @@ +../../../csmsc/voc5/local/synthesize.sh \ No newline at end of file diff --git a/examples/ljspeech/voc5/local/train.sh b/examples/ljspeech/voc5/local/train.sh deleted file mode 100755 index 9695631ef..000000000 --- a/examples/ljspeech/voc5/local/train.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 - -FLAGS_cudnn_exhaustive_search=true \ -FLAGS_conv_workspace_size_limit=4000 \ -python ${BIN_DIR}/train.py \ - --train-metadata=dump/train/norm/metadata.jsonl \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=${config_path} \ - --output-dir=${train_output_path} \ - --ngpu=1 diff --git a/examples/ljspeech/voc5/local/train.sh b/examples/ljspeech/voc5/local/train.sh new file mode 120000 index 000000000..2942893d2 --- /dev/null +++ b/examples/ljspeech/voc5/local/train.sh @@ -0,0 +1 @@ +../../../csmsc/voc1/local/train.sh \ No newline at end of file diff --git a/examples/ljspeech/voc5/path.sh b/examples/ljspeech/voc5/path.sh deleted file mode 100755 index 7451b3218..000000000 --- a/examples/ljspeech/voc5/path.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` - -export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} -export LC_ALL=C - -export PYTHONDONTWRITEBYTECODE=1 -# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} - -MODEL=hifigan -export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL} diff --git a/examples/ljspeech/voc5/path.sh b/examples/ljspeech/voc5/path.sh new file mode 120000 index 000000000..b67fe2b39 --- /dev/null +++ b/examples/ljspeech/voc5/path.sh @@ -0,0 +1 @@ +../../csmsc/voc5/path.sh \ No newline at end of file diff --git a/examples/vctk/ernie_sat/local/train.sh b/examples/vctk/ernie_sat/local/train.sh deleted file mode 100755 index 526aac435..000000000 --- a/examples/vctk/ernie_sat/local/train.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 - -python3 ${BIN_DIR}/train.py \ - --train-metadata=dump/train/norm/metadata.jsonl \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=${config_path} \ - --output-dir=${train_output_path} \ - --ngpu=8 \ - --phones-dict=dump/phone_id_map.txt \ No newline at end of file diff --git a/examples/vctk/ernie_sat/local/train.sh b/examples/vctk/ernie_sat/local/train.sh new file mode 120000 index 000000000..9f1d2346d --- /dev/null +++ b/examples/vctk/ernie_sat/local/train.sh @@ -0,0 +1 @@ +../../../aishell3/ernie_sat/local/train.sh \ No newline at end of file diff --git a/examples/vctk/ernie_sat/path.sh b/examples/vctk/ernie_sat/path.sh deleted file mode 100755 index 4ecab0251..000000000 --- a/examples/vctk/ernie_sat/path.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` - -export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} -export LC_ALL=C - -export PYTHONDONTWRITEBYTECODE=1 -# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} - -MODEL=ernie_sat -export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL} \ No newline at end of file diff --git a/examples/vctk/ernie_sat/path.sh b/examples/vctk/ernie_sat/path.sh new file mode 120000 index 000000000..5ec397590 --- /dev/null +++ b/examples/vctk/ernie_sat/path.sh @@ -0,0 +1 @@ +../../aishell3/ernie_sat/path.sh \ No newline at end of file diff --git a/examples/vctk/tts3/local/train.sh b/examples/vctk/tts3/local/train.sh deleted file mode 100755 index 3a5076505..000000000 --- a/examples/vctk/tts3/local/train.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 - -python3 ${BIN_DIR}/train.py \ - --train-metadata=dump/train/norm/metadata.jsonl \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=${config_path} \ - --output-dir=${train_output_path} \ - --ngpu=1 \ - --phones-dict=dump/phone_id_map.txt \ - --speaker-dict=dump/speaker_id_map.txt diff --git a/examples/vctk/tts3/local/train.sh b/examples/vctk/tts3/local/train.sh new file mode 120000 index 000000000..78885a300 --- /dev/null +++ b/examples/vctk/tts3/local/train.sh @@ -0,0 +1 @@ +../../../aishell3/tts3/local/train.sh \ No newline at end of file diff --git a/examples/vctk/tts3/path.sh b/examples/vctk/tts3/path.sh deleted file mode 100755 index fb7e8411c..000000000 --- a/examples/vctk/tts3/path.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` - -export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} -export LC_ALL=C - -export PYTHONDONTWRITEBYTECODE=1 -# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} - -MODEL=fastspeech2 -export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL} diff --git a/examples/vctk/tts3/path.sh b/examples/vctk/tts3/path.sh new file mode 120000 index 000000000..4785b9095 --- /dev/null +++ b/examples/vctk/tts3/path.sh @@ -0,0 +1 @@ +../../csmsc/tts3/path.sh \ No newline at end of file diff --git a/examples/vctk/voc1/local/preprocess.sh b/examples/vctk/voc1/local/preprocess.sh index 88a478cd5..6b7e5288a 100755 --- a/examples/vctk/voc1/local/preprocess.sh +++ b/examples/vctk/voc1/local/preprocess.sh @@ -42,14 +42,18 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then python3 ${BIN_DIR}/../normalize.py \ --metadata=dump/train/raw/metadata.jsonl \ --dumpdir=dump/train/norm \ - --stats=dump/train/feats_stats.npy + --stats=dump/train/feats_stats.npy \ + --skip-wav-copy + python3 ${BIN_DIR}/../normalize.py \ --metadata=dump/dev/raw/metadata.jsonl \ --dumpdir=dump/dev/norm \ - --stats=dump/train/feats_stats.npy + --stats=dump/train/feats_stats.npy \ + --skip-wav-copy python3 ${BIN_DIR}/../normalize.py \ --metadata=dump/test/raw/metadata.jsonl \ --dumpdir=dump/test/norm \ - --stats=dump/train/feats_stats.npy + --stats=dump/train/feats_stats.npy \ + --skip-wav-copy fi diff --git a/examples/vctk/voc1/local/synthesize.sh b/examples/vctk/voc1/local/synthesize.sh deleted file mode 100755 index 145557b3d..000000000 --- a/examples/vctk/voc1/local/synthesize.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 -ckpt_name=$3 - -FLAGS_allocator_strategy=naive_best_fit \ -FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 ${BIN_DIR}/../synthesize.py \ - --config=${config_path} \ - --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \ - --test-metadata=dump/test/norm/metadata.jsonl \ - --output-dir=${train_output_path}/test \ - --generator-type=pwgan diff --git a/examples/vctk/voc1/local/synthesize.sh b/examples/vctk/voc1/local/synthesize.sh new file mode 120000 index 000000000..d6aecd8d1 --- /dev/null +++ b/examples/vctk/voc1/local/synthesize.sh @@ -0,0 +1 @@ +../../../csmsc/voc1/local/synthesize.sh \ No newline at end of file diff --git a/examples/vctk/voc1/local/train.sh b/examples/vctk/voc1/local/train.sh deleted file mode 100755 index 9695631ef..000000000 --- a/examples/vctk/voc1/local/train.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 - -FLAGS_cudnn_exhaustive_search=true \ -FLAGS_conv_workspace_size_limit=4000 \ -python ${BIN_DIR}/train.py \ - --train-metadata=dump/train/norm/metadata.jsonl \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=${config_path} \ - --output-dir=${train_output_path} \ - --ngpu=1 diff --git a/examples/vctk/voc1/local/train.sh b/examples/vctk/voc1/local/train.sh new file mode 120000 index 000000000..2942893d2 --- /dev/null +++ b/examples/vctk/voc1/local/train.sh @@ -0,0 +1 @@ +../../../csmsc/voc1/local/train.sh \ No newline at end of file diff --git a/examples/vctk/voc1/path.sh b/examples/vctk/voc1/path.sh deleted file mode 100755 index 1e6647b86..000000000 --- a/examples/vctk/voc1/path.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` - -export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} -export LC_ALL=C - -export PYTHONDONTWRITEBYTECODE=1 -# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} - -MODEL=parallelwave_gan -export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL} \ No newline at end of file diff --git a/examples/vctk/voc1/path.sh b/examples/vctk/voc1/path.sh new file mode 120000 index 000000000..b7ed4fb8f --- /dev/null +++ b/examples/vctk/voc1/path.sh @@ -0,0 +1 @@ +../../csmsc/voc1/path.sh \ No newline at end of file diff --git a/examples/vctk/voc5/local/preprocess.sh b/examples/vctk/voc5/local/preprocess.sh deleted file mode 100755 index 88a478cd5..000000000 --- a/examples/vctk/voc5/local/preprocess.sh +++ /dev/null @@ -1,55 +0,0 @@ -#!/bin/bash - -stage=0 -stop_stage=100 - -config_path=$1 - -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - # get durations from MFA's result - echo "Generate durations.txt from MFA results ..." - python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \ - --inputdir=./vctk_alignment \ - --output=durations.txt \ - --config=${config_path} -fi - -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - # extract features - echo "Extract features ..." - python3 ${BIN_DIR}/../preprocess.py \ - --rootdir=~/datasets/VCTK-Corpus-0.92/ \ - --dataset=vctk \ - --dumpdir=dump \ - --dur-file=durations.txt \ - --config=${config_path} \ - --cut-sil=True \ - --num-cpu=20 -fi - -if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - # get features' stats(mean and std) - echo "Get features' stats ..." - python3 ${MAIN_ROOT}/utils/compute_statistics.py \ - --metadata=dump/train/raw/metadata.jsonl \ - --field-name="feats" -fi - -if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - # normalize, dev and test should use train's stats - echo "Normalize ..." - - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump/train/raw/metadata.jsonl \ - --dumpdir=dump/train/norm \ - --stats=dump/train/feats_stats.npy - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump/dev/raw/metadata.jsonl \ - --dumpdir=dump/dev/norm \ - --stats=dump/train/feats_stats.npy - - python3 ${BIN_DIR}/../normalize.py \ - --metadata=dump/test/raw/metadata.jsonl \ - --dumpdir=dump/test/norm \ - --stats=dump/train/feats_stats.npy -fi diff --git a/examples/vctk/voc5/local/preprocess.sh b/examples/vctk/voc5/local/preprocess.sh new file mode 120000 index 000000000..f0cb24de9 --- /dev/null +++ b/examples/vctk/voc5/local/preprocess.sh @@ -0,0 +1 @@ +../../voc1/local/preprocess.sh \ No newline at end of file diff --git a/examples/vctk/voc5/local/synthesize.sh b/examples/vctk/voc5/local/synthesize.sh deleted file mode 100755 index 647896175..000000000 --- a/examples/vctk/voc5/local/synthesize.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 -ckpt_name=$3 - -FLAGS_allocator_strategy=naive_best_fit \ -FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 ${BIN_DIR}/../synthesize.py \ - --config=${config_path} \ - --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \ - --test-metadata=dump/test/norm/metadata.jsonl \ - --output-dir=${train_output_path}/test \ - --generator-type=hifigan diff --git a/examples/vctk/voc5/local/synthesize.sh b/examples/vctk/voc5/local/synthesize.sh new file mode 120000 index 000000000..c887112c0 --- /dev/null +++ b/examples/vctk/voc5/local/synthesize.sh @@ -0,0 +1 @@ +../../../csmsc/voc5/local/synthesize.sh \ No newline at end of file diff --git a/examples/vctk/voc5/local/train.sh b/examples/vctk/voc5/local/train.sh deleted file mode 100755 index 9695631ef..000000000 --- a/examples/vctk/voc5/local/train.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 - -FLAGS_cudnn_exhaustive_search=true \ -FLAGS_conv_workspace_size_limit=4000 \ -python ${BIN_DIR}/train.py \ - --train-metadata=dump/train/norm/metadata.jsonl \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=${config_path} \ - --output-dir=${train_output_path} \ - --ngpu=1 diff --git a/examples/vctk/voc5/local/train.sh b/examples/vctk/voc5/local/train.sh new file mode 120000 index 000000000..2942893d2 --- /dev/null +++ b/examples/vctk/voc5/local/train.sh @@ -0,0 +1 @@ +../../../csmsc/voc1/local/train.sh \ No newline at end of file diff --git a/examples/vctk/voc5/path.sh b/examples/vctk/voc5/path.sh deleted file mode 100755 index 7451b3218..000000000 --- a/examples/vctk/voc5/path.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` - -export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} -export LC_ALL=C - -export PYTHONDONTWRITEBYTECODE=1 -# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} - -MODEL=hifigan -export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL} diff --git a/examples/vctk/voc5/path.sh b/examples/vctk/voc5/path.sh new file mode 120000 index 000000000..b67fe2b39 --- /dev/null +++ b/examples/vctk/voc5/path.sh @@ -0,0 +1 @@ +../../csmsc/voc5/path.sh \ No newline at end of file diff --git a/examples/zh_en_tts/tts3/local/train.sh b/examples/zh_en_tts/tts3/local/train.sh deleted file mode 100755 index 1da72f117..000000000 --- a/examples/zh_en_tts/tts3/local/train.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -config_path=$1 -train_output_path=$2 - -python3 ${BIN_DIR}/train.py \ - --train-metadata=dump/train/norm/metadata.jsonl \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=${config_path} \ - --output-dir=${train_output_path} \ - --ngpu=2 \ - --phones-dict=dump/phone_id_map.txt \ - --speaker-dict=dump/speaker_id_map.txt diff --git a/examples/zh_en_tts/tts3/local/train.sh b/examples/zh_en_tts/tts3/local/train.sh new file mode 120000 index 000000000..78885a300 --- /dev/null +++ b/examples/zh_en_tts/tts3/local/train.sh @@ -0,0 +1 @@ +../../../aishell3/tts3/local/train.sh \ No newline at end of file diff --git a/examples/zh_en_tts/tts3/path.sh b/examples/zh_en_tts/tts3/path.sh deleted file mode 100755 index fb7e8411c..000000000 --- a/examples/zh_en_tts/tts3/path.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -export MAIN_ROOT=`realpath ${PWD}/../../../` - -export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} -export LC_ALL=C - -export PYTHONDONTWRITEBYTECODE=1 -# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} - -MODEL=fastspeech2 -export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL} diff --git a/examples/zh_en_tts/tts3/path.sh b/examples/zh_en_tts/tts3/path.sh new file mode 120000 index 000000000..4785b9095 --- /dev/null +++ b/examples/zh_en_tts/tts3/path.sh @@ -0,0 +1 @@ +../../csmsc/tts3/path.sh \ No newline at end of file From 31c2c226cacf88281332e61bd03bb863b1c1e9cf Mon Sep 17 00:00:00 2001 From: zxcd <228587199@qq.com> Date: Mon, 30 Jan 2023 19:11:02 +0800 Subject: [PATCH 030/159] clean fluid elementwise_max and square api. (#2852) --- paddlespeech/s2t/training/gradclip.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddlespeech/s2t/training/gradclip.py b/paddlespeech/s2t/training/gradclip.py index 26ac501e2..b2c0500d3 100644 --- a/paddlespeech/s2t/training/gradclip.py +++ b/paddlespeech/s2t/training/gradclip.py @@ -43,7 +43,7 @@ class ClipGradByGlobalNormWithLog(paddle.nn.ClipGradByGlobalNorm): if g.type == core.VarDesc.VarType.SELECTED_ROWS: merge_grad = layers.merge_selected_rows(g) merge_grad = layers.get_tensor_from_selected_rows(merge_grad) - square = layers.square(merge_grad) + square = paddle.square(merge_grad) sum_square = layers.reduce_sum(square) sum_square_list.append(sum_square) @@ -66,7 +66,7 @@ class ClipGradByGlobalNormWithLog(paddle.nn.ClipGradByGlobalNorm): shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm) clip_var = layers.elementwise_div( x=max_global_norm, - y=layers.elementwise_max(x=global_norm_var, y=max_global_norm)) + y=paddle.maximum(x=global_norm_var, y=max_global_norm)) for i, (p, g) in enumerate(params_grads): if g is None: continue From b5764e9f74665babfdd922189560ba269c072635 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Mon, 30 Jan 2023 19:17:37 +0800 Subject: [PATCH 031/159] [Install]rm protobuf in setup.py (#2853) * rm protobuf in setup.py && rm audio's dependances in setup.py --- audio/setup.py | 2 +- docs/requirements.txt | 4 +--- setup.py | 6 +----- 3 files changed, 3 insertions(+), 9 deletions(-) diff --git a/audio/setup.py b/audio/setup.py index 82e9a55a5..d36b2c440 100644 --- a/audio/setup.py +++ b/audio/setup.py @@ -43,7 +43,7 @@ base = [ "scipy>=1.0.0", "soundfile~=0.10", "colorlog", - "pathos == 0.2.8", + "pathos==0.2.8", "pybind11", "parameterized", "tqdm", diff --git a/docs/requirements.txt b/docs/requirements.txt index c6228d917..5422c26f9 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,5 +1,4 @@ braceexpand -colorlog editdistance fastapi g2p_en @@ -16,7 +15,7 @@ matplotlib myst-parser nara_wpe numpydoc -onnxruntime==1.10.0 +onnxruntime>=1.11.0 opencc paddlenlp # use paddlepaddle == 2.3.* according to: https://github.com/PaddlePaddle/Paddle/issues/48243 @@ -24,7 +23,6 @@ paddlepaddle>=2.2.2,<2.4.0 paddlespeech_ctcdecoders paddlespeech_feat pandas -pathos==0.2.8 pattern_singleton Pillow>=9.0.0 ppdiffusers>=0.9.0 diff --git a/setup.py b/setup.py index 212d3b109..be6cf63a9 100644 --- a/setup.py +++ b/setup.py @@ -53,7 +53,6 @@ base = [ "paddlespeech_feat", "Pillow>=9.0.0", "praatio==5.0.0", - "protobuf>=3.1.0, <=3.20.0", "pypinyin<=0.44.0", "pypinyin-dict", "python-dateutil", @@ -72,12 +71,9 @@ base = [ "yacs~=0.1.8", "prettytable", "zhon", - "colorlog", - "pathos==0.2.8", "braceexpand", "pyyaml", - "pybind11", - "paddleslim==2.3.4", + "paddleslim>=2.3.4", "paddleaudio>=1.0.2", ] From 64aeb6dccc73a262bab9f9ed2a1b8c7b15a30582 Mon Sep 17 00:00:00 2001 From: zxcd <228587199@qq.com> Date: Tue, 31 Jan 2023 19:52:45 +0800 Subject: [PATCH 032/159] remove some fluid api (elementwise_div elementwise_mul sqrt reduce_sum). (#2859) --- paddlespeech/s2t/training/gradclip.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/paddlespeech/s2t/training/gradclip.py b/paddlespeech/s2t/training/gradclip.py index b2c0500d3..be6fcf589 100644 --- a/paddlespeech/s2t/training/gradclip.py +++ b/paddlespeech/s2t/training/gradclip.py @@ -44,7 +44,7 @@ class ClipGradByGlobalNormWithLog(paddle.nn.ClipGradByGlobalNorm): merge_grad = layers.merge_selected_rows(g) merge_grad = layers.get_tensor_from_selected_rows(merge_grad) square = paddle.square(merge_grad) - sum_square = layers.reduce_sum(square) + sum_square = paddle.sum(square) sum_square_list.append(sum_square) # debug log, not dump all since slow down train process @@ -57,14 +57,15 @@ class ClipGradByGlobalNormWithLog(paddle.nn.ClipGradByGlobalNorm): return params_grads global_norm_var = layers.concat(sum_square_list) - global_norm_var = layers.reduce_sum(global_norm_var) - global_norm_var = layers.sqrt(global_norm_var) + global_norm_var = paddle.sum(global_norm_var) + global_norm_var = paddle.sqrt(global_norm_var) + # debug log logger.debug(f"Grad Global Norm: {float(global_norm_var)}!!!!") max_global_norm = layers.fill_constant( shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm) - clip_var = layers.elementwise_div( + clip_var = paddle.divide( x=max_global_norm, y=paddle.maximum(x=global_norm_var, y=max_global_norm)) for i, (p, g) in enumerate(params_grads): @@ -73,7 +74,7 @@ class ClipGradByGlobalNormWithLog(paddle.nn.ClipGradByGlobalNorm): if getattr(p, 'need_clip', True) is False: params_and_grads.append((p, g)) continue - new_grad = layers.elementwise_mul(x=g, y=clip_var) + new_grad = paddle.multiply(x=g, y=clip_var) params_and_grads.append((p, new_grad)) # debug log, not dump all since slow down train process From 2f526c093cac230493f1ae399fa7182f73d588d3 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Wed, 1 Feb 2023 14:06:46 +0800 Subject: [PATCH 033/159] fix data for slim (#2862) --- examples/csmsc/tts3/local/PTQ_static.sh | 2 +- examples/csmsc/voc1/local/PTQ_static.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/csmsc/tts3/local/PTQ_static.sh b/examples/csmsc/tts3/local/PTQ_static.sh index a70a77b58..c6dce53cb 100755 --- a/examples/csmsc/tts3/local/PTQ_static.sh +++ b/examples/csmsc/tts3/local/PTQ_static.sh @@ -5,4 +5,4 @@ python3 ${BIN_DIR}/../PTQ_static.py \ --dev-metadata=dump/dev/norm/metadata.jsonl \ --inference_dir ${train_output_path}/inference \ --model_name ${model_name} \ - --onnx_forma=True \ No newline at end of file + --onnx_format=True \ No newline at end of file diff --git a/examples/csmsc/voc1/local/PTQ_static.sh b/examples/csmsc/voc1/local/PTQ_static.sh index 2e5166141..c85ebd109 100755 --- a/examples/csmsc/voc1/local/PTQ_static.sh +++ b/examples/csmsc/voc1/local/PTQ_static.sh @@ -2,7 +2,7 @@ train_output_path=$1 model_name=$2 python3 ${BIN_DIR}/../../PTQ_static.py \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ + --dev-metadata=dump/dev/raw/metadata.jsonl \ --inference_dir ${train_output_path}/inference \ --model_name ${model_name} \ --onnx_format=True \ No newline at end of file From ac3ed3c5a8a4e81ad662b8c41efa562f415dad7b Mon Sep 17 00:00:00 2001 From: QuanZ9 <31169290+QuanZ9@users.noreply.github.com> Date: Wed, 1 Feb 2023 15:55:52 +0800 Subject: [PATCH 034/159] Update zh_frontend.py (#2863) --- paddlespeech/t2s/frontend/zh_frontend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddlespeech/t2s/frontend/zh_frontend.py b/paddlespeech/t2s/frontend/zh_frontend.py index ddd8cf5c7..efb673e36 100644 --- a/paddlespeech/t2s/frontend/zh_frontend.py +++ b/paddlespeech/t2s/frontend/zh_frontend.py @@ -138,7 +138,7 @@ class Frontend(): "拐儿", "聋儿", "乞儿", "患儿", "幼儿", "孤儿", "婴儿", "婴幼儿", "连体儿", "脑瘫儿", "流浪儿", "体弱儿", "混血儿", "蜜雪儿", "舫儿", "祖儿", "美儿", "应采儿", "可儿", "侄儿", "孙儿", "侄孙儿", "女儿", "男儿", "红孩儿", "花儿", "虫儿", "马儿", "鸟儿", "猪儿", "猫儿", - "狗儿" + "狗儿", "少儿" } self.vocab_phones = {} From 896da6dcd152b6241f606343dfa5ee6ec4932df5 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Wed, 1 Feb 2023 18:25:00 +0800 Subject: [PATCH 035/159] remove utils and third_party in paddlespeech's site-packages (#2867) --- audio/setup.py | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/audio/setup.py b/audio/setup.py index d36b2c440..6e358346c 100644 --- a/audio/setup.py +++ b/audio/setup.py @@ -273,7 +273,7 @@ def main(): }, # Package info - packages=find_packages(include=('paddleaudio*')), + packages=find_packages(include=['paddleaudio*']), package_data=lib_package_data, ext_modules=setup_helpers.get_ext_modules(), zip_safe=True, diff --git a/setup.py b/setup.py index be6cf63a9..2c97ce783 100644 --- a/setup.py +++ b/setup.py @@ -300,7 +300,7 @@ setup_info = dict( }, # Package info - packages=find_packages(include=('paddlespeech*')), + packages=find_packages(include=['paddlespeech*'], exclude=['utils', 'third_party']), zip_safe=True, classifiers=[ 'Development Status :: 5 - Production/Stable', From a55fd2e55685236c34330e0ba01e98878fc5b8cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=89=BE=E6=A2=A6?= Date: Thu, 2 Feb 2023 13:03:41 +0800 Subject: [PATCH 036/159] [TTS]Fix diffusion wavenet denoiser final conv init param (#2868) * add diffusion module for training diffsinger * add wavenet denoiser final conv initializer --- paddlespeech/t2s/modules/diffusion.py | 34 +++++++++++---------------- 1 file changed, 14 insertions(+), 20 deletions(-) diff --git a/paddlespeech/t2s/modules/diffusion.py b/paddlespeech/t2s/modules/diffusion.py index 52fe84ceb..eb67ffb0d 100644 --- a/paddlespeech/t2s/modules/diffusion.py +++ b/paddlespeech/t2s/modules/diffusion.py @@ -40,7 +40,7 @@ class WaveNetDenoiser(nn.Layer): layers (int, optional): Number of residual blocks inside, by default 20 stacks (int, optional): - The number of groups to split the residual blocks into, by default 4 + The number of groups to split the residual blocks into, by default 5 Within each group, the dilation of the residual block grows exponentially. residual_channels (int, optional): Residual channel of the residual blocks, by default 256 @@ -64,7 +64,7 @@ class WaveNetDenoiser(nn.Layer): out_channels: int=80, kernel_size: int=3, layers: int=20, - stacks: int=4, + stacks: int=5, residual_channels: int=256, gate_channels: int=512, skip_channels: int=256, @@ -72,7 +72,7 @@ class WaveNetDenoiser(nn.Layer): dropout: float=0., bias: bool=True, use_weight_norm: bool=False, - init_type: str="kaiming_uniform", ): + init_type: str="kaiming_normal", ): super().__init__() # initialize parameters @@ -118,18 +118,15 @@ class WaveNetDenoiser(nn.Layer): bias=bias) self.conv_layers.append(conv) + final_conv = nn.Conv1D(skip_channels, out_channels, 1, bias_attr=True) + nn.initializer.Constant(0.0)(final_conv.weight) self.last_conv_layers = nn.Sequential(nn.ReLU(), nn.Conv1D( skip_channels, skip_channels, 1, bias_attr=True), - nn.ReLU(), - nn.Conv1D( - skip_channels, - out_channels, - 1, - bias_attr=True)) + nn.ReLU(), final_conv) if use_weight_norm: self.apply_weight_norm() @@ -200,10 +197,6 @@ class GaussianDiffusion(nn.Layer): Args: denoiser (Layer, optional): The model used for denoising noises. - In fact, the denoiser model performs the operation - of producing a output with more noises from the noisy input. - Then we use the diffusion algorithm to calculate - the input with the output to get the denoised result. num_train_timesteps (int, optional): The number of timesteps between the noise and the real during training, by default 1000. beta_start (float, optional): @@ -233,7 +226,8 @@ class GaussianDiffusion(nn.Layer): >>> def callback(index, timestep, num_timesteps, sample): >>> nonlocal pbar >>> if pbar is None: - >>> pbar = tqdm(total=num_timesteps-index) + >>> pbar = tqdm(total=num_timesteps) + >>> pbar.update(index) >>> pbar.update() >>> >>> return callback @@ -247,7 +241,7 @@ class GaussianDiffusion(nn.Layer): >>> diffusion = GaussianDiffusion(denoiser, num_train_timesteps=ds, num_max_timesteps=K_step) >>> with paddle.no_grad(): >>> sample = diffusion.inference( - >>> paddle.randn(x.shape), c, x, + >>> paddle.randn(x.shape), c, ref_x=x_in, >>> num_inference_steps=infer_steps, >>> scheduler_type=scheduler_type, >>> callback=create_progress_callback()) @@ -262,7 +256,7 @@ class GaussianDiffusion(nn.Layer): >>> diffusion = GaussianDiffusion(denoiser, num_train_timesteps=ds, num_max_timesteps=K_step) >>> with paddle.no_grad(): >>> sample = diffusion.inference( - >>> paddle.randn(x.shape), c, x_in, + >>> paddle.randn(x.shape), c, ref_x=x_in, >>> num_inference_steps=infer_steps, >>> scheduler_type=scheduler_type, >>> callback=create_progress_callback()) @@ -277,11 +271,11 @@ class GaussianDiffusion(nn.Layer): >>> diffusion = GaussianDiffusion(denoiser, num_train_timesteps=ds, num_max_timesteps=K_step) >>> with paddle.no_grad(): >>> sample = diffusion.inference( - >>> paddle.randn(x.shape), c, None, + >>> paddle.randn(x.shape), c, ref_x=x_in, >>> num_inference_steps=infer_steps, >>> scheduler_type=scheduler_type, >>> callback=create_progress_callback()) - 100%|█████| 25/25 [00:01<00:00, 19.75it/s] + 100%|█████| 34/34 [00:01<00:00, 19.75it/s] >>> >>> # ds=1000, K_step=100, scheduler=pndm, infer_step=50, from aux fs2 mel output >>> ds = 1000 @@ -292,11 +286,11 @@ class GaussianDiffusion(nn.Layer): >>> diffusion = GaussianDiffusion(denoiser, num_train_timesteps=ds, num_max_timesteps=K_step) >>> with paddle.no_grad(): >>> sample = diffusion.inference( - >>> paddle.randn(x.shape), c, x, + >>> paddle.randn(x.shape), c, ref_x=x_in, >>> num_inference_steps=infer_steps, >>> scheduler_type=scheduler_type, >>> callback=create_progress_callback()) - 100%|█████| 5/5 [00:00<00:00, 23.80it/s] + 100%|█████| 14/14 [00:00<00:00, 23.80it/s] """ From a283f8a57e8bbc411bd36f2e0d8df3e0780a1c0e Mon Sep 17 00:00:00 2001 From: TianYuan Date: Thu, 2 Feb 2023 13:04:20 +0800 Subject: [PATCH 037/159] [TTS]fix open encoding (#2865) --- paddlespeech/cli/tts/infer.py | 6 +++--- paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py | 2 +- paddlespeech/t2s/exps/ernie_sat/train.py | 2 +- paddlespeech/t2s/exps/fastspeech2/train.py | 4 ++-- paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py | 6 +++--- paddlespeech/t2s/exps/speedyspeech/train.py | 6 +++--- paddlespeech/t2s/exps/syn_utils.py | 8 ++++---- paddlespeech/t2s/exps/tacotron2/train.py | 2 +- paddlespeech/t2s/exps/transformer_tts/train.py | 2 +- paddlespeech/t2s/exps/vits/train.py | 4 ++-- paddlespeech/t2s/frontend/phonectic.py | 2 +- paddlespeech/t2s/frontend/zh_frontend.py | 4 ++-- 12 files changed, 24 insertions(+), 24 deletions(-) diff --git a/paddlespeech/cli/tts/infer.py b/paddlespeech/cli/tts/infer.py index 707518c05..5515ade26 100644 --- a/paddlespeech/cli/tts/infer.py +++ b/paddlespeech/cli/tts/infer.py @@ -292,19 +292,19 @@ class TTSExecutor(BaseExecutor): with open(self.voc_config) as f: self.voc_config = CfgNode(yaml.safe_load(f)) - with open(self.phones_dict, "r") as f: + with open(self.phones_dict, 'rt', encoding='utf-8') as f: phn_id = [line.strip().split() for line in f.readlines()] vocab_size = len(phn_id) tone_size = None if self.tones_dict: - with open(self.tones_dict, "r") as f: + with open(self.tones_dict, 'rt', encoding='utf-8') as f: tone_id = [line.strip().split() for line in f.readlines()] tone_size = len(tone_id) spk_num = None if self.speaker_dict: - with open(self.speaker_dict, 'rt') as f: + with open(self.speaker_dict, 'rt', encoding='utf-8') as f: spk_id = [line.strip().split() for line in f.readlines()] spk_num = len(spk_id) diff --git a/paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py b/paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py index e450aa1a0..c43dafb3c 100644 --- a/paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py +++ b/paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py @@ -437,7 +437,7 @@ if __name__ == '__main__': vocab_phones = {} - with open(args.phones_dict, 'rt') as f: + with open(args.phones_dict, 'rt', encoding='utf-8') as f: phn_id = [line.strip().split() for line in f.readlines()] for phn, id in phn_id: vocab_phones[phn] = int(id) diff --git a/paddlespeech/t2s/exps/ernie_sat/train.py b/paddlespeech/t2s/exps/ernie_sat/train.py index 75a666bb1..c98d691be 100644 --- a/paddlespeech/t2s/exps/ernie_sat/train.py +++ b/paddlespeech/t2s/exps/ernie_sat/train.py @@ -109,7 +109,7 @@ def train_sp(args, config): num_workers=config.num_workers) print("dataloaders done!") - with open(args.phones_dict, "r") as f: + with open(args.phones_dict, 'rt', encoding='utf-8') as f: phn_id = [line.strip().split() for line in f.readlines()] vocab_size = len(phn_id) print("vocab_size:", vocab_size) diff --git a/paddlespeech/t2s/exps/fastspeech2/train.py b/paddlespeech/t2s/exps/fastspeech2/train.py index d31e62a82..97626db0b 100644 --- a/paddlespeech/t2s/exps/fastspeech2/train.py +++ b/paddlespeech/t2s/exps/fastspeech2/train.py @@ -67,7 +67,7 @@ def train_sp(args, config): if args.speaker_dict is not None: print("multiple speaker fastspeech2!") collate_fn = fastspeech2_multi_spk_batch_fn - with open(args.speaker_dict, 'rt') as f: + with open(args.speaker_dict, 'rt', encoding='utf-8') as f: spk_id = [line.strip().split() for line in f.readlines()] spk_num = len(spk_id) fields += ["spk_id"] @@ -123,7 +123,7 @@ def train_sp(args, config): num_workers=config.num_workers) print("dataloaders done!") - with open(args.phones_dict, "r") as f: + with open(args.phones_dict, 'rt', encoding='utf-8') as f: phn_id = [line.strip().split() for line in f.readlines()] vocab_size = len(phn_id) print("vocab_size:", vocab_size) diff --git a/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py b/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py index 644ec250d..d05dfafcf 100644 --- a/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py +++ b/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py @@ -39,18 +39,18 @@ def evaluate(args, speedyspeech_config, pwg_config): # construct dataset for evaluation sentences = [] - with open(args.text, 'rt') as f: + with open(args.text, 'rt', encoding='utf-8') as f: for line in f: items = line.strip().split() utt_id = items[0] sentence = "".join(items[1:]) sentences.append((utt_id, sentence)) - with open(args.phones_dict, "r") as f: + with open(args.phones_dict, 'rt', encoding='utf-8') as f: phn_id = [line.strip().split() for line in f.readlines()] vocab_size = len(phn_id) print("vocab_size:", vocab_size) - with open(args.tones_dict, "r") as f: + with open(args.tones_dict, 'rt', encoding='utf-8') as f: tone_id = [line.strip().split() for line in f.readlines()] tone_size = len(tone_id) print("tone_size:", tone_size) diff --git a/paddlespeech/t2s/exps/speedyspeech/train.py b/paddlespeech/t2s/exps/speedyspeech/train.py index 7b422e64f..c90090daa 100644 --- a/paddlespeech/t2s/exps/speedyspeech/train.py +++ b/paddlespeech/t2s/exps/speedyspeech/train.py @@ -70,7 +70,7 @@ def train_sp(args, config): if args.speaker_dict is not None: print("multiple speaker speedyspeech!") collate_fn = speedyspeech_multi_spk_batch_fn - with open(args.speaker_dict, 'rt') as f: + with open(args.speaker_dict, 'rt', encoding='utf-8') as f: spk_id = [line.strip().split() for line in f.readlines()] spk_num = len(spk_id) fields += ["spk_id"] @@ -133,11 +133,11 @@ def train_sp(args, config): collate_fn=collate_fn, num_workers=config.num_workers) print("dataloaders done!") - with open(args.phones_dict, "r") as f: + with open(args.phones_dict, 'rt', encoding='utf-8') as f: phn_id = [line.strip().split() for line in f.readlines()] vocab_size = len(phn_id) print("vocab_size:", vocab_size) - with open(args.tones_dict, "r") as f: + with open(args.tones_dict, 'rt', encoding='utf-8') as f: tone_id = [line.strip().split() for line in f.readlines()] tone_size = len(tone_id) print("tone_size:", tone_size) diff --git a/paddlespeech/t2s/exps/syn_utils.py b/paddlespeech/t2s/exps/syn_utils.py index 6b693440c..491edda30 100644 --- a/paddlespeech/t2s/exps/syn_utils.py +++ b/paddlespeech/t2s/exps/syn_utils.py @@ -106,7 +106,7 @@ def get_chunks(data, block_size: int, pad_size: int): def get_sentences(text_file: Optional[os.PathLike], lang: str='zh'): # construct dataset for evaluation sentences = [] - with open(text_file, 'rt') as f: + with open(text_file, 'rt', encoding='utf-8') as f: for line in f: if line.strip() != "": items = re.split(r"\s+", line.strip(), 1) @@ -325,17 +325,17 @@ def get_am_inference(am: str='fastspeech2_csmsc', tones_dict: Optional[os.PathLike]=None, speaker_dict: Optional[os.PathLike]=None, return_am: bool=False): - with open(phones_dict, "r") as f: + with open(phones_dict, 'rt', encoding='utf-8') as f: phn_id = [line.strip().split() for line in f.readlines()] vocab_size = len(phn_id) tone_size = None if tones_dict is not None: - with open(tones_dict, "r") as f: + with open(tones_dict, 'rt', encoding='utf-8') as f: tone_id = [line.strip().split() for line in f.readlines()] tone_size = len(tone_id) spk_num = None if speaker_dict is not None: - with open(speaker_dict, 'rt') as f: + with open(speaker_dict, 'rt', encoding='utf-8') as f: spk_id = [line.strip().split() for line in f.readlines()] spk_num = len(spk_id) odim = am_config.n_mels diff --git a/paddlespeech/t2s/exps/tacotron2/train.py b/paddlespeech/t2s/exps/tacotron2/train.py index 69ff80e46..db88009a8 100644 --- a/paddlespeech/t2s/exps/tacotron2/train.py +++ b/paddlespeech/t2s/exps/tacotron2/train.py @@ -119,7 +119,7 @@ def train_sp(args, config): num_workers=config.num_workers) print("dataloaders done!") - with open(args.phones_dict, "r") as f: + with open(args.phones_dict, 'rt', encoding='utf-8') as f: phn_id = [line.strip().split() for line in f.readlines()] vocab_size = len(phn_id) print("vocab_size:", vocab_size) diff --git a/paddlespeech/t2s/exps/transformer_tts/train.py b/paddlespeech/t2s/exps/transformer_tts/train.py index da48b6b99..d49baad99 100644 --- a/paddlespeech/t2s/exps/transformer_tts/train.py +++ b/paddlespeech/t2s/exps/transformer_tts/train.py @@ -114,7 +114,7 @@ def train_sp(args, config): num_workers=config.num_workers) print("dataloaders done!") - with open(args.phones_dict, "r") as f: + with open(args.phones_dict, 'rt', encoding='utf-8') as f: phn_id = [line.strip().split() for line in f.readlines()] vocab_size = len(phn_id) print("vocab_size:", vocab_size) diff --git a/paddlespeech/t2s/exps/vits/train.py b/paddlespeech/t2s/exps/vits/train.py index f6a31ced2..0e74bf631 100644 --- a/paddlespeech/t2s/exps/vits/train.py +++ b/paddlespeech/t2s/exps/vits/train.py @@ -78,7 +78,7 @@ def train_sp(args, config): if args.speaker_dict is not None: print("multiple speaker vits!") collate_fn = vits_multi_spk_batch_fn - with open(args.speaker_dict, 'rt') as f: + with open(args.speaker_dict, 'rt', encoding='utf-8') as f: spk_id = [line.strip().split() for line in f.readlines()] spk_num = len(spk_id) fields += ["spk_id"] @@ -132,7 +132,7 @@ def train_sp(args, config): num_workers=config.num_workers) print("dataloaders done!") - with open(args.phones_dict, "r") as f: + with open(args.phones_dict, 'rt', encoding='utf-8') as f: phn_id = [line.strip().split() for line in f.readlines()] vocab_size = len(phn_id) print("vocab_size:", vocab_size) diff --git a/paddlespeech/t2s/frontend/phonectic.py b/paddlespeech/t2s/frontend/phonectic.py index 261db80a8..af86d9b80 100644 --- a/paddlespeech/t2s/frontend/phonectic.py +++ b/paddlespeech/t2s/frontend/phonectic.py @@ -58,7 +58,7 @@ class English(Phonetics): self.punc = ":,;。?!“”‘’':,;.?!" self.text_normalizer = TextNormalizer() if phone_vocab_path: - with open(phone_vocab_path, 'rt') as f: + with open(phone_vocab_path, 'rt', encoding='utf-8') as f: phn_id = [line.strip().split() for line in f.readlines()] for phn, id in phn_id: self.vocab_phones[phn] = int(id) diff --git a/paddlespeech/t2s/frontend/zh_frontend.py b/paddlespeech/t2s/frontend/zh_frontend.py index efb673e36..35b97a93a 100644 --- a/paddlespeech/t2s/frontend/zh_frontend.py +++ b/paddlespeech/t2s/frontend/zh_frontend.py @@ -144,12 +144,12 @@ class Frontend(): self.vocab_phones = {} self.vocab_tones = {} if phone_vocab_path: - with open(phone_vocab_path, 'rt') as f: + with open(phone_vocab_path, 'rt', encoding='utf-8') as f: phn_id = [line.strip().split() for line in f.readlines()] for phn, id in phn_id: self.vocab_phones[phn] = int(id) if tone_vocab_path: - with open(tone_vocab_path, 'rt') as f: + with open(tone_vocab_path, 'rt', encoding='utf-8') as f: tone_id = [line.strip().split() for line in f.readlines()] for tone, id in tone_id: self.vocab_tones[tone] = int(id) From c764710aa12a2f0db23475b15e1f6cafd5f05e57 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=AB=A0=E5=AE=8F=E5=BD=AC?= <57510731+hopingZ@users.noreply.github.com> Date: Thu, 2 Feb 2023 13:05:35 +0800 Subject: [PATCH 038/159] [TTS]Avoid using variable "attn_loss" before assignment (#2860) * Avoid using variable "attn_loss" before assignment * Update tacotron2_updater.py --------- Co-authored-by: TianYuan --- .../t2s/models/tacotron2/tacotron2_updater.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/paddlespeech/t2s/models/tacotron2/tacotron2_updater.py b/paddlespeech/t2s/models/tacotron2/tacotron2_updater.py index 09e6827d0..1db9248ae 100644 --- a/paddlespeech/t2s/models/tacotron2/tacotron2_updater.py +++ b/paddlespeech/t2s/models/tacotron2/tacotron2_updater.py @@ -113,16 +113,18 @@ class Tacotron2Updater(StandardUpdater): loss.backward() optimizer.step() + if self.use_guided_attn_loss: + report("train/attn_loss", float(attn_loss)) + losses_dict["attn_loss"] = float(attn_loss) + report("train/l1_loss", float(l1_loss)) report("train/mse_loss", float(mse_loss)) report("train/bce_loss", float(bce_loss)) - report("train/attn_loss", float(attn_loss)) report("train/loss", float(loss)) losses_dict["l1_loss"] = float(l1_loss) losses_dict["mse_loss"] = float(mse_loss) losses_dict["bce_loss"] = float(bce_loss) - losses_dict["attn_loss"] = float(attn_loss) losses_dict["loss"] = float(loss) self.msg += ', '.join('{}: {:>.6f}'.format(k, v) for k, v in losses_dict.items()) @@ -202,17 +204,19 @@ class Tacotron2Evaluator(StandardEvaluator): attn_loss = self.attn_loss( att_ws=att_ws, ilens=batch["text_lengths"] + 1, olens=olens_in) loss = loss + attn_loss + + if self.use_guided_attn_loss: + report("eval/attn_loss", float(attn_loss)) + losses_dict["attn_loss"] = float(attn_loss) report("eval/l1_loss", float(l1_loss)) report("eval/mse_loss", float(mse_loss)) report("eval/bce_loss", float(bce_loss)) - report("eval/attn_loss", float(attn_loss)) report("eval/loss", float(loss)) losses_dict["l1_loss"] = float(l1_loss) losses_dict["mse_loss"] = float(mse_loss) losses_dict["bce_loss"] = float(bce_loss) - losses_dict["attn_loss"] = float(attn_loss) losses_dict["loss"] = float(loss) self.msg += ', '.join('{}: {:>.6f}'.format(k, v) for k, v in losses_dict.items()) From 6b00ad6064a390525bd992dc747e1e5681b49db4 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Fri, 3 Feb 2023 09:57:51 +0800 Subject: [PATCH 039/159] [Install]clean dependencies (#2871) * clean dependencies * update paddleaudio's version * rm dependency in librosa and paddlenlp * rm dependency in paddlepaddle * rm dependency in speech_web --- audio/setup.py | 7 +------ .../speech_web/speech_server/requirements.txt | 4 +--- docs/requirements.txt | 15 +++------------ setup.py | 18 +++++------------- 4 files changed, 10 insertions(+), 34 deletions(-) diff --git a/audio/setup.py b/audio/setup.py index 6e358346c..823e5dfad 100644 --- a/audio/setup.py +++ b/audio/setup.py @@ -40,14 +40,9 @@ COMMITID = 'none' base = [ "kaldiio", "librosa==0.8.1", - "scipy>=1.0.0", - "soundfile~=0.10", - "colorlog", - "pathos==0.2.8", + "pathos", "pybind11", "parameterized", - "tqdm", - "scikit-learn" ] requirements = { diff --git a/demos/speech_web/speech_server/requirements.txt b/demos/speech_web/speech_server/requirements.txt index cdc654656..8425a1fee 100644 --- a/demos/speech_web/speech_server/requirements.txt +++ b/demos/speech_web/speech_server/requirements.txt @@ -1,8 +1,6 @@ aiofiles faiss-cpu -praatio==5.0.0 +praatio>=5.0.0 pydantic python-multipart -scikit_learn starlette -uvicorn diff --git a/docs/requirements.txt b/docs/requirements.txt index 5422c26f9..609f27925 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,11 +1,9 @@ braceexpand editdistance -fastapi g2p_en g2pM h5py inflect -jieba jsonlines kaldiio keyboard @@ -24,30 +22,23 @@ paddlespeech_ctcdecoders paddlespeech_feat pandas pattern_singleton -Pillow>=9.0.0 ppdiffusers>=0.9.0 -praatio==5.0.0 +praatio>=5.0.0 prettytable pypinyin-dict pypinyin<=0.44.0 python-dateutil -pyworld==0.2.12 +pyworld>=0.2.12 recommonmark>=0.5.0 -resampy==0.2.2 +resampy sacrebleu -scipy -sentencepiece~=0.1.96 -soundfile~=0.10 sphinx sphinx-autobuild sphinx-markdown-tables sphinx_rtd_theme textgrid timer -tqdm typeguard -uvicorn -visualdl webrtcvad websockets yacs~=0.1.8 diff --git a/setup.py b/setup.py index 2c97ce783..76bc5be8d 100644 --- a/setup.py +++ b/setup.py @@ -37,9 +37,7 @@ base = [ "g2pM", "h5py", "inflect", - "jieba", "jsonlines", - "kaldiio", "librosa==0.8.1", "loguru", "matplotlib", @@ -51,22 +49,16 @@ base = [ "paddlenlp>=2.4.8", "ppdiffusers>=0.9.0", "paddlespeech_feat", - "Pillow>=9.0.0", - "praatio==5.0.0", + "praatio>=5.0.0", "pypinyin<=0.44.0", "pypinyin-dict", "python-dateutil", - "pyworld==0.2.12", - "resampy==0.2.2", + "pyworld>=0.2.12", + "resampy", "sacrebleu", - "scipy", - "sentencepiece~=0.1.96", - "soundfile~=0.10", "textgrid", "timer", - "tqdm", "typeguard", - "visualdl", "webrtcvad", "yacs~=0.1.8", "prettytable", @@ -74,10 +66,10 @@ base = [ "braceexpand", "pyyaml", "paddleslim>=2.3.4", - "paddleaudio>=1.0.2", + "paddleaudio>=1.1.0", ] -server = ["fastapi", "uvicorn", "pattern_singleton", "websockets"] +server = ["pattern_singleton", "websockets"] requirements = { "install": From 089c060756c9fe5494ad9e13a57e61451103fee1 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Mon, 6 Feb 2023 19:59:02 +0800 Subject: [PATCH 040/159] fix pwgan tipc (#2882) --- tests/test_tipc/prepare.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/test_tipc/prepare.sh b/tests/test_tipc/prepare.sh index cb05a1d0f..9ff81bd8b 100755 --- a/tests/test_tipc/prepare.sh +++ b/tests/test_tipc/prepare.sh @@ -73,6 +73,9 @@ if [[ ${MODE} = "benchmark_train" ]];then mkdir -p BZNSYP unrar x BZNSYP.rar BZNSYP wget -nc https://paddlespeech.bj.bcebos.com/Parakeet/benchmark/durations.txt + # 避免网络问题导致的 nltk_data 无法下载使程序 hang 住 + wget -nc https://paddlespeech.bj.bcebos.com/Parakeet/tools/nltk_data.tar.gz + tar -xzf nltk_data.tar.gz -C ${HOME} # 数据预处理 python ../paddlespeech/t2s/exps/gan_vocoder/preprocess.py --rootdir=BZNSYP/ --dumpdir=dump --num-cpu=20 --cut-sil=True --dur-file=durations.txt --config=../examples/csmsc/voc1/conf/default.yaml python ../utils/compute_statistics.py --metadata=dump/train/raw/metadata.jsonl --field-name="feats" From 16d84367c6c7452deb0cc9955aa40298271637b0 Mon Sep 17 00:00:00 2001 From: Zhou Wei <1183042833@qq.com> Date: Tue, 7 Feb 2023 10:10:53 +0800 Subject: [PATCH 041/159] fix Tensor.numpy()[0] to float(Tensor) to adapt 0D (#2884) --- examples/tess/cls0/local/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/tess/cls0/local/train.py b/examples/tess/cls0/local/train.py index 25382d8c3..f023a37b7 100644 --- a/examples/tess/cls0/local/train.py +++ b/examples/tess/cls0/local/train.py @@ -121,7 +121,7 @@ if __name__ == "__main__": optimizer.clear_grad() # Calculate loss - avg_loss += loss.numpy()[0] + avg_loss += float(loss) # Calculate metrics preds = paddle.argmax(logits, axis=1) From 3a8ba2e24246f1283bfcfd149bd803e03987d4d3 Mon Sep 17 00:00:00 2001 From: HuangLiangJie Date: Tue, 7 Feb 2023 13:14:54 +0800 Subject: [PATCH 042/159] [TTS]Update VITS checkpoint , test=tts (#2887) --- examples/csmsc/vits/README.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/examples/csmsc/vits/README.md b/examples/csmsc/vits/README.md index 8f223e07b..50d703b2d 100644 --- a/examples/csmsc/vits/README.md +++ b/examples/csmsc/vits/README.md @@ -147,14 +147,14 @@ optional arguments: The pretrained model can be downloaded here: -- [vits_csmsc_ckpt_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/vits/vits_csmsc_ckpt_1.1.0.zip) (add_blank=true) +- [vits_csmsc_ckpt_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/vits/vits_csmsc_ckpt_1.4.0.zip) (add_blank=true) VITS checkpoint contains files listed below. ```text -vits_csmsc_ckpt_1.1.0 -├── default.yaml # default config used to train vitx -├── phone_id_map.txt # phone vocabulary file when training vits -└── snapshot_iter_333000.pdz # model parameters and optimizer states +vits_csmsc_ckpt_1.4.0 +├── default.yaml # default config used to train vitx +├── phone_id_map.txt # phone vocabulary file when training vits +└── snapshot_iter_150000.pdz # model parameters and optimizer states ``` ps: This ckpt is not good enough, a better result is training @@ -168,9 +168,9 @@ add_blank=true FLAGS_allocator_strategy=naive_best_fit \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \ python3 ${BIN_DIR}/synthesize_e2e.py \ - --config=vits_csmsc_ckpt_1.1.0/default.yaml \ - --ckpt=vits_csmsc_ckpt_1.1.0/snapshot_iter_333000.pdz \ - --phones_dict=vits_csmsc_ckpt_1.1.0/phone_id_map.txt \ + --config=vits_csmsc_ckpt_1.4.0/default.yaml \ + --ckpt=vits_csmsc_ckpt_1.4.0/snapshot_iter_150000.pdz \ + --phones_dict=vits_csmsc_ckpt_1.4.0/phone_id_map.txt \ --output_dir=exp/default/test_e2e \ --text=${BIN_DIR}/../sentences.txt \ --add-blank=${add_blank} From b1d0658ef8d265a2781ce712982edfca4f0d4c52 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Wed, 8 Feb 2023 15:47:30 +0800 Subject: [PATCH 043/159] Update stale.yml --- .github/stale.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/stale.yml b/.github/stale.yml index da19b6606..6b0da9b98 100644 --- a/.github/stale.yml +++ b/.github/stale.yml @@ -6,7 +6,8 @@ daysUntilClose: 30 exemptLabels: - Roadmap - Bug - - New Feature + - feature request + - Tips # Label to use when marking an issue as stale staleLabel: Stale # Comment to post when marking an issue as stale. Set to `false` to disable @@ -17,4 +18,4 @@ markComment: > unmarkComment: false # Comment to post when closing a stale issue. Set to `false` to disable closeComment: > - This issue is closed. Please re-open if needed. \ No newline at end of file + This issue is closed. Please re-open if needed. From f6b624ddc8d4bc869b0a5fbe36d3e183a6adf01c Mon Sep 17 00:00:00 2001 From: zxcd <228587199@qq.com> Date: Wed, 8 Feb 2023 16:29:58 +0800 Subject: [PATCH 044/159] add encoding=utf8 for text cli. (#2896) --- paddlespeech/cli/text/infer.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/paddlespeech/cli/text/infer.py b/paddlespeech/cli/text/infer.py index ff822f674..bd76a13d0 100644 --- a/paddlespeech/cli/text/infer.py +++ b/paddlespeech/cli/text/infer.py @@ -127,7 +127,7 @@ class TextExecutor(BaseExecutor): if self.task == 'punc': # punc list self._punc_list = [] - with open(self.vocab_file, 'r') as f: + with open(self.vocab_file, 'r', encoding='utf-8') as f: for line in f: self._punc_list.append(line.strip()) @@ -178,12 +178,12 @@ class TextExecutor(BaseExecutor): if self.task == 'punc': # punc list self._punc_list = [] - with open(self.vocab_file, 'r') as f: + with open(self.vocab_file, 'r', encoding='utf-8') as f: for line in f: self._punc_list.append(line.strip()) # model - with open(self.cfg_path) as f: + with open(self.cfg_path, 'r', encoding='utf-8') as f: config = CfgNode(yaml.safe_load(f)) self.model = ErnieLinear(**config["model"]) From 8cbf6a2c9aa0dd1099bcb7654f1a984a8ea541b5 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Thu, 9 Feb 2023 10:51:03 +0800 Subject: [PATCH 045/159] Update setup.py (#2879) --- third_party/ctc_decoders/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/ctc_decoders/setup.py b/third_party/ctc_decoders/setup.py index c13f3df99..5ae5b3bf6 100644 --- a/third_party/ctc_decoders/setup.py +++ b/third_party/ctc_decoders/setup.py @@ -129,7 +129,7 @@ decoders_module = [ setup( name='paddlespeech_ctcdecoders', - version='0.2.0', + version='0.2.2', description="CTC decoders in paddlespeech", author="PaddlePaddle Speech and Language Team", author_email="paddlesl@baidu.com", From 6728db5b59d4aa26424400a6c420036b98cca726 Mon Sep 17 00:00:00 2001 From: zxcd <228587199@qq.com> Date: Thu, 9 Feb 2023 10:51:23 +0800 Subject: [PATCH 046/159] [ASR]Whisper remove audio duration limit, test=asr (#2900) --- paddlespeech/cli/whisper/infer.py | 9 +-------- paddlespeech/s2t/models/whisper/__init__.py | 2 +- paddlespeech/s2t/models/whisper/tokenizer.py | 2 +- paddlespeech/s2t/models/whisper/utils.py | 2 +- paddlespeech/s2t/models/whisper/whipser.py | 2 +- 5 files changed, 5 insertions(+), 12 deletions(-) diff --git a/paddlespeech/cli/whisper/infer.py b/paddlespeech/cli/whisper/infer.py index c016b453a..ebcca890b 100644 --- a/paddlespeech/cli/whisper/infer.py +++ b/paddlespeech/cli/whisper/infer.py @@ -152,8 +152,7 @@ class WhisperExecutor(BaseExecutor): Init model and other resources from a specific path. """ logger.debug("start to init the model") - # default max_len: unit:second - self.max_len = 50 + if hasattr(self, 'model'): logger.debug('Model had been initialized.') return @@ -339,12 +338,6 @@ class WhisperExecutor(BaseExecutor): try: audio, audio_sample_rate = soundfile.read( audio_file, dtype="int16", always_2d=True) - audio_duration = audio.shape[0] / audio_sample_rate - if audio_duration > self.max_len: - logger.error( - f"Please input audio file less then {self.max_len} seconds.\n" - ) - return False except Exception as e: logger.exception(e) logger.error( diff --git a/paddlespeech/s2t/models/whisper/__init__.py b/paddlespeech/s2t/models/whisper/__init__.py index 98ab23610..b78dece8a 100644 --- a/paddlespeech/s2t/models/whisper/__init__.py +++ b/paddlespeech/s2t/models/whisper/__init__.py @@ -1,5 +1,5 @@ # MIT License, Copyright (c) 2022 OpenAI. -# Copyright (c) 2022 PaddlePaddle Authors and . All Rights Reserved. +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Modified from OpenAI Whisper 2022 (https://github.com/openai/whisper/whisper/__init__.py) from paddlespeech.s2t.models.whisper.whipser import decode diff --git a/paddlespeech/s2t/models/whisper/tokenizer.py b/paddlespeech/s2t/models/whisper/tokenizer.py index 1e1aea044..e8b201bcc 100644 --- a/paddlespeech/s2t/models/whisper/tokenizer.py +++ b/paddlespeech/s2t/models/whisper/tokenizer.py @@ -1,5 +1,5 @@ # MIT License, Copyright (c) 2022 OpenAI. -# Copyright (c) 2022 PaddlePaddle Authors and . All Rights Reserved. +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Modified from OpenAI Whisper 2022 (https://github.com/openai/whisper/whisper/tokenizer.py) import os diff --git a/paddlespeech/s2t/models/whisper/utils.py b/paddlespeech/s2t/models/whisper/utils.py index d067af7d2..5528f9604 100644 --- a/paddlespeech/s2t/models/whisper/utils.py +++ b/paddlespeech/s2t/models/whisper/utils.py @@ -1,5 +1,5 @@ # MIT License, Copyright (c) 2022 OpenAI. -# Copyright (c) 2022 PaddlePaddle Authors and . All Rights Reserved. +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Modified from OpenAI Whisper 2022 (https://github.com/openai/whisper/whisper/utils.py) import zlib diff --git a/paddlespeech/s2t/models/whisper/whipser.py b/paddlespeech/s2t/models/whisper/whipser.py index 9cf9a9eca..a28013e4b 100644 --- a/paddlespeech/s2t/models/whisper/whipser.py +++ b/paddlespeech/s2t/models/whisper/whipser.py @@ -1,5 +1,5 @@ # MIT License, Copyright (c) 2022 OpenAI. -# Copyright (c) 2022 PaddlePaddle Authors and . All Rights Reserved. +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Modified from OpenAI Whisper 2022 (https://github.com/openai/whisper/whisper) import os From bcd8e309ec3fade62971067de6d5607027c254e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=89=BE=E6=A2=A6?= Date: Thu, 9 Feb 2023 14:58:34 +0800 Subject: [PATCH 047/159] [TTS]Add diffusion noise clip to optimize sample result (#2902) * add diffusion module for training diffsinger * add wavenet denoiser final conv initializer * add diffusion noise clip to optimize sample result --- paddlespeech/t2s/modules/diffusion.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/paddlespeech/t2s/modules/diffusion.py b/paddlespeech/t2s/modules/diffusion.py index eb67ffb0d..be684ce38 100644 --- a/paddlespeech/t2s/modules/diffusion.py +++ b/paddlespeech/t2s/modules/diffusion.py @@ -360,6 +360,8 @@ class GaussianDiffusion(nn.Layer): num_inference_steps: Optional[int]=1000, strength: Optional[float]=None, scheduler_type: Optional[str]="ddpm", + clip_noise: Optional[bool]=True, + clip_noise_range: Optional[Tuple[float, float]]=(-1, 1), callback: Optional[Callable[[int, int, int, paddle.Tensor], None]]=None, callback_steps: Optional[int]=1): @@ -380,6 +382,10 @@ class GaussianDiffusion(nn.Layer): scheduler_type (str, optional): Noise scheduler for generate noises. Choose a great scheduler can skip many denoising step, by default 'ddpm'. + clip_noise (bool, optional): + Whether to clip each denoised output, by default True. + clip_noise_range (tuple, optional): + denoised output min and max value range after clip, by default (-1, 1). callback (Callable[[int,int,int,Tensor], None], optional): Callback function during denoising steps. @@ -440,6 +446,9 @@ class GaussianDiffusion(nn.Layer): # denoising loop denoised_output = noisy_input + if clip_noise: + n_min, n_max = clip_noise_range + denoised_output = paddle.clip(denoised_output, n_min, n_max) num_warmup_steps = len( timesteps) - num_inference_steps * scheduler.order for i, t in enumerate(timesteps): @@ -451,6 +460,8 @@ class GaussianDiffusion(nn.Layer): # compute the previous noisy sample x_t -> x_t-1 denoised_output = scheduler.step(noise_pred, t, denoised_output).prev_sample + if clip_noise: + denoised_output = paddle.clip(denoised_output, n_min, n_max) # call the callback, if provided if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and From 66a9cf8ebc0be8deb68c6fa9b53f961aa647b045 Mon Sep 17 00:00:00 2001 From: ZhengZZZ <107376080+EscaticZheng@users.noreply.github.com> Date: Mon, 13 Feb 2023 18:01:00 +0800 Subject: [PATCH 048/159] modify readme (#2915) --- README.md | 2 +- README_cn.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index afc4e4d09..1678e9e04 100644 --- a/README.md +++ b/README.md @@ -987,7 +987,7 @@ You are warmly welcome to submit questions in [discussions](https://github.com/P - Many thanks to [vpegasus](https://github.com/vpegasus)/[xuesebot](https://github.com/vpegasus/xuesebot) for developing a rasa chatbot,which is able to speak and listen thanks to PaddleSpeech. - Many thanks to [chenkui164](https://github.com/chenkui164)/[FastASR](https://github.com/chenkui164/FastASR) for the C++ inference implementation of PaddleSpeech ASR. - Many thanks to [heyudage](https://github.com/heyudage)/[VoiceTyping](https://github.com/heyudage/VoiceTyping) for the real-time voice typing tool implementation of PaddleSpeech ASR streaming services. - +- Many thanks to [EscaticZheng](https://github.com/EscaticZheng)/[ps3.9wheel-install](https://github.com/EscaticZheng/ps3.9wheel-install) for the python3.9 prebuilt wheel for PaddleSpeech installation in Windows without Viusal Studio. Besides, PaddleSpeech depends on a lot of open source repositories. See [references](./docs/source/reference.md) for more information. diff --git a/README_cn.md b/README_cn.md index ecc4644aa..572f9dee6 100644 --- a/README_cn.md +++ b/README_cn.md @@ -988,10 +988,10 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声 - 非常感谢 [awmmmm](https://github.com/awmmmm) 提供 fastspeech2 aishell3 conformer 预训练模型。 - 非常感谢 [phecda-xu](https://github.com/phecda-xu)/[PaddleDubbing](https://github.com/phecda-xu/PaddleDubbing) 基于 PaddleSpeech 的 TTS 模型搭建带 GUI 操作界面的配音工具。 - 非常感谢 [jerryuhoo](https://github.com/jerryuhoo)/[VTuberTalk](https://github.com/jerryuhoo/VTuberTalk) 基于 PaddleSpeech 的 TTS GUI 界面和基于 ASR 制作数据集的相关代码。 - - 非常感谢 [vpegasus](https://github.com/vpegasus)/[xuesebot](https://github.com/vpegasus/xuesebot) 基于 PaddleSpeech 的 ASR 与 TTS 设计的可听、说对话机器人。 - 非常感谢 [chenkui164](https://github.com/chenkui164)/[FastASR](https://github.com/chenkui164/FastASR) 对 PaddleSpeech 的 ASR 进行 C++ 推理实现。 - 非常感谢 [heyudage](https://github.com/heyudage)/[VoiceTyping](https://github.com/heyudage/VoiceTyping) 基于 PaddleSpeech 的 ASR 流式服务实现的实时语音输入法工具。 +- 非常感谢 [EscaticZheng](https://github.com/EscaticZheng)/[ps3.9wheel-install](https://github.com/EscaticZheng/ps3.9wheel-install) 对PaddleSpeech在Windows下的安装提供了无需Visua Studio,基于python3.9的预编译依赖安装包。 此外,PaddleSpeech 依赖于许多开源存储库。有关更多信息,请参阅 [references](./docs/source/reference.md)。 From 047092de8ed344ec391e5492c897395837773765 Mon Sep 17 00:00:00 2001 From: zxcd <228587199@qq.com> Date: Mon, 13 Feb 2023 22:16:05 +0800 Subject: [PATCH 049/159] add wav2vev2_zh aishell recipe, and speechbrain dataloader. (#2916) --- examples/aishell/asr3/README.md | 198 ++++ examples/aishell/asr3/cmd.sh | 89 ++ examples/aishell/asr3/conf/preprocess.yaml | 3 + .../aishell/asr3/conf/train_with_wav2vec.yaml | 101 +++ examples/aishell/asr3/conf/tuning/decode.yaml | 4 + examples/aishell/asr3/conf/wav2vec2ASR.yaml | 167 ++++ .../aishell/asr3/local/aishell_prepare.py | 129 +++ examples/aishell/asr3/local/data.sh | 101 +++ examples/aishell/asr3/local/test.sh | 84 ++ examples/aishell/asr3/local/test_wav.sh | 58 ++ examples/aishell/asr3/local/train.sh | 59 ++ examples/aishell/asr3/path.sh | 15 + examples/aishell/asr3/run.sh | 48 + examples/aishell/asr3/utils | 1 + paddlespeech/s2t/exps/wav2vec2/model.py | 412 +++++++-- paddlespeech/s2t/io/speechbrain/__init__.py | 13 + paddlespeech/s2t/io/speechbrain/batch.py | 107 +++ .../s2t/io/speechbrain/data_pipeline.py | 488 ++++++++++ paddlespeech/s2t/io/speechbrain/data_utils.py | 177 ++++ paddlespeech/s2t/io/speechbrain/dataio.py | 845 ++++++++++++++++++ paddlespeech/s2t/io/speechbrain/dataloader.py | 172 ++++ paddlespeech/s2t/io/speechbrain/dataset.py | 371 ++++++++ paddlespeech/s2t/io/speechbrain/depgraph.py | 237 +++++ .../s2t/io/speechbrain/make_dataloader.py | 118 +++ paddlespeech/s2t/io/speechbrain/sampler.py | 503 +++++++++++ .../s2t/io/speechbrain/sb_pipeline.py | 156 ++++ paddlespeech/s2t/models/wav2vec2/__init__.py | 2 +- .../wav2vec2/processing/signal_processing.py | 20 +- .../processing/speech_augmentation.py | 44 +- .../s2t/models/wav2vec2/wav2vec2_ASR.py | 54 +- setup.py | 2 + 31 files changed, 4672 insertions(+), 106 deletions(-) create mode 100644 examples/aishell/asr3/README.md create mode 100755 examples/aishell/asr3/cmd.sh create mode 100755 examples/aishell/asr3/conf/preprocess.yaml create mode 100755 examples/aishell/asr3/conf/train_with_wav2vec.yaml create mode 100755 examples/aishell/asr3/conf/tuning/decode.yaml create mode 100755 examples/aishell/asr3/conf/wav2vec2ASR.yaml create mode 100644 examples/aishell/asr3/local/aishell_prepare.py create mode 100755 examples/aishell/asr3/local/data.sh create mode 100755 examples/aishell/asr3/local/test.sh create mode 100755 examples/aishell/asr3/local/test_wav.sh create mode 100755 examples/aishell/asr3/local/train.sh create mode 100755 examples/aishell/asr3/path.sh create mode 100755 examples/aishell/asr3/run.sh create mode 120000 examples/aishell/asr3/utils create mode 100644 paddlespeech/s2t/io/speechbrain/__init__.py create mode 100755 paddlespeech/s2t/io/speechbrain/batch.py create mode 100755 paddlespeech/s2t/io/speechbrain/data_pipeline.py create mode 100755 paddlespeech/s2t/io/speechbrain/data_utils.py create mode 100755 paddlespeech/s2t/io/speechbrain/dataio.py create mode 100755 paddlespeech/s2t/io/speechbrain/dataloader.py create mode 100755 paddlespeech/s2t/io/speechbrain/dataset.py create mode 100755 paddlespeech/s2t/io/speechbrain/depgraph.py create mode 100755 paddlespeech/s2t/io/speechbrain/make_dataloader.py create mode 100755 paddlespeech/s2t/io/speechbrain/sampler.py create mode 100755 paddlespeech/s2t/io/speechbrain/sb_pipeline.py mode change 100644 => 100755 paddlespeech/s2t/models/wav2vec2/wav2vec2_ASR.py diff --git a/examples/aishell/asr3/README.md b/examples/aishell/asr3/README.md new file mode 100644 index 000000000..e5806d621 --- /dev/null +++ b/examples/aishell/asr3/README.md @@ -0,0 +1,198 @@ +# Wav2vec2ASR with Aishell +This example contains code used to finetune [wav2vec2.0](https://https://arxiv.org/pdf/2006.11477.pdf) model with [Aishell dataset](http://www.openslr.org/resources/33) +## Overview +All the scripts you need are in `run.sh`. There are several stages in `run.sh`, and each stage has its function. +| Stage | Function | +|:---- |:----------------------------------------------------------- | +| 0 | Process data. It includes:
(1) Download the dataset
(2) Calculate the CMVN of the train dataset
(3) Get the vocabulary file
(4) Get the manifest files of the train, development and test dataset
(5) Download the pretrained wav2vec2 model | +| 1 | Train the model | +| 2 | Get the final model by averaging the top-k models, set k = 1 means to choose the best model | +| 3 | Test the final model performance | +| 4 | Infer the single audio file | + + +You can choose to run a range of stages by setting `stage` and `stop_stage `. + +For example, if you want to execute the code in stage 2 and stage 3, you can run this script: +```bash +bash run.sh --stage 2 --stop_stage 3 +``` +Or you can set `stage` equal to `stop-stage` to only run one stage. +For example, if you only want to run `stage 0`, you can use the script below: +```bash +bash run.sh --stage 0 --stop_stage 0 +``` +The document below will describe the scripts in `run.sh` in detail. +## The Environment Variables +The path.sh contains the environment variables. +```bash +. ./path.sh +. ./cmd.sh +``` +This script needs to be run first. And another script is also needed: +```bash +source ${MAIN_ROOT}/utils/parse_options.sh +``` +It will support the way of using `--variable value` in the shell scripts. +## The Local Variables +Some local variables are set in `run.sh`. +`gpus` denotes the GPU number you want to use. If you set `gpus=`, it means you only use CPU. +`stage` denotes the number of stages you want to start from in the experiments. +`stop stage` denotes the number of the stage you want to end at in the experiments. +`conf_path` denotes the config path of the model. +`avg_num` denotes the number K of top-K models you want to average to get the final model. +`audio file` denotes the file path of the single file you want to infer in stage 5 +`ckpt` denotes the checkpoint prefix of the model, e.g. "wav2vec2ASR" + +You can set the local variables (except `ckpt`) when you use `run.sh` + +For example, you can set the `gpus` and `avg_num` when you use the command line: +```bash +bash run.sh --gpus 0,1 --avg_num 20 +``` +## Stage 0: Data Processing +To use this example, you need to process data firstly and you can use stage 0 in `run.sh` to do this. The code is shown below: +```bash + if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # prepare data + bash ./local/data.sh || exit -1 + fi +``` +Stage 0 is for processing the data. + +If you only want to process the data. You can run +```bash +bash run.sh --stage 0 --stop_stage 0 +``` +You can also just run these scripts in your command line. +```bash +. ./path.sh +. ./cmd.sh +bash ./local/data.sh +``` +After processing the data, the `data` directory will look like this: +```bash +data/ +|-- dev.meta +|-- lang_char +| `-- vocab.txt +|-- manifest.dev +|-- manifest.dev.raw +|-- manifest.test +|-- manifest.test.raw +|-- manifest.train +|-- manifest.train.raw +|-- mean_std.json +|-- test.meta +|-- train.meta +|-- train.csv +|-- dev.csv +|-- test.csv +``` + +Stage 0 also downloads the Chinese pre-trained [wav2vec2](https://paddlespeech.bj.bcebos.com/wav2vec/chinese-wav2vec2-large.pdparams) model. +```bash +mkdir -p exp/wav2vec2 +wget -P exp/wav2vec2 https://paddlespeech.bj.bcebos.com/wav2vec/chinese-wav2vec2-large.pdparams +``` +## Stage 1: Model Training +If you want to train the model. you can use stage 1 in `run.sh`. The code is shown below. +```bash +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # train model, all `ckpt` under `exp` dir + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} + fi +``` +If you want to train the model, you can use the script below to execute stage 0 and stage 1: +```bash +bash run.sh --stage 0 --stop_stage 1 +``` +or you can run these scripts in the command line (only use CPU). +```bash +. ./path.sh +. ./cmd.sh +bash ./local/data.sh +CUDA_VISIBLE_DEVICES= ./local/train.sh conf/wav2vec2ASR.yaml wav2vec2ASR +``` +## Stage 2: Top-k Models Averaging +After training the model, we need to get the final model for testing and inference. In every epoch, the model checkpoint is saved, so we can choose the best model from them based on the validation loss or we can sort them and average the parameters of the top-k models to get the final model. We can use stage 2 to do this, and the code is shown below. Note: We only train one epoch for wav2vec2ASR, thus the `avg_num` is set to 1. +```bash + if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # avg n best model + avg.sh best exp/${ckpt}/checkpoints ${avg_num} + fi +``` +The `avg.sh` is in the `../../../utils/` which is define in the `path.sh`. +If you want to get the final model, you can use the script below to execute stage 0, stage 1, and stage 2: +```bash +bash run.sh --stage 0 --stop_stage 2 +``` +or you can run these scripts in the command line (only use CPU). + +```bash +. ./path.sh +. ./cmd.sh +bash ./local/data.sh +CUDA_VISIBLE_DEVICES= ./local/train.sh conf/wav2vec2ASR.yaml wav2vec2ASR +avg.sh best exp/wav2vec2ASR/checkpoints 1 +``` +## Stage 3: Model Testing +The test stage is to evaluate the model performance. The code of test stage is shown below: +```bash + if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + # test ckpt avg_n + CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + fi +``` +If you want to train a model and test it, you can use the script below to execute stage 0, stage 1, stage 2, and stage 3 : +```bash +bash run.sh --stage 0 --stop_stage 3 +``` +or you can run these scripts in the command line (only use CPU). +```bash +. ./path.sh +. ./cmd.sh +bash ./local/data.sh +CUDA_VISIBLE_DEVICES= ./local/train.sh conf/wav2vec2ASR.yaml wav2vec2ASR +avg.sh best exp/wav2vec2ASR/checkpoints 1 +CUDA_VISIBLE_DEVICES= ./local/test.sh conf/wav2vec2ASR.yaml conf/tuning/decode.yaml exp/wav2vec2ASR/checkpoints/avg_1 +``` +## Pretrained Model +You can get the pretrained wav2vec2ASR from [this](../../../docs/source/released_model.md). + +using the `tar` scripts to unpack the model and then you can use the script to test the model. + +For example: +```bash +wget https://paddlespeech.bj.bcebos.com/s2t/aishell/asr3/wav2vec2ASR-large-aishell1_ckpt_1.3.0.model.tar.gz +tar xzvf wav2vec2ASR-large-aishell1_ckpt_1.3.0.model.tar.gz +source path.sh +# If you have process the data and get the manifest file, you can skip the following 2 steps +bash local/data.sh --stage -1 --stop_stage -1 +bash local/data.sh --stage 2 --stop_stage 2 +CUDA_VISIBLE_DEVICES= ./local/test.sh conf/wav2vec2ASR.yaml conf/tuning/decode.yaml exp/wav2vec2ASR/checkpoints/avg_1 +``` +The performance of the released models are shown in [here](./RESULTS.md). + + +## Stage 4: Single Audio File Inference +In some situations, you want to use the trained model to do the inference for the single audio file. You can use stage 5. The code is shown below +```bash + if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + # test a single .wav file + CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1 + fi +``` +you can train the model by yourself using ```bash run.sh --stage 0 --stop_stage 3```, or you can download the pretrained model through the script below: +```bash +wget https://paddlespeech.bj.bcebos.com/s2t/aishell/asr3/wav2vec2ASR-large-aishell1_ckpt_1.3.0.model.tar.gz +tar xzvf wav2vec2ASR-large-aishell1_ckpt_1.3.0.model.tar.gz +``` +You can download the audio demo: +```bash +wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/en/demo_002_en.wav -P data/ +``` +You need to prepare an audio file or use the audio demo above, please confirm the sample rate of the audio is 16K. You can get the result of the audio demo by running the script below. +```bash +CUDA_VISIBLE_DEVICES= ./local/test_wav.sh conf/wav2vec2ASR.yaml conf/tuning/decode.yaml exp/wav2vec2ASR/checkpoints/avg_1 data/demo_002_en.wav +``` diff --git a/examples/aishell/asr3/cmd.sh b/examples/aishell/asr3/cmd.sh new file mode 100755 index 000000000..7b70ef5e0 --- /dev/null +++ b/examples/aishell/asr3/cmd.sh @@ -0,0 +1,89 @@ +# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ====== +# Usage: .pl [options] JOB=1: +# e.g. +# run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB +# +# Options: +# --time