From 571b13c53f8c50a522cbf72b122d2466041af020 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Wed, 4 Aug 2021 06:56:21 +0000 Subject: [PATCH 01/21] add dur utils --- examples/librispeech/s1/README.md | 7 +++++++ examples/librispeech/s1/path.sh | 4 ++-- examples/librispeech/s1/utils | 1 + setup.sh | 6 +++--- utils/duration_from_maniefst.sh | 10 ++++++++++ 5 files changed, 23 insertions(+), 5 deletions(-) create mode 120000 examples/librispeech/s1/utils create mode 100644 utils/duration_from_maniefst.sh diff --git a/examples/librispeech/s1/README.md b/examples/librispeech/s1/README.md index 080b340e..7f89b808 100644 --- a/examples/librispeech/s1/README.md +++ b/examples/librispeech/s1/README.md @@ -1,5 +1,12 @@ # LibriSpeech +## Data + +| Data Subset | Duration in Seconds | +| data/manifest.train | 0.83s ~ 29.735s | +| data/manifest.dev | 1.065 ~ 35.155s | +| data/manifest.test-clean | 1.285s ~ 34.955s | + ## Conformer | Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER | diff --git a/examples/librispeech/s1/path.sh b/examples/librispeech/s1/path.sh index 30adb6ca..22fb1255 100644 --- a/examples/librispeech/s1/path.sh +++ b/examples/librispeech/s1/path.sh @@ -1,10 +1,10 @@ export MAIN_ROOT=${PWD}/../../../ -export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} +export PATH=${MAIN_ROOT}:${PWD}/utils:${PATH} export LC_ALL=C # Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 +export PYTHONIOENCODING=UTF-8 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/ diff --git a/examples/librispeech/s1/utils b/examples/librispeech/s1/utils new file mode 120000 index 00000000..973afe67 --- /dev/null +++ b/examples/librispeech/s1/utils @@ -0,0 +1 @@ +../../../utils \ No newline at end of file diff --git a/setup.sh b/setup.sh index 384d62d2..b340d47b 100644 --- a/setup.sh +++ b/setup.sh @@ -10,7 +10,7 @@ fi if [ -e /etc/lsb-release ];then #${SUDO} apt-get update - ${SUDO} apt-get install -y vim tig tree sox pkg-config libflac-dev libogg-dev libvorbis-dev libboost-dev swig python3-dev + ${SUDO} apt-get install -y jq vim tig tree sox pkg-config libflac-dev libogg-dev libvorbis-dev libboost-dev swig python3-dev if [ $? != 0 ]; then error_msg "Please using Ubuntu or install pkg-config libflac-dev libogg-dev libvorbis-dev libboost-dev swig python3-dev by user." exit -1 @@ -54,10 +54,10 @@ if [ $? != 0 ]; then fi cd AutoLog pip install -r requirements.txt - python setup.py install + python setup.py install cd .. rm -rf AutoLog -fi +fi # install decoders python3 -c "import pkg_resources; pkg_resources.require(\"swig_decoders==1.1\")" diff --git a/utils/duration_from_maniefst.sh b/utils/duration_from_maniefst.sh new file mode 100644 index 00000000..fae579c7 --- /dev/null +++ b/utils/duration_from_maniefst.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +if [ $# == 1 ];then + echo "usage: ${0} manifest_file" + exit -1 +fi + +manifest=$1 + +jq -S '.feat_shape[0]' ${manifest} | sort -nu From ccdfd5b342696744a5b74c0c69a3a2ab257757a7 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Wed, 4 Aug 2021 06:58:28 +0000 Subject: [PATCH 02/21] format --- deepspeech/decoders/swig/setup.py | 5 ++--- deepspeech/models/ds2/__init__.py | 18 ++++++++++++++---- deepspeech/models/ds2/deepspeech2.py | 10 +++++----- examples/callcenter/s1/README.md | 2 +- examples/librispeech/s1/README.md | 6 +++--- 5 files changed, 25 insertions(+), 16 deletions(-) diff --git a/deepspeech/decoders/swig/setup.py b/deepspeech/decoders/swig/setup.py index 3da5ce8b..86af475a 100644 --- a/deepspeech/decoders/swig/setup.py +++ b/deepspeech/decoders/swig/setup.py @@ -84,9 +84,8 @@ FILES = glob.glob('kenlm/util/*.cc') \ FILES += glob.glob('openfst-1.6.3/src/lib/*.cc') FILES = [ - fn for fn in FILES - if not (fn.endswith('main.cc') or fn.endswith('test.cc') or fn.endswith( - 'unittest.cc')) + fn for fn in FILES if not (fn.endswith('main.cc') or fn.endswith('test.cc') + or fn.endswith('unittest.cc')) ] LIBS = ['stdc++'] diff --git a/deepspeech/models/ds2/__init__.py b/deepspeech/models/ds2/__init__.py index 299f901c..39bea5bf 100644 --- a/deepspeech/models/ds2/__init__.py +++ b/deepspeech/models/ds2/__init__.py @@ -1,7 +1,17 @@ -from .deepspeech2 import DeepSpeech2Model +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. from .deepspeech2 import DeepSpeech2InferModel +from .deepspeech2 import DeepSpeech2Model __all__ = ['DeepSpeech2Model', 'DeepSpeech2InferModel'] - - - diff --git a/deepspeech/models/ds2/deepspeech2.py b/deepspeech/models/ds2/deepspeech2.py index 0bd5fb95..8d737e80 100644 --- a/deepspeech/models/ds2/deepspeech2.py +++ b/deepspeech/models/ds2/deepspeech2.py @@ -19,15 +19,15 @@ from paddle import nn from yacs.config import CfgNode from deepspeech.models.ds2.conv import ConvStack -from deepspeech.modules.ctc import CTCDecoder from deepspeech.models.ds2.rnn import RNNStack +from deepspeech.modules.ctc import CTCDecoder from deepspeech.utils import layer_tools from deepspeech.utils.checkpoint import Checkpoint from deepspeech.utils.log import Log logger = Log(__name__).getlog() -__all__ = ['DeepSpeech2Model', 'DeepSpeech2InferMode'] +__all__ = ['DeepSpeech2Model', 'DeepSpeech2InferModel'] class CRNNEncoder(nn.Layer): @@ -117,7 +117,7 @@ class DeepSpeech2Model(nn.Layer): :type share_weights: bool :return: A tuple of an output unnormalized log probability layer ( before softmax) and a ctc cost layer. - :rtype: tuple of LayerOutput + :rtype: tuple of LayerOutput """ @classmethod @@ -206,10 +206,10 @@ class DeepSpeech2Model(nn.Layer): config: yacs.config.CfgNode model configs - + checkpoint_path: Path or str the path of pretrained model checkpoint, without extension name - + Returns ------- DeepSpeech2Model diff --git a/examples/callcenter/s1/README.md b/examples/callcenter/s1/README.md index a83a516b..b9fa1472 100644 --- a/examples/callcenter/s1/README.md +++ b/examples/callcenter/s1/README.md @@ -17,4 +17,4 @@ | conformer | 45.73 M | conf/chunk_conformer.yaml | spec_aug + shift | test | attention | 16, -1 | 2.23287845 | 0.087982 | | conformer | 45.73 M | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_greedy_search | 16, -1 | 2.23287845 | 0.086962 | | conformer | 45.73 M | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_prefix_beam_search | 16, -1 | 2.23287845 | 0.086741 | -| conformer | 45.73 M | conf/chunk_conformer.yaml | spec_aug + shift | test | attention_rescoring | 16, -1 | 2.23287845 | 0.083495 | +| conformer | 45.73 M | conf/chunk_conformer.yaml | spec_aug + shift | test | attention_rescoring | 16, -1 | 2.23287845 | 0.083495 | diff --git a/examples/librispeech/s1/README.md b/examples/librispeech/s1/README.md index 7f89b808..79b5b80e 100644 --- a/examples/librispeech/s1/README.md +++ b/examples/librispeech/s1/README.md @@ -2,10 +2,10 @@ ## Data -| Data Subset | Duration in Seconds | -| data/manifest.train | 0.83s ~ 29.735s | +| Data Subset | Duration in Seconds | +| data/manifest.train | 0.83s ~ 29.735s | | data/manifest.dev | 1.065 ~ 35.155s | -| data/manifest.test-clean | 1.285s ~ 34.955s | +| data/manifest.test-clean | 1.285s ~ 34.955s | ## Conformer From 74f987aa8f34705934c053b4b9caab865a264072 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Wed, 4 Aug 2021 08:05:37 +0000 Subject: [PATCH 03/21] setup.sh with tools/ make --- .gitignore | 1 + examples/librispeech/s1/README.md | 15 +++++++++++++-- setup.sh | 5 +++++ 3 files changed, 19 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 260c80bd..4ac2a36d 100644 --- a/.gitignore +++ b/.gitignore @@ -15,6 +15,7 @@ tools/venv tools/kenlm tools/sox-14.4.2 tools/soxbindings +tools/montreal-forced-aligner/ tools/Montreal-Forced-Aligner/ *output/ diff --git a/examples/librispeech/s1/README.md b/examples/librispeech/s1/README.md index 79b5b80e..f27b474c 100644 --- a/examples/librispeech/s1/README.md +++ b/examples/librispeech/s1/README.md @@ -11,13 +11,18 @@ | Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER | | --- | --- | --- | --- | --- | --- | --- | --- | -| conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-all | attention | 6.35 | 0.057117 | | conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean | attention | 6.35 | 0.030162 | | conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean | ctc_greedy_search | 6.35 | 0.037910 | | conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean | ctc_prefix_beam_search | 6.35 | 0.037761 | | conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean | attention_rescoring | 6.35 | 0.032115 | +### Test w/o length filter + +| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER | +| --- | --- | --- | --- | --- | --- | --- | --- | +| conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean-all | attention | 6.35 | 0.057117 | + ## Chunk Conformer | Model | Params | Config | Augmentation| Test set | Decode method | Chunk Size & Left Chunks | Loss | WER | @@ -32,5 +37,11 @@ | Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER | | --- | --- | --- | --- | --- | --- | --- | --- | -| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-all | attention | 6.98 | 0.066500 | | transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean | attention | 6.98 | 0.036 | + +### Test w/o length filter + +| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER | +| --- | --- | --- | --- | --- | --- | --- | --- | +| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean-all | attention | 6.98 | 0.066500 | + diff --git a/setup.sh b/setup.sh index b340d47b..503bc6b5 100644 --- a/setup.sh +++ b/setup.sh @@ -17,6 +17,11 @@ if [ -e /etc/lsb-release ];then fi fi + +# tools/make +rm tools/*.done +pushd tools && make && popd + # install python dependencies if [ -f "requirements.txt" ]; then pip3 install -r requirements.txt From 032315191296fad713bd08f9fb37b88242e66c77 Mon Sep 17 00:00:00 2001 From: Junkun Date: Mon, 26 Jul 2021 18:28:21 -0700 Subject: [PATCH 04/21] add u2 st --- deepspeech/exps/u2_st/__init__.py | 13 + deepspeech/exps/u2_st/bin/export.py | 48 ++ deepspeech/exps/u2_st/bin/test.py | 55 +++ deepspeech/exps/u2_st/bin/train.py | 59 +++ deepspeech/exps/u2_st/config.py | 41 ++ deepspeech/exps/u2_st/model.py | 675 ++++++++++++++++++++++++++++ 6 files changed, 891 insertions(+) create mode 100644 deepspeech/exps/u2_st/__init__.py create mode 100644 deepspeech/exps/u2_st/bin/export.py create mode 100644 deepspeech/exps/u2_st/bin/test.py create mode 100644 deepspeech/exps/u2_st/bin/train.py create mode 100644 deepspeech/exps/u2_st/config.py create mode 100644 deepspeech/exps/u2_st/model.py diff --git a/deepspeech/exps/u2_st/__init__.py b/deepspeech/exps/u2_st/__init__.py new file mode 100644 index 00000000..185a92b8 --- /dev/null +++ b/deepspeech/exps/u2_st/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/deepspeech/exps/u2_st/bin/export.py b/deepspeech/exps/u2_st/bin/export.py new file mode 100644 index 00000000..f566ba5b --- /dev/null +++ b/deepspeech/exps/u2_st/bin/export.py @@ -0,0 +1,48 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Export for U2 model.""" +from deepspeech.exps.u2_st.config import get_cfg_defaults +from deepspeech.exps.u2_st.model import U2STTester as Tester +from deepspeech.training.cli import default_argument_parser +from deepspeech.utils.utility import print_arguments + + +def main_sp(config, args): + exp = Tester(config, args) + exp.setup() + exp.run_export() + + +def main(config, args): + main_sp(config, args) + + +if __name__ == "__main__": + parser = default_argument_parser() + args = parser.parse_args() + print_arguments(args, globals()) + + # https://yaml.org/type/float.html + config = get_cfg_defaults() + if args.config: + config.merge_from_file(args.config) + if args.opts: + config.merge_from_list(args.opts) + config.freeze() + print(config) + if args.dump_config: + with open(args.dump_config, 'w') as f: + print(config, file=f) + + main(config, args) diff --git a/deepspeech/exps/u2_st/bin/test.py b/deepspeech/exps/u2_st/bin/test.py new file mode 100644 index 00000000..d66c7a26 --- /dev/null +++ b/deepspeech/exps/u2_st/bin/test.py @@ -0,0 +1,55 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Evaluation for U2 model.""" +import cProfile + +from deepspeech.exps.u2_st.config import get_cfg_defaults +from deepspeech.exps.u2_st.model import U2STTester as Tester +from deepspeech.training.cli import default_argument_parser +from deepspeech.utils.utility import print_arguments + +# TODO(hui zhang): dynamic load + + +def main_sp(config, args): + exp = Tester(config, args) + exp.setup() + exp.run_test() + + +def main(config, args): + main_sp(config, args) + + +if __name__ == "__main__": + parser = default_argument_parser() + args = parser.parse_args() + print_arguments(args, globals()) + + # https://yaml.org/type/float.html + config = get_cfg_defaults() + if args.config: + config.merge_from_file(args.config) + if args.opts: + config.merge_from_list(args.opts) + config.freeze() + print(config) + if args.dump_config: + with open(args.dump_config, 'w') as f: + print(config, file=f) + + # Setting for profiling + pr = cProfile.Profile() + pr.runcall(main, config, args) + pr.dump_stats('test.profile') diff --git a/deepspeech/exps/u2_st/bin/train.py b/deepspeech/exps/u2_st/bin/train.py new file mode 100644 index 00000000..86a0f000 --- /dev/null +++ b/deepspeech/exps/u2_st/bin/train.py @@ -0,0 +1,59 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Trainer for U2 model.""" +import cProfile +import os + +from paddle import distributed as dist + +from deepspeech.exps.u2_st.config import get_cfg_defaults +from deepspeech.exps.u2_st.model import U2STTrainer as Trainer +from deepspeech.training.cli import default_argument_parser +from deepspeech.utils.utility import print_arguments + + +def main_sp(config, args): + exp = Trainer(config, args) + exp.setup() + exp.run() + + +def main(config, args): + if args.device == "gpu" and args.nprocs > 1: + dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs) + else: + main_sp(config, args) + + +if __name__ == "__main__": + parser = default_argument_parser() + args = parser.parse_args() + print_arguments(args, globals()) + + # https://yaml.org/type/float.html + config = get_cfg_defaults() + if args.config: + config.merge_from_file(args.config) + if args.opts: + config.merge_from_list(args.opts) + config.freeze() + print(config) + if args.dump_config: + with open(args.dump_config, 'w') as f: + print(config, file=f) + + # Setting for profiling + pr = cProfile.Profile() + pr.runcall(main, config, args) + pr.dump_stats(os.path.join(args.output, 'train.profile')) diff --git a/deepspeech/exps/u2_st/config.py b/deepspeech/exps/u2_st/config.py new file mode 100644 index 00000000..b1b7b357 --- /dev/null +++ b/deepspeech/exps/u2_st/config.py @@ -0,0 +1,41 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from yacs.config import CfgNode + +from deepspeech.exps.u2_st.model import U2STTester +from deepspeech.exps.u2_st.model import U2STTrainer +from deepspeech.io.collator_st import SpeechCollator +from deepspeech.io.dataset import ManifestDataset +from deepspeech.models.u2_st import U2STModel + +_C = CfgNode() + +_C.data = ManifestDataset.params() + +_C.collator = SpeechCollator.params() + +_C.model = U2STModel.params() + +_C.training = U2STTrainer.params() + +_C.decoding = U2STTester.params() + + +def get_cfg_defaults(): + """Get a yacs CfgNode object with default values for my_project.""" + # Return a clone so that the defaults will not be altered + # This is for the "local variable" use pattern + config = _C.clone() + config.set_new_allowed(True) + return config diff --git a/deepspeech/exps/u2_st/model.py b/deepspeech/exps/u2_st/model.py new file mode 100644 index 00000000..21323fc9 --- /dev/null +++ b/deepspeech/exps/u2_st/model.py @@ -0,0 +1,675 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contains U2 model.""" +import json +import os +import sys +import time +from collections import defaultdict +from pathlib import Path +from typing import List +from typing import Optional +from typing import Tuple + +import numpy as np +import paddle +import sacrebleu +from paddle import distributed as dist +from paddle.io import DataLoader +from yacs.config import CfgNode + +from deepspeech.io.collator_st import KaldiPrePorocessedCollator +from deepspeech.io.collator_st import SpeechCollator +from deepspeech.io.collator_st import TripletKaldiPrePorocessedCollator +from deepspeech.io.dataset import ManifestDataset +from deepspeech.io.dataset import TripletManifestDataset +from deepspeech.io.sampler import SortagradBatchSampler +from deepspeech.io.sampler import SortagradDistributedBatchSampler +from deepspeech.models.u2_st import U2STModel +from deepspeech.training.gradclip import ClipGradByGlobalNormWithLog +from deepspeech.training.scheduler import WarmupLR +from deepspeech.training.trainer import Trainer +from deepspeech.utils import ctc_utils +from deepspeech.utils import error_rate +from deepspeech.utils import layer_tools +from deepspeech.utils import mp_tools +from deepspeech.utils import text_grid +from deepspeech.utils import utility +from deepspeech.utils.log import Log + +logger = Log(__name__).getlog() + + +class U2STTrainer(Trainer): + @classmethod + def params(cls, config: Optional[CfgNode]=None) -> CfgNode: + # training config + default = CfgNode( + dict( + n_epoch=50, # train epochs + log_interval=100, # steps + accum_grad=1, # accum grad by # steps + global_grad_clip=5.0, # the global norm clip + )) + default.optim = 'adam' + default.optim_conf = CfgNode( + dict( + lr=5e-4, # learning rate + weight_decay=1e-6, # the coeff of weight decay + )) + default.scheduler = 'warmuplr' + default.scheduler_conf = CfgNode( + dict( + warmup_steps=25000, + lr_decay=1.0, # learning rate decay + )) + + if config is not None: + config.merge_from_other_cfg(default) + return default + + def __init__(self, config, args): + super().__init__(config, args) + + def train_batch(self, batch_index, batch_data, msg): + train_conf = self.config.training + start = time.time() + utt, audio, audio_len, text, text_len = batch_data + if isinstance(text, list) and isinstance(text_len, list): + # joint training with ASR. Two decoding texts [translation, transcription] + text, text_transcript = text + text_len, text_transcript_len = text_len + loss, st_loss, attention_loss, ctc_loss = self.model( + audio, audio_len, text, text_len, text_transcript, + text_transcript_len) + else: + loss, st_loss, attention_loss, ctc_loss = self.model( + audio, audio_len, text, text_len) + # loss div by `batch_size * accum_grad` + loss /= train_conf.accum_grad + loss.backward() + layer_tools.print_grads(self.model, print_func=None) + + losses_np = {'loss': float(loss) * train_conf.accum_grad} + losses_np['st_loss'] = float(st_loss) + if attention_loss: + losses_np['att_loss'] = float(attention_loss) + if ctc_loss: + losses_np['ctc_loss'] = float(ctc_loss) + + if (batch_index + 1) % train_conf.accum_grad == 0: + self.optimizer.step() + self.optimizer.clear_grad() + self.lr_scheduler.step() + self.iteration += 1 + + iteration_time = time.time() - start + + if (batch_index + 1) % train_conf.log_interval == 0: + msg += "train time: {:>.3f}s, ".format(iteration_time) + msg += "batch size: {}, ".format(self.config.collator.batch_size) + msg += "accum: {}, ".format(train_conf.accum_grad) + msg += ', '.join('{}: {:>.6f}'.format(k, v) + for k, v in losses_np.items()) + logger.info(msg) + + if dist.get_rank() == 0 and self.visualizer: + losses_np_v = losses_np.copy() + losses_np_v.update({"lr": self.lr_scheduler()}) + self.visualizer.add_scalars("step", losses_np_v, + self.iteration - 1) + + @paddle.no_grad() + def valid(self): + self.model.eval() + logger.info(f"Valid Total Examples: {len(self.valid_loader.dataset)}") + valid_losses = defaultdict(list) + num_seen_utts = 1 + total_loss = 0.0 + for i, batch in enumerate(self.valid_loader): + utt, audio, audio_len, text, text_len = batch + if isinstance(text, list) and isinstance(text_len, list): + text, text_transcript = text + text_len, text_transcript_len = text_len + loss, st_loss, attention_loss, ctc_loss = self.model( + audio, audio_len, text, text_len, text_transcript, + text_transcript_len) + else: + loss, st_loss, attention_loss, ctc_loss = self.model( + audio, audio_len, text, text_len) + if paddle.isfinite(loss): + num_utts = batch[1].shape[0] + num_seen_utts += num_utts + total_loss += float(st_loss) * num_utts + valid_losses['val_loss'].append(float(st_loss)) + if attention_loss: + valid_losses['val_att_loss'].append(float(attention_loss)) + if ctc_loss: + valid_losses['val_ctc_loss'].append(float(ctc_loss)) + + if (i + 1) % self.config.training.log_interval == 0: + valid_dump = {k: np.mean(v) for k, v in valid_losses.items()} + valid_dump['val_history_st_loss'] = total_loss / num_seen_utts + + # logging + msg = f"Valid: Rank: {dist.get_rank()}, " + msg += "epoch: {}, ".format(self.epoch) + msg += "step: {}, ".format(self.iteration) + msg += "batch: {}/{}, ".format(i + 1, len(self.valid_loader)) + msg += ', '.join('{}: {:>.6f}'.format(k, v) + for k, v in valid_dump.items()) + logger.info(msg) + + logger.info('Rank {} Val info st_val_loss {}'.format( + dist.get_rank(), total_loss / num_seen_utts)) + return total_loss, num_seen_utts + + def train(self): + """The training process control by step.""" + # !!!IMPORTANT!!! + # Try to export the model by script, if fails, we should refine + # the code to satisfy the script export requirements + # script_model = paddle.jit.to_static(self.model) + # script_model_path = str(self.checkpoint_dir / 'init') + # paddle.jit.save(script_model, script_model_path) + + from_scratch = self.resume_or_scratch() + if from_scratch: + # save init model, i.e. 0 epoch + self.save(tag='init') + + self.lr_scheduler.step(self.iteration) + if self.parallel: + self.train_loader.batch_sampler.set_epoch(self.epoch) + + logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}") + while self.epoch < self.config.training.n_epoch: + self.model.train() + try: + data_start_time = time.time() + for batch_index, batch in enumerate(self.train_loader): + dataload_time = time.time() - data_start_time + msg = "Train: Rank: {}, ".format(dist.get_rank()) + msg += "epoch: {}, ".format(self.epoch) + msg += "step: {}, ".format(self.iteration) + msg += "batch : {}/{}, ".format(batch_index + 1, + len(self.train_loader)) + msg += "lr: {:>.8f}, ".format(self.lr_scheduler()) + msg += "data time: {:>.3f}s, ".format(dataload_time) + self.train_batch(batch_index, batch, msg) + data_start_time = time.time() + except Exception as e: + logger.error(e) + raise e + + total_loss, num_seen_utts = self.valid() + if dist.get_world_size() > 1: + num_seen_utts = paddle.to_tensor(num_seen_utts) + # the default operator in all_reduce function is sum. + dist.all_reduce(num_seen_utts) + total_loss = paddle.to_tensor(total_loss) + dist.all_reduce(total_loss) + cv_loss = total_loss / num_seen_utts + cv_loss = float(cv_loss) + else: + cv_loss = total_loss / num_seen_utts + + logger.info( + 'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss)) + if self.visualizer: + self.visualizer.add_scalars( + 'epoch', {'cv_loss': cv_loss, + 'lr': self.lr_scheduler()}, self.epoch) + self.save(tag=self.epoch, infos={'val_loss': cv_loss}) + self.new_epoch() + + def setup_dataloader(self): + config = self.config.clone() + config.defrost() + config.collator.keep_transcription_text = False + + # train/valid dataset, return token ids + Dataset = TripletManifestDataset if config.model.model_conf.asr_weight > 0. else ManifestDataset + config.data.manifest = config.data.train_manifest + train_dataset = Dataset.from_config(config) + + config.data.manifest = config.data.dev_manifest + dev_dataset = Dataset.from_config(config) + + if config.collator.raw_wav: + TestCollator = Collator = SpeechCollator + # Not yet implement the mtl loader for raw_wav. + else: + if config.model.model_conf.asr_weight > 0.: + Collator = TripletKaldiPrePorocessedCollator + TestCollator = KaldiPrePorocessedCollator + else: + TestCollator = Collator = KaldiPrePorocessedCollator + + collate_fn_train = Collator.from_config(config) + + config.collator.augmentation_config = "" + collate_fn_dev = Collator.from_config(config) + + if self.parallel: + batch_sampler = SortagradDistributedBatchSampler( + train_dataset, + batch_size=config.collator.batch_size, + num_replicas=None, + rank=None, + shuffle=True, + drop_last=True, + sortagrad=config.collator.sortagrad, + shuffle_method=config.collator.shuffle_method) + else: + batch_sampler = SortagradBatchSampler( + train_dataset, + shuffle=True, + batch_size=config.collator.batch_size, + drop_last=True, + sortagrad=config.collator.sortagrad, + shuffle_method=config.collator.shuffle_method) + self.train_loader = DataLoader( + train_dataset, + batch_sampler=batch_sampler, + collate_fn=collate_fn_train, + num_workers=config.collator.num_workers, ) + self.valid_loader = DataLoader( + dev_dataset, + batch_size=config.collator.batch_size, + shuffle=False, + drop_last=False, + collate_fn=collate_fn_dev) + + # test dataset, return raw text + config.data.manifest = config.data.test_manifest + # filter test examples, will cause less examples, but no mismatch with training + # and can use large batch size , save training time, so filter test egs now. + # config.data.min_input_len = 0.0 # second + # config.data.max_input_len = float('inf') # second + # config.data.min_output_len = 0.0 # tokens + # config.data.max_output_len = float('inf') # tokens + # config.data.min_output_input_ratio = 0.00 + # config.data.max_output_input_ratio = float('inf') + test_dataset = ManifestDataset.from_config(config) + # return text ord id + config.collator.keep_transcription_text = True + config.collator.augmentation_config = "" + self.test_loader = DataLoader( + test_dataset, + batch_size=config.decoding.batch_size, + shuffle=False, + drop_last=False, + collate_fn=TestCollator.from_config(config)) + # return text token id + config.collator.keep_transcription_text = False + self.align_loader = DataLoader( + test_dataset, + batch_size=config.decoding.batch_size, + shuffle=False, + drop_last=False, + collate_fn=TestCollator.from_config(config)) + logger.info("Setup train/valid/test/align Dataloader!") + + def setup_model(self): + config = self.config + model_conf = config.model + model_conf.defrost() + model_conf.input_dim = self.train_loader.collate_fn.feature_size + model_conf.output_dim = self.train_loader.collate_fn.vocab_size + model_conf.freeze() + model = U2STModel.from_config(model_conf) + + if self.parallel: + model = paddle.DataParallel(model) + + logger.info(f"{model}") + layer_tools.print_params(model, logger.info) + + train_config = config.training + optim_type = train_config.optim + optim_conf = train_config.optim_conf + scheduler_type = train_config.scheduler + scheduler_conf = train_config.scheduler_conf + + grad_clip = ClipGradByGlobalNormWithLog(train_config.global_grad_clip) + weight_decay = paddle.regularizer.L2Decay(optim_conf.weight_decay) + + if scheduler_type == 'expdecaylr': + lr_scheduler = paddle.optimizer.lr.ExponentialDecay( + learning_rate=optim_conf.lr, + gamma=scheduler_conf.lr_decay, + verbose=False) + elif scheduler_type == 'warmuplr': + lr_scheduler = WarmupLR( + learning_rate=optim_conf.lr, + warmup_steps=scheduler_conf.warmup_steps, + verbose=False) + elif scheduler_type == 'noam': + lr_scheduler = paddle.optimizer.lr.NoamDecay( + learning_rate=optim_conf.lr, + d_model=model_conf.encoder_conf.output_size, + warmup_steps=scheduler_conf.warmup_steps, + verbose=False) + else: + raise ValueError(f"Not support scheduler: {scheduler_type}") + + if optim_type == 'adam': + optimizer = paddle.optimizer.Adam( + learning_rate=lr_scheduler, + parameters=model.parameters(), + weight_decay=weight_decay, + grad_clip=grad_clip) + else: + raise ValueError(f"Not support optim: {optim_type}") + + self.model = model + self.optimizer = optimizer + self.lr_scheduler = lr_scheduler + logger.info("Setup model/optimizer/lr_scheduler!") + + +class U2STTester(U2STTrainer): + @classmethod + def params(cls, config: Optional[CfgNode]=None) -> CfgNode: + # decoding config + default = CfgNode( + dict( + alpha=2.5, # Coef of LM for beam search. + beta=0.3, # Coef of WC for beam search. + cutoff_prob=1.0, # Cutoff probability for pruning. + cutoff_top_n=40, # Cutoff number for pruning. + lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm', # Filepath for language model. + decoding_method='attention', # Decoding method. Options: 'attention', 'ctc_greedy_search', + # 'ctc_prefix_beam_search', 'attention_rescoring' + error_rate_type='wer', # Error rate type for evaluation. Options `wer`, 'cer' + num_proc_bsearch=8, # # of CPUs for beam search. + beam_size=10, # Beam search width. + batch_size=16, # decoding batch size + ctc_weight=0.0, # ctc weight for attention rescoring decode mode. + decoding_chunk_size=-1, # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. + num_decoding_left_chunks=-1, # number of left chunks for decoding. Defaults to -1. + simulate_streaming=False, # simulate streaming inference. Defaults to False. + )) + + if config is not None: + config.merge_from_other_cfg(default) + return default + + def __init__(self, config, args): + super().__init__(config, args) + + def ordid2token(self, texts, texts_len): + """ ord() id to chr() chr """ + trans = [] + for text, n in zip(texts, texts_len): + n = n.numpy().item() + ids = text[:n] + trans.append(''.join([chr(i) for i in ids])) + return trans + + def compute_translation_metrics(self, + utts, + audio, + audio_len, + texts, + texts_len, + fout=None): + cfg = self.config.decoding + len_refs, num_ins = 0, 0 + bleu_func = sacrebleu.corpus_bleu + + start_time = time.time() + text_feature = self.test_loader.collate_fn.text_feature + + refs = [ + "".join(chr(t) for t in text[:text_len]) + for text, text_len in zip(texts, texts_len) + ] + # from IPython import embed + # import os + # embed() + # os._exit(0) + hyps = self.model.decode( + audio, + audio_len, + text_feature=text_feature, + decoding_method=cfg.decoding_method, + lang_model_path=cfg.lang_model_path, + beam_alpha=cfg.alpha, + beam_beta=cfg.beta, + beam_size=cfg.beam_size, + cutoff_prob=cfg.cutoff_prob, + cutoff_top_n=cfg.cutoff_top_n, + num_processes=cfg.num_proc_bsearch, + ctc_weight=cfg.ctc_weight, + decoding_chunk_size=cfg.decoding_chunk_size, + num_decoding_left_chunks=cfg.num_decoding_left_chunks, + simulate_streaming=cfg.simulate_streaming) + decode_time = time.time() - start_time + + for utt, target, result in zip(utts, refs, hyps): + len_refs += len(target.split()) + num_ins += 1 + if fout: + fout.write(utt + " " + result + "\n") + logger.info("\nReference: %s\nHypothesis: %s" % (target, result)) + logger.info("One example BLEU = %s" % + (bleu_func([result], [[target]]).prec_str)) + + return dict( + hyps=hyps, + refs=refs, + bleu=bleu_func(hyps, [refs]).score, + len_refs=len_refs, + num_ins=num_ins, # num examples + num_frames=audio_len.sum().numpy().item(), + decode_time=decode_time) + + @mp_tools.rank_zero_only + @paddle.no_grad() + def test(self): + assert self.args.result_file + self.model.eval() + logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}") + + stride_ms = self.test_loader.collate_fn.stride_ms + hyps, refs = [], [] + len_refs, num_ins = 0, 0 + num_frames = 0.0 + num_time = 0.0 + with open(self.args.result_file, 'w') as fout: + for i, batch in enumerate(self.test_loader): + metrics = self.compute_translation_metrics( + *batch, fout=fout) + hyps += metrics['hyps'] + refs += metrics['refs'] + bleu = metrics['bleu'] + num_frames += metrics['num_frames'] + num_time += metrics["decode_time"] + len_refs += metrics['len_refs'] + num_ins += metrics['num_ins'] + rtf = num_time / (num_frames * stride_ms) + logger.info("RTF: %f, BELU (%d) = %f" % + (rtf, num_ins, bleu)) + + rtf = num_time / (num_frames * stride_ms) + msg = "Test: " + msg += "epoch: {}, ".format(self.epoch) + msg += "step: {}, ".format(self.iteration) + msg += "RTF: {}, ".format(rtf) + msg += "Test set [%s]: %s" % ( + len(hyps), str(sacrebleu.corpus_bleu(hyps, [refs]))) + logger.info(msg) + bleu_meta_path = os.path.splitext( + self.args.result_file)[0] + '.bleu' + err_type_str = "BLEU" + with open(bleu_meta_path, 'w') as f: + data = json.dumps({ + "epoch": + self.epoch, + "step": + self.iteration, + "rtf": + rtf, + err_type_str: + sacrebleu.corpus_bleu(hyps, [refs]).score, + "dataset_hour": (num_frames * stride_ms) / 1000.0 / 3600.0, + "process_hour": + num_time / 1000.0 / 3600.0, + "num_examples": + num_ins, + "decode_method": + self.config.decoding.decoding_method, + }) + f.write(data + '\n') + + def run_test(self): + self.resume_or_scratch() + try: + self.test() + except KeyboardInterrupt: + sys.exit(-1) + + @paddle.no_grad() + def align(self): + if self.config.decoding.batch_size > 1: + logger.fatal('alignment mode must be running with batch_size == 1') + sys.exit(1) + + # xxx.align + assert self.args.result_file and self.args.result_file.endswith( + '.align') + + self.model.eval() + logger.info(f"Align Total Examples: {len(self.align_loader.dataset)}") + + stride_ms = self.align_loader.collate_fn.stride_ms + token_dict = self.align_loader.collate_fn.vocab_list + with open(self.args.result_file, 'w') as fout: + # one example in batch + for i, batch in enumerate(self.align_loader): + key, feat, feats_length, target, target_length = batch + + # 1. Encoder + encoder_out, encoder_mask = self.model._forward_encoder( + feat, feats_length) # (B, maxlen, encoder_dim) + maxlen = encoder_out.size(1) + ctc_probs = self.model.ctc.log_softmax( + encoder_out) # (1, maxlen, vocab_size) + + # 2. alignment + ctc_probs = ctc_probs.squeeze(0) + target = target.squeeze(0) + alignment = ctc_utils.forced_align(ctc_probs, target) + logger.info("align ids", key[0], alignment) + fout.write('{} {}\n'.format(key[0], alignment)) + + # 3. gen praat + # segment alignment + align_segs = text_grid.segment_alignment(alignment) + logger.info("align tokens", key[0], align_segs) + # IntervalTier, List["start end token\n"] + subsample = utility.get_subsample(self.config) + tierformat = text_grid.align_to_tierformat( + align_segs, subsample, token_dict) + # write tier + align_output_path = os.path.join( + os.path.dirname(self.args.result_file), "align") + tier_path = os.path.join(align_output_path, key[0] + ".tier") + with open(tier_path, 'w') as f: + f.writelines(tierformat) + # write textgrid + textgrid_path = os.path.join(align_output_path, + key[0] + ".TextGrid") + second_per_frame = 1. / (1000. / + stride_ms) # 25ms window, 10ms stride + second_per_example = ( + len(alignment) + 1) * subsample * second_per_frame + text_grid.generate_textgrid( + maxtime=second_per_example, + intervals=tierformat, + output=textgrid_path) + + def run_align(self): + self.resume_or_scratch() + try: + self.align() + except KeyboardInterrupt: + sys.exit(-1) + + def load_inferspec(self): + """infer model and input spec. + + Returns: + nn.Layer: inference model + List[paddle.static.InputSpec]: input spec. + """ + from deepspeech.models.u2 import U2InferModel + infer_model = U2InferModel.from_pretrained(self.test_loader, + self.config.model.clone(), + self.args.checkpoint_path) + feat_dim = self.test_loader.collate_fn.feature_size + input_spec = [ + paddle.static.InputSpec(shape=[1, None, feat_dim], + dtype='float32'), # audio, [B,T,D] + paddle.static.InputSpec(shape=[1], + dtype='int64'), # audio_length, [B] + ] + return infer_model, input_spec + + def export(self): + infer_model, input_spec = self.load_inferspec() + assert isinstance(input_spec, list), type(input_spec) + infer_model.eval() + static_model = paddle.jit.to_static(infer_model, input_spec=input_spec) + logger.info(f"Export code: {static_model.forward.code}") + paddle.jit.save(static_model, self.args.export_path) + + def run_export(self): + try: + self.export() + except KeyboardInterrupt: + sys.exit(-1) + + def setup(self): + """Setup the experiment. + """ + paddle.set_device(self.args.device) + + self.setup_output_dir() + self.setup_checkpointer() + + self.setup_dataloader() + self.setup_model() + + self.iteration = 0 + self.epoch = 0 + + def setup_output_dir(self): + """Create a directory used for output. + """ + # output dir + if self.args.output: + output_dir = Path(self.args.output).expanduser() + output_dir.mkdir(parents=True, exist_ok=True) + else: + output_dir = Path( + self.args.checkpoint_path).expanduser().parent.parent + output_dir.mkdir(parents=True, exist_ok=True) + + self.output_dir = output_dir From 2fd43efb3bb291c7fba79ca932ce399328f2f0f0 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Wed, 4 Aug 2021 11:12:43 +0000 Subject: [PATCH 05/21] librispeech s2 for kaldi feat --- examples/librispeech/README.md | 2 + examples/librispeech/s2/README.md | 47 +++++++ .../librispeech/s2/conf/augmentation.json | 34 +++++ .../librispeech/s2/conf/chunk_conformer.yaml | 120 ++++++++++++++++++ .../s2/conf/chunk_transformer.yaml | 113 +++++++++++++++++ examples/librispeech/s2/conf/conformer.yaml | 116 +++++++++++++++++ examples/librispeech/s2/conf/transformer.yaml | 111 ++++++++++++++++ examples/librispeech/s2/local/align.sh | 37 ++++++ examples/librispeech/s2/local/data.sh | 111 ++++++++++++++++ .../librispeech/s2/local/download_lm_en.sh | 20 +++ examples/librispeech/s2/local/export.sh | 34 +++++ examples/librispeech/s2/local/test.sh | 72 +++++++++++ examples/librispeech/s2/local/train.sh | 33 +++++ examples/librispeech/s2/path.sh | 14 ++ examples/librispeech/s2/run.sh | 43 +++++++ examples/librispeech/s2/utils | 1 + 16 files changed, 908 insertions(+) create mode 100644 examples/librispeech/s2/README.md create mode 100644 examples/librispeech/s2/conf/augmentation.json create mode 100644 examples/librispeech/s2/conf/chunk_conformer.yaml create mode 100644 examples/librispeech/s2/conf/chunk_transformer.yaml create mode 100644 examples/librispeech/s2/conf/conformer.yaml create mode 100644 examples/librispeech/s2/conf/transformer.yaml create mode 100755 examples/librispeech/s2/local/align.sh create mode 100755 examples/librispeech/s2/local/data.sh create mode 100755 examples/librispeech/s2/local/download_lm_en.sh create mode 100755 examples/librispeech/s2/local/export.sh create mode 100755 examples/librispeech/s2/local/test.sh create mode 100755 examples/librispeech/s2/local/train.sh create mode 100644 examples/librispeech/s2/path.sh create mode 100755 examples/librispeech/s2/run.sh create mode 120000 examples/librispeech/s2/utils diff --git a/examples/librispeech/README.md b/examples/librispeech/README.md index baa4f296..354baafa 100644 --- a/examples/librispeech/README.md +++ b/examples/librispeech/README.md @@ -1,3 +1,5 @@ # ASR + * s0 is for deepspeech2 offline * s1 is for transformer/conformer/U2 +* s2 is for transformer/conformer/U2 w/ kaldi feat diff --git a/examples/librispeech/s2/README.md b/examples/librispeech/s2/README.md new file mode 100644 index 00000000..f27b474c --- /dev/null +++ b/examples/librispeech/s2/README.md @@ -0,0 +1,47 @@ +# LibriSpeech + +## Data + +| Data Subset | Duration in Seconds | +| data/manifest.train | 0.83s ~ 29.735s | +| data/manifest.dev | 1.065 ~ 35.155s | +| data/manifest.test-clean | 1.285s ~ 34.955s | + +## Conformer + +| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER | +| --- | --- | --- | --- | --- | --- | --- | --- | +| conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean | attention | 6.35 | 0.030162 | +| conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean | ctc_greedy_search | 6.35 | 0.037910 | +| conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean | ctc_prefix_beam_search | 6.35 | 0.037761 | +| conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean | attention_rescoring | 6.35 | 0.032115 | + + +### Test w/o length filter + +| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER | +| --- | --- | --- | --- | --- | --- | --- | --- | +| conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean-all | attention | 6.35 | 0.057117 | + +## Chunk Conformer + +| Model | Params | Config | Augmentation| Test set | Decode method | Chunk Size & Left Chunks | Loss | WER | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | +| conformer | 47.63 M | conf/chunk_conformer.yaml | spec_aug + shift | test-clean | attention | 16, -1 | 7.01250648 | 0.069548 | +| conformer | 47.63 M | conf/chunk_conformer.yaml | spec_aug + shift | test-clean | ctc_greedy_search | 16, -1 | 7.01250648 | 0.094753 | +| conformer | 47.63 M | conf/chunk_conformer.yaml | spec_aug + shift | test-clean | ctc_prefix_beam_search | 16, -1 | 7.01250648 | - | +| conformer | 47.63 M | conf/chunk_conformer.yaml | spec_aug + shift | test-clean | attention_rescoring | 16, -1 | 7.01250648 | - | + + +## Transformer + +| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER | +| --- | --- | --- | --- | --- | --- | --- | --- | +| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean | attention | 6.98 | 0.036 | + +### Test w/o length filter + +| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER | +| --- | --- | --- | --- | --- | --- | --- | --- | +| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean-all | attention | 6.98 | 0.066500 | + diff --git a/examples/librispeech/s2/conf/augmentation.json b/examples/librispeech/s2/conf/augmentation.json new file mode 100644 index 00000000..c1078393 --- /dev/null +++ b/examples/librispeech/s2/conf/augmentation.json @@ -0,0 +1,34 @@ +[ + { + "type": "shift", + "params": { + "min_shift_ms": -5, + "max_shift_ms": 5 + }, + "prob": 1.0 + }, + { + "type": "speed", + "params": { + "min_speed_rate": 0.9, + "max_speed_rate": 1.1, + "num_rates": 3 + }, + "prob": 0.0 + }, + { + "type": "specaug", + "params": { + "F": 10, + "T": 50, + "n_freq_masks": 2, + "n_time_masks": 2, + "p": 1.0, + "W": 80, + "adaptive_number_ratio": 0, + "adaptive_size_ratio": 0, + "max_n_time_masks": 20 + }, + "prob": 1.0 + } +] diff --git a/examples/librispeech/s2/conf/chunk_conformer.yaml b/examples/librispeech/s2/conf/chunk_conformer.yaml new file mode 100644 index 00000000..0de1aefe --- /dev/null +++ b/examples/librispeech/s2/conf/chunk_conformer.yaml @@ -0,0 +1,120 @@ +# https://yaml.org/type/float.html +data: + train_manifest: data/manifest.train + dev_manifest: data/manifest.dev + test_manifest: data/manifest.test + min_input_len: 0.5 + max_input_len: 20.0 + min_output_len: 0.0 + max_output_len: 400.0 + min_output_input_ratio: 0.05 + max_output_input_ratio: 10.0 + +collator: + vocab_filepath: data/vocab.txt + unit_type: 'spm' + spm_model_prefix: 'data/bpe_unigram_5000' + mean_std_filepath: "" + augmentation_config: conf/augmentation.json + batch_size: 16 + raw_wav: True # use raw_wav or kaldi feature + specgram_type: fbank #linear, mfcc, fbank + feat_dim: 80 + delta_delta: False + dither: 1.0 + target_sample_rate: 16000 + max_freq: None + n_fft: None + stride_ms: 10.0 + window_ms: 25.0 + use_dB_normalization: True + target_dB: -20 + random_seed: 0 + keep_transcription_text: False + sortagrad: True + shuffle_method: batch_shuffle + num_workers: 2 + + +# network architecture +model: + cmvn_file: "data/mean_std.json" + cmvn_file_type: "json" + # encoder related + encoder: conformer + encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: True + use_cnn_module: True + cnn_module_kernel: 15 + activation_type: 'swish' + pos_enc_layer_type: 'rel_pos' + selfattention_layer_type: 'rel_selfattn' + causal: True + use_dynamic_chunk: true + cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster + use_dynamic_left_chunk: false + + # decoder related + decoder: transformer + decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 + + # hybrid CTC/attention + model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false + + +training: + n_epoch: 240 + accum_grad: 8 + global_grad_clip: 5.0 + optim: adam + optim_conf: + lr: 0.001 + weight_decay: 1e-06 + scheduler: warmuplr # pytorch v1.1.0+ required + scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 + log_interval: 100 + checkpoint: + kbest_n: 50 + latest_n: 5 + + +decoding: + batch_size: 128 + error_rate_type: wer + decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' + lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm + alpha: 2.5 + beta: 0.3 + beam_size: 10 + cutoff_prob: 1.0 + cutoff_top_n: 0 + num_proc_bsearch: 8 + ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. + decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. + num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. + simulate_streaming: true # simulate streaming inference. Defaults to False. + + diff --git a/examples/librispeech/s2/conf/chunk_transformer.yaml b/examples/librispeech/s2/conf/chunk_transformer.yaml new file mode 100644 index 00000000..f782a037 --- /dev/null +++ b/examples/librispeech/s2/conf/chunk_transformer.yaml @@ -0,0 +1,113 @@ +# https://yaml.org/type/float.html +data: + train_manifest: data/manifest.train + dev_manifest: data/manifest.dev + test_manifest: data/manifest.test + min_input_len: 0.5 # second + max_input_len: 20.0 # second + min_output_len: 0.0 # tokens + max_output_len: 400.0 # tokens + min_output_input_ratio: 0.05 + max_output_input_ratio: 10.0 + +collator: + vocab_filepath: data/vocab.txt + unit_type: 'spm' + spm_model_prefix: 'data/bpe_unigram_5000' + mean_std_filepath: "" + augmentation_config: conf/augmentation.json + batch_size: 64 + raw_wav: True # use raw_wav or kaldi feature + specgram_type: fbank #linear, mfcc, fbank + feat_dim: 80 + delta_delta: False + dither: 1.0 + target_sample_rate: 16000 + max_freq: None + n_fft: None + stride_ms: 10.0 + window_ms: 25.0 + use_dB_normalization: True + target_dB: -20 + random_seed: 0 + keep_transcription_text: False + sortagrad: True + shuffle_method: batch_shuffle + num_workers: 2 + + +# network architecture +model: + cmvn_file: "data/mean_std.json" + cmvn_file_type: "json" + # encoder related + encoder: transformer + encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: true + use_dynamic_chunk: true + use_dynamic_left_chunk: false + + # decoder related + decoder: transformer + decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 + + # hybrid CTC/attention + model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false + + +training: + n_epoch: 120 + accum_grad: 1 + global_grad_clip: 5.0 + optim: adam + optim_conf: + lr: 0.001 + weight_decay: 1e-06 + scheduler: warmuplr # pytorch v1.1.0+ required + scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 + log_interval: 100 + checkpoint: + kbest_n: 50 + latest_n: 5 + + +decoding: + batch_size: 64 + error_rate_type: wer + decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' + lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm + alpha: 2.5 + beta: 0.3 + beam_size: 10 + cutoff_prob: 1.0 + cutoff_top_n: 0 + num_proc_bsearch: 8 + ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. + decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. + num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. + simulate_streaming: true # simulate streaming inference. Defaults to False. + + diff --git a/examples/librispeech/s2/conf/conformer.yaml b/examples/librispeech/s2/conf/conformer.yaml new file mode 100644 index 00000000..955b6108 --- /dev/null +++ b/examples/librispeech/s2/conf/conformer.yaml @@ -0,0 +1,116 @@ +# https://yaml.org/type/float.html +data: + train_manifest: data/manifest.train + dev_manifest: data/manifest.dev + test_manifest: data/manifest.test-clean + min_input_len: 0.5 # seconds + max_input_len: 20.0 # seconds + min_output_len: 0.0 # tokens + max_output_len: 400.0 # tokens + min_output_input_ratio: 0.05 + max_output_input_ratio: 10.0 + +collator: + vocab_filepath: data/vocab.txt + unit_type: 'spm' + spm_model_prefix: 'data/bpe_unigram_5000' + mean_std_filepath: "" + augmentation_config: conf/augmentation.json + batch_size: 16 + raw_wav: True # use raw_wav or kaldi feature + specgram_type: fbank #linear, mfcc, fbank + feat_dim: 80 + delta_delta: False + dither: 1.0 + target_sample_rate: 16000 + max_freq: None + n_fft: None + stride_ms: 10.0 + window_ms: 25.0 + use_dB_normalization: True + target_dB: -20 + random_seed: 0 + keep_transcription_text: False + sortagrad: True + shuffle_method: batch_shuffle + num_workers: 2 + + +# network architecture +model: + cmvn_file: "data/mean_std.json" + cmvn_file_type: "json" + # encoder related + encoder: conformer + encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: True + use_cnn_module: True + cnn_module_kernel: 15 + activation_type: 'swish' + pos_enc_layer_type: 'rel_pos' + selfattention_layer_type: 'rel_selfattn' + + # decoder related + decoder: transformer + decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 + + # hybrid CTC/attention + model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false + + +training: + n_epoch: 120 + accum_grad: 8 + global_grad_clip: 3.0 + optim: adam + optim_conf: + lr: 0.004 + weight_decay: 1e-06 + scheduler: warmuplr # pytorch v1.1.0+ required + scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 + log_interval: 100 + checkpoint: + kbest_n: 50 + latest_n: 5 + + +decoding: + batch_size: 64 + error_rate_type: wer + decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' + lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm + alpha: 2.5 + beta: 0.3 + beam_size: 10 + cutoff_prob: 1.0 + cutoff_top_n: 0 + num_proc_bsearch: 8 + ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. + decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. + num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. + simulate_streaming: False # simulate streaming inference. Defaults to False. + + diff --git a/examples/librispeech/s2/conf/transformer.yaml b/examples/librispeech/s2/conf/transformer.yaml new file mode 100644 index 00000000..8a769dca --- /dev/null +++ b/examples/librispeech/s2/conf/transformer.yaml @@ -0,0 +1,111 @@ +# https://yaml.org/type/float.html +data: + train_manifest: data/manifest.train + dev_manifest: data/manifest.dev + test_manifest: data/manifest.test-clean + min_input_len: 0.5 # second + max_input_len: 20.0 # second + min_output_len: 0.0 # tokens + max_output_len: 400.0 # tokens + min_output_input_ratio: 0.05 + max_output_input_ratio: 10.0 + +collator: + vocab_filepath: data/vocab.txt + unit_type: 'spm' + spm_model_prefix: 'data/bpe_unigram_5000' + mean_std_filepath: "" + augmentation_config: conf/augmentation.json + batch_size: 64 + raw_wav: True # use raw_wav or kaldi feature + specgram_type: fbank #linear, mfcc, fbank + feat_dim: 80 + delta_delta: False + dither: 1.0 + target_sample_rate: 16000 + max_freq: None + n_fft: None + stride_ms: 10.0 + window_ms: 25.0 + use_dB_normalization: True + target_dB: -20 + random_seed: 0 + keep_transcription_text: False + sortagrad: True + shuffle_method: batch_shuffle + num_workers: 2 + + +# network architecture +model: + cmvn_file: "data/mean_std.json" + cmvn_file_type: "json" + # encoder related + encoder: transformer + encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: true + + # decoder related + decoder: transformer + decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 + + # hybrid CTC/attention + model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false + + +training: + n_epoch: 120 + accum_grad: 2 + global_grad_clip: 5.0 + optim: adam + optim_conf: + lr: 0.004 + weight_decay: 1e-06 + scheduler: warmuplr # pytorch v1.1.0+ required + scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 + log_interval: 100 + checkpoint: + kbest_n: 50 + latest_n: 5 + + +decoding: + batch_size: 64 + error_rate_type: wer + decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' + lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm + alpha: 2.5 + beta: 0.3 + beam_size: 10 + cutoff_prob: 1.0 + cutoff_top_n: 0 + num_proc_bsearch: 8 + ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. + decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. + num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. + simulate_streaming: False # simulate streaming inference. Defaults to False. + + diff --git a/examples/librispeech/s2/local/align.sh b/examples/librispeech/s2/local/align.sh new file mode 100755 index 00000000..ad6c84bc --- /dev/null +++ b/examples/librispeech/s2/local/align.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +if [ $# != 2 ];then + echo "usage: ${0} config_path ckpt_path_prefix" + exit -1 +fi + +ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') +echo "using $ngpu gpus..." + +device=gpu +if [ ${ngpu} == 0 ];then + device=cpu +fi +config_path=$1 +ckpt_prefix=$2 + +batch_size=1 +output_dir=${ckpt_prefix} +mkdir -p ${output_dir} + +# align dump in `result_file` +# .tier, .TextGrid dump in `dir of result_file` +python3 -u ${BIN_DIR}/alignment.py \ +--device ${device} \ +--nproc 1 \ +--config ${config_path} \ +--result_file ${output_dir}/${type}.align \ +--checkpoint_path ${ckpt_prefix} \ +--opts decoding.batch_size ${batch_size} + +if [ $? -ne 0 ]; then + echo "Failed in ctc alignment!" + exit 1 +fi + +exit 0 diff --git a/examples/librispeech/s2/local/data.sh b/examples/librispeech/s2/local/data.sh new file mode 100755 index 00000000..4ad476d3 --- /dev/null +++ b/examples/librispeech/s2/local/data.sh @@ -0,0 +1,111 @@ +#!/bin/bash + +stage=-1 +stop_stage=100 + +# bpemode (unigram or bpe) +nbpe=5000 +bpemode=unigram +bpeprefix="data/bpe_${bpemode}_${nbpe}" + +source ${MAIN_ROOT}/utils/parse_options.sh + + +mkdir -p data +TARGET_DIR=${MAIN_ROOT}/examples/dataset +mkdir -p ${TARGET_DIR} + +if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then + # download data, generate manifests + python3 ${TARGET_DIR}/librispeech/librispeech.py \ + --manifest_prefix="data/manifest" \ + --target_dir="${TARGET_DIR}/librispeech" \ + --full_download="True" + + if [ $? -ne 0 ]; then + echo "Prepare LibriSpeech failed. Terminated." + exit 1 + fi + + for set in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do + mv data/manifest.${set} data/manifest.${set}.raw + done + + rm -rf data/manifest.train.raw data/manifest.dev.raw data/manifest.test.raw + for set in train-clean-100 train-clean-360 train-other-500; do + cat data/manifest.${set}.raw >> data/manifest.train.raw + done + + for set in dev-clean dev-other; do + cat data/manifest.${set}.raw >> data/manifest.dev.raw + done + + for set in test-clean test-other; do + cat data/manifest.${set}.raw >> data/manifest.test.raw + done +fi + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # build vocabulary + python3 ${MAIN_ROOT}/utils/build_vocab.py \ + --unit_type "spm" \ + --spm_vocab_size=${nbpe} \ + --spm_mode ${bpemode} \ + --spm_model_prefix ${bpeprefix} \ + --vocab_path="data/vocab.txt" \ + --manifest_paths="data/manifest.train.raw" + + if [ $? -ne 0 ]; then + echo "Build vocabulary failed. Terminated." + exit 1 + fi +fi + + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # compute mean and stddev for normalizer + num_workers=$(nproc) + python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ + --manifest_path="data/manifest.train.raw" \ + --num_samples=-1 \ + --specgram_type="fbank" \ + --feat_dim=80 \ + --delta_delta=false \ + --sample_rate=16000 \ + --stride_ms=10.0 \ + --window_ms=25.0 \ + --use_dB_normalization=False \ + --num_workers=${num_workers} \ + --output_path="data/mean_std.json" + + if [ $? -ne 0 ]; then + echo "Compute mean and stddev failed. Terminated." + exit 1 + fi +fi + + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # format manifest with tokenids, vocab size + for set in train dev test dev-clean dev-other test-clean test-other; do + { + python3 ${MAIN_ROOT}/utils/format_data.py \ + --feat_type "raw" \ + --cmvn_path "data/mean_std.json" \ + --unit_type "spm" \ + --spm_model_prefix ${bpeprefix} \ + --vocab_path="data/vocab.txt" \ + --manifest_path="data/manifest.${set}.raw" \ + --output_path="data/manifest.${set}" + + if [ $? -ne 0 ]; then + echo "Formt mnaifest failed. Terminated." + exit 1 + fi + }& + done + wait +fi + +echo "LibriSpeech Data preparation done." +exit 0 diff --git a/examples/librispeech/s2/local/download_lm_en.sh b/examples/librispeech/s2/local/download_lm_en.sh new file mode 100755 index 00000000..dc1bdf66 --- /dev/null +++ b/examples/librispeech/s2/local/download_lm_en.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +. ${MAIN_ROOT}/utils/utility.sh + +DIR=data/lm +mkdir -p ${DIR} + +URL=https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm +MD5="099a601759d467cd0a8523ff939819c5" +TARGET=${DIR}/common_crawl_00.prune01111.trie.klm + +echo "Download language model ..." +download $URL $MD5 $TARGET +if [ $? -ne 0 ]; then + echo "Fail to download the language model!" + exit 1 +fi + + +exit 0 diff --git a/examples/librispeech/s2/local/export.sh b/examples/librispeech/s2/local/export.sh new file mode 100755 index 00000000..f99a15ba --- /dev/null +++ b/examples/librispeech/s2/local/export.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +if [ $# != 3 ];then + echo "usage: $0 config_path ckpt_prefix jit_model_path" + exit -1 +fi + +ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') +echo "using $ngpu gpus..." + +config_path=$1 +ckpt_path_prefix=$2 +jit_model_export_path=$3 + +device=gpu +if [ ${ngpu} == 0 ];then + device=cpu +fi + +python3 -u ${BIN_DIR}/export.py \ +--device ${device} \ +--nproc ${ngpu} \ +--config ${config_path} \ +--checkpoint_path ${ckpt_path_prefix} \ +--export_path ${jit_model_export_path} + + +if [ $? -ne 0 ]; then + echo "Failed in export!" + exit 1 +fi + + +exit 0 diff --git a/examples/librispeech/s2/local/test.sh b/examples/librispeech/s2/local/test.sh new file mode 100755 index 00000000..3bd3f0bb --- /dev/null +++ b/examples/librispeech/s2/local/test.sh @@ -0,0 +1,72 @@ +#!/bin/bash + +if [ $# != 2 ];then + echo "usage: ${0} config_path ckpt_path_prefix" + exit -1 +fi + +ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') +echo "using $ngpu gpus..." + +device=gpu +if [ ${ngpu} == 0 ];then + device=cpu +fi + +config_path=$1 +ckpt_prefix=$2 + +chunk_mode=false +if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then + chunk_mode=true +fi +echo "chunk mode ${chunk_mode}" + + +# download language model +#bash local/download_lm_en.sh +#if [ $? -ne 0 ]; then +# exit 1 +#fi + +for type in attention ctc_greedy_search; do + echo "decoding ${type}" + if [ ${chunk_mode} == true ];then + # stream decoding only support batchsize=1 + batch_size=1 + else + batch_size=64 + fi + python3 -u ${BIN_DIR}/test.py \ + --device ${device} \ + --nproc 1 \ + --config ${config_path} \ + --result_file ${ckpt_prefix}.${type}.rsl \ + --checkpoint_path ${ckpt_prefix} \ + --opts decoding.decoding_method ${type} decoding.batch_size ${batch_size} + + if [ $? -ne 0 ]; then + echo "Failed in evaluation!" + exit 1 + fi +done + +for type in ctc_prefix_beam_search attention_rescoring; do + echo "decoding ${type}" + batch_size=1 + python3 -u ${BIN_DIR}/test.py \ + --device ${device} \ + --nproc 1 \ + --config ${config_path} \ + --result_file ${ckpt_prefix}.${type}.rsl \ + --checkpoint_path ${ckpt_prefix} \ + --opts decoding.decoding_method ${type} decoding.batch_size ${batch_size} + + if [ $? -ne 0 ]; then + echo "Failed in evaluation!" + exit 1 + fi +done + + +exit 0 diff --git a/examples/librispeech/s2/local/train.sh b/examples/librispeech/s2/local/train.sh new file mode 100755 index 00000000..f3eb98da --- /dev/null +++ b/examples/librispeech/s2/local/train.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +if [ $# != 2 ];then + echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name" + exit -1 +fi + +ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') +echo "using $ngpu gpus..." + +config_path=$1 +ckpt_name=$2 + +device=gpu +if [ ${ngpu} == 0 ];then + device=cpu +fi +echo "using ${device}..." + +mkdir -p exp + +python3 -u ${BIN_DIR}/train.py \ +--device ${device} \ +--nproc ${ngpu} \ +--config ${config_path} \ +--output exp/${ckpt_name} + +if [ $? -ne 0 ]; then + echo "Failed in training!" + exit 1 +fi + +exit 0 diff --git a/examples/librispeech/s2/path.sh b/examples/librispeech/s2/path.sh new file mode 100644 index 00000000..22fb1255 --- /dev/null +++ b/examples/librispeech/s2/path.sh @@ -0,0 +1,14 @@ +export MAIN_ROOT=${PWD}/../../../ + +export PATH=${MAIN_ROOT}:${PWD}/utils:${PATH} +export LC_ALL=C + +# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C +export PYTHONIOENCODING=UTF-8 +export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} + +export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/ + + +MODEL=u2 +export BIN_DIR=${MAIN_ROOT}/deepspeech/exps/${MODEL}/bin diff --git a/examples/librispeech/s2/run.sh b/examples/librispeech/s2/run.sh new file mode 100755 index 00000000..b81e8dcf --- /dev/null +++ b/examples/librispeech/s2/run.sh @@ -0,0 +1,43 @@ +#!/bin/bash +set -e +source path.sh + +stage=0 +stop_stage=100 +conf_path=conf/transformer.yaml +avg_num=30 +source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; + +avg_ckpt=avg_${avg_num} +ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}') +echo "checkpoint name ${ckpt}" + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # prepare data + bash ./local/data.sh || exit -1 +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # train model, all `ckpt` under `exp` dir + CUDA_VISIBLE_DEVICES=4,5,6,7 ./local/train.sh ${conf_path} ${ckpt} +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # avg n best model + avg.sh exp/${ckpt}/checkpoints ${avg_num} +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + # test ckpt avg_n + CUDA_VISIBLE_DEVICES=7 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 +fi + +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + # ctc alignment of test data + CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 +fi + +if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then + # export ckpt avg_n + CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit +fi diff --git a/examples/librispeech/s2/utils b/examples/librispeech/s2/utils new file mode 120000 index 00000000..256f914a --- /dev/null +++ b/examples/librispeech/s2/utils @@ -0,0 +1 @@ +../../../utils/ \ No newline at end of file From ac0ae57ef24412f971216acabbc14abfce0f65e2 Mon Sep 17 00:00:00 2001 From: Junkun Date: Wed, 4 Aug 2021 13:42:03 -0700 Subject: [PATCH 06/21] add collactor and evaluation code for ST --- deepspeech/exps/u2_st/model.py | 29 +- deepspeech/io/collator_st.py | 666 ++++++++++++++++++++++++++++++ deepspeech/io/dataset.py | 17 +- deepspeech/models/u2_st.py | 734 +++++++++++++++++++++++++++++++++ deepspeech/utils/bleu_score.py | 53 +++ 5 files changed, 1484 insertions(+), 15 deletions(-) create mode 100644 deepspeech/io/collator_st.py create mode 100644 deepspeech/models/u2_st.py create mode 100644 deepspeech/utils/bleu_score.py diff --git a/deepspeech/exps/u2_st/model.py b/deepspeech/exps/u2_st/model.py index 21323fc9..867d1899 100644 --- a/deepspeech/exps/u2_st/model.py +++ b/deepspeech/exps/u2_st/model.py @@ -24,7 +24,6 @@ from typing import Tuple import numpy as np import paddle -import sacrebleu from paddle import distributed as dist from paddle.io import DataLoader from yacs.config import CfgNode @@ -32,6 +31,7 @@ from yacs.config import CfgNode from deepspeech.io.collator_st import KaldiPrePorocessedCollator from deepspeech.io.collator_st import SpeechCollator from deepspeech.io.collator_st import TripletKaldiPrePorocessedCollator +from deepspeech.io.collator_st import TripletSpeechCollator from deepspeech.io.dataset import ManifestDataset from deepspeech.io.dataset import TripletManifestDataset from deepspeech.io.sampler import SortagradBatchSampler @@ -40,6 +40,7 @@ from deepspeech.models.u2_st import U2STModel from deepspeech.training.gradclip import ClipGradByGlobalNormWithLog from deepspeech.training.scheduler import WarmupLR from deepspeech.training.trainer import Trainer +from deepspeech.utils import bleu_score from deepspeech.utils import ctc_utils from deepspeech.utils import error_rate from deepspeech.utils import layer_tools @@ -248,7 +249,11 @@ class U2STTrainer(Trainer): dev_dataset = Dataset.from_config(config) if config.collator.raw_wav: - TestCollator = Collator = SpeechCollator + if config.model.model_conf.asr_weight > 0.: + Collator = TripletSpeechCollator + TestCollator = SpeechCollator + else: + TestCollator = Collator = SpeechCollator # Not yet implement the mtl loader for raw_wav. else: if config.model.model_conf.asr_weight > 0.: @@ -393,7 +398,7 @@ class U2STTester(U2STTrainer): lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm', # Filepath for language model. decoding_method='attention', # Decoding method. Options: 'attention', 'ctc_greedy_search', # 'ctc_prefix_beam_search', 'attention_rescoring' - error_rate_type='wer', # Error rate type for evaluation. Options `wer`, 'cer' + error_rate_type='bleu', # Error rate type for evaluation. Options `bleu`, 'char_bleu' num_proc_bsearch=8, # # of CPUs for beam search. beam_size=10, # Beam search width. batch_size=16, # decoding batch size @@ -428,10 +433,10 @@ class U2STTester(U2STTrainer): audio_len, texts, texts_len, + bleu_func, fout=None): cfg = self.config.decoding len_refs, num_ins = 0, 0 - bleu_func = sacrebleu.corpus_bleu start_time = time.time() text_feature = self.test_loader.collate_fn.text_feature @@ -487,6 +492,9 @@ class U2STTester(U2STTrainer): self.model.eval() logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}") + cfg = self.config.decoding + bleu_func = bleu_score.char_bleu if cfg.error_rate_type == 'char-bleu' else bleu_score.bleu + stride_ms = self.test_loader.collate_fn.stride_ms hyps, refs = [], [] len_refs, num_ins = 0, 0 @@ -495,7 +503,7 @@ class U2STTester(U2STTrainer): with open(self.args.result_file, 'w') as fout: for i, batch in enumerate(self.test_loader): metrics = self.compute_translation_metrics( - *batch, fout=fout) + *batch, bleu_func=bleu_func, fout=fout) hyps += metrics['hyps'] refs += metrics['refs'] bleu = metrics['bleu'] @@ -504,19 +512,16 @@ class U2STTester(U2STTrainer): len_refs += metrics['len_refs'] num_ins += metrics['num_ins'] rtf = num_time / (num_frames * stride_ms) - logger.info("RTF: %f, BELU (%d) = %f" % - (rtf, num_ins, bleu)) + logger.info("RTF: %f, BELU (%d) = %f" % (rtf, num_ins, bleu)) rtf = num_time / (num_frames * stride_ms) msg = "Test: " msg += "epoch: {}, ".format(self.epoch) msg += "step: {}, ".format(self.iteration) msg += "RTF: {}, ".format(rtf) - msg += "Test set [%s]: %s" % ( - len(hyps), str(sacrebleu.corpus_bleu(hyps, [refs]))) + msg += "Test set [%s]: %s" % (len(hyps), str(bleu_func(hyps, [refs]))) logger.info(msg) - bleu_meta_path = os.path.splitext( - self.args.result_file)[0] + '.bleu' + bleu_meta_path = os.path.splitext(self.args.result_file)[0] + '.bleu' err_type_str = "BLEU" with open(bleu_meta_path, 'w') as f: data = json.dumps({ @@ -527,7 +532,7 @@ class U2STTester(U2STTrainer): "rtf": rtf, err_type_str: - sacrebleu.corpus_bleu(hyps, [refs]).score, + bleu_func(hyps, [refs]).score, "dataset_hour": (num_frames * stride_ms) / 1000.0 / 3600.0, "process_hour": num_time / 1000.0 / 3600.0, diff --git a/deepspeech/io/collator_st.py b/deepspeech/io/collator_st.py new file mode 100644 index 00000000..34933312 --- /dev/null +++ b/deepspeech/io/collator_st.py @@ -0,0 +1,666 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import io +from collections import namedtuple +from typing import Optional +from typing import Tuple + +import kaldiio +import numpy as np +from yacs.config import CfgNode + +from deepspeech.frontend.augmentor.augmentation import AugmentationPipeline +from deepspeech.frontend.featurizer.speech_featurizer import SpeechFeaturizer +from deepspeech.frontend.featurizer.text_featurizer import TextFeaturizer +from deepspeech.frontend.normalizer import FeatureNormalizer +from deepspeech.frontend.speech import SpeechSegment +from deepspeech.frontend.utility import IGNORE_ID +from deepspeech.io.utility import pad_sequence +from deepspeech.utils.log import Log + +__all__ = ["SpeechCollator", "KaldiPrePorocessedCollator"] + +logger = Log(__name__).getlog() + +# namedtupe need global for pickle. +TarLocalData = namedtuple('TarLocalData', ['tar2info', 'tar2object']) + + +class SpeechCollator(): + @classmethod + def params(cls, config: Optional[CfgNode]=None) -> CfgNode: + default = CfgNode( + dict( + augmentation_config="", + random_seed=0, + mean_std_filepath="", + unit_type="char", + vocab_filepath="", + spm_model_prefix="", + specgram_type='linear', # 'linear', 'mfcc', 'fbank' + feat_dim=0, # 'mfcc', 'fbank' + delta_delta=False, # 'mfcc', 'fbank' + stride_ms=10.0, # ms + window_ms=20.0, # ms + n_fft=None, # fft points + max_freq=None, # None for samplerate/2 + target_sample_rate=16000, # target sample rate + use_dB_normalization=True, + target_dB=-20, + dither=1.0, # feature dither + keep_transcription_text=False)) + + if config is not None: + config.merge_from_other_cfg(default) + return default + + @classmethod + def from_config(cls, config): + """Build a SpeechCollator object from a config. + + Args: + config (yacs.config.CfgNode): configs object. + + Returns: + SpeechCollator: collator object. + """ + assert 'augmentation_config' in config.collator + assert 'keep_transcription_text' in config.collator + assert 'mean_std_filepath' in config.collator + assert 'vocab_filepath' in config.collator + assert 'specgram_type' in config.collator + assert 'n_fft' in config.collator + assert config.collator + + if isinstance(config.collator.augmentation_config, (str, bytes)): + if config.collator.augmentation_config: + aug_file = io.open( + config.collator.augmentation_config, + mode='r', + encoding='utf8') + else: + aug_file = io.StringIO(initial_value='{}', newline='') + else: + aug_file = config.collator.augmentation_config + assert isinstance(aug_file, io.StringIO) + + speech_collator = cls( + aug_file=aug_file, + random_seed=0, + mean_std_filepath=config.collator.mean_std_filepath, + unit_type=config.collator.unit_type, + vocab_filepath=config.collator.vocab_filepath, + spm_model_prefix=config.collator.spm_model_prefix, + specgram_type=config.collator.specgram_type, + feat_dim=config.collator.feat_dim, + delta_delta=config.collator.delta_delta, + stride_ms=config.collator.stride_ms, + window_ms=config.collator.window_ms, + n_fft=config.collator.n_fft, + max_freq=config.collator.max_freq, + target_sample_rate=config.collator.target_sample_rate, + use_dB_normalization=config.collator.use_dB_normalization, + target_dB=config.collator.target_dB, + dither=config.collator.dither, + keep_transcription_text=config.collator.keep_transcription_text) + return speech_collator + + def __init__( + self, + aug_file, + mean_std_filepath, + vocab_filepath, + spm_model_prefix, + random_seed=0, + unit_type="char", + specgram_type='linear', # 'linear', 'mfcc', 'fbank' + feat_dim=0, # 'mfcc', 'fbank' + delta_delta=False, # 'mfcc', 'fbank' + stride_ms=10.0, # ms + window_ms=20.0, # ms + n_fft=None, # fft points + max_freq=None, # None for samplerate/2 + target_sample_rate=16000, # target sample rate + use_dB_normalization=True, + target_dB=-20, + dither=1.0, + keep_transcription_text=True): + """SpeechCollator Collator + + Args: + unit_type(str): token unit type, e.g. char, word, spm + vocab_filepath (str): vocab file path. + mean_std_filepath (str): mean and std file path, which suffix is *.npy + spm_model_prefix (str): spm model prefix, need if `unit_type` is spm. + augmentation_config (str, optional): augmentation json str. Defaults to '{}'. + stride_ms (float, optional): stride size in ms. Defaults to 10.0. + window_ms (float, optional): window size in ms. Defaults to 20.0. + n_fft (int, optional): fft points for rfft. Defaults to None. + max_freq (int, optional): max cut freq. Defaults to None. + target_sample_rate (int, optional): target sample rate which used for training. Defaults to 16000. + specgram_type (str, optional): 'linear', 'mfcc' or 'fbank'. Defaults to 'linear'. + feat_dim (int, optional): audio feature dim, using by 'mfcc' or 'fbank'. Defaults to None. + delta_delta (bool, optional): audio feature with delta-delta, using by 'fbank' or 'mfcc'. Defaults to False. + use_dB_normalization (bool, optional): do dB normalization. Defaults to True. + target_dB (int, optional): target dB. Defaults to -20. + random_seed (int, optional): for random generator. Defaults to 0. + keep_transcription_text (bool, optional): True, when not in training mode, will not do tokenizer; Defaults to False. + if ``keep_transcription_text`` is False, text is token ids else is raw string. + + Do augmentations + Padding audio features with zeros to make them have the same shape (or + a user-defined shape) within one batch. + """ + self._keep_transcription_text = keep_transcription_text + + self._local_data = TarLocalData(tar2info={}, tar2object={}) + self._augmentation_pipeline = AugmentationPipeline( + augmentation_config=aug_file.read(), random_seed=random_seed) + + self._normalizer = FeatureNormalizer( + mean_std_filepath) if mean_std_filepath else None + + self._stride_ms = stride_ms + self._target_sample_rate = target_sample_rate + + self._speech_featurizer = SpeechFeaturizer( + unit_type=unit_type, + vocab_filepath=vocab_filepath, + spm_model_prefix=spm_model_prefix, + specgram_type=specgram_type, + feat_dim=feat_dim, + delta_delta=delta_delta, + stride_ms=stride_ms, + window_ms=window_ms, + n_fft=n_fft, + max_freq=max_freq, + target_sample_rate=target_sample_rate, + use_dB_normalization=use_dB_normalization, + target_dB=target_dB, + dither=dither) + + def _parse_tar(self, file): + """Parse a tar file to get a tarfile object + and a map containing tarinfoes + """ + result = {} + f = tarfile.open(file) + for tarinfo in f.getmembers(): + result[tarinfo.name] = tarinfo + return f, result + + def _subfile_from_tar(self, file): + """Get subfile object from tar. + + It will return a subfile object from tar file + and cached tar file info for next reading request. + """ + tarpath, filename = file.split(':', 1)[1].split('#', 1) + if 'tar2info' not in self._local_data.__dict__: + self._local_data.tar2info = {} + if 'tar2object' not in self._local_data.__dict__: + self._local_data.tar2object = {} + if tarpath not in self._local_data.tar2info: + object, infoes = self._parse_tar(tarpath) + self._local_data.tar2info[tarpath] = infoes + self._local_data.tar2object[tarpath] = object + return self._local_data.tar2object[tarpath].extractfile( + self._local_data.tar2info[tarpath][filename]) + + def process_utterance(self, audio_file, translation): + """Load, augment, featurize and normalize for speech data. + + :param audio_file: Filepath or file object of audio file. + :type audio_file: str | file + :param translation: translation text. + :type translation: str + :return: Tuple of audio feature tensor and data of translation part, + where translation part could be token ids or text. + :rtype: tuple of (2darray, list) + """ + if isinstance(audio_file, str) and audio_file.startswith('tar:'): + speech_segment = SpeechSegment.from_file( + self._subfile_from_tar(audio_file), translation) + else: + speech_segment = SpeechSegment.from_file(audio_file, translation) + + # audio augment + self._augmentation_pipeline.transform_audio(speech_segment) + + specgram, translation_part = self._speech_featurizer.featurize( + speech_segment, self._keep_transcription_text) + if self._normalizer: + specgram = self._normalizer.apply(specgram) + + # specgram augment + specgram = self._augmentation_pipeline.transform_feature(specgram) + specgram = specgram.transpose([1, 0]) + return specgram, translation_part + + def __call__(self, batch): + """batch examples + + Args: + batch ([List]): batch is (audio, text) + audio (np.ndarray) shape (D, T) + text (List[int] or str): shape (U,) + + Returns: + tuple(audio, text, audio_lens, text_lens): batched data. + audio : (B, Tmax, D) + audio_lens: (B) + text : (B, Umax) + text_lens: (B) + """ + audios = [] + audio_lens = [] + texts = [] + text_lens = [] + utts = [] + for utt, audio, text in batch: + audio, text = self.process_utterance(audio, text) + #utt + utts.append(utt) + # audio + audios.append(audio) # [T, D] + audio_lens.append(audio.shape[0]) + # text + # for training, text is token ids + # else text is string, convert to unicode ord + tokens = [] + if self._keep_transcription_text: + assert isinstance(text, str), (type(text), text) + tokens = [ord(t) for t in text] + else: + tokens = text # token ids + tokens = tokens if isinstance(tokens, np.ndarray) else np.array( + tokens, dtype=np.int64) + texts.append(tokens) + text_lens.append(tokens.shape[0]) + + padded_audios = pad_sequence( + audios, padding_value=0.0).astype(np.float32) #[B, T, D] + audio_lens = np.array(audio_lens).astype(np.int64) + padded_texts = pad_sequence( + texts, padding_value=IGNORE_ID).astype(np.int64) + text_lens = np.array(text_lens).astype(np.int64) + return utts, padded_audios, audio_lens, padded_texts, text_lens + + @property + def manifest(self): + return self._manifest + + @property + def vocab_size(self): + return self._speech_featurizer.vocab_size + + @property + def vocab_list(self): + return self._speech_featurizer.vocab_list + + @property + def vocab_dict(self): + return self._speech_featurizer.vocab_dict + + @property + def text_feature(self): + return self._speech_featurizer.text_feature + + @property + def feature_size(self): + return self._speech_featurizer.feature_size + + @property + def stride_ms(self): + return self._speech_featurizer.stride_ms + + +class TripletSpeechCollator(SpeechCollator): + def process_utterance(self, audio_file, translation, transcript): + """Load, augment, featurize and normalize for speech data. + + :param audio_file: Filepath or file object of audio file. + :type audio_file: str | file + :param translation: translation text. + :type translation: str + :return: Tuple of audio feature tensor and data of translation part, + where translation part could be token ids or text. + :rtype: tuple of (2darray, list) + """ + if isinstance(audio_file, str) and audio_file.startswith('tar:'): + speech_segment = SpeechSegment.from_file( + self._subfile_from_tar(audio_file), translation) + else: + speech_segment = SpeechSegment.from_file(audio_file, translation) + + # audio augment + self._augmentation_pipeline.transform_audio(speech_segment) + + specgram, translation_part = self._speech_featurizer.featurize( + speech_segment, self._keep_transcription_text) + transcript_part = self._speech_featurizer._text_featurizer.featurize( + transcript) + if self._normalizer: + specgram = self._normalizer.apply(specgram) + + # specgram augment + specgram = self._augmentation_pipeline.transform_feature(specgram) + specgram = specgram.transpose([1, 0]) + return specgram, translation_part, transcript_part + + def __call__(self, batch): + """batch examples + + Args: + batch ([List]): batch is (audio, text) + audio (np.ndarray) shape (D, T) + text (List[int] or str): shape (U,) + + Returns: + tuple(audio, text, audio_lens, text_lens): batched data. + audio : (B, Tmax, D) + audio_lens: (B) + text : (B, Umax) + text_lens: (B) + """ + audios = [] + audio_lens = [] + translation_text = [] + translation_text_lens = [] + transcription_text = [] + transcription_text_lens = [] + + utts = [] + for utt, audio, translation, transcription in batch: + audio, translation, transcription = self.process_utterance( + audio, translation, transcription) + #utt + utts.append(utt) + # audio + audios.append(audio) # [T, D] + audio_lens.append(audio.shape[0]) + # text + # for training, text is token ids + # else text is string, convert to unicode ord + tokens = [[], []] + for idx, text in enumerate([translation, transcription]): + if self._keep_transcription_text: + assert isinstance(text, str), (type(text), text) + tokens[idx] = [ord(t) for t in text] + else: + tokens[idx] = text # token ids + tokens[idx] = tokens[idx] if isinstance( + tokens[idx], np.ndarray) else np.array( + tokens[idx], dtype=np.int64) + translation_text.append(tokens[0]) + translation_text_lens.append(tokens[0].shape[0]) + transcription_text.append(tokens[1]) + transcription_text_lens.append(tokens[1].shape[0]) + + padded_audios = pad_sequence( + audios, padding_value=0.0).astype(np.float32) #[B, T, D] + audio_lens = np.array(audio_lens).astype(np.int64) + padded_translation = pad_sequence( + translation_text, padding_value=IGNORE_ID).astype(np.int64) + translation_lens = np.array(translation_text_lens).astype(np.int64) + padded_transcription = pad_sequence( + transcription_text, padding_value=IGNORE_ID).astype(np.int64) + transcription_lens = np.array(transcription_text_lens).astype(np.int64) + return utts, padded_audios, audio_lens, ( + padded_translation, padded_transcription), (translation_lens, + transcription_lens) + + +class KaldiPrePorocessedCollator(SpeechCollator): + @classmethod + def params(cls, config: Optional[CfgNode]=None) -> CfgNode: + default = CfgNode( + dict( + augmentation_config="", + random_seed=0, + unit_type="char", + vocab_filepath="", + spm_model_prefix="", + feat_dim=0, + stride_ms=10.0, + keep_transcription_text=False)) + + if config is not None: + config.merge_from_other_cfg(default) + return default + + @classmethod + def from_config(cls, config): + """Build a SpeechCollator object from a config. + + Args: + config (yacs.config.CfgNode): configs object. + + Returns: + SpeechCollator: collator object. + """ + assert 'augmentation_config' in config.collator + assert 'keep_transcription_text' in config.collator + assert 'vocab_filepath' in config.collator + assert config.collator + + if isinstance(config.collator.augmentation_config, (str, bytes)): + if config.collator.augmentation_config: + aug_file = io.open( + config.collator.augmentation_config, + mode='r', + encoding='utf8') + else: + aug_file = io.StringIO(initial_value='{}', newline='') + else: + aug_file = config.collator.augmentation_config + assert isinstance(aug_file, io.StringIO) + + speech_collator = cls( + aug_file=aug_file, + random_seed=0, + unit_type=config.collator.unit_type, + vocab_filepath=config.collator.vocab_filepath, + spm_model_prefix=config.collator.spm_model_prefix, + feat_dim=config.collator.feat_dim, + stride_ms=config.collator.stride_ms, + keep_transcription_text=config.collator.keep_transcription_text) + return speech_collator + + def __init__(self, + aug_file, + vocab_filepath, + spm_model_prefix, + random_seed=0, + unit_type="char", + feat_dim=0, + stride_ms=10.0, + keep_transcription_text=True): + """SpeechCollator Collator + + Args: + unit_type(str): token unit type, e.g. char, word, spm + vocab_filepath (str): vocab file path. + spm_model_prefix (str): spm model prefix, need if `unit_type` is spm. + augmentation_config (str, optional): augmentation json str. Defaults to '{}'. + random_seed (int, optional): for random generator. Defaults to 0. + keep_transcription_text (bool, optional): True, when not in training mode, will not do tokenizer; Defaults to False. + if ``keep_transcription_text`` is False, text is token ids else is raw string. + + Do augmentations + Padding audio features with zeros to make them have the same shape (or + a user-defined shape) within one batch. + """ + self._keep_transcription_text = keep_transcription_text + self._feat_dim = feat_dim + self._stride_ms = stride_ms + + self._local_data = TarLocalData(tar2info={}, tar2object={}) + self._augmentation_pipeline = AugmentationPipeline( + augmentation_config=aug_file.read(), random_seed=random_seed) + + self._text_featurizer = TextFeaturizer(unit_type, vocab_filepath, + spm_model_prefix) + + def process_utterance(self, audio_file, translation): + """Load, augment, featurize and normalize for speech data. + + :param audio_file: Filepath or file object of kaldi processed feature. + :type audio_file: str | file + :param translation: Translation text. + :type translation: str + :return: Tuple of audio feature tensor and data of translation part, + where translation part could be token ids or text. + :rtype: tuple of (2darray, list) + """ + specgram = kaldiio.load_mat(audio_file) + specgram = specgram.transpose([1, 0]) + assert specgram.shape[ + 0] == self._feat_dim, 'expect feat dim {}, but got {}'.format( + self._feat_dim, specgram.shape[0]) + + # specgram augment + specgram = self._augmentation_pipeline.transform_feature(specgram) + + specgram = specgram.transpose([1, 0]) + if self._keep_transcription_text: + return specgram, translation + else: + text_ids = self._text_featurizer.featurize(translation) + return specgram, text_ids + + @property + def manifest(self): + return self._manifest + + @property + def vocab_size(self): + return self._text_featurizer.vocab_size + + @property + def vocab_list(self): + return self._text_featurizer.vocab_list + + @property + def vocab_dict(self): + return self._text_featurizer.vocab_dict + + @property + def text_feature(self): + return self._text_featurizer + + @property + def feature_size(self): + return self._feat_dim + + @property + def stride_ms(self): + return self._stride_ms + + +class TripletKaldiPrePorocessedCollator(KaldiPrePorocessedCollator): + def process_utterance(self, audio_file, translation, transcript): + """Load, augment, featurize and normalize for speech data. + + :param audio_file: Filepath or file object of kali processed feature. + :type audio_file: str | file + :param translation: Translation text. + :type translation: str + :param transcript: Transcription text. + :type transcript: str + :return: Tuple of audio feature tensor and data of translation and transcription parts, + where translation and transcription parts could be token ids or text. + :rtype: tuple of (2darray, (list, list)) + """ + specgram = kaldiio.load_mat(audio_file) + specgram = specgram.transpose([1, 0]) + assert specgram.shape[ + 0] == self._feat_dim, 'expect feat dim {}, but got {}'.format( + self._feat_dim, specgram.shape[0]) + + # specgram augment + specgram = self._augmentation_pipeline.transform_feature(specgram) + + specgram = specgram.transpose([1, 0]) + if self._keep_transcription_text: + return specgram, translation, transcript + else: + translation_text_ids = self._text_featurizer.featurize(translation) + transcript_text_ids = self._text_featurizer.featurize(transcript) + return specgram, translation_text_ids, transcript_text_ids + + def __call__(self, batch): + """batch examples + + Args: + batch ([List]): batch is (audio, text) + audio (np.ndarray) shape (D, T) + translation (List[int] or str): shape (U,) + transcription (List[int] or str): shape (V,) + + Returns: + tuple(audio, text, audio_lens, text_lens): batched data. + audio : (B, Tmax, D) + audio_lens: (B) + translation_text : (B, Umax) + translation_text_lens: (B) + transcription_text : (B, Vmax) + transcription_text_lens: (B) + """ + audios = [] + audio_lens = [] + translation_text = [] + translation_text_lens = [] + transcription_text = [] + transcription_text_lens = [] + + utts = [] + for utt, audio, translation, transcription in batch: + audio, translation, transcription = self.process_utterance( + audio, translation, transcription) + #utt + utts.append(utt) + # audio + audios.append(audio) # [T, D] + audio_lens.append(audio.shape[0]) + # text + # for training, text is token ids + # else text is string, convert to unicode ord + tokens = [[], []] + for idx, text in enumerate([translation, transcription]): + if self._keep_transcription_text: + assert isinstance(text, str), (type(text), text) + tokens[idx] = [ord(t) for t in text] + else: + tokens[idx] = text # token ids + tokens[idx] = tokens[idx] if isinstance( + tokens[idx], np.ndarray) else np.array( + tokens[idx], dtype=np.int64) + translation_text.append(tokens[0]) + translation_text_lens.append(tokens[0].shape[0]) + transcription_text.append(tokens[1]) + transcription_text_lens.append(tokens[1].shape[0]) + + padded_audios = pad_sequence( + audios, padding_value=0.0).astype(np.float32) #[B, T, D] + audio_lens = np.array(audio_lens).astype(np.int64) + padded_translation = pad_sequence( + translation_text, padding_value=IGNORE_ID).astype(np.int64) + translation_lens = np.array(translation_text_lens).astype(np.int64) + padded_transcription = pad_sequence( + transcription_text, padding_value=IGNORE_ID).astype(np.int64) + transcription_lens = np.array(transcription_text_lens).astype(np.int64) + return utts, padded_audios, audio_lens, ( + padded_translation, padded_transcription), (translation_lens, + transcription_lens) diff --git a/deepspeech/io/dataset.py b/deepspeech/io/dataset.py index 3fc4e988..ac7be1f9 100644 --- a/deepspeech/io/dataset.py +++ b/deepspeech/io/dataset.py @@ -19,9 +19,7 @@ from yacs.config import CfgNode from deepspeech.frontend.utility import read_manifest from deepspeech.utils.log import Log -__all__ = [ - "ManifestDataset", -] +__all__ = ["ManifestDataset", "TripletManifestDataset"] logger = Log(__name__).getlog() @@ -105,3 +103,16 @@ class ManifestDataset(Dataset): def __getitem__(self, idx): instance = self._manifest[idx] return instance["utt"], instance["feat"], instance["text"] + + +class TripletManifestDataset(ManifestDataset): + """ + For Joint Training of Speech Translation and ASR. + text: translation, + text1: transcript. + """ + + def __getitem__(self, idx): + instance = self._manifest[idx] + return instance["utt"], instance["feat"], instance["text"], instance[ + "text1"] diff --git a/deepspeech/models/u2_st.py b/deepspeech/models/u2_st.py new file mode 100644 index 00000000..5eea139b --- /dev/null +++ b/deepspeech/models/u2_st.py @@ -0,0 +1,734 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""U2 ASR Model +Unified Streaming and Non-streaming Two-pass End-to-end Model for Speech Recognition +(https://arxiv.org/pdf/2012.05481.pdf) +""" +import sys +import time +from collections import defaultdict +from typing import Dict +from typing import List +from typing import Optional +from typing import Tuple + +import paddle +from paddle import jit +from paddle import nn +from yacs.config import CfgNode + +from deepspeech.frontend.utility import IGNORE_ID +from deepspeech.frontend.utility import load_cmvn +from deepspeech.modules.cmvn import GlobalCMVN +from deepspeech.modules.ctc import CTCDecoder +from deepspeech.modules.decoder import TransformerDecoder +from deepspeech.modules.encoder import ConformerEncoder +from deepspeech.modules.encoder import TransformerEncoder +from deepspeech.modules.loss import LabelSmoothingLoss +from deepspeech.modules.mask import make_pad_mask +from deepspeech.modules.mask import mask_finished_preds +from deepspeech.modules.mask import mask_finished_scores +from deepspeech.modules.mask import subsequent_mask +from deepspeech.utils import checkpoint +from deepspeech.utils import layer_tools +from deepspeech.utils.ctc_utils import remove_duplicates_and_blank +from deepspeech.utils.log import Log +from deepspeech.utils.tensor_utils import add_sos_eos +from deepspeech.utils.tensor_utils import pad_sequence +from deepspeech.utils.tensor_utils import th_accuracy +from deepspeech.utils.utility import log_add + +__all__ = ["U2STModel", "U2STInferModel"] + +logger = Log(__name__).getlog() + + +class U2STBaseModel(nn.Module): + """CTC-Attention hybrid Encoder-Decoder model""" + + @classmethod + def params(cls, config: Optional[CfgNode]=None) -> CfgNode: + # network architecture + default = CfgNode() + # allow add new item when merge_with_file + default.cmvn_file = "" + default.cmvn_file_type = "json" + default.input_dim = 0 + default.output_dim = 0 + # encoder related + default.encoder = 'transformer' + default.encoder_conf = CfgNode( + dict( + output_size=256, # dimension of attention + attention_heads=4, + linear_units=2048, # the number of units of position-wise feed forward + num_blocks=12, # the number of encoder blocks + dropout_rate=0.1, + positional_dropout_rate=0.1, + attention_dropout_rate=0.0, + input_layer='conv2d', # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before=True, + # use_cnn_module=True, + # cnn_module_kernel=15, + # activation_type='swish', + # pos_enc_layer_type='rel_pos', + # selfattention_layer_type='rel_selfattn', + )) + # decoder related + default.decoder = 'transformer' + default.decoder_conf = CfgNode( + dict( + attention_heads=4, + linear_units=2048, + num_blocks=6, + dropout_rate=0.1, + positional_dropout_rate=0.1, + self_attention_dropout_rate=0.0, + src_attention_dropout_rate=0.0, )) + # hybrid CTC/attention + default.model_conf = CfgNode( + dict( + asr_weight=0.0, + ctc_weight=0.0, + lsm_weight=0.1, # label smoothing option + length_normalized_loss=False, )) + + if config is not None: + config.merge_from_other_cfg(default) + return default + + def __init__(self, + vocab_size: int, + encoder: TransformerEncoder, + st_decoder: TransformerDecoder, + decoder: TransformerDecoder=None, + ctc: CTCDecoder=None, + ctc_weight: float=0.0, + asr_weight: float=0.0, + ignore_id: int=IGNORE_ID, + lsm_weight: float=0.0, + length_normalized_loss: bool=False): + assert 0.0 <= ctc_weight <= 1.0, ctc_weight + + super().__init__() + # note that eos is the same as sos (equivalent ID) + self.sos = vocab_size - 1 + self.eos = vocab_size - 1 + self.vocab_size = vocab_size + self.ignore_id = ignore_id + self.ctc_weight = ctc_weight + self.asr_weight = asr_weight + + self.encoder = encoder + self.st_decoder = st_decoder + self.decoder = decoder + self.ctc = ctc + self.criterion_att = LabelSmoothingLoss( + size=vocab_size, + padding_idx=ignore_id, + smoothing=lsm_weight, + normalize_length=length_normalized_loss, ) + + def forward( + self, + speech: paddle.Tensor, + speech_lengths: paddle.Tensor, + text: paddle.Tensor, + text_lengths: paddle.Tensor, + asr_text: paddle.Tensor=None, + asr_text_lengths: paddle.Tensor=None, + ) -> Tuple[Optional[paddle.Tensor], Optional[paddle.Tensor], Optional[ + paddle.Tensor]]: + """Frontend + Encoder + Decoder + Calc loss + Args: + speech: (Batch, Length, ...) + speech_lengths: (Batch, ) + text: (Batch, Length) + text_lengths: (Batch,) + Returns: + total_loss, attention_loss, ctc_loss + """ + assert text_lengths.dim() == 1, text_lengths.shape + # Check that batch_size is unified + assert (speech.shape[0] == speech_lengths.shape[0] == text.shape[0] == + text_lengths.shape[0]), (speech.shape, speech_lengths.shape, + text.shape, text_lengths.shape) + # 1. Encoder + start = time.time() + encoder_out, encoder_mask = self.encoder(speech, speech_lengths) + encoder_time = time.time() - start + #logger.debug(f"encoder time: {encoder_time}") + #TODO(Hui Zhang): sum not support bool type + #encoder_out_lens = encoder_mask.squeeze(1).sum(1) #[B, 1, T] -> [B] + encoder_out_lens = encoder_mask.squeeze(1).cast(paddle.int64).sum( + 1) #[B, 1, T] -> [B] + + # 2a. ST-decoder branch + start = time.time() + loss_st, acc_st = self._calc_st_loss(encoder_out, encoder_mask, text, + text_lengths) + decoder_time = time.time() - start + + loss_asr_att = None + loss_asr_ctc = None + # 2b. ASR Attention-decoder branch + if self.asr_weight > 0.: + if self.ctc_weight != 1.0: + start = time.time() + loss_asr_att, acc_att = self._calc_att_loss( + encoder_out, encoder_mask, asr_text, asr_text_lengths) + decoder_time = time.time() - start + + # 2c. CTC branch + if self.ctc_weight != 0.0: + start = time.time() + loss_asr_ctc = self.ctc(encoder_out, encoder_out_lens, asr_text, + asr_text_lengths) + ctc_time = time.time() - start + + if loss_asr_ctc is None: + loss_asr = loss_asr_att + elif loss_asr_att is None: + loss_asr = loss_asr_ctc + else: + loss_asr = self.ctc_weight * loss_asr_ctc + (1 - self.ctc_weight + ) * loss_asr_att + loss = self.asr_weight * loss_asr + (1 - self.asr_weight) * loss_st + else: + loss = loss_st + return loss, loss_st, loss_asr_att, loss_asr_ctc + + def _calc_st_loss( + self, + encoder_out: paddle.Tensor, + encoder_mask: paddle.Tensor, + ys_pad: paddle.Tensor, + ys_pad_lens: paddle.Tensor, ) -> Tuple[paddle.Tensor, float]: + """Calc attention loss. + + Args: + encoder_out (paddle.Tensor): [B, Tmax, D] + encoder_mask (paddle.Tensor): [B, 1, Tmax] + ys_pad (paddle.Tensor): [B, Umax] + ys_pad_lens (paddle.Tensor): [B] + + Returns: + Tuple[paddle.Tensor, float]: attention_loss, accuracy rate + """ + ys_in_pad, ys_out_pad = add_sos_eos(ys_pad, self.sos, self.eos, + self.ignore_id) + ys_in_lens = ys_pad_lens + 1 + + # 1. Forward decoder + decoder_out, _ = self.st_decoder(encoder_out, encoder_mask, ys_in_pad, + ys_in_lens) + + # 2. Compute attention loss + loss_att = self.criterion_att(decoder_out, ys_out_pad) + acc_att = th_accuracy( + decoder_out.view(-1, self.vocab_size), + ys_out_pad, + ignore_label=self.ignore_id, ) + return loss_att, acc_att + + def _calc_att_loss( + self, + encoder_out: paddle.Tensor, + encoder_mask: paddle.Tensor, + ys_pad: paddle.Tensor, + ys_pad_lens: paddle.Tensor, ) -> Tuple[paddle.Tensor, float]: + """Calc attention loss. + + Args: + encoder_out (paddle.Tensor): [B, Tmax, D] + encoder_mask (paddle.Tensor): [B, 1, Tmax] + ys_pad (paddle.Tensor): [B, Umax] + ys_pad_lens (paddle.Tensor): [B] + + Returns: + Tuple[paddle.Tensor, float]: attention_loss, accuracy rate + """ + ys_in_pad, ys_out_pad = add_sos_eos(ys_pad, self.sos, self.eos, + self.ignore_id) + ys_in_lens = ys_pad_lens + 1 + + # 1. Forward decoder + decoder_out, _ = self.decoder(encoder_out, encoder_mask, ys_in_pad, + ys_in_lens) + + # 2. Compute attention loss + loss_att = self.criterion_att(decoder_out, ys_out_pad) + acc_att = th_accuracy( + decoder_out.view(-1, self.vocab_size), + ys_out_pad, + ignore_label=self.ignore_id, ) + return loss_att, acc_att + + def _forward_encoder( + self, + speech: paddle.Tensor, + speech_lengths: paddle.Tensor, + decoding_chunk_size: int=-1, + num_decoding_left_chunks: int=-1, + simulate_streaming: bool=False, + ) -> Tuple[paddle.Tensor, paddle.Tensor]: + """Encoder pass. + + Args: + speech (paddle.Tensor): [B, Tmax, D] + speech_lengths (paddle.Tensor): [B] + decoding_chunk_size (int, optional): chuck size. Defaults to -1. + num_decoding_left_chunks (int, optional): nums chunks. Defaults to -1. + simulate_streaming (bool, optional): streaming or not. Defaults to False. + + Returns: + Tuple[paddle.Tensor, paddle.Tensor]: + encoder hiddens (B, Tmax, D), + encoder hiddens mask (B, 1, Tmax). + """ + # Let's assume B = batch_size + # 1. Encoder + if simulate_streaming and decoding_chunk_size > 0: + encoder_out, encoder_mask = self.encoder.forward_chunk_by_chunk( + speech, + decoding_chunk_size=decoding_chunk_size, + num_decoding_left_chunks=num_decoding_left_chunks + ) # (B, maxlen, encoder_dim) + else: + encoder_out, encoder_mask = self.encoder( + speech, + speech_lengths, + decoding_chunk_size=decoding_chunk_size, + num_decoding_left_chunks=num_decoding_left_chunks + ) # (B, maxlen, encoder_dim) + return encoder_out, encoder_mask + + def translate( + self, + speech: paddle.Tensor, + speech_lengths: paddle.Tensor, + beam_size: int=10, + decoding_chunk_size: int=-1, + num_decoding_left_chunks: int=-1, + simulate_streaming: bool=False, ) -> paddle.Tensor: + """ Apply beam search on attention decoder + Args: + speech (paddle.Tensor): (batch, max_len, feat_dim) + speech_length (paddle.Tensor): (batch, ) + beam_size (int): beam size for beam search + decoding_chunk_size (int): decoding chunk for dynamic chunk + trained model. + <0: for decoding, use full chunk. + >0: for decoding, use fixed chunk size as set. + 0: used for training, it's prohibited here + simulate_streaming (bool): whether do encoder forward in a + streaming fashion + Returns: + paddle.Tensor: decoding result, (batch, max_result_len) + """ + assert speech.shape[0] == speech_lengths.shape[0] + assert decoding_chunk_size != 0 + device = speech.place + batch_size = speech.shape[0] + + # Let's assume B = batch_size and N = beam_size + # 1. Encoder + encoder_out, encoder_mask = self._forward_encoder( + speech, speech_lengths, decoding_chunk_size, + num_decoding_left_chunks, + simulate_streaming) # (B, maxlen, encoder_dim) + maxlen = encoder_out.size(1) + encoder_dim = encoder_out.size(2) + running_size = batch_size * beam_size + encoder_out = encoder_out.unsqueeze(1).repeat(1, beam_size, 1, 1).view( + running_size, maxlen, encoder_dim) # (B*N, maxlen, encoder_dim) + encoder_mask = encoder_mask.unsqueeze(1).repeat( + 1, beam_size, 1, 1).view(running_size, 1, + maxlen) # (B*N, 1, max_len) + + hyps = paddle.ones( + [running_size, 1], dtype=paddle.long).fill_(self.sos) # (B*N, 1) + # log scale score + scores = paddle.to_tensor( + [0.0] + [-float('inf')] * (beam_size - 1), dtype=paddle.float) + scores = scores.to(device).repeat(batch_size).unsqueeze(1).to( + device) # (B*N, 1) + end_flag = paddle.zeros_like(scores, dtype=paddle.bool) # (B*N, 1) + cache: Optional[List[paddle.Tensor]] = None + # 2. Decoder forward step by step + for i in range(1, maxlen + 1): + # Stop if all batch and all beam produce eos + # TODO(Hui Zhang): if end_flag.sum() == running_size: + if end_flag.cast(paddle.int64).sum() == running_size: + break + + # 2.1 Forward decoder step + hyps_mask = subsequent_mask(i).unsqueeze(0).repeat( + running_size, 1, 1).to(device) # (B*N, i, i) + # logp: (B*N, vocab) + logp, cache = self.st_decoder.forward_one_step( + encoder_out, encoder_mask, hyps, hyps_mask, cache) + + # 2.2 First beam prune: select topk best prob at current time + top_k_logp, top_k_index = logp.topk(beam_size) # (B*N, N) + top_k_logp = mask_finished_scores(top_k_logp, end_flag) + top_k_index = mask_finished_preds(top_k_index, end_flag, self.eos) + + # 2.3 Seconde beam prune: select topk score with history + scores = scores + top_k_logp # (B*N, N), broadcast add + scores = scores.view(batch_size, beam_size * beam_size) # (B, N*N) + scores, offset_k_index = scores.topk(k=beam_size) # (B, N) + scores = scores.view(-1, 1) # (B*N, 1) + + # 2.4. Compute base index in top_k_index, + # regard top_k_index as (B*N*N),regard offset_k_index as (B*N), + # then find offset_k_index in top_k_index + base_k_index = paddle.arange(batch_size).view(-1, 1).repeat( + 1, beam_size) # (B, N) + base_k_index = base_k_index * beam_size * beam_size + best_k_index = base_k_index.view(-1) + offset_k_index.view( + -1) # (B*N) + + # 2.5 Update best hyps + best_k_pred = paddle.index_select( + top_k_index.view(-1), index=best_k_index, axis=0) # (B*N) + best_hyps_index = best_k_index // beam_size + last_best_k_hyps = paddle.index_select( + hyps, index=best_hyps_index, axis=0) # (B*N, i) + hyps = paddle.cat( + (last_best_k_hyps, best_k_pred.view(-1, 1)), + dim=1) # (B*N, i+1) + + # 2.6 Update end flag + end_flag = paddle.eq(hyps[:, -1], self.eos).view(-1, 1) + + # 3. Select best of best + scores = scores.view(batch_size, beam_size) + # TODO: length normalization + best_index = paddle.argmax(scores, axis=-1).long() # (B) + best_hyps_index = best_index + paddle.arange( + batch_size, dtype=paddle.long) * beam_size + best_hyps = paddle.index_select(hyps, index=best_hyps_index, axis=0) + best_hyps = best_hyps[:, 1:] + return best_hyps + + @jit.export + def subsampling_rate(self) -> int: + """ Export interface for c++ call, return subsampling_rate of the + model + """ + return self.encoder.embed.subsampling_rate + + @jit.export + def right_context(self) -> int: + """ Export interface for c++ call, return right_context of the model + """ + return self.encoder.embed.right_context + + @jit.export + def sos_symbol(self) -> int: + """ Export interface for c++ call, return sos symbol id of the model + """ + return self.sos + + @jit.export + def eos_symbol(self) -> int: + """ Export interface for c++ call, return eos symbol id of the model + """ + return self.eos + + @jit.export + def forward_encoder_chunk( + self, + xs: paddle.Tensor, + offset: int, + required_cache_size: int, + subsampling_cache: Optional[paddle.Tensor]=None, + elayers_output_cache: Optional[List[paddle.Tensor]]=None, + conformer_cnn_cache: Optional[List[paddle.Tensor]]=None, + ) -> Tuple[paddle.Tensor, paddle.Tensor, List[paddle.Tensor], List[ + paddle.Tensor]]: + """ Export interface for c++ call, give input chunk xs, and return + output from time 0 to current chunk. + Args: + xs (paddle.Tensor): chunk input + subsampling_cache (Optional[paddle.Tensor]): subsampling cache + elayers_output_cache (Optional[List[paddle.Tensor]]): + transformer/conformer encoder layers output cache + conformer_cnn_cache (Optional[List[paddle.Tensor]]): conformer + cnn cache + Returns: + paddle.Tensor: output, it ranges from time 0 to current chunk. + paddle.Tensor: subsampling cache + List[paddle.Tensor]: attention cache + List[paddle.Tensor]: conformer cnn cache + """ + return self.encoder.forward_chunk( + xs, offset, required_cache_size, subsampling_cache, + elayers_output_cache, conformer_cnn_cache) + + @jit.export + def ctc_activation(self, xs: paddle.Tensor) -> paddle.Tensor: + """ Export interface for c++ call, apply linear transform and log + softmax before ctc + Args: + xs (paddle.Tensor): encoder output + Returns: + paddle.Tensor: activation before ctc + """ + return self.ctc.log_softmax(xs) + + @jit.export + def forward_attention_decoder( + self, + hyps: paddle.Tensor, + hyps_lens: paddle.Tensor, + encoder_out: paddle.Tensor, ) -> paddle.Tensor: + """ Export interface for c++ call, forward decoder with multiple + hypothesis from ctc prefix beam search and one encoder output + Args: + hyps (paddle.Tensor): hyps from ctc prefix beam search, already + pad sos at the begining, (B, T) + hyps_lens (paddle.Tensor): length of each hyp in hyps, (B) + encoder_out (paddle.Tensor): corresponding encoder output, (B=1, T, D) + Returns: + paddle.Tensor: decoder output, (B, L) + """ + assert encoder_out.size(0) == 1 + num_hyps = hyps.size(0) + assert hyps_lens.size(0) == num_hyps + encoder_out = encoder_out.repeat(num_hyps, 1, 1) + # (B, 1, T) + encoder_mask = paddle.ones( + [num_hyps, 1, encoder_out.size(1)], dtype=paddle.bool) + # (num_hyps, max_hyps_len, vocab_size) + decoder_out, _ = self.decoder(encoder_out, encoder_mask, hyps, + hyps_lens) + decoder_out = paddle.nn.functional.log_softmax(decoder_out, dim=-1) + return decoder_out + + @paddle.no_grad() + def decode(self, + feats: paddle.Tensor, + feats_lengths: paddle.Tensor, + text_feature: Dict[str, int], + decoding_method: str, + lang_model_path: str, + beam_alpha: float, + beam_beta: float, + beam_size: int, + cutoff_prob: float, + cutoff_top_n: int, + num_processes: int, + ctc_weight: float=0.0, + decoding_chunk_size: int=-1, + num_decoding_left_chunks: int=-1, + simulate_streaming: bool=False): + """u2 decoding. + + Args: + feats (Tenosr): audio features, (B, T, D) + feats_lengths (Tenosr): (B) + text_feature (TextFeaturizer): text feature object. + decoding_method (str): decoding mode, e.g. + 'fullsentence', + 'simultaneous' + lang_model_path (str): lm path. + beam_alpha (float): lm weight. + beam_beta (float): length penalty. + beam_size (int): beam size for search + cutoff_prob (float): for prune. + cutoff_top_n (int): for prune. + num_processes (int): + ctc_weight (float, optional): ctc weight for attention rescoring decode mode. Defaults to 0.0. + decoding_chunk_size (int, optional): decoding chunk size. Defaults to -1. + <0: for decoding, use full chunk. + >0: for decoding, use fixed chunk size as set. + 0: used for training, it's prohibited here. + num_decoding_left_chunks (int, optional): + number of left chunks for decoding. Defaults to -1. + simulate_streaming (bool, optional): simulate streaming inference. Defaults to False. + + Raises: + ValueError: when not support decoding_method. + + Returns: + List[List[int]]: transcripts. + """ + batch_size = feats.size(0) + + if decoding_method == 'fullsentence': + hyps = self.translate( + feats, + feats_lengths, + beam_size=beam_size, + decoding_chunk_size=decoding_chunk_size, + num_decoding_left_chunks=num_decoding_left_chunks, + simulate_streaming=simulate_streaming) + hyps = [hyp.tolist() for hyp in hyps] + else: + raise ValueError(f"Not support decoding method: {decoding_method}") + + res = [text_feature.defeaturize(hyp) for hyp in hyps] + return res + + +class U2STModel(U2STBaseModel): + def __init__(self, configs: dict): + vocab_size, encoder, decoder = U2STModel._init_from_config(configs) + + if isinstance(decoder, Tuple): + st_decoder, asr_decoder, ctc = decoder + super().__init__( + vocab_size=vocab_size, + encoder=encoder, + st_decoder=st_decoder, + decoder=asr_decoder, + ctc=ctc, + **configs['model_conf']) + else: + super().__init__( + vocab_size=vocab_size, + encoder=encoder, + st_decoder=decoder, + **configs['model_conf']) + + @classmethod + def _init_from_config(cls, configs: dict): + """init sub module for model. + + Args: + configs (dict): config dict. + + Raises: + ValueError: raise when using not support encoder type. + + Returns: + int, nn.Layer, nn.Layer, nn.Layer: vocab size, encoder, decoder, ctc + """ + if configs['cmvn_file'] is not None: + mean, istd = load_cmvn(configs['cmvn_file'], + configs['cmvn_file_type']) + global_cmvn = GlobalCMVN( + paddle.to_tensor(mean, dtype=paddle.float), + paddle.to_tensor(istd, dtype=paddle.float)) + else: + global_cmvn = None + + input_dim = configs['input_dim'] + vocab_size = configs['output_dim'] + assert input_dim != 0, input_dim + assert vocab_size != 0, vocab_size + + encoder_type = configs.get('encoder', 'transformer') + logger.info(f"U2 Encoder type: {encoder_type}") + if encoder_type == 'transformer': + encoder = TransformerEncoder( + input_dim, global_cmvn=global_cmvn, **configs['encoder_conf']) + elif encoder_type == 'conformer': + encoder = ConformerEncoder( + input_dim, global_cmvn=global_cmvn, **configs['encoder_conf']) + else: + raise ValueError(f"not support encoder type:{encoder_type}") + + st_decoder = TransformerDecoder(vocab_size, + encoder.output_size(), + **configs['decoder_conf']) + + asr_weight = configs['model_conf']['asr_weight'] + logger.info(f"ASR Joint Training Weight: {asr_weight}") + + if asr_weight > 0.: + decoder = TransformerDecoder(vocab_size, + encoder.output_size(), + **configs['decoder_conf']) + ctc = CTCDecoder( + odim=vocab_size, + enc_n_units=encoder.output_size(), + blank_id=0, + dropout_rate=0.0, + reduction=True, # sum + batch_average=True) # sum / batch_size + + return vocab_size, encoder, (st_decoder, decoder, ctc) + else: + return vocab_size, encoder, st_decoder + + @classmethod + def from_config(cls, configs: dict): + """init model. + + Args: + configs (dict): config dict. + + Raises: + ValueError: raise when using not support encoder type. + + Returns: + nn.Layer: U2STModel + """ + model = cls(configs) + return model + + @classmethod + def from_pretrained(cls, dataloader, config, checkpoint_path): + """Build a DeepSpeech2Model model from a pretrained model. + + Args: + dataloader (paddle.io.DataLoader): not used. + config (yacs.config.CfgNode): model configs + checkpoint_path (Path or str): the path of pretrained model checkpoint, without extension name + + Returns: + DeepSpeech2Model: The model built from pretrained result. + """ + config.defrost() + config.input_dim = dataloader.collate_fn.feature_size + config.output_dim = dataloader.collate_fn.vocab_size + config.freeze() + model = cls.from_config(config) + + if checkpoint_path: + infos = checkpoint.load_parameters( + model, checkpoint_path=checkpoint_path) + logger.info(f"checkpoint info: {infos}") + layer_tools.summary(model) + return model + + +class U2STInferModel(U2STModel): + def __init__(self, configs: dict): + super().__init__(configs) + + def forward(self, + feats, + feats_lengths, + decoding_chunk_size=-1, + num_decoding_left_chunks=-1, + simulate_streaming=False): + """export model function + + Args: + feats (Tensor): [B, T, D] + feats_lengths (Tensor): [B] + + Returns: + List[List[int]]: best path result + """ + return self.translate( + feats, + feats_lengths, + decoding_chunk_size=decoding_chunk_size, + num_decoding_left_chunks=num_decoding_left_chunks, + simulate_streaming=simulate_streaming) diff --git a/deepspeech/utils/bleu_score.py b/deepspeech/utils/bleu_score.py new file mode 100644 index 00000000..580fbf61 --- /dev/null +++ b/deepspeech/utils/bleu_score.py @@ -0,0 +1,53 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""This module provides functions to calculate bleu score in different level. +e.g. wer for word-level, cer for char-level. +""" +import numpy as np +import sacrebleu + +__all__ = ['bleu', 'char_bleu'] + + +def bleu(hypothesis, reference): + """Calculate BLEU. BLEU compares reference text and + hypothesis text in word-level using scarebleu. + + + + :param reference: The reference sentences. + :type reference: list[list[str]] + :param hypothesis: The hypothesis sentence. + :type hypothesis: list[str] + :raises ValueError: If the reference length is zero. + """ + + return sacrebleu.corpus_bleu(hypothesis, reference) + +def char_bleu(hypothesis, reference): + """Calculate BLEU. BLEU compares reference text and + hypothesis text in char-level using scarebleu. + + + + :param reference: The reference sentences. + :type reference: list[list[str]] + :param hypothesis: The hypothesis sentence. + :type hypothesis: list[str] + :raises ValueError: If the reference number is zero. + """ + hypothesis =[' '.join(list(hyp.replace(' ', ''))) for hyp in hypothesis] + reference = [[' '.join(list(ref_i.replace(' ', ''))) for ref_i in ref ]for ref in reference ] + + return sacrebleu.corpus_bleu(hypothesis, reference) \ No newline at end of file From 9d05a749e22089797d4e1fbbd64f24aca576179c Mon Sep 17 00:00:00 2001 From: Junkun Date: Wed, 4 Aug 2021 13:45:44 -0700 Subject: [PATCH 07/21] script for TED-En-Zh translation --- examples/dataset/ted_en_zh/.gitignore | 6 + examples/dataset/ted_en_zh/ted_en_zh.py | 114 ++++++++++++++++++ examples/ted_en_zh/conf/transformer.yaml | 109 +++++++++++++++++ .../conf/transformer_joint_noam.yaml | 111 +++++++++++++++++ examples/ted_en_zh/local/data.sh | 111 +++++++++++++++++ examples/ted_en_zh/local/test.sh | 35 ++++++ examples/ted_en_zh/local/train.sh | 33 +++++ examples/ted_en_zh/path.sh | 14 +++ examples/ted_en_zh/run.sh | 40 ++++++ utils/build_vocab.py | 13 +- utils/format_triplet_data.py | 96 +++++++++++++++ 11 files changed, 679 insertions(+), 3 deletions(-) create mode 100644 examples/dataset/ted_en_zh/.gitignore create mode 100644 examples/dataset/ted_en_zh/ted_en_zh.py create mode 100644 examples/ted_en_zh/conf/transformer.yaml create mode 100644 examples/ted_en_zh/conf/transformer_joint_noam.yaml create mode 100755 examples/ted_en_zh/local/data.sh create mode 100755 examples/ted_en_zh/local/test.sh create mode 100755 examples/ted_en_zh/local/train.sh create mode 100644 examples/ted_en_zh/path.sh create mode 100755 examples/ted_en_zh/run.sh create mode 100755 utils/format_triplet_data.py diff --git a/examples/dataset/ted_en_zh/.gitignore b/examples/dataset/ted_en_zh/.gitignore new file mode 100644 index 00000000..ad6ab64a --- /dev/null +++ b/examples/dataset/ted_en_zh/.gitignore @@ -0,0 +1,6 @@ +*.tar.gz.* +manifest.* +*.md +EN-ZH/ +train-split/ +test-segment/ \ No newline at end of file diff --git a/examples/dataset/ted_en_zh/ted_en_zh.py b/examples/dataset/ted_en_zh/ted_en_zh.py new file mode 100644 index 00000000..08f15119 --- /dev/null +++ b/examples/dataset/ted_en_zh/ted_en_zh.py @@ -0,0 +1,114 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Prepare Ted-En-Zh speech translation dataset + +Create manifest files from splited datased. +dev set: tst2010, test set: tst2015 +Manifest file is a json-format file with each line containing the +meta data (i.e. audio filepath, transcript and audio duration) +of each audio file in the data set. +""" +import argparse +import codecs +import json +import os + +import soundfile + +parser = argparse.ArgumentParser(description=__doc__) +parser.add_argument( + "--src_dir", + default="", + type=str, + help="Directory to kaldi splited data. (default: %(default)s)") +parser.add_argument( + "--manifest_prefix", + default="manifest", + type=str, + help="Filepath prefix for output manifests. (default: %(default)s)") +args = parser.parse_args() + + +def create_manifest(data_dir, manifest_path_prefix): + print("Creating manifest %s ..." % manifest_path_prefix) + json_lines = [] + + data_types_infos = [('train', 'train-split/train-segment', 'En-Zh/train.en-zh'), + ('dev', 'test-segment/tst2010', 'En-Zh/tst2010.en-zh'), + ('test', 'test-segment/tst2015', 'En-Zh/tst2015.en-zh')] + for data_info in data_types_infos: + dtype, audio_relative_dir, text_relative_path = data_info + del json_lines[:] + total_sec = 0.0 + total_text = 0.0 + total_num = 0 + + text_path = os.path.join(data_dir, text_relative_path) + audio_dir = os.path.join(data_dir, audio_relative_dir) + + for line in codecs.open(text_path, 'r', 'utf-8', errors='ignore'): + line = line.strip() + if len(line) < 1: + continue + audio_id, trancription, translation = line.split('\t') + utt = audio_id.split('.')[0] + + audio_path = os.path.join(audio_dir, audio_id) + if os.path.exists(audio_path): + if os.path.getsize(audio_path) < 30000: + continue + audio_data, samplerate = soundfile.read(audio_path) + duration = float(len(audio_data) / samplerate) + json_lines.append( + json.dumps( + { + 'utt': utt, + 'feat': audio_path, + 'feat_shape': (duration, ), # second + 'text': " ".join(translation.split()), + 'text1': " ".join(trancription.split()) + }, + ensure_ascii=False)) + + total_sec += duration + total_text += len(translation.split()) + total_num += 1 + if not total_num % 1000: + print(dtype, 'Processed:', total_num) + + manifest_path = manifest_path_prefix + '.' + dtype + '.raw' + with codecs.open(manifest_path, 'w', 'utf-8') as fout: + for line in json_lines: + fout.write(line + '\n') + + +def prepare_dataset(src_dir, manifest_path=None): + """create manifest file.""" + if os.path.isdir(manifest_path): + manifest_path = os.path.join(manifest_path, 'manifest') + if manifest_path: + create_manifest(src_dir, manifest_path) + + +def main(): + if args.src_dir.startswith('~'): + args.src_dir = os.path.expanduser(args.src_dir) + + prepare_dataset(src_dir=args.src_dir, manifest_path=args.manifest_prefix) + + print("manifest prepare done!") + + +if __name__ == '__main__': + main() diff --git a/examples/ted_en_zh/conf/transformer.yaml b/examples/ted_en_zh/conf/transformer.yaml new file mode 100644 index 00000000..10a3e7f5 --- /dev/null +++ b/examples/ted_en_zh/conf/transformer.yaml @@ -0,0 +1,109 @@ +# https://yaml.org/type/float.html +data: + train_manifest: data/manifest.train.tiny + dev_manifest: data/manifest.dev + test_manifest: data/manifest.test + min_input_len: 0.5 # second + max_input_len: 3000.0 # second + min_output_len: 0.0 # tokens + max_output_len: 400.0 # tokens + min_output_input_ratio: 0.01 + max_output_input_ratio: 20.0 + +collator: + vocab_filepath: data/vocab.txt + unit_type: 'spm' + spm_model_prefix: data/bpe_unigram_8000 + mean_std_filepath: "" + # augmentation_config: conf/augmentation.json + batch_size: 10 + raw_wav: True # use raw_wav or kaldi feature + specgram_type: fbank #linear, mfcc, fbank + feat_dim: 80 + delta_delta: False + dither: 1.0 + target_sample_rate: 16000 + max_freq: None + n_fft: None + stride_ms: 10.0 + window_ms: 25.0 + use_dB_normalization: True + target_dB: -20 + random_seed: 0 + keep_transcription_text: False + sortagrad: True + shuffle_method: batch_shuffle + num_workers: 2 + + +# network architecture +model: + cmvn_file: "data/mean_std.json" + cmvn_file_type: "json" + # encoder related + encoder: transformer + encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: true + + # decoder related + decoder: transformer + decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 + + # hybrid CTC/attention + model_conf: + asr_weight: 0.0 + ctc_weight: 0.0 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false + + +training: + n_epoch: 120 + accum_grad: 2 + global_grad_clip: 5.0 + optim: adam + optim_conf: + lr: 0.004 + weight_decay: 1e-06 + scheduler: warmuplr # pytorch v1.1.0+ required + scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 + log_interval: 5 + checkpoint: + kbest_n: 50 + latest_n: 5 + + +decoding: + batch_size: 5 + error_rate_type: char-bleu + decoding_method: fullsentence # 'fullsentence', 'simultaneous' + alpha: 2.5 + beta: 0.3 + beam_size: 10 + cutoff_prob: 1.0 + cutoff_top_n: 0 + num_proc_bsearch: 8 + ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. + decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. + num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. + simulate_streaming: False # simulate streaming inference. Defaults to False. diff --git a/examples/ted_en_zh/conf/transformer_joint_noam.yaml b/examples/ted_en_zh/conf/transformer_joint_noam.yaml new file mode 100644 index 00000000..ba384f8c --- /dev/null +++ b/examples/ted_en_zh/conf/transformer_joint_noam.yaml @@ -0,0 +1,111 @@ +# https://yaml.org/type/float.html +data: + train_manifest: data/manifest.train + dev_manifest: data/manifest.dev + test_manifest: data/manifest.test + min_input_len: 0.5 # second + max_input_len: 3000.0 # second + min_output_len: 0.0 # tokens + max_output_len: 400.0 # tokens + min_output_input_ratio: 0.01 + max_output_input_ratio: 20.0 + +collator: + vocab_filepath: data/vocab.txt + unit_type: 'spm' + spm_model_prefix: data/bpe_unigram_8000 + mean_std_filepath: "" + # augmentation_config: conf/augmentation.json + batch_size: 10 + raw_wav: True # use raw_wav or kaldi feature + specgram_type: fbank #linear, mfcc, fbank + feat_dim: 80 + delta_delta: False + dither: 1.0 + target_sample_rate: 16000 + max_freq: None + n_fft: None + stride_ms: 10.0 + window_ms: 25.0 + use_dB_normalization: True + target_dB: -20 + random_seed: 0 + keep_transcription_text: False + sortagrad: True + shuffle_method: batch_shuffle + num_workers: 2 + + +# network architecture +model: + cmvn_file: "data/mean_std.json" + cmvn_file_type: "json" + # encoder related + encoder: transformer + encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: true + + # decoder related + decoder: transformer + decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 + + # hybrid CTC/attention + model_conf: + asr_weight: 0.5 + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false + + +training: + n_epoch: 120 + accum_grad: 2 + global_grad_clip: 5.0 + optim: adam + optim_conf: + lr: 2.5 + weight_decay: 1e-06 + scheduler: noam + scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 + log_interval: 5 + checkpoint: + kbest_n: 50 + latest_n: 5 + + +decoding: + batch_size: 5 + error_rate_type: char-bleu + decoding_method: fullsentence # 'fullsentence', 'simultaneous' + alpha: 2.5 + beta: 0.3 + beam_size: 10 + cutoff_prob: 1.0 + cutoff_top_n: 0 + num_proc_bsearch: 8 + ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. + decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. + num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. + simulate_streaming: False # simulate streaming inference. Defaults to False. + + diff --git a/examples/ted_en_zh/local/data.sh b/examples/ted_en_zh/local/data.sh new file mode 100755 index 00000000..0a5c58aa --- /dev/null +++ b/examples/ted_en_zh/local/data.sh @@ -0,0 +1,111 @@ +#!/bin/bash + +stage=-1 +stop_stage=100 + +# bpemode (unigram or bpe) +nbpe=8000 +bpemode=unigram +bpeprefix="data/bpe_${bpemode}_${nbpe}" +DATA_DIR= + + +source ${MAIN_ROOT}/utils/parse_options.sh + + +mkdir -p data +TARGET_DIR=${MAIN_ROOT}/examples/dataset +mkdir -p ${TARGET_DIR} + +if [ ! -d ${SOURCE_DIR} ]; then + echo "Error: Dataset is not avaiable. Please download and unzip the dataset" + echo "Download Link: https://pan.baidu.com/s/18L-59wgeS96WkObISrytQQ Passwd: bva0" + echo "The tree of the directory should be:" + echo "." + echo "|-- En-Zh" + echo "|-- test-segment" + echo " |-- tst2010" + echo " |-- ..." + echo "|-- train-split" + echo " |-- train-segment" + echo "|-- README.md" + + exit 1 +fi + +if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then + # generate manifests + python3 ${TARGET_DIR}/ted_en_zh/ted_en_zh.py \ + --manifest_prefix="data/manifest" \ + --src_dir="${DATA_DIR}" + + echo "Complete raw data pre-process." +fi + + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # build vocabulary + python3 ${MAIN_ROOT}/utils/build_vocab.py \ + --unit_type "spm" \ + --spm_vocab_size=${nbpe} \ + --spm_mode ${bpemode} \ + --spm_model_prefix ${bpeprefix} \ + --vocab_path="data/vocab.txt" \ + --text_keys 'text' 'text1' \ + --manifest_paths="data/manifest.train.raw" + + + if [ $? -ne 0 ]; then + echo "Build vocabulary failed. Terminated." + exit 1 + fi +fi + + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # compute mean and stddev for normalizer + num_workers=$(nproc) + python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ + --manifest_path="data/manifest.train.raw" \ + --num_samples=-1 \ + --specgram_type="fbank" \ + --feat_dim=80 \ + --delta_delta=false \ + --sample_rate=16000 \ + --stride_ms=10.0 \ + --window_ms=25.0 \ + --use_dB_normalization=False \ + --num_workers=${num_workers} \ + --output_path="data/mean_std.json" + + if [ $? -ne 0 ]; then + echo "Compute mean and stddev failed. Terminated." + exit 1 + fi +fi + + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # format manifest with tokenids, vocab size + for set in train dev test; do + { + python3 ${MAIN_ROOT}/utils/format_triplet_data.py \ + --feat_type "raw" \ + --cmvn_path "data/mean_std.json" \ + --unit_type "spm" \ + --spm_model_prefix ${bpeprefix} \ + --vocab_path="data/vocab.txt" \ + --manifest_path="data/manifest.${set}.raw" \ + --output_path="data/manifest.${set}" + + if [ $? -ne 0 ]; then + echo "Formt mnaifest failed. Terminated." + exit 1 + fi + }& + done + wait +fi + +echo "Ted En-Zh Data preparation done." +exit 0 diff --git a/examples/ted_en_zh/local/test.sh b/examples/ted_en_zh/local/test.sh new file mode 100755 index 00000000..802bb13c --- /dev/null +++ b/examples/ted_en_zh/local/test.sh @@ -0,0 +1,35 @@ +#! /usr/bin/env bash + +if [ $# != 2 ];then + echo "usage: ${0} config_path ckpt_path_prefix" + exit -1 +fi + +ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') +echo "using $ngpu gpus..." + +device=gpu +if [ ngpu == 0 ];then + device=cpu +fi +config_path=$1 +ckpt_prefix=$2 + +for type in fullsentence; do + echo "decoding ${type}" + batch_size=32 + python3 -u ${BIN_DIR}/test.py \ + --device ${device} \ + --nproc 1 \ + --config ${config_path} \ + --result_file ${ckpt_prefix}.${type}.rsl \ + --checkpoint_path ${ckpt_prefix} \ + --opts decoding.decoding_method ${type} decoding.batch_size ${batch_size} + + if [ $? -ne 0 ]; then + echo "Failed in evaluation!" + exit 1 + fi +done + +exit 0 diff --git a/examples/ted_en_zh/local/train.sh b/examples/ted_en_zh/local/train.sh new file mode 100755 index 00000000..f3eb98da --- /dev/null +++ b/examples/ted_en_zh/local/train.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +if [ $# != 2 ];then + echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name" + exit -1 +fi + +ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') +echo "using $ngpu gpus..." + +config_path=$1 +ckpt_name=$2 + +device=gpu +if [ ${ngpu} == 0 ];then + device=cpu +fi +echo "using ${device}..." + +mkdir -p exp + +python3 -u ${BIN_DIR}/train.py \ +--device ${device} \ +--nproc ${ngpu} \ +--config ${config_path} \ +--output exp/${ckpt_name} + +if [ $? -ne 0 ]; then + echo "Failed in training!" + exit 1 +fi + +exit 0 diff --git a/examples/ted_en_zh/path.sh b/examples/ted_en_zh/path.sh new file mode 100644 index 00000000..881a5b91 --- /dev/null +++ b/examples/ted_en_zh/path.sh @@ -0,0 +1,14 @@ +export MAIN_ROOT=${PWD}/../../ + +export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} +export LC_ALL=C + +# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C +export PYTHONIOENCODING=UTF-8 +export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} + +export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/ + + +MODEL=u2_st +export BIN_DIR=${MAIN_ROOT}/deepspeech/exps/${MODEL}/bin diff --git a/examples/ted_en_zh/run.sh b/examples/ted_en_zh/run.sh new file mode 100755 index 00000000..89048f3d --- /dev/null +++ b/examples/ted_en_zh/run.sh @@ -0,0 +1,40 @@ +#!/bin/bash +set -e +source path.sh + +stage=0 +stop_stage=100 +conf_path=conf/transformer_joint_noam.yaml +avg_num=5 +data_path=./TED-En-Zh # path to unzipped data +source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; + +avg_ckpt=avg_${avg_num} +ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}') +echo "checkpoint name ${ckpt}" + + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # prepare data + bash ./local/data.sh --DATA_DIR ${data_path} || exit -1 +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # train model, all `ckpt` under `exp` dir + CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 ./local/train.sh ${conf_path} ${ckpt} +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # avg n best model + ../../utils/avg.sh exp/${ckpt}/checkpoints ${avg_num} +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + # test ckpt avg_n + CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 +fi + +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + # export ckpt avg_n + CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit +fi diff --git a/utils/build_vocab.py b/utils/build_vocab.py index 76092b25..151d52f8 100755 --- a/utils/build_vocab.py +++ b/utils/build_vocab.py @@ -44,6 +44,11 @@ add_arg('manifest_paths', str, "You can provide multiple manifest files.", nargs='+', required=True) +add_arg('text_keys', str, + 'text', + "keys of the text in manifest for building vocabulary. " + "You can provide multiple k.", + nargs='+') # bpe add_arg('spm_vocab_size', int, 0, "Vocab size for spm.") add_arg('spm_mode', str, 'unigram', "spm model type, e.g. unigram, spm, char, word. only need when `unit_type` is spm") @@ -58,10 +63,10 @@ def count_manifest(counter, text_feature, manifest_path): line = text_feature.tokenize(line_json['text']) counter.update(line) -def dump_text_manifest(fileobj, manifest_path): +def dump_text_manifest(fileobj, manifest_path, key='text'): manifest_jsons = read_manifest(manifest_path) for line_json in manifest_jsons: - fileobj.write(line_json['text'] + "\n") + fileobj.write(line_json[key] + "\n") def main(): print_arguments(args, globals()) @@ -78,7 +83,9 @@ def main(): fp = tempfile.NamedTemporaryFile(mode='w', delete=False) for manifest_path in args.manifest_paths: - dump_text_manifest(fp, manifest_path) + text_keys = [args.text_keys] if type(args.text_keys) is not list else args.text_keys + for text_key in text_keys: + dump_text_manifest(fp, manifest_path, key=text_key) fp.close() # train spm.SentencePieceTrainer.Train( diff --git a/utils/format_triplet_data.py b/utils/format_triplet_data.py new file mode 100755 index 00000000..f3dd7ca4 --- /dev/null +++ b/utils/format_triplet_data.py @@ -0,0 +1,96 @@ +#!/usr/bin/env python3 +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""format manifest with more metadata.""" +import argparse +import functools +import json + +from deepspeech.frontend.featurizer.text_featurizer import TextFeaturizer +from deepspeech.frontend.utility import load_cmvn +from deepspeech.frontend.utility import read_manifest +from deepspeech.utils.utility import add_arguments +from deepspeech.utils.utility import print_arguments + +parser = argparse.ArgumentParser(description=__doc__) +add_arg = functools.partial(add_arguments, argparser=parser) +# yapf: disable +add_arg('feat_type', str, "raw", "speech feature type, e.g. raw(wav, flac), kaldi") +add_arg('cmvn_path', str, + 'examples/librispeech/data/mean_std.json', + "Filepath of cmvn.") +add_arg('unit_type', str, "char", "Unit type, e.g. char, word, spm") +add_arg('vocab_path', str, + 'examples/librispeech/data/vocab.txt', + "Filepath of the vocabulary.") +add_arg('manifest_paths', str, + None, + "Filepaths of manifests for building vocabulary. " + "You can provide multiple manifest files.", + nargs='+', + required=True) +# bpe +add_arg('spm_model_prefix', str, None, + "spm model prefix, spm_model_%(bpe_mode)_%(count_threshold), only need when `unit_type` is spm") +add_arg('output_path', str, None, "filepath of formated manifest.", required=True) +# yapf: disable +args = parser.parse_args() + + +def main(): + print_arguments(args, globals()) + fout = open(args.output_path, 'w', encoding='utf-8') + + # get feat dim + mean, std = load_cmvn(args.cmvn_path, filetype='json') + feat_dim = mean.shape[0] #(D) + print(f"Feature dim: {feat_dim}") + + text_feature = TextFeaturizer(args.unit_type, args.vocab_path, args.spm_model_prefix) + vocab_size = text_feature.vocab_size + print(f"Vocab size: {vocab_size}") + + count = 0 + for manifest_path in args.manifest_paths: + manifest_jsons = read_manifest(manifest_path) + for line_json in manifest_jsons: + # text: translation text, text1: transcript text. + # Currently only support joint-vocab, will add separate vocabs setting. + line = line_json['text'] + tokens = text_feature.tokenize(line) + tokenids = text_feature.featurize(line) + line_json['token'] = tokens + line_json['token_id'] = tokenids + line_json['token_shape'] = (len(tokenids), vocab_size) + line = line_json['text1'] + tokens = text_feature.tokenize(line) + tokenids = text_feature.featurize(line) + line_json['token1'] = tokens + line_json['token_id1'] = tokenids + line_json['token_shape1'] = (len(tokenids), vocab_size) + feat_shape = line_json['feat_shape'] + assert isinstance(feat_shape, (list, tuple)), type(feat_shape) + if args.feat_type == 'raw': + feat_shape.append(feat_dim) + else: # kaldi + raise NotImplementedError('no support kaldi feat now!') + fout.write(json.dumps(line_json) + '\n') + count += 1 + + print(f"Examples number: {count}") + fout.close() + + +if __name__ == '__main__': + main() From dc44f552a1f3250cb6a05ab92a821ee96dc1e84f Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Thu, 5 Aug 2021 02:57:05 +0000 Subject: [PATCH 08/21] update readme --- examples/librispeech/README.md | 1 + examples/librispeech/s2/README.md | 29 ++++++++++++----------------- 2 files changed, 13 insertions(+), 17 deletions(-) diff --git a/examples/librispeech/README.md b/examples/librispeech/README.md index 354baafa..2718988f 100644 --- a/examples/librispeech/README.md +++ b/examples/librispeech/README.md @@ -3,3 +3,4 @@ * s0 is for deepspeech2 offline * s1 is for transformer/conformer/U2 * s2 is for transformer/conformer/U2 w/ kaldi feat +need install Kaldi diff --git a/examples/librispeech/s2/README.md b/examples/librispeech/s2/README.md index f27b474c..8f092dd8 100644 --- a/examples/librispeech/s2/README.md +++ b/examples/librispeech/s2/README.md @@ -1,47 +1,42 @@ # LibriSpeech ## Data - | Data Subset | Duration in Seconds | | data/manifest.train | 0.83s ~ 29.735s | | data/manifest.dev | 1.065 ~ 35.155s | | data/manifest.test-clean | 1.285s ~ 34.955s | ## Conformer - | Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER | | --- | --- | --- | --- | --- | --- | --- | --- | -| conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean | attention | 6.35 | 0.030162 | -| conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean | ctc_greedy_search | 6.35 | 0.037910 | -| conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean | ctc_prefix_beam_search | 6.35 | 0.037761 | -| conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean | attention_rescoring | 6.35 | 0.032115 | - +| conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean | attention | - | - | +| conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean | ctc_greedy_search | | | +| conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean | ctc_prefix_beam_search | | | +| conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean | attention_rescoring | | | ### Test w/o length filter - | Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER | | --- | --- | --- | --- | --- | --- | --- | --- | -| conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean-all | attention | 6.35 | 0.057117 | +| conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean-all | attention | | | + ## Chunk Conformer | Model | Params | Config | Augmentation| Test set | Decode method | Chunk Size & Left Chunks | Loss | WER | | --- | --- | --- | --- | --- | --- | --- | --- | --- | -| conformer | 47.63 M | conf/chunk_conformer.yaml | spec_aug + shift | test-clean | attention | 16, -1 | 7.01250648 | 0.069548 | -| conformer | 47.63 M | conf/chunk_conformer.yaml | spec_aug + shift | test-clean | ctc_greedy_search | 16, -1 | 7.01250648 | 0.094753 | -| conformer | 47.63 M | conf/chunk_conformer.yaml | spec_aug + shift | test-clean | ctc_prefix_beam_search | 16, -1 | 7.01250648 | - | -| conformer | 47.63 M | conf/chunk_conformer.yaml | spec_aug + shift | test-clean | attention_rescoring | 16, -1 | 7.01250648 | - | +| conformer | 47.63 M | conf/chunk_conformer.yaml | spec_aug + shift | test-clean | attention | 16, -1 | | | +| conformer | 47.63 M | conf/chunk_conformer.yaml | spec_aug + shift | test-clean | ctc_greedy_search | 16, -1 | | | +| conformer | 47.63 M | conf/chunk_conformer.yaml | spec_aug + shift | test-clean | ctc_prefix_beam_search | 16, -1 | | - | +| conformer | 47.63 M | conf/chunk_conformer.yaml | spec_aug + shift | test-clean | attention_rescoring | 16, -1 | | - | ## Transformer - | Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER | | --- | --- | --- | --- | --- | --- | --- | --- | -| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean | attention | 6.98 | 0.036 | +| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean | attention | | | ### Test w/o length filter - | Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER | | --- | --- | --- | --- | --- | --- | --- | --- | -| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean-all | attention | 6.98 | 0.066500 | +| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean-all | attention | | | From 3864e1130222336c93d5bc984b6d2f45c3428a9d Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Thu, 5 Aug 2021 02:58:32 +0000 Subject: [PATCH 09/21] update librispeech chunk conformer result --- examples/librispeech/s1/README.md | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/examples/librispeech/s1/README.md b/examples/librispeech/s1/README.md index f27b474c..88ea216e 100644 --- a/examples/librispeech/s1/README.md +++ b/examples/librispeech/s1/README.md @@ -1,14 +1,14 @@ # LibriSpeech ## Data - | Data Subset | Duration in Seconds | +| --- | --- | | data/manifest.train | 0.83s ~ 29.735s | | data/manifest.dev | 1.065 ~ 35.155s | | data/manifest.test-clean | 1.285s ~ 34.955s | -## Conformer +## Conformer | Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER | | --- | --- | --- | --- | --- | --- | --- | --- | | conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean | attention | 6.35 | 0.030162 | @@ -16,32 +16,27 @@ | conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean | ctc_prefix_beam_search | 6.35 | 0.037761 | | conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean | attention_rescoring | 6.35 | 0.032115 | - ### Test w/o length filter - | Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER | | --- | --- | --- | --- | --- | --- | --- | --- | | conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean-all | attention | 6.35 | 0.057117 | -## Chunk Conformer +## Chunk Conformer | Model | Params | Config | Augmentation| Test set | Decode method | Chunk Size & Left Chunks | Loss | WER | | --- | --- | --- | --- | --- | --- | --- | --- | --- | -| conformer | 47.63 M | conf/chunk_conformer.yaml | spec_aug + shift | test-clean | attention | 16, -1 | 7.01250648 | 0.069548 | -| conformer | 47.63 M | conf/chunk_conformer.yaml | spec_aug + shift | test-clean | ctc_greedy_search | 16, -1 | 7.01250648 | 0.094753 | -| conformer | 47.63 M | conf/chunk_conformer.yaml | spec_aug + shift | test-clean | ctc_prefix_beam_search | 16, -1 | 7.01250648 | - | -| conformer | 47.63 M | conf/chunk_conformer.yaml | spec_aug + shift | test-clean | attention_rescoring | 16, -1 | 7.01250648 | - | +| conformer | 47.63 M | conf/chunk_conformer.yaml | spec_aug + shift | test-clean | attention | 16, -1 | 7.11 | 0.063193 | +| conformer | 47.63 M | conf/chunk_conformer.yaml | spec_aug + shift | test-clean | ctc_greedy_search | 16, -1 | 7.11 | 0.082394 | +| conformer | 47.63 M | conf/chunk_conformer.yaml | spec_aug + shift | test-clean | ctc_prefix_beam_search | 16, -1 | 7.11 | 0.082156 | +| conformer | 47.63 M | conf/chunk_conformer.yaml | spec_aug + shift | test-clean | attention_rescoring | 16, -1 | 7.11 | 0.071000 | ## Transformer - | Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER | | --- | --- | --- | --- | --- | --- | --- | --- | | transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean | attention | 6.98 | 0.036 | ### Test w/o length filter - | Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER | | --- | --- | --- | --- | --- | --- | --- | --- | -| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean-all | attention | 6.98 | 0.066500 | - +| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean-all | attention | 6.98 | 0.066500 | \ No newline at end of file From 515497ae1f3616efc1a8c020897d795fce5ecea1 Mon Sep 17 00:00:00 2001 From: Junkun Date: Wed, 4 Aug 2021 21:33:23 -0700 Subject: [PATCH 10/21] refine the code --- deepspeech/io/collator_st.py | 2 +- deepspeech/utils/bleu_score.py | 8 +++++--- examples/dataset/ted_en_zh/ted_en_zh.py | 10 ++++++---- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/deepspeech/io/collator_st.py b/deepspeech/io/collator_st.py index 34933312..1be6445d 100644 --- a/deepspeech/io/collator_st.py +++ b/deepspeech/io/collator_st.py @@ -563,7 +563,7 @@ class KaldiPrePorocessedCollator(SpeechCollator): @property def feature_size(self): return self._feat_dim - + @property def stride_ms(self): return self._stride_ms diff --git a/deepspeech/utils/bleu_score.py b/deepspeech/utils/bleu_score.py index 580fbf61..f1bf5261 100644 --- a/deepspeech/utils/bleu_score.py +++ b/deepspeech/utils/bleu_score.py @@ -35,6 +35,7 @@ def bleu(hypothesis, reference): return sacrebleu.corpus_bleu(hypothesis, reference) + def char_bleu(hypothesis, reference): """Calculate BLEU. BLEU compares reference text and hypothesis text in char-level using scarebleu. @@ -47,7 +48,8 @@ def char_bleu(hypothesis, reference): :type hypothesis: list[str] :raises ValueError: If the reference number is zero. """ - hypothesis =[' '.join(list(hyp.replace(' ', ''))) for hyp in hypothesis] - reference = [[' '.join(list(ref_i.replace(' ', ''))) for ref_i in ref ]for ref in reference ] + hypothesis = [' '.join(list(hyp.replace(' ', ''))) for hyp in hypothesis] + reference = [[' '.join(list(ref_i.replace(' ', ''))) for ref_i in ref] + for ref in reference] - return sacrebleu.corpus_bleu(hypothesis, reference) \ No newline at end of file + return sacrebleu.corpus_bleu(hypothesis, reference) diff --git a/examples/dataset/ted_en_zh/ted_en_zh.py b/examples/dataset/ted_en_zh/ted_en_zh.py index 08f15119..14bef01d 100644 --- a/examples/dataset/ted_en_zh/ted_en_zh.py +++ b/examples/dataset/ted_en_zh/ted_en_zh.py @@ -44,9 +44,11 @@ def create_manifest(data_dir, manifest_path_prefix): print("Creating manifest %s ..." % manifest_path_prefix) json_lines = [] - data_types_infos = [('train', 'train-split/train-segment', 'En-Zh/train.en-zh'), - ('dev', 'test-segment/tst2010', 'En-Zh/tst2010.en-zh'), - ('test', 'test-segment/tst2015', 'En-Zh/tst2015.en-zh')] + data_types_infos = [ + ('train', 'train-split/train-segment', 'En-Zh/train.en-zh'), + ('dev', 'test-segment/tst2010', 'En-Zh/tst2010.en-zh'), + ('test', 'test-segment/tst2015', 'En-Zh/tst2015.en-zh') + ] for data_info in data_types_infos: dtype, audio_relative_dir, text_relative_path = data_info del json_lines[:] @@ -63,7 +65,7 @@ def create_manifest(data_dir, manifest_path_prefix): continue audio_id, trancription, translation = line.split('\t') utt = audio_id.split('.')[0] - + audio_path = os.path.join(audio_dir, audio_id) if os.path.exists(audio_path): if os.path.getsize(audio_path) < 30000: From 45e71a0a64271bd8fbec85dc605088dae9186a59 Mon Sep 17 00:00:00 2001 From: Junkun Date: Wed, 4 Aug 2021 21:33:35 -0700 Subject: [PATCH 11/21] correct yaml --- examples/ted_en_zh/conf/transformer_joint_noam.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/ted_en_zh/conf/transformer_joint_noam.yaml b/examples/ted_en_zh/conf/transformer_joint_noam.yaml index ba384f8c..bc1f8890 100644 --- a/examples/ted_en_zh/conf/transformer_joint_noam.yaml +++ b/examples/ted_en_zh/conf/transformer_joint_noam.yaml @@ -3,8 +3,8 @@ data: train_manifest: data/manifest.train dev_manifest: data/manifest.dev test_manifest: data/manifest.test - min_input_len: 0.5 # second - max_input_len: 3000.0 # second + min_input_len: 0.05 # second + max_input_len: 30.0 # second min_output_len: 0.0 # tokens max_output_len: 400.0 # tokens min_output_input_ratio: 0.01 From 895330bad354d1447ae9cb5934f7a1bbebd561c4 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Thu, 5 Aug 2021 04:54:48 +0000 Subject: [PATCH 12/21] refactor ted en zh script; fix MAIN_ROOT path --- examples/aishell/s0/path.sh | 2 +- examples/aishell/s1/path.sh | 2 +- examples/callcenter/s1/path.sh | 2 +- examples/cc-cedict/path.sh | 2 +- examples/chinese_g2p/path.sh | 2 +- examples/librispeech/s0/path.sh | 2 +- examples/librispeech/s1/path.sh | 2 +- examples/librispeech/s2/path.sh | 2 +- examples/ngram_lm/s0/path.sh | 2 +- examples/spm/path.sh | 2 +- examples/ted_en_zh/README.md | 3 +++ examples/ted_en_zh/{ => t0}/conf/transformer.yaml | 4 ++-- examples/ted_en_zh/{ => t0}/conf/transformer_joint_noam.yaml | 0 examples/ted_en_zh/{ => t0}/local/data.sh | 0 examples/ted_en_zh/{ => t0}/local/test.sh | 0 examples/ted_en_zh/{ => t0}/local/train.sh | 0 examples/ted_en_zh/{ => t0}/path.sh | 2 +- examples/ted_en_zh/{ => t0}/run.sh | 0 examples/text_normalization/path.sh | 3 ++- examples/thchs30/a0/path.sh | 2 +- examples/timit/s1/path.sh | 3 ++- examples/tiny/s0/path.sh | 2 +- examples/tiny/s1/path.sh | 2 +- 23 files changed, 23 insertions(+), 18 deletions(-) create mode 100644 examples/ted_en_zh/README.md rename examples/ted_en_zh/{ => t0}/conf/transformer.yaml (97%) rename examples/ted_en_zh/{ => t0}/conf/transformer_joint_noam.yaml (100%) rename examples/ted_en_zh/{ => t0}/local/data.sh (100%) rename examples/ted_en_zh/{ => t0}/local/test.sh (100%) rename examples/ted_en_zh/{ => t0}/local/train.sh (100%) rename examples/ted_en_zh/{ => t0}/path.sh (88%) rename examples/ted_en_zh/{ => t0}/run.sh (100%) diff --git a/examples/aishell/s0/path.sh b/examples/aishell/s0/path.sh index 552b9678..e6d3a655 100644 --- a/examples/aishell/s0/path.sh +++ b/examples/aishell/s0/path.sh @@ -1,4 +1,4 @@ -export MAIN_ROOT=${PWD}/../../../ +export MAIN_ROOT=`realpath ${PWD}/../../../` export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} export LC_ALL=C diff --git a/examples/aishell/s1/path.sh b/examples/aishell/s1/path.sh index 6214c8ac..6807a950 100644 --- a/examples/aishell/s1/path.sh +++ b/examples/aishell/s1/path.sh @@ -1,4 +1,4 @@ -export MAIN_ROOT=${PWD}/../../../ +export MAIN_ROOT=`realpath ${PWD}/../../../` export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} export LC_ALL=C diff --git a/examples/callcenter/s1/path.sh b/examples/callcenter/s1/path.sh index 30adb6ca..29841bc1 100644 --- a/examples/callcenter/s1/path.sh +++ b/examples/callcenter/s1/path.sh @@ -1,4 +1,4 @@ -export MAIN_ROOT=${PWD}/../../../ +export MAIN_ROOT=`realpath ${PWD}/../../../` export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} export LC_ALL=C diff --git a/examples/cc-cedict/path.sh b/examples/cc-cedict/path.sh index 84e2de7d..f8fdd82d 100644 --- a/examples/cc-cedict/path.sh +++ b/examples/cc-cedict/path.sh @@ -1,4 +1,4 @@ -export MAIN_ROOT=${PWD}/../../ +export MAIN_ROOT=`realpath ${PWD}/../../` export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} export LC_ALL=C diff --git a/examples/chinese_g2p/path.sh b/examples/chinese_g2p/path.sh index b4c625f9..482177dc 100644 --- a/examples/chinese_g2p/path.sh +++ b/examples/chinese_g2p/path.sh @@ -1,4 +1,4 @@ -export MAIN_ROOT=${PWD}/../../ +export MAIN_ROOT=`realpath ${PWD}/../../` export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} export LC_ALL=C diff --git a/examples/librispeech/s0/path.sh b/examples/librispeech/s0/path.sh index 777da29e..8a9345f2 100644 --- a/examples/librispeech/s0/path.sh +++ b/examples/librispeech/s0/path.sh @@ -1,4 +1,4 @@ -export MAIN_ROOT=${PWD}/../../../ +export MAIN_ROOT=`realpath ${PWD}/../../../` export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} export LC_ALL=C diff --git a/examples/librispeech/s1/path.sh b/examples/librispeech/s1/path.sh index 22fb1255..457f7e54 100644 --- a/examples/librispeech/s1/path.sh +++ b/examples/librispeech/s1/path.sh @@ -1,4 +1,4 @@ -export MAIN_ROOT=${PWD}/../../../ +export MAIN_ROOT=`realpath ${PWD}/../../../` export PATH=${MAIN_ROOT}:${PWD}/utils:${PATH} export LC_ALL=C diff --git a/examples/librispeech/s2/path.sh b/examples/librispeech/s2/path.sh index 22fb1255..457f7e54 100644 --- a/examples/librispeech/s2/path.sh +++ b/examples/librispeech/s2/path.sh @@ -1,4 +1,4 @@ -export MAIN_ROOT=${PWD}/../../../ +export MAIN_ROOT=`realpath ${PWD}/../../../` export PATH=${MAIN_ROOT}:${PWD}/utils:${PATH} export LC_ALL=C diff --git a/examples/ngram_lm/s0/path.sh b/examples/ngram_lm/s0/path.sh index 5f580bc4..cbd1d82c 100644 --- a/examples/ngram_lm/s0/path.sh +++ b/examples/ngram_lm/s0/path.sh @@ -1,4 +1,4 @@ -export MAIN_ROOT=${PWD}/../../../ +export MAIN_ROOT=`realpath ${PWD}/../../../` export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} export LC_ALL=C diff --git a/examples/spm/path.sh b/examples/spm/path.sh index 9da641e1..20237889 100644 --- a/examples/spm/path.sh +++ b/examples/spm/path.sh @@ -1,4 +1,4 @@ -export MAIN_ROOT=${PWD}/../../ +export MAIN_ROOT=`realpath ${PWD}/../../` export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} export LC_ALL=C diff --git a/examples/ted_en_zh/README.md b/examples/ted_en_zh/README.md new file mode 100644 index 00000000..5664b06b --- /dev/null +++ b/examples/ted_en_zh/README.md @@ -0,0 +1,3 @@ +# TED En -> Zh + +* t0 for u2 speech translation diff --git a/examples/ted_en_zh/conf/transformer.yaml b/examples/ted_en_zh/t0/conf/transformer.yaml similarity index 97% rename from examples/ted_en_zh/conf/transformer.yaml rename to examples/ted_en_zh/t0/conf/transformer.yaml index 10a3e7f5..755e0446 100644 --- a/examples/ted_en_zh/conf/transformer.yaml +++ b/examples/ted_en_zh/t0/conf/transformer.yaml @@ -3,8 +3,8 @@ data: train_manifest: data/manifest.train.tiny dev_manifest: data/manifest.dev test_manifest: data/manifest.test - min_input_len: 0.5 # second - max_input_len: 3000.0 # second + min_input_len: 0.05 # second + max_input_len: 30.0 # second min_output_len: 0.0 # tokens max_output_len: 400.0 # tokens min_output_input_ratio: 0.01 diff --git a/examples/ted_en_zh/conf/transformer_joint_noam.yaml b/examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml similarity index 100% rename from examples/ted_en_zh/conf/transformer_joint_noam.yaml rename to examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml diff --git a/examples/ted_en_zh/local/data.sh b/examples/ted_en_zh/t0/local/data.sh similarity index 100% rename from examples/ted_en_zh/local/data.sh rename to examples/ted_en_zh/t0/local/data.sh diff --git a/examples/ted_en_zh/local/test.sh b/examples/ted_en_zh/t0/local/test.sh similarity index 100% rename from examples/ted_en_zh/local/test.sh rename to examples/ted_en_zh/t0/local/test.sh diff --git a/examples/ted_en_zh/local/train.sh b/examples/ted_en_zh/t0/local/train.sh similarity index 100% rename from examples/ted_en_zh/local/train.sh rename to examples/ted_en_zh/t0/local/train.sh diff --git a/examples/ted_en_zh/path.sh b/examples/ted_en_zh/t0/path.sh similarity index 88% rename from examples/ted_en_zh/path.sh rename to examples/ted_en_zh/t0/path.sh index 881a5b91..a7f60425 100644 --- a/examples/ted_en_zh/path.sh +++ b/examples/ted_en_zh/t0/path.sh @@ -1,4 +1,4 @@ -export MAIN_ROOT=${PWD}/../../ +export MAIN_ROOT=`realpath ${PWD}/../../../` export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} export LC_ALL=C diff --git a/examples/ted_en_zh/run.sh b/examples/ted_en_zh/t0/run.sh similarity index 100% rename from examples/ted_en_zh/run.sh rename to examples/ted_en_zh/t0/run.sh diff --git a/examples/text_normalization/path.sh b/examples/text_normalization/path.sh index 7cec3a24..30689eee 100644 --- a/examples/text_normalization/path.sh +++ b/examples/text_normalization/path.sh @@ -1,4 +1,5 @@ -export MAIN_ROOT=${PWD}/../../ +export MAIN_ROOT=`realpath ${PWD}/../../` + export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} export LC_ALL=C diff --git a/examples/thchs30/a0/path.sh b/examples/thchs30/a0/path.sh index 08e13c19..fc953beb 100644 --- a/examples/thchs30/a0/path.sh +++ b/examples/thchs30/a0/path.sh @@ -1,4 +1,4 @@ -export MAIN_ROOT=${PWD}/../../../ +export MAIN_ROOT=`realpath ${PWD}/../../../` export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} export LC_ALL=C diff --git a/examples/timit/s1/path.sh b/examples/timit/s1/path.sh index a632babe..29841bc1 100644 --- a/examples/timit/s1/path.sh +++ b/examples/timit/s1/path.sh @@ -1,4 +1,5 @@ -export MAIN_ROOT=${PWD}/../../../ +export MAIN_ROOT=`realpath ${PWD}/../../../` + export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} export LC_ALL=C diff --git a/examples/tiny/s0/path.sh b/examples/tiny/s0/path.sh index 777da29e..8a9345f2 100644 --- a/examples/tiny/s0/path.sh +++ b/examples/tiny/s0/path.sh @@ -1,4 +1,4 @@ -export MAIN_ROOT=${PWD}/../../../ +export MAIN_ROOT=`realpath ${PWD}/../../../` export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} export LC_ALL=C diff --git a/examples/tiny/s1/path.sh b/examples/tiny/s1/path.sh index 30adb6ca..29841bc1 100644 --- a/examples/tiny/s1/path.sh +++ b/examples/tiny/s1/path.sh @@ -1,4 +1,4 @@ -export MAIN_ROOT=${PWD}/../../../ +export MAIN_ROOT=`realpath ${PWD}/../../../` export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} export LC_ALL=C From cc813b18d37e081df844953bc9ce56de3e5b104f Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Thu, 5 Aug 2021 07:04:30 +0000 Subject: [PATCH 13/21] fix install and format code --- deepspeech/exps/u2_st/model.py | 3 --- deepspeech/io/collator_st.py | 1 - deepspeech/models/u2_st.py | 6 ------ deepspeech/utils/bleu_score.py | 1 - examples/librispeech/s1/README.md | 4 ++-- examples/librispeech/s2/README.md | 1 - requirements.txt | 2 ++ setup.sh | 4 +++- 8 files changed, 7 insertions(+), 15 deletions(-) diff --git a/deepspeech/exps/u2_st/model.py b/deepspeech/exps/u2_st/model.py index 867d1899..f72e2bbc 100644 --- a/deepspeech/exps/u2_st/model.py +++ b/deepspeech/exps/u2_st/model.py @@ -18,9 +18,7 @@ import sys import time from collections import defaultdict from pathlib import Path -from typing import List from typing import Optional -from typing import Tuple import numpy as np import paddle @@ -42,7 +40,6 @@ from deepspeech.training.scheduler import WarmupLR from deepspeech.training.trainer import Trainer from deepspeech.utils import bleu_score from deepspeech.utils import ctc_utils -from deepspeech.utils import error_rate from deepspeech.utils import layer_tools from deepspeech.utils import mp_tools from deepspeech.utils import text_grid diff --git a/deepspeech/io/collator_st.py b/deepspeech/io/collator_st.py index 1be6445d..1ee36190 100644 --- a/deepspeech/io/collator_st.py +++ b/deepspeech/io/collator_st.py @@ -14,7 +14,6 @@ import io from collections import namedtuple from typing import Optional -from typing import Tuple import kaldiio import numpy as np diff --git a/deepspeech/models/u2_st.py b/deepspeech/models/u2_st.py index 5eea139b..a73f52e9 100644 --- a/deepspeech/models/u2_st.py +++ b/deepspeech/models/u2_st.py @@ -15,9 +15,7 @@ Unified Streaming and Non-streaming Two-pass End-to-end Model for Speech Recognition (https://arxiv.org/pdf/2012.05481.pdf) """ -import sys import time -from collections import defaultdict from typing import Dict from typing import List from typing import Optional @@ -36,18 +34,14 @@ from deepspeech.modules.decoder import TransformerDecoder from deepspeech.modules.encoder import ConformerEncoder from deepspeech.modules.encoder import TransformerEncoder from deepspeech.modules.loss import LabelSmoothingLoss -from deepspeech.modules.mask import make_pad_mask from deepspeech.modules.mask import mask_finished_preds from deepspeech.modules.mask import mask_finished_scores from deepspeech.modules.mask import subsequent_mask from deepspeech.utils import checkpoint from deepspeech.utils import layer_tools -from deepspeech.utils.ctc_utils import remove_duplicates_and_blank from deepspeech.utils.log import Log from deepspeech.utils.tensor_utils import add_sos_eos -from deepspeech.utils.tensor_utils import pad_sequence from deepspeech.utils.tensor_utils import th_accuracy -from deepspeech.utils.utility import log_add __all__ = ["U2STModel", "U2STInferModel"] diff --git a/deepspeech/utils/bleu_score.py b/deepspeech/utils/bleu_score.py index f1bf5261..09646133 100644 --- a/deepspeech/utils/bleu_score.py +++ b/deepspeech/utils/bleu_score.py @@ -14,7 +14,6 @@ """This module provides functions to calculate bleu score in different level. e.g. wer for word-level, cer for char-level. """ -import numpy as np import sacrebleu __all__ = ['bleu', 'char_bleu'] diff --git a/examples/librispeech/s1/README.md b/examples/librispeech/s1/README.md index 88ea216e..daa4d175 100644 --- a/examples/librispeech/s1/README.md +++ b/examples/librispeech/s1/README.md @@ -2,7 +2,7 @@ ## Data | Data Subset | Duration in Seconds | -| --- | --- | +| --- | --- | | data/manifest.train | 0.83s ~ 29.735s | | data/manifest.dev | 1.065 ~ 35.155s | | data/manifest.test-clean | 1.285s ~ 34.955s | @@ -39,4 +39,4 @@ ### Test w/o length filter | Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER | | --- | --- | --- | --- | --- | --- | --- | --- | -| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean-all | attention | 6.98 | 0.066500 | \ No newline at end of file +| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean-all | attention | 6.98 | 0.066500 | diff --git a/examples/librispeech/s2/README.md b/examples/librispeech/s2/README.md index 8f092dd8..e4022f01 100644 --- a/examples/librispeech/s2/README.md +++ b/examples/librispeech/s2/README.md @@ -39,4 +39,3 @@ | Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER | | --- | --- | --- | --- | --- | --- | --- | --- | | transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean-all | attention | | | - diff --git a/requirements.txt b/requirements.txt index 57a951bb..baaa9ba9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,9 @@ coverage +gpustat pre-commit pybind11 resampy==0.2.2 +sacrebleu scipy==1.2.1 sentencepiece snakeviz diff --git a/setup.sh b/setup.sh index 503bc6b5..6e472c47 100644 --- a/setup.sh +++ b/setup.sh @@ -9,7 +9,7 @@ if [ $(id -u) -eq 0 ]; then fi if [ -e /etc/lsb-release ];then - #${SUDO} apt-get update + ${SUDO} apt-get update -y ${SUDO} apt-get install -y jq vim tig tree sox pkg-config libflac-dev libogg-dev libvorbis-dev libboost-dev swig python3-dev if [ $? != 0 ]; then error_msg "Please using Ubuntu or install pkg-config libflac-dev libogg-dev libvorbis-dev libboost-dev swig python3-dev by user." @@ -22,6 +22,8 @@ fi rm tools/*.done pushd tools && make && popd +source tools/venv/bin/activate + # install python dependencies if [ -f "requirements.txt" ]; then pip3 install -r requirements.txt From 4d174b5e2f8eddb75c9c7838613c4fae306d6928 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Thu, 5 Aug 2021 07:09:16 +0000 Subject: [PATCH 14/21] fix egs bugs --- examples/librispeech/s1/run.sh | 4 ++-- examples/librispeech/s2/run.sh | 4 ++-- examples/ted_en_zh/t0/local/test.sh | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/librispeech/s1/run.sh b/examples/librispeech/s1/run.sh index b81e8dcf..2a8f2e2d 100755 --- a/examples/librispeech/s1/run.sh +++ b/examples/librispeech/s1/run.sh @@ -19,7 +19,7 @@ fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # train model, all `ckpt` under `exp` dir - CUDA_VISIBLE_DEVICES=4,5,6,7 ./local/train.sh ${conf_path} ${ckpt} + CUDA_VISIBLE_DEVICES=0,1,2,3 ./local/train.sh ${conf_path} ${ckpt} fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then @@ -29,7 +29,7 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=7 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then diff --git a/examples/librispeech/s2/run.sh b/examples/librispeech/s2/run.sh index b81e8dcf..2a8f2e2d 100755 --- a/examples/librispeech/s2/run.sh +++ b/examples/librispeech/s2/run.sh @@ -19,7 +19,7 @@ fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # train model, all `ckpt` under `exp` dir - CUDA_VISIBLE_DEVICES=4,5,6,7 ./local/train.sh ${conf_path} ${ckpt} + CUDA_VISIBLE_DEVICES=0,1,2,3 ./local/train.sh ${conf_path} ${ckpt} fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then @@ -29,7 +29,7 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=7 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then diff --git a/examples/ted_en_zh/t0/local/test.sh b/examples/ted_en_zh/t0/local/test.sh index 802bb13c..642328e8 100755 --- a/examples/ted_en_zh/t0/local/test.sh +++ b/examples/ted_en_zh/t0/local/test.sh @@ -9,7 +9,7 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." device=gpu -if [ ngpu == 0 ];then +if [ ${ngpu} == 0 ];then device=cpu fi config_path=$1 From 1cd4d4bf83b705378ab30c1e26d672d30cd13cbe Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Thu, 5 Aug 2021 10:12:22 +0000 Subject: [PATCH 15/21] fix tiny conf and refactor optimizer and scheduler --- deepspeech/exps/u2/model.py | 87 ++++-- deepspeech/exps/u2_st/model.py | 5 +- deepspeech/models/deepspeech2.py | 262 ------------------ deepspeech/training/optimizer.py | 81 ++++++ deepspeech/training/scheduler.py | 47 +++- deepspeech/utils/dynamic_import.py | 50 ++++ examples/librispeech/s1/conf/transformer.yaml | 4 +- examples/tiny/s1/conf/chunk_confermer.yaml | 16 +- examples/tiny/s1/conf/chunk_transformer.yaml | 14 +- examples/tiny/s1/conf/conformer.yaml | 16 +- examples/tiny/s1/conf/transformer.yaml | 27 +- 11 files changed, 280 insertions(+), 329 deletions(-) delete mode 100644 deepspeech/models/deepspeech2.py create mode 100644 deepspeech/training/optimizer.py create mode 100644 deepspeech/utils/dynamic_import.py diff --git a/deepspeech/exps/u2/model.py b/deepspeech/exps/u2/model.py index dd62f537..34145780 100644 --- a/deepspeech/exps/u2/model.py +++ b/deepspeech/exps/u2/model.py @@ -31,8 +31,8 @@ from deepspeech.io.dataset import ManifestDataset from deepspeech.io.sampler import SortagradBatchSampler from deepspeech.io.sampler import SortagradDistributedBatchSampler from deepspeech.models.u2 import U2Model -from deepspeech.training.gradclip import ClipGradByGlobalNormWithLog -from deepspeech.training.scheduler import WarmupLR +from deepspeech.training.optimizer import OptimizerFactory +from deepspeech.training.scheduler import LRSchedulerFactory from deepspeech.training.trainer import Trainer from deepspeech.utils import ctc_utils from deepspeech.utils import error_rate @@ -41,6 +41,8 @@ from deepspeech.utils import mp_tools from deepspeech.utils import text_grid from deepspeech.utils import utility from deepspeech.utils.log import Log +# from deepspeech.training.gradclip import ClipGradByGlobalNormWithLog +# from deepspeech.training.scheduler import WarmupLR logger = Log(__name__).getlog() @@ -312,30 +314,63 @@ class U2Trainer(Trainer): scheduler_type = train_config.scheduler scheduler_conf = train_config.scheduler_conf - grad_clip = ClipGradByGlobalNormWithLog(train_config.global_grad_clip) - weight_decay = paddle.regularizer.L2Decay(optim_conf.weight_decay) - - if scheduler_type == 'expdecaylr': - lr_scheduler = paddle.optimizer.lr.ExponentialDecay( - learning_rate=optim_conf.lr, - gamma=scheduler_conf.lr_decay, - verbose=False) - elif scheduler_type == 'warmuplr': - lr_scheduler = WarmupLR( - learning_rate=optim_conf.lr, - warmup_steps=scheduler_conf.warmup_steps, - verbose=False) - else: - raise ValueError(f"Not support scheduler: {scheduler_type}") - - if optim_type == 'adam': - optimizer = paddle.optimizer.Adam( - learning_rate=lr_scheduler, - parameters=model.parameters(), - weight_decay=weight_decay, - grad_clip=grad_clip) - else: - raise ValueError(f"Not support optim: {optim_type}") + scheduler_args = { + "learning_rate": + optim_conf.lr, + "verbose": + False, + "warmup_steps": + scheduler_conf.warmup_steps + if "warmup_steps" in scheduler_conf else None, + "gamma": + scheduler_conf.lr_decay if "lr_decay" in scheduler_conf else None, + } + lr_scheduler = LRSchedulerFactory.from_args(scheduler_type, + scheduler_args) + + # if scheduler_type == 'expdecaylr': + # lr_scheduler = paddle.optimizer.lr.ExponentialDecay( + # learning_rate=optim_conf.lr, + # gamma=scheduler_conf.lr_decay, + # verbose=False) + # elif scheduler_type == 'warmuplr': + # lr_scheduler = WarmupLR( + # learning_rate=optim_conf.lr, + # warmup_steps=scheduler_conf.warmup_steps, + # verbose=False) + # else: + # raise ValueError(f"Not support scheduler: {scheduler_type}") + + def optimizer_args( + config, + parameters, + lr_scheduler=None, ): + train_config = config.training + optim_type = train_config.optim + optim_conf = train_config.optim_conf + scheduler_type = train_config.scheduler + scheduler_conf = train_config.scheduler_conf + return { + "grad_clip": train_config.global_grad_clip, + "weight_decay": optim_conf.weight_decay, + "learning_rate": lr_scheduler + if lr_scheduler else optim_conf.lr, + "parameters": parameters, + } + + optimzer_args = optimizer_args(config, model.parameters(), lr_scheduler) + optimizer = OptimizerFactory.from_args(optim_type, optimzer_args) + + # grad_clip = ClipGradByGlobalNormWithLog(train_config.global_grad_clip) + # weight_decay = paddle.regularizer.L2Decay(optim_conf.weight_decay) + # if optim_type == 'adam': + # optimizer = paddle.optimizer.Adam( + # learning_rate=lr_scheduler, + # parameters=model.parameters(), + # weight_decay=weight_decay, + # grad_clip=grad_clip) + # else: + # raise ValueError(f"Not support optim: {optim_type}") self.model = model self.optimizer = optimizer diff --git a/deepspeech/exps/u2_st/model.py b/deepspeech/exps/u2_st/model.py index f72e2bbc..5734e15f 100644 --- a/deepspeech/exps/u2_st/model.py +++ b/deepspeech/exps/u2_st/model.py @@ -345,9 +345,6 @@ class U2STTrainer(Trainer): scheduler_type = train_config.scheduler scheduler_conf = train_config.scheduler_conf - grad_clip = ClipGradByGlobalNormWithLog(train_config.global_grad_clip) - weight_decay = paddle.regularizer.L2Decay(optim_conf.weight_decay) - if scheduler_type == 'expdecaylr': lr_scheduler = paddle.optimizer.lr.ExponentialDecay( learning_rate=optim_conf.lr, @@ -367,6 +364,8 @@ class U2STTrainer(Trainer): else: raise ValueError(f"Not support scheduler: {scheduler_type}") + grad_clip = ClipGradByGlobalNormWithLog(train_config.global_grad_clip) + weight_decay = paddle.regularizer.L2Decay(optim_conf.weight_decay) if optim_type == 'adam': optimizer = paddle.optimizer.Adam( learning_rate=lr_scheduler, diff --git a/deepspeech/models/deepspeech2.py b/deepspeech/models/deepspeech2.py deleted file mode 100644 index 233986a9..00000000 --- a/deepspeech/models/deepspeech2.py +++ /dev/null @@ -1,262 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Deepspeech2 ASR Model""" -from typing import Optional - -import paddle -from paddle import nn -from yacs.config import CfgNode - -from deepspeech.modules.conv import ConvStack -from deepspeech.modules.ctc import CTCDecoder -from deepspeech.modules.rnn import RNNStack -from deepspeech.utils import layer_tools -from deepspeech.utils.checkpoint import Checkpoint -from deepspeech.utils.log import Log - -logger = Log(__name__).getlog() - -__all__ = ['DeepSpeech2Model'] - - -class CRNNEncoder(nn.Layer): - def __init__(self, - feat_size, - dict_size, - num_conv_layers=2, - num_rnn_layers=3, - rnn_size=1024, - use_gru=False, - share_rnn_weights=True): - super().__init__() - self.rnn_size = rnn_size - self.feat_size = feat_size # 161 for linear - self.dict_size = dict_size - - self.conv = ConvStack(feat_size, num_conv_layers) - - i_size = self.conv.output_height # H after conv stack - self.rnn = RNNStack( - i_size=i_size, - h_size=rnn_size, - num_stacks=num_rnn_layers, - use_gru=use_gru, - share_rnn_weights=share_rnn_weights) - - @property - def output_size(self): - return self.rnn_size * 2 - - def forward(self, audio, audio_len): - """Compute Encoder outputs - - Args: - audio (Tensor): [B, Tmax, D] - text (Tensor): [B, Umax] - audio_len (Tensor): [B] - text_len (Tensor): [B] - Returns: - x (Tensor): encoder outputs, [B, T, D] - x_lens (Tensor): encoder length, [B] - """ - # [B, T, D] -> [B, D, T] - audio = audio.transpose([0, 2, 1]) - # [B, D, T] -> [B, C=1, D, T] - x = audio.unsqueeze(1) - x_lens = audio_len - - # convolution group - x, x_lens = self.conv(x, x_lens) - - # convert data from convolution feature map to sequence of vectors - #B, C, D, T = paddle.shape(x) # not work under jit - x = x.transpose([0, 3, 1, 2]) #[B, T, C, D] - #x = x.reshape([B, T, C * D]) #[B, T, C*D] # not work under jit - x = x.reshape([0, 0, -1]) #[B, T, C*D] - - # remove padding part - x, x_lens = self.rnn(x, x_lens) #[B, T, D] - return x, x_lens - - -class DeepSpeech2Model(nn.Layer): - """The DeepSpeech2 network structure. - - :param audio_data: Audio spectrogram data layer. - :type audio_data: Variable - :param text_data: Transcription text data layer. - :type text_data: Variable - :param audio_len: Valid sequence length data layer. - :type audio_len: Variable - :param masks: Masks data layer to reset padding. - :type masks: Variable - :param dict_size: Dictionary size for tokenized transcription. - :type dict_size: int - :param num_conv_layers: Number of stacking convolution layers. - :type num_conv_layers: int - :param num_rnn_layers: Number of stacking RNN layers. - :type num_rnn_layers: int - :param rnn_size: RNN layer size (dimension of RNN cells). - :type rnn_size: int - :param use_gru: Use gru if set True. Use simple rnn if set False. - :type use_gru: bool - :param share_rnn_weights: Whether to share input-hidden weights between - forward and backward direction RNNs. - It is only available when use_gru=False. - :type share_weights: bool - :return: A tuple of an output unnormalized log probability layer ( - before softmax) and a ctc cost layer. - :rtype: tuple of LayerOutput - """ - - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - default = CfgNode( - dict( - num_conv_layers=2, #Number of stacking convolution layers. - num_rnn_layers=3, #Number of stacking RNN layers. - rnn_layer_size=1024, #RNN layer size (number of RNN cells). - use_gru=True, #Use gru if set True. Use simple rnn if set False. - share_rnn_weights=True #Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported. - )) - if config is not None: - config.merge_from_other_cfg(default) - return default - - def __init__(self, - feat_size, - dict_size, - num_conv_layers=2, - num_rnn_layers=3, - rnn_size=1024, - use_gru=False, - share_rnn_weights=True): - super().__init__() - self.encoder = CRNNEncoder( - feat_size=feat_size, - dict_size=dict_size, - num_conv_layers=num_conv_layers, - num_rnn_layers=num_rnn_layers, - rnn_size=rnn_size, - use_gru=use_gru, - share_rnn_weights=share_rnn_weights) - assert (self.encoder.output_size == rnn_size * 2) - - self.decoder = CTCDecoder( - odim=dict_size, # is in vocab - enc_n_units=self.encoder.output_size, - blank_id=0, # first token is - dropout_rate=0.0, - reduction=True, # sum - batch_average=True) # sum / batch_size - - def forward(self, audio, audio_len, text, text_len): - """Compute Model loss - - Args: - audio (Tenosr): [B, T, D] - audio_len (Tensor): [B] - text (Tensor): [B, U] - text_len (Tensor): [B] - - Returns: - loss (Tenosr): [1] - """ - eouts, eouts_len = self.encoder(audio, audio_len) - loss = self.decoder(eouts, eouts_len, text, text_len) - return loss - - @paddle.no_grad() - def decode(self, audio, audio_len, vocab_list, decoding_method, - lang_model_path, beam_alpha, beam_beta, beam_size, cutoff_prob, - cutoff_top_n, num_processes): - # init once - # decoders only accept string encoded in utf-8 - self.decoder.init_decode( - beam_alpha=beam_alpha, - beam_beta=beam_beta, - lang_model_path=lang_model_path, - vocab_list=vocab_list, - decoding_method=decoding_method) - - eouts, eouts_len = self.encoder(audio, audio_len) - probs = self.decoder.softmax(eouts) - return self.decoder.decode_probs( - probs.numpy(), eouts_len, vocab_list, decoding_method, - lang_model_path, beam_alpha, beam_beta, beam_size, cutoff_prob, - cutoff_top_n, num_processes) - - @classmethod - def from_pretrained(cls, dataloader, config, checkpoint_path): - """Build a DeepSpeech2Model model from a pretrained model. - Parameters - ---------- - dataloader: paddle.io.DataLoader - - config: yacs.config.CfgNode - model configs - - checkpoint_path: Path or str - the path of pretrained model checkpoint, without extension name - - Returns - ------- - DeepSpeech2Model - The model built from pretrained result. - """ - model = cls(feat_size=dataloader.collate_fn.feature_size, - dict_size=dataloader.collate_fn.vocab_size, - num_conv_layers=config.model.num_conv_layers, - num_rnn_layers=config.model.num_rnn_layers, - rnn_size=config.model.rnn_layer_size, - use_gru=config.model.use_gru, - share_rnn_weights=config.model.share_rnn_weights) - infos = Checkpoint().load_parameters( - model, checkpoint_path=checkpoint_path) - logger.info(f"checkpoint info: {infos}") - layer_tools.summary(model) - return model - - -class DeepSpeech2InferModel(DeepSpeech2Model): - def __init__(self, - feat_size, - dict_size, - num_conv_layers=2, - num_rnn_layers=3, - rnn_size=1024, - use_gru=False, - share_rnn_weights=True): - super().__init__( - feat_size=feat_size, - dict_size=dict_size, - num_conv_layers=num_conv_layers, - num_rnn_layers=num_rnn_layers, - rnn_size=rnn_size, - use_gru=use_gru, - share_rnn_weights=share_rnn_weights) - - def forward(self, audio, audio_len): - """export model function - - Args: - audio (Tensor): [B, T, D] - audio_len (Tensor): [B] - - Returns: - probs: probs after softmax - """ - eouts, eouts_len = self.encoder(audio, audio_len) - probs = self.decoder.softmax(eouts) - return probs diff --git a/deepspeech/training/optimizer.py b/deepspeech/training/optimizer.py new file mode 100644 index 00000000..adbc97ff --- /dev/null +++ b/deepspeech/training/optimizer.py @@ -0,0 +1,81 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Any +from typing import Dict +from typing import Text + +from paddle.optimizer import Optimizer +from paddle.regularizer import L2Decay + +from deepspeech.training.gradclip import ClipGradByGlobalNormWithLog +from deepspeech.utils.dynamic_import import dynamic_import +from deepspeech.utils.log import Log + +__all__ = ["OptimizerFactory"] + +logger = Log(__name__).getlog() + +OPTIMIZER_DICT = { + "sgd": "paddle.optimizer:SGD", + "momentum": "paddle.optimizer:Momentum", + "adadelta": "paddle.optimizer:Adadelta", + "adam": "paddle.optimizer:Adam", + "adamw": "paddle.optimizer:AdamW", +} + + +def register_optimizer(cls): + """Register optimizer.""" + alias = cls.__name__.lower() + OPTIMIZER_DICT[cls.__name__.lower()] = cls.__module__ + ":" + cls.__name__ + return cls + + +def dynamic_import_optimizer(module): + """Import Optimizer class dynamically. + + Args: + module (str): module_name:class_name or alias in `OPTIMIZER_DICT` + + Returns: + type: Optimizer class + + """ + module_class = dynamic_import(module, OPTIMIZER_DICT) + assert issubclass(module_class, + Optimizer), f"{module} does not implement Optimizer" + return module_class + + +class OptimizerFactory(): + @classmethod + def from_args(cls, name: str, args: Dict[Text, Any]): + assert "parameters" in args, "parameters not in args." + assert "learning_rate" in args, "learning_rate not in args." + + grad_clip = ClipGradByGlobalNormWithLog( + args['grad_clip']) if "grad_clip" in args else None + weight_decay = L2Decay( + args['weight_decay']) if "weight_decay" in args else None + module_class = dynamic_import_optimizer(name.lower()) + + if weight_decay: + logger.info(f'WeightDecay: {weight_decay}') + if grad_clip: + logger.info(f'GradClip: {grad_clip}') + logger.info( + f"Optimizer: {module_class.__name__} {args['learning_rate']}") + + args.update({"grad_clip": grad_clip, "weight_decay": weight_decay}) + return module_class(**args) diff --git a/deepspeech/training/scheduler.py b/deepspeech/training/scheduler.py index d3613028..b8f3ece7 100644 --- a/deepspeech/training/scheduler.py +++ b/deepspeech/training/scheduler.py @@ -11,18 +11,53 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from typing import Any +from typing import Dict +from typing import Text from typing import Union from paddle.optimizer.lr import LRScheduler from typeguard import check_argument_types +from deepspeech.utils.dynamic_import import dynamic_import +from deepspeech.utils.dynamic_import import instance_class from deepspeech.utils.log import Log -__all__ = ["WarmupLR"] +__all__ = ["WarmupLR", "LRSchedulerFactory"] logger = Log(__name__).getlog() +SCHEDULER_DICT = { + "noam": "paddle.optimizer.lr:NoamDecay", + "expdecaylr": "paddle.optimizer.lr:ExponentialDecay", + "piecewisedecay": "paddle.optimizer.lr:PiecewiseDecay", +} + +def register_scheduler(cls): + """Register scheduler.""" + alias = cls.__name__.lower() + SCHEDULER_DICT[cls.__name__.lower()] = cls.__module__ + ":" + cls.__name__ + return cls + + +def dynamic_import_scheduler(module): + """Import Scheduler class dynamically. + + Args: + module (str): module_name:class_name or alias in `SCHEDULER_DICT` + + Returns: + type: Scheduler class + + """ + module_class = dynamic_import(module, SCHEDULER_DICT) + assert issubclass(module_class, + LRScheduler), f"{module} does not implement LRScheduler" + return module_class + + +@register_scheduler class WarmupLR(LRScheduler): """The WarmupLR scheduler This scheduler is almost same as NoamLR Scheduler except for following @@ -40,7 +75,8 @@ class WarmupLR(LRScheduler): warmup_steps: Union[int, float]=25000, learning_rate=1.0, last_epoch=-1, - verbose=False): + verbose=False, + **kwargs): assert check_argument_types() self.warmup_steps = warmup_steps super().__init__(learning_rate, last_epoch, verbose) @@ -64,3 +100,10 @@ class WarmupLR(LRScheduler): None ''' self.step(epoch=step) + + +class LRSchedulerFactory(): + @classmethod + def from_args(cls, name: str, args: Dict[Text, Any]): + module_class = dynamic_import_scheduler(name.lower()) + return instance_class(module_class, args) diff --git a/deepspeech/utils/dynamic_import.py b/deepspeech/utils/dynamic_import.py new file mode 100644 index 00000000..81586e3e --- /dev/null +++ b/deepspeech/utils/dynamic_import.py @@ -0,0 +1,50 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import importlib +from typing import Any +from typing import Dict +from typing import Text + +from deepspeech.utils.log import Log + +logger = Log(__name__).getlog() + +__all__ = ["dynamic_import", "instance_class"] + + +def dynamic_import(import_path, alias=dict()): + """dynamic import module and class + + :param str import_path: syntax 'module_name:class_name' + e.g., 'deepspeech.models.u2:U2Model' + :param dict alias: shortcut for registered class + :return: imported class + """ + if import_path not in alias and ":" not in import_path: + raise ValueError("import_path should be one of {} or " + 'include ":", e.g. "deepspeech.models.u2:U2Model" : ' + "{}".format(set(alias), import_path)) + if ":" not in import_path: + import_path = alias[import_path] + + module_name, objname = import_path.split(":") + m = importlib.import_module(module_name) + return getattr(m, objname) + + +def instance_class(module_class, args: Dict[Text, Any]): + # filter out `val` which is None + new_args = {key: val for key, val in args.items() if val is not None} + logger.info(f"Instance: {module_class.__name__} {new_args}.") + return module_class(**new_args) diff --git a/examples/librispeech/s1/conf/transformer.yaml b/examples/librispeech/s1/conf/transformer.yaml index 8a769dca..26188677 100644 --- a/examples/librispeech/s1/conf/transformer.yaml +++ b/examples/librispeech/s1/conf/transformer.yaml @@ -16,7 +16,7 @@ collator: spm_model_prefix: 'data/bpe_unigram_5000' mean_std_filepath: "" augmentation_config: conf/augmentation.json - batch_size: 64 + batch_size: 32 raw_wav: True # use raw_wav or kaldi feature specgram_type: fbank #linear, mfcc, fbank feat_dim: 80 @@ -73,7 +73,7 @@ model: training: n_epoch: 120 - accum_grad: 2 + accum_grad: 4 global_grad_clip: 5.0 optim: adam optim_conf: diff --git a/examples/tiny/s1/conf/chunk_confermer.yaml b/examples/tiny/s1/conf/chunk_confermer.yaml index 606300bd..1b701aa2 100644 --- a/examples/tiny/s1/conf/chunk_confermer.yaml +++ b/examples/tiny/s1/conf/chunk_confermer.yaml @@ -3,18 +3,20 @@ data: train_manifest: data/manifest.tiny dev_manifest: data/manifest.tiny test_manifest: data/manifest.tiny + min_input_len: 0.5 # second + max_input_len: 20.0 # second + min_output_len: 0.0 # tokens + max_output_len: 400.0 # tokens + min_output_input_ratio: 0.05 + max_output_input_ratio: 10.0 + +collator: + mean_std_filepath: "" vocab_filepath: data/vocab.txt unit_type: 'spm' spm_model_prefix: 'data/bpe_unigram_200' - mean_std_filepath: "" augmentation_config: conf/augmentation.json batch_size: 4 - min_input_len: 0.5 - max_input_len: 20.0 - min_output_len: 0.0 - max_output_len: 400.0 - min_output_input_ratio: 0.05 - max_output_input_ratio: 10.0 raw_wav: True # use raw_wav or kaldi feature specgram_type: fbank #linear, mfcc, fbank feat_dim: 80 diff --git a/examples/tiny/s1/conf/chunk_transformer.yaml b/examples/tiny/s1/conf/chunk_transformer.yaml index 72d36848..31dfd26c 100644 --- a/examples/tiny/s1/conf/chunk_transformer.yaml +++ b/examples/tiny/s1/conf/chunk_transformer.yaml @@ -3,18 +3,20 @@ data: train_manifest: data/manifest.tiny dev_manifest: data/manifest.tiny test_manifest: data/manifest.tiny - vocab_filepath: data/vocab.txt - unit_type: 'spm' - spm_model_prefix: 'data/bpe_unigram_200' - mean_std_filepath: "" - augmentation_config: conf/augmentation.json - batch_size: 4 min_input_len: 0.5 # second max_input_len: 20.0 # second min_output_len: 0.0 # tokens max_output_len: 400.0 # tokens min_output_input_ratio: 0.05 max_output_input_ratio: 10.0 + +collator: + mean_std_filepath: "" + vocab_filepath: data/vocab.txt + unit_type: 'spm' + spm_model_prefix: 'data/bpe_unigram_202' + augmentation_config: conf/augmentation.json + batch_size: 4 raw_wav: True # use raw_wav or kaldi feature specgram_type: fbank #linear, mfcc, fbank feat_dim: 80 diff --git a/examples/tiny/s1/conf/conformer.yaml b/examples/tiny/s1/conf/conformer.yaml index a6f73050..b40e77e3 100644 --- a/examples/tiny/s1/conf/conformer.yaml +++ b/examples/tiny/s1/conf/conformer.yaml @@ -3,18 +3,20 @@ data: train_manifest: data/manifest.tiny dev_manifest: data/manifest.tiny test_manifest: data/manifest.tiny + min_input_len: 0.5 # second + max_input_len: 20.0 # second + min_output_len: 0.0 # tokens + max_output_len: 400.0 # tokens + min_output_input_ratio: 0.05 + max_output_input_ratio: 10.0 + +collator: + mean_std_filepath: "" vocab_filepath: data/vocab.txt unit_type: 'spm' spm_model_prefix: 'data/bpe_unigram_200' - mean_std_filepath: "" augmentation_config: conf/augmentation.json batch_size: 4 - min_input_len: 0.5 - max_input_len: 20.0 - min_output_len: 0.0 - max_output_len: 400.0 - min_output_input_ratio: 0.05 - max_output_input_ratio: 10.0 raw_wav: True # use raw_wav or kaldi feature specgram_type: fbank #linear, mfcc, fbank feat_dim: 80 diff --git a/examples/tiny/s1/conf/transformer.yaml b/examples/tiny/s1/conf/transformer.yaml index 71cbdde7..e97ad756 100644 --- a/examples/tiny/s1/conf/transformer.yaml +++ b/examples/tiny/s1/conf/transformer.yaml @@ -11,30 +11,29 @@ data: max_output_input_ratio: 10.0 collator: - vocab_filepath: data/vocab.txt mean_std_filepath: "" - augmentation_config: conf/augmentation.json - random_seed: 0 + vocab_filepath: data/vocab.txt unit_type: 'spm' - spm_model_prefix: 'data/bpe_unigram_200' - specgram_type: fbank + spm_model_prefix: 'data/bpe_unigram_202' + augmentation_config: conf/augmentation.json + batch_size: 4 + raw_wav: True # use raw_wav or kaldi feature + specgram_type: fbank #linear, mfcc, fbank feat_dim: 80 delta_delta: False - stride_ms: 10.0 - window_ms: 20.0 - n_fft: None - max_freq: None + dither: 1.0 target_sample_rate: 16000 + max_freq: None + n_fft: None + stride_ms: 10.0 + window_ms: 25.0 use_dB_normalization: True target_dB: -20 - dither: 1.0 + random_seed: 0 keep_transcription_text: False - batch_size: 4 sortagrad: True shuffle_method: batch_shuffle - num_workers: 0 #2 - raw_wav: True # use raw_wav or kaldi feature - + num_workers: 2 # network architecture model: From 3912c255ef39a712fb5b3630c111c08d7eac0149 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Thu, 5 Aug 2021 10:33:23 +0000 Subject: [PATCH 16/21] support noam lr and opt --- deepspeech/exps/u2/model.py | 12 ++++++++++++ deepspeech/training/optimizer.py | 3 +++ deepspeech/utils/dynamic_import.py | 10 ++++++++-- 3 files changed, 23 insertions(+), 2 deletions(-) diff --git a/deepspeech/exps/u2/model.py b/deepspeech/exps/u2/model.py index 34145780..aefe73f8 100644 --- a/deepspeech/exps/u2/model.py +++ b/deepspeech/exps/u2/model.py @@ -324,6 +324,9 @@ class U2Trainer(Trainer): if "warmup_steps" in scheduler_conf else None, "gamma": scheduler_conf.lr_decay if "lr_decay" in scheduler_conf else None, + "d_model": + model_conf.encoder_conf.output_size + if scheduler_type == "noam" else None, } lr_scheduler = LRSchedulerFactory.from_args(scheduler_type, scheduler_args) @@ -338,6 +341,12 @@ class U2Trainer(Trainer): # learning_rate=optim_conf.lr, # warmup_steps=scheduler_conf.warmup_steps, # verbose=False) + # elif scheduler_type == 'noam': + # lr_scheduler = paddle.optimizer.lr.NoamDecay( + # learning_rate=optim_conf.lr, + # d_model=model_conf.encoder_conf.output_size, + # warmup_steps=scheduler_conf.warmup_steps, + # verbose=False) # else: # raise ValueError(f"Not support scheduler: {scheduler_type}") @@ -356,6 +365,9 @@ class U2Trainer(Trainer): "learning_rate": lr_scheduler if lr_scheduler else optim_conf.lr, "parameters": parameters, + "epsilon": 1e-9 if optim_type == 'noam' else None, + "beta1": 0.9 if optim_type == 'noam' else None, + "beat2": 0.98 if optim_type == 'noam' else None, } optimzer_args = optimizer_args(config, model.parameters(), lr_scheduler) diff --git a/deepspeech/training/optimizer.py b/deepspeech/training/optimizer.py index adbc97ff..2e62a7ed 100644 --- a/deepspeech/training/optimizer.py +++ b/deepspeech/training/optimizer.py @@ -20,6 +20,7 @@ from paddle.regularizer import L2Decay from deepspeech.training.gradclip import ClipGradByGlobalNormWithLog from deepspeech.utils.dynamic_import import dynamic_import +from deepspeech.utils.dynamic_import import filter_valid_args from deepspeech.utils.log import Log __all__ = ["OptimizerFactory"] @@ -78,4 +79,6 @@ class OptimizerFactory(): f"Optimizer: {module_class.__name__} {args['learning_rate']}") args.update({"grad_clip": grad_clip, "weight_decay": weight_decay}) + + args = filter_valid_args(args) return module_class(**args) diff --git a/deepspeech/utils/dynamic_import.py b/deepspeech/utils/dynamic_import.py index 81586e3e..41978bc9 100644 --- a/deepspeech/utils/dynamic_import.py +++ b/deepspeech/utils/dynamic_import.py @@ -20,7 +20,7 @@ from deepspeech.utils.log import Log logger = Log(__name__).getlog() -__all__ = ["dynamic_import", "instance_class"] +__all__ = ["dynamic_import", "instance_class", "filter_valid_args"] def dynamic_import(import_path, alias=dict()): @@ -43,8 +43,14 @@ def dynamic_import(import_path, alias=dict()): return getattr(m, objname) -def instance_class(module_class, args: Dict[Text, Any]): +def filter_valid_args(args: Dict[Text, Any]): # filter out `val` which is None new_args = {key: val for key, val in args.items() if val is not None} + return new_args + + +def instance_class(module_class, args: Dict[Text, Any]): + # filter out `val` which is None + new_args = filter_valid_args(args) logger.info(f"Instance: {module_class.__name__} {new_args}.") return module_class(**new_args) From c4da9a7f3ad89d9acc10034d70186823ed993570 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Thu, 5 Aug 2021 11:04:48 +0000 Subject: [PATCH 17/21] filter key by class signature, no print tensor --- deepspeech/training/gradclip.py | 3 +++ deepspeech/training/optimizer.py | 5 ++--- deepspeech/utils/dynamic_import.py | 25 ++++++++++++++++++------- deepspeech/utils/tensor_utils.py | 16 +++++++++++++++- 4 files changed, 38 insertions(+), 11 deletions(-) diff --git a/deepspeech/training/gradclip.py b/deepspeech/training/gradclip.py index d0f9803d..f46814eb 100644 --- a/deepspeech/training/gradclip.py +++ b/deepspeech/training/gradclip.py @@ -27,6 +27,9 @@ class ClipGradByGlobalNormWithLog(paddle.nn.ClipGradByGlobalNorm): def __init__(self, clip_norm): super().__init__(clip_norm) + def __repr__(self): + return f"{self.__class__.__name__}(global_clip_norm={self.clip_norm})" + @imperative_base.no_grad def _dygraph_clip(self, params_grads): params_and_grads = [] diff --git a/deepspeech/training/optimizer.py b/deepspeech/training/optimizer.py index 2e62a7ed..f7933f8d 100644 --- a/deepspeech/training/optimizer.py +++ b/deepspeech/training/optimizer.py @@ -20,7 +20,7 @@ from paddle.regularizer import L2Decay from deepspeech.training.gradclip import ClipGradByGlobalNormWithLog from deepspeech.utils.dynamic_import import dynamic_import -from deepspeech.utils.dynamic_import import filter_valid_args +from deepspeech.utils.dynamic_import import instance_class from deepspeech.utils.log import Log __all__ = ["OptimizerFactory"] @@ -80,5 +80,4 @@ class OptimizerFactory(): args.update({"grad_clip": grad_clip, "weight_decay": weight_decay}) - args = filter_valid_args(args) - return module_class(**args) + return instance_class(module_class, args) diff --git a/deepspeech/utils/dynamic_import.py b/deepspeech/utils/dynamic_import.py index 41978bc9..533f15ee 100644 --- a/deepspeech/utils/dynamic_import.py +++ b/deepspeech/utils/dynamic_import.py @@ -12,15 +12,18 @@ # See the License for the specific language governing permissions and # limitations under the License. import importlib +import inspect from typing import Any from typing import Dict +from typing import List from typing import Text from deepspeech.utils.log import Log +from deepspeech.utils.tensor_utils import has_tensor logger = Log(__name__).getlog() -__all__ = ["dynamic_import", "instance_class", "filter_valid_args"] +__all__ = ["dynamic_import", "instance_class"] def dynamic_import(import_path, alias=dict()): @@ -43,14 +46,22 @@ def dynamic_import(import_path, alias=dict()): return getattr(m, objname) -def filter_valid_args(args: Dict[Text, Any]): - # filter out `val` which is None - new_args = {key: val for key, val in args.items() if val is not None} +def filter_valid_args(args: Dict[Text, Any], valid_keys: List[Text]): + # filter by `valid_keys` and filter `val` is not None + new_args = { + key: val + for key, val in args.items() if (key in valid_keys and val is not None) + } return new_args +def filter_out_tenosr(args: Dict[Text, Any]): + return {key: val for key, val in args.items() if not has_tensor(val)} + + def instance_class(module_class, args: Dict[Text, Any]): - # filter out `val` which is None - new_args = filter_valid_args(args) - logger.info(f"Instance: {module_class.__name__} {new_args}.") + valid_keys = inspect.signature(module_class).parameters.keys() + new_args = filter_valid_args(args, valid_keys) + logger.info( + f"Instance: {module_class.__name__} {filter_out_tenosr(new_args)}.") return module_class(**new_args) diff --git a/deepspeech/utils/tensor_utils.py b/deepspeech/utils/tensor_utils.py index 7679d9e1..9bff6b0f 100644 --- a/deepspeech/utils/tensor_utils.py +++ b/deepspeech/utils/tensor_utils.py @@ -19,11 +19,25 @@ import paddle from deepspeech.utils.log import Log -__all__ = ["pad_sequence", "add_sos_eos", "th_accuracy"] +__all__ = ["pad_sequence", "add_sos_eos", "th_accuracy", "has_tensor"] logger = Log(__name__).getlog() +def has_tensor(val): + if isinstance(val, (list, tuple)): + for item in val: + if has_tensor(item): + return True + elif isinstance(val, dict): + for k, v in val.items(): + print(k) + if has_tensor(v): + return True + else: + return paddle.is_tensor(val) + + def pad_sequence(sequences: List[paddle.Tensor], batch_first: bool=False, padding_value: float=0.0) -> paddle.Tensor: From 820b4db287ce243be1333e4787c0634b5a1ee5eb Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Thu, 5 Aug 2021 11:08:15 +0000 Subject: [PATCH 18/21] with all args for scheduler --- deepspeech/exps/u2/model.py | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/deepspeech/exps/u2/model.py b/deepspeech/exps/u2/model.py index aefe73f8..495da10c 100644 --- a/deepspeech/exps/u2/model.py +++ b/deepspeech/exps/u2/model.py @@ -315,18 +315,11 @@ class U2Trainer(Trainer): scheduler_conf = train_config.scheduler_conf scheduler_args = { - "learning_rate": - optim_conf.lr, - "verbose": - False, - "warmup_steps": - scheduler_conf.warmup_steps - if "warmup_steps" in scheduler_conf else None, - "gamma": - scheduler_conf.lr_decay if "lr_decay" in scheduler_conf else None, - "d_model": - model_conf.encoder_conf.output_size - if scheduler_type == "noam" else None, + "learning_rate": optim_conf.lr, + "verbose": False, + "warmup_steps": scheduler_conf.warmup_steps, + "gamma": scheduler_conf.lr_decay, + "d_model": model_conf.encoder_conf.output_size, } lr_scheduler = LRSchedulerFactory.from_args(scheduler_type, scheduler_args) From e76123d4185a48a36af718258c788b51ce112eaa Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Thu, 5 Aug 2021 11:14:20 +0000 Subject: [PATCH 19/21] rm useless --- deepspeech/exps/u2/model.py | 32 -------------------------------- 1 file changed, 32 deletions(-) diff --git a/deepspeech/exps/u2/model.py b/deepspeech/exps/u2/model.py index 495da10c..d661f078 100644 --- a/deepspeech/exps/u2/model.py +++ b/deepspeech/exps/u2/model.py @@ -41,8 +41,6 @@ from deepspeech.utils import mp_tools from deepspeech.utils import text_grid from deepspeech.utils import utility from deepspeech.utils.log import Log -# from deepspeech.training.gradclip import ClipGradByGlobalNormWithLog -# from deepspeech.training.scheduler import WarmupLR logger = Log(__name__).getlog() @@ -324,25 +322,6 @@ class U2Trainer(Trainer): lr_scheduler = LRSchedulerFactory.from_args(scheduler_type, scheduler_args) - # if scheduler_type == 'expdecaylr': - # lr_scheduler = paddle.optimizer.lr.ExponentialDecay( - # learning_rate=optim_conf.lr, - # gamma=scheduler_conf.lr_decay, - # verbose=False) - # elif scheduler_type == 'warmuplr': - # lr_scheduler = WarmupLR( - # learning_rate=optim_conf.lr, - # warmup_steps=scheduler_conf.warmup_steps, - # verbose=False) - # elif scheduler_type == 'noam': - # lr_scheduler = paddle.optimizer.lr.NoamDecay( - # learning_rate=optim_conf.lr, - # d_model=model_conf.encoder_conf.output_size, - # warmup_steps=scheduler_conf.warmup_steps, - # verbose=False) - # else: - # raise ValueError(f"Not support scheduler: {scheduler_type}") - def optimizer_args( config, parameters, @@ -366,17 +345,6 @@ class U2Trainer(Trainer): optimzer_args = optimizer_args(config, model.parameters(), lr_scheduler) optimizer = OptimizerFactory.from_args(optim_type, optimzer_args) - # grad_clip = ClipGradByGlobalNormWithLog(train_config.global_grad_clip) - # weight_decay = paddle.regularizer.L2Decay(optim_conf.weight_decay) - # if optim_type == 'adam': - # optimizer = paddle.optimizer.Adam( - # learning_rate=lr_scheduler, - # parameters=model.parameters(), - # weight_decay=weight_decay, - # grad_clip=grad_clip) - # else: - # raise ValueError(f"Not support optim: {optim_type}") - self.model = model self.optimizer = optimizer self.lr_scheduler = lr_scheduler From 7d9dc28ee22763212daf6fc03d163cf4f6885bf0 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Fri, 6 Aug 2021 03:09:40 +0000 Subject: [PATCH 20/21] fix config --- examples/tiny/s1/conf/chunk_transformer.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/tiny/s1/conf/chunk_transformer.yaml b/examples/tiny/s1/conf/chunk_transformer.yaml index 31dfd26c..1adb91c4 100644 --- a/examples/tiny/s1/conf/chunk_transformer.yaml +++ b/examples/tiny/s1/conf/chunk_transformer.yaml @@ -14,7 +14,7 @@ collator: mean_std_filepath: "" vocab_filepath: data/vocab.txt unit_type: 'spm' - spm_model_prefix: 'data/bpe_unigram_202' + spm_model_prefix: 'data/bpe_unigram_200' augmentation_config: conf/augmentation.json batch_size: 4 raw_wav: True # use raw_wav or kaldi feature From 4c0ee8d354c67025e8ea8b1f29547e6baea95394 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Mon, 9 Aug 2021 11:21:16 +0000 Subject: [PATCH 21/21] fix conf and readme --- .bashrc | 15 +++++++++++++++ .gitignore | 1 + README.md | 2 +- README_cn.md | 2 +- examples/aishell/s0/conf/deepspeech2.yaml | 2 +- examples/librispeech/s0/conf/deepspeech2.yaml | 2 +- examples/librispeech/s1/conf/conformer.yaml | 4 ++-- examples/librispeech/s1/conf/transformer.yaml | 4 ++-- examples/tiny/s0/conf/deepspeech2.yaml | 2 +- examples/tiny/s1/conf/transformer.yaml | 2 +- 10 files changed, 26 insertions(+), 10 deletions(-) create mode 100644 .bashrc diff --git a/.bashrc b/.bashrc new file mode 100644 index 00000000..8abbb3c7 --- /dev/null +++ b/.bashrc @@ -0,0 +1,15 @@ +unset GREP_OPTIONS + +# https://zhuanlan.zhihu.com/p/33050965 +alias nvs='nvidia-smi' +alias his='history' +alias jobs='jobs -l' +alias ports='netstat -tulanp' +alias wget='wget -c' + +## Colorize the grep command output for ease of use (good for log files)## +alias grep='grep --color=auto' +alias egrep='egrep --color=auto' +alias fgrep='fgrep --color=auto' + + diff --git a/.gitignore b/.gitignore index 4ac2a36d..e4134a08 100644 --- a/.gitignore +++ b/.gitignore @@ -10,6 +10,7 @@ .ipynb_checkpoints *.npz *.done +*.whl tools/venv tools/kenlm diff --git a/README.md b/README.md index 424dc485..f7d1e088 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ ## Setup * python>=3.7 -* paddlepaddle>=2.1.0 +* paddlepaddle>=2.1.2 Please see [install](doc/src/install.md). diff --git a/README_cn.md b/README_cn.md index d762ec2b..019b38c1 100644 --- a/README_cn.md +++ b/README_cn.md @@ -17,7 +17,7 @@ ## 安装 * python>=3.7 -* paddlepaddle>=2.1.0 +* paddlepaddle>=2.1.2 参看 [安装](doc/src/install.md)。 diff --git a/examples/aishell/s0/conf/deepspeech2.yaml b/examples/aishell/s0/conf/deepspeech2.yaml index 1c97fc60..c4ff246f 100644 --- a/examples/aishell/s0/conf/deepspeech2.yaml +++ b/examples/aishell/s0/conf/deepspeech2.yaml @@ -32,7 +32,7 @@ collator: keep_transcription_text: False sortagrad: True shuffle_method: batch_shuffle - num_workers: 0 + num_workers: 2 model: num_conv_layers: 2 diff --git a/examples/librispeech/s0/conf/deepspeech2.yaml b/examples/librispeech/s0/conf/deepspeech2.yaml index acee94c3..dab8d046 100644 --- a/examples/librispeech/s0/conf/deepspeech2.yaml +++ b/examples/librispeech/s0/conf/deepspeech2.yaml @@ -32,7 +32,7 @@ collator: keep_transcription_text: False sortagrad: True shuffle_method: batch_shuffle - num_workers: 0 + num_workers: 2 model: num_conv_layers: 2 diff --git a/examples/librispeech/s1/conf/conformer.yaml b/examples/librispeech/s1/conf/conformer.yaml index 955b6108..6d825f05 100644 --- a/examples/librispeech/s1/conf/conformer.yaml +++ b/examples/librispeech/s1/conf/conformer.yaml @@ -16,7 +16,7 @@ collator: spm_model_prefix: 'data/bpe_unigram_5000' mean_std_filepath: "" augmentation_config: conf/augmentation.json - batch_size: 16 + batch_size: 32 raw_wav: True # use raw_wav or kaldi feature specgram_type: fbank #linear, mfcc, fbank feat_dim: 80 @@ -78,7 +78,7 @@ model: training: n_epoch: 120 - accum_grad: 8 + accum_grad: 4 global_grad_clip: 3.0 optim: adam optim_conf: diff --git a/examples/librispeech/s1/conf/transformer.yaml b/examples/librispeech/s1/conf/transformer.yaml index 26188677..8a769dca 100644 --- a/examples/librispeech/s1/conf/transformer.yaml +++ b/examples/librispeech/s1/conf/transformer.yaml @@ -16,7 +16,7 @@ collator: spm_model_prefix: 'data/bpe_unigram_5000' mean_std_filepath: "" augmentation_config: conf/augmentation.json - batch_size: 32 + batch_size: 64 raw_wav: True # use raw_wav or kaldi feature specgram_type: fbank #linear, mfcc, fbank feat_dim: 80 @@ -73,7 +73,7 @@ model: training: n_epoch: 120 - accum_grad: 4 + accum_grad: 2 global_grad_clip: 5.0 optim: adam optim_conf: diff --git a/examples/tiny/s0/conf/deepspeech2.yaml b/examples/tiny/s0/conf/deepspeech2.yaml index ea433f34..ab9a00d9 100644 --- a/examples/tiny/s0/conf/deepspeech2.yaml +++ b/examples/tiny/s0/conf/deepspeech2.yaml @@ -32,7 +32,7 @@ collator: keep_transcription_text: False sortagrad: True shuffle_method: batch_shuffle - num_workers: 0 + num_workers: 2 batch_size: 4 model: diff --git a/examples/tiny/s1/conf/transformer.yaml b/examples/tiny/s1/conf/transformer.yaml index e97ad756..fd5adbde 100644 --- a/examples/tiny/s1/conf/transformer.yaml +++ b/examples/tiny/s1/conf/transformer.yaml @@ -14,7 +14,7 @@ collator: mean_std_filepath: "" vocab_filepath: data/vocab.txt unit_type: 'spm' - spm_model_prefix: 'data/bpe_unigram_202' + spm_model_prefix: 'data/bpe_unigram_200' augmentation_config: conf/augmentation.json batch_size: 4 raw_wav: True # use raw_wav or kaldi feature