diff --git a/examples/aishell/asr0/local/test_hub_ori b/examples/aishell/asr0/local/test_hub_ori deleted file mode 100755 index ee1fb805..00000000 --- a/examples/aishell/asr0/local/test_hub_ori +++ /dev/null @@ -1,47 +0,0 @@ -#!/bin/bash - -if [ $# != 4 ];then - echo "usage: ${0} config_path ckpt_path_prefix model_type audio_file" - exit -1 -fi - -ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') -echo "using $ngpu gpus..." - -config_path=$1 -ckpt_prefix=$2 -model_type=$3 -audio_file=$4 - -mkdir -p data -wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wav -P data/ -if [ $? -ne 0 ]; then - exit 1 -fi - -if [ ! -f ${audio_file} ]; then - echo "Plase input the right audio_file path" - exit 1 -fi - -# download language model -bash local/download_lm_ch.sh -if [ $? -ne 0 ]; then - exit 1 -fi - -python3 -u ${BIN_DIR}/test_hub.py \ ---nproc ${ngpu} \ ---config ${config_path} \ ---result_file ${ckpt_prefix}.rsl \ ---checkpoint_path ${ckpt_prefix} \ ---model_type ${model_type} \ ---audio_file ${audio_file} - -if [ $? -ne 0 ]; then - echo "Failed in evaluation!" - exit 1 -fi - - -exit 0 diff --git a/examples/other/1xt2x/src_deepspeech2x/bin/test.py b/examples/other/1xt2x/src_deepspeech2x/bin/test.py index b404cce8..88a13fdc 100644 --- a/examples/other/1xt2x/src_deepspeech2x/bin/test.py +++ b/examples/other/1xt2x/src_deepspeech2x/bin/test.py @@ -15,7 +15,6 @@ from src_deepspeech2x.test_model import DeepSpeech2Tester as Tester from yacs.config import CfgNode -from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils.utility import print_arguments @@ -42,7 +41,7 @@ if __name__ == "__main__": print("model_type:{}".format(args.model_type)) # https://yaml.org/type/float.html - config = get_cfg_defaults(args.model_type) + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) if args.decode_cfg: diff --git a/examples/other/1xt2x/src_deepspeech2x/models/ds2/deepspeech2.py b/examples/other/1xt2x/src_deepspeech2x/models/ds2/deepspeech2.py index 4c20ffcd..003b02e2 100644 --- a/examples/other/1xt2x/src_deepspeech2x/models/ds2/deepspeech2.py +++ b/examples/other/1xt2x/src_deepspeech2x/models/ds2/deepspeech2.py @@ -120,20 +120,6 @@ class DeepSpeech2Model(nn.Layer): :rtype: tuple of LayerOutput """ - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - default = CfgNode( - dict( - num_conv_layers=2, #Number of stacking convolution layers. - num_rnn_layers=3, #Number of stacking RNN layers. - rnn_layer_size=1024, #RNN layer size (number of RNN cells). - use_gru=True, #Use gru if set True. Use simple rnn if set False. - share_rnn_weights=True #Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported. - )) - if config is not None: - config.merge_from_other_cfg(default) - return default - def __init__(self, feat_size, dict_size, diff --git a/examples/other/1xt2x/src_deepspeech2x/test_model.py b/examples/other/1xt2x/src_deepspeech2x/test_model.py index 53a4e629..246fb107 100644 --- a/examples/other/1xt2x/src_deepspeech2x/test_model.py +++ b/examples/other/1xt2x/src_deepspeech2x/test_model.py @@ -44,22 +44,6 @@ logger = Log(__name__).getlog() class DeepSpeech2Trainer(Trainer): - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - # training config - default = CfgNode( - dict( - lr=5e-4, # learning rate - lr_decay=1.0, # learning rate decay - weight_decay=1e-6, # the coeff of weight decay - global_grad_clip=5.0, # the global norm clip - n_epoch=50, # train epochs - )) - - if config is not None: - config.merge_from_other_cfg(default) - return default - def __init__(self, config, args): super().__init__(config, args) @@ -246,27 +230,6 @@ class DeepSpeech2Trainer(Trainer): class DeepSpeech2Tester(DeepSpeech2Trainer): - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - # testing config - default = CfgNode( - dict( - alpha=2.5, # Coef of LM for beam search. - beta=0.3, # Coef of WC for beam search. - cutoff_prob=1.0, # Cutoff probability for pruning. - cutoff_top_n=40, # Cutoff number for pruning. - lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm', # Filepath for language model. - decoding_method='ctc_beam_search', # Decoding method. Options: ctc_beam_search, ctc_greedy - error_rate_type='wer', # Error rate type for evaluation. Options `wer`, 'cer' - num_proc_bsearch=8, # # of CPUs for beam search. - beam_size=500, # Beam search width. - batch_size=128, # decoding batch size - )) - - if config is not None: - config.merge_from_other_cfg(default) - return default - def __init__(self, config, args): self._text_featurizer = TextFeaturizer( diff --git a/examples/ted_en_zh/st0/conf/preprocess.yaml b/examples/ted_en_zh/st0/conf/preprocess.yaml new file mode 100644 index 00000000..d3992cb9 --- /dev/null +++ b/examples/ted_en_zh/st0/conf/preprocess.yaml @@ -0,0 +1,25 @@ +process: + # extract kaldi fbank from PCM + - type: fbank_kaldi + fs: 16000 + n_mels: 80 + n_shift: 160 + win_length: 400 + dither: 0.1 + - type: cmvn_json + cmvn_path: data/mean_std.json + # these three processes are a.k.a. SpecAugument + - type: time_warp + max_time_warp: 5 + inplace: true + mode: PIL + - type: freq_mask + F: 30 + n_mask: 2 + inplace: true + replace_with_zero: false + - type: time_mask + T: 40 + n_mask: 2 + inplace: true + replace_with_zero: false diff --git a/examples/ted_en_zh/st0/conf/transformer.yaml b/examples/ted_en_zh/st0/conf/transformer.yaml index a154621d..d113fc94 100644 --- a/examples/ted_en_zh/st0/conf/transformer.yaml +++ b/examples/ted_en_zh/st0/conf/transformer.yaml @@ -19,7 +19,7 @@ vocab_filepath: data/lang_char/vocab.txt unit_type: 'spm' spm_model_prefix: data/lang_char/bpe_unigram_8000 mean_std_filepath: "" -augmentation_config: conf/preprocess.yaml +preprocess_config: conf/preprocess.yaml batch_size: 16 maxlen_in: 5 # if input length > maxlen-in, batchsize is automatically reduced maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced @@ -87,7 +87,7 @@ global_grad_clip: 5.0 optim: adam optim_conf: lr: 2.5 - weight_decay: 1e-06 + weight_decay: 1.0e-06 scheduler: noam scheduler_conf: warmup_steps: 25000 diff --git a/examples/ted_en_zh/st0/conf/transformer_mtl_noam.yaml b/examples/ted_en_zh/st0/conf/transformer_mtl_noam.yaml index c80dea7e..a01ec1a6 100644 --- a/examples/ted_en_zh/st0/conf/transformer_mtl_noam.yaml +++ b/examples/ted_en_zh/st0/conf/transformer_mtl_noam.yaml @@ -19,7 +19,7 @@ vocab_filepath: data/lang_char/vocab.txt unit_type: 'spm' spm_model_prefix: data/lang_char/bpe_unigram_8000 mean_std_filepath: "" -augmentation_config: conf/preprocess.yaml +preprocess_config: conf/preprocess.yaml batch_size: 16 maxlen_in: 5 # if input length > maxlen-in, batchsize is automatically reduced maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced diff --git a/examples/ted_en_zh/st1/conf/preprocess.yaml b/examples/ted_en_zh/st1/conf/preprocess.yaml new file mode 100644 index 00000000..bc86d98c --- /dev/null +++ b/examples/ted_en_zh/st1/conf/preprocess.yaml @@ -0,0 +1,16 @@ +process: + # these three processes are a.k.a. SpecAugument + - type: time_warp + max_time_warp: 5 + inplace: true + mode: PIL + - type: freq_mask + F: 30 + n_mask: 2 + inplace: true + replace_with_zero: false + - type: time_mask + T: 40 + n_mask: 2 + inplace: true + replace_with_zero: false diff --git a/examples/ted_en_zh/st1/conf/transformer.yaml b/examples/ted_en_zh/st1/conf/transformer.yaml index 05674562..515edee2 100644 --- a/examples/ted_en_zh/st1/conf/transformer.yaml +++ b/examples/ted_en_zh/st1/conf/transformer.yaml @@ -13,7 +13,7 @@ vocab_filepath: data/lang_char/ted_en_zh_bpe8000.txt unit_type: 'spm' spm_model_prefix: data/lang_char/ted_en_zh_bpe8000 mean_std_filepath: "" -# augmentation_config: conf/augmentation.json +# preprocess_config: conf/augmentation.json batch_size: 20 feat_dim: 83 stride_ms: 10.0 @@ -27,7 +27,7 @@ batch_bins: 0 batch_frames_in: 0 batch_frames_out: 0 batch_frames_inout: 0 -augmentation_config: +preprocess_config: num_workers: 0 subsampling_factor: 1 num_encs: 1 diff --git a/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml b/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml index ae246cbf..a5f956fa 100644 --- a/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml +++ b/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml @@ -13,7 +13,7 @@ vocab_filepath: data/lang_char/ted_en_zh_bpe8000.txt unit_type: 'spm' spm_model_prefix: data/lang_char/ted_en_zh_bpe8000 mean_std_filepath: "" -# augmentation_config: conf/augmentation.json +# preprocess_config: conf/augmentation.json batch_size: 20 feat_dim: 83 stride_ms: 10.0 @@ -27,7 +27,7 @@ batch_bins: 0 batch_frames_in: 0 batch_frames_out: 0 batch_frames_inout: 0 -augmentation_config: +preprocess_config: num_workers: 0 subsampling_factor: 1 num_encs: 1 diff --git a/examples/ted_en_zh/st1/local/test.sh b/examples/ted_en_zh/st1/local/test.sh index 9e24380d..be6f25f9 100755 --- a/examples/ted_en_zh/st1/local/test.sh +++ b/examples/ted_en_zh/st1/local/test.sh @@ -20,12 +20,7 @@ for type in fullsentence; do --decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ -<<<<<<< HEAD - --opts decode.decoding_method ${type} \ - --opts decode.decode_batch_size ${batch_size} -======= --opts decoding.decoding_method ${type} \ ->>>>>>> 6272496d9c26736750b577fd832ea9dd4ddc4e6e if [ $? -ne 0 ]; then echo "Failed in evaluation!" diff --git a/examples/tiny/asr1/conf/chunk_confermer.yaml b/examples/tiny/asr1/conf/chunk_confermer.yaml index cd072c14..8f785121 100644 --- a/examples/tiny/asr1/conf/chunk_confermer.yaml +++ b/examples/tiny/asr1/conf/chunk_confermer.yaml @@ -58,7 +58,6 @@ mean_std_filepath: "" vocab_filepath: data/lang_char/vocab.txt unit_type: 'spm' spm_model_prefix: 'data/lang_char/bpe_unigram_200' -preprocess_config: conf/preprocess.yaml feat_dim: 80 stride_ms: 10.0 window_ms: 25.0 @@ -72,7 +71,7 @@ batch_bins: 0 batch_frames_in: 0 batch_frames_out: 0 batch_frames_inout: 0 -augmentation_config: conf/preprocess.yaml +preprocess_config: conf/preprocess.yaml num_workers: 0 subsampling_factor: 1 num_encs: 1 diff --git a/paddlespeech/s2t/decoders/recog.py b/paddlespeech/s2t/decoders/recog.py index 3e9939f0..88955eac 100644 --- a/paddlespeech/s2t/decoders/recog.py +++ b/paddlespeech/s2t/decoders/recog.py @@ -85,7 +85,7 @@ def recog_v2(args): mode="asr", load_output=False, sort_in_input_length=False, - preprocess_conf=confs.collator.augmentation_config + preprocess_conf=confs.preprocess_config if args.preprocess_conf is None else args.preprocess_conf, preprocess_args={"train": False}, ) diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/deploy/runtime.py b/paddlespeech/s2t/exps/deepspeech2/bin/deploy/runtime.py index 88148323..ccb85906 100644 --- a/paddlespeech/s2t/exps/deepspeech2/bin/deploy/runtime.py +++ b/paddlespeech/s2t/exps/deepspeech2/bin/deploy/runtime.py @@ -20,7 +20,7 @@ from paddle.inference import Config from paddle.inference import create_predictor from paddle.io import DataLoader -from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults +from yacs.config import CfgNode from paddlespeech.s2t.io.collator import SpeechCollator from paddlespeech.s2t.io.dataset import ManifestDataset from paddlespeech.s2t.models.ds2 import DeepSpeech2Model @@ -176,7 +176,7 @@ if __name__ == "__main__": print_arguments(args, globals()) # https://yaml.org/type/float.html - config = get_cfg_defaults() + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) if args.decode_cfg: diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/deploy/server.py b/paddlespeech/s2t/exps/deepspeech2/bin/deploy/server.py index dea6d975..85c2466f 100644 --- a/paddlespeech/s2t/exps/deepspeech2/bin/deploy/server.py +++ b/paddlespeech/s2t/exps/deepspeech2/bin/deploy/server.py @@ -18,7 +18,7 @@ import numpy as np import paddle from paddle.io import DataLoader -from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults +from yacs.config import CfgNode from paddlespeech.s2t.io.collator import SpeechCollator from paddlespeech.s2t.io.dataset import ManifestDataset from paddlespeech.s2t.models.ds2 import DeepSpeech2Model @@ -111,7 +111,7 @@ if __name__ == "__main__": print_arguments(args, globals()) # https://yaml.org/type/float.html - config = get_cfg_defaults() + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) if args.decode_cfg: diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/export.py b/paddlespeech/s2t/exps/deepspeech2/bin/export.py index 66042e84..090b5fab 100644 --- a/paddlespeech/s2t/exps/deepspeech2/bin/export.py +++ b/paddlespeech/s2t/exps/deepspeech2/bin/export.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Export for DeepSpeech2 model.""" -from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults +from yacs.config import CfgNode from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2Tester as Tester from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils.utility import print_arguments @@ -41,7 +41,7 @@ if __name__ == "__main__": print_arguments(args) # https://yaml.org/type/float.html - config = get_cfg_defaults(args.model_type) + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) if args.opts: diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/test.py b/paddlespeech/s2t/exps/deepspeech2/bin/test.py index 7ce921d6..388b380d 100644 --- a/paddlespeech/s2t/exps/deepspeech2/bin/test.py +++ b/paddlespeech/s2t/exps/deepspeech2/bin/test.py @@ -14,7 +14,6 @@ """Evaluation for DeepSpeech2 model.""" from yacs.config import CfgNode -from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2Tester as Tester from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils.utility import print_arguments @@ -43,7 +42,7 @@ if __name__ == "__main__": print("model_type:{}".format(args.model_type)) # https://yaml.org/type/float.html - config = get_cfg_defaults(args.model_type) + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) if args.decode_cfg: diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py b/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py index 7a1801d4..176028ed 100644 --- a/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py +++ b/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py @@ -13,8 +13,6 @@ # limitations under the License. """Evaluation for DeepSpeech2 model.""" from yacs.config import CfgNode - -from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2ExportTester as ExportTester from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils.utility import print_arguments @@ -48,7 +46,7 @@ if __name__ == "__main__": print("model_type:{}".format(args.model_type)) # https://yaml.org/type/float.html - config = get_cfg_defaults(args.model_type) + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) if args.decode_cfg: diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py b/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py index 28756b05..e2cb7e2f 100644 --- a/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py +++ b/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py @@ -188,7 +188,7 @@ if __name__ == "__main__": print("model_type:{}".format(args.model_type)) # https://yaml.org/type/float.html - config = get_cfg_defaults(args.model_type) + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) if args.decode_cfg: diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/train.py b/paddlespeech/s2t/exps/deepspeech2/bin/train.py index 400538f9..5e8c0fff 100644 --- a/paddlespeech/s2t/exps/deepspeech2/bin/train.py +++ b/paddlespeech/s2t/exps/deepspeech2/bin/train.py @@ -14,7 +14,7 @@ """Trainer for DeepSpeech2 model.""" from paddle import distributed as dist -from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults +from yacs.config import CfgNode from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2Trainer as Trainer from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils.utility import print_arguments @@ -42,7 +42,7 @@ if __name__ == "__main__": print_arguments(args, globals()) # https://yaml.org/type/float.html - config = get_cfg_defaults(args.model_type) + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) if args.opts: diff --git a/paddlespeech/s2t/exps/deepspeech2/config.py b/paddlespeech/s2t/exps/deepspeech2/config.py deleted file mode 100644 index d8eab50e..00000000 --- a/paddlespeech/s2t/exps/deepspeech2/config.py +++ /dev/null @@ -1,28 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from yacs.config import CfgNode - -from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2Tester -from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2Trainer -from paddlespeech.s2t.io.collator import SpeechCollator -from paddlespeech.s2t.io.dataset import ManifestDataset -from paddlespeech.s2t.models.ds2 import DeepSpeech2Model -from paddlespeech.s2t.models.ds2_online import DeepSpeech2ModelOnline - - -def get_cfg_defaults(model_type='offline'): - _C = CfgNode() - config = _C.clone() - config.set_new_allowed(True) - return config diff --git a/paddlespeech/s2t/exps/deepspeech2/model.py b/paddlespeech/s2t/exps/deepspeech2/model.py index fc214a8a..e7d5e20f 100644 --- a/paddlespeech/s2t/exps/deepspeech2/model.py +++ b/paddlespeech/s2t/exps/deepspeech2/model.py @@ -49,22 +49,6 @@ logger = Log(__name__).getlog() class DeepSpeech2Trainer(Trainer): - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - # training config - default = CfgNode( - dict( - lr=5e-4, # learning rate - lr_decay=1.0, # learning rate decay - weight_decay=1e-6, # the coeff of weight decay - global_grad_clip=5.0, # the global norm clip - n_epoch=50, # train epochs - )) - - if config is not None: - config.merge_from_other_cfg(default) - return default - def __init__(self, config, args): super().__init__(config, args) @@ -259,27 +243,6 @@ class DeepSpeech2Trainer(Trainer): class DeepSpeech2Tester(DeepSpeech2Trainer): - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - # testing config - default = CfgNode( - dict( - alpha=2.5, # Coef of LM for beam search. - beta=0.3, # Coef of WC for beam search. - cutoff_prob=1.0, # Cutoff probability for pruning. - cutoff_top_n=40, # Cutoff number for pruning. - lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm', # Filepath for language model. - decoding_method='ctc_beam_search', # Decoding method. Options: ctc_beam_search, ctc_greedy - error_rate_type='wer', # Error rate type for evaluation. Options `wer`, 'cer' - num_proc_bsearch=8, # # of CPUs for beam search. - beam_size=500, # Beam search width. - batch_size=128, # decoding batch size - )) - - if config is not None: - config.merge_from_other_cfg(default) - return default - def __init__(self, config, args): super().__init__(config, args) self._text_featurizer = TextFeaturizer( diff --git a/paddlespeech/s2t/exps/u2/bin/alignment.py b/paddlespeech/s2t/exps/u2/bin/alignment.py index 5d768536..e3390feb 100644 --- a/paddlespeech/s2t/exps/u2/bin/alignment.py +++ b/paddlespeech/s2t/exps/u2/bin/alignment.py @@ -14,7 +14,6 @@ """Alignment for U2 model.""" from yacs.config import CfgNode -from paddlespeech.s2t.exps.u2.config import get_cfg_defaults from paddlespeech.s2t.exps.u2.model import U2Tester as Tester from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils.utility import print_arguments @@ -33,14 +32,14 @@ def main(config, args): if __name__ == "__main__": parser = default_argument_parser() - # save asr result to + # save asr result to parser.add_argument( "--result_file", type=str, help="path of save the asr result") args = parser.parse_args() print_arguments(args, globals()) # https://yaml.org/type/float.html - config = get_cfg_defaults() + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) if args.decode_cfg: diff --git a/paddlespeech/s2t/exps/u2/bin/export.py b/paddlespeech/s2t/exps/u2/bin/export.py index 44fc7c3e..3907cebd 100644 --- a/paddlespeech/s2t/exps/u2/bin/export.py +++ b/paddlespeech/s2t/exps/u2/bin/export.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Export for U2 model.""" -from paddlespeech.s2t.exps.u2.config import get_cfg_defaults +from yacs.config import CfgNode from paddlespeech.s2t.exps.u2.model import U2Tester as Tester from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils.utility import print_arguments @@ -31,14 +31,14 @@ def main(config, args): if __name__ == "__main__": parser = default_argument_parser() - # save jit model to + # save jit model to parser.add_argument( "--export_path", type=str, help="path of the jit model to save") args = parser.parse_args() print_arguments(args, globals()) # https://yaml.org/type/float.html - config = get_cfg_defaults() + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) if args.opts: diff --git a/paddlespeech/s2t/exps/u2/bin/test.py b/paddlespeech/s2t/exps/u2/bin/test.py index d93954fe..f14d804f 100644 --- a/paddlespeech/s2t/exps/u2/bin/test.py +++ b/paddlespeech/s2t/exps/u2/bin/test.py @@ -16,7 +16,6 @@ import cProfile from yacs.config import CfgNode -from paddlespeech.s2t.exps.u2.config import get_cfg_defaults from paddlespeech.s2t.exps.u2.model import U2Tester as Tester from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils.utility import print_arguments @@ -44,7 +43,7 @@ if __name__ == "__main__": print_arguments(args, globals()) # https://yaml.org/type/float.html - config = get_cfg_defaults() + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) if args.decode_cfg: diff --git a/paddlespeech/s2t/exps/u2/bin/test_wav.py b/paddlespeech/s2t/exps/u2/bin/test_wav.py index 554d6ca5..9904813a 100644 --- a/paddlespeech/s2t/exps/u2/bin/test_wav.py +++ b/paddlespeech/s2t/exps/u2/bin/test_wav.py @@ -20,7 +20,6 @@ import paddle import soundfile from yacs.config import CfgNode -from paddlespeech.s2t.exps.u2.config import get_cfg_defaults from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer from paddlespeech.s2t.models.u2 import U2Model from paddlespeech.s2t.training.cli import default_argument_parser @@ -129,7 +128,7 @@ if __name__ == "__main__": "--audio_file", type=str, help="path of the input audio file") args = parser.parse_args() - config = get_cfg_defaults() + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) if args.decode_cfg: diff --git a/paddlespeech/s2t/exps/u2/bin/train.py b/paddlespeech/s2t/exps/u2/bin/train.py index d6ee8b30..d562278f 100644 --- a/paddlespeech/s2t/exps/u2/bin/train.py +++ b/paddlespeech/s2t/exps/u2/bin/train.py @@ -17,7 +17,7 @@ import os from paddle import distributed as dist -from paddlespeech.s2t.exps.u2.config import get_cfg_defaults +from yacs.config import CfgNode from paddlespeech.s2t.exps.u2.model import U2Trainer as Trainer from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils.utility import print_arguments @@ -44,7 +44,7 @@ if __name__ == "__main__": print_arguments(args, globals()) # https://yaml.org/type/float.html - config = get_cfg_defaults() + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) if args.opts: diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py index f1683d70..d0cea031 100644 --- a/paddlespeech/s2t/exps/u2/model.py +++ b/paddlespeech/s2t/exps/u2/model.py @@ -46,33 +46,6 @@ logger = Log(__name__).getlog() class U2Trainer(Trainer): - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - # training config - default = CfgNode( - dict( - n_epoch=50, # train epochs - log_interval=100, # steps - accum_grad=1, # accum grad by # steps - global_grad_clip=5.0, # the global norm clip - )) - default.optim = 'adam' - default.optim_conf = CfgNode( - dict( - lr=5e-4, # learning rate - weight_decay=1e-6, # the coeff of weight decay - )) - default.scheduler = 'warmuplr' - default.scheduler_conf = CfgNode( - dict( - warmup_steps=25000, - lr_decay=1.0, # learning rate decay - )) - - if config is not None: - config.merge_from_other_cfg(default) - return default - def __init__(self, config, args): super().__init__(config, args) @@ -401,35 +374,6 @@ class U2Trainer(Trainer): class U2Tester(U2Trainer): - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - # decoding config - default = CfgNode( - dict( - alpha=2.5, # Coef of LM for beam search. - beta=0.3, # Coef of WC for beam search. - cutoff_prob=1.0, # Cutoff probability for pruning. - cutoff_top_n=40, # Cutoff number for pruning. - lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm', # Filepath for language model. - decoding_method='attention', # Decoding method. Options: 'attention', 'ctc_greedy_search', - # 'ctc_prefix_beam_search', 'attention_rescoring' - error_rate_type='wer', # Error rate type for evaluation. Options `wer`, 'cer' - num_proc_bsearch=8, # # of CPUs for beam search. - beam_size=10, # Beam search width. - decode_batch_size=16, # decoding batch size - ctc_weight=0.0, # ctc weight for attention rescoring decode mode. - decoding_chunk_size=-1, # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks=-1, # number of left chunks for decoding. Defaults to -1. - simulate_streaming=False, # simulate streaming inference. Defaults to False. - )) - - if config is not None: - config.merge_from_other_cfg(default) - return default - def __init__(self, config, args): super().__init__(config, args) self.text_feature = TextFeaturizer( diff --git a/paddlespeech/s2t/exps/u2/trainer.py b/paddlespeech/s2t/exps/u2/trainer.py index 57d87316..ab87c30d 100644 --- a/paddlespeech/s2t/exps/u2/trainer.py +++ b/paddlespeech/s2t/exps/u2/trainer.py @@ -55,7 +55,6 @@ class U2Trainer(Trainer): collate_fn_train = SpeechCollator.from_config(config) - config.augmentation_config = "" collate_fn_dev = SpeechCollator.from_config(config) if self.parallel: @@ -103,7 +102,6 @@ class U2Trainer(Trainer): test_dataset = ManifestDataset.from_config(config) # return text ord id config.keep_transcription_text = True - config.augmentation_config = "" self.test_loader = DataLoader( test_dataset, batch_size=config.decode.batch_size, diff --git a/paddlespeech/s2t/exps/u2_kaldi/model.py b/paddlespeech/s2t/exps/u2_kaldi/model.py index 887dd29e..780c5c08 100644 --- a/paddlespeech/s2t/exps/u2_kaldi/model.py +++ b/paddlespeech/s2t/exps/u2_kaldi/model.py @@ -42,40 +42,7 @@ from paddlespeech.s2t.utils.utility import UpdateConfig logger = Log(__name__).getlog() - -def get_cfg_defaults(): - """Get a yacs CfgNode object with default values for my_project.""" - # Return a clone so that the defaults will not be altered - # This is for the "local variable" use pattern - _C = CfgNode() - - _C.model = U2Model.params() - - _C.training = U2Trainer.params() - - _C.decoding = U2Tester.params() - - config = _C.clone() - config.set_new_allowed(True) - return config - - class U2Trainer(Trainer): - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - # training config - default = CfgNode( - dict( - n_epoch=50, # train epochs - log_interval=100, # steps - accum_grad=1, # accum grad by # steps - checkpoint=dict( - kbest_n=50, - latest_n=5, ), )) - if config is not None: - config.merge_from_other_cfg(default) - return default - def __init__(self, config, args): super().__init__(config, args) @@ -362,35 +329,6 @@ class U2Trainer(Trainer): class U2Tester(U2Trainer): - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - # decoding config - default = CfgNode( - dict( - alpha=2.5, # Coef of LM for beam search. - beta=0.3, # Coef of WC for beam search. - cutoff_prob=1.0, # Cutoff probability for pruning. - cutoff_top_n=40, # Cutoff number for pruning. - lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm', # Filepath for language model. - decoding_method='attention', # Decoding method. Options: 'attention', 'ctc_greedy_search', - # 'ctc_prefix_beam_search', 'attention_rescoring' - error_rate_type='wer', # Error rate type for evaluation. Options `wer`, 'cer' - num_proc_bsearch=8, # # of CPUs for beam search. - beam_size=10, # Beam search width. - batch_size=16, # decoding batch size - ctc_weight=0.0, # ctc weight for attention rescoring decode mode. - decoding_chunk_size=-1, # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks=-1, # number of left chunks for decoding. Defaults to -1. - simulate_streaming=False, # simulate streaming inference. Defaults to False. - )) - - if config is not None: - config.merge_from_other_cfg(default) - return default - def __init__(self, config, args): super().__init__(config, args) self.text_feature = TextFeaturizer( diff --git a/paddlespeech/s2t/exps/u2_st/bin/export.py b/paddlespeech/s2t/exps/u2_st/bin/export.py index 69d9718f..1bc4e1f3 100644 --- a/paddlespeech/s2t/exps/u2_st/bin/export.py +++ b/paddlespeech/s2t/exps/u2_st/bin/export.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Export for U2 model.""" -from paddlespeech.s2t.exps.u2_st.config import get_cfg_defaults +from yacs.config import CfgNode from paddlespeech.s2t.exps.u2_st.model import U2STTester as Tester from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils.utility import print_arguments @@ -31,14 +31,14 @@ def main(config, args): if __name__ == "__main__": parser = default_argument_parser() - # save jit model to + # save jit model to parser.add_argument( "--export_path", type=str, help="path of the jit model to save") args = parser.parse_args() print_arguments(args, globals()) # https://yaml.org/type/float.html - config = get_cfg_defaults() + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) if args.opts: diff --git a/paddlespeech/s2t/exps/u2_st/bin/test.py b/paddlespeech/s2t/exps/u2_st/bin/test.py index 3ad5fc7d..1d70a310 100644 --- a/paddlespeech/s2t/exps/u2_st/bin/test.py +++ b/paddlespeech/s2t/exps/u2_st/bin/test.py @@ -16,7 +16,6 @@ import cProfile from yacs.config import CfgNode -from paddlespeech.s2t.exps.u2_st.config import get_cfg_defaults from paddlespeech.s2t.exps.u2_st.model import U2STTester as Tester from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils.utility import print_arguments @@ -44,7 +43,7 @@ if __name__ == "__main__": print_arguments(args, globals()) # https://yaml.org/type/float.html - config = get_cfg_defaults() + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) if args.decode_cfg: diff --git a/paddlespeech/s2t/exps/u2_st/bin/train.py b/paddlespeech/s2t/exps/u2_st/bin/train.py index 58496c88..4dec9ec8 100644 --- a/paddlespeech/s2t/exps/u2_st/bin/train.py +++ b/paddlespeech/s2t/exps/u2_st/bin/train.py @@ -16,8 +16,8 @@ import cProfile import os from paddle import distributed as dist +from yacs.config import CfgNode -from paddlespeech.s2t.exps.u2_st.config import get_cfg_defaults from paddlespeech.s2t.exps.u2_st.model import U2STTrainer as Trainer from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils.utility import print_arguments @@ -42,7 +42,7 @@ if __name__ == "__main__": print_arguments(args, globals()) # https://yaml.org/type/float.html - config = get_cfg_defaults() + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) if args.opts: diff --git a/paddlespeech/s2t/exps/u2_st/config.py b/paddlespeech/s2t/exps/u2_st/config.py deleted file mode 100644 index a314a1ca..00000000 --- a/paddlespeech/s2t/exps/u2_st/config.py +++ /dev/null @@ -1,41 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from yacs.config import CfgNode - -from paddlespeech.s2t.exps.u2_st.model import U2STTester -from paddlespeech.s2t.exps.u2_st.model import U2STTrainer -from paddlespeech.s2t.io.collator import SpeechCollator -from paddlespeech.s2t.io.dataset import ManifestDataset -from paddlespeech.s2t.models.u2_st import U2STModel - -_C = CfgNode() - -# _C.data = ManifestDataset.params() - -# _C.collator = SpeechCollator.params() - -# _C.model = U2STModel.params() - -# _C.training = U2STTrainer.params() - -# _C.decoding = U2STTester.params() - - -def get_cfg_defaults(): - """Get a yacs CfgNode object with default values for my_project.""" - # Return a clone so that the defaults will not be altered - # This is for the "local variable" use pattern - config = _C.clone() - config.set_new_allowed(True) - return config diff --git a/paddlespeech/s2t/exps/u2_st/model.py b/paddlespeech/s2t/exps/u2_st/model.py index 88aeb4a5..ca2c2c1d 100644 --- a/paddlespeech/s2t/exps/u2_st/model.py +++ b/paddlespeech/s2t/exps/u2_st/model.py @@ -45,33 +45,6 @@ logger = Log(__name__).getlog() class U2STTrainer(Trainer): - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - # training config - default = CfgNode( - dict( - n_epoch=50, # train epochs - log_interval=100, # steps - accum_grad=1, # accum grad by # steps - global_grad_clip=5.0, # the global norm clip - )) - default.optim = 'adam' - default.optim_conf = CfgNode( - dict( - lr=5e-4, # learning rate - weight_decay=1e-6, # the coeff of weight decay - )) - default.scheduler = 'warmuplr' - default.scheduler_conf = CfgNode( - dict( - warmup_steps=25000, - lr_decay=1.0, # learning rate decay - )) - - if config is not None: - config.merge_from_other_cfg(default) - return default - def __init__(self, config, args): super().__init__(config, args) @@ -127,7 +100,7 @@ class U2STTrainer(Trainer): for k, v in losses_np.items(): report(k, v) - report("batch_size", self.config.collator.batch_size) + report("batch_size", self.config.batch_size) report("accum", train_conf.accum_grad) report("step_cost", iteration_time) @@ -236,7 +209,7 @@ class U2STTrainer(Trainer): msg += "," msg = msg[:-1] # remove the last "," if (batch_index + 1 - ) % self.config.training.log_interval == 0: + ) % self.config.log_interval == 0: logger.info(msg) except Exception as e: logger.error(e) @@ -287,7 +260,7 @@ class U2STTrainer(Trainer): batch_frames_in=0, batch_frames_out=0, batch_frames_inout=0, - preprocess_conf=config.augmentation_config, # aug will be off when train_mode=False + preprocess_conf=config.preprocess_config, # aug will be off when train_mode=False n_iter_processes=config.num_workers, subsampling_factor=1, load_aux_output=load_transcript, @@ -308,7 +281,7 @@ class U2STTrainer(Trainer): batch_frames_in=0, batch_frames_out=0, batch_frames_inout=0, - preprocess_conf=config.augmentation_config, # aug will be off when train_mode=False + preprocess_conf=config.preprocess_config, # aug will be off when train_mode=False n_iter_processes=config.num_workers, subsampling_factor=1, load_aux_output=load_transcript, @@ -319,7 +292,7 @@ class U2STTrainer(Trainer): # test dataset, return raw text decode_batch_size = config.get('decode',dict()).get('decode_batch_size', 1) self.test_loader = BatchDataLoader( - json_file=config.data.test_manifest, + json_file=config.test_manifest, train_mode=False, sortagrad=False, batch_size=decode_batch_size, @@ -332,7 +305,7 @@ class U2STTrainer(Trainer): batch_frames_in=0, batch_frames_out=0, batch_frames_inout=0, - preprocess_conf=config.augmentation_config, # aug will be off when train_mode=False + preprocess_conf=config.preprocess_config, # aug will be off when train_mode=False n_iter_processes=config.num_workers, subsampling_factor=1, num_encs=1, @@ -379,7 +352,7 @@ class U2STTrainer(Trainer): config, parameters, lr_scheduler=None, ): - train_config = config.training + train_config = config optim_type = train_config.optim optim_conf = train_config.optim_conf scheduler_type = train_config.scheduler @@ -405,41 +378,12 @@ class U2STTrainer(Trainer): class U2STTester(U2STTrainer): - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - # decoding config - default = CfgNode( - dict( - alpha=2.5, # Coef of LM for beam search. - beta=0.3, # Coef of WC for beam search. - cutoff_prob=1.0, # Cutoff probability for pruning. - cutoff_top_n=40, # Cutoff number for pruning. - lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm', # Filepath for language model. - decoding_method='attention', # Decoding method. Options: 'attention', 'ctc_greedy_search', - # 'ctc_prefix_beam_search', 'attention_rescoring' - error_rate_type='bleu', # Error rate type for evaluation. Options `bleu`, 'char_bleu' - num_proc_bsearch=8, # # of CPUs for beam search. - beam_size=10, # Beam search width. - batch_size=16, # decoding batch size - ctc_weight=0.0, # ctc weight for attention rescoring decode mode. - decoding_chunk_size=-1, # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks=-1, # number of left chunks for decoding. Defaults to -1. - simulate_streaming=False, # simulate streaming inference. Defaults to False. - )) - - if config is not None: - config.merge_from_other_cfg(default) - return default - def __init__(self, config, args): super().__init__(config, args) self.text_feature = TextFeaturizer( - unit_type=self.config.collator.unit_type, - vocab_filepath=self.config.collator.vocab_filepath, - spm_model_prefix=self.config.collator.spm_model_prefix) + unit_type=self.config.unit_type, + vocab=self.config.vocab_filepath, + spm_model_prefix=self.config.spm_model_prefix) self.vocab_list = self.text_feature.vocab_list def id2token(self, texts, texts_len, text_feature): @@ -526,7 +470,7 @@ class U2STTester(U2STTrainer): decode_cfg = self.config.decode bleu_func = bleu_score.char_bleu if decode_cfg.error_rate_type == 'char-bleu' else bleu_score.bleu - stride_ms = self.config.collator.stride_ms + stride_ms = self.config.stride_ms hyps, refs = [], [] len_refs, num_ins = 0, 0 num_frames = 0.0 diff --git a/paddlespeech/s2t/io/collator.py b/paddlespeech/s2t/io/collator.py index 27bf20eb..3a14b2d5 100644 --- a/paddlespeech/s2t/io/collator.py +++ b/paddlespeech/s2t/io/collator.py @@ -219,33 +219,6 @@ class SpeechCollatorBase(): class SpeechCollator(SpeechCollatorBase): - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - default = CfgNode( - dict( - augmentation_config="", - random_seed=0, - mean_std_filepath="", - unit_type="char", - vocab_filepath="", - spm_model_prefix="", - spectrum_type='linear', # 'linear', 'mfcc', 'fbank' - feat_dim=0, # 'mfcc', 'fbank' - delta_delta=False, # 'mfcc', 'fbank' - stride_ms=10.0, # ms - window_ms=20.0, # ms - n_fft=None, # fft points - max_freq=None, # None for samplerate/2 - target_sample_rate=16000, # target sample rate - use_dB_normalization=True, - target_dB=-20, - dither=1.0, # feature dither - keep_transcription_text=False)) - - if config is not None: - config.merge_from_other_cfg(default) - return default - @classmethod def from_config(cls, config): """Build a SpeechCollator object from a config. diff --git a/paddlespeech/s2t/io/dataset.py b/paddlespeech/s2t/io/dataset.py index c76ccfce..9149fb27 100644 --- a/paddlespeech/s2t/io/dataset.py +++ b/paddlespeech/s2t/io/dataset.py @@ -28,22 +28,6 @@ logger = Log(__name__).getlog() class ManifestDataset(Dataset): - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - default = CfgNode( - dict( - manifest="", - max_input_len=27.0, - min_input_len=0.0, - max_output_len=float('inf'), - min_output_len=0.0, - max_output_input_ratio=float('inf'), - min_output_input_ratio=0.0, )) - - if config is not None: - config.merge_from_other_cfg(default) - return default - @classmethod def from_config(cls, config): """Build a ManifestDataset object from a config. diff --git a/paddlespeech/s2t/models/ds2/deepspeech2.py b/paddlespeech/s2t/models/ds2/deepspeech2.py index 0414d04f..ddc3612d 100644 --- a/paddlespeech/s2t/models/ds2/deepspeech2.py +++ b/paddlespeech/s2t/models/ds2/deepspeech2.py @@ -119,21 +119,6 @@ class DeepSpeech2Model(nn.Layer): before softmax) and a ctc cost layer. :rtype: tuple of LayerOutput """ - - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - default = CfgNode( - dict( - num_conv_layers=2, #Number of stacking convolution layers. - num_rnn_layers=3, #Number of stacking RNN layers. - rnn_layer_size=1024, #RNN layer size (number of RNN cells). - use_gru=True, #Use gru if set True. Use simple rnn if set False. - share_rnn_weights=True, #Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported. - ctc_grad_norm_type=None, )) - if config is not None: - config.merge_from_other_cfg(default) - return default - def __init__(self, feat_size, dict_size, diff --git a/paddlespeech/s2t/models/ds2_online/deepspeech2.py b/paddlespeech/s2t/models/ds2_online/deepspeech2.py index f08e30d0..aae77f74 100644 --- a/paddlespeech/s2t/models/ds2_online/deepspeech2.py +++ b/paddlespeech/s2t/models/ds2_online/deepspeech2.py @@ -243,23 +243,6 @@ class DeepSpeech2ModelOnline(nn.Layer): before softmax) and a ctc cost layer. :rtype: tuple of LayerOutput """ - - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - default = CfgNode( - dict( - num_conv_layers=2, #Number of stacking convolution layers. - num_rnn_layers=4, #Number of stacking RNN layers. - rnn_layer_size=1024, #RNN layer size (number of RNN cells). - num_fc_layers=2, - fc_layers_size_list=[512, 256], - use_gru=True, #Use gru if set True. Use simple rnn if set False. - blank_id=0, # index of blank in vocob.txt - ctc_grad_norm_type=None, )) - if config is not None: - config.merge_from_other_cfg(default) - return default - def __init__( self, feat_size, diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py index 83eff467..26e81acf 100644 --- a/paddlespeech/s2t/models/u2/u2.py +++ b/paddlespeech/s2t/models/u2/u2.py @@ -59,57 +59,6 @@ logger = Log(__name__).getlog() class U2BaseModel(ASRInterface, nn.Layer): """CTC-Attention hybrid Encoder-Decoder model""" - - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - # network architecture - default = CfgNode() - # allow add new item when merge_with_file - default.cmvn_file = "" - default.cmvn_file_type = "json" - default.input_dim = 0 - default.output_dim = 0 - # encoder related - default.encoder = 'transformer' - default.encoder_conf = CfgNode( - dict( - output_size=256, # dimension of attention - attention_heads=4, - linear_units=2048, # the number of units of position-wise feed forward - num_blocks=12, # the number of encoder blocks - dropout_rate=0.1, - positional_dropout_rate=0.1, - attention_dropout_rate=0.0, - input_layer='conv2d', # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before=True, - # use_cnn_module=True, - # cnn_module_kernel=15, - # activation_type='swish', - # pos_enc_layer_type='rel_pos', - # selfattention_layer_type='rel_selfattn', - )) - # decoder related - default.decoder = 'transformer' - default.decoder_conf = CfgNode( - dict( - attention_heads=4, - linear_units=2048, - num_blocks=6, - dropout_rate=0.1, - positional_dropout_rate=0.1, - self_attention_dropout_rate=0.0, - src_attention_dropout_rate=0.0, )) - # hybrid CTC/attention - default.model_conf = CfgNode( - dict( - ctc_weight=0.3, - lsm_weight=0.1, # label smoothing option - length_normalized_loss=False, )) - - if config is not None: - config.merge_from_other_cfg(default) - return default - def __init__(self, vocab_size: int, encoder: TransformerEncoder, diff --git a/paddlespeech/s2t/models/u2_st/u2_st.py b/paddlespeech/s2t/models/u2_st/u2_st.py index 8b07e389..1c5596ba 100644 --- a/paddlespeech/s2t/models/u2_st/u2_st.py +++ b/paddlespeech/s2t/models/u2_st/u2_st.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """U2 ASR Model -Unified Streaming and Non-streaming Two-pass End-to-end Model for Speech Recognition +Unified Streaming and Non-streaming Two-pass End-to-end Model for Speech Recognition (https://arxiv.org/pdf/2012.05481.pdf) """ import time @@ -51,58 +51,6 @@ logger = Log(__name__).getlog() class U2STBaseModel(nn.Layer): """CTC-Attention hybrid Encoder-Decoder model""" - - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - # network architecture - default = CfgNode() - # allow add new item when merge_with_file - default.cmvn_file = "" - default.cmvn_file_type = "json" - default.input_dim = 0 - default.output_dim = 0 - # encoder related - default.encoder = 'transformer' - default.encoder_conf = CfgNode( - dict( - output_size=256, # dimension of attention - attention_heads=4, - linear_units=2048, # the number of units of position-wise feed forward - num_blocks=12, # the number of encoder blocks - dropout_rate=0.1, - positional_dropout_rate=0.1, - attention_dropout_rate=0.0, - input_layer='conv2d', # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before=True, - # use_cnn_module=True, - # cnn_module_kernel=15, - # activation_type='swish', - # pos_enc_layer_type='rel_pos', - # selfattention_layer_type='rel_selfattn', - )) - # decoder related - default.decoder = 'transformer' - default.decoder_conf = CfgNode( - dict( - attention_heads=4, - linear_units=2048, - num_blocks=6, - dropout_rate=0.1, - positional_dropout_rate=0.1, - self_attention_dropout_rate=0.0, - src_attention_dropout_rate=0.0, )) - # hybrid CTC/attention - default.model_conf = CfgNode( - dict( - asr_weight=0.0, - ctc_weight=0.0, - lsm_weight=0.1, # label smoothing option - length_normalized_loss=False, )) - - if config is not None: - config.merge_from_other_cfg(default) - return default - def __init__(self, vocab_size: int, encoder: TransformerEncoder, @@ -289,8 +237,8 @@ class U2STBaseModel(nn.Layer): simulate_streaming (bool, optional): streaming or not. Defaults to False. Returns: - Tuple[paddle.Tensor, paddle.Tensor]: - encoder hiddens (B, Tmax, D), + Tuple[paddle.Tensor, paddle.Tensor]: + encoder hiddens (B, Tmax, D), encoder hiddens mask (B, 1, Tmax). """ # Let's assume B = batch_size @@ -533,21 +481,21 @@ class U2STBaseModel(nn.Layer): feats (Tenosr): audio features, (B, T, D) feats_lengths (Tenosr): (B) text_feature (TextFeaturizer): text feature object. - decoding_method (str): decoding mode, e.g. - 'fullsentence', + decoding_method (str): decoding mode, e.g. + 'fullsentence', 'simultaneous' beam_size (int): beam size for search decoding_chunk_size (int, optional): decoding chunk size. Defaults to -1. <0: for decoding, use full chunk. >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here. - num_decoding_left_chunks (int, optional): + 0: used for training, it's prohibited here. + num_decoding_left_chunks (int, optional): number of left chunks for decoding. Defaults to -1. simulate_streaming (bool, optional): simulate streaming inference. Defaults to False. Raises: ValueError: when not support decoding_method. - + Returns: List[List[int]]: transcripts. """ @@ -601,7 +549,7 @@ class U2STModel(U2STBaseModel): ValueError: raise when using not support encoder type. Returns: - int, nn.Layer, nn.Layer, nn.Layer: vocab size, encoder, decoder, ctc + int, nn.Layer, nn.Layer, nn.Layer: vocab size, encoder, decoder, ctc """ if configs['cmvn_file'] is not None: mean, istd = load_cmvn(configs['cmvn_file'],