From c40b6f406208a283f799bab546bfca4020f66204 Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Tue, 28 Dec 2021 08:47:25 +0000 Subject: [PATCH 1/9] refactor the train and test config,test=asr --- examples/aishell/asr1/conf/conformer.yaml | 180 +++++++++++----------- examples/aishell/asr1/conf/decode.yaml | 12 ++ examples/aishell/asr1/local/align.sh | 10 +- examples/aishell/asr1/local/test.sh | 11 +- examples/aishell/asr1/local/test_wav.sh | 12 +- examples/aishell/asr1/run.sh | 7 +- paddlespeech/s2t/exps/u2/bin/alignment.py | 6 + paddlespeech/s2t/exps/u2/bin/test.py | 10 +- paddlespeech/s2t/exps/u2/bin/test_wav.py | 36 ++--- paddlespeech/s2t/exps/u2/config.py | 11 +- paddlespeech/s2t/exps/u2/model.py | 109 +++++++------ paddlespeech/s2t/training/cli.py | 8 + paddlespeech/s2t/training/trainer.py | 10 +- paddlespeech/s2t/utils/utility.py | 2 +- 14 files changed, 229 insertions(+), 195 deletions(-) create mode 100644 examples/aishell/asr1/conf/decode.yaml diff --git a/examples/aishell/asr1/conf/conformer.yaml b/examples/aishell/asr1/conf/conformer.yaml index 907e3a94..2ba96e76 100644 --- a/examples/aishell/asr1/conf/conformer.yaml +++ b/examples/aishell/asr1/conf/conformer.yaml @@ -1,97 +1,93 @@ -# network architecture -model: - cmvn_file: - cmvn_file_type: "json" - # encoder related - encoder: conformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: True - cnn_module_kernel: 15 - use_cnn_module: True - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' +############################################ +# Network Architecture # +############################################ +#model: +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: conformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: True + cnn_module_kernel: 15 + use_cnn_module: True + activation_type: 'swish' + pos_enc_layer_type: 'rel_pos' + selfattention_layer_type: 'rel_selfattn' - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 - # hybrid CTC/attention - model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test +########################################### +# Data # +########################################### +#data: +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test +########################################### +# Dataloader # +########################################### +#collator: +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'char' +augmentation_config: conf/preprocess.yaml +spm_model_prefix: '' +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 64 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 -collator: - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'char' - augmentation_config: conf/preprocess.yaml - feat_dim: 80 - stride_ms: 10.0 - window_ms: 25.0 - sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs - batch_size: 64 - maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced - maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced - minibatches: 0 # for debug - batch_count: auto - batch_bins: 0 - batch_frames_in: 0 - batch_frames_out: 0 - batch_frames_inout: 0 - num_workers: 0 - subsampling_factor: 1 - num_encs: 1 - - -training: - n_epoch: 240 - accum_grad: 2 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 0.002 - weight_decay: 1e-6 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 - - -decoding: - beam_size: 10 - batch_size: 128 - error_rate_type: cer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. +########################################### +# training # +########################################### +#training: +n_epoch: 240 +accum_grad: 2 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 0.002 + weight_decay: 1.0e-6 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/aishell/asr1/conf/decode.yaml b/examples/aishell/asr1/conf/decode.yaml new file mode 100644 index 00000000..49364f5d --- /dev/null +++ b/examples/aishell/asr1/conf/decode.yaml @@ -0,0 +1,12 @@ +#decoding: +beam_size: 10 +decode_batch_size: 128 +error_rate_type: cer +decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' +ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. +decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. +num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. +simulate_streaming: False # simulate streaming inference. Defaults to False. diff --git a/examples/aishell/asr1/local/align.sh b/examples/aishell/asr1/local/align.sh index c65d611c..f526c8a4 100755 --- a/examples/aishell/asr1/local/align.sh +++ b/examples/aishell/asr1/local/align.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 2 ];then - echo "usage: ${0} config_path ckpt_path_prefix" +if [ $# != 3 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix" exit -1 fi @@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 +decode_config_path=$2 +ckpt_prefix=$3 batch_size=1 output_dir=${ckpt_prefix} @@ -20,9 +21,10 @@ mkdir -p ${output_dir} python3 -u ${BIN_DIR}/alignment.py \ --ngpu ${ngpu} \ --config ${config_path} \ +--decode_config ${decode_config_path} \ --result_file ${output_dir}/${type}.align \ --checkpoint_path ${ckpt_prefix} \ ---opts decoding.batch_size ${batch_size} +--opts decoding.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in ctc alignment!" diff --git a/examples/aishell/asr1/local/test.sh b/examples/aishell/asr1/local/test.sh index da159de7..2c092127 100755 --- a/examples/aishell/asr1/local/test.sh +++ b/examples/aishell/asr1/local/test.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 2 ];then - echo "usage: ${0} config_path ckpt_path_prefix" +if [ $# != 3 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix" exit -1 fi @@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 +decode_config_path=$2 +ckpt_prefix=$3 chunk_mode=false if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then @@ -36,10 +37,11 @@ for type in attention ctc_greedy_search; do python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_config ${decode_config_path} \ --result_file ${output_dir}/${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decoding.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" @@ -55,6 +57,7 @@ for type in ctc_prefix_beam_search attention_rescoring; do python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_config ${decode_config_path} \ --result_file ${output_dir}/${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ --opts decoding.decoding_method ${type} \ diff --git a/examples/aishell/asr1/local/test_wav.sh b/examples/aishell/asr1/local/test_wav.sh index f85c1a47..4866e642 100755 --- a/examples/aishell/asr1/local/test_wav.sh +++ b/examples/aishell/asr1/local/test_wav.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 3 ];then - echo "usage: ${0} config_path ckpt_path_prefix audio_file" +if [ $# != 4 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix audio_file" exit -1 fi @@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 -audio_file=$3 +decode_config_path=$2 +ckpt_prefix=$3 +audio_file=$4 mkdir -p data wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wav -P data/ @@ -42,10 +43,11 @@ for type in attention_rescoring; do python3 -u ${BIN_DIR}/test_wav.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_config ${decode_config_path} \ --result_file ${output_dir}/${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} \ + --opts decoding.decode_batch_size ${batch_size} \ --audio_file ${audio_file} if [ $? -ne 0 ]; then diff --git a/examples/aishell/asr1/run.sh b/examples/aishell/asr1/run.sh index d07a4ed5..11aff9c7 100644 --- a/examples/aishell/asr1/run.sh +++ b/examples/aishell/asr1/run.sh @@ -6,6 +6,7 @@ gpus=0,1,2,3 stage=0 stop_stage=50 conf_path=conf/conformer.yaml +decode_conf_path=conf/decode.yaml avg_num=20 audio_file=data/demo_01_03.wav @@ -32,18 +33,18 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then # ctc alignment of test data - CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi # Optionally, you can add LM and test it with runtime. if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then # test a single .wav file - CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1 fi # Not supported at now!!! diff --git a/paddlespeech/s2t/exps/u2/bin/alignment.py b/paddlespeech/s2t/exps/u2/bin/alignment.py index df95baeb..f8397ed0 100644 --- a/paddlespeech/s2t/exps/u2/bin/alignment.py +++ b/paddlespeech/s2t/exps/u2/bin/alignment.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Alignment for U2 model.""" +from yacs.config import CfgNode + from paddlespeech.s2t.exps.u2.config import get_cfg_defaults from paddlespeech.s2t.exps.u2.model import U2Tester as Tester from paddlespeech.s2t.training.cli import default_argument_parser @@ -41,6 +43,10 @@ if __name__ == "__main__": config = get_cfg_defaults() if args.config: config.merge_from_file(args.config) + if args.decode_config: + decode_confs = CfgNode(new_allowed=True) + decode_confs.merge_from_file(args.decode_config) + config.decoding = decode_confs if args.opts: config.merge_from_list(args.opts) config.freeze() diff --git a/paddlespeech/s2t/exps/u2/bin/test.py b/paddlespeech/s2t/exps/u2/bin/test.py index 48b0670d..f179ea81 100644 --- a/paddlespeech/s2t/exps/u2/bin/test.py +++ b/paddlespeech/s2t/exps/u2/bin/test.py @@ -14,12 +14,14 @@ """Evaluation for U2 model.""" import cProfile +from yacs.config import CfgNode + from paddlespeech.s2t.exps.u2.config import get_cfg_defaults from paddlespeech.s2t.exps.u2.model import U2Tester as Tester from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils.utility import print_arguments -# TODO(hui zhang): dynamic load +# TODO(hui zhang): dynamic load def main_sp(config, args): @@ -35,7 +37,7 @@ def main(config, args): if __name__ == "__main__": parser = default_argument_parser() - # save asr result to + # save asr result to parser.add_argument( "--result_file", type=str, help="path of save the asr result") args = parser.parse_args() @@ -45,6 +47,10 @@ if __name__ == "__main__": config = get_cfg_defaults() if args.config: config.merge_from_file(args.config) + if args.decode_config: + decode_confs = CfgNode(new_allowed=True) + decode_confs.merge_from_file(args.decode_config) + config.decoding = decode_confs if args.opts: config.merge_from_list(args.opts) config.freeze() diff --git a/paddlespeech/s2t/exps/u2/bin/test_wav.py b/paddlespeech/s2t/exps/u2/bin/test_wav.py index 556316ec..e5671a43 100644 --- a/paddlespeech/s2t/exps/u2/bin/test_wav.py +++ b/paddlespeech/s2t/exps/u2/bin/test_wav.py @@ -18,6 +18,7 @@ from pathlib import Path import paddle import soundfile +from yacs.config import CfgNode from paddlespeech.s2t.exps.u2.config import get_cfg_defaults from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer @@ -36,23 +37,22 @@ class U2Infer(): self.args = args self.config = config self.audio_file = args.audio_file - self.sr = config.collator.target_sample_rate - self.preprocess_conf = config.collator.augmentation_config + self.preprocess_conf = config.augmentation_config self.preprocess_args = {"train": False} self.preprocessing = Transformation(self.preprocess_conf) self.text_feature = TextFeaturizer( - unit_type=config.collator.unit_type, - vocab=config.collator.vocab_filepath, - spm_model_prefix=config.collator.spm_model_prefix) + unit_type=config.unit_type, + vocab=config.vocab_filepath, + spm_model_prefix=config.spm_model_prefix) paddle.set_device('gpu' if self.args.ngpu > 0 else 'cpu') # model - model_conf = config.model + model_conf = config with UpdateConfig(model_conf): - model_conf.input_dim = config.collator.feat_dim + model_conf.input_dim = config.feat_dim model_conf.output_dim = self.text_feature.vocab_size model = U2Model.from_config(model_conf) self.model = model @@ -70,10 +70,6 @@ class U2Infer(): # read audio, sample_rate = soundfile.read( self.audio_file, dtype="int16", always_2d=True) - if sample_rate != self.sr: - logger.error( - f"sample rate error: {sample_rate}, need {self.sr} ") - sys.exit(-1) audio = audio[:, 0] logger.info(f"audio shape: {audio.shape}") @@ -85,17 +81,17 @@ class U2Infer(): ilen = paddle.to_tensor(feat.shape[0]) xs = paddle.to_tensor(feat, dtype='float32').unsqueeze(axis=0) - cfg = self.config.decoding + decode_config = self.config.decoding result_transcripts = self.model.decode( xs, ilen, text_feature=self.text_feature, - decoding_method=cfg.decoding_method, - beam_size=cfg.beam_size, - ctc_weight=cfg.ctc_weight, - decoding_chunk_size=cfg.decoding_chunk_size, - num_decoding_left_chunks=cfg.num_decoding_left_chunks, - simulate_streaming=cfg.simulate_streaming) + decoding_method=decode_config.decoding_method, + beam_size=decode_config.beam_size, + ctc_weight=decode_config.ctc_weight, + decoding_chunk_size=decode_config.decoding_chunk_size, + num_decoding_left_chunks=decode_config.num_decoding_left_chunks, + simulate_streaming=decode_config.simulate_streaming) rsl = result_transcripts[0][0] utt = Path(self.audio_file).name logger.info(f"hyp: {utt} {result_transcripts[0][0]}") @@ -136,6 +132,10 @@ if __name__ == "__main__": config = get_cfg_defaults() if args.config: config.merge_from_file(args.config) + if args.decode_config: + decode_confs = CfgNode(new_allowed=True) + decode_confs.merge_from_file(args.decode_config) + config.decoding = decode_confs if args.opts: config.merge_from_list(args.opts) config.freeze() diff --git a/paddlespeech/s2t/exps/u2/config.py b/paddlespeech/s2t/exps/u2/config.py index 898b0bb2..59376e95 100644 --- a/paddlespeech/s2t/exps/u2/config.py +++ b/paddlespeech/s2t/exps/u2/config.py @@ -19,19 +19,18 @@ from paddlespeech.s2t.io.collator import SpeechCollator from paddlespeech.s2t.io.dataset import ManifestDataset from paddlespeech.s2t.models.u2 import U2Model -_C = CfgNode() +_C = CfgNode(new_allowed=True) -_C.data = ManifestDataset.params() +ManifestDataset.params(_C) -_C.collator = SpeechCollator.params() +SpeechCollator.params(_C) -_C.model = U2Model.params() +U2Model.params(_C) -_C.training = U2Trainer.params() +U2Trainer.params(_C) _C.decoding = U2Tester.params() - def get_cfg_defaults(): """Get a yacs CfgNode object with default values for my_project.""" # Return a clone so that the defaults will not be altered diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py index 9fb7067f..1de9541d 100644 --- a/paddlespeech/s2t/exps/u2/model.py +++ b/paddlespeech/s2t/exps/u2/model.py @@ -77,7 +77,7 @@ class U2Trainer(Trainer): super().__init__(config, args) def train_batch(self, batch_index, batch_data, msg): - train_conf = self.config.training + train_conf = self.config start = time.time() # forward @@ -120,7 +120,7 @@ class U2Trainer(Trainer): for k, v in losses_np.items(): report(k, v) - report("batch_size", self.config.collator.batch_size) + report("batch_size", self.config.batch_size) report("accum", train_conf.accum_grad) report("step_cost", iteration_time) @@ -153,7 +153,7 @@ class U2Trainer(Trainer): if ctc_loss: valid_losses['val_ctc_loss'].append(float(ctc_loss)) - if (i + 1) % self.config.training.log_interval == 0: + if (i + 1) % self.config.log_interval == 0: valid_dump = {k: np.mean(v) for k, v in valid_losses.items()} valid_dump['val_history_loss'] = total_loss / num_seen_utts @@ -182,7 +182,7 @@ class U2Trainer(Trainer): self.before_train() logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}") - while self.epoch < self.config.training.n_epoch: + while self.epoch < self.config.n_epoch: with Timer("Epoch-Train Time Cost: {}"): self.model.train() try: @@ -214,8 +214,7 @@ class U2Trainer(Trainer): k.split(',')) == 2 else "" msg += "," msg = msg[:-1] # remove the last "," - if (batch_index + 1 - ) % self.config.training.log_interval == 0: + if (batch_index + 1) % self.config.log_interval == 0: logger.info(msg) data_start_time = time.time() except Exception as e: @@ -252,29 +251,29 @@ class U2Trainer(Trainer): if self.train: # train/valid dataset, return token ids self.train_loader = BatchDataLoader( - json_file=config.data.train_manifest, + json_file=config.train_manifest, train_mode=True, - sortagrad=config.collator.sortagrad, - batch_size=config.collator.batch_size, - maxlen_in=config.collator.maxlen_in, - maxlen_out=config.collator.maxlen_out, - minibatches=config.collator.minibatches, + sortagrad=config.sortagrad, + batch_size=config.batch_size, + maxlen_in=config.maxlen_in, + maxlen_out=config.maxlen_out, + minibatches=config.minibatches, mini_batch_size=self.args.ngpu, - batch_count=config.collator.batch_count, - batch_bins=config.collator.batch_bins, - batch_frames_in=config.collator.batch_frames_in, - batch_frames_out=config.collator.batch_frames_out, - batch_frames_inout=config.collator.batch_frames_inout, - preprocess_conf=config.collator.augmentation_config, - n_iter_processes=config.collator.num_workers, + batch_count=config.batch_count, + batch_bins=config.batch_bins, + batch_frames_in=config.batch_frames_in, + batch_frames_out=config.batch_frames_out, + batch_frames_inout=config.batch_frames_inout, + preprocess_conf=config.augmentation_config, + n_iter_processes=config.num_workers, subsampling_factor=1, num_encs=1) self.valid_loader = BatchDataLoader( - json_file=config.data.dev_manifest, + json_file=config.dev_manifest, train_mode=False, sortagrad=False, - batch_size=config.collator.batch_size, + batch_size=config.batch_size, maxlen_in=float('inf'), maxlen_out=float('inf'), minibatches=0, @@ -284,18 +283,18 @@ class U2Trainer(Trainer): batch_frames_in=0, batch_frames_out=0, batch_frames_inout=0, - preprocess_conf=config.collator.augmentation_config, - n_iter_processes=config.collator.num_workers, + preprocess_conf=config.augmentation_config, + n_iter_processes=config.num_workers, subsampling_factor=1, num_encs=1) logger.info("Setup train/valid Dataloader!") else: # test dataset, return raw text self.test_loader = BatchDataLoader( - json_file=config.data.test_manifest, + json_file=config.test_manifest, train_mode=False, sortagrad=False, - batch_size=config.decoding.batch_size, + batch_size=config.decoding.decode_batch_size, maxlen_in=float('inf'), maxlen_out=float('inf'), minibatches=0, @@ -305,16 +304,16 @@ class U2Trainer(Trainer): batch_frames_in=0, batch_frames_out=0, batch_frames_inout=0, - preprocess_conf=config.collator.augmentation_config, + preprocess_conf=config.augmentation_config, n_iter_processes=1, subsampling_factor=1, num_encs=1) self.align_loader = BatchDataLoader( - json_file=config.data.test_manifest, + json_file=config.test_manifest, train_mode=False, sortagrad=False, - batch_size=config.decoding.batch_size, + batch_size=config.decoding.decode_batch_size, maxlen_in=float('inf'), maxlen_out=float('inf'), minibatches=0, @@ -324,7 +323,7 @@ class U2Trainer(Trainer): batch_frames_in=0, batch_frames_out=0, batch_frames_inout=0, - preprocess_conf=config.collator.augmentation_config, + preprocess_conf=config.augmentation_config, n_iter_processes=1, subsampling_factor=1, num_encs=1) @@ -332,7 +331,7 @@ class U2Trainer(Trainer): def setup_model(self): config = self.config - model_conf = config.model + model_conf = config with UpdateConfig(model_conf): if self.train: @@ -355,7 +354,7 @@ class U2Trainer(Trainer): if not self.train: return - train_config = config.training + train_config = config optim_type = train_config.optim optim_conf = train_config.optim_conf scheduler_type = train_config.scheduler @@ -375,7 +374,7 @@ class U2Trainer(Trainer): config, parameters, lr_scheduler=None, ): - train_config = config.training + train_config = config optim_type = train_config.optim optim_conf = train_config.optim_conf scheduler_type = train_config.scheduler @@ -415,7 +414,7 @@ class U2Tester(U2Trainer): error_rate_type='wer', # Error rate type for evaluation. Options `wer`, 'cer' num_proc_bsearch=8, # # of CPUs for beam search. beam_size=10, # Beam search width. - batch_size=16, # decoding batch size + decode_batch_size=16, # decoding batch size ctc_weight=0.0, # ctc weight for attention rescoring decode mode. decoding_chunk_size=-1, # decoding chunk size. Defaults to -1. # <0: for decoding, use full chunk. @@ -432,9 +431,9 @@ class U2Tester(U2Trainer): def __init__(self, config, args): super().__init__(config, args) self.text_feature = TextFeaturizer( - unit_type=self.config.collator.unit_type, - vocab=self.config.collator.vocab_filepath, - spm_model_prefix=self.config.collator.spm_model_prefix) + unit_type=self.config.unit_type, + vocab=self.config.vocab_filepath, + spm_model_prefix=self.config.spm_model_prefix) self.vocab_list = self.text_feature.vocab_list def id2token(self, texts, texts_len, text_feature): @@ -453,10 +452,10 @@ class U2Tester(U2Trainer): texts, texts_len, fout=None): - cfg = self.config.decoding + decode_config = self.config.decoding errors_sum, len_refs, num_ins = 0.0, 0, 0 - errors_func = error_rate.char_errors if cfg.error_rate_type == 'cer' else error_rate.word_errors - error_rate_func = error_rate.cer if cfg.error_rate_type == 'cer' else error_rate.wer + errors_func = error_rate.char_errors if decode_config.error_rate_type == 'cer' else error_rate.word_errors + error_rate_func = error_rate.cer if decode_config.error_rate_type == 'cer' else error_rate.wer start_time = time.time() target_transcripts = self.id2token(texts, texts_len, self.text_feature) @@ -464,12 +463,12 @@ class U2Tester(U2Trainer): audio, audio_len, text_feature=self.text_feature, - decoding_method=cfg.decoding_method, - beam_size=cfg.beam_size, - ctc_weight=cfg.ctc_weight, - decoding_chunk_size=cfg.decoding_chunk_size, - num_decoding_left_chunks=cfg.num_decoding_left_chunks, - simulate_streaming=cfg.simulate_streaming) + decoding_method=decode_config.decoding_method, + beam_size=decode_config.beam_size, + ctc_weight=decode_config.ctc_weight, + decoding_chunk_size=decode_config.decoding_chunk_size, + num_decoding_left_chunks=decode_config.num_decoding_left_chunks, + simulate_streaming=decode_config.simulate_streaming) decode_time = time.time() - start_time for utt, target, result, rec_tids in zip( @@ -488,15 +487,15 @@ class U2Tester(U2Trainer): logger.info(f"Utt: {utt}") logger.info(f"Ref: {target}") logger.info(f"Hyp: {result}") - logger.info("One example error rate [%s] = %f" % - (cfg.error_rate_type, error_rate_func(target, result))) + logger.info("One example error rate [%s] = %f" % ( + decode_config.error_rate_type, error_rate_func(target, result))) return dict( errors_sum=errors_sum, len_refs=len_refs, num_ins=num_ins, # num examples error_rate=errors_sum / len_refs, - error_rate_type=cfg.error_rate_type, + error_rate_type=decode_config.error_rate_type, num_frames=audio_len.sum().numpy().item(), decode_time=decode_time) @@ -507,7 +506,7 @@ class U2Tester(U2Trainer): self.model.eval() logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}") - stride_ms = self.config.collator.stride_ms + stride_ms = self.config.stride_ms error_rate_type = None errors_sum, len_refs, num_ins = 0.0, 0, 0 num_frames = 0.0 @@ -558,15 +557,15 @@ class U2Tester(U2Trainer): "ref_len": len_refs, "decode_method": - self.config.decoding.decoding_method, + self.config.decoding_method, }) f.write(data + '\n') @paddle.no_grad() def align(self): ctc_utils.ctc_align(self.config, self.model, self.align_loader, - self.config.decoding.batch_size, - self.config.collator.stride_ms, self.vocab_list, + self.config.decoding.decode_batch_size, + self.config.stride_ms, self.vocab_list, self.args.result_file) def load_inferspec(self): @@ -577,10 +576,10 @@ class U2Tester(U2Trainer): List[paddle.static.InputSpec]: input spec. """ from paddlespeech.s2t.models.u2 import U2InferModel - infer_model = U2InferModel.from_pretrained(self.test_loader, - self.config.model.clone(), + infer_model = U2InferModel.from_pretrained(self.train_loader, + self.config.clone(), self.args.checkpoint_path) - feat_dim = self.test_loader.feat_dim + feat_dim = self.train_loader.feat_dim input_spec = [ paddle.static.InputSpec(shape=[1, None, feat_dim], dtype='float32'), # audio, [B,T,D] diff --git a/paddlespeech/s2t/training/cli.py b/paddlespeech/s2t/training/cli.py index 3ef871c5..d4299ea3 100644 --- a/paddlespeech/s2t/training/cli.py +++ b/paddlespeech/s2t/training/cli.py @@ -97,6 +97,14 @@ def default_argument_parser(parser=None): train_group.add_argument( "--dump-config", metavar="FILE", help="dump config to `this` file.") + test_group = parser.add_argument_group( + title='Test Options', description=None) + + test_group.add_argument( + "--decode_config", + metavar="DECODE_CONFIG_FILE", + help="decode config file.") + profile_group = parser.add_argument_group( title='Benchmark Options', description=None) profile_group.add_argument( diff --git a/paddlespeech/s2t/training/trainer.py b/paddlespeech/s2t/training/trainer.py index 9bf1ca4d..4b2011ec 100644 --- a/paddlespeech/s2t/training/trainer.py +++ b/paddlespeech/s2t/training/trainer.py @@ -117,8 +117,8 @@ class Trainer(): self.init_parallel() self.checkpoint = Checkpoint( - kbest_n=self.config.training.checkpoint.kbest_n, - latest_n=self.config.training.checkpoint.latest_n) + kbest_n=self.config.checkpoint.kbest_n, + latest_n=self.config.checkpoint.latest_n) # set random seed if needed if args.seed: @@ -129,8 +129,8 @@ class Trainer(): if hasattr(self.args, "benchmark_batch_size") and self.args.benchmark_batch_size: with UpdateConfig(self.config): - self.config.collator.batch_size = self.args.benchmark_batch_size - self.config.training.log_interval = 1 + self.config.batch_size = self.args.benchmark_batch_size + self.config.log_interval = 1 logger.info( f"Benchmark reset batch-size: {self.args.benchmark_batch_size}") @@ -260,7 +260,7 @@ class Trainer(): self.before_train() logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}") - while self.epoch < self.config.training.n_epoch: + while self.epoch < self.config.n_epoch: with Timer("Epoch-Train Time Cost: {}"): self.model.train() try: diff --git a/paddlespeech/s2t/utils/utility.py b/paddlespeech/s2t/utils/utility.py index 73c79816..dc1be815 100644 --- a/paddlespeech/s2t/utils/utility.py +++ b/paddlespeech/s2t/utils/utility.py @@ -130,7 +130,7 @@ def get_subsample(config): Returns: int: subsample rate. """ - input_layer = config["model"]["encoder_conf"]["input_layer"] + input_layer = config["encoder_conf"]["input_layer"] assert input_layer in ["conv2d", "conv2d6", "conv2d8"] if input_layer == "conv2d": return 4 From 960658f66924bf9471ed0c5e5132debae294f363 Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Wed, 29 Dec 2021 02:48:16 +0000 Subject: [PATCH 2/9] add the whole of aishell asr1 --- .../aishell/asr1/conf/chunk_conformer.yaml | 183 +++++++++--------- examples/aishell/asr1/conf/conformer.yaml | 5 - examples/aishell/asr1/conf/transformer.yaml | 167 ++++++++-------- .../asr1/conf/tuning/chunk_decode.yaml | 11 ++ .../asr1/conf/{ => tuning}/decode.yaml | 1 - examples/aishell/asr1/run.sh | 2 +- 6 files changed, 177 insertions(+), 192 deletions(-) create mode 100644 examples/aishell/asr1/conf/tuning/chunk_decode.yaml rename examples/aishell/asr1/conf/{ => tuning}/decode.yaml (98%) diff --git a/examples/aishell/asr1/conf/chunk_conformer.yaml b/examples/aishell/asr1/conf/chunk_conformer.yaml index 50eaef98..31e9be13 100644 --- a/examples/aishell/asr1/conf/chunk_conformer.yaml +++ b/examples/aishell/asr1/conf/chunk_conformer.yaml @@ -1,103 +1,94 @@ -# network architecture -model: - cmvn_file: - cmvn_file_type: "json" - # encoder related - encoder: conformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: True - cnn_module_kernel: 15 - use_cnn_module: True - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' - causal: true - use_dynamic_chunk: true - cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster - use_dynamic_left_chunk: false +############################################ +# Network Architecture # +############################################ +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: conformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: True + cnn_module_kernel: 15 + use_cnn_module: True + activation_type: 'swish' + pos_enc_layer_type: 'rel_pos' + selfattention_layer_type: 'rel_selfattn' + causal: true + use_dynamic_chunk: true + cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster + use_dynamic_left_chunk: false +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +########################################### +# Data # +########################################### - # hybrid CTC/attention - model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'char' +augmentation_config: conf/preprocess.yaml +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 64 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 -collator: - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'char' - augmentation_config: conf/preprocess.yaml - feat_dim: 80 - stride_ms: 10.0 - window_ms: 25.0 - sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs - batch_size: 64 - maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced - maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced - minibatches: 0 # for debug - batch_count: auto - batch_bins: 0 - batch_frames_in: 0 - batch_frames_out: 0 - batch_frames_inout: 0 - num_workers: 0 - subsampling_factor: 1 - num_encs: 1 - - -training: - n_epoch: 240 - accum_grad: 2 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 0.002 - weight_decay: 1e-6 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 - - -decoding: - beam_size: 10 - batch_size: 128 - error_rate_type: cer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. +########################################### +# training # +########################################### +n_epoch: 240 +accum_grad: 2 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 0.002 + weight_decay: 1e-6 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/aishell/asr1/conf/conformer.yaml b/examples/aishell/asr1/conf/conformer.yaml index 2ba96e76..d9e3daec 100644 --- a/examples/aishell/asr1/conf/conformer.yaml +++ b/examples/aishell/asr1/conf/conformer.yaml @@ -1,7 +1,6 @@ ############################################ # Network Architecture # ############################################ -#model: cmvn_file: cmvn_file_type: "json" # encoder related @@ -42,7 +41,6 @@ model_conf: ########################################### # Data # ########################################### -#data: train_manifest: data/manifest.train dev_manifest: data/manifest.dev test_manifest: data/manifest.test @@ -50,11 +48,9 @@ test_manifest: data/manifest.test ########################################### # Dataloader # ########################################### -#collator: vocab_filepath: data/lang_char/vocab.txt unit_type: 'char' augmentation_config: conf/preprocess.yaml -spm_model_prefix: '' feat_dim: 80 stride_ms: 10.0 window_ms: 25.0 @@ -75,7 +71,6 @@ num_encs: 1 ########################################### # training # ########################################### -#training: n_epoch: 240 accum_grad: 2 global_grad_clip: 5.0 diff --git a/examples/aishell/asr1/conf/transformer.yaml b/examples/aishell/asr1/conf/transformer.yaml index 7c5fa624..e6684ec8 100644 --- a/examples/aishell/asr1/conf/transformer.yaml +++ b/examples/aishell/asr1/conf/transformer.yaml @@ -1,95 +1,84 @@ -# network architecture -model: - cmvn_file: - cmvn_file_type: "json" - # encoder related - encoder: transformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true - - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 - - # hybrid CTC/attention - model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +############################################ +# Network Architecture # +############################################ +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: transformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: true +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false +########################################### +# Data # +########################################### # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test - - -collator: - unit_type: 'char' - vocab_filepath: data/lang_char/vocab.txt - feat_dim: 80 - stride_ms: 10.0 - window_ms: 25.0 - sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs - batch_size: 64 - maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced - maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced - minibatches: 0 # for debug - batch_count: auto - batch_bins: 0 - batch_frames_in: 0 - batch_frames_out: 0 - batch_frames_inout: 0 - augmentation_config: conf/preprocess.yaml - num_workers: 0 - subsampling_factor: 1 - num_encs: 1 - - +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test -training: - n_epoch: 240 - accum_grad: 2 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 0.002 - weight_decay: 1e-6 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 +########################################### +# Dataloader # +########################################### +unit_type: 'char' +vocab_filepath: data/lang_char/vocab.txt +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 64 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +augmentation_config: conf/preprocess.yaml +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 -decoding: - beam_size: 10 - batch_size: 128 - error_rate_type: cer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. \ No newline at end of file +########################################### +# training # +########################################### +n_epoch: 240 +accum_grad: 2 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 0.002 + weight_decay: 1e-6 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/aishell/asr1/conf/tuning/chunk_decode.yaml b/examples/aishell/asr1/conf/tuning/chunk_decode.yaml new file mode 100644 index 00000000..72ede927 --- /dev/null +++ b/examples/aishell/asr1/conf/tuning/chunk_decode.yaml @@ -0,0 +1,11 @@ +beam_size: 10 +decode_batch_size: 128 +error_rate_type: cer +decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' +ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. +decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. +num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. +simulate_streaming: False # simulate streaming inference. Defaults to False. diff --git a/examples/aishell/asr1/conf/decode.yaml b/examples/aishell/asr1/conf/tuning/decode.yaml similarity index 98% rename from examples/aishell/asr1/conf/decode.yaml rename to examples/aishell/asr1/conf/tuning/decode.yaml index 49364f5d..72ede927 100644 --- a/examples/aishell/asr1/conf/decode.yaml +++ b/examples/aishell/asr1/conf/tuning/decode.yaml @@ -1,4 +1,3 @@ -#decoding: beam_size: 10 decode_batch_size: 128 error_rate_type: cer diff --git a/examples/aishell/asr1/run.sh b/examples/aishell/asr1/run.sh index 11aff9c7..c54dae9c 100644 --- a/examples/aishell/asr1/run.sh +++ b/examples/aishell/asr1/run.sh @@ -6,7 +6,7 @@ gpus=0,1,2,3 stage=0 stop_stage=50 conf_path=conf/conformer.yaml -decode_conf_path=conf/decode.yaml +decode_conf_path=conf/tuning/decode.yaml avg_num=20 audio_file=data/demo_01_03.wav From 2c5902d7c58d9ec437637332015a836ee05f6db7 Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Wed, 29 Dec 2021 03:13:14 +0000 Subject: [PATCH 3/9] rename decoding to decode --- paddlespeech/s2t/exps/u2/config.py | 1 + paddlespeech/s2t/exps/u2/model.py | 8 ++++---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/paddlespeech/s2t/exps/u2/config.py b/paddlespeech/s2t/exps/u2/config.py index 59376e95..537d0c21 100644 --- a/paddlespeech/s2t/exps/u2/config.py +++ b/paddlespeech/s2t/exps/u2/config.py @@ -31,6 +31,7 @@ U2Trainer.params(_C) _C.decoding = U2Tester.params() + def get_cfg_defaults(): """Get a yacs CfgNode object with default values for my_project.""" # Return a clone so that the defaults will not be altered diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py index 1de9541d..31610e15 100644 --- a/paddlespeech/s2t/exps/u2/model.py +++ b/paddlespeech/s2t/exps/u2/model.py @@ -294,7 +294,7 @@ class U2Trainer(Trainer): json_file=config.test_manifest, train_mode=False, sortagrad=False, - batch_size=config.decoding.decode_batch_size, + batch_size=config.decode.decode_batch_size, maxlen_in=float('inf'), maxlen_out=float('inf'), minibatches=0, @@ -313,7 +313,7 @@ class U2Trainer(Trainer): json_file=config.test_manifest, train_mode=False, sortagrad=False, - batch_size=config.decoding.decode_batch_size, + batch_size=config.decode.decode_batch_size, maxlen_in=float('inf'), maxlen_out=float('inf'), minibatches=0, @@ -452,7 +452,7 @@ class U2Tester(U2Trainer): texts, texts_len, fout=None): - decode_config = self.config.decoding + decode_config = self.config.decode errors_sum, len_refs, num_ins = 0.0, 0, 0 errors_func = error_rate.char_errors if decode_config.error_rate_type == 'cer' else error_rate.word_errors error_rate_func = error_rate.cer if decode_config.error_rate_type == 'cer' else error_rate.wer @@ -564,7 +564,7 @@ class U2Tester(U2Trainer): @paddle.no_grad() def align(self): ctc_utils.ctc_align(self.config, self.model, self.align_loader, - self.config.decoding.decode_batch_size, + self.config.decode.decode_batch_size, self.config.stride_ms, self.vocab_list, self.args.result_file) From 41eeed0450bea7c5fb097887d3a2e8dac55a6f28 Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Wed, 29 Dec 2021 08:38:11 +0000 Subject: [PATCH 4/9] add librispeech asr1 --- examples/aishell/asr1/local/align.sh | 2 +- examples/aishell/asr1/local/test.sh | 10 +- examples/aishell/asr1/local/test_wav.sh | 4 +- examples/csmsc/voc5/README.md | 4 +- .../asr1/conf/chunk_conformer.yaml | 184 ++++++++-------- .../asr1/conf/chunk_transformer.yaml | 179 ++++++++-------- examples/librispeech/asr1/conf/conformer.yaml | 181 ++++++++-------- .../librispeech/asr1/conf/transformer.yaml | 183 +++++++--------- .../asr1/conf/tuning/chunk_decode.yaml | 11 + .../librispeech/asr1/conf/tuning/decode.yaml | 11 + examples/librispeech/asr1/local/align.sh | 10 +- examples/librispeech/asr1/local/test.sh | 22 +- examples/librispeech/asr1/local/test_wav.sh | 14 +- examples/librispeech/asr1/run.sh | 7 +- examples/tiny/asr1/conf/conformer.yaml | 197 +++++++++--------- examples/tiny/asr1/conf/transformer.yaml | 188 ++++++++--------- paddlespeech/s2t/exps/u2/bin/alignment.py | 2 +- paddlespeech/s2t/exps/u2/bin/test.py | 2 +- paddlespeech/s2t/exps/u2/bin/test_wav.py | 4 +- paddlespeech/s2t/exps/u2/config.py | 2 +- 20 files changed, 590 insertions(+), 627 deletions(-) create mode 100644 examples/librispeech/asr1/conf/tuning/chunk_decode.yaml create mode 100644 examples/librispeech/asr1/conf/tuning/decode.yaml diff --git a/examples/aishell/asr1/local/align.sh b/examples/aishell/asr1/local/align.sh index f526c8a4..95472e10 100755 --- a/examples/aishell/asr1/local/align.sh +++ b/examples/aishell/asr1/local/align.sh @@ -24,7 +24,7 @@ python3 -u ${BIN_DIR}/alignment.py \ --decode_config ${decode_config_path} \ --result_file ${output_dir}/${type}.align \ --checkpoint_path ${ckpt_prefix} \ ---opts decoding.decode_batch_size ${batch_size} +--opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in ctc alignment!" diff --git a/examples/aishell/asr1/local/test.sh b/examples/aishell/asr1/local/test.sh index 2c092127..cab7c34e 100755 --- a/examples/aishell/asr1/local/test.sh +++ b/examples/aishell/asr1/local/test.sh @@ -30,7 +30,7 @@ for type in attention ctc_greedy_search; do # stream decoding only support batchsize=1 batch_size=1 else - batch_size=64 + batch_size=1 fi output_dir=${ckpt_prefix} mkdir -p ${output_dir} @@ -40,8 +40,8 @@ for type in attention ctc_greedy_search; do --decode_config ${decode_config_path} \ --result_file ${output_dir}/${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.decode_batch_size ${batch_size} + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" @@ -60,8 +60,8 @@ for type in ctc_prefix_beam_search attention_rescoring; do --decode_config ${decode_config_path} \ --result_file ${output_dir}/${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" diff --git a/examples/aishell/asr1/local/test_wav.sh b/examples/aishell/asr1/local/test_wav.sh index 4866e642..661013b1 100755 --- a/examples/aishell/asr1/local/test_wav.sh +++ b/examples/aishell/asr1/local/test_wav.sh @@ -46,8 +46,8 @@ for type in attention_rescoring; do --decode_config ${decode_config_path} \ --result_file ${output_dir}/${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.decode_batch_size ${batch_size} \ + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} \ --audio_file ${audio_file} if [ $? -ne 0 ]; then diff --git a/examples/csmsc/voc5/README.md b/examples/csmsc/voc5/README.md index 21afe6ee..be06f830 100644 --- a/examples/csmsc/voc5/README.md +++ b/examples/csmsc/voc5/README.md @@ -125,8 +125,8 @@ HiFiGAN checkpoint contains files listed below. ```text hifigan_csmsc_ckpt_0.1.1 ├── default.yaml # default config used to train hifigan -├── feats_stats.npy # statistics used to normalize spectrogram when training hifigan -└── snapshot_iter_2500000.pdz # generator parameters of hifigan +├── feats_stats.npy # generator parameters of hifigan +└── snapshot_iter_2500000.pdz # statistics used to normalize spectrogram when training hifigan ``` ## Acknowledgement diff --git a/examples/librispeech/asr1/conf/chunk_conformer.yaml b/examples/librispeech/asr1/conf/chunk_conformer.yaml index 662d559c..ace61d36 100644 --- a/examples/librispeech/asr1/conf/chunk_conformer.yaml +++ b/examples/librispeech/asr1/conf/chunk_conformer.yaml @@ -1,103 +1,99 @@ -# network architecture -model: - cmvn_file: - cmvn_file_type: "json" - # encoder related - encoder: conformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: True - use_cnn_module: True - cnn_module_kernel: 15 - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' - causal: True - use_dynamic_chunk: true - cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster - use_dynamic_left_chunk: false +############################################ +# Network Architecture # +############################################ +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: conformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: True + use_cnn_module: True + cnn_module_kernel: 15 + activation_type: 'swish' + pos_enc_layer_type: 'rel_pos' + selfattention_layer_type: 'rel_selfattn' + causal: True + use_dynamic_chunk: true + cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster + use_dynamic_left_chunk: false - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 - # hybrid CTC/attention - model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test -collator: - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'spm' - spm_model_prefix: 'data/lang_char/bpe_unigram_5000' - mean_std_filepath: "" - augmentation_config: conf/preprocess.yaml - feat_dim: 80 - stride_ms: 10.0 - window_ms: 25.0 - sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs - batch_size: 16 - maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced - maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced - minibatches: 0 # for debug - batch_count: auto - batch_bins: 0 - batch_frames_in: 0 - batch_frames_out: 0 - batch_frames_inout: 0 - augmentation_config: conf/preprocess.yaml - num_workers: 0 - subsampling_factor: 1 - num_encs: 1 +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'spm' +spm_model_prefix: 'data/lang_char/bpe_unigram_5000' +mean_std_filepath: "" +augmentation_config: conf/preprocess.yaml +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 16 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +augmentation_config: conf/preprocess.yaml +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 + +########################################### +# Training # +########################################### +n_epoch: 120 +accum_grad: 8 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 0.001 + weight_decay: 1e-06 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 -training: - n_epoch: 120 - accum_grad: 8 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 0.001 - weight_decay: 1e-06 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 25000 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 -decoding: - batch_size: 128 - error_rate_type: wer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - beam_size: 10 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: true # simulate streaming inference. Defaults to False. diff --git a/examples/librispeech/asr1/conf/chunk_transformer.yaml b/examples/librispeech/asr1/conf/chunk_transformer.yaml index bc77ba41..d6d84eb1 100644 --- a/examples/librispeech/asr1/conf/chunk_transformer.yaml +++ b/examples/librispeech/asr1/conf/chunk_transformer.yaml @@ -1,103 +1,90 @@ -# network architecture -model: - cmvn_file: - cmvn_file_type: "json" - # encoder related - encoder: transformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true - use_dynamic_chunk: true - use_dynamic_left_chunk: false +############################################ +# Network Architecture # +############################################ +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: transformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: true + use_dynamic_chunk: true + use_dynamic_left_chunk: false - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 - # hybrid CTC/attention - model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test -collator: - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'spm' - spm_model_prefix: 'data/lang_char/bpe_unigram_5000' - mean_std_filepath: "" - augmentation_config: conf/preprocess.yaml - feat_dim: 80 - stride_ms: 10.0 - window_ms: 25.0 - sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs - batch_size: 64 - maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced - maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced - minibatches: 0 # for debug - batch_count: auto - batch_bins: 0 - batch_frames_in: 0 - batch_frames_out: 0 - batch_frames_inout: 0 - augmentation_config: conf/preprocess.yaml - num_workers: 0 - subsampling_factor: 1 - num_encs: 1 +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'spm' +spm_model_prefix: 'data/lang_char/bpe_unigram_5000' +mean_std_filepath: "" +augmentation_config: conf/preprocess.yaml +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 64 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +augmentation_config: conf/preprocess.yaml +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 -training: - n_epoch: 120 - accum_grad: 1 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 0.001 - weight_decay: 1e-06 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 - - -decoding: - batch_size: 64 - error_rate_type: wer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 10 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: true # simulate streaming inference. Defaults to False. \ No newline at end of file +########################################### +# Training # +########################################### +n_epoch: 120 +accum_grad: 1 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 0.001 + weight_decay: 1e-06 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 \ No newline at end of file diff --git a/examples/librispeech/asr1/conf/conformer.yaml b/examples/librispeech/asr1/conf/conformer.yaml index 5a570897..bb028e69 100644 --- a/examples/librispeech/asr1/conf/conformer.yaml +++ b/examples/librispeech/asr1/conf/conformer.yaml @@ -1,104 +1,97 @@ -# network architecture -model: - cmvn_file: - cmvn_file_type: "json" - # encoder related - encoder: conformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: True - use_cnn_module: True - cnn_module_kernel: 15 - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' +############################################ +# Network Architecture # +############################################ +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: conformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: True + use_cnn_module: True + cnn_module_kernel: 15 + activation_type: 'swish' + pos_enc_layer_type: 'rel_pos' + selfattention_layer_type: 'rel_selfattn' - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 - # hybrid CTC/attention - model_conf: - ctc_weight: 0.3 - ctc_grad_norm_type: null - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + ctc_grad_norm_type: null + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test-clean +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test-clean -collator: - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'spm' - spm_model_prefix: 'data/lang_char/bpe_unigram_5000' - mean_std_filepath: "" - augmentation_config: conf/preprocess.yaml - feat_dim: 80 - stride_ms: 10.0 - window_ms: 25.0 - sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs - batch_size: 16 - maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced - maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced - minibatches: 0 # for debug - batch_count: auto - batch_bins: 0 - batch_frames_in: 0 - batch_frames_out: 0 - batch_frames_inout: 0 - augmentation_config: conf/preprocess.yaml - num_workers: 0 - subsampling_factor: 1 - num_encs: 1 +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'spm' +spm_model_prefix: 'data/lang_char/bpe_unigram_5000' +mean_std_filepath: "" +augmentation_config: conf/preprocess.yaml +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 16 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +augmentation_config: conf/preprocess.yaml +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 -training: - n_epoch: 70 - accum_grad: 8 - global_grad_clip: 3.0 - optim: adam - optim_conf: - lr: 0.004 - weight_decay: 1e-06 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 - - -decoding: - batch_size: 64 - error_rate_type: wer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - beam_size: 10 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. +########################################### +# Training # +########################################### +n_epoch: 70 +accum_grad: 8 +global_grad_clip: 3.0 +optim: adam +optim_conf: + lr: 0.004 + weight_decay: 1e-06 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/librispeech/asr1/conf/transformer.yaml b/examples/librispeech/asr1/conf/transformer.yaml index b7f33e22..f81234f1 100644 --- a/examples/librispeech/asr1/conf/transformer.yaml +++ b/examples/librispeech/asr1/conf/transformer.yaml @@ -1,110 +1,89 @@ -# network architecture -model: - cmvn_file: - cmvn_file_type: "json" - # encoder related - encoder: transformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true +############################################ +# Network Architecture # +############################################ +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: transformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: true - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 - # hybrid CTC/attention - model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test-clean - min_input_len: 0.5 # second - max_input_len: 30.0 # second - min_output_len: 0.0 # tokens - max_output_len: 400.0 # tokens - min_output_input_ratio: 0.05 - max_output_input_ratio: 100.0 +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test-clean -collator: - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'spm' - spm_model_prefix: 'data/lang_char/bpe_unigram_5000' - mean_std_filepath: "" - augmentation_config: conf/preprocess.yaml - feat_dim: 80 - stride_ms: 10.0 - window_ms: 25.0 - sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs - batch_size: 32 - maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced - maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced - minibatches: 0 # for debug - batch_count: auto - batch_bins: 0 - batch_frames_in: 0 - batch_frames_out: 0 - batch_frames_inout: 0 - augmentation_config: conf/preprocess.yaml - num_workers: 0 - subsampling_factor: 1 - num_encs: 1 - - -training: - n_epoch: 120 - accum_grad: 4 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 0.004 - weight_decay: 1e-06 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 - - -decoding: - batch_size: 64 - error_rate_type: wer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 10 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'spm' +spm_model_prefix: 'data/lang_char/bpe_unigram_5000' +mean_std_filepath: "" +augmentation_config: conf/preprocess.yaml +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 32 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +augmentation_config: conf/preprocess.yaml +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 +########################################### +# Training # +########################################### +n_epoch: 120 +accum_grad: 4 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 0.004 + weight_decay: 1e-06 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/librispeech/asr1/conf/tuning/chunk_decode.yaml b/examples/librispeech/asr1/conf/tuning/chunk_decode.yaml new file mode 100644 index 00000000..0760e721 --- /dev/null +++ b/examples/librispeech/asr1/conf/tuning/chunk_decode.yaml @@ -0,0 +1,11 @@ +decode_batch_size: 128 +error_rate_type: wer +decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' +beam_size: 10 +ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. +decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. +num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. +simulate_streaming: true # simulate streaming inference. Defaults to False. diff --git a/examples/librispeech/asr1/conf/tuning/decode.yaml b/examples/librispeech/asr1/conf/tuning/decode.yaml new file mode 100644 index 00000000..805dd02f --- /dev/null +++ b/examples/librispeech/asr1/conf/tuning/decode.yaml @@ -0,0 +1,11 @@ +decode_batch_size: 64 +error_rate_type: wer +decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' +beam_size: 10 +ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. +decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. +num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. +simulate_streaming: False # simulate streaming inference. Defaults to False. diff --git a/examples/librispeech/asr1/local/align.sh b/examples/librispeech/asr1/local/align.sh index c65d611c..95472e10 100755 --- a/examples/librispeech/asr1/local/align.sh +++ b/examples/librispeech/asr1/local/align.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 2 ];then - echo "usage: ${0} config_path ckpt_path_prefix" +if [ $# != 3 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix" exit -1 fi @@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 +decode_config_path=$2 +ckpt_prefix=$3 batch_size=1 output_dir=${ckpt_prefix} @@ -20,9 +21,10 @@ mkdir -p ${output_dir} python3 -u ${BIN_DIR}/alignment.py \ --ngpu ${ngpu} \ --config ${config_path} \ +--decode_config ${decode_config_path} \ --result_file ${output_dir}/${type}.align \ --checkpoint_path ${ckpt_prefix} \ ---opts decoding.batch_size ${batch_size} +--opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in ctc alignment!" diff --git a/examples/librispeech/asr1/local/test.sh b/examples/librispeech/asr1/local/test.sh index aa06132e..ddb6c6b6 100755 --- a/examples/librispeech/asr1/local/test.sh +++ b/examples/librispeech/asr1/local/test.sh @@ -15,8 +15,8 @@ recog_set="test-clean" stage=0 stop_stage=100 -if [ $# != 2 ];then - echo "usage: ${0} config_path ckpt_path_prefix" +if [ $# != 3 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix" exit -1 fi @@ -24,7 +24,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 +decode_config_path=$2 +ckpt_prefix=$3 chunk_mode=false if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then @@ -52,10 +53,11 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_config ${decode_config_path} \ --result_file ${ckpt_prefix}.${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" @@ -76,10 +78,11 @@ for type in ctc_greedy_search; do python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_config ${decode_config_path} \ --result_file ${ckpt_prefix}.${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" @@ -96,10 +99,11 @@ for type in ctc_prefix_beam_search attention_rescoring; do python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_config ${decode_config_path} \ --result_file ${ckpt_prefix}.${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" diff --git a/examples/librispeech/asr1/local/test_wav.sh b/examples/librispeech/asr1/local/test_wav.sh index ab6d685d..60eaadbf 100755 --- a/examples/librispeech/asr1/local/test_wav.sh +++ b/examples/librispeech/asr1/local/test_wav.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 3 ];then - echo "usage: ${0} config_path ckpt_path_prefix audio_file" +if [ $# != 4 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix audio_file" exit -1 fi @@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 -audio_file=$3 +decode_config_path=$2 +ckpt_prefix=$3 +audio_file=$4 mkdir -p data wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/en/demo_002_en.wav -P data/ @@ -49,10 +50,11 @@ for type in attention_rescoring; do python3 -u ${BIN_DIR}/test_wav.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_config ${decode_config_path} \ --result_file ${output_dir}/${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} \ + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} \ --audio_file ${audio_file} #score_sclite.sh --bpe ${nbpe} --bpemodel ${bpemodel}.model --wer true ${expdir}/${decode_dir} ${dict} diff --git a/examples/librispeech/asr1/run.sh b/examples/librispeech/asr1/run.sh index f839e5af..116dae12 100755 --- a/examples/librispeech/asr1/run.sh +++ b/examples/librispeech/asr1/run.sh @@ -8,6 +8,7 @@ gpus=0,1,2,3 stage=0 stop_stage=50 conf_path=conf/transformer.yaml +decode_conf_path=conf/tuning/decode.yaml avg_num=30 audio_file=data/demo_002_en.wav @@ -34,17 +35,17 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then # ctc alignment of test data - CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then # test a single .wav file - CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1 fi if [ ${stage} -le 51 ] && [ ${stop_stage} -ge 51 ]; then diff --git a/examples/tiny/asr1/conf/conformer.yaml b/examples/tiny/asr1/conf/conformer.yaml index eb850902..085581f2 100644 --- a/examples/tiny/asr1/conf/conformer.yaml +++ b/examples/tiny/asr1/conf/conformer.yaml @@ -1,116 +1,105 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.tiny - dev_manifest: data/manifest.tiny - test_manifest: data/manifest.tiny - min_input_len: 0.5 # second - max_input_len: 20.0 # second - min_output_len: 0.0 # tokens - max_output_len: 400.0 # tokens - min_output_input_ratio: 0.05 - max_output_input_ratio: 10.0 +########################################### +# Data # +########################################### +train_manifest: data/manifest.tiny +dev_manifest: data/manifest.tiny +test_manifest: data/manifest.tiny +min_input_len: 0.5 # second +max_input_len: 20.0 # second +min_output_len: 0.0 # tokens +max_output_len: 400.0 # tokens +min_output_input_ratio: 0.05 +max_output_input_ratio: 10.0 -collator: - mean_std_filepath: "" - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'spm' - spm_model_prefix: 'data/lang_char/bpe_unigram_200' - augmentation_config: conf/preprocess.yaml - batch_size: 4 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank - feat_dim: 80 - delta_delta: False - dither: 1.0 - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 +########################################### +# Dataloader # +########################################### +mean_std_filepath: "" +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'spm' +spm_model_prefix: 'data/lang_char/bpe_unigram_200' +augmentation_config: conf/preprocess.yaml +batch_size: 4 +raw_wav: True # use raw_wav or kaldi feature +spectrum_type: fbank #linear, mfcc, fbank +feat_dim: 80 +delta_delta: False +dither: 1.0 +target_sample_rate: 16000 +max_freq: None +n_fft: None +stride_ms: 10.0 +window_ms: 25.0 +use_dB_normalization: True +target_dB: -20 +random_seed: 0 +keep_transcription_text: False +sortagrad: True +shuffle_method: batch_shuffle +num_workers: 2 -# network architecture -model: - cmvn_file: "data/mean_std.json" - cmvn_file_type: "json" - # encoder related - encoder: conformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true - use_cnn_module: True - cnn_module_kernel: 15 - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +############################################ +# Network Architecture # +############################################ +cmvn_file: "data/mean_std.json" +cmvn_file_type: "json" +# encoder related +encoder: conformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: true + use_cnn_module: True + cnn_module_kernel: 15 + activation_type: 'swish' + pos_enc_layer_type: 'rel_pos' + selfattention_layer_type: 'rel_selfattn' - # hybrid CTC/attention - model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false -training: - n_epoch: 5 - accum_grad: 4 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 0.002 - weight_decay: 1e-06 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 1 - checkpoint: - kbest_n: 10 - latest_n: 1 +########################################### +# training # +########################################### +n_epoch: 5 +accum_grad: 4 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 0.002 + weight_decay: 1e-06 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 1 +checkpoint: + kbest_n: 10 + latest_n: 1 -decoding: - batch_size: 64 - error_rate_type: wer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 10 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. diff --git a/examples/tiny/asr1/conf/transformer.yaml b/examples/tiny/asr1/conf/transformer.yaml index c641d1f5..95c7df50 100644 --- a/examples/tiny/asr1/conf/transformer.yaml +++ b/examples/tiny/asr1/conf/transformer.yaml @@ -1,110 +1,98 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.tiny - dev_manifest: data/manifest.tiny - test_manifest: data/manifest.tiny - min_input_len: 0.5 # second - max_input_len: 20.0 # second - min_output_len: 0.0 # tokens - max_output_len: 400.0 # tokens - min_output_input_ratio: 0.05 - max_output_input_ratio: 10.0 +########################################### +# Data # +########################################### +train_manifest: data/manifest.tiny +dev_manifest: data/manifest.tiny +test_manifest: data/manifest.tiny +min_input_len: 0.5 # second +max_input_len: 20.0 # second +min_output_len: 0.0 # tokens +max_output_len: 400.0 # tokens +min_output_input_ratio: 0.05 +max_output_input_ratio: 10.0 -collator: - mean_std_filepath: data/mean_std.json - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'spm' - spm_model_prefix: 'data/lang_char/bpe_unigram_200' - augmentation_config: conf/preprocess.yaml - batch_size: 4 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank - feat_dim: 80 - delta_delta: False - dither: 1.0 - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 +########################################### +# Dataloader # +########################################### +mean_std_filepath: data/mean_std.json +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'spm' +spm_model_prefix: 'data/lang_char/bpe_unigram_200' +augmentation_config: conf/preprocess.yaml +batch_size: 4 +raw_wav: True # use raw_wav or kaldi feature +spectrum_type: fbank #linear, mfcc, fbank +feat_dim: 80 +delta_delta: False +dither: 1.0 +target_sample_rate: 16000 +max_freq: None +n_fft: None +stride_ms: 10.0 +window_ms: 25.0 +use_dB_normalization: True +target_dB: -20 +random_seed: 0 +keep_transcription_text: False +sortagrad: True +shuffle_method: batch_shuffle +num_workers: 2 -# network architecture -model: - cmvn_file: - cmvn_file_type: "json" - # encoder related - encoder: transformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true +############################################ +# Network Architecture # +############################################ +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: transformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: true - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 - # hybrid CTC/attention - model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false -training: - n_epoch: 5 - accum_grad: 1 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 0.002 - weight_decay: 1e-06 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 1 - checkpoint: - kbest_n: 2 - latest_n: 1 +########################################### +# training # +########################################### +n_epoch: 5 +accum_grad: 1 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 0.002 + weight_decay: 1e-06 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 1 +checkpoint: + kbest_n: 2 + latest_n: 1 -decoding: - batch_size: 8 #64 - error_rate_type: wer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 10 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. - diff --git a/paddlespeech/s2t/exps/u2/bin/alignment.py b/paddlespeech/s2t/exps/u2/bin/alignment.py index f8397ed0..229f696d 100644 --- a/paddlespeech/s2t/exps/u2/bin/alignment.py +++ b/paddlespeech/s2t/exps/u2/bin/alignment.py @@ -46,7 +46,7 @@ if __name__ == "__main__": if args.decode_config: decode_confs = CfgNode(new_allowed=True) decode_confs.merge_from_file(args.decode_config) - config.decoding = decode_confs + config.decode = decode_confs if args.opts: config.merge_from_list(args.opts) config.freeze() diff --git a/paddlespeech/s2t/exps/u2/bin/test.py b/paddlespeech/s2t/exps/u2/bin/test.py index f179ea81..419594bf 100644 --- a/paddlespeech/s2t/exps/u2/bin/test.py +++ b/paddlespeech/s2t/exps/u2/bin/test.py @@ -50,7 +50,7 @@ if __name__ == "__main__": if args.decode_config: decode_confs = CfgNode(new_allowed=True) decode_confs.merge_from_file(args.decode_config) - config.decoding = decode_confs + config.decode = decode_confs if args.opts: config.merge_from_list(args.opts) config.freeze() diff --git a/paddlespeech/s2t/exps/u2/bin/test_wav.py b/paddlespeech/s2t/exps/u2/bin/test_wav.py index e5671a43..766e4173 100644 --- a/paddlespeech/s2t/exps/u2/bin/test_wav.py +++ b/paddlespeech/s2t/exps/u2/bin/test_wav.py @@ -81,7 +81,7 @@ class U2Infer(): ilen = paddle.to_tensor(feat.shape[0]) xs = paddle.to_tensor(feat, dtype='float32').unsqueeze(axis=0) - decode_config = self.config.decoding + decode_config = self.config.decode result_transcripts = self.model.decode( xs, ilen, @@ -135,7 +135,7 @@ if __name__ == "__main__": if args.decode_config: decode_confs = CfgNode(new_allowed=True) decode_confs.merge_from_file(args.decode_config) - config.decoding = decode_confs + config.decode = decode_confs if args.opts: config.merge_from_list(args.opts) config.freeze() diff --git a/paddlespeech/s2t/exps/u2/config.py b/paddlespeech/s2t/exps/u2/config.py index 537d0c21..2b4f6fb2 100644 --- a/paddlespeech/s2t/exps/u2/config.py +++ b/paddlespeech/s2t/exps/u2/config.py @@ -29,7 +29,7 @@ U2Model.params(_C) U2Trainer.params(_C) -_C.decoding = U2Tester.params() +_C.decode = U2Tester.params() def get_cfg_defaults(): From c907a8deda85f399978ce005cbf4f94d6238673b Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Fri, 31 Dec 2021 05:55:14 +0000 Subject: [PATCH 5/9] change all recipes --- examples/aishell/asr0/conf/deepspeech2.yaml | 124 +++++------ .../aishell/asr0/conf/deepspeech2_online.yaml | 126 ++++++----- .../asr0/conf/tuning/chunk_decode.yaml | 10 + examples/aishell/asr0/conf/tuning/decode.yaml | 10 + examples/aishell/asr0/local/test.sh | 10 +- examples/aishell/asr0/local/test_export.sh | 10 +- examples/aishell/asr0/local/test_hub_ori | 47 ++++ examples/aishell/asr0/local/test_wav.sh | 12 +- examples/aishell/asr0/run.sh | 7 +- .../aishell/asr1/conf/chunk_conformer.yaml | 7 +- examples/aishell/asr1/conf/conformer.yaml | 5 +- examples/aishell/asr1/conf/transformer.yaml | 7 +- examples/aishell/asr1/local/align.sh | 2 +- examples/aishell/asr1/local/test.sh | 6 +- examples/aishell/asr1/local/test_wav.sh | 2 +- .../callcenter/asr1/conf/chunk_conformer.yaml | 204 ++++++++--------- examples/callcenter/asr1/conf/conformer.yaml | 193 +++++++--------- examples/callcenter/asr1/conf/preprocess.yaml | 2 +- .../asr1/conf/tuning/chunk_decode.yaml | 11 + .../callcenter/asr1/conf/tuning/decode.yaml | 13 ++ examples/callcenter/asr1/local/align.sh | 10 +- examples/callcenter/asr1/local/test.sh | 18 +- examples/callcenter/asr1/run.sh | 5 +- .../librispeech/asr0/conf/deepspeech2.yaml | 123 +++++------ .../asr0/conf/deepspeech2_online.yaml | 127 ++++++----- .../asr0/conf/tuning/chunk_decode.yaml | 10 + .../librispeech/asr0/conf/tuning/decode.yaml | 10 + examples/librispeech/asr0/local/test.sh | 10 +- examples/librispeech/asr0/local/test_wav.sh | 12 +- examples/librispeech/asr0/run.sh | 5 +- .../asr1/conf/chunk_conformer.yaml | 8 +- .../asr1/conf/chunk_transformer.yaml | 5 +- examples/librispeech/asr1/conf/conformer.yaml | 5 +- .../librispeech/asr1/conf/transformer.yaml | 5 +- examples/librispeech/asr1/local/align.sh | 2 +- examples/librispeech/asr1/local/test.sh | 6 +- examples/librispeech/asr1/local/test_wav.sh | 2 +- .../asr2/conf/decode/decode_base.yaml | 11 + .../librispeech/asr2/conf/transformer.yaml | 151 ++++++------- examples/librispeech/asr2/local/align.sh | 12 +- examples/librispeech/asr2/local/test.sh | 10 +- examples/librispeech/asr2/run.sh | 8 +- .../other/1xt2x/aishell/conf/deepspeech2.yaml | 122 +++++------ .../1xt2x/aishell/conf/tuning/decode.yaml | 10 + examples/other/1xt2x/aishell/local/test.sh | 10 +- examples/other/1xt2x/aishell/run.sh | 3 +- .../1xt2x/baidu_en8k/conf/deepspeech2.yaml | 123 +++++------ .../1xt2x/baidu_en8k/conf/tuning/decode.yaml | 10 + examples/other/1xt2x/baidu_en8k/local/test.sh | 10 +- examples/other/1xt2x/baidu_en8k/run.sh | 3 +- .../1xt2x/librispeech/conf/deepspeech2.yaml | 123 +++++------ .../1xt2x/librispeech/conf/tuning/decode.yaml | 10 + .../other/1xt2x/librispeech/local/test.sh | 10 +- examples/other/1xt2x/librispeech/run.sh | 3 +- .../other/1xt2x/src_deepspeech2x/bin/test.py | 5 + .../models/ds2/deepspeech2.py | 12 +- .../1xt2x/src_deepspeech2x/test_model.py | 90 +++----- examples/ted_en_zh/st0/conf/transformer.yaml | 191 ++++++++-------- .../st0/conf/transformer_mtl_noam.yaml | 190 ++++++++-------- .../ted_en_zh/st0/conf/tuning/decode.yaml | 11 + examples/ted_en_zh/st0/local/test.sh | 12 +- examples/ted_en_zh/st0/run.sh | 3 +- examples/ted_en_zh/st1/conf/transformer.yaml | 191 ++++++++-------- .../st1/conf/transformer_mtl_noam.yaml | 191 ++++++++-------- .../ted_en_zh/st1/conf/tuning/decode.yaml | 12 + examples/ted_en_zh/st1/local/test.sh | 12 +- examples/ted_en_zh/st1/run.sh | 3 +- examples/timit/asr1/conf/transformer.yaml | 181 +++++++-------- examples/timit/asr1/conf/tuning/decode.yaml | 11 + examples/timit/asr1/local/align.sh | 10 +- examples/timit/asr1/local/test.sh | 22 +- examples/timit/asr1/run.sh | 13 +- examples/tiny/asr0/conf/deepspeech2.yaml | 121 +++++----- .../tiny/asr0/conf/deepspeech2_online.yaml | 126 ++++++----- .../tiny/asr0/conf/tuning/chunk_decode.yaml | 10 + examples/tiny/asr0/conf/tuning/decode.yaml | 10 + examples/tiny/asr0/local/test.sh | 10 +- examples/tiny/asr0/run.sh | 3 +- examples/tiny/asr1/conf/chunk_confermer.yaml | 206 ++++++++---------- .../tiny/asr1/conf/chunk_transformer.yaml | 190 +++++++--------- examples/tiny/asr1/conf/conformer.yaml | 80 +++---- examples/tiny/asr1/conf/transformer.yaml | 76 +++---- .../tiny/asr1/conf/tuning/chunk_decode.yaml | 11 + examples/tiny/asr1/conf/tuning/decode.yaml | 11 + examples/tiny/asr1/local/align.sh | 10 +- examples/tiny/asr1/local/test.sh | 17 +- examples/tiny/asr1/run.sh | 5 +- examples/wenetspeech/asr1/conf/conformer.yaml | 189 ++++++++-------- .../wenetspeech/asr1/conf/tuning/decode.yaml | 11 + examples/wenetspeech/asr1/local/test.sh | 17 +- examples/wenetspeech/asr1/local/test_wav.sh | 14 +- examples/wenetspeech/asr1/run.sh | 8 +- .../exps/deepspeech2/bin/deploy/runtime.py | 32 +-- .../s2t/exps/deepspeech2/bin/deploy/server.py | 32 +-- paddlespeech/s2t/exps/deepspeech2/bin/test.py | 6 + .../s2t/exps/deepspeech2/bin/test_export.py | 6 + .../s2t/exps/deepspeech2/bin/test_wav.py | 21 +- paddlespeech/s2t/exps/deepspeech2/config.py | 11 - paddlespeech/s2t/exps/deepspeech2/model.py | 132 ++++++----- paddlespeech/s2t/exps/u2/bin/alignment.py | 4 +- paddlespeech/s2t/exps/u2/bin/test.py | 4 +- paddlespeech/s2t/exps/u2/bin/test_wav.py | 6 +- paddlespeech/s2t/exps/u2/config.py | 10 +- paddlespeech/s2t/exps/u2/model.py | 16 +- paddlespeech/s2t/exps/u2/trainer.py | 58 ++--- paddlespeech/s2t/exps/u2_kaldi/bin/test.py | 4 + paddlespeech/s2t/exps/u2_kaldi/model.py | 75 ++++--- paddlespeech/s2t/exps/u2_st/bin/test.py | 10 +- paddlespeech/s2t/exps/u2_st/config.py | 10 +- paddlespeech/s2t/exps/u2_st/model.py | 110 +++++----- paddlespeech/s2t/io/collator.py | 58 +++-- paddlespeech/s2t/io/dataset.py | 18 +- paddlespeech/s2t/models/ds2/deepspeech2.py | 14 +- .../s2t/models/ds2_online/deepspeech2.py | 18 +- paddlespeech/s2t/training/cli.py | 2 +- tests/benchmark/conformer/run.sh | 5 +- tests/benchmark/conformer/run_benchmark.sh | 20 +- .../ds2/ds2_params_lite_train_infer.txt | 4 +- .../ds2/ds2_params_whole_train_infer.txt | 2 +- tests/chains/ds2/lite_train_infer.sh | 4 +- tests/chains/ds2/prepare.sh | 8 +- tests/chains/ds2/test.sh | 1 + 122 files changed, 2427 insertions(+), 2359 deletions(-) create mode 100644 examples/aishell/asr0/conf/tuning/chunk_decode.yaml create mode 100644 examples/aishell/asr0/conf/tuning/decode.yaml create mode 100755 examples/aishell/asr0/local/test_hub_ori create mode 100644 examples/callcenter/asr1/conf/tuning/chunk_decode.yaml create mode 100644 examples/callcenter/asr1/conf/tuning/decode.yaml create mode 100644 examples/librispeech/asr0/conf/tuning/chunk_decode.yaml create mode 100644 examples/librispeech/asr0/conf/tuning/decode.yaml create mode 100644 examples/librispeech/asr2/conf/decode/decode_base.yaml create mode 100644 examples/other/1xt2x/aishell/conf/tuning/decode.yaml create mode 100644 examples/other/1xt2x/baidu_en8k/conf/tuning/decode.yaml create mode 100644 examples/other/1xt2x/librispeech/conf/tuning/decode.yaml create mode 100644 examples/ted_en_zh/st0/conf/tuning/decode.yaml create mode 100644 examples/ted_en_zh/st1/conf/tuning/decode.yaml create mode 100644 examples/timit/asr1/conf/tuning/decode.yaml create mode 100644 examples/tiny/asr0/conf/tuning/chunk_decode.yaml create mode 100644 examples/tiny/asr0/conf/tuning/decode.yaml create mode 100644 examples/tiny/asr1/conf/tuning/chunk_decode.yaml create mode 100644 examples/tiny/asr1/conf/tuning/decode.yaml create mode 100644 examples/wenetspeech/asr1/conf/tuning/decode.yaml diff --git a/examples/aishell/asr0/conf/deepspeech2.yaml b/examples/aishell/asr0/conf/deepspeech2.yaml index bdfa4219..1dc8581e 100644 --- a/examples/aishell/asr0/conf/deepspeech2.yaml +++ b/examples/aishell/asr0/conf/deepspeech2.yaml @@ -1,68 +1,64 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test - min_input_len: 0.0 - max_input_len: 27.0 # second - min_output_len: 0.0 - max_output_len: .inf - min_output_input_ratio: 0.00 - max_output_input_ratio: .inf +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test +min_input_len: 0.0 +max_input_len: 27.0 # second +min_output_len: 0.0 +max_output_len: .inf +min_output_input_ratio: 0.00 +max_output_input_ratio: .inf -collator: - batch_size: 64 # one gpu - mean_std_filepath: data/mean_std.json - unit_type: char - vocab_filepath: data/lang_char/vocab.txt - augmentation_config: conf/augmentation.json - random_seed: 0 - spm_model_prefix: - spectrum_type: linear - feat_dim: - delta_delta: False - stride_ms: 10.0 - window_ms: 20.0 - n_fft: None - max_freq: None - target_sample_rate: 16000 - use_dB_normalization: True - target_dB: -20 - dither: 1.0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 +########################################### +# Dataloader # +########################################### +batch_size: 64 # one gpu +mean_std_filepath: data/mean_std.json +unit_type: char +vocab_filepath: data/lang_char/vocab.txt +augmentation_config: conf/augmentation.json +random_seed: 0 +spm_model_prefix: +spectrum_type: linear +feat_dim: +delta_delta: False +stride_ms: 10.0 +window_ms: 20.0 +n_fft: None +max_freq: None +target_sample_rate: 16000 +use_dB_normalization: True +target_dB: -20 +dither: 1.0 +keep_transcription_text: False +sortagrad: True +shuffle_method: batch_shuffle +num_workers: 2 -model: - num_conv_layers: 2 - num_rnn_layers: 3 - rnn_layer_size: 1024 - use_gru: True - share_rnn_weights: False - blank_id: 0 - ctc_grad_norm_type: instance +############################################ +# Network Architecture # +############################################ +num_conv_layers: 2 +num_rnn_layers: 3 +rnn_layer_size: 1024 +use_gru: True +share_rnn_weights: False +blank_id: 0 +ctc_grad_norm_type: instance -training: - n_epoch: 80 - accum_grad: 1 - lr: 2e-3 - lr_decay: 0.83 - weight_decay: 1e-06 - global_grad_clip: 3.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 - -decoding: - batch_size: 128 - error_rate_type: cer - decoding_method: ctc_beam_search - lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm - alpha: 1.9 - beta: 5.0 - beam_size: 300 - cutoff_prob: 0.99 - cutoff_top_n: 40 - num_proc_bsearch: 10 +########################################### +# Training # +########################################### +n_epoch: 80 +accum_grad: 1 +lr: 2e-3 +lr_decay: 0.83 +weight_decay: 1e-06 +global_grad_clip: 3.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/aishell/asr0/conf/deepspeech2_online.yaml b/examples/aishell/asr0/conf/deepspeech2_online.yaml index 2f63f4de..c49973a2 100644 --- a/examples/aishell/asr0/conf/deepspeech2_online.yaml +++ b/examples/aishell/asr0/conf/deepspeech2_online.yaml @@ -1,70 +1,68 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test - min_input_len: 0.0 - max_input_len: 27.0 # second - min_output_len: 0.0 - max_output_len: .inf - min_output_input_ratio: 0.00 - max_output_input_ratio: .inf +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test +min_input_len: 0.0 +max_input_len: 27.0 # second +min_output_len: 0.0 +max_output_len: .inf +min_output_input_ratio: 0.00 +max_output_input_ratio: .inf -collator: - batch_size: 64 # one gpu - mean_std_filepath: data/mean_std.json - unit_type: char - vocab_filepath: data/lang_char/vocab.txt - augmentation_config: conf/augmentation.json - random_seed: 0 - spm_model_prefix: - spectrum_type: linear #linear, mfcc, fbank - feat_dim: - delta_delta: False - stride_ms: 10.0 - window_ms: 20.0 - n_fft: None - max_freq: None - target_sample_rate: 16000 - use_dB_normalization: True - target_dB: -20 - dither: 1.0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 0 +########################################### +# Dataloader # +########################################### +batch_size: 64 # one gpu +mean_std_filepath: data/mean_std.json +unit_type: char +vocab_filepath: data/lang_char/vocab.txt +augmentation_config: conf/augmentation.json +random_seed: 0 +spm_model_prefix: +spectrum_type: linear #linear, mfcc, fbank +feat_dim: +delta_delta: False +stride_ms: 10.0 +window_ms: 20.0 +n_fft: None +max_freq: None +target_sample_rate: 16000 +use_dB_normalization: True +target_dB: -20 +dither: 1.0 +keep_transcription_text: False +sortagrad: True +shuffle_method: batch_shuffle +num_workers: 0 -model: - num_conv_layers: 2 - num_rnn_layers: 5 - rnn_layer_size: 1024 - rnn_direction: forward # [forward, bidirect] - num_fc_layers: 0 - fc_layers_size_list: -1, - use_gru: False - blank_id: 0 +############################################ +# Network Architecture # +############################################ +num_conv_layers: 2 +num_rnn_layers: 5 +rnn_layer_size: 1024 +rnn_direction: forward # [forward, bidirect] +num_fc_layers: 0 +fc_layers_size_list: -1, +use_gru: False +blank_id: 0 -training: - n_epoch: 65 - accum_grad: 1 - lr: 5e-4 - lr_decay: 0.93 - weight_decay: 1e-06 - global_grad_clip: 3.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 +########################################### +# Training # +########################################### +n_epoch: 65 +accum_grad: 1 +lr: 5e-4 +lr_decay: 0.93 +weight_decay: 1e-06 +global_grad_clip: 3.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 + -decoding: - batch_size: 32 - error_rate_type: cer - decoding_method: ctc_beam_search - lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm - alpha: 2.2 #1.9 - beta: 4.3 - beam_size: 300 - cutoff_prob: 0.99 - cutoff_top_n: 40 - num_proc_bsearch: 10 diff --git a/examples/aishell/asr0/conf/tuning/chunk_decode.yaml b/examples/aishell/asr0/conf/tuning/chunk_decode.yaml new file mode 100644 index 00000000..9de06711 --- /dev/null +++ b/examples/aishell/asr0/conf/tuning/chunk_decode.yaml @@ -0,0 +1,10 @@ +chunk_batch_size: 32 +error_rate_type: cer +decoding_method: ctc_beam_search +lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm +alpha: 2.2 #1.9 +beta: 4.3 +beam_size: 300 +cutoff_prob: 0.99 +cutoff_top_n: 40 +num_proc_bsearch: 10 diff --git a/examples/aishell/asr0/conf/tuning/decode.yaml b/examples/aishell/asr0/conf/tuning/decode.yaml new file mode 100644 index 00000000..5778e656 --- /dev/null +++ b/examples/aishell/asr0/conf/tuning/decode.yaml @@ -0,0 +1,10 @@ +decode_batch_size: 128 +error_rate_type: cer +decoding_method: ctc_beam_search +lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm +alpha: 1.9 +beta: 5.0 +beam_size: 300 +cutoff_prob: 0.99 +cutoff_top_n: 40 +num_proc_bsearch: 10 diff --git a/examples/aishell/asr0/local/test.sh b/examples/aishell/asr0/local/test.sh index 8cbff235..463593ef 100755 --- a/examples/aishell/asr0/local/test.sh +++ b/examples/aishell/asr0/local/test.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 3 ];then - echo "usage: ${0} config_path ckpt_path_prefix model_type" +if [ $# != 4 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type" exit -1 fi @@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 -model_type=$3 +decode_config_path=$2 +ckpt_prefix=$3 +model_type=$4 # download language model bash local/download_lm_ch.sh @@ -21,6 +22,7 @@ fi python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ +--decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.rsl \ --checkpoint_path ${ckpt_prefix} \ --model_type ${model_type} diff --git a/examples/aishell/asr0/local/test_export.sh b/examples/aishell/asr0/local/test_export.sh index 4f5e5c8b..7a4b87f8 100755 --- a/examples/aishell/asr0/local/test_export.sh +++ b/examples/aishell/asr0/local/test_export.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 3 ];then - echo "usage: ${0} config_path ckpt_path_prefix model_type" +if [ $# != 4 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type" exit -1 fi @@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -jit_model_export_path=$2 -model_type=$3 +decode_config_path=$2 +jit_model_export_path=$3 +model_type=$4 # download language model bash local/download_lm_ch.sh > /dev/null 2>&1 @@ -21,6 +22,7 @@ fi python3 -u ${BIN_DIR}/test_export.py \ --ngpu ${ngpu} \ --config ${config_path} \ +--decode_cfg ${decode_config_path} \ --result_file ${jit_model_export_path}.rsl \ --export_path ${jit_model_export_path} \ --model_type ${model_type} diff --git a/examples/aishell/asr0/local/test_hub_ori b/examples/aishell/asr0/local/test_hub_ori new file mode 100755 index 00000000..ee1fb805 --- /dev/null +++ b/examples/aishell/asr0/local/test_hub_ori @@ -0,0 +1,47 @@ +#!/bin/bash + +if [ $# != 4 ];then + echo "usage: ${0} config_path ckpt_path_prefix model_type audio_file" + exit -1 +fi + +ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') +echo "using $ngpu gpus..." + +config_path=$1 +ckpt_prefix=$2 +model_type=$3 +audio_file=$4 + +mkdir -p data +wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wav -P data/ +if [ $? -ne 0 ]; then + exit 1 +fi + +if [ ! -f ${audio_file} ]; then + echo "Plase input the right audio_file path" + exit 1 +fi + +# download language model +bash local/download_lm_ch.sh +if [ $? -ne 0 ]; then + exit 1 +fi + +python3 -u ${BIN_DIR}/test_hub.py \ +--nproc ${ngpu} \ +--config ${config_path} \ +--result_file ${ckpt_prefix}.rsl \ +--checkpoint_path ${ckpt_prefix} \ +--model_type ${model_type} \ +--audio_file ${audio_file} + +if [ $? -ne 0 ]; then + echo "Failed in evaluation!" + exit 1 +fi + + +exit 0 diff --git a/examples/aishell/asr0/local/test_wav.sh b/examples/aishell/asr0/local/test_wav.sh index 4a6d92fb..62b005a6 100755 --- a/examples/aishell/asr0/local/test_wav.sh +++ b/examples/aishell/asr0/local/test_wav.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 4 ];then - echo "usage: ${0} config_path ckpt_path_prefix model_type audio_file" +if [ $# != 5 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type audio_file" exit -1 fi @@ -9,9 +9,10 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 -model_type=$3 -audio_file=$4 +decode_config_path=$2 +ckpt_prefix=$3 +model_type=$4 +audio_file=$5 mkdir -p data wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wav -P data/ @@ -33,6 +34,7 @@ fi python3 -u ${BIN_DIR}/test_wav.py \ --ngpu ${ngpu} \ --config ${config_path} \ +--decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.rsl \ --checkpoint_path ${ckpt_prefix} \ --model_type ${model_type} \ diff --git a/examples/aishell/asr0/run.sh b/examples/aishell/asr0/run.sh index 270b88fc..15685f21 100755 --- a/examples/aishell/asr0/run.sh +++ b/examples/aishell/asr0/run.sh @@ -6,6 +6,7 @@ gpus=0,1,2,3 stage=0 stop_stage=100 conf_path=conf/deepspeech2.yaml #conf/deepspeech2.yaml or conf/deepspeeech2_online.yaml +decode_conf_path=conf/tuning/decode.yaml avg_num=1 model_type=offline # offline or online audio_file=data/demo_01_03.wav @@ -34,7 +35,7 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type}|| exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type}|| exit -1 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then @@ -44,11 +45,11 @@ fi if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then # test export ckpt avg_n - CUDA_VISIBLE_DEVICES=0 ./local/test_export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt}.jit ${model_type}|| exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test_export.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt}.jit ${model_type}|| exit -1 fi # Optionally, you can add LM and test it with runtime. if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then # test a single .wav file - CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} ${audio_file} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} ${audio_file} || exit -1 fi diff --git a/examples/aishell/asr1/conf/chunk_conformer.yaml b/examples/aishell/asr1/conf/chunk_conformer.yaml index 31e9be13..68e852ba 100644 --- a/examples/aishell/asr1/conf/chunk_conformer.yaml +++ b/examples/aishell/asr1/conf/chunk_conformer.yaml @@ -54,8 +54,9 @@ test_manifest: data/manifest.test ########################################### vocab_filepath: data/lang_char/vocab.txt +spm_model_prefix: '' unit_type: 'char' -augmentation_config: conf/preprocess.yaml +preprocess_config: conf/preprocess.yaml feat_dim: 80 stride_ms: 10.0 window_ms: 25.0 @@ -74,7 +75,7 @@ subsampling_factor: 1 num_encs: 1 ########################################### -# training # +# Training # ########################################### n_epoch: 240 accum_grad: 2 @@ -82,7 +83,7 @@ global_grad_clip: 5.0 optim: adam optim_conf: lr: 0.002 - weight_decay: 1e-6 + weight_decay: 1.0e-6 scheduler: warmuplr scheduler_conf: warmup_steps: 25000 diff --git a/examples/aishell/asr1/conf/conformer.yaml b/examples/aishell/asr1/conf/conformer.yaml index d9e3daec..0a931e95 100644 --- a/examples/aishell/asr1/conf/conformer.yaml +++ b/examples/aishell/asr1/conf/conformer.yaml @@ -49,8 +49,9 @@ test_manifest: data/manifest.test # Dataloader # ########################################### vocab_filepath: data/lang_char/vocab.txt +spm_model_prefix: '' unit_type: 'char' -augmentation_config: conf/preprocess.yaml +preprocess_config: conf/preprocess.yaml feat_dim: 80 stride_ms: 10.0 window_ms: 25.0 @@ -69,7 +70,7 @@ subsampling_factor: 1 num_encs: 1 ########################################### -# training # +# Training # ########################################### n_epoch: 240 accum_grad: 2 diff --git a/examples/aishell/asr1/conf/transformer.yaml b/examples/aishell/asr1/conf/transformer.yaml index e6684ec8..9d294653 100644 --- a/examples/aishell/asr1/conf/transformer.yaml +++ b/examples/aishell/asr1/conf/transformer.yaml @@ -46,6 +46,7 @@ test_manifest: data/manifest.test ########################################### unit_type: 'char' vocab_filepath: data/lang_char/vocab.txt +spm_model_prefix: '' feat_dim: 80 stride_ms: 10.0 window_ms: 25.0 @@ -59,13 +60,13 @@ batch_bins: 0 batch_frames_in: 0 batch_frames_out: 0 batch_frames_inout: 0 -augmentation_config: conf/preprocess.yaml +preprocess_config: conf/preprocess.yaml num_workers: 0 subsampling_factor: 1 num_encs: 1 ########################################### -# training # +# Training # ########################################### n_epoch: 240 accum_grad: 2 @@ -73,7 +74,7 @@ global_grad_clip: 5.0 optim: adam optim_conf: lr: 0.002 - weight_decay: 1e-6 + weight_decay: 1.0e-6 scheduler: warmuplr scheduler_conf: warmup_steps: 25000 diff --git a/examples/aishell/asr1/local/align.sh b/examples/aishell/asr1/local/align.sh index 95472e10..14d91d68 100755 --- a/examples/aishell/asr1/local/align.sh +++ b/examples/aishell/asr1/local/align.sh @@ -21,7 +21,7 @@ mkdir -p ${output_dir} python3 -u ${BIN_DIR}/alignment.py \ --ngpu ${ngpu} \ --config ${config_path} \ ---decode_config ${decode_config_path} \ +--decode_cfg ${decode_config_path} \ --result_file ${output_dir}/${type}.align \ --checkpoint_path ${ckpt_prefix} \ --opts decode.decode_batch_size ${batch_size} diff --git a/examples/aishell/asr1/local/test.sh b/examples/aishell/asr1/local/test.sh index cab7c34e..65b884e5 100755 --- a/examples/aishell/asr1/local/test.sh +++ b/examples/aishell/asr1/local/test.sh @@ -30,14 +30,14 @@ for type in attention ctc_greedy_search; do # stream decoding only support batchsize=1 batch_size=1 else - batch_size=1 + batch_size=64 fi output_dir=${ckpt_prefix} mkdir -p ${output_dir} python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ - --decode_config ${decode_config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${output_dir}/${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ --opts decode.decoding_method ${type} \ @@ -57,7 +57,7 @@ for type in ctc_prefix_beam_search attention_rescoring; do python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ - --decode_config ${decode_config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${output_dir}/${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ --opts decode.decoding_method ${type} \ diff --git a/examples/aishell/asr1/local/test_wav.sh b/examples/aishell/asr1/local/test_wav.sh index 661013b1..d029f2fd 100755 --- a/examples/aishell/asr1/local/test_wav.sh +++ b/examples/aishell/asr1/local/test_wav.sh @@ -43,7 +43,7 @@ for type in attention_rescoring; do python3 -u ${BIN_DIR}/test_wav.py \ --ngpu ${ngpu} \ --config ${config_path} \ - --decode_config ${decode_config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${output_dir}/${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ --opts decode.decoding_method ${type} \ diff --git a/examples/callcenter/asr1/conf/chunk_conformer.yaml b/examples/callcenter/asr1/conf/chunk_conformer.yaml index 69959c68..19e783a6 100644 --- a/examples/callcenter/asr1/conf/chunk_conformer.yaml +++ b/examples/callcenter/asr1/conf/chunk_conformer.yaml @@ -1,120 +1,98 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test - min_input_len: 0.5 - max_input_len: 20.0 # second - min_output_len: 0.0 - max_output_len: 400.0 - min_output_input_ratio: 0.05 - max_output_input_ratio: 10.0 - - -collator: - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'char' - spm_model_prefix: '' - augmentation_config: conf/preprocess.yaml - batch_size: 32 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank - feat_dim: 80 - delta_delta: False - dither: 1.0 - target_sample_rate: 8000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 - - -# network architecture -model: - cmvn_file: - cmvn_file_type: "json" - # encoder related - encoder: conformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: True - use_cnn_module: True - cnn_module_kernel: 15 - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' - causal: true - use_dynamic_chunk: true - cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster - use_dynamic_left_chunk: false - - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 - - # hybrid CTC/attention - model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test + +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'char' +spm_model_prefix: '' +preprocess_config: conf/preprocess.yaml +batch_size: 32 +raw_wav: True # use raw_wav or kaldi feature +spectrum_type: fbank #linear, mfcc, fbank +feat_dim: 80 +delta_delta: False +dither: 1.0 +target_sample_rate: 8000 +max_freq: None +n_fft: None +stride_ms: 10.0 +window_ms: 25.0 +use_dB_normalization: True +target_dB: -20 +random_seed: 0 +keep_transcription_text: False +sortagrad: True +shuffle_method: batch_shuffle +num_workers: 2 -training: - n_epoch: 240 - accum_grad: 4 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 0.001 - weight_decay: 1e-6 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 +############################################ +# Network Architecture # +############################################ +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: conformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: True + use_cnn_module: True + cnn_module_kernel: 15 + activation_type: 'swish' + pos_enc_layer_type: 'rel_pos' + selfattention_layer_type: 'rel_selfattn' + causal: true + use_dynamic_chunk: true + cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster + use_dynamic_left_chunk: false -decoding: - batch_size: 128 - error_rate_type: cer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 10 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: true # simulate streaming inference. Defaults to False. +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false +########################################### +# Training # +########################################### +n_epoch: 240 +accum_grad: 4 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 0.001 + weight_decay: 1.0e-6 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/callcenter/asr1/conf/conformer.yaml b/examples/callcenter/asr1/conf/conformer.yaml index 80c15abb..f6fcb949 100644 --- a/examples/callcenter/asr1/conf/conformer.yaml +++ b/examples/callcenter/asr1/conf/conformer.yaml @@ -1,117 +1,92 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test - min_input_len: 0.5 - max_input_len: 20.0 # second - min_output_len: 0.0 - max_output_len: 400.0 - min_output_input_ratio: 0.0 - max_output_input_ratio: .inf +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test -collator: - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'char' - spm_model_prefix: '' - augmentation_config: conf/preprocess.yaml - batch_size: 32 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank - feat_dim: 80 - delta_delta: False - dither: 1.0 - target_sample_rate: 8000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'char' +spm_model_prefix: '' +preprocess_config: conf/preprocess.yaml +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 64 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 -# network architecture -model: - cmvn_file: - cmvn_file_type: "json" - # encoder related - encoder: conformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: True - use_cnn_module: True - cnn_module_kernel: 15 - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' +############################################ +# Network Architecture # +############################################ +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: conformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: True + use_cnn_module: True + cnn_module_kernel: 15 + activation_type: 'swish' + pos_enc_layer_type: 'rel_pos' + selfattention_layer_type: 'rel_selfattn' - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 - # hybrid CTC/attention - model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - - -training: - n_epoch: 100 # 50 will be lowest - accum_grad: 4 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 0.002 - weight_decay: 1e-6 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 - - - - -decoding: - batch_size: 128 - error_rate_type: cer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 10 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false +########################################### +# Training # +########################################### +n_epoch: 100 # 50 will be lowest +accum_grad: 4 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 0.002 + weight_decay: 1.0e-6 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/callcenter/asr1/conf/preprocess.yaml b/examples/callcenter/asr1/conf/preprocess.yaml index f7f4c58d..877e7d5a 100644 --- a/examples/callcenter/asr1/conf/preprocess.yaml +++ b/examples/callcenter/asr1/conf/preprocess.yaml @@ -1,7 +1,7 @@ process: # extract kaldi fbank from PCM - type: fbank_kaldi - fs: 16000 + fs: 8000 n_mels: 80 n_shift: 160 win_length: 400 diff --git a/examples/callcenter/asr1/conf/tuning/chunk_decode.yaml b/examples/callcenter/asr1/conf/tuning/chunk_decode.yaml new file mode 100644 index 00000000..49a6a114 --- /dev/null +++ b/examples/callcenter/asr1/conf/tuning/chunk_decode.yaml @@ -0,0 +1,11 @@ +decode_batch_size: 128 +error_rate_type: cer +decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' +beam_size: 10 +ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. +decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. +num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. +simulate_streaming: true # simulate streaming inference. Defaults to False. \ No newline at end of file diff --git a/examples/callcenter/asr1/conf/tuning/decode.yaml b/examples/callcenter/asr1/conf/tuning/decode.yaml new file mode 100644 index 00000000..d2e0b72d --- /dev/null +++ b/examples/callcenter/asr1/conf/tuning/decode.yaml @@ -0,0 +1,13 @@ +decode_batch_size: 128 +error_rate_type: cer +decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' +beam_size: 10 +ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. +decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. +num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. +simulate_streaming: False # simulate streaming inference. Defaults to False. + + diff --git a/examples/callcenter/asr1/local/align.sh b/examples/callcenter/asr1/local/align.sh index 681c77ed..1397ae57 100755 --- a/examples/callcenter/asr1/local/align.sh +++ b/examples/callcenter/asr1/local/align.sh @@ -1,7 +1,7 @@ #! /usr/bin/env bash -if [ $# != 2 ];then - echo "usage: ${0} config_path ckpt_path_prefix" +if [ $# != 3 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix" exit -1 fi @@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 +decode_config_path=$2 +ckpt_prefix=$3 ckpt_name=$(basename ${ckpt_prefxi}) @@ -25,9 +26,10 @@ mkdir -p ${output_dir} python3 -u ${BIN_DIR}/alignment.py \ --ngpu ${ngpu} \ --config ${config_path} \ +--decode_cfg ${decode_config_path} \ --result_file ${output_dir}/${type}.align \ --checkpoint_path ${ckpt_prefix} \ ---opts decoding.batch_size ${batch_size} +--opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in ctc alignment!" diff --git a/examples/callcenter/asr1/local/test.sh b/examples/callcenter/asr1/local/test.sh index fc43c5a2..b7ff722a 100755 --- a/examples/callcenter/asr1/local/test.sh +++ b/examples/callcenter/asr1/local/test.sh @@ -1,7 +1,7 @@ #! /usr/bin/env bash -if [ $# != 2 ];then - echo "usage: ${0} config_path ckpt_path_prefix" +if [ $# != 3 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix" exit -1 fi @@ -9,7 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 +decode_config_path=$2 +ckpt_prefix=$3 + ckpt_name=$(basename ${ckpt_prefxi}) @@ -30,10 +32,11 @@ for type in attention ctc_greedy_search; do python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${output_dir}/${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" @@ -49,10 +52,11 @@ for type in ctc_prefix_beam_search attention_rescoring; do python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${output_dir}/${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" diff --git a/examples/callcenter/asr1/run.sh b/examples/callcenter/asr1/run.sh index e9be3d03..86730ce1 100644 --- a/examples/callcenter/asr1/run.sh +++ b/examples/callcenter/asr1/run.sh @@ -6,6 +6,7 @@ gpus=0,1,2,3 stage=0 stop_stage=100 conf_path=conf/conformer.yaml +decode_conf_path=conf/tuning/decode.yaml avg_num=20 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; @@ -31,12 +32,12 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then # ctc alignment of test data - CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then diff --git a/examples/librispeech/asr0/conf/deepspeech2.yaml b/examples/librispeech/asr0/conf/deepspeech2.yaml index f3574e15..0b0a1550 100644 --- a/examples/librispeech/asr0/conf/deepspeech2.yaml +++ b/examples/librispeech/asr0/conf/deepspeech2.yaml @@ -1,68 +1,65 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev-clean - test_manifest: data/manifest.test-clean - min_input_len: 0.0 - max_input_len: 30.0 # second - min_output_len: 0.0 - max_output_len: .inf - min_output_input_ratio: 0.00 - max_output_input_ratio: .inf +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev-clean +test_manifest: data/manifest.test-clean +min_input_len: 0.0 +max_input_len: 30.0 # second +min_output_len: 0.0 +max_output_len: .inf +min_output_input_ratio: 0.00 +max_output_input_ratio: .inf -collator: - batch_size: 20 - mean_std_filepath: data/mean_std.json - unit_type: char - vocab_filepath: data/lang_char/vocab.txt - augmentation_config: conf/augmentation.json - random_seed: 0 - spm_model_prefix: - spectrum_type: linear - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 20.0 - delta_delta: False - dither: 1.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 +########################################### +# Dataloader # +########################################### +batch_size: 20 +mean_std_filepath: data/mean_std.json +unit_type: char +vocab_filepath: data/lang_char/vocab.txt +augmentation_config: conf/augmentation.json +random_seed: 0 +spm_model_prefix: +spectrum_type: linear +feat_dim: +target_sample_rate: 16000 +max_freq: None +n_fft: None +stride_ms: 10.0 +window_ms: 20.0 +delta_delta: False +dither: 1.0 +use_dB_normalization: True +target_dB: -20 +random_seed: 0 +keep_transcription_text: False +sortagrad: True +shuffle_method: batch_shuffle +num_workers: 2 -model: - num_conv_layers: 2 - num_rnn_layers: 3 - rnn_layer_size: 2048 - use_gru: False - share_rnn_weights: True - blank_id: 0 +############################################ +# Network Architecture # +############################################ +num_conv_layers: 2 +num_rnn_layers: 3 +rnn_layer_size: 2048 +use_gru: False +share_rnn_weights: True +blank_id: 0 -training: - n_epoch: 50 - accum_grad: 1 - lr: 1e-3 - lr_decay: 0.83 - weight_decay: 1e-06 - global_grad_clip: 5.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 - -decoding: - batch_size: 128 - error_rate_type: wer - decoding_method: ctc_beam_search - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 1.9 - beta: 0.3 - beam_size: 500 - cutoff_prob: 1.0 - cutoff_top_n: 40 - num_proc_bsearch: 8 +########################################### +# Training # +########################################### +n_epoch: 50 +accum_grad: 1 +lr: 1e-3 +lr_decay: 0.83 +weight_decay: 1e-06 +global_grad_clip: 5.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/librispeech/asr0/conf/deepspeech2_online.yaml b/examples/librispeech/asr0/conf/deepspeech2_online.yaml index 0d16bc57..8bd5a672 100644 --- a/examples/librispeech/asr0/conf/deepspeech2_online.yaml +++ b/examples/librispeech/asr0/conf/deepspeech2_online.yaml @@ -1,70 +1,67 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev-clean - test_manifest: data/manifest.test-clean - min_input_len: 0.0 - max_input_len: 30.0 # second - min_output_len: 0.0 - max_output_len: .inf - min_output_input_ratio: 0.00 - max_output_input_ratio: .inf +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev-clean +test_manifest: data/manifest.test-clean +min_input_len: 0.0 +max_input_len: 30.0 # second +min_output_len: 0.0 +max_output_len: .inf +min_output_input_ratio: 0.00 +max_output_input_ratio: .inf -collator: - batch_size: 15 - mean_std_filepath: data/mean_std.json - unit_type: char - vocab_filepath: data/lang_char/vocab.txt - augmentation_config: conf/augmentation.json - random_seed: 0 - spm_model_prefix: - spectrum_type: linear - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 20.0 - delta_delta: False - dither: 1.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 0 +########################################### +# Dataloader # +########################################### +batch_size: 15 +mean_std_filepath: data/mean_std.json +unit_type: char +vocab_filepath: data/lang_char/vocab.txt +augmentation_config: conf/augmentation.json +random_seed: 0 +spm_model_prefix: +spectrum_type: linear +feat_dim: +target_sample_rate: 16000 +max_freq: None +n_fft: None +stride_ms: 10.0 +window_ms: 20.0 +delta_delta: False +dither: 1.0 +use_dB_normalization: True +target_dB: -20 +random_seed: 0 +keep_transcription_text: False +sortagrad: True +shuffle_method: batch_shuffle +num_workers: 0 -model: - num_conv_layers: 2 - num_rnn_layers: 3 - rnn_layer_size: 2048 - rnn_direction: forward - num_fc_layers: 2 - fc_layers_size_list: 512, 256 - use_gru: False - blank_id: 0 +############################################ +# Network Architecture # +############################################ +num_conv_layers: 2 +num_rnn_layers: 3 +rnn_layer_size: 2048 +rnn_direction: forward +num_fc_layers: 2 +fc_layers_size_list: 512, 256 +use_gru: False +blank_id: 0 -training: - n_epoch: 50 - accum_grad: 4 - lr: 1e-3 - lr_decay: 0.83 - weight_decay: 1e-06 - global_grad_clip: 5.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 - -decoding: - batch_size: 128 - error_rate_type: wer - decoding_method: ctc_beam_search - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 1.9 - beta: 0.3 - beam_size: 500 - cutoff_prob: 1.0 - cutoff_top_n: 40 - num_proc_bsearch: 8 +########################################### +# Training # +########################################### +n_epoch: 50 +accum_grad: 4 +lr: 1e-3 +lr_decay: 0.83 +weight_decay: 1e-06 +global_grad_clip: 5.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/librispeech/asr0/conf/tuning/chunk_decode.yaml b/examples/librispeech/asr0/conf/tuning/chunk_decode.yaml new file mode 100644 index 00000000..e07026ba --- /dev/null +++ b/examples/librispeech/asr0/conf/tuning/chunk_decode.yaml @@ -0,0 +1,10 @@ +decode_batch_size: 128 +error_rate_type: wer +decoding_method: ctc_beam_search +lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm +alpha: 1.9 +beta: 0.3 +beam_size: 500 +cutoff_prob: 1.0 +cutoff_top_n: 40 +num_proc_bsearch: 8 \ No newline at end of file diff --git a/examples/librispeech/asr0/conf/tuning/decode.yaml b/examples/librispeech/asr0/conf/tuning/decode.yaml new file mode 100644 index 00000000..e07026ba --- /dev/null +++ b/examples/librispeech/asr0/conf/tuning/decode.yaml @@ -0,0 +1,10 @@ +decode_batch_size: 128 +error_rate_type: wer +decoding_method: ctc_beam_search +lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm +alpha: 1.9 +beta: 0.3 +beam_size: 500 +cutoff_prob: 1.0 +cutoff_top_n: 40 +num_proc_bsearch: 8 \ No newline at end of file diff --git a/examples/librispeech/asr0/local/test.sh b/examples/librispeech/asr0/local/test.sh index a627ef72..ea40046b 100755 --- a/examples/librispeech/asr0/local/test.sh +++ b/examples/librispeech/asr0/local/test.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 3 ];then - echo "usage: ${0} config_path ckpt_path_prefix model_type" +if [ $# != 4 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type" exit -1 fi @@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 -model_type=$3 +decode_config_path=$2 +ckpt_prefix=$3 +model_type=$4 # download language model bash local/download_lm_en.sh @@ -21,6 +22,7 @@ fi python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ +--decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.rsl \ --checkpoint_path ${ckpt_prefix} \ --model_type ${model_type} diff --git a/examples/librispeech/asr0/local/test_wav.sh b/examples/librispeech/asr0/local/test_wav.sh index e8337da7..25cfc45e 100755 --- a/examples/librispeech/asr0/local/test_wav.sh +++ b/examples/librispeech/asr0/local/test_wav.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 4 ];then - echo "usage: ${0} config_path ckpt_path_prefix model_type audio_file" +if [ $# != 5 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type audio_file" exit -1 fi @@ -9,9 +9,10 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 -model_type=$3 -audio_file=$4 +decode_config_path=$2 +ckpt_prefix=$3 +model_type=$4 +audio_file=$5 mkdir -p data wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/en/demo_002_en.wav -P data/ @@ -33,6 +34,7 @@ fi python3 -u ${BIN_DIR}/test_wav.py \ --ngpu ${ngpu} \ --config ${config_path} \ +--decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.rsl \ --checkpoint_path ${ckpt_prefix} \ --model_type ${model_type} \ diff --git a/examples/librispeech/asr0/run.sh b/examples/librispeech/asr0/run.sh index 5d811b65..ca2c2b9d 100755 --- a/examples/librispeech/asr0/run.sh +++ b/examples/librispeech/asr0/run.sh @@ -6,6 +6,7 @@ gpus=0,1,2,3,4,5,6,7 stage=0 stop_stage=100 conf_path=conf/deepspeech2.yaml +decode_conf_path=conf/tuning/decode.yaml avg_num=30 model_type=offline audio_file=data/demo_002_en.wav @@ -33,7 +34,7 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} || exit -1 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then @@ -43,5 +44,5 @@ fi if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then # test a single .wav file - CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} ${audio_file} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} ${audio_file} || exit -1 fi diff --git a/examples/librispeech/asr1/conf/chunk_conformer.yaml b/examples/librispeech/asr1/conf/chunk_conformer.yaml index ace61d36..72b9cb7b 100644 --- a/examples/librispeech/asr1/conf/chunk_conformer.yaml +++ b/examples/librispeech/asr1/conf/chunk_conformer.yaml @@ -57,7 +57,7 @@ vocab_filepath: data/lang_char/vocab.txt unit_type: 'spm' spm_model_prefix: 'data/lang_char/bpe_unigram_5000' mean_std_filepath: "" -augmentation_config: conf/preprocess.yaml +preprocess_config: conf/preprocess.yaml feat_dim: 80 stride_ms: 10.0 window_ms: 25.0 @@ -70,8 +70,7 @@ batch_count: auto batch_bins: 0 batch_frames_in: 0 batch_frames_out: 0 -batch_frames_inout: 0 -augmentation_config: conf/preprocess.yaml +batch_frames_inout: 0 num_workers: 0 subsampling_factor: 1 num_encs: 1 @@ -85,10 +84,11 @@ global_grad_clip: 5.0 optim: adam optim_conf: lr: 0.001 - weight_decay: 1e-06 + weight_decay: 1.0e-06 scheduler: warmuplr scheduler_conf: warmup_steps: 25000 + lr_decay: 1.0 log_interval: 100 checkpoint: kbest_n: 50 diff --git a/examples/librispeech/asr1/conf/chunk_transformer.yaml b/examples/librispeech/asr1/conf/chunk_transformer.yaml index d6d84eb1..19ade8ad 100644 --- a/examples/librispeech/asr1/conf/chunk_transformer.yaml +++ b/examples/librispeech/asr1/conf/chunk_transformer.yaml @@ -50,7 +50,7 @@ vocab_filepath: data/lang_char/vocab.txt unit_type: 'spm' spm_model_prefix: 'data/lang_char/bpe_unigram_5000' mean_std_filepath: "" -augmentation_config: conf/preprocess.yaml +preprocess_config: conf/preprocess.yaml feat_dim: 80 stride_ms: 10.0 window_ms: 25.0 @@ -64,7 +64,6 @@ batch_bins: 0 batch_frames_in: 0 batch_frames_out: 0 batch_frames_inout: 0 -augmentation_config: conf/preprocess.yaml num_workers: 0 subsampling_factor: 1 num_encs: 1 @@ -79,7 +78,7 @@ global_grad_clip: 5.0 optim: adam optim_conf: lr: 0.001 - weight_decay: 1e-06 + weight_decay: 1.0e-06 scheduler: warmuplr scheduler_conf: warmup_steps: 25000 diff --git a/examples/librispeech/asr1/conf/conformer.yaml b/examples/librispeech/asr1/conf/conformer.yaml index bb028e69..4f7b759b 100644 --- a/examples/librispeech/asr1/conf/conformer.yaml +++ b/examples/librispeech/asr1/conf/conformer.yaml @@ -55,7 +55,7 @@ vocab_filepath: data/lang_char/vocab.txt unit_type: 'spm' spm_model_prefix: 'data/lang_char/bpe_unigram_5000' mean_std_filepath: "" -augmentation_config: conf/preprocess.yaml +preprocess_config: conf/preprocess.yaml feat_dim: 80 stride_ms: 10.0 window_ms: 25.0 @@ -69,7 +69,6 @@ batch_bins: 0 batch_frames_in: 0 batch_frames_out: 0 batch_frames_inout: 0 -augmentation_config: conf/preprocess.yaml num_workers: 0 subsampling_factor: 1 num_encs: 1 @@ -84,7 +83,7 @@ global_grad_clip: 3.0 optim: adam optim_conf: lr: 0.004 - weight_decay: 1e-06 + weight_decay: 1.0e-06 scheduler: warmuplr scheduler_conf: warmup_steps: 25000 diff --git a/examples/librispeech/asr1/conf/transformer.yaml b/examples/librispeech/asr1/conf/transformer.yaml index f81234f1..740ce78f 100644 --- a/examples/librispeech/asr1/conf/transformer.yaml +++ b/examples/librispeech/asr1/conf/transformer.yaml @@ -49,7 +49,7 @@ vocab_filepath: data/lang_char/vocab.txt unit_type: 'spm' spm_model_prefix: 'data/lang_char/bpe_unigram_5000' mean_std_filepath: "" -augmentation_config: conf/preprocess.yaml +preprocess_config: conf/preprocess.yaml feat_dim: 80 stride_ms: 10.0 window_ms: 25.0 @@ -63,7 +63,6 @@ batch_bins: 0 batch_frames_in: 0 batch_frames_out: 0 batch_frames_inout: 0 -augmentation_config: conf/preprocess.yaml num_workers: 0 subsampling_factor: 1 num_encs: 1 @@ -78,7 +77,7 @@ global_grad_clip: 5.0 optim: adam optim_conf: lr: 0.004 - weight_decay: 1e-06 + weight_decay: 1.0e-06 scheduler: warmuplr scheduler_conf: warmup_steps: 25000 diff --git a/examples/librispeech/asr1/local/align.sh b/examples/librispeech/asr1/local/align.sh index 95472e10..14d91d68 100755 --- a/examples/librispeech/asr1/local/align.sh +++ b/examples/librispeech/asr1/local/align.sh @@ -21,7 +21,7 @@ mkdir -p ${output_dir} python3 -u ${BIN_DIR}/alignment.py \ --ngpu ${ngpu} \ --config ${config_path} \ ---decode_config ${decode_config_path} \ +--decode_cfg ${decode_config_path} \ --result_file ${output_dir}/${type}.align \ --checkpoint_path ${ckpt_prefix} \ --opts decode.decode_batch_size ${batch_size} diff --git a/examples/librispeech/asr1/local/test.sh b/examples/librispeech/asr1/local/test.sh index ddb6c6b6..51ced18b 100755 --- a/examples/librispeech/asr1/local/test.sh +++ b/examples/librispeech/asr1/local/test.sh @@ -53,7 +53,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ - --decode_config ${decode_config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ --opts decode.decoding_method ${type} \ @@ -78,7 +78,7 @@ for type in ctc_greedy_search; do python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ - --decode_config ${decode_config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ --opts decode.decoding_method ${type} \ @@ -99,7 +99,7 @@ for type in ctc_prefix_beam_search attention_rescoring; do python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ - --decode_config ${decode_config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ --opts decode.decoding_method ${type} \ diff --git a/examples/librispeech/asr1/local/test_wav.sh b/examples/librispeech/asr1/local/test_wav.sh index 60eaadbf..e70fc83c 100755 --- a/examples/librispeech/asr1/local/test_wav.sh +++ b/examples/librispeech/asr1/local/test_wav.sh @@ -50,7 +50,7 @@ for type in attention_rescoring; do python3 -u ${BIN_DIR}/test_wav.py \ --ngpu ${ngpu} \ --config ${config_path} \ - --decode_config ${decode_config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${output_dir}/${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ --opts decode.decoding_method ${type} \ diff --git a/examples/librispeech/asr2/conf/decode/decode_base.yaml b/examples/librispeech/asr2/conf/decode/decode_base.yaml new file mode 100644 index 00000000..384ed197 --- /dev/null +++ b/examples/librispeech/asr2/conf/decode/decode_base.yaml @@ -0,0 +1,11 @@ +decode_batch_size: 1 +error_rate_type: wer +decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' +beam_size: 10 +ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. +decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. +num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. +simulate_streaming: False # simulate streaming inference. Defaults to False. diff --git a/examples/librispeech/asr2/conf/transformer.yaml b/examples/librispeech/asr2/conf/transformer.yaml index a16563a5..32d95b41 100644 --- a/examples/librispeech/asr2/conf/transformer.yaml +++ b/examples/librispeech/asr2/conf/transformer.yaml @@ -1,73 +1,80 @@ # https://yaml.org/type/float.html -# network architecture -model: - cmvn_file: - cmvn_file_type: "json" - # encoder related - encoder: transformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true +############################################ +# Network Architecture # +############################################ +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: transformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: true - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 - # hybrid CTC/attention - model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test-clean +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test-clean -collator: - vocab_filepath: data/lang_char/train_960_unigram5000_units.txt - unit_type: spm - spm_model_prefix: data/lang_char/train_960_unigram5000 - feat_dim: 83 - stride_ms: 10.0 - window_ms: 25.0 - sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs - batch_size: 30 - maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced - maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced - minibatches: 0 # for debug - batch_count: auto - batch_bins: 0 - batch_frames_in: 0 - batch_frames_out: 0 - batch_frames_inout: 0 - augmentation_config: conf/preprocess.yaml - num_workers: 0 - subsampling_factor: 1 - num_encs: 1 +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/train_960_unigram5000_units.txt +unit_type: spm +spm_model_prefix: data/lang_char/train_960_unigram5000 +feat_dim: 83 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 30 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +preprocess_config: conf/preprocess.yaml +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 -training: - n_epoch: 120 - accum_grad: 2 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 +########################################### +# Training # +########################################### +n_epoch: 120 +accum_grad: 2 +log_interval: 1 +checkpoint: + kbest_n: 50 + latest_n: 5 optim: adam optim_conf: @@ -79,23 +86,5 @@ scheduler_conf: warmup_steps: 25000 lr_decay: 1.0 -decoding: - batch_size: 1 - error_rate_type: wer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 10 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. diff --git a/examples/librispeech/asr2/local/align.sh b/examples/librispeech/asr2/local/align.sh index 626c3574..60a16f42 100755 --- a/examples/librispeech/asr2/local/align.sh +++ b/examples/librispeech/asr2/local/align.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 3 ];then - echo "usage: ${0} config_path dict_path ckpt_path_prefix" +if [ $# != 4 ];then + echo "usage: ${0} config_path decode_config_path dict_path ckpt_path_prefix" exit -1 fi @@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -dict_path=$2 -ckpt_prefix=$3 +decode_config_path=$2 +dict_path=$3 +ckpt_prefix=$4 batch_size=1 output_dir=${ckpt_prefix} @@ -24,9 +25,10 @@ python3 -u ${BIN_DIR}/test.py \ --dict-path ${dict_path} \ --ngpu ${ngpu} \ --config ${config_path} \ +--decode_cfg ${decode_config_path} \ --result-file ${output_dir}/${type}.align \ --checkpoint_path ${ckpt_prefix} \ ---opts decoding.batch_size ${batch_size} +--opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in ctc alignment!" diff --git a/examples/librispeech/asr2/local/test.sh b/examples/librispeech/asr2/local/test.sh index d210f2a8..bf6428d6 100755 --- a/examples/librispeech/asr2/local/test.sh +++ b/examples/librispeech/asr2/local/test.sh @@ -19,8 +19,9 @@ bpeprefix=data/lang_char/${train_set}_${bpemode}${nbpe} bpemodel=${bpeprefix}.model config_path=conf/transformer.yaml +decode_config_path=conf/decode/decode_base.yaml dict=data/lang_char/${train_set}_${bpemode}${nbpe}_units.txt -ckpt_prefix= +ckpt_prefix=exp/transformer/checkpoints/init source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; @@ -79,11 +80,12 @@ for dmethd in attention ctc_greedy_search ctc_prefix_beam_search attention_resco --ngpu ${ngpu} \ --dict-path ${dict} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --checkpoint_path ${ckpt_prefix} \ --result-file ${decode_dir}/data.JOB.json \ - --opts decoding.decoding_method ${dmethd} \ - --opts decoding.batch_size ${batch_size} \ - --opts data.test_manifest ${feat_recog_dir}/split${nj}/JOB/manifest.${rtask} + --opts decode.decoding_method ${dmethd} \ + --opts decode.decode_batch_size ${batch_size} \ + --opts test_manifest ${feat_recog_dir}/split${nj}/JOB/manifest.${rtask} score_sclite.sh --bpe ${nbpe} --bpemodel ${bpemodel} --wer false ${decode_dir} ${dict} diff --git a/examples/librispeech/asr2/run.sh b/examples/librispeech/asr2/run.sh index 5b7596f2..56671233 100755 --- a/examples/librispeech/asr2/run.sh +++ b/examples/librispeech/asr2/run.sh @@ -9,12 +9,14 @@ gpus=0,1,2,3,4,5,6,7 stage=0 stop_stage=50 conf_path=conf/transformer.yaml -dict_path=lang_char/train_960_unigram5000_units.txt +decode_conf_path=conf/decode/decode_base.yaml +dict_path=data/lang_char/train_960_unigram5000_units.txt avg_num=10 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; avg_ckpt=avg_${avg_num} +avg_ckpt=init ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}') echo "checkpoint name ${ckpt}" @@ -35,7 +37,7 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # attetion resocre decoder - ./local/test.sh ${conf_path} ${dict_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + ./local/test.sh ${conf_path} ${decode_conf_path} ${dict_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then @@ -45,7 +47,7 @@ fi if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then # ctc alignment of test data - CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} ${dict_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} ${decode_conf_path} ${dict_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then diff --git a/examples/other/1xt2x/aishell/conf/deepspeech2.yaml b/examples/other/1xt2x/aishell/conf/deepspeech2.yaml index c2d69226..c2db2c7c 100644 --- a/examples/other/1xt2x/aishell/conf/deepspeech2.yaml +++ b/examples/other/1xt2x/aishell/conf/deepspeech2.yaml @@ -1,67 +1,65 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test - min_input_len: 0.0 - max_input_len: 27.0 # second - min_output_len: 0.0 - max_output_len: .inf - min_output_input_ratio: 0.00 - max_output_input_ratio: .inf +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test +min_input_len: 0.0 +max_input_len: 27.0 # second +min_output_len: 0.0 +max_output_len: .inf +min_output_input_ratio: 0.00 +max_output_input_ratio: .inf -collator: - batch_size: 64 # one gpu - mean_std_filepath: data/mean_std.npz - unit_type: char - vocab_filepath: data/vocab.txt - augmentation_config: conf/augmentation.json - random_seed: 0 - spm_model_prefix: - spectrum_type: linear - feat_dim: - delta_delta: False - stride_ms: 10.0 - window_ms: 20.0 - n_fft: None - max_freq: None - target_sample_rate: 16000 - use_dB_normalization: True - target_dB: -20 - dither: 1.0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 +########################################### +# Dataloader # +########################################### +batch_size: 64 # one gpu +mean_std_filepath: data/mean_std.npz +unit_type: char +vocab_filepath: data/vocab.txt +augmentation_config: conf/augmentation.json +random_seed: 0 +spm_model_prefix: +spectrum_type: linear +feat_dim: +delta_delta: False +stride_ms: 10.0 +window_ms: 20.0 +n_fft: None +max_freq: None +target_sample_rate: 16000 +use_dB_normalization: True +target_dB: -20 +dither: 1.0 +keep_transcription_text: False +sortagrad: True +shuffle_method: batch_shuffle +num_workers: 2 -model: - num_conv_layers: 2 - num_rnn_layers: 3 - rnn_layer_size: 1024 - use_gru: True - share_rnn_weights: False - blank_id: 4333 +############################################ +# Network Architecture # +############################################ +num_conv_layers: 2 +num_rnn_layers: 3 +rnn_layer_size: 1024 +use_gru: True +share_rnn_weights: False +blank_id: 4333 -training: - n_epoch: 80 - accum_grad: 1 - lr: 2e-3 - lr_decay: 0.83 - weight_decay: 1e-06 - global_grad_clip: 3.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 +########################################### +# Training # +########################################### +n_epoch: 80 +accum_grad: 1 +lr: 2e-3 +lr_decay: 0.83 +weight_decay: 1e-06 +global_grad_clip: 3.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 + -decoding: - batch_size: 32 - error_rate_type: cer - decoding_method: ctc_beam_search - lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm - alpha: 2.6 - beta: 5.0 - beam_size: 300 - cutoff_prob: 0.99 - cutoff_top_n: 40 - num_proc_bsearch: 8 diff --git a/examples/other/1xt2x/aishell/conf/tuning/decode.yaml b/examples/other/1xt2x/aishell/conf/tuning/decode.yaml new file mode 100644 index 00000000..b5283a93 --- /dev/null +++ b/examples/other/1xt2x/aishell/conf/tuning/decode.yaml @@ -0,0 +1,10 @@ +decode_batch_size: 32 +error_rate_type: cer +decoding_method: ctc_beam_search +lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm +alpha: 2.6 +beta: 5.0 +beam_size: 300 +cutoff_prob: 0.99 +cutoff_top_n: 40 +num_proc_bsearch: 8 \ No newline at end of file diff --git a/examples/other/1xt2x/aishell/local/test.sh b/examples/other/1xt2x/aishell/local/test.sh index 8cbff235..463593ef 100755 --- a/examples/other/1xt2x/aishell/local/test.sh +++ b/examples/other/1xt2x/aishell/local/test.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 3 ];then - echo "usage: ${0} config_path ckpt_path_prefix model_type" +if [ $# != 4 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type" exit -1 fi @@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 -model_type=$3 +decode_config_path=$2 +ckpt_prefix=$3 +model_type=$4 # download language model bash local/download_lm_ch.sh @@ -21,6 +22,7 @@ fi python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ +--decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.rsl \ --checkpoint_path ${ckpt_prefix} \ --model_type ${model_type} diff --git a/examples/other/1xt2x/aishell/run.sh b/examples/other/1xt2x/aishell/run.sh index 1ccac1c3..89a63411 100755 --- a/examples/other/1xt2x/aishell/run.sh +++ b/examples/other/1xt2x/aishell/run.sh @@ -5,6 +5,7 @@ source path.sh stage=0 stop_stage=100 conf_path=conf/deepspeech2.yaml +decode_conf_path=conf/tuning/decode.yaml avg_num=1 model_type=offline gpus=2 @@ -23,6 +24,6 @@ fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${v18_ckpt} ${model_type}|| exit -1 + CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${v18_ckpt} ${model_type}|| exit -1 fi diff --git a/examples/other/1xt2x/baidu_en8k/conf/deepspeech2.yaml b/examples/other/1xt2x/baidu_en8k/conf/deepspeech2.yaml index be51a9b9..0c08fbc6 100644 --- a/examples/other/1xt2x/baidu_en8k/conf/deepspeech2.yaml +++ b/examples/other/1xt2x/baidu_en8k/conf/deepspeech2.yaml @@ -1,67 +1,64 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test-clean - min_input_len: 0.0 - max_input_len: .inf # second - min_output_len: 0.0 - max_output_len: .inf - min_output_input_ratio: 0.00 - max_output_input_ratio: .inf +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test-clean +min_input_len: 0.0 +max_input_len: .inf # second +min_output_len: 0.0 +max_output_len: .inf +min_output_input_ratio: 0.00 +max_output_input_ratio: .inf -collator: - batch_size: 64 # one gpu - mean_std_filepath: data/mean_std.npz - unit_type: char - vocab_filepath: data/vocab.txt - augmentation_config: conf/augmentation.json - random_seed: 0 - spm_model_prefix: - spectrum_type: linear - feat_dim: - delta_delta: False - stride_ms: 10.0 - window_ms: 20.0 - n_fft: None - max_freq: None - target_sample_rate: 16000 - use_dB_normalization: True - target_dB: -20 - dither: 1.0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 +########################################### +# Dataloader # +########################################### +batch_size: 64 # one gpu +mean_std_filepath: data/mean_std.npz +unit_type: char +vocab_filepath: data/vocab.txt +augmentation_config: conf/augmentation.json +random_seed: 0 +spm_model_prefix: +spectrum_type: linear +feat_dim: +delta_delta: False +stride_ms: 10.0 +window_ms: 20.0 +n_fft: None +max_freq: None +target_sample_rate: 16000 +use_dB_normalization: True +target_dB: -20 +dither: 1.0 +keep_transcription_text: False +sortagrad: True +shuffle_method: batch_shuffle +num_workers: 2 -model: - num_conv_layers: 2 - num_rnn_layers: 3 - rnn_layer_size: 1024 - use_gru: True - share_rnn_weights: False - blank_id: 28 +############################################ +# Network Architecture # +############################################ +num_conv_layers: 2 +num_rnn_layers: 3 +rnn_layer_size: 1024 +use_gru: True +share_rnn_weights: False +blank_id: 28 + +########################################### +# Training # +########################################### +n_epoch: 80 +accum_grad: 1 +lr: 2e-3 +lr_decay: 0.83 +weight_decay: 1e-06 +global_grad_clip: 3.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 -training: - n_epoch: 80 - accum_grad: 1 - lr: 2e-3 - lr_decay: 0.83 - weight_decay: 1e-06 - global_grad_clip: 3.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 - -decoding: - batch_size: 32 - error_rate_type: wer - decoding_method: ctc_beam_search - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 1.4 - beta: 0.35 - beam_size: 500 - cutoff_prob: 1.0 - cutoff_top_n: 40 - num_proc_bsearch: 8 diff --git a/examples/other/1xt2x/baidu_en8k/conf/tuning/decode.yaml b/examples/other/1xt2x/baidu_en8k/conf/tuning/decode.yaml new file mode 100644 index 00000000..f52dde32 --- /dev/null +++ b/examples/other/1xt2x/baidu_en8k/conf/tuning/decode.yaml @@ -0,0 +1,10 @@ +decode_batch_size: 32 +error_rate_type: wer +decoding_method: ctc_beam_search +lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm +alpha: 1.4 +beta: 0.35 +beam_size: 500 +cutoff_prob: 1.0 +cutoff_top_n: 40 +num_proc_bsearch: 8 \ No newline at end of file diff --git a/examples/other/1xt2x/baidu_en8k/local/test.sh b/examples/other/1xt2x/baidu_en8k/local/test.sh index a627ef72..ea40046b 100755 --- a/examples/other/1xt2x/baidu_en8k/local/test.sh +++ b/examples/other/1xt2x/baidu_en8k/local/test.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 3 ];then - echo "usage: ${0} config_path ckpt_path_prefix model_type" +if [ $# != 4 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type" exit -1 fi @@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 -model_type=$3 +decode_config_path=$2 +ckpt_prefix=$3 +model_type=$4 # download language model bash local/download_lm_en.sh @@ -21,6 +22,7 @@ fi python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ +--decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.rsl \ --checkpoint_path ${ckpt_prefix} \ --model_type ${model_type} diff --git a/examples/other/1xt2x/baidu_en8k/run.sh b/examples/other/1xt2x/baidu_en8k/run.sh index b7f69f6b..82de56b0 100755 --- a/examples/other/1xt2x/baidu_en8k/run.sh +++ b/examples/other/1xt2x/baidu_en8k/run.sh @@ -5,6 +5,7 @@ source path.sh stage=0 stop_stage=100 conf_path=conf/deepspeech2.yaml +decode_conf_path=conf/tuning/decode.yaml avg_num=1 model_type=offline gpus=0 @@ -23,6 +24,6 @@ fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${v18_ckpt} ${model_type}|| exit -1 + CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${v18_ckpt} ${model_type}|| exit -1 fi diff --git a/examples/other/1xt2x/librispeech/conf/deepspeech2.yaml b/examples/other/1xt2x/librispeech/conf/deepspeech2.yaml index ad7fb2c1..a2a5649b 100644 --- a/examples/other/1xt2x/librispeech/conf/deepspeech2.yaml +++ b/examples/other/1xt2x/librispeech/conf/deepspeech2.yaml @@ -1,67 +1,64 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test-clean - min_input_len: 0.0 - max_input_len: 1000.0 # second - min_output_len: 0.0 - max_output_len: .inf - min_output_input_ratio: 0.00 - max_output_input_ratio: .inf +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test-clean +min_input_len: 0.0 +max_input_len: 1000.0 # second +min_output_len: 0.0 +max_output_len: .inf +min_output_input_ratio: 0.00 +max_output_input_ratio: .inf -collator: - batch_size: 64 # one gpu - mean_std_filepath: data/mean_std.npz - unit_type: char - vocab_filepath: data/vocab.txt - augmentation_config: conf/augmentation.json - random_seed: 0 - spm_model_prefix: - spectrum_type: linear - feat_dim: - delta_delta: False - stride_ms: 10.0 - window_ms: 20.0 - n_fft: None - max_freq: None - target_sample_rate: 16000 - use_dB_normalization: True - target_dB: -20 - dither: 1.0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 +########################################### +# Dataloader # +########################################### +batch_size: 64 # one gpu +mean_std_filepath: data/mean_std.npz +unit_type: char +vocab_filepath: data/vocab.txt +augmentation_config: conf/augmentation.json +random_seed: 0 +spm_model_prefix: +spectrum_type: linear +feat_dim: +delta_delta: False +stride_ms: 10.0 +window_ms: 20.0 +n_fft: None +max_freq: None +target_sample_rate: 16000 +use_dB_normalization: True +target_dB: -20 +dither: 1.0 +keep_transcription_text: False +sortagrad: True +shuffle_method: batch_shuffle +num_workers: 2 -model: - num_conv_layers: 2 - num_rnn_layers: 3 - rnn_layer_size: 2048 - use_gru: False - share_rnn_weights: True - blank_id: 28 +############################################ +# Network Architecture # +############################################ +num_conv_layers: 2 +num_rnn_layers: 3 +rnn_layer_size: 2048 +use_gru: False +share_rnn_weights: True +blank_id: 28 + +########################################### +# Training # +########################################### +n_epoch: 80 +accum_grad: 1 +lr: 2e-3 +lr_decay: 0.83 +weight_decay: 1e-06 +global_grad_clip: 3.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 -training: - n_epoch: 80 - accum_grad: 1 - lr: 2e-3 - lr_decay: 0.83 - weight_decay: 1e-06 - global_grad_clip: 3.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 - -decoding: - batch_size: 32 - error_rate_type: wer - decoding_method: ctc_beam_search - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 500 - cutoff_prob: 1.0 - cutoff_top_n: 40 - num_proc_bsearch: 8 diff --git a/examples/other/1xt2x/librispeech/conf/tuning/decode.yaml b/examples/other/1xt2x/librispeech/conf/tuning/decode.yaml new file mode 100644 index 00000000..f3b51def --- /dev/null +++ b/examples/other/1xt2x/librispeech/conf/tuning/decode.yaml @@ -0,0 +1,10 @@ +decode_batch_size: 32 +error_rate_type: wer +decoding_method: ctc_beam_search +lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm +alpha: 2.5 +beta: 0.3 +beam_size: 500 +cutoff_prob: 1.0 +cutoff_top_n: 40 +num_proc_bsearch: 8 \ No newline at end of file diff --git a/examples/other/1xt2x/librispeech/local/test.sh b/examples/other/1xt2x/librispeech/local/test.sh index a627ef72..ea40046b 100755 --- a/examples/other/1xt2x/librispeech/local/test.sh +++ b/examples/other/1xt2x/librispeech/local/test.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 3 ];then - echo "usage: ${0} config_path ckpt_path_prefix model_type" +if [ $# != 4 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type" exit -1 fi @@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 -model_type=$3 +decode_config_path=$2 +ckpt_prefix=$3 +model_type=$4 # download language model bash local/download_lm_en.sh @@ -21,6 +22,7 @@ fi python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ +--decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.rsl \ --checkpoint_path ${ckpt_prefix} \ --model_type ${model_type} diff --git a/examples/other/1xt2x/librispeech/run.sh b/examples/other/1xt2x/librispeech/run.sh index 8c667de2..8b614bbb 100755 --- a/examples/other/1xt2x/librispeech/run.sh +++ b/examples/other/1xt2x/librispeech/run.sh @@ -5,6 +5,7 @@ source path.sh stage=0 stop_stage=100 conf_path=conf/deepspeech2.yaml +decode_conf_path=conf/tuning/decode.yaml avg_num=1 model_type=offline gpus=1 @@ -23,5 +24,5 @@ fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${v18_ckpt} ${model_type}|| exit -1 + CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${v18_ckpt} ${model_type}|| exit -1 fi diff --git a/examples/other/1xt2x/src_deepspeech2x/bin/test.py b/examples/other/1xt2x/src_deepspeech2x/bin/test.py index b4f9cdf9..b404cce8 100644 --- a/examples/other/1xt2x/src_deepspeech2x/bin/test.py +++ b/examples/other/1xt2x/src_deepspeech2x/bin/test.py @@ -13,6 +13,7 @@ # limitations under the License. """Evaluation for DeepSpeech2 model.""" from src_deepspeech2x.test_model import DeepSpeech2Tester as Tester +from yacs.config import CfgNode from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults from paddlespeech.s2t.training.cli import default_argument_parser @@ -44,6 +45,10 @@ if __name__ == "__main__": config = get_cfg_defaults(args.model_type) if args.config: config.merge_from_file(args.config) + if args.decode_cfg: + decode_confs = CfgNode(new_allowed=True) + decode_confs.merge_from_file(args.decode_cfg) + config.decode = decode_confs if args.opts: config.merge_from_list(args.opts) config.freeze() diff --git a/examples/other/1xt2x/src_deepspeech2x/models/ds2/deepspeech2.py b/examples/other/1xt2x/src_deepspeech2x/models/ds2/deepspeech2.py index ad83a41d..4c20ffcd 100644 --- a/examples/other/1xt2x/src_deepspeech2x/models/ds2/deepspeech2.py +++ b/examples/other/1xt2x/src_deepspeech2x/models/ds2/deepspeech2.py @@ -233,11 +233,11 @@ class DeepSpeech2Model(nn.Layer): """ model = cls(feat_size=dataloader.collate_fn.feature_size, dict_size=len(dataloader.collate_fn.vocab_list), - num_conv_layers=config.model.num_conv_layers, - num_rnn_layers=config.model.num_rnn_layers, - rnn_size=config.model.rnn_layer_size, - use_gru=config.model.use_gru, - share_rnn_weights=config.model.share_rnn_weights) + num_conv_layers=config.num_conv_layers, + num_rnn_layers=config.num_rnn_layers, + rnn_size=config.rnn_layer_size, + use_gru=config.use_gru, + share_rnn_weights=config.share_rnn_weights) infos = Checkpoint().load_parameters( model, checkpoint_path=checkpoint_path) logger.info(f"checkpoint info: {infos}") @@ -250,7 +250,7 @@ class DeepSpeech2Model(nn.Layer): Parameters config: yacs.config.CfgNode - config.model + config Returns ------- DeepSpeech2Model diff --git a/examples/other/1xt2x/src_deepspeech2x/test_model.py b/examples/other/1xt2x/src_deepspeech2x/test_model.py index 82e190d8..53a4e629 100644 --- a/examples/other/1xt2x/src_deepspeech2x/test_model.py +++ b/examples/other/1xt2x/src_deepspeech2x/test_model.py @@ -64,7 +64,7 @@ class DeepSpeech2Trainer(Trainer): super().__init__(config, args) def train_batch(self, batch_index, batch_data, msg): - train_conf = self.config.training + train_conf = self.config start = time.time() # forward @@ -98,7 +98,7 @@ class DeepSpeech2Trainer(Trainer): iteration_time = time.time() - start msg += "train time: {:>.3f}s, ".format(iteration_time) - msg += "batch size: {}, ".format(self.config.collator.batch_size) + msg += "batch size: {}, ".format(self.config.batch_size) msg += "accum: {}, ".format(train_conf.accum_grad) msg += ', '.join('{}: {:>.6f}'.format(k, v) for k, v in losses_np.items()) @@ -126,7 +126,7 @@ class DeepSpeech2Trainer(Trainer): total_loss += float(loss) * num_utts valid_losses['val_loss'].append(float(loss)) - if (i + 1) % self.config.training.log_interval == 0: + if (i + 1) % self.config.log_interval == 0: valid_dump = {k: np.mean(v) for k, v in valid_losses.items()} valid_dump['val_history_loss'] = total_loss / num_seen_utts @@ -146,15 +146,15 @@ class DeepSpeech2Trainer(Trainer): def setup_model(self): config = self.config.clone() config.defrost() - config.model.feat_size = self.train_loader.collate_fn.feature_size - #config.model.dict_size = self.train_loader.collate_fn.vocab_size - config.model.dict_size = len(self.train_loader.collate_fn.vocab_list) + config.feat_size = self.train_loader.collate_fn.feature_size + #config.dict_size = self.train_loader.collate_fn.vocab_size + config.dict_size = len(self.train_loader.collate_fn.vocab_list) config.freeze() if self.args.model_type == 'offline': - model = DeepSpeech2Model.from_config(config.model) + model = DeepSpeech2Model.from_config(config) elif self.args.model_type == 'online': - model = DeepSpeech2ModelOnline.from_config(config.model) + model = DeepSpeech2ModelOnline.from_config(config) else: raise Exception("wrong model type") if self.parallel: @@ -163,17 +163,13 @@ class DeepSpeech2Trainer(Trainer): logger.info(f"{model}") layer_tools.print_params(model, logger.info) - grad_clip = ClipGradByGlobalNormWithLog( - config.training.global_grad_clip) + grad_clip = ClipGradByGlobalNormWithLog(config.global_grad_clip) lr_scheduler = paddle.optimizer.lr.ExponentialDecay( - learning_rate=config.training.lr, - gamma=config.training.lr_decay, - verbose=True) + learning_rate=config.lr, gamma=config.lr_decay, verbose=True) optimizer = paddle.optimizer.Adam( learning_rate=lr_scheduler, parameters=model.parameters(), - weight_decay=paddle.regularizer.L2Decay( - config.training.weight_decay), + weight_decay=paddle.regularizer.L2Decay(config.weight_decay), grad_clip=grad_clip) self.model = model @@ -184,59 +180,59 @@ class DeepSpeech2Trainer(Trainer): def setup_dataloader(self): config = self.config.clone() config.defrost() - config.collator.keep_transcription_text = False + config.keep_transcription_text = False - config.data.manifest = config.data.train_manifest + config.manifest = config.train_manifest train_dataset = ManifestDataset.from_config(config) - config.data.manifest = config.data.dev_manifest + config.manifest = config.dev_manifest dev_dataset = ManifestDataset.from_config(config) - config.data.manifest = config.data.test_manifest + config.manifest = config.test_manifest test_dataset = ManifestDataset.from_config(config) if self.parallel: batch_sampler = SortagradDistributedBatchSampler( train_dataset, - batch_size=config.collator.batch_size, + batch_size=config.batch_size, num_replicas=None, rank=None, shuffle=True, drop_last=True, - sortagrad=config.collator.sortagrad, - shuffle_method=config.collator.shuffle_method) + sortagrad=config.sortagrad, + shuffle_method=config.shuffle_method) else: batch_sampler = SortagradBatchSampler( train_dataset, shuffle=True, - batch_size=config.collator.batch_size, + batch_size=config.batch_size, drop_last=True, - sortagrad=config.collator.sortagrad, - shuffle_method=config.collator.shuffle_method) + sortagrad=config.sortagrad, + shuffle_method=config.shuffle_method) collate_fn_train = SpeechCollator.from_config(config) - config.collator.augmentation_config = "" + config.augmentation_config = "" collate_fn_dev = SpeechCollator.from_config(config) - config.collator.keep_transcription_text = True - config.collator.augmentation_config = "" + config.keep_transcription_text = True + config.augmentation_config = "" collate_fn_test = SpeechCollator.from_config(config) self.train_loader = DataLoader( train_dataset, batch_sampler=batch_sampler, collate_fn=collate_fn_train, - num_workers=config.collator.num_workers) + num_workers=config.num_workers) self.valid_loader = DataLoader( dev_dataset, - batch_size=config.collator.batch_size, + batch_size=config.batch_size, shuffle=False, drop_last=False, collate_fn=collate_fn_dev) self.test_loader = DataLoader( test_dataset, - batch_size=config.decoding.batch_size, + batch_size=config.decode.decode_batch_size, shuffle=False, drop_last=False, collate_fn=collate_fn_test) @@ -274,7 +270,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): def __init__(self, config, args): self._text_featurizer = TextFeaturizer( - unit_type=config.collator.unit_type, vocab_filepath=None) + unit_type=config.unit_type, vocab=None) super().__init__(config, args) def ordid2token(self, texts, texts_len): @@ -293,7 +289,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): texts, texts_len, fout=None): - cfg = self.config.decoding + cfg = self.config.decode errors_sum, len_refs, num_ins = 0.0, 0, 0 errors_func = error_rate.char_errors if cfg.error_rate_type == 'cer' else error_rate.word_errors error_rate_func = error_rate.cer if cfg.error_rate_type == 'cer' else error_rate.wer @@ -399,31 +395,3 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): self.export() except KeyboardInterrupt: exit(-1) - - def setup(self): - """Setup the experiment. - """ - paddle.set_device('gpu' if self.args.ngpu > 0 else 'cpu') - - self.setup_output_dir() - self.setup_checkpointer() - - self.setup_dataloader() - self.setup_model() - - self.iteration = 0 - self.epoch = 0 - - def setup_output_dir(self): - """Create a directory used for output. - """ - # output dir - if self.args.output: - output_dir = Path(self.args.output).expanduser() - output_dir.mkdir(parents=True, exist_ok=True) - else: - output_dir = Path( - self.args.checkpoint_path).expanduser().parent.parent - output_dir.mkdir(parents=True, exist_ok=True) - - self.output_dir = output_dir diff --git a/examples/ted_en_zh/st0/conf/transformer.yaml b/examples/ted_en_zh/st0/conf/transformer.yaml index 36f287b1..5fe04619 100644 --- a/examples/ted_en_zh/st0/conf/transformer.yaml +++ b/examples/ted_en_zh/st0/conf/transformer.yaml @@ -1,109 +1,96 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train.tiny - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test - min_input_len: 0.05 # second - max_input_len: 30.0 # second - min_output_len: 0.0 # tokens - max_output_len: 400.0 # tokens - min_output_input_ratio: 0.01 - max_output_input_ratio: 20.0 +########################################### +# Data # +########################################### +train_manifest: data/manifest.train.tiny +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test +min_input_len: 0.05 # second +max_input_len: 30.0 # second +min_output_len: 0.0 # tokens +max_output_len: 400.0 # tokens +min_output_input_ratio: 0.01 +max_output_input_ratio: 20.0 -collator: - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'spm' - spm_model_prefix: data/lang_char/bpe_unigram_8000 - mean_std_filepath: "" - # augmentation_config: conf/augmentation.json - batch_size: 10 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank - feat_dim: 80 - delta_delta: False - dither: 1.0 - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'spm' +spm_model_prefix: data/lang_char/bpe_unigram_8000 +mean_std_filepath: "" +# augmentation_config: conf/augmentation.json +batch_size: 10 +raw_wav: True # use raw_wav or kaldi feature +spectrum_type: fbank #linear, mfcc, fbank +feat_dim: 80 +delta_delta: False +dither: 1.0 +target_sample_rate: 16000 +max_freq: None +n_fft: None +stride_ms: 10.0 +window_ms: 25.0 +use_dB_normalization: True +target_dB: -20 +random_seed: 0 +keep_transcription_text: False +sortagrad: True +shuffle_method: batch_shuffle +num_workers: 2 -# network architecture -model: - cmvn_file: "data/mean_std.json" - cmvn_file_type: "json" - # encoder related - encoder: transformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true +############################################ +# Network Architecture # +############################################ +cmvn_file: "data/mean_std.json" +cmvn_file_type: "json" +# encoder related +encoder: transformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: true - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 - # hybrid CTC/attention - model_conf: - asr_weight: 0.0 - ctc_weight: 0.0 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +# hybrid CTC/attention +model_conf: + asr_weight: 0.0 + ctc_weight: 0.0 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false - -training: - n_epoch: 120 - accum_grad: 2 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 0.004 - weight_decay: 1e-06 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 5 - checkpoint: - kbest_n: 50 - latest_n: 5 - - -decoding: - batch_size: 5 - error_rate_type: char-bleu - decoding_method: fullsentence # 'fullsentence', 'simultaneous' - alpha: 2.5 - beta: 0.3 - beam_size: 10 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. +########################################### +# Training # +########################################### +n_epoch: 120 +accum_grad: 2 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 0.004 + weight_decay: 1.0e-06 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 5 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/ted_en_zh/st0/conf/transformer_mtl_noam.yaml b/examples/ted_en_zh/st0/conf/transformer_mtl_noam.yaml index 78887d3c..128561f7 100644 --- a/examples/ted_en_zh/st0/conf/transformer_mtl_noam.yaml +++ b/examples/ted_en_zh/st0/conf/transformer_mtl_noam.yaml @@ -1,112 +1,100 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test - min_input_len: 0.05 # second - max_input_len: 30.0 # second - min_output_len: 0.0 # tokens - max_output_len: 400.0 # tokens - min_output_input_ratio: 0.01 - max_output_input_ratio: 20.0 +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test +min_input_len: 0.05 # second +max_input_len: 30.0 # second +min_output_len: 0.0 # tokens +max_output_len: 400.0 # tokens +min_output_input_ratio: 0.01 +max_output_input_ratio: 20.0 -collator: - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'spm' - spm_model_prefix: data/lang_char/bpe_unigram_8000 - mean_std_filepath: "" - # augmentation_config: conf/augmentation.json - batch_size: 10 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank - feat_dim: 80 - delta_delta: False - dither: 1.0 - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'spm' +spm_model_prefix: data/lang_char/bpe_unigram_8000 +mean_std_filepath: "" +# augmentation_config: conf/augmentation.json +batch_size: 10 +raw_wav: True # use raw_wav or kaldi feature +spectrum_type: fbank #linear, mfcc, fbank +feat_dim: 80 +delta_delta: False +dither: 1.0 +target_sample_rate: 16000 +max_freq: None +n_fft: None +stride_ms: 10.0 +window_ms: 25.0 +use_dB_normalization: True +target_dB: -20 +random_seed: 0 +keep_transcription_text: False +sortagrad: True +shuffle_method: batch_shuffle +num_workers: 2 -# network architecture -model: - cmvn_file: "data/mean_std.json" - cmvn_file_type: "json" - # encoder related - encoder: transformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true +############################################ +# Network Architecture # +############################################ +cmvn_file: "data/mean_std.json" +cmvn_file_type: "json" +# encoder related +encoder: transformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: true - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 - # hybrid CTC/attention - model_conf: - asr_weight: 0.5 - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +# hybrid CTC/attention +model_conf: + asr_weight: 0.5 + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false -training: - n_epoch: 120 - accum_grad: 2 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 2.5 - weight_decay: 1e-06 - scheduler: noam - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 50 - checkpoint: - kbest_n: 50 - latest_n: 5 +########################################### +# Training # +########################################### +n_epoch: 120 +accum_grad: 2 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 2.5 + weight_decay: 1.0e-06 +scheduler: noam +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 50 +checkpoint: + kbest_n: 50 + latest_n: 5 -decoding: - batch_size: 5 - error_rate_type: char-bleu - decoding_method: fullsentence # 'fullsentence', 'simultaneous' - alpha: 2.5 - beta: 0.3 - beam_size: 10 - word_reward: 0.7 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. - diff --git a/examples/ted_en_zh/st0/conf/tuning/decode.yaml b/examples/ted_en_zh/st0/conf/tuning/decode.yaml new file mode 100644 index 00000000..ed081cf4 --- /dev/null +++ b/examples/ted_en_zh/st0/conf/tuning/decode.yaml @@ -0,0 +1,11 @@ +batch_size: 5 +error_rate_type: char-bleu +decoding_method: fullsentence # 'fullsentence', 'simultaneous' +beam_size: 10 +word_reward: 0.7 +decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. +num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. +simulate_streaming: False # simulate streaming inference. Defaults to False. \ No newline at end of file diff --git a/examples/ted_en_zh/st0/local/test.sh b/examples/ted_en_zh/st0/local/test.sh index a9b18dd9..5c782e5b 100755 --- a/examples/ted_en_zh/st0/local/test.sh +++ b/examples/ted_en_zh/st0/local/test.sh @@ -1,7 +1,7 @@ #! /usr/bin/env bash -if [ $# != 2 ];then - echo "usage: ${0} config_path ckpt_path_prefix" +if [ $# != 3 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix" exit -1 fi @@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 +decode_config_path=$2 +ckpt_prefix=$3 for type in fullsentence; do echo "decoding ${type}" @@ -17,10 +18,11 @@ for type in fullsentence; do python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" diff --git a/examples/ted_en_zh/st0/run.sh b/examples/ted_en_zh/st0/run.sh index b85ba95a..1746c025 100755 --- a/examples/ted_en_zh/st0/run.sh +++ b/examples/ted_en_zh/st0/run.sh @@ -6,6 +6,7 @@ gpus=0,1,2,3 stage=0 stop_stage=50 conf_path=conf/transformer_mtl_noam.yaml +decode_conf_path=conf/tuning/decode.yaml avg_num=5 data_path=./TED_EnZh # path to unzipped data source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; @@ -32,7 +33,7 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi if [ ${stage} -le 51 ] && [ ${stop_stage} -ge 51 ]; then diff --git a/examples/ted_en_zh/st1/conf/transformer.yaml b/examples/ted_en_zh/st1/conf/transformer.yaml index 609c5824..bea8d9ab 100644 --- a/examples/ted_en_zh/st1/conf/transformer.yaml +++ b/examples/ted_en_zh/st1/conf/transformer.yaml @@ -1,110 +1,97 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train.tiny - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test - min_input_len: 5.0 # frame - max_input_len: 3000.0 # frame - min_output_len: 0.0 # tokens - max_output_len: 400.0 # tokens - min_output_input_ratio: 0.01 - max_output_input_ratio: 20.0 +########################################### +# Data # +########################################### +train_manifest: data/manifest.train.tiny +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test +min_input_len: 5.0 # frame +max_input_len: 3000.0 # frame +min_output_len: 0.0 # tokens +max_output_len: 400.0 # tokens +min_output_input_ratio: 0.01 +max_output_input_ratio: 20.0 -collator: - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'spm' - spm_model_prefix: data/lang_char/bpe_unigram_8000 - mean_std_filepath: "" - # augmentation_config: conf/augmentation.json - batch_size: 10 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank - feat_dim: 83 - delta_delta: False - dither: 1.0 - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'spm' +spm_model_prefix: data/lang_char/bpe_unigram_8000 +mean_std_filepath: "" +# augmentation_config: conf/augmentation.json +batch_size: 10 +raw_wav: True # use raw_wav or kaldi feature +spectrum_type: fbank #linear, mfcc, fbank +feat_dim: 83 +delta_delta: False +dither: 1.0 +target_sample_rate: 16000 +max_freq: None +n_fft: None +stride_ms: 10.0 +window_ms: 25.0 +use_dB_normalization: True +target_dB: -20 +random_seed: 0 +keep_transcription_text: False +sortagrad: True +shuffle_method: batch_shuffle +num_workers: 2 -# network architecture -model: - cmvn_file: None - cmvn_file_type: "json" - # encoder related - encoder: transformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true +############################################ +# Network Architecture # +############################################ +cmvn_file: None +cmvn_file_type: "json" +# encoder related +encoder: transformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: true - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 - # hybrid CTC/attention - model_conf: - asr_weight: 0.0 - ctc_weight: 0.0 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +# hybrid CTC/attention +model_conf: + asr_weight: 0.0 + ctc_weight: 0.0 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false -training: - n_epoch: 20 - accum_grad: 2 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 0.004 - weight_decay: 1e-06 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 5 - checkpoint: - kbest_n: 50 - latest_n: 5 - - -decoding: - batch_size: 5 - error_rate_type: char-bleu - decoding_method: fullsentence # 'fullsentence', 'simultaneous' - alpha: 2.5 - beta: 0.3 - beam_size: 10 - word_reward: 0.7 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. +########################################### +# Training # +########################################### +n_epoch: 20 +accum_grad: 2 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 0.004 + weight_decay: 1.0e-06 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 5 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml b/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml index 10eccd1e..31f7245d 100644 --- a/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml +++ b/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml @@ -1,110 +1,97 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test - min_input_len: 5.0 # frame - max_input_len: 3000.0 # frame - min_output_len: 0.0 # tokens - max_output_len: 400.0 # tokens - min_output_input_ratio: 0.01 - max_output_input_ratio: 20.0 +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test +min_input_len: 5.0 # frame +max_input_len: 3000.0 # frame +min_output_len: 0.0 # tokens +max_output_len: 400.0 # tokens +min_output_input_ratio: 0.01 +max_output_input_ratio: 20.0 -collator: - vocab_filepath: data/lang_char/ted_en_zh_bpe8000.txt - unit_type: 'spm' - spm_model_prefix: data/lang_char/ted_en_zh_bpe8000 - mean_std_filepath: "" - # augmentation_config: conf/augmentation.json - batch_size: 10 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank - feat_dim: 83 - delta_delta: False - dither: 1.0 - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/ted_en_zh_bpe8000.txt +unit_type: 'spm' +spm_model_prefix: data/lang_char/ted_en_zh_bpe8000 +mean_std_filepath: "" +# augmentation_config: conf/augmentation.json +batch_size: 10 +raw_wav: True # use raw_wav or kaldi feature +spectrum_type: fbank #linear, mfcc, fbank +feat_dim: 83 +delta_delta: False +dither: 1.0 +target_sample_rate: 16000 +max_freq: None +n_fft: None +stride_ms: 10.0 +window_ms: 25.0 +use_dB_normalization: True +target_dB: -20 +random_seed: 0 +keep_transcription_text: False +sortagrad: True +shuffle_method: batch_shuffle +num_workers: 2 -# network architecture -model: - cmvn_file: None - cmvn_file_type: "json" - # encoder related - encoder: transformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true +############################################ +# Network Architecture # +############################################ +cmvn_file: None +cmvn_file_type: "json" +# encoder related +encoder: transformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: true - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 - # hybrid CTC/attention - model_conf: - asr_weight: 0.5 - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +# hybrid CTC/attention +model_conf: + asr_weight: 0.5 + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false -training: - n_epoch: 20 - accum_grad: 2 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 2.5 - weight_decay: 1e-06 - scheduler: noam - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 5 - checkpoint: - kbest_n: 50 - latest_n: 5 - - -decoding: - batch_size: 5 - error_rate_type: char-bleu - decoding_method: fullsentence # 'fullsentence', 'simultaneous' - alpha: 2.5 - beta: 0.3 - beam_size: 10 - word_reward: 0.7 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. \ No newline at end of file +########################################### +# Training # +########################################### +n_epoch: 20 +accum_grad: 2 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 2.5 + weight_decay: 1.0e-06 +scheduler: noam +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 5 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/ted_en_zh/st1/conf/tuning/decode.yaml b/examples/ted_en_zh/st1/conf/tuning/decode.yaml new file mode 100644 index 00000000..d6104dbc --- /dev/null +++ b/examples/ted_en_zh/st1/conf/tuning/decode.yaml @@ -0,0 +1,12 @@ + +batch_size: 5 +error_rate_type: char-bleu +decoding_method: fullsentence # 'fullsentence', 'simultaneous' +beam_size: 10 +word_reward: 0.7 +decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. +num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. +simulate_streaming: False # simulate streaming inference. Defaults to False. \ No newline at end of file diff --git a/examples/ted_en_zh/st1/local/test.sh b/examples/ted_en_zh/st1/local/test.sh index a9b18dd9..5c782e5b 100755 --- a/examples/ted_en_zh/st1/local/test.sh +++ b/examples/ted_en_zh/st1/local/test.sh @@ -1,7 +1,7 @@ #! /usr/bin/env bash -if [ $# != 2 ];then - echo "usage: ${0} config_path ckpt_path_prefix" +if [ $# != 3 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix" exit -1 fi @@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 +decode_config_path=$2 +ckpt_prefix=$3 for type in fullsentence; do echo "decoding ${type}" @@ -17,10 +18,11 @@ for type in fullsentence; do python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" diff --git a/examples/ted_en_zh/st1/run.sh b/examples/ted_en_zh/st1/run.sh index f6362a8b..67309919 100755 --- a/examples/ted_en_zh/st1/run.sh +++ b/examples/ted_en_zh/st1/run.sh @@ -7,6 +7,7 @@ gpus=0,1,2,3 stage=1 stop_stage=4 conf_path=conf/transformer_mtl_noam.yaml +decode_conf_path=conf/tuning/decode.yaml ckpt_path= # paddle.98 # (finetune from FAT-ST pretrained model) avg_num=5 data_path=./TED_EnZh # path to unzipped data @@ -38,5 +39,5 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_pat} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi \ No newline at end of file diff --git a/examples/timit/asr1/conf/transformer.yaml b/examples/timit/asr1/conf/transformer.yaml index 1c6059e4..4731395f 100644 --- a/examples/timit/asr1/conf/transformer.yaml +++ b/examples/timit/asr1/conf/transformer.yaml @@ -1,110 +1,89 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test - min_input_len: 0.0 # second - max_input_len: 10.0 # second - min_output_len: 0.0 # tokens - max_output_len: 150.0 # tokens - min_output_input_ratio: 0.005 - max_output_input_ratio: 1000.0 +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test -collator: - vocab_filepath: data/lang_char/vocab.txt - unit_type: "word" - mean_std_filepath: "" - augmentation_config: conf/preprocess.yaml - batch_size: 64 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank - feat_dim: 80 - delta_delta: False - dither: 1.0 - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/vocab.txt +spm_model_prefix: '' +unit_type: "word" +mean_std_filepath: "" +preprocess_config: conf/preprocess.yaml +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 64 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 -# network architecture -model: - cmvn_file: - cmvn_file_type: "json" - # encoder related - encoder: transformer - encoder_conf: - output_size: 128 # dimension of attention - attention_heads: 4 - linear_units: 1024 # the number of units of position-wise feed forward - num_blocks: 6 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true +############################################ +# Network Architecture # +############################################ +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: transformer +encoder_conf: + output_size: 128 # dimension of attention + attention_heads: 4 + linear_units: 1024 # the number of units of position-wise feed forward + num_blocks: 6 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: true - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 1024 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 1024 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 - # hybrid CTC/attention - model_conf: - ctc_weight: 0.5 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +# hybrid CTC/attention +model_conf: + ctc_weight: 0.5 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false -training: - n_epoch: 50 - accum_grad: 1 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 0.004 - weight_decay: 1e-06 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 1200 - lr_decay: 1.0 - log_interval: 10 - checkpoint: - kbest_n: 50 - latest_n: 5 - - -decoding: - batch_size: 64 - error_rate_type: wer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 10 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. - +########################################### +# Training # +########################################### +n_epoch: 50 +accum_grad: 1 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 0.004 + weight_decay: 1.0e-6 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 1200 + lr_decay: 1.0 +log_interval: 10 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/timit/asr1/conf/tuning/decode.yaml b/examples/timit/asr1/conf/tuning/decode.yaml new file mode 100644 index 00000000..805dd02f --- /dev/null +++ b/examples/timit/asr1/conf/tuning/decode.yaml @@ -0,0 +1,11 @@ +decode_batch_size: 64 +error_rate_type: wer +decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' +beam_size: 10 +ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. +decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. +num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. +simulate_streaming: False # simulate streaming inference. Defaults to False. diff --git a/examples/timit/asr1/local/align.sh b/examples/timit/asr1/local/align.sh index c65d611c..14d91d68 100755 --- a/examples/timit/asr1/local/align.sh +++ b/examples/timit/asr1/local/align.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 2 ];then - echo "usage: ${0} config_path ckpt_path_prefix" +if [ $# != 3 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix" exit -1 fi @@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 +decode_config_path=$2 +ckpt_prefix=$3 batch_size=1 output_dir=${ckpt_prefix} @@ -20,9 +21,10 @@ mkdir -p ${output_dir} python3 -u ${BIN_DIR}/alignment.py \ --ngpu ${ngpu} \ --config ${config_path} \ +--decode_cfg ${decode_config_path} \ --result_file ${output_dir}/${type}.align \ --checkpoint_path ${ckpt_prefix} \ ---opts decoding.batch_size ${batch_size} +--opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in ctc alignment!" diff --git a/examples/timit/asr1/local/test.sh b/examples/timit/asr1/local/test.sh index 08ee0e36..88192c58 100755 --- a/examples/timit/asr1/local/test.sh +++ b/examples/timit/asr1/local/test.sh @@ -7,8 +7,8 @@ stop_stage=50 . ${MAIN_ROOT}/utils/parse_options.sh || exit 1; -if [ $# != 2 ];then - echo "usage: ${0} config_path ckpt_path_prefix" +if [ $# != 3 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix" exit -1 fi @@ -17,7 +17,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 +decode_config_path=$2 +ckpt_prefix=$3 chunk_mode=false if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then @@ -43,10 +44,11 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" @@ -63,10 +65,11 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" @@ -82,10 +85,11 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" diff --git a/examples/timit/asr1/run.sh b/examples/timit/asr1/run.sh index a95b5f3a..0d84be9f 100755 --- a/examples/timit/asr1/run.sh +++ b/examples/timit/asr1/run.sh @@ -7,6 +7,7 @@ gpus=0,1,2,3 stage=0 stop_stage=50 conf_path=conf/transformer.yaml +decode_conf_path=conf/tuning/decode.yaml avg_num=10 TIMIT_path=/path/to/TIMIT @@ -34,15 +35,15 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then # ctc alignment of test data - CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi -# if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then -# # export ckpt avg_n -# CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit -# fi +if [ ${stage} -le 51 ] && [ ${stop_stage} -ge 51 ]; then + # export ckpt avg_n + CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit +fi diff --git a/examples/tiny/asr0/conf/deepspeech2.yaml b/examples/tiny/asr0/conf/deepspeech2.yaml index 7d841d47..a16a79d3 100644 --- a/examples/tiny/asr0/conf/deepspeech2.yaml +++ b/examples/tiny/asr0/conf/deepspeech2.yaml @@ -1,70 +1,67 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.tiny - dev_manifest: data/manifest.tiny - test_manifest: data/manifest.tiny - min_input_len: 0.0 - max_input_len: 30.0 - min_output_len: 0.0 - max_output_len: 400.0 - min_output_input_ratio: 0.05 - max_output_input_ratio: 10.0 +########################################### +# Data # +########################################### +train_manifest: data/manifest.tiny +dev_manifest: data/manifest.tiny +test_manifest: data/manifest.tiny +min_input_len: 0.0 +max_input_len: 30.0 +min_output_len: 0.0 +max_output_len: 400.0 +min_output_input_ratio: 0.05 +max_output_input_ratio: 10.0 -collator: - mean_std_filepath: data/mean_std.json - unit_type: char - vocab_filepath: data/lang_char/vocab.txt - augmentation_config: conf/augmentation.json - random_seed: 0 - spm_model_prefix: - spectrum_type: linear - feat_dim: - delta_delta: False - stride_ms: 10.0 - window_ms: 20.0 - n_fft: None - max_freq: None - target_sample_rate: 16000 - use_dB_normalization: True - target_dB: -20 - dither: 1.0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 - batch_size: 4 +########################################### +# Dataloader # +########################################### +mean_std_filepath: data/mean_std.json +unit_type: char +vocab_filepath: data/lang_char/vocab.txt +augmentation_config: conf/augmentation.json +random_seed: 0 +spm_model_prefix: +spectrum_type: linear +feat_dim: +delta_delta: False +stride_ms: 10.0 +window_ms: 20.0 +n_fft: None +max_freq: None +target_sample_rate: 16000 +use_dB_normalization: True +target_dB: -20 +dither: 1.0 +keep_transcription_text: False +sortagrad: True +shuffle_method: batch_shuffle +num_workers: 2 +batch_size: 4 -model: - num_conv_layers: 2 - num_rnn_layers: 3 - rnn_layer_size: 2048 - use_gru: False - share_rnn_weights: True - blank_id: 0 +############################################ +# Network Architecture # +############################################ +num_conv_layers: 2 +num_rnn_layers: 3 +rnn_layer_size: 2048 +use_gru: False +share_rnn_weights: True +blank_id: 0 -training: - n_epoch: 5 - accum_grad: 1 - lr: 1e-5 - lr_decay: 0.8 - weight_decay: 1e-06 - global_grad_clip: 5.0 - log_interval: 1 - checkpoint: - kbest_n: 3 - latest_n: 2 +########################################### +# Training # +########################################### +n_epoch: 5 +accum_grad: 1 +lr: 1e-5 +lr_decay: 0.8 +weight_decay: 1e-06 +global_grad_clip: 5.0 +log_interval: 1 +checkpoint: + kbest_n: 3 + latest_n: 2 -decoding: - batch_size: 128 - error_rate_type: wer - decoding_method: ctc_beam_search - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 500 - cutoff_prob: 1.0 - cutoff_top_n: 40 - num_proc_bsearch: 8 diff --git a/examples/tiny/asr0/conf/deepspeech2_online.yaml b/examples/tiny/asr0/conf/deepspeech2_online.yaml index 393b6439..5458cfb3 100644 --- a/examples/tiny/asr0/conf/deepspeech2_online.yaml +++ b/examples/tiny/asr0/conf/deepspeech2_online.yaml @@ -1,72 +1,68 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.tiny - dev_manifest: data/manifest.tiny - test_manifest: data/manifest.tiny - min_input_len: 0.0 - max_input_len: 30.0 - min_output_len: 0.0 - max_output_len: 400.0 - min_output_input_ratio: 0.05 - max_output_input_ratio: 10.0 +########################################### +# Data # +########################################### +train_manifest: data/manifest.tiny +dev_manifest: data/manifest.tiny +test_manifest: data/manifest.tiny +min_input_len: 0.0 +max_input_len: 30.0 +min_output_len: 0.0 +max_output_len: 400.0 +min_output_input_ratio: 0.05 +max_output_input_ratio: 10.0 -collator: - mean_std_filepath: data/mean_std.json - unit_type: char - vocab_filepath: data/lang_char/vocab.txt - augmentation_config: conf/augmentation.json - random_seed: 0 - spm_model_prefix: - spectrum_type: linear - feat_dim: - delta_delta: False - stride_ms: 10.0 - window_ms: 20.0 - n_fft: None - max_freq: None - target_sample_rate: 16000 - use_dB_normalization: True - target_dB: -20 - dither: 1.0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 0 - batch_size: 4 +########################################### +# Dataloader # +########################################### +mean_std_filepath: data/mean_std.json +unit_type: char +vocab_filepath: data/lang_char/vocab.txt +augmentation_config: conf/augmentation.json +random_seed: 0 +spm_model_prefix: +spectrum_type: linear +feat_dim: +delta_delta: False +stride_ms: 10.0 +window_ms: 20.0 +n_fft: None +max_freq: None +target_sample_rate: 16000 +use_dB_normalization: True +target_dB: -20 +dither: 1.0 +keep_transcription_text: False +sortagrad: True +shuffle_method: batch_shuffle +num_workers: 0 +batch_size: 4 -model: - num_conv_layers: 2 - num_rnn_layers: 4 - rnn_layer_size: 2048 - rnn_direction: forward - num_fc_layers: 2 - fc_layers_size_list: 512, 256 - use_gru: True - blank_id: 0 +############################################ +# Network Architecture # +############################################ +num_conv_layers: 2 +num_rnn_layers: 4 +rnn_layer_size: 2048 +rnn_direction: forward +num_fc_layers: 2 +fc_layers_size_list: 512, 256 +use_gru: True +blank_id: 0 -training: - n_epoch: 5 - accum_grad: 1 - lr: 1e-5 - lr_decay: 1.0 - weight_decay: 1e-06 - global_grad_clip: 5.0 - log_interval: 1 - checkpoint: - kbest_n: 3 - latest_n: 2 +########################################### +# Training # +########################################### +n_epoch: 5 +accum_grad: 1 +lr: 1e-5 +lr_decay: 1.0 +weight_decay: 1e-06 +global_grad_clip: 5.0 +log_interval: 1 +checkpoint: + kbest_n: 3 + latest_n: 2 - -decoding: - batch_size: 128 - error_rate_type: wer - decoding_method: ctc_beam_search - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 500 - cutoff_prob: 1.0 - cutoff_top_n: 40 - num_proc_bsearch: 8 diff --git a/examples/tiny/asr0/conf/tuning/chunk_decode.yaml b/examples/tiny/asr0/conf/tuning/chunk_decode.yaml new file mode 100644 index 00000000..94c3dbde --- /dev/null +++ b/examples/tiny/asr0/conf/tuning/chunk_decode.yaml @@ -0,0 +1,10 @@ +decode_batch_size: 128 +error_rate_type: wer +decoding_method: ctc_beam_search +lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm +alpha: 2.5 +beta: 0.3 +beam_size: 500 +cutoff_prob: 1.0 +cutoff_top_n: 40 +num_proc_bsearch: 8 diff --git a/examples/tiny/asr0/conf/tuning/decode.yaml b/examples/tiny/asr0/conf/tuning/decode.yaml new file mode 100644 index 00000000..94c3dbde --- /dev/null +++ b/examples/tiny/asr0/conf/tuning/decode.yaml @@ -0,0 +1,10 @@ +decode_batch_size: 128 +error_rate_type: wer +decoding_method: ctc_beam_search +lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm +alpha: 2.5 +beta: 0.3 +beam_size: 500 +cutoff_prob: 1.0 +cutoff_top_n: 40 +num_proc_bsearch: 8 diff --git a/examples/tiny/asr0/local/test.sh b/examples/tiny/asr0/local/test.sh index a627ef72..ea40046b 100755 --- a/examples/tiny/asr0/local/test.sh +++ b/examples/tiny/asr0/local/test.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 3 ];then - echo "usage: ${0} config_path ckpt_path_prefix model_type" +if [ $# != 4 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type" exit -1 fi @@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 -model_type=$3 +decode_config_path=$2 +ckpt_prefix=$3 +model_type=$4 # download language model bash local/download_lm_en.sh @@ -21,6 +22,7 @@ fi python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ +--decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.rsl \ --checkpoint_path ${ckpt_prefix} \ --model_type ${model_type} diff --git a/examples/tiny/asr0/run.sh b/examples/tiny/asr0/run.sh index f39fb3fa..25f04624 100755 --- a/examples/tiny/asr0/run.sh +++ b/examples/tiny/asr0/run.sh @@ -6,6 +6,7 @@ gpus=0 stage=0 stop_stage=100 conf_path=conf/deepspeech2.yaml +decode_conf_path=conf/tuning/decode.yaml avg_num=1 model_type=offline @@ -32,7 +33,7 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} || exit -1 + CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} || exit -1 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then diff --git a/examples/tiny/asr1/conf/chunk_confermer.yaml b/examples/tiny/asr1/conf/chunk_confermer.yaml index ad27478d..cd072c14 100644 --- a/examples/tiny/asr1/conf/chunk_confermer.yaml +++ b/examples/tiny/asr1/conf/chunk_confermer.yaml @@ -1,120 +1,98 @@ -# https://yaml.org/type/float.html -data: - train_manifest: data/manifest.tiny - dev_manifest: data/manifest.tiny - test_manifest: data/manifest.tiny - min_input_len: 0.5 # second - max_input_len: 30.0 # second - min_output_len: 0.0 # tokens - max_output_len: 400.0 # tokens - min_output_input_ratio: 0.05 - max_output_input_ratio: 10.0 - -collator: - mean_std_filepath: "" - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'spm' - spm_model_prefix: 'data/lang_char/bpe_unigram_200' - augmentation_config: conf/preprocess.yaml - batch_size: 4 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank - feat_dim: 80 - delta_delta: False - dither: 1.0 - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 - - -# network architecture -model: - cmvn_file: "data/mean_std.json" - cmvn_file_type: "json" - # encoder related - encoder: conformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: True - use_cnn_module: True - cnn_module_kernel: 15 - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' - causal: True - use_dynamic_chunk: True - cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster - use_dynamic_left_chunk: false - - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +############################################ +# Network Architecture # +############################################ +cmvn_file: "data/mean_std.json" +cmvn_file_type: "json" +# encoder related +encoder: conformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: True + use_cnn_module: True + cnn_module_kernel: 15 + activation_type: 'swish' + pos_enc_layer_type: 'rel_pos' + selfattention_layer_type: 'rel_selfattn' + causal: True + use_dynamic_chunk: True + cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster + use_dynamic_left_chunk: false - # hybrid CTC/attention - model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false -training: - n_epoch: 5 - accum_grad: 1 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 0.001 - weight_decay: 1e-06 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 1 - checkpoint: - kbest_n: 10 - latest_n: 1 +########################################### +# Data # +########################################### +train_manifest: data/manifest.tiny +dev_manifest: data/manifest.tiny +test_manifest: data/manifest.tiny -decoding: - batch_size: 64 - error_rate_type: wer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 10 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. - + +########################################### +# Dataloader # +########################################### +mean_std_filepath: "" +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'spm' +spm_model_prefix: 'data/lang_char/bpe_unigram_200' +preprocess_config: conf/preprocess.yaml +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 4 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +augmentation_config: conf/preprocess.yaml +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 + +########################################### +# Training # +########################################### +n_epoch: 5 +accum_grad: 1 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 0.001 + weight_decay: 1.0e-06 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 1 +checkpoint: + kbest_n: 10 + latest_n: 1 diff --git a/examples/tiny/asr1/conf/chunk_transformer.yaml b/examples/tiny/asr1/conf/chunk_transformer.yaml index 298518fb..2570bb85 100644 --- a/examples/tiny/asr1/conf/chunk_transformer.yaml +++ b/examples/tiny/asr1/conf/chunk_transformer.yaml @@ -1,113 +1,91 @@ -# https://yaml.org/type/float.html -data: - train_manifest: data/manifest.tiny - dev_manifest: data/manifest.tiny - test_manifest: data/manifest.tiny - min_input_len: 0.5 # second - max_input_len: 20.0 # second - min_output_len: 0.0 # tokens - max_output_len: 400.0 # tokens - min_output_input_ratio: 0.05 - max_output_input_ratio: 10.0 - -collator: - mean_std_filepath: "" - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'spm' - spm_model_prefix: 'data/lang_char/bpe_unigram_200' - augmentation_config: conf/preprocess.yaml - batch_size: 4 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank - feat_dim: 80 - delta_delta: False - dither: 1.0 - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 - - -# network architecture -model: - cmvn_file: "data/mean_std.json" - cmvn_file_type: "json" - # encoder related - encoder: transformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true - use_dynamic_chunk: true - use_dynamic_left_chunk: false +############################################ +# Network Architecture # +############################################ +cmvn_file: "data/mean_std.json" +cmvn_file_type: "json" +# encoder related +encoder: transformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: true + use_dynamic_chunk: true + use_dynamic_left_chunk: false - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 - # hybrid CTC/attention - model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false -training: - n_epoch: 5 - accum_grad: 1 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 0.002 - weight_decay: 1e-06 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 1 - checkpoint: - kbest_n: 10 - latest_n: 1 - +# https://yaml.org/type/float.html +########################################### +# Data # +########################################### +train_manifest: data/manifest.tiny +dev_manifest: data/manifest.tiny +test_manifest: data/manifest.tiny + +########################################### +# Dataloader # +########################################### +mean_std_filepath: "" +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'spm' +spm_model_prefix: 'data/lang_char/bpe_unigram_200' +preprocess_config: conf/preprocess.yaml +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 4 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 -decoding: - batch_size: 64 - error_rate_type: wer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 10 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. +########################################### +# Training # +########################################### +n_epoch: 5 +accum_grad: 1 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 0.002 + weight_decay: 1.0e-06 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 1 +checkpoint: + kbest_n: 10 + latest_n: 1 diff --git a/examples/tiny/asr1/conf/conformer.yaml b/examples/tiny/asr1/conf/conformer.yaml index 085581f2..eb8f0ab9 100644 --- a/examples/tiny/asr1/conf/conformer.yaml +++ b/examples/tiny/asr1/conf/conformer.yaml @@ -1,46 +1,4 @@ # https://yaml.org/type/float.html -########################################### -# Data # -########################################### -train_manifest: data/manifest.tiny -dev_manifest: data/manifest.tiny -test_manifest: data/manifest.tiny -min_input_len: 0.5 # second -max_input_len: 20.0 # second -min_output_len: 0.0 # tokens -max_output_len: 400.0 # tokens -min_output_input_ratio: 0.05 -max_output_input_ratio: 10.0 - - -########################################### -# Dataloader # -########################################### -mean_std_filepath: "" -vocab_filepath: data/lang_char/vocab.txt -unit_type: 'spm' -spm_model_prefix: 'data/lang_char/bpe_unigram_200' -augmentation_config: conf/preprocess.yaml -batch_size: 4 -raw_wav: True # use raw_wav or kaldi feature -spectrum_type: fbank #linear, mfcc, fbank -feat_dim: 80 -delta_delta: False -dither: 1.0 -target_sample_rate: 16000 -max_freq: None -n_fft: None -stride_ms: 10.0 -window_ms: 25.0 -use_dB_normalization: True -target_dB: -20 -random_seed: 0 -keep_transcription_text: False -sortagrad: True -shuffle_method: batch_shuffle -num_workers: 2 - - ############################################ # Network Architecture # ############################################ @@ -83,7 +41,41 @@ model_conf: ########################################### -# training # +# Data # +########################################### +train_manifest: data/manifest.tiny +dev_manifest: data/manifest.tiny +test_manifest: data/manifest.tiny + + +########################################### +# Dataloader # +########################################### +mean_std_filepath: "" +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'spm' +spm_model_prefix: 'data/lang_char/bpe_unigram_200' +preprocess_config: conf/preprocess.yaml +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 4 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 + + +########################################### +# Training # ########################################### n_epoch: 5 accum_grad: 4 @@ -91,7 +83,7 @@ global_grad_clip: 5.0 optim: adam optim_conf: lr: 0.002 - weight_decay: 1e-06 + weight_decay: 1.0e-06 scheduler: warmuplr scheduler_conf: warmup_steps: 25000 diff --git a/examples/tiny/asr1/conf/transformer.yaml b/examples/tiny/asr1/conf/transformer.yaml index 95c7df50..4e3068d1 100644 --- a/examples/tiny/asr1/conf/transformer.yaml +++ b/examples/tiny/asr1/conf/transformer.yaml @@ -1,44 +1,4 @@ # https://yaml.org/type/float.html -########################################### -# Data # -########################################### -train_manifest: data/manifest.tiny -dev_manifest: data/manifest.tiny -test_manifest: data/manifest.tiny -min_input_len: 0.5 # second -max_input_len: 20.0 # second -min_output_len: 0.0 # tokens -max_output_len: 400.0 # tokens -min_output_input_ratio: 0.05 -max_output_input_ratio: 10.0 - -########################################### -# Dataloader # -########################################### -mean_std_filepath: data/mean_std.json -vocab_filepath: data/lang_char/vocab.txt -unit_type: 'spm' -spm_model_prefix: 'data/lang_char/bpe_unigram_200' -augmentation_config: conf/preprocess.yaml -batch_size: 4 -raw_wav: True # use raw_wav or kaldi feature -spectrum_type: fbank #linear, mfcc, fbank -feat_dim: 80 -delta_delta: False -dither: 1.0 -target_sample_rate: 16000 -max_freq: None -n_fft: None -stride_ms: 10.0 -window_ms: 25.0 -use_dB_normalization: True -target_dB: -20 -random_seed: 0 -keep_transcription_text: False -sortagrad: True -shuffle_method: batch_shuffle -num_workers: 2 - ############################################ # Network Architecture # ############################################ @@ -74,9 +34,41 @@ model_conf: lsm_weight: 0.1 # label smoothing option length_normalized_loss: false +########################################### +# Data # +########################################### +train_manifest: data/manifest.tiny +dev_manifest: data/manifest.tiny +test_manifest: data/manifest.tiny + +########################################### +# Dataloader # +########################################### +mean_std_filepath: data/mean_std.json +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'spm' +spm_model_prefix: 'data/lang_char/bpe_unigram_200' +preprocess_config: conf/preprocess.yaml +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 4 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 + ########################################### -# training # +# Training # ########################################### n_epoch: 5 accum_grad: 1 @@ -84,7 +76,7 @@ global_grad_clip: 5.0 optim: adam optim_conf: lr: 0.002 - weight_decay: 1e-06 + weight_decay: 1.0e-06 scheduler: warmuplr scheduler_conf: warmup_steps: 25000 diff --git a/examples/tiny/asr1/conf/tuning/chunk_decode.yaml b/examples/tiny/asr1/conf/tuning/chunk_decode.yaml new file mode 100644 index 00000000..c5b641da --- /dev/null +++ b/examples/tiny/asr1/conf/tuning/chunk_decode.yaml @@ -0,0 +1,11 @@ +decode_batch_size: 8 #64 +error_rate_type: wer +decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' +beam_size: 10 +ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. +decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. +num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. +simulate_streaming: False # simulate streaming inference. Defaults to False. \ No newline at end of file diff --git a/examples/tiny/asr1/conf/tuning/decode.yaml b/examples/tiny/asr1/conf/tuning/decode.yaml new file mode 100644 index 00000000..a0984f9e --- /dev/null +++ b/examples/tiny/asr1/conf/tuning/decode.yaml @@ -0,0 +1,11 @@ +decode_batch_size: 8 #64 +error_rate_type: wer +decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' +beam_size: 10 +ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. +decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. +num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. +simulate_streaming: False # simulate streaming inference. Defaults to False. diff --git a/examples/tiny/asr1/local/align.sh b/examples/tiny/asr1/local/align.sh index c65d611c..14d91d68 100755 --- a/examples/tiny/asr1/local/align.sh +++ b/examples/tiny/asr1/local/align.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 2 ];then - echo "usage: ${0} config_path ckpt_path_prefix" +if [ $# != 3 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix" exit -1 fi @@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 +decode_config_path=$2 +ckpt_prefix=$3 batch_size=1 output_dir=${ckpt_prefix} @@ -20,9 +21,10 @@ mkdir -p ${output_dir} python3 -u ${BIN_DIR}/alignment.py \ --ngpu ${ngpu} \ --config ${config_path} \ +--decode_cfg ${decode_config_path} \ --result_file ${output_dir}/${type}.align \ --checkpoint_path ${ckpt_prefix} \ ---opts decoding.batch_size ${batch_size} +--opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in ctc alignment!" diff --git a/examples/tiny/asr1/local/test.sh b/examples/tiny/asr1/local/test.sh index 190bacff..79df969b 100755 --- a/examples/tiny/asr1/local/test.sh +++ b/examples/tiny/asr1/local/test.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 2 ];then - echo "usage: ${0} config_path ckpt_path_prefix" +if [ $# != 3 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix" exit -1 fi @@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 +decode_config_path=$2 +ckpt_prefix=$3 chunk_mode=false if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then @@ -33,10 +34,11 @@ for type in attention ctc_greedy_search; do python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" @@ -50,10 +52,11 @@ for type in ctc_prefix_beam_search attention_rescoring; do python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" diff --git a/examples/tiny/asr1/run.sh b/examples/tiny/asr1/run.sh index ec9c5a56..1651c034 100755 --- a/examples/tiny/asr1/run.sh +++ b/examples/tiny/asr1/run.sh @@ -6,6 +6,7 @@ gpus=0 stage=0 stop_stage=50 conf_path=conf/transformer.yaml +decode_conf_path=conf/tuning/decode.yaml avg_num=1 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; @@ -31,12 +32,12 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then # ctc alignment of test data - CUDA_VISIBLE_DEVICES=${gpus} ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=${gpus} ./local/align.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi if [ ${stage} -le 51 ] && [ ${stop_stage} -ge 51 ]; then diff --git a/examples/wenetspeech/asr1/conf/conformer.yaml b/examples/wenetspeech/asr1/conf/conformer.yaml index a438236d..6c2bbca4 100644 --- a/examples/wenetspeech/asr1/conf/conformer.yaml +++ b/examples/wenetspeech/asr1/conf/conformer.yaml @@ -1,111 +1,92 @@ -# network architecture -model: - # encoder related - encoder: conformer - encoder_conf: - output_size: 512 # dimension of attention - attention_heads: 8 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: True - use_cnn_module: True - cnn_module_kernel: 15 - cnn_module_norm: layer_norm - activation_type: swish - pos_enc_layer_type: rel_pos - selfattention_layer_type: rel_selfattn +############################################ +# Network Architecture # +############################################ +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: conformer +encoder_conf: + output_size: 512 # dimension of attention + attention_heads: 8 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: True + use_cnn_module: True + cnn_module_kernel: 15 + cnn_module_norm: layer_norm + activation_type: swish + pos_enc_layer_type: rel_pos + selfattention_layer_type: rel_selfattn - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 8 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 8 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 - # hybrid CTC/attention - model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test - min_input_len: 0.1 # second - max_input_len: 12.0 # second - min_output_len: 1.0 - max_output_len: 400.0 - min_output_input_ratio: 0.05 - max_output_input_ratio: 10.0 +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test -collator: - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'char' - spm_model_prefix: '' - augmentation_config: conf/preprocess.yaml - batch_size: 64 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank - feat_dim: 80 - delta_delta: False - dither: 1.0 - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'char' +preprocess_config: conf/preprocess.yaml +spm_model_prefix: '' +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 64 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 -training: - n_epoch: 240 - accum_grad: 16 - global_grad_clip: 5.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 - optim: adam - optim_conf: - lr: 0.001 - weight_decay: 1e-6 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 5000 - lr_decay: 1.0 - - -decoding: - batch_size: 128 - error_rate_type: cer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 10 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. \ No newline at end of file +########################################### +# Training # +########################################### +n_epoch: 240 +accum_grad: 16 +global_grad_clip: 5.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 +optim: adam +optim_conf: + lr: 0.001 + weight_decay: 1.0e-6 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 5000 + lr_decay: 1.0 diff --git a/examples/wenetspeech/asr1/conf/tuning/decode.yaml b/examples/wenetspeech/asr1/conf/tuning/decode.yaml new file mode 100644 index 00000000..6924bfa6 --- /dev/null +++ b/examples/wenetspeech/asr1/conf/tuning/decode.yaml @@ -0,0 +1,11 @@ +decode_batch_size: 128 +error_rate_type: cer +decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' +beam_size: 10 +ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. +decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. +num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. +simulate_streaming: False # simulate streaming inference. Defaults to False. \ No newline at end of file diff --git a/examples/wenetspeech/asr1/local/test.sh b/examples/wenetspeech/asr1/local/test.sh index da159de7..65b884e5 100755 --- a/examples/wenetspeech/asr1/local/test.sh +++ b/examples/wenetspeech/asr1/local/test.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 2 ];then - echo "usage: ${0} config_path ckpt_path_prefix" +if [ $# != 3 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix" exit -1 fi @@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 +decode_config_path=$2 +ckpt_prefix=$3 chunk_mode=false if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then @@ -36,10 +37,11 @@ for type in attention ctc_greedy_search; do python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${output_dir}/${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" @@ -55,10 +57,11 @@ for type in ctc_prefix_beam_search attention_rescoring; do python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${output_dir}/${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" diff --git a/examples/wenetspeech/asr1/local/test_wav.sh b/examples/wenetspeech/asr1/local/test_wav.sh index 5c779474..47464262 100755 --- a/examples/wenetspeech/asr1/local/test_wav.sh +++ b/examples/wenetspeech/asr1/local/test_wav.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 3 ];then - echo "usage: ${0} config_path ckpt_path_prefix audio_file" +if [ $# != 4 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix audio_file" exit -1 fi @@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 -audio_file=$3 +decode_config_path=$2 +ckpt_prefix=$3 +audio_file=$4 mkdir -p data wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wav -P data/ @@ -43,10 +44,11 @@ for type in attention_rescoring; do python3 -u ${BIN_DIR}/test_wav.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${output_dir}/${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} \ + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} \ --audio_file ${audio_file} if [ $? -ne 0 ]; then diff --git a/examples/wenetspeech/asr1/run.sh b/examples/wenetspeech/asr1/run.sh index d77f409f..9995bc63 100644 --- a/examples/wenetspeech/asr1/run.sh +++ b/examples/wenetspeech/asr1/run.sh @@ -7,7 +7,7 @@ gpus=0,1,2,3,4,5,6,7 stage=0 stop_stage=100 conf_path=conf/conformer.yaml - +decode_conf_path=conf/tuning/decode.yaml average_checkpoint=true avg_num=10 @@ -36,12 +36,12 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then # ctc alignment of test data - CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then @@ -51,5 +51,5 @@ fi if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then # test a single .wav file - CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1 fi diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/deploy/runtime.py b/paddlespeech/s2t/exps/deepspeech2/bin/deploy/runtime.py index 7ccb3a6c..88148323 100644 --- a/paddlespeech/s2t/exps/deepspeech2/bin/deploy/runtime.py +++ b/paddlespeech/s2t/exps/deepspeech2/bin/deploy/runtime.py @@ -80,13 +80,13 @@ def inference(config, args): def start_server(config, args): """Start the ASR server""" config.defrost() - config.data.manifest = config.data.test_manifest + config.manifest = config.test_manifest dataset = ManifestDataset.from_config(config) - config.collator.augmentation_config = "" - config.collator.keep_transcription_text = True - config.collator.batch_size = 1 - config.collator.num_workers = 0 + config.augmentation_config = "" + config.keep_transcription_text = True + config.batch_size = 1 + config.num_workers = 0 collate_fn = SpeechCollator.from_config(config) test_loader = DataLoader(dataset, collate_fn=collate_fn, num_workers=0) @@ -105,14 +105,14 @@ def start_server(config, args): paddle.to_tensor(audio), paddle.to_tensor(audio_len), vocab_list=test_loader.collate_fn.vocab_list, - decoding_method=config.decoding.decoding_method, - lang_model_path=config.decoding.lang_model_path, - beam_alpha=config.decoding.alpha, - beam_beta=config.decoding.beta, - beam_size=config.decoding.beam_size, - cutoff_prob=config.decoding.cutoff_prob, - cutoff_top_n=config.decoding.cutoff_top_n, - num_processes=config.decoding.num_proc_bsearch) + decoding_method=config.decode.decoding_method, + lang_model_path=config.decode.lang_model_path, + beam_alpha=config.decode.alpha, + beam_beta=config.decode.beta, + beam_size=config.decode.beam_size, + cutoff_prob=config.decode.cutoff_prob, + cutoff_top_n=config.decode.cutoff_top_n, + num_processes=config.decode.num_proc_bsearch) return result_transcript[0] # warming up with utterrances sampled from Librispeech @@ -179,12 +179,16 @@ if __name__ == "__main__": config = get_cfg_defaults() if args.config: config.merge_from_file(args.config) + if args.decode_cfg: + decode_confs = CfgNode(new_allowed=True) + decode_confs.merge_from_file(args.decode_cfg) + config.decode = decode_confs if args.opts: config.merge_from_list(args.opts) config.freeze() print(config) - args.warmup_manifest = config.data.test_manifest + args.warmup_manifest = config.test_manifest print_arguments(args, globals()) if args.dump_config: diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/deploy/server.py b/paddlespeech/s2t/exps/deepspeech2/bin/deploy/server.py index 5c6eee3f..dea6d975 100644 --- a/paddlespeech/s2t/exps/deepspeech2/bin/deploy/server.py +++ b/paddlespeech/s2t/exps/deepspeech2/bin/deploy/server.py @@ -33,13 +33,13 @@ from paddlespeech.s2t.utils.utility import print_arguments def start_server(config, args): """Start the ASR server""" config.defrost() - config.data.manifest = config.data.test_manifest + config.manifest = config.test_manifest dataset = ManifestDataset.from_config(config) - config.collator.augmentation_config = "" - config.collator.keep_transcription_text = True - config.collator.batch_size = 1 - config.collator.num_workers = 0 + config.augmentation_config = "" + config.keep_transcription_text = True + config.batch_size = 1 + config.num_workers = 0 collate_fn = SpeechCollator.from_config(config) test_loader = DataLoader(dataset, collate_fn=collate_fn, num_workers=0) @@ -62,14 +62,14 @@ def start_server(config, args): paddle.to_tensor(audio), paddle.to_tensor(audio_len), vocab_list=test_loader.collate_fn.vocab_list, - decoding_method=config.decoding.decoding_method, - lang_model_path=config.decoding.lang_model_path, - beam_alpha=config.decoding.alpha, - beam_beta=config.decoding.beta, - beam_size=config.decoding.beam_size, - cutoff_prob=config.decoding.cutoff_prob, - cutoff_top_n=config.decoding.cutoff_top_n, - num_processes=config.decoding.num_proc_bsearch) + decoding_method=config.decode.decoding_method, + lang_model_path=config.decode.lang_model_path, + beam_alpha=config.decode.alpha, + beam_beta=config.decode.beta, + beam_size=config.decode.beam_size, + cutoff_prob=config.decode.cutoff_prob, + cutoff_top_n=config.decode.cutoff_top_n, + num_processes=config.decode.num_proc_bsearch) return result_transcript[0] # warming up with utterrances sampled from Librispeech @@ -114,12 +114,16 @@ if __name__ == "__main__": config = get_cfg_defaults() if args.config: config.merge_from_file(args.config) + if args.decode_cfg: + decode_confs = CfgNode(new_allowed=True) + decode_confs.merge_from_file(args.decode_cfg) + config.decode = decode_confs if args.opts: config.merge_from_list(args.opts) config.freeze() print(config) - args.warmup_manifest = config.data.test_manifest + args.warmup_manifest = config.test_manifest print_arguments(args, globals()) if args.dump_config: diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/test.py b/paddlespeech/s2t/exps/deepspeech2/bin/test.py index f52615fa..7ce921d6 100644 --- a/paddlespeech/s2t/exps/deepspeech2/bin/test.py +++ b/paddlespeech/s2t/exps/deepspeech2/bin/test.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Evaluation for DeepSpeech2 model.""" +from yacs.config import CfgNode + from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2Tester as Tester from paddlespeech.s2t.training.cli import default_argument_parser @@ -44,6 +46,10 @@ if __name__ == "__main__": config = get_cfg_defaults(args.model_type) if args.config: config.merge_from_file(args.config) + if args.decode_cfg: + decode_confs = CfgNode(new_allowed=True) + decode_confs.merge_from_file(args.decode_cfg) + config.decode = decode_confs if args.opts: config.merge_from_list(args.opts) config.freeze() diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py b/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py index e073ebbf..7a1801d4 100644 --- a/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py +++ b/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Evaluation for DeepSpeech2 model.""" +from yacs.config import CfgNode + from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2ExportTester as ExportTester from paddlespeech.s2t.training.cli import default_argument_parser @@ -49,6 +51,10 @@ if __name__ == "__main__": config = get_cfg_defaults(args.model_type) if args.config: config.merge_from_file(args.config) + if args.decode_cfg: + decode_confs = CfgNode(new_allowed=True) + decode_confs.merge_from_file(args.decode_cfg) + config.decode = decode_confs if args.opts: config.merge_from_list(args.opts) config.freeze() diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py b/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py index cf2ca0d6..28756b05 100644 --- a/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py +++ b/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py @@ -18,6 +18,7 @@ from pathlib import Path import paddle import soundfile +from yacs.config import CfgNode from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer @@ -41,7 +42,7 @@ class DeepSpeech2Tester_hub(): self.audio_file = args.audio_file self.collate_fn_test = SpeechCollator.from_config(config) self._text_featurizer = TextFeaturizer( - unit_type=config.collator.unit_type, vocab=None) + unit_type=config.unit_type, vocab=None) def compute_result_transcripts(self, audio, audio_len, vocab_list, cfg): result_transcripts = self.model.decode( @@ -74,7 +75,7 @@ class DeepSpeech2Tester_hub(): audio = paddle.unsqueeze(audio, axis=0) vocab_list = collate_fn_test.vocab_list result_transcripts = self.compute_result_transcripts( - audio, audio_len, vocab_list, cfg.decoding) + audio, audio_len, vocab_list, cfg.decode) logger.info("result_transcripts: " + result_transcripts[0]) def run_test(self): @@ -110,13 +111,13 @@ class DeepSpeech2Tester_hub(): def setup_model(self): config = self.config.clone() with UpdateConfig(config): - config.model.input_dim = self.collate_fn_test.feature_size - config.model.output_dim = self.collate_fn_test.vocab_size + config.input_dim = self.collate_fn_test.feature_size + config.output_dim = self.collate_fn_test.vocab_size if self.args.model_type == 'offline': - model = DeepSpeech2Model.from_config(config.model) + model = DeepSpeech2Model.from_config(config) elif self.args.model_type == 'online': - model = DeepSpeech2ModelOnline.from_config(config.model) + model = DeepSpeech2ModelOnline.from_config(config) else: raise Exception("wrong model type") @@ -134,8 +135,8 @@ class DeepSpeech2Tester_hub(): self.checkpoint_dir = checkpoint_dir self.checkpoint = Checkpoint( - kbest_n=self.config.training.checkpoint.kbest_n, - latest_n=self.config.training.checkpoint.latest_n) + kbest_n=self.config.checkpoint.kbest_n, + latest_n=self.config.checkpoint.latest_n) def resume(self): """Resume from the checkpoint at checkpoints in the output @@ -190,6 +191,10 @@ if __name__ == "__main__": config = get_cfg_defaults(args.model_type) if args.config: config.merge_from_file(args.config) + if args.decode_cfg: + decode_confs = CfgNode(new_allowed=True) + decode_confs.merge_from_file(args.decode_cfg) + config.decode = decode_confs if args.opts: config.merge_from_list(args.opts) config.freeze() diff --git a/paddlespeech/s2t/exps/deepspeech2/config.py b/paddlespeech/s2t/exps/deepspeech2/config.py index 58dc05ff..d8eab50e 100644 --- a/paddlespeech/s2t/exps/deepspeech2/config.py +++ b/paddlespeech/s2t/exps/deepspeech2/config.py @@ -23,17 +23,6 @@ from paddlespeech.s2t.models.ds2_online import DeepSpeech2ModelOnline def get_cfg_defaults(model_type='offline'): _C = CfgNode() - _C.data = ManifestDataset.params() - _C.collator = SpeechCollator.params() - _C.training = DeepSpeech2Trainer.params() - _C.decoding = DeepSpeech2Tester.params() - if model_type == 'offline': - _C.model = DeepSpeech2Model.params() - else: - _C.model = DeepSpeech2ModelOnline.params() - """Get a yacs CfgNode object with default values for my_project.""" - # Return a clone so that the defaults will not be altered - # This is for the "local variable" use pattern config = _C.clone() config.set_new_allowed(True) return config diff --git a/paddlespeech/s2t/exps/deepspeech2/model.py b/paddlespeech/s2t/exps/deepspeech2/model.py index a0b69d64..fc214a8a 100644 --- a/paddlespeech/s2t/exps/deepspeech2/model.py +++ b/paddlespeech/s2t/exps/deepspeech2/model.py @@ -69,8 +69,8 @@ class DeepSpeech2Trainer(Trainer): super().__init__(config, args) def train_batch(self, batch_index, batch_data, msg): - batch_size = self.config.collator.batch_size - accum_grad = self.config.training.accum_grad + batch_size = self.config.batch_size + accum_grad = self.config.accum_grad start = time.time() @@ -133,7 +133,7 @@ class DeepSpeech2Trainer(Trainer): total_loss += float(loss) * num_utts valid_losses['val_loss'].append(float(loss)) - if (i + 1) % self.config.training.log_interval == 0: + if (i + 1) % self.config.log_interval == 0: valid_dump = {k: np.mean(v) for k, v in valid_losses.items()} valid_dump['val_history_loss'] = total_loss / num_seen_utts @@ -154,16 +154,16 @@ class DeepSpeech2Trainer(Trainer): config = self.config.clone() with UpdateConfig(config): if self.train: - config.model.input_dim = self.train_loader.collate_fn.feature_size - config.model.output_dim = self.train_loader.collate_fn.vocab_size + config.input_dim = self.train_loader.collate_fn.feature_size + config.output_dim = self.train_loader.collate_fn.vocab_size else: - config.model.input_dim = self.test_loader.collate_fn.feature_size - config.model.output_dim = self.test_loader.collate_fn.vocab_size + config.input_dim = self.test_loader.collate_fn.feature_size + config.output_dim = self.test_loader.collate_fn.vocab_size if self.args.model_type == 'offline': - model = DeepSpeech2Model.from_config(config.model) + model = DeepSpeech2Model.from_config(config) elif self.args.model_type == 'online': - model = DeepSpeech2ModelOnline.from_config(config.model) + model = DeepSpeech2ModelOnline.from_config(config) else: raise Exception("wrong model type") if self.parallel: @@ -177,17 +177,13 @@ class DeepSpeech2Trainer(Trainer): if not self.train: return - grad_clip = ClipGradByGlobalNormWithLog( - config.training.global_grad_clip) + grad_clip = ClipGradByGlobalNormWithLog(config.global_grad_clip) lr_scheduler = paddle.optimizer.lr.ExponentialDecay( - learning_rate=config.training.lr, - gamma=config.training.lr_decay, - verbose=True) + learning_rate=config.lr, gamma=config.lr_decay, verbose=True) optimizer = paddle.optimizer.Adam( learning_rate=lr_scheduler, parameters=model.parameters(), - weight_decay=paddle.regularizer.L2Decay( - config.training.weight_decay), + weight_decay=paddle.regularizer.L2Decay(config.weight_decay), grad_clip=grad_clip) self.optimizer = optimizer self.lr_scheduler = lr_scheduler @@ -198,66 +194,67 @@ class DeepSpeech2Trainer(Trainer): config.defrost() if self.train: # train - config.data.manifest = config.data.train_manifest + config.manifest = config.train_manifest train_dataset = ManifestDataset.from_config(config) if self.parallel: batch_sampler = SortagradDistributedBatchSampler( train_dataset, - batch_size=config.collator.batch_size, + batch_size=config.batch_size, num_replicas=None, rank=None, shuffle=True, drop_last=True, - sortagrad=config.collator.sortagrad, - shuffle_method=config.collator.shuffle_method) + sortagrad=config.sortagrad, + shuffle_method=config.shuffle_method) else: batch_sampler = SortagradBatchSampler( train_dataset, shuffle=True, - batch_size=config.collator.batch_size, + batch_size=config.batch_size, drop_last=True, - sortagrad=config.collator.sortagrad, - shuffle_method=config.collator.shuffle_method) + sortagrad=config.sortagrad, + shuffle_method=config.shuffle_method) - config.collator.keep_transcription_text = False + config.keep_transcription_text = False collate_fn_train = SpeechCollator.from_config(config) self.train_loader = DataLoader( train_dataset, batch_sampler=batch_sampler, collate_fn=collate_fn_train, - num_workers=config.collator.num_workers) + num_workers=config.num_workers) # dev - config.data.manifest = config.data.dev_manifest + config.manifest = config.dev_manifest dev_dataset = ManifestDataset.from_config(config) - config.collator.augmentation_config = "" - config.collator.keep_transcription_text = False + config.augmentation_config = "" + config.keep_transcription_text = False collate_fn_dev = SpeechCollator.from_config(config) self.valid_loader = DataLoader( dev_dataset, - batch_size=int(config.collator.batch_size), + batch_size=int(config.batch_size), shuffle=False, drop_last=False, collate_fn=collate_fn_dev, - num_workers=config.collator.num_workers) + num_workers=config.num_workers) logger.info("Setup train/valid Dataloader!") else: # test - config.data.manifest = config.data.test_manifest + config.manifest = config.test_manifest test_dataset = ManifestDataset.from_config(config) - config.collator.augmentation_config = "" - config.collator.keep_transcription_text = True + config.augmentation_config = "" + config.keep_transcription_text = True collate_fn_test = SpeechCollator.from_config(config) - + decode_batch_size = config.get('decode', dict()).get( + 'decode_batch_size', 1) self.test_loader = DataLoader( test_dataset, - batch_size=config.decoding.batch_size, + batch_size=decode_batch_size, shuffle=False, drop_last=False, collate_fn=collate_fn_test, - num_workers=config.collator.num_workers) + num_workers=config.num_workers) logger.info("Setup test Dataloader!") @@ -286,7 +283,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): def __init__(self, config, args): super().__init__(config, args) self._text_featurizer = TextFeaturizer( - unit_type=config.collator.unit_type, vocab=None) + unit_type=config.unit_type, vocab=None) def ordid2token(self, texts, texts_len): """ ord() id to chr() chr """ @@ -304,17 +301,17 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): texts, texts_len, fout=None): - cfg = self.config.decoding + decode_cfg = self.config.decode errors_sum, len_refs, num_ins = 0.0, 0, 0 - errors_func = error_rate.char_errors if cfg.error_rate_type == 'cer' else error_rate.word_errors - error_rate_func = error_rate.cer if cfg.error_rate_type == 'cer' else error_rate.wer + errors_func = error_rate.char_errors if decode_cfg.error_rate_type == 'cer' else error_rate.word_errors + error_rate_func = error_rate.cer if decode_cfg.error_rate_type == 'cer' else error_rate.wer vocab_list = self.test_loader.collate_fn.vocab_list target_transcripts = self.ordid2token(texts, texts_len) - result_transcripts = self.compute_result_transcripts(audio, audio_len, - vocab_list, cfg) + result_transcripts = self.compute_result_transcripts( + audio, audio_len, vocab_list, decode_cfg) for utt, target, result in zip(utts, target_transcripts, result_transcripts): @@ -327,29 +324,31 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): logger.info(f"Utt: {utt}") logger.info(f"Ref: {target}") logger.info(f"Hyp: {result}") - logger.info("Current error rate [%s] = %f" % - (cfg.error_rate_type, error_rate_func(target, result))) + logger.info( + "Current error rate [%s] = %f" % + (decode_cfg.error_rate_type, error_rate_func(target, result))) return dict( errors_sum=errors_sum, len_refs=len_refs, num_ins=num_ins, error_rate=errors_sum / len_refs, - error_rate_type=cfg.error_rate_type) + error_rate_type=decode_cfg.error_rate_type) - def compute_result_transcripts(self, audio, audio_len, vocab_list, cfg): + def compute_result_transcripts(self, audio, audio_len, vocab_list, + decode_cfg): result_transcripts = self.model.decode( audio, audio_len, vocab_list, - decoding_method=cfg.decoding_method, - lang_model_path=cfg.lang_model_path, - beam_alpha=cfg.alpha, - beam_beta=cfg.beta, - beam_size=cfg.beam_size, - cutoff_prob=cfg.cutoff_prob, - cutoff_top_n=cfg.cutoff_top_n, - num_processes=cfg.num_proc_bsearch) + decoding_method=decode_cfg.decoding_method, + lang_model_path=decode_cfg.lang_model_path, + beam_alpha=decode_cfg.alpha, + beam_beta=decode_cfg.beta, + beam_size=decode_cfg.beam_size, + cutoff_prob=decode_cfg.cutoff_prob, + cutoff_top_n=decode_cfg.cutoff_top_n, + num_processes=decode_cfg.num_proc_bsearch) return result_transcripts @@ -358,7 +357,6 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): def test(self): logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}") self.model.eval() - cfg = self.config error_rate_type = None errors_sum, len_refs, num_ins = 0.0, 0, 0 with jsonlines.open(self.args.result_file, 'w') as fout: @@ -412,11 +410,10 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester): if self.args.enable_auto_log is True: from paddlespeech.s2t.utils.log import Autolog self.autolog = Autolog( - batch_size=self.config.decoding.batch_size, + batch_size=self.config.decode.decode_batch_size, model_name="deepspeech2", model_precision="fp32").getlog() self.model.eval() - cfg = self.config error_rate_type = None errors_sum, len_refs, num_ins = 0.0, 0, 0 with jsonlines.open(self.args.result_file, 'w') as fout: @@ -441,7 +438,8 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester): if self.args.enable_auto_log is True: self.autolog.report() - def compute_result_transcripts(self, audio, audio_len, vocab_list, cfg): + def compute_result_transcripts(self, audio, audio_len, vocab_list, + decode_cfg): if self.args.model_type == "online": output_probs, output_lens = self.static_forward_online(audio, audio_len) @@ -454,13 +452,15 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester): self.predictor.clear_intermediate_tensor() self.predictor.try_shrink_memory() - self.model.decoder.init_decode(cfg.alpha, cfg.beta, cfg.lang_model_path, - vocab_list, cfg.decoding_method) + self.model.decoder.init_decode(decode_cfg.alpha, decode_cfg.beta, + decode_cfg.lang_model_path, vocab_list, + decode_cfg.decoding_method) result_transcripts = self.model.decoder.decode_probs( - output_probs, output_lens, vocab_list, cfg.decoding_method, - cfg.lang_model_path, cfg.alpha, cfg.beta, cfg.beam_size, - cfg.cutoff_prob, cfg.cutoff_top_n, cfg.num_proc_bsearch) + output_probs, output_lens, vocab_list, decode_cfg.decoding_method, + decode_cfg.lang_model_path, decode_cfg.alpha, decode_cfg.beta, + decode_cfg.beam_size, decode_cfg.cutoff_prob, + decode_cfg.cutoff_top_n, decode_cfg.num_proc_bsearch) #replace the with ' ' result_transcripts = [ self._text_featurizer.detokenize(sentence) @@ -531,12 +531,10 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester): num_chunk = int(num_chunk) chunk_state_h_box = np.zeros( - (self.config.model.num_rnn_layers, 1, - self.config.model.rnn_layer_size), + (self.config.num_rnn_layers, 1, self.config.rnn_layer_size), dtype=x.dtype) chunk_state_c_box = np.zeros( - (self.config.model.num_rnn_layers, 1, - self.config.model.rnn_layer_size), + (self.config.num_rnn_layers, 1, self.config.rnn_layer_size), dtype=x.dtype) input_names = self.predictor.get_input_names() diff --git a/paddlespeech/s2t/exps/u2/bin/alignment.py b/paddlespeech/s2t/exps/u2/bin/alignment.py index 229f696d..5d768536 100644 --- a/paddlespeech/s2t/exps/u2/bin/alignment.py +++ b/paddlespeech/s2t/exps/u2/bin/alignment.py @@ -43,9 +43,9 @@ if __name__ == "__main__": config = get_cfg_defaults() if args.config: config.merge_from_file(args.config) - if args.decode_config: + if args.decode_cfg: decode_confs = CfgNode(new_allowed=True) - decode_confs.merge_from_file(args.decode_config) + decode_confs.merge_from_file(args.decode_cfg) config.decode = decode_confs if args.opts: config.merge_from_list(args.opts) diff --git a/paddlespeech/s2t/exps/u2/bin/test.py b/paddlespeech/s2t/exps/u2/bin/test.py index 419594bf..d93954fe 100644 --- a/paddlespeech/s2t/exps/u2/bin/test.py +++ b/paddlespeech/s2t/exps/u2/bin/test.py @@ -47,9 +47,9 @@ if __name__ == "__main__": config = get_cfg_defaults() if args.config: config.merge_from_file(args.config) - if args.decode_config: + if args.decode_cfg: decode_confs = CfgNode(new_allowed=True) - decode_confs.merge_from_file(args.decode_config) + decode_confs.merge_from_file(args.decode_cfg) config.decode = decode_confs if args.opts: config.merge_from_list(args.opts) diff --git a/paddlespeech/s2t/exps/u2/bin/test_wav.py b/paddlespeech/s2t/exps/u2/bin/test_wav.py index 766e4173..554d6ca5 100644 --- a/paddlespeech/s2t/exps/u2/bin/test_wav.py +++ b/paddlespeech/s2t/exps/u2/bin/test_wav.py @@ -38,7 +38,7 @@ class U2Infer(): self.config = config self.audio_file = args.audio_file - self.preprocess_conf = config.augmentation_config + self.preprocess_conf = config.preprocess_config self.preprocess_args = {"train": False} self.preprocessing = Transformation(self.preprocess_conf) @@ -132,9 +132,9 @@ if __name__ == "__main__": config = get_cfg_defaults() if args.config: config.merge_from_file(args.config) - if args.decode_config: + if args.decode_cfg: decode_confs = CfgNode(new_allowed=True) - decode_confs.merge_from_file(args.decode_config) + decode_confs.merge_from_file(args.decode_cfg) config.decode = decode_confs if args.opts: config.merge_from_list(args.opts) diff --git a/paddlespeech/s2t/exps/u2/config.py b/paddlespeech/s2t/exps/u2/config.py index 2b4f6fb2..44780d2e 100644 --- a/paddlespeech/s2t/exps/u2/config.py +++ b/paddlespeech/s2t/exps/u2/config.py @@ -21,15 +21,15 @@ from paddlespeech.s2t.models.u2 import U2Model _C = CfgNode(new_allowed=True) -ManifestDataset.params(_C) +# ManifestDataset.params(_C) -SpeechCollator.params(_C) +# SpeechCollator.params(_C) -U2Model.params(_C) +# U2Model.params(_C) -U2Trainer.params(_C) +# U2Trainer.params(_C) -_C.decode = U2Tester.params() +# _C.decode = U2Tester.params() def get_cfg_defaults(): diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py index 31610e15..f1683d70 100644 --- a/paddlespeech/s2t/exps/u2/model.py +++ b/paddlespeech/s2t/exps/u2/model.py @@ -264,7 +264,7 @@ class U2Trainer(Trainer): batch_frames_in=config.batch_frames_in, batch_frames_out=config.batch_frames_out, batch_frames_inout=config.batch_frames_inout, - preprocess_conf=config.augmentation_config, + preprocess_conf=config.preprocess_config, n_iter_processes=config.num_workers, subsampling_factor=1, num_encs=1) @@ -283,18 +283,20 @@ class U2Trainer(Trainer): batch_frames_in=0, batch_frames_out=0, batch_frames_inout=0, - preprocess_conf=config.augmentation_config, + preprocess_conf=config.preprocess_config, n_iter_processes=config.num_workers, subsampling_factor=1, num_encs=1) logger.info("Setup train/valid Dataloader!") else: + decode_batch_size = config.get('decode', dict()).get( + 'decode_batch_size', 1) # test dataset, return raw text self.test_loader = BatchDataLoader( json_file=config.test_manifest, train_mode=False, sortagrad=False, - batch_size=config.decode.decode_batch_size, + batch_size=decode_batch_size, maxlen_in=float('inf'), maxlen_out=float('inf'), minibatches=0, @@ -304,7 +306,7 @@ class U2Trainer(Trainer): batch_frames_in=0, batch_frames_out=0, batch_frames_inout=0, - preprocess_conf=config.augmentation_config, + preprocess_conf=config.preprocess_config, n_iter_processes=1, subsampling_factor=1, num_encs=1) @@ -313,7 +315,7 @@ class U2Trainer(Trainer): json_file=config.test_manifest, train_mode=False, sortagrad=False, - batch_size=config.decode.decode_batch_size, + batch_size=decode_batch_size, maxlen_in=float('inf'), maxlen_out=float('inf'), minibatches=0, @@ -323,7 +325,7 @@ class U2Trainer(Trainer): batch_frames_in=0, batch_frames_out=0, batch_frames_inout=0, - preprocess_conf=config.augmentation_config, + preprocess_conf=config.preprocess_config, n_iter_processes=1, subsampling_factor=1, num_encs=1) @@ -557,7 +559,7 @@ class U2Tester(U2Trainer): "ref_len": len_refs, "decode_method": - self.config.decoding_method, + self.config.decode.decoding_method, }) f.write(data + '\n') diff --git a/paddlespeech/s2t/exps/u2/trainer.py b/paddlespeech/s2t/exps/u2/trainer.py index 22a0a3c5..57d87316 100644 --- a/paddlespeech/s2t/exps/u2/trainer.py +++ b/paddlespeech/s2t/exps/u2/trainer.py @@ -44,77 +44,77 @@ class U2Trainer(Trainer): def setup_dataloader(self): config = self.config.clone() config.defrost() - config.collator.keep_transcription_text = False + config.keep_transcription_text = False # train/valid dataset, return token ids - config.data.manifest = config.data.train_manifest + config.manifest = config.train_manifest train_dataset = ManifestDataset.from_config(config) - config.data.manifest = config.data.dev_manifest + config.manifest = config.dev_manifest dev_dataset = ManifestDataset.from_config(config) collate_fn_train = SpeechCollator.from_config(config) - config.collator.augmentation_config = "" + config.augmentation_config = "" collate_fn_dev = SpeechCollator.from_config(config) if self.parallel: batch_sampler = SortagradDistributedBatchSampler( train_dataset, - batch_size=config.collator.batch_size, + batch_size=config.batch_size, num_replicas=None, rank=None, shuffle=True, drop_last=True, - sortagrad=config.collator.sortagrad, - shuffle_method=config.collator.shuffle_method) + sortagrad=config.sortagrad, + shuffle_method=config.shuffle_method) else: batch_sampler = SortagradBatchSampler( train_dataset, shuffle=True, - batch_size=config.collator.batch_size, + batch_size=config.batch_size, drop_last=True, - sortagrad=config.collator.sortagrad, - shuffle_method=config.collator.shuffle_method) + sortagrad=config.sortagrad, + shuffle_method=config.shuffle_method) self.train_loader = DataLoader( train_dataset, batch_sampler=batch_sampler, collate_fn=collate_fn_train, - num_workers=config.collator.num_workers, ) + num_workers=config.num_workers, ) self.valid_loader = DataLoader( dev_dataset, - batch_size=config.collator.batch_size, + batch_size=config.batch_size, shuffle=False, drop_last=False, collate_fn=collate_fn_dev, - num_workers=config.collator.num_workers, ) + num_workers=config.num_workers, ) # test dataset, return raw text - config.data.manifest = config.data.test_manifest + config.manifest = config.test_manifest # filter test examples, will cause less examples, but no mismatch with training # and can use large batch size , save training time, so filter test egs now. - config.data.min_input_len = 0.0 # second - config.data.max_input_len = float('inf') # second - config.data.min_output_len = 0.0 # tokens - config.data.max_output_len = float('inf') # tokens - config.data.min_output_input_ratio = 0.00 - config.data.max_output_input_ratio = float('inf') + config.min_input_len = 0.0 # second + config.max_input_len = float('inf') # second + config.min_output_len = 0.0 # tokens + config.max_output_len = float('inf') # tokens + config.min_output_input_ratio = 0.00 + config.max_output_input_ratio = float('inf') test_dataset = ManifestDataset.from_config(config) # return text ord id - config.collator.keep_transcription_text = True - config.collator.augmentation_config = "" + config.keep_transcription_text = True + config.augmentation_config = "" self.test_loader = DataLoader( test_dataset, - batch_size=config.decoding.batch_size, + batch_size=config.decode.batch_size, shuffle=False, drop_last=False, collate_fn=SpeechCollator.from_config(config)) # return text token id - config.collator.keep_transcription_text = False + config.keep_transcription_text = False self.align_loader = DataLoader( test_dataset, - batch_size=config.decoding.batch_size, + batch_size=config.decode.batch_size, shuffle=False, drop_last=False, collate_fn=SpeechCollator.from_config(config)) @@ -122,7 +122,7 @@ class U2Trainer(Trainer): def setup_model(self): config = self.config - model_conf = config.model + model_conf = config with UpdateConfig(model_conf): model_conf.input_dim = self.train_loader.collate_fn.feature_size model_conf.output_dim = self.train_loader.collate_fn.vocab_size @@ -136,7 +136,7 @@ class U2Trainer(Trainer): logger.info(f"{model}") layer_tools.print_params(model, logger.info) - train_config = config.training + train_config = config optim_type = train_config.optim optim_conf = train_config.optim_conf scheduler_type = train_config.scheduler @@ -156,7 +156,7 @@ class U2Trainer(Trainer): config, parameters, lr_scheduler=None, ): - train_config = config.training + train_config = config optim_type = train_config.optim optim_conf = train_config.optim_conf scheduler_type = train_config.scheduler @@ -182,7 +182,7 @@ class U2Trainer(Trainer): def setup_updater(self): output_dir = self.output_dir - config = self.config.training + config = self.config updater = U2Updater( model=self.model, diff --git a/paddlespeech/s2t/exps/u2_kaldi/bin/test.py b/paddlespeech/s2t/exps/u2_kaldi/bin/test.py index 67bed349..422483b9 100644 --- a/paddlespeech/s2t/exps/u2_kaldi/bin/test.py +++ b/paddlespeech/s2t/exps/u2_kaldi/bin/test.py @@ -69,6 +69,10 @@ if __name__ == "__main__": config = CfgNode() config.set_new_allowed(True) config.merge_from_file(args.config) + if args.decode_cfg: + decode_confs = CfgNode(new_allowed=True) + decode_confs.merge_from_file(args.decode_cfg) + config.decode = decode_confs if args.opts: config.merge_from_list(args.opts) config.freeze() diff --git a/paddlespeech/s2t/exps/u2_kaldi/model.py b/paddlespeech/s2t/exps/u2_kaldi/model.py index 9b8274ad..887dd29e 100644 --- a/paddlespeech/s2t/exps/u2_kaldi/model.py +++ b/paddlespeech/s2t/exps/u2_kaldi/model.py @@ -80,7 +80,7 @@ class U2Trainer(Trainer): super().__init__(config, args) def train_batch(self, batch_index, batch_data, msg): - train_conf = self.config.training + train_conf = self.config start = time.time() # forward @@ -122,7 +122,7 @@ class U2Trainer(Trainer): if (batch_index + 1) % train_conf.log_interval == 0: msg += "train time: {:>.3f}s, ".format(iteration_time) - msg += "batch size: {}, ".format(self.config.collator.batch_size) + msg += "batch size: {}, ".format(self.config.batch_size) msg += "accum: {}, ".format(train_conf.accum_grad) msg += ', '.join('{}: {:>.6f}'.format(k, v) for k, v in losses_np.items()) @@ -157,7 +157,7 @@ class U2Trainer(Trainer): if ctc_loss: valid_losses['val_ctc_loss'].append(float(ctc_loss)) - if (i + 1) % self.config.training.log_interval == 0: + if (i + 1) % self.config.log_interval == 0: valid_dump = {k: np.mean(v) for k, v in valid_losses.items()} valid_dump['val_history_loss'] = total_loss / num_seen_utts @@ -186,7 +186,7 @@ class U2Trainer(Trainer): self.before_train() logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}") - while self.epoch < self.config.training.n_epoch: + while self.epoch < self.config.n_epoch: with Timer("Epoch-Train Time Cost: {}"): self.model.train() try: @@ -235,10 +235,10 @@ class U2Trainer(Trainer): config = self.config.clone() # train/valid dataset, return token ids self.train_loader = BatchDataLoader( - json_file=config.data.train_manifest, + json_file=config.train_manifest, train_mode=True, sortagrad=False, - batch_size=config.collator.batch_size, + batch_size=config.batch_size, maxlen_in=float('inf'), maxlen_out=float('inf'), minibatches=0, @@ -248,16 +248,16 @@ class U2Trainer(Trainer): batch_frames_in=0, batch_frames_out=0, batch_frames_inout=0, - preprocess_conf=config.collator.augmentation_config, - n_iter_processes=config.collator.num_workers, + preprocess_conf=config.preprocess_config, + n_iter_processes=config.num_workers, subsampling_factor=1, num_encs=1) self.valid_loader = BatchDataLoader( - json_file=config.data.dev_manifest, + json_file=config.dev_manifest, train_mode=False, sortagrad=False, - batch_size=config.collator.batch_size, + batch_size=config.batch_size, maxlen_in=float('inf'), maxlen_out=float('inf'), minibatches=0, @@ -268,16 +268,18 @@ class U2Trainer(Trainer): batch_frames_out=0, batch_frames_inout=0, preprocess_conf=None, - n_iter_processes=config.collator.num_workers, + n_iter_processes=config.num_workers, subsampling_factor=1, num_encs=1) + decode_batch_size = config.get('decode', dict()).get( + 'decode_batch_size', 1) # test dataset, return raw text self.test_loader = BatchDataLoader( - json_file=config.data.test_manifest, + json_file=config.test_manifest, train_mode=False, sortagrad=False, - batch_size=config.decoding.batch_size, + batch_size=decode_batch_size, maxlen_in=float('inf'), maxlen_out=float('inf'), minibatches=0, @@ -293,10 +295,10 @@ class U2Trainer(Trainer): num_encs=1) self.align_loader = BatchDataLoader( - json_file=config.data.test_manifest, + json_file=config.test_manifest, train_mode=False, sortagrad=False, - batch_size=config.decoding.batch_size, + batch_size=decode_batch_size, maxlen_in=float('inf'), maxlen_out=float('inf'), minibatches=0, @@ -316,7 +318,7 @@ class U2Trainer(Trainer): config = self.config # model - model_conf = config.model + model_conf = config with UpdateConfig(model_conf): model_conf.input_dim = self.train_loader.feat_dim model_conf.output_dim = self.train_loader.vocab_size @@ -392,9 +394,9 @@ class U2Tester(U2Trainer): def __init__(self, config, args): super().__init__(config, args) self.text_feature = TextFeaturizer( - unit_type=self.config.collator.unit_type, - vocab=self.config.collator.vocab_filepath, - spm_model_prefix=self.config.collator.spm_model_prefix) + unit_type=self.config.unit_type, + vocab=self.config.vocab_filepath, + spm_model_prefix=self.config.spm_model_prefix) self.vocab_list = self.text_feature.vocab_list def id2token(self, texts, texts_len, text_feature): @@ -413,10 +415,10 @@ class U2Tester(U2Trainer): texts, texts_len, fout=None): - cfg = self.config.decoding + decode_cfg = self.config.decode errors_sum, len_refs, num_ins = 0.0, 0, 0 - errors_func = error_rate.char_errors if cfg.error_rate_type == 'cer' else error_rate.word_errors - error_rate_func = error_rate.cer if cfg.error_rate_type == 'cer' else error_rate.wer + errors_func = error_rate.char_errors if decode_cfg.error_rate_type == 'cer' else error_rate.word_errors + error_rate_func = error_rate.cer if decode_cfg.error_rate_type == 'cer' else error_rate.wer start_time = time.time() target_transcripts = self.id2token(texts, texts_len, self.text_feature) @@ -424,12 +426,12 @@ class U2Tester(U2Trainer): audio, audio_len, text_feature=self.text_feature, - decoding_method=cfg.decoding_method, - beam_size=cfg.beam_size, - ctc_weight=cfg.ctc_weight, - decoding_chunk_size=cfg.decoding_chunk_size, - num_decoding_left_chunks=cfg.num_decoding_left_chunks, - simulate_streaming=cfg.simulate_streaming) + decoding_method=decode_cfg.decoding_method, + beam_size=decode_cfg.beam_size, + ctc_weight=decode_cfg.ctc_weight, + decoding_chunk_size=decode_cfg.decoding_chunk_size, + num_decoding_left_chunks=decode_cfg.num_decoding_left_chunks, + simulate_streaming=decode_cfg.simulate_streaming) decode_time = time.time() - start_time for i, (utt, target, result, rec_tids) in enumerate( @@ -449,15 +451,16 @@ class U2Tester(U2Trainer): logger.info(f"Utt: {utt}") logger.info(f"Ref: {target}") logger.info(f"Hyp: {result}") - logger.info("One example error rate [%s] = %f" % - (cfg.error_rate_type, error_rate_func(target, result))) + logger.info( + "One example error rate [%s] = %f" % + (decode_cfg.error_rate_type, error_rate_func(target, result))) return dict( errors_sum=errors_sum, len_refs=len_refs, num_ins=num_ins, # num examples error_rate=errors_sum / len_refs, - error_rate_type=cfg.error_rate_type, + error_rate_type=decode_cfg.error_rate_type, num_frames=audio_len.sum().numpy().item(), decode_time=decode_time) @@ -468,7 +471,7 @@ class U2Tester(U2Trainer): self.model.eval() logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}") - stride_ms = self.config.collator.stride_ms + stride_ms = self.config.stride_ms error_rate_type = None errors_sum, len_refs, num_ins = 0.0, 0, 0 num_frames = 0.0 @@ -519,15 +522,15 @@ class U2Tester(U2Trainer): "ref_len": len_refs, "decode_method": - self.config.decoding.decoding_method, + self.config.decode.decoding_method, }) f.write(data + '\n') @paddle.no_grad() def align(self): ctc_utils.ctc_align(self.config, self.model, self.align_loader, - self.config.decoding.batch_size, - self.config.collator.stride_ms, self.vocab_list, + self.config.decode.decode_batch_size, + self.config.stride_ms, self.vocab_list, self.args.result_file) def load_inferspec(self): @@ -539,7 +542,7 @@ class U2Tester(U2Trainer): """ from paddlespeech.s2t.models.u2 import U2InferModel infer_model = U2InferModel.from_pretrained(self.test_loader, - self.config.model.clone(), + self.config.clone(), self.args.checkpoint_path) feat_dim = self.test_loader.feat_dim input_spec = [ diff --git a/paddlespeech/s2t/exps/u2_st/bin/test.py b/paddlespeech/s2t/exps/u2_st/bin/test.py index 93c2fee0..3ad5fc7d 100644 --- a/paddlespeech/s2t/exps/u2_st/bin/test.py +++ b/paddlespeech/s2t/exps/u2_st/bin/test.py @@ -14,12 +14,14 @@ """Evaluation for U2 model.""" import cProfile +from yacs.config import CfgNode + from paddlespeech.s2t.exps.u2_st.config import get_cfg_defaults from paddlespeech.s2t.exps.u2_st.model import U2STTester as Tester from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils.utility import print_arguments -# TODO(hui zhang): dynamic load +# TODO(hui zhang): dynamic load def main_sp(config, args): @@ -35,7 +37,7 @@ def main(config, args): if __name__ == "__main__": parser = default_argument_parser() - # save asr result to + # save asr result to parser.add_argument( "--result_file", type=str, help="path of save the asr result") args = parser.parse_args() @@ -45,6 +47,10 @@ if __name__ == "__main__": config = get_cfg_defaults() if args.config: config.merge_from_file(args.config) + if args.decode_cfg: + decode_conf = CfgNode(new_allowed=True) + decode_conf.merge_from_file(args.decode_cfg) + config.decode = decode_conf if args.opts: config.merge_from_list(args.opts) config.freeze() diff --git a/paddlespeech/s2t/exps/u2_st/config.py b/paddlespeech/s2t/exps/u2_st/config.py index a48f9106..a314a1ca 100644 --- a/paddlespeech/s2t/exps/u2_st/config.py +++ b/paddlespeech/s2t/exps/u2_st/config.py @@ -21,15 +21,15 @@ from paddlespeech.s2t.models.u2_st import U2STModel _C = CfgNode() -_C.data = ManifestDataset.params() +# _C.data = ManifestDataset.params() -_C.collator = SpeechCollator.params() +# _C.collator = SpeechCollator.params() -_C.model = U2STModel.params() +# _C.model = U2STModel.params() -_C.training = U2STTrainer.params() +# _C.training = U2STTrainer.params() -_C.decoding = U2STTester.params() +# _C.decoding = U2STTester.params() def get_cfg_defaults(): diff --git a/paddlespeech/s2t/exps/u2_st/model.py b/paddlespeech/s2t/exps/u2_st/model.py index a3b39df7..00f11599 100644 --- a/paddlespeech/s2t/exps/u2_st/model.py +++ b/paddlespeech/s2t/exps/u2_st/model.py @@ -78,7 +78,7 @@ class U2STTrainer(Trainer): super().__init__(config, args) def train_batch(self, batch_index, batch_data, msg): - train_conf = self.config.training + train_conf = self.config start = time.time() # forward utt, audio, audio_len, text, text_len = batch_data @@ -127,7 +127,7 @@ class U2STTrainer(Trainer): if (batch_index + 1) % train_conf.log_interval == 0: msg += "train time: {:>.3f}s, ".format(iteration_time) - msg += "batch size: {}, ".format(self.config.collator.batch_size) + msg += "batch size: {}, ".format(self.config.batch_size) msg += "accum: {}, ".format(train_conf.accum_grad) msg += ', '.join('{}: {:>.6f}'.format(k, v) for k, v in losses_np.items()) @@ -168,7 +168,7 @@ class U2STTrainer(Trainer): if ctc_loss: valid_losses['val_ctc_loss'].append(float(ctc_loss)) - if (i + 1) % self.config.training.log_interval == 0: + if (i + 1) % self.config.log_interval == 0: valid_dump = {k: np.mean(v) for k, v in valid_losses.items()} valid_dump['val_history_st_loss'] = total_loss / num_seen_utts @@ -197,7 +197,7 @@ class U2STTrainer(Trainer): self.before_train() logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}") - while self.epoch < self.config.training.n_epoch: + while self.epoch < self.config.n_epoch: with Timer("Epoch-Train Time Cost: {}"): self.model.train() try: @@ -245,91 +245,93 @@ class U2STTrainer(Trainer): def setup_dataloader(self): config = self.config.clone() config.defrost() - config.collator.keep_transcription_text = False + config.keep_transcription_text = False # train/valid dataset, return token ids - config.data.manifest = config.data.train_manifest + config.manifest = config.train_manifest train_dataset = ManifestDataset.from_config(config) - config.data.manifest = config.data.dev_manifest + config.manifest = config.dev_manifest dev_dataset = ManifestDataset.from_config(config) - if config.model.model_conf.asr_weight > 0.: + if config.model_conf.asr_weight > 0.: Collator = TripletSpeechCollator TestCollator = SpeechCollator else: TestCollator = Collator = SpeechCollator collate_fn_train = Collator.from_config(config) - config.collator.augmentation_config = "" + config.augmentation_config = "" collate_fn_dev = Collator.from_config(config) if self.parallel: batch_sampler = SortagradDistributedBatchSampler( train_dataset, - batch_size=config.collator.batch_size, + batch_size=config.batch_size, num_replicas=None, rank=None, shuffle=True, drop_last=True, - sortagrad=config.collator.sortagrad, - shuffle_method=config.collator.shuffle_method) + sortagrad=config.sortagrad, + shuffle_method=config.shuffle_method) else: batch_sampler = SortagradBatchSampler( train_dataset, shuffle=True, - batch_size=config.collator.batch_size, + batch_size=config.batch_size, drop_last=True, - sortagrad=config.collator.sortagrad, - shuffle_method=config.collator.shuffle_method) + sortagrad=config.sortagrad, + shuffle_method=config.shuffle_method) self.train_loader = DataLoader( train_dataset, batch_sampler=batch_sampler, collate_fn=collate_fn_train, - num_workers=config.collator.num_workers, ) + num_workers=config.num_workers, ) self.valid_loader = DataLoader( dev_dataset, - batch_size=config.collator.batch_size, + batch_size=config.batch_size, shuffle=False, drop_last=False, collate_fn=collate_fn_dev, - num_workers=config.collator.num_workers, ) + num_workers=config.num_workers, ) # test dataset, return raw text - config.data.manifest = config.data.test_manifest + config.manifest = config.test_manifest # filter test examples, will cause less examples, but no mismatch with training # and can use large batch size , save training time, so filter test egs now. - # config.data.min_input_len = 0.0 # second - # config.data.max_input_len = float('inf') # second - # config.data.min_output_len = 0.0 # tokens - # config.data.max_output_len = float('inf') # tokens - # config.data.min_output_input_ratio = 0.00 - # config.data.max_output_input_ratio = float('inf') + # config.min_input_len = 0.0 # second + # config.max_input_len = float('inf') # second + # config.min_output_len = 0.0 # tokens + # config.max_output_len = float('inf') # tokens + # config.min_output_input_ratio = 0.00 + # config.max_output_input_ratio = float('inf') test_dataset = ManifestDataset.from_config(config) # return text ord id - config.collator.keep_transcription_text = True - config.collator.augmentation_config = "" + config.keep_transcription_text = True + config.augmentation_config = "" + decode_batch_size = config.get('decode', dict()).get( + 'decode_batch_size', 1) self.test_loader = DataLoader( test_dataset, - batch_size=config.decoding.batch_size, + batch_size=decode_batch_size, shuffle=False, drop_last=False, collate_fn=TestCollator.from_config(config), - num_workers=config.collator.num_workers, ) + num_workers=config.num_workers, ) # return text token id - config.collator.keep_transcription_text = False + config.keep_transcription_text = False self.align_loader = DataLoader( test_dataset, - batch_size=config.decoding.batch_size, + batch_size=decode_batch_size, shuffle=False, drop_last=False, collate_fn=TestCollator.from_config(config), - num_workers=config.collator.num_workers, ) + num_workers=config.num_workers, ) logger.info("Setup train/valid/test/align Dataloader!") def setup_model(self): config = self.config - model_conf = config.model + model_conf = config with UpdateConfig(model_conf): model_conf.input_dim = self.train_loader.collate_fn.feature_size model_conf.output_dim = self.train_loader.collate_fn.vocab_size @@ -342,7 +344,7 @@ class U2STTrainer(Trainer): logger.info(f"{model}") layer_tools.print_params(model, logger.info) - train_config = config.training + train_config = config optim_type = train_config.optim optim_conf = train_config.optim_conf scheduler_type = train_config.scheduler @@ -428,7 +430,7 @@ class U2STTester(U2STTrainer): def translate(self, audio, audio_len): """"E2E translation from extracted audio feature""" - cfg = self.config.decoding + decode_cfg = self.config.decode text_feature = self.test_loader.collate_fn.text_feature self.model.eval() @@ -436,12 +438,12 @@ class U2STTester(U2STTrainer): audio, audio_len, text_feature=text_feature, - decoding_method=cfg.decoding_method, - beam_size=cfg.beam_size, - word_reward=cfg.word_reward, - decoding_chunk_size=cfg.decoding_chunk_size, - num_decoding_left_chunks=cfg.num_decoding_left_chunks, - simulate_streaming=cfg.simulate_streaming) + decoding_method=decode_cfg.decoding_method, + beam_size=decode_cfg.beam_size, + word_reward=decode_cfg.word_reward, + decoding_chunk_size=decode_cfg.decoding_chunk_size, + num_decoding_left_chunks=decode_cfg.num_decoding_left_chunks, + simulate_streaming=decode_cfg.simulate_streaming) return hyps def compute_translation_metrics(self, @@ -452,7 +454,7 @@ class U2STTester(U2STTrainer): texts_len, bleu_func, fout=None): - cfg = self.config.decoding + decode_cfg = self.config.decode len_refs, num_ins = 0, 0 start_time = time.time() @@ -467,12 +469,12 @@ class U2STTester(U2STTrainer): audio, audio_len, text_feature=text_feature, - decoding_method=cfg.decoding_method, - beam_size=cfg.beam_size, - word_reward=cfg.word_reward, - decoding_chunk_size=cfg.decoding_chunk_size, - num_decoding_left_chunks=cfg.num_decoding_left_chunks, - simulate_streaming=cfg.simulate_streaming) + decoding_method=decode_cfg.decoding_method, + beam_size=decode_cfg.beam_size, + word_reward=decode_cfg.word_reward, + decoding_chunk_size=decode_cfg.decoding_chunk_size, + num_decoding_left_chunks=decode_cfg.num_decoding_left_chunks, + simulate_streaming=decode_cfg.simulate_streaming) decode_time = time.time() - start_time for utt, target, result in zip(utts, refs, hyps): @@ -502,8 +504,8 @@ class U2STTester(U2STTrainer): self.model.eval() logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}") - cfg = self.config.decoding - bleu_func = bleu_score.char_bleu if cfg.error_rate_type == 'char-bleu' else bleu_score.bleu + decode_cfg = self.config.decode + bleu_func = bleu_score.char_bleu if decode_cfg.error_rate_type == 'char-bleu' else bleu_score.bleu stride_ms = self.test_loader.collate_fn.stride_ms hyps, refs = [], [] @@ -549,15 +551,15 @@ class U2STTester(U2STTrainer): "num_examples": num_ins, "decode_method": - self.config.decoding.decoding_method, + self.config.decode.decoding_method, }) f.write(data + '\n') @paddle.no_grad() def align(self): ctc_utils.ctc_align(self.config, self.model, self.align_loader, - self.config.decoding.batch_size, - self.config.collator.stride_ms, self.vocab_list, + self.config.decode.decode_batch_size, + self.config.stride_ms, self.vocab_list, self.args.result_file) def load_inferspec(self): @@ -569,7 +571,7 @@ class U2STTester(U2STTrainer): """ from paddlespeech.s2t.models.u2 import U2InferModel infer_model = U2InferModel.from_pretrained(self.test_loader, - self.config.model.clone(), + self.config.clone(), self.args.checkpoint_path) feat_dim = self.test_loader.collate_fn.feature_size input_spec = [ diff --git a/paddlespeech/s2t/io/collator.py b/paddlespeech/s2t/io/collator.py index 5f233549..27bf20eb 100644 --- a/paddlespeech/s2t/io/collator.py +++ b/paddlespeech/s2t/io/collator.py @@ -256,45 +256,43 @@ class SpeechCollator(SpeechCollatorBase): Returns: SpeechCollator: collator object. """ - assert 'augmentation_config' in config.collator - assert 'keep_transcription_text' in config.collator - assert 'mean_std_filepath' in config.collator - assert 'vocab_filepath' in config.collator - assert 'spectrum_type' in config.collator - assert 'n_fft' in config.collator - assert config.collator - - if isinstance(config.collator.augmentation_config, (str, bytes)): - if config.collator.augmentation_config: + assert 'augmentation_config' in config + assert 'keep_transcription_text' in config + assert 'mean_std_filepath' in config + assert 'vocab_filepath' in config + assert 'spectrum_type' in config + assert 'n_fft' in config + assert config + + if isinstance(config.augmentation_config, (str, bytes)): + if config.augmentation_config: aug_file = io.open( - config.collator.augmentation_config, - mode='r', - encoding='utf8') + config.augmentation_config, mode='r', encoding='utf8') else: aug_file = io.StringIO(initial_value='{}', newline='') else: - aug_file = config.collator.augmentation_config + aug_file = config.augmentation_config assert isinstance(aug_file, io.StringIO) speech_collator = cls( aug_file=aug_file, random_seed=0, - mean_std_filepath=config.collator.mean_std_filepath, - unit_type=config.collator.unit_type, - vocab_filepath=config.collator.vocab_filepath, - spm_model_prefix=config.collator.spm_model_prefix, - spectrum_type=config.collator.spectrum_type, - feat_dim=config.collator.feat_dim, - delta_delta=config.collator.delta_delta, - stride_ms=config.collator.stride_ms, - window_ms=config.collator.window_ms, - n_fft=config.collator.n_fft, - max_freq=config.collator.max_freq, - target_sample_rate=config.collator.target_sample_rate, - use_dB_normalization=config.collator.use_dB_normalization, - target_dB=config.collator.target_dB, - dither=config.collator.dither, - keep_transcription_text=config.collator.keep_transcription_text) + mean_std_filepath=config.mean_std_filepath, + unit_type=config.unit_type, + vocab_filepath=config.vocab_filepath, + spm_model_prefix=config.spm_model_prefix, + spectrum_type=config.spectrum_type, + feat_dim=config.feat_dim, + delta_delta=config.delta_delta, + stride_ms=config.stride_ms, + window_ms=config.window_ms, + n_fft=config.n_fft, + max_freq=config.max_freq, + target_sample_rate=config.target_sample_rate, + use_dB_normalization=config.use_dB_normalization, + target_dB=config.target_dB, + dither=config.dither, + keep_transcription_text=config.keep_transcription_text) return speech_collator diff --git a/paddlespeech/s2t/io/dataset.py b/paddlespeech/s2t/io/dataset.py index d64d7d3e..c76ccfce 100644 --- a/paddlespeech/s2t/io/dataset.py +++ b/paddlespeech/s2t/io/dataset.py @@ -54,17 +54,17 @@ class ManifestDataset(Dataset): Returns: ManifestDataset: dataet object. """ - assert 'manifest' in config.data - assert config.data.manifest + assert 'manifest' in config + assert config.manifest dataset = cls( - manifest_path=config.data.manifest, - max_input_len=config.data.max_input_len, - min_input_len=config.data.min_input_len, - max_output_len=config.data.max_output_len, - min_output_len=config.data.min_output_len, - max_output_input_ratio=config.data.max_output_input_ratio, - min_output_input_ratio=config.data.min_output_input_ratio, ) + manifest_path=config.manifest, + max_input_len=config.max_input_len, + min_input_len=config.min_input_len, + max_output_len=config.max_output_len, + min_output_len=config.min_output_len, + max_output_input_ratio=config.max_output_input_ratio, + min_output_input_ratio=config.min_output_input_ratio, ) return dataset def __init__(self, diff --git a/paddlespeech/s2t/models/ds2/deepspeech2.py b/paddlespeech/s2t/models/ds2/deepspeech2.py index 0dfaec29..0414d04f 100644 --- a/paddlespeech/s2t/models/ds2/deepspeech2.py +++ b/paddlespeech/s2t/models/ds2/deepspeech2.py @@ -221,12 +221,12 @@ class DeepSpeech2Model(nn.Layer): model = cls( feat_size=dataloader.collate_fn.feature_size, dict_size=dataloader.collate_fn.vocab_size, - num_conv_layers=config.model.num_conv_layers, - num_rnn_layers=config.model.num_rnn_layers, - rnn_size=config.model.rnn_layer_size, - use_gru=config.model.use_gru, - share_rnn_weights=config.model.share_rnn_weights, - blank_id=config.model.blank_id, + num_conv_layers=config.num_conv_layers, + num_rnn_layers=config.num_rnn_layers, + rnn_size=config.rnn_layer_size, + use_gru=config.use_gru, + share_rnn_weights=config.share_rnn_weights, + blank_id=config.blank_id, ctc_grad_norm_type=config.get('ctc_grad_norm_type', None), ) infos = Checkpoint().load_parameters( model, checkpoint_path=checkpoint_path) @@ -240,7 +240,7 @@ class DeepSpeech2Model(nn.Layer): Parameters config: yacs.config.CfgNode - config.model + config Returns ------- DeepSpeech2Model diff --git a/paddlespeech/s2t/models/ds2_online/deepspeech2.py b/paddlespeech/s2t/models/ds2_online/deepspeech2.py index 85876bce..f08e30d0 100644 --- a/paddlespeech/s2t/models/ds2_online/deepspeech2.py +++ b/paddlespeech/s2t/models/ds2_online/deepspeech2.py @@ -353,14 +353,14 @@ class DeepSpeech2ModelOnline(nn.Layer): model = cls( feat_size=dataloader.collate_fn.feature_size, dict_size=dataloader.collate_fn.vocab_size, - num_conv_layers=config.model.num_conv_layers, - num_rnn_layers=config.model.num_rnn_layers, - rnn_size=config.model.rnn_layer_size, - rnn_direction=config.model.rnn_direction, - num_fc_layers=config.model.num_fc_layers, - fc_layers_size_list=config.model.fc_layers_size_list, - use_gru=config.model.use_gru, - blank_id=config.model.blank_id, + num_conv_layers=config.num_conv_layers, + num_rnn_layers=config.num_rnn_layers, + rnn_size=config.rnn_layer_size, + rnn_direction=config.rnn_direction, + num_fc_layers=config.num_fc_layers, + fc_layers_size_list=config.fc_layers_size_list, + use_gru=config.use_gru, + blank_id=config.blank_id, ctc_grad_norm_type=config.get('ctc_grad_norm_type', None), ) infos = Checkpoint().load_parameters( model, checkpoint_path=checkpoint_path) @@ -374,7 +374,7 @@ class DeepSpeech2ModelOnline(nn.Layer): Parameters config: yacs.config.CfgNode - config.model + config Returns ------- DeepSpeech2ModelOnline diff --git a/paddlespeech/s2t/training/cli.py b/paddlespeech/s2t/training/cli.py index d4299ea3..bb85732a 100644 --- a/paddlespeech/s2t/training/cli.py +++ b/paddlespeech/s2t/training/cli.py @@ -101,7 +101,7 @@ def default_argument_parser(parser=None): title='Test Options', description=None) test_group.add_argument( - "--decode_config", + "--decode_cfg", metavar="DECODE_CONFIG_FILE", help="decode config file.") diff --git a/tests/benchmark/conformer/run.sh b/tests/benchmark/conformer/run.sh index fcd0c235..c9d640ed 100644 --- a/tests/benchmark/conformer/run.sh +++ b/tests/benchmark/conformer/run.sh @@ -22,6 +22,7 @@ sed -i "s/ accum_grad: 2/ accum_grad: 1/g" conf/benchmark/conformer.yaml fp_item_list=(fp32) bs_item=(16) config_path=conf/benchmark/conformer.yaml +decode_config_path=conf/tuning/decode.yaml seed=0 output=exp/conformer profiler_options=None @@ -34,13 +35,13 @@ for fp_item in ${fp_item_list[@]}; do echo "index is speed, 8gpus, run_mode is multi_process, begin, conformer" run_mode=mp ngpu=8 - CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash ${CUR_DIR}/run_benchmark.sh ${run_mode} ${config_path} ${output} ${seed} ${ngpu} ${profiler_options} ${bs_item} ${fp_item} ${model_item} | tee ${log_path}/${log_name}_speed_8gpus8p 2>&1 + CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash ${CUR_DIR}/run_benchmark.sh ${run_mode} ${config_path} ${decode_config_path} ${output} ${seed} ${ngpu} ${profiler_options} ${bs_item} ${fp_item} ${model_item} | tee ${log_path}/${log_name}_speed_8gpus8p 2>&1 sleep 60 log_name=speech_${model_item}_bs${bs_item}_${fp_item} # 如:clas_MobileNetv1_mp_bs32_fp32_8 echo "index is speed, 1gpus, begin, ${log_name}" run_mode=sp ngpu=1 - CUDA_VISIBLE_DEVICES=0 bash ${CUR_DIR}/run_benchmark.sh ${run_mode} ${config_path} ${output} ${seed} ${ngpu} ${profiler_options} ${bs_item} ${fp_item} ${model_item} | tee ${log_path}/${log_name}_speed_1gpus 2>&1 # (5min) + CUDA_VISIBLE_DEVICES=0 bash ${CUR_DIR}/run_benchmark.sh ${run_mode} ${config_path} ${decode_config_path} ${output} ${seed} ${ngpu} ${profiler_options} ${bs_item} ${fp_item} ${model_item} | tee ${log_path}/${log_name}_speed_1gpus 2>&1 # (5min) sleep 60 done done diff --git a/tests/benchmark/conformer/run_benchmark.sh b/tests/benchmark/conformer/run_benchmark.sh index 5b83b15c..d5902c51 100644 --- a/tests/benchmark/conformer/run_benchmark.sh +++ b/tests/benchmark/conformer/run_benchmark.sh @@ -5,13 +5,14 @@ function _set_params(){ run_mode=${1:-"sp"} # 单卡sp|多卡mp config_path=${2:-"conf/conformer.yaml"} - output=${3:-"exp/conformer"} - seed=${4:-"0"} - ngpu=${5:-"1"} - profiler_options=${6:-"None"} - batch_size=${7:-"32"} - fp_item=${8:-"fp32"} - model_item=${9:-"conformer"} + decode_config_path=${3:-"conf/tuning/decode.yaml"} + output=${4:-"exp/conformer"} + seed=${5:-"0"} + ngpu=${6:-"1"} + profiler_options=${7:-"None"} + batch_size=${8:-"32"} + fp_item=${9:-"fp32"} + model_item=${10:-"conformer"} benchmark_max_step=0 run_log_path=${TRAIN_LOG_DIR:-$(pwd)} # TRAIN_LOG_DIR 后续QA设置该参数 # 添加日志解析需要的参数 @@ -35,6 +36,7 @@ function _train(){ echo "Train on ${num_gpu_devices} GPUs" echo "current CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES, gpus=$num_gpu_devices, batch_size=$batch_size" train_cmd="--config=${config_path} \ + --decode_cfg=${decode_config_path} \ --output=${output} \ --seed=${seed} \ --ngpu=${ngpu} \ @@ -68,7 +70,7 @@ function _train(){ } source ${BENCHMARK_ROOT}/scripts/run_model.sh # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;该脚本在连调时可从benchmark repo中下载https://github.com/PaddlePaddle/benchmark/blob/master/scripts/run_model.sh;如果不联调只想要产出训练log可以注掉本行,提交时需打开 -_set_params $@ -# _train # 如果只想产出训练log,不解析,可取消注释 +#_set_params $@ +#_train # 如果只想产出训练log,不解析,可取消注释 _run # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只想要产出训练log可以注掉本行,提交时需打开 diff --git a/tests/chains/ds2/ds2_params_lite_train_infer.txt b/tests/chains/ds2/ds2_params_lite_train_infer.txt index b11872bd..cad8efa3 100644 --- a/tests/chains/ds2/ds2_params_lite_train_infer.txt +++ b/tests/chains/ds2/ds2_params_lite_train_infer.txt @@ -21,13 +21,13 @@ null:null null:null ## ===========================eval_params=========================== -eval: ../../../paddlespeech/s2t/exps/deepspeech2/bin/test.py --ngpu 1 --config conf/deepspeech2.yaml --checkpoint_path exp/deepspeech_tiny/checkpoints/9 --result_file tests/9.rsl --model_type offline +eval: ../../../paddlespeech/s2t/exps/deepspeech2/bin/test.py --ngpu 1 --config conf/deepspeech2.yaml --decode_cfg conf/tuning/decode.yaml --checkpoint_path exp/deepspeech_tiny/checkpoints/4 --result_file tests/4.rsl --model_type offline null:null ## ===========================infer_params=========================== null:null null:null -norm_export: ../../../paddlespeech/s2t/exps/deepspeech2/bin/export.py --ngpu 1 --config conf/deepspeech2.yaml --model_type offline --checkpoint_path exp/deepspeech_tiny/checkpoints/9 --export_path exp/deepspeech_tiny/checkpoints/9.jit +norm_export: ../../../paddlespeech/s2t/exps/deepspeech2/bin/export.py --ngpu 1 --config conf/deepspeech2.yaml --model_type offline --checkpoint_path exp/deepspeech_tiny/checkpoints/4 --export_path exp/deepspeech_tiny/checkpoints/4.jit quant_export:null fpgm_export:null distill_export:null diff --git a/tests/chains/ds2/ds2_params_whole_train_infer.txt b/tests/chains/ds2/ds2_params_whole_train_infer.txt index 875e3ccf..5c619506 100644 --- a/tests/chains/ds2/ds2_params_whole_train_infer.txt +++ b/tests/chains/ds2/ds2_params_whole_train_infer.txt @@ -21,7 +21,7 @@ null:null null:null ## ===========================eval_params=========================== -eval: ../../../paddlespeech/s2t/exps/deepspeech2/bin/test.py --ngpu 1 --config conf/deepspeech2.yaml --result_file tests/49.rsl --checkpoint_path exp/deepspeech_whole/checkpoints/49 --model_type offline +eval: ../../../paddlespeech/s2t/exps/deepspeech2/bin/test.py --ngpu 1 --config conf/deepspeech2.yaml --decode_cfg conf/tuning/decode.yaml --result_file tests/49.rsl --checkpoint_path exp/deepspeech_whole/checkpoints/49 --model_type offline null:null ## ===========================infer_params=========================== diff --git a/tests/chains/ds2/lite_train_infer.sh b/tests/chains/ds2/lite_train_infer.sh index 76b22a38..1dce1b29 100644 --- a/tests/chains/ds2/lite_train_infer.sh +++ b/tests/chains/ds2/lite_train_infer.sh @@ -1,5 +1,5 @@ bash prepare.sh ds2_params_lite_train_infer.txt lite_train_infer -cd ../../examples/tiny/s0 +cd ../../../examples/tiny/asr0 source path.sh -bash ../../../tests/chains/test.sh ../../../tests/chains/ds2_params_lite_train_infer.txt lite_train_infer +bash ../../../tests/chains/ds2/test.sh ../../../tests/chains/ds2/ds2_params_lite_train_infer.txt lite_train_infer cd ../../../tests/chains diff --git a/tests/chains/ds2/prepare.sh b/tests/chains/ds2/prepare.sh index 73a30283..4913ce42 100644 --- a/tests/chains/ds2/prepare.sh +++ b/tests/chains/ds2/prepare.sh @@ -34,7 +34,7 @@ MODE=$2 if [ ${MODE} = "lite_train_infer" ];then # pretrain lite train data curPath=$(readlink -f "$(dirname "$0")") - cd ${curPath}/../../examples/tiny/s0 + cd ${curPath}/../../../examples/tiny/asr0 source path.sh # download audio data bash ./local/data.sh || exit -1 @@ -47,7 +47,7 @@ if [ ${MODE} = "lite_train_infer" ];then elif [ ${MODE} = "whole_train_infer" ];then curPath=$(readlink -f "$(dirname "$0")") - cd ${curPath}/../../examples/aishell/s0 + cd ${curPath}/../../../examples/aishell/asr0 source path.sh # download audio data bash ./local/data.sh || exit -1 @@ -59,7 +59,7 @@ elif [ ${MODE} = "whole_train_infer" ];then cd ${curPath} elif [ ${MODE} = "whole_infer" ];then curPath=$(readlink -f "$(dirname "$0")") - cd ${curPath}/../../examples/aishell/s0 + cd ${curPath}/../../../examples/aishell/asr0 source path.sh # download audio data bash ./local/data.sh || exit -1 @@ -71,7 +71,7 @@ elif [ ${MODE} = "whole_infer" ];then cd ${curPath} else curPath=$(readlink -f "$(dirname "$0")") - cd ${curPath}/../../examples/aishell/s0 + cd ${curPath}/../../../examples/aishell/asr0 source path.sh # download audio data bash ./local/data.sh || exit -1 diff --git a/tests/chains/ds2/test.sh b/tests/chains/ds2/test.sh index c9307820..26917c67 100644 --- a/tests/chains/ds2/test.sh +++ b/tests/chains/ds2/test.sh @@ -324,6 +324,7 @@ else gsu=${gpu//,/ } nump=`echo $gsu | wc -w` cmd="${python} ${run_train} --ngpu=$nump" + export CUDA_VISIBLE_DEVICES=${gpu} else # train with multi-machine cmd="${python} -m paddle.distributed.launch --ips=${ips} --gpus=${gpu} ${run_train} ${set_save_model} ${set_pretrain} ${set_epoch} ${set_autocast} ${set_batchsize} ${set_train_params1}" fi From 3e2cc898cb28edef51c729e7f3c64f9c054a5032 Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Tue, 4 Jan 2022 07:40:40 +0000 Subject: [PATCH 6/9] remove default cfg and fix some bugs,test=asr --- examples/aishell/asr0/local/test_hub_ori | 47 ----------- .../other/1xt2x/src_deepspeech2x/bin/test.py | 3 +- .../models/ds2/deepspeech2.py | 14 ---- .../1xt2x/src_deepspeech2x/test_model.py | 37 --------- examples/ted_en_zh/st0/conf/preprocess.yaml | 25 ++++++ examples/ted_en_zh/st0/conf/transformer.yaml | 4 +- .../st0/conf/transformer_mtl_noam.yaml | 2 +- examples/ted_en_zh/st1/conf/preprocess.yaml | 16 ++++ examples/ted_en_zh/st1/conf/transformer.yaml | 4 +- .../st1/conf/transformer_mtl_noam.yaml | 4 +- examples/ted_en_zh/st1/local/test.sh | 5 -- examples/tiny/asr1/conf/chunk_confermer.yaml | 3 +- paddlespeech/s2t/decoders/recog.py | 2 +- .../exps/deepspeech2/bin/deploy/runtime.py | 4 +- .../s2t/exps/deepspeech2/bin/deploy/server.py | 4 +- .../s2t/exps/deepspeech2/bin/export.py | 4 +- paddlespeech/s2t/exps/deepspeech2/bin/test.py | 3 +- .../s2t/exps/deepspeech2/bin/test_export.py | 4 +- .../s2t/exps/deepspeech2/bin/test_wav.py | 2 +- .../s2t/exps/deepspeech2/bin/train.py | 4 +- paddlespeech/s2t/exps/deepspeech2/config.py | 28 ------- paddlespeech/s2t/exps/deepspeech2/model.py | 37 --------- paddlespeech/s2t/exps/u2/bin/alignment.py | 5 +- paddlespeech/s2t/exps/u2/bin/export.py | 6 +- paddlespeech/s2t/exps/u2/bin/test.py | 3 +- paddlespeech/s2t/exps/u2/bin/test_wav.py | 3 +- paddlespeech/s2t/exps/u2/bin/train.py | 4 +- paddlespeech/s2t/exps/u2/model.py | 56 ------------- paddlespeech/s2t/exps/u2/trainer.py | 2 - paddlespeech/s2t/exps/u2_kaldi/model.py | 62 --------------- paddlespeech/s2t/exps/u2_st/bin/export.py | 6 +- paddlespeech/s2t/exps/u2_st/bin/test.py | 3 +- paddlespeech/s2t/exps/u2_st/bin/train.py | 4 +- paddlespeech/s2t/exps/u2_st/config.py | 41 ---------- paddlespeech/s2t/exps/u2_st/model.py | 78 +++---------------- paddlespeech/s2t/io/collator.py | 27 ------- paddlespeech/s2t/io/dataset.py | 16 ---- paddlespeech/s2t/models/ds2/deepspeech2.py | 15 ---- .../s2t/models/ds2_online/deepspeech2.py | 17 ---- paddlespeech/s2t/models/u2/u2.py | 51 ------------ paddlespeech/s2t/models/u2_st/u2_st.py | 70 +++-------------- 41 files changed, 97 insertions(+), 628 deletions(-) delete mode 100755 examples/aishell/asr0/local/test_hub_ori create mode 100644 examples/ted_en_zh/st0/conf/preprocess.yaml create mode 100644 examples/ted_en_zh/st1/conf/preprocess.yaml delete mode 100644 paddlespeech/s2t/exps/deepspeech2/config.py delete mode 100644 paddlespeech/s2t/exps/u2_st/config.py diff --git a/examples/aishell/asr0/local/test_hub_ori b/examples/aishell/asr0/local/test_hub_ori deleted file mode 100755 index ee1fb805..00000000 --- a/examples/aishell/asr0/local/test_hub_ori +++ /dev/null @@ -1,47 +0,0 @@ -#!/bin/bash - -if [ $# != 4 ];then - echo "usage: ${0} config_path ckpt_path_prefix model_type audio_file" - exit -1 -fi - -ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') -echo "using $ngpu gpus..." - -config_path=$1 -ckpt_prefix=$2 -model_type=$3 -audio_file=$4 - -mkdir -p data -wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wav -P data/ -if [ $? -ne 0 ]; then - exit 1 -fi - -if [ ! -f ${audio_file} ]; then - echo "Plase input the right audio_file path" - exit 1 -fi - -# download language model -bash local/download_lm_ch.sh -if [ $? -ne 0 ]; then - exit 1 -fi - -python3 -u ${BIN_DIR}/test_hub.py \ ---nproc ${ngpu} \ ---config ${config_path} \ ---result_file ${ckpt_prefix}.rsl \ ---checkpoint_path ${ckpt_prefix} \ ---model_type ${model_type} \ ---audio_file ${audio_file} - -if [ $? -ne 0 ]; then - echo "Failed in evaluation!" - exit 1 -fi - - -exit 0 diff --git a/examples/other/1xt2x/src_deepspeech2x/bin/test.py b/examples/other/1xt2x/src_deepspeech2x/bin/test.py index b404cce8..88a13fdc 100644 --- a/examples/other/1xt2x/src_deepspeech2x/bin/test.py +++ b/examples/other/1xt2x/src_deepspeech2x/bin/test.py @@ -15,7 +15,6 @@ from src_deepspeech2x.test_model import DeepSpeech2Tester as Tester from yacs.config import CfgNode -from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils.utility import print_arguments @@ -42,7 +41,7 @@ if __name__ == "__main__": print("model_type:{}".format(args.model_type)) # https://yaml.org/type/float.html - config = get_cfg_defaults(args.model_type) + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) if args.decode_cfg: diff --git a/examples/other/1xt2x/src_deepspeech2x/models/ds2/deepspeech2.py b/examples/other/1xt2x/src_deepspeech2x/models/ds2/deepspeech2.py index 4c20ffcd..003b02e2 100644 --- a/examples/other/1xt2x/src_deepspeech2x/models/ds2/deepspeech2.py +++ b/examples/other/1xt2x/src_deepspeech2x/models/ds2/deepspeech2.py @@ -120,20 +120,6 @@ class DeepSpeech2Model(nn.Layer): :rtype: tuple of LayerOutput """ - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - default = CfgNode( - dict( - num_conv_layers=2, #Number of stacking convolution layers. - num_rnn_layers=3, #Number of stacking RNN layers. - rnn_layer_size=1024, #RNN layer size (number of RNN cells). - use_gru=True, #Use gru if set True. Use simple rnn if set False. - share_rnn_weights=True #Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported. - )) - if config is not None: - config.merge_from_other_cfg(default) - return default - def __init__(self, feat_size, dict_size, diff --git a/examples/other/1xt2x/src_deepspeech2x/test_model.py b/examples/other/1xt2x/src_deepspeech2x/test_model.py index 53a4e629..246fb107 100644 --- a/examples/other/1xt2x/src_deepspeech2x/test_model.py +++ b/examples/other/1xt2x/src_deepspeech2x/test_model.py @@ -44,22 +44,6 @@ logger = Log(__name__).getlog() class DeepSpeech2Trainer(Trainer): - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - # training config - default = CfgNode( - dict( - lr=5e-4, # learning rate - lr_decay=1.0, # learning rate decay - weight_decay=1e-6, # the coeff of weight decay - global_grad_clip=5.0, # the global norm clip - n_epoch=50, # train epochs - )) - - if config is not None: - config.merge_from_other_cfg(default) - return default - def __init__(self, config, args): super().__init__(config, args) @@ -246,27 +230,6 @@ class DeepSpeech2Trainer(Trainer): class DeepSpeech2Tester(DeepSpeech2Trainer): - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - # testing config - default = CfgNode( - dict( - alpha=2.5, # Coef of LM for beam search. - beta=0.3, # Coef of WC for beam search. - cutoff_prob=1.0, # Cutoff probability for pruning. - cutoff_top_n=40, # Cutoff number for pruning. - lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm', # Filepath for language model. - decoding_method='ctc_beam_search', # Decoding method. Options: ctc_beam_search, ctc_greedy - error_rate_type='wer', # Error rate type for evaluation. Options `wer`, 'cer' - num_proc_bsearch=8, # # of CPUs for beam search. - beam_size=500, # Beam search width. - batch_size=128, # decoding batch size - )) - - if config is not None: - config.merge_from_other_cfg(default) - return default - def __init__(self, config, args): self._text_featurizer = TextFeaturizer( diff --git a/examples/ted_en_zh/st0/conf/preprocess.yaml b/examples/ted_en_zh/st0/conf/preprocess.yaml new file mode 100644 index 00000000..d3992cb9 --- /dev/null +++ b/examples/ted_en_zh/st0/conf/preprocess.yaml @@ -0,0 +1,25 @@ +process: + # extract kaldi fbank from PCM + - type: fbank_kaldi + fs: 16000 + n_mels: 80 + n_shift: 160 + win_length: 400 + dither: 0.1 + - type: cmvn_json + cmvn_path: data/mean_std.json + # these three processes are a.k.a. SpecAugument + - type: time_warp + max_time_warp: 5 + inplace: true + mode: PIL + - type: freq_mask + F: 30 + n_mask: 2 + inplace: true + replace_with_zero: false + - type: time_mask + T: 40 + n_mask: 2 + inplace: true + replace_with_zero: false diff --git a/examples/ted_en_zh/st0/conf/transformer.yaml b/examples/ted_en_zh/st0/conf/transformer.yaml index a154621d..d113fc94 100644 --- a/examples/ted_en_zh/st0/conf/transformer.yaml +++ b/examples/ted_en_zh/st0/conf/transformer.yaml @@ -19,7 +19,7 @@ vocab_filepath: data/lang_char/vocab.txt unit_type: 'spm' spm_model_prefix: data/lang_char/bpe_unigram_8000 mean_std_filepath: "" -augmentation_config: conf/preprocess.yaml +preprocess_config: conf/preprocess.yaml batch_size: 16 maxlen_in: 5 # if input length > maxlen-in, batchsize is automatically reduced maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced @@ -87,7 +87,7 @@ global_grad_clip: 5.0 optim: adam optim_conf: lr: 2.5 - weight_decay: 1e-06 + weight_decay: 1.0e-06 scheduler: noam scheduler_conf: warmup_steps: 25000 diff --git a/examples/ted_en_zh/st0/conf/transformer_mtl_noam.yaml b/examples/ted_en_zh/st0/conf/transformer_mtl_noam.yaml index c80dea7e..a01ec1a6 100644 --- a/examples/ted_en_zh/st0/conf/transformer_mtl_noam.yaml +++ b/examples/ted_en_zh/st0/conf/transformer_mtl_noam.yaml @@ -19,7 +19,7 @@ vocab_filepath: data/lang_char/vocab.txt unit_type: 'spm' spm_model_prefix: data/lang_char/bpe_unigram_8000 mean_std_filepath: "" -augmentation_config: conf/preprocess.yaml +preprocess_config: conf/preprocess.yaml batch_size: 16 maxlen_in: 5 # if input length > maxlen-in, batchsize is automatically reduced maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced diff --git a/examples/ted_en_zh/st1/conf/preprocess.yaml b/examples/ted_en_zh/st1/conf/preprocess.yaml new file mode 100644 index 00000000..bc86d98c --- /dev/null +++ b/examples/ted_en_zh/st1/conf/preprocess.yaml @@ -0,0 +1,16 @@ +process: + # these three processes are a.k.a. SpecAugument + - type: time_warp + max_time_warp: 5 + inplace: true + mode: PIL + - type: freq_mask + F: 30 + n_mask: 2 + inplace: true + replace_with_zero: false + - type: time_mask + T: 40 + n_mask: 2 + inplace: true + replace_with_zero: false diff --git a/examples/ted_en_zh/st1/conf/transformer.yaml b/examples/ted_en_zh/st1/conf/transformer.yaml index 05674562..515edee2 100644 --- a/examples/ted_en_zh/st1/conf/transformer.yaml +++ b/examples/ted_en_zh/st1/conf/transformer.yaml @@ -13,7 +13,7 @@ vocab_filepath: data/lang_char/ted_en_zh_bpe8000.txt unit_type: 'spm' spm_model_prefix: data/lang_char/ted_en_zh_bpe8000 mean_std_filepath: "" -# augmentation_config: conf/augmentation.json +# preprocess_config: conf/augmentation.json batch_size: 20 feat_dim: 83 stride_ms: 10.0 @@ -27,7 +27,7 @@ batch_bins: 0 batch_frames_in: 0 batch_frames_out: 0 batch_frames_inout: 0 -augmentation_config: +preprocess_config: num_workers: 0 subsampling_factor: 1 num_encs: 1 diff --git a/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml b/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml index ae246cbf..a5f956fa 100644 --- a/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml +++ b/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml @@ -13,7 +13,7 @@ vocab_filepath: data/lang_char/ted_en_zh_bpe8000.txt unit_type: 'spm' spm_model_prefix: data/lang_char/ted_en_zh_bpe8000 mean_std_filepath: "" -# augmentation_config: conf/augmentation.json +# preprocess_config: conf/augmentation.json batch_size: 20 feat_dim: 83 stride_ms: 10.0 @@ -27,7 +27,7 @@ batch_bins: 0 batch_frames_in: 0 batch_frames_out: 0 batch_frames_inout: 0 -augmentation_config: +preprocess_config: num_workers: 0 subsampling_factor: 1 num_encs: 1 diff --git a/examples/ted_en_zh/st1/local/test.sh b/examples/ted_en_zh/st1/local/test.sh index 9e24380d..be6f25f9 100755 --- a/examples/ted_en_zh/st1/local/test.sh +++ b/examples/ted_en_zh/st1/local/test.sh @@ -20,12 +20,7 @@ for type in fullsentence; do --decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ -<<<<<<< HEAD - --opts decode.decoding_method ${type} \ - --opts decode.decode_batch_size ${batch_size} -======= --opts decoding.decoding_method ${type} \ ->>>>>>> 6272496d9c26736750b577fd832ea9dd4ddc4e6e if [ $? -ne 0 ]; then echo "Failed in evaluation!" diff --git a/examples/tiny/asr1/conf/chunk_confermer.yaml b/examples/tiny/asr1/conf/chunk_confermer.yaml index cd072c14..8f785121 100644 --- a/examples/tiny/asr1/conf/chunk_confermer.yaml +++ b/examples/tiny/asr1/conf/chunk_confermer.yaml @@ -58,7 +58,6 @@ mean_std_filepath: "" vocab_filepath: data/lang_char/vocab.txt unit_type: 'spm' spm_model_prefix: 'data/lang_char/bpe_unigram_200' -preprocess_config: conf/preprocess.yaml feat_dim: 80 stride_ms: 10.0 window_ms: 25.0 @@ -72,7 +71,7 @@ batch_bins: 0 batch_frames_in: 0 batch_frames_out: 0 batch_frames_inout: 0 -augmentation_config: conf/preprocess.yaml +preprocess_config: conf/preprocess.yaml num_workers: 0 subsampling_factor: 1 num_encs: 1 diff --git a/paddlespeech/s2t/decoders/recog.py b/paddlespeech/s2t/decoders/recog.py index 3e9939f0..88955eac 100644 --- a/paddlespeech/s2t/decoders/recog.py +++ b/paddlespeech/s2t/decoders/recog.py @@ -85,7 +85,7 @@ def recog_v2(args): mode="asr", load_output=False, sort_in_input_length=False, - preprocess_conf=confs.collator.augmentation_config + preprocess_conf=confs.preprocess_config if args.preprocess_conf is None else args.preprocess_conf, preprocess_args={"train": False}, ) diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/deploy/runtime.py b/paddlespeech/s2t/exps/deepspeech2/bin/deploy/runtime.py index 88148323..ccb85906 100644 --- a/paddlespeech/s2t/exps/deepspeech2/bin/deploy/runtime.py +++ b/paddlespeech/s2t/exps/deepspeech2/bin/deploy/runtime.py @@ -20,7 +20,7 @@ from paddle.inference import Config from paddle.inference import create_predictor from paddle.io import DataLoader -from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults +from yacs.config import CfgNode from paddlespeech.s2t.io.collator import SpeechCollator from paddlespeech.s2t.io.dataset import ManifestDataset from paddlespeech.s2t.models.ds2 import DeepSpeech2Model @@ -176,7 +176,7 @@ if __name__ == "__main__": print_arguments(args, globals()) # https://yaml.org/type/float.html - config = get_cfg_defaults() + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) if args.decode_cfg: diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/deploy/server.py b/paddlespeech/s2t/exps/deepspeech2/bin/deploy/server.py index dea6d975..85c2466f 100644 --- a/paddlespeech/s2t/exps/deepspeech2/bin/deploy/server.py +++ b/paddlespeech/s2t/exps/deepspeech2/bin/deploy/server.py @@ -18,7 +18,7 @@ import numpy as np import paddle from paddle.io import DataLoader -from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults +from yacs.config import CfgNode from paddlespeech.s2t.io.collator import SpeechCollator from paddlespeech.s2t.io.dataset import ManifestDataset from paddlespeech.s2t.models.ds2 import DeepSpeech2Model @@ -111,7 +111,7 @@ if __name__ == "__main__": print_arguments(args, globals()) # https://yaml.org/type/float.html - config = get_cfg_defaults() + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) if args.decode_cfg: diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/export.py b/paddlespeech/s2t/exps/deepspeech2/bin/export.py index 66042e84..090b5fab 100644 --- a/paddlespeech/s2t/exps/deepspeech2/bin/export.py +++ b/paddlespeech/s2t/exps/deepspeech2/bin/export.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Export for DeepSpeech2 model.""" -from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults +from yacs.config import CfgNode from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2Tester as Tester from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils.utility import print_arguments @@ -41,7 +41,7 @@ if __name__ == "__main__": print_arguments(args) # https://yaml.org/type/float.html - config = get_cfg_defaults(args.model_type) + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) if args.opts: diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/test.py b/paddlespeech/s2t/exps/deepspeech2/bin/test.py index 7ce921d6..388b380d 100644 --- a/paddlespeech/s2t/exps/deepspeech2/bin/test.py +++ b/paddlespeech/s2t/exps/deepspeech2/bin/test.py @@ -14,7 +14,6 @@ """Evaluation for DeepSpeech2 model.""" from yacs.config import CfgNode -from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2Tester as Tester from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils.utility import print_arguments @@ -43,7 +42,7 @@ if __name__ == "__main__": print("model_type:{}".format(args.model_type)) # https://yaml.org/type/float.html - config = get_cfg_defaults(args.model_type) + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) if args.decode_cfg: diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py b/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py index 7a1801d4..176028ed 100644 --- a/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py +++ b/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py @@ -13,8 +13,6 @@ # limitations under the License. """Evaluation for DeepSpeech2 model.""" from yacs.config import CfgNode - -from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2ExportTester as ExportTester from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils.utility import print_arguments @@ -48,7 +46,7 @@ if __name__ == "__main__": print("model_type:{}".format(args.model_type)) # https://yaml.org/type/float.html - config = get_cfg_defaults(args.model_type) + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) if args.decode_cfg: diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py b/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py index 28756b05..e2cb7e2f 100644 --- a/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py +++ b/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py @@ -188,7 +188,7 @@ if __name__ == "__main__": print("model_type:{}".format(args.model_type)) # https://yaml.org/type/float.html - config = get_cfg_defaults(args.model_type) + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) if args.decode_cfg: diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/train.py b/paddlespeech/s2t/exps/deepspeech2/bin/train.py index 400538f9..5e8c0fff 100644 --- a/paddlespeech/s2t/exps/deepspeech2/bin/train.py +++ b/paddlespeech/s2t/exps/deepspeech2/bin/train.py @@ -14,7 +14,7 @@ """Trainer for DeepSpeech2 model.""" from paddle import distributed as dist -from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults +from yacs.config import CfgNode from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2Trainer as Trainer from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils.utility import print_arguments @@ -42,7 +42,7 @@ if __name__ == "__main__": print_arguments(args, globals()) # https://yaml.org/type/float.html - config = get_cfg_defaults(args.model_type) + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) if args.opts: diff --git a/paddlespeech/s2t/exps/deepspeech2/config.py b/paddlespeech/s2t/exps/deepspeech2/config.py deleted file mode 100644 index d8eab50e..00000000 --- a/paddlespeech/s2t/exps/deepspeech2/config.py +++ /dev/null @@ -1,28 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from yacs.config import CfgNode - -from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2Tester -from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2Trainer -from paddlespeech.s2t.io.collator import SpeechCollator -from paddlespeech.s2t.io.dataset import ManifestDataset -from paddlespeech.s2t.models.ds2 import DeepSpeech2Model -from paddlespeech.s2t.models.ds2_online import DeepSpeech2ModelOnline - - -def get_cfg_defaults(model_type='offline'): - _C = CfgNode() - config = _C.clone() - config.set_new_allowed(True) - return config diff --git a/paddlespeech/s2t/exps/deepspeech2/model.py b/paddlespeech/s2t/exps/deepspeech2/model.py index fc214a8a..e7d5e20f 100644 --- a/paddlespeech/s2t/exps/deepspeech2/model.py +++ b/paddlespeech/s2t/exps/deepspeech2/model.py @@ -49,22 +49,6 @@ logger = Log(__name__).getlog() class DeepSpeech2Trainer(Trainer): - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - # training config - default = CfgNode( - dict( - lr=5e-4, # learning rate - lr_decay=1.0, # learning rate decay - weight_decay=1e-6, # the coeff of weight decay - global_grad_clip=5.0, # the global norm clip - n_epoch=50, # train epochs - )) - - if config is not None: - config.merge_from_other_cfg(default) - return default - def __init__(self, config, args): super().__init__(config, args) @@ -259,27 +243,6 @@ class DeepSpeech2Trainer(Trainer): class DeepSpeech2Tester(DeepSpeech2Trainer): - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - # testing config - default = CfgNode( - dict( - alpha=2.5, # Coef of LM for beam search. - beta=0.3, # Coef of WC for beam search. - cutoff_prob=1.0, # Cutoff probability for pruning. - cutoff_top_n=40, # Cutoff number for pruning. - lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm', # Filepath for language model. - decoding_method='ctc_beam_search', # Decoding method. Options: ctc_beam_search, ctc_greedy - error_rate_type='wer', # Error rate type for evaluation. Options `wer`, 'cer' - num_proc_bsearch=8, # # of CPUs for beam search. - beam_size=500, # Beam search width. - batch_size=128, # decoding batch size - )) - - if config is not None: - config.merge_from_other_cfg(default) - return default - def __init__(self, config, args): super().__init__(config, args) self._text_featurizer = TextFeaturizer( diff --git a/paddlespeech/s2t/exps/u2/bin/alignment.py b/paddlespeech/s2t/exps/u2/bin/alignment.py index 5d768536..e3390feb 100644 --- a/paddlespeech/s2t/exps/u2/bin/alignment.py +++ b/paddlespeech/s2t/exps/u2/bin/alignment.py @@ -14,7 +14,6 @@ """Alignment for U2 model.""" from yacs.config import CfgNode -from paddlespeech.s2t.exps.u2.config import get_cfg_defaults from paddlespeech.s2t.exps.u2.model import U2Tester as Tester from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils.utility import print_arguments @@ -33,14 +32,14 @@ def main(config, args): if __name__ == "__main__": parser = default_argument_parser() - # save asr result to + # save asr result to parser.add_argument( "--result_file", type=str, help="path of save the asr result") args = parser.parse_args() print_arguments(args, globals()) # https://yaml.org/type/float.html - config = get_cfg_defaults() + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) if args.decode_cfg: diff --git a/paddlespeech/s2t/exps/u2/bin/export.py b/paddlespeech/s2t/exps/u2/bin/export.py index 44fc7c3e..3907cebd 100644 --- a/paddlespeech/s2t/exps/u2/bin/export.py +++ b/paddlespeech/s2t/exps/u2/bin/export.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Export for U2 model.""" -from paddlespeech.s2t.exps.u2.config import get_cfg_defaults +from yacs.config import CfgNode from paddlespeech.s2t.exps.u2.model import U2Tester as Tester from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils.utility import print_arguments @@ -31,14 +31,14 @@ def main(config, args): if __name__ == "__main__": parser = default_argument_parser() - # save jit model to + # save jit model to parser.add_argument( "--export_path", type=str, help="path of the jit model to save") args = parser.parse_args() print_arguments(args, globals()) # https://yaml.org/type/float.html - config = get_cfg_defaults() + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) if args.opts: diff --git a/paddlespeech/s2t/exps/u2/bin/test.py b/paddlespeech/s2t/exps/u2/bin/test.py index d93954fe..f14d804f 100644 --- a/paddlespeech/s2t/exps/u2/bin/test.py +++ b/paddlespeech/s2t/exps/u2/bin/test.py @@ -16,7 +16,6 @@ import cProfile from yacs.config import CfgNode -from paddlespeech.s2t.exps.u2.config import get_cfg_defaults from paddlespeech.s2t.exps.u2.model import U2Tester as Tester from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils.utility import print_arguments @@ -44,7 +43,7 @@ if __name__ == "__main__": print_arguments(args, globals()) # https://yaml.org/type/float.html - config = get_cfg_defaults() + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) if args.decode_cfg: diff --git a/paddlespeech/s2t/exps/u2/bin/test_wav.py b/paddlespeech/s2t/exps/u2/bin/test_wav.py index 554d6ca5..9904813a 100644 --- a/paddlespeech/s2t/exps/u2/bin/test_wav.py +++ b/paddlespeech/s2t/exps/u2/bin/test_wav.py @@ -20,7 +20,6 @@ import paddle import soundfile from yacs.config import CfgNode -from paddlespeech.s2t.exps.u2.config import get_cfg_defaults from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer from paddlespeech.s2t.models.u2 import U2Model from paddlespeech.s2t.training.cli import default_argument_parser @@ -129,7 +128,7 @@ if __name__ == "__main__": "--audio_file", type=str, help="path of the input audio file") args = parser.parse_args() - config = get_cfg_defaults() + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) if args.decode_cfg: diff --git a/paddlespeech/s2t/exps/u2/bin/train.py b/paddlespeech/s2t/exps/u2/bin/train.py index d6ee8b30..d562278f 100644 --- a/paddlespeech/s2t/exps/u2/bin/train.py +++ b/paddlespeech/s2t/exps/u2/bin/train.py @@ -17,7 +17,7 @@ import os from paddle import distributed as dist -from paddlespeech.s2t.exps.u2.config import get_cfg_defaults +from yacs.config import CfgNode from paddlespeech.s2t.exps.u2.model import U2Trainer as Trainer from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils.utility import print_arguments @@ -44,7 +44,7 @@ if __name__ == "__main__": print_arguments(args, globals()) # https://yaml.org/type/float.html - config = get_cfg_defaults() + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) if args.opts: diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py index f1683d70..d0cea031 100644 --- a/paddlespeech/s2t/exps/u2/model.py +++ b/paddlespeech/s2t/exps/u2/model.py @@ -46,33 +46,6 @@ logger = Log(__name__).getlog() class U2Trainer(Trainer): - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - # training config - default = CfgNode( - dict( - n_epoch=50, # train epochs - log_interval=100, # steps - accum_grad=1, # accum grad by # steps - global_grad_clip=5.0, # the global norm clip - )) - default.optim = 'adam' - default.optim_conf = CfgNode( - dict( - lr=5e-4, # learning rate - weight_decay=1e-6, # the coeff of weight decay - )) - default.scheduler = 'warmuplr' - default.scheduler_conf = CfgNode( - dict( - warmup_steps=25000, - lr_decay=1.0, # learning rate decay - )) - - if config is not None: - config.merge_from_other_cfg(default) - return default - def __init__(self, config, args): super().__init__(config, args) @@ -401,35 +374,6 @@ class U2Trainer(Trainer): class U2Tester(U2Trainer): - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - # decoding config - default = CfgNode( - dict( - alpha=2.5, # Coef of LM for beam search. - beta=0.3, # Coef of WC for beam search. - cutoff_prob=1.0, # Cutoff probability for pruning. - cutoff_top_n=40, # Cutoff number for pruning. - lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm', # Filepath for language model. - decoding_method='attention', # Decoding method. Options: 'attention', 'ctc_greedy_search', - # 'ctc_prefix_beam_search', 'attention_rescoring' - error_rate_type='wer', # Error rate type for evaluation. Options `wer`, 'cer' - num_proc_bsearch=8, # # of CPUs for beam search. - beam_size=10, # Beam search width. - decode_batch_size=16, # decoding batch size - ctc_weight=0.0, # ctc weight for attention rescoring decode mode. - decoding_chunk_size=-1, # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks=-1, # number of left chunks for decoding. Defaults to -1. - simulate_streaming=False, # simulate streaming inference. Defaults to False. - )) - - if config is not None: - config.merge_from_other_cfg(default) - return default - def __init__(self, config, args): super().__init__(config, args) self.text_feature = TextFeaturizer( diff --git a/paddlespeech/s2t/exps/u2/trainer.py b/paddlespeech/s2t/exps/u2/trainer.py index 57d87316..ab87c30d 100644 --- a/paddlespeech/s2t/exps/u2/trainer.py +++ b/paddlespeech/s2t/exps/u2/trainer.py @@ -55,7 +55,6 @@ class U2Trainer(Trainer): collate_fn_train = SpeechCollator.from_config(config) - config.augmentation_config = "" collate_fn_dev = SpeechCollator.from_config(config) if self.parallel: @@ -103,7 +102,6 @@ class U2Trainer(Trainer): test_dataset = ManifestDataset.from_config(config) # return text ord id config.keep_transcription_text = True - config.augmentation_config = "" self.test_loader = DataLoader( test_dataset, batch_size=config.decode.batch_size, diff --git a/paddlespeech/s2t/exps/u2_kaldi/model.py b/paddlespeech/s2t/exps/u2_kaldi/model.py index 887dd29e..780c5c08 100644 --- a/paddlespeech/s2t/exps/u2_kaldi/model.py +++ b/paddlespeech/s2t/exps/u2_kaldi/model.py @@ -42,40 +42,7 @@ from paddlespeech.s2t.utils.utility import UpdateConfig logger = Log(__name__).getlog() - -def get_cfg_defaults(): - """Get a yacs CfgNode object with default values for my_project.""" - # Return a clone so that the defaults will not be altered - # This is for the "local variable" use pattern - _C = CfgNode() - - _C.model = U2Model.params() - - _C.training = U2Trainer.params() - - _C.decoding = U2Tester.params() - - config = _C.clone() - config.set_new_allowed(True) - return config - - class U2Trainer(Trainer): - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - # training config - default = CfgNode( - dict( - n_epoch=50, # train epochs - log_interval=100, # steps - accum_grad=1, # accum grad by # steps - checkpoint=dict( - kbest_n=50, - latest_n=5, ), )) - if config is not None: - config.merge_from_other_cfg(default) - return default - def __init__(self, config, args): super().__init__(config, args) @@ -362,35 +329,6 @@ class U2Trainer(Trainer): class U2Tester(U2Trainer): - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - # decoding config - default = CfgNode( - dict( - alpha=2.5, # Coef of LM for beam search. - beta=0.3, # Coef of WC for beam search. - cutoff_prob=1.0, # Cutoff probability for pruning. - cutoff_top_n=40, # Cutoff number for pruning. - lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm', # Filepath for language model. - decoding_method='attention', # Decoding method. Options: 'attention', 'ctc_greedy_search', - # 'ctc_prefix_beam_search', 'attention_rescoring' - error_rate_type='wer', # Error rate type for evaluation. Options `wer`, 'cer' - num_proc_bsearch=8, # # of CPUs for beam search. - beam_size=10, # Beam search width. - batch_size=16, # decoding batch size - ctc_weight=0.0, # ctc weight for attention rescoring decode mode. - decoding_chunk_size=-1, # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks=-1, # number of left chunks for decoding. Defaults to -1. - simulate_streaming=False, # simulate streaming inference. Defaults to False. - )) - - if config is not None: - config.merge_from_other_cfg(default) - return default - def __init__(self, config, args): super().__init__(config, args) self.text_feature = TextFeaturizer( diff --git a/paddlespeech/s2t/exps/u2_st/bin/export.py b/paddlespeech/s2t/exps/u2_st/bin/export.py index 69d9718f..1bc4e1f3 100644 --- a/paddlespeech/s2t/exps/u2_st/bin/export.py +++ b/paddlespeech/s2t/exps/u2_st/bin/export.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Export for U2 model.""" -from paddlespeech.s2t.exps.u2_st.config import get_cfg_defaults +from yacs.config import CfgNode from paddlespeech.s2t.exps.u2_st.model import U2STTester as Tester from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils.utility import print_arguments @@ -31,14 +31,14 @@ def main(config, args): if __name__ == "__main__": parser = default_argument_parser() - # save jit model to + # save jit model to parser.add_argument( "--export_path", type=str, help="path of the jit model to save") args = parser.parse_args() print_arguments(args, globals()) # https://yaml.org/type/float.html - config = get_cfg_defaults() + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) if args.opts: diff --git a/paddlespeech/s2t/exps/u2_st/bin/test.py b/paddlespeech/s2t/exps/u2_st/bin/test.py index 3ad5fc7d..1d70a310 100644 --- a/paddlespeech/s2t/exps/u2_st/bin/test.py +++ b/paddlespeech/s2t/exps/u2_st/bin/test.py @@ -16,7 +16,6 @@ import cProfile from yacs.config import CfgNode -from paddlespeech.s2t.exps.u2_st.config import get_cfg_defaults from paddlespeech.s2t.exps.u2_st.model import U2STTester as Tester from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils.utility import print_arguments @@ -44,7 +43,7 @@ if __name__ == "__main__": print_arguments(args, globals()) # https://yaml.org/type/float.html - config = get_cfg_defaults() + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) if args.decode_cfg: diff --git a/paddlespeech/s2t/exps/u2_st/bin/train.py b/paddlespeech/s2t/exps/u2_st/bin/train.py index 58496c88..4dec9ec8 100644 --- a/paddlespeech/s2t/exps/u2_st/bin/train.py +++ b/paddlespeech/s2t/exps/u2_st/bin/train.py @@ -16,8 +16,8 @@ import cProfile import os from paddle import distributed as dist +from yacs.config import CfgNode -from paddlespeech.s2t.exps.u2_st.config import get_cfg_defaults from paddlespeech.s2t.exps.u2_st.model import U2STTrainer as Trainer from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils.utility import print_arguments @@ -42,7 +42,7 @@ if __name__ == "__main__": print_arguments(args, globals()) # https://yaml.org/type/float.html - config = get_cfg_defaults() + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) if args.opts: diff --git a/paddlespeech/s2t/exps/u2_st/config.py b/paddlespeech/s2t/exps/u2_st/config.py deleted file mode 100644 index a314a1ca..00000000 --- a/paddlespeech/s2t/exps/u2_st/config.py +++ /dev/null @@ -1,41 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from yacs.config import CfgNode - -from paddlespeech.s2t.exps.u2_st.model import U2STTester -from paddlespeech.s2t.exps.u2_st.model import U2STTrainer -from paddlespeech.s2t.io.collator import SpeechCollator -from paddlespeech.s2t.io.dataset import ManifestDataset -from paddlespeech.s2t.models.u2_st import U2STModel - -_C = CfgNode() - -# _C.data = ManifestDataset.params() - -# _C.collator = SpeechCollator.params() - -# _C.model = U2STModel.params() - -# _C.training = U2STTrainer.params() - -# _C.decoding = U2STTester.params() - - -def get_cfg_defaults(): - """Get a yacs CfgNode object with default values for my_project.""" - # Return a clone so that the defaults will not be altered - # This is for the "local variable" use pattern - config = _C.clone() - config.set_new_allowed(True) - return config diff --git a/paddlespeech/s2t/exps/u2_st/model.py b/paddlespeech/s2t/exps/u2_st/model.py index 88aeb4a5..ca2c2c1d 100644 --- a/paddlespeech/s2t/exps/u2_st/model.py +++ b/paddlespeech/s2t/exps/u2_st/model.py @@ -45,33 +45,6 @@ logger = Log(__name__).getlog() class U2STTrainer(Trainer): - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - # training config - default = CfgNode( - dict( - n_epoch=50, # train epochs - log_interval=100, # steps - accum_grad=1, # accum grad by # steps - global_grad_clip=5.0, # the global norm clip - )) - default.optim = 'adam' - default.optim_conf = CfgNode( - dict( - lr=5e-4, # learning rate - weight_decay=1e-6, # the coeff of weight decay - )) - default.scheduler = 'warmuplr' - default.scheduler_conf = CfgNode( - dict( - warmup_steps=25000, - lr_decay=1.0, # learning rate decay - )) - - if config is not None: - config.merge_from_other_cfg(default) - return default - def __init__(self, config, args): super().__init__(config, args) @@ -127,7 +100,7 @@ class U2STTrainer(Trainer): for k, v in losses_np.items(): report(k, v) - report("batch_size", self.config.collator.batch_size) + report("batch_size", self.config.batch_size) report("accum", train_conf.accum_grad) report("step_cost", iteration_time) @@ -236,7 +209,7 @@ class U2STTrainer(Trainer): msg += "," msg = msg[:-1] # remove the last "," if (batch_index + 1 - ) % self.config.training.log_interval == 0: + ) % self.config.log_interval == 0: logger.info(msg) except Exception as e: logger.error(e) @@ -287,7 +260,7 @@ class U2STTrainer(Trainer): batch_frames_in=0, batch_frames_out=0, batch_frames_inout=0, - preprocess_conf=config.augmentation_config, # aug will be off when train_mode=False + preprocess_conf=config.preprocess_config, # aug will be off when train_mode=False n_iter_processes=config.num_workers, subsampling_factor=1, load_aux_output=load_transcript, @@ -308,7 +281,7 @@ class U2STTrainer(Trainer): batch_frames_in=0, batch_frames_out=0, batch_frames_inout=0, - preprocess_conf=config.augmentation_config, # aug will be off when train_mode=False + preprocess_conf=config.preprocess_config, # aug will be off when train_mode=False n_iter_processes=config.num_workers, subsampling_factor=1, load_aux_output=load_transcript, @@ -319,7 +292,7 @@ class U2STTrainer(Trainer): # test dataset, return raw text decode_batch_size = config.get('decode',dict()).get('decode_batch_size', 1) self.test_loader = BatchDataLoader( - json_file=config.data.test_manifest, + json_file=config.test_manifest, train_mode=False, sortagrad=False, batch_size=decode_batch_size, @@ -332,7 +305,7 @@ class U2STTrainer(Trainer): batch_frames_in=0, batch_frames_out=0, batch_frames_inout=0, - preprocess_conf=config.augmentation_config, # aug will be off when train_mode=False + preprocess_conf=config.preprocess_config, # aug will be off when train_mode=False n_iter_processes=config.num_workers, subsampling_factor=1, num_encs=1, @@ -379,7 +352,7 @@ class U2STTrainer(Trainer): config, parameters, lr_scheduler=None, ): - train_config = config.training + train_config = config optim_type = train_config.optim optim_conf = train_config.optim_conf scheduler_type = train_config.scheduler @@ -405,41 +378,12 @@ class U2STTrainer(Trainer): class U2STTester(U2STTrainer): - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - # decoding config - default = CfgNode( - dict( - alpha=2.5, # Coef of LM for beam search. - beta=0.3, # Coef of WC for beam search. - cutoff_prob=1.0, # Cutoff probability for pruning. - cutoff_top_n=40, # Cutoff number for pruning. - lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm', # Filepath for language model. - decoding_method='attention', # Decoding method. Options: 'attention', 'ctc_greedy_search', - # 'ctc_prefix_beam_search', 'attention_rescoring' - error_rate_type='bleu', # Error rate type for evaluation. Options `bleu`, 'char_bleu' - num_proc_bsearch=8, # # of CPUs for beam search. - beam_size=10, # Beam search width. - batch_size=16, # decoding batch size - ctc_weight=0.0, # ctc weight for attention rescoring decode mode. - decoding_chunk_size=-1, # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks=-1, # number of left chunks for decoding. Defaults to -1. - simulate_streaming=False, # simulate streaming inference. Defaults to False. - )) - - if config is not None: - config.merge_from_other_cfg(default) - return default - def __init__(self, config, args): super().__init__(config, args) self.text_feature = TextFeaturizer( - unit_type=self.config.collator.unit_type, - vocab_filepath=self.config.collator.vocab_filepath, - spm_model_prefix=self.config.collator.spm_model_prefix) + unit_type=self.config.unit_type, + vocab=self.config.vocab_filepath, + spm_model_prefix=self.config.spm_model_prefix) self.vocab_list = self.text_feature.vocab_list def id2token(self, texts, texts_len, text_feature): @@ -526,7 +470,7 @@ class U2STTester(U2STTrainer): decode_cfg = self.config.decode bleu_func = bleu_score.char_bleu if decode_cfg.error_rate_type == 'char-bleu' else bleu_score.bleu - stride_ms = self.config.collator.stride_ms + stride_ms = self.config.stride_ms hyps, refs = [], [] len_refs, num_ins = 0, 0 num_frames = 0.0 diff --git a/paddlespeech/s2t/io/collator.py b/paddlespeech/s2t/io/collator.py index 27bf20eb..3a14b2d5 100644 --- a/paddlespeech/s2t/io/collator.py +++ b/paddlespeech/s2t/io/collator.py @@ -219,33 +219,6 @@ class SpeechCollatorBase(): class SpeechCollator(SpeechCollatorBase): - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - default = CfgNode( - dict( - augmentation_config="", - random_seed=0, - mean_std_filepath="", - unit_type="char", - vocab_filepath="", - spm_model_prefix="", - spectrum_type='linear', # 'linear', 'mfcc', 'fbank' - feat_dim=0, # 'mfcc', 'fbank' - delta_delta=False, # 'mfcc', 'fbank' - stride_ms=10.0, # ms - window_ms=20.0, # ms - n_fft=None, # fft points - max_freq=None, # None for samplerate/2 - target_sample_rate=16000, # target sample rate - use_dB_normalization=True, - target_dB=-20, - dither=1.0, # feature dither - keep_transcription_text=False)) - - if config is not None: - config.merge_from_other_cfg(default) - return default - @classmethod def from_config(cls, config): """Build a SpeechCollator object from a config. diff --git a/paddlespeech/s2t/io/dataset.py b/paddlespeech/s2t/io/dataset.py index c76ccfce..9149fb27 100644 --- a/paddlespeech/s2t/io/dataset.py +++ b/paddlespeech/s2t/io/dataset.py @@ -28,22 +28,6 @@ logger = Log(__name__).getlog() class ManifestDataset(Dataset): - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - default = CfgNode( - dict( - manifest="", - max_input_len=27.0, - min_input_len=0.0, - max_output_len=float('inf'), - min_output_len=0.0, - max_output_input_ratio=float('inf'), - min_output_input_ratio=0.0, )) - - if config is not None: - config.merge_from_other_cfg(default) - return default - @classmethod def from_config(cls, config): """Build a ManifestDataset object from a config. diff --git a/paddlespeech/s2t/models/ds2/deepspeech2.py b/paddlespeech/s2t/models/ds2/deepspeech2.py index 0414d04f..ddc3612d 100644 --- a/paddlespeech/s2t/models/ds2/deepspeech2.py +++ b/paddlespeech/s2t/models/ds2/deepspeech2.py @@ -119,21 +119,6 @@ class DeepSpeech2Model(nn.Layer): before softmax) and a ctc cost layer. :rtype: tuple of LayerOutput """ - - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - default = CfgNode( - dict( - num_conv_layers=2, #Number of stacking convolution layers. - num_rnn_layers=3, #Number of stacking RNN layers. - rnn_layer_size=1024, #RNN layer size (number of RNN cells). - use_gru=True, #Use gru if set True. Use simple rnn if set False. - share_rnn_weights=True, #Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported. - ctc_grad_norm_type=None, )) - if config is not None: - config.merge_from_other_cfg(default) - return default - def __init__(self, feat_size, dict_size, diff --git a/paddlespeech/s2t/models/ds2_online/deepspeech2.py b/paddlespeech/s2t/models/ds2_online/deepspeech2.py index f08e30d0..aae77f74 100644 --- a/paddlespeech/s2t/models/ds2_online/deepspeech2.py +++ b/paddlespeech/s2t/models/ds2_online/deepspeech2.py @@ -243,23 +243,6 @@ class DeepSpeech2ModelOnline(nn.Layer): before softmax) and a ctc cost layer. :rtype: tuple of LayerOutput """ - - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - default = CfgNode( - dict( - num_conv_layers=2, #Number of stacking convolution layers. - num_rnn_layers=4, #Number of stacking RNN layers. - rnn_layer_size=1024, #RNN layer size (number of RNN cells). - num_fc_layers=2, - fc_layers_size_list=[512, 256], - use_gru=True, #Use gru if set True. Use simple rnn if set False. - blank_id=0, # index of blank in vocob.txt - ctc_grad_norm_type=None, )) - if config is not None: - config.merge_from_other_cfg(default) - return default - def __init__( self, feat_size, diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py index 83eff467..26e81acf 100644 --- a/paddlespeech/s2t/models/u2/u2.py +++ b/paddlespeech/s2t/models/u2/u2.py @@ -59,57 +59,6 @@ logger = Log(__name__).getlog() class U2BaseModel(ASRInterface, nn.Layer): """CTC-Attention hybrid Encoder-Decoder model""" - - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - # network architecture - default = CfgNode() - # allow add new item when merge_with_file - default.cmvn_file = "" - default.cmvn_file_type = "json" - default.input_dim = 0 - default.output_dim = 0 - # encoder related - default.encoder = 'transformer' - default.encoder_conf = CfgNode( - dict( - output_size=256, # dimension of attention - attention_heads=4, - linear_units=2048, # the number of units of position-wise feed forward - num_blocks=12, # the number of encoder blocks - dropout_rate=0.1, - positional_dropout_rate=0.1, - attention_dropout_rate=0.0, - input_layer='conv2d', # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before=True, - # use_cnn_module=True, - # cnn_module_kernel=15, - # activation_type='swish', - # pos_enc_layer_type='rel_pos', - # selfattention_layer_type='rel_selfattn', - )) - # decoder related - default.decoder = 'transformer' - default.decoder_conf = CfgNode( - dict( - attention_heads=4, - linear_units=2048, - num_blocks=6, - dropout_rate=0.1, - positional_dropout_rate=0.1, - self_attention_dropout_rate=0.0, - src_attention_dropout_rate=0.0, )) - # hybrid CTC/attention - default.model_conf = CfgNode( - dict( - ctc_weight=0.3, - lsm_weight=0.1, # label smoothing option - length_normalized_loss=False, )) - - if config is not None: - config.merge_from_other_cfg(default) - return default - def __init__(self, vocab_size: int, encoder: TransformerEncoder, diff --git a/paddlespeech/s2t/models/u2_st/u2_st.py b/paddlespeech/s2t/models/u2_st/u2_st.py index 8b07e389..1c5596ba 100644 --- a/paddlespeech/s2t/models/u2_st/u2_st.py +++ b/paddlespeech/s2t/models/u2_st/u2_st.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """U2 ASR Model -Unified Streaming and Non-streaming Two-pass End-to-end Model for Speech Recognition +Unified Streaming and Non-streaming Two-pass End-to-end Model for Speech Recognition (https://arxiv.org/pdf/2012.05481.pdf) """ import time @@ -51,58 +51,6 @@ logger = Log(__name__).getlog() class U2STBaseModel(nn.Layer): """CTC-Attention hybrid Encoder-Decoder model""" - - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - # network architecture - default = CfgNode() - # allow add new item when merge_with_file - default.cmvn_file = "" - default.cmvn_file_type = "json" - default.input_dim = 0 - default.output_dim = 0 - # encoder related - default.encoder = 'transformer' - default.encoder_conf = CfgNode( - dict( - output_size=256, # dimension of attention - attention_heads=4, - linear_units=2048, # the number of units of position-wise feed forward - num_blocks=12, # the number of encoder blocks - dropout_rate=0.1, - positional_dropout_rate=0.1, - attention_dropout_rate=0.0, - input_layer='conv2d', # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before=True, - # use_cnn_module=True, - # cnn_module_kernel=15, - # activation_type='swish', - # pos_enc_layer_type='rel_pos', - # selfattention_layer_type='rel_selfattn', - )) - # decoder related - default.decoder = 'transformer' - default.decoder_conf = CfgNode( - dict( - attention_heads=4, - linear_units=2048, - num_blocks=6, - dropout_rate=0.1, - positional_dropout_rate=0.1, - self_attention_dropout_rate=0.0, - src_attention_dropout_rate=0.0, )) - # hybrid CTC/attention - default.model_conf = CfgNode( - dict( - asr_weight=0.0, - ctc_weight=0.0, - lsm_weight=0.1, # label smoothing option - length_normalized_loss=False, )) - - if config is not None: - config.merge_from_other_cfg(default) - return default - def __init__(self, vocab_size: int, encoder: TransformerEncoder, @@ -289,8 +237,8 @@ class U2STBaseModel(nn.Layer): simulate_streaming (bool, optional): streaming or not. Defaults to False. Returns: - Tuple[paddle.Tensor, paddle.Tensor]: - encoder hiddens (B, Tmax, D), + Tuple[paddle.Tensor, paddle.Tensor]: + encoder hiddens (B, Tmax, D), encoder hiddens mask (B, 1, Tmax). """ # Let's assume B = batch_size @@ -533,21 +481,21 @@ class U2STBaseModel(nn.Layer): feats (Tenosr): audio features, (B, T, D) feats_lengths (Tenosr): (B) text_feature (TextFeaturizer): text feature object. - decoding_method (str): decoding mode, e.g. - 'fullsentence', + decoding_method (str): decoding mode, e.g. + 'fullsentence', 'simultaneous' beam_size (int): beam size for search decoding_chunk_size (int, optional): decoding chunk size. Defaults to -1. <0: for decoding, use full chunk. >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here. - num_decoding_left_chunks (int, optional): + 0: used for training, it's prohibited here. + num_decoding_left_chunks (int, optional): number of left chunks for decoding. Defaults to -1. simulate_streaming (bool, optional): simulate streaming inference. Defaults to False. Raises: ValueError: when not support decoding_method. - + Returns: List[List[int]]: transcripts. """ @@ -601,7 +549,7 @@ class U2STModel(U2STBaseModel): ValueError: raise when using not support encoder type. Returns: - int, nn.Layer, nn.Layer, nn.Layer: vocab size, encoder, decoder, ctc + int, nn.Layer, nn.Layer, nn.Layer: vocab size, encoder, decoder, ctc """ if configs['cmvn_file'] is not None: mean, istd = load_cmvn(configs['cmvn_file'], From 8b63485ce3a6e4c95106e46746aae6a4ef5b3b16 Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Tue, 4 Jan 2022 13:22:32 +0000 Subject: [PATCH 7/9] fix some bug, test=asr --- examples/csmsc/voc5/README.md | 4 +-- examples/ted_en_zh/st1/local/test.sh | 2 +- examples/ted_en_zh/st1/run.sh | 6 ++-- paddlespeech/s2t/exps/u2/config.py | 41 ---------------------------- 4 files changed, 6 insertions(+), 47 deletions(-) delete mode 100644 paddlespeech/s2t/exps/u2/config.py diff --git a/examples/csmsc/voc5/README.md b/examples/csmsc/voc5/README.md index be06f830..21afe6ee 100644 --- a/examples/csmsc/voc5/README.md +++ b/examples/csmsc/voc5/README.md @@ -125,8 +125,8 @@ HiFiGAN checkpoint contains files listed below. ```text hifigan_csmsc_ckpt_0.1.1 ├── default.yaml # default config used to train hifigan -├── feats_stats.npy # generator parameters of hifigan -└── snapshot_iter_2500000.pdz # statistics used to normalize spectrogram when training hifigan +├── feats_stats.npy # statistics used to normalize spectrogram when training hifigan +└── snapshot_iter_2500000.pdz # generator parameters of hifigan ``` ## Acknowledgement diff --git a/examples/ted_en_zh/st1/local/test.sh b/examples/ted_en_zh/st1/local/test.sh index be6f25f9..904f95c4 100755 --- a/examples/ted_en_zh/st1/local/test.sh +++ b/examples/ted_en_zh/st1/local/test.sh @@ -20,7 +20,7 @@ for type in fullsentence; do --decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ + --opts decode.decoding_method ${type} \ if [ $? -ne 0 ]; then echo "Failed in evaluation!" diff --git a/examples/ted_en_zh/st1/run.sh b/examples/ted_en_zh/st1/run.sh index 67309919..1808e37b 100755 --- a/examples/ted_en_zh/st1/run.sh +++ b/examples/ted_en_zh/st1/run.sh @@ -28,7 +28,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then if [ -n "${ckpt_path}" ]; then echo "Finetune from Pretrained Model" ${ckpt_path} ./local/download_pretrain.sh || exit -1 - fi + fi CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} "${ckpt_path}" fi @@ -39,5 +39,5 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_pat} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 -fi \ No newline at end of file + CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 +fi diff --git a/paddlespeech/s2t/exps/u2/config.py b/paddlespeech/s2t/exps/u2/config.py deleted file mode 100644 index 44780d2e..00000000 --- a/paddlespeech/s2t/exps/u2/config.py +++ /dev/null @@ -1,41 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from yacs.config import CfgNode - -from paddlespeech.s2t.exps.u2.model import U2Tester -from paddlespeech.s2t.exps.u2.model import U2Trainer -from paddlespeech.s2t.io.collator import SpeechCollator -from paddlespeech.s2t.io.dataset import ManifestDataset -from paddlespeech.s2t.models.u2 import U2Model - -_C = CfgNode(new_allowed=True) - -# ManifestDataset.params(_C) - -# SpeechCollator.params(_C) - -# U2Model.params(_C) - -# U2Trainer.params(_C) - -# _C.decode = U2Tester.params() - - -def get_cfg_defaults(): - """Get a yacs CfgNode object with default values for my_project.""" - # Return a clone so that the defaults will not be altered - # This is for the "local variable" use pattern - config = _C.clone() - config.set_new_allowed(True) - return config From d5f05edc2e608f5ac9d7793f089194d31772c688 Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Wed, 5 Jan 2022 03:15:52 +0000 Subject: [PATCH 8/9] fix some bug, test=asr --- examples/librispeech/asr2/run.sh | 1 - tests/benchmark/conformer/run_benchmark.sh | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/examples/librispeech/asr2/run.sh b/examples/librispeech/asr2/run.sh index 56671233..c9a794e3 100755 --- a/examples/librispeech/asr2/run.sh +++ b/examples/librispeech/asr2/run.sh @@ -16,7 +16,6 @@ avg_num=10 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; avg_ckpt=avg_${avg_num} -avg_ckpt=init ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}') echo "checkpoint name ${ckpt}" diff --git a/tests/benchmark/conformer/run_benchmark.sh b/tests/benchmark/conformer/run_benchmark.sh index d5902c51..16cd410e 100644 --- a/tests/benchmark/conformer/run_benchmark.sh +++ b/tests/benchmark/conformer/run_benchmark.sh @@ -70,7 +70,7 @@ function _train(){ } source ${BENCHMARK_ROOT}/scripts/run_model.sh # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;该脚本在连调时可从benchmark repo中下载https://github.com/PaddlePaddle/benchmark/blob/master/scripts/run_model.sh;如果不联调只想要产出训练log可以注掉本行,提交时需打开 -#_set_params $@ +_set_params $@ #_train # 如果只想产出训练log,不解析,可取消注释 _run # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只想要产出训练log可以注掉本行,提交时需打开 From 455bf477a44e80e71eaf697830c77aceb403d09a Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Wed, 5 Jan 2022 03:31:16 +0000 Subject: [PATCH 9/9] fix some bug, test=asr --- examples/aishell/asr1/conf/conformer.yaml | 2 +- examples/callcenter/asr1/run.sh | 4 ++-- examples/librispeech/asr2/local/test.sh | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/aishell/asr1/conf/conformer.yaml b/examples/aishell/asr1/conf/conformer.yaml index 490b577f..775a4527 100644 --- a/examples/aishell/asr1/conf/conformer.yaml +++ b/examples/aishell/asr1/conf/conformer.yaml @@ -65,7 +65,7 @@ batch_bins: 0 batch_frames_in: 0 batch_frames_out: 0 batch_frames_inout: 0 -num_workers: 8 +num_workers: 2 subsampling_factor: 1 num_encs: 1 diff --git a/examples/callcenter/asr1/run.sh b/examples/callcenter/asr1/run.sh index 86730ce1..0c7ffc1e 100644 --- a/examples/callcenter/asr1/run.sh +++ b/examples/callcenter/asr1/run.sh @@ -4,7 +4,7 @@ source path.sh gpus=0,1,2,3 stage=0 -stop_stage=100 +stop_stage=50 conf_path=conf/conformer.yaml decode_conf_path=conf/tuning/decode.yaml avg_num=20 @@ -40,7 +40,7 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi -if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then +if [ ${stage} -le 51 ] && [ ${stop_stage} -ge 51 ]; then # export ckpt avg_n CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit fi diff --git a/examples/librispeech/asr2/local/test.sh b/examples/librispeech/asr2/local/test.sh index bf6428d6..8cf3b52c 100755 --- a/examples/librispeech/asr2/local/test.sh +++ b/examples/librispeech/asr2/local/test.sh @@ -21,7 +21,7 @@ bpemodel=${bpeprefix}.model config_path=conf/transformer.yaml decode_config_path=conf/decode/decode_base.yaml dict=data/lang_char/${train_set}_${bpemode}${nbpe}_units.txt -ckpt_prefix=exp/transformer/checkpoints/init +ckpt_prefix= source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;