diff --git a/deepspeech/exps/deepspeech2/bin/tune.py b/deepspeech/exps/deepspeech2/bin/tune.py deleted file mode 100644 index 94a9b6c47..000000000 --- a/deepspeech/exps/deepspeech2/bin/tune.py +++ /dev/null @@ -1,191 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Beam search parameters tuning for DeepSpeech2 model.""" -import functools -import sys - -import numpy as np -from paddle.io import DataLoader - -from deepspeech.exps.deepspeech2.config import get_cfg_defaults -from deepspeech.io.collator import SpeechCollator -from deepspeech.io.dataset import ManifestDataset -from deepspeech.models.ds2 import DeepSpeech2Model -from deepspeech.training.cli import default_argument_parser -from deepspeech.utils import error_rate -from deepspeech.utils.utility import add_arguments -from deepspeech.utils.utility import print_arguments - - -def tune(config, args): - """Tune parameters alpha and beta incrementally.""" - if not args.num_alphas >= 0: - raise ValueError("num_alphas must be non-negative!") - if not args.num_betas >= 0: - raise ValueError("num_betas must be non-negative!") - config.defrost() - config.data.manfiest = config.data.dev_manifest - config.data.augmentation_config = "" - config.data.keep_transcription_text = True - dev_dataset = ManifestDataset.from_config(config) - - valid_loader = DataLoader( - dev_dataset, - batch_size=config.data.batch_size, - shuffle=False, - drop_last=False, - collate_fn=SpeechCollator(keep_transcription_text=True)) - - model = DeepSpeech2Model.from_pretrained(valid_loader, config, - args.checkpoint_path) - model.eval() - - # decoders only accept string encoded in utf-8 - vocab_list = valid_loader.dataset.vocab_list - errors_func = error_rate.char_errors if config.decoding.error_rate_type == 'cer' else error_rate.word_errors - - # create grid for search - cand_alphas = np.linspace(args.alpha_from, args.alpha_to, args.num_alphas) - cand_betas = np.linspace(args.beta_from, args.beta_to, args.num_betas) - params_grid = [(alpha, beta) for alpha in cand_alphas - for beta in cand_betas] - - err_sum = [0.0 for i in range(len(params_grid))] - err_ave = [0.0 for i in range(len(params_grid))] - - num_ins, len_refs, cur_batch = 0, 0, 0 - # initialize external scorer - model.decoder.init_decode(args.alpha_from, args.beta_from, - config.decoding.lang_model_path, vocab_list, - config.decoding.decoding_method) - ## incremental tuning parameters over multiple batches - print("start tuning ...") - for infer_data in valid_loader(): - if (args.num_batches >= 0) and (cur_batch >= args.num_batches): - break - - def ordid2token(texts, texts_len): - """ ord() id to chr() chr """ - trans = [] - for text, n in zip(texts, texts_len): - n = n.numpy().item() - ids = text[:n] - trans.append(''.join([chr(i) for i in ids])) - return trans - - audio, audio_len, text, text_len = infer_data - target_transcripts = ordid2token(text, text_len) - num_ins += audio.shape[0] - - # model infer - eouts, eouts_len = model.encoder(audio, audio_len) - probs = model.decoder.softmax(eouts) - - # grid search - for index, (alpha, beta) in enumerate(params_grid): - print(f"tuneing: alpha={alpha} beta={beta}") - result_transcripts = model.decoder.decode_probs( - probs.numpy(), eouts_len, vocab_list, - config.decoding.decoding_method, - config.decoding.lang_model_path, alpha, beta, - config.decoding.beam_size, config.decoding.cutoff_prob, - config.decoding.cutoff_top_n, config.decoding.num_proc_bsearch) - - for target, result in zip(target_transcripts, result_transcripts): - errors, len_ref = errors_func(target, result) - err_sum[index] += errors - - # accumulate the length of references of every batchπ - # in the first iteration - if args.alpha_from == alpha and args.beta_from == beta: - len_refs += len_ref - - err_ave[index] = err_sum[index] / len_refs - if index % 2 == 0: - sys.stdout.write('.') - sys.stdout.flush() - print("tuneing: one grid done!") - - # output on-line tuning result at the end of current batch - err_ave_min = min(err_ave) - min_index = err_ave.index(err_ave_min) - print("\nBatch %d [%d/?], current opt (alpha, beta) = (%s, %s), " - " min [%s] = %f" % - (cur_batch, num_ins, "%.3f" % params_grid[min_index][0], - "%.3f" % params_grid[min_index][1], - config.decoding.error_rate_type, err_ave_min)) - cur_batch += 1 - - # output WER/CER at every (alpha, beta) - print("\nFinal %s:\n" % config.decoding.error_rate_type) - for index in range(len(params_grid)): - print("(alpha, beta) = (%s, %s), [%s] = %f" % - ("%.3f" % params_grid[index][0], "%.3f" % params_grid[index][1], - config.decoding.error_rate_type, err_ave[index])) - - err_ave_min = min(err_ave) - min_index = err_ave.index(err_ave_min) - print("\nFinish tuning on %d batches, final opt (alpha, beta) = (%s, %s)" % - (cur_batch, "%.3f" % params_grid[min_index][0], - "%.3f" % params_grid[min_index][1])) - - print("finish tuning") - - -def main(config, args): - tune(config, args) - - -if __name__ == "__main__": - parser = default_argument_parser() - add_arg = functools.partial(add_arguments, argparser=parser) - add_arg('num_batches', int, -1, "# of batches tuning on. " - "Default -1, on whole dev set.") - add_arg('num_alphas', int, 45, "# of alpha candidates for tuning.") - add_arg('num_betas', int, 8, "# of beta candidates for tuning.") - add_arg('alpha_from', float, 1.0, "Where alpha starts tuning from.") - add_arg('alpha_to', float, 3.2, "Where alpha ends tuning with.") - add_arg('beta_from', float, 0.1, "Where beta starts tuning from.") - add_arg('beta_to', float, 0.45, "Where beta ends tuning with.") - - add_arg('batch_size', int, 256, "# of samples per batch.") - add_arg('beam_size', int, 500, "Beam search width.") - add_arg('num_proc_bsearch', int, 8, "# of CPUs for beam search.") - add_arg('cutoff_prob', float, 1.0, "Cutoff probability for pruning.") - add_arg('cutoff_top_n', int, 40, "Cutoff number for pruning.") - - args = parser.parse_args() - print_arguments(args, globals()) - - # https://yaml.org/type/float.html - config = get_cfg_defaults() - if args.config: - config.merge_from_file(args.config) - if args.opts: - config.merge_from_list(args.opts) - - config.data.batch_size = args.batch_size - config.decoding.beam_size = args.beam_size - config.decoding.num_proc_bsearch = args.num_proc_bsearch - config.decoding.cutoff_prob = args.cutoff_prob - config.decoding.cutoff_top_n = args.cutoff_top_n - - config.freeze() - print(config) - - if args.dump_config: - with open(args.dump_config, 'w') as f: - print(config, file=f) - - main(config, args) diff --git a/deepspeech/exps/deepspeech2/model.py b/deepspeech/exps/deepspeech2/model.py index fbc357ca0..df35c52c6 100644 --- a/deepspeech/exps/deepspeech2/model.py +++ b/deepspeech/exps/deepspeech2/model.py @@ -41,6 +41,7 @@ from deepspeech.utils import layer_tools from deepspeech.utils import mp_tools from deepspeech.utils.log import Autolog from deepspeech.utils.log import Log +from deepspeech.utils.utility import UpdateConfig logger = Log(__name__).getlog() @@ -147,10 +148,9 @@ class DeepSpeech2Trainer(Trainer): def setup_model(self): config = self.config.clone() - config.defrost() - config.model.feat_size = self.train_loader.collate_fn.feature_size - config.model.dict_size = self.train_loader.collate_fn.vocab_size - config.freeze() + with UpdateConfig(config): + config.model.feat_size = self.train_loader.collate_fn.feature_size + config.model.dict_size = self.train_loader.collate_fn.vocab_size if self.args.model_type == 'offline': model = DeepSpeech2Model.from_config(config.model) diff --git a/deepspeech/exps/u2/model.py b/deepspeech/exps/u2/model.py index 0d17d9fd2..89d443e03 100644 --- a/deepspeech/exps/u2/model.py +++ b/deepspeech/exps/u2/model.py @@ -43,6 +43,7 @@ from deepspeech.utils import mp_tools from deepspeech.utils import text_grid from deepspeech.utils import utility from deepspeech.utils.log import Log +from deepspeech.utils.utility import UpdateConfig logger = Log(__name__).getlog() @@ -315,10 +316,11 @@ class U2Trainer(Trainer): def setup_model(self): config = self.config model_conf = config.model - model_conf.defrost() - model_conf.input_dim = self.train_loader.collate_fn.feature_size - model_conf.output_dim = self.train_loader.collate_fn.vocab_size - model_conf.freeze() + + with UpdateConfig(model_conf): + model_conf.input_dim = self.train_loader.collate_fn.feature_size + model_conf.output_dim = self.train_loader.collate_fn.vocab_size + model = U2Model.from_config(model_conf) if self.parallel: diff --git a/deepspeech/exps/u2/trainer.py b/deepspeech/exps/u2/trainer.py index fa3e6d9d7..8e8634ac3 100644 --- a/deepspeech/exps/u2/trainer.py +++ b/deepspeech/exps/u2/trainer.py @@ -32,6 +32,7 @@ from deepspeech.training.trainer import Trainer from deepspeech.training.updaters.trainer import Trainer as NewTrainer from deepspeech.utils import layer_tools from deepspeech.utils.log import Log +from deepspeech.utils.utility import UpdateConfig logger = Log(__name__).getlog() @@ -121,10 +122,10 @@ class U2Trainer(Trainer): def setup_model(self): config = self.config model_conf = config.model - model_conf.defrost() - model_conf.input_dim = self.train_loader.collate_fn.feature_size - model_conf.output_dim = self.train_loader.collate_fn.vocab_size - model_conf.freeze() + with UpdateConfig(model_conf): + model_conf.input_dim = self.train_loader.collate_fn.feature_size + model_conf.output_dim = self.train_loader.collate_fn.vocab_size + model = U2Model.from_config(model_conf) if self.parallel: diff --git a/deepspeech/exps/u2_kaldi/model.py b/deepspeech/exps/u2_kaldi/model.py index 3d15e0259..edcc34012 100644 --- a/deepspeech/exps/u2_kaldi/model.py +++ b/deepspeech/exps/u2_kaldi/model.py @@ -41,6 +41,7 @@ from deepspeech.utils import mp_tools from deepspeech.utils import text_grid from deepspeech.utils import utility from deepspeech.utils.log import Log +from deepspeech.utils.utility import UpdateConfig logger = Log(__name__).getlog() @@ -319,10 +320,10 @@ class U2Trainer(Trainer): # model model_conf = config.model - model_conf.defrost() - model_conf.input_dim = self.train_loader.feat_dim - model_conf.output_dim = self.train_loader.vocab_size - model_conf.freeze() + with UpdateConfig(model_conf): + model_conf.input_dim = self.train_loader.feat_dim + model_conf.output_dim = self.train_loader.vocab_size + model = U2Model.from_config(model_conf) if self.parallel: model = paddle.DataParallel(model) diff --git a/deepspeech/exps/u2_st/model.py b/deepspeech/exps/u2_st/model.py index 91a81503f..0fa8ed735 100644 --- a/deepspeech/exps/u2_st/model.py +++ b/deepspeech/exps/u2_st/model.py @@ -47,6 +47,7 @@ from deepspeech.utils import mp_tools from deepspeech.utils import text_grid from deepspeech.utils import utility from deepspeech.utils.log import Log +from deepspeech.utils.utility import UpdateConfig logger = Log(__name__).getlog() @@ -345,10 +346,10 @@ class U2STTrainer(Trainer): def setup_model(self): config = self.config model_conf = config.model - model_conf.defrost() - model_conf.input_dim = self.train_loader.collate_fn.feature_size - model_conf.output_dim = self.train_loader.collate_fn.vocab_size - model_conf.freeze() + with UpdateConfig(model_conf): + model_conf.input_dim = self.train_loader.collate_fn.feature_size + model_conf.output_dim = self.train_loader.collate_fn.vocab_size + model = U2STModel.from_config(model_conf) if self.parallel: diff --git a/deepspeech/models/u2/u2.py b/deepspeech/models/u2/u2.py index fd8f15471..39ed9d5d1 100644 --- a/deepspeech/models/u2/u2.py +++ b/deepspeech/models/u2/u2.py @@ -48,6 +48,7 @@ from deepspeech.utils.tensor_utils import add_sos_eos from deepspeech.utils.tensor_utils import pad_sequence from deepspeech.utils.tensor_utils import th_accuracy from deepspeech.utils.utility import log_add +from deepspeech.utils.utility import UpdateConfig __all__ = ["U2Model", "U2InferModel"] @@ -903,10 +904,10 @@ class U2Model(U2BaseModel): Returns: DeepSpeech2Model: The model built from pretrained result. """ - config.defrost() - config.input_dim = dataloader.collate_fn.feature_size - config.output_dim = dataloader.collate_fn.vocab_size - config.freeze() + with UpdateConfig(config): + config.input_dim = dataloader.collate_fn.feature_size + config.output_dim = dataloader.collate_fn.vocab_size + model = cls.from_config(config) if checkpoint_path: diff --git a/deepspeech/models/u2_st.py b/deepspeech/models/u2_st.py index 6737a549d..87ca68b29 100644 --- a/deepspeech/models/u2_st.py +++ b/deepspeech/models/u2_st.py @@ -42,6 +42,7 @@ from deepspeech.utils import layer_tools from deepspeech.utils.log import Log from deepspeech.utils.tensor_utils import add_sos_eos from deepspeech.utils.tensor_utils import th_accuracy +from deepspeech.utils.utility import UpdateConfig __all__ = ["U2STModel", "U2STInferModel"] @@ -686,10 +687,10 @@ class U2STModel(U2STBaseModel): Returns: DeepSpeech2Model: The model built from pretrained result. """ - config.defrost() - config.input_dim = dataloader.collate_fn.feature_size - config.output_dim = dataloader.collate_fn.vocab_size - config.freeze() + with UpdateConfig(config): + config.input_dim = dataloader.collate_fn.feature_size + config.output_dim = dataloader.collate_fn.vocab_size + model = cls.from_config(config) if checkpoint_path: