From 45f73c507cbb6b4e7e16d9b4988011bf97f8e446 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Mon, 1 Mar 2021 07:13:56 +0000 Subject: [PATCH] refactor repo fix decoding --- .gitignore | 3 +- {data_utils => deepspeech}/__init__.py | 0 .../decoders}/__init__.py | 0 .../decoders}/decoders_deprecated.py | 0 .../decoders}/scorer_deprecated.py | 0 .../decoders/swig}/__init__.py | 0 .../decoders}/swig/_init_paths.py | 0 .../swig/ctc_beam_search_decoder.cpp | 0 .../decoders}/swig/ctc_beam_search_decoder.h | 0 .../decoders}/swig/ctc_greedy_decoder.cpp | 0 .../decoders}/swig/ctc_greedy_decoder.h | 0 .../decoders}/swig/decoder_utils.cpp | 0 .../decoders}/swig/decoder_utils.h | 0 .../decoders}/swig/decoders.i | 0 .../decoders}/swig/path_trie.cpp | 0 .../decoders}/swig/path_trie.h | 0 .../decoders}/swig/scorer.cpp | 0 .../decoders}/swig/scorer.h | 0 .../decoders}/swig/setup.py | 0 .../decoders}/swig/setup.sh | 0 .../decoders}/swig_wrapper.py | 0 .../decoders}/tests/test_decoders.py | 0 {decoders => deepspeech/exps}/__init__.py | 0 .../exps/deepspeech2}/__init__.py | 0 .../exps/deepspeech2/bin/infer.py | 11 +- .../exps/deepspeech2/bin/test.py | 10 +- .../exps/deepspeech2/bin/train.py | 8 +- .../exps/deepspeech2/bin/tune.py | 36 ++--- .../exps/deepspeech2}/config.py | 4 - .../exps/deepspeech2}/dataset.py | 10 +- .../exps/deepspeech2}/model.py | 97 ++++++-------- .../frontend}/__init__.py | 0 {data_utils => deepspeech/frontend}/audio.py | 0 .../frontend/augmentor}/__init__.py | 0 .../frontend}/augmentor/augmentation.py | 14 +- .../frontend}/augmentor/base.py | 0 .../frontend}/augmentor/impulse_response.py | 6 +- .../frontend}/augmentor/noise_perturb.py | 6 +- .../online_bayesian_normalization.py | 2 +- .../frontend}/augmentor/resample.py | 2 +- .../frontend}/augmentor/shift_perturb.py | 2 +- .../frontend}/augmentor/speed_perturb.py | 2 +- .../frontend}/augmentor/volume_perturb.py | 2 +- .../frontend/featurizer}/__init__.py | 2 - .../frontend}/featurizer/audio_featurizer.py | 4 +- .../frontend}/featurizer/speech_featurizer.py | 4 +- .../frontend}/featurizer/text_featurizer.py | 0 .../frontend}/normalizer.py | 4 +- {data_utils => deepspeech/frontend}/speech.py | 27 ++-- .../frontend}/utility.py | 49 +------ deepspeech/models/__init__.py | 13 ++ {model_utils => deepspeech/models}/network.py | 35 ++--- deepspeech/modules/__init__.py | 13 ++ deepspeech/training/__init__.py | 15 +++ {training => deepspeech/training}/cli.py | 3 +- {training => deepspeech/training}/trainer.py | 43 +++--- deepspeech/utils/__init__.py | 13 ++ {utils => deepspeech/utils}/checkpoint.py | 4 +- {utils => deepspeech/utils}/error_rate.py | 0 {utils => deepspeech/utils}/mp_tools.py | 0 deepspeech/utils/utility.py | 57 ++++++++ deploy/demo_server.py | 11 +- examples/aishell/.gitignore | 2 + examples/aishell/conf/deepspeech2.yaml | 10 +- examples/aishell/local/data.sh | 11 +- .../aishell/local}/download_lm_ch.sh | 7 +- .../aishell/local}/download_model.sh | 9 +- examples/aishell/local/infer.sh | 6 +- examples/aishell/local/infer_golden.sh | 18 +-- examples/aishell/local/test.sh | 9 +- examples/aishell/local/test_golden.sh | 41 ++---- examples/aishell/local/train.sh | 9 +- examples/aishell/local/tune.sh | 2 +- examples/aishell/models | 1 - examples/aishell/path.sh | 3 + examples/aishell/run.sh | 13 +- .../{conf => aug_conf}/augmentation.config | 0 .../augmentation.config.example | 0 .../baidu_en8k}/download_lm_en.sh | 8 +- .../baidu_en8k/download_model.sh | 7 +- examples/baidu_en8k/path.sh | 5 + examples/baidu_en8k/run_infer_golden.sh | 13 +- examples/baidu_en8k/run_test_golden.sh | 14 +- examples/dataset/aishell/.gitignore | 1 + .../local => dataset/aishell}/aishell.py | 2 +- .../chime3_background}/chime3_background.py | 3 +- examples/dataset/librispeech/.gitignore | 7 + .../librispeech}/librispeech.py | 4 +- examples/dataset/mini_librispeech/.gitignore | 4 + .../mini_librispeech/mini_librispeech.py | 115 ++++++++++++++++ examples/dataset/musan/musan.py | 123 ++++++++++++++++++ examples/dataset/rir_noise/rir_noise.py | 123 ++++++++++++++++++ .../dataset}/voxforge/run_data.sh | 9 +- .../dataset}/voxforge/voxforge.py | 4 +- examples/librispeech/.gitignore | 2 + examples/librispeech/conf/deepspeech2.yaml | 24 ++-- examples/librispeech/local/data.sh | 13 +- examples/librispeech/local/download_lm_en.sh | 20 +++ .../librispeech/local}/download_model.sh | 10 +- examples/librispeech/local/infer.sh | 34 +---- examples/librispeech/local/infer_golden.sh | 10 +- examples/librispeech/local/test.sh | 33 +---- examples/librispeech/local/test_golden.sh | 19 +-- examples/librispeech/local/train.sh | 37 ++---- examples/librispeech/local/tune.sh | 25 ++-- examples/librispeech/models | 1 - examples/librispeech/path.sh | 4 + examples/librispeech/run.sh | 10 +- examples/tiny/.gitignore | 2 + examples/tiny/conf/deepspeech2.yaml | 7 +- examples/tiny/local/data.sh | 16 +-- examples/tiny/local/download_lm_en.sh | 20 +++ examples/tiny/local/download_model.sh | 21 +++ examples/tiny/local/infer.sh | 8 +- examples/tiny/local/infer_golden.sh | 10 +- examples/tiny/local/test.sh | 6 +- examples/tiny/local/test_golden.sh | 9 +- examples/tiny/local/train.sh | 3 +- examples/tiny/local/tune.sh | 25 ++-- examples/tiny/models | 1 - examples/tiny/path.sh | 4 + examples/tiny/run.sh | 3 - dataloader.ipynb => notebook/dataloader.ipynb | 0 train_test.ipynb => notebook/train_test.ipynb | 0 setup.sh | 2 +- tests/network_test.py | 3 +- {utils/tests => tests}/test_error_rate.py | 2 +- {tools => utils}/build_vocab.py | 8 +- {tools => utils}/compute_mean_std.py | 8 +- {tools => utils}/profile.sh | 0 utils/utility.py | 90 +++++++------ 131 files changed, 968 insertions(+), 617 deletions(-) rename {data_utils => deepspeech}/__init__.py (100%) rename {data_utils/augmentor => deepspeech/decoders}/__init__.py (100%) rename {decoders => deepspeech/decoders}/decoders_deprecated.py (100%) rename {decoders => deepspeech/decoders}/scorer_deprecated.py (100%) rename {data_utils/featurizer => deepspeech/decoders/swig}/__init__.py (100%) rename {decoders => deepspeech/decoders}/swig/_init_paths.py (100%) rename {decoders => deepspeech/decoders}/swig/ctc_beam_search_decoder.cpp (100%) rename {decoders => deepspeech/decoders}/swig/ctc_beam_search_decoder.h (100%) rename {decoders => deepspeech/decoders}/swig/ctc_greedy_decoder.cpp (100%) rename {decoders => deepspeech/decoders}/swig/ctc_greedy_decoder.h (100%) rename {decoders => deepspeech/decoders}/swig/decoder_utils.cpp (100%) rename {decoders => deepspeech/decoders}/swig/decoder_utils.h (100%) rename {decoders => deepspeech/decoders}/swig/decoders.i (100%) rename {decoders => deepspeech/decoders}/swig/path_trie.cpp (100%) rename {decoders => deepspeech/decoders}/swig/path_trie.h (100%) rename {decoders => deepspeech/decoders}/swig/scorer.cpp (100%) rename {decoders => deepspeech/decoders}/swig/scorer.h (100%) rename {decoders => deepspeech/decoders}/swig/setup.py (100%) rename {decoders => deepspeech/decoders}/swig/setup.sh (100%) rename {decoders => deepspeech/decoders}/swig_wrapper.py (100%) rename {decoders => deepspeech/decoders}/tests/test_decoders.py (100%) rename {decoders => deepspeech/exps}/__init__.py (100%) rename {decoders/swig => deepspeech/exps/deepspeech2}/__init__.py (100%) rename infer.py => deepspeech/exps/deepspeech2/bin/infer.py (79%) rename test.py => deepspeech/exps/deepspeech2/bin/test.py (81%) rename train.py => deepspeech/exps/deepspeech2/bin/train.py (85%) rename tune.py => deepspeech/exps/deepspeech2/bin/tune.py (89%) rename {model_utils => deepspeech/exps/deepspeech2}/config.py (93%) rename {data_utils => deepspeech/exps/deepspeech2}/dataset.py (98%) rename {model_utils => deepspeech/exps/deepspeech2}/model.py (89%) rename {model_utils => deepspeech/frontend}/__init__.py (100%) rename {data_utils => deepspeech/frontend}/audio.py (100%) rename {utils => deepspeech/frontend/augmentor}/__init__.py (100%) rename {data_utils => deepspeech/frontend}/augmentor/augmentation.py (90%) rename {data_utils => deepspeech/frontend}/augmentor/base.py (100%) rename {data_utils => deepspeech/frontend}/augmentor/impulse_response.py (90%) rename {data_utils => deepspeech/frontend}/augmentor/noise_perturb.py (93%) rename {data_utils => deepspeech/frontend}/augmentor/online_bayesian_normalization.py (97%) rename {data_utils => deepspeech/frontend}/augmentor/resample.py (95%) rename {data_utils => deepspeech/frontend}/augmentor/shift_perturb.py (96%) rename {data_utils => deepspeech/frontend}/augmentor/speed_perturb.py (97%) rename {data_utils => deepspeech/frontend}/augmentor/volume_perturb.py (96%) rename {training => deepspeech/frontend/featurizer}/__init__.py (95%) rename {data_utils => deepspeech/frontend}/featurizer/audio_featurizer.py (98%) rename {data_utils => deepspeech/frontend}/featurizer/speech_featurizer.py (95%) rename {data_utils => deepspeech/frontend}/featurizer/text_featurizer.py (100%) rename {data_utils => deepspeech/frontend}/normalizer.py (97%) rename {data_utils => deepspeech/frontend}/speech.py (91%) rename {data_utils => deepspeech/frontend}/utility.py (53%) create mode 100644 deepspeech/models/__init__.py rename {model_utils => deepspeech/models}/network.py (96%) create mode 100644 deepspeech/modules/__init__.py create mode 100644 deepspeech/training/__init__.py rename {training => deepspeech/training}/cli.py (88%) rename {training => deepspeech/training}/trainer.py (93%) create mode 100644 deepspeech/utils/__init__.py rename {utils => deepspeech/utils}/checkpoint.py (98%) rename {utils => deepspeech/utils}/error_rate.py (100%) rename {utils => deepspeech/utils}/mp_tools.py (100%) create mode 100644 deepspeech/utils/utility.py create mode 100644 examples/aishell/.gitignore rename {models/lm => examples/aishell/local}/download_lm_ch.sh (73%) rename {models/aishell => examples/aishell/local}/download_model.sh (68%) delete mode 120000 examples/aishell/models rename examples/{conf => aug_conf}/augmentation.config (100%) rename examples/{conf => aug_conf}/augmentation.config.example (100%) rename {models/lm => examples/baidu_en8k}/download_lm_en.sh (73%) rename {models => examples}/baidu_en8k/download_model.sh (73%) create mode 100644 examples/dataset/aishell/.gitignore rename examples/{aishell/local => dataset/aishell}/aishell.py (98%) rename {data/noise => examples/dataset/chime3_background}/chime3_background.py (97%) create mode 100644 examples/dataset/librispeech/.gitignore rename examples/{librispeech/local => dataset/librispeech}/librispeech.py (98%) create mode 100644 examples/dataset/mini_librispeech/.gitignore create mode 100644 examples/dataset/mini_librispeech/mini_librispeech.py create mode 100644 examples/dataset/musan/musan.py create mode 100644 examples/dataset/rir_noise/rir_noise.py rename {data => examples/dataset}/voxforge/run_data.sh (58%) rename {data => examples/dataset}/voxforge/voxforge.py (98%) create mode 100644 examples/librispeech/.gitignore create mode 100644 examples/librispeech/local/download_lm_en.sh rename {models/librispeech => examples/librispeech/local}/download_model.sh (68%) delete mode 120000 examples/librispeech/models create mode 100644 examples/tiny/.gitignore create mode 100644 examples/tiny/local/download_lm_en.sh create mode 100644 examples/tiny/local/download_model.sh delete mode 120000 examples/tiny/models rename dataloader.ipynb => notebook/dataloader.ipynb (100%) rename train_test.ipynb => notebook/train_test.ipynb (100%) rename {utils/tests => tests}/test_error_rate.py (99%) rename {tools => utils}/build_vocab.py (92%) rename {tools => utils}/compute_mean_std.py (87%) rename {tools => utils}/profile.sh (100%) diff --git a/.gitignore b/.gitignore index 2ec11b5ee..dee7e4b33 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,4 @@ .DS_Store *.pyc tools/venv -dataset -models/* +.vscode diff --git a/data_utils/__init__.py b/deepspeech/__init__.py similarity index 100% rename from data_utils/__init__.py rename to deepspeech/__init__.py diff --git a/data_utils/augmentor/__init__.py b/deepspeech/decoders/__init__.py similarity index 100% rename from data_utils/augmentor/__init__.py rename to deepspeech/decoders/__init__.py diff --git a/decoders/decoders_deprecated.py b/deepspeech/decoders/decoders_deprecated.py similarity index 100% rename from decoders/decoders_deprecated.py rename to deepspeech/decoders/decoders_deprecated.py diff --git a/decoders/scorer_deprecated.py b/deepspeech/decoders/scorer_deprecated.py similarity index 100% rename from decoders/scorer_deprecated.py rename to deepspeech/decoders/scorer_deprecated.py diff --git a/data_utils/featurizer/__init__.py b/deepspeech/decoders/swig/__init__.py similarity index 100% rename from data_utils/featurizer/__init__.py rename to deepspeech/decoders/swig/__init__.py diff --git a/decoders/swig/_init_paths.py b/deepspeech/decoders/swig/_init_paths.py similarity index 100% rename from decoders/swig/_init_paths.py rename to deepspeech/decoders/swig/_init_paths.py diff --git a/decoders/swig/ctc_beam_search_decoder.cpp b/deepspeech/decoders/swig/ctc_beam_search_decoder.cpp similarity index 100% rename from decoders/swig/ctc_beam_search_decoder.cpp rename to deepspeech/decoders/swig/ctc_beam_search_decoder.cpp diff --git a/decoders/swig/ctc_beam_search_decoder.h b/deepspeech/decoders/swig/ctc_beam_search_decoder.h similarity index 100% rename from decoders/swig/ctc_beam_search_decoder.h rename to deepspeech/decoders/swig/ctc_beam_search_decoder.h diff --git a/decoders/swig/ctc_greedy_decoder.cpp b/deepspeech/decoders/swig/ctc_greedy_decoder.cpp similarity index 100% rename from decoders/swig/ctc_greedy_decoder.cpp rename to deepspeech/decoders/swig/ctc_greedy_decoder.cpp diff --git a/decoders/swig/ctc_greedy_decoder.h b/deepspeech/decoders/swig/ctc_greedy_decoder.h similarity index 100% rename from decoders/swig/ctc_greedy_decoder.h rename to deepspeech/decoders/swig/ctc_greedy_decoder.h diff --git a/decoders/swig/decoder_utils.cpp b/deepspeech/decoders/swig/decoder_utils.cpp similarity index 100% rename from decoders/swig/decoder_utils.cpp rename to deepspeech/decoders/swig/decoder_utils.cpp diff --git a/decoders/swig/decoder_utils.h b/deepspeech/decoders/swig/decoder_utils.h similarity index 100% rename from decoders/swig/decoder_utils.h rename to deepspeech/decoders/swig/decoder_utils.h diff --git a/decoders/swig/decoders.i b/deepspeech/decoders/swig/decoders.i similarity index 100% rename from decoders/swig/decoders.i rename to deepspeech/decoders/swig/decoders.i diff --git a/decoders/swig/path_trie.cpp b/deepspeech/decoders/swig/path_trie.cpp similarity index 100% rename from decoders/swig/path_trie.cpp rename to deepspeech/decoders/swig/path_trie.cpp diff --git a/decoders/swig/path_trie.h b/deepspeech/decoders/swig/path_trie.h similarity index 100% rename from decoders/swig/path_trie.h rename to deepspeech/decoders/swig/path_trie.h diff --git a/decoders/swig/scorer.cpp b/deepspeech/decoders/swig/scorer.cpp similarity index 100% rename from decoders/swig/scorer.cpp rename to deepspeech/decoders/swig/scorer.cpp diff --git a/decoders/swig/scorer.h b/deepspeech/decoders/swig/scorer.h similarity index 100% rename from decoders/swig/scorer.h rename to deepspeech/decoders/swig/scorer.h diff --git a/decoders/swig/setup.py b/deepspeech/decoders/swig/setup.py similarity index 100% rename from decoders/swig/setup.py rename to deepspeech/decoders/swig/setup.py diff --git a/decoders/swig/setup.sh b/deepspeech/decoders/swig/setup.sh similarity index 100% rename from decoders/swig/setup.sh rename to deepspeech/decoders/swig/setup.sh diff --git a/decoders/swig_wrapper.py b/deepspeech/decoders/swig_wrapper.py similarity index 100% rename from decoders/swig_wrapper.py rename to deepspeech/decoders/swig_wrapper.py diff --git a/decoders/tests/test_decoders.py b/deepspeech/decoders/tests/test_decoders.py similarity index 100% rename from decoders/tests/test_decoders.py rename to deepspeech/decoders/tests/test_decoders.py diff --git a/decoders/__init__.py b/deepspeech/exps/__init__.py similarity index 100% rename from decoders/__init__.py rename to deepspeech/exps/__init__.py diff --git a/decoders/swig/__init__.py b/deepspeech/exps/deepspeech2/__init__.py similarity index 100% rename from decoders/swig/__init__.py rename to deepspeech/exps/deepspeech2/__init__.py diff --git a/infer.py b/deepspeech/exps/deepspeech2/bin/infer.py similarity index 79% rename from infer.py rename to deepspeech/exps/deepspeech2/bin/infer.py index 52d3a6744..6f52c812f 100644 --- a/infer.py +++ b/deepspeech/exps/deepspeech2/bin/infer.py @@ -20,12 +20,13 @@ import functools from paddle import distributed as dist -from utils.utility import print_arguments -from training.cli import default_argument_parser +from deepspeech.training.cli import default_argument_parser +from deepspeech.utils.utility import print_arguments +from deepspeech.utils.error_rate import char_errors, word_errors -from model_utils.config import get_cfg_defaults -from model_utils.model import DeepSpeech2Tester as Tester -from utils.error_rate import char_errors, word_errors +# TODO(hui zhang): dynamic load +from deepspeech.exps.deepspeech2.config import get_cfg_defaults +from deepspeech.exps.deepspeech2.model import DeepSpeech2Tester as Tester def main_sp(config, args): diff --git a/test.py b/deepspeech/exps/deepspeech2/bin/test.py similarity index 81% rename from test.py rename to deepspeech/exps/deepspeech2/bin/test.py index 7758ddfd2..72b38f485 100644 --- a/test.py +++ b/deepspeech/exps/deepspeech2/bin/test.py @@ -20,12 +20,12 @@ import functools from paddle import distributed as dist -from utils.utility import print_arguments -from training.cli import default_argument_parser +from deepspeech.training.cli import default_argument_parser +from deepspeech.utils.utility import print_arguments +from deepspeech.utils.error_rate import char_errors, word_errors -from model_utils.config import get_cfg_defaults -from model_utils.model import DeepSpeech2Tester as Tester -from utils.error_rate import char_errors, word_errors +from deepspeech.exps.deepspeech2.config import get_cfg_defaults +from deepspeech.exps.deepspeech2.model import DeepSpeech2Tester as Tester def main_sp(config, args): diff --git a/train.py b/deepspeech/exps/deepspeech2/bin/train.py similarity index 85% rename from train.py rename to deepspeech/exps/deepspeech2/bin/train.py index 87bd33d07..0c1d08914 100644 --- a/train.py +++ b/deepspeech/exps/deepspeech2/bin/train.py @@ -20,11 +20,11 @@ import functools from paddle import distributed as dist -from utils.utility import print_arguments -from training.cli import default_argument_parser +from deepspeech.utils.utility import print_arguments +from deepspeech.training.cli import default_argument_parser -from model_utils.config import get_cfg_defaults -from model_utils.model import DeepSpeech2Trainer as Trainer +from deepspeech.exps.deepspeech2.config import get_cfg_defaults +from deepspeech.exps.deepspeech2.model import DeepSpeech2Trainer as Trainer def main_sp(config, args): diff --git a/tune.py b/deepspeech/exps/deepspeech2/bin/tune.py similarity index 89% rename from tune.py rename to deepspeech/exps/deepspeech2/bin/tune.py index b269265ae..33ecfe926 100644 --- a/tune.py +++ b/deepspeech/exps/deepspeech2/bin/tune.py @@ -20,22 +20,21 @@ import argparse import functools import gzip import logging -import paddle.fluid as fluid -from training.cli import default_argument_parser -from model_utils.config import get_cfg_defaults - -from data_utils.dataset import SpeechCollator -from data_utils.dataset import DeepSpeech2Dataset -from data_utils.dataset import DeepSpeech2DistributedBatchSampler -from data_utils.dataset import DeepSpeech2BatchSampler from paddle.io import DataLoader -from model_utils.network import DeepSpeech2 -from model_utils.network import DeepSpeech2Loss +from deepspeech.training.cli import default_argument_parser +from deepspeech.utils.error_rate import char_errors, word_errors +from deepspeech.utils.utility import add_arguments, print_arguments + +from deepspeech.models.network import DeepSpeech2 +from deepspeech.models.network import DeepSpeech2Loss -from utils.error_rate import char_errors, word_errors -from utils.utility import add_arguments, print_arguments +from deepspeech.exps.deepspeech2.dataset import SpeechCollator +from deepspeech.exps.deepspeech2.dataset import DeepSpeech2Dataset +from deepspeech.exps.deepspeech2.dataset import DeepSpeech2DistributedBatchSampler +from deepspeech.exps.deepspeech2.dataset import DeepSpeech2BatchSampler +from deepspeech.exps.deepspeech2.config import get_cfg_defaults def tune(config, args): @@ -114,7 +113,7 @@ def tune(config, args): return trans audio, text, audio_len, text_len = infer_data - _, probs, _ = model.predict(audio, audio_len) + _, probs, logits_lens = model.predict(audio, audio_len) target_transcripts = ordid2token(text, text_len) num_ins += audio.shape[0] @@ -122,17 +121,17 @@ def tune(config, args): for index, (alpha, beta) in enumerate(params_grid): print(f"tuneing: alpha={alpha} beta={beta}") result_transcripts = model.decode_probs( - probs.numpy(), vocab_list, config.decoding.decoding_method, + probs.numpy(), logits_lens, vocab_list, + config.decoding.decoding_method, config.decoding.lang_model_path, alpha, beta, config.decoding.beam_size, config.decoding.cutoff_prob, config.decoding.cutoff_top_n, config.decoding.num_proc_bsearch) for target, result in zip(target_transcripts, result_transcripts): - #print(f"tuneing: {target} {result}") errors, len_ref = errors_func(target, result) err_sum[index] += errors - # accumulate the length of references of every batch + # accumulate the length of references of every batchπ # in the first iteration if args.alpha_from == alpha and args.beta_from == beta: len_refs += len_ref @@ -148,8 +147,9 @@ def tune(config, args): min_index = err_ave.index(err_ave_min) print("\nBatch %d [%d/?], current opt (alpha, beta) = (%s, %s), " " min [%s] = %f" % - (cur_batch, num_ins, "%.3f" % params_grid[min_index][0], "%.3f" % - params_grid[min_index][1], args.error_rate_type, err_ave_min)) + (cur_batch, num_ins, "%.3f" % params_grid[min_index][0], + "%.3f" % params_grid[min_index][1], + config.decoding.error_rate_type, err_ave_min)) cur_batch += 1 # output WER/CER at every (alpha, beta) diff --git a/model_utils/config.py b/deepspeech/exps/deepspeech2/config.py similarity index 93% rename from model_utils/config.py rename to deepspeech/exps/deepspeech2/config.py index a6b99a61d..455f5b6c1 100644 --- a/model_utils/config.py +++ b/deepspeech/exps/deepspeech2/config.py @@ -56,10 +56,6 @@ _C.training = CN( lr_decay=1.0, # learning rate decay weight_decay=1e-6, # the coeff of weight decay global_grad_clip=5.0, # the global norm clip - plot_interval=1000, # plot attention and spectrogram by step - valid_interval=1000, # validation by step - save_interval=1000, # checkpoint by step - max_iteration=500000, # max iteration to train by step n_epoch=50, # train epochs )) diff --git a/data_utils/dataset.py b/deepspeech/exps/deepspeech2/dataset.py similarity index 98% rename from data_utils/dataset.py rename to deepspeech/exps/deepspeech2/dataset.py index 6be0c0455..72e3d840d 100644 --- a/data_utils/dataset.py +++ b/deepspeech/exps/deepspeech2/dataset.py @@ -27,11 +27,11 @@ from paddle.io import BatchSampler from paddle.io import DistributedBatchSampler from paddle import distributed as dist -from data_utils.utility import read_manifest -from data_utils.augmentor.augmentation import AugmentationPipeline -from data_utils.featurizer.speech_featurizer import SpeechFeaturizer -from data_utils.speech import SpeechSegment -from data_utils.normalizer import FeatureNormalizer +from deepspeech.frontend.utility import read_manifest +from deepspeech.frontend.augmentor.augmentation import AugmentationPipeline +from deepspeech.frontend.featurizer.speech_featurizer import SpeechFeaturizer +from deepspeech.frontend.speech import SpeechSegment +from deepspeech.frontend.normalizer import FeatureNormalizer logger = logging.getLogger(__name__) diff --git a/model_utils/model.py b/deepspeech/exps/deepspeech2/model.py similarity index 89% rename from model_utils/model.py rename to deepspeech/exps/deepspeech2/model.py index 6520d94a3..633569fcf 100644 --- a/model_utils/model.py +++ b/deepspeech/exps/deepspeech2/model.py @@ -29,26 +29,23 @@ from paddle.io import DataLoader from paddle.fluid.dygraph import base as imperative_base from paddle.fluid import layers -from paddle.fluid import framework from paddle.fluid import core -from paddle.fluid import name_scope -from utils import mp_tools -from training import Trainer +from deepspeech.training import Trainer +from deepspeech.utils import mp_tools +from deepspeech.utils.error_rate import char_errors, word_errors, cer, wer -from model_utils.network import DeepSpeech2 -from model_utils.network import DeepSpeech2Loss +from deepspeech.models.network import DeepSpeech2 +from deepspeech.models.network import DeepSpeech2Loss -from data_utils.dataset import SpeechCollator -from data_utils.dataset import DeepSpeech2Dataset -from data_utils.dataset import DeepSpeech2DistributedBatchSampler -from data_utils.dataset import DeepSpeech2BatchSampler +from deepspeech.decoders.swig_wrapper import Scorer +from deepspeech.decoders.swig_wrapper import ctc_greedy_decoder +from deepspeech.decoders.swig_wrapper import ctc_beam_search_decoder_batch -from decoders.swig_wrapper import Scorer -from decoders.swig_wrapper import ctc_greedy_decoder -from decoders.swig_wrapper import ctc_beam_search_decoder_batch - -from utils.error_rate import char_errors, word_errors, cer, wer +from deepspeech.exps.deepspeech2.dataset import SpeechCollator +from deepspeech.exps.deepspeech2.dataset import DeepSpeech2Dataset +from deepspeech.exps.deepspeech2.dataset import DeepSpeech2DistributedBatchSampler +from deepspeech.exps.deepspeech2.dataset import DeepSpeech2BatchSampler logger = logging.getLogger(__name__) @@ -161,46 +158,6 @@ class DeepSpeech2Trainer(Trainer): self.visualizer.add_scalar("train/{}".format(k), v, self.iteration) - def new_epoch(self): - """Reset the train loader and increment ``epoch``. - """ - if self.parallel: - # batch sampler epoch start from 0 - self.train_loader.batch_sampler.set_epoch(self.epoch) - self.epoch += 1 - - def train(self): - """The training process. - - It includes forward/backward/update and periodical validation and - saving. - """ - self.logger.info( - f"Train Total Examples: {len(self.train_loader.dataset)}") - self.new_epoch() - while self.epoch <= self.config.training.n_epoch: - try: - for batch in self.train_loader: - self.iteration += 1 - self.train_batch(batch) - - # if self.iteration % self.config.training.valid_interval == 0: - # self.valid() - - # if self.iteration % self.config.training.save_interval == 0: - # self.save() - except Exception as e: - self.logger.error(e) - pass - - self.valid() - self.save() - self.lr_scheduler.step() - self.new_epoch() - - def compute_metrics(self, inputs, outputs): - pass - @mp_tools.rank_zero_only @paddle.no_grad() def valid(self): @@ -212,7 +169,7 @@ class DeepSpeech2Trainer(Trainer): audio, text, audio_len, text_len = batch outputs = self.model(*batch) loss = self.compute_losses(batch, outputs) - metrics = self.compute_metrics(batch, outputs) + #metrics = self.compute_metrics(batch, outputs) valid_losses['val_loss'].append(float(loss)) valid_losses['val_loss_div_batchsize'].append( @@ -373,6 +330,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): target_transcripts = self.ordid2token(texts, texts_len) result_transcripts = self.model.decode_probs( probs.numpy(), + logits_len, vocab_list, decoding_method=cfg.decoding_method, lang_model_path=cfg.lang_model_path, @@ -446,15 +404,37 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): """ # output dir if self.args.output: - output_dir = Path(self.args.output).expanduser() / "infer" + output_dir = Path(self.args.output).expanduser() output_dir.mkdir(parents=True, exist_ok=True) else: output_dir = Path( - self.args.checkpoint_path).expanduser().parent.parent / "infer" + self.args.checkpoint_path).expanduser().parent.parent output_dir.mkdir(parents=True, exist_ok=True) self.output_dir = output_dir + def setup_logger(self): + """Initialize a text logger to log the experiment. + + Each process has its own text logger. The logging message is write to + the standard output and a text file named ``worker_n.log`` in the + output directory, where ``n`` means the rank of the process. + """ + format = '[%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s' + formatter = logging.Formatter(fmt=format, datefmt='%Y/%m/%d %H:%M:%S') + + logger.setLevel("INFO") + + # global logger + stdout = True + save_path = "" + logging.basicConfig( + level=logging.DEBUG if stdout else logging.INFO, + format=format, + datefmt='%Y/%m/%d %H:%M:%S', + filename=save_path if not stdout else None) + self.logger = logger + def setup(self): """Setup the experiment. """ @@ -463,6 +443,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): self.init_parallel() self.setup_output_dir() + self.setup_checkpointer() self.setup_logger() self.setup_dataloader() diff --git a/model_utils/__init__.py b/deepspeech/frontend/__init__.py similarity index 100% rename from model_utils/__init__.py rename to deepspeech/frontend/__init__.py diff --git a/data_utils/audio.py b/deepspeech/frontend/audio.py similarity index 100% rename from data_utils/audio.py rename to deepspeech/frontend/audio.py diff --git a/utils/__init__.py b/deepspeech/frontend/augmentor/__init__.py similarity index 100% rename from utils/__init__.py rename to deepspeech/frontend/augmentor/__init__.py diff --git a/data_utils/augmentor/augmentation.py b/deepspeech/frontend/augmentor/augmentation.py similarity index 90% rename from data_utils/augmentor/augmentation.py rename to deepspeech/frontend/augmentor/augmentation.py index f36d993e1..e50084a00 100644 --- a/data_utils/augmentor/augmentation.py +++ b/deepspeech/frontend/augmentor/augmentation.py @@ -15,13 +15,13 @@ import json import random -from data_utils.augmentor.volume_perturb import VolumePerturbAugmentor -from data_utils.augmentor.shift_perturb import ShiftPerturbAugmentor -from data_utils.augmentor.speed_perturb import SpeedPerturbAugmentor -from data_utils.augmentor.noise_perturb import NoisePerturbAugmentor -from data_utils.augmentor.impulse_response import ImpulseResponseAugmentor -from data_utils.augmentor.resample import ResampleAugmentor -from data_utils.augmentor.online_bayesian_normalization import \ +from deepspeech.frontend.augmentor.volume_perturb import VolumePerturbAugmentor +from deepspeech.frontend.augmentor.shift_perturb import ShiftPerturbAugmentor +from deepspeech.frontend.augmentor.speed_perturb import SpeedPerturbAugmentor +from deepspeech.frontend.augmentor.noise_perturb import NoisePerturbAugmentor +from deepspeech.frontend.augmentor.impulse_response import ImpulseResponseAugmentor +from deepspeech.frontend.augmentor.resample import ResampleAugmentor +from deepspeech.frontend.augmentor.online_bayesian_normalization import \ OnlineBayesianNormalizationAugmentor diff --git a/data_utils/augmentor/base.py b/deepspeech/frontend/augmentor/base.py similarity index 100% rename from data_utils/augmentor/base.py rename to deepspeech/frontend/augmentor/base.py diff --git a/data_utils/augmentor/impulse_response.py b/deepspeech/frontend/augmentor/impulse_response.py similarity index 90% rename from data_utils/augmentor/impulse_response.py rename to deepspeech/frontend/augmentor/impulse_response.py index 839c6a809..40aa3d47e 100644 --- a/data_utils/augmentor/impulse_response.py +++ b/deepspeech/frontend/augmentor/impulse_response.py @@ -13,9 +13,9 @@ # limitations under the License. """Contains the impulse response augmentation model.""" -from data_utils.augmentor.base import AugmentorBase -from data_utils.utility import read_manifest -from data_utils.audio import AudioSegment +from deepspeech.frontend.augmentor.base import AugmentorBase +from deepspeech.frontend.utility import read_manifest +from deepspeech.frontend.audio import AudioSegment class ImpulseResponseAugmentor(AugmentorBase): diff --git a/data_utils/augmentor/noise_perturb.py b/deepspeech/frontend/augmentor/noise_perturb.py similarity index 93% rename from data_utils/augmentor/noise_perturb.py rename to deepspeech/frontend/augmentor/noise_perturb.py index 954d1b419..350370b8f 100644 --- a/data_utils/augmentor/noise_perturb.py +++ b/deepspeech/frontend/augmentor/noise_perturb.py @@ -13,9 +13,9 @@ # limitations under the License. """Contains the noise perturb augmentation model.""" -from data_utils.augmentor.base import AugmentorBase -from data_utils.utility import read_manifest -from data_utils.audio import AudioSegment +from deepspeech.frontend.augmentor.base import AugmentorBase +from deepspeech.frontend.utility import read_manifest +from deepspeech.frontend.audio import AudioSegment class NoisePerturbAugmentor(AugmentorBase): diff --git a/data_utils/augmentor/online_bayesian_normalization.py b/deepspeech/frontend/augmentor/online_bayesian_normalization.py similarity index 97% rename from data_utils/augmentor/online_bayesian_normalization.py rename to deepspeech/frontend/augmentor/online_bayesian_normalization.py index f5c7d99fd..14c260dfd 100644 --- a/data_utils/augmentor/online_bayesian_normalization.py +++ b/deepspeech/frontend/augmentor/online_bayesian_normalization.py @@ -13,7 +13,7 @@ # limitations under the License. """Contain the online bayesian normalization augmentation model.""" -from data_utils.augmentor.base import AugmentorBase +from deepspeech.frontend.augmentor.base import AugmentorBase class OnlineBayesianNormalizationAugmentor(AugmentorBase): diff --git a/data_utils/augmentor/resample.py b/deepspeech/frontend/augmentor/resample.py similarity index 95% rename from data_utils/augmentor/resample.py rename to deepspeech/frontend/augmentor/resample.py index 3732e09cd..8ef574cbb 100644 --- a/data_utils/augmentor/resample.py +++ b/deepspeech/frontend/augmentor/resample.py @@ -13,7 +13,7 @@ # limitations under the License. """Contain the resample augmentation model.""" -from data_utils.augmentor.base import AugmentorBase +from deepspeech.frontend.augmentor.base import AugmentorBase class ResampleAugmentor(AugmentorBase): diff --git a/data_utils/augmentor/shift_perturb.py b/deepspeech/frontend/augmentor/shift_perturb.py similarity index 96% rename from data_utils/augmentor/shift_perturb.py rename to deepspeech/frontend/augmentor/shift_perturb.py index 8b8e60362..2edbf594d 100644 --- a/data_utils/augmentor/shift_perturb.py +++ b/deepspeech/frontend/augmentor/shift_perturb.py @@ -13,7 +13,7 @@ # limitations under the License. """Contains the volume perturb augmentation model.""" -from data_utils.augmentor.base import AugmentorBase +from deepspeech.frontend.augmentor.base import AugmentorBase class ShiftPerturbAugmentor(AugmentorBase): diff --git a/data_utils/augmentor/speed_perturb.py b/deepspeech/frontend/augmentor/speed_perturb.py similarity index 97% rename from data_utils/augmentor/speed_perturb.py rename to deepspeech/frontend/augmentor/speed_perturb.py index 7b28f7ec6..6518382db 100644 --- a/data_utils/augmentor/speed_perturb.py +++ b/deepspeech/frontend/augmentor/speed_perturb.py @@ -13,7 +13,7 @@ # limitations under the License. """Contain the speech perturbation augmentation model.""" -from data_utils.augmentor.base import AugmentorBase +from deepspeech.frontend.augmentor.base import AugmentorBase class SpeedPerturbAugmentor(AugmentorBase): diff --git a/data_utils/augmentor/volume_perturb.py b/deepspeech/frontend/augmentor/volume_perturb.py similarity index 96% rename from data_utils/augmentor/volume_perturb.py rename to deepspeech/frontend/augmentor/volume_perturb.py index b98c7a3b4..dc64d0e9e 100644 --- a/data_utils/augmentor/volume_perturb.py +++ b/deepspeech/frontend/augmentor/volume_perturb.py @@ -13,7 +13,7 @@ # limitations under the License. """Contains the volume perturb augmentation model.""" -from data_utils.augmentor.base import AugmentorBase +from deepspeech.frontend.augmentor.base import AugmentorBase class VolumePerturbAugmentor(AugmentorBase): diff --git a/training/__init__.py b/deepspeech/frontend/featurizer/__init__.py similarity index 95% rename from training/__init__.py rename to deepspeech/frontend/featurizer/__init__.py index 932432db1..185a92b8d 100644 --- a/training/__init__.py +++ b/deepspeech/frontend/featurizer/__init__.py @@ -11,5 +11,3 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -from training.trainer import * \ No newline at end of file diff --git a/data_utils/featurizer/audio_featurizer.py b/deepspeech/frontend/featurizer/audio_featurizer.py similarity index 98% rename from data_utils/featurizer/audio_featurizer.py rename to deepspeech/frontend/featurizer/audio_featurizer.py index b410b0217..b5edb32d5 100644 --- a/data_utils/featurizer/audio_featurizer.py +++ b/deepspeech/frontend/featurizer/audio_featurizer.py @@ -14,8 +14,8 @@ """Contains the audio featurizer class.""" import numpy as np -from data_utils.utility import read_manifest -from data_utils.audio import AudioSegment +from deepspeech.frontend.utility import read_manifest +from deepspeech.frontend.audio import AudioSegment from python_speech_features import mfcc from python_speech_features import delta diff --git a/data_utils/featurizer/speech_featurizer.py b/deepspeech/frontend/featurizer/speech_featurizer.py similarity index 95% rename from data_utils/featurizer/speech_featurizer.py rename to deepspeech/frontend/featurizer/speech_featurizer.py index 1bbf2bf58..d4de96adc 100644 --- a/data_utils/featurizer/speech_featurizer.py +++ b/deepspeech/frontend/featurizer/speech_featurizer.py @@ -13,8 +13,8 @@ # limitations under the License. """Contains the speech featurizer class.""" -from data_utils.featurizer.audio_featurizer import AudioFeaturizer -from data_utils.featurizer.text_featurizer import TextFeaturizer +from deepspeech.frontend.featurizer.audio_featurizer import AudioFeaturizer +from deepspeech.frontend.featurizer.text_featurizer import TextFeaturizer class SpeechFeaturizer(object): diff --git a/data_utils/featurizer/text_featurizer.py b/deepspeech/frontend/featurizer/text_featurizer.py similarity index 100% rename from data_utils/featurizer/text_featurizer.py rename to deepspeech/frontend/featurizer/text_featurizer.py diff --git a/data_utils/normalizer.py b/deepspeech/frontend/normalizer.py similarity index 97% rename from data_utils/normalizer.py rename to deepspeech/frontend/normalizer.py index 83a008f10..8e50566c6 100644 --- a/data_utils/normalizer.py +++ b/deepspeech/frontend/normalizer.py @@ -15,8 +15,8 @@ import numpy as np import random -from data_utils.utility import read_manifest -from data_utils.audio import AudioSegment +from deepspeech.frontend.utility import read_manifest +from deepspeech.frontend.audio import AudioSegment class FeatureNormalizer(object): diff --git a/data_utils/speech.py b/deepspeech/frontend/speech.py similarity index 91% rename from data_utils/speech.py rename to deepspeech/frontend/speech.py index 01c1787a4..2883405bb 100644 --- a/data_utils/speech.py +++ b/deepspeech/frontend/speech.py @@ -14,28 +14,33 @@ """Contains the speech segment class.""" import numpy as np -from data_utils.audio import AudioSegment +from deepspeech.frontend.audio import AudioSegment class SpeechSegment(AudioSegment): - """Speech segment abstraction, a subclass of AudioSegment, - with an additional transcript. - - :param samples: Audio samples [num_samples x num_channels]. - :type samples: ndarray.float32 - :param sample_rate: Audio sample rate. - :type sample_rate: int - :param transcript: Transcript text for the speech. - :type transript: str - :raises TypeError: If the sample data type is not float or int. + """Speech Segment with Text + + Args: + AudioSegment (AudioSegment): Audio Segment """ def __init__(self, samples, sample_rate, transcript): + """Speech segment abstraction, a subclass of AudioSegment, + with an additional transcript. + + Args: + samples (ndarray.float32): Audio samples [num_samples x num_channels]. + sample_rate (int): Audio sample rate. + transcript (str): Transcript text for the speech. + """ AudioSegment.__init__(self, samples, sample_rate) self._transcript = transcript def __eq__(self, other): """Return whether two objects are equal. + + Returns: + bool: True, when equal to other """ if not AudioSegment.__eq__(self, other): return False diff --git a/data_utils/utility.py b/deepspeech/frontend/utility.py similarity index 53% rename from data_utils/utility.py rename to deepspeech/frontend/utility.py index 6cc1b2713..3694e106a 100644 --- a/data_utils/utility.py +++ b/deepspeech/frontend/utility.py @@ -20,6 +20,7 @@ import tarfile import time from threading import Thread from multiprocessing import Process, Manager, Value + from paddle.dataset.common import md5file @@ -49,51 +50,3 @@ def read_manifest(manifest_path, max_duration=float('inf'), min_duration=0.0): json_data["duration"] >= min_duration): manifest.append(json_data) return manifest - - -def getfile_insensitive(path): - """Get the actual file path when given insensitive filename.""" - directory, filename = os.path.split(path) - directory, filename = (directory or '.'), filename.lower() - for f in os.listdir(directory): - newpath = os.path.join(directory, f) - if os.path.isfile(newpath) and f.lower() == filename: - return newpath - - -def download_multi(url, target_dir, extra_args): - """Download multiple files from url to target_dir.""" - if not os.path.exists(target_dir): os.makedirs(target_dir) - print("Downloading %s ..." % url) - ret_code = os.system("wget -c " + url + ' ' + extra_args + " -P " + - target_dir) - return ret_code - - -def download(url, md5sum, target_dir): - """Download file from url to target_dir, and check md5sum.""" - if not os.path.exists(target_dir): os.makedirs(target_dir) - filepath = os.path.join(target_dir, url.split("/")[-1]) - if not (os.path.exists(filepath) and md5file(filepath) == md5sum): - print("Downloading %s ..." % url) - os.system("wget -c " + url + " -P " + target_dir) - print("\nMD5 Chesksum %s ..." % filepath) - if not md5file(filepath) == md5sum: - raise RuntimeError("MD5 checksum failed.") - else: - print("File exists, skip downloading. (%s)" % filepath) - return filepath - - -def unpack(filepath, target_dir, rm_tar=False): - """Unpack the file to the target_dir.""" - print("Unpacking %s ..." % filepath) - tar = tarfile.open(filepath) - tar.extractall(target_dir) - tar.close() - if rm_tar == True: - os.remove(filepath) - - -class XmapEndSignal(): - pass diff --git a/deepspeech/models/__init__.py b/deepspeech/models/__init__.py new file mode 100644 index 000000000..185a92b8d --- /dev/null +++ b/deepspeech/models/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/model_utils/network.py b/deepspeech/models/network.py similarity index 96% rename from model_utils/network.py rename to deepspeech/models/network.py index 1e7545ee6..a3ea771dc 100644 --- a/model_utils/network.py +++ b/deepspeech/models/network.py @@ -22,11 +22,10 @@ from paddle import nn from paddle.nn import functional as F from paddle.nn import initializer as I -from utils import checkpoint - -from decoders.swig_wrapper import Scorer -from decoders.swig_wrapper import ctc_greedy_decoder -from decoders.swig_wrapper import ctc_beam_search_decoder_batch +from deepspeech.utils import checkpoint +from deepspeech.decoders.swig_wrapper import Scorer +from deepspeech.decoders.swig_wrapper import ctc_greedy_decoder +from deepspeech.decoders.swig_wrapper import ctc_beam_search_decoder_batch logger = logging.getLogger(__name__) @@ -661,16 +660,19 @@ class DeepSpeech2(nn.Layer): self._init_ext_scorer(beam_alpha, beam_beta, lang_model_path, vocab_list) - def decode_probs(self, probs, vocab_list, decoding_method, lang_model_path, - beam_alpha, beam_beta, beam_size, cutoff_prob, - cutoff_top_n, num_processes): - """ probs: activation after softmax """ + def decode_probs(self, probs, logits_lens, vocab_list, decoding_method, + lang_model_path, beam_alpha, beam_beta, beam_size, + cutoff_prob, cutoff_top_n, num_processes): + """ probs: activation after softmax + logits_len: audio output lens + """ + probs_split = [probs[i, :l, :] for i, l in enumerate(logits_lens)] if decoding_method == "ctc_greedy": result_transcripts = self._decode_batch_greedy( - probs_split=probs, vocab_list=vocab_list) + probs_split=probs_split, vocab_list=vocab_list) elif decoding_method == "ctc_beam_search": result_transcripts = self._decode_batch_beam_search( - probs_split=probs, + probs_split=probs_split, beam_alpha=beam_alpha, beam_beta=beam_beta, beam_size=beam_size, @@ -686,12 +688,11 @@ class DeepSpeech2(nn.Layer): def decode(self, audio, audio_len, vocab_list, decoding_method, lang_model_path, beam_alpha, beam_beta, beam_size, cutoff_prob, cutoff_top_n, num_processes): - _, probs, audio_lens = self.predict(audio, audio_len) - probs_split = [probs[i, :l, :] for i, l in enumerate(audio_lens)] - return self.decode_probs(probs_split, vocab_list, decoding_method, - lang_model_path, beam_alpha, beam_beta, - beam_size, cutoff_prob, cutoff_top_n, - num_processes) + _, probs, logits_lens = self.predict(audio, audio_len) + return self.decode_probs(probs.numpy(), logits_lens, vocab_list, + decoding_method, lang_model_path, beam_alpha, + beam_beta, beam_size, cutoff_prob, + cutoff_top_n, num_processes) def from_pretrained(self, checkpoint_path): """Build a model from a pretrained model. diff --git a/deepspeech/modules/__init__.py b/deepspeech/modules/__init__.py new file mode 100644 index 000000000..185a92b8d --- /dev/null +++ b/deepspeech/modules/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/deepspeech/training/__init__.py b/deepspeech/training/__init__.py new file mode 100644 index 000000000..1071a3dd7 --- /dev/null +++ b/deepspeech/training/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from deepspeech.training.trainer import * diff --git a/training/cli.py b/deepspeech/training/cli.py similarity index 88% rename from training/cli.py rename to deepspeech/training/cli.py index e0ebfc7de..1076fe0c7 100644 --- a/training/cli.py +++ b/deepspeech/training/cli.py @@ -59,7 +59,8 @@ def default_argument_parser(): parser.add_argument("--nprocs", type=int, default=1, help="number of parallel processes to use.") # overwrite extra config and default config - parser.add_argument("--opts", nargs=argparse.REMAINDER, help="options to overwrite --config file and the default config, passing in KEY VALUE pairs") + #parser.add_argument("--opts", nargs=argparse.REMAINDER, help="options to overwrite --config file and the default config, passing in KEY VALUE pairs") + parser.add_argument("--opts", type=str, default=[], nargs='+', help="options to overwrite --config file and the default config, passing in KEY VALUE pairs") # yapd: enable return parser diff --git a/training/trainer.py b/deepspeech/training/trainer.py similarity index 93% rename from training/trainer.py rename to deepspeech/training/trainer.py index 3fac31d70..f93bc5754 100644 --- a/training/trainer.py +++ b/deepspeech/training/trainer.py @@ -24,8 +24,8 @@ from paddle import distributed as dist from paddle.distributed.utils import get_gpus from tensorboardX import SummaryWriter -from utils import checkpoint -from utils import mp_tools +from deepspeech.utils import checkpoint +from deepspeech.utils import mp_tools __all__ = ["Trainer"] @@ -148,20 +148,6 @@ class Trainer(): checkpoint_path=self.args.checkpoint_path) self.iteration = iteration - def read_batch(self): - """Read a batch from the train_loader. - Returns - ------- - List[Tensor] - A batch. - """ - try: - batch = next(self.iterator) - except StopIteration: - self.new_epoch() - batch = next(self.iterator) - return batch - def new_epoch(self): """Reset the train loader and increment ``epoch``. """ @@ -169,7 +155,6 @@ class Trainer(): # batch sampler epoch start from 0 self.train_loader.batch_sampler.set_epoch(self.epoch) self.epoch += 1 - self.iterator = iter(self.train_loader) def train(self): """The training process. @@ -177,16 +162,22 @@ class Trainer(): It includes forward/backward/update and periodical validation and saving. """ + self.logger.info( + f"Train Total Examples: {len(self.train_loader.dataset)}") self.new_epoch() - while self.iteration < self.config.training.max_iteration: - self.iteration += 1 - self.train_batch() - - if self.iteration % self.config.training.valid_interval == 0: - self.valid() - - if self.iteration % self.config.training.save_interval == 0: - self.save() + while self.epoch <= self.config.training.n_epoch: + try: + for batch in self.train_loader: + self.iteration += 1 + self.train_batch(batch) + except Exception as e: + self.logger.error(e) + pass + + self.valid() + self.save() + self.lr_scheduler.step() + self.new_epoch() def run(self): """The routine of the experiment after setup. This method is intended diff --git a/deepspeech/utils/__init__.py b/deepspeech/utils/__init__.py new file mode 100644 index 000000000..185a92b8d --- /dev/null +++ b/deepspeech/utils/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/utils/checkpoint.py b/deepspeech/utils/checkpoint.py similarity index 98% rename from utils/checkpoint.py rename to deepspeech/utils/checkpoint.py index e0c6938c9..f2066fdec 100644 --- a/utils/checkpoint.py +++ b/deepspeech/utils/checkpoint.py @@ -16,15 +16,15 @@ import os import time import logging import numpy as np + import paddle from paddle import distributed as dist from paddle.nn import Layer from paddle.optimizer import Optimizer -from utils import mp_tools +from deepspeech.utils import mp_tools logger = logging.getLogger(__name__) -logger.setLevel("INFO") __all__ = ["load_parameters", "save_parameters"] diff --git a/utils/error_rate.py b/deepspeech/utils/error_rate.py similarity index 100% rename from utils/error_rate.py rename to deepspeech/utils/error_rate.py diff --git a/utils/mp_tools.py b/deepspeech/utils/mp_tools.py similarity index 100% rename from utils/mp_tools.py rename to deepspeech/utils/mp_tools.py diff --git a/deepspeech/utils/utility.py b/deepspeech/utils/utility.py new file mode 100644 index 000000000..cd7166593 --- /dev/null +++ b/deepspeech/utils/utility.py @@ -0,0 +1,57 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contains common utility functions.""" + +import distutils.util + + +def print_arguments(args): + """Print argparse's arguments. + + Usage: + + .. code-block:: python + + parser = argparse.ArgumentParser() + parser.add_argument("name", default="Jonh", type=str, help="User name.") + args = parser.parse_args() + print_arguments(args) + + :param args: Input argparse.Namespace for printing. + :type args: argparse.Namespace + """ + print("----------- Configuration Arguments -----------") + for arg, value in sorted(vars(args).items()): + print("%s: %s" % (arg, value)) + print("------------------------------------------------") + + +def add_arguments(argname, type, default, help, argparser, **kwargs): + """Add argparse's argument. + + Usage: + + .. code-block:: python + + parser = argparse.ArgumentParser() + add_argument("name", str, "Jonh", "User name.", parser) + args = parser.parse_args() + """ + type = distutils.util.strtobool if type == bool else type + argparser.add_argument( + "--" + argname, + default=default, + type=type, + help=help + ' Default: %(default)s.', + **kwargs) diff --git a/deploy/demo_server.py b/deploy/demo_server.py index bfc48c9f1..299b58091 100644 --- a/deploy/demo_server.py +++ b/deploy/demo_server.py @@ -23,11 +23,12 @@ import struct import wave import paddle.fluid as fluid import numpy as np -import _init_paths -from data_utils.data import DataGenerator -from model_utils.model import DeepSpeech2Model -from data_utils.utility import read_manifest -from utils.utility import add_arguments, print_arguments + +from deepspeech.frontend.utility import read_manifest +from deepspeech.utils.utility import add_arguments, print_arguments + +from deepspeech.exps.deepspeech2.model import DeepSpeech2Model +from deepspeech.exps.deepspeech2.dataset import DataGenerator parser = argparse.ArgumentParser(description=__doc__) add_arg = functools.partial(add_arguments, argparser=parser) diff --git a/examples/aishell/.gitignore b/examples/aishell/.gitignore new file mode 100644 index 000000000..44038ca5b --- /dev/null +++ b/examples/aishell/.gitignore @@ -0,0 +1,2 @@ +data +ckpt* diff --git a/examples/aishell/conf/deepspeech2.yaml b/examples/aishell/conf/deepspeech2.yaml index e2e08e1a9..821c183e5 100644 --- a/examples/aishell/conf/deepspeech2.yaml +++ b/examples/aishell/conf/deepspeech2.yaml @@ -34,18 +34,14 @@ training: lr_decay: 0.83 weight_decay: 1e-06 global_grad_clip: 5.0 - max_iteration: 500000 - plot_interval: 1000 - save_interval: 1000 - valid_interval: 1000 decoding: - batch_size: 10 + batch_size: 128 error_rate_type: cer decoding_method: ctc_beam_search - lang_model_path: models/lm/zh_giga.no_cna_cmn.prune01244.klm + lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm alpha: 2.6 beta: 5.0 beam_size: 300 - cutoff_prob: 1.0 + cutoff_prob: 0.99 cutoff_top_n: 40 num_proc_bsearch: 10 diff --git a/examples/aishell/local/data.sh b/examples/aishell/local/data.sh index b874b2df8..6eeb3d8fc 100644 --- a/examples/aishell/local/data.sh +++ b/examples/aishell/local/data.sh @@ -2,10 +2,13 @@ mkdir -p data +TARGET_DIR=${MAIN_ROOT}/examples/dataset +mkdir -p ${TARGET_DIR} + # download data, generate manifests -PYTHONPATH=.:$PYTHONPATH python3 local/aishell.py \ +PYTHONPATH=.:$PYTHONPATH python3 ${TARGET_DIR}/aishell/aishell.py \ --manifest_prefix="data/manifest" \ ---target_dir="${MAIN_ROOT}/dataset/aishell" +--target_dir="${TARGET_DIR}/aishell" if [ $? -ne 0 ]; then echo "Prepare Aishell failed. Terminated." @@ -14,7 +17,7 @@ fi # build vocabulary -python3 ${MAIN_ROOT}/tools/build_vocab.py \ +python3 ${MAIN_ROOT}/utils/build_vocab.py \ --count_threshold=0 \ --vocab_path="data/vocab.txt" \ --manifest_paths "data/manifest.train" "data/manifest.dev" @@ -26,7 +29,7 @@ fi # compute mean and stddev for normalizer -python3 ${MAIN_ROOT}/tools/compute_mean_std.py \ +python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ --manifest_path="data/manifest.train" \ --num_samples=2000 \ --specgram_type="linear" \ diff --git a/models/lm/download_lm_ch.sh b/examples/aishell/local/download_lm_ch.sh similarity index 73% rename from models/lm/download_lm_ch.sh rename to examples/aishell/local/download_lm_ch.sh index 0e4915262..f9e2261fd 100644 --- a/models/lm/download_lm_ch.sh +++ b/examples/aishell/local/download_lm_ch.sh @@ -1,10 +1,13 @@ #! /usr/bin/env bash -. ../../utils/utility.sh +. ${MAIN_ROOT}/utils/utility.sh + +DIR=data/lm +mkdir -p ${DIR} URL='https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm' MD5="29e02312deb2e59b3c8686c7966d4fe3" -TARGET=./zh_giga.no_cna_cmn.prune01244.klm +TARGET=${DIR}/zh_giga.no_cna_cmn.prune01244.klm echo "Download language model ..." diff --git a/models/aishell/download_model.sh b/examples/aishell/local/download_model.sh similarity index 68% rename from models/aishell/download_model.sh rename to examples/aishell/local/download_model.sh index 76ac4d005..2f9f40fb3 100644 --- a/models/aishell/download_model.sh +++ b/examples/aishell/local/download_model.sh @@ -1,10 +1,13 @@ #! /usr/bin/env bash -. ../../utils/utility.sh +. ${MAIN_ROOT}/utils/utility.sh + +DIR=data/pretrain +mkdir -p ${DIR} URL='https://deepspeech.bj.bcebos.com/mandarin_models/aishell_model_fluid.tar.gz' MD5=2bf0cc8b6d5da2a2a787b5cc36a496b5 -TARGET=./aishell_model_fluid.tar.gz +TARGET=${DIR}/aishell_model_fluid.tar.gz echo "Download Aishell model ..." @@ -13,7 +16,7 @@ if [ $? -ne 0 ]; then echo "Fail to download Aishell model!" exit 1 fi -tar -zxvf $TARGET +tar -zxvf $TARGET -C ${DIR} exit 0 diff --git a/examples/aishell/local/infer.sh b/examples/aishell/local/infer.sh index bc413be11..4b4c9381b 100644 --- a/examples/aishell/local/infer.sh +++ b/examples/aishell/local/infer.sh @@ -2,14 +2,12 @@ # download language model -cd ${MAIN_ROOT}/models/lm > /dev/null -bash download_lm_ch.sh +bash local/download_lm_ch.sh if [ $? -ne 0 ]; then exit 1 fi -cd - > /dev/null -python3 -u ${MAIN_ROOT}/infer.py \ +python3 -u ${BIN_DIR}/infer.py \ --device 'gpu' \ --nproc 1 \ --config conf/deepspeech2.yaml \ diff --git a/examples/aishell/local/infer_golden.sh b/examples/aishell/local/infer_golden.sh index 296c0d5b4..1727bcbad 100644 --- a/examples/aishell/local/infer_golden.sh +++ b/examples/aishell/local/infer_golden.sh @@ -1,22 +1,16 @@ #! /usr/bin/env bash # download language model -cd ${MAIN_ROOT}/models/lm > /dev/null -bash download_lm_ch.sh +bash local/download_lm_ch.sh if [ $? -ne 0 ]; then exit 1 fi -cd - > /dev/null - # download well-trained model -cd ${MAIN_ROOT}/models/aishell > /dev/null -bash download_model.sh +bash local/download_model.sh if [ $? -ne 0 ]; then exit 1 fi -cd - > /dev/null - # infer CUDA_VISIBLE_DEVICES=0 \ @@ -35,10 +29,10 @@ python3 -u ${MAIN_ROOT}/infer.py \ --use_gpu=False \ --share_rnn_weights=False \ --infer_manifest="data/manifest.test" \ ---mean_std_path="${MAIN_ROOT}/models/aishell/mean_std.npz" \ ---vocab_path="${MAIN_ROOT}/models/aishell/vocab.txt" \ ---model_path="${MAIN_ROOT}/models/aishell" \ ---lang_model_path="${MAIN_ROOT}/models/lm/zh_giga.no_cna_cmn.prune01244.klm" \ +--mean_std_path="data/pretrain/mean_std.npz" \ +--vocab_path="data/pretrain/vocab.txt" \ +--model_path="data/pretrain" \ +--lang_model_path="data/lm/zh_giga.no_cna_cmn.prune01244.klm" \ --decoding_method="ctc_beam_search" \ --error_rate_type="cer" \ --specgram_type="linear" diff --git a/examples/aishell/local/test.sh b/examples/aishell/local/test.sh index 6e6544bdb..74015f5d5 100644 --- a/examples/aishell/local/test.sh +++ b/examples/aishell/local/test.sh @@ -1,19 +1,16 @@ #! /usr/bin/env bash # download language model -cd ${MAIN_ROOT}/models/lm > /dev/null -bash download_lm_ch.sh +bash local/download_lm_ch.sh if [ $? -ne 0 ]; then exit 1 fi -cd - > /dev/null - -python3 -u ${MAIN_ROOT}/test.py \ +python3 -u ${BIN_DIR}/test.py \ --device 'gpu' \ --nproc 1 \ --config conf/deepspeech2.yaml \ ---output ckpt +--checkpoint_path ${1} if [ $? -ne 0 ]; then echo "Failed in evaluation!" diff --git a/examples/aishell/local/test_golden.sh b/examples/aishell/local/test_golden.sh index 062a1b99b..86abd38cb 100644 --- a/examples/aishell/local/test_golden.sh +++ b/examples/aishell/local/test_golden.sh @@ -1,47 +1,26 @@ #! /usr/bin/env bash # download language model -cd ${MAIN_ROOT}/models/lm > /dev/null -bash download_lm_ch.sh +bash local/download_lm_ch.sh if [ $? -ne 0 ]; then exit 1 fi -cd - > /dev/null - # download well-trained model -cd ${MAIN_ROOT}/models/aishell > /dev/null -bash download_model.sh +bash local/download_model.sh if [ $? -ne 0 ]; then exit 1 fi -cd - > /dev/null - # evaluate model -CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ -python3 -u ${MAIN_ROOT}/test.py \ ---batch_size=128 \ ---beam_size=300 \ ---num_proc_bsearch=8 \ ---num_conv_layers=2 \ ---num_rnn_layers=3 \ ---rnn_layer_size=1024 \ ---alpha=2.6 \ ---beta=5.0 \ ---cutoff_prob=0.99 \ ---cutoff_top_n=40 \ ---use_gru=True \ ---use_gpu=True \ ---share_rnn_weights=False \ ---test_manifest="data/manifest.test" \ ---mean_std_path="${MAIN_ROOT}/models/aishell/mean_std.npz" \ ---vocab_path="${MAIN_ROOT}/models/aishell/vocab.txt" \ ---model_path="${MAIN_ROOT}/models/aishell" \ ---lang_model_path="${MAIN_ROOT}/models/lm/zh_giga.no_cna_cmn.prune01244.klm" \ ---decoding_method="ctc_beam_search" \ ---error_rate_type="cer" \ ---specgram_type="linear" +CUDA_VISIBLE_DEVICES=0 \ +python3 -u ${BIN_DIR}/test.py \ +--device 'gpu' \ +--nproc 1 \ +--config conf/deepspeech2.yaml \ +--checkpoint_path data/pretrain/params.pdparams \ +--opts data.mean_std_filepath data/pretrain/mean_std.npz \ +--opts data.vocab_filepath data/pretrain/vocab.txt if [ $? -ne 0 ]; then echo "Failed in evaluation!" diff --git a/examples/aishell/local/train.sh b/examples/aishell/local/train.sh index ce30c4a11..3e13a79e3 100644 --- a/examples/aishell/local/train.sh +++ b/examples/aishell/local/train.sh @@ -4,11 +4,14 @@ # if you wish to resume from an exists model, uncomment --init_from_pretrained_model export FLAGS_sync_nccl_allreduce=0 -python3 -u ${MAIN_ROOT}/train.py \ +ngpu=$(echo ${CUDA_VISIBLE_DEVICES} | python -c 'import sys; a = sys.stdin.read(); print(len(a.split(",")));') +echo "using $ngpu gpus..." + +python3 -u ${BIN_DIR}/train.py \ --device 'gpu' \ ---nproc 4 \ +--nproc ${ngpu} \ --config conf/deepspeech2.yaml \ ---output ckpt-${1} +--output ckpt if [ $? -ne 0 ]; then diff --git a/examples/aishell/local/tune.sh b/examples/aishell/local/tune.sh index a11137706..9ff5e8b99 100644 --- a/examples/aishell/local/tune.sh +++ b/examples/aishell/local/tune.sh @@ -1,7 +1,7 @@ #! /usr/bin/env bash # grid-search for hyper-parameters in language model -python3 -u ${MAIN_ROOT}/tune.py \ +python3 -u ${BIN_DIR}/tune.py \ --device 'gpu' \ --nproc 1 \ --config conf/deepspeech2.yaml \ diff --git a/examples/aishell/models b/examples/aishell/models deleted file mode 120000 index 9e68e9945..000000000 --- a/examples/aishell/models +++ /dev/null @@ -1 +0,0 @@ -../../models \ No newline at end of file diff --git a/examples/aishell/path.sh b/examples/aishell/path.sh index a55139e11..debdbba46 100644 --- a/examples/aishell/path.sh +++ b/examples/aishell/path.sh @@ -8,3 +8,6 @@ export PYTHONIOENCODING=UTF-8 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/ + +MODEL=deepspeech2 +export BIN_DIR=${MAIN_ROOT}/deepspeech/exps/${MODEL}/bin diff --git a/examples/aishell/run.sh b/examples/aishell/run.sh index 6cf8af2ba..dc762df99 100644 --- a/examples/aishell/run.sh +++ b/examples/aishell/run.sh @@ -1,21 +1,16 @@ #!/bin/bash source path.sh +# only demos # prepare data bash ./local/data.sh -# test pretrain model -bash ./local/test_golden.sh - -# test pretain model -bash ./local/infer_golden.sh - # train model -bash ./local/train.sh +CUDA_VISIBLE_DEVICES=0,1,2,3 bash ./local/train.sh # test model -bash ./local/test.sh +CUDA_VISIBLE_DEVICES=0 bash ./local/test.sh ckpt/checkpoints/step-3284 # infer model -bash ./local/infer.sh +CUDA_VISIBLE_DEVICES=0 bash ./local/infer.sh ckpt/checkpoints/step-3284 diff --git a/examples/conf/augmentation.config b/examples/aug_conf/augmentation.config similarity index 100% rename from examples/conf/augmentation.config rename to examples/aug_conf/augmentation.config diff --git a/examples/conf/augmentation.config.example b/examples/aug_conf/augmentation.config.example similarity index 100% rename from examples/conf/augmentation.config.example rename to examples/aug_conf/augmentation.config.example diff --git a/models/lm/download_lm_en.sh b/examples/baidu_en8k/download_lm_en.sh similarity index 73% rename from models/lm/download_lm_en.sh rename to examples/baidu_en8k/download_lm_en.sh index cc8d32035..05ea793fb 100644 --- a/models/lm/download_lm_en.sh +++ b/examples/baidu_en8k/download_lm_en.sh @@ -1,11 +1,13 @@ #! /usr/bin/env bash -. ../../utils/utility.sh +. ${MAIN_ROOT}/utils/utility.sh + +DIR=data/lm +mkdir -p ${DIR} URL=https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm MD5="099a601759d467cd0a8523ff939819c5" -TARGET=./common_crawl_00.prune01111.trie.klm - +TARGET=${DIR}/common_crawl_00.prune01111.trie.klm echo "Download language model ..." download $URL $MD5 $TARGET diff --git a/models/baidu_en8k/download_model.sh b/examples/baidu_en8k/download_model.sh similarity index 73% rename from models/baidu_en8k/download_model.sh rename to examples/baidu_en8k/download_model.sh index bbdb32b61..3fc36b514 100644 --- a/models/baidu_en8k/download_model.sh +++ b/examples/baidu_en8k/download_model.sh @@ -1,10 +1,13 @@ #! /usr/bin/env bash -. ../../utils/utility.sh +. ${MAIN_ROOT}/utils/utility.sh + +DIR=data/pretrain +mkdir -p ${DIR} URL='https://deepspeech.bj.bcebos.com/demo_models/baidu_en8k_model_fluid.tar.gz' MD5=7e58fbf64aa4ecf639b049792ddcf788 -TARGET=./baidu_en8k_model_fluid.tar.gz +TARGET=${DIR}/baidu_en8k_model_fluid.tar.gz echo "Download BaiduEn8k model ..." diff --git a/examples/baidu_en8k/path.sh b/examples/baidu_en8k/path.sh index fd1cebba8..1b150ca40 100644 --- a/examples/baidu_en8k/path.sh +++ b/examples/baidu_en8k/path.sh @@ -6,3 +6,8 @@ export LC_ALL=C # Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C export PYTHONIOENCODING=UTF-8 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} + +export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/ + +MODEL=deepspeech2 +export BIN_DIR=${MAIN_ROOT}/deepspeech/exps/${MODEL}/bin \ No newline at end of file diff --git a/examples/baidu_en8k/run_infer_golden.sh b/examples/baidu_en8k/run_infer_golden.sh index 11d7541ee..32916f21f 100644 --- a/examples/baidu_en8k/run_infer_golden.sh +++ b/examples/baidu_en8k/run_infer_golden.sh @@ -3,22 +3,17 @@ source path.sh # download language model -cd ${MAIN_ROOT}/models/lm > /dev/null bash download_lm_en.sh if [ $? -ne 0 ]; then exit 1 fi -cd - > /dev/null # download well-trained model -cd ${MAIN_ROOT}/models/baidu_en8k > /dev/null bash download_model.sh if [ $? -ne 0 ]; then exit 1 fi -cd - > /dev/null - # infer CUDA_VISIBLE_DEVICES=0 \ @@ -37,10 +32,10 @@ python3 -u ${MAIN_ROOT}/infer.py \ --use_gpu=False \ --share_rnn_weights=False \ --infer_manifest="${MAIN_ROOT}/examples/librispeech/data/manifest.test-clean" \ ---mean_std_path="${MAIN_ROOT}/models/baidu_en8k/mean_std.npz" \ ---vocab_path="${MAIN_ROOT}/models/baidu_en8k/vocab.txt" \ ---model_path="${MAIN_ROOT}/models/baidu_en8k" \ ---lang_model_path="${MAIN_ROOT}/models/lm/common_crawl_00.prune01111.trie.klm" \ +--mean_std_path="data/pretrain/baidu_en8k/mean_std.npz" \ +--vocab_path="data/pretrain/baidu_en8k/vocab.txt" \ +--model_path="data/pretrain/baidu_en8k" \ +--lang_model_path="data/lm/common_crawl_00.prune01111.trie.klm" \ --decoding_method="ctc_beam_search" \ --error_rate_type="wer" \ --specgram_type="linear" diff --git a/examples/baidu_en8k/run_test_golden.sh b/examples/baidu_en8k/run_test_golden.sh index 10c61a096..eb51d8e33 100644 --- a/examples/baidu_en8k/run_test_golden.sh +++ b/examples/baidu_en8k/run_test_golden.sh @@ -3,21 +3,17 @@ source path.sh # download language model -cd ${MAIN_ROOT}/models/lm > /dev/null bash download_lm_en.sh if [ $? -ne 0 ]; then exit 1 fi -cd - > /dev/null # download well-trained model -cd ${MAIN_ROOT}/models/baidu_en8k > /dev/null bash download_model.sh if [ $? -ne 0 ]; then exit 1 fi -cd - > /dev/null # evaluate model @@ -37,11 +33,11 @@ python3 -u ${MAIN_ROOT}/test.py \ --use_gru=True \ --use_gpu=False \ --share_rnn_weights=False \ ---test_manifest="data/manifest.test-clean" \ ---mean_std_path="${MAIN_ROOT}/models/baidu_en8k/mean_std.npz" \ ---vocab_path="${MAIN_ROOT}/models/baidu_en8k/vocab.txt" \ ---model_path="${MAIN_ROOT}/models/baidu_en8k" \ ---lang_model_path="${MAIN_ROOT}/models/lm/common_crawl_00.prune01111.trie.klm" \ +--test_manifest="${MAIN_ROOT}/examples/librispeech/data/manifest.test-clean" \ +--mean_std_path="data/pretrain/baidu_en8k/mean_std.npz" \ +--vocab_path="data/pretrain/baidu_en8k/vocab.txt" \ +--model_path="data/pretrain/baidu_en8k" \ +--lang_model_path="data/lm/common_crawl_00.prune01111.trie.klm" \ --decoding_method="ctc_beam_search" \ --error_rate_type="wer" \ --specgram_type="linear" diff --git a/examples/dataset/aishell/.gitignore b/examples/dataset/aishell/.gitignore new file mode 100644 index 000000000..9c6e517e5 --- /dev/null +++ b/examples/dataset/aishell/.gitignore @@ -0,0 +1 @@ +data_aishell* diff --git a/examples/aishell/local/aishell.py b/examples/dataset/aishell/aishell.py similarity index 98% rename from examples/aishell/local/aishell.py rename to examples/dataset/aishell/aishell.py index ba59b744d..38d0c28a3 100644 --- a/examples/aishell/local/aishell.py +++ b/examples/dataset/aishell/aishell.py @@ -24,7 +24,7 @@ import codecs import soundfile import json import argparse -from data_utils.utility import download, unpack +from utils.utility import download, unpack DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech') diff --git a/data/noise/chime3_background.py b/examples/dataset/chime3_background/chime3_background.py similarity index 97% rename from data/noise/chime3_background.py rename to examples/dataset/chime3_background/chime3_background.py index 8db09204e..31208d147 100644 --- a/data/noise/chime3_background.py +++ b/examples/dataset/chime3_background/chime3_background.py @@ -29,7 +29,8 @@ import json import io from paddle.v2.dataset.common import md5file -DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech') +#DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech') +DATA_HOME = os.path.expanduser('.') URL = "https://d4s.myairbridge.com/packagev2/AG0Y3DNBE5IWRRTV/?dlid=W19XG7T0NNHB027139H0EQ" MD5 = "c3ff512618d7a67d4f85566ea1bc39ec" diff --git a/examples/dataset/librispeech/.gitignore b/examples/dataset/librispeech/.gitignore new file mode 100644 index 000000000..a8d8eb76d --- /dev/null +++ b/examples/dataset/librispeech/.gitignore @@ -0,0 +1,7 @@ +dev-clean/ +dev-other/ +test-clean/ +test-other/ +train-clean-100/ +train-clean-360/ +train-other-500/ diff --git a/examples/librispeech/local/librispeech.py b/examples/dataset/librispeech/librispeech.py similarity index 98% rename from examples/librispeech/local/librispeech.py rename to examples/dataset/librispeech/librispeech.py index ae1bae2de..4cf0f5541 100644 --- a/examples/librispeech/local/librispeech.py +++ b/examples/dataset/librispeech/librispeech.py @@ -27,10 +27,10 @@ import soundfile import json import codecs import io -from data_utils.utility import download, unpack +from utils.utility import download, unpack URL_ROOT = "http://www.openslr.org/resources/12" -URL_ROOT = "https://openslr.magicdatatech.com/resources/12" +#URL_ROOT = "https://openslr.magicdatatech.com/resources/12" URL_TEST_CLEAN = URL_ROOT + "/test-clean.tar.gz" URL_TEST_OTHER = URL_ROOT + "/test-other.tar.gz" URL_DEV_CLEAN = URL_ROOT + "/dev-clean.tar.gz" diff --git a/examples/dataset/mini_librispeech/.gitignore b/examples/dataset/mini_librispeech/.gitignore new file mode 100644 index 000000000..61f54c966 --- /dev/null +++ b/examples/dataset/mini_librispeech/.gitignore @@ -0,0 +1,4 @@ +dev-clean/ +manifest.dev-clean +manifest.train-clean +train-clean/ diff --git a/examples/dataset/mini_librispeech/mini_librispeech.py b/examples/dataset/mini_librispeech/mini_librispeech.py new file mode 100644 index 000000000..883a322dc --- /dev/null +++ b/examples/dataset/mini_librispeech/mini_librispeech.py @@ -0,0 +1,115 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Prepare Librispeech ASR datasets. + +Download, unpack and create manifest files. +Manifest file is a json-format file with each line containing the +meta data (i.e. audio filepath, transcript and audio duration) +of each audio file in the data set. +""" + +import distutils.util +import os +import sys +import argparse +import soundfile +import json +import codecs +import io +from utils.utility import download, unpack + +URL_ROOT = "http://www.openslr.org/resources/31" +URL_TRAIN_CLEAN = URL_ROOT + "/train-clean-5.tar.gz" +URL_DEV_CLEAN = URL_ROOT + "/dev-clean-2.tar.gz" + +MD5_TRAIN_CLEAN = "5df7d4e78065366204ca6845bb08f490" +MD5_DEV_CLEAN = "6d7ab67ac6a1d2c993d050e16d61080d" + +parser = argparse.ArgumentParser(description=__doc__) +parser.add_argument( + "--target_dir", + default='~/.cache/paddle/dataset/speech/libri', + type=str, + help="Directory to save the dataset. (default: %(default)s)") +parser.add_argument( + "--manifest_prefix", + default="manifest", + type=str, + help="Filepath prefix for output manifests. (default: %(default)s)") +args = parser.parse_args() + + +def create_manifest(data_dir, manifest_path): + """Create a manifest json file summarizing the data set, with each line + containing the meta data (i.e. audio filepath, transcription text, audio + duration) of each audio file within the data set. + """ + print("Creating manifest %s ..." % manifest_path) + json_lines = [] + for subfolder, _, filelist in sorted(os.walk(data_dir)): + text_filelist = [ + filename for filename in filelist if filename.endswith('trans.txt') + ] + if len(text_filelist) > 0: + text_filepath = os.path.join(subfolder, text_filelist[0]) + for line in io.open(text_filepath, encoding="utf8"): + segments = line.strip().split() + text = ' '.join(segments[1:]).lower() + audio_filepath = os.path.join(subfolder, segments[0] + '.flac') + audio_data, samplerate = soundfile.read(audio_filepath) + duration = float(len(audio_data)) / samplerate + json_lines.append( + json.dumps({ + 'audio_filepath': audio_filepath, + 'duration': duration, + 'text': text + })) + with codecs.open(manifest_path, 'w', 'utf-8') as out_file: + for line in json_lines: + out_file.write(line + '\n') + + +def prepare_dataset(url, md5sum, target_dir, manifest_path): + """Download, unpack and create summmary manifest file. + """ + if not os.path.exists(os.path.join(target_dir, "LibriSpeech")): + # download + filepath = download(url, md5sum, target_dir) + # unpack + unpack(filepath, target_dir) + else: + print("Skip downloading and unpacking. Data already exists in %s." % + target_dir) + # create manifest json file + create_manifest(target_dir, manifest_path) + + +def main(): + if args.target_dir.startswith('~'): + args.target_dir = os.path.expanduser(args.target_dir) + + prepare_dataset( + url=URL_TRAIN_CLEAN, + md5sum=MD5_TRAIN_CLEAN, + target_dir=os.path.join(args.target_dir, "train-clean"), + manifest_path=args.manifest_prefix + ".train-clean") + prepare_dataset( + url=URL_DEV_CLEAN, + md5sum=MD5_DEV_CLEAN, + target_dir=os.path.join(args.target_dir, "dev-clean"), + manifest_path=args.manifest_prefix + ".dev-clean") + + +if __name__ == '__main__': + main() diff --git a/examples/dataset/musan/musan.py b/examples/dataset/musan/musan.py new file mode 100644 index 000000000..0d01057e4 --- /dev/null +++ b/examples/dataset/musan/musan.py @@ -0,0 +1,123 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Prepare Aishell mandarin dataset + +Download, unpack and create manifest files. +Manifest file is a json-format file with each line containing the +meta data (i.e. audio filepath, transcript and audio duration) +of each audio file in the data set. +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import codecs +import soundfile +import json +import argparse +from utils.utility import download, unpack + +DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech') + +URL_ROOT = 'https://www.openslr.org/resources/17' +DATA_URL = URL_ROOT + '/musan.tar.gz' +MD5_DATA = '' + +parser = argparse.ArgumentParser(description=__doc__) +parser.add_argument( + "--target_dir", + default=DATA_HOME + "/musan", + type=str, + help="Directory to save the dataset. (default: %(default)s)") +parser.add_argument( + "--manifest_prefix", + default="manifest", + type=str, + help="Filepath prefix for output manifests. (default: %(default)s)") +args = parser.parse_args() + + +def create_manifest(data_dir, manifest_path_prefix): + print("Creating manifest %s ..." % manifest_path_prefix) + json_lines = [] + transcript_path = os.path.join(data_dir, 'transcript', + 'aishell_transcript_v0.8.txt') + transcript_dict = {} + for line in codecs.open(transcript_path, 'r', 'utf-8'): + line = line.strip() + if line == '': continue + audio_id, text = line.split(' ', 1) + # remove withespace + text = ''.join(text.split()) + transcript_dict[audio_id] = text + + data_types = ['train', 'dev', 'test'] + for type in data_types: + del json_lines[:] + audio_dir = os.path.join(data_dir, 'wav', type) + for subfolder, _, filelist in sorted(os.walk(audio_dir)): + for fname in filelist: + audio_path = os.path.join(subfolder, fname) + audio_id = fname[:-4] + # if no transcription for audio then skipped + if audio_id not in transcript_dict: + continue + audio_data, samplerate = soundfile.read(audio_path) + duration = float(len(audio_data) / samplerate) + text = transcript_dict[audio_id] + json_lines.append( + json.dumps( + { + 'audio_filepath': audio_path, + 'duration': duration, + 'text': text + }, + ensure_ascii=False)) + manifest_path = manifest_path_prefix + '.' + type + with codecs.open(manifest_path, 'w', 'utf-8') as fout: + for line in json_lines: + fout.write(line + '\n') + + +def prepare_dataset(url, md5sum, target_dir, manifest_path): + """Download, unpack and create manifest file.""" + data_dir = os.path.join(target_dir, 'data_aishell') + if not os.path.exists(data_dir): + filepath = download(url, md5sum, target_dir) + unpack(filepath, target_dir) + # unpack all audio tar files + audio_dir = os.path.join(data_dir, 'wav') + for subfolder, _, filelist in sorted(os.walk(audio_dir)): + for ftar in filelist: + unpack(os.path.join(subfolder, ftar), subfolder, True) + else: + print("Skip downloading and unpacking. Data already exists in %s." % + target_dir) + create_manifest(data_dir, manifest_path) + + +def main(): + if args.target_dir.startswith('~'): + args.target_dir = os.path.expanduser(args.target_dir) + + prepare_dataset( + url=DATA_URL, + md5sum=MD5_DATA, + target_dir=args.target_dir, + manifest_path=args.manifest_prefix) + + +if __name__ == '__main__': + main() diff --git a/examples/dataset/rir_noise/rir_noise.py b/examples/dataset/rir_noise/rir_noise.py new file mode 100644 index 000000000..dd2b5c64f --- /dev/null +++ b/examples/dataset/rir_noise/rir_noise.py @@ -0,0 +1,123 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Prepare Aishell mandarin dataset + +Download, unpack and create manifest files. +Manifest file is a json-format file with each line containing the +meta data (i.e. audio filepath, transcript and audio duration) +of each audio file in the data set. +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import codecs +import soundfile +import json +import argparse +from data_utils.utility import download, unpack + +DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech') + +URL_ROOT = 'http://www.openslr.org/resources/28' +DATA_URL = URL_ROOT + '/rirs_noises.zip' +MD5_DATA = '' + +parser = argparse.ArgumentParser(description=__doc__) +parser.add_argument( + "--target_dir", + default=DATA_HOME + "/Aishell", + type=str, + help="Directory to save the dataset. (default: %(default)s)") +parser.add_argument( + "--manifest_prefix", + default="manifest", + type=str, + help="Filepath prefix for output manifests. (default: %(default)s)") +args = parser.parse_args() + + +def create_manifest(data_dir, manifest_path_prefix): + print("Creating manifest %s ..." % manifest_path_prefix) + json_lines = [] + transcript_path = os.path.join(data_dir, 'transcript', + 'aishell_transcript_v0.8.txt') + transcript_dict = {} + for line in codecs.open(transcript_path, 'r', 'utf-8'): + line = line.strip() + if line == '': continue + audio_id, text = line.split(' ', 1) + # remove withespace + text = ''.join(text.split()) + transcript_dict[audio_id] = text + + data_types = ['train', 'dev', 'test'] + for type in data_types: + del json_lines[:] + audio_dir = os.path.join(data_dir, 'wav', type) + for subfolder, _, filelist in sorted(os.walk(audio_dir)): + for fname in filelist: + audio_path = os.path.join(subfolder, fname) + audio_id = fname[:-4] + # if no transcription for audio then skipped + if audio_id not in transcript_dict: + continue + audio_data, samplerate = soundfile.read(audio_path) + duration = float(len(audio_data) / samplerate) + text = transcript_dict[audio_id] + json_lines.append( + json.dumps( + { + 'audio_filepath': audio_path, + 'duration': duration, + 'text': text + }, + ensure_ascii=False)) + manifest_path = manifest_path_prefix + '.' + type + with codecs.open(manifest_path, 'w', 'utf-8') as fout: + for line in json_lines: + fout.write(line + '\n') + + +def prepare_dataset(url, md5sum, target_dir, manifest_path): + """Download, unpack and create manifest file.""" + data_dir = os.path.join(target_dir, 'data_aishell') + if not os.path.exists(data_dir): + filepath = download(url, md5sum, target_dir) + unpack(filepath, target_dir) + # unpack all audio tar files + audio_dir = os.path.join(data_dir, 'wav') + for subfolder, _, filelist in sorted(os.walk(audio_dir)): + for ftar in filelist: + unpack(os.path.join(subfolder, ftar), subfolder, True) + else: + print("Skip downloading and unpacking. Data already exists in %s." % + target_dir) + create_manifest(data_dir, manifest_path) + + +def main(): + if args.target_dir.startswith('~'): + args.target_dir = os.path.expanduser(args.target_dir) + + prepare_dataset( + url=DATA_URL, + md5sum=MD5_DATA, + target_dir=args.target_dir, + manifest_path=args.manifest_prefix) + + +if __name__ == '__main__': + main() diff --git a/data/voxforge/run_data.sh b/examples/dataset/voxforge/run_data.sh similarity index 58% rename from data/voxforge/run_data.sh rename to examples/dataset/voxforge/run_data.sh index 0276744ae..5af9d0cc6 100644 --- a/data/voxforge/run_data.sh +++ b/examples/dataset/voxforge/run_data.sh @@ -1,9 +1,12 @@ #! /usr/bin/env bash +TARGET_DIR=${MAIN_ROOT}/examples/dataset/voxforge +mkdir -p ${TARGET_DIR} + # download data, generate manifests -PYTHONPATH=../../:$PYTHONPATH python voxforge.py \ ---manifest_prefix='./manifest' \ ---target_dir='./dataset/VoxForge' \ +python ${MAIN_ROOT}/examples/dataset/voxforge/voxforge.py \ +--manifest_prefix="${TARGET_DIR}/manifest" \ +--target_dir="${TARGET_DIR}" \ --is_merge_dialect=True \ --dialects 'american' 'british' 'australian' 'european' 'irish' 'canadian' 'indian' diff --git a/data/voxforge/voxforge.py b/examples/dataset/voxforge/voxforge.py similarity index 98% rename from data/voxforge/voxforge.py rename to examples/dataset/voxforge/voxforge.py index 3fb0ded88..abf1ccff6 100644 --- a/data/voxforge/voxforge.py +++ b/examples/dataset/voxforge/voxforge.py @@ -27,9 +27,9 @@ import json import argparse import shutil import subprocess -from data_utils.utility import download_multi, unpack, getfile_insensitive +from utils.utility import download_multi, unpack, getfile_insensitive -DATA_HOME = './dataset' +DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech') DATA_URL = 'http://www.repository.voxforge1.org/downloads/SpeechCorpus/Trunk/' \ 'Audio/Main/16kHz_16bit' diff --git a/examples/librispeech/.gitignore b/examples/librispeech/.gitignore new file mode 100644 index 000000000..44038ca5b --- /dev/null +++ b/examples/librispeech/.gitignore @@ -0,0 +1,2 @@ +data +ckpt* diff --git a/examples/librispeech/conf/deepspeech2.yaml b/examples/librispeech/conf/deepspeech2.yaml index 457a56b2e..9e2e29396 100644 --- a/examples/librispeech/conf/deepspeech2.yaml +++ b/examples/librispeech/conf/deepspeech2.yaml @@ -1,12 +1,12 @@ # https://yaml.org/type/float.html data: - train_manifest: data/manifest.tiny - dev_manifest: data/manifest.tiny - test_manifest: data/manifest.tiny + train_manifest: data/manifest.train + dev_manifest: data/manifest.dev-clean + test_manifest: data/manifest.test-clean mean_std_filepath: data/mean_std.npz vocab_filepath: data/vocab.txt augmentation_config: conf/augmentation.config - batch_size: 4 + batch_size: 20 max_duration: 27.0 min_duration: 0.0 specgram_type: linear @@ -26,26 +26,22 @@ model: num_conv_layers: 2 num_rnn_layers: 3 rnn_layer_size: 2048 - use_gru: True + use_gru: False share_rnn_weights: True training: - n_epoch: 20 - lr: 1e-5 + n_epoch: 50 + lr: 5e-4 + lr_decay: 0.83 weight_decay: 1e-06 - global_grad_clip: 400.0 - max_iteration: 500000 - plot_interval: 1000 - save_interval: 1000 - valid_interval: 1000 + global_grad_clip: 5.0 decoding: batch_size: 128 error_rate_type: wer decoding_method: ctc_beam_search - lang_model_path: models/lm/common_crawl_00.prune01111.trie.klm + lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm alpha: 2.5 beta: 0.3 beam_size: 500 cutoff_prob: 1.0 cutoff_top_n: 40 num_proc_bsearch: 8 - diff --git a/examples/librispeech/local/data.sh b/examples/librispeech/local/data.sh index cbcad7b8d..ca65d640c 100644 --- a/examples/librispeech/local/data.sh +++ b/examples/librispeech/local/data.sh @@ -1,11 +1,13 @@ #! /usr/bin/env bash mkdir -p data +TARGET_DIR=${MAIN_ROOT}/examples/dataset +mkdir -p ${TARGET_DIR} # download data, generate manifests -PYTHONPATH=.:$PYTHONPATH python3 local/librispeech.py \ +PYTHONPATH=.:$PYTHONPATH python3 ${TARGET_DIR}/librispeech/librispeech.py \ --manifest_prefix="data/manifest" \ ---target_dir="${MAIN_ROOT}/dataset/librispeech" \ +--target_dir="${TARGET_DIR}/librispeech" \ --full_download="True" if [ $? -ne 0 ]; then @@ -15,9 +17,8 @@ fi cat data/manifest.train-* | shuf > data/manifest.train - # build vocabulary -python3 ${MAIN_ROOT}/tools/build_vocab.py \ +python3 ${MAIN_ROOT}/utils/build_vocab.py \ --count_threshold=0 \ --vocab_path="data/vocab.txt" \ --manifest_paths="data/manifest.train" @@ -27,9 +28,8 @@ if [ $? -ne 0 ]; then exit 1 fi - # compute mean and stddev for normalizer -python3 ${MAIN_ROOT}/tools/compute_mean_std.py \ +python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ --manifest_path="data/manifest.train" \ --num_samples=2000 \ --specgram_type="linear" \ @@ -40,6 +40,5 @@ if [ $? -ne 0 ]; then exit 1 fi - echo "LibriSpeech Data preparation done." exit 0 diff --git a/examples/librispeech/local/download_lm_en.sh b/examples/librispeech/local/download_lm_en.sh new file mode 100644 index 000000000..05ea793fb --- /dev/null +++ b/examples/librispeech/local/download_lm_en.sh @@ -0,0 +1,20 @@ +#! /usr/bin/env bash + +. ${MAIN_ROOT}/utils/utility.sh + +DIR=data/lm +mkdir -p ${DIR} + +URL=https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm +MD5="099a601759d467cd0a8523ff939819c5" +TARGET=${DIR}/common_crawl_00.prune01111.trie.klm + +echo "Download language model ..." +download $URL $MD5 $TARGET +if [ $? -ne 0 ]; then + echo "Fail to download the language model!" + exit 1 +fi + + +exit 0 diff --git a/models/librispeech/download_model.sh b/examples/librispeech/local/download_model.sh similarity index 68% rename from models/librispeech/download_model.sh rename to examples/librispeech/local/download_model.sh index edf853054..f13bde0f2 100644 --- a/models/librispeech/download_model.sh +++ b/examples/librispeech/local/download_model.sh @@ -1,10 +1,13 @@ #! /usr/bin/env bash -. ../../utils/utility.sh +. ${MAIN_ROOT}/utils/utility.sh + +DIR=data/pretrain +mkdir -p ${DIR} URL='https://deepspeech.bj.bcebos.com/eng_models/librispeech_model_fluid.tar.gz' MD5=fafb11fe57c3ecd107147056453f5348 -TARGET=./librispeech_model_fluid.tar.gz +TARGET=${DIR}/librispeech_model_fluid.tar.gz echo "Download LibriSpeech model ..." @@ -13,7 +16,6 @@ if [ $? -ne 0 ]; then echo "Fail to download LibriSpeech model!" exit 1 fi -tar -zxvf $TARGET - +tar -zxvf $TARGET -C ${DIR} exit 0 diff --git a/examples/librispeech/local/infer.sh b/examples/librispeech/local/infer.sh index 33959b381..4b6a0b01f 100644 --- a/examples/librispeech/local/infer.sh +++ b/examples/librispeech/local/infer.sh @@ -1,43 +1,21 @@ #! /usr/bin/env bash # download language model -cd ${MAIN_ROOT}/models/lm > /dev/null -bash download_lm_en.sh +bash local/download_lm_en.sh if [ $? -ne 0 ]; then exit 1 fi -cd - > /dev/null +python3 -u ${BIN_DIR}/infer.py \ +--device 'gpu' \ +--nproc 1 \ +--config conf/deepspeech2.yaml \ +--output ckpt -# infer -CUDA_VISIBLE_DEVICES=0 \ -python3 -u ${MAIN_ROOT}/infer.py \ ---num_samples=10 \ ---beam_size=500 \ ---num_proc_bsearch=8 \ ---num_conv_layers=2 \ ---num_rnn_layers=3 \ ---rnn_layer_size=2048 \ ---alpha=2.5 \ ---beta=0.3 \ ---cutoff_prob=1.0 \ ---cutoff_top_n=40 \ ---use_gru=False \ ---use_gpu=True \ ---share_rnn_weights=True \ ---infer_manifest="data/manifest.test-clean" \ ---mean_std_path="data/mean_std.npz" \ ---vocab_path="data/vocab.txt" \ ---model_path="checkpoints/step_final" \ ---lang_model_path="${MAIN_ROOT}/models/lm/common_crawl_00.prune01111.trie.klm" \ ---decoding_method="ctc_beam_search" \ ---error_rate_type="wer" \ ---specgram_type="linear" if [ $? -ne 0 ]; then echo "Failed in inference!" exit 1 fi - exit 0 diff --git a/examples/librispeech/local/infer_golden.sh b/examples/librispeech/local/infer_golden.sh index 21663681b..d17b4328d 100644 --- a/examples/librispeech/local/infer_golden.sh +++ b/examples/librispeech/local/infer_golden.sh @@ -1,22 +1,16 @@ #! /usr/bin/env bash # download language model -cd ${MAIN_ROOT}/models/lm > /dev/null -bash download_lm_en.sh +bash local/download_lm_en.sh if [ $? -ne 0 ]; then exit 1 fi -cd - > /dev/null - # download well-trained model -cd ${MAIN_ROOT}/models/librispeech > /dev/null -bash download_model.sh +bash local/download_model.sh if [ $? -ne 0 ]; then exit 1 fi -cd - > /dev/null - # infer CUDA_VISIBLE_DEVICES=0 \ diff --git a/examples/librispeech/local/test.sh b/examples/librispeech/local/test.sh index cd8c07542..f39fbaef1 100644 --- a/examples/librispeech/local/test.sh +++ b/examples/librispeech/local/test.sh @@ -1,38 +1,17 @@ #! /usr/bin/env bash # download language model -cd ${MAIN_ROOT}/models/lm > /dev/null -bash download_lm_en.sh +bash local/download_lm_en.sh if [ $? -ne 0 ]; then exit 1 fi -cd - > /dev/null +python3 -u ${BIN_DIR}/test.py \ +--device 'gpu' \ +--nproc 1 \ +--config conf/deepspeech2.yaml \ +--output ckpt -# evaluate model -CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ -python3 -u ${MAIN_ROOT}/test.py \ ---batch_size=128 \ ---beam_size=500 \ ---num_proc_bsearch=8 \ ---num_conv_layers=2 \ ---num_rnn_layers=3 \ ---rnn_layer_size=2048 \ ---alpha=2.5 \ ---beta=0.3 \ ---cutoff_prob=1.0 \ ---cutoff_top_n=40 \ ---use_gru=False \ ---use_gpu=True \ ---share_rnn_weights=True \ ---test_manifest="data/manifest.test-clean" \ ---mean_std_path="data/mean_std.npz" \ ---vocab_path="data/vocab.txt" \ ---model_path="checkpoints/step_final" \ ---lang_model_path="${MAIN_ROOT}/models/lm/common_crawl_00.prune01111.trie.klm" \ ---decoding_method="ctc_beam_search" \ ---error_rate_type="wer" \ ---specgram_type="linear" if [ $? -ne 0 ]; then echo "Failed in evaluation!" diff --git a/examples/librispeech/local/test_golden.sh b/examples/librispeech/local/test_golden.sh index 54ec6ad03..d6b1bc8e9 100644 --- a/examples/librispeech/local/test_golden.sh +++ b/examples/librispeech/local/test_golden.sh @@ -1,26 +1,21 @@ #! /usr/bin/env bash # download language model -cd ${MAIN_ROOT}/models/lm > /dev/null -bash download_lm_en.sh +bash local/download_lm_en.sh if [ $? -ne 0 ]; then exit 1 fi -cd - > /dev/null - # download well-trained model -cd ${MAIN_ROOT}/models/librispeech > /dev/null -bash download_model.sh +bash local/download_model.sh if [ $? -ne 0 ]; then exit 1 fi -cd - > /dev/null # evaluate model CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ -python3 -u ${MAIN_ROOT}/test.py \ +python3 -u $MAIN_ROOT/test.py \ --batch_size=128 \ --beam_size=500 \ --num_proc_bsearch=8 \ @@ -35,10 +30,10 @@ python3 -u ${MAIN_ROOT}/test.py \ --use_gpu=True \ --share_rnn_weights=True \ --test_manifest="data/manifest.test-clean" \ ---mean_std_path="${MAIN_ROOT}/models/librispeech/mean_std.npz" \ ---vocab_path="${MAIN_ROOT}/models/librispeech/vocab.txt" \ ---model_path="${MAIN_ROOT}/models/librispeech" \ ---lang_model_path="${MAIN_ROOT}/models/lm/common_crawl_00.prune01111.trie.klm" \ +--mean_std_path="$MAIN_ROOT/models/librispeech/mean_std.npz" \ +--vocab_path="$MAIN_ROOT/models/librispeech/vocab.txt" \ +--model_path="$MAIN_ROOT/models/librispeech" \ +--lang_model_path="$MAIN_ROOT/models/lm/common_crawl_00.prune01111.trie.klm" \ --decoding_method="ctc_beam_search" \ --error_rate_type="wer" \ --specgram_type="linear" diff --git a/examples/librispeech/local/train.sh b/examples/librispeech/local/train.sh index 32aa2657b..59a94181b 100644 --- a/examples/librispeech/local/train.sh +++ b/examples/librispeech/local/train.sh @@ -1,36 +1,15 @@ #! /usr/bin/env bash -# train model -# if you wish to resume from an exists model, uncomment --init_from_pretrained_model export FLAGS_sync_nccl_allreduce=0 -CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ -python3 -u ${MAIN_ROOT}/train.py \ ---batch_size=20 \ ---num_epoch=50 \ ---num_conv_layers=2 \ ---num_rnn_layers=3 \ ---rnn_layer_size=2048 \ ---num_iter_print=100 \ ---save_epoch=1 \ ---num_samples=280000 \ ---learning_rate=5e-4 \ ---max_duration=27.0 \ ---min_duration=0.0 \ ---test_off=False \ ---use_sortagrad=True \ ---use_gru=False \ ---use_gpu=True \ ---is_local=True \ ---share_rnn_weights=True \ ---train_manifest="data/manifest.train" \ ---dev_manifest="data/manifest.dev-clean" \ ---mean_std_path="data/mean_std.npz" \ ---vocab_path="data/vocab.txt" \ ---output_model_dir="./checkpoints/libri" \ ---augment_conf_path="${MAIN_ROOT}/conf/augmentation.config" \ ---specgram_type="linear" \ ---shuffle_method="batch_shuffle_clipped" \ +ngpu=$(echo ${CUDA_VISIBLE_DEVICES} | python -c 'import sys; a = sys.stdin.read(); print(len(a.split(",")));') +echo "using $ngpu gpus..." + +python3 -u ${BIN_DIR}/train.py \ +--device 'gpu' \ +--nproc ${ngpu} \ +--config conf/deepspeech2.yaml \ +--output ckpt if [ $? -ne 0 ]; then echo "Failed in training!" diff --git a/examples/librispeech/local/tune.sh b/examples/librispeech/local/tune.sh index 848f0b8f9..4bb81d29b 100644 --- a/examples/librispeech/local/tune.sh +++ b/examples/librispeech/local/tune.sh @@ -1,15 +1,19 @@ #! /usr/bin/env bash +if [ $# != 1 ];then + echo "usage: tune ckpt_path" + exit 1 +fi + # grid-search for hyper-parameters in language model -CUDA_VISIBLE_DEVICES=0,1,2,3 \ -python3 -u ${MAIN_ROOT}tools/tune.py \ +python3 -u ${BIN_DIR}/tune.py \ +--device 'gpu' \ +--nproc 1 \ +--config conf/deepspeech2.yaml \ --num_batches=-1 \ --batch_size=128 \ --beam_size=500 \ --num_proc_bsearch=12 \ ---num_conv_layers=2 \ ---num_rnn_layers=3 \ ---rnn_layer_size=2048 \ --num_alphas=45 \ --num_betas=8 \ --alpha_from=1.0 \ @@ -18,16 +22,7 @@ python3 -u ${MAIN_ROOT}tools/tune.py \ --beta_to=0.45 \ --cutoff_prob=1.0 \ --cutoff_top_n=40 \ ---use_gru=False \ ---use_gpu=True \ ---share_rnn_weights=True \ ---tune_manifest="data/manifest.dev-clean" \ ---mean_std_path="data/mean_std.npz" \ ---vocab_path="${MAIN_ROOT}/models/librispeech/vocab.txt" \ ---model_path="${MAIN_ROOT}/models/librispeech" \ ---lang_model_path="${MAIN_ROOT}/models/lm/common_crawl_00.prune01111.trie.klm" \ ---error_rate_type="wer" \ ---specgram_type="linear" +--checkpoint_path ${1} if [ $? -ne 0 ]; then echo "Failed in tuning!" diff --git a/examples/librispeech/models b/examples/librispeech/models deleted file mode 120000 index 9e68e9945..000000000 --- a/examples/librispeech/models +++ /dev/null @@ -1 +0,0 @@ -../../models \ No newline at end of file diff --git a/examples/librispeech/path.sh b/examples/librispeech/path.sh index a55139e11..a179631b3 100644 --- a/examples/librispeech/path.sh +++ b/examples/librispeech/path.sh @@ -8,3 +8,7 @@ export PYTHONIOENCODING=UTF-8 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/ + + +MODEL=deepspeech2 +export BIN_DIR=${MAIN_ROOT}/deepspeech/exps/${MODEL}/bin diff --git a/examples/librispeech/run.sh b/examples/librispeech/run.sh index c5f66ae1d..ff87d38bf 100644 --- a/examples/librispeech/run.sh +++ b/examples/librispeech/run.sh @@ -1,4 +1,5 @@ #!/bin/bash +set -e source path.sh @@ -6,13 +7,10 @@ source path.sh bash ./local/data.sh # train model -bash ./local/train.sh +CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash ./local/train.sh # test model -bash ./local/test.sh +CUDA_VISIBLE_DEVICES=0 bash ./local/test.sh # infer model -bash ./local/infer.sh - -# tune model -#bash ./local/tune.sh +CUDA_VISIBLE_DEVICES=0 bash ./local/infer.sh diff --git a/examples/tiny/.gitignore b/examples/tiny/.gitignore new file mode 100644 index 000000000..073c3b9eb --- /dev/null +++ b/examples/tiny/.gitignore @@ -0,0 +1,2 @@ +ckpt* +data diff --git a/examples/tiny/conf/deepspeech2.yaml b/examples/tiny/conf/deepspeech2.yaml index dc7d59d47..c7dd83f3c 100644 --- a/examples/tiny/conf/deepspeech2.yaml +++ b/examples/tiny/conf/deepspeech2.yaml @@ -34,19 +34,14 @@ training: lr_decay: 1.0 weight_decay: 1e-06 global_grad_clip: 5.0 - max_iteration: 500000 - plot_interval: 1000 - save_interval: 1000 - valid_interval: 1000 decoding: batch_size: 128 error_rate_type: wer decoding_method: ctc_beam_search - lang_model_path: models/lm/common_crawl_00.prune01111.trie.klm + lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm alpha: 2.5 beta: 0.3 beam_size: 500 cutoff_prob: 1.0 cutoff_top_n: 40 num_proc_bsearch: 8 - diff --git a/examples/tiny/local/data.sh b/examples/tiny/local/data.sh index 3ad387dbc..d834ec677 100644 --- a/examples/tiny/local/data.sh +++ b/examples/tiny/local/data.sh @@ -1,14 +1,13 @@ #! /usr/bin/env bash -# prepare folder -if [ ! -e data ]; then - mkdir data -fi +mkdir -p data +TARGET_DIR=${MAIN_ROOT}/examples/dataset +mkdir -p ${TARGET_DIR} # download data, generate manifests -PYTHONPATH=.:$PYTHONPATH python3 ../librispeech/local/librispeech.py \ +PYTHONPATH=.:$PYTHONPATH python3 ${TARGET_DIR}/librispeech/librispeech.py \ --manifest_prefix="data/manifest" \ ---target_dir="${MAIN_ROOT}/dataset/librispeech" \ +--target_dir="${TARGET_DIR}/librispeech" \ --full_download="False" if [ $? -ne 0 ]; then @@ -19,7 +18,7 @@ fi head -n 64 data/manifest.dev-clean > data/manifest.tiny # build vocabulary -python3 ${MAIN_ROOT}/tools/build_vocab.py \ +python3 ${MAIN_ROOT}/utils/build_vocab.py \ --count_threshold=0 \ --vocab_path="data/vocab.txt" \ --manifest_paths="data/manifest.tiny" @@ -31,7 +30,7 @@ fi # compute mean and stddev for normalizer -python3 ${MAIN_ROOT}/tools/compute_mean_std.py \ +python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ --manifest_path="data/manifest.tiny" \ --num_samples=64 \ --specgram_type="linear" \ @@ -42,6 +41,5 @@ if [ $? -ne 0 ]; then exit 1 fi - echo "LibriSpeech Data preparation done." exit 0 diff --git a/examples/tiny/local/download_lm_en.sh b/examples/tiny/local/download_lm_en.sh new file mode 100644 index 000000000..05ea793fb --- /dev/null +++ b/examples/tiny/local/download_lm_en.sh @@ -0,0 +1,20 @@ +#! /usr/bin/env bash + +. ${MAIN_ROOT}/utils/utility.sh + +DIR=data/lm +mkdir -p ${DIR} + +URL=https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm +MD5="099a601759d467cd0a8523ff939819c5" +TARGET=${DIR}/common_crawl_00.prune01111.trie.klm + +echo "Download language model ..." +download $URL $MD5 $TARGET +if [ $? -ne 0 ]; then + echo "Fail to download the language model!" + exit 1 +fi + + +exit 0 diff --git a/examples/tiny/local/download_model.sh b/examples/tiny/local/download_model.sh new file mode 100644 index 000000000..f13bde0f2 --- /dev/null +++ b/examples/tiny/local/download_model.sh @@ -0,0 +1,21 @@ +#! /usr/bin/env bash + +. ${MAIN_ROOT}/utils/utility.sh + +DIR=data/pretrain +mkdir -p ${DIR} + +URL='https://deepspeech.bj.bcebos.com/eng_models/librispeech_model_fluid.tar.gz' +MD5=fafb11fe57c3ecd107147056453f5348 +TARGET=${DIR}/librispeech_model_fluid.tar.gz + + +echo "Download LibriSpeech model ..." +download $URL $MD5 $TARGET +if [ $? -ne 0 ]; then + echo "Fail to download LibriSpeech model!" + exit 1 +fi +tar -zxvf $TARGET -C ${DIR} + +exit 0 diff --git a/examples/tiny/local/infer.sh b/examples/tiny/local/infer.sh index 74e8982d2..3aff6b78b 100644 --- a/examples/tiny/local/infer.sh +++ b/examples/tiny/local/infer.sh @@ -1,15 +1,13 @@ #! /usr/bin/env bash # download language model -cd $MAIN_ROOT/models/lm > /dev/null -bash download_lm_en.sh +bash local/download_lm_en.sh if [ $? -ne 0 ]; then exit 1 fi -cd - > /dev/null -CUDA_VISIBLE_DEVICES=0,1,2,3 \ -python3 -u ${MAIN_ROOT}/infer.py \ +CUDA_VISIBLE_DEVICES=0 \ +python3 -u ${BIN_DIR}/infer.py \ --device 'gpu' \ --nproc 1 \ --config conf/deepspeech2.yaml \ diff --git a/examples/tiny/local/infer_golden.sh b/examples/tiny/local/infer_golden.sh index 21663681b..d17b4328d 100644 --- a/examples/tiny/local/infer_golden.sh +++ b/examples/tiny/local/infer_golden.sh @@ -1,22 +1,16 @@ #! /usr/bin/env bash # download language model -cd ${MAIN_ROOT}/models/lm > /dev/null -bash download_lm_en.sh +bash local/download_lm_en.sh if [ $? -ne 0 ]; then exit 1 fi -cd - > /dev/null - # download well-trained model -cd ${MAIN_ROOT}/models/librispeech > /dev/null -bash download_model.sh +bash local/download_model.sh if [ $? -ne 0 ]; then exit 1 fi -cd - > /dev/null - # infer CUDA_VISIBLE_DEVICES=0 \ diff --git a/examples/tiny/local/test.sh b/examples/tiny/local/test.sh index cfedd1ca8..fedebf96d 100644 --- a/examples/tiny/local/test.sh +++ b/examples/tiny/local/test.sh @@ -1,15 +1,13 @@ #! /usr/bin/env bash # download language model -cd $MAIN_ROOT/models/lm > /dev/null -bash download_lm_en.sh +bash local/download_lm_en.sh if [ $? -ne 0 ]; then exit 1 fi -cd - > /dev/null CUDA_VISIBLE_DEVICES=0 \ -python3 -u ${MAIN_ROOT}/test.py \ +python3 -u ${BIN_DIR}/test.py \ --device 'gpu' \ --nproc 1 \ --config conf/deepspeech2.yaml \ diff --git a/examples/tiny/local/test_golden.sh b/examples/tiny/local/test_golden.sh index 9983fade8..d6b1bc8e9 100644 --- a/examples/tiny/local/test_golden.sh +++ b/examples/tiny/local/test_golden.sh @@ -1,21 +1,16 @@ #! /usr/bin/env bash # download language model -cd $MAIN_ROOT/models/lm > /dev/null -bash download_lm_en.sh +bash local/download_lm_en.sh if [ $? -ne 0 ]; then exit 1 fi -cd - > /dev/null - # download well-trained model -cd $MAIN_ROOT/models/librispeech > /dev/null -bash download_model.sh +bash local/download_model.sh if [ $? -ne 0 ]; then exit 1 fi -cd - > /dev/null # evaluate model diff --git a/examples/tiny/local/train.sh b/examples/tiny/local/train.sh index dfd229172..369ccc924 100644 --- a/examples/tiny/local/train.sh +++ b/examples/tiny/local/train.sh @@ -2,9 +2,8 @@ export FLAGS_sync_nccl_allreduce=0 -#CUDA_VISIBLE_DEVICES=0,1,2,3 \ CUDA_VISIBLE_DEVICES=0 \ -python3 -u ${MAIN_ROOT}/train.py \ +python3 -u ${BIN_DIR}/train.py \ --device 'gpu' \ --nproc 1 \ --config conf/deepspeech2.yaml \ diff --git a/examples/tiny/local/tune.sh b/examples/tiny/local/tune.sh index b5cc4d6a1..4bb81d29b 100644 --- a/examples/tiny/local/tune.sh +++ b/examples/tiny/local/tune.sh @@ -1,15 +1,19 @@ #! /usr/bin/env bash +if [ $# != 1 ];then + echo "usage: tune ckpt_path" + exit 1 +fi + # grid-search for hyper-parameters in language model -CUDA_VISIBLE_DEVICES=0,1,2,3 \ -python3 -u $MAIN_ROOT/tools/tune.py \ +python3 -u ${BIN_DIR}/tune.py \ +--device 'gpu' \ +--nproc 1 \ +--config conf/deepspeech2.yaml \ --num_batches=-1 \ --batch_size=128 \ --beam_size=500 \ --num_proc_bsearch=12 \ ---num_conv_layers=2 \ ---num_rnn_layers=3 \ ---rnn_layer_size=2048 \ --num_alphas=45 \ --num_betas=8 \ --alpha_from=1.0 \ @@ -18,16 +22,7 @@ python3 -u $MAIN_ROOT/tools/tune.py \ --beta_to=0.45 \ --cutoff_prob=1.0 \ --cutoff_top_n=40 \ ---use_gru=False \ ---use_gpu=True \ ---share_rnn_weights=True \ ---tune_manifest="data/manifest.dev-clean" \ ---mean_std_path="data/mean_std.npz" \ ---vocab_path="data/vocab.txt" \ ---model_path="$MAIN_ROOT/models/librispeech" \ ---lang_model_path="$MAIN_ROOT/models/lm/common_crawl_00.prune01111.trie.klm" \ ---error_rate_type="wer" \ ---specgram_type="linear" +--checkpoint_path ${1} if [ $? -ne 0 ]; then echo "Failed in tuning!" diff --git a/examples/tiny/models b/examples/tiny/models deleted file mode 120000 index 150d99d4d..000000000 --- a/examples/tiny/models +++ /dev/null @@ -1 +0,0 @@ -../../models/ \ No newline at end of file diff --git a/examples/tiny/path.sh b/examples/tiny/path.sh index a55139e11..a179631b3 100644 --- a/examples/tiny/path.sh +++ b/examples/tiny/path.sh @@ -8,3 +8,7 @@ export PYTHONIOENCODING=UTF-8 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/ + + +MODEL=deepspeech2 +export BIN_DIR=${MAIN_ROOT}/deepspeech/exps/${MODEL}/bin diff --git a/examples/tiny/run.sh b/examples/tiny/run.sh index 01ad06516..2b5ed5308 100644 --- a/examples/tiny/run.sh +++ b/examples/tiny/run.sh @@ -14,6 +14,3 @@ bash ./local/test.sh # infer model bash ./local/infer.sh - -## tune model -#bash ./local/tune.sh diff --git a/dataloader.ipynb b/notebook/dataloader.ipynb similarity index 100% rename from dataloader.ipynb rename to notebook/dataloader.ipynb diff --git a/train_test.ipynb b/notebook/train_test.ipynb similarity index 100% rename from train_test.ipynb rename to notebook/train_test.ipynb diff --git a/setup.sh b/setup.sh index 115ddcdc1..8d3a0994e 100644 --- a/setup.sh +++ b/setup.sh @@ -38,7 +38,7 @@ fi # install decoders python3 -c "import pkg_resources; pkg_resources.require(\"swig_decoders==1.1\")" if [ $? != 0 ]; then - cd decoders/swig > /dev/null + cd deepspeech/decoders/swig > /dev/null sh setup.sh cd - > /dev/null fi diff --git a/tests/network_test.py b/tests/network_test.py index 7e35c05cc..7e8d62c2b 100644 --- a/tests/network_test.py +++ b/tests/network_test.py @@ -12,10 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -from model_utils.network import DeepSpeech2 import paddle import numpy as np +from deepspeech.models.network import DeepSpeech2 + if __name__ == '__main__': batch_size = 2 diff --git a/utils/tests/test_error_rate.py b/tests/test_error_rate.py similarity index 99% rename from utils/tests/test_error_rate.py rename to tests/test_error_rate.py index 80c5b192a..646d5739f 100644 --- a/utils/tests/test_error_rate.py +++ b/tests/test_error_rate.py @@ -14,7 +14,7 @@ """Test error rate.""" import unittest -from utils import error_rate +from deepspeech.utils import error_rate class TestParse(unittest.TestCase): diff --git a/tools/build_vocab.py b/utils/build_vocab.py similarity index 92% rename from tools/build_vocab.py rename to utils/build_vocab.py index 5dc6f35bb..cb17de57c 100644 --- a/tools/build_vocab.py +++ b/utils/build_vocab.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. """Build vocabulary from manifest files. - Each item in vocabulary file is a character. """ @@ -22,15 +21,16 @@ import codecs import json from collections import Counter import os.path -from data_utils.utility import read_manifest -from utils.utility import add_arguments, print_arguments + +from deepspeech.frontend.utility import read_manifest +from deepspeech.utils.utility import add_arguments, print_arguments parser = argparse.ArgumentParser(description=__doc__) add_arg = functools.partial(add_arguments, argparser=parser) # yapf: disable add_arg('count_threshold', int, 0, "Truncation threshold for char counts.") add_arg('vocab_path', str, - 'data/librispeech/vocab.txt', + 'examples/librispeech/data/vocab.txt', "Filepath to write the vocabulary.") add_arg('manifest_paths', str, None, diff --git a/tools/compute_mean_std.py b/utils/compute_mean_std.py similarity index 87% rename from tools/compute_mean_std.py rename to utils/compute_mean_std.py index e0245fc5b..80fe88813 100644 --- a/tools/compute_mean_std.py +++ b/utils/compute_mean_std.py @@ -15,10 +15,10 @@ import argparse import functools -from data_utils.normalizer import FeatureNormalizer -from data_utils.augmentor.augmentation import AugmentationPipeline -from data_utils.featurizer.audio_featurizer import AudioFeaturizer -from utils.utility import add_arguments, print_arguments +from deepspeech.frontend.normalizer import FeatureNormalizer +from deepspeech.frontend.augmentor.augmentation import AugmentationPipeline +from deepspeech.frontend.featurizer.audio_featurizer import AudioFeaturizer +from deepspeech.utils.utility import add_arguments, print_arguments parser = argparse.ArgumentParser(description=__doc__) add_arg = functools.partial(add_arguments, argparser=parser) diff --git a/tools/profile.sh b/utils/profile.sh similarity index 100% rename from tools/profile.sh rename to utils/profile.sh diff --git a/utils/utility.py b/utils/utility.py index cd7166593..1d3be04d4 100644 --- a/utils/utility.py +++ b/utils/utility.py @@ -11,47 +11,51 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Contains common utility functions.""" -import distutils.util - - -def print_arguments(args): - """Print argparse's arguments. - - Usage: - - .. code-block:: python - - parser = argparse.ArgumentParser() - parser.add_argument("name", default="Jonh", type=str, help="User name.") - args = parser.parse_args() - print_arguments(args) - - :param args: Input argparse.Namespace for printing. - :type args: argparse.Namespace - """ - print("----------- Configuration Arguments -----------") - for arg, value in sorted(vars(args).items()): - print("%s: %s" % (arg, value)) - print("------------------------------------------------") - - -def add_arguments(argname, type, default, help, argparser, **kwargs): - """Add argparse's argument. - - Usage: - - .. code-block:: python - - parser = argparse.ArgumentParser() - add_argument("name", str, "Jonh", "User name.", parser) - args = parser.parse_args() - """ - type = distutils.util.strtobool if type == bool else type - argparser.add_argument( - "--" + argname, - default=default, - type=type, - help=help + ' Default: %(default)s.', - **kwargs) +import os +import tarfile +from paddle.dataset.common import md5file + + +def getfile_insensitive(path): + """Get the actual file path when given insensitive filename.""" + directory, filename = os.path.split(path) + directory, filename = (directory or '.'), filename.lower() + for f in os.listdir(directory): + newpath = os.path.join(directory, f) + if os.path.isfile(newpath) and f.lower() == filename: + return newpath + + +def download_multi(url, target_dir, extra_args): + """Download multiple files from url to target_dir.""" + if not os.path.exists(target_dir): os.makedirs(target_dir) + print("Downloading %s ..." % url) + ret_code = os.system("wget -c " + url + ' ' + extra_args + " -P " + + target_dir) + return ret_code + + +def download(url, md5sum, target_dir): + """Download file from url to target_dir, and check md5sum.""" + if not os.path.exists(target_dir): os.makedirs(target_dir) + filepath = os.path.join(target_dir, url.split("/")[-1]) + if not (os.path.exists(filepath) and md5file(filepath) == md5sum): + print("Downloading %s ..." % url) + os.system("wget -c " + url + " -P " + target_dir) + print("\nMD5 Chesksum %s ..." % filepath) + if not md5file(filepath) == md5sum: + raise RuntimeError("MD5 checksum failed.") + else: + print("File exists, skip downloading. (%s)" % filepath) + return filepath + + +def unpack(filepath, target_dir, rm_tar=False): + """Unpack the file to the target_dir.""" + print("Unpacking %s ..." % filepath) + tar = tarfile.open(filepath) + tar.extractall(target_dir) + tar.close() + if rm_tar == True: + os.remove(filepath)