refactor repo

fix decoding
5 years ago · 45f73c507c
parent 49d55a865c
commit 45f73c507c
131 changed files with 968 additions and 617 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,5 +1,4 @@
 .DS_Store
 *.pyc
 tools/venv
-dataset
-models/*
+.vscode
--- a/deepspeech/init.py
+++ b/deepspeech/init.py
--- a/data_utils/augmentor/init.py
+++ b/data_utils/augmentor/init.py
--- a/deepspeech/decoders/decoders_deprecated.py
+++ b/deepspeech/decoders/decoders_deprecated.py
--- a/deepspeech/decoders/scorer_deprecated.py
+++ b/deepspeech/decoders/scorer_deprecated.py
--- a/deepspeech/decoders/swig/init.py
+++ b/deepspeech/decoders/swig/init.py
--- a/deepspeech/decoders/swig/_init_paths.py
+++ b/deepspeech/decoders/swig/_init_paths.py
--- a/deepspeech/decoders/swig/ctc_beam_search_decoder.cpp
+++ b/deepspeech/decoders/swig/ctc_beam_search_decoder.cpp
--- a/deepspeech/decoders/swig/ctc_beam_search_decoder.h
+++ b/deepspeech/decoders/swig/ctc_beam_search_decoder.h
--- a/deepspeech/decoders/swig/ctc_greedy_decoder.cpp
+++ b/deepspeech/decoders/swig/ctc_greedy_decoder.cpp
--- a/deepspeech/decoders/swig/ctc_greedy_decoder.h
+++ b/deepspeech/decoders/swig/ctc_greedy_decoder.h
--- a/deepspeech/decoders/swig/decoder_utils.cpp
+++ b/deepspeech/decoders/swig/decoder_utils.cpp
--- a/deepspeech/decoders/swig/decoder_utils.h
+++ b/deepspeech/decoders/swig/decoder_utils.h
--- a/deepspeech/decoders/swig/decoders.i
+++ b/deepspeech/decoders/swig/decoders.i
--- a/deepspeech/decoders/swig/path_trie.cpp
+++ b/deepspeech/decoders/swig/path_trie.cpp
--- a/deepspeech/decoders/swig/path_trie.h
+++ b/deepspeech/decoders/swig/path_trie.h
--- a/deepspeech/decoders/swig/scorer.cpp
+++ b/deepspeech/decoders/swig/scorer.cpp
--- a/deepspeech/decoders/swig/scorer.h
+++ b/deepspeech/decoders/swig/scorer.h
--- a/deepspeech/decoders/swig/setup.py
+++ b/deepspeech/decoders/swig/setup.py
--- a/deepspeech/decoders/swig/setup.sh
+++ b/deepspeech/decoders/swig/setup.sh
--- a/deepspeech/decoders/swig_wrapper.py
+++ b/deepspeech/decoders/swig_wrapper.py
--- a/deepspeech/decoders/tests/test_decoders.py
+++ b/deepspeech/decoders/tests/test_decoders.py
--- a/deepspeech/exps/init.py
+++ b/deepspeech/exps/init.py
--- a/deepspeech/exps/deepspeech2/init.py
+++ b/deepspeech/exps/deepspeech2/init.py
--- a/deepspeech/exps/deepspeech2/bin/infer.py
+++ b/deepspeech/exps/deepspeech2/bin/infer.py
@ -20,12 +20,13 @@ import functools

 from paddle import distributed as dist

-from utils.utility import print_arguments
-from training.cli import default_argument_parser
+from deepspeech.training.cli import default_argument_parser
+from deepspeech.utils.utility import print_arguments
+from deepspeech.utils.error_rate import char_errors, word_errors

-from model_utils.config import get_cfg_defaults
-from model_utils.model import DeepSpeech2Tester as Tester
-from utils.error_rate import char_errors, word_errors
+# TODO(hui zhang): dynamic load 
+from deepspeech.exps.deepspeech2.config import get_cfg_defaults
+from deepspeech.exps.deepspeech2.model import DeepSpeech2Tester as Tester


 def main_sp(config, args):
--- a/deepspeech/exps/deepspeech2/bin/test.py
+++ b/deepspeech/exps/deepspeech2/bin/test.py
@ -20,12 +20,12 @@ import functools

 from paddle import distributed as dist

-from utils.utility import print_arguments
-from training.cli import default_argument_parser
+from deepspeech.training.cli import default_argument_parser
+from deepspeech.utils.utility import print_arguments
+from deepspeech.utils.error_rate import char_errors, word_errors

-from model_utils.config import get_cfg_defaults
-from model_utils.model import DeepSpeech2Tester as Tester
-from utils.error_rate import char_errors, word_errors
+from deepspeech.exps.deepspeech2.config import get_cfg_defaults
+from deepspeech.exps.deepspeech2.model import DeepSpeech2Tester as Tester


 def main_sp(config, args):
--- a/deepspeech/exps/deepspeech2/bin/train.py
+++ b/deepspeech/exps/deepspeech2/bin/train.py
@ -20,11 +20,11 @@ import functools

 from paddle import distributed as dist

-from utils.utility import print_arguments
-from training.cli import default_argument_parser
+from deepspeech.utils.utility import print_arguments
+from deepspeech.training.cli import default_argument_parser

-from model_utils.config import get_cfg_defaults
-from model_utils.model import DeepSpeech2Trainer as Trainer
+from deepspeech.exps.deepspeech2.config import get_cfg_defaults
+from deepspeech.exps.deepspeech2.model import DeepSpeech2Trainer as Trainer


 def main_sp(config, args):
--- a/deepspeech/exps/deepspeech2/bin/tune.py
+++ b/deepspeech/exps/deepspeech2/bin/tune.py
@ -20,22 +20,21 @@ import argparse
 import functools
 import gzip
 import logging
-import paddle.fluid as fluid

-from training.cli import default_argument_parser
-from model_utils.config import get_cfg_defaults
-
-from data_utils.dataset import SpeechCollator
-from data_utils.dataset import DeepSpeech2Dataset
-from data_utils.dataset import DeepSpeech2DistributedBatchSampler
-from data_utils.dataset import DeepSpeech2BatchSampler
 from paddle.io import DataLoader

-from model_utils.network import DeepSpeech2
-from model_utils.network import DeepSpeech2Loss
+from deepspeech.training.cli import default_argument_parser
+from deepspeech.utils.error_rate import char_errors, word_errors
+from deepspeech.utils.utility import add_arguments, print_arguments
+
+from deepspeech.models.network import DeepSpeech2
+from deepspeech.models.network import DeepSpeech2Loss

-from utils.error_rate import char_errors, word_errors
-from utils.utility import add_arguments, print_arguments
+from deepspeech.exps.deepspeech2.dataset import SpeechCollator
+from deepspeech.exps.deepspeech2.dataset import DeepSpeech2Dataset
+from deepspeech.exps.deepspeech2.dataset import DeepSpeech2DistributedBatchSampler
+from deepspeech.exps.deepspeech2.dataset import DeepSpeech2BatchSampler
+from deepspeech.exps.deepspeech2.config import get_cfg_defaults


 def tune(config, args):
@ -114,7 +113,7 @@ def tune(config, args):
            return trans

        audio, text, audio_len, text_len = infer_data
-        _, probs, _ = model.predict(audio, audio_len)
+        _, probs, logits_lens = model.predict(audio, audio_len)
        target_transcripts = ordid2token(text, text_len)
        num_ins += audio.shape[0]

@ -122,17 +121,17 @@ def tune(config, args):
        for index, (alpha, beta) in enumerate(params_grid):
            print(f"tuneing: alpha={alpha} beta={beta}")
            result_transcripts = model.decode_probs(
-                probs.numpy(), vocab_list, config.decoding.decoding_method,
+                probs.numpy(), logits_lens, vocab_list,
+                config.decoding.decoding_method,
                config.decoding.lang_model_path, alpha, beta,
                config.decoding.beam_size, config.decoding.cutoff_prob,
                config.decoding.cutoff_top_n, config.decoding.num_proc_bsearch)

            for target, result in zip(target_transcripts, result_transcripts):
-                #print(f"tuneing: {target} {result}")
                errors, len_ref = errors_func(target, result)
                err_sum[index] += errors

-                # accumulate the length of references of every batch
+                # accumulate the length of references of every batchπ
                # in the first iteration
                if args.alpha_from == alpha and args.beta_from == beta:
                    len_refs += len_ref
@ -148,8 +147,9 @@ def tune(config, args):
        min_index = err_ave.index(err_ave_min)
        print("\nBatch %d [%d/?], current opt (alpha, beta) = (%s, %s), "
              " min [%s] = %f" %
-              (cur_batch, num_ins, "%.3f" % params_grid[min_index][0], "%.3f" %
-               params_grid[min_index][1], args.error_rate_type, err_ave_min))
+              (cur_batch, num_ins, "%.3f" % params_grid[min_index][0],
+               "%.3f" % params_grid[min_index][1],
+               config.decoding.error_rate_type, err_ave_min))
        cur_batch += 1

    # output WER/CER at every (alpha, beta)
--- a/deepspeech/exps/deepspeech2/config.py
+++ b/deepspeech/exps/deepspeech2/config.py
@ -56,10 +56,6 @@ _C.training = CN(
        lr_decay=1.0,  # learning rate decay
        weight_decay=1e-6,  # the coeff of weight decay
        global_grad_clip=5.0,  # the global norm clip
-        plot_interval=1000,  # plot attention and spectrogram by step
-        valid_interval=1000,  # validation by step
-        save_interval=1000,  # checkpoint by step
-        max_iteration=500000,  # max iteration to train by step
        n_epoch=50,  # train epochs
    ))

--- a/deepspeech/exps/deepspeech2/dataset.py
+++ b/deepspeech/exps/deepspeech2/dataset.py
@ -27,11 +27,11 @@ from paddle.io import BatchSampler
 from paddle.io import DistributedBatchSampler
 from paddle import distributed as dist

-from data_utils.utility import read_manifest
-from data_utils.augmentor.augmentation import AugmentationPipeline
-from data_utils.featurizer.speech_featurizer import SpeechFeaturizer
-from data_utils.speech import SpeechSegment
-from data_utils.normalizer import FeatureNormalizer
+from deepspeech.frontend.utility import read_manifest
+from deepspeech.frontend.augmentor.augmentation import AugmentationPipeline
+from deepspeech.frontend.featurizer.speech_featurizer import SpeechFeaturizer
+from deepspeech.frontend.speech import SpeechSegment
+from deepspeech.frontend.normalizer import FeatureNormalizer

 logger = logging.getLogger(__name__)

--- a/deepspeech/exps/deepspeech2/model.py
+++ b/deepspeech/exps/deepspeech2/model.py
@ -29,26 +29,23 @@ from paddle.io import DataLoader

 from paddle.fluid.dygraph import base as imperative_base
 from paddle.fluid import layers
-from paddle.fluid import framework
 from paddle.fluid import core
-from paddle.fluid import name_scope

-from utils import mp_tools
-from training import Trainer
+from deepspeech.training import Trainer
+from deepspeech.utils import mp_tools
+from deepspeech.utils.error_rate import char_errors, word_errors, cer, wer

-from model_utils.network import DeepSpeech2
-from model_utils.network import DeepSpeech2Loss
+from deepspeech.models.network import DeepSpeech2
+from deepspeech.models.network import DeepSpeech2Loss

-from data_utils.dataset import SpeechCollator
-from data_utils.dataset import DeepSpeech2Dataset
-from data_utils.dataset import DeepSpeech2DistributedBatchSampler
-from data_utils.dataset import DeepSpeech2BatchSampler
+from deepspeech.decoders.swig_wrapper import Scorer
+from deepspeech.decoders.swig_wrapper import ctc_greedy_decoder
+from deepspeech.decoders.swig_wrapper import ctc_beam_search_decoder_batch

-from decoders.swig_wrapper import Scorer
-from decoders.swig_wrapper import ctc_greedy_decoder
-from decoders.swig_wrapper import ctc_beam_search_decoder_batch
-
-from utils.error_rate import char_errors, word_errors, cer, wer
+from deepspeech.exps.deepspeech2.dataset import SpeechCollator
+from deepspeech.exps.deepspeech2.dataset import DeepSpeech2Dataset
+from deepspeech.exps.deepspeech2.dataset import DeepSpeech2DistributedBatchSampler
+from deepspeech.exps.deepspeech2.dataset import DeepSpeech2BatchSampler

 logger = logging.getLogger(__name__)

@ -161,46 +158,6 @@ class DeepSpeech2Trainer(Trainer):
                self.visualizer.add_scalar("train/{}".format(k), v,
                                           self.iteration)

-    def new_epoch(self):
-        """Reset the train loader and increment ``epoch``.
-        """
-        if self.parallel:
-            # batch sampler epoch start from 0
-            self.train_loader.batch_sampler.set_epoch(self.epoch)
-        self.epoch += 1
-
-    def train(self):
-        """The training process.
-        
-        It includes forward/backward/update and periodical validation and 
-        saving.
-        """
-        self.logger.info(
-            f"Train Total Examples: {len(self.train_loader.dataset)}")
-        self.new_epoch()
-        while self.epoch <= self.config.training.n_epoch:
-            try:
-                for batch in self.train_loader:
-                    self.iteration += 1
-                    self.train_batch(batch)
-
-                    # if self.iteration % self.config.training.valid_interval == 0:
-                    #     self.valid()
-
-                    # if self.iteration % self.config.training.save_interval == 0:
-                    #     self.save()
-            except Exception as e:
-                self.logger.error(e)
-                pass
-
-            self.valid()
-            self.save()
-            self.lr_scheduler.step()
-            self.new_epoch()
-
-    def compute_metrics(self, inputs, outputs):
-        pass
-
    @mp_tools.rank_zero_only
    @paddle.no_grad()
    def valid(self):
@ -212,7 +169,7 @@ class DeepSpeech2Trainer(Trainer):
            audio, text, audio_len, text_len = batch
            outputs = self.model(*batch)
            loss = self.compute_losses(batch, outputs)
-            metrics = self.compute_metrics(batch, outputs)
+            #metrics = self.compute_metrics(batch, outputs)

            valid_losses['val_loss'].append(float(loss))
            valid_losses['val_loss_div_batchsize'].append(
@ -373,6 +330,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
        target_transcripts = self.ordid2token(texts, texts_len)
        result_transcripts = self.model.decode_probs(
            probs.numpy(),
+            logits_len,
            vocab_list,
            decoding_method=cfg.decoding_method,
            lang_model_path=cfg.lang_model_path,
@ -446,15 +404,37 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
        """
        # output dir
        if self.args.output:
-            output_dir = Path(self.args.output).expanduser() / "infer"
+            output_dir = Path(self.args.output).expanduser()
            output_dir.mkdir(parents=True, exist_ok=True)
        else:
            output_dir = Path(
-                self.args.checkpoint_path).expanduser().parent.parent / "infer"
+                self.args.checkpoint_path).expanduser().parent.parent
            output_dir.mkdir(parents=True, exist_ok=True)

        self.output_dir = output_dir

+    def setup_logger(self):
+        """Initialize a text logger to log the experiment.
+        
+        Each process has its own text logger. The logging message is write to 
+        the standard output and a text file named ``worker_n.log`` in the 
+        output directory, where ``n`` means the rank of the process. 
+        """
+        format = '[%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s'
+        formatter = logging.Formatter(fmt=format, datefmt='%Y/%m/%d %H:%M:%S')
+
+        logger.setLevel("INFO")
+
+        # global logger
+        stdout = True
+        save_path = ""
+        logging.basicConfig(
+            level=logging.DEBUG if stdout else logging.INFO,
+            format=format,
+            datefmt='%Y/%m/%d %H:%M:%S',
+            filename=save_path if not stdout else None)
+        self.logger = logger
+
    def setup(self):
        """Setup the experiment.
        """
@ -463,6 +443,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
            self.init_parallel()

        self.setup_output_dir()
+        self.setup_checkpointer()
        self.setup_logger()

        self.setup_dataloader()
--- a/deepspeech/frontend/init.py
+++ b/deepspeech/frontend/init.py
--- a/deepspeech/frontend/audio.py
+++ b/deepspeech/frontend/audio.py
--- a/deepspeech/frontend/augmentor/init.py
+++ b/deepspeech/frontend/augmentor/init.py
--- a/deepspeech/frontend/augmentor/augmentation.py
+++ b/deepspeech/frontend/augmentor/augmentation.py
@ -15,13 +15,13 @@

 import json
 import random
-from data_utils.augmentor.volume_perturb import VolumePerturbAugmentor
-from data_utils.augmentor.shift_perturb import ShiftPerturbAugmentor
-from data_utils.augmentor.speed_perturb import SpeedPerturbAugmentor
-from data_utils.augmentor.noise_perturb import NoisePerturbAugmentor
-from data_utils.augmentor.impulse_response import ImpulseResponseAugmentor
-from data_utils.augmentor.resample import ResampleAugmentor
-from data_utils.augmentor.online_bayesian_normalization import \
+from deepspeech.frontend.augmentor.volume_perturb import VolumePerturbAugmentor
+from deepspeech.frontend.augmentor.shift_perturb import ShiftPerturbAugmentor
+from deepspeech.frontend.augmentor.speed_perturb import SpeedPerturbAugmentor
+from deepspeech.frontend.augmentor.noise_perturb import NoisePerturbAugmentor
+from deepspeech.frontend.augmentor.impulse_response import ImpulseResponseAugmentor
+from deepspeech.frontend.augmentor.resample import ResampleAugmentor
+from deepspeech.frontend.augmentor.online_bayesian_normalization import \
     OnlineBayesianNormalizationAugmentor


--- a/deepspeech/frontend/augmentor/base.py
+++ b/deepspeech/frontend/augmentor/base.py
--- a/deepspeech/frontend/augmentor/impulse_response.py
+++ b/deepspeech/frontend/augmentor/impulse_response.py
@ -13,9 +13,9 @@
 # limitations under the License.
 """Contains the impulse response augmentation model."""

-from data_utils.augmentor.base import AugmentorBase
-from data_utils.utility import read_manifest
-from data_utils.audio import AudioSegment
+from deepspeech.frontend.augmentor.base import AugmentorBase
+from deepspeech.frontend.utility import read_manifest
+from deepspeech.frontend.audio import AudioSegment


 class ImpulseResponseAugmentor(AugmentorBase):
--- a/deepspeech/frontend/augmentor/noise_perturb.py
+++ b/deepspeech/frontend/augmentor/noise_perturb.py
@ -13,9 +13,9 @@
 # limitations under the License.
 """Contains the noise perturb augmentation model."""

-from data_utils.augmentor.base import AugmentorBase
-from data_utils.utility import read_manifest
-from data_utils.audio import AudioSegment
+from deepspeech.frontend.augmentor.base import AugmentorBase
+from deepspeech.frontend.utility import read_manifest
+from deepspeech.frontend.audio import AudioSegment


 class NoisePerturbAugmentor(AugmentorBase):
--- a/deepspeech/frontend/augmentor/online_bayesian_normalization.py
+++ b/deepspeech/frontend/augmentor/online_bayesian_normalization.py
@ -13,7 +13,7 @@
 # limitations under the License.
 """Contain the online bayesian normalization augmentation model."""

-from data_utils.augmentor.base import AugmentorBase
+from deepspeech.frontend.augmentor.base import AugmentorBase


 class OnlineBayesianNormalizationAugmentor(AugmentorBase):
--- a/deepspeech/frontend/augmentor/resample.py
+++ b/deepspeech/frontend/augmentor/resample.py
@ -13,7 +13,7 @@
 # limitations under the License.
 """Contain the resample augmentation model."""

-from data_utils.augmentor.base import AugmentorBase
+from deepspeech.frontend.augmentor.base import AugmentorBase


 class ResampleAugmentor(AugmentorBase):
--- a/deepspeech/frontend/augmentor/shift_perturb.py
+++ b/deepspeech/frontend/augmentor/shift_perturb.py
@ -13,7 +13,7 @@
 # limitations under the License.
 """Contains the volume perturb augmentation model."""

-from data_utils.augmentor.base import AugmentorBase
+from deepspeech.frontend.augmentor.base import AugmentorBase


 class ShiftPerturbAugmentor(AugmentorBase):
--- a/deepspeech/frontend/augmentor/speed_perturb.py
+++ b/deepspeech/frontend/augmentor/speed_perturb.py
@ -13,7 +13,7 @@
 # limitations under the License.
 """Contain the speech perturbation augmentation model."""

-from data_utils.augmentor.base import AugmentorBase
+from deepspeech.frontend.augmentor.base import AugmentorBase


 class SpeedPerturbAugmentor(AugmentorBase):
--- a/deepspeech/frontend/augmentor/volume_perturb.py
+++ b/deepspeech/frontend/augmentor/volume_perturb.py
@ -13,7 +13,7 @@
 # limitations under the License.
 """Contains the volume perturb augmentation model."""

-from data_utils.augmentor.base import AugmentorBase
+from deepspeech.frontend.augmentor.base import AugmentorBase


 class VolumePerturbAugmentor(AugmentorBase):
--- a/deepspeech/frontend/featurizer/init.py
+++ b/deepspeech/frontend/featurizer/init.py
@ -11,5 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-from training.trainer import *
--- a/deepspeech/frontend/featurizer/audio_featurizer.py
+++ b/deepspeech/frontend/featurizer/audio_featurizer.py
@ -14,8 +14,8 @@
 """Contains the audio featurizer class."""

 import numpy as np
-from data_utils.utility import read_manifest
-from data_utils.audio import AudioSegment
+from deepspeech.frontend.utility import read_manifest
+from deepspeech.frontend.audio import AudioSegment
 from python_speech_features import mfcc
 from python_speech_features import delta

--- a/deepspeech/frontend/featurizer/speech_featurizer.py
+++ b/deepspeech/frontend/featurizer/speech_featurizer.py
@ -13,8 +13,8 @@
 # limitations under the License.
 """Contains the speech featurizer class."""

-from data_utils.featurizer.audio_featurizer import AudioFeaturizer
-from data_utils.featurizer.text_featurizer import TextFeaturizer
+from deepspeech.frontend.featurizer.audio_featurizer import AudioFeaturizer
+from deepspeech.frontend.featurizer.text_featurizer import TextFeaturizer


 class SpeechFeaturizer(object):
--- a/deepspeech/frontend/featurizer/text_featurizer.py
+++ b/deepspeech/frontend/featurizer/text_featurizer.py
--- a/deepspeech/frontend/normalizer.py
+++ b/deepspeech/frontend/normalizer.py
@ -15,8 +15,8 @@

 import numpy as np
 import random
-from data_utils.utility import read_manifest
-from data_utils.audio import AudioSegment
+from deepspeech.frontend.utility import read_manifest
+from deepspeech.frontend.audio import AudioSegment


 class FeatureNormalizer(object):
--- a/deepspeech/frontend/speech.py
+++ b/deepspeech/frontend/speech.py
@ -14,28 +14,33 @@
 """Contains the speech segment class."""

 import numpy as np
-from data_utils.audio import AudioSegment
+from deepspeech.frontend.audio import AudioSegment


 class SpeechSegment(AudioSegment):
-    """Speech segment abstraction, a subclass of AudioSegment,
-    with an additional transcript.
-
-    :param samples: Audio samples [num_samples x num_channels].
-    :type samples: ndarray.float32
-    :param sample_rate: Audio sample rate.
-    :type sample_rate: int
-    :param transcript: Transcript text for the speech.
-    :type transript: str
-    :raises TypeError: If the sample data type is not float or int.
+    """Speech Segment with Text
+
+    Args:
+        AudioSegment (AudioSegment): Audio Segment
    """

    def __init__(self, samples, sample_rate, transcript):
+        """Speech segment abstraction, a subclass of AudioSegment,
+            with an additional transcript.
+
+        Args:
+            samples (ndarray.float32): Audio samples [num_samples x num_channels].
+            sample_rate (int): Audio sample rate.
+            transcript (str): Transcript text for the speech.
+        """
        AudioSegment.__init__(self, samples, sample_rate)
        self._transcript = transcript

    def __eq__(self, other):
        """Return whether two objects are equal.
+
+        Returns:
+            bool: True, when equal to other
        """
        if not AudioSegment.__eq__(self, other):
            return False
--- a/deepspeech/frontend/utility.py
+++ b/deepspeech/frontend/utility.py
@ -20,6 +20,7 @@ import tarfile
 import time
 from threading import Thread
 from multiprocessing import Process, Manager, Value
+
 from paddle.dataset.common import md5file


@ -49,51 +50,3 @@ def read_manifest(manifest_path, max_duration=float('inf'), min_duration=0.0):
                json_data["duration"] >= min_duration):
            manifest.append(json_data)
    return manifest
-
-
-def getfile_insensitive(path):
-    """Get the actual file path when given insensitive filename."""
-    directory, filename = os.path.split(path)
-    directory, filename = (directory or '.'), filename.lower()
-    for f in os.listdir(directory):
-        newpath = os.path.join(directory, f)
-        if os.path.isfile(newpath) and f.lower() == filename:
-            return newpath
-
-
-def download_multi(url, target_dir, extra_args):
-    """Download multiple files from url to target_dir."""
-    if not os.path.exists(target_dir): os.makedirs(target_dir)
-    print("Downloading %s ..." % url)
-    ret_code = os.system("wget -c " + url + ' ' + extra_args + " -P " +
-                         target_dir)
-    return ret_code
-
-
-def download(url, md5sum, target_dir):
-    """Download file from url to target_dir, and check md5sum."""
-    if not os.path.exists(target_dir): os.makedirs(target_dir)
-    filepath = os.path.join(target_dir, url.split("/")[-1])
-    if not (os.path.exists(filepath) and md5file(filepath) == md5sum):
-        print("Downloading %s ..." % url)
-        os.system("wget -c " + url + " -P " + target_dir)
-        print("\nMD5 Chesksum %s ..." % filepath)
-        if not md5file(filepath) == md5sum:
-            raise RuntimeError("MD5 checksum failed.")
-    else:
-        print("File exists, skip downloading. (%s)" % filepath)
-    return filepath
-
-
-def unpack(filepath, target_dir, rm_tar=False):
-    """Unpack the file to the target_dir."""
-    print("Unpacking %s ..." % filepath)
-    tar = tarfile.open(filepath)
-    tar.extractall(target_dir)
-    tar.close()
-    if rm_tar == True:
-        os.remove(filepath)
-
-
-class XmapEndSignal():
-    pass
--- a/deepspeech/models/init.py
+++ b/deepspeech/models/init.py
@ -0,0 +1,13 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/deepspeech/models/network.py
+++ b/deepspeech/models/network.py
@ -22,11 +22,10 @@ from paddle import nn
 from paddle.nn import functional as F
 from paddle.nn import initializer as I

-from utils import checkpoint
-
-from decoders.swig_wrapper import Scorer
-from decoders.swig_wrapper import ctc_greedy_decoder
-from decoders.swig_wrapper import ctc_beam_search_decoder_batch
+from deepspeech.utils import checkpoint
+from deepspeech.decoders.swig_wrapper import Scorer
+from deepspeech.decoders.swig_wrapper import ctc_greedy_decoder
+from deepspeech.decoders.swig_wrapper import ctc_beam_search_decoder_batch

 logger = logging.getLogger(__name__)

@ -661,16 +660,19 @@ class DeepSpeech2(nn.Layer):
            self._init_ext_scorer(beam_alpha, beam_beta, lang_model_path,
                                  vocab_list)

-    def decode_probs(self, probs, vocab_list, decoding_method, lang_model_path,
-                     beam_alpha, beam_beta, beam_size, cutoff_prob,
-                     cutoff_top_n, num_processes):
-        """ probs: activation after softmax """
+    def decode_probs(self, probs, logits_lens, vocab_list, decoding_method,
+                     lang_model_path, beam_alpha, beam_beta, beam_size,
+                     cutoff_prob, cutoff_top_n, num_processes):
+        """ probs: activation after softmax 
+        logits_len: audio output lens
+        """
+        probs_split = [probs[i, :l, :] for i, l in enumerate(logits_lens)]
        if decoding_method == "ctc_greedy":
            result_transcripts = self._decode_batch_greedy(
-                probs_split=probs, vocab_list=vocab_list)
+                probs_split=probs_split, vocab_list=vocab_list)
        elif decoding_method == "ctc_beam_search":
            result_transcripts = self._decode_batch_beam_search(
-                probs_split=probs,
+                probs_split=probs_split,
                beam_alpha=beam_alpha,
                beam_beta=beam_beta,
                beam_size=beam_size,
@ -686,12 +688,11 @@ class DeepSpeech2(nn.Layer):
    def decode(self, audio, audio_len, vocab_list, decoding_method,
               lang_model_path, beam_alpha, beam_beta, beam_size, cutoff_prob,
               cutoff_top_n, num_processes):
-        _, probs, audio_lens = self.predict(audio, audio_len)
-        probs_split = [probs[i, :l, :] for i, l in enumerate(audio_lens)]
-        return self.decode_probs(probs_split, vocab_list, decoding_method,
-                                 lang_model_path, beam_alpha, beam_beta,
-                                 beam_size, cutoff_prob, cutoff_top_n,
-                                 num_processes)
+        _, probs, logits_lens = self.predict(audio, audio_len)
+        return self.decode_probs(probs.numpy(), logits_lens, vocab_list,
+                                 decoding_method, lang_model_path, beam_alpha,
+                                 beam_beta, beam_size, cutoff_prob,
+                                 cutoff_top_n, num_processes)

    def from_pretrained(self, checkpoint_path):
        """Build a model from a pretrained model.
--- a/deepspeech/modules/init.py
+++ b/deepspeech/modules/init.py
@ -0,0 +1,13 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/deepspeech/training/init.py
+++ b/deepspeech/training/init.py
@ -0,0 +1,15 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from deepspeech.training.trainer import *
--- a/deepspeech/training/cli.py
+++ b/deepspeech/training/cli.py
@ -59,7 +59,8 @@ def default_argument_parser():
    parser.add_argument("--nprocs", type=int, default=1, help="number of parallel processes to use.")

    # overwrite extra config and default config
-    parser.add_argument("--opts", nargs=argparse.REMAINDER, help="options to overwrite --config file and the default config, passing in KEY VALUE pairs")
+    #parser.add_argument("--opts", nargs=argparse.REMAINDER, help="options to overwrite --config file and the default config, passing in KEY VALUE pairs")
+    parser.add_argument("--opts", type=str, default=[], nargs='+', help="options to overwrite --config file and the default config, passing in KEY VALUE pairs")
    # yapd: enable

    return parser
--- a/deepspeech/training/trainer.py
+++ b/deepspeech/training/trainer.py
@ -24,8 +24,8 @@ from paddle import distributed as dist
 from paddle.distributed.utils import get_gpus
 from tensorboardX import SummaryWriter

-from utils import checkpoint
-from utils import mp_tools
+from deepspeech.utils import checkpoint
+from deepspeech.utils import mp_tools

 __all__ = ["Trainer"]

@ -148,20 +148,6 @@ class Trainer():
            checkpoint_path=self.args.checkpoint_path)
        self.iteration = iteration

-    def read_batch(self):
-        """Read a batch from the train_loader.
-        Returns
-        -------
-        List[Tensor]
-            A batch.
-        """
-        try:
-            batch = next(self.iterator)
-        except StopIteration:
-            self.new_epoch()
-            batch = next(self.iterator)
-        return batch
-
    def new_epoch(self):
        """Reset the train loader and increment ``epoch``.
        """
@ -169,7 +155,6 @@ class Trainer():
            # batch sampler epoch start from 0
            self.train_loader.batch_sampler.set_epoch(self.epoch)
        self.epoch += 1
-        self.iterator = iter(self.train_loader)

    def train(self):
        """The training process.
@ -177,16 +162,22 @@ class Trainer():
        It includes forward/backward/update and periodical validation and 
        saving.
        """
+        self.logger.info(
+            f"Train Total Examples: {len(self.train_loader.dataset)}")
        self.new_epoch()
-        while self.iteration < self.config.training.max_iteration:
-            self.iteration += 1
-            self.train_batch()
-
-            if self.iteration % self.config.training.valid_interval == 0:
-                self.valid()
-
-            if self.iteration % self.config.training.save_interval == 0:
-                self.save()
+        while self.epoch <= self.config.training.n_epoch:
+            try:
+                for batch in self.train_loader:
+                    self.iteration += 1
+                    self.train_batch(batch)
+            except Exception as e:
+                self.logger.error(e)
+                pass
+
+            self.valid()
+            self.save()
+            self.lr_scheduler.step()
+            self.new_epoch()

    def run(self):
        """The routine of the experiment after setup. This method is intended
--- a/deepspeech/utils/init.py
+++ b/deepspeech/utils/init.py
@ -0,0 +1,13 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/deepspeech/utils/checkpoint.py
+++ b/deepspeech/utils/checkpoint.py
@ -16,15 +16,15 @@ import os
 import time
 import logging
 import numpy as np
+
 import paddle
 from paddle import distributed as dist
 from paddle.nn import Layer
 from paddle.optimizer import Optimizer

-from utils import mp_tools
+from deepspeech.utils import mp_tools

 logger = logging.getLogger(__name__)
-logger.setLevel("INFO")

 __all__ = ["load_parameters", "save_parameters"]

--- a/deepspeech/utils/error_rate.py
+++ b/deepspeech/utils/error_rate.py
--- a/deepspeech/utils/mp_tools.py
+++ b/deepspeech/utils/mp_tools.py
--- a/deepspeech/utils/utility.py
+++ b/deepspeech/utils/utility.py
@ -0,0 +1,57 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Contains common utility functions."""
+
+import distutils.util
+
+
+def print_arguments(args):
+    """Print argparse's arguments.
+
+    Usage:
+
+    .. code-block:: python
+
+        parser = argparse.ArgumentParser()
+        parser.add_argument("name", default="Jonh", type=str, help="User name.")
+        args = parser.parse_args()
+        print_arguments(args)
+
+    :param args: Input argparse.Namespace for printing.
+    :type args: argparse.Namespace
+    """
+    print("-----------  Configuration Arguments -----------")
+    for arg, value in sorted(vars(args).items()):
+        print("%s: %s" % (arg, value))
+    print("------------------------------------------------")
+
+
+def add_arguments(argname, type, default, help, argparser, **kwargs):
+    """Add argparse's argument.
+
+    Usage:
+
+    .. code-block:: python
+
+        parser = argparse.ArgumentParser()
+        add_argument("name", str, "Jonh", "User name.", parser)
+        args = parser.parse_args()
+    """
+    type = distutils.util.strtobool if type == bool else type
+    argparser.add_argument(
+        "--" + argname,
+        default=default,
+        type=type,
+        help=help + ' Default: %(default)s.',
+        **kwargs)
--- a/deploy/demo_server.py
+++ b/deploy/demo_server.py
@ -23,11 +23,12 @@ import struct
 import wave
 import paddle.fluid as fluid
 import numpy as np
-import _init_paths
-from data_utils.data import DataGenerator
-from model_utils.model import DeepSpeech2Model
-from data_utils.utility import read_manifest
-from utils.utility import add_arguments, print_arguments
+
+from deepspeech.frontend.utility import read_manifest
+from deepspeech.utils.utility import add_arguments, print_arguments
+
+from deepspeech.exps.deepspeech2.model import DeepSpeech2Model
+from deepspeech.exps.deepspeech2.dataset import DataGenerator

 parser = argparse.ArgumentParser(description=__doc__)
 add_arg = functools.partial(add_arguments, argparser=parser)
--- a/examples/aishell/.gitignore
+++ b/examples/aishell/.gitignore
@ -0,0 +1,2 @@
+data
+ckpt*
--- a/examples/aishell/conf/deepspeech2.yaml
+++ b/examples/aishell/conf/deepspeech2.yaml
@ -34,18 +34,14 @@ training:
  lr_decay: 0.83
  weight_decay: 1e-06
  global_grad_clip: 5.0
-  max_iteration: 500000
-  plot_interval: 1000
-  save_interval: 1000
-  valid_interval: 1000
 decoding:
-  batch_size: 10
+  batch_size: 128
  error_rate_type: cer 
  decoding_method: ctc_beam_search
-  lang_model_path: models/lm/zh_giga.no_cna_cmn.prune01244.klm
+  lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm
  alpha: 2.6
  beta: 5.0
  beam_size: 300
-  cutoff_prob: 1.0 
+  cutoff_prob: 0.99
  cutoff_top_n: 40
  num_proc_bsearch: 10
--- a/examples/aishell/local/data.sh
+++ b/examples/aishell/local/data.sh
@ -2,10 +2,13 @@

 mkdir -p data

+TARGET_DIR=${MAIN_ROOT}/examples/dataset
+mkdir -p ${TARGET_DIR}
+
 # download data, generate manifests
-PYTHONPATH=.:$PYTHONPATH python3 local/aishell.py \
+PYTHONPATH=.:$PYTHONPATH python3 ${TARGET_DIR}/aishell/aishell.py \
 --manifest_prefix="data/manifest" \
--target_dir="${MAIN_ROOT}/dataset/aishell"
+--target_dir="${TARGET_DIR}/aishell"

 if [ $? -ne 0 ]; then
    echo "Prepare Aishell failed. Terminated."
@ -14,7 +17,7 @@ fi


 # build vocabulary
-python3 ${MAIN_ROOT}/tools/build_vocab.py \
+python3 ${MAIN_ROOT}/utils/build_vocab.py \
 --count_threshold=0 \
 --vocab_path="data/vocab.txt" \
 --manifest_paths "data/manifest.train" "data/manifest.dev"
@ -26,7 +29,7 @@ fi


 # compute mean and stddev for normalizer
-python3 ${MAIN_ROOT}/tools/compute_mean_std.py \
+python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
 --manifest_path="data/manifest.train" \
 --num_samples=2000 \
 --specgram_type="linear" \
--- a/examples/aishell/local/download_lm_ch.sh
+++ b/examples/aishell/local/download_lm_ch.sh
@ -1,10 +1,13 @@
 #! /usr/bin/env bash

-. ../../utils/utility.sh
+. ${MAIN_ROOT}/utils/utility.sh
+
+DIR=data/lm
+mkdir -p ${DIR}

 URL='https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm'
 MD5="29e02312deb2e59b3c8686c7966d4fe3"
-TARGET=./zh_giga.no_cna_cmn.prune01244.klm
+TARGET=${DIR}/zh_giga.no_cna_cmn.prune01244.klm


 echo "Download language model ..."
--- a/examples/aishell/local/download_model.sh
+++ b/examples/aishell/local/download_model.sh
@ -1,10 +1,13 @@
 #! /usr/bin/env bash

-. ../../utils/utility.sh
+. ${MAIN_ROOT}/utils/utility.sh
+
+DIR=data/pretrain
+mkdir -p ${DIR} 

 URL='https://deepspeech.bj.bcebos.com/mandarin_models/aishell_model_fluid.tar.gz'
 MD5=2bf0cc8b6d5da2a2a787b5cc36a496b5
-TARGET=./aishell_model_fluid.tar.gz
+TARGET=${DIR}/aishell_model_fluid.tar.gz


 echo "Download Aishell model ..."
@ -13,7 +16,7 @@ if [ $? -ne 0 ]; then
    echo "Fail to download Aishell model!"
    exit 1
 fi
-tar -zxvf $TARGET
+tar -zxvf $TARGET -C ${DIR}


 exit 0
--- a/examples/aishell/local/infer.sh
+++ b/examples/aishell/local/infer.sh
@ -2,14 +2,12 @@


 # download language model
-cd ${MAIN_ROOT}/models/lm > /dev/null
-bash download_lm_ch.sh
+bash local/download_lm_ch.sh
 if [ $? -ne 0 ]; then
    exit 1
 fi
-cd - > /dev/null

-python3 -u ${MAIN_ROOT}/infer.py \
+python3 -u ${BIN_DIR}/infer.py \
 --device 'gpu' \
 --nproc 1 \
 --config conf/deepspeech2.yaml \
--- a/examples/aishell/local/infer_golden.sh
+++ b/examples/aishell/local/infer_golden.sh
@ -1,22 +1,16 @@
 #! /usr/bin/env bash

 # download language model
-cd ${MAIN_ROOT}/models/lm > /dev/null
-bash download_lm_ch.sh
+bash local/download_lm_ch.sh
 if [ $? -ne 0 ]; then
    exit 1
 fi
-cd - > /dev/null
-

 # download well-trained model
-cd ${MAIN_ROOT}/models/aishell > /dev/null
-bash download_model.sh
+bash local/download_model.sh
 if [ $? -ne 0 ]; then
    exit 1
 fi
-cd - > /dev/null
-

 # infer
 CUDA_VISIBLE_DEVICES=0 \
@ -35,10 +29,10 @@ python3 -u ${MAIN_ROOT}/infer.py \
 --use_gpu=False \
 --share_rnn_weights=False \
 --infer_manifest="data/manifest.test" \
--mean_std_path="${MAIN_ROOT}/models/aishell/mean_std.npz" \
--vocab_path="${MAIN_ROOT}/models/aishell/vocab.txt" \
--model_path="${MAIN_ROOT}/models/aishell" \
--lang_model_path="${MAIN_ROOT}/models/lm/zh_giga.no_cna_cmn.prune01244.klm" \
+--mean_std_path="data/pretrain/mean_std.npz" \
+--vocab_path="data/pretrain/vocab.txt" \
+--model_path="data/pretrain" \
+--lang_model_path="data/lm/zh_giga.no_cna_cmn.prune01244.klm" \
 --decoding_method="ctc_beam_search" \
 --error_rate_type="cer" \
 --specgram_type="linear"
--- a/examples/aishell/local/test.sh
+++ b/examples/aishell/local/test.sh
@ -1,19 +1,16 @@
 #! /usr/bin/env bash

 # download language model
-cd ${MAIN_ROOT}/models/lm > /dev/null
-bash download_lm_ch.sh
+bash local/download_lm_ch.sh
 if [ $? -ne 0 ]; then
    exit 1
 fi
-cd - > /dev/null

-
-python3 -u ${MAIN_ROOT}/test.py \
+python3 -u ${BIN_DIR}/test.py \
 --device 'gpu' \
 --nproc 1 \
 --config conf/deepspeech2.yaml \
--output ckpt
+--checkpoint_path ${1} 

 if [ $? -ne 0 ]; then
    echo "Failed in evaluation!"
--- a/examples/aishell/local/test_golden.sh
+++ b/examples/aishell/local/test_golden.sh
@ -1,47 +1,26 @@
 #! /usr/bin/env bash

 # download language model
-cd ${MAIN_ROOT}/models/lm > /dev/null
-bash download_lm_ch.sh
+bash local/download_lm_ch.sh
 if [ $? -ne 0 ]; then
    exit 1
 fi
-cd - > /dev/null
-

 # download well-trained model
-cd ${MAIN_ROOT}/models/aishell > /dev/null
-bash download_model.sh
+bash local/download_model.sh
 if [ $? -ne 0 ]; then
    exit 1
 fi
-cd - > /dev/null
-

 # evaluate model
-CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
-python3 -u ${MAIN_ROOT}/test.py \
--batch_size=128 \
--beam_size=300 \
--num_proc_bsearch=8 \
--num_conv_layers=2 \
--num_rnn_layers=3 \
--rnn_layer_size=1024 \
--alpha=2.6 \
--beta=5.0 \
--cutoff_prob=0.99 \
--cutoff_top_n=40 \
--use_gru=True \
--use_gpu=True \
--share_rnn_weights=False \
--test_manifest="data/manifest.test" \
--mean_std_path="${MAIN_ROOT}/models/aishell/mean_std.npz" \
--vocab_path="${MAIN_ROOT}/models/aishell/vocab.txt" \
--model_path="${MAIN_ROOT}/models/aishell" \
--lang_model_path="${MAIN_ROOT}/models/lm/zh_giga.no_cna_cmn.prune01244.klm" \
--decoding_method="ctc_beam_search" \
--error_rate_type="cer" \
--specgram_type="linear"
+CUDA_VISIBLE_DEVICES=0 \
+python3 -u ${BIN_DIR}/test.py \
+--device 'gpu' \
+--nproc 1 \
+--config conf/deepspeech2.yaml \
+--checkpoint_path data/pretrain/params.pdparams  \
+--opts data.mean_std_filepath data/pretrain/mean_std.npz  \
+--opts data.vocab_filepath data/pretrain/vocab.txt

 if [ $? -ne 0 ]; then
    echo "Failed in evaluation!"
--- a/examples/aishell/local/train.sh
+++ b/examples/aishell/local/train.sh
@ -4,11 +4,14 @@
 # if you wish to resume from an exists model, uncomment --init_from_pretrained_model
 export FLAGS_sync_nccl_allreduce=0

-python3 -u ${MAIN_ROOT}/train.py \
+ngpu=$(echo ${CUDA_VISIBLE_DEVICES} | python -c 'import sys; a = sys.stdin.read(); print(len(a.split(",")));')
+echo "using $ngpu gpus..."
+
+python3 -u ${BIN_DIR}/train.py \
 --device 'gpu' \
--nproc 4 \
+--nproc ${ngpu} \
 --config conf/deepspeech2.yaml \
--output ckpt-${1}
+--output ckpt


 if [ $? -ne 0 ]; then
--- a/examples/aishell/local/tune.sh
+++ b/examples/aishell/local/tune.sh
@ -1,7 +1,7 @@
 #! /usr/bin/env bash

 # grid-search for hyper-parameters in language model
-python3 -u ${MAIN_ROOT}/tune.py \
+python3 -u ${BIN_DIR}/tune.py \
 --device 'gpu' \
 --nproc 1 \
 --config conf/deepspeech2.yaml \
--- a/examples/aishell/models
+++ b/examples/aishell/models
@ -1 +0,0 @@
-../../models
--- a/examples/aishell/path.sh
+++ b/examples/aishell/path.sh
@ -8,3 +8,6 @@ export PYTHONIOENCODING=UTF-8
 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}

 export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
+
+MODEL=deepspeech2
+export BIN_DIR=${MAIN_ROOT}/deepspeech/exps/${MODEL}/bin
--- a/examples/aishell/run.sh
+++ b/examples/aishell/run.sh
@ -1,21 +1,16 @@
 #!/bin/bash

 source path.sh
+# only demos

 # prepare data
 bash ./local/data.sh

-# test pretrain model
-bash ./local/test_golden.sh
-
-# test pretain model
-bash ./local/infer_golden.sh
-
 # train model
-bash ./local/train.sh
+CUDA_VISIBLE_DEVICES=0,1,2,3 bash ./local/train.sh

 # test model
-bash ./local/test.sh
+CUDA_VISIBLE_DEVICES=0 bash ./local/test.sh ckpt/checkpoints/step-3284

 # infer model
-bash ./local/infer.sh
+CUDA_VISIBLE_DEVICES=0 bash ./local/infer.sh ckpt/checkpoints/step-3284
--- a/examples/aug_conf/augmentation.config
+++ b/examples/aug_conf/augmentation.config
--- a/examples/aug_conf/augmentation.config.example
+++ b/examples/aug_conf/augmentation.config.example
--- a/examples/baidu_en8k/download_lm_en.sh
+++ b/examples/baidu_en8k/download_lm_en.sh
@ -1,11 +1,13 @@
 #! /usr/bin/env bash

-. ../../utils/utility.sh
+. ${MAIN_ROOT}/utils/utility.sh
+
+DIR=data/lm
+mkdir -p ${DIR}

 URL=https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm
 MD5="099a601759d467cd0a8523ff939819c5"
-TARGET=./common_crawl_00.prune01111.trie.klm
-
+TARGET=${DIR}/common_crawl_00.prune01111.trie.klm

 echo "Download language model ..."
 download $URL $MD5 $TARGET
--- a/examples/baidu_en8k/download_model.sh
+++ b/examples/baidu_en8k/download_model.sh
@ -1,10 +1,13 @@
 #! /usr/bin/env bash

-. ../../utils/utility.sh
+. ${MAIN_ROOT}/utils/utility.sh
+
+DIR=data/pretrain
+mkdir -p ${DIR}

 URL='https://deepspeech.bj.bcebos.com/demo_models/baidu_en8k_model_fluid.tar.gz'
 MD5=7e58fbf64aa4ecf639b049792ddcf788
-TARGET=./baidu_en8k_model_fluid.tar.gz
+TARGET=${DIR}/baidu_en8k_model_fluid.tar.gz


 echo "Download BaiduEn8k model ..."
--- a/examples/baidu_en8k/path.sh
+++ b/examples/baidu_en8k/path.sh
@ -6,3 +6,8 @@ export LC_ALL=C
 # Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
 export PYTHONIOENCODING=UTF-8 
 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
+
+MODEL=deepspeech2
+export BIN_DIR=${MAIN_ROOT}/deepspeech/exps/${MODEL}/bin
--- a/examples/baidu_en8k/run_infer_golden.sh
+++ b/examples/baidu_en8k/run_infer_golden.sh
@ -3,22 +3,17 @@
 source path.sh

 # download language model
-cd ${MAIN_ROOT}/models/lm > /dev/null
 bash download_lm_en.sh
 if [ $? -ne 0 ]; then
    exit 1
 fi
-cd - > /dev/null


 # download well-trained model
-cd ${MAIN_ROOT}/models/baidu_en8k > /dev/null
 bash download_model.sh
 if [ $? -ne 0 ]; then
    exit 1
 fi
-cd - > /dev/null
-

 # infer
 CUDA_VISIBLE_DEVICES=0 \
@ -37,10 +32,10 @@ python3 -u ${MAIN_ROOT}/infer.py \
 --use_gpu=False \
 --share_rnn_weights=False \
 --infer_manifest="${MAIN_ROOT}/examples/librispeech/data/manifest.test-clean" \
--mean_std_path="${MAIN_ROOT}/models/baidu_en8k/mean_std.npz" \
--vocab_path="${MAIN_ROOT}/models/baidu_en8k/vocab.txt" \
--model_path="${MAIN_ROOT}/models/baidu_en8k" \
--lang_model_path="${MAIN_ROOT}/models/lm/common_crawl_00.prune01111.trie.klm" \
+--mean_std_path="data/pretrain/baidu_en8k/mean_std.npz" \
+--vocab_path="data/pretrain/baidu_en8k/vocab.txt" \
+--model_path="data/pretrain/baidu_en8k" \
+--lang_model_path="data/lm/common_crawl_00.prune01111.trie.klm" \
 --decoding_method="ctc_beam_search" \
 --error_rate_type="wer" \
 --specgram_type="linear"
--- a/examples/baidu_en8k/run_test_golden.sh
+++ b/examples/baidu_en8k/run_test_golden.sh
@ -3,21 +3,17 @@
 source path.sh

 # download language model
-cd ${MAIN_ROOT}/models/lm > /dev/null
 bash download_lm_en.sh
 if [ $? -ne 0 ]; then
    exit 1
 fi
-cd - > /dev/null


 # download well-trained model
-cd ${MAIN_ROOT}/models/baidu_en8k > /dev/null
 bash download_model.sh
 if [ $? -ne 0 ]; then
    exit 1
 fi
-cd - > /dev/null


 # evaluate model
@ -37,11 +33,11 @@ python3 -u ${MAIN_ROOT}/test.py \
 --use_gru=True \
 --use_gpu=False \
 --share_rnn_weights=False \
--test_manifest="data/manifest.test-clean" \
--mean_std_path="${MAIN_ROOT}/models/baidu_en8k/mean_std.npz" \
--vocab_path="${MAIN_ROOT}/models/baidu_en8k/vocab.txt" \
--model_path="${MAIN_ROOT}/models/baidu_en8k" \
--lang_model_path="${MAIN_ROOT}/models/lm/common_crawl_00.prune01111.trie.klm" \
+--test_manifest="${MAIN_ROOT}/examples/librispeech/data/manifest.test-clean" \
+--mean_std_path="data/pretrain/baidu_en8k/mean_std.npz" \
+--vocab_path="data/pretrain/baidu_en8k/vocab.txt" \
+--model_path="data/pretrain/baidu_en8k" \
+--lang_model_path="data/lm/common_crawl_00.prune01111.trie.klm" \
 --decoding_method="ctc_beam_search" \
 --error_rate_type="wer" \
 --specgram_type="linear"
--- a/examples/dataset/aishell/.gitignore
+++ b/examples/dataset/aishell/.gitignore
@ -0,0 +1 @@
+data_aishell*
--- a/examples/dataset/aishell/aishell.py
+++ b/examples/dataset/aishell/aishell.py
@ -24,7 +24,7 @@ import codecs
 import soundfile
 import json
 import argparse
-from data_utils.utility import download, unpack
+from utils.utility import download, unpack

 DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')

--- a/examples/dataset/chime3_background/chime3_background.py
+++ b/examples/dataset/chime3_background/chime3_background.py
@ -29,7 +29,8 @@ import json
 import io
 from paddle.v2.dataset.common import md5file

-DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
+#DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
+DATA_HOME = os.path.expanduser('.')

 URL = "https://d4s.myairbridge.com/packagev2/AG0Y3DNBE5IWRRTV/?dlid=W19XG7T0NNHB027139H0EQ"
 MD5 = "c3ff512618d7a67d4f85566ea1bc39ec"
--- a/examples/dataset/librispeech/.gitignore
+++ b/examples/dataset/librispeech/.gitignore
@ -0,0 +1,7 @@
+dev-clean/
+dev-other/
+test-clean/
+test-other/
+train-clean-100/
+train-clean-360/
+train-other-500/
--- a/examples/dataset/librispeech/librispeech.py
+++ b/examples/dataset/librispeech/librispeech.py
@ -27,10 +27,10 @@ import soundfile
 import json
 import codecs
 import io
-from data_utils.utility import download, unpack
+from utils.utility import download, unpack

 URL_ROOT = "http://www.openslr.org/resources/12"
-URL_ROOT = "https://openslr.magicdatatech.com/resources/12"
+#URL_ROOT = "https://openslr.magicdatatech.com/resources/12"
 URL_TEST_CLEAN = URL_ROOT + "/test-clean.tar.gz"
 URL_TEST_OTHER = URL_ROOT + "/test-other.tar.gz"
 URL_DEV_CLEAN = URL_ROOT + "/dev-clean.tar.gz"
--- a/examples/dataset/mini_librispeech/.gitignore
+++ b/examples/dataset/mini_librispeech/.gitignore
@ -0,0 +1,4 @@
+dev-clean/
+manifest.dev-clean
+manifest.train-clean
+train-clean/
--- a/examples/dataset/mini_librispeech/mini_librispeech.py
+++ b/examples/dataset/mini_librispeech/mini_librispeech.py
@ -0,0 +1,115 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Prepare Librispeech ASR datasets.
+
+Download, unpack and create manifest files.
+Manifest file is a json-format file with each line containing the
+meta data (i.e. audio filepath, transcript and audio duration)
+of each audio file in the data set.
+"""
+
+import distutils.util
+import os
+import sys
+import argparse
+import soundfile
+import json
+import codecs
+import io
+from utils.utility import download, unpack
+
+URL_ROOT = "http://www.openslr.org/resources/31"
+URL_TRAIN_CLEAN = URL_ROOT + "/train-clean-5.tar.gz"
+URL_DEV_CLEAN = URL_ROOT + "/dev-clean-2.tar.gz"
+
+MD5_TRAIN_CLEAN = "5df7d4e78065366204ca6845bb08f490"
+MD5_DEV_CLEAN = "6d7ab67ac6a1d2c993d050e16d61080d"
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    "--target_dir",
+    default='~/.cache/paddle/dataset/speech/libri',
+    type=str,
+    help="Directory to save the dataset. (default: %(default)s)")
+parser.add_argument(
+    "--manifest_prefix",
+    default="manifest",
+    type=str,
+    help="Filepath prefix for output manifests. (default: %(default)s)")
+args = parser.parse_args()
+
+
+def create_manifest(data_dir, manifest_path):
+    """Create a manifest json file summarizing the data set, with each line
+    containing the meta data (i.e. audio filepath, transcription text, audio
+    duration) of each audio file within the data set.
+    """
+    print("Creating manifest %s ..." % manifest_path)
+    json_lines = []
+    for subfolder, _, filelist in sorted(os.walk(data_dir)):
+        text_filelist = [
+            filename for filename in filelist if filename.endswith('trans.txt')
+        ]
+        if len(text_filelist) > 0:
+            text_filepath = os.path.join(subfolder, text_filelist[0])
+            for line in io.open(text_filepath, encoding="utf8"):
+                segments = line.strip().split()
+                text = ' '.join(segments[1:]).lower()
+                audio_filepath = os.path.join(subfolder, segments[0] + '.flac')
+                audio_data, samplerate = soundfile.read(audio_filepath)
+                duration = float(len(audio_data)) / samplerate
+                json_lines.append(
+                    json.dumps({
+                        'audio_filepath': audio_filepath,
+                        'duration': duration,
+                        'text': text
+                    }))
+    with codecs.open(manifest_path, 'w', 'utf-8') as out_file:
+        for line in json_lines:
+            out_file.write(line + '\n')
+
+
+def prepare_dataset(url, md5sum, target_dir, manifest_path):
+    """Download, unpack and create summmary manifest file.
+    """
+    if not os.path.exists(os.path.join(target_dir, "LibriSpeech")):
+        # download
+        filepath = download(url, md5sum, target_dir)
+        # unpack
+        unpack(filepath, target_dir)
+    else:
+        print("Skip downloading and unpacking. Data already exists in %s." %
+              target_dir)
+    # create manifest json file
+    create_manifest(target_dir, manifest_path)
+
+
+def main():
+    if args.target_dir.startswith('~'):
+        args.target_dir = os.path.expanduser(args.target_dir)
+
+    prepare_dataset(
+        url=URL_TRAIN_CLEAN,
+        md5sum=MD5_TRAIN_CLEAN,
+        target_dir=os.path.join(args.target_dir, "train-clean"),
+        manifest_path=args.manifest_prefix + ".train-clean")
+    prepare_dataset(
+        url=URL_DEV_CLEAN,
+        md5sum=MD5_DEV_CLEAN,
+        target_dir=os.path.join(args.target_dir, "dev-clean"),
+        manifest_path=args.manifest_prefix + ".dev-clean")
+
+
+if __name__ == '__main__':
+    main()
--- a/examples/dataset/musan/musan.py
+++ b/examples/dataset/musan/musan.py
@ -0,0 +1,123 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Prepare Aishell mandarin dataset
+
+Download, unpack and create manifest files.
+Manifest file is a json-format file with each line containing the
+meta data (i.e. audio filepath, transcript and audio duration)
+of each audio file in the data set.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import codecs
+import soundfile
+import json
+import argparse
+from utils.utility import download, unpack
+
+DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
+
+URL_ROOT = 'https://www.openslr.org/resources/17'
+DATA_URL = URL_ROOT + '/musan.tar.gz'
+MD5_DATA = ''
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    "--target_dir",
+    default=DATA_HOME + "/musan",
+    type=str,
+    help="Directory to save the dataset. (default: %(default)s)")
+parser.add_argument(
+    "--manifest_prefix",
+    default="manifest",
+    type=str,
+    help="Filepath prefix for output manifests. (default: %(default)s)")
+args = parser.parse_args()
+
+
+def create_manifest(data_dir, manifest_path_prefix):
+    print("Creating manifest %s ..." % manifest_path_prefix)
+    json_lines = []
+    transcript_path = os.path.join(data_dir, 'transcript',
+                                   'aishell_transcript_v0.8.txt')
+    transcript_dict = {}
+    for line in codecs.open(transcript_path, 'r', 'utf-8'):
+        line = line.strip()
+        if line == '': continue
+        audio_id, text = line.split(' ', 1)
+        # remove withespace
+        text = ''.join(text.split())
+        transcript_dict[audio_id] = text
+
+    data_types = ['train', 'dev', 'test']
+    for type in data_types:
+        del json_lines[:]
+        audio_dir = os.path.join(data_dir, 'wav', type)
+        for subfolder, _, filelist in sorted(os.walk(audio_dir)):
+            for fname in filelist:
+                audio_path = os.path.join(subfolder, fname)
+                audio_id = fname[:-4]
+                # if no transcription for audio then skipped
+                if audio_id not in transcript_dict:
+                    continue
+                audio_data, samplerate = soundfile.read(audio_path)
+                duration = float(len(audio_data) / samplerate)
+                text = transcript_dict[audio_id]
+                json_lines.append(
+                    json.dumps(
+                        {
+                            'audio_filepath': audio_path,
+                            'duration': duration,
+                            'text': text
+                        },
+                        ensure_ascii=False))
+        manifest_path = manifest_path_prefix + '.' + type
+        with codecs.open(manifest_path, 'w', 'utf-8') as fout:
+            for line in json_lines:
+                fout.write(line + '\n')
+
+
+def prepare_dataset(url, md5sum, target_dir, manifest_path):
+    """Download, unpack and create manifest file."""
+    data_dir = os.path.join(target_dir, 'data_aishell')
+    if not os.path.exists(data_dir):
+        filepath = download(url, md5sum, target_dir)
+        unpack(filepath, target_dir)
+        # unpack all audio tar files
+        audio_dir = os.path.join(data_dir, 'wav')
+        for subfolder, _, filelist in sorted(os.walk(audio_dir)):
+            for ftar in filelist:
+                unpack(os.path.join(subfolder, ftar), subfolder, True)
+    else:
+        print("Skip downloading and unpacking. Data already exists in %s." %
+              target_dir)
+    create_manifest(data_dir, manifest_path)
+
+
+def main():
+    if args.target_dir.startswith('~'):
+        args.target_dir = os.path.expanduser(args.target_dir)
+
+    prepare_dataset(
+        url=DATA_URL,
+        md5sum=MD5_DATA,
+        target_dir=args.target_dir,
+        manifest_path=args.manifest_prefix)
+
+
+if __name__ == '__main__':
+    main()
--- a/examples/dataset/rir_noise/rir_noise.py
+++ b/examples/dataset/rir_noise/rir_noise.py
@ -0,0 +1,123 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Prepare Aishell mandarin dataset
+
+Download, unpack and create manifest files.
+Manifest file is a json-format file with each line containing the
+meta data (i.e. audio filepath, transcript and audio duration)
+of each audio file in the data set.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import codecs
+import soundfile
+import json
+import argparse
+from data_utils.utility import download, unpack
+
+DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
+
+URL_ROOT = 'http://www.openslr.org/resources/28'
+DATA_URL = URL_ROOT + '/rirs_noises.zip'
+MD5_DATA = ''
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    "--target_dir",
+    default=DATA_HOME + "/Aishell",
+    type=str,
+    help="Directory to save the dataset. (default: %(default)s)")
+parser.add_argument(
+    "--manifest_prefix",
+    default="manifest",
+    type=str,
+    help="Filepath prefix for output manifests. (default: %(default)s)")
+args = parser.parse_args()
+
+
+def create_manifest(data_dir, manifest_path_prefix):
+    print("Creating manifest %s ..." % manifest_path_prefix)
+    json_lines = []
+    transcript_path = os.path.join(data_dir, 'transcript',
+                                   'aishell_transcript_v0.8.txt')
+    transcript_dict = {}
+    for line in codecs.open(transcript_path, 'r', 'utf-8'):
+        line = line.strip()
+        if line == '': continue
+        audio_id, text = line.split(' ', 1)
+        # remove withespace
+        text = ''.join(text.split())
+        transcript_dict[audio_id] = text
+
+    data_types = ['train', 'dev', 'test']
+    for type in data_types:
+        del json_lines[:]
+        audio_dir = os.path.join(data_dir, 'wav', type)
+        for subfolder, _, filelist in sorted(os.walk(audio_dir)):
+            for fname in filelist:
+                audio_path = os.path.join(subfolder, fname)
+                audio_id = fname[:-4]
+                # if no transcription for audio then skipped
+                if audio_id not in transcript_dict:
+                    continue
+                audio_data, samplerate = soundfile.read(audio_path)
+                duration = float(len(audio_data) / samplerate)
+                text = transcript_dict[audio_id]
+                json_lines.append(
+                    json.dumps(
+                        {
+                            'audio_filepath': audio_path,
+                            'duration': duration,
+                            'text': text
+                        },
+                        ensure_ascii=False))
+        manifest_path = manifest_path_prefix + '.' + type
+        with codecs.open(manifest_path, 'w', 'utf-8') as fout:
+            for line in json_lines:
+                fout.write(line + '\n')
+
+
+def prepare_dataset(url, md5sum, target_dir, manifest_path):
+    """Download, unpack and create manifest file."""
+    data_dir = os.path.join(target_dir, 'data_aishell')
+    if not os.path.exists(data_dir):
+        filepath = download(url, md5sum, target_dir)
+        unpack(filepath, target_dir)
+        # unpack all audio tar files
+        audio_dir = os.path.join(data_dir, 'wav')
+        for subfolder, _, filelist in sorted(os.walk(audio_dir)):
+            for ftar in filelist:
+                unpack(os.path.join(subfolder, ftar), subfolder, True)
+    else:
+        print("Skip downloading and unpacking. Data already exists in %s." %
+              target_dir)
+    create_manifest(data_dir, manifest_path)
+
+
+def main():
+    if args.target_dir.startswith('~'):
+        args.target_dir = os.path.expanduser(args.target_dir)
+
+    prepare_dataset(
+        url=DATA_URL,
+        md5sum=MD5_DATA,
+        target_dir=args.target_dir,
+        manifest_path=args.manifest_prefix)
+
+
+if __name__ == '__main__':
+    main()
--- a/examples/dataset/voxforge/run_data.sh
+++ b/examples/dataset/voxforge/run_data.sh
@ -1,9 +1,12 @@
 #! /usr/bin/env bash

+TARGET_DIR=${MAIN_ROOT}/examples/dataset/voxforge
+mkdir -p ${TARGET_DIR}
+
 # download data, generate manifests
-PYTHONPATH=../../:$PYTHONPATH python voxforge.py \
--manifest_prefix='./manifest' \
--target_dir='./dataset/VoxForge' \
+python ${MAIN_ROOT}/examples/dataset/voxforge/voxforge.py \
+--manifest_prefix="${TARGET_DIR}/manifest" \
+--target_dir="${TARGET_DIR}" \
 --is_merge_dialect=True \
 --dialects 'american' 'british' 'australian' 'european' 'irish' 'canadian' 'indian'

--- a/examples/dataset/voxforge/voxforge.py
+++ b/examples/dataset/voxforge/voxforge.py
@ -27,9 +27,9 @@ import json
 import argparse
 import shutil
 import subprocess
-from data_utils.utility import download_multi, unpack, getfile_insensitive
+from utils.utility import download_multi, unpack, getfile_insensitive

-DATA_HOME = './dataset'
+DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')

 DATA_URL = 'http://www.repository.voxforge1.org/downloads/SpeechCorpus/Trunk/' \
           'Audio/Main/16kHz_16bit'
--- a/examples/librispeech/.gitignore
+++ b/examples/librispeech/.gitignore
@ -0,0 +1,2 @@
+data
+ckpt*
--- a/examples/librispeech/conf/deepspeech2.yaml
+++ b/examples/librispeech/conf/deepspeech2.yaml
@ -1,12 +1,12 @@
 # https://yaml.org/type/float.html
 data:
-  train_manifest: data/manifest.tiny
-  dev_manifest: data/manifest.tiny
-  test_manifest: data/manifest.tiny
+  train_manifest: data/manifest.train
+  dev_manifest: data/manifest.dev-clean
+  test_manifest: data/manifest.test-clean
  mean_std_filepath: data/mean_std.npz
  vocab_filepath: data/vocab.txt 
  augmentation_config: conf/augmentation.config
-  batch_size: 4
+  batch_size: 20
  max_duration: 27.0
  min_duration: 0.0
  specgram_type: linear
@ -26,26 +26,22 @@ model:
  num_conv_layers: 2
  num_rnn_layers: 3
  rnn_layer_size: 2048
-  use_gru: True 
+  use_gru: False 
  share_rnn_weights: True 
 training:
-  n_epoch: 20
-  lr: 1e-5 
+  n_epoch: 50
+  lr: 5e-4
+  lr_decay: 0.83
  weight_decay: 1e-06
-  global_grad_clip: 400.0
-  max_iteration: 500000
-  plot_interval: 1000
-  save_interval: 1000
-  valid_interval: 1000
+  global_grad_clip: 5.0
 decoding:
  batch_size: 128
  error_rate_type: wer
  decoding_method: ctc_beam_search
-  lang_model_path: models/lm/common_crawl_00.prune01111.trie.klm
+  lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
  alpha: 2.5
  beta: 0.3
  beam_size: 500
  cutoff_prob: 1.0
  cutoff_top_n: 40
  num_proc_bsearch: 8
-
--- a/examples/librispeech/local/data.sh
+++ b/examples/librispeech/local/data.sh
@ -1,11 +1,13 @@
 #! /usr/bin/env bash

 mkdir -p data
+TARGET_DIR=${MAIN_ROOT}/examples/dataset
+mkdir -p ${TARGET_DIR}

 # download data, generate manifests
-PYTHONPATH=.:$PYTHONPATH python3 local/librispeech.py \
+PYTHONPATH=.:$PYTHONPATH python3 ${TARGET_DIR}/librispeech/librispeech.py \
 --manifest_prefix="data/manifest" \
--target_dir="${MAIN_ROOT}/dataset/librispeech" \
+--target_dir="${TARGET_DIR}/librispeech" \
 --full_download="True"

 if [ $? -ne 0 ]; then
@ -15,9 +17,8 @@ fi

 cat data/manifest.train-* | shuf > data/manifest.train

-
 # build vocabulary
-python3 ${MAIN_ROOT}/tools/build_vocab.py \
+python3 ${MAIN_ROOT}/utils/build_vocab.py \
 --count_threshold=0 \
 --vocab_path="data/vocab.txt" \
 --manifest_paths="data/manifest.train"
@ -27,9 +28,8 @@ if [ $? -ne 0 ]; then
    exit 1
 fi

-
 # compute mean and stddev for normalizer
-python3 ${MAIN_ROOT}/tools/compute_mean_std.py \
+python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
 --manifest_path="data/manifest.train" \
 --num_samples=2000 \
 --specgram_type="linear" \
@ -40,6 +40,5 @@ if [ $? -ne 0 ]; then
    exit 1
 fi

-
 echo "LibriSpeech Data preparation done."
 exit 0
--- a/examples/librispeech/local/download_lm_en.sh
+++ b/examples/librispeech/local/download_lm_en.sh
@ -0,0 +1,20 @@
+#! /usr/bin/env bash
+
+. ${MAIN_ROOT}/utils/utility.sh
+
+DIR=data/lm
+mkdir -p ${DIR}
+
+URL=https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm
+MD5="099a601759d467cd0a8523ff939819c5"
+TARGET=${DIR}/common_crawl_00.prune01111.trie.klm
+
+echo "Download language model ..."
+download $URL $MD5 $TARGET
+if [ $? -ne 0 ]; then
+    echo "Fail to download the language model!"
+    exit 1
+fi
+
+
+exit 0
--- a/examples/librispeech/local/download_model.sh
+++ b/examples/librispeech/local/download_model.sh
@ -1,10 +1,13 @@
 #! /usr/bin/env bash

-. ../../utils/utility.sh
+. ${MAIN_ROOT}/utils/utility.sh
+
+DIR=data/pretrain
+mkdir -p ${DIR}

 URL='https://deepspeech.bj.bcebos.com/eng_models/librispeech_model_fluid.tar.gz'
 MD5=fafb11fe57c3ecd107147056453f5348
-TARGET=./librispeech_model_fluid.tar.gz
+TARGET=${DIR}/librispeech_model_fluid.tar.gz


 echo "Download LibriSpeech model ..."
@ -13,7 +16,6 @@ if [ $? -ne 0 ]; then
    echo "Fail to download LibriSpeech model!"
    exit 1
 fi
-tar -zxvf $TARGET
-
+tar -zxvf $TARGET -C ${DIR}

 exit 0
--- a/examples/librispeech/local/infer.sh
+++ b/examples/librispeech/local/infer.sh
@ -1,43 +1,21 @@
 #! /usr/bin/env bash

 # download language model
-cd ${MAIN_ROOT}/models/lm > /dev/null
-bash download_lm_en.sh
+bash local/download_lm_en.sh
 if [ $? -ne 0 ]; then
    exit 1
 fi
-cd - > /dev/null

+python3 -u ${BIN_DIR}/infer.py \
+--device 'gpu' \
+--nproc 1 \
+--config conf/deepspeech2.yaml \
+--output ckpt

-# infer
-CUDA_VISIBLE_DEVICES=0 \
-python3 -u ${MAIN_ROOT}/infer.py \
--num_samples=10 \
--beam_size=500 \
--num_proc_bsearch=8 \
--num_conv_layers=2 \
--num_rnn_layers=3 \
--rnn_layer_size=2048 \
--alpha=2.5 \
--beta=0.3 \
--cutoff_prob=1.0 \
--cutoff_top_n=40 \
--use_gru=False \
--use_gpu=True \
--share_rnn_weights=True \
--infer_manifest="data/manifest.test-clean" \
--mean_std_path="data/mean_std.npz" \
--vocab_path="data/vocab.txt" \
--model_path="checkpoints/step_final" \
--lang_model_path="${MAIN_ROOT}/models/lm/common_crawl_00.prune01111.trie.klm" \
--decoding_method="ctc_beam_search" \
--error_rate_type="wer" \
--specgram_type="linear"

 if [ $? -ne 0 ]; then
    echo "Failed in inference!"
    exit 1
 fi

-
 exit 0
--- a/Show More
+++ b/Show More