fix cmvn

4 years ago · 1635e000b3
parent 2aed275233
commit 1635e000b3
7 changed files with 51 additions and 40 deletions
--- a/deepspeech/frontend/normalizer.py
+++ b/deepspeech/frontend/normalizer.py
@ -22,9 +22,12 @@ from paddle.io import Dataset
 from deepspeech.frontend.audio import AudioSegment
 from deepspeech.frontend.utility import load_cmvn
 from deepspeech.frontend.utility import read_manifest
 from deepspeech.utils.log import Log
 __all__ = ["FeatureNormalizer"]
 logger = Log(__name__).getlog()
 # https://github.com/PaddlePaddle/Paddle/pull/31481
 class CollateFunc(object):
@ -176,7 +179,7 @@ class FeatureNormalizer(object):
                wav_number += batch_size
                if wav_number % 1000 == 0:
-                    print('process {} wavs,{} frames'.format(wav_number,
+                    logger.info('process {} wavs,{} frames'.format(wav_number,
                                                                   all_number))
        self.cmvn_info = {
--- a/deepspeech/utils/log.py
+++ b/deepspeech/utils/log.py
@ -17,6 +17,12 @@ import os
 import socket
 import sys
 FORMAT_STR = '[%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s'
 DATE_FMT_STR = '%Y/%m/%d %H:%M:%S'
 logging.basicConfig(
    level=logging.DEBUG, format=FORMAT_STR, datefmt=DATE_FMT_STR)
 def find_log_dir(log_dir=None):
    """Returns the most suitable directory to put log files into.
@ -123,12 +129,10 @@ class Log():
            pass
        if not self.logger.hasHandlers():
-            format = '[%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s'
+            formatter = logging.Formatter(fmt=FORMAT_STR, datefmt=DATE_FMT_STR)
            formatter = logging.Formatter(
                fmt=format, datefmt='%Y/%m/%d %H:%M:%S')
            fh = logging.FileHandler(Log.log_name)
            fh.setFormatter(formatter)
            fh.setLevel(logging.DEBUG)
            fh.setFormatter(formatter)
            self.logger.addHandler(fh)
            ch = logging.StreamHandler()
@ -136,9 +140,6 @@ class Log():
            ch.setFormatter(formatter)
            self.logger.addHandler(ch)
            #fh.close()
            #ch.close()
        # stop propagate for propagating may print
        # log multiple times
        self.logger.propagate = False
--- a/examples/aishell/s0/local/data.sh
+++ b/examples/aishell/s0/local/data.sh
@ -51,6 +51,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    --stride_ms=10.0 \
    --window_ms=25.0 \
    --sample_rate=16000 \
    --use_dB_normalization=False \
    --num_samples=-1 \
    --num_workers=16 \
    --output_path="data/mean_std.json"
--- a/examples/librispeech/s1/local/data.sh
+++ b/examples/librispeech/s1/local/data.sh
@ -73,6 +73,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    --sample_rate=16000 \
    --stride_ms=10.0 \
    --window_ms=25.0 \
    --use_dB_normalization=False \
    --num_workers=${num_workers} \
    --output_path="data/mean_std.json"
--- a/examples/tiny/s0/local/data.sh
+++ b/examples/tiny/s0/local/data.sh
@ -57,6 +57,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    --sample_rate=16000 \
    --stride_ms=10.0 \
    --window_ms=25.0 \
    --use_dB_normalization=False \
    --num_workers=2 \
    --output_path="data/mean_std.json"
--- a/utils/avg_model.py
+++ b/utils/avg_model.py
@ -21,6 +21,8 @@ import paddle
 def main(args):
    paddle.set_device('cpu')
    val_scores = []
    beat_val_scores = []
    selected_epochs = []
--- a/utils/compute_mean_std.py
+++ b/utils/compute_mean_std.py
@ -25,17 +25,19 @@ parser = argparse.ArgumentParser(description=__doc__)
 add_arg = functools.partial(add_arguments, argparser=parser)
 # yapf: disable
 add_arg('num_samples',      int,    -1,    "# of samples to for statistics.")
 add_arg('specgram_type',    str,
        'linear',
        "Audio feature type. Options: linear, mfcc, fbank.",
        choices=['linear', 'mfcc', 'fbank'])
 add_arg('feat_dim',    int, 13, "Audio feature dim.")
-add_arg('delta_delta',    bool,
+add_arg('delta_delta', bool,  False, "Audio feature with delta delta.")
        False,
        "Audio feature with delta delta.")
 add_arg('stride_ms', float, 10.0,  "stride length in ms.")
 add_arg('window_ms', float, 20.0,  "stride length in ms.")
 add_arg('sample_rate',  int, 16000,  "target sample rate.")
 add_arg('use_dB_normalization', bool, False, "do dB normalization.")
 add_arg('target_dB',   int, -20,  "target dB.")
 add_arg('manifest_path',    str,
        'data/librispeech/manifest.train',
        "Filepath of manifest to compute normalizer's mean and stddev.")
@ -63,8 +65,8 @@ def main():
        n_fft=None,
        max_freq=None,
        target_sample_rate=args.sample_rate,
-        use_dB_normalization=True,
+        use_dB_normalization=args.use_dB_normalization,
-        target_dB=-20,
+        target_dB=args.target_dB,
        dither=0.0)
    def augment_and_featurize(audio_segment):