Rename some folders and update examples.

7 years ago · ae7ef7929a
parent a00a436b52
commit ae7ef7929a
22 changed files with 209 additions and 79 deletions
--- a/data/librispeech/librispeech.py
+++ b/data/librispeech/librispeech.py
@ -41,7 +41,7 @@ MD5_TRAIN_OTHER_500 = "d1a0fd59409feb2c614ce4d30c387708"
 parser = argparse.ArgumentParser(description=__doc__)
 parser.add_argument(
    "--target_dir",
-    default=DATA_HOME + "/Libri",
+    default=DATA_HOME + "/libri",
    type=str,
    help="Directory to save the dataset. (default: %(default)s)")
 parser.add_argument(
--- a/data/tiny/tiny.py
+++ b/data/tiny/tiny.py
@ -0,0 +1,126 @@
+"""Prepare Librispeech ASR datasets.
+
+Download, unpack and create manifest files.
+Manifest file is a json-format file with each line containing the
+meta data (i.e. audio filepath, transcript and audio duration)
+of each audio file in the data set.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import distutils.util
+import os
+import sys
+import tarfile
+import argparse
+import soundfile
+import json
+import codecs
+from paddle.v2.dataset.common import md5file
+
+DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
+
+URL_ROOT = "http://www.openslr.org/resources/12"
+URL_DEV_CLEAN = URL_ROOT + "/dev-clean.tar.gz"
+MD5_DEV_CLEAN = "42e2234ba48799c1f50f24a7926300a1"
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    "--target_dir",
+    default=DATA_HOME + "/tiny",
+    type=str,
+    help="Directory to save the dataset. (default: %(default)s)")
+parser.add_argument(
+    "--manifest_prefix",
+    default="manifest",
+    type=str,
+    help="Filepath prefix for output manifests. (default: %(default)s)")
+args = parser.parse_args()
+
+
+def download(url, md5sum, target_dir):
+    """
+    Download file from url to target_dir, and check md5sum.
+    """
+    if not os.path.exists(target_dir): os.makedirs(target_dir)
+    filepath = os.path.join(target_dir, url.split("/")[-1])
+    if not (os.path.exists(filepath) and md5file(filepath) == md5sum):
+        print("Downloading %s ..." % url)
+        os.system("wget -c " + url + " -P " + target_dir)
+        print("\nMD5 Chesksum %s ..." % filepath)
+        if not md5file(filepath) == md5sum:
+            raise RuntimeError("MD5 checksum failed.")
+    else:
+        print("File exists, skip downloading. (%s)" % filepath)
+    return filepath
+
+
+def unpack(filepath, target_dir):
+    """
+    Unpack the file to the target_dir.
+    """
+    print("Unpacking %s ..." % filepath)
+    tar = tarfile.open(filepath)
+    tar.extractall(target_dir)
+    tar.close()
+
+
+def create_manifest(data_dir, manifest_path):
+    """
+    Create a manifest json file summarizing the data set, with each line
+    containing the meta data (i.e. audio filepath, transcription text, audio
+    duration) of each audio file within the data set.
+    """
+    print("Creating manifest %s ..." % manifest_path)
+    json_lines = []
+    for subfolder, _, filelist in sorted(os.walk(data_dir)):
+        text_filelist = [
+            filename for filename in filelist if filename.endswith('trans.txt')
+        ]
+        if len(text_filelist) > 0:
+            text_filepath = os.path.join(data_dir, subfolder, text_filelist[0])
+            for line in open(text_filepath):
+                segments = line.strip().split()
+                text = ' '.join(segments[1:]).lower()
+                audio_filepath = os.path.join(data_dir, subfolder,
+                                              segments[0] + '.flac')
+                audio_data, samplerate = soundfile.read(audio_filepath)
+                duration = float(len(audio_data)) / samplerate
+                json_lines.append(
+                    json.dumps({
+                        'audio_filepath': audio_filepath,
+                        'duration': duration,
+                        'text': text
+                    }))
+    with codecs.open(manifest_path, 'w', 'utf-8') as out_file:
+        for line in json_lines:
+            out_file.write(line + '\n')
+
+
+def prepare_dataset(url, md5sum, target_dir, manifest_path):
+    """
+    Download, unpack and create summmary manifest file.
+    """
+    if not os.path.exists(os.path.join(target_dir, "LibriSpeech")):
+        # download
+        filepath = download(url, md5sum, target_dir)
+        # unpack
+        unpack(filepath, target_dir)
+    else:
+        print("Skip downloading and unpacking. Data already exists in %s." %
+              target_dir)
+    # create manifest json file
+    create_manifest(target_dir, manifest_path)
+
+
+def main():
+    prepare_dataset(
+        url=URL_DEV_CLEAN,
+        md5sum=MD5_DEV_CLEAN,
+        target_dir=os.path.join(args.target_dir, "dev-clean"),
+        manifest_path=args.manifest_prefix + ".dev-clean")
+
+
+if __name__ == '__main__':
+    main()
--- a/examples/librispeech/prepare_data.sh
+++ b/examples/librispeech/prepare_data.sh
@ -16,7 +16,7 @@ fi
 cat data/librispeech/manifest.train* | shuf > data/librispeech/manifest.train


-# build vocabulary (for English data, we can just skip this)
+# build vocabulary (can be skipped for English, as already provided)
 # python tools/build_vocab.py \
 # --count_threshold=0 \
 # --vocab_path='data/librispeech/eng_vocab.txt' \
--- a/examples/librispeech_tiny/prepare_data.sh
+++ b/examples/librispeech_tiny/prepare_data.sh
@ -1,39 +0,0 @@
-#! /usr/bin/bash
-
-pushd ../..
-
-# download data, generate manifests
-python data/librispeech/librispeech.py \
--manifest_prefix='data/librispeech/manifest' \
--full_download='True' \
--target_dir='~/.cache/paddle/dataset/speech/Libri'
-
-if [ $? -ne 0 ]; then
-    echo "Prepare LibriSpeech failed. Terminated."
-    exit 1
-fi
-
-cat data/librispeech/manifest.train* | shuf > data/librispeech/manifest.train
-
-
-# build vocabulary (for English data, we can just skip this)
-# python tools/build_vocab.py \
-# --count_threshold=0 \
-# --vocab_path='data/librispeech/eng_vocab.txt' \
-# --manifest_paths='data/librispeech/manifeset.train'
-
-
-# compute mean and stddev for normalizer
-python tools/compute_mean_std.py \
--manifest_path='data/librispeech/manifest.train' \
--num_samples=2000 \
--specgram_type='linear' \
--output_path='data/librispeech/mean_std.npz'
-
-if [ $? -ne 0 ]; then
-    echo "Compute mean and stddev failed. Terminated."
-    exit 1
-fi
-
-
-echo "LibriSpeech Data preparation done."
--- a/examples/tiny/run_data.sh
+++ b/examples/tiny/run_data.sh
@ -0,0 +1,45 @@
+#! /usr/bin/bash
+
+pushd ../..
+
+# download data, generate manifests
+python data/tiny/tiny.py \
+--manifest_prefix='data/tiny/manifest' \
+--target_dir=$HOME'/.cache/paddle/dataset/speech/tiny'
+
+if [ $? -ne 0 ]; then
+    echo "Prepare LibriSpeech failed. Terminated."
+    exit 1
+fi
+
+cat data/tiny/manifest.dev-clean | head -n 32 > data/tiny/manifest.train
+cat data/tiny/manifest.dev-clean | head -n 48 | tail -n 16 > data/tiny/manifest.dev
+cat data/tiny/manifest.dev-clean | head -n 64 | tail -n 16 > data/tiny/manifest.test
+
+
+# build vocabulary
+python tools/build_vocab.py \
+--count_threshold=0 \
+--vocab_path='data/tiny/vocab.txt' \
+--manifest_paths='data/tiny/manifest.train'
+
+if [ $? -ne 0 ]; then
+    echo "Build vocabulary failed. Terminated."
+    exit 1
+fi
+
+
+# compute mean and stddev for normalizer
+python tools/compute_mean_std.py \
+--manifest_path='data/tiny/manifest.train' \
+--num_samples=32 \
+--specgram_type='linear' \
+--output_path='data/tiny/mean_std.npz'
+
+if [ $? -ne 0 ]; then
+    echo "Compute mean and stddev failed. Terminated."
+    exit 1
+fi
+
+
+echo "Tiny data preparation done."
--- a/examples/librispeech_tiny/run_infer.sh
+++ b/examples/librispeech_tiny/run_infer.sh
@ -4,7 +4,7 @@ pushd ../..

 CUDA_VISIBLE_DEVICES=0 \
 python -u infer.py \
--num_samples=10 \
+--num_samples=4 \
 --trainer_count=1 \
 --beam_size=500 \
 --num_proc_bsearch=12 \
@ -17,11 +17,11 @@ python -u infer.py \
 --use_gru=False \
 --use_gpu=True \
 --share_rnn_weights=True \
--infer_manifest='data/librispeech/manifest.dev-clean' \
--mean_std_path='data/librispeech/mean_std.npz' \
--vocab_path='data/librispeech/eng_vocab.txt' \
--model_path='checkpoints/params.latest.tar.gz' \
--lang_model_path='lm/data/common_crawl_00.prune01111.trie.klm' \
+--infer_manifest='data/tiny/manifest.train' \
+--mean_std_path='data/tiny/mean_std.npz' \
+--vocab_path='data/tiny/vocab.txt' \
+--model_path='checkpoints/params.pass-14.tar.gz' \
+--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
 --decoding_method='ctc_beam_search' \
 --error_rate_type='wer' \
 --specgram_type='linear'
--- a/examples/librispeech_tiny/run_test.sh
+++ b/examples/librispeech_tiny/run_test.sh
--- a/examples/librispeech_tiny/run_train.sh
+++ b/examples/librispeech_tiny/run_train.sh
@ -2,17 +2,17 @@

 pushd ../..

-CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+CUDA_VISIBLE_DEVICES=0,1 \
 python -u train.py \
--batch_size=256 \
--trainer_count=8 \
--num_passes=50 \
--num_proc_data=12 \
+--batch_size=2 \
+--trainer_count=1 \
+--num_passes=10 \
+--num_proc_data=1 \
 --num_conv_layers=2 \
 --num_rnn_layers=3 \
 --rnn_layer_size=2048 \
 --num_iter_print=100 \
--learning_rate=5e-4 \
+--learning_rate=5e-5 \
 --max_duration=27.0 \
 --min_duration=0.0 \
 --use_sortagrad=True \
@ -20,10 +20,10 @@ python -u train.py \
 --use_gpu=True \
 --is_local=True \
 --share_rnn_weights=True \
--train_manifest='data/librispeech/manifest.train' \
--dev_manifest='data/librispeech/manifest.dev' \
--mean_std_path='data/librispeech/mean_std.npz' \
--vocab_path='data/librispeech/eng_vocab.txt' \
+--train_manifest='data/tiny/manifest.train' \
+--dev_manifest='data/tiny/manifest.train' \
+--mean_std_path='data/tiny/mean_std.npz' \
+--vocab_path='data/tiny/vocab.txt' \
 --output_model_dir='./checkpoints' \
 --augment_conf_path='conf/augmentation.config' \
 --specgram_type='linear' \
--- a/examples/librispeech_tiny/run_tune.sh
+++ b/examples/librispeech_tiny/run_tune.sh
--- a/infer.py
+++ b/infer.py
@ -7,7 +7,7 @@ import argparse
 import functools
 import paddle.v2 as paddle
 from data_utils.data import DataGenerator
-from models.model import DeepSpeech2Model
+from model_utils.model import DeepSpeech2Model
 from utils.error_rate import wer, cer
 from utils.utility import add_arguments, print_arguments

@ -35,10 +35,10 @@ add_arg('mean_std_path',    str,
        'data/librispeech/mean_std.npz',
        "Filepath of normalizer's mean & std.")
 add_arg('vocab_path',       str,
-        'data/librispeech/eng_vocab.txt',
+        'data/librispeech/vocab.txt',
        "Filepath of vocabulary.")
 add_arg('lang_model_path',  str,
-        'lm/data/common_crawl_00.prune01111.trie.klm',
+        'model_zoo/lm/common_crawl_00.prune01111.trie.klm',
        "Filepath for language model.")
 add_arg('model_path',       str,
        './checkpoints/params.latest.tar.gz',
--- a/model_utils/init.py
+++ b/model_utils/init.py
--- a/model_utils/decoder.py
+++ b/model_utils/decoder.py
@ -180,6 +180,8 @@ def ctc_beam_search_decoder(probs_seq,
                prob = prob * ext_scoring_func(result)
            log_prob = log(prob)
            beam_result.append((log_prob, result))
+        else:
+            beam_result.append((float('-inf'), ''))

    ## output top beam_size decoding results
    beam_result = sorted(beam_result, key=lambda asd: asd[0], reverse=True)
--- a/model_utils/lm_scorer.py
+++ b/model_utils/lm_scorer.py
--- a/model_utils/model.py
+++ b/model_utils/model.py
@ -8,9 +8,10 @@ import os
 import time
 import gzip
 import paddle.v2 as paddle
-from lm.lm_scorer import LmScorer
-from models.decoder import ctc_greedy_decoder, ctc_beam_search_decoder
-from models.network import deep_speech_v2_network
+from model_utils.lm_scorer import LmScorer
+from model_utils.decoder import ctc_greedy_decoder, ctc_beam_search_decoder
+from model_utils.decoder import ctc_beam_search_decoder_batch
+from model_utils.network import deep_speech_v2_network


 class DeepSpeech2Model(object):
--- a/model_utils/network.py
+++ b/model_utils/network.py
--- a/model_utils/tests/test_decoders.py
+++ b/model_utils/tests/test_decoders.py
@ -4,7 +4,7 @@ from __future__ import division
 from __future__ import print_function

 import unittest
-from models import decoder
+from model_utils import decoder


 class TestDecoders(unittest.TestCase):
--- a/models/init.py
+++ b/models/init.py
--- a/models/lm/download_en.sh
+++ b/models/lm/download_en.sh
@ -14,6 +14,3 @@ if [ $MD5 != $md5_tmp ]; then
    echo "Fail to download the language model!"
    exit 1
 fi
-
-
-
--- a/test.py
+++ b/test.py
@ -7,7 +7,7 @@ import argparse
 import functools
 import paddle.v2 as paddle
 from data_utils.data import DataGenerator
-from models.model import DeepSpeech2Model
+from model_utils.model import DeepSpeech2Model
 from utils.error_rate import wer, cer
 from utils.utility import add_arguments, print_arguments

@ -36,14 +36,14 @@ add_arg('mean_std_path',    str,
        'data/librispeech/mean_std.npz',
        "Filepath of normalizer's mean & std.")
 add_arg('vocab_path',       str,
-        'data/librispeech/eng_vocab.txt',
+        'data/librispeech/vocab.txt',
        "Filepath of vocabulary.")
 add_arg('model_path',       str,
        './checkpoints/params.latest.tar.gz',
        "If None, the training starts from scratch, "
        "otherwise, it resumes from the pre-trained model.")
 add_arg('lang_model_path',  str,
-        'lm/data/common_crawl_00.prune01111.trie.klm',
+        'model_zoo/lm/common_crawl_00.prune01111.trie.klm',
        "Filepath for language model.")
 add_arg('decoding_method',  str,
        'ctc_beam_search',
--- a/tools/build_vocab.py
+++ b/tools/build_vocab.py
@ -21,10 +21,8 @@ add_arg = functools.partial(add_arguments, argparser=parser)
 # yapf: disable
 add_arg('count_threshold',  int,    0,  "Truncation threshold for char counts.")
 add_arg('vocab_path',       str,
-        None,
-        "Filepath to write the vocabulary.",
-        nargs='+',
-        required=True)
+        'data/librispeech/vocab.txt',
+        "Filepath to write the vocabulary.")
 add_arg('manifest_paths',   str,
        None,
        "Filepaths of manifests for building vocabulary. "
--- a/tools/tune.py
+++ b/tools/tune.py
@ -9,7 +9,7 @@ import functools
 import paddle.v2 as paddle
 import _init_paths
 from data_utils.data import DataGenerator
-from models.model import DeepSpeech2Model
+from model_utils.model import DeepSpeech2Model
 from utils.error_rate import wer
 from utils.utility import add_arguments, print_arguments

@ -41,10 +41,10 @@ add_arg('mean_std_path',    str,
        'data/librispeech/mean_std.npz',
        "Filepath of normalizer's mean & std.")
 add_arg('vocab_path',       str,
-        'data/librispeech/eng_vocab.txt',
+        'data/librispeech/vocab.txt',
        "Filepath of vocabulary.")
 add_arg('lang_model_path',  str,
-        'lm/data/common_crawl_00.prune01111.trie.klm',
+        'model_zoo/lm/common_crawl_00.prune01111.trie.klm',
        "Filepath for language model.")
 add_arg('model_path',       str,
        './checkpoints/params.latest.tar.gz',
--- a/train.py
+++ b/train.py
@ -6,7 +6,7 @@ from __future__ import print_function
 import argparse
 import functools
 import paddle.v2 as paddle
-from models.model import DeepSpeech2Model
+from model_utils.model import DeepSpeech2Model
 from data_utils.data import DataGenerator
 from utils.utility import add_arguments, print_arguments

@ -41,7 +41,7 @@ add_arg('mean_std_path',    str,
        'data/librispeech/mean_std.npz',
        "Filepath of normalizer's mean & std.")
 add_arg('vocab_path',       str,
-        'data/librispeech/eng_vocab.txt',
+        'data/librispeech/vocab.txt',
        "Filepath of vocabulary.")
 add_arg('init_model_path',  str,
        None,