Rename some folders and update examples.

8 years ago · ae7ef7929a
parent a00a436b52
commit ae7ef7929a
22 changed files with 209 additions and 79 deletions
--- a/data/librispeech/librispeech.py
+++ b/data/librispeech/librispeech.py
@ -41,7 +41,7 @@ MD5_TRAIN_OTHER_500 = "d1a0fd59409feb2c614ce4d30c387708"
 parser = argparse.ArgumentParser(description=__doc__)
 parser.add_argument(
    "--target_dir",
-    default=DATA_HOME + "/Libri",
+    default=DATA_HOME + "/libri",
    type=str,
    help="Directory to save the dataset. (default: %(default)s)")
 parser.add_argument(
--- a/data/tiny/tiny.py
+++ b/data/tiny/tiny.py
@ -0,0 +1,126 @@
 """Prepare Librispeech ASR datasets.
 Download, unpack and create manifest files.
 Manifest file is a json-format file with each line containing the
 meta data (i.e. audio filepath, transcript and audio duration)
 of each audio file in the data set.
 """
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 import distutils.util
 import os
 import sys
 import tarfile
 import argparse
 import soundfile
 import json
 import codecs
 from paddle.v2.dataset.common import md5file
 DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
 URL_ROOT = "http://www.openslr.org/resources/12"
 URL_DEV_CLEAN = URL_ROOT + "/dev-clean.tar.gz"
 MD5_DEV_CLEAN = "42e2234ba48799c1f50f24a7926300a1"
 parser = argparse.ArgumentParser(description=__doc__)
 parser.add_argument(
    "--target_dir",
    default=DATA_HOME + "/tiny",
    type=str,
    help="Directory to save the dataset. (default: %(default)s)")
 parser.add_argument(
    "--manifest_prefix",
    default="manifest",
    type=str,
    help="Filepath prefix for output manifests. (default: %(default)s)")
 args = parser.parse_args()
 def download(url, md5sum, target_dir):
    """
    Download file from url to target_dir, and check md5sum.
    """
    if not os.path.exists(target_dir): os.makedirs(target_dir)
    filepath = os.path.join(target_dir, url.split("/")[-1])
    if not (os.path.exists(filepath) and md5file(filepath) == md5sum):
        print("Downloading %s ..." % url)
        os.system("wget -c " + url + " -P " + target_dir)
        print("\nMD5 Chesksum %s ..." % filepath)
        if not md5file(filepath) == md5sum:
            raise RuntimeError("MD5 checksum failed.")
    else:
        print("File exists, skip downloading. (%s)" % filepath)
    return filepath
 def unpack(filepath, target_dir):
    """
    Unpack the file to the target_dir.
    """
    print("Unpacking %s ..." % filepath)
    tar = tarfile.open(filepath)
    tar.extractall(target_dir)
    tar.close()
 def create_manifest(data_dir, manifest_path):
    """
    Create a manifest json file summarizing the data set, with each line
    containing the meta data (i.e. audio filepath, transcription text, audio
    duration) of each audio file within the data set.
    """
    print("Creating manifest %s ..." % manifest_path)
    json_lines = []
    for subfolder, _, filelist in sorted(os.walk(data_dir)):
        text_filelist = [
            filename for filename in filelist if filename.endswith('trans.txt')
        ]
        if len(text_filelist) > 0:
            text_filepath = os.path.join(data_dir, subfolder, text_filelist[0])
            for line in open(text_filepath):
                segments = line.strip().split()
                text = ' '.join(segments[1:]).lower()
                audio_filepath = os.path.join(data_dir, subfolder,
                                              segments[0] + '.flac')
                audio_data, samplerate = soundfile.read(audio_filepath)
                duration = float(len(audio_data)) / samplerate
                json_lines.append(
                    json.dumps({
                        'audio_filepath': audio_filepath,
                        'duration': duration,
                        'text': text
                    }))
    with codecs.open(manifest_path, 'w', 'utf-8') as out_file:
        for line in json_lines:
            out_file.write(line + '\n')
 def prepare_dataset(url, md5sum, target_dir, manifest_path):
    """
    Download, unpack and create summmary manifest file.
    """
    if not os.path.exists(os.path.join(target_dir, "LibriSpeech")):
        # download
        filepath = download(url, md5sum, target_dir)
        # unpack
        unpack(filepath, target_dir)
    else:
        print("Skip downloading and unpacking. Data already exists in %s." %
              target_dir)
    # create manifest json file
    create_manifest(target_dir, manifest_path)
 def main():
    prepare_dataset(
        url=URL_DEV_CLEAN,
        md5sum=MD5_DEV_CLEAN,
        target_dir=os.path.join(args.target_dir, "dev-clean"),
        manifest_path=args.manifest_prefix + ".dev-clean")
 if __name__ == '__main__':
    main()
--- a/examples/librispeech/prepare_data.sh
+++ b/examples/librispeech/prepare_data.sh
@ -16,7 +16,7 @@ fi
 cat data/librispeech/manifest.train* | shuf > data/librispeech/manifest.train
-# build vocabulary (for English data, we can just skip this)
+# build vocabulary (can be skipped for English, as already provided)
 # python tools/build_vocab.py \
 # --count_threshold=0 \
 # --vocab_path='data/librispeech/eng_vocab.txt' \
--- a/examples/librispeech_tiny/prepare_data.sh
+++ b/examples/librispeech_tiny/prepare_data.sh
@ -1,39 +0,0 @@
 #! /usr/bin/bash
 pushd ../..
 # download data, generate manifests
 python data/librispeech/librispeech.py \
 --manifest_prefix='data/librispeech/manifest' \
 --full_download='True' \
 --target_dir='~/.cache/paddle/dataset/speech/Libri'
 if [ $? -ne 0 ]; then
    echo "Prepare LibriSpeech failed. Terminated."
    exit 1
 fi
 cat data/librispeech/manifest.train* | shuf > data/librispeech/manifest.train
 # build vocabulary (for English data, we can just skip this)
 # python tools/build_vocab.py \
 # --count_threshold=0 \
 # --vocab_path='data/librispeech/eng_vocab.txt' \
 # --manifest_paths='data/librispeech/manifeset.train'
 # compute mean and stddev for normalizer
 python tools/compute_mean_std.py \
 --manifest_path='data/librispeech/manifest.train' \
 --num_samples=2000 \
 --specgram_type='linear' \
 --output_path='data/librispeech/mean_std.npz'
 if [ $? -ne 0 ]; then
    echo "Compute mean and stddev failed. Terminated."
    exit 1
 fi
 echo "LibriSpeech Data preparation done."
--- a/examples/tiny/run_data.sh
+++ b/examples/tiny/run_data.sh
@ -0,0 +1,45 @@
 #! /usr/bin/bash
 pushd ../..
 # download data, generate manifests
 python data/tiny/tiny.py \
 --manifest_prefix='data/tiny/manifest' \
 --target_dir=$HOME'/.cache/paddle/dataset/speech/tiny'
 if [ $? -ne 0 ]; then
    echo "Prepare LibriSpeech failed. Terminated."
    exit 1
 fi
 cat data/tiny/manifest.dev-clean | head -n 32 > data/tiny/manifest.train
 cat data/tiny/manifest.dev-clean | head -n 48 | tail -n 16 > data/tiny/manifest.dev
 cat data/tiny/manifest.dev-clean | head -n 64 | tail -n 16 > data/tiny/manifest.test
 # build vocabulary
 python tools/build_vocab.py \
 --count_threshold=0 \
 --vocab_path='data/tiny/vocab.txt' \
 --manifest_paths='data/tiny/manifest.train'
 if [ $? -ne 0 ]; then
    echo "Build vocabulary failed. Terminated."
    exit 1
 fi
 # compute mean and stddev for normalizer
 python tools/compute_mean_std.py \
 --manifest_path='data/tiny/manifest.train' \
 --num_samples=32 \
 --specgram_type='linear' \
 --output_path='data/tiny/mean_std.npz'
 if [ $? -ne 0 ]; then
    echo "Compute mean and stddev failed. Terminated."
    exit 1
 fi
 echo "Tiny data preparation done."
--- a/examples/librispeech_tiny/run_infer.sh
+++ b/examples/librispeech_tiny/run_infer.sh
@ -4,7 +4,7 @@ pushd ../..
 CUDA_VISIBLE_DEVICES=0 \
 python -u infer.py \
--num_samples=10 \
+--num_samples=4 \
 --trainer_count=1 \
 --beam_size=500 \
 --num_proc_bsearch=12 \
@ -17,11 +17,11 @@ python -u infer.py \
 --use_gru=False \
 --use_gpu=True \
 --share_rnn_weights=True \
--infer_manifest='data/librispeech/manifest.dev-clean' \
+--infer_manifest='data/tiny/manifest.train' \
--mean_std_path='data/librispeech/mean_std.npz' \
+--mean_std_path='data/tiny/mean_std.npz' \
--vocab_path='data/librispeech/eng_vocab.txt' \
+--vocab_path='data/tiny/vocab.txt' \
--model_path='checkpoints/params.latest.tar.gz' \
+--model_path='checkpoints/params.pass-14.tar.gz' \
--lang_model_path='lm/data/common_crawl_00.prune01111.trie.klm' \
+--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
 --decoding_method='ctc_beam_search' \
 --error_rate_type='wer' \
 --specgram_type='linear'
--- a/examples/librispeech_tiny/run_test.sh
+++ b/examples/librispeech_tiny/run_test.sh
--- a/examples/librispeech_tiny/run_train.sh
+++ b/examples/librispeech_tiny/run_train.sh
@ -2,17 +2,17 @@
 pushd ../..
-CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+CUDA_VISIBLE_DEVICES=0,1 \
 python -u train.py \
--batch_size=256 \
+--batch_size=2 \
--trainer_count=8 \
+--trainer_count=1 \
--num_passes=50 \
+--num_passes=10 \
--num_proc_data=12 \
+--num_proc_data=1 \
 --num_conv_layers=2 \
 --num_rnn_layers=3 \
 --rnn_layer_size=2048 \
 --num_iter_print=100 \
--learning_rate=5e-4 \
+--learning_rate=5e-5 \
 --max_duration=27.0 \
 --min_duration=0.0 \
 --use_sortagrad=True \
@ -20,10 +20,10 @@ python -u train.py \
 --use_gpu=True \
 --is_local=True \
 --share_rnn_weights=True \
--train_manifest='data/librispeech/manifest.train' \
+--train_manifest='data/tiny/manifest.train' \
--dev_manifest='data/librispeech/manifest.dev' \
+--dev_manifest='data/tiny/manifest.train' \
--mean_std_path='data/librispeech/mean_std.npz' \
+--mean_std_path='data/tiny/mean_std.npz' \
--vocab_path='data/librispeech/eng_vocab.txt' \
+--vocab_path='data/tiny/vocab.txt' \
 --output_model_dir='./checkpoints' \
 --augment_conf_path='conf/augmentation.config' \
 --specgram_type='linear' \
--- a/examples/librispeech_tiny/run_tune.sh
+++ b/examples/librispeech_tiny/run_tune.sh
--- a/infer.py
+++ b/infer.py
@ -7,7 +7,7 @@ import argparse
 import functools
 import paddle.v2 as paddle
 from data_utils.data import DataGenerator
-from models.model import DeepSpeech2Model
+from model_utils.model import DeepSpeech2Model
 from utils.error_rate import wer, cer
 from utils.utility import add_arguments, print_arguments
@ -35,10 +35,10 @@ add_arg('mean_std_path',    str,
        'data/librispeech/mean_std.npz',
        "Filepath of normalizer's mean & std.")
 add_arg('vocab_path',       str,
-        'data/librispeech/eng_vocab.txt',
+        'data/librispeech/vocab.txt',
        "Filepath of vocabulary.")
 add_arg('lang_model_path',  str,
-        'lm/data/common_crawl_00.prune01111.trie.klm',
+        'model_zoo/lm/common_crawl_00.prune01111.trie.klm',
        "Filepath for language model.")
 add_arg('model_path',       str,
        './checkpoints/params.latest.tar.gz',
--- a/model_utils/init.py
+++ b/model_utils/init.py
--- a/model_utils/decoder.py
+++ b/model_utils/decoder.py
@ -180,6 +180,8 @@ def ctc_beam_search_decoder(probs_seq,
                prob = prob * ext_scoring_func(result)
            log_prob = log(prob)
            beam_result.append((log_prob, result))
        else:
            beam_result.append((float('-inf'), ''))
    ## output top beam_size decoding results
    beam_result = sorted(beam_result, key=lambda asd: asd[0], reverse=True)
--- a/model_utils/lm_scorer.py
+++ b/model_utils/lm_scorer.py
--- a/model_utils/model.py
+++ b/model_utils/model.py
@ -8,9 +8,10 @@ import os
 import time
 import gzip
 import paddle.v2 as paddle
-from lm.lm_scorer import LmScorer
+from model_utils.lm_scorer import LmScorer
-from models.decoder import ctc_greedy_decoder, ctc_beam_search_decoder
+from model_utils.decoder import ctc_greedy_decoder, ctc_beam_search_decoder
-from models.network import deep_speech_v2_network
+from model_utils.decoder import ctc_beam_search_decoder_batch
 from model_utils.network import deep_speech_v2_network
 class DeepSpeech2Model(object):
--- a/model_utils/network.py
+++ b/model_utils/network.py
--- a/model_utils/tests/test_decoders.py
+++ b/model_utils/tests/test_decoders.py
@ -4,7 +4,7 @@ from __future__ import division
 from __future__ import print_function
 import unittest
-from models import decoder
+from model_utils import decoder
 class TestDecoders(unittest.TestCase):
--- a/models/init.py
+++ b/models/init.py
--- a/models/lm/download_en.sh
+++ b/models/lm/download_en.sh
@ -14,6 +14,3 @@ if [ $MD5 != $md5_tmp ]; then
    echo "Fail to download the language model!"
    exit 1
 fi
--- a/test.py
+++ b/test.py
@ -7,7 +7,7 @@ import argparse
 import functools
 import paddle.v2 as paddle
 from data_utils.data import DataGenerator
-from models.model import DeepSpeech2Model
+from model_utils.model import DeepSpeech2Model
 from utils.error_rate import wer, cer
 from utils.utility import add_arguments, print_arguments
@ -36,14 +36,14 @@ add_arg('mean_std_path',    str,
        'data/librispeech/mean_std.npz',
        "Filepath of normalizer's mean & std.")
 add_arg('vocab_path',       str,
-        'data/librispeech/eng_vocab.txt',
+        'data/librispeech/vocab.txt',
        "Filepath of vocabulary.")
 add_arg('model_path',       str,
        './checkpoints/params.latest.tar.gz',
        "If None, the training starts from scratch, "
        "otherwise, it resumes from the pre-trained model.")
 add_arg('lang_model_path',  str,
-        'lm/data/common_crawl_00.prune01111.trie.klm',
+        'model_zoo/lm/common_crawl_00.prune01111.trie.klm',
        "Filepath for language model.")
 add_arg('decoding_method',  str,
        'ctc_beam_search',
--- a/tools/build_vocab.py
+++ b/tools/build_vocab.py
@ -21,10 +21,8 @@ add_arg = functools.partial(add_arguments, argparser=parser)
 # yapf: disable
 add_arg('count_threshold',  int,    0,  "Truncation threshold for char counts.")
 add_arg('vocab_path',       str,
-        None,
+        'data/librispeech/vocab.txt',
-        "Filepath to write the vocabulary.",
+        "Filepath to write the vocabulary.")
        nargs='+',
        required=True)
 add_arg('manifest_paths',   str,
        None,
        "Filepaths of manifests for building vocabulary. "
--- a/tools/tune.py
+++ b/tools/tune.py
@ -9,7 +9,7 @@ import functools
 import paddle.v2 as paddle
 import _init_paths
 from data_utils.data import DataGenerator
-from models.model import DeepSpeech2Model
+from model_utils.model import DeepSpeech2Model
 from utils.error_rate import wer
 from utils.utility import add_arguments, print_arguments
@ -41,10 +41,10 @@ add_arg('mean_std_path',    str,
        'data/librispeech/mean_std.npz',
        "Filepath of normalizer's mean & std.")
 add_arg('vocab_path',       str,
-        'data/librispeech/eng_vocab.txt',
+        'data/librispeech/vocab.txt',
        "Filepath of vocabulary.")
 add_arg('lang_model_path',  str,
-        'lm/data/common_crawl_00.prune01111.trie.klm',
+        'model_zoo/lm/common_crawl_00.prune01111.trie.klm',
        "Filepath for language model.")
 add_arg('model_path',       str,
        './checkpoints/params.latest.tar.gz',
--- a/train.py
+++ b/train.py
@ -6,7 +6,7 @@ from __future__ import print_function
 import argparse
 import functools
 import paddle.v2 as paddle
-from models.model import DeepSpeech2Model
+from model_utils.model import DeepSpeech2Model
 from data_utils.data import DataGenerator
 from utils.utility import add_arguments, print_arguments
@ -41,7 +41,7 @@ add_arg('mean_std_path',    str,
        'data/librispeech/mean_std.npz',
        "Filepath of normalizer's mean & std.")
 add_arg('vocab_path',       str,
-        'data/librispeech/eng_vocab.txt',
+        'data/librispeech/vocab.txt',
        "Filepath of vocabulary.")
 add_arg('init_model_path',  str,
        None,