refactor tiny egs

5 years ago · 75c8018eab
parent b882ba5000
commit 75c8018eab
12 changed files with 238 additions and 69 deletions
--- a/examples/librispeech/local/librispeech.py
+++ b/examples/librispeech/local/librispeech.py
@ -0,0 +1,146 @@
 """Prepare Librispeech ASR datasets.
 Download, unpack and create manifest files.
 Manifest file is a json-format file with each line containing the
 meta data (i.e. audio filepath, transcript and audio duration)
 of each audio file in the data set.
 """
 import distutils.util
 import os
 import sys
 import argparse
 import soundfile
 import json
 import codecs
 import io
 from data_utils.utility import download, unpack
 URL_ROOT = "http://www.openslr.org/resources/12"
 URL_ROOT = "https://openslr.magicdatatech.com/resources/12"
 URL_TEST_CLEAN = URL_ROOT + "/test-clean.tar.gz"
 URL_TEST_OTHER = URL_ROOT + "/test-other.tar.gz"
 URL_DEV_CLEAN = URL_ROOT + "/dev-clean.tar.gz"
 URL_DEV_OTHER = URL_ROOT + "/dev-other.tar.gz"
 URL_TRAIN_CLEAN_100 = URL_ROOT + "/train-clean-100.tar.gz"
 URL_TRAIN_CLEAN_360 = URL_ROOT + "/train-clean-360.tar.gz"
 URL_TRAIN_OTHER_500 = URL_ROOT + "/train-other-500.tar.gz"
 MD5_TEST_CLEAN = "32fa31d27d2e1cad72775fee3f4849a9"
 MD5_TEST_OTHER = "fb5a50374b501bb3bac4815ee91d3135"
 MD5_DEV_CLEAN = "42e2234ba48799c1f50f24a7926300a1"
 MD5_DEV_OTHER = "c8d0bcc9cca99d4f8b62fcc847357931"
 MD5_TRAIN_CLEAN_100 = "2a93770f6d5c6c964bc36631d331a522"
 MD5_TRAIN_CLEAN_360 = "c0e676e450a7ff2f54aeade5171606fa"
 MD5_TRAIN_OTHER_500 = "d1a0fd59409feb2c614ce4d30c387708"
 parser = argparse.ArgumentParser(description=__doc__)
 parser.add_argument(
    "--target_dir",
    default='~/.cache/paddle/dataset/speech/libri',
    type=str,
    help="Directory to save the dataset. (default: %(default)s)")
 parser.add_argument(
    "--manifest_prefix",
    default="manifest",
    type=str,
    help="Filepath prefix for output manifests. (default: %(default)s)")
 parser.add_argument(
    "--full_download",
    default="True",
    type=distutils.util.strtobool,
    help="Download all datasets for Librispeech."
    " If False, only download a minimal requirement (test-clean, dev-clean"
    " train-clean-100). (default: %(default)s)")
 args = parser.parse_args()
 def create_manifest(data_dir, manifest_path):
    """Create a manifest json file summarizing the data set, with each line
    containing the meta data (i.e. audio filepath, transcription text, audio
    duration) of each audio file within the data set.
    """
    print("Creating manifest %s ..." % manifest_path)
    json_lines = []
    for subfolder, _, filelist in sorted(os.walk(data_dir)):
        text_filelist = [
            filename for filename in filelist if filename.endswith('trans.txt')
        ]
        if len(text_filelist) > 0:
            text_filepath = os.path.join(subfolder, text_filelist[0])
            for line in io.open(text_filepath, encoding="utf8"):
                segments = line.strip().split()
                text = ' '.join(segments[1:]).lower()
                audio_filepath = os.path.join(subfolder, segments[0] + '.flac')
                audio_data, samplerate = soundfile.read(audio_filepath)
                duration = float(len(audio_data)) / samplerate
                json_lines.append(
                    json.dumps({
                        'audio_filepath': audio_filepath,
                        'duration': duration,
                        'text': text
                    }))
    with codecs.open(manifest_path, 'w', 'utf-8') as out_file:
        for line in json_lines:
            out_file.write(line + '\n')
 def prepare_dataset(url, md5sum, target_dir, manifest_path):
    """Download, unpack and create summmary manifest file.
    """
    if not os.path.exists(os.path.join(target_dir, "LibriSpeech")):
        # download
        filepath = download(url, md5sum, target_dir)
        # unpack
        unpack(filepath, target_dir)
    else:
        print("Skip downloading and unpacking. Data already exists in %s." %
              target_dir)
    # create manifest json file
    create_manifest(target_dir, manifest_path)
 def main():
    if args.target_dir.startswith('~'):
        args.target_dir = os.path.expanduser(args.target_dir)
    prepare_dataset(
        url=URL_TEST_CLEAN,
        md5sum=MD5_TEST_CLEAN,
        target_dir=os.path.join(args.target_dir, "test-clean"),
        manifest_path=args.manifest_prefix + ".test-clean")
    prepare_dataset(
        url=URL_DEV_CLEAN,
        md5sum=MD5_DEV_CLEAN,
        target_dir=os.path.join(args.target_dir, "dev-clean"),
        manifest_path=args.manifest_prefix + ".dev-clean")
    if args.full_download:
        prepare_dataset(
            url=URL_TRAIN_CLEAN_100,
            md5sum=MD5_TRAIN_CLEAN_100,
            target_dir=os.path.join(args.target_dir, "train-clean-100"),
            manifest_path=args.manifest_prefix + ".train-clean-100")
        prepare_dataset(
            url=URL_TEST_OTHER,
            md5sum=MD5_TEST_OTHER,
            target_dir=os.path.join(args.target_dir, "test-other"),
            manifest_path=args.manifest_prefix + ".test-other")
        prepare_dataset(
            url=URL_DEV_OTHER,
            md5sum=MD5_DEV_OTHER,
            target_dir=os.path.join(args.target_dir, "dev-other"),
            manifest_path=args.manifest_prefix + ".dev-other")
        prepare_dataset(
            url=URL_TRAIN_CLEAN_360,
            md5sum=MD5_TRAIN_CLEAN_360,
            target_dir=os.path.join(args.target_dir, "train-clean-360"),
            manifest_path=args.manifest_prefix + ".train-clean-360")
        prepare_dataset(
            url=URL_TRAIN_OTHER_500,
            md5sum=MD5_TRAIN_OTHER_500,
            target_dir=os.path.join(args.target_dir, "train-other-500"),
            manifest_path=args.manifest_prefix + ".train-other-500")
 if __name__ == '__main__':
    main()
--- a/examples/tiny/README.md
+++ b/examples/tiny/README.md
@ -0,0 +1,4 @@
 # Tiny Example
 1. `source path.sh`
 2. `bash run.sh`
--- a/examples/tiny/local/run_data.sh
+++ b/examples/tiny/local/run_data.sh
@ -1,16 +1,14 @@
 #! /usr/bin/env bash
 cd ../.. > /dev/null
 # prepare folder
-if [ ! -e data/tiny ]; then
+if [ ! -e data ]; then
-    mkdir data/tiny
+    mkdir data
 fi
 # download data, generate manifests
-PYTHONPATH=.:$PYTHONPATH python3 data/librispeech/librispeech.py \
+PYTHONPATH=.:$PYTHONPATH python3 ../librispeech/local/librispeech.py \
--manifest_prefix='data/tiny/manifest' \
+--manifest_prefix='data/manifest' \
--target_dir='./dataset/librispeech' \
+--target_dir="${MAIN_ROOT}/dataset/librispeech" \
 --full_download='False'
 if [ $? -ne 0 ]; then
@ -18,13 +16,13 @@ if [ $? -ne 0 ]; then
    exit 1
 fi
-head -n 64 data/tiny/manifest.dev-clean  > data/tiny/manifest.tiny
+head -n 64 data/manifest.dev-clean  > data/manifest.tiny
 # build vocabulary
-python3 tools/build_vocab.py \
+python3 ${MAIN_ROOT}/tools/build_vocab.py \
 --count_threshold=0 \
--vocab_path='data/tiny/vocab.txt' \
+--vocab_path='data/vocab.txt' \
--manifest_paths='data/tiny/manifest.tiny'
+--manifest_paths='data/manifest.tiny'
 if [ $? -ne 0 ]; then
    echo "Build vocabulary failed. Terminated."
@ -33,11 +31,11 @@ fi
 # compute mean and stddev for normalizer
-python3 tools/compute_mean_std.py \
+python3 ${MAIN_ROOT}/tools/compute_mean_std.py \
--manifest_path='data/tiny/manifest.tiny' \
+--manifest_path='data/manifest.tiny' \
 --num_samples=64 \
 --specgram_type='linear' \
--output_path='data/tiny/mean_std.npz'
+--output_path='data/mean_std.npz'
 if [ $? -ne 0 ]; then
    echo "Compute mean and stddev failed. Terminated."
--- a/examples/tiny/local/run_infer.sh
+++ b/examples/tiny/local/run_infer.sh
@ -1,9 +1,7 @@
 #! /usr/bin/env bash
 cd ../.. > /dev/null
 # download language model
-cd models/lm > /dev/null
+cd $MAIN_ROOT/models/lm > /dev/null
 bash download_lm_en.sh
 if [ $? -ne 0 ]; then
    exit 1
@ -13,7 +11,7 @@ cd - > /dev/null
 # infer
 CUDA_VISIBLE_DEVICES=0 \
-python3 -u infer.py \
+python3 -u $MAIN_ROOT/infer.py \
 --num_samples=10 \
 --beam_size=500 \
 --num_proc_bsearch=8 \
@ -27,11 +25,11 @@ python3 -u infer.py \
 --use_gru=False \
 --use_gpu=True \
 --share_rnn_weights=True \
--infer_manifest='data/tiny/manifest.test-clean' \
+--infer_manifest='data/manifest.test-clean' \
--mean_std_path='data/tiny/mean_std.npz' \
+--mean_std_path='data/mean_std.npz' \
--vocab_path='data/tiny/vocab.txt' \
+--vocab_path='data/vocab.txt' \
--model_path='./checkpoints/tiny/step_final' \
+--model_path='checkpoints/step_final' \
--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
+--lang_model_path="$MAIN_ROOT/models/lm/common_crawl_00.prune01111.trie.klm" \
 --decoding_method='ctc_beam_search' \
 --error_rate_type='wer' \
 --specgram_type='linear'
--- a/examples/tiny/local/run_infer_golden.sh
+++ b/examples/tiny/local/run_infer_golden.sh
@ -1,9 +1,7 @@
 #! /usr/bin/env bash
 cd ../.. > /dev/null
 # download language model
-cd models/lm > /dev/null
+cd ${MAIN_ROOT}/models/lm > /dev/null
 bash download_lm_en.sh
 if [ $? -ne 0 ]; then
    exit 1
@ -12,7 +10,7 @@ cd - > /dev/null
 # download well-trained model
-cd models/librispeech > /dev/null
+cd ${MAIN_ROOT}/models/librispeech > /dev/null
 bash download_model.sh
 if [ $? -ne 0 ]; then
    exit 1
@ -22,7 +20,7 @@ cd - > /dev/null
 # infer
 CUDA_VISIBLE_DEVICES=0 \
-python3 -u infer.py \
+python3 -u ${MAIN_ROOT}/infer.py \
 --num_samples=10 \
 --beam_size=500 \
 --num_proc_bsearch=8 \
@ -36,11 +34,11 @@ python3 -u infer.py \
 --use_gru=False \
 --use_gpu=True \
 --share_rnn_weights=True \
--infer_manifest='data/tiny/manifest.test-clean' \
+--infer_manifest='data/manifest.test-clean' \
--mean_std_path='models/librispeech/mean_std.npz' \
+--mean_std_path="${MAIN_ROOT}/models/librispeech/mean_std.npz" \
--vocab_path='models/librispeech/vocab.txt' \
+--vocab_path="${MAIN_ROOT}/models/librispeech/vocab.txt" \
--model_path='models/librispeech' \
+--model_path="${MAIN_ROOT}/models/librispeech" \
--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
+--lang_model_path="${MAIN_ROOT}/models/lm/common_crawl_00.prune01111.trie.klm" \
 --decoding_method='ctc_beam_search' \
 --error_rate_type='wer' \
 --specgram_type='linear'
--- a/examples/tiny/local/run_test.sh
+++ b/examples/tiny/local/run_test.sh
@ -1,9 +1,7 @@
 #! /usr/bin/env bash
 cd ../.. > /dev/null
 # download language model
-cd models/lm > /dev/null
+cd $MAIN_ROOT/models/lm > /dev/null
 bash download_lm_en.sh
 if [ $? -ne 0 ]; then
    exit 1
@ -13,7 +11,7 @@ cd - > /dev/null
 # evaluate model
 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
-python3 -u test.py \
+python3 -u $MAIN_ROOT/test.py \
 --batch_size=128 \
 --beam_size=500 \
 --num_proc_bsearch=8 \
@ -27,11 +25,11 @@ python3 -u test.py \
 --use_gru=False \
 --use_gpu=True \
 --share_rnn_weights=True \
--test_manifest='data/tiny/manifest.test-clean' \
+--test_manifest='data/manifest.test-clean' \
--mean_std_path='data/tiny/mean_std.npz' \
+--mean_std_path='data/mean_std.npz' \
--vocab_path='data/tiny/vocab.txt' \
+--vocab_path='data/vocab.txt' \
--model_path='checkpoints/tiny/step_final' \
+--model_path='checkpoints/step_final' \
--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
+--lang_model_path="$MAIN_ROOT/models/lm/common_crawl_00.prune01111.trie.klm" \
 --decoding_method='ctc_beam_search' \
 --error_rate_type='wer' \
 --specgram_type='linear'
--- a/examples/tiny/local/run_test_golden.sh
+++ b/examples/tiny/local/run_test_golden.sh
@ -1,9 +1,7 @@
 #! /usr/bin/env bash
 cd ../.. > /dev/null
 # download language model
-cd models/lm > /dev/null
+cd $MAIN_ROOT/models/lm > /dev/null
 bash download_lm_en.sh
 if [ $? -ne 0 ]; then
    exit 1
@ -12,7 +10,7 @@ cd - > /dev/null
 # download well-trained model
-cd models/librispeech > /dev/null
+cd $MAIN_ROOT/models/librispeech > /dev/null
 bash download_model.sh
 if [ $? -ne 0 ]; then
    exit 1
@ -22,7 +20,7 @@ cd - > /dev/null
 # evaluate model
 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
-python3 -u test.py \
+python3 -u $MAIN_ROOT/test.py \
 --batch_size=128 \
 --beam_size=500 \
 --num_proc_bsearch=8 \
@ -36,11 +34,11 @@ python3 -u test.py \
 --use_gru=False \
 --use_gpu=True \
 --share_rnn_weights=True \
--test_manifest='data/tiny/manifest.test-clean' \
+--test_manifest='data/manifest.test-clean' \
--mean_std_path='models/librispeech/mean_std.npz' \
+--mean_std_path="$MAIN_ROOT/models/librispeech/mean_std.npz" \
--vocab_path='models/librispeech/vocab.txt' \
+--vocab_path="$MAIN_ROOT/models/librispeech/vocab.txt" \
--model_path='models/librispeech' \
+--model_path="$MAIN_ROOT/models/librispeech" \
--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
+--lang_model_path="$MAIN_ROOT/models/lm/common_crawl_00.prune01111.trie.klm" \
 --decoding_method='ctc_beam_search' \
 --error_rate_type='wer' \
 --specgram_type='linear'
--- a/examples/tiny/local/run_train.sh
+++ b/examples/tiny/local/run_train.sh
@ -1,12 +1,10 @@
 #! /usr/bin/env bash
 cd ../.. > /dev/null
 # train model
 # if you wish to resume from an exists model, uncomment --init_from_pretrained_model
 export FLAGS_sync_nccl_allreduce=0
 CUDA_VISIBLE_DEVICES=0,1,2,3 \
-python3 -u train.py \
+python3 -u ${MAIN_ROOT}/train.py \
 --batch_size=4 \
 --num_epoch=20 \
 --num_conv_layers=2 \
@ -24,12 +22,12 @@ python3 -u train.py \
 --use_gpu=True \
 --is_local=True \
 --share_rnn_weights=True \
--train_manifest='data/tiny/manifest.tiny' \
+--train_manifest='data/manifest.tiny' \
--dev_manifest='data/tiny/manifest.tiny' \
+--dev_manifest='data/manifest.tiny' \
--mean_std_path='data/tiny/mean_std.npz' \
+--mean_std_path='data/mean_std.npz' \
--vocab_path='data/tiny/vocab.txt' \
+--vocab_path='data/vocab.txt' \
--output_model_dir='./checkpoints/tiny' \
+--output_model_dir='./checkpoints/' \
--augment_conf_path='conf/augmentation.config' \
+--augment_conf_path="${MAIN_ROOT}/conf/augmentation.config" \
 --specgram_type='linear' \
 --shuffle_method='batch_shuffle_clipped' \
--- a/examples/tiny/local/run_tune.sh
+++ b/examples/tiny/local/run_tune.sh
@ -1,10 +1,8 @@
 #! /usr/bin/env bash
 cd ../.. > /dev/null
 # grid-search for hyper-parameters in language model
 CUDA_VISIBLE_DEVICES=0,1,2,3 \
-python3 -u tools/tune.py \
+python3 -u $MAIN_ROOT/tools/tune.py \
 --num_batches=-1 \
 --batch_size=128 \
 --beam_size=500 \
@ -23,11 +21,11 @@ python3 -u tools/tune.py \
 --use_gru=False \
 --use_gpu=True \
 --share_rnn_weights=True \
--tune_manifest='data/tiny/manifest.dev-clean' \
+--tune_manifest='data/manifest.dev-clean' \
--mean_std_path='data/tiny/mean_std.npz' \
+--mean_std_path='data/mean_std.npz' \
--vocab_path='data/tiny/vocab.txt' \
+--vocab_path='data/vocab.txt' \
--model_path='models/librispeech' \
+--model_path="$MAIN_ROOT/models/librispeech" \
--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
+--lang_model_path="$MAIN_ROOT/models/lm/common_crawl_00.prune01111.trie.klm" \
 --error_rate_type='wer' \
 --specgram_type='linear'
--- a/examples/tiny/path.sh
+++ b/examples/tiny/path.sh
@ -0,0 +1,8 @@
 export MAIN_ROOT=${PWD}/../../
 export PATH=${MAIN_ROOT}:${PWD}/tools:${PATH}
 export LC_ALL=C
 # Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
 export PYTHONIOENCODING=UTF-8 
 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
--- a/examples/tiny/run.sh
+++ b/examples/tiny/run.sh
@ -0,0 +1,24 @@
 #!/bin/bash
 source path.sh
 # prepare data
 bash ./local/run_data.sh
 # test pretrain model
 bash ./local/run_test_golden.sh
 # test pretain model
 bash ./local/run_infer_golden.sh
 # train model
 bash ./local/run_train.sh
 # test model
 bash ./local/run_test.sh
 # infer model
 bash ./local/run_infer.sh
 # tune model
 bash ./local/run_tune.sh
--- a/requirements.txt
+++ b/requirements.txt
@ -2,3 +2,4 @@ scipy==1.2.1
 resampy==0.1.5
 SoundFile==0.9.0.post1
 python_speech_features
 paddlepaddle-gpu==1.8.0.post107