refactor tiny egs

5 years ago · 75c8018eab
parent b882ba5000
commit 75c8018eab
12 changed files with 238 additions and 69 deletions
--- a/examples/librispeech/local/librispeech.py
+++ b/examples/librispeech/local/librispeech.py
@ -0,0 +1,146 @@
+"""Prepare Librispeech ASR datasets.
+
+Download, unpack and create manifest files.
+Manifest file is a json-format file with each line containing the
+meta data (i.e. audio filepath, transcript and audio duration)
+of each audio file in the data set.
+"""
+
+import distutils.util
+import os
+import sys
+import argparse
+import soundfile
+import json
+import codecs
+import io
+from data_utils.utility import download, unpack
+
+URL_ROOT = "http://www.openslr.org/resources/12"
+URL_ROOT = "https://openslr.magicdatatech.com/resources/12"
+URL_TEST_CLEAN = URL_ROOT + "/test-clean.tar.gz"
+URL_TEST_OTHER = URL_ROOT + "/test-other.tar.gz"
+URL_DEV_CLEAN = URL_ROOT + "/dev-clean.tar.gz"
+URL_DEV_OTHER = URL_ROOT + "/dev-other.tar.gz"
+URL_TRAIN_CLEAN_100 = URL_ROOT + "/train-clean-100.tar.gz"
+URL_TRAIN_CLEAN_360 = URL_ROOT + "/train-clean-360.tar.gz"
+URL_TRAIN_OTHER_500 = URL_ROOT + "/train-other-500.tar.gz"
+
+MD5_TEST_CLEAN = "32fa31d27d2e1cad72775fee3f4849a9"
+MD5_TEST_OTHER = "fb5a50374b501bb3bac4815ee91d3135"
+MD5_DEV_CLEAN = "42e2234ba48799c1f50f24a7926300a1"
+MD5_DEV_OTHER = "c8d0bcc9cca99d4f8b62fcc847357931"
+MD5_TRAIN_CLEAN_100 = "2a93770f6d5c6c964bc36631d331a522"
+MD5_TRAIN_CLEAN_360 = "c0e676e450a7ff2f54aeade5171606fa"
+MD5_TRAIN_OTHER_500 = "d1a0fd59409feb2c614ce4d30c387708"
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    "--target_dir",
+    default='~/.cache/paddle/dataset/speech/libri',
+    type=str,
+    help="Directory to save the dataset. (default: %(default)s)")
+parser.add_argument(
+    "--manifest_prefix",
+    default="manifest",
+    type=str,
+    help="Filepath prefix for output manifests. (default: %(default)s)")
+parser.add_argument(
+    "--full_download",
+    default="True",
+    type=distutils.util.strtobool,
+    help="Download all datasets for Librispeech."
+    " If False, only download a minimal requirement (test-clean, dev-clean"
+    " train-clean-100). (default: %(default)s)")
+args = parser.parse_args()
+
+
+def create_manifest(data_dir, manifest_path):
+    """Create a manifest json file summarizing the data set, with each line
+    containing the meta data (i.e. audio filepath, transcription text, audio
+    duration) of each audio file within the data set.
+    """
+    print("Creating manifest %s ..." % manifest_path)
+    json_lines = []
+    for subfolder, _, filelist in sorted(os.walk(data_dir)):
+        text_filelist = [
+            filename for filename in filelist if filename.endswith('trans.txt')
+        ]
+        if len(text_filelist) > 0:
+            text_filepath = os.path.join(subfolder, text_filelist[0])
+            for line in io.open(text_filepath, encoding="utf8"):
+                segments = line.strip().split()
+                text = ' '.join(segments[1:]).lower()
+                audio_filepath = os.path.join(subfolder, segments[0] + '.flac')
+                audio_data, samplerate = soundfile.read(audio_filepath)
+                duration = float(len(audio_data)) / samplerate
+                json_lines.append(
+                    json.dumps({
+                        'audio_filepath': audio_filepath,
+                        'duration': duration,
+                        'text': text
+                    }))
+    with codecs.open(manifest_path, 'w', 'utf-8') as out_file:
+        for line in json_lines:
+            out_file.write(line + '\n')
+
+
+def prepare_dataset(url, md5sum, target_dir, manifest_path):
+    """Download, unpack and create summmary manifest file.
+    """
+    if not os.path.exists(os.path.join(target_dir, "LibriSpeech")):
+        # download
+        filepath = download(url, md5sum, target_dir)
+        # unpack
+        unpack(filepath, target_dir)
+    else:
+        print("Skip downloading and unpacking. Data already exists in %s." %
+              target_dir)
+    # create manifest json file
+    create_manifest(target_dir, manifest_path)
+
+
+def main():
+    if args.target_dir.startswith('~'):
+        args.target_dir = os.path.expanduser(args.target_dir)
+
+    prepare_dataset(
+        url=URL_TEST_CLEAN,
+        md5sum=MD5_TEST_CLEAN,
+        target_dir=os.path.join(args.target_dir, "test-clean"),
+        manifest_path=args.manifest_prefix + ".test-clean")
+    prepare_dataset(
+        url=URL_DEV_CLEAN,
+        md5sum=MD5_DEV_CLEAN,
+        target_dir=os.path.join(args.target_dir, "dev-clean"),
+        manifest_path=args.manifest_prefix + ".dev-clean")
+    if args.full_download:
+        prepare_dataset(
+            url=URL_TRAIN_CLEAN_100,
+            md5sum=MD5_TRAIN_CLEAN_100,
+            target_dir=os.path.join(args.target_dir, "train-clean-100"),
+            manifest_path=args.manifest_prefix + ".train-clean-100")
+        prepare_dataset(
+            url=URL_TEST_OTHER,
+            md5sum=MD5_TEST_OTHER,
+            target_dir=os.path.join(args.target_dir, "test-other"),
+            manifest_path=args.manifest_prefix + ".test-other")
+        prepare_dataset(
+            url=URL_DEV_OTHER,
+            md5sum=MD5_DEV_OTHER,
+            target_dir=os.path.join(args.target_dir, "dev-other"),
+            manifest_path=args.manifest_prefix + ".dev-other")
+        prepare_dataset(
+            url=URL_TRAIN_CLEAN_360,
+            md5sum=MD5_TRAIN_CLEAN_360,
+            target_dir=os.path.join(args.target_dir, "train-clean-360"),
+            manifest_path=args.manifest_prefix + ".train-clean-360")
+        prepare_dataset(
+            url=URL_TRAIN_OTHER_500,
+            md5sum=MD5_TRAIN_OTHER_500,
+            target_dir=os.path.join(args.target_dir, "train-other-500"),
+            manifest_path=args.manifest_prefix + ".train-other-500")
+
+
+if __name__ == '__main__':
+    main()
--- a/examples/tiny/README.md
+++ b/examples/tiny/README.md
@ -0,0 +1,4 @@
+# Tiny Example
+
+1. `source path.sh`
+2. `bash run.sh`
--- a/examples/tiny/local/run_data.sh
+++ b/examples/tiny/local/run_data.sh
@ -1,16 +1,14 @@
 #! /usr/bin/env bash

-cd ../.. > /dev/null
-
 # prepare folder
-if [ ! -e data/tiny ]; then
-    mkdir data/tiny
+if [ ! -e data ]; then
+    mkdir data
 fi

 # download data, generate manifests
-PYTHONPATH=.:$PYTHONPATH python3 data/librispeech/librispeech.py \
--manifest_prefix='data/tiny/manifest' \
--target_dir='./dataset/librispeech' \
+PYTHONPATH=.:$PYTHONPATH python3 ../librispeech/local/librispeech.py \
+--manifest_prefix='data/manifest' \
+--target_dir="${MAIN_ROOT}/dataset/librispeech" \
 --full_download='False'

 if [ $? -ne 0 ]; then
@ -18,13 +16,13 @@ if [ $? -ne 0 ]; then
    exit 1
 fi

-head -n 64 data/tiny/manifest.dev-clean  > data/tiny/manifest.tiny
+head -n 64 data/manifest.dev-clean  > data/manifest.tiny

 # build vocabulary
-python3 tools/build_vocab.py \
+python3 ${MAIN_ROOT}/tools/build_vocab.py \
 --count_threshold=0 \
--vocab_path='data/tiny/vocab.txt' \
--manifest_paths='data/tiny/manifest.tiny'
+--vocab_path='data/vocab.txt' \
+--manifest_paths='data/manifest.tiny'

 if [ $? -ne 0 ]; then
    echo "Build vocabulary failed. Terminated."
@ -33,11 +31,11 @@ fi


 # compute mean and stddev for normalizer
-python3 tools/compute_mean_std.py \
--manifest_path='data/tiny/manifest.tiny' \
+python3 ${MAIN_ROOT}/tools/compute_mean_std.py \
+--manifest_path='data/manifest.tiny' \
 --num_samples=64 \
 --specgram_type='linear' \
--output_path='data/tiny/mean_std.npz'
+--output_path='data/mean_std.npz'

 if [ $? -ne 0 ]; then
    echo "Compute mean and stddev failed. Terminated."
--- a/examples/tiny/local/run_infer.sh
+++ b/examples/tiny/local/run_infer.sh
@ -1,9 +1,7 @@
 #! /usr/bin/env bash

-cd ../.. > /dev/null
-
 # download language model
-cd models/lm > /dev/null
+cd $MAIN_ROOT/models/lm > /dev/null
 bash download_lm_en.sh
 if [ $? -ne 0 ]; then
    exit 1
@ -13,7 +11,7 @@ cd - > /dev/null

 # infer
 CUDA_VISIBLE_DEVICES=0 \
-python3 -u infer.py \
+python3 -u $MAIN_ROOT/infer.py \
 --num_samples=10 \
 --beam_size=500 \
 --num_proc_bsearch=8 \
@ -27,11 +25,11 @@ python3 -u infer.py \
 --use_gru=False \
 --use_gpu=True \
 --share_rnn_weights=True \
--infer_manifest='data/tiny/manifest.test-clean' \
--mean_std_path='data/tiny/mean_std.npz' \
--vocab_path='data/tiny/vocab.txt' \
--model_path='./checkpoints/tiny/step_final' \
--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
+--infer_manifest='data/manifest.test-clean' \
+--mean_std_path='data/mean_std.npz' \
+--vocab_path='data/vocab.txt' \
+--model_path='checkpoints/step_final' \
+--lang_model_path="$MAIN_ROOT/models/lm/common_crawl_00.prune01111.trie.klm" \
 --decoding_method='ctc_beam_search' \
 --error_rate_type='wer' \
 --specgram_type='linear'
--- a/examples/tiny/local/run_infer_golden.sh
+++ b/examples/tiny/local/run_infer_golden.sh
@ -1,9 +1,7 @@
 #! /usr/bin/env bash

-cd ../.. > /dev/null
-
 # download language model
-cd models/lm > /dev/null
+cd ${MAIN_ROOT}/models/lm > /dev/null
 bash download_lm_en.sh
 if [ $? -ne 0 ]; then
    exit 1
@ -12,7 +10,7 @@ cd - > /dev/null


 # download well-trained model
-cd models/librispeech > /dev/null
+cd ${MAIN_ROOT}/models/librispeech > /dev/null
 bash download_model.sh
 if [ $? -ne 0 ]; then
    exit 1
@ -22,7 +20,7 @@ cd - > /dev/null

 # infer
 CUDA_VISIBLE_DEVICES=0 \
-python3 -u infer.py \
+python3 -u ${MAIN_ROOT}/infer.py \
 --num_samples=10 \
 --beam_size=500 \
 --num_proc_bsearch=8 \
@ -36,11 +34,11 @@ python3 -u infer.py \
 --use_gru=False \
 --use_gpu=True \
 --share_rnn_weights=True \
--infer_manifest='data/tiny/manifest.test-clean' \
--mean_std_path='models/librispeech/mean_std.npz' \
--vocab_path='models/librispeech/vocab.txt' \
--model_path='models/librispeech' \
--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
+--infer_manifest='data/manifest.test-clean' \
+--mean_std_path="${MAIN_ROOT}/models/librispeech/mean_std.npz" \
+--vocab_path="${MAIN_ROOT}/models/librispeech/vocab.txt" \
+--model_path="${MAIN_ROOT}/models/librispeech" \
+--lang_model_path="${MAIN_ROOT}/models/lm/common_crawl_00.prune01111.trie.klm" \
 --decoding_method='ctc_beam_search' \
 --error_rate_type='wer' \
 --specgram_type='linear'
--- a/examples/tiny/local/run_test.sh
+++ b/examples/tiny/local/run_test.sh
@ -1,9 +1,7 @@
 #! /usr/bin/env bash

-cd ../.. > /dev/null
-
 # download language model
-cd models/lm > /dev/null
+cd $MAIN_ROOT/models/lm > /dev/null
 bash download_lm_en.sh
 if [ $? -ne 0 ]; then
    exit 1
@ -13,7 +11,7 @@ cd - > /dev/null

 # evaluate model
 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
-python3 -u test.py \
+python3 -u $MAIN_ROOT/test.py \
 --batch_size=128 \
 --beam_size=500 \
 --num_proc_bsearch=8 \
@ -27,11 +25,11 @@ python3 -u test.py \
 --use_gru=False \
 --use_gpu=True \
 --share_rnn_weights=True \
--test_manifest='data/tiny/manifest.test-clean' \
--mean_std_path='data/tiny/mean_std.npz' \
--vocab_path='data/tiny/vocab.txt' \
--model_path='checkpoints/tiny/step_final' \
--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
+--test_manifest='data/manifest.test-clean' \
+--mean_std_path='data/mean_std.npz' \
+--vocab_path='data/vocab.txt' \
+--model_path='checkpoints/step_final' \
+--lang_model_path="$MAIN_ROOT/models/lm/common_crawl_00.prune01111.trie.klm" \
 --decoding_method='ctc_beam_search' \
 --error_rate_type='wer' \
 --specgram_type='linear'
--- a/examples/tiny/local/run_test_golden.sh
+++ b/examples/tiny/local/run_test_golden.sh
@ -1,9 +1,7 @@
 #! /usr/bin/env bash

-cd ../.. > /dev/null
-
 # download language model
-cd models/lm > /dev/null
+cd $MAIN_ROOT/models/lm > /dev/null
 bash download_lm_en.sh
 if [ $? -ne 0 ]; then
    exit 1
@ -12,7 +10,7 @@ cd - > /dev/null


 # download well-trained model
-cd models/librispeech > /dev/null
+cd $MAIN_ROOT/models/librispeech > /dev/null
 bash download_model.sh
 if [ $? -ne 0 ]; then
    exit 1
@ -22,7 +20,7 @@ cd - > /dev/null

 # evaluate model
 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
-python3 -u test.py \
+python3 -u $MAIN_ROOT/test.py \
 --batch_size=128 \
 --beam_size=500 \
 --num_proc_bsearch=8 \
@ -36,11 +34,11 @@ python3 -u test.py \
 --use_gru=False \
 --use_gpu=True \
 --share_rnn_weights=True \
--test_manifest='data/tiny/manifest.test-clean' \
--mean_std_path='models/librispeech/mean_std.npz' \
--vocab_path='models/librispeech/vocab.txt' \
--model_path='models/librispeech' \
--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
+--test_manifest='data/manifest.test-clean' \
+--mean_std_path="$MAIN_ROOT/models/librispeech/mean_std.npz" \
+--vocab_path="$MAIN_ROOT/models/librispeech/vocab.txt" \
+--model_path="$MAIN_ROOT/models/librispeech" \
+--lang_model_path="$MAIN_ROOT/models/lm/common_crawl_00.prune01111.trie.klm" \
 --decoding_method='ctc_beam_search' \
 --error_rate_type='wer' \
 --specgram_type='linear'
--- a/examples/tiny/local/run_train.sh
+++ b/examples/tiny/local/run_train.sh
@ -1,12 +1,10 @@
 #! /usr/bin/env bash

-cd ../.. > /dev/null
-
 # train model
 # if you wish to resume from an exists model, uncomment --init_from_pretrained_model
 export FLAGS_sync_nccl_allreduce=0
 CUDA_VISIBLE_DEVICES=0,1,2,3 \
-python3 -u train.py \
+python3 -u ${MAIN_ROOT}/train.py \
 --batch_size=4 \
 --num_epoch=20 \
 --num_conv_layers=2 \
@ -24,12 +22,12 @@ python3 -u train.py \
 --use_gpu=True \
 --is_local=True \
 --share_rnn_weights=True \
--train_manifest='data/tiny/manifest.tiny' \
--dev_manifest='data/tiny/manifest.tiny' \
--mean_std_path='data/tiny/mean_std.npz' \
--vocab_path='data/tiny/vocab.txt' \
--output_model_dir='./checkpoints/tiny' \
--augment_conf_path='conf/augmentation.config' \
+--train_manifest='data/manifest.tiny' \
+--dev_manifest='data/manifest.tiny' \
+--mean_std_path='data/mean_std.npz' \
+--vocab_path='data/vocab.txt' \
+--output_model_dir='./checkpoints/' \
+--augment_conf_path="${MAIN_ROOT}/conf/augmentation.config" \
 --specgram_type='linear' \
 --shuffle_method='batch_shuffle_clipped' \

--- a/examples/tiny/local/run_tune.sh
+++ b/examples/tiny/local/run_tune.sh
@ -1,10 +1,8 @@
 #! /usr/bin/env bash

-cd ../.. > /dev/null
-
 # grid-search for hyper-parameters in language model
 CUDA_VISIBLE_DEVICES=0,1,2,3 \
-python3 -u tools/tune.py \
+python3 -u $MAIN_ROOT/tools/tune.py \
 --num_batches=-1 \
 --batch_size=128 \
 --beam_size=500 \
@ -23,11 +21,11 @@ python3 -u tools/tune.py \
 --use_gru=False \
 --use_gpu=True \
 --share_rnn_weights=True \
--tune_manifest='data/tiny/manifest.dev-clean' \
--mean_std_path='data/tiny/mean_std.npz' \
--vocab_path='data/tiny/vocab.txt' \
--model_path='models/librispeech' \
--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
+--tune_manifest='data/manifest.dev-clean' \
+--mean_std_path='data/mean_std.npz' \
+--vocab_path='data/vocab.txt' \
+--model_path="$MAIN_ROOT/models/librispeech" \
+--lang_model_path="$MAIN_ROOT/models/lm/common_crawl_00.prune01111.trie.klm" \
 --error_rate_type='wer' \
 --specgram_type='linear'

--- a/examples/tiny/path.sh
+++ b/examples/tiny/path.sh
@ -0,0 +1,8 @@
+export MAIN_ROOT=${PWD}/../../
+
+export PATH=${MAIN_ROOT}:${PWD}/tools:${PATH}
+export LC_ALL=C
+
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8 
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
--- a/examples/tiny/run.sh
+++ b/examples/tiny/run.sh
@ -0,0 +1,24 @@
+#!/bin/bash
+
+source path.sh
+
+# prepare data
+bash ./local/run_data.sh
+
+# test pretrain model
+bash ./local/run_test_golden.sh
+
+# test pretain model
+bash ./local/run_infer_golden.sh
+
+# train model
+bash ./local/run_train.sh
+
+# test model
+bash ./local/run_test.sh
+
+# infer model
+bash ./local/run_infer.sh
+
+# tune model
+bash ./local/run_tune.sh
--- a/requirements.txt
+++ b/requirements.txt
@ -2,3 +2,4 @@ scipy==1.2.1
 resampy==0.1.5
 SoundFile==0.9.0.post1
 python_speech_features
+paddlepaddle-gpu==1.8.0.post107