From 9800d1495690d2d85305be750317e8cd623222b4 Mon Sep 17 00:00:00 2001 From: Yibing Liu Date: Thu, 16 Nov 2017 11:49:43 +0800 Subject: [PATCH 01/18] fix the data path in the librispeech example --- examples/librispeech/run_data.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/librispeech/run_data.sh b/examples/librispeech/run_data.sh index 12f2dc6d..6e170c12 100644 --- a/examples/librispeech/run_data.sh +++ b/examples/librispeech/run_data.sh @@ -5,7 +5,7 @@ cd ../.. > /dev/null # download data, generate manifests PYTHONPATH=.:$PYTHONPATH python data/librispeech/librispeech.py \ --manifest_prefix='data/librispeech/manifest' \ ---target_dir='~/.cache/paddle/dataset/speech/Libri' \ +--target_dir='~/.cache/paddle/dataset/speech/libri' \ --full_download='True' if [ $? -ne 0 ]; then From 24cb1866a8227d76d8d342d7886f6652ad1ed697 Mon Sep 17 00:00:00 2001 From: yangyaming Date: Thu, 16 Nov 2017 16:29:27 +0800 Subject: [PATCH 02/18] Adapt settings for Aishell example. --- examples/aishell/run_infer.sh | 6 +++--- examples/aishell/run_infer_golden.sh | 6 +++--- examples/aishell/run_test.sh | 6 +++--- examples/aishell/run_test_golden.sh | 6 +++--- examples/aishell/run_train.sh | 2 +- 5 files changed, 13 insertions(+), 13 deletions(-) diff --git a/examples/aishell/run_infer.sh b/examples/aishell/run_infer.sh index 404555e8..e8bd9eab 100644 --- a/examples/aishell/run_infer.sh +++ b/examples/aishell/run_infer.sh @@ -21,11 +21,11 @@ python -u infer.py \ --num_conv_layers=2 \ --num_rnn_layers=3 \ --rnn_layer_size=1024 \ ---alpha=1.4 \ ---beta=2.4 \ +--alpha=2.6 \ +--beta=5.0 \ --cutoff_prob=0.99 \ --cutoff_top_n=40 \ ---use_gru=False \ +--use_gru=True \ --use_gpu=True \ --share_rnn_weights=False \ --infer_manifest='data/aishell/manifest.test' \ diff --git a/examples/aishell/run_infer_golden.sh b/examples/aishell/run_infer_golden.sh index 4701bdaa..68f5a521 100644 --- a/examples/aishell/run_infer_golden.sh +++ b/examples/aishell/run_infer_golden.sh @@ -30,11 +30,11 @@ python -u infer.py \ --num_conv_layers=2 \ --num_rnn_layers=3 \ --rnn_layer_size=1024 \ ---alpha=1.4 \ ---beta=2.4 \ +--alpha=2.6 \ +--beta=5.0 \ --cutoff_prob=0.99 \ --cutoff_top_n=40 \ ---use_gru=False \ +--use_gru=True \ --use_gpu=True \ --share_rnn_weights=False \ --infer_manifest='data/aishell/manifest.test' \ diff --git a/examples/aishell/run_test.sh b/examples/aishell/run_test.sh index feec95cb..35dfca82 100644 --- a/examples/aishell/run_test.sh +++ b/examples/aishell/run_test.sh @@ -22,11 +22,11 @@ python -u test.py \ --num_conv_layers=2 \ --num_rnn_layers=3 \ --rnn_layer_size=1024 \ ---alpha=1.4 \ ---beta=2.4 \ +--alpha=2.6 \ +--beta=5.0 \ --cutoff_prob=0.99 \ --cutoff_top_n=40 \ ---use_gru=False \ +--use_gru=True \ --use_gpu=True \ --share_rnn_weights=False \ --test_manifest='data/aishell/manifest.test' \ diff --git a/examples/aishell/run_test_golden.sh b/examples/aishell/run_test_golden.sh index 387d54f3..8b5e6559 100644 --- a/examples/aishell/run_test_golden.sh +++ b/examples/aishell/run_test_golden.sh @@ -31,11 +31,11 @@ python -u test.py \ --num_conv_layers=2 \ --num_rnn_layers=3 \ --rnn_layer_size=1024 \ ---alpha=1.4 \ ---beta=2.4 \ +--alpha=2.6 \ +--beta=5.0 \ --cutoff_prob=0.99 \ --cutoff_top_n=40 \ ---use_gru=False \ +--use_gru=True \ --use_gpu=True \ --share_rnn_weights=False \ --test_manifest='data/aishell/manifest.test' \ diff --git a/examples/aishell/run_train.sh b/examples/aishell/run_train.sh index 077fabcd..e09205cb 100644 --- a/examples/aishell/run_train.sh +++ b/examples/aishell/run_train.sh @@ -19,7 +19,7 @@ python -u train.py \ --min_duration=0.0 \ --test_off=False \ --use_sortagrad=True \ ---use_gru=False \ +--use_gru=True \ --use_gpu=True \ --is_local=True \ --share_rnn_weights=False \ From 35ef4624b0a5e6b657b62980a44a502b868e6cbf Mon Sep 17 00:00:00 2001 From: yangyaming Date: Thu, 16 Nov 2017 20:46:02 +0800 Subject: [PATCH 03/18] Update url for Aishell model. --- README.md | 2 +- models/aishell/download_model.sh | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index ca146926..2e74b575 100644 --- a/README.md +++ b/README.md @@ -481,7 +481,7 @@ Language | Model Name | Training Data | Hours of Speech :-----------: | :------------: | :----------: | -------: English | [LibriSpeech Model](http://cloud.dlnel.org/filepub/?uuid=17404caf-cf19-492f-9707-1fad07c19aae) | [LibriSpeech Dataset](http://www.openslr.org/12/) | 960 h English | [BaiduEN8k Model](http://cloud.dlnel.org/filepub/?uuid=37a1c211-ec47-494c-973c-31437a10ae90) | Baidu Internal English Dataset | 8628 h -Mandarin | [Aishell Model](http://cloud.dlnel.org/filepub/?uuid=6c83b9d8-3255-4adf-9726-0fe0be3d0274) | [Aishell Dataset](http://www.openslr.org/33/) | 151 h +Mandarin | [Aishell Model](http://cloud.dlnel.org/filepub/?uuid=61de63b9-6904-4809-ad95-0cc5104ab973) | [Aishell Dataset](http://www.openslr.org/33/) | 151 h Mandarin | [BaiduCN1.2k Model](to-be-added) | Baidu Internal Mandarin Dataset | 1204 h #### Language Model Released diff --git a/models/aishell/download_model.sh b/models/aishell/download_model.sh index 19aec554..072fc6fa 100644 --- a/models/aishell/download_model.sh +++ b/models/aishell/download_model.sh @@ -2,8 +2,8 @@ . ../../utils/utility.sh -URL='http://cloud.dlnel.org/filepub/?uuid=6c83b9d8-3255-4adf-9726-0fe0be3d0274' -MD5=28521a58552885a81cf92a1e9b133a71 +URL='http://cloud.dlnel.org/filepub/?uuid=61de63b9-6904-4809-ad95-0cc5104ab973' +MD5=0ee83aa15fba421e5de8fc66c8feb350 TARGET=./aishell_model.tar.gz From abbfa43b22d19b990df9a239fee5a4fbdd06b996 Mon Sep 17 00:00:00 2001 From: yangyaming Date: Thu, 16 Nov 2017 23:04:35 +0800 Subject: [PATCH 04/18] Add script for VoxForge data preparation. --- README.md | 2 + data/voxforge/run_data.sh | 18 ++++ data/voxforge/voxforge.py | 221 ++++++++++++++++++++++++++++++++++++++ data_utils/utility.py | 19 ++++ 4 files changed, 260 insertions(+) create mode 100644 data/voxforge/run_data.sh create mode 100644 data/voxforge/voxforge.py diff --git a/README.md b/README.md index ca146926..6f282a28 100644 --- a/README.md +++ b/README.md @@ -506,6 +506,8 @@ VoxForge European | 31.21 | 20.47 VoxForge Indian | 56.79 | 28.15 Baidu Internal Testset  |   47.73 |   8.92 +For reproducing results on VoxForge data, we provide a script to generate VoxForge dialect manifest files. Please go to ```data/voxforge``` and execute ```sh run_data.sh``` to get VoxForge dialect manifest files. Notice that VoxForge data may keep updated and the generated manifest files may have difference from those we evaluated. + #### Benchmark Results for Mandarin Model (Character Error Rate) Test Set | Aishell Model | BaiduCN1.2k Model diff --git a/data/voxforge/run_data.sh b/data/voxforge/run_data.sh new file mode 100644 index 00000000..e0a9f1b3 --- /dev/null +++ b/data/voxforge/run_data.sh @@ -0,0 +1,18 @@ +#! /usr/bin/env bash + +cd ../.. > /dev/null + +# download data, generate manifests +PYTHONPATH=.:$PYTHONPATH python data/voxforge/voxforge.py \ +--manifest_prefix='data/voxforge/manifest' \ +--target_dir='~/.cache/paddle/dataset/speech/VoxForge' \ +--is_merge_dialect=True \ +--dialects 'american' 'british' 'australian' 'european' 'irish' 'canadian' 'indian' + +if [ $? -ne 0 ]; then + echo "Prepare VoxForge failed. Terminated." + exit 1 +fi + +echo "VoxForge Data preparation done." +exit 0 diff --git a/data/voxforge/voxforge.py b/data/voxforge/voxforge.py new file mode 100644 index 00000000..63f052bd --- /dev/null +++ b/data/voxforge/voxforge.py @@ -0,0 +1,221 @@ +"""Prepare VoxForge dataset + +Download, unpack and create manifest files. +Manifest file is a json-format file with each line containing the +meta data (i.e. audio filepath, transcript and audio duration) +of each audio file in the data set. +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import codecs +import soundfile +import json +import argparse +import shutil +import subprocess +from data_utils.utility import download_multi, unpack, getfile_insensitive + +DATA_HOME = '~/.cache/paddle/dataset/speech' + +DATA_URL = 'http://www.repository.voxforge1.org/downloads/SpeechCorpus/Trunk/' \ + 'Audio/Main/16kHz_16bit' + +parser = argparse.ArgumentParser(description=__doc__) +parser.add_argument( + "--target_dir", + default=DATA_HOME + "/VoxForge", + type=str, + help="Directory to save the dataset. (default: %(default)s)") +parser.add_argument( + "--dialects", + default=[ + 'american', 'british', 'australian', 'european', 'irish', 'canadian', + 'indian' + ], + nargs='+', + type=str, + help="Dialect types. (default: %(default)s)") +parser.add_argument( + "--is_merge_dialect", + default=True, + type=bool, + help="If set True, manifests of american dialect and canadian dialect will " + "be merged to american-canadian dialect; manifests of british " + "dialect, irish dialect and australian dialect will be merged to " + "commonwealth dialect. (default: %(default)s)") +parser.add_argument( + "--manifest_prefix", + default="manifest", + type=str, + help="Filepath prefix for output manifests. (default: %(default)s)") +args = parser.parse_args() + + +def download_and_unpack(target_dir, url): + wget_args = '-q -l 1 -N -nd -c -e robots=off -A tgz -r -np' + tgz_dir = os.path.join(target_dir, 'tgz') + exit_code = download_multi(url, tgz_dir, wget_args) + if exit_code != 0: + print('Download tgz audio files failed with exit code %d.' % exit_code) + else: + print('Download done, start unpacking ...') + audio_dir = os.path.join(target_dir, 'audio') + for root, dirs, files in os.walk(tgz_dir): + for file in files: + print(file) + if file.endswith('.tgz'): + unpack(os.path.join(root, file), audio_dir) + + +def select_dialects(target_dir, dialect_list): + """Classify audio files by dialect.""" + dialect_root_dir = os.path.join(target_dir, 'dialect') + if os.path.exists(dialect_root_dir): + shutil.rmtree(dialect_root_dir) + os.mkdir(dialect_root_dir) + audio_dir = os.path.abspath(os.path.join(target_dir, 'audio')) + for dialect in dialect_list: + # filter files by dialect + command = 'find %s -iwholename "*etc/readme*" -exec egrep -iHl \ + "pronunciation dialect.*%s" {} \;' % (audio_dir, dialect) + p = subprocess.Popen( + command, stdin=subprocess.PIPE, stdout=subprocess.PIPE, shell=True) + output, err = p.communicate() + dialect_dir = os.path.join(dialect_root_dir, dialect) + if os.path.exists(dialect_dir): + shutil.rmtree(dialect_dir) + os.mkdir(dialect_dir) + for path in output.splitlines(): + src_dir = os.path.dirname(os.path.dirname(path)) + link = os.path.basename(os.path.normpath(src_dir)) + os.symlink(src_dir, os.path.join(dialect_dir, link)) + + +def generate_manifest(data_dir, manifest_path): + json_lines = [] + + for path in os.listdir(data_dir): + audio_link = os.path.join(data_dir, path) + assert os.path.islink( + audio_link), '%s should be symbolic link.' % audio_link + actual_audio_dir = os.path.abspath(os.readlink(audio_link)) + + audio_type = '' + if os.path.isdir(os.path.join(actual_audio_dir, 'wav')): + audio_type = 'wav' + elif os.path.isdir(os.path.join(actual_audio_dir, 'flac')): + audio_type = 'flac' + else: + print('Unknown audio type, skipped processing %s.' % + actual_audio_dir) + continue + + etc_dir = os.path.join(actual_audio_dir, 'etc') + prompts_file = os.path.join(etc_dir, 'PROMPTS') + if not os.path.isfile(prompts_file): + print('PROMPTS file missing, skip processing %s.' % + actual_audio_dir) + continue + + readme_file = getfile_insensitive(os.path.join(etc_dir, 'README')) + if readme_file is None: + print('README file missing, skip processing %s.' % actual_audio_dir) + continue + + for line in file(prompts_file): + u, trans = line.strip().split(None, 1) + u_parts = u.split('/') + + # try to format the date time + try: + speaker, date, sfx = u_parts[-3].split('-') + obj = datetime.datetime.strptime(date, '%y.%m.%d') + formatted = obj.strftime('%Y%m%d') + u_parts[-3] = '-'.join([speaker, formatted, sfx]) + except Exception as e: + pass + + if len(u_parts) < 2: + u_parts = [audio_type] + u_parts + u_parts[-2] = audio_type + u_parts[-1] += '.' + audio_type + u = os.path.join(actual_audio_dir, '/'.join(u_parts[-2:])) + + if not os.path.isfile(u): + print('Audio file missing, skip processing %s.' % u) + continue + + if os.stat(u).st_size == 0: + print('Empty audio file, skip processing %s.' % u) + continue + + trans = trans.strip().replace('-', ' ') + if not trans.isupper() or \ + not trans.strip().replace(' ', '').replace("'", "").isalpha(): + print("Transcript not normalized properly, skip processing %s." + % u) + continue + + audio_data, samplerate = soundfile.read(u) + duration = float(len(audio_data)) / samplerate + json_lines.append( + json.dumps({ + 'audio_filepath': u, + 'duration': duration, + 'text': trans.lower() + })) + + with codecs.open(manifest_path, 'w', 'utf-8') as fout: + for line in json_lines: + fout.write(line + '\n') + + +def merge_manifests(manifest_files, save_path): + lines = [] + for manifest_file in manifest_files: + line = codecs.open(manifest_file, 'r', 'utf-8').readlines() + lines += line + + with codecs.open(save_path, 'w', 'utf-8') as fout: + for line in lines: + fout.write(line) + + +def prepare_dataset(url, dialects, target_dir, manifest_prefix, is_merge): + download_and_unpack(target_dir, url) + select_dialects(target_dir, dialects) + american_canadian_manifests = [] + commonwealth_manifests = [] + for dialect in dialects: + dialect_dir = os.path.join(target_dir, 'dialect', dialect) + manifest_fpath = manifest_prefix + '.' + dialect + if dialect == 'american' or dialect == 'canadian': + american_canadian_manifests.append(manifest_fpath) + if dialect == 'australian' \ + or dialect == 'british' \ + or dialect == 'irish': + commonwealth_manifests.append(manifest_fpath) + generate_manifest(dialect_dir, manifest_fpath) + + if is_merge: + if len(american_canadian_manifests) > 0: + manifest_fpath = manifest_prefix + '.american-canadian' + merge_manifests(american_canadian_manifests, manifest_fpath) + if len(commonwealth_manifests) > 0: + manifest_fpath = manifest_prefix + '.commonwealth' + merge_manifests(commonwealth_manifests, manifest_fpath) + + +def main(): + if args.target_dir.startswith('~'): + args.target_dir = os.path.expanduser(args.target_dir) + + prepare_dataset(DATA_URL, args.dialects, args.target_dir, + args.manifest_prefix, args.is_merge_dialect) + + +if __name__ == '__main__': + main() diff --git a/data_utils/utility.py b/data_utils/utility.py index bb5cad45..2633e1b4 100644 --- a/data_utils/utility.py +++ b/data_utils/utility.py @@ -42,6 +42,25 @@ def read_manifest(manifest_path, max_duration=float('inf'), min_duration=0.0): return manifest +def getfile_insensitive(path): + """Get the actual file path when given insensitive filename.""" + directory, filename = os.path.split(path) + directory, filename = (directory or '.'), filename.lower() + for f in os.listdir(directory): + newpath = os.path.join(directory, f) + if os.path.isfile(newpath) and f.lower() == filename: + return newpath + + +def download_multi(url, target_dir, extra_args): + """Download multiple files from url to target_dir.""" + if not os.path.exists(target_dir): os.makedirs(target_dir) + print("Downloading %s ..." % url) + ret_code = os.system("wget -c " + url + ' ' + extra_args + " -P " + + target_dir) + return ret_code + + def download(url, md5sum, target_dir): """Download file from url to target_dir, and check md5sum.""" if not os.path.exists(target_dir): os.makedirs(target_dir) From b5f70d5fcf9590797f8f0bb732bca5e9d6eefbeb Mon Sep 17 00:00:00 2001 From: yangyaming Date: Fri, 17 Nov 2017 14:54:18 +0800 Subject: [PATCH 05/18] Refine doc. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 6f282a28..7ceae17f 100644 --- a/README.md +++ b/README.md @@ -506,7 +506,7 @@ VoxForge European | 31.21 | 20.47 VoxForge Indian | 56.79 | 28.15 Baidu Internal Testset  |   47.73 |   8.92 -For reproducing results on VoxForge data, we provide a script to generate VoxForge dialect manifest files. Please go to ```data/voxforge``` and execute ```sh run_data.sh``` to get VoxForge dialect manifest files. Notice that VoxForge data may keep updated and the generated manifest files may have difference from those we evaluated. +For reproducing benchmark results on VoxForge data, we provide a script to generate VoxForge dialect manifest files. Please go to ```data/voxforge``` and execute ```sh run_data.sh``` to get VoxForge dialect manifest files. Notice that VoxForge data may keep updated and the generated manifest files may have difference from those we evaluated. #### Benchmark Results for Mandarin Model (Character Error Rate) From 0dc4dddf2fb678c36624c57be10b72b1e0982116 Mon Sep 17 00:00:00 2001 From: yangyaming Date: Fri, 17 Nov 2017 15:18:35 +0800 Subject: [PATCH 06/18] Some fix for CI. --- .clang_format.hook | 2 +- .pre-commit-config.yaml | 8 -------- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/.clang_format.hook b/.clang_format.hook index 40d70f56..4cbc972b 100755 --- a/.clang_format.hook +++ b/.clang_format.hook @@ -1,7 +1,7 @@ #!/usr/bin/env bash set -e -readonly VERSION="3.8" +readonly VERSION="3.9" version=$(clang-format -version) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 8ff36e09..ede1c53a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -33,11 +33,3 @@ entry: bash .clang_format.hook -i language: system files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|cuh|proto)$ -- repo: local - hooks: - - id: convert-markdown-into-html - name: convert-markdown-into-html - description: Convert README.md into index.html - entry: python .pre-commit-hooks/convert_markdown_into_html.py - language: system - files: .+README\.md$ From 9f0c3467e57057b9fa9cf668345243da058fa1b7 Mon Sep 17 00:00:00 2001 From: Yibing Liu Date: Fri, 17 Nov 2017 15:50:10 +0800 Subject: [PATCH 07/18] fix decoders: force indices in FST starting from one & add version check in setup --- .clang_format.hook | 2 +- decoders/swig/path_trie.cpp | 2 +- decoders/swig/scorer.cpp | 12 ++---------- decoders/swig/scorer.h | 2 +- decoders/swig/setup.py | 2 +- setup.sh | 2 +- 6 files changed, 7 insertions(+), 15 deletions(-) diff --git a/.clang_format.hook b/.clang_format.hook index 40d70f56..8141fffb 100755 --- a/.clang_format.hook +++ b/.clang_format.hook @@ -1,7 +1,7 @@ #!/usr/bin/env bash set -e -readonly VERSION="3.8" +readonly VERSION="3.6" version=$(clang-format -version) diff --git a/decoders/swig/path_trie.cpp b/decoders/swig/path_trie.cpp index 40d90970..152efa82 100644 --- a/decoders/swig/path_trie.cpp +++ b/decoders/swig/path_trie.cpp @@ -52,7 +52,7 @@ PathTrie* PathTrie::get_path_trie(int new_char, bool reset) { } else { if (has_dictionary_) { matcher_->SetState(dictionary_state_); - bool found = matcher_->Find(new_char); + bool found = matcher_->Find(new_char + 1); if (!found) { // Adding this character causes word outside dictionary auto FSTZERO = fst::TropicalWeight::Zero(); diff --git a/decoders/swig/scorer.cpp b/decoders/swig/scorer.cpp index 686c67c7..39da13d1 100644 --- a/decoders/swig/scorer.cpp +++ b/decoders/swig/scorer.cpp @@ -152,10 +152,8 @@ void Scorer::set_char_map(const std::vector& char_list) { for (size_t i = 0; i < char_list_.size(); i++) { if (char_list_[i] == " ") { SPACE_ID_ = i; - char_map_[' '] = i; - } else if (char_list_[i].size() == 1) { - char_map_[char_list_[i][0]] = i; } + char_map_[char_list_[i]] = i + 1; // Force index starting from zero } } @@ -193,17 +191,11 @@ std::vector Scorer::make_ngram(PathTrie* prefix) { void Scorer::fill_dictionary(bool add_space) { fst::StdVectorFst dictionary; - // First reverse char_list so ints can be accessed by chars - std::unordered_map char_map; - for (size_t i = 0; i < char_list_.size(); i++) { - char_map[char_list_[i]] = i; - } - // For each unigram convert to ints and put in trie int dict_size = 0; for (const auto& word : vocabulary_) { bool added = add_word_to_dictionary( - word, char_map, add_space, SPACE_ID_, &dictionary); + word, char_map_, add_space, SPACE_ID_ + 1, &dictionary); dict_size += added ? 1 : 0; } diff --git a/decoders/swig/scorer.h b/decoders/swig/scorer.h index 61836463..5ebc719c 100644 --- a/decoders/swig/scorer.h +++ b/decoders/swig/scorer.h @@ -104,7 +104,7 @@ private: int SPACE_ID_; std::vector char_list_; - std::unordered_map char_map_; + std::unordered_map char_map_; std::vector vocabulary_; }; diff --git a/decoders/swig/setup.py b/decoders/swig/setup.py index b6bc0ca0..a4bb2e9d 100644 --- a/decoders/swig/setup.py +++ b/decoders/swig/setup.py @@ -113,7 +113,7 @@ decoders_module = [ setup( name='swig_decoders', - version='1.0', + version='1.1', description="""CTC decoders""", ext_modules=decoders_module, py_modules=['swig_decoders'], ) diff --git a/setup.sh b/setup.sh index 7c40415d..ec5e47ec 100644 --- a/setup.sh +++ b/setup.sh @@ -27,7 +27,7 @@ if [ $? != 0 ]; then fi # install decoders -python -c "import swig_decoders" +python -c "import pkg_resources; pkg_resources.require(\"swig_decoders==1.1\")" if [ $? != 0 ]; then cd decoders/swig > /dev/null sh setup.sh From 2587ebf2f7c790195719a0fb659acec68f780e5b Mon Sep 17 00:00:00 2001 From: Yibing Liu Date: Fri, 17 Nov 2017 15:50:55 +0800 Subject: [PATCH 08/18] revert clang_format version --- .clang_format.hook | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.clang_format.hook b/.clang_format.hook index 8141fffb..40d70f56 100755 --- a/.clang_format.hook +++ b/.clang_format.hook @@ -1,7 +1,7 @@ #!/usr/bin/env bash set -e -readonly VERSION="3.6" +readonly VERSION="3.8" version=$(clang-format -version) From 980b8289a350e6f93fb7b4e779461dd525e8f399 Mon Sep 17 00:00:00 2001 From: yangyaming Date: Fri, 17 Nov 2017 15:27:53 +0800 Subject: [PATCH 09/18] Update travis --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 0f67f656..eadcb03b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -23,7 +23,7 @@ before_install: script: - .travis/precommit.sh - docker run -i --rm -v "$PWD:/py_unittest" paddlepaddle/paddle:latest /bin/bash -c - 'cd /py_unittest; sh .travis/unittest.sh' + 'cd /py_unittest; sh .travis/unittest.sh' || exit $? - | if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit 0; fi; if [[ "$TRAVIS_BRANCH" != "develop" && ! "$TRAVIS_BRANCH" =~ ^v[[:digit:]]+\.[[:digit:]]+(\.[[:digit:]]+)?(-\S*)?$ ]]; then echo "not develop branch, no deploy"; exit 0; fi; From 48619f39e7f8e7f96820b6ff96d8e84b1888550c Mon Sep 17 00:00:00 2001 From: yangyaming Date: Fri, 17 Nov 2017 17:18:13 +0800 Subject: [PATCH 10/18] Fix travis. --- .travis.yml | 16 +++++++++++----- .travis/unittest.sh | 9 +-------- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/.travis.yml b/.travis.yml index eadcb03b..52bfd5a1 100644 --- a/.travis.yml +++ b/.travis.yml @@ -17,20 +17,26 @@ addons: - python-pip - python2.7-dev ssh_known_hosts: 52.76.173.135 + before_install: - sudo pip install -U virtualenv pre-commit pip - docker pull paddlepaddle/paddle:latest + script: - - .travis/precommit.sh - - docker run -i --rm -v "$PWD:/py_unittest" paddlepaddle/paddle:latest /bin/bash -c - 'cd /py_unittest; sh .travis/unittest.sh' || exit $? + - exit_code=0 + - .travis/precommit.sh || exit_code=$(( exit_code | $? )) + - docker run -i --rm -v "$PWD:/py_unittest" paddlepaddle/paddle:latest /bin/bash -c + 'cd /py_unittest; sh .travis/unittest.sh' || exit_code=$(( exit_code | $? )) - | - if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit 0; fi; - if [[ "$TRAVIS_BRANCH" != "develop" && ! "$TRAVIS_BRANCH" =~ ^v[[:digit:]]+\.[[:digit:]]+(\.[[:digit:]]+)?(-\S*)?$ ]]; then echo "not develop branch, no deploy"; exit 0; fi; + if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit $exit_code; fi; + if [[ "$TRAVIS_BRANCH" != "develop" && ! "$TRAVIS_BRANCH" =~ ^v[[:digit:]]+\.[[:digit:]]+(\.[[:digit:]]+)?(-\S*)?$ ]]; then echo "not develop branch, no deploy"; exit $exit_code; fi; export DEPLOY_DOCS_SH=https://raw.githubusercontent.com/PaddlePaddle/PaddlePaddle.org/master/scripts/deploy/deploy_docs.sh export MODELS_DIR=`pwd` cd .. curl $DEPLOY_DOCS_SH | bash -s $CONTENT_DEC_PASSWD $TRAVIS_BRANCH $MODELS_DIR + exit_code=$(( exit_code | $? )) + exit $exit_code + notifications: email: on_success: change diff --git a/.travis/unittest.sh b/.travis/unittest.sh index 4195a441..f27dc481 100755 --- a/.travis/unittest.sh +++ b/.travis/unittest.sh @@ -24,13 +24,6 @@ unittest(){ trap 'abort' 0 set -e -for proj in */ ; do - if [ -d $proj ]; then - unittest $proj - if [ $? != 0 ]; then - exit 1 - fi - fi -done +unittest . trap : 0 From 3ea19973c66a6a10320888ba47a8857bebf5abfa Mon Sep 17 00:00:00 2001 From: Yibing Liu Date: Fri, 17 Nov 2017 17:19:53 +0800 Subject: [PATCH 11/18] add more comments to explain the modification --- .clang_format.hook | 2 +- decoders/swig/scorer.cpp | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/.clang_format.hook b/.clang_format.hook index 40d70f56..8141fffb 100755 --- a/.clang_format.hook +++ b/.clang_format.hook @@ -1,7 +1,7 @@ #!/usr/bin/env bash set -e -readonly VERSION="3.8" +readonly VERSION="3.6" version=$(clang-format -version) diff --git a/decoders/swig/scorer.cpp b/decoders/swig/scorer.cpp index 39da13d1..27b61cd0 100644 --- a/decoders/swig/scorer.cpp +++ b/decoders/swig/scorer.cpp @@ -149,11 +149,15 @@ void Scorer::set_char_map(const std::vector& char_list) { char_list_ = char_list; char_map_.clear(); + // Set the char map for the FST for spelling correction for (size_t i = 0; i < char_list_.size(); i++) { if (char_list_[i] == " ") { SPACE_ID_ = i; } - char_map_[char_list_[i]] = i + 1; // Force index starting from zero + // The initial state of FST is state 0, hence the index of chars in + // the FST should start from 1 to avoid the conflict with the initial + // state, otherwise wrong decoding results would be given. + char_map_[char_list_[i]] = i + 1; } } From dd770948a0cc71da4f96a0fd446deec0b631a369 Mon Sep 17 00:00:00 2001 From: Yibing Liu Date: Fri, 17 Nov 2017 17:20:44 +0800 Subject: [PATCH 12/18] revert clang_format version --- .clang_format.hook | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.clang_format.hook b/.clang_format.hook index 8141fffb..4cbc972b 100755 --- a/.clang_format.hook +++ b/.clang_format.hook @@ -1,7 +1,7 @@ #!/usr/bin/env bash set -e -readonly VERSION="3.6" +readonly VERSION="3.9" version=$(clang-format -version) From bb637c1d395b55842bf9ecc6d8f346d7e0919e51 Mon Sep 17 00:00:00 2001 From: Yibing Liu Date: Fri, 17 Nov 2017 18:34:45 +0800 Subject: [PATCH 13/18] remove doc deploy in travis-ci --- .travis.yml | 9 --------- 1 file changed, 9 deletions(-) diff --git a/.travis.yml b/.travis.yml index 52bfd5a1..75c2c135 100644 --- a/.travis.yml +++ b/.travis.yml @@ -16,7 +16,6 @@ addons: - python - python-pip - python2.7-dev - ssh_known_hosts: 52.76.173.135 before_install: - sudo pip install -U virtualenv pre-commit pip @@ -27,14 +26,6 @@ script: - .travis/precommit.sh || exit_code=$(( exit_code | $? )) - docker run -i --rm -v "$PWD:/py_unittest" paddlepaddle/paddle:latest /bin/bash -c 'cd /py_unittest; sh .travis/unittest.sh' || exit_code=$(( exit_code | $? )) - - | - if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit $exit_code; fi; - if [[ "$TRAVIS_BRANCH" != "develop" && ! "$TRAVIS_BRANCH" =~ ^v[[:digit:]]+\.[[:digit:]]+(\.[[:digit:]]+)?(-\S*)?$ ]]; then echo "not develop branch, no deploy"; exit $exit_code; fi; - export DEPLOY_DOCS_SH=https://raw.githubusercontent.com/PaddlePaddle/PaddlePaddle.org/master/scripts/deploy/deploy_docs.sh - export MODELS_DIR=`pwd` - cd .. - curl $DEPLOY_DOCS_SH | bash -s $CONTENT_DEC_PASSWD $TRAVIS_BRANCH $MODELS_DIR - exit_code=$(( exit_code | $? )) exit $exit_code notifications: From adc117312f975eb8558c2d052b8446c676918cd8 Mon Sep 17 00:00:00 2001 From: yangyaming Date: Mon, 20 Nov 2017 11:02:04 +0800 Subject: [PATCH 14/18] Refine doc and fix path for run_data.sh --- README.md | 2 +- data/voxforge/run_data.sh | 6 ++---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 7ceae17f..eb16dd58 100644 --- a/README.md +++ b/README.md @@ -506,7 +506,7 @@ VoxForge European | 31.21 | 20.47 VoxForge Indian | 56.79 | 28.15 Baidu Internal Testset  |   47.73 |   8.92 -For reproducing benchmark results on VoxForge data, we provide a script to generate VoxForge dialect manifest files. Please go to ```data/voxforge``` and execute ```sh run_data.sh``` to get VoxForge dialect manifest files. Notice that VoxForge data may keep updated and the generated manifest files may have difference from those we evaluated. +For reproducing benchmark results on VoxForge data, we provide a script to download data and generate VoxForge dialect manifest files. Please go to ```data/voxforge``` and execute ```sh run_data.sh``` to get VoxForge dialect manifest files. Notice that VoxForge data may keep updating and the generated manifest files may have difference from those we evaluated on. #### Benchmark Results for Mandarin Model (Character Error Rate) diff --git a/data/voxforge/run_data.sh b/data/voxforge/run_data.sh index e0a9f1b3..c6ff7111 100644 --- a/data/voxforge/run_data.sh +++ b/data/voxforge/run_data.sh @@ -1,10 +1,8 @@ #! /usr/bin/env bash -cd ../.. > /dev/null - # download data, generate manifests -PYTHONPATH=.:$PYTHONPATH python data/voxforge/voxforge.py \ ---manifest_prefix='data/voxforge/manifest' \ +PYTHONPATH=../../:$PYTHONPATH python voxforge.py \ +--manifest_prefix='./manifest' \ --target_dir='~/.cache/paddle/dataset/speech/VoxForge' \ --is_merge_dialect=True \ --dialects 'american' 'british' 'australian' 'european' 'irish' 'canadian' 'indian' From a200271ba9e436be28805d30296171d8cf7fbc90 Mon Sep 17 00:00:00 2001 From: yangyaming Date: Mon, 20 Nov 2017 19:13:48 +0800 Subject: [PATCH 15/18] Update libri model. --- README.md | 16 ++++++++-------- models/librispeech/download_model.sh | 4 ++-- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index a05289b9..27dede48 100644 --- a/README.md +++ b/README.md @@ -479,7 +479,7 @@ python deploy/demo_client.py --help Language | Model Name | Training Data | Hours of Speech :-----------: | :------------: | :----------: | -------: -English | [LibriSpeech Model](http://cloud.dlnel.org/filepub/?uuid=17404caf-cf19-492f-9707-1fad07c19aae) | [LibriSpeech Dataset](http://www.openslr.org/12/) | 960 h +English | [LibriSpeech Model](http://cloud.dlnel.org/filepub/?uuid=117cde63-cd59-4948-8b80-df782555f7d6) | [LibriSpeech Dataset](http://www.openslr.org/12/) | 960 h English | [BaiduEN8k Model](http://cloud.dlnel.org/filepub/?uuid=37a1c211-ec47-494c-973c-31437a10ae90) | Baidu Internal English Dataset | 8628 h Mandarin | [Aishell Model](http://cloud.dlnel.org/filepub/?uuid=61de63b9-6904-4809-ad95-0cc5104ab973) | [Aishell Dataset](http://www.openslr.org/33/) | 151 h Mandarin | [BaiduCN1.2k Model](to-be-added) | Baidu Internal Mandarin Dataset | 1204 h @@ -498,13 +498,13 @@ Language Model | Training Data | Token-based | Size | Descriptions Test Set | LibriSpeech Model | BaiduEN8K Model :--------------------- | ---------------: | -------------------: -LibriSpeech Test-Clean | 7.77 | 6.63 -LibriSpeech Test-Other | 23.25 | 16.59 -VoxForge American-Canadian | 12.52 |   7.46 -VoxForge Commonwealth | 21.08 | 16.23 -VoxForge European | 31.21 | 20.47 -VoxForge Indian | 56.79 | 28.15 -Baidu Internal Testset  |   47.73 |   8.92 +LibriSpeech Test-Clean | 7.73 | 6.63 +LibriSpeech Test-Other | 23.15 | 16.59 +VoxForge American-Canadian | 12.30 |   7.46 +VoxForge Commonwealth | 20.03 | 16.23 +VoxForge European | 30.31 | 20.47 +VoxForge Indian | 55.47 | 28.15 +Baidu Internal Testset  |   44.71 |   8.92 For reproducing benchmark results on VoxForge data, we provide a script to download data and generate VoxForge dialect manifest files. Please go to ```data/voxforge``` and execute ```sh run_data.sh``` to get VoxForge dialect manifest files. Notice that VoxForge data may keep updating and the generated manifest files may have difference from those we evaluated on. diff --git a/models/librispeech/download_model.sh b/models/librispeech/download_model.sh index 9c0ec278..305c082a 100644 --- a/models/librispeech/download_model.sh +++ b/models/librispeech/download_model.sh @@ -2,8 +2,8 @@ . ../../utils/utility.sh -URL='http://cloud.dlnel.org/filepub/?uuid=6020a634-5399-4423-b021-c5ed32680fff' -MD5=2ef08f8b608a7c555592161fc14d81a6 +URL='http://cloud.dlnel.org/filepub/?uuid=117cde63-cd59-4948-8b80-df782555f7d6' +MD5=1f72d0c5591f453362f0caa09dd57618 TARGET=./librispeech_model.tar.gz From 234f2bb49d94bf42899f7b11956ef472a398faaa Mon Sep 17 00:00:00 2001 From: yangyaming Date: Tue, 21 Nov 2017 11:22:59 +0800 Subject: [PATCH 16/18] Adapt demo_server.py to support padding removing. --- deploy/demo_server.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/deploy/demo_server.py b/deploy/demo_server.py index 3e81c0c5..bb339b76 100644 --- a/deploy/demo_server.py +++ b/deploy/demo_server.py @@ -147,7 +147,8 @@ def start_server(): augmentation_config='{}', specgram_type=args.specgram_type, num_threads=1, - keep_transcription_text=True) + keep_transcription_text=True, + num_conv_layers=args.num_conv_layers) # prepare ASR model ds2_model = DeepSpeech2Model( vocab_size=data_generator.vocab_size, @@ -163,8 +164,20 @@ def start_server(): # prepare ASR inference handler def file_to_transcript(filename): feature = data_generator.process_utterance(filename, "") + ins = [] + conv0_h = (feature[0].shape[0] - 1) // 2 + 1 + conv0_w = (feature[0].shape[1] - 1) // 3 + 1 + ins += [feature[0], feature[1], + [0], [conv0_w], + [1, 32, 1, conv0_h, conv0_w + 1, conv0_w]] + pre_h = conv0_h + for i in xrange(args.num_conv_layers - 1): + h = (pre_h - 1) // 2 + 1 + pre_h = h + ins += [[1, 32, 1, h, conv0_w + 1, conv0_w]] + result_transcript = ds2_model.infer_batch( - infer_data=[feature], + infer_data=[ins], decoding_method=args.decoding_method, beam_alpha=args.alpha, beam_beta=args.beta, @@ -173,7 +186,8 @@ def start_server(): cutoff_top_n=args.cutoff_top_n, vocab_list=vocab_list, language_model_path=args.lang_model_path, - num_processes=1) + num_processes=1, + feeding_dict=data_generator.feeding) return result_transcript[0] # warming up with utterrances sampled from Librispeech From 5ba0e0a00bf9afb12e6ea3ae2056d8f73d21c12b Mon Sep 17 00:00:00 2001 From: Yibing Liu Date: Tue, 21 Nov 2017 13:35:42 +0800 Subject: [PATCH 17/18] update setup in readme --- README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 27dede48..0ba9b86e 100644 --- a/README.md +++ b/README.md @@ -31,8 +31,9 @@ To avoid the trouble of environment setup, [running in docker container](#runnin ### Setup ```bash -git clone https://github.com/PaddlePaddle/models.git -cd models/deep_speech_2 +sudo apt-get install -y pkg-config libflac-dev libogg-dev libvorbis-dev swig +git clone https://github.com/PaddlePaddle/DeepSpeech.git +cd DeepSpeech sh setup.sh ``` From 74e00f4e15706e14d24ea5a169ee5c9eac4b30c2 Mon Sep 17 00:00:00 2001 From: Yibing Liu Date: Tue, 21 Nov 2017 16:55:12 +0800 Subject: [PATCH 18/18] add more info in the setup section --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index 0ba9b86e..08a3afa3 100644 --- a/README.md +++ b/README.md @@ -29,9 +29,15 @@ To avoid the trouble of environment setup, [running in docker container](#runnin - PaddlePaddle the latest version (please refer to the [Installation Guide](https://github.com/PaddlePaddle/Paddle#installation)) ### Setup +- Make sure these libraries or tools installed: `pkg-config`, `flac`, `ogg`, `vorbis` and `swig`, e.g. installing them via `apt-get`: ```bash sudo apt-get install -y pkg-config libflac-dev libogg-dev libvorbis-dev swig +``` + +- Run the setup script for the remaining dependencies + +```bash git clone https://github.com/PaddlePaddle/DeepSpeech.git cd DeepSpeech sh setup.sh