From 104743ccccecff6b0464a8dc1cc4986856f37b82 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Mon, 26 Jul 2021 11:56:47 +0000 Subject: [PATCH] TLG build pass --- deepspeech/utils/log.py | 2 +- .../aishell/s1/local/aishell_train_lms.sh | 58 +++++++++++++ examples/aishell/s1/local/tlg.sh | 52 +++++++++++ examples/aishell/s1/path.sh | 17 +++- examples/aishell/s1/run.sh | 42 ++------- examples/aishell/s1/utils | 1 + utils/__init__.py | 13 +++ utils/filter_scp.pl | 87 +++++++++++++++++++ utils/fst/add_lex_disambig.pl | 0 utils/fst/compile_lexicon_token_fst.sh | 2 +- utils/fst/ctc_token_fst.py | 4 +- utils/fst/ctc_token_fst_corrected.py | 2 - utils/fst/eps2disambig.pl | 0 utils/fst/make_lexicon_fst.pl | 0 utils/fst/make_tlg.sh | 0 utils/fst/prepare_dict.py | 18 ++-- utils/fst/remove_oovs.pl | 0 utils/fst/rnnt_token_fst.py | 2 - utils/fst/s2eps.pl | 0 utils/manifest_key_value.py | 64 ++++++++++++++ utils/ngram_train.sh | 2 +- utils/utility.py | 82 ++++++++++++++++- 22 files changed, 387 insertions(+), 61 deletions(-) create mode 100755 examples/aishell/s1/local/aishell_train_lms.sh create mode 100755 examples/aishell/s1/local/tlg.sh create mode 120000 examples/aishell/s1/utils create mode 100644 utils/__init__.py create mode 100755 utils/filter_scp.pl mode change 100644 => 100755 utils/fst/add_lex_disambig.pl mode change 100644 => 100755 utils/fst/compile_lexicon_token_fst.sh mode change 100644 => 100755 utils/fst/ctc_token_fst.py mode change 100644 => 100755 utils/fst/ctc_token_fst_corrected.py mode change 100644 => 100755 utils/fst/eps2disambig.pl mode change 100644 => 100755 utils/fst/make_lexicon_fst.pl mode change 100644 => 100755 utils/fst/make_tlg.sh mode change 100644 => 100755 utils/fst/prepare_dict.py mode change 100644 => 100755 utils/fst/remove_oovs.pl mode change 100644 => 100755 utils/fst/rnnt_token_fst.py mode change 100644 => 100755 utils/fst/s2eps.pl create mode 100755 utils/manifest_key_value.py mode change 100644 => 100755 utils/utility.py diff --git a/deepspeech/utils/log.py b/deepspeech/utils/log.py index aefc8b59..e99dacec 100644 --- a/deepspeech/utils/log.py +++ b/deepspeech/utils/log.py @@ -17,7 +17,6 @@ import os import socket import sys -import auto_log from paddle import inference FORMAT_STR = '[%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s' @@ -156,6 +155,7 @@ class Autolog: batch_size, model_name="DeepSpeech", model_precision="fp32"): + import auto_log pid = os.getpid() gpu_id = int(os.environ['CUDA_VISIBLE_DEVICES'].split(',')[0]) infer_config = inference.Config() diff --git a/examples/aishell/s1/local/aishell_train_lms.sh b/examples/aishell/s1/local/aishell_train_lms.sh new file mode 100755 index 00000000..7fb555b4 --- /dev/null +++ b/examples/aishell/s1/local/aishell_train_lms.sh @@ -0,0 +1,58 @@ +#!/bin/bash + +# To be run from one directory above this script. +. ./path.sh + +text=data/local/lm/text +lexicon=data/local/dict/lexicon.txt + +for f in "$text" "$lexicon"; do + [ ! -f $x ] && echo "$0: No such file $f" && exit 1; +done + +# Check SRILM tools +if ! which ngram-count > /dev/null; then + echo "srilm tools are not found, please download it and install it from: " + echo "http://www.speech.sri.com/projects/srilm/download.html" + echo "Then add the tools to your PATH" + exit 1 +fi + +# This script takes no arguments. It assumes you have already run +# aishell_data_prep.sh. +# It takes as input the files +# data/local/lm/text +# data/local/dict/lexicon.txt +dir=data/local/lm +mkdir -p $dir + + +cleantext=$dir/text.no_oov + +cat $text | awk -v lex=$lexicon 'BEGIN{while((getline0){ seen[$1]=1; } } + {for(n=1; n<=NF;n++) { if (seen[$n]) { printf("%s ", $n); } else {printf(" ");} } printf("\n");}' \ + > $cleantext || exit 1; + +cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \ + sort -nr > $dir/word.counts || exit 1; + +# Get counts from acoustic training transcripts, and add one-count +# for each word in the lexicon (but not silence, we don't want it +# in the LM-- we'll add it optionally later). +cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \ + cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \ + sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1; + +cat $dir/unigram.counts | awk '{print $2}' | cat - <(echo ""; echo "" ) > $dir/wordlist + +heldout_sent=10000 # Don't change this if you want result to be comparable with + # kaldi_lm results +mkdir -p $dir +cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n $dir/heldout +cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n $dir/train + +ngram-count -text $dir/train -order 3 -limit-vocab -vocab $dir/wordlist -unk \ + -map-unk "" -kndiscount -interpolate -lm $dir/lm.arpa +ngram -lm $dir/lm.arpa -ppl $dir/heldout \ No newline at end of file diff --git a/examples/aishell/s1/local/tlg.sh b/examples/aishell/s1/local/tlg.sh new file mode 100755 index 00000000..f5287f79 --- /dev/null +++ b/examples/aishell/s1/local/tlg.sh @@ -0,0 +1,52 @@ +#!/bin/bash + +set -eo pipefail + +stage=-1 +stop_stage=100 +corpus=aishell +lmtype=srilm + +source utils/parse_options.sh + +data=${MAIN_ROOT}/examples/dataset/${corpus} +lexicon=$data/resource_aishell/lexicon.txt +text=$data/data_aishell/transcript/aishell_transcript_v0.8.txt + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # 7.1 Prepare dict + unit_file=data/vocab.txt + mkdir -p data/local/dict + cp $unit_file data/local/dict/units.txt + utils/fst/prepare_dict.py \ + --unit_file $unit_file \ + --in_lexicon ${lexicon} \ + --out_lexicon data/local/dict/lexicon.txt +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # 7.2 Train lm + lm=data/local/lm + mkdir -p data/train + mkdir -p $lm + utils/manifest_key_value.py \ + --manifest_path data/manifest.train \ + --output_path data/train + utils/filter_scp.pl data/train/text \ + $text > $lm/text + if [ $lmtype == 'srilm' ];then + local/aishell_train_lms.sh + else + utils/ngram_train.sh --order 3 $lm/text $lm/lm.arpa + fi +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # 7.3 Build decoding TLG + utils/fst/compile_lexicon_token_fst.sh \ + data/local/dict data/local/tmp data/local/lang + utils/fst/make_tlg.sh data/local/lm data/local/lang data/lang_test || exit 1; +fi + +echo "Aishell build TLG done." +exit 0 diff --git a/examples/aishell/s1/path.sh b/examples/aishell/s1/path.sh index 30adb6ca..512b3c9f 100644 --- a/examples/aishell/s1/path.sh +++ b/examples/aishell/s1/path.sh @@ -4,11 +4,24 @@ export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} export LC_ALL=C # Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 +export PYTHONIOENCODING=UTF-8 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/ - +# model exp MODEL=u2 export BIN_DIR=${MAIN_ROOT}/deepspeech/exps/${MODEL}/bin + + +export LIBLBFGS=/workspace/zhanghui/asr/wenet-210713/tools/liblbfgs-1.10 +export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:${LIBLBFGS}/lib/.libs +export SRILM=/workspace/zhanghui/asr/wenet-210713/tools/srilm +export PATH=${PATH}:${SRILM}/bin:${SRILM}/bin/i686-m64 + +# Kaldi +export KALDI_ROOT=/workspace/zhanghui/asr/kaldi +[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh diff --git a/examples/aishell/s1/run.sh b/examples/aishell/s1/run.sh index cf5ed508..d55d47ea 100644 --- a/examples/aishell/s1/run.sh +++ b/examples/aishell/s1/run.sh @@ -38,43 +38,13 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi -# if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then -# # export ckpt avg_n -# CUDA_VISIBLE_DEVICES=0 ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit -# fi +if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then + # export ckpt avg_n + CUDA_VISIBLE_DEVICES=0 ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit +fi # Optionally, you can add LM and test it with runtime. if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then - # 7.1 Prepare dict - unit_file=data/vocab.txt - mkdir -p data/local/dict - cp $unit_file data/local/dict/units.txt - utils/fst/prepare_dict.py $unit_file ${data}/resource_aishell/lexicon.txt \ - data/local/dict/lexicon.txt - - # 7.2 Train lm - lm=data/local/lm - mkdir -p $lm - utils/filter_scp.pl data/train/text \ - $data/data_aishell/transcript/aishell_transcript_v0.8.txt > $lm/text - local/aishell_train_lms.sh - - # 7.3 Build decoding TLG - utils/fst/compile_lexicon_token_fst.sh \ - data/local/dict data/local/tmp data/local/lang - utils/fst/make_tlg.sh data/local/lm data/local/lang data/lang_test || exit 1; - - # # 7.4 Decoding with runtime - # # reverse_weight only works for u2++ model and only left to right decoder is used when it is set to 0.0. - # dir=exp/conformer - # reverse_weight=0.0 - # chunk_size=-1 - # ./tools/decode.sh --nj 16 \ - # --beam 15.0 --lattice_beam 7.5 --max_active 7000 \ - # --blank_skip_thresh 0.98 --ctc_weight 0.5 --rescoring_weight 1.0 \ - # --reverse_weight $reverse_weight --chunk_size $chunk_size \ - # --fst_path data/lang_test/TLG.fst \ - # data/test/wav.scp data/test/text $dir/final.zip \ - # data/lang_test/words.txt $dir/lm_with_runtime - # # See $dir/lm_with_runtime for wer + # train lm and build TLG + ./local/tlg.sh --corpus aishell --lmtype srilm fi diff --git a/examples/aishell/s1/utils b/examples/aishell/s1/utils new file mode 120000 index 00000000..973afe67 --- /dev/null +++ b/examples/aishell/s1/utils @@ -0,0 +1 @@ +../../../utils \ No newline at end of file diff --git a/utils/__init__.py b/utils/__init__.py new file mode 100644 index 00000000..185a92b8 --- /dev/null +++ b/utils/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/utils/filter_scp.pl b/utils/filter_scp.pl new file mode 100755 index 00000000..904db868 --- /dev/null +++ b/utils/filter_scp.pl @@ -0,0 +1,87 @@ +#!/usr/bin/env perl +# Copyright 2010-2012 Microsoft Corporation +# Johns Hopkins University (author: Daniel Povey) + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + + +# This script takes a list of utterance-ids or any file whose first field +# of each line is an utterance-id, and filters an scp +# file (or any file whose "n-th" field is an utterance id), printing +# out only those lines whose "n-th" field is in id_list. The index of +# the "n-th" field is 1, by default, but can be changed by using +# the -f switch + +$exclude = 0; +$field = 1; +$shifted = 0; + +do { + $shifted=0; + if ($ARGV[0] eq "--exclude") { + $exclude = 1; + shift @ARGV; + $shifted=1; + } + if ($ARGV[0] eq "-f") { + $field = $ARGV[1]; + shift @ARGV; shift @ARGV; + $shifted=1 + } +} while ($shifted); + +if(@ARGV < 1 || @ARGV > 2) { + die "Usage: filter_scp.pl [--exclude] [-f ] id_list [in.scp] > out.scp \n" . + "Prints only the input lines whose f'th field (default: first) is in 'id_list'.\n" . + "Note: only the first field of each line in id_list matters. With --exclude, prints\n" . + "only the lines that were *not* in id_list.\n" . + "Caution: previously, the -f option was interpreted as a zero-based field index.\n" . + "If your older scripts (written before Oct 2014) stopped working and you used the\n" . + "-f option, add 1 to the argument.\n" . + "See also: utils/filter_scp.pl .\n"; +} + + +$idlist = shift @ARGV; +open(F, "<$idlist") || die "Could not open id-list file $idlist"; +while() { + @A = split; + @A>=1 || die "Invalid id-list file line $_"; + $seen{$A[0]} = 1; +} + +if ($field == 1) { # Treat this as special case, since it is common. + while(<>) { + $_ =~ m/\s*(\S+)\s*/ || die "Bad line $_, could not get first field."; + # $1 is what we filter on. + if ((!$exclude && $seen{$1}) || ($exclude && !defined $seen{$1})) { + print $_; + } + } +} else { + while(<>) { + @A = split; + @A > 0 || die "Invalid scp file line $_"; + @A >= $field || die "Invalid scp file line $_"; + if ((!$exclude && $seen{$A[$field-1]}) || ($exclude && !defined $seen{$A[$field-1]})) { + print $_; + } + } +} + +# tests: +# the following should print "foo 1" +# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl <(echo foo) +# the following should print "bar 2". +# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl -f 2 <(echo 2) \ No newline at end of file diff --git a/utils/fst/add_lex_disambig.pl b/utils/fst/add_lex_disambig.pl old mode 100644 new mode 100755 diff --git a/utils/fst/compile_lexicon_token_fst.sh b/utils/fst/compile_lexicon_token_fst.sh old mode 100644 new mode 100755 index 6e5716b7..e9e8b1ec --- a/utils/fst/compile_lexicon_token_fst.sh +++ b/utils/fst/compile_lexicon_token_fst.sh @@ -57,7 +57,7 @@ cat $srcdir/units.txt | awk '{print $1}' > $tmpdir/units.list # ctc_token_fst_corrected is too big and too slow for character based chinese modeling, # so here just use simple ctc_token_fst -utils/fst/ctc_token_fst.py $dir/tokens.txt | \ +utils/fst/ctc_token_fst.py --token_file $dir/tokens.txt | \ fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/tokens.txt --keep_isymbols=false --keep_osymbols=false | \ fstarcsort --sort_type=olabel > $dir/T.fst || exit 1; diff --git a/utils/fst/ctc_token_fst.py b/utils/fst/ctc_token_fst.py old mode 100644 new mode 100755 index d41da568..2262912c --- a/utils/fst/ctc_token_fst.py +++ b/utils/fst/ctc_token_fst.py @@ -1,6 +1,5 @@ #!/usr/bin/env python3 import argparse -import sys def main(args): @@ -14,7 +13,7 @@ def main(args): print('2 0 ') # linking `token` between node 1 and node 2 - with open(sys.token_file, 'r') as fin: + with open(args.token_file, 'r') as fin: node = 3 for entry in fin: fields = entry.strip().split(' ') @@ -46,6 +45,5 @@ if __name__ == '__main__': help='e2e model token file. line: token(char/phone/spm/disambigous)') args = parser.parse_args() - print(args) main(args) diff --git a/utils/fst/ctc_token_fst_corrected.py b/utils/fst/ctc_token_fst_corrected.py old mode 100644 new mode 100755 index e88436a4..a1d476c8 --- a/utils/fst/ctc_token_fst_corrected.py +++ b/utils/fst/ctc_token_fst_corrected.py @@ -73,8 +73,6 @@ if __name__ == "__main__": '--token_file', required=True, help='e2e model token file. line: token(char/phone/spm/disambigous)') - args = parser.parse_args() - print(args) main(args) diff --git a/utils/fst/eps2disambig.pl b/utils/fst/eps2disambig.pl old mode 100644 new mode 100755 diff --git a/utils/fst/make_lexicon_fst.pl b/utils/fst/make_lexicon_fst.pl old mode 100644 new mode 100755 diff --git a/utils/fst/make_tlg.sh b/utils/fst/make_tlg.sh old mode 100644 new mode 100755 diff --git a/utils/fst/prepare_dict.py b/utils/fst/prepare_dict.py old mode 100644 new mode 100755 index 471b12ec..f59cd311 --- a/utils/fst/prepare_dict.py +++ b/utils/fst/prepare_dict.py @@ -1,13 +1,5 @@ #!/usr/bin/env python3 import argparse -import sys - - -def contain_oov(units): - for unit in units: - if unit not in unit_table: - return True - return False def main(args): @@ -18,6 +10,12 @@ def main(args): unit = line.strip() unit_table.add(unit) + def contain_oov(units): + for unit in units: + if unit not in unit_table: + return True + return False + # load spm model bpemode = args.bpemodel if bpemode: @@ -27,8 +25,8 @@ def main(args): # used to filter polyphone lexicon_table = set() - with open(sys.in_lexicon, 'r') as fin, \ - open(sys.out_lexicon, 'w') as fout: + with open(args.in_lexicon, 'r') as fin, \ + open(args.out_lexicon, 'w') as fout: for line in fin: word = line.split()[0] if word == 'SIL' and not bpemode: # `sil` might be a valid piece in bpemodel diff --git a/utils/fst/remove_oovs.pl b/utils/fst/remove_oovs.pl old mode 100644 new mode 100755 diff --git a/utils/fst/rnnt_token_fst.py b/utils/fst/rnnt_token_fst.py old mode 100644 new mode 100755 index 14376c3b..8f1cf493 --- a/utils/fst/rnnt_token_fst.py +++ b/utils/fst/rnnt_token_fst.py @@ -31,8 +31,6 @@ if __name__ == "__main__": '--token_file', required=True, help='e2e model token file. line: token(char/phone/spm/disambigous)') - args = parser.parse_args() - print(args) main(args) diff --git a/utils/fst/s2eps.pl b/utils/fst/s2eps.pl old mode 100644 new mode 100755 diff --git a/utils/manifest_key_value.py b/utils/manifest_key_value.py new file mode 100755 index 00000000..b409236f --- /dev/null +++ b/utils/manifest_key_value.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python3 +"""Manifest file to key-value files.""" +import argparse +import functools +from pathlib import Path + +from utils.utility import add_arguments +from utils.utility import print_arguments +from utils.utility import read_manifest + + +def main(args): + print_arguments(args, globals()) + + count = 0 + + outdir = Path(args.output_path) + wav_scp = outdir / 'wav.scp' + dur_scp = outdir / 'duration' + text_scp = outdir / 'text' + + manifest_jsons = read_manifest(args.manifest_path) + + with wav_scp.open('w') as fwav, dur_scp.open('w') as fdur, text_scp.open( + 'w') as ftxt: + for line_json in manifest_jsons: + utt = line_json['utt'] + feat = line_json['feat'] + file_ext = Path(feat).suffix # .wav + text = line_json['text'] + feat_shape = line_json['feat_shape'] + dur = feat_shape[0] + feat_dim = feat_shape[1] + if 'token' in line_json: + tokens = line_json['token'] + tokenids = line_json['token_id'] + token_shape = line_json['token_shape'] + token_len = token_shape[0] + vocab_dim = token_shape[1] + + if file_ext == '.wav': + fwav.write(f"{utt} {feat}\n") + fdur.write(f"{utt} {dur}\n") + ftxt.write(f"{utt} {text}\n") + + count += 1 + + print(f"Examples number: {count}") + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description=__doc__) + add_arg = functools.partial(add_arguments, argparser=parser) + # yapf: disable + add_arg('manifest_path', str, + 'data/librispeech/manifest.train', + "Filepath of manifest to compute normalizer's mean and stddev.") + add_arg('output_path', str, + 'data/train', + "dir path to dump wav.scp/duaration/text files.") + # yapf: disable + args = parser.parse_args() + + main(args) diff --git a/utils/ngram_train.sh b/utils/ngram_train.sh index cba74880..b56048eb 100755 --- a/utils/ngram_train.sh +++ b/utils/ngram_train.sh @@ -22,7 +22,7 @@ lmbin=${2}.klm.bin # https://kheafield.com/code/kenlm/estimation/ echo "build arpa lm." -lmplz -o ${order} -S ${mem} --prune ${prune} < ${text} >${arpa} || { echo "train kenlm error!"; exit -1; } +lmplz -o ${order} -S ${mem} --prune ${prune} < ${text} > ${arpa} || { echo "train kenlm error!"; exit -1; } # https://kheafield.com/code/kenlm/ echo "build binary lm." diff --git a/utils/utility.py b/utils/utility.py old mode 100644 new mode 100755 index 344900ef..a6b81d73 --- a/utils/utility.py +++ b/utils/utility.py @@ -11,19 +11,95 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import hashlib +import json import os import tarfile import zipfile from typing import Text -from paddle.dataset.common import md5file - __all__ = [ "check_md5sum", "getfile_insensitive", "download_multi", "download", - "unpack", "unzip" + "unpack", "unzip", "md5file", "print_arguments", "add_arguments", + "read_manifest" ] +def read_manifest(manifest_path): + """Load and parse manifest file. + Args: + manifest_path ([type]): Manifest file to load and parse. + + Raises: + IOError: If failed to parse the manifest. + + Returns: + List[dict]: Manifest parsing results. + """ + + manifest = [] + for json_line in open(manifest_path, 'r'): + try: + json_data = json.loads(json_line) + except Exception as e: + raise IOError("Error reading manifest: %s" % str(e)) + return manifest + + +def print_arguments(args, info=None): + """Print argparse's arguments. + + Usage: + + .. code-block:: python + + parser = argparse.ArgumentParser() + parser.add_argument("name", default="Jonh", type=str, help="User name.") + args = parser.parse_args() + print_arguments(args) + + :param args: Input argparse.Namespace for printing. + :type args: argparse.Namespace + """ + filename = "" + if info: + filename = info["__file__"] + filename = os.path.basename(filename) + print(f"----------- {filename} Configuration Arguments -----------") + for arg, value in sorted(vars(args).items()): + print("%s: %s" % (arg, value)) + print("-----------------------------------------------------------") + + +def add_arguments(argname, type, default, help, argparser, **kwargs): + """Add argparse's argument. + + Usage: + + .. code-block:: python + + parser = argparse.ArgumentParser() + add_argument("name", str, "Jonh", "User name.", parser) + args = parser.parse_args() + """ + type = distutils.util.strtobool if type == bool else type + argparser.add_argument( + "--" + argname, + default=default, + type=type, + help=help + ' Default: %(default)s.', + **kwargs) + + +def md5file(fname): + hash_md5 = hashlib.md5() + f = open(fname, "rb") + for chunk in iter(lambda: f.read(4096), b""): + hash_md5.update(chunk) + f.close() + return hash_md5.hexdigest() + + def getfile_insensitive(path): """Get the actual file path when given insensitive filename.""" directory, filename = os.path.split(path)