From 9abe33b4bd48e3a27c76af905bdda4dae175f97c Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Wed, 22 Sep 2021 12:46:24 +0000 Subject: [PATCH] add score_sclite --- examples/librispeech/s2/path.sh | 2 +- tools/Makefile | 36 ++++++++++++++++++ utils/filter.py | 66 +++++++++++++++++++++++++++++++++ utils/score_sclite.sh | 3 +- 4 files changed, 105 insertions(+), 2 deletions(-) create mode 100644 utils/filter.py diff --git a/examples/librispeech/s2/path.sh b/examples/librispeech/s2/path.sh index c90e27821..9f6891cd3 100644 --- a/examples/librispeech/s2/path.sh +++ b/examples/librispeech/s2/path.sh @@ -1,6 +1,6 @@ export MAIN_ROOT=`realpath ${PWD}/../../../` -export PATH=${MAIN_ROOT}:${PWD}/utils:${PATH} +export PATH=${MAIN_ROOT}:${MAIN_ROOT}/tools/sckt/bin/sclite:${PWD}/utils:${PATH} export LC_ALL=C # Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C diff --git a/tools/Makefile b/tools/Makefile index 62cf990fa..9fb1dc896 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -39,3 +39,39 @@ mfa.done: test -d montreal-forced-aligner || wget https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/releases/download/v1.0.1/montreal-forced-aligner_linux.tar.gz tar xvf montreal-forced-aligner_linux.tar.gz touch mfa.done + + +# Keep the existing target 'sclite' to avoid breaking the users who might have +# scripted it in. +.PHONY: sclite sctk_cleaned sctk_made + +sclite sctk_made: sctk/.compiled + +sctk/.compiled: sctk + rm -f sctk/.compiled + $(SCTK_MKENV) $(MAKE) -C sctk config + $(SCTK_MKENV) $(MAKE) -C sctk all doc + $(MAKE) -C sctk install + touch sctk/.compiled + +# The GitHub archive unpacks into SCTK-{40-character-long-hash}/ +sctk: sctk-$(SCTK_GITHASH).tar.gz + tar zxvf sctk-$(SCTK_GITHASH).tar.gz + rm -rf sctk-$(SCTK_GITHASH) sctk + mv SCTK-$(SCTK_GITHASH)* sctk-$(SCTK_GITHASH) + ln -s sctk-$(SCTK_GITHASH) sctk + touch sctk-$(SCTK_GITHASH).tar.gz + +sctk-$(SCTK_GITHASH).tar.gz: + if [ -d '$(DOWNLOAD_DIR)' ]; then \ + cp -p '$(DOWNLOAD_DIR)/sctk-$(SCTK_GITHASH).tar.gz' .; \ + else \ + $(WGET) -nv -T 10 -t 3 -O sctk-$(SCTK_GITHASH).tar.gz \ + https://github.com/usnistgov/SCTK/archive/$(SCTK_GITHASH).tar.gz; \ + fi + +sctk_cleaned: + -for d in sctk/ sctk-*/; do \ + [ ! -f $$d/.compiled ] || $(MAKE) -C $$d clean; \ + rm -f $$d/.compiled; \ + done \ No newline at end of file diff --git a/utils/filter.py b/utils/filter.py new file mode 100644 index 000000000..d31eab4d7 --- /dev/null +++ b/utils/filter.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python3 + +# Apache 2.0 + +import argparse +import codecs +import sys + +is_python2 = sys.version_info[0] == 2 + + +def get_parser(): + parser = argparse.ArgumentParser( + description="filter words in a text file", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument( + "--exclude", + "-v", + dest="exclude", + action="store_true", + help="exclude filter words", + ) + parser.add_argument("filt", type=str, help="filter list") + parser.add_argument("infile", type=str, help="input file") + return parser + + +def main(args): + args = get_parser().parse_args(args) + filter_file(args.infile, args.filt, args.exclude) + + +def filter_file(infile, filt, exclude): + vocab = set() + with codecs.open(filt, "r", encoding="utf-8") as vocabfile: + for line in vocabfile: + vocab.add(line.strip()) + + sys.stdout = codecs.getwriter("utf-8")( + sys.stdout if is_python2 else sys.stdout.buffer + ) + with codecs.open(infile, "r", encoding="utf-8") as textfile: + for line in textfile: + if exclude: + print( + " ".join( + map( + lambda word: word if word not in vocab else "", + line.strip().split(), + ) + ) + ) + else: + print( + " ".join( + map( + lambda word: word if word in vocab else "", + line.strip().split(), + ) + ) + ) + + +if __name__ == "__main__": + main(sys.argv[1:]) diff --git a/utils/score_sclite.sh b/utils/score_sclite.sh index 3aa4cd072..7ded76eba 100755 --- a/utils/score_sclite.sh +++ b/utils/score_sclite.sh @@ -5,6 +5,7 @@ [ -f ./path.sh ] && . ./path.sh +# non language symbol nlsyms="" wer=false bpe="" @@ -24,7 +25,7 @@ fi dir=$1 dic=$2 -concatjson.py ${dir}/data.*.json > ${dir}/data.json +cat ${dir}/data.*.json > ${dir}/data.json if [ $num_spkrs -eq 1 ]; then json2trn.py ${dir}/data.json ${dic} --num-spkrs ${num_spkrs} --refs ${dir}/ref.trn --hyps ${dir}/hyp.trn