add score_sclite

4 years ago · 9abe33b4bd
parent ae87bc8c7a
commit 9abe33b4bd
4 changed files with 105 additions and 2 deletions
--- a/examples/librispeech/s2/path.sh
+++ b/examples/librispeech/s2/path.sh
@ -1,6 +1,6 @@
 export MAIN_ROOT=`realpath ${PWD}/../../../`

-export PATH=${MAIN_ROOT}:${PWD}/utils:${PATH}
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/tools/sckt/bin/sclite:${PWD}/utils:${PATH}
 export LC_ALL=C

 # Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
--- a/tools/Makefile
+++ b/tools/Makefile
@ -39,3 +39,39 @@ mfa.done:
 	test -d montreal-forced-aligner || wget https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/releases/download/v1.0.1/montreal-forced-aligner_linux.tar.gz
 	tar xvf montreal-forced-aligner_linux.tar.gz
 	touch mfa.done
+
+
+# Keep the existing target 'sclite' to avoid breaking the users who might have
+# scripted it in.
+.PHONY: sclite sctk_cleaned sctk_made
+
+sclite sctk_made: sctk/.compiled
+
+sctk/.compiled: sctk
+	rm -f sctk/.compiled
+	$(SCTK_MKENV) $(MAKE) -C sctk config
+	$(SCTK_MKENV) $(MAKE) -C sctk all doc
+	$(MAKE) -C sctk install
+	touch sctk/.compiled
+
+# The GitHub archive unpacks into SCTK-{40-character-long-hash}/
+sctk: sctk-$(SCTK_GITHASH).tar.gz
+	tar zxvf sctk-$(SCTK_GITHASH).tar.gz
+	rm -rf sctk-$(SCTK_GITHASH) sctk
+	mv SCTK-$(SCTK_GITHASH)* sctk-$(SCTK_GITHASH)
+	ln -s sctk-$(SCTK_GITHASH) sctk
+	touch sctk-$(SCTK_GITHASH).tar.gz
+
+sctk-$(SCTK_GITHASH).tar.gz:
+	if [ -d '$(DOWNLOAD_DIR)' ]; then \
+	  cp -p '$(DOWNLOAD_DIR)/sctk-$(SCTK_GITHASH).tar.gz' .; \
+	else \
+	  $(WGET) -nv -T 10 -t 3 -O sctk-$(SCTK_GITHASH).tar.gz \
+	    https://github.com/usnistgov/SCTK/archive/$(SCTK_GITHASH).tar.gz; \
+	fi
+
+sctk_cleaned:
+	-for d in sctk/ sctk-*/; do \
+	   [ ! -f $$d/.compiled ] || $(MAKE) -C $$d clean; \
+	   rm -f $$d/.compiled; \
+	done
--- a/utils/filter.py
+++ b/utils/filter.py
@ -0,0 +1,66 @@
+#!/usr/bin/env python3
+
+# Apache 2.0
+
+import argparse
+import codecs
+import sys
+
+is_python2 = sys.version_info[0] == 2
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        description="filter words in a text file",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument(
+        "--exclude",
+        "-v",
+        dest="exclude",
+        action="store_true",
+        help="exclude filter words",
+    )
+    parser.add_argument("filt", type=str, help="filter list")
+    parser.add_argument("infile", type=str, help="input file")
+    return parser
+
+
+def main(args):
+    args = get_parser().parse_args(args)
+    filter_file(args.infile, args.filt, args.exclude)
+
+
+def filter_file(infile, filt, exclude):
+    vocab = set()
+    with codecs.open(filt, "r", encoding="utf-8") as vocabfile:
+        for line in vocabfile:
+            vocab.add(line.strip())
+
+    sys.stdout = codecs.getwriter("utf-8")(
+        sys.stdout if is_python2 else sys.stdout.buffer
+    )
+    with codecs.open(infile, "r", encoding="utf-8") as textfile:
+        for line in textfile:
+            if exclude:
+                print(
+                    " ".join(
+                        map(
+                            lambda word: word if word not in vocab else "",
+                            line.strip().split(),
+                        )
+                    )
+                )
+            else:
+                print(
+                    " ".join(
+                        map(
+                            lambda word: word if word in vocab else "<UNK>",
+                            line.strip().split(),
+                        )
+                    )
+                )
+
+
+if __name__ == "__main__":
+    main(sys.argv[1:])
--- a/utils/score_sclite.sh
+++ b/utils/score_sclite.sh
@ -5,6 +5,7 @@

 [ -f ./path.sh ] && . ./path.sh

+# non language symbol
 nlsyms=""
 wer=false
 bpe=""
@ -24,7 +25,7 @@ fi
 dir=$1
 dic=$2

-concatjson.py ${dir}/data.*.json > ${dir}/data.json
+cat ${dir}/data.*.json > ${dir}/data.json

 if [ $num_spkrs -eq 1 ]; then
  json2trn.py ${dir}/data.json ${dic} --num-spkrs ${num_spkrs} --refs ${dir}/ref.trn --hyps ${dir}/hyp.trn