From 9abe33b4bd48e3a27c76af905bdda4dae175f97c Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Wed, 22 Sep 2021 12:46:24 +0000
Subject: [PATCH] add score_sclite

---
 examples/librispeech/s2/path.sh |  2 +-
 tools/Makefile                  | 36 ++++++++++++++++++
 utils/filter.py                 | 66 +++++++++++++++++++++++++++++++++
 utils/score_sclite.sh           |  3 +-
 4 files changed, 105 insertions(+), 2 deletions(-)
 create mode 100644 utils/filter.py

diff --git a/examples/librispeech/s2/path.sh b/examples/librispeech/s2/path.sh
index c90e27821..9f6891cd3 100644
--- a/examples/librispeech/s2/path.sh
+++ b/examples/librispeech/s2/path.sh
@@ -1,6 +1,6 @@
 export MAIN_ROOT=`realpath ${PWD}/../../../`
 
-export PATH=${MAIN_ROOT}:${PWD}/utils:${PATH}
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/tools/sckt/bin/sclite:${PWD}/utils:${PATH}
 export LC_ALL=C
 
 # Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
diff --git a/tools/Makefile b/tools/Makefile
index 62cf990fa..9fb1dc896 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -39,3 +39,39 @@ mfa.done:
 	test -d montreal-forced-aligner || wget https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/releases/download/v1.0.1/montreal-forced-aligner_linux.tar.gz
 	tar xvf montreal-forced-aligner_linux.tar.gz
 	touch mfa.done
+
+
+# Keep the existing target 'sclite' to avoid breaking the users who might have
+# scripted it in.
+.PHONY: sclite sctk_cleaned sctk_made
+
+sclite sctk_made: sctk/.compiled
+
+sctk/.compiled: sctk
+	rm -f sctk/.compiled
+	$(SCTK_MKENV) $(MAKE) -C sctk config
+	$(SCTK_MKENV) $(MAKE) -C sctk all doc
+	$(MAKE) -C sctk install
+	touch sctk/.compiled
+
+# The GitHub archive unpacks into SCTK-{40-character-long-hash}/
+sctk: sctk-$(SCTK_GITHASH).tar.gz
+	tar zxvf sctk-$(SCTK_GITHASH).tar.gz
+	rm -rf sctk-$(SCTK_GITHASH) sctk
+	mv SCTK-$(SCTK_GITHASH)* sctk-$(SCTK_GITHASH)
+	ln -s sctk-$(SCTK_GITHASH) sctk
+	touch sctk-$(SCTK_GITHASH).tar.gz
+
+sctk-$(SCTK_GITHASH).tar.gz:
+	if [ -d '$(DOWNLOAD_DIR)' ]; then \
+	  cp -p '$(DOWNLOAD_DIR)/sctk-$(SCTK_GITHASH).tar.gz' .; \
+	else \
+	  $(WGET) -nv -T 10 -t 3 -O sctk-$(SCTK_GITHASH).tar.gz \
+	    https://github.com/usnistgov/SCTK/archive/$(SCTK_GITHASH).tar.gz; \
+	fi
+
+sctk_cleaned:
+	-for d in sctk/ sctk-*/; do \
+	   [ ! -f $$d/.compiled ] || $(MAKE) -C $$d clean; \
+	   rm -f $$d/.compiled; \
+	done
\ No newline at end of file
diff --git a/utils/filter.py b/utils/filter.py
new file mode 100644
index 000000000..d31eab4d7
--- /dev/null
+++ b/utils/filter.py
@@ -0,0 +1,66 @@
+#!/usr/bin/env python3
+
+# Apache 2.0
+
+import argparse
+import codecs
+import sys
+
+is_python2 = sys.version_info[0] == 2
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        description="filter words in a text file",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument(
+        "--exclude",
+        "-v",
+        dest="exclude",
+        action="store_true",
+        help="exclude filter words",
+    )
+    parser.add_argument("filt", type=str, help="filter list")
+    parser.add_argument("infile", type=str, help="input file")
+    return parser
+
+
+def main(args):
+    args = get_parser().parse_args(args)
+    filter_file(args.infile, args.filt, args.exclude)
+
+
+def filter_file(infile, filt, exclude):
+    vocab = set()
+    with codecs.open(filt, "r", encoding="utf-8") as vocabfile:
+        for line in vocabfile:
+            vocab.add(line.strip())
+
+    sys.stdout = codecs.getwriter("utf-8")(
+        sys.stdout if is_python2 else sys.stdout.buffer
+    )
+    with codecs.open(infile, "r", encoding="utf-8") as textfile:
+        for line in textfile:
+            if exclude:
+                print(
+                    " ".join(
+                        map(
+                            lambda word: word if word not in vocab else "",
+                            line.strip().split(),
+                        )
+                    )
+                )
+            else:
+                print(
+                    " ".join(
+                        map(
+                            lambda word: word if word in vocab else "<UNK>",
+                            line.strip().split(),
+                        )
+                    )
+                )
+
+
+if __name__ == "__main__":
+    main(sys.argv[1:])
diff --git a/utils/score_sclite.sh b/utils/score_sclite.sh
index 3aa4cd072..7ded76eba 100755
--- a/utils/score_sclite.sh
+++ b/utils/score_sclite.sh
@@ -5,6 +5,7 @@
 
 [ -f ./path.sh ] && . ./path.sh
 
+# non language symbol
 nlsyms=""
 wer=false
 bpe=""
@@ -24,7 +25,7 @@ fi
 dir=$1
 dic=$2
 
-concatjson.py ${dir}/data.*.json > ${dir}/data.json
+cat ${dir}/data.*.json > ${dir}/data.json
 
 if [ $num_spkrs -eq 1 ]; then
   json2trn.py ${dir}/data.json ${dic} --num-spkrs ${num_spkrs} --refs ${dir}/ref.trn --hyps ${dir}/hyp.trn