add score_sclite

pull/852/head
Hui Zhang 3 years ago
parent ae87bc8c7a
commit 9abe33b4bd

@ -1,6 +1,6 @@
export MAIN_ROOT=`realpath ${PWD}/../../../`
export PATH=${MAIN_ROOT}:${PWD}/utils:${PATH}
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/tools/sckt/bin/sclite:${PWD}/utils:${PATH}
export LC_ALL=C
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C

@ -39,3 +39,39 @@ mfa.done:
test -d montreal-forced-aligner || wget https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/releases/download/v1.0.1/montreal-forced-aligner_linux.tar.gz
tar xvf montreal-forced-aligner_linux.tar.gz
touch mfa.done
# Keep the existing target 'sclite' to avoid breaking the users who might have
# scripted it in.
.PHONY: sclite sctk_cleaned sctk_made
sclite sctk_made: sctk/.compiled
sctk/.compiled: sctk
rm -f sctk/.compiled
$(SCTK_MKENV) $(MAKE) -C sctk config
$(SCTK_MKENV) $(MAKE) -C sctk all doc
$(MAKE) -C sctk install
touch sctk/.compiled
# The GitHub archive unpacks into SCTK-{40-character-long-hash}/
sctk: sctk-$(SCTK_GITHASH).tar.gz
tar zxvf sctk-$(SCTK_GITHASH).tar.gz
rm -rf sctk-$(SCTK_GITHASH) sctk
mv SCTK-$(SCTK_GITHASH)* sctk-$(SCTK_GITHASH)
ln -s sctk-$(SCTK_GITHASH) sctk
touch sctk-$(SCTK_GITHASH).tar.gz
sctk-$(SCTK_GITHASH).tar.gz:
if [ -d '$(DOWNLOAD_DIR)' ]; then \
cp -p '$(DOWNLOAD_DIR)/sctk-$(SCTK_GITHASH).tar.gz' .; \
else \
$(WGET) -nv -T 10 -t 3 -O sctk-$(SCTK_GITHASH).tar.gz \
https://github.com/usnistgov/SCTK/archive/$(SCTK_GITHASH).tar.gz; \
fi
sctk_cleaned:
-for d in sctk/ sctk-*/; do \
[ ! -f $$d/.compiled ] || $(MAKE) -C $$d clean; \
rm -f $$d/.compiled; \
done

@ -0,0 +1,66 @@
#!/usr/bin/env python3
# Apache 2.0
import argparse
import codecs
import sys
is_python2 = sys.version_info[0] == 2
def get_parser():
parser = argparse.ArgumentParser(
description="filter words in a text file",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
parser.add_argument(
"--exclude",
"-v",
dest="exclude",
action="store_true",
help="exclude filter words",
)
parser.add_argument("filt", type=str, help="filter list")
parser.add_argument("infile", type=str, help="input file")
return parser
def main(args):
args = get_parser().parse_args(args)
filter_file(args.infile, args.filt, args.exclude)
def filter_file(infile, filt, exclude):
vocab = set()
with codecs.open(filt, "r", encoding="utf-8") as vocabfile:
for line in vocabfile:
vocab.add(line.strip())
sys.stdout = codecs.getwriter("utf-8")(
sys.stdout if is_python2 else sys.stdout.buffer
)
with codecs.open(infile, "r", encoding="utf-8") as textfile:
for line in textfile:
if exclude:
print(
" ".join(
map(
lambda word: word if word not in vocab else "",
line.strip().split(),
)
)
)
else:
print(
" ".join(
map(
lambda word: word if word in vocab else "<UNK>",
line.strip().split(),
)
)
)
if __name__ == "__main__":
main(sys.argv[1:])

@ -5,6 +5,7 @@
[ -f ./path.sh ] && . ./path.sh
# non language symbol
nlsyms=""
wer=false
bpe=""
@ -24,7 +25,7 @@ fi
dir=$1
dic=$2
concatjson.py ${dir}/data.*.json > ${dir}/data.json
cat ${dir}/data.*.json > ${dir}/data.json
if [ $num_spkrs -eq 1 ]; then
json2trn.py ${dir}/data.json ${dic} --num-spkrs ${num_spkrs} --refs ${dir}/ref.trn --hyps ${dir}/hyp.trn

Loading…
Cancel
Save