parent
ad8ec177ef
commit
15f434a5d3
@ -0,0 +1,59 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
|
||||||
|
# To be run from one directory above this script.
|
||||||
|
. ./path.sh
|
||||||
|
|
||||||
|
text=data/local/lm/text
|
||||||
|
lexicon=data/local/dict/lexicon.txt
|
||||||
|
|
||||||
|
for f in "$text" "$lexicon"; do
|
||||||
|
[ ! -f $x ] && echo "$0: No such file $f" && exit 1;
|
||||||
|
done
|
||||||
|
|
||||||
|
# Check SRILM tools
|
||||||
|
if ! which ngram-count > /dev/null; then
|
||||||
|
echo "srilm tools are not found, please download it and install it from: "
|
||||||
|
echo "http://www.speech.sri.com/projects/srilm/download.html"
|
||||||
|
echo "Then add the tools to your PATH"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# This script takes no arguments. It assumes you have already run
|
||||||
|
# aishell_data_prep.sh.
|
||||||
|
# It takes as input the files
|
||||||
|
# data/local/lm/text
|
||||||
|
# data/local/dict/lexicon.txt
|
||||||
|
dir=data/local/lm
|
||||||
|
mkdir -p $dir
|
||||||
|
|
||||||
|
|
||||||
|
cleantext=$dir/text.no_oov
|
||||||
|
|
||||||
|
cat $text | awk -v lex=$lexicon 'BEGIN{while((getline<lex) >0){ seen[$1]=1; } }
|
||||||
|
{for(n=1; n<=NF;n++) { if (seen[$n]) { printf("%s ", $n); } else {printf("<SPOKEN_NOISE> ");} } printf("\n");}' \
|
||||||
|
> $cleantext || exit 1;
|
||||||
|
|
||||||
|
cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \
|
||||||
|
sort -nr > $dir/word.counts || exit 1;
|
||||||
|
|
||||||
|
# Get counts from acoustic training transcripts, and add one-count
|
||||||
|
# for each word in the lexicon (but not silence, we don't want it
|
||||||
|
# in the LM-- we'll add it optionally later).
|
||||||
|
cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \
|
||||||
|
cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \
|
||||||
|
sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1;
|
||||||
|
|
||||||
|
cat $dir/unigram.counts | awk '{print $2}' | cat - <(echo "<s>"; echo "</s>" ) > $dir/wordlist
|
||||||
|
|
||||||
|
heldout_sent=10000 # Don't change this if you want result to be comparable with
|
||||||
|
# kaldi_lm results
|
||||||
|
mkdir -p $dir
|
||||||
|
cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
|
||||||
|
head -$heldout_sent > $dir/heldout
|
||||||
|
cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
|
||||||
|
tail -n +$heldout_sent > $dir/train
|
||||||
|
|
||||||
|
ngram-count -text $dir/train -order 3 -limit-vocab -vocab $dir/wordlist -unk \
|
||||||
|
-map-unk "<UNK>" -kndiscount -interpolate -lm $dir/lm.arpa
|
||||||
|
ngram -lm $dir/lm.arpa -ppl $dir/heldout
|
@ -0,0 +1,31 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
. ./path.sh || exit 1;
|
||||||
|
|
||||||
|
. tools/parse_options.sh || exit 1;
|
||||||
|
|
||||||
|
data=/mnt/dataset/aishell
|
||||||
|
|
||||||
|
# Optionally, you can add LM and test it with runtime.
|
||||||
|
dir=./ds2_graph
|
||||||
|
dict=$dir/vocab.txt
|
||||||
|
if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
|
||||||
|
# 7.1 Prepare dict
|
||||||
|
unit_file=$dict
|
||||||
|
mkdir -p $dir/local/dict
|
||||||
|
cp $unit_file $dir/local/dict/units.txt
|
||||||
|
tools/fst/prepare_dict.py $unit_file ${data}/resource_aishell/lexicon.txt \
|
||||||
|
$dir/local/dict/lexicon.txt
|
||||||
|
# Train lm
|
||||||
|
lm=$dir/local/lm
|
||||||
|
mkdir -p $lm
|
||||||
|
tools/filter_scp.pl data/train/text \
|
||||||
|
$data/data_aishell/transcript/aishell_transcript_v0.8.txt > $lm/text
|
||||||
|
local/ds2_aishell_train_lms.sh
|
||||||
|
# Build decoding TLG
|
||||||
|
tools/fst/compile_lexicon_token_fst.sh \
|
||||||
|
$dir/local/dict $dir/local/tmp $dir/local/lang
|
||||||
|
tools/fst/make_tlg.sh $dir/local/lm $dir/local/lang $dir/lang_test || exit 1;
|
||||||
|
fi
|
||||||
|
|
||||||
|
|
@ -0,0 +1,14 @@
|
|||||||
|
# This contains the locations of binarys build required for running the examples.
|
||||||
|
|
||||||
|
SPEECHX_ROOT=$PWD/../..
|
||||||
|
SPEECHX_EXAMPLES=$SPEECHX_ROOT/build/examples
|
||||||
|
|
||||||
|
SPEECHX_TOOLS=$SPEECHX_ROOT/tools
|
||||||
|
TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin
|
||||||
|
|
||||||
|
[ -d $SPEECHX_EXAMPLES ] || { echo "Error: 'build/examples' directory not found. please ensure that the project build successfully"; }
|
||||||
|
|
||||||
|
export LC_AL=C
|
||||||
|
|
||||||
|
SPEECHX_BIN=$SPEECHX_EXAMPLES/feat
|
||||||
|
export PATH=$PATH:$SPEECHX_BIN:$TOOLS_BIN
|
@ -0,0 +1,195 @@
|
|||||||
|
#!/usr/bin/env perl
|
||||||
|
# Copyright 2010-2011 Microsoft Corporation
|
||||||
|
# 2013-2016 Johns Hopkins University (author: Daniel Povey)
|
||||||
|
# 2015 Hainan Xu
|
||||||
|
# 2015 Guoguo Chen
|
||||||
|
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||||
|
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||||
|
# MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||||
|
# See the Apache 2 License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
|
||||||
|
# Adds disambiguation symbols to a lexicon.
|
||||||
|
# Outputs still in the normal lexicon format.
|
||||||
|
# Disambig syms are numbered #1, #2, #3, etc. (#0
|
||||||
|
# reserved for symbol in grammar).
|
||||||
|
# Outputs the number of disambig syms to the standard output.
|
||||||
|
# With the --pron-probs option, expects the second field
|
||||||
|
# of each lexicon line to be a pron-prob.
|
||||||
|
# With the --sil-probs option, expects three additional
|
||||||
|
# fields after the pron-prob, representing various components
|
||||||
|
# of the silence probability model.
|
||||||
|
|
||||||
|
$pron_probs = 0;
|
||||||
|
$sil_probs = 0;
|
||||||
|
$first_allowed_disambig = 1;
|
||||||
|
|
||||||
|
for ($n = 1; $n <= 3 && @ARGV > 0; $n++) {
|
||||||
|
if ($ARGV[0] eq "--pron-probs") {
|
||||||
|
$pron_probs = 1;
|
||||||
|
shift @ARGV;
|
||||||
|
}
|
||||||
|
if ($ARGV[0] eq "--sil-probs") {
|
||||||
|
$sil_probs = 1;
|
||||||
|
shift @ARGV;
|
||||||
|
}
|
||||||
|
if ($ARGV[0] eq "--first-allowed-disambig") {
|
||||||
|
$first_allowed_disambig = 0 + $ARGV[1];
|
||||||
|
if ($first_allowed_disambig < 1) {
|
||||||
|
die "add_lex_disambig.pl: invalid --first-allowed-disambig option: $first_allowed_disambig\n";
|
||||||
|
}
|
||||||
|
shift @ARGV;
|
||||||
|
shift @ARGV;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (@ARGV != 2) {
|
||||||
|
die "Usage: add_lex_disambig.pl [opts] <lexicon-in> <lexicon-out>\n" .
|
||||||
|
"This script adds disambiguation symbols to a lexicon in order to\n" .
|
||||||
|
"make decoding graphs determinizable; it adds pseudo-phone\n" .
|
||||||
|
"disambiguation symbols #1, #2 and so on at the ends of phones\n" .
|
||||||
|
"to ensure that all pronunciations are different, and that none\n" .
|
||||||
|
"is a prefix of another.\n" .
|
||||||
|
"It prints to the standard output the number of the largest-numbered" .
|
||||||
|
"disambiguation symbol that was used.\n" .
|
||||||
|
"\n" .
|
||||||
|
"Options: --pron-probs Expect pronunciation probabilities in the 2nd field\n" .
|
||||||
|
" --sil-probs [should be with --pron-probs option]\n" .
|
||||||
|
" Expect 3 extra fields after the pron-probs, for aspects of\n" .
|
||||||
|
" the silence probability model\n" .
|
||||||
|
" --first-allowed-disambig <n> The number of the first disambiguation symbol\n" .
|
||||||
|
" that this script is allowed to add. By default this is\n" .
|
||||||
|
" #1, but you can set this to a larger value using this option.\n" .
|
||||||
|
"e.g.:\n" .
|
||||||
|
" add_lex_disambig.pl lexicon.txt lexicon_disambig.txt\n" .
|
||||||
|
" add_lex_disambig.pl --pron-probs lexiconp.txt lexiconp_disambig.txt\n" .
|
||||||
|
" add_lex_disambig.pl --pron-probs --sil-probs lexiconp_silprob.txt lexiconp_silprob_disambig.txt\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
$lexfn = shift @ARGV;
|
||||||
|
$lexoutfn = shift @ARGV;
|
||||||
|
|
||||||
|
open(L, "<$lexfn") || die "Error opening lexicon $lexfn";
|
||||||
|
|
||||||
|
# (1) Read in the lexicon.
|
||||||
|
@L = ( );
|
||||||
|
while(<L>) {
|
||||||
|
@A = split(" ", $_);
|
||||||
|
push @L, join(" ", @A);
|
||||||
|
}
|
||||||
|
|
||||||
|
# (2) Work out the count of each phone-sequence in the
|
||||||
|
# lexicon.
|
||||||
|
|
||||||
|
foreach $l (@L) {
|
||||||
|
@A = split(" ", $l);
|
||||||
|
shift @A; # Remove word.
|
||||||
|
if ($pron_probs) {
|
||||||
|
$p = shift @A;
|
||||||
|
if (!($p > 0.0 && $p <= 1.0)) { die "Bad lexicon line $l (expecting pron-prob as second field)"; }
|
||||||
|
}
|
||||||
|
if ($sil_probs) {
|
||||||
|
$silp = shift @A;
|
||||||
|
if (!($silp > 0.0 && $silp <= 1.0)) { die "Bad lexicon line $l for silprobs"; }
|
||||||
|
$correction = shift @A;
|
||||||
|
if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; }
|
||||||
|
$correction = shift @A;
|
||||||
|
if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; }
|
||||||
|
}
|
||||||
|
if (!(@A)) {
|
||||||
|
die "Bad lexicon line $1, no phone in phone list";
|
||||||
|
}
|
||||||
|
$count{join(" ",@A)}++;
|
||||||
|
}
|
||||||
|
|
||||||
|
# (3) For each left sub-sequence of each phone-sequence, note down
|
||||||
|
# that it exists (for identifying prefixes of longer strings).
|
||||||
|
|
||||||
|
foreach $l (@L) {
|
||||||
|
@A = split(" ", $l);
|
||||||
|
shift @A; # Remove word.
|
||||||
|
if ($pron_probs) { shift @A; } # remove pron-prob.
|
||||||
|
if ($sil_probs) {
|
||||||
|
shift @A; # Remove silprob
|
||||||
|
shift @A; # Remove silprob
|
||||||
|
}
|
||||||
|
while(@A > 0) {
|
||||||
|
pop @A; # Remove last phone
|
||||||
|
$issubseq{join(" ",@A)} = 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# (4) For each entry in the lexicon:
|
||||||
|
# if the phone sequence is unique and is not a
|
||||||
|
# prefix of another word, no diambig symbol.
|
||||||
|
# Else output #1, or #2, #3, ... if the same phone-seq
|
||||||
|
# has already been assigned a disambig symbol.
|
||||||
|
|
||||||
|
|
||||||
|
open(O, ">$lexoutfn") || die "Opening lexicon file $lexoutfn for writing.\n";
|
||||||
|
|
||||||
|
# max_disambig will always be the highest-numbered disambiguation symbol that
|
||||||
|
# has been used so far.
|
||||||
|
$max_disambig = $first_allowed_disambig - 1;
|
||||||
|
|
||||||
|
foreach $l (@L) {
|
||||||
|
@A = split(" ", $l);
|
||||||
|
$word = shift @A;
|
||||||
|
if ($pron_probs) {
|
||||||
|
$pron_prob = shift @A;
|
||||||
|
}
|
||||||
|
if ($sil_probs) {
|
||||||
|
$sil_word_prob = shift @A;
|
||||||
|
$word_sil_correction = shift @A;
|
||||||
|
$prev_nonsil_correction = shift @A
|
||||||
|
}
|
||||||
|
$phnseq = join(" ", @A);
|
||||||
|
if (!defined $issubseq{$phnseq}
|
||||||
|
&& $count{$phnseq} == 1) {
|
||||||
|
; # Do nothing.
|
||||||
|
} else {
|
||||||
|
if ($phnseq eq "") { # need disambig symbols for the empty string
|
||||||
|
# that are not use anywhere else.
|
||||||
|
$max_disambig++;
|
||||||
|
$reserved_for_the_empty_string{$max_disambig} = 1;
|
||||||
|
$phnseq = "#$max_disambig";
|
||||||
|
} else {
|
||||||
|
$cur_disambig = $last_used_disambig_symbol_of{$phnseq};
|
||||||
|
if (!defined $cur_disambig) {
|
||||||
|
$cur_disambig = $first_allowed_disambig;
|
||||||
|
} else {
|
||||||
|
$cur_disambig++; # Get a number that has not been used yet for
|
||||||
|
# this phone sequence.
|
||||||
|
}
|
||||||
|
while (defined $reserved_for_the_empty_string{$cur_disambig}) {
|
||||||
|
$cur_disambig++;
|
||||||
|
}
|
||||||
|
if ($cur_disambig > $max_disambig) {
|
||||||
|
$max_disambig = $cur_disambig;
|
||||||
|
}
|
||||||
|
$last_used_disambig_symbol_of{$phnseq} = $cur_disambig;
|
||||||
|
$phnseq = $phnseq . " #" . $cur_disambig;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if ($pron_probs) {
|
||||||
|
if ($sil_probs) {
|
||||||
|
print O "$word\t$pron_prob\t$sil_word_prob\t$word_sil_correction\t$prev_nonsil_correction\t$phnseq\n";
|
||||||
|
} else {
|
||||||
|
print O "$word\t$pron_prob\t$phnseq\n";
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
print O "$word\t$phnseq\n";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
print $max_disambig . "\n";
|
@ -0,0 +1,86 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# Copyright 2015 Yajie Miao (Carnegie Mellon University)
|
||||||
|
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||||
|
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||||
|
# MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||||
|
# See the Apache 2 License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
# This script compiles the lexicon and CTC tokens into FSTs. FST compiling slightly differs between the
|
||||||
|
# phoneme and character-based lexicons.
|
||||||
|
set -eo pipefail
|
||||||
|
. tools/parse_options.sh
|
||||||
|
|
||||||
|
if [ $# -ne 3 ]; then
|
||||||
|
echo "usage: tools/fst/compile_lexicon_token_fst.sh <dict-src-dir> <tmp-dir> <lang-dir>"
|
||||||
|
echo "e.g.: tools/fst/compile_lexicon_token_fst.sh data/local/dict data/local/lang_tmp data/lang"
|
||||||
|
echo "<dict-src-dir> should contain the following files:"
|
||||||
|
echo "lexicon.txt lexicon_numbers.txt units.txt"
|
||||||
|
echo "options: "
|
||||||
|
exit 1;
|
||||||
|
fi
|
||||||
|
|
||||||
|
srcdir=$1
|
||||||
|
tmpdir=$2
|
||||||
|
dir=$3
|
||||||
|
mkdir -p $dir $tmpdir
|
||||||
|
|
||||||
|
[ -f path.sh ] && . ./path.sh
|
||||||
|
|
||||||
|
cp $srcdir/units.txt $dir
|
||||||
|
|
||||||
|
# Add probabilities to lexicon entries. There is in fact no point of doing this here since all the entries have 1.0.
|
||||||
|
# But utils/make_lexicon_fst.pl requires a probabilistic version, so we just leave it as it is.
|
||||||
|
perl -ape 's/(\S+\s+)(.+)/${1}1.0\t$2/;' < $srcdir/lexicon.txt > $tmpdir/lexiconp.txt || exit 1;
|
||||||
|
|
||||||
|
# Add disambiguation symbols to the lexicon. This is necessary for determinizing the composition of L.fst and G.fst.
|
||||||
|
# Without these symbols, determinization will fail.
|
||||||
|
ndisambig=`tools/fst/add_lex_disambig.pl $tmpdir/lexiconp.txt $tmpdir/lexiconp_disambig.txt`
|
||||||
|
ndisambig=$[$ndisambig+1];
|
||||||
|
|
||||||
|
( for n in `seq 0 $ndisambig`; do echo '#'$n; done ) > $tmpdir/disambig.list
|
||||||
|
|
||||||
|
# Get the full list of CTC tokens used in FST. These tokens include <eps>, the blank <blk>,
|
||||||
|
# the actual model unit, and the disambiguation symbols.
|
||||||
|
cat $srcdir/units.txt | awk '{print $1}' > $tmpdir/units.list
|
||||||
|
(echo '<eps>';) | cat - $tmpdir/units.list $tmpdir/disambig.list | awk '{print $1 " " (NR-1)}' > $dir/tokens.txt
|
||||||
|
|
||||||
|
# ctc_token_fst_corrected is too big and too slow for character based chinese modeling,
|
||||||
|
# so here just use simple ctc_token_fst
|
||||||
|
tools/fst/ctc_token_fst.py $dir/tokens.txt | \
|
||||||
|
fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/tokens.txt --keep_isymbols=false --keep_osymbols=false | \
|
||||||
|
fstarcsort --sort_type=olabel > $dir/T.fst || exit 1;
|
||||||
|
|
||||||
|
# Encode the words with indices. Will be used in lexicon and language model FST compiling.
|
||||||
|
cat $tmpdir/lexiconp.txt | awk '{print $1}' | sort | awk '
|
||||||
|
BEGIN {
|
||||||
|
print "<eps> 0";
|
||||||
|
}
|
||||||
|
{
|
||||||
|
printf("%s %d\n", $1, NR);
|
||||||
|
}
|
||||||
|
END {
|
||||||
|
printf("#0 %d\n", NR+1);
|
||||||
|
printf("<s> %d\n", NR+2);
|
||||||
|
printf("</s> %d\n", NR+3);
|
||||||
|
}' > $dir/words.txt || exit 1;
|
||||||
|
|
||||||
|
# Now compile the lexicon FST. Depending on the size of your lexicon, it may take some time.
|
||||||
|
token_disambig_symbol=`grep \#0 $dir/tokens.txt | awk '{print $2}'`
|
||||||
|
word_disambig_symbol=`grep \#0 $dir/words.txt | awk '{print $2}'`
|
||||||
|
|
||||||
|
tools/fst/make_lexicon_fst.pl --pron-probs $tmpdir/lexiconp_disambig.txt 0 "sil" '#'$ndisambig | \
|
||||||
|
fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/words.txt \
|
||||||
|
--keep_isymbols=false --keep_osymbols=false | \
|
||||||
|
fstaddselfloops "echo $token_disambig_symbol |" "echo $word_disambig_symbol |" | \
|
||||||
|
fstarcsort --sort_type=olabel > $dir/L.fst || exit 1;
|
||||||
|
|
||||||
|
echo "Lexicon and token FSTs compiling succeeded"
|
@ -0,0 +1,24 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
import sys
|
||||||
|
|
||||||
|
print('0 1 <eps> <eps>')
|
||||||
|
print('1 1 <blank> <eps>')
|
||||||
|
print('2 2 <blank> <eps>')
|
||||||
|
print('2 0 <eps> <eps>')
|
||||||
|
|
||||||
|
with open(sys.argv[1], 'r') as fin:
|
||||||
|
node = 3
|
||||||
|
for entry in fin:
|
||||||
|
fields = entry.strip().split(' ')
|
||||||
|
phone = fields[0]
|
||||||
|
if phone == '<eps>' or phone == '<blank>':
|
||||||
|
continue
|
||||||
|
elif '#' in phone: # disambiguous phone
|
||||||
|
print('{} {} {} {}'.format(0, 0, '<eps>', phone))
|
||||||
|
else:
|
||||||
|
print('{} {} {} {}'.format(1, node, phone, phone))
|
||||||
|
print('{} {} {} {}'.format(node, node, phone, '<eps>'))
|
||||||
|
print('{} {} {} {}'.format(node, 2, '<eps>', '<eps>'))
|
||||||
|
node += 1
|
||||||
|
print('0')
|
@ -0,0 +1,21 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
import sys
|
||||||
|
|
||||||
|
print('0 0 <blank> <eps>')
|
||||||
|
|
||||||
|
with open(sys.argv[1], 'r', encoding='utf8') as fin:
|
||||||
|
node = 1
|
||||||
|
for entry in fin:
|
||||||
|
fields = entry.strip().split(' ')
|
||||||
|
phone = fields[0]
|
||||||
|
if phone == '<eps>' or phone == '<blank>':
|
||||||
|
continue
|
||||||
|
elif '#' in phone: # disambiguous phone
|
||||||
|
print('{} {} {} {}'.format(0, 0, '<eps>', phone))
|
||||||
|
else:
|
||||||
|
print('{} {} {} {}'.format(0, node, phone, phone))
|
||||||
|
print('{} {} {} {}'.format(node, node, phone, '<eps>'))
|
||||||
|
print('{} {} {} {}'.format(node, 0, '<eps>', '<eps>'))
|
||||||
|
node += 1
|
||||||
|
print('0')
|
@ -0,0 +1,55 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
import sys
|
||||||
|
|
||||||
|
|
||||||
|
def il(n):
|
||||||
|
return n + 1
|
||||||
|
|
||||||
|
|
||||||
|
def ol(n):
|
||||||
|
return n + 1
|
||||||
|
|
||||||
|
|
||||||
|
def s(n):
|
||||||
|
return n
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
with open(sys.argv[1]) as f:
|
||||||
|
lines = f.readlines()
|
||||||
|
phone_count = 0
|
||||||
|
disambig_count = 0
|
||||||
|
for line in lines:
|
||||||
|
sp = line.split()
|
||||||
|
phone = sp[0]
|
||||||
|
if phone == '<eps>' or phone == '<blank>':
|
||||||
|
continue
|
||||||
|
if phone.startswith('#'):
|
||||||
|
disambig_count += 1
|
||||||
|
else:
|
||||||
|
phone_count += 1
|
||||||
|
|
||||||
|
# 1. add start state
|
||||||
|
print('0 0 {} 0'.format(il(0)))
|
||||||
|
|
||||||
|
# 2. 0 -> i, i -> i, i -> 0
|
||||||
|
for i in range(1, phone_count + 1):
|
||||||
|
print('0 {} {} {}'.format(s(i), il(i), ol(i)))
|
||||||
|
print('{} {} {} 0'.format(s(i), s(i), il(i)))
|
||||||
|
print('{} 0 {} 0'.format(s(i), il(0)))
|
||||||
|
|
||||||
|
# 3. i -> other phone
|
||||||
|
for i in range(1, phone_count + 1):
|
||||||
|
for j in range(1, phone_count + 1):
|
||||||
|
if i != j:
|
||||||
|
print('{} {} {} {}'.format(s(i), s(j), il(j), ol(j)))
|
||||||
|
|
||||||
|
# 4. add disambiguous arcs on every final state
|
||||||
|
for i in range(0, phone_count + 1):
|
||||||
|
for j in range(phone_count + 2, phone_count + disambig_count + 2):
|
||||||
|
print('{} {} {} {}'.format(s(i), s(i), 0, j))
|
||||||
|
|
||||||
|
# 5. every i is final state
|
||||||
|
for i in range(0, phone_count + 1):
|
||||||
|
print(s(i))
|
@ -0,0 +1,29 @@
|
|||||||
|
#!/usr/bin/env perl
|
||||||
|
# Copyright 2010-2011 Microsoft Corporation
|
||||||
|
# 2015 Guoguo Chen
|
||||||
|
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||||
|
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||||
|
# MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||||
|
# See the Apache 2 License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
# This script replaces epsilon with #0 on the input side only, of the G.fst
|
||||||
|
# acceptor.
|
||||||
|
|
||||||
|
while(<>){
|
||||||
|
if (/\s+#0\s+/) {
|
||||||
|
print STDERR "$0: ERROR: LM has word #0, " .
|
||||||
|
"which is reserved as disambiguation symbol\n";
|
||||||
|
exit 1;
|
||||||
|
}
|
||||||
|
s:^(\d+\s+\d+\s+)\<eps\>(\s+):$1#0$2:;
|
||||||
|
print;
|
||||||
|
}
|
@ -0,0 +1,155 @@
|
|||||||
|
#!/usr/bin/env perl
|
||||||
|
use warnings; #sed replacement for -w perl parameter
|
||||||
|
# Copyright 2010-2011 Microsoft Corporation
|
||||||
|
# 2013 Johns Hopkins University (author: Daniel Povey)
|
||||||
|
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||||
|
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||||
|
# MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||||
|
# See the Apache 2 License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
|
||||||
|
# makes lexicon FST, in text form, from lexicon (pronunciation probabilities optional).
|
||||||
|
|
||||||
|
$pron_probs = 0;
|
||||||
|
|
||||||
|
if ((@ARGV > 0) && ($ARGV[0] eq "--pron-probs")) {
|
||||||
|
$pron_probs = 1;
|
||||||
|
shift @ARGV;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (@ARGV != 1 && @ARGV != 3 && @ARGV != 4) {
|
||||||
|
print STDERR "Usage: make_lexicon_fst.pl [--pron-probs] lexicon.txt [silprob silphone [sil_disambig_sym]] >lexiconfst.txt\n\n";
|
||||||
|
print STDERR "Creates a lexicon FST that transduces phones to words, and may allow optional silence.\n\n";
|
||||||
|
print STDERR "Note: ordinarily, each line of lexicon.txt is:\n";
|
||||||
|
print STDERR " word phone1 phone2 ... phoneN;\n";
|
||||||
|
print STDERR "if the --pron-probs option is used, each line is:\n";
|
||||||
|
print STDERR " word pronunciation-probability phone1 phone2 ... phoneN.\n\n";
|
||||||
|
print STDERR "The probability 'prob' will typically be between zero and one, and note that\n";
|
||||||
|
print STDERR "it's generally helpful to normalize so the largest one for each word is 1.0, but\n";
|
||||||
|
print STDERR "this is your responsibility.\n\n";
|
||||||
|
print STDERR "The silence disambiguation symbol, e.g. something like #5, is used only\n";
|
||||||
|
print STDERR "when creating a lexicon with disambiguation symbols, e.g. L_disambig.fst,\n";
|
||||||
|
print STDERR "and was introduced to fix a particular case of non-determinism of decoding graphs.\n\n";
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
$lexfn = shift @ARGV;
|
||||||
|
if (@ARGV == 0) {
|
||||||
|
$silprob = 0.0;
|
||||||
|
} elsif (@ARGV == 2) {
|
||||||
|
($silprob,$silphone) = @ARGV;
|
||||||
|
} else {
|
||||||
|
($silprob,$silphone,$sildisambig) = @ARGV;
|
||||||
|
}
|
||||||
|
if ($silprob != 0.0) {
|
||||||
|
$silprob < 1.0 || die "Sil prob cannot be >= 1.0";
|
||||||
|
$silcost = -log($silprob);
|
||||||
|
$nosilcost = -log(1.0 - $silprob);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
open(L, "<$lexfn") || die "Error opening lexicon $lexfn";
|
||||||
|
|
||||||
|
|
||||||
|
if ( $silprob == 0.0 ) { # No optional silences: just have one (loop+final) state which is numbered zero.
|
||||||
|
$loopstate = 0;
|
||||||
|
$nextstate = 1; # next unallocated state.
|
||||||
|
while (<L>) {
|
||||||
|
@A = split(" ", $_);
|
||||||
|
@A == 0 && die "Empty lexicon line.";
|
||||||
|
foreach $a (@A) {
|
||||||
|
if ($a eq "<eps>") {
|
||||||
|
die "Bad lexicon line $_ (<eps> is forbidden)";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
$w = shift @A;
|
||||||
|
if (! $pron_probs) {
|
||||||
|
$pron_cost = 0.0;
|
||||||
|
} else {
|
||||||
|
$pron_prob = shift @A;
|
||||||
|
if (! defined $pron_prob || !($pron_prob > 0.0 && $pron_prob <= 1.0)) {
|
||||||
|
die "Bad pronunciation probability in line $_";
|
||||||
|
}
|
||||||
|
$pron_cost = -log($pron_prob);
|
||||||
|
}
|
||||||
|
if ($pron_cost != 0.0) { $pron_cost_string = "\t$pron_cost"; } else { $pron_cost_string = ""; }
|
||||||
|
|
||||||
|
$s = $loopstate;
|
||||||
|
$word_or_eps = $w;
|
||||||
|
while (@A > 0) {
|
||||||
|
$p = shift @A;
|
||||||
|
if (@A > 0) {
|
||||||
|
$ns = $nextstate++;
|
||||||
|
} else {
|
||||||
|
$ns = $loopstate;
|
||||||
|
}
|
||||||
|
print "$s\t$ns\t$p\t$word_or_eps$pron_cost_string\n";
|
||||||
|
$word_or_eps = "<eps>";
|
||||||
|
$pron_cost_string = ""; # so we only print it on the first arc of the word.
|
||||||
|
$s = $ns;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
print "$loopstate\t0\n"; # final-cost.
|
||||||
|
} else { # have silence probs.
|
||||||
|
$startstate = 0;
|
||||||
|
$loopstate = 1;
|
||||||
|
$silstate = 2; # state from where we go to loopstate after emitting silence.
|
||||||
|
print "$startstate\t$loopstate\t<eps>\t<eps>\t$nosilcost\n"; # no silence.
|
||||||
|
if (!defined $sildisambig) {
|
||||||
|
print "$startstate\t$loopstate\t$silphone\t<eps>\t$silcost\n"; # silence.
|
||||||
|
print "$silstate\t$loopstate\t$silphone\t<eps>\n"; # no cost.
|
||||||
|
$nextstate = 3;
|
||||||
|
} else {
|
||||||
|
$disambigstate = 3;
|
||||||
|
$nextstate = 4;
|
||||||
|
print "$startstate\t$disambigstate\t$silphone\t<eps>\t$silcost\n"; # silence.
|
||||||
|
print "$silstate\t$disambigstate\t$silphone\t<eps>\n"; # no cost.
|
||||||
|
print "$disambigstate\t$loopstate\t$sildisambig\t<eps>\n"; # silence disambiguation symbol.
|
||||||
|
}
|
||||||
|
while (<L>) {
|
||||||
|
@A = split(" ", $_);
|
||||||
|
$w = shift @A;
|
||||||
|
if (! $pron_probs) {
|
||||||
|
$pron_cost = 0.0;
|
||||||
|
} else {
|
||||||
|
$pron_prob = shift @A;
|
||||||
|
if (! defined $pron_prob || !($pron_prob > 0.0 && $pron_prob <= 1.0)) {
|
||||||
|
die "Bad pronunciation probability in line $_";
|
||||||
|
}
|
||||||
|
$pron_cost = -log($pron_prob);
|
||||||
|
}
|
||||||
|
if ($pron_cost != 0.0) { $pron_cost_string = "\t$pron_cost"; } else { $pron_cost_string = ""; }
|
||||||
|
$s = $loopstate;
|
||||||
|
$word_or_eps = $w;
|
||||||
|
while (@A > 0) {
|
||||||
|
$p = shift @A;
|
||||||
|
if (@A > 0) {
|
||||||
|
$ns = $nextstate++;
|
||||||
|
print "$s\t$ns\t$p\t$word_or_eps$pron_cost_string\n";
|
||||||
|
$word_or_eps = "<eps>";
|
||||||
|
$pron_cost_string = ""; $pron_cost = 0.0; # so we only print it the 1st time.
|
||||||
|
$s = $ns;
|
||||||
|
} elsif (!defined($silphone) || $p ne $silphone) {
|
||||||
|
# This is non-deterministic but relatively compact,
|
||||||
|
# and avoids epsilons.
|
||||||
|
$local_nosilcost = $nosilcost + $pron_cost;
|
||||||
|
$local_silcost = $silcost + $pron_cost;
|
||||||
|
print "$s\t$loopstate\t$p\t$word_or_eps\t$local_nosilcost\n";
|
||||||
|
print "$s\t$silstate\t$p\t$word_or_eps\t$local_silcost\n";
|
||||||
|
} else {
|
||||||
|
# no point putting opt-sil after silence word.
|
||||||
|
print "$s\t$loopstate\t$p\t$word_or_eps$pron_cost_string\n";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
print "$loopstate\t0\n"; # final-cost.
|
||||||
|
}
|
@ -0,0 +1,38 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
#
|
||||||
|
|
||||||
|
if [ -f path.sh ]; then . path.sh; fi
|
||||||
|
|
||||||
|
lm_dir=$1
|
||||||
|
src_lang=$2
|
||||||
|
tgt_lang=$3
|
||||||
|
|
||||||
|
arpa_lm=${lm_dir}/lm.arpa
|
||||||
|
[ ! -f $arpa_lm ] && echo No such file $arpa_lm && exit 1;
|
||||||
|
|
||||||
|
rm -rf $tgt_lang
|
||||||
|
cp -r $src_lang $tgt_lang
|
||||||
|
|
||||||
|
# Compose the language model to FST
|
||||||
|
cat $arpa_lm | \
|
||||||
|
grep -v '<s> <s>' | \
|
||||||
|
grep -v '</s> <s>' | \
|
||||||
|
grep -v '</s> </s>' | \
|
||||||
|
grep -v -i '<unk>' | \
|
||||||
|
grep -v -i '<spoken_noise>' | \
|
||||||
|
arpa2fst --read-symbol-table=$tgt_lang/words.txt --keep-symbols=true - | fstprint | \
|
||||||
|
tools/fst/eps2disambig.pl | tools/fst/s2eps.pl | fstcompile --isymbols=$tgt_lang/words.txt \
|
||||||
|
--osymbols=$tgt_lang/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
||||||
|
fstrmepsilon | fstarcsort --sort_type=ilabel > $tgt_lang/G.fst
|
||||||
|
|
||||||
|
|
||||||
|
echo "Checking how stochastic G is (the first of these numbers should be small):"
|
||||||
|
fstisstochastic $tgt_lang/G.fst
|
||||||
|
|
||||||
|
# Compose the token, lexicon and language-model FST into the final decoding graph
|
||||||
|
fsttablecompose $tgt_lang/L.fst $tgt_lang/G.fst | fstdeterminizestar --use-log=true | \
|
||||||
|
fstminimizeencoded | fstarcsort --sort_type=ilabel > $tgt_lang/LG.fst || exit 1;
|
||||||
|
fsttablecompose $tgt_lang/T.fst $tgt_lang/LG.fst > $tgt_lang/TLG.fst || exit 1;
|
||||||
|
|
||||||
|
echo "Composing decoding graph TLG.fst succeeded"
|
||||||
|
#rm -r $tgt_lang/LG.fst # We don't need to keep this intermediate FST
|
@ -0,0 +1,64 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# encoding: utf-8
|
||||||
|
|
||||||
|
import sys
|
||||||
|
|
||||||
|
# sys.argv[1]: e2e model unit file(lang_char.txt)
|
||||||
|
# sys.argv[2]: raw lexicon file
|
||||||
|
# sys.argv[3]: output lexicon file
|
||||||
|
# sys.argv[4]: bpemodel
|
||||||
|
|
||||||
|
unit_table = set()
|
||||||
|
with open(sys.argv[1], 'r', encoding='utf8') as fin:
|
||||||
|
for line in fin:
|
||||||
|
unit = line.split()[0]
|
||||||
|
unit_table.add(unit)
|
||||||
|
|
||||||
|
|
||||||
|
def contain_oov(units):
|
||||||
|
for unit in units:
|
||||||
|
if unit not in unit_table:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
bpemode = len(sys.argv) > 4
|
||||||
|
if bpemode:
|
||||||
|
import sentencepiece as spm
|
||||||
|
sp = spm.SentencePieceProcessor()
|
||||||
|
sp.Load(sys.argv[4])
|
||||||
|
lexicon_table = set()
|
||||||
|
with open(sys.argv[2], 'r', encoding='utf8') as fin, \
|
||||||
|
open(sys.argv[3], 'w', encoding='utf8') as fout:
|
||||||
|
for line in fin:
|
||||||
|
word = line.split()[0]
|
||||||
|
if word == 'SIL' and not bpemode: # `sil` might be a valid piece in bpemodel
|
||||||
|
continue
|
||||||
|
elif word == '<SPOKEN_NOISE>':
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
# each word only has one pronunciation for e2e system
|
||||||
|
if word in lexicon_table:
|
||||||
|
continue
|
||||||
|
if bpemode:
|
||||||
|
pieces = sp.EncodeAsPieces(word)
|
||||||
|
if contain_oov(pieces):
|
||||||
|
print(
|
||||||
|
'Ignoring words {}, which contains oov unit'.format(
|
||||||
|
''.join(word).strip('▁'))
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
chars = ' '.join(
|
||||||
|
[p if p in unit_table else '<unk>' for p in pieces])
|
||||||
|
else:
|
||||||
|
# ignore words with OOV
|
||||||
|
if contain_oov(word):
|
||||||
|
print('Ignoring words {}, which contains oov unit'.format(word))
|
||||||
|
continue
|
||||||
|
# Optional, append ▁ in front of english word
|
||||||
|
# we assume the model unit of our e2e system is char now.
|
||||||
|
if word.encode('utf8').isalpha() and '▁' in unit_table:
|
||||||
|
word = '▁' + word
|
||||||
|
chars = ' '.join(word) # word is a char list
|
||||||
|
fout.write('{} {}\n'.format(word, chars))
|
||||||
|
lexicon_table.add(word)
|
@ -0,0 +1,43 @@
|
|||||||
|
#!/usr/bin/env perl
|
||||||
|
# Copyright 2010-2011 Microsoft Corporation
|
||||||
|
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||||
|
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||||
|
# MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||||
|
# See the Apache 2 License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
# This script removes lines that contain these OOVs on either the
|
||||||
|
# third or fourth fields of the line. It is intended to remove arcs
|
||||||
|
# with OOVs on, from FSTs (probably compiled from ARPAs with OOVs in).
|
||||||
|
|
||||||
|
if ( @ARGV < 1 && @ARGV > 2) {
|
||||||
|
die "Usage: remove_oovs.pl unk_list.txt [ printed-fst ]\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
$unklist = shift @ARGV;
|
||||||
|
open(S, "<$unklist") || die "Failed opening unknown-symbol list $unklist\n";
|
||||||
|
while(<S>){
|
||||||
|
@A = split(" ", $_);
|
||||||
|
@A == 1 || die "Bad line in unknown-symbol list: $_";
|
||||||
|
$unk{$A[0]} = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
$num_removed = 0;
|
||||||
|
while(<>){
|
||||||
|
@A = split(" ", $_);
|
||||||
|
if(defined $unk{$A[2]} || defined $unk{$A[3]}) {
|
||||||
|
$num_removed++;
|
||||||
|
} else {
|
||||||
|
print;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
print STDERR "remove_oovs.pl: removed $num_removed lines.\n";
|
||||||
|
|
@ -0,0 +1,17 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
import sys
|
||||||
|
|
||||||
|
print('0 0 <blank> <eps>')
|
||||||
|
|
||||||
|
with open(sys.argv[1], 'r', encoding='utf8') as fin:
|
||||||
|
for entry in fin:
|
||||||
|
fields = entry.strip().split(' ')
|
||||||
|
phone = fields[0]
|
||||||
|
if phone == '<eps>' or phone == '<blank>':
|
||||||
|
continue
|
||||||
|
elif '#' in phone: # disambiguous phone
|
||||||
|
print('{} {} {} {}'.format(0, 0, '<eps>', phone))
|
||||||
|
else:
|
||||||
|
print('{} {} {} {}'.format(0, 0, phone, phone))
|
||||||
|
print('0')
|
@ -0,0 +1,27 @@
|
|||||||
|
#!/usr/bin/env perl
|
||||||
|
# Copyright 2010-2011 Microsoft Corporation
|
||||||
|
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||||
|
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||||
|
# MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||||
|
# See the Apache 2 License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
# This script replaces <s> and </s> with <eps> (on both input and output sides),
|
||||||
|
# for the G.fst acceptor.
|
||||||
|
|
||||||
|
while(<>){
|
||||||
|
@A = split(" ", $_);
|
||||||
|
if ( @A >= 4 ) {
|
||||||
|
if ($A[2] eq "<s>" || $A[2] eq "</s>") { $A[2] = "<eps>"; }
|
||||||
|
if ($A[3] eq "<s>" || $A[3] eq "</s>") { $A[3] = "<eps>"; }
|
||||||
|
}
|
||||||
|
print join("\t", @A) . "\n";
|
||||||
|
}
|
@ -0,0 +1,97 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Copyright 2012 Johns Hopkins University (Author: Daniel Povey);
|
||||||
|
# Arnab Ghoshal, Karel Vesely
|
||||||
|
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||||
|
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||||
|
# MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||||
|
# See the Apache 2 License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
|
||||||
|
# Parse command-line options.
|
||||||
|
# To be sourced by another script (as in ". parse_options.sh").
|
||||||
|
# Option format is: --option-name arg
|
||||||
|
# and shell variable "option_name" gets set to value "arg."
|
||||||
|
# The exception is --help, which takes no arguments, but prints the
|
||||||
|
# $help_message variable (if defined).
|
||||||
|
|
||||||
|
|
||||||
|
###
|
||||||
|
### The --config file options have lower priority to command line
|
||||||
|
### options, so we need to import them first...
|
||||||
|
###
|
||||||
|
|
||||||
|
# Now import all the configs specified by command-line, in left-to-right order
|
||||||
|
for ((argpos=1; argpos<$#; argpos++)); do
|
||||||
|
if [ "${!argpos}" == "--config" ]; then
|
||||||
|
argpos_plus1=$((argpos+1))
|
||||||
|
config=${!argpos_plus1}
|
||||||
|
[ ! -r $config ] && echo "$0: missing config '$config'" && exit 1
|
||||||
|
. $config # source the config file.
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
|
||||||
|
###
|
||||||
|
### No we process the command line options
|
||||||
|
###
|
||||||
|
while true; do
|
||||||
|
[ -z "${1:-}" ] && break; # break if there are no arguments
|
||||||
|
case "$1" in
|
||||||
|
# If the enclosing script is called with --help option, print the help
|
||||||
|
# message and exit. Scripts should put help messages in $help_message
|
||||||
|
--help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2;
|
||||||
|
else printf "$help_message\n" 1>&2 ; fi;
|
||||||
|
exit 0 ;;
|
||||||
|
--*=*) echo "$0: options to scripts must be of the form --name value, got '$1'"
|
||||||
|
exit 1 ;;
|
||||||
|
# If the first command-line argument begins with "--" (e.g. --foo-bar),
|
||||||
|
# then work out the variable name as $name, which will equal "foo_bar".
|
||||||
|
--*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`;
|
||||||
|
# Next we test whether the variable in question is undefned-- if so it's
|
||||||
|
# an invalid option and we die. Note: $0 evaluates to the name of the
|
||||||
|
# enclosing script.
|
||||||
|
# The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar
|
||||||
|
# is undefined. We then have to wrap this test inside "eval" because
|
||||||
|
# foo_bar is itself inside a variable ($name).
|
||||||
|
eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
|
||||||
|
|
||||||
|
oldval="`eval echo \\$$name`";
|
||||||
|
# Work out whether we seem to be expecting a Boolean argument.
|
||||||
|
if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then
|
||||||
|
was_bool=true;
|
||||||
|
else
|
||||||
|
was_bool=false;
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Set the variable to the right value-- the escaped quotes make it work if
|
||||||
|
# the option had spaces, like --cmd "queue.pl -sync y"
|
||||||
|
eval $name=\"$2\";
|
||||||
|
|
||||||
|
# Check that Boolean-valued arguments are really Boolean.
|
||||||
|
if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
|
||||||
|
echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
|
||||||
|
exit 1;
|
||||||
|
fi
|
||||||
|
shift 2;
|
||||||
|
;;
|
||||||
|
*) break;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
|
||||||
|
# Check for an empty argument to the --cmd option, which can easily occur as a
|
||||||
|
# result of scripting errors.
|
||||||
|
[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1;
|
||||||
|
|
||||||
|
|
||||||
|
true; # so this script returns exit code 0.
|
@ -0,0 +1,100 @@
|
|||||||
|
// fstbin/fstaddselfloops.cc
|
||||||
|
|
||||||
|
// Copyright 2009-2011 Microsoft Corporation
|
||||||
|
|
||||||
|
// See ../../COPYING for clarification regarding multiple authors
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||||
|
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||||
|
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||||
|
// See the Apache 2 License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
#include "base/kaldi-common.h"
|
||||||
|
#include "fst/fstlib.h"
|
||||||
|
#include "fstext/determinize-star.h"
|
||||||
|
#include "fstext/fstext-utils.h"
|
||||||
|
#include "fstext/kaldi-fst-io.h"
|
||||||
|
#include "util/parse-options.h"
|
||||||
|
#include "util/simple-io-funcs.h"
|
||||||
|
|
||||||
|
/* some test examples:
|
||||||
|
pushd ~/tmpdir
|
||||||
|
( echo 3; echo 4) > in.list
|
||||||
|
( echo 5; echo 6) > out.list
|
||||||
|
( echo "0 0 0 0"; echo "0 0" ) | fstcompile | fstaddselfloops in.list out.list
|
||||||
|
| fstprint ( echo "0 1 0 1"; echo " 0 2 1 0"; echo "1 0"; echo "2 0"; ) |
|
||||||
|
fstcompile | fstaddselfloops in.list out.list | fstprint
|
||||||
|
*/
|
||||||
|
|
||||||
|
int main(int argc, char *argv[]) {
|
||||||
|
try {
|
||||||
|
using namespace kaldi; // NOLINT
|
||||||
|
using namespace fst; // NOLINT
|
||||||
|
using kaldi::int32;
|
||||||
|
|
||||||
|
const char *usage =
|
||||||
|
"Adds self-loops to states of an FST to propagate disambiguation "
|
||||||
|
"symbols through it\n"
|
||||||
|
"They are added on each final state and each state with non-epsilon "
|
||||||
|
"output symbols\n"
|
||||||
|
"on at least one arc out of the state. Useful in conjunction with "
|
||||||
|
"predeterminize\n"
|
||||||
|
"\n"
|
||||||
|
"Usage: fstaddselfloops in-disambig-list out-disambig-list [in.fst "
|
||||||
|
"[out.fst] ]\n"
|
||||||
|
"E.g: fstaddselfloops in.list out.list < in.fst > withloops.fst\n"
|
||||||
|
"in.list and out.list are lists of integers, one per line, of the\n"
|
||||||
|
"same length.\n";
|
||||||
|
|
||||||
|
ParseOptions po(usage);
|
||||||
|
po.Read(argc, argv);
|
||||||
|
|
||||||
|
if (po.NumArgs() < 2 || po.NumArgs() > 4) {
|
||||||
|
po.PrintUsage();
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string disambig_in_rxfilename = po.GetArg(1),
|
||||||
|
disambig_out_rxfilename = po.GetArg(2),
|
||||||
|
fst_in_filename = po.GetOptArg(3),
|
||||||
|
fst_out_filename = po.GetOptArg(4);
|
||||||
|
|
||||||
|
VectorFst<StdArc> *fst = ReadFstKaldi(fst_in_filename);
|
||||||
|
|
||||||
|
std::vector<int32> disambig_in;
|
||||||
|
if (!ReadIntegerVectorSimple(disambig_in_rxfilename, &disambig_in))
|
||||||
|
KALDI_ERR
|
||||||
|
<< "fstaddselfloops: Could not read disambiguation symbols from "
|
||||||
|
<< kaldi::PrintableRxfilename(disambig_in_rxfilename);
|
||||||
|
|
||||||
|
std::vector<int32> disambig_out;
|
||||||
|
if (!ReadIntegerVectorSimple(disambig_out_rxfilename, &disambig_out))
|
||||||
|
KALDI_ERR
|
||||||
|
<< "fstaddselfloops: Could not read disambiguation symbols from "
|
||||||
|
<< kaldi::PrintableRxfilename(disambig_out_rxfilename);
|
||||||
|
|
||||||
|
if (disambig_in.size() != disambig_out.size())
|
||||||
|
KALDI_ERR
|
||||||
|
<< "fstaddselfloops: mismatch in size of disambiguation symbols";
|
||||||
|
|
||||||
|
AddSelfLoops(fst, disambig_in, disambig_out);
|
||||||
|
|
||||||
|
WriteFstKaldi(*fst, fst_out_filename);
|
||||||
|
|
||||||
|
delete fst;
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
} catch (const std::exception &e) {
|
||||||
|
std::cerr << e.what();
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
@ -0,0 +1,114 @@
|
|||||||
|
// fstbin/fstdeterminizestar.cc
|
||||||
|
|
||||||
|
// Copyright 2009-2011 Microsoft Corporation
|
||||||
|
|
||||||
|
// See ../../COPYING for clarification regarding multiple authors
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||||
|
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||||
|
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||||
|
// See the Apache 2 License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
#include "base/kaldi-common.h"
|
||||||
|
#include "fst/fstlib.h"
|
||||||
|
#include "fstext/determinize-star.h"
|
||||||
|
#include "fstext/fstext-utils.h"
|
||||||
|
#include "fstext/kaldi-fst-io.h"
|
||||||
|
#include "util/parse-options.h"
|
||||||
|
#if !defined(_MSC_VER) && !defined(__APPLE__)
|
||||||
|
#include <signal.h> // Comment this line and the call to signal below if
|
||||||
|
// it causes compilation problems. It is only to enable a debugging procedure
|
||||||
|
// when determinization does not terminate. We are disabling this code if
|
||||||
|
// compiling on Windows because signal.h is not available there, and on
|
||||||
|
// MacOS due to a problem with <signal.h> in the initial release of Sierra.
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* some test examples:
|
||||||
|
( echo "0 0 0 0"; echo "0 0" ) | fstcompile | fstdeterminizestar | fstprint
|
||||||
|
( echo "0 0 1 0"; echo "0 0" ) | fstcompile | fstdeterminizestar | fstprint
|
||||||
|
( echo "0 0 1 0"; echo "0 1 1 0"; echo "0 0" ) | fstcompile |
|
||||||
|
fstdeterminizestar | fstprint # this last one fails [correctly]: ( echo "0 0 0
|
||||||
|
1"; echo "0 0" ) | fstcompile | fstdeterminizestar | fstprint
|
||||||
|
|
||||||
|
cd ~/tmpdir
|
||||||
|
while true; do
|
||||||
|
fstrand > 1.fst
|
||||||
|
fstpredeterminize out.lst 1.fst | fstdeterminizestar | fstrmsymbols out.lst
|
||||||
|
> 2.fst fstequivalent --random=true 1.fst 2.fst || echo "Test failed" echo -n
|
||||||
|
"." done
|
||||||
|
|
||||||
|
Test of debugging [with non-determinizable input]:
|
||||||
|
( echo " 0 0 1 0 1.0"; echo "0 1 1 0"; echo "1 1 1 0 0"; echo "0 2 2 0"; echo
|
||||||
|
"2"; echo "1" ) | fstcompile | fstdeterminizestar kill -SIGUSR1 [the process-id
|
||||||
|
of fstdeterminizestar] # prints out a bunch of debugging output showing the
|
||||||
|
mess it got itself into.
|
||||||
|
*/
|
||||||
|
|
||||||
|
bool debug_location = false;
|
||||||
|
void signal_handler(int) { debug_location = true; }
|
||||||
|
|
||||||
|
int main(int argc, char *argv[]) {
|
||||||
|
try {
|
||||||
|
using namespace kaldi; // NOLINT
|
||||||
|
using namespace fst; // NOLINT
|
||||||
|
using kaldi::int32;
|
||||||
|
|
||||||
|
const char *usage =
|
||||||
|
"Removes epsilons and determinizes in one step\n"
|
||||||
|
"\n"
|
||||||
|
"Usage: fstdeterminizestar [in.fst [out.fst] ]\n"
|
||||||
|
"\n"
|
||||||
|
"See also: fstdeterminizelog, lattice-determinize\n";
|
||||||
|
|
||||||
|
float delta = kDelta;
|
||||||
|
int max_states = -1;
|
||||||
|
bool use_log = false;
|
||||||
|
ParseOptions po(usage);
|
||||||
|
po.Register("use-log", &use_log, "Determinize in log semiring.");
|
||||||
|
po.Register("delta", &delta,
|
||||||
|
"Delta value used to determine equivalence of weights.");
|
||||||
|
po.Register(
|
||||||
|
"max-states", &max_states,
|
||||||
|
"Maximum number of states in determinized FST before it will abort.");
|
||||||
|
po.Read(argc, argv);
|
||||||
|
|
||||||
|
if (po.NumArgs() > 2) {
|
||||||
|
po.PrintUsage();
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string fst_in_str = po.GetOptArg(1), fst_out_str = po.GetOptArg(2);
|
||||||
|
|
||||||
|
// This enables us to get traceback info from determinization that is
|
||||||
|
// not seeming to terminate.
|
||||||
|
#if !defined(_MSC_VER) && !defined(__APPLE__)
|
||||||
|
signal(SIGUSR1, signal_handler);
|
||||||
|
#endif
|
||||||
|
// Normal case: just files.
|
||||||
|
VectorFst<StdArc> *fst = ReadFstKaldi(fst_in_str);
|
||||||
|
|
||||||
|
ArcSort(fst, ILabelCompare<StdArc>()); // improves speed.
|
||||||
|
if (use_log) {
|
||||||
|
DeterminizeStarInLog(fst, delta, &debug_location, max_states);
|
||||||
|
} else {
|
||||||
|
VectorFst<StdArc> det_fst;
|
||||||
|
DeterminizeStar(*fst, &det_fst, delta, &debug_location, max_states);
|
||||||
|
*fst = det_fst; // will do shallow copy and then det_fst goes
|
||||||
|
// out of scope anyway.
|
||||||
|
}
|
||||||
|
WriteFstKaldi(*fst, fst_out_str);
|
||||||
|
delete fst;
|
||||||
|
return 0;
|
||||||
|
} catch (const std::exception &e) {
|
||||||
|
std::cerr << e.what();
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,91 @@
|
|||||||
|
// fstbin/fstisstochastic.cc
|
||||||
|
|
||||||
|
// Copyright 2009-2011 Microsoft Corporation
|
||||||
|
|
||||||
|
// See ../../COPYING for clarification regarding multiple authors
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||||
|
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||||
|
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||||
|
// See the Apache 2 License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
#include "base/kaldi-common.h"
|
||||||
|
#include "fst/fstlib.h"
|
||||||
|
#include "fstext/fstext-utils.h"
|
||||||
|
#include "fstext/kaldi-fst-io.h"
|
||||||
|
#include "util/kaldi-io.h"
|
||||||
|
#include "util/parse-options.h"
|
||||||
|
|
||||||
|
// e.g. of test:
|
||||||
|
// echo " 0 0" | fstcompile | fstisstochastic
|
||||||
|
// should return 0 and print "0 0" [meaning, min and
|
||||||
|
// max weight are one = exp(0)]
|
||||||
|
// echo " 0 1" | fstcompile | fstisstochastic
|
||||||
|
// should return 1, not stochastic, and print 1 1
|
||||||
|
// (echo "0 0 0 0 0.693147 "; echo "0 1 0 0 0.693147 "; echo "1 0" ) |
|
||||||
|
// fstcompile | fstisstochastic should return 0, stochastic; it prints "0
|
||||||
|
// -1.78e-07" for me (echo "0 0 0 0 0.693147 "; echo "0 1 0 0 0.693147 "; echo
|
||||||
|
// "1 0" ) | fstcompile | fstisstochastic --test-in-log=false should return 1,
|
||||||
|
// not stochastic in tropical; it prints "0 0.693147" for me (echo "0 0 0 0 0 ";
|
||||||
|
// echo "0 1 0 0 0 "; echo "1 0" ) | fstcompile | fstisstochastic
|
||||||
|
// --test-in-log=false should return 0, stochastic in tropical; it prints "0 0"
|
||||||
|
// for me (echo "0 0 0 0 0.693147 "; echo "0 1 0 0 0.693147 "; echo "1 0" ) |
|
||||||
|
// fstcompile | fstisstochastic --test-in-log=false --delta=1 returns 0 even
|
||||||
|
// though not stochastic because we gave it an absurdly large delta.
|
||||||
|
|
||||||
|
int main(int argc, char *argv[]) {
|
||||||
|
try {
|
||||||
|
using namespace kaldi; // NOLINT
|
||||||
|
using namespace fst; // NOLINT
|
||||||
|
using kaldi::int32;
|
||||||
|
|
||||||
|
const char *usage =
|
||||||
|
"Checks whether an FST is stochastic and exits with success if so.\n"
|
||||||
|
"Prints out maximum error (in log units).\n"
|
||||||
|
"\n"
|
||||||
|
"Usage: fstisstochastic [ in.fst ]\n";
|
||||||
|
|
||||||
|
float delta = 0.01;
|
||||||
|
bool test_in_log = true;
|
||||||
|
|
||||||
|
ParseOptions po(usage);
|
||||||
|
po.Register("delta", &delta, "Maximum error to accept.");
|
||||||
|
po.Register("test-in-log", &test_in_log,
|
||||||
|
"Test stochasticity in log semiring.");
|
||||||
|
po.Read(argc, argv);
|
||||||
|
|
||||||
|
if (po.NumArgs() > 1) {
|
||||||
|
po.PrintUsage();
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string fst_in_filename = po.GetOptArg(1);
|
||||||
|
|
||||||
|
Fst<StdArc> *fst = ReadFstKaldiGeneric(fst_in_filename);
|
||||||
|
|
||||||
|
bool ans;
|
||||||
|
StdArc::Weight min, max;
|
||||||
|
if (test_in_log)
|
||||||
|
ans = IsStochasticFstInLog(*fst, delta, &min, &max);
|
||||||
|
else
|
||||||
|
ans = IsStochasticFst(*fst, delta, &min, &max);
|
||||||
|
|
||||||
|
std::cout << min.Value() << " " << max.Value() << '\n';
|
||||||
|
delete fst;
|
||||||
|
if (ans)
|
||||||
|
return 0; // success;
|
||||||
|
else
|
||||||
|
return 1;
|
||||||
|
} catch (const std::exception &e) {
|
||||||
|
std::cerr << e.what();
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,74 @@
|
|||||||
|
// fstbin/fstminimizeencoded.cc
|
||||||
|
|
||||||
|
// Copyright 2009-2011 Microsoft Corporation
|
||||||
|
|
||||||
|
// See ../../COPYING for clarification regarding multiple authors
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||||
|
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||||
|
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||||
|
// See the Apache 2 License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
#include "base/kaldi-common.h"
|
||||||
|
#include "fst/fstlib.h"
|
||||||
|
#include "fstext/determinize-star.h"
|
||||||
|
#include "fstext/fstext-utils.h"
|
||||||
|
#include "fstext/kaldi-fst-io.h"
|
||||||
|
#include "util/kaldi-io.h"
|
||||||
|
#include "util/parse-options.h"
|
||||||
|
#include "util/text-utils.h"
|
||||||
|
|
||||||
|
/* some test examples:
|
||||||
|
( echo "0 0 0 0"; echo "0 0" ) | fstcompile | fstminimizeencoded | fstprint
|
||||||
|
( echo "0 1 0 0"; echo " 0 2 0 0"; echo "1 0"; echo "2 0"; ) | fstcompile |
|
||||||
|
fstminimizeencoded | fstprint
|
||||||
|
*/
|
||||||
|
|
||||||
|
int main(int argc, char *argv[]) {
|
||||||
|
try {
|
||||||
|
using namespace kaldi; // NOLINT
|
||||||
|
using namespace fst; // NOLINT
|
||||||
|
using kaldi::int32;
|
||||||
|
|
||||||
|
const char *usage =
|
||||||
|
"Minimizes FST after encoding [similar to fstminimize, but no "
|
||||||
|
"weight-pushing]\n"
|
||||||
|
"\n"
|
||||||
|
"Usage: fstminimizeencoded [in.fst [out.fst] ]\n";
|
||||||
|
|
||||||
|
float delta = kDelta;
|
||||||
|
ParseOptions po(usage);
|
||||||
|
po.Register("delta", &delta,
|
||||||
|
"Delta likelihood used for quantization of weights");
|
||||||
|
po.Read(argc, argv);
|
||||||
|
|
||||||
|
if (po.NumArgs() > 2) {
|
||||||
|
po.PrintUsage();
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string fst_in_filename = po.GetOptArg(1),
|
||||||
|
fst_out_filename = po.GetOptArg(2);
|
||||||
|
|
||||||
|
VectorFst<StdArc> *fst = ReadFstKaldi(fst_in_filename);
|
||||||
|
|
||||||
|
MinimizeEncoded(fst, delta);
|
||||||
|
|
||||||
|
WriteFstKaldi(*fst, fst_out_filename);
|
||||||
|
|
||||||
|
delete fst;
|
||||||
|
return 0;
|
||||||
|
} catch (const std::exception &e) {
|
||||||
|
std::cerr << e.what();
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
@ -0,0 +1,133 @@
|
|||||||
|
// fstbin/fsttablecompose.cc
|
||||||
|
|
||||||
|
// Copyright 2009-2011 Microsoft Corporation
|
||||||
|
// 2013 Johns Hopkins University (author: Daniel Povey)
|
||||||
|
|
||||||
|
// See ../../COPYING for clarification regarding multiple authors
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||||
|
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||||
|
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||||
|
// See the Apache 2 License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
#include "base/kaldi-common.h"
|
||||||
|
#include "fst/fstlib.h"
|
||||||
|
#include "fstext/fstext-utils.h"
|
||||||
|
#include "fstext/kaldi-fst-io.h"
|
||||||
|
#include "fstext/table-matcher.h"
|
||||||
|
#include "util/parse-options.h"
|
||||||
|
|
||||||
|
/*
|
||||||
|
cd ~/tmpdir
|
||||||
|
while true; do
|
||||||
|
fstrand | fstarcsort --sort_type=olabel > 1.fst; fstrand | fstarcsort
|
||||||
|
> 2.fst fstcompose 1.fst 2.fst > 3a.fst fsttablecompose 1.fst 2.fst > 3b.fst
|
||||||
|
fstequivalent --random=true 3a.fst 3b.fst || echo "Test failed"
|
||||||
|
echo -n "."
|
||||||
|
done
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
int main(int argc, char *argv[]) {
|
||||||
|
try {
|
||||||
|
using namespace kaldi; // NOLINT
|
||||||
|
using namespace fst; // NOLINT
|
||||||
|
using kaldi::int32;
|
||||||
|
/*
|
||||||
|
fsttablecompose should always give equivalent results to compose,
|
||||||
|
but it is more efficient for certain kinds of inputs.
|
||||||
|
In particular, it is useful when, say, the left FST has states
|
||||||
|
that typically either have epsilon olabels, or
|
||||||
|
one transition out for each of the possible symbols (as the
|
||||||
|
olabel). The same with the input symbols of the right-hand FST
|
||||||
|
is possible.
|
||||||
|
*/
|
||||||
|
|
||||||
|
const char *usage =
|
||||||
|
"Composition algorithm [between two FSTs of standard type, in "
|
||||||
|
"tropical\n"
|
||||||
|
"semiring] that is more efficient for certain cases-- in particular,\n"
|
||||||
|
"where one of the FSTs (the left one, if --match-side=left) has large\n"
|
||||||
|
"out-degree\n"
|
||||||
|
"\n"
|
||||||
|
"Usage: fsttablecompose (fst1-rxfilename|fst1-rspecifier) "
|
||||||
|
"(fst2-rxfilename|fst2-rspecifier) [(out-rxfilename|out-rspecifier)]\n";
|
||||||
|
|
||||||
|
ParseOptions po(usage);
|
||||||
|
|
||||||
|
TableComposeOptions opts;
|
||||||
|
std::string match_side = "left";
|
||||||
|
std::string compose_filter = "sequence";
|
||||||
|
|
||||||
|
po.Register("connect", &opts.connect, "If true, trim FST before output.");
|
||||||
|
po.Register("match-side", &match_side,
|
||||||
|
"Side of composition to do table "
|
||||||
|
"match, one of: \"left\" or \"right\".");
|
||||||
|
po.Register("compose-filter", &compose_filter,
|
||||||
|
"Composition filter to use, "
|
||||||
|
"one of: \"alt_sequence\", \"auto\", \"match\", \"sequence\"");
|
||||||
|
|
||||||
|
po.Read(argc, argv);
|
||||||
|
|
||||||
|
if (match_side == "left") {
|
||||||
|
opts.table_match_type = MATCH_OUTPUT;
|
||||||
|
} else if (match_side == "right") {
|
||||||
|
opts.table_match_type = MATCH_INPUT;
|
||||||
|
} else {
|
||||||
|
KALDI_ERR << "Invalid match-side option: " << match_side;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (compose_filter == "alt_sequence") {
|
||||||
|
opts.filter_type = ALT_SEQUENCE_FILTER;
|
||||||
|
} else if (compose_filter == "auto") {
|
||||||
|
opts.filter_type = AUTO_FILTER;
|
||||||
|
} else if (compose_filter == "match") {
|
||||||
|
opts.filter_type = MATCH_FILTER;
|
||||||
|
} else if (compose_filter == "sequence") {
|
||||||
|
opts.filter_type = SEQUENCE_FILTER;
|
||||||
|
} else {
|
||||||
|
KALDI_ERR << "Invalid compose-filter option: " << compose_filter;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (po.NumArgs() < 2 || po.NumArgs() > 3) {
|
||||||
|
po.PrintUsage();
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string fst1_in_str = po.GetArg(1), fst2_in_str = po.GetArg(2),
|
||||||
|
fst_out_str = po.GetOptArg(3);
|
||||||
|
|
||||||
|
VectorFst<StdArc> *fst1 = ReadFstKaldi(fst1_in_str);
|
||||||
|
|
||||||
|
VectorFst<StdArc> *fst2 = ReadFstKaldi(fst2_in_str);
|
||||||
|
|
||||||
|
// Checks if <fst1> is olabel sorted and <fst2> is ilabel sorted.
|
||||||
|
if (fst1->Properties(fst::kOLabelSorted, true) == 0) {
|
||||||
|
KALDI_WARN << "The first FST is not olabel sorted.";
|
||||||
|
}
|
||||||
|
if (fst2->Properties(fst::kILabelSorted, true) == 0) {
|
||||||
|
KALDI_WARN << "The second FST is not ilabel sorted.";
|
||||||
|
}
|
||||||
|
|
||||||
|
VectorFst<StdArc> composed_fst;
|
||||||
|
|
||||||
|
TableCompose(*fst1, *fst2, &composed_fst, opts);
|
||||||
|
|
||||||
|
delete fst1;
|
||||||
|
delete fst2;
|
||||||
|
|
||||||
|
WriteFstKaldi(composed_fst, fst_out_str);
|
||||||
|
return 0;
|
||||||
|
} catch (const std::exception &e) {
|
||||||
|
std::cerr << e.what();
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,97 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
current_path=`pwd`
|
||||||
|
current_dir=`basename "$current_path"`
|
||||||
|
|
||||||
|
if [ "tools" != "$current_dir" ]; then
|
||||||
|
echo "You should run this script in tools/ directory!!"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ! -d liblbfgs-1.10 ]; then
|
||||||
|
echo Installing libLBFGS library to support MaxEnt LMs
|
||||||
|
bash extras/install_liblbfgs.sh || exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
! command -v gawk > /dev/null && \
|
||||||
|
echo "GNU awk is not installed so SRILM will probably not work correctly: refusing to install" && exit 1;
|
||||||
|
|
||||||
|
if [ $# -ne 3 ]; then
|
||||||
|
echo "SRILM download requires some information about you"
|
||||||
|
echo
|
||||||
|
echo "Usage: $0 <name> <organization> <email>"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
srilm_url="http://www.speech.sri.com/projects/srilm/srilm_download.php"
|
||||||
|
post_data="WWW_file=srilm-1.7.3.tar.gz&WWW_name=$1&WWW_org=$2&WWW_email=$3"
|
||||||
|
|
||||||
|
if ! wget --post-data "$post_data" -O ./srilm.tar.gz "$srilm_url"; then
|
||||||
|
echo 'There was a problem downloading the file.'
|
||||||
|
echo 'Check you internet connection and try again.'
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
mkdir -p srilm
|
||||||
|
cd srilm
|
||||||
|
|
||||||
|
|
||||||
|
if [ -f ../srilm.tgz ]; then
|
||||||
|
tar -xvzf ../srilm.tgz # Old SRILM format
|
||||||
|
elif [ -f ../srilm.tar.gz ]; then
|
||||||
|
tar -xvzf ../srilm.tar.gz # Changed format type from tgz to tar.gz
|
||||||
|
fi
|
||||||
|
|
||||||
|
major=`gawk -F. '{ print $1 }' RELEASE`
|
||||||
|
minor=`gawk -F. '{ print $2 }' RELEASE`
|
||||||
|
micro=`gawk -F. '{ print $3 }' RELEASE`
|
||||||
|
|
||||||
|
if [ $major -le 1 ] && [ $minor -le 7 ] && [ $micro -le 1 ]; then
|
||||||
|
echo "Detected version 1.7.1 or earlier. Applying patch."
|
||||||
|
patch -p0 < ../extras/srilm.patch
|
||||||
|
fi
|
||||||
|
|
||||||
|
# set the SRILM variable in the top-level Makefile to this directory.
|
||||||
|
cp Makefile tmpf
|
||||||
|
|
||||||
|
cat tmpf | gawk -v pwd=`pwd` '/SRILM =/{printf("SRILM = %s\n", pwd); next;} {print;}' \
|
||||||
|
> Makefile || exit 1
|
||||||
|
rm tmpf
|
||||||
|
|
||||||
|
mtype=`sbin/machine-type`
|
||||||
|
|
||||||
|
echo HAVE_LIBLBFGS=1 >> common/Makefile.machine.$mtype
|
||||||
|
grep ADDITIONAL_INCLUDES common/Makefile.machine.$mtype | \
|
||||||
|
sed 's|$| -I$(SRILM)/../liblbfgs-1.10/include|' \
|
||||||
|
>> common/Makefile.machine.$mtype
|
||||||
|
|
||||||
|
grep ADDITIONAL_LDFLAGS common/Makefile.machine.$mtype | \
|
||||||
|
sed 's|$| -L$(SRILM)/../liblbfgs-1.10/lib/ -Wl,-rpath -Wl,$(SRILM)/../liblbfgs-1.10/lib/|' \
|
||||||
|
>> common/Makefile.machine.$mtype
|
||||||
|
|
||||||
|
make || exit
|
||||||
|
|
||||||
|
cd ..
|
||||||
|
(
|
||||||
|
[ ! -z "${SRILM}" ] && \
|
||||||
|
echo >&2 "SRILM variable is aleady defined. Undefining..." && \
|
||||||
|
unset SRILM
|
||||||
|
|
||||||
|
[ -f ./env.sh ] && . ./env.sh
|
||||||
|
|
||||||
|
[ ! -z "${SRILM}" ] && \
|
||||||
|
echo >&2 "SRILM config is already in env.sh" && exit
|
||||||
|
|
||||||
|
wd=`pwd`
|
||||||
|
wd=`readlink -f $wd || pwd`
|
||||||
|
|
||||||
|
echo "export SRILM=$wd/srilm"
|
||||||
|
dirs="\${PATH}"
|
||||||
|
for directory in $(cd srilm && find bin -type d ) ; do
|
||||||
|
dirs="$dirs:\${SRILM}/$directory"
|
||||||
|
done
|
||||||
|
echo "export PATH=$dirs"
|
||||||
|
) >> env.sh
|
||||||
|
|
||||||
|
echo >&2 "Installation of SRILM finished successfully"
|
||||||
|
echo >&2 "Please source the tools/env.sh in your path.sh to enable it"
|
@ -0,0 +1,5 @@
|
|||||||
|
cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
|
||||||
|
|
||||||
|
add_executable(arpa2fst ${CMAKE_CURRENT_SOURCE_DIR}/arpa2fst.cc)
|
||||||
|
target_include_directories(arpa2fst PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
|
||||||
|
target_link_libraries(arpa2fst )
|
@ -0,0 +1,145 @@
|
|||||||
|
// bin/arpa2fst.cc
|
||||||
|
//
|
||||||
|
// Copyright 2009-2011 Gilles Boulianne.
|
||||||
|
//
|
||||||
|
// See ../../COPYING for clarification regarding multiple authors
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||||
|
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||||
|
// MERCHANTABILITY OR NON-INFRINGEMENT.
|
||||||
|
// See the Apache 2 License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
#include "lm/arpa-lm-compiler.h"
|
||||||
|
#include "util/kaldi-io.h"
|
||||||
|
#include "util/parse-options.h"
|
||||||
|
|
||||||
|
int main(int argc, char *argv[]) {
|
||||||
|
using namespace kaldi; // NOLINT
|
||||||
|
try {
|
||||||
|
const char *usage =
|
||||||
|
"Convert an ARPA format language model into an FST\n"
|
||||||
|
"Usage: arpa2fst [opts] <input-arpa> <output-fst>\n"
|
||||||
|
" e.g.: arpa2fst --disambig-symbol=#0 --read-symbol-table="
|
||||||
|
"data/lang/words.txt lm/input.arpa G.fst\n\n"
|
||||||
|
"Note: When called without switches, the output G.fst will contain\n"
|
||||||
|
"an embedded symbol table. This is compatible with the way a previous\n"
|
||||||
|
"version of arpa2fst worked.\n";
|
||||||
|
|
||||||
|
ParseOptions po(usage);
|
||||||
|
|
||||||
|
ArpaParseOptions options;
|
||||||
|
options.Register(&po);
|
||||||
|
|
||||||
|
// Option flags.
|
||||||
|
std::string bos_symbol = "<s>";
|
||||||
|
std::string eos_symbol = "</s>";
|
||||||
|
std::string disambig_symbol;
|
||||||
|
std::string read_syms_filename;
|
||||||
|
std::string write_syms_filename;
|
||||||
|
bool keep_symbols = false;
|
||||||
|
bool ilabel_sort = true;
|
||||||
|
|
||||||
|
po.Register("bos-symbol", &bos_symbol, "Beginning of sentence symbol");
|
||||||
|
po.Register("eos-symbol", &eos_symbol, "End of sentence symbol");
|
||||||
|
po.Register("disambig-symbol", &disambig_symbol,
|
||||||
|
"Disambiguator. If provided (e. g. #0), used on input side of "
|
||||||
|
"backoff links, and <s> and </s> are replaced with epsilons");
|
||||||
|
po.Register("read-symbol-table", &read_syms_filename,
|
||||||
|
"Use existing symbol table");
|
||||||
|
po.Register("write-symbol-table", &write_syms_filename,
|
||||||
|
"Write generated symbol table to a file");
|
||||||
|
po.Register("keep-symbols", &keep_symbols,
|
||||||
|
"Store symbol table with FST. Symbols always saved to FST if "
|
||||||
|
"symbol tables are neither read or written (otherwise symbols "
|
||||||
|
"would be lost entirely)");
|
||||||
|
po.Register("ilabel-sort", &ilabel_sort, "Ilabel-sort the output FST");
|
||||||
|
|
||||||
|
po.Read(argc, argv);
|
||||||
|
|
||||||
|
if (po.NumArgs() != 1 && po.NumArgs() != 2) {
|
||||||
|
po.PrintUsage();
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
std::string arpa_rxfilename = po.GetArg(1),
|
||||||
|
fst_wxfilename = po.GetOptArg(2);
|
||||||
|
|
||||||
|
int64 disambig_symbol_id = 0;
|
||||||
|
|
||||||
|
fst::SymbolTable *symbols;
|
||||||
|
if (!read_syms_filename.empty()) {
|
||||||
|
// Use existing symbols. Required symbols must be in the table.
|
||||||
|
kaldi::Input kisym(read_syms_filename);
|
||||||
|
symbols = fst::SymbolTable::ReadText(
|
||||||
|
kisym.Stream(), PrintableWxfilename(read_syms_filename));
|
||||||
|
if (symbols == NULL)
|
||||||
|
KALDI_ERR << "Could not read symbol table from file "
|
||||||
|
<< read_syms_filename;
|
||||||
|
|
||||||
|
options.oov_handling = ArpaParseOptions::kSkipNGram;
|
||||||
|
if (!disambig_symbol.empty()) {
|
||||||
|
disambig_symbol_id = symbols->Find(disambig_symbol);
|
||||||
|
if (disambig_symbol_id == -1) // fst::kNoSymbol
|
||||||
|
KALDI_ERR << "Symbol table " << read_syms_filename
|
||||||
|
<< " has no symbol for " << disambig_symbol;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Create a new symbol table and populate it from ARPA file.
|
||||||
|
symbols = new fst::SymbolTable(PrintableWxfilename(fst_wxfilename));
|
||||||
|
options.oov_handling = ArpaParseOptions::kAddToSymbols;
|
||||||
|
symbols->AddSymbol("<eps>", 0);
|
||||||
|
if (!disambig_symbol.empty()) {
|
||||||
|
disambig_symbol_id = symbols->AddSymbol(disambig_symbol);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add or use existing BOS and EOS.
|
||||||
|
options.bos_symbol = symbols->AddSymbol(bos_symbol);
|
||||||
|
options.eos_symbol = symbols->AddSymbol(eos_symbol);
|
||||||
|
|
||||||
|
// If producing new (not reading existing) symbols and not saving them,
|
||||||
|
// need to keep symbols with FST, otherwise they would be lost.
|
||||||
|
if (read_syms_filename.empty() && write_syms_filename.empty())
|
||||||
|
keep_symbols = true;
|
||||||
|
|
||||||
|
// Actually compile LM.
|
||||||
|
KALDI_ASSERT(symbols != NULL);
|
||||||
|
ArpaLmCompiler lm_compiler(options, disambig_symbol_id, symbols);
|
||||||
|
{
|
||||||
|
Input ki(arpa_rxfilename);
|
||||||
|
lm_compiler.Read(ki.Stream());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sort the FST in-place if requested by options.
|
||||||
|
if (ilabel_sort) {
|
||||||
|
fst::ArcSort(lm_compiler.MutableFst(), fst::StdILabelCompare());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Write symbols if requested.
|
||||||
|
if (!write_syms_filename.empty()) {
|
||||||
|
kaldi::Output kosym(write_syms_filename, false);
|
||||||
|
symbols->WriteText(kosym.Stream());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Write LM FST.
|
||||||
|
bool write_binary = true, write_header = false;
|
||||||
|
kaldi::Output kofst(fst_wxfilename, write_binary, write_header);
|
||||||
|
fst::FstWriteOptions wopts(PrintableWxfilename(fst_wxfilename));
|
||||||
|
wopts.write_isymbols = wopts.write_osymbols = keep_symbols;
|
||||||
|
lm_compiler.Fst().Write(kofst.Stream(), wopts);
|
||||||
|
|
||||||
|
delete symbols;
|
||||||
|
} catch (const std::exception &e) {
|
||||||
|
std::cerr << e.what();
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in new issue