#!/usr/bin/env perl # Copyright 2010-2011 Microsoft Corporation # 2013-2016 Johns Hopkins University (author: Daniel Povey) # 2015 Hainan Xu # 2015 Guoguo Chen # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, # MERCHANTABLITY OR NON-INFRINGEMENT. # See the Apache 2 License for the specific language governing permissions and # limitations under the License. # Adds disambiguation symbols to a lexicon. # Outputs still in the normal lexicon format. # Disambig syms are numbered #1, #2, #3, etc. (#0 # reserved for symbol in grammar). # Outputs the number of disambig syms to the standard output. # With the --pron-probs option, expects the second field # of each lexicon line to be a pron-prob. # With the --sil-probs option, expects three additional # fields after the pron-prob, representing various components # of the silence probability model. $pron_probs = 0; $sil_probs = 0; $first_allowed_disambig = 1; for ($n = 1; $n <= 3 && @ARGV > 0; $n++) { if ($ARGV[0] eq "--pron-probs") { $pron_probs = 1; shift @ARGV; } if ($ARGV[0] eq "--sil-probs") { $sil_probs = 1; shift @ARGV; } if ($ARGV[0] eq "--first-allowed-disambig") { $first_allowed_disambig = 0 + $ARGV[1]; if ($first_allowed_disambig < 1) { die "add_lex_disambig.pl: invalid --first-allowed-disambig option: $first_allowed_disambig\n"; } shift @ARGV; shift @ARGV; } } if (@ARGV != 2) { die "Usage: add_lex_disambig.pl [opts] \n" . "This script adds disambiguation symbols to a lexicon in order to\n" . "make decoding graphs determinizable; it adds pseudo-phone\n" . "disambiguation symbols #1, #2 and so on at the ends of phones\n" . "to ensure that all pronunciations are different, and that none\n" . "is a prefix of another.\n" . "It prints to the standard output the number of the largest-numbered" . "disambiguation symbol that was used.\n" . "\n" . "Options: --pron-probs Expect pronunciation probabilities in the 2nd field\n" . " --sil-probs [should be with --pron-probs option]\n" . " Expect 3 extra fields after the pron-probs, for aspects of\n" . " the silence probability model\n" . " --first-allowed-disambig The number of the first disambiguation symbol\n" . " that this script is allowed to add. By default this is\n" . " #1, but you can set this to a larger value using this option.\n" . "e.g.:\n" . " add_lex_disambig.pl lexicon.txt lexicon_disambig.txt\n" . " add_lex_disambig.pl --pron-probs lexiconp.txt lexiconp_disambig.txt\n" . " add_lex_disambig.pl --pron-probs --sil-probs lexiconp_silprob.txt lexiconp_silprob_disambig.txt\n"; } $lexfn = shift @ARGV; $lexoutfn = shift @ARGV; open(L, "<$lexfn") || die "Error opening lexicon $lexfn"; # (1) Read in the lexicon. @L = ( ); while() { @A = split(" ", $_); push @L, join(" ", @A); } # (2) Work out the count of each phone-sequence in the # lexicon. foreach $l (@L) { @A = split(" ", $l); shift @A; # Remove word. if ($pron_probs) { $p = shift @A; if (!($p > 0.0 && $p <= 1.0)) { die "Bad lexicon line $l (expecting pron-prob as second field)"; } } if ($sil_probs) { $silp = shift @A; if (!($silp > 0.0 && $silp <= 1.0)) { die "Bad lexicon line $l for silprobs"; } $correction = shift @A; if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; } $correction = shift @A; if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; } } if (!(@A)) { die "Bad lexicon line $1, no phone in phone list"; } $count{join(" ",@A)}++; } # (3) For each left sub-sequence of each phone-sequence, note down # that it exists (for identifying prefixes of longer strings). foreach $l (@L) { @A = split(" ", $l); shift @A; # Remove word. if ($pron_probs) { shift @A; } # remove pron-prob. if ($sil_probs) { shift @A; # Remove silprob shift @A; # Remove silprob } while(@A > 0) { pop @A; # Remove last phone $issubseq{join(" ",@A)} = 1; } } # (4) For each entry in the lexicon: # if the phone sequence is unique and is not a # prefix of another word, no diambig symbol. # Else output #1, or #2, #3, ... if the same phone-seq # has already been assigned a disambig symbol. open(O, ">$lexoutfn") || die "Opening lexicon file $lexoutfn for writing.\n"; # max_disambig will always be the highest-numbered disambiguation symbol that # has been used so far. $max_disambig = $first_allowed_disambig - 1; foreach $l (@L) { @A = split(" ", $l); $word = shift @A; if ($pron_probs) { $pron_prob = shift @A; } if ($sil_probs) { $sil_word_prob = shift @A; $word_sil_correction = shift @A; $prev_nonsil_correction = shift @A } $phnseq = join(" ", @A); if (!defined $issubseq{$phnseq} && $count{$phnseq} == 1) { ; # Do nothing. } else { if ($phnseq eq "") { # need disambig symbols for the empty string # that are not use anywhere else. $max_disambig++; $reserved_for_the_empty_string{$max_disambig} = 1; $phnseq = "#$max_disambig"; } else { $cur_disambig = $last_used_disambig_symbol_of{$phnseq}; if (!defined $cur_disambig) { $cur_disambig = $first_allowed_disambig; } else { $cur_disambig++; # Get a number that has not been used yet for # this phone sequence. } while (defined $reserved_for_the_empty_string{$cur_disambig}) { $cur_disambig++; } if ($cur_disambig > $max_disambig) { $max_disambig = $cur_disambig; } $last_used_disambig_symbol_of{$phnseq} = $cur_disambig; $phnseq = $phnseq . " #" . $cur_disambig; } } if ($pron_probs) { if ($sil_probs) { print O "$word\t$pron_prob\t$sil_word_prob\t$word_sil_correction\t$prev_nonsil_correction\t$phnseq\n"; } else { print O "$word\t$pron_prob\t$phnseq\n"; } } else { print O "$word\t$phnseq\n"; } } print $max_disambig . "\n";