You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
195 lines
6.6 KiB
195 lines
6.6 KiB
4 years ago
|
#!/usr/bin/env perl
|
||
|
# Copyright 2010-2011 Microsoft Corporation
|
||
|
# 2013-2016 Johns Hopkins University (author: Daniel Povey)
|
||
|
# 2015 Hainan Xu
|
||
|
# 2015 Guoguo Chen
|
||
|
|
||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
# you may not use this file except in compliance with the License.
|
||
|
# You may obtain a copy of the License at
|
||
|
#
|
||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||
|
#
|
||
|
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||
|
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||
|
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||
|
# MERCHANTABLITY OR NON-INFRINGEMENT.
|
||
|
# See the Apache 2 License for the specific language governing permissions and
|
||
|
# limitations under the License.
|
||
|
|
||
|
|
||
|
# Adds disambiguation symbols to a lexicon.
|
||
|
# Outputs still in the normal lexicon format.
|
||
|
# Disambig syms are numbered #1, #2, #3, etc. (#0
|
||
|
# reserved for symbol in grammar).
|
||
|
# Outputs the number of disambig syms to the standard output.
|
||
|
# With the --pron-probs option, expects the second field
|
||
|
# of each lexicon line to be a pron-prob.
|
||
|
# With the --sil-probs option, expects three additional
|
||
|
# fields after the pron-prob, representing various components
|
||
|
# of the silence probability model.
|
||
|
|
||
|
$pron_probs = 0;
|
||
|
$sil_probs = 0;
|
||
|
$first_allowed_disambig = 1;
|
||
|
|
||
|
for ($n = 1; $n <= 3 && @ARGV > 0; $n++) {
|
||
|
if ($ARGV[0] eq "--pron-probs") {
|
||
|
$pron_probs = 1;
|
||
|
shift @ARGV;
|
||
|
}
|
||
|
if ($ARGV[0] eq "--sil-probs") {
|
||
|
$sil_probs = 1;
|
||
|
shift @ARGV;
|
||
|
}
|
||
|
if ($ARGV[0] eq "--first-allowed-disambig") {
|
||
|
$first_allowed_disambig = 0 + $ARGV[1];
|
||
|
if ($first_allowed_disambig < 1) {
|
||
|
die "add_lex_disambig.pl: invalid --first-allowed-disambig option: $first_allowed_disambig\n";
|
||
|
}
|
||
|
shift @ARGV;
|
||
|
shift @ARGV;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if (@ARGV != 2) {
|
||
|
die "Usage: add_lex_disambig.pl [opts] <lexicon-in> <lexicon-out>\n" .
|
||
|
"This script adds disambiguation symbols to a lexicon in order to\n" .
|
||
|
"make decoding graphs determinizable; it adds pseudo-phone\n" .
|
||
|
"disambiguation symbols #1, #2 and so on at the ends of phones\n" .
|
||
|
"to ensure that all pronunciations are different, and that none\n" .
|
||
|
"is a prefix of another.\n" .
|
||
|
"It prints to the standard output the number of the largest-numbered" .
|
||
|
"disambiguation symbol that was used.\n" .
|
||
|
"\n" .
|
||
|
"Options: --pron-probs Expect pronunciation probabilities in the 2nd field\n" .
|
||
|
" --sil-probs [should be with --pron-probs option]\n" .
|
||
|
" Expect 3 extra fields after the pron-probs, for aspects of\n" .
|
||
|
" the silence probability model\n" .
|
||
|
" --first-allowed-disambig <n> The number of the first disambiguation symbol\n" .
|
||
|
" that this script is allowed to add. By default this is\n" .
|
||
|
" #1, but you can set this to a larger value using this option.\n" .
|
||
|
"e.g.:\n" .
|
||
|
" add_lex_disambig.pl lexicon.txt lexicon_disambig.txt\n" .
|
||
|
" add_lex_disambig.pl --pron-probs lexiconp.txt lexiconp_disambig.txt\n" .
|
||
|
" add_lex_disambig.pl --pron-probs --sil-probs lexiconp_silprob.txt lexiconp_silprob_disambig.txt\n";
|
||
|
}
|
||
|
|
||
|
|
||
|
$lexfn = shift @ARGV;
|
||
|
$lexoutfn = shift @ARGV;
|
||
|
|
||
|
open(L, "<$lexfn") || die "Error opening lexicon $lexfn";
|
||
|
|
||
|
# (1) Read in the lexicon.
|
||
|
@L = ( );
|
||
|
while(<L>) {
|
||
|
@A = split(" ", $_);
|
||
|
push @L, join(" ", @A);
|
||
|
}
|
||
|
|
||
|
# (2) Work out the count of each phone-sequence in the
|
||
|
# lexicon.
|
||
|
|
||
|
foreach $l (@L) {
|
||
|
@A = split(" ", $l);
|
||
|
shift @A; # Remove word.
|
||
|
if ($pron_probs) {
|
||
|
$p = shift @A;
|
||
|
if (!($p > 0.0 && $p <= 1.0)) { die "Bad lexicon line $l (expecting pron-prob as second field)"; }
|
||
|
}
|
||
|
if ($sil_probs) {
|
||
|
$silp = shift @A;
|
||
|
if (!($silp > 0.0 && $silp <= 1.0)) { die "Bad lexicon line $l for silprobs"; }
|
||
|
$correction = shift @A;
|
||
|
if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; }
|
||
|
$correction = shift @A;
|
||
|
if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; }
|
||
|
}
|
||
|
if (!(@A)) {
|
||
|
die "Bad lexicon line $1, no phone in phone list";
|
||
|
}
|
||
|
$count{join(" ",@A)}++;
|
||
|
}
|
||
|
|
||
|
# (3) For each left sub-sequence of each phone-sequence, note down
|
||
|
# that it exists (for identifying prefixes of longer strings).
|
||
|
|
||
|
foreach $l (@L) {
|
||
|
@A = split(" ", $l);
|
||
|
shift @A; # Remove word.
|
||
|
if ($pron_probs) { shift @A; } # remove pron-prob.
|
||
|
if ($sil_probs) {
|
||
|
shift @A; # Remove silprob
|
||
|
shift @A; # Remove silprob
|
||
|
}
|
||
|
while(@A > 0) {
|
||
|
pop @A; # Remove last phone
|
||
|
$issubseq{join(" ",@A)} = 1;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
# (4) For each entry in the lexicon:
|
||
|
# if the phone sequence is unique and is not a
|
||
|
# prefix of another word, no diambig symbol.
|
||
|
# Else output #1, or #2, #3, ... if the same phone-seq
|
||
|
# has already been assigned a disambig symbol.
|
||
|
|
||
|
|
||
|
open(O, ">$lexoutfn") || die "Opening lexicon file $lexoutfn for writing.\n";
|
||
|
|
||
|
# max_disambig will always be the highest-numbered disambiguation symbol that
|
||
|
# has been used so far.
|
||
|
$max_disambig = $first_allowed_disambig - 1;
|
||
|
|
||
|
foreach $l (@L) {
|
||
|
@A = split(" ", $l);
|
||
|
$word = shift @A;
|
||
|
if ($pron_probs) {
|
||
|
$pron_prob = shift @A;
|
||
|
}
|
||
|
if ($sil_probs) {
|
||
|
$sil_word_prob = shift @A;
|
||
|
$word_sil_correction = shift @A;
|
||
|
$prev_nonsil_correction = shift @A
|
||
|
}
|
||
|
$phnseq = join(" ", @A);
|
||
|
if (!defined $issubseq{$phnseq}
|
||
|
&& $count{$phnseq} == 1) {
|
||
|
; # Do nothing.
|
||
|
} else {
|
||
|
if ($phnseq eq "") { # need disambig symbols for the empty string
|
||
|
# that are not use anywhere else.
|
||
|
$max_disambig++;
|
||
|
$reserved_for_the_empty_string{$max_disambig} = 1;
|
||
|
$phnseq = "#$max_disambig";
|
||
|
} else {
|
||
|
$cur_disambig = $last_used_disambig_symbol_of{$phnseq};
|
||
|
if (!defined $cur_disambig) {
|
||
|
$cur_disambig = $first_allowed_disambig;
|
||
|
} else {
|
||
|
$cur_disambig++; # Get a number that has not been used yet for
|
||
|
# this phone sequence.
|
||
|
}
|
||
|
while (defined $reserved_for_the_empty_string{$cur_disambig}) {
|
||
|
$cur_disambig++;
|
||
|
}
|
||
|
if ($cur_disambig > $max_disambig) {
|
||
|
$max_disambig = $cur_disambig;
|
||
|
}
|
||
|
$last_used_disambig_symbol_of{$phnseq} = $cur_disambig;
|
||
|
$phnseq = $phnseq . " #" . $cur_disambig;
|
||
|
}
|
||
|
}
|
||
|
if ($pron_probs) {
|
||
|
if ($sil_probs) {
|
||
|
print O "$word\t$pron_prob\t$sil_word_prob\t$word_sil_correction\t$prev_nonsil_correction\t$phnseq\n";
|
||
|
} else {
|
||
|
print O "$word\t$pron_prob\t$phnseq\n";
|
||
|
}
|
||
|
} else {
|
||
|
print O "$word\t$phnseq\n";
|
||
|
}
|
||
|
}
|
||
|
|
||
|
print $max_disambig . "\n";
|