You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
43 lines
1.4 KiB
43 lines
1.4 KiB
#!/usr/bin/env perl
|
|
# Copyright 2010-2011 Microsoft Corporation
|
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
|
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
|
# MERCHANTABLITY OR NON-INFRINGEMENT.
|
|
# See the Apache 2 License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
# This script removes lines that contain these OOVs on either the
|
|
# third or fourth fields of the line. It is intended to remove arcs
|
|
# with OOVs on, from FSTs (probably compiled from ARPAs with OOVs in).
|
|
|
|
if ( @ARGV < 1 && @ARGV > 2) {
|
|
die "Usage: remove_oovs.pl unk_list.txt [ printed-fst ]\n";
|
|
}
|
|
|
|
$unklist = shift @ARGV;
|
|
open(S, "<$unklist") || die "Failed opening unknown-symbol list $unklist\n";
|
|
while(<S>){
|
|
@A = split(" ", $_);
|
|
@A == 1 || die "Bad line in unknown-symbol list: $_";
|
|
$unk{$A[0]} = 1;
|
|
}
|
|
|
|
$num_removed = 0;
|
|
while(<>){
|
|
@A = split(" ", $_);
|
|
if(defined $unk{$A[2]} || defined $unk{$A[3]}) {
|
|
$num_removed++;
|
|
} else {
|
|
print;
|
|
}
|
|
}
|
|
print STDERR "remove_oovs.pl: removed $num_removed lines.\n";
|