parent
913b2300c3
commit
c6e8a33b73
@ -0,0 +1,3 @@
|
||||
# Utils
|
||||
|
||||
* [kaldi utils](https://github.com/kaldi-asr/kaldi/blob/cbed4ff688/egs/wsj/s5/utils)
|
@ -0,0 +1,44 @@
|
||||
#!/usr/bin/env bash
|
||||
# 2020 author Jiayu DU
|
||||
# Apache 2.0
|
||||
|
||||
# This script reads in an Arpa format language model, and converts it into the
|
||||
# KenLM format language model.
|
||||
|
||||
[ -f path.sh ] && . ./path.sh;
|
||||
|
||||
# begin configuration section
|
||||
kenlm_opts="" # e.g. "-q 8 -b 8" for 8bits quantization
|
||||
model_type="trie" # "trie" or "probing". trie is smaller, probing is faster.
|
||||
# end configuration section
|
||||
|
||||
. utils/parse_options.sh
|
||||
|
||||
if [ $# != 2 ]; then
|
||||
echo "Usage: "
|
||||
echo " $0 [options] <arpa-lm-path> <kenlm-path>"
|
||||
echo "e.g.:"
|
||||
echo " $0 data/local/lm/4gram.arpa data/lang_test/G.trie"
|
||||
echo "Options:"
|
||||
echo " --model-type can be either \"trie\" or \"probing\""
|
||||
echo " --kenlm-opts directly pass through to kenlm"
|
||||
echo " e.g. for 8bits quantization, feed \"-q 8 -b 8\""
|
||||
exit 1;
|
||||
fi
|
||||
|
||||
export LC_ALL=C
|
||||
|
||||
arpa_lm=$1
|
||||
kenlm=$2
|
||||
|
||||
if ! which build_binary >& /dev/null ; then
|
||||
echo "$0: cannot find KenLM's build_binary tool,"
|
||||
echo "check kenlm installation (tools/extras/install_kenlm_query_only.sh)."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
mkdir -p $(dirname $kenlm)
|
||||
build_binary $kenlm_opts $model_type $arpa_lm $kenlm
|
||||
|
||||
echo "$0: Successfully built arpa into kenlm format: $kenlm"
|
||||
exit 0
|
@ -0,0 +1,356 @@
|
||||
#!/usr/bin/env perl
|
||||
use warnings; #sed replacement for -w perl parameter
|
||||
# In general, doing
|
||||
# run.pl some.log a b c is like running the command a b c in
|
||||
# the bash shell, and putting the standard error and output into some.log.
|
||||
# To run parallel jobs (backgrounded on the host machine), you can do (e.g.)
|
||||
# run.pl JOB=1:4 some.JOB.log a b c JOB is like running the command a b c JOB
|
||||
# and putting it in some.JOB.log, for each one. [Note: JOB can be any identifier].
|
||||
# If any of the jobs fails, this script will fail.
|
||||
|
||||
# A typical example is:
|
||||
# run.pl some.log my-prog "--opt=foo bar" foo \| other-prog baz
|
||||
# and run.pl will run something like:
|
||||
# ( my-prog '--opt=foo bar' foo | other-prog baz ) >& some.log
|
||||
#
|
||||
# Basically it takes the command-line arguments, quotes them
|
||||
# as necessary to preserve spaces, and evaluates them with bash.
|
||||
# In addition it puts the command line at the top of the log, and
|
||||
# the start and end times of the command at the beginning and end.
|
||||
# The reason why this is useful is so that we can create a different
|
||||
# version of this program that uses a queueing system instead.
|
||||
|
||||
#use Data::Dumper;
|
||||
|
||||
@ARGV < 2 && die "usage: run.pl log-file command-line arguments...";
|
||||
|
||||
#print STDERR "COMMAND-LINE: " . Dumper(\@ARGV) . "\n";
|
||||
$job_pick = 'all';
|
||||
$max_jobs_run = -1;
|
||||
$jobstart = 1;
|
||||
$jobend = 1;
|
||||
$ignored_opts = ""; # These will be ignored.
|
||||
|
||||
# First parse an option like JOB=1:4, and any
|
||||
# options that would normally be given to
|
||||
# queue.pl, which we will just discard.
|
||||
|
||||
for (my $x = 1; $x <= 2; $x++) { # This for-loop is to
|
||||
# allow the JOB=1:n option to be interleaved with the
|
||||
# options to qsub.
|
||||
while (@ARGV >= 2 && $ARGV[0] =~ m:^-:) {
|
||||
# parse any options that would normally go to qsub, but which will be ignored here.
|
||||
my $switch = shift @ARGV;
|
||||
if ($switch eq "-V") {
|
||||
$ignored_opts .= "-V ";
|
||||
} elsif ($switch eq "--max-jobs-run" || $switch eq "-tc") {
|
||||
# we do support the option --max-jobs-run n, and its GridEngine form -tc n.
|
||||
# if the command appears multiple times uses the smallest option.
|
||||
if ( $max_jobs_run <= 0 ) {
|
||||
$max_jobs_run = shift @ARGV;
|
||||
} else {
|
||||
my $new_constraint = shift @ARGV;
|
||||
if ( ($new_constraint < $max_jobs_run) ) {
|
||||
$max_jobs_run = $new_constraint;
|
||||
}
|
||||
}
|
||||
|
||||
if (! ($max_jobs_run > 0)) {
|
||||
die "run.pl: invalid option --max-jobs-run $max_jobs_run";
|
||||
}
|
||||
} else {
|
||||
my $argument = shift @ARGV;
|
||||
if ($argument =~ m/^--/) {
|
||||
print STDERR "run.pl: WARNING: suspicious argument '$argument' to $switch; starts with '-'\n";
|
||||
}
|
||||
if ($switch eq "-sync" && $argument =~ m/^[yY]/) {
|
||||
$ignored_opts .= "-sync "; # Note: in the
|
||||
# corresponding code in queue.pl it says instead, just "$sync = 1;".
|
||||
} elsif ($switch eq "-pe") { # e.g. -pe smp 5
|
||||
my $argument2 = shift @ARGV;
|
||||
$ignored_opts .= "$switch $argument $argument2 ";
|
||||
} elsif ($switch eq "--gpu") {
|
||||
$using_gpu = $argument;
|
||||
} elsif ($switch eq "--pick") {
|
||||
if($argument =~ m/^(all|failed|incomplete)$/) {
|
||||
$job_pick = $argument;
|
||||
} else {
|
||||
print STDERR "run.pl: ERROR: --pick argument must be one of 'all', 'failed' or 'incomplete'"
|
||||
}
|
||||
} else {
|
||||
# Ignore option.
|
||||
$ignored_opts .= "$switch $argument ";
|
||||
}
|
||||
}
|
||||
}
|
||||
if ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+):(\d+)$/) { # e.g. JOB=1:20
|
||||
$jobname = $1;
|
||||
$jobstart = $2;
|
||||
$jobend = $3;
|
||||
if ($jobstart > $jobend) {
|
||||
die "run.pl: invalid job range $ARGV[0]";
|
||||
}
|
||||
if ($jobstart <= 0) {
|
||||
die "run.pl: invalid job range $ARGV[0], start must be strictly positive (this is required for GridEngine compatibility).";
|
||||
}
|
||||
shift;
|
||||
} elsif ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+)$/) { # e.g. JOB=1.
|
||||
$jobname = $1;
|
||||
$jobstart = $2;
|
||||
$jobend = $2;
|
||||
shift;
|
||||
} elsif ($ARGV[0] =~ m/.+\=.*\:.*$/) {
|
||||
print STDERR "run.pl: Warning: suspicious first argument to run.pl: $ARGV[0]\n";
|
||||
}
|
||||
}
|
||||
|
||||
# Users found this message confusing so we are removing it.
|
||||
# if ($ignored_opts ne "") {
|
||||
# print STDERR "run.pl: Warning: ignoring options \"$ignored_opts\"\n";
|
||||
# }
|
||||
|
||||
if ($max_jobs_run == -1) { # If --max-jobs-run option not set,
|
||||
# then work out the number of processors if possible,
|
||||
# and set it based on that.
|
||||
$max_jobs_run = 0;
|
||||
if ($using_gpu) {
|
||||
if (open(P, "nvidia-smi -L |")) {
|
||||
$max_jobs_run++ while (<P>);
|
||||
close(P);
|
||||
}
|
||||
if ($max_jobs_run == 0) {
|
||||
$max_jobs_run = 1;
|
||||
print STDERR "run.pl: Warning: failed to detect number of GPUs from nvidia-smi, using ${max_jobs_run}\n";
|
||||
}
|
||||
} elsif (open(P, "</proc/cpuinfo")) { # Linux
|
||||
while (<P>) { if (m/^processor/) { $max_jobs_run++; } }
|
||||
if ($max_jobs_run == 0) {
|
||||
print STDERR "run.pl: Warning: failed to detect any processors from /proc/cpuinfo\n";
|
||||
$max_jobs_run = 10; # reasonable default.
|
||||
}
|
||||
close(P);
|
||||
} elsif (open(P, "sysctl -a |")) { # BSD/Darwin
|
||||
while (<P>) {
|
||||
if (m/hw\.ncpu\s*[:=]\s*(\d+)/) { # hw.ncpu = 4, or hw.ncpu: 4
|
||||
$max_jobs_run = $1;
|
||||
last;
|
||||
}
|
||||
}
|
||||
close(P);
|
||||
if ($max_jobs_run == 0) {
|
||||
print STDERR "run.pl: Warning: failed to detect any processors from sysctl -a\n";
|
||||
$max_jobs_run = 10; # reasonable default.
|
||||
}
|
||||
} else {
|
||||
# allow at most 32 jobs at once, on non-UNIX systems; change this code
|
||||
# if you need to change this default.
|
||||
$max_jobs_run = 32;
|
||||
}
|
||||
# The just-computed value of $max_jobs_run is just the number of processors
|
||||
# (or our best guess); and if it happens that the number of jobs we need to
|
||||
# run is just slightly above $max_jobs_run, it will make sense to increase
|
||||
# $max_jobs_run to equal the number of jobs, so we don't have a small number
|
||||
# of leftover jobs.
|
||||
$num_jobs = $jobend - $jobstart + 1;
|
||||
if (!$using_gpu &&
|
||||
$num_jobs > $max_jobs_run && $num_jobs < 1.4 * $max_jobs_run) {
|
||||
$max_jobs_run = $num_jobs;
|
||||
}
|
||||
}
|
||||
|
||||
sub pick_or_exit {
|
||||
# pick_or_exit ( $logfile )
|
||||
# Invoked before each job is started helps to run jobs selectively.
|
||||
#
|
||||
# Given the name of the output logfile decides whether the job must be
|
||||
# executed (by returning from the subroutine) or not (by terminating the
|
||||
# process calling exit)
|
||||
#
|
||||
# PRE: $job_pick is a global variable set by command line switch --pick
|
||||
# and indicates which class of jobs must be executed.
|
||||
#
|
||||
# 1) If a failed job is not executed the process exit code will indicate
|
||||
# failure, just as if the task was just executed and failed.
|
||||
#
|
||||
# 2) If a task is incomplete it will be executed. Incomplete may be either
|
||||
# a job whose log file does not contain the accounting notes in the end,
|
||||
# or a job whose log file does not exist.
|
||||
#
|
||||
# 3) If the $job_pick is set to 'all' (default behavior) a task will be
|
||||
# executed regardless of the result of previous attempts.
|
||||
#
|
||||
# This logic could have been implemented in the main execution loop
|
||||
# but a subroutine to preserve the current level of readability of
|
||||
# that part of the code.
|
||||
#
|
||||
# Alexandre Felipe, (o.alexandre.felipe@gmail.com) 14th of August of 2020
|
||||
#
|
||||
if($job_pick eq 'all'){
|
||||
return; # no need to bother with the previous log
|
||||
}
|
||||
open my $fh, "<", $_[0] or return; # job not executed yet
|
||||
my $log_line;
|
||||
my $cur_line;
|
||||
while ($cur_line = <$fh>) {
|
||||
if( $cur_line =~ m/# Ended \(code .*/ ) {
|
||||
$log_line = $cur_line;
|
||||
}
|
||||
}
|
||||
close $fh;
|
||||
if (! defined($log_line)){
|
||||
return; # incomplete
|
||||
}
|
||||
if ( $log_line =~ m/# Ended \(code 0\).*/ ) {
|
||||
exit(0); # complete
|
||||
} elsif ( $log_line =~ m/# Ended \(code \d+(; signal \d+)?\).*/ ){
|
||||
if ($job_pick !~ m/^(failed|all)$/) {
|
||||
exit(1); # failed but not going to run
|
||||
} else {
|
||||
return; # failed
|
||||
}
|
||||
} elsif ( $log_line =~ m/.*\S.*/ ) {
|
||||
return; # incomplete jobs are always run
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
$logfile = shift @ARGV;
|
||||
|
||||
if (defined $jobname && $logfile !~ m/$jobname/ &&
|
||||
$jobend > $jobstart) {
|
||||
print STDERR "run.pl: you are trying to run a parallel job but "
|
||||
. "you are putting the output into just one log file ($logfile)\n";
|
||||
exit(1);
|
||||
}
|
||||
|
||||
$cmd = "";
|
||||
|
||||
foreach $x (@ARGV) {
|
||||
if ($x =~ m/^\S+$/) { $cmd .= $x . " "; }
|
||||
elsif ($x =~ m:\":) { $cmd .= "'$x' "; }
|
||||
else { $cmd .= "\"$x\" "; }
|
||||
}
|
||||
|
||||
#$Data::Dumper::Indent=0;
|
||||
$ret = 0;
|
||||
$numfail = 0;
|
||||
%active_pids=();
|
||||
|
||||
use POSIX ":sys_wait_h";
|
||||
for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) {
|
||||
if (scalar(keys %active_pids) >= $max_jobs_run) {
|
||||
|
||||
# Lets wait for a change in any child's status
|
||||
# Then we have to work out which child finished
|
||||
$r = waitpid(-1, 0);
|
||||
$code = $?;
|
||||
if ($r < 0 ) { die "run.pl: Error waiting for child process"; } # should never happen.
|
||||
if ( defined $active_pids{$r} ) {
|
||||
$jid=$active_pids{$r};
|
||||
$fail[$jid]=$code;
|
||||
if ($code !=0) { $numfail++;}
|
||||
delete $active_pids{$r};
|
||||
# print STDERR "Finished: $r/$jid " . Dumper(\%active_pids) . "\n";
|
||||
} else {
|
||||
die "run.pl: Cannot find the PID of the child process that just finished.";
|
||||
}
|
||||
|
||||
# In theory we could do a non-blocking waitpid over all jobs running just
|
||||
# to find out if only one or more jobs finished during the previous waitpid()
|
||||
# However, we just omit this and will reap the next one in the next pass
|
||||
# through the for(;;) cycle
|
||||
}
|
||||
$childpid = fork();
|
||||
if (!defined $childpid) { die "run.pl: Error forking in run.pl (writing to $logfile)"; }
|
||||
if ($childpid == 0) { # We're in the child... this branch
|
||||
# executes the job and returns (possibly with an error status).
|
||||
if (defined $jobname) {
|
||||
$cmd =~ s/$jobname/$jobid/g;
|
||||
$logfile =~ s/$jobname/$jobid/g;
|
||||
}
|
||||
# exit if the job does not need to be executed
|
||||
pick_or_exit( $logfile );
|
||||
|
||||
system("mkdir -p `dirname $logfile` 2>/dev/null");
|
||||
open(F, ">$logfile") || die "run.pl: Error opening log file $logfile";
|
||||
print F "# " . $cmd . "\n";
|
||||
print F "# Started at " . `date`;
|
||||
$starttime = `date +'%s'`;
|
||||
print F "#\n";
|
||||
close(F);
|
||||
|
||||
# Pipe into bash.. make sure we're not using any other shell.
|
||||
open(B, "|bash") || die "run.pl: Error opening shell command";
|
||||
print B "( " . $cmd . ") 2>>$logfile >> $logfile";
|
||||
close(B); # If there was an error, exit status is in $?
|
||||
$ret = $?;
|
||||
|
||||
$lowbits = $ret & 127;
|
||||
$highbits = $ret >> 8;
|
||||
if ($lowbits != 0) { $return_str = "code $highbits; signal $lowbits" }
|
||||
else { $return_str = "code $highbits"; }
|
||||
|
||||
$endtime = `date +'%s'`;
|
||||
open(F, ">>$logfile") || die "run.pl: Error opening log file $logfile (again)";
|
||||
$enddate = `date`;
|
||||
chop $enddate;
|
||||
print F "# Accounting: time=" . ($endtime - $starttime) . " threads=1\n";
|
||||
print F "# Ended ($return_str) at " . $enddate . ", elapsed time " . ($endtime-$starttime) . " seconds\n";
|
||||
close(F);
|
||||
exit($ret == 0 ? 0 : 1);
|
||||
} else {
|
||||
$pid[$jobid] = $childpid;
|
||||
$active_pids{$childpid} = $jobid;
|
||||
# print STDERR "Queued: " . Dumper(\%active_pids) . "\n";
|
||||
}
|
||||
}
|
||||
|
||||
# Now we have submitted all the jobs, lets wait until all the jobs finish
|
||||
foreach $child (keys %active_pids) {
|
||||
$jobid=$active_pids{$child};
|
||||
$r = waitpid($pid[$jobid], 0);
|
||||
$code = $?;
|
||||
if ($r == -1) { die "run.pl: Error waiting for child process"; } # should never happen.
|
||||
if ($r != 0) { $fail[$jobid]=$code; $numfail++ if $code!=0; } # Completed successfully
|
||||
}
|
||||
|
||||
# Some sanity checks:
|
||||
# The $fail array should not contain undefined codes
|
||||
# The number of non-zeros in that array should be equal to $numfail
|
||||
# We cannot do foreach() here, as the JOB ids do not start at zero
|
||||
$failed_jids=0;
|
||||
for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) {
|
||||
$job_return = $fail[$jobid];
|
||||
if (not defined $job_return ) {
|
||||
# print Dumper(\@fail);
|
||||
|
||||
die "run.pl: Sanity check failed: we have indication that some jobs are running " .
|
||||
"even after we waited for all jobs to finish" ;
|
||||
}
|
||||
if ($job_return != 0 ){ $failed_jids++;}
|
||||
}
|
||||
if ($failed_jids != $numfail) {
|
||||
die "run.pl: Sanity check failed: cannot find out how many jobs failed ($failed_jids x $numfail)."
|
||||
}
|
||||
if ($numfail > 0) { $ret = 1; }
|
||||
|
||||
if ($ret != 0) {
|
||||
$njobs = $jobend - $jobstart + 1;
|
||||
if ($njobs == 1) {
|
||||
if (defined $jobname) {
|
||||
$logfile =~ s/$jobname/$jobstart/; # only one numbered job, so replace name with
|
||||
# that job.
|
||||
}
|
||||
print STDERR "run.pl: job failed, log is in $logfile\n";
|
||||
if ($logfile =~ m/JOB/) {
|
||||
print STDERR "run.pl: probably you forgot to put JOB=1:\$nj in your script.";
|
||||
}
|
||||
}
|
||||
else {
|
||||
$logfile =~ s/$jobname/*/g;
|
||||
print STDERR "run.pl: $numfail / $njobs failed, log is in $logfile\n";
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
exit ($ret);
|
@ -0,0 +1 @@
|
||||
parallel/run.pl
|
@ -0,0 +1,125 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
|
||||
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
|
||||
|
||||
[ -f ./path.sh ] && . ./path.sh
|
||||
|
||||
nlsyms=""
|
||||
wer=false
|
||||
bpe=""
|
||||
bpemodel=""
|
||||
remove_blank=true
|
||||
filter=""
|
||||
num_spkrs=1
|
||||
help_message="Usage: $0 <data-dir> <dict>"
|
||||
|
||||
. utils/parse_options.sh
|
||||
|
||||
if [ $# != 2 ]; then
|
||||
echo "${help_message}"
|
||||
exit 1;
|
||||
fi
|
||||
|
||||
dir=$1
|
||||
dic=$2
|
||||
|
||||
concatjson.py ${dir}/data.*.json > ${dir}/data.json
|
||||
|
||||
if [ $num_spkrs -eq 1 ]; then
|
||||
json2trn.py ${dir}/data.json ${dic} --num-spkrs ${num_spkrs} --refs ${dir}/ref.trn --hyps ${dir}/hyp.trn
|
||||
|
||||
if ${remove_blank}; then
|
||||
sed -i.bak2 -r 's/<blank> //g' ${dir}/hyp.trn
|
||||
fi
|
||||
if [ -n "${nlsyms}" ]; then
|
||||
cp ${dir}/ref.trn ${dir}/ref.trn.org
|
||||
cp ${dir}/hyp.trn ${dir}/hyp.trn.org
|
||||
filt.py -v ${nlsyms} ${dir}/ref.trn.org > ${dir}/ref.trn
|
||||
filt.py -v ${nlsyms} ${dir}/hyp.trn.org > ${dir}/hyp.trn
|
||||
fi
|
||||
if [ -n "${filter}" ]; then
|
||||
sed -i.bak3 -f ${filter} ${dir}/hyp.trn
|
||||
sed -i.bak3 -f ${filter} ${dir}/ref.trn
|
||||
fi
|
||||
|
||||
sclite -r ${dir}/ref.trn trn -h ${dir}/hyp.trn trn -i rm -o all stdout > ${dir}/result.txt
|
||||
|
||||
echo "write a CER (or TER) result in ${dir}/result.txt"
|
||||
grep -e Avg -e SPKR -m 2 ${dir}/result.txt
|
||||
|
||||
if ${wer}; then
|
||||
if [ -n "$bpe" ]; then
|
||||
spm_decode --model=${bpemodel} --input_format=piece < ${dir}/ref.trn | sed -e "s/▁/ /g" > ${dir}/ref.wrd.trn
|
||||
spm_decode --model=${bpemodel} --input_format=piece < ${dir}/hyp.trn | sed -e "s/▁/ /g" > ${dir}/hyp.wrd.trn
|
||||
else
|
||||
sed -e "s/ //g" -e "s/(/ (/" -e "s/<space>/ /g" ${dir}/ref.trn > ${dir}/ref.wrd.trn
|
||||
sed -e "s/ //g" -e "s/(/ (/" -e "s/<space>/ /g" ${dir}/hyp.trn > ${dir}/hyp.wrd.trn
|
||||
fi
|
||||
sclite -r ${dir}/ref.wrd.trn trn -h ${dir}/hyp.wrd.trn trn -i rm -o all stdout > ${dir}/result.wrd.txt
|
||||
|
||||
echo "write a WER result in ${dir}/result.wrd.txt"
|
||||
grep -e Avg -e SPKR -m 2 ${dir}/result.wrd.txt
|
||||
fi
|
||||
elif [ ${num_spkrs} -lt 4 ]; then
|
||||
ref_trns=""
|
||||
hyp_trns=""
|
||||
for i in $(seq ${num_spkrs}); do
|
||||
ref_trns=${ref_trns}"${dir}/ref${i}.trn "
|
||||
hyp_trns=${hyp_trns}"${dir}/hyp${i}.trn "
|
||||
done
|
||||
json2trn.py ${dir}/data.json ${dic} --num-spkrs ${num_spkrs} --refs ${ref_trns} --hyps ${hyp_trns}
|
||||
|
||||
for n in $(seq ${num_spkrs}); do
|
||||
if ${remove_blank}; then
|
||||
sed -i.bak2 -r 's/<blank> //g' ${dir}/hyp${n}.trn
|
||||
fi
|
||||
if [ -n "${nlsyms}" ]; then
|
||||
cp ${dir}/ref${n}.trn ${dir}/ref${n}.trn.org
|
||||
cp ${dir}/hyp${n}.trn ${dir}/hyp${n}.trn.org
|
||||
filt.py -v ${nlsyms} ${dir}/ref${n}.trn.org > ${dir}/ref${n}.trn
|
||||
filt.py -v ${nlsyms} ${dir}/hyp${n}.trn.org > ${dir}/hyp${n}.trn
|
||||
fi
|
||||
if [ -n "${filter}" ]; then
|
||||
sed -i.bak3 -f ${filter} ${dir}/hyp${n}.trn
|
||||
sed -i.bak3 -f ${filter} ${dir}/ref${n}.trn
|
||||
fi
|
||||
done
|
||||
|
||||
results_str=""
|
||||
for (( i=0; i<$((num_spkrs * num_spkrs)); i++ )); do
|
||||
ind_r=$((i / num_spkrs + 1))
|
||||
ind_h=$((i % num_spkrs + 1))
|
||||
results_str=${results_str}"${dir}/result_r${ind_r}h${ind_h}.txt "
|
||||
sclite -r ${dir}/ref${ind_r}.trn trn -h ${dir}/hyp${ind_h}.trn trn -i rm -o all stdout > ${dir}/result_r${ind_r}h${ind_h}.txt
|
||||
done
|
||||
|
||||
echo "write CER (or TER) results in ${dir}/result_r*h*.txt"
|
||||
eval_perm_free_error.py --num-spkrs ${num_spkrs} \
|
||||
${results_str} > ${dir}/min_perm_result.json
|
||||
sed -n '2,4p' ${dir}/min_perm_result.json
|
||||
|
||||
if ${wer}; then
|
||||
for n in $(seq ${num_spkrs}); do
|
||||
if [ -n "$bpe" ]; then
|
||||
spm_decode --model=${bpemodel} --input_format=piece < ${dir}/ref${n}.trn | sed -e "s/▁/ /g" > ${dir}/ref${n}.wrd.trn
|
||||
spm_decode --model=${bpemodel} --input_format=piece < ${dir}/hyp${n}.trn | sed -e "s/▁/ /g" > ${dir}/hyp${n}.wrd.trn
|
||||
else
|
||||
sed -e "s/ //g" -e "s/(/ (/" -e "s/<space>/ /g" ${dir}/ref${n}.trn > ${dir}/ref${n}.wrd.trn
|
||||
sed -e "s/ //g" -e "s/(/ (/" -e "s/<space>/ /g" ${dir}/hyp${n}.trn > ${dir}/hyp${n}.wrd.trn
|
||||
fi
|
||||
done
|
||||
results_str=""
|
||||
for (( i=0; i<$((num_spkrs * num_spkrs)); i++ )); do
|
||||
ind_r=$((i / num_spkrs + 1))
|
||||
ind_h=$((i % num_spkrs + 1))
|
||||
results_str=${results_str}"${dir}/result_r${ind_r}h${ind_h}.wrd.txt "
|
||||
sclite -r ${dir}/ref${ind_r}.wrd.trn trn -h ${dir}/hyp${ind_h}.wrd.trn trn -i rm -o all stdout > ${dir}/result_r${ind_r}h${ind_h}.wrd.txt
|
||||
done
|
||||
|
||||
echo "write WER results in ${dir}/result_r*h*.wrd.txt"
|
||||
eval_perm_free_error.py --num-spkrs ${num_spkrs} \
|
||||
${results_str} > ${dir}/min_perm_result.wrd.json
|
||||
sed -n '2,4p' ${dir}/min_perm_result.wrd.json
|
||||
fi
|
||||
fi
|
@ -0,0 +1,25 @@
|
||||
#!/usr/bin/env perl
|
||||
# Copyright 2010-2011 Microsoft Corporation
|
||||
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
# MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
# See the Apache 2 License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
while(<>){
|
||||
@A = split(" ", $_);
|
||||
@A > 1 || die "Invalid line in spk2utt file: $_";
|
||||
$s = shift @A;
|
||||
foreach $u ( @A ) {
|
||||
print "$u $s\n";
|
||||
}
|
||||
}
|
@ -0,0 +1,79 @@
|
||||
#!/usr/bin/env bash
|
||||
# Copyright 2010-2011 Microsoft Corporation
|
||||
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
# MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
# See the Apache 2 License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
set -o errexit
|
||||
|
||||
if [ $# != 2 ]; then
|
||||
echo "Usage: split_data.sh data-dir num-to-split"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
data=$1
|
||||
numsplit=$2
|
||||
|
||||
if [ $numsplit -le 0 ]; then
|
||||
echo "Invalid num-split argument $numsplit";
|
||||
exit 1;
|
||||
fi
|
||||
|
||||
n=0;
|
||||
feats=""
|
||||
wavs=""
|
||||
utt2spks=""
|
||||
texts=""
|
||||
|
||||
nu=`cat $data/utt2spk | wc -l`
|
||||
nf=`cat $data/feats.scp | wc -l`
|
||||
nt=`cat $data/text | wc -l`
|
||||
if [ $nu -ne $nf ]; then
|
||||
echo "split_data.sh: warning, #lines is (utt2spk,feats.scp) is ($nu,$nf);"
|
||||
echo "this script may produce incorrectly split data."
|
||||
echo "use utils/fix_data_dir.sh to fix this."
|
||||
fi
|
||||
if [ $nt -ne 0 -a $nu -ne $nt ]; then
|
||||
echo "split_data.sh: warning, #lines is (utt2spk,text) is ($nu,$nt);"
|
||||
echo "this script may produce incorrectly split data."
|
||||
echo "use utils/fix_data_dir.sh to fix this."
|
||||
fi
|
||||
|
||||
# utilsscripts/get_split.pl returns "0 1 2 3" or "00 01 .. 18 19" or whatever.
|
||||
# for n in `get_splits.pl $numsplit`; do
|
||||
for n in `seq 1 $numsplit`; do # Changed this to usual number sequence -Arnab
|
||||
mkdir -p $data/split$numsplit/$n
|
||||
feats="$feats $data/split$numsplit/$n/feats.scp"
|
||||
wavs="$wavs $data/split$numsplit/$n/wav.scp"
|
||||
texts="$texts $data/split$numsplit/$n/text"
|
||||
utt2spks="$utt2spks $data/split$numsplit/$n/utt2spk"
|
||||
done
|
||||
|
||||
split_scp.pl --utt2spk=$data/utt2spk $data/utt2spk $utt2spks
|
||||
split_scp.pl --utt2spk=$data/utt2spk $data/feats.scp $feats
|
||||
[ -f $data/wav.scp ] && \
|
||||
split_scp.pl --utt2spk=$data/utt2spk $data/wav.scp $wavs
|
||||
[ -f $data/text ] && \
|
||||
split_scp.pl --utt2spk=$data/utt2spk $data/text $texts
|
||||
|
||||
# for n in `get_splits.pl $numsplit`; do
|
||||
for n in `seq 1 $numsplit`; do # Changed this to usual number sequence -Arnab
|
||||
utt2spk_to_spk2utt.pl $data/split$numsplit/$n/utt2spk \
|
||||
> $data/split$numsplit/$n/spk2utt
|
||||
# for completeness, also split the spk2gender file
|
||||
[ -f $data/spk2gender ] && \
|
||||
filter_scp.pl $data/split$numsplit/$n/spk2utt $data/spk2gender \
|
||||
> $data/split$numsplit/$n/spk2gender
|
||||
done
|
||||
|
||||
exit 0
|
@ -0,0 +1,31 @@
|
||||
#!/usr/bin/env bash
|
||||
set -o errexit
|
||||
|
||||
if [ $# != 2 ]; then
|
||||
echo "Usage: split_data.sh manifest num-to-split"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
data=data
|
||||
|
||||
jsonfile=$1
|
||||
numsplit=$2
|
||||
|
||||
if [ $numsplit -le 0 ]; then
|
||||
echo "Invalid num-split argument $numsplit";
|
||||
exit 1;
|
||||
fi
|
||||
|
||||
n=0;
|
||||
jsons=""
|
||||
|
||||
# utilsscripts/get_split.pl returns "0 1 2 3" or "00 01 .. 18 19" or whatever.
|
||||
# for n in `get_splits.pl $numsplit`; do
|
||||
for n in `seq 1 $numsplit`; do # Changed this to usual number sequence -Arnab
|
||||
mkdir -p $data/split$numsplit/$n
|
||||
jsons="$jsons $data/split$numsplit/$n/${jsonfile}"
|
||||
done
|
||||
|
||||
split_scp.pl $data/${jsonfile} $jsons
|
||||
|
||||
exit 0
|
@ -0,0 +1,212 @@
|
||||
#!/usr/bin/env perl
|
||||
use warnings; #sed replacement for -w perl parameter
|
||||
# Copyright 2010-2011 Microsoft Corporation
|
||||
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
# MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
# See the Apache 2 License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
|
||||
# This program splits up any kind of .scp or archive-type file.
|
||||
# If there is no utt2spk option it will work on any text file and
|
||||
# will split it up with an approximately equal number of lines in
|
||||
# each but.
|
||||
# With the --utt2spk option it will work on anything that has the
|
||||
# utterance-id as the first entry on each line; the utt2spk file is
|
||||
# of the form "utterance speaker" (on each line).
|
||||
# It splits it into equal size chunks as far as it can. If you use
|
||||
# the utt2spk option it will make sure these chunks coincide with
|
||||
# speaker boundaries. In this case, if there are more chunks
|
||||
# than speakers (and in some other circumstances), some of the
|
||||
# resulting chunks will be empty and it
|
||||
# will print a warning.
|
||||
# You will normally call this like:
|
||||
# split_scp.pl scp scp.1 scp.2 scp.3 ...
|
||||
# or
|
||||
# split_scp.pl --utt2spk=utt2spk scp scp.1 scp.2 scp.3 ...
|
||||
# Note that you can use this script to split the utt2spk file itself,
|
||||
# e.g. split_scp.pl --utt2spk=utt2spk utt2spk utt2spk.1 utt2spk.2 ...
|
||||
|
||||
# You can also call the scripts like:
|
||||
# split_scp.pl -j 3 0 scp scp.0
|
||||
# [note: with this option, it assumes zero-based indexing of the split parts,
|
||||
# i.e. the second number must be 0 <= n < num-jobs.]
|
||||
|
||||
$num_jobs = 0;
|
||||
$job_id = 0;
|
||||
$utt2spk_file = "";
|
||||
|
||||
for ($x = 1; $x <= 2; $x++) {
|
||||
if ($ARGV[0] eq "-j") {
|
||||
shift @ARGV;
|
||||
$num_jobs = shift @ARGV;
|
||||
$job_id = shift @ARGV;
|
||||
if ($num_jobs <= 0 || $job_id < 0 || $job_id >= $num_jobs) {
|
||||
die "Invalid num-jobs and job-id: $num_jobs and $job_id";
|
||||
}
|
||||
}
|
||||
if ($ARGV[0] =~ "--utt2spk=(.+)") {
|
||||
$utt2spk_file=$1;
|
||||
shift;
|
||||
}
|
||||
}
|
||||
|
||||
if(($num_jobs == 0 && @ARGV < 2) || ($num_jobs > 0 && (@ARGV < 1 || @ARGV > 2))) {
|
||||
die "Usage: split_scp.pl [--utt2spk=<utt2spk_file>] in.scp out1.scp out2.scp ... \n" .
|
||||
" or: split_scp.pl -j num-jobs job-id [--utt2spk=<utt2spk_file>] in.scp [out.scp]\n" .
|
||||
" ... where 0 <= job-id < num-jobs.";
|
||||
}
|
||||
|
||||
$inscp = shift @ARGV;
|
||||
if ($num_jobs == 0) { # without -j option
|
||||
@OUTPUTS = @ARGV;
|
||||
} else {
|
||||
for ($j = 0; $j < $num_jobs; $j++) {
|
||||
if ($j == $job_id) {
|
||||
if (@ARGV > 0) { push @OUTPUTS, $ARGV[0]; }
|
||||
else { push @OUTPUTS, "-"; }
|
||||
} else {
|
||||
push @OUTPUTS, "/dev/null";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ($utt2spk_file ne "") { # We have the --utt2spk option...
|
||||
open(U, "<$utt2spk_file") || die "Failed to open utt2spk file $utt2spk_file";
|
||||
while(<U>) {
|
||||
@A = split;
|
||||
@A == 2 || die "Bad line $_ in utt2spk file $utt2spk_file";
|
||||
($u,$s) = @A;
|
||||
$utt2spk{$u} = $s;
|
||||
}
|
||||
open(I, "<$inscp") || die "Opening input scp file $inscp";
|
||||
@spkrs = ();
|
||||
while(<I>) {
|
||||
@A = split;
|
||||
if(@A == 0) { die "Empty or space-only line in scp file $inscp"; }
|
||||
$u = $A[0];
|
||||
$s = $utt2spk{$u};
|
||||
if(!defined $s) { die "No such utterance $u in utt2spk file $utt2spk_file"; }
|
||||
if(!defined $spk_count{$s}) {
|
||||
push @spkrs, $s;
|
||||
$spk_count{$s} = 0;
|
||||
$spk_data{$s} = "";
|
||||
}
|
||||
$spk_count{$s}++;
|
||||
$spk_data{$s} = $spk_data{$s} . $_;
|
||||
}
|
||||
# Now split as equally as possible ..
|
||||
# First allocate spks to files by allocating an approximately
|
||||
# equal number of speakers.
|
||||
$numspks = @spkrs; # number of speakers.
|
||||
$numscps = @OUTPUTS; # number of output files.
|
||||
for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
|
||||
$scparray[$scpidx] = []; # [] is array reference.
|
||||
}
|
||||
for ($spkidx = 0; $spkidx < $numspks; $spkidx++) {
|
||||
$scpidx = int(($spkidx*$numscps) / $numspks);
|
||||
$spk = $spkrs[$spkidx];
|
||||
push @{$scparray[$scpidx]}, $spk;
|
||||
$scpcount[$scpidx] += $spk_count{$spk};
|
||||
}
|
||||
|
||||
# Now will try to reassign beginning + ending speakers
|
||||
# to different scp's and see if it gets more balanced.
|
||||
# Suppose objf we're minimizing is sum_i (num utts in scp[i] - average)^2.
|
||||
# We can show that if considering changing just 2 scp's, we minimize
|
||||
# this by minimizing the squared difference in sizes. This is
|
||||
# equivalent to minimizing the absolute difference in sizes. This
|
||||
# shows this method is bound to converge.
|
||||
|
||||
$changed = 1;
|
||||
while($changed) {
|
||||
$changed = 0;
|
||||
for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
|
||||
# First try to reassign ending spk of this scp.
|
||||
if($scpidx < $numscps-1) {
|
||||
$sz = @{$scparray[$scpidx]};
|
||||
if($sz > 0) {
|
||||
$spk = $scparray[$scpidx]->[$sz-1];
|
||||
$count = $spk_count{$spk};
|
||||
$nutt1 = $scpcount[$scpidx];
|
||||
$nutt2 = $scpcount[$scpidx+1];
|
||||
if( abs( ($nutt2+$count) - ($nutt1-$count))
|
||||
< abs($nutt2 - $nutt1)) { # Would decrease
|
||||
# size-diff by reassigning spk...
|
||||
$scpcount[$scpidx+1] += $count;
|
||||
$scpcount[$scpidx] -= $count;
|
||||
pop @{$scparray[$scpidx]};
|
||||
unshift @{$scparray[$scpidx+1]}, $spk;
|
||||
$changed = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
if($scpidx > 0 && @{$scparray[$scpidx]} > 0) {
|
||||
$spk = $scparray[$scpidx]->[0];
|
||||
$count = $spk_count{$spk};
|
||||
$nutt1 = $scpcount[$scpidx-1];
|
||||
$nutt2 = $scpcount[$scpidx];
|
||||
if( abs( ($nutt2-$count) - ($nutt1+$count))
|
||||
< abs($nutt2 - $nutt1)) { # Would decrease
|
||||
# size-diff by reassigning spk...
|
||||
$scpcount[$scpidx-1] += $count;
|
||||
$scpcount[$scpidx] -= $count;
|
||||
shift @{$scparray[$scpidx]};
|
||||
push @{$scparray[$scpidx-1]}, $spk;
|
||||
$changed = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
# Now print out the files...
|
||||
for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
|
||||
$scpfn = $OUTPUTS[$scpidx];
|
||||
open(F, ">$scpfn") || die "Could not open scp file $scpfn for writing.";
|
||||
$count = 0;
|
||||
if(@{$scparray[$scpidx]} == 0) {
|
||||
print STDERR "Warning: split_scp.pl producing empty .scp file $scpfn (too many splits and too few speakers?)\n";
|
||||
} else {
|
||||
foreach $spk ( @{$scparray[$scpidx]} ) {
|
||||
print F $spk_data{$spk};
|
||||
$count += $spk_count{$spk};
|
||||
}
|
||||
if($count != $scpcount[$scpidx]) { die "Count mismatch [code error]"; }
|
||||
}
|
||||
close(F);
|
||||
}
|
||||
} else {
|
||||
# This block is the "normal" case where there is no --utt2spk
|
||||
# option and we just break into equal size chunks.
|
||||
|
||||
open(I, "<$inscp") || die "Opening input scp file $inscp";
|
||||
|
||||
$numscps = @OUTPUTS; # size of array.
|
||||
@F = ();
|
||||
while(<I>) {
|
||||
push @F, $_;
|
||||
}
|
||||
$numlines = @F;
|
||||
if($numlines == 0) {
|
||||
print STDERR "split_scp.pl: warning: empty input scp file $inscp";
|
||||
}
|
||||
$linesperscp = int( ($numlines+($numscps-1)) / $numscps); # the +$(numscps-1) forces rounding up.
|
||||
# [just doing int() rounds down].
|
||||
for($scpidx = 0; $scpidx < @OUTPUTS; $scpidx++) {
|
||||
$scpfile = $OUTPUTS[$scpidx];
|
||||
open(O, ">$scpfile") || die "Opening output scp file $scpfile";
|
||||
for($n = $linesperscp * $scpidx; $n < $numlines && $n < $linesperscp*($scpidx+1); $n++) {
|
||||
print O $F[$n];
|
||||
}
|
||||
close(O) || die "Closing scp file $scpfile";
|
||||
}
|
||||
}
|
@ -0,0 +1,67 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
# 2020 Author Jiayu DU
|
||||
# Apache 2.0
|
||||
|
||||
# This script uses kenlm to estimate an arpa model from plain text,
|
||||
# it is a resort when you hit memory limit dealing with large corpus
|
||||
# kenlm estimates arpa using on-disk structure,
|
||||
# as long as you have big enough hard disk, memory shouldn't be a problem.
|
||||
# by default, kenlm use up to 50% of your local memory,
|
||||
# you can control this through -S option
|
||||
|
||||
[ -f path.sh ] && . ./path.sh;
|
||||
|
||||
kenlm_opts="" # e.g. "-o 4 -S 50% --prune 0 5 7 7"
|
||||
|
||||
if [ $# != 4 ]; then
|
||||
echo "$0 <text> <kaldi_symbol_table> <working_dir> <arpa_name>"
|
||||
echo "e.g. $0 train.txt words.txt wdir 4gram"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
text=$1
|
||||
symbol_table=$2
|
||||
dir=$3
|
||||
arpa_name=$4
|
||||
|
||||
if ! which lmplz >& /dev/null ; then
|
||||
echo "$0: cannot find training tool *lmplz*."
|
||||
echo "tools/extras/install_kenlm_query_only.sh installs kenlm at tools/kenlm"
|
||||
echo "it only supports runtime mode, to actually train an arpa using KenLM,"
|
||||
echo "you need a complete KenLM installation(depends on EIGEN and BOOST),"
|
||||
echo "follow KenLM's building instructions at (https://github.com/kpu/kenlm)"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# the text should be properly pre-processed, e.g:
|
||||
# cleand, normalized and possibly word-segmented
|
||||
|
||||
# get rid off irrelavent symbols
|
||||
grep -v '<eps>' $symbol_table \
|
||||
| grep -v '#0' \
|
||||
| grep -v '<unk>' | grep -v '<UNK>' \
|
||||
| grep -v '<s>' | grep -v '</s>' \
|
||||
| awk '{print $1}' \
|
||||
> $dir/ngram.vocab
|
||||
|
||||
# To make sure that kenlm & kaldi have strictly the same vocabulary:
|
||||
# 1. feed vocabulary into kenlm via --limit_vocab_file
|
||||
# 2. cat vocabulary to training text, so each word at least appear once
|
||||
#
|
||||
# TL;DR reason:
|
||||
# Unlike SRILM's -limit-vocab, kenlm's --limit_vocab_file option
|
||||
# spcifies a *valid* set of vocabulary, whereas *valid but unseen*
|
||||
# words are discarded in final arpa.
|
||||
# So the trick is,
|
||||
# we explicitly add kaldi's vocab(one word per line) to training text,
|
||||
# making each word appear at least once.
|
||||
# kenlm never prunes unigram,
|
||||
# so this always generates consistent kenlm vocabuary as kaldi has.
|
||||
# The effect of this is like add-one smoothing to unigram counts,
|
||||
# shouldn't have significant impacts in practice.
|
||||
cat $dir/ngram.vocab $text \
|
||||
| lmplz $kenlm_opts --limit_vocab_file $dir/ngram.vocab \
|
||||
> $dir/${arpa_name}.arpa
|
||||
|
||||
echo "$0: Done training arpa to: $dir/${arpa_name}.arpa"
|
@ -0,0 +1,38 @@
|
||||
#!/usr/bin/env perl
|
||||
# Copyright 2010-2011 Microsoft Corporation
|
||||
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
# MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
# See the Apache 2 License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# converts an utt2spk file to a spk2utt file.
|
||||
# Takes input from the stdin or from a file argument;
|
||||
# output goes to the standard out.
|
||||
|
||||
if ( @ARGV > 1 ) {
|
||||
die "Usage: utt2spk_to_spk2utt.pl [ utt2spk ] > spk2utt";
|
||||
}
|
||||
|
||||
while(<>){
|
||||
@A = split(" ", $_);
|
||||
@A == 2 || die "Invalid line in utt2spk file: $_";
|
||||
($u,$s) = @A;
|
||||
if(!$seen_spk{$s}) {
|
||||
$seen_spk{$s} = 1;
|
||||
push @spklist, $s;
|
||||
}
|
||||
push (@{$spk_hash{$s}}, "$u");
|
||||
}
|
||||
foreach $s (@spklist) {
|
||||
$l = join(' ',@{$spk_hash{$s}});
|
||||
print "$s $l\n";
|
||||
}
|
Loading…
Reference in new issue