From 8e16315ada7bc487ce2bf4afe3d6090016d89dd2 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Thu, 23 Sep 2021 03:40:53 +0000
Subject: [PATCH] librispeech s1 support multi process decode and sclite

---
 deepspeech/frontend/utility.py        |   1 +
 examples/librispeech/s1/local/test.sh |   8 +-
 examples/librispeech/s1/path.sh       |   2 +-
 examples/librispeech/s2/local/test.sh |   3 +
 examples/librispeech/s2/path.sh       |   2 +-
 utils/run.pl                          |   0
 utils/split_json.sh                   |   4 +-
 utils/split_scp.pl                    |   0
 utils/split_scp.pl                    | 212 --------------------------
 9 files changed, 12 insertions(+), 220 deletions(-)
 mode change 100755 => 120000 utils/run.pl
 create mode 100755 utils/split_scp.pl
 delete mode 100644 utils/split_scp.pl 

diff --git a/deepspeech/frontend/utility.py b/deepspeech/frontend/utility.py
index 926abf18..f7e2cb21 100644
--- a/deepspeech/frontend/utility.py
+++ b/deepspeech/frontend/utility.py
@@ -18,6 +18,7 @@ from typing import List
 from typing import Optional
 from typing import Text
 
+import jsonlines
 import numpy as np
 
 from deepspeech.utils.log import Log
diff --git a/examples/librispeech/s1/local/test.sh b/examples/librispeech/s1/local/test.sh
index 09644bb2..7f48d3d5 100755
--- a/examples/librispeech/s1/local/test.sh
+++ b/examples/librispeech/s1/local/test.sh
@@ -1,7 +1,10 @@
 #!/bin/bash
 
+set -e
+
 expdir=exp
 datadir=data
+nj=32
 
 lmtag=
 
@@ -60,13 +63,10 @@ for dmethd in attention ctc_greedy_search ctc_prefix_beam_search attention_resco
         batch_size=1
         ${decode_cmd} JOB=1:${nj} ${expdir}/${decode_dir}/log/decode.JOB.log \
             python3 -u ${BIN_DIR}/test.py \
-            --model-name u2_kaldi \
-            --run-mode test \
             --nproc ${ngpu} \
-            --dict-path ${dict} \
             --config ${config_path} \
+            --result_file ${expdir}/${decode_dir}/data.JOB.json \
             --checkpoint_path ${ckpt_prefix} \
-            --result-file ${expdir}/${decode_dir}/data.JOB.json \
             --opts decoding.decoding_method ${dmethd} \
             --opts decoding.batch_size ${batch_size} \
             --opts data.test_manifest ${feat_recog_dir}/split${nj}/JOB/manifest.${rtask}
diff --git a/examples/librispeech/s1/path.sh b/examples/librispeech/s1/path.sh
index 09f5ba3f..439f71ae 100644
--- a/examples/librispeech/s1/path.sh
+++ b/examples/librispeech/s1/path.sh
@@ -1,6 +1,6 @@
 export MAIN_ROOT=`realpath ${PWD}/../../../`
 
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/tools/sckt/bin/sclite:${PWD}/utils:${PATH}
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/tools/sctk/bin:${PWD}/utils:${PATH}
 export LC_ALL=C
 
 # Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
diff --git a/examples/librispeech/s2/local/test.sh b/examples/librispeech/s2/local/test.sh
index 09644bb2..5eeb2d61 100755
--- a/examples/librispeech/s2/local/test.sh
+++ b/examples/librispeech/s2/local/test.sh
@@ -1,7 +1,10 @@
 #!/bin/bash
 
+set -e
+
 expdir=exp
 datadir=data
+nj=32
 
 lmtag=
 
diff --git a/examples/librispeech/s2/path.sh b/examples/librispeech/s2/path.sh
index 9f6891cd..05a037af 100644
--- a/examples/librispeech/s2/path.sh
+++ b/examples/librispeech/s2/path.sh
@@ -1,6 +1,6 @@
 export MAIN_ROOT=`realpath ${PWD}/../../../`
 
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/tools/sckt/bin/sclite:${PWD}/utils:${PATH}
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/tools/sctk/bin:${PWD}/utils:${PATH}
 export LC_ALL=C
 
 # Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
diff --git a/utils/run.pl b/utils/run.pl
deleted file mode 100755
index 8b84c7f0..00000000
--- a/utils/run.pl
+++ /dev/null
@@ -1 +0,0 @@
-parallel/run.pl
\ No newline at end of file
diff --git a/utils/run.pl b/utils/run.pl
new file mode 120000
index 00000000..8b84c7f0
--- /dev/null
+++ b/utils/run.pl
@@ -0,0 +1 @@
+parallel/run.pl
\ No newline at end of file
diff --git a/utils/split_json.sh b/utils/split_json.sh
index 6c7d38b9..48f64f61 100755
--- a/utils/split_json.sh
+++ b/utils/split_json.sh
@@ -2,7 +2,7 @@
 set -o errexit
 
 if [ $# != 2 ]; then
-  echo "Usage: split_data.sh manifest num-to-split"
+  echo "Usage: split_json.sh manifest num-to-split"
   exit 1
 fi
 
@@ -28,4 +28,4 @@ done
 
 split_scp.pl $data/${jsonfile} $jsons
 
-exit 0
\ No newline at end of file
+exit 0
diff --git a/utils/split_scp.pl b/utils/split_scp.pl
new file mode 100755
index 00000000..e69de29b
diff --git a/utils/split_scp.pl  b/utils/split_scp.pl 
deleted file mode 100644
index fc28e0b6..00000000
--- a/utils/split_scp.pl 	
+++ /dev/null
@@ -1,212 +0,0 @@
-#!/usr/bin/env perl
-use warnings; #sed replacement for -w perl parameter
-# Copyright 2010-2011 Microsoft Corporation
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-
-
-# This program splits up any kind of .scp or archive-type file.
-# If there is no utt2spk option it will work on any text  file and
-# will split it up with an approximately equal number of lines in
-# each but.
-# With the --utt2spk option it will work on anything that has the 
-# utterance-id as the first entry on each line; the utt2spk file is
-# of the form "utterance speaker" (on each line).
-# It splits it into equal size chunks as far as it can.  If you use
-# the utt2spk option it will make sure these chunks coincide with
-# speaker boundaries.  In this case, if there are more chunks
-# than speakers (and in some other circumstances), some of the 
-# resulting  chunks will be empty and it
-# will print a warning.
-# You will normally call this like:
-# split_scp.pl scp scp.1 scp.2 scp.3 ...
-# or
-# split_scp.pl --utt2spk=utt2spk scp scp.1 scp.2 scp.3 ...
-# Note that you can use this script to split the utt2spk file itself,
-# e.g. split_scp.pl --utt2spk=utt2spk utt2spk utt2spk.1 utt2spk.2 ...
-
-# You can also call the scripts like:
-# split_scp.pl -j 3 0 scp scp.0
-# [note: with this option, it assumes zero-based indexing of the split parts,
-# i.e. the second number must be 0 <= n < num-jobs.]
-
-$num_jobs = 0;
-$job_id = 0;
-$utt2spk_file = "";
-
-for ($x = 1; $x <= 2; $x++) {
-    if ($ARGV[0] eq "-j") {
-        shift @ARGV;
-        $num_jobs = shift @ARGV;
-        $job_id = shift @ARGV;
-        if ($num_jobs <= 0 || $job_id < 0 || $job_id >= $num_jobs) {
-            die "Invalid num-jobs and job-id: $num_jobs and $job_id";
-        }
-    }
-    if ($ARGV[0] =~ "--utt2spk=(.+)") {
-        $utt2spk_file=$1;
-        shift;
-    }
-}
-
-if(($num_jobs == 0 && @ARGV < 2) || ($num_jobs > 0 && (@ARGV < 1 || @ARGV > 2))) {
-    die "Usage: split_scp.pl [--utt2spk=<utt2spk_file>] in.scp out1.scp out2.scp ... \n" .
-        " or: split_scp.pl -j num-jobs job-id [--utt2spk=<utt2spk_file>] in.scp [out.scp]\n" .
-        " ... where 0 <= job-id < num-jobs.";
-}
-   
-$inscp = shift @ARGV;
-if ($num_jobs == 0) { # without -j option
-    @OUTPUTS = @ARGV;
-} else {
-    for ($j = 0; $j < $num_jobs; $j++) {
-        if ($j == $job_id) { 
-            if (@ARGV > 0) { push @OUTPUTS, $ARGV[0]; }
-            else { push @OUTPUTS, "-"; }
-        } else {
-            push @OUTPUTS, "/dev/null";
-        }
-    }
-} 
-
-if ($utt2spk_file ne "") {  # We have the --utt2spk option...
-    open(U, "<$utt2spk_file") || die "Failed to open utt2spk file $utt2spk_file";
-    while(<U>) {
-        @A = split;
-        @A == 2 || die "Bad line $_ in utt2spk file $utt2spk_file";
-        ($u,$s) = @A;
-        $utt2spk{$u} = $s;
-    }
-    open(I, "<$inscp") || die "Opening input scp file $inscp";
-    @spkrs = ();
-    while(<I>) {
-        @A = split;
-        if(@A == 0) { die "Empty or space-only line in scp file $inscp"; }
-        $u = $A[0];
-        $s = $utt2spk{$u};
-        if(!defined $s) { die "No such utterance $u in utt2spk file $utt2spk_file"; }
-        if(!defined $spk_count{$s}) { 
-            push @spkrs, $s; 
-            $spk_count{$s} = 0;
-            $spk_data{$s} = "";
-        }
-        $spk_count{$s}++;
-        $spk_data{$s} = $spk_data{$s} . $_;
-    }
-    # Now split as equally as possible ..
-    # First allocate spks to files by allocating an approximately
-    # equal number of speakers.
-    $numspks = @spkrs;  # number of speakers.
-    $numscps = @OUTPUTS; # number of output files.
-    for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
-        $scparray[$scpidx] = []; # [] is array reference.
-    }
-    for ($spkidx = 0; $spkidx < $numspks; $spkidx++) {
-        $scpidx = int(($spkidx*$numscps) / $numspks);
-        $spk = $spkrs[$spkidx];
-        push @{$scparray[$scpidx]}, $spk;
-        $scpcount[$scpidx] += $spk_count{$spk};
-    }
-
-    # Now will try to reassign beginning + ending speakers
-    # to different scp's and see if it gets more balanced.
-    # Suppose objf we're minimizing is sum_i (num utts in scp[i] - average)^2.
-    # We can show that if considering changing just 2 scp's, we minimize
-    # this by minimizing the squared difference in sizes.  This is
-    # equivalent to minimizing the absolute difference in sizes.  This
-    # shows this method is bound to converge.
-
-    $changed = 1;
-    while($changed) {
-        $changed = 0;
-        for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
-            # First try to reassign ending spk of this scp.
-            if($scpidx < $numscps-1) {
-                $sz = @{$scparray[$scpidx]};
-                if($sz > 0) {
-                    $spk = $scparray[$scpidx]->[$sz-1];
-                    $count = $spk_count{$spk};
-                    $nutt1 = $scpcount[$scpidx];
-                    $nutt2 = $scpcount[$scpidx+1];
-                    if( abs( ($nutt2+$count) - ($nutt1-$count))
-                        < abs($nutt2 - $nutt1))  { # Would decrease
-                        # size-diff by reassigning spk...
-                        $scpcount[$scpidx+1] += $count;
-                        $scpcount[$scpidx] -= $count;
-                        pop @{$scparray[$scpidx]};
-                        unshift @{$scparray[$scpidx+1]}, $spk;
-                        $changed = 1;
-                    }
-                }
-            }
-            if($scpidx > 0 && @{$scparray[$scpidx]} > 0) {
-                $spk = $scparray[$scpidx]->[0];
-                $count = $spk_count{$spk};
-                $nutt1 = $scpcount[$scpidx-1];
-                $nutt2 = $scpcount[$scpidx];
-                if( abs( ($nutt2-$count) - ($nutt1+$count))
-                    < abs($nutt2 - $nutt1))  { # Would decrease
-                    # size-diff by reassigning spk...
-                    $scpcount[$scpidx-1] += $count;
-                    $scpcount[$scpidx] -= $count;
-                    shift @{$scparray[$scpidx]};
-                    push @{$scparray[$scpidx-1]}, $spk;
-                    $changed = 1;
-                }
-            }
-        }
-    }
-    # Now print out the files...
-    for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
-        $scpfn = $OUTPUTS[$scpidx];
-        open(F, ">$scpfn") || die "Could not open scp file $scpfn for writing.";
-        $count = 0;
-        if(@{$scparray[$scpidx]} == 0) {
-            print STDERR "Warning: split_scp.pl producing empty .scp file $scpfn (too many splits and too few speakers?)\n";
-        } else {
-            foreach $spk ( @{$scparray[$scpidx]} ) {
-                print F $spk_data{$spk};
-                $count += $spk_count{$spk};
-            }
-            if($count != $scpcount[$scpidx]) { die "Count mismatch [code error]"; }
-        }
-        close(F);
-    }
-} else { 
-   # This block is the "normal" case where there is no --utt2spk 
-   # option and we just break into equal size chunks.
-
-    open(I, "<$inscp") || die "Opening input scp file $inscp";
-
-    $numscps = @OUTPUTS;  # size of array.
-    @F = ();
-    while(<I>) {
-        push @F, $_;
-    }
-    $numlines = @F;
-    if($numlines == 0) {
-        print STDERR "split_scp.pl: warning: empty input scp file $inscp";
-    }
-    $linesperscp = int( ($numlines+($numscps-1)) / $numscps); # the +$(numscps-1) forces rounding up.
-# [just doing int() rounds down].
-    for($scpidx = 0; $scpidx < @OUTPUTS; $scpidx++) {
-        $scpfile = $OUTPUTS[$scpidx];
-        open(O, ">$scpfile") || die "Opening output scp file $scpfile";
-        for($n = $linesperscp * $scpidx; $n < $numlines && $n < $linesperscp*($scpidx+1); $n++) {
-            print O $F[$n];
-        }
-        close(O) || die "Closing scp file $scpfile";
-    }
-}
\ No newline at end of file