diff --git a/deepspeech/decoders/swig/ctc_beam_search_decoder.cpp b/deepspeech/decoders/swig/ctc_beam_search_decoder.cpp index fcb1f764..fc5fe62d 100644 --- a/deepspeech/decoders/swig/ctc_beam_search_decoder.cpp +++ b/deepspeech/decoders/swig/ctc_beam_search_decoder.cpp @@ -28,6 +28,7 @@ #include "path_trie.h" using FSTMATCH = fst::SortedMatcher; +constexpr kSPACE = "" std::vector> ctc_beam_search_decoder( const std::vector> &probs_seq, @@ -46,13 +47,8 @@ std::vector> ctc_beam_search_decoder( "The shape of probs_seq does not match with " "the shape of the vocabulary"); } - - // assign blank id - // size_t blank_id = vocabulary.size(); - // size_t blank_id = 0; - // assign space id - auto it = std::find(vocabulary.begin(), vocabulary.end(), " "); + auto it = std::find(vocabulary.begin(), vocabulary.end(), kSPACE); int space_id = it - vocabulary.begin(); // if no space in vocabulary if ((size_t)space_id >= vocabulary.size()) { diff --git a/deepspeech/frontend/featurizer/text_featurizer.py b/deepspeech/frontend/featurizer/text_featurizer.py index fe4ea03f..4bc501df 100644 --- a/deepspeech/frontend/featurizer/text_featurizer.py +++ b/deepspeech/frontend/featurizer/text_featurizer.py @@ -16,6 +16,7 @@ import sentencepiece as spm from ..utility import EOS from ..utility import load_dict +from ..utility import SPACE from ..utility import UNK __all__ = ["TextFeaturizer"] @@ -53,9 +54,9 @@ class TextFeaturizer(): self.sp = spm.SentencePieceProcessor() self.sp.Load(spm_model) - def tokenize(self, text): + def tokenize(self, text, replace_space=True): if self.unit_type == 'char': - tokens = self.char_tokenize(text) + tokens = self.char_tokenize(text, replace_space) elif self.unit_type == 'word': tokens = self.word_tokenize(text) else: # spm @@ -107,16 +108,20 @@ class TextFeaturizer(): text = self.detokenize(tokens) return text - def char_tokenize(self, text): + def char_tokenize(self, text, replace_space=True): """Character tokenizer. Args: text (str): text string. + replace_space (bool): False only used by build_vocab.py. Returns: List[str]: tokens. """ - return list(text.strip()) + text = text.strip() + if replace_space: + text = text.replace(" ", SPACE) + return list(text) def char_detokenize(self, tokens): """Character detokenizer. @@ -127,6 +132,7 @@ class TextFeaturizer(): Returns: str: text string. """ + tokens = tokens.replace(SPACE, " ") return "".join(tokens) def word_tokenize(self, text): @@ -193,17 +199,14 @@ class TextFeaturizer(): """Load vocabulary from file.""" vocab_list = load_dict(vocab_filepath, maskctc) assert vocab_list is not None + assert SPACE in vocab_list id2token = dict( [(idx, token) for (idx, token) in enumerate(vocab_list)]) token2id = dict( [(token, idx) for (idx, token) in enumerate(vocab_list)]) - if UNK in vocab_list: - unk_id = vocab_list.index(UNK) - else: - unk_id = -1 - if EOS in vocab_list: - eos_id = vocab_list.index(EOS) - else: - eos_id = -1 + + unk_id = vocab_list.index(UNK) if UNK in vocab_list else -1 + eos_id = vocab_list.index(EOS) if EOS in vocab_list else -1 + return token2id, id2token, vocab_list, unk_id, eos_id diff --git a/deepspeech/frontend/utility.py b/deepspeech/frontend/utility.py index f7e2cb21..3a972b50 100644 --- a/deepspeech/frontend/utility.py +++ b/deepspeech/frontend/utility.py @@ -28,7 +28,7 @@ logger = Log(__name__).getlog() __all__ = [ "load_dict", "load_cmvn", "read_manifest", "rms_to_db", "rms_to_dbfs", "max_dbfs", "mean_dbfs", "gain_db_to_ratio", "normalize_audio", "SOS", - "EOS", "UNK", "BLANK", "MASKCTC" + "EOS", "UNK", "BLANK", "MASKCTC", "SPACE" ] IGNORE_ID = -1 @@ -38,6 +38,7 @@ EOS = SOS UNK = "" BLANK = "" MASKCTC = "" +SPACE = "" def load_dict(dict_path: Optional[Text], maskctc=False) -> Optional[List[Text]]: diff --git a/utils/build_vocab.py b/utils/build_vocab.py index 151d52f8..67c22fbb 100755 --- a/utils/build_vocab.py +++ b/utils/build_vocab.py @@ -25,6 +25,7 @@ from deepspeech.frontend.featurizer.text_featurizer import TextFeaturizer from deepspeech.frontend.utility import BLANK from deepspeech.frontend.utility import read_manifest from deepspeech.frontend.utility import SOS +from deepspeech.frontend.utility import SPACE from deepspeech.frontend.utility import UNK from deepspeech.utils.utility import add_arguments from deepspeech.utils.utility import print_arguments @@ -60,7 +61,7 @@ args = parser.parse_args() def count_manifest(counter, text_feature, manifest_path): manifest_jsons = read_manifest(manifest_path) for line_json in manifest_jsons: - line = text_feature.tokenize(line_json['text']) + line = text_feature.tokenize(line_json['text'], replace_space=False) counter.update(line) def dump_text_manifest(fileobj, manifest_path, key='text'): @@ -109,6 +110,8 @@ def main(): for token, count in count_sorted: if count < args.count_threshold: break + # replace space by `` + token = SPACE if token == ' ' else token tokens.append(token) tokens = sorted(tokens) diff --git a/utils/split_scp.pl b/utils/split_scp.pl index e69de29b..fc28e0b6 100755 --- a/utils/split_scp.pl +++ b/utils/split_scp.pl @@ -0,0 +1,212 @@ +#!/usr/bin/env perl +use warnings; #sed replacement for -w perl parameter +# Copyright 2010-2011 Microsoft Corporation + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + + + +# This program splits up any kind of .scp or archive-type file. +# If there is no utt2spk option it will work on any text file and +# will split it up with an approximately equal number of lines in +# each but. +# With the --utt2spk option it will work on anything that has the +# utterance-id as the first entry on each line; the utt2spk file is +# of the form "utterance speaker" (on each line). +# It splits it into equal size chunks as far as it can. If you use +# the utt2spk option it will make sure these chunks coincide with +# speaker boundaries. In this case, if there are more chunks +# than speakers (and in some other circumstances), some of the +# resulting chunks will be empty and it +# will print a warning. +# You will normally call this like: +# split_scp.pl scp scp.1 scp.2 scp.3 ... +# or +# split_scp.pl --utt2spk=utt2spk scp scp.1 scp.2 scp.3 ... +# Note that you can use this script to split the utt2spk file itself, +# e.g. split_scp.pl --utt2spk=utt2spk utt2spk utt2spk.1 utt2spk.2 ... + +# You can also call the scripts like: +# split_scp.pl -j 3 0 scp scp.0 +# [note: with this option, it assumes zero-based indexing of the split parts, +# i.e. the second number must be 0 <= n < num-jobs.] + +$num_jobs = 0; +$job_id = 0; +$utt2spk_file = ""; + +for ($x = 1; $x <= 2; $x++) { + if ($ARGV[0] eq "-j") { + shift @ARGV; + $num_jobs = shift @ARGV; + $job_id = shift @ARGV; + if ($num_jobs <= 0 || $job_id < 0 || $job_id >= $num_jobs) { + die "Invalid num-jobs and job-id: $num_jobs and $job_id"; + } + } + if ($ARGV[0] =~ "--utt2spk=(.+)") { + $utt2spk_file=$1; + shift; + } +} + +if(($num_jobs == 0 && @ARGV < 2) || ($num_jobs > 0 && (@ARGV < 1 || @ARGV > 2))) { + die "Usage: split_scp.pl [--utt2spk=] in.scp out1.scp out2.scp ... \n" . + " or: split_scp.pl -j num-jobs job-id [--utt2spk=] in.scp [out.scp]\n" . + " ... where 0 <= job-id < num-jobs."; +} + +$inscp = shift @ARGV; +if ($num_jobs == 0) { # without -j option + @OUTPUTS = @ARGV; +} else { + for ($j = 0; $j < $num_jobs; $j++) { + if ($j == $job_id) { + if (@ARGV > 0) { push @OUTPUTS, $ARGV[0]; } + else { push @OUTPUTS, "-"; } + } else { + push @OUTPUTS, "/dev/null"; + } + } +} + +if ($utt2spk_file ne "") { # We have the --utt2spk option... + open(U, "<$utt2spk_file") || die "Failed to open utt2spk file $utt2spk_file"; + while() { + @A = split; + @A == 2 || die "Bad line $_ in utt2spk file $utt2spk_file"; + ($u,$s) = @A; + $utt2spk{$u} = $s; + } + open(I, "<$inscp") || die "Opening input scp file $inscp"; + @spkrs = (); + while() { + @A = split; + if(@A == 0) { die "Empty or space-only line in scp file $inscp"; } + $u = $A[0]; + $s = $utt2spk{$u}; + if(!defined $s) { die "No such utterance $u in utt2spk file $utt2spk_file"; } + if(!defined $spk_count{$s}) { + push @spkrs, $s; + $spk_count{$s} = 0; + $spk_data{$s} = ""; + } + $spk_count{$s}++; + $spk_data{$s} = $spk_data{$s} . $_; + } + # Now split as equally as possible .. + # First allocate spks to files by allocating an approximately + # equal number of speakers. + $numspks = @spkrs; # number of speakers. + $numscps = @OUTPUTS; # number of output files. + for($scpidx = 0; $scpidx < $numscps; $scpidx++) { + $scparray[$scpidx] = []; # [] is array reference. + } + for ($spkidx = 0; $spkidx < $numspks; $spkidx++) { + $scpidx = int(($spkidx*$numscps) / $numspks); + $spk = $spkrs[$spkidx]; + push @{$scparray[$scpidx]}, $spk; + $scpcount[$scpidx] += $spk_count{$spk}; + } + + # Now will try to reassign beginning + ending speakers + # to different scp's and see if it gets more balanced. + # Suppose objf we're minimizing is sum_i (num utts in scp[i] - average)^2. + # We can show that if considering changing just 2 scp's, we minimize + # this by minimizing the squared difference in sizes. This is + # equivalent to minimizing the absolute difference in sizes. This + # shows this method is bound to converge. + + $changed = 1; + while($changed) { + $changed = 0; + for($scpidx = 0; $scpidx < $numscps; $scpidx++) { + # First try to reassign ending spk of this scp. + if($scpidx < $numscps-1) { + $sz = @{$scparray[$scpidx]}; + if($sz > 0) { + $spk = $scparray[$scpidx]->[$sz-1]; + $count = $spk_count{$spk}; + $nutt1 = $scpcount[$scpidx]; + $nutt2 = $scpcount[$scpidx+1]; + if( abs( ($nutt2+$count) - ($nutt1-$count)) + < abs($nutt2 - $nutt1)) { # Would decrease + # size-diff by reassigning spk... + $scpcount[$scpidx+1] += $count; + $scpcount[$scpidx] -= $count; + pop @{$scparray[$scpidx]}; + unshift @{$scparray[$scpidx+1]}, $spk; + $changed = 1; + } + } + } + if($scpidx > 0 && @{$scparray[$scpidx]} > 0) { + $spk = $scparray[$scpidx]->[0]; + $count = $spk_count{$spk}; + $nutt1 = $scpcount[$scpidx-1]; + $nutt2 = $scpcount[$scpidx]; + if( abs( ($nutt2-$count) - ($nutt1+$count)) + < abs($nutt2 - $nutt1)) { # Would decrease + # size-diff by reassigning spk... + $scpcount[$scpidx-1] += $count; + $scpcount[$scpidx] -= $count; + shift @{$scparray[$scpidx]}; + push @{$scparray[$scpidx-1]}, $spk; + $changed = 1; + } + } + } + } + # Now print out the files... + for($scpidx = 0; $scpidx < $numscps; $scpidx++) { + $scpfn = $OUTPUTS[$scpidx]; + open(F, ">$scpfn") || die "Could not open scp file $scpfn for writing."; + $count = 0; + if(@{$scparray[$scpidx]} == 0) { + print STDERR "Warning: split_scp.pl producing empty .scp file $scpfn (too many splits and too few speakers?)\n"; + } else { + foreach $spk ( @{$scparray[$scpidx]} ) { + print F $spk_data{$spk}; + $count += $spk_count{$spk}; + } + if($count != $scpcount[$scpidx]) { die "Count mismatch [code error]"; } + } + close(F); + } +} else { + # This block is the "normal" case where there is no --utt2spk + # option and we just break into equal size chunks. + + open(I, "<$inscp") || die "Opening input scp file $inscp"; + + $numscps = @OUTPUTS; # size of array. + @F = (); + while() { + push @F, $_; + } + $numlines = @F; + if($numlines == 0) { + print STDERR "split_scp.pl: warning: empty input scp file $inscp"; + } + $linesperscp = int( ($numlines+($numscps-1)) / $numscps); # the +$(numscps-1) forces rounding up. +# [just doing int() rounds down]. + for($scpidx = 0; $scpidx < @OUTPUTS; $scpidx++) { + $scpfile = $OUTPUTS[$scpidx]; + open(O, ">$scpfile") || die "Opening output scp file $scpfile"; + for($n = $linesperscp * $scpidx; $n < $numlines && $n < $linesperscp*($scpidx+1); $n++) { + print O $F[$n]; + } + close(O) || die "Closing scp file $scpfile"; + } +} \ No newline at end of file