Merge branch 'develop' of https://github.com/PaddlePaddle/DeepSpeech into release_model

pull/851/head
huangyuxin 3 years ago
commit f6c9976189

@ -28,6 +28,7 @@
#include "path_trie.h" #include "path_trie.h"
using FSTMATCH = fst::SortedMatcher<fst::StdVectorFst>; using FSTMATCH = fst::SortedMatcher<fst::StdVectorFst>;
constexpr kSPACE = "<space>"
std::vector<std::pair<double, std::string>> ctc_beam_search_decoder( std::vector<std::pair<double, std::string>> ctc_beam_search_decoder(
const std::vector<std::vector<double>> &probs_seq, const std::vector<std::vector<double>> &probs_seq,
@ -46,13 +47,8 @@ std::vector<std::pair<double, std::string>> ctc_beam_search_decoder(
"The shape of probs_seq does not match with " "The shape of probs_seq does not match with "
"the shape of the vocabulary"); "the shape of the vocabulary");
} }
// assign blank id
// size_t blank_id = vocabulary.size();
// size_t blank_id = 0;
// assign space id // assign space id
auto it = std::find(vocabulary.begin(), vocabulary.end(), " "); auto it = std::find(vocabulary.begin(), vocabulary.end(), kSPACE);
int space_id = it - vocabulary.begin(); int space_id = it - vocabulary.begin();
// if no space in vocabulary // if no space in vocabulary
if ((size_t)space_id >= vocabulary.size()) { if ((size_t)space_id >= vocabulary.size()) {

@ -16,6 +16,7 @@ import sentencepiece as spm
from ..utility import EOS from ..utility import EOS
from ..utility import load_dict from ..utility import load_dict
from ..utility import SPACE
from ..utility import UNK from ..utility import UNK
__all__ = ["TextFeaturizer"] __all__ = ["TextFeaturizer"]
@ -53,9 +54,9 @@ class TextFeaturizer():
self.sp = spm.SentencePieceProcessor() self.sp = spm.SentencePieceProcessor()
self.sp.Load(spm_model) self.sp.Load(spm_model)
def tokenize(self, text): def tokenize(self, text, replace_space=True):
if self.unit_type == 'char': if self.unit_type == 'char':
tokens = self.char_tokenize(text) tokens = self.char_tokenize(text, replace_space)
elif self.unit_type == 'word': elif self.unit_type == 'word':
tokens = self.word_tokenize(text) tokens = self.word_tokenize(text)
else: # spm else: # spm
@ -107,16 +108,20 @@ class TextFeaturizer():
text = self.detokenize(tokens) text = self.detokenize(tokens)
return text return text
def char_tokenize(self, text): def char_tokenize(self, text, replace_space=True):
"""Character tokenizer. """Character tokenizer.
Args: Args:
text (str): text string. text (str): text string.
replace_space (bool): False only used by build_vocab.py.
Returns: Returns:
List[str]: tokens. List[str]: tokens.
""" """
return list(text.strip()) text = text.strip()
if replace_space:
text = text.replace(" ", SPACE)
return list(text)
def char_detokenize(self, tokens): def char_detokenize(self, tokens):
"""Character detokenizer. """Character detokenizer.
@ -127,6 +132,7 @@ class TextFeaturizer():
Returns: Returns:
str: text string. str: text string.
""" """
tokens = tokens.replace(SPACE, " ")
return "".join(tokens) return "".join(tokens)
def word_tokenize(self, text): def word_tokenize(self, text):
@ -193,17 +199,14 @@ class TextFeaturizer():
"""Load vocabulary from file.""" """Load vocabulary from file."""
vocab_list = load_dict(vocab_filepath, maskctc) vocab_list = load_dict(vocab_filepath, maskctc)
assert vocab_list is not None assert vocab_list is not None
assert SPACE in vocab_list
id2token = dict( id2token = dict(
[(idx, token) for (idx, token) in enumerate(vocab_list)]) [(idx, token) for (idx, token) in enumerate(vocab_list)])
token2id = dict( token2id = dict(
[(token, idx) for (idx, token) in enumerate(vocab_list)]) [(token, idx) for (idx, token) in enumerate(vocab_list)])
if UNK in vocab_list:
unk_id = vocab_list.index(UNK) unk_id = vocab_list.index(UNK) if UNK in vocab_list else -1
else: eos_id = vocab_list.index(EOS) if EOS in vocab_list else -1
unk_id = -1
if EOS in vocab_list:
eos_id = vocab_list.index(EOS)
else:
eos_id = -1
return token2id, id2token, vocab_list, unk_id, eos_id return token2id, id2token, vocab_list, unk_id, eos_id

@ -28,7 +28,7 @@ logger = Log(__name__).getlog()
__all__ = [ __all__ = [
"load_dict", "load_cmvn", "read_manifest", "rms_to_db", "rms_to_dbfs", "load_dict", "load_cmvn", "read_manifest", "rms_to_db", "rms_to_dbfs",
"max_dbfs", "mean_dbfs", "gain_db_to_ratio", "normalize_audio", "SOS", "max_dbfs", "mean_dbfs", "gain_db_to_ratio", "normalize_audio", "SOS",
"EOS", "UNK", "BLANK", "MASKCTC" "EOS", "UNK", "BLANK", "MASKCTC", "SPACE"
] ]
IGNORE_ID = -1 IGNORE_ID = -1
@ -38,6 +38,7 @@ EOS = SOS
UNK = "<unk>" UNK = "<unk>"
BLANK = "<blank>" BLANK = "<blank>"
MASKCTC = "<mask>" MASKCTC = "<mask>"
SPACE = "<space>"
def load_dict(dict_path: Optional[Text], maskctc=False) -> Optional[List[Text]]: def load_dict(dict_path: Optional[Text], maskctc=False) -> Optional[List[Text]]:

@ -25,6 +25,7 @@ from deepspeech.frontend.featurizer.text_featurizer import TextFeaturizer
from deepspeech.frontend.utility import BLANK from deepspeech.frontend.utility import BLANK
from deepspeech.frontend.utility import read_manifest from deepspeech.frontend.utility import read_manifest
from deepspeech.frontend.utility import SOS from deepspeech.frontend.utility import SOS
from deepspeech.frontend.utility import SPACE
from deepspeech.frontend.utility import UNK from deepspeech.frontend.utility import UNK
from deepspeech.utils.utility import add_arguments from deepspeech.utils.utility import add_arguments
from deepspeech.utils.utility import print_arguments from deepspeech.utils.utility import print_arguments
@ -60,7 +61,7 @@ args = parser.parse_args()
def count_manifest(counter, text_feature, manifest_path): def count_manifest(counter, text_feature, manifest_path):
manifest_jsons = read_manifest(manifest_path) manifest_jsons = read_manifest(manifest_path)
for line_json in manifest_jsons: for line_json in manifest_jsons:
line = text_feature.tokenize(line_json['text']) line = text_feature.tokenize(line_json['text'], replace_space=False)
counter.update(line) counter.update(line)
def dump_text_manifest(fileobj, manifest_path, key='text'): def dump_text_manifest(fileobj, manifest_path, key='text'):
@ -109,6 +110,8 @@ def main():
for token, count in count_sorted: for token, count in count_sorted:
if count < args.count_threshold: if count < args.count_threshold:
break break
# replace space by `<space>`
token = SPACE if token == ' ' else token
tokens.append(token) tokens.append(token)
tokens = sorted(tokens) tokens = sorted(tokens)

@ -0,0 +1,212 @@
#!/usr/bin/env perl
use warnings; #sed replacement for -w perl parameter
# Copyright 2010-2011 Microsoft Corporation
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# This program splits up any kind of .scp or archive-type file.
# If there is no utt2spk option it will work on any text file and
# will split it up with an approximately equal number of lines in
# each but.
# With the --utt2spk option it will work on anything that has the
# utterance-id as the first entry on each line; the utt2spk file is
# of the form "utterance speaker" (on each line).
# It splits it into equal size chunks as far as it can. If you use
# the utt2spk option it will make sure these chunks coincide with
# speaker boundaries. In this case, if there are more chunks
# than speakers (and in some other circumstances), some of the
# resulting chunks will be empty and it
# will print a warning.
# You will normally call this like:
# split_scp.pl scp scp.1 scp.2 scp.3 ...
# or
# split_scp.pl --utt2spk=utt2spk scp scp.1 scp.2 scp.3 ...
# Note that you can use this script to split the utt2spk file itself,
# e.g. split_scp.pl --utt2spk=utt2spk utt2spk utt2spk.1 utt2spk.2 ...
# You can also call the scripts like:
# split_scp.pl -j 3 0 scp scp.0
# [note: with this option, it assumes zero-based indexing of the split parts,
# i.e. the second number must be 0 <= n < num-jobs.]
$num_jobs = 0;
$job_id = 0;
$utt2spk_file = "";
for ($x = 1; $x <= 2; $x++) {
if ($ARGV[0] eq "-j") {
shift @ARGV;
$num_jobs = shift @ARGV;
$job_id = shift @ARGV;
if ($num_jobs <= 0 || $job_id < 0 || $job_id >= $num_jobs) {
die "Invalid num-jobs and job-id: $num_jobs and $job_id";
}
}
if ($ARGV[0] =~ "--utt2spk=(.+)") {
$utt2spk_file=$1;
shift;
}
}
if(($num_jobs == 0 && @ARGV < 2) || ($num_jobs > 0 && (@ARGV < 1 || @ARGV > 2))) {
die "Usage: split_scp.pl [--utt2spk=<utt2spk_file>] in.scp out1.scp out2.scp ... \n" .
" or: split_scp.pl -j num-jobs job-id [--utt2spk=<utt2spk_file>] in.scp [out.scp]\n" .
" ... where 0 <= job-id < num-jobs.";
}
$inscp = shift @ARGV;
if ($num_jobs == 0) { # without -j option
@OUTPUTS = @ARGV;
} else {
for ($j = 0; $j < $num_jobs; $j++) {
if ($j == $job_id) {
if (@ARGV > 0) { push @OUTPUTS, $ARGV[0]; }
else { push @OUTPUTS, "-"; }
} else {
push @OUTPUTS, "/dev/null";
}
}
}
if ($utt2spk_file ne "") { # We have the --utt2spk option...
open(U, "<$utt2spk_file") || die "Failed to open utt2spk file $utt2spk_file";
while(<U>) {
@A = split;
@A == 2 || die "Bad line $_ in utt2spk file $utt2spk_file";
($u,$s) = @A;
$utt2spk{$u} = $s;
}
open(I, "<$inscp") || die "Opening input scp file $inscp";
@spkrs = ();
while(<I>) {
@A = split;
if(@A == 0) { die "Empty or space-only line in scp file $inscp"; }
$u = $A[0];
$s = $utt2spk{$u};
if(!defined $s) { die "No such utterance $u in utt2spk file $utt2spk_file"; }
if(!defined $spk_count{$s}) {
push @spkrs, $s;
$spk_count{$s} = 0;
$spk_data{$s} = "";
}
$spk_count{$s}++;
$spk_data{$s} = $spk_data{$s} . $_;
}
# Now split as equally as possible ..
# First allocate spks to files by allocating an approximately
# equal number of speakers.
$numspks = @spkrs; # number of speakers.
$numscps = @OUTPUTS; # number of output files.
for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
$scparray[$scpidx] = []; # [] is array reference.
}
for ($spkidx = 0; $spkidx < $numspks; $spkidx++) {
$scpidx = int(($spkidx*$numscps) / $numspks);
$spk = $spkrs[$spkidx];
push @{$scparray[$scpidx]}, $spk;
$scpcount[$scpidx] += $spk_count{$spk};
}
# Now will try to reassign beginning + ending speakers
# to different scp's and see if it gets more balanced.
# Suppose objf we're minimizing is sum_i (num utts in scp[i] - average)^2.
# We can show that if considering changing just 2 scp's, we minimize
# this by minimizing the squared difference in sizes. This is
# equivalent to minimizing the absolute difference in sizes. This
# shows this method is bound to converge.
$changed = 1;
while($changed) {
$changed = 0;
for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
# First try to reassign ending spk of this scp.
if($scpidx < $numscps-1) {
$sz = @{$scparray[$scpidx]};
if($sz > 0) {
$spk = $scparray[$scpidx]->[$sz-1];
$count = $spk_count{$spk};
$nutt1 = $scpcount[$scpidx];
$nutt2 = $scpcount[$scpidx+1];
if( abs( ($nutt2+$count) - ($nutt1-$count))
< abs($nutt2 - $nutt1)) { # Would decrease
# size-diff by reassigning spk...
$scpcount[$scpidx+1] += $count;
$scpcount[$scpidx] -= $count;
pop @{$scparray[$scpidx]};
unshift @{$scparray[$scpidx+1]}, $spk;
$changed = 1;
}
}
}
if($scpidx > 0 && @{$scparray[$scpidx]} > 0) {
$spk = $scparray[$scpidx]->[0];
$count = $spk_count{$spk};
$nutt1 = $scpcount[$scpidx-1];
$nutt2 = $scpcount[$scpidx];
if( abs( ($nutt2-$count) - ($nutt1+$count))
< abs($nutt2 - $nutt1)) { # Would decrease
# size-diff by reassigning spk...
$scpcount[$scpidx-1] += $count;
$scpcount[$scpidx] -= $count;
shift @{$scparray[$scpidx]};
push @{$scparray[$scpidx-1]}, $spk;
$changed = 1;
}
}
}
}
# Now print out the files...
for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
$scpfn = $OUTPUTS[$scpidx];
open(F, ">$scpfn") || die "Could not open scp file $scpfn for writing.";
$count = 0;
if(@{$scparray[$scpidx]} == 0) {
print STDERR "Warning: split_scp.pl producing empty .scp file $scpfn (too many splits and too few speakers?)\n";
} else {
foreach $spk ( @{$scparray[$scpidx]} ) {
print F $spk_data{$spk};
$count += $spk_count{$spk};
}
if($count != $scpcount[$scpidx]) { die "Count mismatch [code error]"; }
}
close(F);
}
} else {
# This block is the "normal" case where there is no --utt2spk
# option and we just break into equal size chunks.
open(I, "<$inscp") || die "Opening input scp file $inscp";
$numscps = @OUTPUTS; # size of array.
@F = ();
while(<I>) {
push @F, $_;
}
$numlines = @F;
if($numlines == 0) {
print STDERR "split_scp.pl: warning: empty input scp file $inscp";
}
$linesperscp = int( ($numlines+($numscps-1)) / $numscps); # the +$(numscps-1) forces rounding up.
# [just doing int() rounds down].
for($scpidx = 0; $scpidx < @OUTPUTS; $scpidx++) {
$scpfile = $OUTPUTS[$scpidx];
open(O, ">$scpfile") || die "Opening output scp file $scpfile";
for($n = $linesperscp * $scpidx; $n < $numlines && $n < $linesperscp*($scpidx+1); $n++) {
print O $F[$n];
}
close(O) || die "Closing scp file $scpfile";
}
}
Loading…
Cancel
Save