refactor build vocab

5 years ago · ed793b30b7
parent af453e0234
commit ed793b30b7
4 changed files with 59 additions and 134 deletions
--- a/deepspeech/frontend/featurizer/text_featurizer.py
+++ b/deepspeech/frontend/featurizer/text_featurizer.py
@ -14,7 +14,6 @@
 """Contains the text featurizer class."""
 import os
 import codecs
 import sentencepiece as spm
 from deepspeech.frontend.utility import UNK
@ -42,7 +41,7 @@ class TextFeaturizer(object):
        if unit_type == 'spm':
            spm_model = spm_model_prefix + '.model'
            self.sp = spm.SentencePieceProcessor()
-            self.sp.Load(self.spm_model)
+            self.sp.Load(spm_model)
    def featurize(self, text):
        """Convert text string to a list of token indices in char-level.Note
@ -51,14 +50,14 @@ class TextFeaturizer(object):
        :param text: Text to process.
        :type text: str
        :return: List of char-level token indices.
-        :rtype: list
+        :rtype: List[int]
        """
-        if unit_type == 'char':
+        if self.unit_type == 'char':
-            tokens = self._char_tokenize(text)
+            tokens = self.char_tokenize(text)
-        elif unit_type == 'word':
+        elif self.unit_type == 'word':
-            tokens = self._word_tokenize(text)
+            tokens = self.word_tokenize(text)
        else:
-            tokens = self._spm_tokenize(text)
+            tokens = self.spm_tokenize(text)
        ids = []
        for token in tokens:
@ -84,15 +83,15 @@ class TextFeaturizer(object):
        """
        return self._vocab_list
-    def _char_tokenize(self, text):
+    def char_tokenize(self, text):
        """Character tokenizer."""
        return list(text.strip())
-    def _word_tokenize(self, text):
+    def word_tokenize(self, text):
        """Word tokenizer, spearte by <space>."""
        return text.strip().split()
-    def _spm_tokenize(self, text):
+    def spm_tokenize(self, text):
        """spm tokenize.
        Args:
@ -127,7 +126,7 @@ class TextFeaturizer(object):
    def _load_vocabulary_from_file(self, vocab_filepath):
        """Load vocabulary from file."""
        vocab_lines = []
-        with codecs.open(vocab_filepath, 'r', 'utf-8') as file:
+        with open(vocab_filepath, 'r', encoding='utf-8') as file:
            vocab_lines.extend(file.readlines())
        vocab_list = [line[:-1] for line in vocab_lines]
        vocab_dict = dict(
--- a/examples/tiny/s0/local/data.sh
+++ b/examples/tiny/s0/local/data.sh
@ -23,10 +23,10 @@ bpemode=unigram
 bpeprefix="data/bpe_${bpemode}_${nbpe}"
 # build vocabulary
 python3 ${MAIN_ROOT}/utils/build_vocab.py \
--unit_type "bpe" \
+--unit_type "spm" \
 --count_threshold=${nbpe} \
--bpe_mode ${bpemode} \
+--spm_mode ${bpemode} \
--bpe_model_prefix ${bpeprefix} \
+--spm_model_prefix ${bpeprefix} \
 --vocab_path="data/vocab.txt" \
 --manifest_paths="data/manifest.tiny.raw"
@ -53,8 +53,8 @@ fi
 python3 ${MAIN_ROOT}/utils/format_data.py \
 --feat_type "raw" \
 --cmvn_path "data/mean_std.npz" \
--unit_type "bpe" \
+--unit_type "spm" \
--bpe_model_prefix ${bpeprefix} \
+--spm_model_prefix ${bpeprefix} \
 --vocab_path="data/vocab.txt" \
 --manifest_path="data/manifest.tiny.raw" \
 --output_path="data/manifest.tiny"
--- a/utils/build_vocab.py
+++ b/utils/build_vocab.py
@ -29,12 +29,13 @@ from deepspeech.frontend.utility import BLANK
 from deepspeech.frontend.utility import SOS
 from deepspeech.utils.utility import add_arguments
 from deepspeech.utils.utility import print_arguments
 from deepspeech.frontend.featurizer.text_featurizer import TextFeaturizer
 parser = argparse.ArgumentParser(description=__doc__)
 add_arg = functools.partial(add_arguments, argparser=parser)
 # yapf: disable
-add_arg('unit_type', str, "character", "Unit type, e.g. character, word, bpe")
+add_arg('unit_type', str, "char", "Unit type, e.g. char, word, spm")
-add_arg('count_threshold',  int,    0,  "Truncation threshold for char/word/bpe counts.")
+add_arg('count_threshold',  int,    0,  "Truncation threshold for char/word/spm counts.")
 add_arg('vocab_path',       str,
        'examples/librispeech/data/vocab.txt',
        "Filepath to write the vocabulary.")
@ -45,10 +46,10 @@ add_arg('manifest_paths',   str,
        nargs='+',
        required=True)
 # bpe
-add_arg('bpe_mode', str, 'unigram',
+add_arg('spm_mode', str, 'unigram',
-    "bpe model type, e.g. unigram, bpe, char, word. only need when `unit_type` is bpe")
+    "spm model type, e.g. unigram, spm, char, word. only need when `unit_type` is spm")
-add_arg('bpe_model_prefix', str, "bpe_model_%(bpe_mode)_%(count_threshold)",
+add_arg('spm_model_prefix', str, "spm_model_%(spm_mode)_%(count_threshold)",
-    "bpe model prefix, only need when `unit_type` is bpe")
+    "spm model prefix, only need when `unit_type` is spm")
 # yapf: disable
 args = parser.parse_args()
@ -56,7 +57,7 @@ args = parser.parse_args()
 def count_manifest(counter, manifest_path):
    manifest_jsons = read_manifest(manifest_path)
    for line_json in manifest_jsons:
-        if args.unit_type == 'character':
+        if args.unit_type == 'char':
            for char in line_json['text']:
                counter.update(char)
        elif args.unit_type == 'word':
@ -75,7 +76,7 @@ def main():
    fout.write(BLANK + "\n") # 0 will be used for "blank" in CTC
    fout.write(UNK + '\n')   # <unk> must be 1
-    if args.unit_type != 'bpe':
+    if args.unit_type != 'spm':
        counter = Counter()
        for manifest_path in args.manifest_paths:
            count_manifest(counter, manifest_path)
@ -98,41 +99,21 @@ def main():
        spm.SentencePieceTrainer.Train(
            input=fp.name,
            vocab_size=args.count_threshold,
-            model_type=args.bpe_mode,
+            model_type=args.spm_mode,
-            model_prefix=args.bpe_model_prefix,
+            model_prefix=args.spm_model_prefix,
            input_sentence_size=100000000,
            character_coverage=0.9995)
        os.unlink(fp.name)
        # encode
-        sp = spm.SentencePieceProcessor()
+        text_feature = TextFeaturizer(args.unit_type, args.vocab_path, args.spm_model_prefix)
        sp.Load(args.bpe_model_prefix + '.model')
        stats = {"num_empty": 0, "num_filtered": 0}
        def valid(line):
            return True
        def encode(l):
            return sp.EncodeAsPieces(l)
        def encode_line(line):
            line = line.strip()
            if len(line) > 0:
                line = encode(line)
                if valid(line):
                    return line
                else:
                    stats["num_filtered"] += 1
            else:
                stats["num_empty"] += 1
            return None
        vocabs = set()
        for manifest_path in args.manifest_paths:
            manifest_jsons = read_manifest(manifest_path)
            for line_json in manifest_jsons:
                line = line_json['text']
-                enc_line = encode_line(line)
+                enc_line = text_feature.spm_tokenize(line)
                for code in enc_line:
                    vocabs.add(code)
                #print(" ".join(enc_line))
@ -140,9 +121,7 @@ def main():
        for unit in vocabs_sorted:
            fout.write(unit + "\n")
-        print(f"bpe vocab size: {len(vocabs_sorted)}")
+        print(f"spm vocab size: {len(vocabs_sorted)}")
        print(f"skip {stats['num_empty']} empty lines")
        print(f"filter {stats['num_filtered']} invalid lines")
    fout.write(SOS + "\n") # <sos/eos>
    fout.close()
--- a/utils/format_data.py
+++ b/utils/format_data.py
@ -27,6 +27,7 @@ from deepspeech.frontend.utility import SOS
 from deepspeech.frontend.utility import load_cmvn
 from deepspeech.utils.utility import add_arguments
 from deepspeech.utils.utility import print_arguments
 from deepspeech.frontend.featurizer.text_featurizer import TextFeaturizer
 parser = argparse.ArgumentParser(description=__doc__)
 add_arg = functools.partial(add_arguments, argparser=parser)
@ -35,7 +36,7 @@ add_arg('feat_type', str, "raw", "speech feature type, e.g. raw(wav, flac), kald
 add_arg('cmvn_path',       str,
        'examples/librispeech/data/mean_std.npz',
        "Filepath of cmvn.")
-add_arg('unit_type', str, "character", "Unit type, e.g. character, word, bpe")
+add_arg('unit_type', str, "char", "Unit type, e.g. char, word, spm")
 add_arg('vocab_path',       str,
        'examples/librispeech/data/vocab.txt',
        "Filepath of the vocabulary.")
@ -46,7 +47,8 @@ add_arg('manifest_paths',   str,
        nargs='+',
        required=True)
 # bpe
-add_arg('bpe_model_prefix', str, "bpe_model_%(bpe_mode)_%(count_threshold)", "bpe model prefix, only need when `unit_type` is bpe")
+add_arg('spm_model_prefix', str, None,
     "spm model prefix, spm_model_%(bpe_mode)_%(count_threshold), only need when `unit_type` is spm")
 add_arg('output_path',  str, None, "filepath of formated manifest.", required=True)
 # yapf: disable
 args = parser.parse_args()
@ -54,83 +56,28 @@ args = parser.parse_args()
 def main():
    print_arguments(args)
    fout = open(args.output_path, 'w', encoding='utf-8')
    # get feat dim
    mean, std = load_cmvn(args.cmvn_path, filetype='npz')
    feat_dim = mean.shape[0]
    print(f"Feature dim: {feat_dim}")
-    # read vocab
+    text_feature = TextFeaturizer(args.unit_type, args.vocab_path, args.spm_model_prefix)
-    vocab = dict()
+    vocab_size = text_feature.vocab_size
    with open(args.vocab_path, 'r', encoding='utf-8') as fin:
        for line in fin:
            token = line.strip()
            vocab[token] = len(vocab)
    vocab_size = len(vocab)
    print(f"Vocab size: {vocab_size}")
    fout = open(args.output_path, 'w', encoding='utf-8')
    if args.unit_type != 'bpe':
        for manifest_path in args.manifest_paths:
            manifest_jsons = read_manifest(manifest_path)
            for line_json in manifest_jsons:
                tokens = []
                tokenids = []
                if args.unit_type == 'character':
                    for char in line_json['text']:
                        tokens.append(char)
                        tokenids.append(vocab[char])
                elif args.unit_type == 'word':
                    for word in line_json['text'].split():
                        tokens.append(word)
                        tokenids.append(vocab[word])
                line_json['token'] = tokens
                line_json['token_id'] = tokenids
                line_json['token_shape'] = (len(tokenids), vocab_size)
                feat_shape = line_json['feat_shape']
                assert isinstance(feat_shape, (list, tuple)), type(feat_shape)
                if args.feat_type == 'raw':
                    feat_shape.append(feat_dim)
                else: # kaldi
                    raise NotImplemented('no support kaldi feat now!')
                fout.write(json.dumps(line_json) + '\n')
    else:
        import sentencepiece as spm
        # encode
        sp = spm.SentencePieceProcessor()
        sp.Load(args.bpe_model_prefix + '.model')
        def valid(line):
            return True
        def encode(l):
            return sp.EncodeAsPieces(l)
        def encode_line(line):
            line = line.strip()
            if len(line) > 0:
                line = encode(line)
                if valid(line):
                    return line
                else:
                    stats["num_filtered"] += 1
            else:
                stats["num_empty"] += 1
            return None
    for manifest_path in args.manifest_paths:
        manifest_jsons = read_manifest(manifest_path)
        for line_json in manifest_jsons:
            line = line_json['text']
-                tokens = []
+            if args.unit_type == 'char':
-                tokenids = []
+                tokens = text_feature.char_tokenize(line)
-                enc_line = encode_line(line)
+            elif args.unit_type == 'word':
-                for code in enc_line:
+                tokens = text_feature.word_tokenize(line)
-                    tokens.append(code)
+            else: #spm
-                    tokenids.append(vocab[code])
+                tokens = text_feature.spm_tokenize(line)
-                    #print(code, vocab[code])
+            tokenids = text_feature.featurize(line)
            line_json['token'] = tokens
            line_json['token_id'] = tokenids
            line_json['token_shape'] = (len(tokenids), vocab_size)