refactor build vocab

pull/578/head
Hui Zhang 5 years ago
parent af453e0234
commit ed793b30b7

@ -14,7 +14,6 @@
"""Contains the text featurizer class.""" """Contains the text featurizer class."""
import os import os
import codecs
import sentencepiece as spm import sentencepiece as spm
from deepspeech.frontend.utility import UNK from deepspeech.frontend.utility import UNK
@ -42,7 +41,7 @@ class TextFeaturizer(object):
if unit_type == 'spm': if unit_type == 'spm':
spm_model = spm_model_prefix + '.model' spm_model = spm_model_prefix + '.model'
self.sp = spm.SentencePieceProcessor() self.sp = spm.SentencePieceProcessor()
self.sp.Load(self.spm_model) self.sp.Load(spm_model)
def featurize(self, text): def featurize(self, text):
"""Convert text string to a list of token indices in char-level.Note """Convert text string to a list of token indices in char-level.Note
@ -51,14 +50,14 @@ class TextFeaturizer(object):
:param text: Text to process. :param text: Text to process.
:type text: str :type text: str
:return: List of char-level token indices. :return: List of char-level token indices.
:rtype: list :rtype: List[int]
""" """
if unit_type == 'char': if self.unit_type == 'char':
tokens = self._char_tokenize(text) tokens = self.char_tokenize(text)
elif unit_type == 'word': elif self.unit_type == 'word':
tokens = self._word_tokenize(text) tokens = self.word_tokenize(text)
else: else:
tokens = self._spm_tokenize(text) tokens = self.spm_tokenize(text)
ids = [] ids = []
for token in tokens: for token in tokens:
@ -84,15 +83,15 @@ class TextFeaturizer(object):
""" """
return self._vocab_list return self._vocab_list
def _char_tokenize(self, text): def char_tokenize(self, text):
"""Character tokenizer.""" """Character tokenizer."""
return list(text.strip()) return list(text.strip())
def _word_tokenize(self, text): def word_tokenize(self, text):
"""Word tokenizer, spearte by <space>.""" """Word tokenizer, spearte by <space>."""
return text.strip().split() return text.strip().split()
def _spm_tokenize(self, text): def spm_tokenize(self, text):
"""spm tokenize. """spm tokenize.
Args: Args:
@ -127,7 +126,7 @@ class TextFeaturizer(object):
def _load_vocabulary_from_file(self, vocab_filepath): def _load_vocabulary_from_file(self, vocab_filepath):
"""Load vocabulary from file.""" """Load vocabulary from file."""
vocab_lines = [] vocab_lines = []
with codecs.open(vocab_filepath, 'r', 'utf-8') as file: with open(vocab_filepath, 'r', encoding='utf-8') as file:
vocab_lines.extend(file.readlines()) vocab_lines.extend(file.readlines())
vocab_list = [line[:-1] for line in vocab_lines] vocab_list = [line[:-1] for line in vocab_lines]
vocab_dict = dict( vocab_dict = dict(

@ -23,10 +23,10 @@ bpemode=unigram
bpeprefix="data/bpe_${bpemode}_${nbpe}" bpeprefix="data/bpe_${bpemode}_${nbpe}"
# build vocabulary # build vocabulary
python3 ${MAIN_ROOT}/utils/build_vocab.py \ python3 ${MAIN_ROOT}/utils/build_vocab.py \
--unit_type "bpe" \ --unit_type "spm" \
--count_threshold=${nbpe} \ --count_threshold=${nbpe} \
--bpe_mode ${bpemode} \ --spm_mode ${bpemode} \
--bpe_model_prefix ${bpeprefix} \ --spm_model_prefix ${bpeprefix} \
--vocab_path="data/vocab.txt" \ --vocab_path="data/vocab.txt" \
--manifest_paths="data/manifest.tiny.raw" --manifest_paths="data/manifest.tiny.raw"
@ -53,8 +53,8 @@ fi
python3 ${MAIN_ROOT}/utils/format_data.py \ python3 ${MAIN_ROOT}/utils/format_data.py \
--feat_type "raw" \ --feat_type "raw" \
--cmvn_path "data/mean_std.npz" \ --cmvn_path "data/mean_std.npz" \
--unit_type "bpe" \ --unit_type "spm" \
--bpe_model_prefix ${bpeprefix} \ --spm_model_prefix ${bpeprefix} \
--vocab_path="data/vocab.txt" \ --vocab_path="data/vocab.txt" \
--manifest_path="data/manifest.tiny.raw" \ --manifest_path="data/manifest.tiny.raw" \
--output_path="data/manifest.tiny" --output_path="data/manifest.tiny"
@ -66,4 +66,4 @@ if [ $? -ne 0 ]; then
fi fi
echo "LibriSpeech Data preparation done." echo "LibriSpeech Data preparation done."
exit 0 exit 0

@ -29,12 +29,13 @@ from deepspeech.frontend.utility import BLANK
from deepspeech.frontend.utility import SOS from deepspeech.frontend.utility import SOS
from deepspeech.utils.utility import add_arguments from deepspeech.utils.utility import add_arguments
from deepspeech.utils.utility import print_arguments from deepspeech.utils.utility import print_arguments
from deepspeech.frontend.featurizer.text_featurizer import TextFeaturizer
parser = argparse.ArgumentParser(description=__doc__) parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser) add_arg = functools.partial(add_arguments, argparser=parser)
# yapf: disable # yapf: disable
add_arg('unit_type', str, "character", "Unit type, e.g. character, word, bpe") add_arg('unit_type', str, "char", "Unit type, e.g. char, word, spm")
add_arg('count_threshold', int, 0, "Truncation threshold for char/word/bpe counts.") add_arg('count_threshold', int, 0, "Truncation threshold for char/word/spm counts.")
add_arg('vocab_path', str, add_arg('vocab_path', str,
'examples/librispeech/data/vocab.txt', 'examples/librispeech/data/vocab.txt',
"Filepath to write the vocabulary.") "Filepath to write the vocabulary.")
@ -45,10 +46,10 @@ add_arg('manifest_paths', str,
nargs='+', nargs='+',
required=True) required=True)
# bpe # bpe
add_arg('bpe_mode', str, 'unigram', add_arg('spm_mode', str, 'unigram',
"bpe model type, e.g. unigram, bpe, char, word. only need when `unit_type` is bpe") "spm model type, e.g. unigram, spm, char, word. only need when `unit_type` is spm")
add_arg('bpe_model_prefix', str, "bpe_model_%(bpe_mode)_%(count_threshold)", add_arg('spm_model_prefix', str, "spm_model_%(spm_mode)_%(count_threshold)",
"bpe model prefix, only need when `unit_type` is bpe") "spm model prefix, only need when `unit_type` is spm")
# yapf: disable # yapf: disable
args = parser.parse_args() args = parser.parse_args()
@ -56,7 +57,7 @@ args = parser.parse_args()
def count_manifest(counter, manifest_path): def count_manifest(counter, manifest_path):
manifest_jsons = read_manifest(manifest_path) manifest_jsons = read_manifest(manifest_path)
for line_json in manifest_jsons: for line_json in manifest_jsons:
if args.unit_type == 'character': if args.unit_type == 'char':
for char in line_json['text']: for char in line_json['text']:
counter.update(char) counter.update(char)
elif args.unit_type == 'word': elif args.unit_type == 'word':
@ -75,7 +76,7 @@ def main():
fout.write(BLANK + "\n") # 0 will be used for "blank" in CTC fout.write(BLANK + "\n") # 0 will be used for "blank" in CTC
fout.write(UNK + '\n') # <unk> must be 1 fout.write(UNK + '\n') # <unk> must be 1
if args.unit_type != 'bpe': if args.unit_type != 'spm':
counter = Counter() counter = Counter()
for manifest_path in args.manifest_paths: for manifest_path in args.manifest_paths:
count_manifest(counter, manifest_path) count_manifest(counter, manifest_path)
@ -98,41 +99,21 @@ def main():
spm.SentencePieceTrainer.Train( spm.SentencePieceTrainer.Train(
input=fp.name, input=fp.name,
vocab_size=args.count_threshold, vocab_size=args.count_threshold,
model_type=args.bpe_mode, model_type=args.spm_mode,
model_prefix=args.bpe_model_prefix, model_prefix=args.spm_model_prefix,
input_sentence_size=100000000, input_sentence_size=100000000,
character_coverage=0.9995) character_coverage=0.9995)
os.unlink(fp.name) os.unlink(fp.name)
# encode # encode
sp = spm.SentencePieceProcessor() text_feature = TextFeaturizer(args.unit_type, args.vocab_path, args.spm_model_prefix)
sp.Load(args.bpe_model_prefix + '.model')
stats = {"num_empty": 0, "num_filtered": 0}
def valid(line):
return True
def encode(l):
return sp.EncodeAsPieces(l)
def encode_line(line):
line = line.strip()
if len(line) > 0:
line = encode(line)
if valid(line):
return line
else:
stats["num_filtered"] += 1
else:
stats["num_empty"] += 1
return None
vocabs = set() vocabs = set()
for manifest_path in args.manifest_paths: for manifest_path in args.manifest_paths:
manifest_jsons = read_manifest(manifest_path) manifest_jsons = read_manifest(manifest_path)
for line_json in manifest_jsons: for line_json in manifest_jsons:
line = line_json['text'] line = line_json['text']
enc_line = encode_line(line) enc_line = text_feature.spm_tokenize(line)
for code in enc_line: for code in enc_line:
vocabs.add(code) vocabs.add(code)
#print(" ".join(enc_line)) #print(" ".join(enc_line))
@ -140,9 +121,7 @@ def main():
for unit in vocabs_sorted: for unit in vocabs_sorted:
fout.write(unit + "\n") fout.write(unit + "\n")
print(f"bpe vocab size: {len(vocabs_sorted)}") print(f"spm vocab size: {len(vocabs_sorted)}")
print(f"skip {stats['num_empty']} empty lines")
print(f"filter {stats['num_filtered']} invalid lines")
fout.write(SOS + "\n") # <sos/eos> fout.write(SOS + "\n") # <sos/eos>
fout.close() fout.close()

@ -27,6 +27,7 @@ from deepspeech.frontend.utility import SOS
from deepspeech.frontend.utility import load_cmvn from deepspeech.frontend.utility import load_cmvn
from deepspeech.utils.utility import add_arguments from deepspeech.utils.utility import add_arguments
from deepspeech.utils.utility import print_arguments from deepspeech.utils.utility import print_arguments
from deepspeech.frontend.featurizer.text_featurizer import TextFeaturizer
parser = argparse.ArgumentParser(description=__doc__) parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser) add_arg = functools.partial(add_arguments, argparser=parser)
@ -35,7 +36,7 @@ add_arg('feat_type', str, "raw", "speech feature type, e.g. raw(wav, flac), kald
add_arg('cmvn_path', str, add_arg('cmvn_path', str,
'examples/librispeech/data/mean_std.npz', 'examples/librispeech/data/mean_std.npz',
"Filepath of cmvn.") "Filepath of cmvn.")
add_arg('unit_type', str, "character", "Unit type, e.g. character, word, bpe") add_arg('unit_type', str, "char", "Unit type, e.g. char, word, spm")
add_arg('vocab_path', str, add_arg('vocab_path', str,
'examples/librispeech/data/vocab.txt', 'examples/librispeech/data/vocab.txt',
"Filepath of the vocabulary.") "Filepath of the vocabulary.")
@ -46,7 +47,8 @@ add_arg('manifest_paths', str,
nargs='+', nargs='+',
required=True) required=True)
# bpe # bpe
add_arg('bpe_model_prefix', str, "bpe_model_%(bpe_mode)_%(count_threshold)", "bpe model prefix, only need when `unit_type` is bpe") add_arg('spm_model_prefix', str, None,
"spm model prefix, spm_model_%(bpe_mode)_%(count_threshold), only need when `unit_type` is spm")
add_arg('output_path', str, None, "filepath of formated manifest.", required=True) add_arg('output_path', str, None, "filepath of formated manifest.", required=True)
# yapf: disable # yapf: disable
args = parser.parse_args() args = parser.parse_args()
@ -54,93 +56,38 @@ args = parser.parse_args()
def main(): def main():
print_arguments(args) print_arguments(args)
fout = open(args.output_path, 'w', encoding='utf-8')
# get feat dim # get feat dim
mean, std = load_cmvn(args.cmvn_path, filetype='npz') mean, std = load_cmvn(args.cmvn_path, filetype='npz')
feat_dim = mean.shape[0] feat_dim = mean.shape[0]
print(f"Feature dim: {feat_dim}") print(f"Feature dim: {feat_dim}")
# read vocab text_feature = TextFeaturizer(args.unit_type, args.vocab_path, args.spm_model_prefix)
vocab = dict() vocab_size = text_feature.vocab_size
with open(args.vocab_path, 'r', encoding='utf-8') as fin:
for line in fin:
token = line.strip()
vocab[token] = len(vocab)
vocab_size = len(vocab)
print(f"Vocab size: {vocab_size}") print(f"Vocab size: {vocab_size}")
fout = open(args.output_path, 'w', encoding='utf-8') for manifest_path in args.manifest_paths:
manifest_jsons = read_manifest(manifest_path)
if args.unit_type != 'bpe': for line_json in manifest_jsons:
for manifest_path in args.manifest_paths: line = line_json['text']
manifest_jsons = read_manifest(manifest_path) if args.unit_type == 'char':
for line_json in manifest_jsons: tokens = text_feature.char_tokenize(line)
tokens = [] elif args.unit_type == 'word':
tokenids = [] tokens = text_feature.word_tokenize(line)
if args.unit_type == 'character': else: #spm
for char in line_json['text']: tokens = text_feature.spm_tokenize(line)
tokens.append(char) tokenids = text_feature.featurize(line)
tokenids.append(vocab[char]) line_json['token'] = tokens
elif args.unit_type == 'word': line_json['token_id'] = tokenids
for word in line_json['text'].split(): line_json['token_shape'] = (len(tokenids), vocab_size)
tokens.append(word) feat_shape = line_json['feat_shape']
tokenids.append(vocab[word]) assert isinstance(feat_shape, (list, tuple)), type(feat_shape)
line_json['token'] = tokens if args.feat_type == 'raw':
line_json['token_id'] = tokenids feat_shape.append(feat_dim)
line_json['token_shape'] = (len(tokenids), vocab_size) else: # kaldi
feat_shape = line_json['feat_shape'] raise NotImplemented('no support kaldi feat now!')
assert isinstance(feat_shape, (list, tuple)), type(feat_shape) fout.write(json.dumps(line_json) + '\n')
if args.feat_type == 'raw':
feat_shape.append(feat_dim)
else: # kaldi
raise NotImplemented('no support kaldi feat now!')
fout.write(json.dumps(line_json) + '\n')
else:
import sentencepiece as spm
# encode
sp = spm.SentencePieceProcessor()
sp.Load(args.bpe_model_prefix + '.model')
def valid(line):
return True
def encode(l):
return sp.EncodeAsPieces(l)
def encode_line(line):
line = line.strip()
if len(line) > 0:
line = encode(line)
if valid(line):
return line
else:
stats["num_filtered"] += 1
else:
stats["num_empty"] += 1
return None
for manifest_path in args.manifest_paths:
manifest_jsons = read_manifest(manifest_path)
for line_json in manifest_jsons:
line = line_json['text']
tokens = []
tokenids = []
enc_line = encode_line(line)
for code in enc_line:
tokens.append(code)
tokenids.append(vocab[code])
#print(code, vocab[code])
line_json['token'] = tokens
line_json['token_id'] = tokenids
line_json['token_shape'] = (len(tokenids), vocab_size)
feat_shape = line_json['feat_shape']
assert isinstance(feat_shape, (list, tuple)), type(feat_shape)
if args.feat_type == 'raw':
feat_shape.append(feat_dim)
else: # kaldi
raise NotImplemented('no support kaldi feat now!')
fout.write(json.dumps(line_json) + '\n')
fout.close() fout.close()

Loading…
Cancel
Save