|
|
@ -3,7 +3,8 @@ import argparse
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main(args):
|
|
|
|
def main(args):
|
|
|
|
# load `unit` or `vocab` file
|
|
|
|
# load vocab file
|
|
|
|
|
|
|
|
# line: token
|
|
|
|
unit_table = set()
|
|
|
|
unit_table = set()
|
|
|
|
with open(args.unit_file, 'r') as fin:
|
|
|
|
with open(args.unit_file, 'r') as fin:
|
|
|
|
for line in fin:
|
|
|
|
for line in fin:
|
|
|
@ -11,27 +12,41 @@ def main(args):
|
|
|
|
unit_table.add(unit)
|
|
|
|
unit_table.add(unit)
|
|
|
|
|
|
|
|
|
|
|
|
def contain_oov(units):
|
|
|
|
def contain_oov(units):
|
|
|
|
|
|
|
|
"""token not in vocab
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
|
|
|
units (str): token
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
|
|
|
bool: True token in voca, else False.
|
|
|
|
|
|
|
|
"""
|
|
|
|
for unit in units:
|
|
|
|
for unit in units:
|
|
|
|
if unit not in unit_table:
|
|
|
|
if unit not in unit_table:
|
|
|
|
return True
|
|
|
|
return True
|
|
|
|
return False
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
# load spm model
|
|
|
|
# load spm model, for English
|
|
|
|
bpemode = args.bpemodel
|
|
|
|
bpemode = args.bpemodel
|
|
|
|
if bpemode:
|
|
|
|
if bpemode:
|
|
|
|
import sentencepiece as spm
|
|
|
|
import sentencepiece as spm
|
|
|
|
sp = spm.SentencePieceProcessor()
|
|
|
|
sp = spm.SentencePieceProcessor()
|
|
|
|
sp.Load(sys.bpemodel)
|
|
|
|
sp.Load(sys.bpemodel)
|
|
|
|
|
|
|
|
|
|
|
|
# used to filter polyphone
|
|
|
|
# used to filter polyphone and invalid word
|
|
|
|
lexicon_table = set()
|
|
|
|
lexicon_table = set()
|
|
|
|
|
|
|
|
in_n = 0 # in lexicon word count
|
|
|
|
|
|
|
|
out_n = 0 # out lexicon word cout
|
|
|
|
with open(args.in_lexicon, 'r') as fin, \
|
|
|
|
with open(args.in_lexicon, 'r') as fin, \
|
|
|
|
open(args.out_lexicon, 'w') as fout:
|
|
|
|
open(args.out_lexicon, 'w') as fout:
|
|
|
|
for line in fin:
|
|
|
|
for line in fin:
|
|
|
|
word = line.split()[0]
|
|
|
|
word = line.split()[0]
|
|
|
|
|
|
|
|
in_n += 1
|
|
|
|
|
|
|
|
|
|
|
|
if word == 'SIL' and not bpemode: # `sil` might be a valid piece in bpemodel
|
|
|
|
if word == 'SIL' and not bpemode: # `sil` might be a valid piece in bpemodel
|
|
|
|
|
|
|
|
# filter 'SIL' for mandarin, keep it in English
|
|
|
|
continue
|
|
|
|
continue
|
|
|
|
elif word == '<SPOKEN_NOISE>':
|
|
|
|
elif word == '<SPOKEN_NOISE>':
|
|
|
|
|
|
|
|
# filter <SPOKEN_NOISE>
|
|
|
|
continue
|
|
|
|
continue
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
# each word only has one pronunciation for e2e system
|
|
|
|
# each word only has one pronunciation for e2e system
|
|
|
@ -39,12 +54,14 @@ def main(args):
|
|
|
|
continue
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
if bpemode:
|
|
|
|
if bpemode:
|
|
|
|
|
|
|
|
# for english
|
|
|
|
pieces = sp.EncodeAsPieces(word)
|
|
|
|
pieces = sp.EncodeAsPieces(word)
|
|
|
|
if contain_oov(pieces):
|
|
|
|
if contain_oov(pieces):
|
|
|
|
print('Ignoring words {}, which contains oov unit'.
|
|
|
|
print('Ignoring words {}, which contains oov unit'.
|
|
|
|
format(''.join(word).strip('▁')))
|
|
|
|
format(''.join(word).strip('▁')))
|
|
|
|
continue
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# word is piece list, which not have <unk> piece, filter out by `contain_oov(pieces)`
|
|
|
|
chars = ' '.join(
|
|
|
|
chars = ' '.join(
|
|
|
|
[p if p in unit_table else '<unk>' for p in pieces])
|
|
|
|
[p if p in unit_table else '<unk>' for p in pieces])
|
|
|
|
else:
|
|
|
|
else:
|
|
|
@ -58,11 +75,14 @@ def main(args):
|
|
|
|
# we assume the model unit of our e2e system is char now.
|
|
|
|
# we assume the model unit of our e2e system is char now.
|
|
|
|
if word.encode('utf8').isalpha() and '▁' in unit_table:
|
|
|
|
if word.encode('utf8').isalpha() and '▁' in unit_table:
|
|
|
|
word = '▁' + word
|
|
|
|
word = '▁' + word
|
|
|
|
|
|
|
|
|
|
|
|
chars = ' '.join(word) # word is a char list
|
|
|
|
chars = ' '.join(word) # word is a char list
|
|
|
|
|
|
|
|
|
|
|
|
fout.write('{} {}\n'.format(word, chars))
|
|
|
|
fout.write('{} {}\n'.format(word, chars))
|
|
|
|
lexicon_table.add(word)
|
|
|
|
lexicon_table.add(word)
|
|
|
|
|
|
|
|
out_n += 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print(f"Filter lexicon by unit table: filter out {in_n - out_n}, {out_n}/{in_n}")
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
if __name__ == '__main__':
|
|
|
|
parser = argparse.ArgumentParser(
|
|
|
|
parser = argparse.ArgumentParser(
|
|
|
|