You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
PaddleSpeech/utils/fst/prepare_dict.py

109 lines
3.5 KiB

#!/usr/bin/env python3
import argparse
def main(args):
# load vocab file
# line: token
unit_table = set()
with open(args.unit_file, 'r') as fin:
for line in fin:
unit = line.strip()
unit_table.add(unit)
def contain_oov(units):
"""token not in vocab
Args:
units (str): token
Returns:
bool: True token in voca, else False.
"""
for unit in units:
if unit not in unit_table:
return True
return False
# load spm model, for English
bpemode = args.bpemodel
if bpemode:
import sentencepiece as spm
sp = spm.SentencePieceProcessor()
sp.Load(sys.bpemodel)
# used to filter polyphone and invalid word
lexicon_table = set()
in_n = 0 # in lexicon word count
out_n = 0 # out lexicon word cout
with open(args.in_lexicon, 'r') as fin, \
open(args.out_lexicon, 'w') as fout:
for line in fin:
word = line.split()[0]
in_n += 1
if word == 'SIL' and not bpemode: # `sil` might be a valid piece in bpemodel
# filter 'SIL' for mandarin, keep it in English
continue
elif word == '<SPOKEN_NOISE>':
# filter <SPOKEN_NOISE>
continue
else:
# each word only has one pronunciation for e2e system
if word in lexicon_table:
continue
if bpemode:
# for english
pieces = sp.EncodeAsPieces(word)
if contain_oov(pieces):
print('Ignoring words {}, which contains oov unit'.
format(''.join(word).strip('')))
continue
# word is piece list, which not have <unk> piece, filter out by `contain_oov(pieces)`
chars = ' '.join(
[p if p in unit_table else '<unk>' for p in pieces])
else:
# ignore words with OOV
if contain_oov(word):
print('Ignoring words {}, which contains oov unit'.
format(word))
continue
# Optional, append ▁ in front of english word
# we assume the model unit of our e2e system is char now.
if word.encode('utf8').isalpha() and '' in unit_table:
word = '' + word
chars = ' '.join(word) # word is a char list
fout.write('{} {}\n'.format(word, chars))
lexicon_table.add(word)
out_n += 1
print(f"Filter lexicon by unit table: filter out {in_n - out_n}, {out_n}/{in_n}")
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='FST: preprae e2e(char/spm) dict')
parser.add_argument(
'--unit_file',
required=True,
help='e2e model unit file(lang_char.txt/vocab.txt). line: char/spm_pices'
)
parser.add_argument(
'--in_lexicon',
required=True,
help='raw lexicon file. line: word ph0 ... phn')
parser.add_argument(
'--out_lexicon',
required=True,
help='output lexicon file. line: word char0 ... charn')
parser.add_argument('--bpemodel', default=None, help='bpemodel')
args = parser.parse_args()
print(args)
main(args)