PaddleSpeech/utils/fst/prepare_dict.py

#!/usr/bin/env python3
import argparse


def main(args):
    # load vocab file
    # line: token
    unit_table = set()
    with open(args.unit_file, 'r') as fin:
        for line in fin:
            unit = line.strip()
            unit_table.add(unit)

    def contain_oov(units):
        """token not in vocab

        Args:
            units (str): token

        Returns:
            bool: True token in voca, else False.
        """
        for unit in units:
            if unit not in unit_table:
                return True
        return False

    # load spm model, for English
    bpemode = args.bpemodel
    if bpemode:
        import sentencepiece as spm
        sp = spm.SentencePieceProcessor()
        sp.Load(sys.bpemodel)

    # used to filter polyphone and invalid word
    lexicon_table = set()
    in_n = 0  # in lexicon word count
    out_n = 0  # out lexicon word cout
    with open(args.in_lexicon, 'r') as fin, \
            open(args.out_lexicon, 'w') as fout:
        for line in fin:
            word = line.split()[0]
            in_n += 1

            if word == 'SIL' and not bpemode:  # `sil` might be a valid piece in bpemodel
                # filter 'SIL' for mandarin, keep it in English
                continue
            elif word == '<SPOKEN_NOISE>':
                # filter <SPOKEN_NOISE>
                continue
            else:
                # each word only has one pronunciation for e2e system
                if word in lexicon_table:
                    continue

                if bpemode:
                    # for english
                    pieces = sp.EncodeAsPieces(word)
                    if contain_oov(pieces):
                        print('Ignoring words {}, which contains oov unit'.
                              format(''.join(word).strip('▁')))
                        continue

                    # word is piece list, which not have <unk> piece, filter out by `contain_oov(pieces)`
                    chars = ' '.join(
                        [p if p in unit_table else '<unk>' for p in pieces])
                else:
                    # ignore words with OOV
                    if contain_oov(word):
                        print('Ignoring words {}, which contains oov unit'.
                              format(word))
                        continue

                    # Optional, append ▁ in front of english word
                    # we assume the model unit of our e2e system is char now.
                    if word.encode('utf8').isalpha() and '▁' in unit_table:
                        word = '▁' + word

                    chars = ' '.join(word)  # word is a char list

                fout.write('{} {}\n'.format(word, chars))
                lexicon_table.add(word)
                out_n += 1

    print(
        f"Filter lexicon by unit table: filter out {in_n - out_n}, {out_n}/{in_n}"
    )


if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description='FST: preprae e2e(char/spm) dict')
    parser.add_argument(
        '--unit_file',
        required=True,
        help='e2e model unit file(lang_char.txt/vocab.txt). line: char/spm_pices'
    )
    parser.add_argument(
        '--in_lexicon',
        required=True,
        help='raw lexicon file. line: word ph0 ... phn')
    parser.add_argument(
        '--out_lexicon',
        required=True,
        help='output lexicon file. line: word char0 ... charn')
    parser.add_argument('--bpemodel', default=None, help='bpemodel')

    args = parser.parse_args()
    print(args)

    main(args)
add TLG utils 3 years ago			`#!/usr/bin/env python3`
			`import argparse`


			`def main(args):`
train lm 3 years ago			`# load vocab file`
			`# line: token`
add TLG utils 3 years ago			`unit_table = set()`
			`with open(args.unit_file, 'r') as fin:`
			`for line in fin:`
			`unit = line.strip()`
			`unit_table.add(unit)`

TLG build pass 3 years ago			`def contain_oov(units):`
train lm 3 years ago			`"""token not in vocab`

			`Args:`
			`units (str): token`

			`Returns:`
			`bool: True token in voca, else False.`
			`"""`
TLG build pass 3 years ago			`for unit in units:`
			`if unit not in unit_table:`
			`return True`
			`return False`

train lm 3 years ago			`# load spm model, for English`
add TLG utils 3 years ago			`bpemode = args.bpemodel`
			`if bpemode:`
			`import sentencepiece as spm`
			`sp = spm.SentencePieceProcessor()`
			`sp.Load(sys.bpemodel)`

train lm 3 years ago			`# used to filter polyphone and invalid word`
add TLG utils 3 years ago			`lexicon_table = set()`
train lm 3 years ago			`in_n = 0 # in lexicon word count`
format 3 years ago			`out_n = 0 # out lexicon word cout`
TLG build pass 3 years ago			`with open(args.in_lexicon, 'r') as fin, \`
			`open(args.out_lexicon, 'w') as fout:`
add TLG utils 3 years ago			`for line in fin:`
			`word = line.split()[0]`
train lm 3 years ago			`in_n += 1`

add TLG utils 3 years ago			if word == 'SIL' and not bpemode: # `sil` might be a valid piece in bpemodel
train lm 3 years ago			`# filter 'SIL' for mandarin, keep it in English`
add TLG utils 3 years ago			`continue`
			`elif word == '<SPOKEN_NOISE>':`
train lm 3 years ago			`# filter <SPOKEN_NOISE>`
add TLG utils 3 years ago			`continue`
			`else:`
			`# each word only has one pronunciation for e2e system`
			`if word in lexicon_table:`
			`continue`

			`if bpemode:`
train lm 3 years ago			`# for english`
add TLG utils 3 years ago			`pieces = sp.EncodeAsPieces(word)`
			`if contain_oov(pieces):`
			`print('Ignoring words {}, which contains oov unit'.`
			`format(''.join(word).strip('▁')))`
			`continue`

train lm 3 years ago			# word is piece list, which not have <unk> piece, filter out by `contain_oov(pieces)`
add TLG utils 3 years ago			`chars = ' '.join(`
			`[p if p in unit_table else '<unk>' for p in pieces])`
			`else:`
			`# ignore words with OOV`
			`if contain_oov(word):`
			`print('Ignoring words {}, which contains oov unit'.`
			`format(word))`
			`continue`

			`# Optional, append ▁ in front of english word`
			`# we assume the model unit of our e2e system is char now.`
			`if word.encode('utf8').isalpha() and '▁' in unit_table:`
			`word = '▁' + word`
train lm 3 years ago
add TLG utils 3 years ago			`chars = ' '.join(word) # word is a char list`

			`fout.write('{} {}\n'.format(word, chars))`
			`lexicon_table.add(word)`
train lm 3 years ago			`out_n += 1`
add TLG utils 3 years ago
format 3 years ago			`print(`
			`f"Filter lexicon by unit table: filter out {in_n - out_n}, {out_n}/{in_n}"`
			`)`

add TLG utils 3 years ago
			`if __name__ == '__main__':`
			`parser = argparse.ArgumentParser(`
			`description='FST: preprae e2e(char/spm) dict')`
			`parser.add_argument(`
			`'--unit_file',`
			`required=True,`
			`help='e2e model unit file(lang_char.txt/vocab.txt). line: char/spm_pices'`
			`)`
			`parser.add_argument(`
			`'--in_lexicon',`
			`required=True,`
			`help='raw lexicon file. line: word ph0 ... phn')`
			`parser.add_argument(`
			`'--out_lexicon',`
			`required=True,`
			`help='output lexicon file. line: word char0 ... charn')`
			`parser.add_argument('--bpemodel', default=None, help='bpemodel')`

			`args = parser.parse_args()`
			`print(args)`

			`main(args)`