parent
12c01f39aa
commit
64f0bad5ca
@ -0,0 +1,127 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""format manifest with more metadata."""
|
||||
import argparse
|
||||
import functools
|
||||
import json
|
||||
from collections import Counter
|
||||
import os
|
||||
import copy
|
||||
import tempfile
|
||||
|
||||
from deepspeech.frontend.utility import read_manifest
|
||||
from deepspeech.frontend.utility import UNK
|
||||
from deepspeech.frontend.utility import BLANK
|
||||
from deepspeech.frontend.utility import SOS
|
||||
from deepspeech.utils.utility import add_arguments
|
||||
from deepspeech.utils.utility import print_arguments
|
||||
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
add_arg = functools.partial(add_arguments, argparser=parser)
|
||||
# yapf: disable
|
||||
add_arg('feat_type', str, "raw", "speech feature type, e.g. raw(wav, flac), kaldi")
|
||||
add_arg('unit_type', str, "character", "Unit type, e.g. character, word, bpe")
|
||||
add_arg('vocab_path', str,
|
||||
'examples/librispeech/data/vocab.txt',
|
||||
"Filepath to write the vocabulary.")
|
||||
add_arg('manifest_paths', str,
|
||||
None,
|
||||
"Filepaths of manifests for building vocabulary. "
|
||||
"You can provide multiple manifest files.",
|
||||
nargs='+',
|
||||
required=True)
|
||||
# bpe
|
||||
add_arg('bpe_model_prefix', str, "bpe_model_%(bpe_mode)_%(count_threshold)", "bpe model prefix, only need when `unit_type` is bpe")
|
||||
add_arg('output_path', str, None, "filepath of formated manifest.", required=True)
|
||||
# yapf: disable
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
def main():
|
||||
print_arguments(args)
|
||||
|
||||
# read vocab
|
||||
vocab = dict()
|
||||
with open(args.vocab_path, 'r', encoding='utf-8') as fin:
|
||||
for line in fin:
|
||||
token = line.strip()
|
||||
vocab[token] = len(vocab)
|
||||
vocab_size = len(vocab)
|
||||
|
||||
fout = open(args.output_path, 'w', encoding='utf-8')
|
||||
|
||||
if args.unit_type != 'bpe':
|
||||
for manifest_path in args.manifest_paths:
|
||||
manifest_jsons = read_manifest(manifest_path)
|
||||
for line_json in manifest_jsons:
|
||||
tokens = []
|
||||
tokenids = []
|
||||
if args.unit_type == 'character':
|
||||
for char in line_json['text']:
|
||||
tokens.append(char)
|
||||
tokenids.append(vocab[char])
|
||||
elif args.unit_type == 'word':
|
||||
for word in line_json['text'].split():
|
||||
tokens.append(word)
|
||||
tokenids.append(vocab[word])
|
||||
line_json['token'] = tokens
|
||||
line_json['token_id'] = tokenids
|
||||
line_json['token_shape'] = (len(tokenids), vocab_size)
|
||||
fout.write(json.dumps(line_json) + '\n')
|
||||
else:
|
||||
import sentencepiece as spm
|
||||
|
||||
# encode
|
||||
sp = spm.SentencePieceProcessor()
|
||||
sp.Load(args.bpe_model_prefix + '.model')
|
||||
|
||||
def valid(line):
|
||||
return True
|
||||
|
||||
def encode(l):
|
||||
return sp.EncodeAsPieces(l)
|
||||
|
||||
def encode_line(line):
|
||||
line = line.strip()
|
||||
if len(line) > 0:
|
||||
line = encode(line)
|
||||
if valid(line):
|
||||
return line
|
||||
else:
|
||||
stats["num_filtered"] += 1
|
||||
else:
|
||||
stats["num_empty"] += 1
|
||||
return None
|
||||
|
||||
for manifest_path in args.manifest_paths:
|
||||
manifest_jsons = read_manifest(manifest_path)
|
||||
for line_json in manifest_jsons:
|
||||
line = line_json['text']
|
||||
tokens = []
|
||||
tokenids = []
|
||||
enc_line = encode_line(line)
|
||||
for code in enc_line:
|
||||
tokens.append(code)
|
||||
tokenids.append(vocab[code])
|
||||
#print(code, vocab[code])
|
||||
line_json['token'] = tokens
|
||||
line_json['token_id'] = tokenids
|
||||
line_json['token_shape'] = (len(tokenids), vocab_size)
|
||||
fout.write(json.dumps(line_json) + '\n')
|
||||
|
||||
fout.close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
Loading…
Reference in new issue