You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
101 lines
3.7 KiB
101 lines
3.7 KiB
#!/usr/bin/env python3
|
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
"""format manifest with more metadata."""
|
|
import argparse
|
|
import functools
|
|
import json
|
|
|
|
import jsonlines
|
|
|
|
from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
|
|
from paddlespeech.s2t.frontend.utility import load_cmvn
|
|
from paddlespeech.s2t.io.utility import feat_type
|
|
from paddlespeech.utils.argparse import add_arguments
|
|
from paddlespeech.utils.argparse import print_arguments
|
|
|
|
parser = argparse.ArgumentParser(description=__doc__)
|
|
add_arg = functools.partial(add_arguments, argparser=parser)
|
|
# yapf: disable
|
|
add_arg('cmvn_path', str,
|
|
'examples/librispeech/data/mean_std.json',
|
|
"Filepath of cmvn.")
|
|
add_arg('unit_type', str, "char", "Unit type, e.g. char, word, spm")
|
|
add_arg('vocab_path', str,
|
|
'examples/librispeech/data/vocab.txt',
|
|
"Filepath of the vocabulary.")
|
|
add_arg('manifest_paths', str,
|
|
None,
|
|
"Filepaths of manifests for building vocabulary. "
|
|
"You can provide multiple manifest files.",
|
|
nargs='+',
|
|
required=True)
|
|
# bpe
|
|
add_arg('spm_model_prefix', str, None,
|
|
"spm model prefix, spm_model_%(bpe_mode)_%(count_threshold), only need when `unit_type` is spm")
|
|
add_arg('output_path', str, None, "filepath of formated manifest.", required=True)
|
|
# yapf: disable
|
|
args = parser.parse_args()
|
|
|
|
|
|
def main():
|
|
print_arguments(args, globals())
|
|
fout = open(args.output_path, 'w', encoding='utf-8')
|
|
|
|
# get feat dim
|
|
mean, std = load_cmvn(args.cmvn_path, filetype='json')
|
|
feat_dim = mean.shape[0] #(D)
|
|
print(f"Feature dim: {feat_dim}")
|
|
|
|
text_feature = TextFeaturizer(args.unit_type, args.vocab_path, args.spm_model_prefix)
|
|
vocab_size = text_feature.vocab_size
|
|
print(f"Vocab size: {vocab_size}")
|
|
|
|
count = 0
|
|
for manifest_path in args.manifest_paths:
|
|
with jsonlines.open(str(manifest_path), 'r') as reader:
|
|
manifest_jsons = list(reader)
|
|
for line_json in manifest_jsons:
|
|
# text: translation text, text1: transcript text.
|
|
# Currently only support joint-vocab, will add separate vocabs setting.
|
|
line = line_json['text']
|
|
tokens = text_feature.tokenize(line)
|
|
tokenids = text_feature.featurize(line)
|
|
line_json['token'] = tokens
|
|
line_json['token_id'] = tokenids
|
|
line_json['token_shape'] = (len(tokenids), vocab_size)
|
|
line = line_json['text1']
|
|
tokens = text_feature.tokenize(line)
|
|
tokenids = text_feature.featurize(line)
|
|
line_json['token1'] = tokens
|
|
line_json['token_id1'] = tokenids
|
|
line_json['token_shape1'] = (len(tokenids), vocab_size)
|
|
|
|
feat_shape = line_json['feat_shape']
|
|
assert isinstance(feat_shape, (list, tuple)), type(feat_shape)
|
|
filetype = feat_type(line_json['feat'])
|
|
if filetype == 'sound':
|
|
feat_shape.append(feat_dim)
|
|
else: # kaldi
|
|
raise NotImplementedError('no support kaldi feat now!')
|
|
fout.write(json.dumps(line_json) + '\n')
|
|
count += 1
|
|
|
|
print(f"Examples number: {count}")
|
|
fout.close()
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|