# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """format manifest with more metadata.""" import argparse import functools import json import jsonlines from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer from paddlespeech.s2t.frontend.utility import load_cmvn from paddlespeech.s2t.io.utility import feat_type from paddlespeech.utils.argparse import add_arguments from paddlespeech.utils.argparse import print_arguments def define_argparse(): parser = argparse.ArgumentParser(description=__doc__) add_arg = functools.partial(add_arguments, argparser=parser) # yapf: disable add_arg('manifest_paths', str, None, "Filepaths of manifests for building vocabulary. " "You can provide multiple manifest files.", nargs='+', required=True) add_arg('output_path', str, None, "filepath of formated manifest.", required=True) add_arg('cmvn_path', str, 'examples/librispeech/data/mean_std.json', "Filepath of cmvn.") add_arg('unit_type', str, "char", "Unit type, e.g. char, word, spm") add_arg('vocab_path', str, 'examples/librispeech/data/vocab.txt', "Filepath of the vocabulary.") # bpe add_arg('spm_model_prefix', str, None, "spm model prefix, spm_model_%(bpe_mode)_%(count_threshold), only need when `unit_type` is spm") # yapf: disable args = parser.parse_args() return args def format_data( manifest_paths="", output_path="", cmvn_path="examples/librispeech/data/mean_std.json", unit_type="char", vocab_path="examples/librispeech/data/vocab.txt", spm_model_prefix=""): manifest_paths = [manifest_paths] if isinstance(manifest_paths, str) else manifest_paths fout = open(output_path, 'w', encoding='utf-8') # get feat dim filetype = cmvn_path.split(".")[-1] mean, istd = load_cmvn(cmvn_path, filetype=filetype) feat_dim = mean.shape[0] #(D) print(f"Feature dim: {feat_dim}") text_feature = TextFeaturizer(unit_type, vocab_path, spm_model_prefix) vocab_size = text_feature.vocab_size print(f"Vocab size: {vocab_size}") # josnline like this # { # "input": [{"name": "input1", "shape": (100, 83), "feat": "xxx.ark:123"}], # "output": [{"name":"target1", "shape": (40, 5002), "text": "a b c de"}], # "utt2spk": "111-2222", # "utt": "111-2222-333" # } count = 0 for manifest_path in manifest_paths: with jsonlines.open(str(manifest_path), 'r') as reader: manifest_jsons = list(reader) for line_json in manifest_jsons: output_json = { "input": [], "output": [], 'utt': line_json['utt'], 'utt2spk': line_json.get('utt2spk', 'global'), } # output line = line_json['text'] if isinstance(line, str): # only one target tokens = text_feature.tokenize(line) tokenids = text_feature.featurize(line) output_json['output'].append({ 'name': 'target1', 'shape': (len(tokenids), vocab_size), 'text': line, 'token': ' '.join(tokens), 'tokenid': ' '.join(map(str, tokenids)), }) else: # isinstance(line, list), multi target in one vocab for i, item in enumerate(line, 1): tokens = text_feature.tokenize(item) tokenids = text_feature.featurize(item) output_json['output'].append({ 'name': f'target{i}', 'shape': (len(tokenids), vocab_size), 'text': item, 'token': ' '.join(tokens), 'tokenid': ' '.join(map(str, tokenids)), }) # input line = line_json['feat'] if isinstance(line, str): # only one input feat_shape = line_json['feat_shape'] assert isinstance(feat_shape, (list, tuple)), type(feat_shape) filetype = feat_type(line) if filetype == 'sound': feat_shape.append(feat_dim) else: # kaldi raise NotImplementedError('no support kaldi feat now!') output_json['input'].append({ "name": "input1", "shape": feat_shape, "feat": line, "filetype": filetype, }) else: # isinstance(line, list), multi input raise NotImplementedError("not support multi input now!") fout.write(json.dumps(output_json) + '\n') count += 1 print(f"{manifest_paths} Examples number: {count}") fout.close() def main(): args = define_argparse() print_arguments(args, globals()) format_data(**vars(args)) if __name__ == '__main__': main()