#!/usr/bin/env python3 """remove longshort data from manifest""" import argparse import logging import jsonlines from paddlespeech.s2t.utils.cli_utils import get_commandline_args # manifest after format # josnline like this # { # "input": [{"name": "input1", "shape": (100, 83), "feat": "xxx.ark:123"}], # "output": [{"name":"target1", "shape": (40, 5002), "text": "a b c de"}], # "utt2spk": "111-2222", # "utt": "111-2222-333" # } def get_parser(): parser = argparse.ArgumentParser( description="remove longshort data from format manifest", formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) parser.add_argument( "--verbose", "-V", default=0, type=int, help="Verbose option") parser.add_argument( "--iaxis", default=0, type=int, help="multi inputs index, 0 is the first") parser.add_argument( "--oaxis", default=0, type=int, help="multi outputs index, 0 is the first") parser.add_argument("--maxframes", default=2000, type=int, help="maxframes") parser.add_argument("--minframes", default=10, type=int, help="minframes") parser.add_argument("--maxchars", default=200, type=int, help="max tokens") parser.add_argument("--minchars", default=0, type=int, help="min tokens") parser.add_argument( "--stride_ms", default=10, type=int, help="stride in ms unit.") parser.add_argument( "rspecifier", type=str, help="jsonl format manifest. e.g. manifest.jsonl") parser.add_argument( "wspecifier_or_wxfilename", type=str, help="Write specifier. e.g. manifest.jsonl") return parser def filter_input(args, line): tmp = line['input'][args.iaxis] if args.sound: # second to frame nframe = tmp['shape'][0] * 1000 / args.stride_ms else: nframe = tmp['shape'][0] if nframe < args.minframes or nframe > args.maxframes: return True else: return False def filter_output(args, line): nchars = len(line['output'][args.iaxis]['text']) if nchars < args.minchars or nchars > args.maxchars: return True else: return False def main(): args = get_parser().parse_args() logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" if args.verbose > 0: logging.basicConfig(level=logging.INFO, format=logfmt) else: logging.basicConfig(level=logging.WARN, format=logfmt) logging.info(get_commandline_args()) with jsonlines.open(args.rspecifier, 'r') as reader: lines = list(reader) logging.info(f"Example: {len(lines)}") feat = lines[0]['input'][args.iaxis]['feat'] args.soud = False if feat.split('.')[-1] not in 'ark, scp': args.sound = True count = 0 filter = 0 with jsonlines.open(args.wspecifier_or_wxfilename, 'w') as writer: for line in lines: if filter_input(args, line) or filter_output(args, line): filter += 1 continue writer.write(line) count += 1 logging.info(f"Example after filter: {count}\{filter}") if __name__ == '__main__': main()