parent
18d9abc7a0
commit
44743622d4
@ -0,0 +1,29 @@
|
||||
process:
|
||||
# extract kaldi fbank from PCM
|
||||
- type: fbank_kaldi
|
||||
fs: 16000
|
||||
n_mels: 80
|
||||
n_shift: 160
|
||||
win_length: 400
|
||||
dither: true
|
||||
- type: cmvn_json
|
||||
cmvn_path: data/mean_std.json
|
||||
# these three processes are a.k.a. SpecAugument
|
||||
- type: time_warp
|
||||
max_time_warp: 5
|
||||
inplace: true
|
||||
mode: PIL
|
||||
- type: freq_mask
|
||||
F: 30
|
||||
n_mask: 2
|
||||
inplace: true
|
||||
replace_with_zero: false
|
||||
- type: time_mask
|
||||
T: 40
|
||||
n_mask: 2
|
||||
inplace: true
|
||||
replace_with_zero: false
|
||||
|
||||
|
||||
|
||||
|
@ -0,0 +1,102 @@
|
||||
#!/usr/bin/env python3
|
||||
"""remove longshort data from manifest"""
|
||||
import logging
|
||||
import argparse
|
||||
import jsonlines
|
||||
|
||||
from paddlespeech.s2t.utils.cli_utils import get_commandline_args
|
||||
|
||||
# manifest after format
|
||||
# josnline like this
|
||||
# {
|
||||
# "input": [{"name": "input1", "shape": (100, 83), "feat": "xxx.ark:123"}],
|
||||
# "output": [{"name":"target1", "shape": (40, 5002), "text": "a b c de"}],
|
||||
# "utt2spk": "111-2222",
|
||||
# "utt": "111-2222-333"
|
||||
# }
|
||||
|
||||
|
||||
def get_parser():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="remove longshort data from format manifest",
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter, )
|
||||
parser.add_argument(
|
||||
"--verbose", "-V", default=0, type=int, help="Verbose option")
|
||||
parser.add_argument(
|
||||
"--iaxis", default=0, type=int, help="multi inputs index, 0 is the first")
|
||||
parser.add_argument(
|
||||
"--oaxis", default=0, type=int, help="multi outputs index, 0 is the first")
|
||||
parser.add_argument(
|
||||
"--maxframes", default=2000, type=int, help="maxframes")
|
||||
parser.add_argument(
|
||||
"--minframes", default=10, type=int, help="minframes")
|
||||
parser.add_argument(
|
||||
"--maxchars", default=200, type=int, help="max tokens")
|
||||
parser.add_argument(
|
||||
"--minchars", default=0, type=int, help="min tokens")
|
||||
parser.add_argument(
|
||||
"--stride_ms", default=10, type=int, help="stride in ms unit.")
|
||||
parser.add_argument(
|
||||
"rspecifier",
|
||||
type=str,
|
||||
help="jsonl format manifest. e.g. manifest.jsonl")
|
||||
parser.add_argument(
|
||||
"wspecifier_or_wxfilename",
|
||||
type=str,
|
||||
help="Write specifier. e.g. manifest.jsonl")
|
||||
return parser
|
||||
|
||||
|
||||
def filter_input(args, line):
|
||||
tmp = line['input'][args.iaxis]
|
||||
if args.sound:
|
||||
# second to frame
|
||||
nframe = tmp['shape'][0] * 1000 / args.stride_ms
|
||||
else:
|
||||
nframe = tmp['shape'][0]
|
||||
|
||||
if nframe < args.minframes or nframe > args.maxframes:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
def filter_output(args, line):
|
||||
nchars = len(line['output'][args.iaxis]['text'])
|
||||
if nchars < args.minchars or nchars > args.maxchars:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
def main():
|
||||
args = get_parser().parse_args()
|
||||
|
||||
logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
|
||||
if args.verbose > 0:
|
||||
logging.basicConfig(level=logging.INFO, format=logfmt)
|
||||
else:
|
||||
logging.basicConfig(level=logging.WARN, format=logfmt)
|
||||
logging.info(get_commandline_args())
|
||||
|
||||
with jsonlines.open(args.rspecifier, 'r') as reader:
|
||||
lines = list(reader)
|
||||
logging.info(f"Example: {len(lines)}")
|
||||
feat = lines[0]['input'][args.iaxis]['feat']
|
||||
args.soud = False
|
||||
if feat.split('.')[-1] not in 'ark, scp':
|
||||
args.sound = True
|
||||
|
||||
count = 0
|
||||
filter = 0
|
||||
with jsonlines.open(args.wspecifier_or_wxfilename, 'w') as writer:
|
||||
for line in lines:
|
||||
if filter_input(args, line) or filter_output(args, line):
|
||||
filter += 1
|
||||
continue
|
||||
writer.write(line)
|
||||
count += 1
|
||||
logging.info(f"Example after filter: {count}\{filter}")
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
Loading…
Reference in new issue