#!/usr/bin/env python3 # Copyright 2017 Johns Hopkins University (Shinji Watanabe) # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) import argparse import codecs import re import sys is_python2 = sys.version_info[0] == 2 def exist_or_not(i, match_pos): start_pos = None end_pos = None for pos in match_pos: if pos[0] <= i < pos[1]: start_pos = pos[0] end_pos = pos[1] break return start_pos, end_pos def get_parser(): parser = argparse.ArgumentParser( description="convert raw text to tokenized text", formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) parser.add_argument( "--nchar", "-n", default=1, type=int, help="number of characters to split, i.e., \ aabb -> a a b b with -n 1 and aa bb with -n 2", ) parser.add_argument( "--skip-ncols", "-s", default=0, type=int, help="skip first n columns") parser.add_argument( "--space", default="", type=str, help="space symbol") parser.add_argument( "--non-lang-syms", "-l", default=None, type=str, help="list of non-linguistic symobles, e.g., etc.", ) parser.add_argument( "text", type=str, default=False, nargs="?", help="input text") parser.add_argument( "--trans_type", "-t", type=str, default="char", choices=["char", "phn"], help="""Transcript type. char/phn. e.g., for TIMIT FADG0_SI1279 - If trans_type is char, read from SI1279.WRD file -> "bricks are an alternative" Else if trans_type is phn, read from SI1279.PHN file -> "sil b r ih sil k s aa r er n aa l sil t er n ih sil t ih v sil" """, ) return parser def main(): parser = get_parser() args = parser.parse_args() rs = [] if args.non_lang_syms is not None: with codecs.open(args.non_lang_syms, "r", encoding="utf-8") as f: nls = [x.rstrip() for x in f.readlines()] rs = [re.compile(re.escape(x)) for x in nls] if args.text: f = codecs.open(args.text, encoding="utf-8") else: f = codecs.getreader("utf-8")(sys.stdin if is_python2 else sys.stdin.buffer) sys.stdout = codecs.getwriter("utf-8")(sys.stdout if is_python2 else sys.stdout.buffer) line = f.readline() n = args.nchar while line: x = line.split() print(" ".join(x[:args.skip_ncols]), end=" ") a = " ".join(x[args.skip_ncols:]) # get all matched positions match_pos = [] for r in rs: i = 0 while i >= 0: m = r.search(a, i) if m: match_pos.append([m.start(), m.end()]) i = m.end() else: break if args.trans_type == "phn": a = a.split(" ") else: if len(match_pos) > 0: chars = [] i = 0 while i < len(a): start_pos, end_pos = exist_or_not(i, match_pos) if start_pos is not None: chars.append(a[start_pos:end_pos]) i = end_pos else: chars.append(a[i]) i += 1 a = chars a = [a[j:j + n] for j in range(0, len(a), n)] a_flat = [] for z in a: a_flat.append("".join(z)) a_chars = [z.replace(" ", args.space) for z in a_flat] if args.trans_type == "phn": a_chars = [z.replace("sil", args.space) for z in a_chars] print(" ".join(a_chars)) line = f.readline() if __name__ == "__main__": main()