|
|
|
#!/usr/bin/env python3
|
|
|
|
# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
|
|
|
|
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
|
|
|
|
import argparse
|
|
|
|
import codecs
|
|
|
|
import re
|
|
|
|
import sys
|
|
|
|
|
|
|
|
is_python2 = sys.version_info[0] == 2
|
|
|
|
|
|
|
|
|
|
|
|
def exist_or_not(i, match_pos):
|
|
|
|
start_pos = None
|
|
|
|
end_pos = None
|
|
|
|
for pos in match_pos:
|
|
|
|
if pos[0] <= i < pos[1]:
|
|
|
|
start_pos = pos[0]
|
|
|
|
end_pos = pos[1]
|
|
|
|
break
|
|
|
|
|
|
|
|
return start_pos, end_pos
|
|
|
|
|
|
|
|
|
|
|
|
def get_parser():
|
|
|
|
parser = argparse.ArgumentParser(
|
|
|
|
description="convert raw text to tokenized text",
|
|
|
|
formatter_class=argparse.ArgumentDefaultsHelpFormatter, )
|
|
|
|
parser.add_argument(
|
|
|
|
"--nchar",
|
|
|
|
"-n",
|
|
|
|
default=1,
|
|
|
|
type=int,
|
|
|
|
help="number of characters to split, i.e., \
|
|
|
|
aabb -> a a b b with -n 1 and aa bb with -n 2", )
|
|
|
|
parser.add_argument(
|
|
|
|
"--skip-ncols", "-s", default=0, type=int, help="skip first n columns")
|
|
|
|
parser.add_argument(
|
|
|
|
"--space", default="<space>", type=str, help="space symbol")
|
|
|
|
parser.add_argument(
|
|
|
|
"--non-lang-syms",
|
|
|
|
"-l",
|
|
|
|
default=None,
|
|
|
|
type=str,
|
|
|
|
help="list of non-linguistic symobles, e.g., <NOISE> etc.", )
|
|
|
|
parser.add_argument(
|
|
|
|
"text", type=str, default=False, nargs="?", help="input text")
|
|
|
|
parser.add_argument(
|
|
|
|
"--trans_type",
|
|
|
|
"-t",
|
|
|
|
type=str,
|
|
|
|
default="char",
|
|
|
|
choices=["char", "phn"],
|
|
|
|
help="""Transcript type. char/phn. e.g., for TIMIT FADG0_SI1279 -
|
|
|
|
If trans_type is char,
|
|
|
|
read from SI1279.WRD file -> "bricks are an alternative"
|
|
|
|
Else if trans_type is phn,
|
|
|
|
read from SI1279.PHN file -> "sil b r ih sil k s aa r er n aa l
|
|
|
|
sil t er n ih sil t ih v sil" """, )
|
|
|
|
return parser
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
parser = get_parser()
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
|
|
rs = []
|
|
|
|
if args.non_lang_syms is not None:
|
|
|
|
with codecs.open(args.non_lang_syms, "r", encoding="utf-8") as f:
|
|
|
|
nls = [x.rstrip() for x in f.readlines()]
|
|
|
|
rs = [re.compile(re.escape(x)) for x in nls]
|
|
|
|
|
|
|
|
if args.text:
|
|
|
|
f = codecs.open(args.text, encoding="utf-8")
|
|
|
|
else:
|
|
|
|
f = codecs.getreader("utf-8")(sys.stdin
|
|
|
|
if is_python2 else sys.stdin.buffer)
|
|
|
|
|
|
|
|
sys.stdout = codecs.getwriter("utf-8")(sys.stdout
|
|
|
|
if is_python2 else sys.stdout.buffer)
|
|
|
|
line = f.readline()
|
|
|
|
n = args.nchar
|
|
|
|
while line:
|
|
|
|
x = line.split()
|
|
|
|
print(" ".join(x[:args.skip_ncols]), end=" ")
|
|
|
|
a = " ".join(x[args.skip_ncols:])
|
|
|
|
|
|
|
|
# get all matched positions
|
|
|
|
match_pos = []
|
|
|
|
for r in rs:
|
|
|
|
i = 0
|
|
|
|
while i >= 0:
|
|
|
|
m = r.search(a, i)
|
|
|
|
if m:
|
|
|
|
match_pos.append([m.start(), m.end()])
|
|
|
|
i = m.end()
|
|
|
|
else:
|
|
|
|
break
|
|
|
|
|
|
|
|
if args.trans_type == "phn":
|
|
|
|
a = a.split(" ")
|
|
|
|
else:
|
|
|
|
if len(match_pos) > 0:
|
|
|
|
chars = []
|
|
|
|
i = 0
|
|
|
|
while i < len(a):
|
|
|
|
start_pos, end_pos = exist_or_not(i, match_pos)
|
|
|
|
if start_pos is not None:
|
|
|
|
chars.append(a[start_pos:end_pos])
|
|
|
|
i = end_pos
|
|
|
|
else:
|
|
|
|
chars.append(a[i])
|
|
|
|
i += 1
|
|
|
|
a = chars
|
|
|
|
|
|
|
|
a = [a[j:j + n] for j in range(0, len(a), n)]
|
|
|
|
|
|
|
|
a_flat = []
|
|
|
|
for z in a:
|
|
|
|
a_flat.append("".join(z))
|
|
|
|
|
|
|
|
a_chars = [z.replace(" ", args.space) for z in a_flat]
|
|
|
|
if args.trans_type == "phn":
|
|
|
|
a_chars = [z.replace("sil", args.space) for z in a_chars]
|
|
|
|
print(" ".join(a_chars))
|
|
|
|
line = f.readline()
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
main()
|