PaddleSpeech/utils/text2token.py

#!/usr/bin/env python3
# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
import argparse
import codecs
import re
import sys

is_python2 = sys.version_info[0] == 2


def exist_or_not(i, match_pos):
    start_pos = None
    end_pos = None
    for pos in match_pos:
        if pos[0] <= i < pos[1]:
            start_pos = pos[0]
            end_pos = pos[1]
            break

    return start_pos, end_pos


def get_parser():
    parser = argparse.ArgumentParser(
        description="convert raw text to tokenized text",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter, )
    parser.add_argument(
        "--nchar",
        "-n",
        default=1,
        type=int,
        help="number of characters to split, i.e., \
                        aabb -> a a b b with -n 1 and aa bb with -n 2", )
    parser.add_argument(
        "--skip-ncols", "-s", default=0, type=int, help="skip first n columns")
    parser.add_argument(
        "--space", default="<space>", type=str, help="space symbol")
    parser.add_argument(
        "--non-lang-syms",
        "-l",
        default=None,
        type=str,
        help="list of non-linguistic symobles, e.g., <NOISE> etc.", )
    parser.add_argument(
        "text", type=str, default=False, nargs="?", help="input text")
    parser.add_argument(
        "--trans_type",
        "-t",
        type=str,
        default="char",
        choices=["char", "phn"],
        help="""Transcript type. char/phn. e.g., for TIMIT FADG0_SI1279 -
                        If trans_type is char,
                        read from SI1279.WRD file -> "bricks are an alternative"
                        Else if trans_type is phn,
                        read from SI1279.PHN file -> "sil b r ih sil k s aa r er n aa l
                        sil t er n ih sil t ih v sil" """, )
    return parser


def main():
    parser = get_parser()
    args = parser.parse_args()

    rs = []
    if args.non_lang_syms is not None:
        with codecs.open(args.non_lang_syms, "r", encoding="utf-8") as f:
            nls = [x.rstrip() for x in f.readlines()]
            rs = [re.compile(re.escape(x)) for x in nls]

    if args.text:
        f = codecs.open(args.text, encoding="utf-8")
    else:
        f = codecs.getreader("utf-8")(sys.stdin
                                      if is_python2 else sys.stdin.buffer)

    sys.stdout = codecs.getwriter("utf-8")(sys.stdout
                                           if is_python2 else sys.stdout.buffer)
    line = f.readline()
    n = args.nchar
    while line:
        x = line.split()
        print(" ".join(x[:args.skip_ncols]), end=" ")
        a = " ".join(x[args.skip_ncols:])

        # get all matched positions
        match_pos = []
        for r in rs:
            i = 0
            while i >= 0:
                m = r.search(a, i)
                if m:
                    match_pos.append([m.start(), m.end()])
                    i = m.end()
                else:
                    break

        if args.trans_type == "phn":
            a = a.split(" ")
        else:
            if len(match_pos) > 0:
                chars = []
                i = 0
                while i < len(a):
                    start_pos, end_pos = exist_or_not(i, match_pos)
                    if start_pos is not None:
                        chars.append(a[start_pos:end_pos])
                        i = end_pos
                    else:
                        chars.append(a[i])
                        i += 1
                a = chars

            a = [a[j:j + n] for j in range(0, len(a), n)]

        a_flat = []
        for z in a:
            a_flat.append("".join(z))

        a_chars = [z.replace(" ", args.space) for z in a_flat]
        if args.trans_type == "phn":
            a_chars = [z.replace("sil", args.space) for z in a_chars]
        print(" ".join(a_chars))
        line = f.readline()


if __name__ == "__main__":
    main()
more utils to support kaldi/espnet data preocess 3 years ago			`#!/usr/bin/env python3`
			`# Copyright 2017 Johns Hopkins University (Shinji Watanabe)`
			`# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)`
			`import argparse`
			`import codecs`
			`import re`
			`import sys`

			`is_python2 = sys.version_info[0] == 2`


			`def exist_or_not(i, match_pos):`
			`start_pos = None`
			`end_pos = None`
			`for pos in match_pos:`
			`if pos[0] <= i < pos[1]:`
			`start_pos = pos[0]`
			`end_pos = pos[1]`
			`break`

			`return start_pos, end_pos`


			`def get_parser():`
			`parser = argparse.ArgumentParser(`
			`description="convert raw text to tokenized text",`
lm embed and format code 3 years ago			`formatter_class=argparse.ArgumentDefaultsHelpFormatter, )`
more utils to support kaldi/espnet data preocess 3 years ago			`parser.add_argument(`
			`"--nchar",`
			`"-n",`
			`default=1,`
			`type=int,`
			`help="number of characters to split, i.e., \`
lm embed and format code 3 years ago			`aabb -> a a b b with -n 1 and aa bb with -n 2", )`
more utils to support kaldi/espnet data preocess 3 years ago			`parser.add_argument(`
lm embed and format code 3 years ago			`"--skip-ncols", "-s", default=0, type=int, help="skip first n columns")`
			`parser.add_argument(`
			`"--space", default="<space>", type=str, help="space symbol")`
more utils to support kaldi/espnet data preocess 3 years ago			`parser.add_argument(`
			`"--non-lang-syms",`
			`"-l",`
			`default=None,`
			`type=str,`
lm embed and format code 3 years ago			`help="list of non-linguistic symobles, e.g., <NOISE> etc.", )`
			`parser.add_argument(`
			`"text", type=str, default=False, nargs="?", help="input text")`
more utils to support kaldi/espnet data preocess 3 years ago			`parser.add_argument(`
			`"--trans_type",`
			`"-t",`
			`type=str,`
			`default="char",`
			`choices=["char", "phn"],`
			`help="""Transcript type. char/phn. e.g., for TIMIT FADG0_SI1279 -`
			`If trans_type is char,`
			`read from SI1279.WRD file -> "bricks are an alternative"`
			`Else if trans_type is phn,`
			`read from SI1279.PHN file -> "sil b r ih sil k s aa r er n aa l`
lm embed and format code 3 years ago			`sil t er n ih sil t ih v sil" """, )`
more utils to support kaldi/espnet data preocess 3 years ago			`return parser`


			`def main():`
			`parser = get_parser()`
			`args = parser.parse_args()`

			`rs = []`
			`if args.non_lang_syms is not None:`
			`with codecs.open(args.non_lang_syms, "r", encoding="utf-8") as f:`
			`nls = [x.rstrip() for x in f.readlines()]`
			`rs = [re.compile(re.escape(x)) for x in nls]`

			`if args.text:`
			`f = codecs.open(args.text, encoding="utf-8")`
			`else:`
lm embed and format code 3 years ago			`f = codecs.getreader("utf-8")(sys.stdin`
			`if is_python2 else sys.stdin.buffer)`
more utils to support kaldi/espnet data preocess 3 years ago
lm embed and format code 3 years ago			`sys.stdout = codecs.getwriter("utf-8")(sys.stdout`
			`if is_python2 else sys.stdout.buffer)`
more utils to support kaldi/espnet data preocess 3 years ago			`line = f.readline()`
			`n = args.nchar`
			`while line:`
			`x = line.split()`
lm embed and format code 3 years ago			`print(" ".join(x[:args.skip_ncols]), end=" ")`
			`a = " ".join(x[args.skip_ncols:])`
more utils to support kaldi/espnet data preocess 3 years ago
			`# get all matched positions`
			`match_pos = []`
			`for r in rs:`
			`i = 0`
			`while i >= 0:`
			`m = r.search(a, i)`
			`if m:`
			`match_pos.append([m.start(), m.end()])`
			`i = m.end()`
			`else:`
			`break`

			`if args.trans_type == "phn":`
			`a = a.split(" ")`
			`else:`
			`if len(match_pos) > 0:`
			`chars = []`
			`i = 0`
			`while i < len(a):`
			`start_pos, end_pos = exist_or_not(i, match_pos)`
			`if start_pos is not None:`
			`chars.append(a[start_pos:end_pos])`
			`i = end_pos`
			`else:`
			`chars.append(a[i])`
			`i += 1`
			`a = chars`

lm embed and format code 3 years ago			`a = [a[j:j + n] for j in range(0, len(a), n)]`
more utils to support kaldi/espnet data preocess 3 years ago
			`a_flat = []`
			`for z in a:`
			`a_flat.append("".join(z))`

			`a_chars = [z.replace(" ", args.space) for z in a_flat]`
			`if args.trans_type == "phn":`
			`a_chars = [z.replace("sil", args.space) for z in a_chars]`
			`print(" ".join(a_chars))`
			`line = f.readline()`


			`if __name__ == "__main__":`
			`main()`