From ea35558ee03527b57cfacccf272f405ca427d0b2 Mon Sep 17 00:00:00 2001 From: Junkun Date: Mon, 29 Nov 2021 11:31:45 -0800 Subject: [PATCH] add utils --- utils/addjson.py | 155 +++++++++++ utils/scp2json.py | 48 ++++ utils/tokenizer.perl | 596 +++++++++++++++++++++++++++++++++++++++++++ utils/update_json.sh | 88 +++++++ 4 files changed, 887 insertions(+) create mode 100755 utils/addjson.py create mode 100755 utils/scp2json.py create mode 100644 utils/tokenizer.perl create mode 100755 utils/update_json.sh diff --git a/utils/addjson.py b/utils/addjson.py new file mode 100755 index 00000000..7fabe625 --- /dev/null +++ b/utils/addjson.py @@ -0,0 +1,155 @@ +#!/usr/bin/env python3 +# encoding: utf-8 + +# Copyright 2018 Nagoya University (Tomoki Hayashi) +# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +from __future__ import print_function +from __future__ import unicode_literals + +import argparse +import codecs +import json +import logging +import sys + +from distutils.util import strtobool + +from espnet.utils.cli_utils import get_commandline_args + +is_python2 = sys.version_info[0] == 2 + + +def get_parser(): + parser = argparse.ArgumentParser( + description="add multiple json values to an input or output value", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument("jsons", type=str, nargs="+", help="json files") + parser.add_argument( + "-i", + "--is-input", + default=True, + type=strtobool, + help="If true, add to input. If false, add to output", + ) + parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option") + return parser + + +if __name__ == "__main__": + parser = get_parser() + args = parser.parse_args() + + # logging info + logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" + if args.verbose > 0: + logging.basicConfig(level=logging.INFO, format=logfmt) + else: + logging.basicConfig(level=logging.WARN, format=logfmt) + logging.info(get_commandline_args()) + + # make intersection set for utterance keys + js = [] + intersec_ks = [] + for x in args.jsons: + with codecs.open(x, "r", encoding="utf-8") as f: + j = json.load(f) + ks = j["utts"].keys() + logging.info(x + ": has " + str(len(ks)) + " utterances") + if len(intersec_ks) > 0: + intersec_ks = intersec_ks.intersection(set(ks)) + if len(intersec_ks) == 0: + logging.warning("Empty intersection") + break + else: + intersec_ks = set(ks) + js.append(j) + logging.info("new json has " + str(len(intersec_ks)) + " utterances") + + # updated original dict to keep intersection + intersec_org_dic = dict() + for k in intersec_ks: + v = js[0]["utts"][k] + intersec_org_dic[k] = v + + intersec_add_dic = dict() + for k in intersec_ks: + v = js[1]["utts"][k] + for j in js[2:]: + v.update(j["utts"][k]) + intersec_add_dic[k] = v + + new_dic = dict() + for key_id in intersec_org_dic: + orgdic = intersec_org_dic[key_id] + adddic = intersec_add_dic[key_id] + + if "utt2spk" not in orgdic: + orgdic["utt2spk"] = "" + # NOTE: for machine translation + + # add as input + if args.is_input: + # original input + input_list = orgdic["input"] + # additional input + in_add_dic = {} + if "idim" in adddic and "ilen" in adddic: + in_add_dic["shape"] = [int(adddic["ilen"]), int(adddic["idim"])] + elif "idim" in adddic: + in_add_dic["shape"] = [int(adddic["idim"])] + # add all other key value + for key, value in adddic.items(): + if key in ["idim", "ilen"]: + continue + in_add_dic[key] = value + # add name + in_add_dic["name"] = "input%d" % (len(input_list) + 1) + + input_list.append(in_add_dic) + new_dic[key_id] = { + "input": input_list, + "output": orgdic["output"], + "utt2spk": orgdic["utt2spk"], + } + # add as output + else: + # original output + output_list = orgdic["output"] + # additional output + out_add_dic = {} + # add shape + if "odim" in adddic and "olen" in adddic: + out_add_dic["shape"] = [int(adddic["olen"]), int(adddic["odim"])] + elif "odim" in adddic: + out_add_dic["shape"] = [int(adddic["odim"])] + # add all other key value + for key, value in adddic.items(): + if key in ["odim", "olen"]: + continue + out_add_dic[key] = value + # add name + out_add_dic["name"] = "target%d" % (len(output_list) + 1) + + output_list.append(out_add_dic) + new_dic[key_id] = { + "input": orgdic["input"], + "output": output_list, + "utt2spk": orgdic["utt2spk"], + } + if "lang" in orgdic.keys(): + new_dic[key_id]["lang"] = orgdic["lang"] + + # ensure "ensure_ascii=False", which is a bug + jsonstring = json.dumps( + {"utts": new_dic}, + indent=4, + ensure_ascii=False, + sort_keys=True, + separators=(",", ": "), + ) + sys.stdout = codecs.getwriter("utf-8")( + sys.stdout if is_python2 else sys.stdout.buffer + ) + print(jsonstring) diff --git a/utils/scp2json.py b/utils/scp2json.py new file mode 100755 index 00000000..8e8de3e0 --- /dev/null +++ b/utils/scp2json.py @@ -0,0 +1,48 @@ +#!/usr/bin/env python3 +# encoding: utf-8 + +# Copyright 2017 Johns Hopkins University (Shinji Watanabe) +# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +from __future__ import print_function +from __future__ import unicode_literals + +import argparse +import codecs +import json +import sys + +is_python2 = sys.version_info[0] == 2 + + +def get_parser(): + parser = argparse.ArgumentParser( + description="convert scp to json", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument("--key", "-k", type=str, help="key") + return parser + + +if __name__ == "__main__": + parser = get_parser() + args = parser.parse_args() + + new_line = {} + sys.stdin = codecs.getreader("utf-8")(sys.stdin if is_python2 else sys.stdin.buffer) + sys.stdout = codecs.getwriter("utf-8")( + sys.stdout if is_python2 else sys.stdout.buffer + ) + line = sys.stdin.readline() + while line: + x = line.rstrip().split() + v = {args.key: " ".join(x[1:])} + new_line[x[0]] = v + line = sys.stdin.readline() + + all_l = {"utts": new_line} + + # ensure "ensure_ascii=False", which is a bug + jsonstring = json.dumps( + all_l, indent=4, ensure_ascii=False, sort_keys=True, separators=(",", ": ") + ) + print(jsonstring) diff --git a/utils/tokenizer.perl b/utils/tokenizer.perl new file mode 100644 index 00000000..ae97d658 --- /dev/null +++ b/utils/tokenizer.perl @@ -0,0 +1,596 @@ +#!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. + +use warnings; + +# Sample Tokenizer +### Version 1.1 +# written by Pidong Wang, based on the code written by Josh Schroeder and Philipp Koehn +# Version 1.1 updates: +# (1) add multithreading option "-threads NUM_THREADS" (default is 1); +# (2) add a timing option "-time" to calculate the average speed of this tokenizer; +# (3) add an option "-lines NUM_SENTENCES_PER_THREAD" to set the number of lines for each thread (default is 2000), and this option controls the memory amount needed: the larger this number is, the larger memory is required (the higher tokenization speed); +### Version 1.0 +# $Id: tokenizer.perl 915 2009-08-10 08:15:49Z philipp $ +# written by Josh Schroeder, based on code by Philipp Koehn + +binmode(STDIN, ":utf8"); +binmode(STDOUT, ":utf8"); + +use warnings; +use FindBin qw($RealBin); +use strict; +use Time::HiRes; + +if (eval {require Thread;1;}) { + #module loaded + Thread->import(); +} + +my $mydir = "$RealBin/../share/nonbreaking_prefixes"; + +my %NONBREAKING_PREFIX = (); +my @protected_patterns = (); +my $protected_patterns_file = ""; +my $language = "en"; +my $QUIET = 0; +my $HELP = 0; +my $AGGRESSIVE = 0; +my $SKIP_XML = 0; +my $TIMING = 0; +my $NUM_THREADS = 1; +my $NUM_SENTENCES_PER_THREAD = 2000; +my $PENN = 0; +my $NO_ESCAPING = 0; +while (@ARGV) +{ + $_ = shift; + /^-b$/ && ($| = 1, next); + /^-l$/ && ($language = shift, next); + /^-q$/ && ($QUIET = 1, next); + /^-h$/ && ($HELP = 1, next); + /^-x$/ && ($SKIP_XML = 1, next); + /^-a$/ && ($AGGRESSIVE = 1, next); + /^-time$/ && ($TIMING = 1, next); + # Option to add list of regexps to be protected + /^-protected/ && ($protected_patterns_file = shift, next); + /^-threads$/ && ($NUM_THREADS = int(shift), next); + /^-lines$/ && ($NUM_SENTENCES_PER_THREAD = int(shift), next); + /^-penn$/ && ($PENN = 1, next); + /^-no-escape/ && ($NO_ESCAPING = 1, next); +} + +# for time calculation +my $start_time; +if ($TIMING) +{ + $start_time = [ Time::HiRes::gettimeofday( ) ]; +} + +# print help message +if ($HELP) +{ + print "Usage ./tokenizer.perl (-l [en|de|...]) (-threads 4) < textfile > tokenizedfile\n"; + print "Options:\n"; + print " -q ... quiet.\n"; + print " -a ... aggressive hyphen splitting.\n"; + print " -b ... disable Perl buffering.\n"; + print " -time ... enable processing time calculation.\n"; + print " -penn ... use Penn treebank-like tokenization.\n"; + print " -protected FILE ... specify file with patters to be protected in tokenisation.\n"; + print " -no-escape ... don't perform HTML escaping on apostrophy, quotes, etc.\n"; + exit; +} + +if (!$QUIET) +{ + print STDERR "Tokenizer Version 1.1\n"; + print STDERR "Language: $language\n"; + print STDERR "Number of threads: $NUM_THREADS\n"; +} + +# load the language-specific non-breaking prefix info from files in the directory nonbreaking_prefixes +load_prefixes($language,\%NONBREAKING_PREFIX); + +if (scalar(%NONBREAKING_PREFIX) eq 0) +{ + print STDERR "Warning: No known abbreviations for language '$language'\n"; +} + +# Load protected patterns +if ($protected_patterns_file) +{ + open(PP,$protected_patterns_file) || die "Unable to open $protected_patterns_file"; + while() { + chomp; + push @protected_patterns, $_; + } +} + +my @batch_sentences = (); +my @thread_list = (); +my $count_sentences = 0; + +if ($NUM_THREADS > 1) +{# multi-threading tokenization + while() + { + $count_sentences = $count_sentences + 1; + push(@batch_sentences, $_); + if (scalar(@batch_sentences)>=($NUM_SENTENCES_PER_THREAD*$NUM_THREADS)) + { + # assign each thread work + for (my $i=0; $i<$NUM_THREADS; $i++) + { + my $start_index = $i*$NUM_SENTENCES_PER_THREAD; + my $end_index = $start_index+$NUM_SENTENCES_PER_THREAD-1; + my @subbatch_sentences = @batch_sentences[$start_index..$end_index]; + my $new_thread = new Thread \&tokenize_batch, @subbatch_sentences; + push(@thread_list, $new_thread); + } + foreach (@thread_list) + { + my $tokenized_list = $_->join; + foreach (@$tokenized_list) + { + print $_; + } + } + # reset for the new run + @thread_list = (); + @batch_sentences = (); + } + } + # the last batch + if (scalar(@batch_sentences)>0) + { + # assign each thread work + for (my $i=0; $i<$NUM_THREADS; $i++) + { + my $start_index = $i*$NUM_SENTENCES_PER_THREAD; + if ($start_index >= scalar(@batch_sentences)) + { + last; + } + my $end_index = $start_index+$NUM_SENTENCES_PER_THREAD-1; + if ($end_index >= scalar(@batch_sentences)) + { + $end_index = scalar(@batch_sentences)-1; + } + my @subbatch_sentences = @batch_sentences[$start_index..$end_index]; + my $new_thread = new Thread \&tokenize_batch, @subbatch_sentences; + push(@thread_list, $new_thread); + } + foreach (@thread_list) + { + my $tokenized_list = $_->join; + foreach (@$tokenized_list) + { + print $_; + } + } + } +} +else +{# single thread only + while() + { + if (($SKIP_XML && /^<.+>$/) || /^\s*$/) + { + #don't try to tokenize XML/HTML tag lines + print $_; + } + else + { + print &tokenize($_); + } + } +} + +if ($TIMING) +{ + my $duration = Time::HiRes::tv_interval( $start_time ); + print STDERR ("TOTAL EXECUTION TIME: ".$duration."\n"); + print STDERR ("TOKENIZATION SPEED: ".($duration/$count_sentences*1000)." milliseconds/line\n"); +} + +##################################################################################### +# subroutines afterward + +# tokenize a batch of texts saved in an array +# input: an array containing a batch of texts +# return: another array containing a batch of tokenized texts for the input array +sub tokenize_batch +{ + my(@text_list) = @_; + my(@tokenized_list) = (); + foreach (@text_list) + { + if (($SKIP_XML && /^<.+>$/) || /^\s*$/) + { + #don't try to tokenize XML/HTML tag lines + push(@tokenized_list, $_); + } + else + { + push(@tokenized_list, &tokenize($_)); + } + } + return \@tokenized_list; +} + +# the actual tokenize function which tokenizes one input string +# input: one string +# return: the tokenized string for the input string +sub tokenize +{ + my($text) = @_; + + if ($PENN) { + return tokenize_penn($text); + } + + chomp($text); + $text = " $text "; + + # remove ASCII junk + $text =~ s/\s+/ /g; + $text =~ s/[\000-\037]//g; + + # Find protected patterns + my @protected = (); + foreach my $protected_pattern (@protected_patterns) { + my $t = $text; + while ($t =~ /(?$protected_pattern)(?.*)$/) { + push @protected, $+{PATTERN}; + $t = $+{TAIL}; + } + } + + for (my $i = 0; $i < scalar(@protected); ++$i) { + my $subst = sprintf("THISISPROTECTED%.3d", $i); + $text =~ s,\Q$protected[$i], $subst ,g; + } + $text =~ s/ +/ /g; + $text =~ s/^ //g; + $text =~ s/ $//g; + + # separate out all "other" special characters + if (($language eq "fi") or ($language eq "sv")) { + # in Finnish and Swedish, the colon can be used inside words as an apostrophe-like character: + # USA:n, 20:een, EU:ssa, USA:s, S:t + $text =~ s/([^\p{IsAlnum}\s\.\:\'\`\,\-])/ $1 /g; + # if a colon is not immediately followed by lower-case characters, separate it out anyway + $text =~ s/(:)(?=$|[^\p{Ll}])/ $1 /g; + } + elsif ($language eq "tdt") { + # in Tetun, the apostrophe can be used inside words as an apostrophe-like character: + $text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g; + # if an apostrophe is not immediately followed by lower-case characters, separate it out anyway + $text =~ s/(\')(?=$|[^\p{Ll}])/ $1 /g; + } + elsif (($language eq "ca")) { + # in Catalan, the middle dot can be used inside words: + # il�lusio + $text =~ s/([^\p{IsAlnum}\s\.\·\'\`\,\-])/ $1 /g; + # if a middot is not immediately followed by lower-case characters, separate it out anyway + $text =~ s/(·)(?=$|[^\p{Ll}])/ $1 /g; + } + else { + $text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g; + } + + # aggressive hyphen splitting + if ($AGGRESSIVE) + { + $text =~ s/([\p{IsAlnum}])\-(?=[\p{IsAlnum}])/$1 \@-\@ /g; + } + + #multi-dots stay together + $text =~ s/\.([\.]+)/ DOTMULTI$1/g; + while($text =~ /DOTMULTI\./) + { + $text =~ s/DOTMULTI\.([^\.])/DOTDOTMULTI $1/g; + $text =~ s/DOTMULTI\./DOTDOTMULTI/g; + } + + # seperate out "," except if within numbers (5,300) + #$text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g; + + # separate out "," except if within numbers (5,300) + # previous "global" application skips some: A,B,C,D,E > A , B,C , D,E + # first application uses up B so rule can't see B,C + # two-step version here may create extra spaces but these are removed later + # will also space digit,letter or letter,digit forms (redundant with next section) + $text =~ s/([^\p{IsN}])[,]/$1 , /g; + $text =~ s/[,]([^\p{IsN}])/ , $1/g; + + # separate "," after a number if it's the end of a sentence + $text =~ s/([\p{IsN}])[,]$/$1 ,/g; + + # separate , pre and post number + #$text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g; + #$text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g; + + # turn `into ' + #$text =~ s/\`/\'/g; + + #turn '' into " + #$text =~ s/\'\'/ \" /g; + + if ($language eq "en") + { + #split contractions right + $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g; + $text =~ s/([^\p{IsAlpha}\p{IsN}])[']([\p{IsAlpha}])/$1 ' $2/g; + $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g; + $text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1 '$2/g; + #special case for "1990's" + $text =~ s/([\p{IsN}])[']([s])/$1 '$2/g; + } + elsif (($language eq "fr") or ($language eq "it") or ($language eq "ga") or ($language eq "ca")) + { + #split contractions left + $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g; + $text =~ s/([^\p{IsAlpha}])[']([\p{IsAlpha}])/$1 ' $2/g; + $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g; + $text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1' $2/g; + } + elsif (($language eq "so") or ($language eq "tdt")) + { + # Don't split glottals + $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g; + $text =~ s/([^\p{IsAlpha}])[']([\p{IsAlpha}])/$1 ' $2/g; + $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g; + } + else + { + $text =~ s/\'/ \' /g; + } + + #word token method + my @words = split(/\s/,$text); + $text = ""; + for (my $i=0;$i<(scalar(@words));$i++) + { + my $word = $words[$i]; + if ( $word =~ /^(\S+)\.$/) + { + my $pre = $1; + if ($i == scalar(@words)-1) { + # split last words independently as they are unlikely to be non-breaking prefixes + $word = $pre." ."; + } + elsif (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($i/\>/g; # xml + $text =~ s/\'/\'/g; # xml + $text =~ s/\"/\"/g; # xml + $text =~ s/\[/\[/g; # syntax non-terminal + $text =~ s/\]/\]/g; # syntax non-terminal + } + + #ensure final line break + $text .= "\n" unless $text =~ /\n$/; + + return $text; +} + +sub tokenize_penn +{ + # Improved compatibility with Penn Treebank tokenization. Useful if + # the text is to later be parsed with a PTB-trained parser. + # + # Adapted from Robert MacIntyre's sed script: + # http://www.cis.upenn.edu/~treebank/tokenizer.sed + + my($text) = @_; + chomp($text); + + # remove ASCII junk + $text =~ s/\s+/ /g; + $text =~ s/[\000-\037]//g; + + # attempt to get correct directional quotes + $text =~ s/^``/`` /g; + $text =~ s/^"/`` /g; + $text =~ s/^`([^`])/` $1/g; + $text =~ s/^'/` /g; + $text =~ s/([ ([{<])"/$1 `` /g; + $text =~ s/([ ([{<])``/$1 `` /g; + $text =~ s/([ ([{<])`([^`])/$1 ` $2/g; + $text =~ s/([ ([{<])'/$1 ` /g; + # close quotes handled at end + + $text =~ s=\.\.\.= _ELLIPSIS_ =g; + + # separate out "," except if within numbers (5,300) + $text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g; + # separate , pre and post number + $text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g; + $text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g; + + #$text =~ s=([;:@#\$%&\p{IsSc}])= $1 =g; +$text =~ s=([;:@#\$%&\p{IsSc}\p{IsSo}])= $1 =g; + + # Separate out intra-token slashes. PTB tokenization doesn't do this, so + # the tokens should be merged prior to parsing with a PTB-trained parser + # (see syntax-hyphen-splitting.perl). + $text =~ s/([\p{IsAlnum}])\/([\p{IsAlnum}])/$1 \@\/\@ $2/g; + + # Assume sentence tokenization has been done first, so split FINAL periods + # only. + $text =~ s=([^.])([.])([\]\)}>"']*) ?$=$1 $2$3 =g; + # however, we may as well split ALL question marks and exclamation points, + # since they shouldn't have the abbrev.-marker ambiguity problem + $text =~ s=([?!])= $1 =g; + + # parentheses, brackets, etc. + $text =~ s=([\]\[\(\){}<>])= $1 =g; + $text =~ s/\(/-LRB-/g; + $text =~ s/\)/-RRB-/g; + $text =~ s/\[/-LSB-/g; + $text =~ s/\]/-RSB-/g; + $text =~ s/{/-LCB-/g; + $text =~ s/}/-RCB-/g; + + $text =~ s=--= -- =g; + + # First off, add a space to the beginning and end of each line, to reduce + # necessary number of regexps. + $text =~ s=$= =; + $text =~ s=^= =; + + $text =~ s="= '' =g; + # possessive or close-single-quote + $text =~ s=([^'])' =$1 ' =g; + # as in it's, I'm, we'd + $text =~ s='([sSmMdD]) = '$1 =g; + $text =~ s='ll = 'll =g; + $text =~ s='re = 're =g; + $text =~ s='ve = 've =g; + $text =~ s=n't = n't =g; + $text =~ s='LL = 'LL =g; + $text =~ s='RE = 'RE =g; + $text =~ s='VE = 'VE =g; + $text =~ s=N'T = N'T =g; + + $text =~ s= ([Cc])annot = $1an not =g; + $text =~ s= ([Dd])'ye = $1' ye =g; + $text =~ s= ([Gg])imme = $1im me =g; + $text =~ s= ([Gg])onna = $1on na =g; + $text =~ s= ([Gg])otta = $1ot ta =g; + $text =~ s= ([Ll])emme = $1em me =g; + $text =~ s= ([Mm])ore'n = $1ore 'n =g; + $text =~ s= '([Tt])is = '$1 is =g; + $text =~ s= '([Tt])was = '$1 was =g; + $text =~ s= ([Ww])anna = $1an na =g; + + #word token method + my @words = split(/\s/,$text); + $text = ""; + for (my $i=0;$i<(scalar(@words));$i++) + { + my $word = $words[$i]; + if ( $word =~ /^(\S+)\.$/) + { + my $pre = $1; + if (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($i/\>/g; # xml + $text =~ s/\'/\'/g; # xml + $text =~ s/\"/\"/g; # xml + $text =~ s/\[/\[/g; # syntax non-terminal + $text =~ s/\]/\]/g; # syntax non-terminal + + #ensure final line break + $text .= "\n" unless $text =~ /\n$/; + + return $text; +} + +sub load_prefixes +{ + my ($language, $PREFIX_REF) = @_; + + my $prefixfile = "$mydir/nonbreaking_prefix.$language"; + + #default back to English if we don't have a language-specific prefix file + if (!(-e $prefixfile)) + { + $prefixfile = "$mydir/nonbreaking_prefix.en"; + print STDERR "WARNING: No known abbreviations for language '$language', attempting fall-back to English version...\n"; + die ("ERROR: No abbreviations files found in $mydir\n") unless (-e $prefixfile); + } + + if (-e "$prefixfile") + { + open(PREFIX, "<:utf8", "$prefixfile"); + while () + { + my $item = $_; + chomp($item); + if (($item) && (substr($item,0,1) ne "#")) + { + if ($item =~ /(.*)[\s]+(\#NUMERIC_ONLY\#)/) + { + $PREFIX_REF->{$1} = 2; + } + else + { + $PREFIX_REF->{$item} = 1; + } + } + } + close(PREFIX); + } +} \ No newline at end of file diff --git a/utils/update_json.sh b/utils/update_json.sh new file mode 100755 index 00000000..bf697475 --- /dev/null +++ b/utils/update_json.sh @@ -0,0 +1,88 @@ +#!/bin/bash + +# Copyright 2020 Kyoto University (Hirofumi Inaguma) +# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +echo "$0 $*" >&2 # Print the command line for logging +. ./path.sh + +nlsyms="" +oov="" +bpecode="" +verbose=0 + +text="" +multilingual=false + +help_message=$(cat << EOF +Usage: $0 +e.g. $0 data/train data/lang_1char/train_units.txt +Options: + --oov # Default: + --verbose # Default: 0 +EOF +) +. utils/parse_options.sh + +if [ $# != 3 ]; then + echo "${help_message}" 1>&2 + exit 1; +fi + +set -euo pipefail + +json=$1 +dir=$2 +dic=$3 +json_dir=$(dirname ${json}) +tmpdir=$(mktemp -d ${dir}/tmp-XXXXX) +trap 'rm -rf ${tmpdir}' EXIT + +if [ -z ${text} ]; then + text=${dir}/text +fi + +# 2. Create scp files for outputs +mkdir -p ${tmpdir}/output +if [ -n "${bpecode}" ]; then + if [ ${multilingual} = true ]; then + # remove a space before the language ID + paste -d " " <(awk '{print $1}' ${text}) <(cut -f 2- -d" " ${text} \ + | spm_encode --model=${bpecode} --output_format=piece | cut -f 2- -d" ") \ + > ${tmpdir}/output/token.scp + else + paste -d " " <(awk '{print $1}' ${text}) <(cut -f 2- -d" " ${text} \ + | spm_encode --model=${bpecode} --output_format=piece) \ + > ${tmpdir}/output/token.scp + fi +elif [ -n "${nlsyms}" ]; then + text2token.py -s 1 -n 1 -l ${nlsyms} ${text} > ${tmpdir}/output/token.scp +else + text2token.py -s 1 -n 1 ${text} > ${tmpdir}/output/token.scp +fi +< ${tmpdir}/output/token.scp utils/sym2int.pl --map-oov ${oov} -f 2- ${dic} > ${tmpdir}/output/tokenid.scp +awk '{print $1 " " NF-1}' ${tmpdir}/output/tokenid.scp > ${tmpdir}/output/olen.scp +# +2 comes from CTC blank and EOS +vocsize=$(tail -n 1 ${dic} | awk '{print $2}') +odim=$(echo "$vocsize + 2" | bc) +awk -v odim=${odim} '{print $1 " " odim}' ${text} > ${tmpdir}/output/odim.scp + +cat ${text} > ${tmpdir}/output/text.scp + + +# 4. Create JSON files from each scp files +rm -f ${tmpdir}/*/*.json +for x in "${tmpdir}"/output/*.scp; do + k=$(basename ${x} .scp) + < ${x} scp2json.py --key ${k} > ${tmpdir}/output/${k}.json +done + +# add to json +addjson.py --verbose ${verbose} -i false \ + ${json} ${tmpdir}/output/text.json ${tmpdir}/output/token.json ${tmpdir}/output/tokenid.json ${tmpdir}/output/olen.json ${tmpdir}/output/odim.json > ${tmpdir}/data.json +mkdir -p ${json_dir}/.backup +echo "json updated. original json is kept in ${json_dir}/.backup." +cp ${json} ${json_dir}/.backup/"$(basename ${json})" +cp ${tmpdir}/data.json ${json} + +rm -fr ${tmpdir}