pull/1050/head
Junkun 3 years ago
parent 8f3280af8e
commit ea35558ee0

@ -0,0 +1,155 @@
#!/usr/bin/env python3
# encoding: utf-8
# Copyright 2018 Nagoya University (Tomoki Hayashi)
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
from __future__ import print_function
from __future__ import unicode_literals
import argparse
import codecs
import json
import logging
import sys
from distutils.util import strtobool
from espnet.utils.cli_utils import get_commandline_args
is_python2 = sys.version_info[0] == 2
def get_parser():
parser = argparse.ArgumentParser(
description="add multiple json values to an input or output value",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
parser.add_argument("jsons", type=str, nargs="+", help="json files")
parser.add_argument(
"-i",
"--is-input",
default=True,
type=strtobool,
help="If true, add to input. If false, add to output",
)
parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option")
return parser
if __name__ == "__main__":
parser = get_parser()
args = parser.parse_args()
# logging info
logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
if args.verbose > 0:
logging.basicConfig(level=logging.INFO, format=logfmt)
else:
logging.basicConfig(level=logging.WARN, format=logfmt)
logging.info(get_commandline_args())
# make intersection set for utterance keys
js = []
intersec_ks = []
for x in args.jsons:
with codecs.open(x, "r", encoding="utf-8") as f:
j = json.load(f)
ks = j["utts"].keys()
logging.info(x + ": has " + str(len(ks)) + " utterances")
if len(intersec_ks) > 0:
intersec_ks = intersec_ks.intersection(set(ks))
if len(intersec_ks) == 0:
logging.warning("Empty intersection")
break
else:
intersec_ks = set(ks)
js.append(j)
logging.info("new json has " + str(len(intersec_ks)) + " utterances")
# updated original dict to keep intersection
intersec_org_dic = dict()
for k in intersec_ks:
v = js[0]["utts"][k]
intersec_org_dic[k] = v
intersec_add_dic = dict()
for k in intersec_ks:
v = js[1]["utts"][k]
for j in js[2:]:
v.update(j["utts"][k])
intersec_add_dic[k] = v
new_dic = dict()
for key_id in intersec_org_dic:
orgdic = intersec_org_dic[key_id]
adddic = intersec_add_dic[key_id]
if "utt2spk" not in orgdic:
orgdic["utt2spk"] = ""
# NOTE: for machine translation
# add as input
if args.is_input:
# original input
input_list = orgdic["input"]
# additional input
in_add_dic = {}
if "idim" in adddic and "ilen" in adddic:
in_add_dic["shape"] = [int(adddic["ilen"]), int(adddic["idim"])]
elif "idim" in adddic:
in_add_dic["shape"] = [int(adddic["idim"])]
# add all other key value
for key, value in adddic.items():
if key in ["idim", "ilen"]:
continue
in_add_dic[key] = value
# add name
in_add_dic["name"] = "input%d" % (len(input_list) + 1)
input_list.append(in_add_dic)
new_dic[key_id] = {
"input": input_list,
"output": orgdic["output"],
"utt2spk": orgdic["utt2spk"],
}
# add as output
else:
# original output
output_list = orgdic["output"]
# additional output
out_add_dic = {}
# add shape
if "odim" in adddic and "olen" in adddic:
out_add_dic["shape"] = [int(adddic["olen"]), int(adddic["odim"])]
elif "odim" in adddic:
out_add_dic["shape"] = [int(adddic["odim"])]
# add all other key value
for key, value in adddic.items():
if key in ["odim", "olen"]:
continue
out_add_dic[key] = value
# add name
out_add_dic["name"] = "target%d" % (len(output_list) + 1)
output_list.append(out_add_dic)
new_dic[key_id] = {
"input": orgdic["input"],
"output": output_list,
"utt2spk": orgdic["utt2spk"],
}
if "lang" in orgdic.keys():
new_dic[key_id]["lang"] = orgdic["lang"]
# ensure "ensure_ascii=False", which is a bug
jsonstring = json.dumps(
{"utts": new_dic},
indent=4,
ensure_ascii=False,
sort_keys=True,
separators=(",", ": "),
)
sys.stdout = codecs.getwriter("utf-8")(
sys.stdout if is_python2 else sys.stdout.buffer
)
print(jsonstring)

@ -0,0 +1,48 @@
#!/usr/bin/env python3
# encoding: utf-8
# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
from __future__ import print_function
from __future__ import unicode_literals
import argparse
import codecs
import json
import sys
is_python2 = sys.version_info[0] == 2
def get_parser():
parser = argparse.ArgumentParser(
description="convert scp to json",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
parser.add_argument("--key", "-k", type=str, help="key")
return parser
if __name__ == "__main__":
parser = get_parser()
args = parser.parse_args()
new_line = {}
sys.stdin = codecs.getreader("utf-8")(sys.stdin if is_python2 else sys.stdin.buffer)
sys.stdout = codecs.getwriter("utf-8")(
sys.stdout if is_python2 else sys.stdout.buffer
)
line = sys.stdin.readline()
while line:
x = line.rstrip().split()
v = {args.key: " ".join(x[1:])}
new_line[x[0]] = v
line = sys.stdin.readline()
all_l = {"utts": new_line}
# ensure "ensure_ascii=False", which is a bug
jsonstring = json.dumps(
all_l, indent=4, ensure_ascii=False, sort_keys=True, separators=(",", ": ")
)
print(jsonstring)

@ -0,0 +1,596 @@
#!/usr/bin/env perl
#
# This file is part of moses. Its use is licensed under the GNU Lesser General
# Public License version 2.1 or, at your option, any later version.
use warnings;
# Sample Tokenizer
### Version 1.1
# written by Pidong Wang, based on the code written by Josh Schroeder and Philipp Koehn
# Version 1.1 updates:
# (1) add multithreading option "-threads NUM_THREADS" (default is 1);
# (2) add a timing option "-time" to calculate the average speed of this tokenizer;
# (3) add an option "-lines NUM_SENTENCES_PER_THREAD" to set the number of lines for each thread (default is 2000), and this option controls the memory amount needed: the larger this number is, the larger memory is required (the higher tokenization speed);
### Version 1.0
# $Id: tokenizer.perl 915 2009-08-10 08:15:49Z philipp $
# written by Josh Schroeder, based on code by Philipp Koehn
binmode(STDIN, ":utf8");
binmode(STDOUT, ":utf8");
use warnings;
use FindBin qw($RealBin);
use strict;
use Time::HiRes;
if (eval {require Thread;1;}) {
#module loaded
Thread->import();
}
my $mydir = "$RealBin/../share/nonbreaking_prefixes";
my %NONBREAKING_PREFIX = ();
my @protected_patterns = ();
my $protected_patterns_file = "";
my $language = "en";
my $QUIET = 0;
my $HELP = 0;
my $AGGRESSIVE = 0;
my $SKIP_XML = 0;
my $TIMING = 0;
my $NUM_THREADS = 1;
my $NUM_SENTENCES_PER_THREAD = 2000;
my $PENN = 0;
my $NO_ESCAPING = 0;
while (@ARGV)
{
$_ = shift;
/^-b$/ && ($| = 1, next);
/^-l$/ && ($language = shift, next);
/^-q$/ && ($QUIET = 1, next);
/^-h$/ && ($HELP = 1, next);
/^-x$/ && ($SKIP_XML = 1, next);
/^-a$/ && ($AGGRESSIVE = 1, next);
/^-time$/ && ($TIMING = 1, next);
# Option to add list of regexps to be protected
/^-protected/ && ($protected_patterns_file = shift, next);
/^-threads$/ && ($NUM_THREADS = int(shift), next);
/^-lines$/ && ($NUM_SENTENCES_PER_THREAD = int(shift), next);
/^-penn$/ && ($PENN = 1, next);
/^-no-escape/ && ($NO_ESCAPING = 1, next);
}
# for time calculation
my $start_time;
if ($TIMING)
{
$start_time = [ Time::HiRes::gettimeofday( ) ];
}
# print help message
if ($HELP)
{
print "Usage ./tokenizer.perl (-l [en|de|...]) (-threads 4) < textfile > tokenizedfile\n";
print "Options:\n";
print " -q ... quiet.\n";
print " -a ... aggressive hyphen splitting.\n";
print " -b ... disable Perl buffering.\n";
print " -time ... enable processing time calculation.\n";
print " -penn ... use Penn treebank-like tokenization.\n";
print " -protected FILE ... specify file with patters to be protected in tokenisation.\n";
print " -no-escape ... don't perform HTML escaping on apostrophy, quotes, etc.\n";
exit;
}
if (!$QUIET)
{
print STDERR "Tokenizer Version 1.1\n";
print STDERR "Language: $language\n";
print STDERR "Number of threads: $NUM_THREADS\n";
}
# load the language-specific non-breaking prefix info from files in the directory nonbreaking_prefixes
load_prefixes($language,\%NONBREAKING_PREFIX);
if (scalar(%NONBREAKING_PREFIX) eq 0)
{
print STDERR "Warning: No known abbreviations for language '$language'\n";
}
# Load protected patterns
if ($protected_patterns_file)
{
open(PP,$protected_patterns_file) || die "Unable to open $protected_patterns_file";
while(<PP>) {
chomp;
push @protected_patterns, $_;
}
}
my @batch_sentences = ();
my @thread_list = ();
my $count_sentences = 0;
if ($NUM_THREADS > 1)
{# multi-threading tokenization
while(<STDIN>)
{
$count_sentences = $count_sentences + 1;
push(@batch_sentences, $_);
if (scalar(@batch_sentences)>=($NUM_SENTENCES_PER_THREAD*$NUM_THREADS))
{
# assign each thread work
for (my $i=0; $i<$NUM_THREADS; $i++)
{
my $start_index = $i*$NUM_SENTENCES_PER_THREAD;
my $end_index = $start_index+$NUM_SENTENCES_PER_THREAD-1;
my @subbatch_sentences = @batch_sentences[$start_index..$end_index];
my $new_thread = new Thread \&tokenize_batch, @subbatch_sentences;
push(@thread_list, $new_thread);
}
foreach (@thread_list)
{
my $tokenized_list = $_->join;
foreach (@$tokenized_list)
{
print $_;
}
}
# reset for the new run
@thread_list = ();
@batch_sentences = ();
}
}
# the last batch
if (scalar(@batch_sentences)>0)
{
# assign each thread work
for (my $i=0; $i<$NUM_THREADS; $i++)
{
my $start_index = $i*$NUM_SENTENCES_PER_THREAD;
if ($start_index >= scalar(@batch_sentences))
{
last;
}
my $end_index = $start_index+$NUM_SENTENCES_PER_THREAD-1;
if ($end_index >= scalar(@batch_sentences))
{
$end_index = scalar(@batch_sentences)-1;
}
my @subbatch_sentences = @batch_sentences[$start_index..$end_index];
my $new_thread = new Thread \&tokenize_batch, @subbatch_sentences;
push(@thread_list, $new_thread);
}
foreach (@thread_list)
{
my $tokenized_list = $_->join;
foreach (@$tokenized_list)
{
print $_;
}
}
}
}
else
{# single thread only
while(<STDIN>)
{
if (($SKIP_XML && /^<.+>$/) || /^\s*$/)
{
#don't try to tokenize XML/HTML tag lines
print $_;
}
else
{
print &tokenize($_);
}
}
}
if ($TIMING)
{
my $duration = Time::HiRes::tv_interval( $start_time );
print STDERR ("TOTAL EXECUTION TIME: ".$duration."\n");
print STDERR ("TOKENIZATION SPEED: ".($duration/$count_sentences*1000)." milliseconds/line\n");
}
#####################################################################################
# subroutines afterward
# tokenize a batch of texts saved in an array
# input: an array containing a batch of texts
# return: another array containing a batch of tokenized texts for the input array
sub tokenize_batch
{
my(@text_list) = @_;
my(@tokenized_list) = ();
foreach (@text_list)
{
if (($SKIP_XML && /^<.+>$/) || /^\s*$/)
{
#don't try to tokenize XML/HTML tag lines
push(@tokenized_list, $_);
}
else
{
push(@tokenized_list, &tokenize($_));
}
}
return \@tokenized_list;
}
# the actual tokenize function which tokenizes one input string
# input: one string
# return: the tokenized string for the input string
sub tokenize
{
my($text) = @_;
if ($PENN) {
return tokenize_penn($text);
}
chomp($text);
$text = " $text ";
# remove ASCII junk
$text =~ s/\s+/ /g;
$text =~ s/[\000-\037]//g;
# Find protected patterns
my @protected = ();
foreach my $protected_pattern (@protected_patterns) {
my $t = $text;
while ($t =~ /(?<PATTERN>$protected_pattern)(?<TAIL>.*)$/) {
push @protected, $+{PATTERN};
$t = $+{TAIL};
}
}
for (my $i = 0; $i < scalar(@protected); ++$i) {
my $subst = sprintf("THISISPROTECTED%.3d", $i);
$text =~ s,\Q$protected[$i], $subst ,g;
}
$text =~ s/ +/ /g;
$text =~ s/^ //g;
$text =~ s/ $//g;
# separate out all "other" special characters
if (($language eq "fi") or ($language eq "sv")) {
# in Finnish and Swedish, the colon can be used inside words as an apostrophe-like character:
# USA:n, 20:een, EU:ssa, USA:s, S:t
$text =~ s/([^\p{IsAlnum}\s\.\:\'\`\,\-])/ $1 /g;
# if a colon is not immediately followed by lower-case characters, separate it out anyway
$text =~ s/(:)(?=$|[^\p{Ll}])/ $1 /g;
}
elsif ($language eq "tdt") {
# in Tetun, the apostrophe can be used inside words as an apostrophe-like character:
$text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g;
# if an apostrophe is not immediately followed by lower-case characters, separate it out anyway
$text =~ s/(\')(?=$|[^\p{Ll}])/ $1 /g;
}
elsif (($language eq "ca")) {
# in Catalan, the middle dot can be used inside words:
# il<69>lusio
$text =~ s/([^\p{IsAlnum}\s\.\·\'\`\,\-])/ $1 /g;
# if a middot is not immediately followed by lower-case characters, separate it out anyway
$text =~ s/(·)(?=$|[^\p{Ll}])/ $1 /g;
}
else {
$text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g;
}
# aggressive hyphen splitting
if ($AGGRESSIVE)
{
$text =~ s/([\p{IsAlnum}])\-(?=[\p{IsAlnum}])/$1 \@-\@ /g;
}
#multi-dots stay together
$text =~ s/\.([\.]+)/ DOTMULTI$1/g;
while($text =~ /DOTMULTI\./)
{
$text =~ s/DOTMULTI\.([^\.])/DOTDOTMULTI $1/g;
$text =~ s/DOTMULTI\./DOTDOTMULTI/g;
}
# seperate out "," except if within numbers (5,300)
#$text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
# separate out "," except if within numbers (5,300)
# previous "global" application skips some: A,B,C,D,E > A , B,C , D,E
# first application uses up B so rule can't see B,C
# two-step version here may create extra spaces but these are removed later
# will also space digit,letter or letter,digit forms (redundant with next section)
$text =~ s/([^\p{IsN}])[,]/$1 , /g;
$text =~ s/[,]([^\p{IsN}])/ , $1/g;
# separate "," after a number if it's the end of a sentence
$text =~ s/([\p{IsN}])[,]$/$1 ,/g;
# separate , pre and post number
#$text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
#$text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g;
# turn `into '
#$text =~ s/\`/\'/g;
#turn '' into "
#$text =~ s/\'\'/ \" /g;
if ($language eq "en")
{
#split contractions right
$text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
$text =~ s/([^\p{IsAlpha}\p{IsN}])[']([\p{IsAlpha}])/$1 ' $2/g;
$text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
$text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1 '$2/g;
#special case for "1990's"
$text =~ s/([\p{IsN}])[']([s])/$1 '$2/g;
}
elsif (($language eq "fr") or ($language eq "it") or ($language eq "ga") or ($language eq "ca"))
{
#split contractions left
$text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
$text =~ s/([^\p{IsAlpha}])[']([\p{IsAlpha}])/$1 ' $2/g;
$text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
$text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1' $2/g;
}
elsif (($language eq "so") or ($language eq "tdt"))
{
# Don't split glottals
$text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
$text =~ s/([^\p{IsAlpha}])[']([\p{IsAlpha}])/$1 ' $2/g;
$text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
}
else
{
$text =~ s/\'/ \' /g;
}
#word token method
my @words = split(/\s/,$text);
$text = "";
for (my $i=0;$i<(scalar(@words));$i++)
{
my $word = $words[$i];
if ( $word =~ /^(\S+)\.$/)
{
my $pre = $1;
if ($i == scalar(@words)-1) {
# split last words independently as they are unlikely to be non-breaking prefixes
$word = $pre." .";
}
elsif (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[\p{IsLower}]/)))
{
#no change
}
elsif (($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==2) && ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[0-9]+/)))
{
#no change
}
else
{
$word = $pre." .";
}
}
$text .= $word." ";
}
# clean up extraneous spaces
$text =~ s/ +/ /g;
$text =~ s/^ //g;
$text =~ s/ $//g;
# .' at end of sentence is missed
$text =~ s/\.\' ?$/ . ' /;
# restore protected
for (my $i = 0; $i < scalar(@protected); ++$i) {
my $subst = sprintf("THISISPROTECTED%.3d", $i);
$text =~ s/$subst/$protected[$i]/g;
}
#restore multi-dots
while($text =~ /DOTDOTMULTI/)
{
$text =~ s/DOTDOTMULTI/DOTMULTI./g;
}
$text =~ s/DOTMULTI/./g;
#escape special chars
if (!$NO_ESCAPING)
{
$text =~ s/\&/\&amp;/g; # escape escape
$text =~ s/\|/\&#124;/g; # factor separator
$text =~ s/\</\&lt;/g; # xml
$text =~ s/\>/\&gt;/g; # xml
$text =~ s/\'/\&apos;/g; # xml
$text =~ s/\"/\&quot;/g; # xml
$text =~ s/\[/\&#91;/g; # syntax non-terminal
$text =~ s/\]/\&#93;/g; # syntax non-terminal
}
#ensure final line break
$text .= "\n" unless $text =~ /\n$/;
return $text;
}
sub tokenize_penn
{
# Improved compatibility with Penn Treebank tokenization. Useful if
# the text is to later be parsed with a PTB-trained parser.
#
# Adapted from Robert MacIntyre's sed script:
# http://www.cis.upenn.edu/~treebank/tokenizer.sed
my($text) = @_;
chomp($text);
# remove ASCII junk
$text =~ s/\s+/ /g;
$text =~ s/[\000-\037]//g;
# attempt to get correct directional quotes
$text =~ s/^``/`` /g;
$text =~ s/^"/`` /g;
$text =~ s/^`([^`])/` $1/g;
$text =~ s/^'/` /g;
$text =~ s/([ ([{<])"/$1 `` /g;
$text =~ s/([ ([{<])``/$1 `` /g;
$text =~ s/([ ([{<])`([^`])/$1 ` $2/g;
$text =~ s/([ ([{<])'/$1 ` /g;
# close quotes handled at end
$text =~ s=\.\.\.= _ELLIPSIS_ =g;
# separate out "," except if within numbers (5,300)
$text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
# separate , pre and post number
$text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
$text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g;
#$text =~ s=([;:@#\$%&\p{IsSc}])= $1 =g;
$text =~ s=([;:@#\$%&\p{IsSc}\p{IsSo}])= $1 =g;
# Separate out intra-token slashes. PTB tokenization doesn't do this, so
# the tokens should be merged prior to parsing with a PTB-trained parser
# (see syntax-hyphen-splitting.perl).
$text =~ s/([\p{IsAlnum}])\/([\p{IsAlnum}])/$1 \@\/\@ $2/g;
# Assume sentence tokenization has been done first, so split FINAL periods
# only.
$text =~ s=([^.])([.])([\]\)}>"']*) ?$=$1 $2$3 =g;
# however, we may as well split ALL question marks and exclamation points,
# since they shouldn't have the abbrev.-marker ambiguity problem
$text =~ s=([?!])= $1 =g;
# parentheses, brackets, etc.
$text =~ s=([\]\[\(\){}<>])= $1 =g;
$text =~ s/\(/-LRB-/g;
$text =~ s/\)/-RRB-/g;
$text =~ s/\[/-LSB-/g;
$text =~ s/\]/-RSB-/g;
$text =~ s/{/-LCB-/g;
$text =~ s/}/-RCB-/g;
$text =~ s=--= -- =g;
# First off, add a space to the beginning and end of each line, to reduce
# necessary number of regexps.
$text =~ s=$= =;
$text =~ s=^= =;
$text =~ s="= '' =g;
# possessive or close-single-quote
$text =~ s=([^'])' =$1 ' =g;
# as in it's, I'm, we'd
$text =~ s='([sSmMdD]) = '$1 =g;
$text =~ s='ll = 'll =g;
$text =~ s='re = 're =g;
$text =~ s='ve = 've =g;
$text =~ s=n't = n't =g;
$text =~ s='LL = 'LL =g;
$text =~ s='RE = 'RE =g;
$text =~ s='VE = 'VE =g;
$text =~ s=N'T = N'T =g;
$text =~ s= ([Cc])annot = $1an not =g;
$text =~ s= ([Dd])'ye = $1' ye =g;
$text =~ s= ([Gg])imme = $1im me =g;
$text =~ s= ([Gg])onna = $1on na =g;
$text =~ s= ([Gg])otta = $1ot ta =g;
$text =~ s= ([Ll])emme = $1em me =g;
$text =~ s= ([Mm])ore'n = $1ore 'n =g;
$text =~ s= '([Tt])is = '$1 is =g;
$text =~ s= '([Tt])was = '$1 was =g;
$text =~ s= ([Ww])anna = $1an na =g;
#word token method
my @words = split(/\s/,$text);
$text = "";
for (my $i=0;$i<(scalar(@words));$i++)
{
my $word = $words[$i];
if ( $word =~ /^(\S+)\.$/)
{
my $pre = $1;
if (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[\p{IsLower}]/)))
{
#no change
}
elsif (($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==2) && ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[0-9]+/)))
{
#no change
}
else
{
$word = $pre." .";
}
}
$text .= $word." ";
}
# restore ellipses
$text =~ s=_ELLIPSIS_=\.\.\.=g;
# clean out extra spaces
$text =~ s= *= =g;
$text =~ s=^ *==g;
$text =~ s= *$==g;
#escape special chars
$text =~ s/\&/\&amp;/g; # escape escape
$text =~ s/\|/\&#124;/g; # factor separator
$text =~ s/\</\&lt;/g; # xml
$text =~ s/\>/\&gt;/g; # xml
$text =~ s/\'/\&apos;/g; # xml
$text =~ s/\"/\&quot;/g; # xml
$text =~ s/\[/\&#91;/g; # syntax non-terminal
$text =~ s/\]/\&#93;/g; # syntax non-terminal
#ensure final line break
$text .= "\n" unless $text =~ /\n$/;
return $text;
}
sub load_prefixes
{
my ($language, $PREFIX_REF) = @_;
my $prefixfile = "$mydir/nonbreaking_prefix.$language";
#default back to English if we don't have a language-specific prefix file
if (!(-e $prefixfile))
{
$prefixfile = "$mydir/nonbreaking_prefix.en";
print STDERR "WARNING: No known abbreviations for language '$language', attempting fall-back to English version...\n";
die ("ERROR: No abbreviations files found in $mydir\n") unless (-e $prefixfile);
}
if (-e "$prefixfile")
{
open(PREFIX, "<:utf8", "$prefixfile");
while (<PREFIX>)
{
my $item = $_;
chomp($item);
if (($item) && (substr($item,0,1) ne "#"))
{
if ($item =~ /(.*)[\s]+(\#NUMERIC_ONLY\#)/)
{
$PREFIX_REF->{$1} = 2;
}
else
{
$PREFIX_REF->{$item} = 1;
}
}
}
close(PREFIX);
}
}

@ -0,0 +1,88 @@
#!/bin/bash
# Copyright 2020 Kyoto University (Hirofumi Inaguma)
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
echo "$0 $*" >&2 # Print the command line for logging
. ./path.sh
nlsyms=""
oov="<unk>"
bpecode=""
verbose=0
text=""
multilingual=false
help_message=$(cat << EOF
Usage: $0 <json> <data-dir> <dict>
e.g. $0 data/train data/lang_1char/train_units.txt
Options:
--oov <oov-word> # Default: <unk>
--verbose <num> # Default: 0
EOF
)
. utils/parse_options.sh
if [ $# != 3 ]; then
echo "${help_message}" 1>&2
exit 1;
fi
set -euo pipefail
json=$1
dir=$2
dic=$3
json_dir=$(dirname ${json})
tmpdir=$(mktemp -d ${dir}/tmp-XXXXX)
trap 'rm -rf ${tmpdir}' EXIT
if [ -z ${text} ]; then
text=${dir}/text
fi
# 2. Create scp files for outputs
mkdir -p ${tmpdir}/output
if [ -n "${bpecode}" ]; then
if [ ${multilingual} = true ]; then
# remove a space before the language ID
paste -d " " <(awk '{print $1}' ${text}) <(cut -f 2- -d" " ${text} \
| spm_encode --model=${bpecode} --output_format=piece | cut -f 2- -d" ") \
> ${tmpdir}/output/token.scp
else
paste -d " " <(awk '{print $1}' ${text}) <(cut -f 2- -d" " ${text} \
| spm_encode --model=${bpecode} --output_format=piece) \
> ${tmpdir}/output/token.scp
fi
elif [ -n "${nlsyms}" ]; then
text2token.py -s 1 -n 1 -l ${nlsyms} ${text} > ${tmpdir}/output/token.scp
else
text2token.py -s 1 -n 1 ${text} > ${tmpdir}/output/token.scp
fi
< ${tmpdir}/output/token.scp utils/sym2int.pl --map-oov ${oov} -f 2- ${dic} > ${tmpdir}/output/tokenid.scp
awk '{print $1 " " NF-1}' ${tmpdir}/output/tokenid.scp > ${tmpdir}/output/olen.scp
# +2 comes from CTC blank and EOS
vocsize=$(tail -n 1 ${dic} | awk '{print $2}')
odim=$(echo "$vocsize + 2" | bc)
awk -v odim=${odim} '{print $1 " " odim}' ${text} > ${tmpdir}/output/odim.scp
cat ${text} > ${tmpdir}/output/text.scp
# 4. Create JSON files from each scp files
rm -f ${tmpdir}/*/*.json
for x in "${tmpdir}"/output/*.scp; do
k=$(basename ${x} .scp)
< ${x} scp2json.py --key ${k} > ${tmpdir}/output/${k}.json
done
# add to json
addjson.py --verbose ${verbose} -i false \
${json} ${tmpdir}/output/text.json ${tmpdir}/output/token.json ${tmpdir}/output/tokenid.json ${tmpdir}/output/olen.json ${tmpdir}/output/odim.json > ${tmpdir}/data.json
mkdir -p ${json_dir}/.backup
echo "json updated. original json is kept in ${json_dir}/.backup."
cp ${json} ${json_dir}/.backup/"$(basename ${json})"
cp ${tmpdir}/data.json ${json}
rm -fr ${tmpdir}
Loading…
Cancel
Save