Merge pull request #1050 from LittleChenCc/develop

[ST] add script for data process (st1) for Ted-En-Zh
5 years ago · 100fdf2403
parent 2de7bc14b0 aea1e92a3d
commit 100fdf2403
23 changed files with 1394 additions and 80 deletions
--- a/dataset/ted_en_zh/ted_en_zh.py
+++ b/dataset/ted_en_zh/ted_en_zh.py
@ -28,7 +28,7 @@ import soundfile

 parser = argparse.ArgumentParser(description=__doc__)
 parser.add_argument(
-    "--src_dir",
+    "--src-dir",
    default="",
    type=str,
    help="Directory to kaldi splited data. (default: %(default)s)")
--- a/examples/ted_en_zh/st1/cmd.sh
+++ b/examples/ted_en_zh/st1/cmd.sh
@ -0,0 +1,89 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# "qsub" (SGE, Torque, PBS, etc.)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/gpu.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
--- a/examples/ted_en_zh/st1/conf/fbank.conf
+++ b/examples/ted_en_zh/st1/conf/fbank.conf
@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
--- a/examples/ted_en_zh/st1/conf/pitch.conf
+++ b/examples/ted_en_zh/st1/conf/pitch.conf
@ -0,0 +1 @@
+--sample-frequency=16000
--- a/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml
+++ b/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml
@ -11,9 +11,9 @@ data:
  max_output_input_ratio: 20.0

 collator:
-  vocab_filepath: data/lang_char/vocab.txt
+  vocab_filepath: data/lang_char/ted_en_zh_bpe8000.txt
  unit_type: 'spm'
-  spm_model_prefix: data/train_sp.en-zh-nlpr.zh-nlpr_bpe8000_tc
+  spm_model_prefix: data/lang_char/ted_en_zh_bpe8000
  mean_std_filepath: ""
  # augmentation_config: conf/augmentation.json
  batch_size: 10
--- a/examples/ted_en_zh/st1/local/data.sh
+++ b/examples/ted_en_zh/st1/local/data.sh
@ -8,10 +8,13 @@ dict_dir=data/lang_char

 # bpemode (unigram or bpe)
 nbpe=8000
-bpemode=unigram
+bpemode=bpe
 bpeprefix="${dict_dir}/bpe_${bpemode}_${nbpe}"
 data_dir=./TED_EnZh
-
+target_dir=data/ted_en_zh
+dumpdir=data/dump
+do_delta=false
+nj=20

 source ${MAIN_ROOT}/utils/parse_options.sh

@ -38,75 +41,167 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
        exit 1
    fi

-    # generate manifests
-    python3 ${TARGET_DIR}/ted_en_zh/ted_en_zh.py \
-    --manifest_prefix="data/manifest" \
-    --src_dir="${data_dir}"
+    # extract data 
+    echo "data Extraction"
+    python3 local/ted_en_zh.py \
+    --tgt-dir=${target_dir} \
+    --src-dir=${data_dir}

-    echo "Complete raw data pre-process."
 fi
-
+prep_dir=${target_dir}/data_prep 
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
-    # compute mean and stddev for normalizer
-    num_workers=$(nproc)
-    python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
-    --manifest_path="data/manifest.train.raw" \
-    --num_samples=-1 \
-    --spectrum_type="fbank" \
-    --feat_dim=80 \
-    --delta_delta=false \
-    --sample_rate=16000 \
-    --stride_ms=10.0 \
-    --window_ms=25.0 \
-    --use_dB_normalization=False \
-    --num_workers=${num_workers} \
-    --output_path="data/mean_std.json"
-
-    if [ $? -ne 0 ]; then
-        echo "Compute mean and stddev failed. Terminated."
-        exit 1
-    fi
+    ### Task dependent. You have to make data the following preparation part by yourself.
+    ### But you can utilize Kaldi recipes in most cases
+    echo "stage 0: Data preparation"
+    for set in train dev test; do
+    # for set in train; do
+        dst=${target_dir}/${set}
+        for lang in en zh; do
+
+            if [ ${lang} = 'en' ]; then
+                echo "remove punctuation $lang"
+                # remove punctuation
+                local/remove_punctuation.pl < ${dst}/${lang}.org > ${dst}/${lang}.raw
+            else
+                cp ${dst}/${lang}.org ${dst}/${lang}.raw
+            fi
+
+            paste -d " " ${dst}/.yaml ${dst}/${lang}.raw | sort > ${dst}/text.${lang}
+
+
+        done
+        # error check
+        n=$(cat ${dst}/.yaml | wc -l)
+        n_en=$(cat ${dst}/en.raw | wc -l)
+        n_tgt=$(cat ${dst}/zh.raw | wc -l)
+        [ ${n} -ne ${n_en} ] && echo "Warning: expected ${n} data data files, found ${n_en}" && exit 1;
+        [ ${n} -ne ${n_tgt} ] && echo "Warning: expected ${n} data data files, found ${n_tgt}" && exit 1;
+
+        echo "done text processing"
+        cat ${dst}/wav.scp.org | uniq | sort -k1,1 -u > ${dst}/wav.scp
+        cat ${dst}/utt2spk.org | uniq | sort -k1,1 -u > ${dst}/utt2spk
+
+        cat ${dst}/utt2spk | utils/utt2spk_to_spk2utt.pl | sort -k1,1 -u > ${dst}/spk2utt
+        rm -rf ${prep_dir}/${set}.en-zh
+        mkdir -p ${prep_dir}/${set}.en-zh
+        echo "remove duplicate lines..."
+        cut -d ' ' -f 1 ${dst}/text.en | sort | uniq -c | sort -n -k1 -r | grep -v '1 ted-en-zh' \
+            | sed 's/^[ \t]*//' > ${dst}/duplicate_lines
+        cut -d ' ' -f 1 ${dst}/text.en | sort | uniq -c | sort -n -k1 -r | grep '1 ted-en-zh' \
+            | cut -d '1' -f 2- | sed 's/^[ \t]*//' > ${dst}/reclist
+        reduce_data_dir.sh ${dst} ${dst}/reclist ${prep_dir}/${set}.en-zh
+        echo "done wav processing"
+        for l in en zh; do
+            cp ${dst}/text.${l} ${prep_dir}/${set}.en-zh/text.${l}
+        done
+        utils/fix_data_dir.sh --utt_extra_files \
+        "text.en text.zh" \
+        ${prep_dir}/${set}.en-zh
+    done
 fi

+feat_tr_dir=${dumpdir}/train_sp/delta${do_delta}; mkdir -p ${feat_tr_dir}
+feat_dt_dir=${dumpdir}/dev/delta${do_delta}; mkdir -p ${feat_dt_dir}
+feat_trans_dir=${dumpdir}/test/delta${do_delta}; mkdir -p ${feat_trans_dir}
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
-    # build vocabulary
-    python3 ${MAIN_ROOT}/utils/build_vocab.py \
-    --unit_type "spm" \
-    --spm_vocab_size=${nbpe} \
-    --spm_mode ${bpemode} \
-    --spm_model_prefix ${bpeprefix} \
-    --vocab_path="${dict_dir}/vocab.txt" \
-    --text_keys 'text' 'text1' \
-    --manifest_paths="data/manifest.train.raw"
-
-
-    if [ $? -ne 0 ]; then
-        echo "Build vocabulary failed. Terminated."
-        exit 1
-    fi
+    ### Task dependent. You have to design training and dev sets by yourself.
+    ### But you can utilize Kaldi recipes in most cases
+    echo "stage 1: Feature Generation"
+    fbankdir=data/fbank
+    # Generate the fbank features; by default 80-dimensional fbanks with pitch on each frame
+    for x in train dev test; do
+        steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj ${nj} --write_utt2num_frames true \
+            ${prep_dir}/${x}.en-zh data/make_fbank/${x} ${fbankdir}
+    done
+    
+    echo "speed perturbation"
+    utils/perturb_data_dir_speed.sh 0.9 ${prep_dir}/train.en-zh ${prep_dir}/temp1.en-zh
+    utils/perturb_data_dir_speed.sh 1.0 ${prep_dir}/train.en-zh ${prep_dir}/temp2.en-zh
+    utils/perturb_data_dir_speed.sh 1.1 ${prep_dir}/train.en-zh ${prep_dir}/temp3.en-zh
+
+    utils/combine_data.sh --extra-files utt2uniq ${prep_dir}/train_sp.en-zh \
+    ${prep_dir}/temp1.en-zh ${prep_dir}/temp2.en-zh ${prep_dir}/temp3.en-zh
+    rm -r ${prep_dir}/temp*.en-zh 
+    utils/fix_data_dir.sh ${prep_dir}/train_sp.en-zh
+
+    steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj ${nj} --write_utt2num_frames true \
+        ${prep_dir}/train_sp.en-zh exp/make_fbank/train_sp.en-zh ${fbankdir}
+
+    for lang in en zh; do
+        cat /dev/null > ${prep_dir}/train_sp.en-zh/text.${lang}
+        for p in "sp0.9-" "sp1.0-" "sp1.1-"; do
+            awk -v p=${p} '{printf("%s %s%s\n", $1, p, $1);}' ${prep_dir}/train.en-zh/utt2spk > ${prep_dir}/train_sp.en-zh/utt_map
+            utils/apply_map.pl -f 1 ${prep_dir}/train_sp.en-zh/utt_map < ${prep_dir}/train.en-zh/text.${lang} >>${prep_dir}/train_sp.en-zh/text.${lang}
+        done
+    done
+
+    for x in train_sp dev test; do
+        local/divide_lang.sh ${prep_dir}/${x}.en-zh zh
+    done
+
+    for x in train_sp dev; do
+        # remove utt having more than 3000 frames
+        # remove utt having more than 400 characters
+        for lang in zh en; do
+            remove_longshortdata.sh --maxframes 3000 --maxchars 400 ${prep_dir}/${x}.en-zh.${lang} ${prep_dir}/${x}.en-zh.${lang}.tmp
+        done
+        cut -f 1 -d " " ${prep_dir}/${x}.en-zh.en.tmp/text > ${prep_dir}/${x}.en-zh.${lang}.tmp/reclist1
+        cut -f 1 -d " " ${prep_dir}/${x}.en-zh.${lang}.tmp/text > ${prep_dir}/${x}.en-zh.${lang}.tmp/reclist2
+        comm -12 ${prep_dir}/${x}.en-zh.${lang}.tmp/reclist1 ${prep_dir}/${x}.en-zh.${lang}.tmp/reclist2 > ${prep_dir}/${x}.en-zh.en.tmp/reclist
+
+        for lang in zh en; do
+            reduce_data_dir.sh ${prep_dir}/${x}.en-zh.${lang}.tmp ${prep_dir}/${x}.en-zh.en.tmp/reclist ${prep_dir}/${x}.en-zh.${lang}
+            utils/fix_data_dir.sh  ${prep_dir}/${x}.en-zh.${lang}
+        done
+        rm -rf ${prep_dir}/${x}.en-zh.*.tmp
+    done
+
+    compute-cmvn-stats scp:${prep_dir}/train_sp.en-zh.zh/feats.scp ${prep_dir}/train_sp.en-zh.zh/cmvn.ark
+
+    dump.sh --cmd "$train_cmd" --nj ${nj} --do_delta $do_delta \
+        ${prep_dir}/train_sp.en-zh.zh/feats.scp ${prep_dir}/train_sp.en-zh.zh/cmvn.ark ${prep_dir}/dump_feats/train_sp.en-zh.zh ${feat_tr_dir}
+    dump.sh --cmd "$train_cmd" --nj ${nj} --do_delta $do_delta \
+        ${prep_dir}/dev.en-zh.zh/feats.scp ${prep_dir}/train_sp.en-zh.zh/cmvn.ark ${prep_dir}/dump_feats/dev.en-zh.zh ${feat_dt_dir}
+    dump.sh --cmd "$train_cmd" --nj ${nj} --do_delta $do_delta \
+        ${prep_dir}/test.en-zh.zh/feats.scp ${prep_dir}/train_sp.en-zh.zh/cmvn.ark ${prep_dir}/dump_feats/test.en-zh.zh ${feat_trans_dir}
 fi

+dict=${dict_dir}/ted_en_zh_${bpemode}${nbpe}.txt
+nlsyms=${dict_dir}/ted_en_zh_non_lang_syms.txt
+bpemodel=${dict_dir}/ted_en_zh_${bpemode}${nbpe}
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
-    # format manifest with tokenids, vocab size
-    for set in train dev test; do
-    {
-        python3 ${MAIN_ROOT}/utils/format_triplet_data.py \
-        --feat_type "raw" \
-        --cmvn_path "data/mean_std.json" \
-        --unit_type "spm" \
-        --spm_model_prefix ${bpeprefix} \
-        --vocab_path="${dict_dir}/vocab.txt" \
-        --manifest_path="data/manifest.${set}.raw" \
-        --output_path="data/manifest.${set}"
-
-        if [ $? -ne 0 ]; then
-            echo "Formt mnaifest failed. Terminated."
-            exit 1
-        fi
-    }&
+    echo "stage 2: Dictionary and Json Data Preparation"
+
+    echo "make a joint source and target dictionary"
+    echo "<unk> 1" > ${dict} # <unk> must be 1, 0 will be used for "blank" in CTC
+    offset=$(wc -l < ${dict})
+    grep sp1.0 ${prep_dir}/train_sp.en-zh.*/text | cut -f 2- -d' ' | grep -v -e '^\s*$' > ${dict_dir}/input.txt
+    spm_train  --input=${dict_dir}/input.txt --vocab_size=${nbpe} --model_type=${bpemode} --model_prefix=${bpemodel} --input_sentence_size=100000000 --character_coverage=1.0
+    spm_encode --model=${bpemodel}.model --output_format=piece < ${dict_dir}/input.txt | tr ' ' '\n' | sort | uniq | awk -v offset=${offset} '{print $0 " " NR+offset}' >> ${dict}
+    wc -l ${dict}
+
+    echo "make json files"
+    data2json.sh --nj ${nj} --feat ${feat_tr_dir}/feats.scp --text ${prep_dir}/train_sp.en-zh.zh/text --bpecode ${bpemodel}.model --lang zh \
+        ${prep_dir}/train_sp.en-zh.zh ${dict} > ${feat_tr_dir}/data_${bpemode}${nbpe}.json
+    data2json.sh --feat ${feat_dt_dir}/feats.scp --text ${prep_dir}/dev.en-zh.zh/text --bpecode ${bpemodel}.model --lang zh \
+        ${prep_dir}/dev.en-zh.zh ${dict} > ${feat_dt_dir}/data_${bpemode}${nbpe}.json
+    data2json.sh --feat ${feat_trans_dir}/feats.scp --text ${prep_dir}/test.en-zh.zh/text --bpecode ${bpemodel}.model --lang zh \
+        ${prep_dir}/test.en-zh.zh ${dict} > ${feat_trans_dir}/data_${bpemode}${nbpe}.json
+    echo "update json (add source references)"
+    # update json (add source references)
+    for x in train_sp dev; do
+        feat_dir=${dumpdir}/${x}/delta${do_delta}
+        data_dir=${prep_dir}/$(echo ${x} | cut -f 1 -d ".").en-zh.en
+        update_json.sh --text ${data_dir}/text --bpecode ${bpemodel}.model \
+            ${feat_dir}/data_${bpemode}${nbpe}.json ${data_dir} ${dict}
    done
-    wait
 fi

+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    echo "stage 3: Format the Json Data"
+    python3 local/espnet_json_to_manifest.py --json-file ${feat_tr_dir}/data_${bpemode}${nbpe}.json --manifest-file data/manifest.train
+    python3 local/espnet_json_to_manifest.py --json-file ${feat_dt_dir}/data_${bpemode}${nbpe}.json --manifest-file data/manifest.dev
+    python3 local/espnet_json_to_manifest.py --json-file ${feat_trans_dir}/data_${bpemode}${nbpe}.json --manifest-file data/manifest.test
+fi
 echo "Ted En-Zh Data preparation done."
 exit 0
--- a/examples/ted_en_zh/st1/local/divide_lang.sh
+++ b/examples/ted_en_zh/st1/local/divide_lang.sh
@ -0,0 +1,48 @@
+#!/bin/bash
+
+# Copyright 2019 Kyoto University (Hirofumi Inaguma)
+#           2021 PaddlePaddle
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+. ./path.sh
+
+if [ "$#" -ne 2 ]; then
+    echo "Usage: $0 <set> <lang>>"
+    echo "e.g.: $0 dev"
+    exit 1
+fi
+
+set=$1
+lang=$2
+export LC_ALL=en_US.UTF-8
+# Copy stuff intoc its final locations [this has been moved from the format_data script]
+# for En
+mkdir -p ${set}.en
+for f in spk2utt utt2spk segments wav.scp feats.scp utt2num_frames; do
+    if [ -f ${set}/${f} ]; then
+        sort ${set}/${f} > ${set}.en/${f}
+    fi
+done
+sort ${set}/text.en | sed $'s/[^[:print:]]//g' > ${set}.en/text 
+
+utils/fix_data_dir.sh ${set}.en
+if [ -f ${set}.en/feats.scp ]; then
+    utils/validate_data_dir.sh ${set}.en || exit 1;
+else
+    utils/validate_data_dir.sh --no-feats --no-wav ${set}.en || exit 1;
+fi
+
+# for target language
+mkdir -p ${set}.${lang}
+for f in spk2utt utt2spk segments wav.scp feats.scp utt2num_frames; do
+    if [ -f ${set}/${f} ]; then
+        sort ${set}/${f} > ${set}.${lang}/${f}
+    fi
+done
+sort ${set}/text.${lang} | sed $'s/[^[:print:]]//g' > ${set}.${lang}/text 
+utils/fix_data_dir.sh  ${set}.${lang}
+if [ -f ${set}.${lang}/feats.scp ]; then
+    utils/validate_data_dir.sh ${set}.${lang} || exit 1;
+else
+    utils/validate_data_dir.sh --no-feats --no-wav ${set}.${lang} || exit 1;
+fi
--- a/examples/ted_en_zh/st1/local/espnet_json_to_manifest.py
+++ b/examples/ted_en_zh/st1/local/espnet_json_to_manifest.py
@ -0,0 +1,27 @@
+#!/usr/bin/env python
+import argparse
+import json
+
+
+def main(args):
+    with open(args.json_file, 'r') as fin:
+        data_json = json.load(fin)
+
+    with open(args.manifest_file, 'w') as fout:
+        for key, value in data_json['utts'].items():
+            value['utt'] = key
+            fout.write(json.dumps(value, ensure_ascii=False))
+            fout.write("\n")
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        '--json-file', type=str, default=None, help="espnet data json file.")
+    parser.add_argument(
+        '--manifest-file',
+        type=str,
+        default='manifest.train',
+        help='manifest data json line file.')
+    args = parser.parse_args()
+    main(args)
--- a/examples/ted_en_zh/st1/local/remove_punctuation.pl
+++ b/examples/ted_en_zh/st1/local/remove_punctuation.pl
@ -0,0 +1,25 @@
+#!/usr/bin/perl
+
+use warnings;
+use strict;
+
+binmode(STDIN,":utf8");
+binmode(STDOUT,":utf8");
+
+while(<STDIN>) {
+  $_ = " $_ ";
+
+  # remove punctuation except apostrophe
+  s/<space>/spacemark/g;  # for scoring
+  s/'/apostrophe/g;
+  s/[[:punct:]]//g;
+  s/apostrophe/'/g;
+  s/spacemark/<space>/g;  # for scoring
+
+  # remove whitespace
+  s/\s+/ /g;
+  s/^\s+//;
+  s/\s+$//;
+
+  print "$_\n";
+}
--- a/examples/ted_en_zh/st1/local/ted_en_zh.py
+++ b/examples/ted_en_zh/st1/local/ted_en_zh.py
@ -0,0 +1,104 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import codecs
+import os
+
+
+# org_split = 'train-split/train-segment'
+# text_file = 'En-Zh/train.en-zh'
+# data_split = 'train'
+def data_process(src_dir, tgt_dir, wav_dir_list, text_file_list,
+                 data_split_list):
+
+    for org_split, text_file, data_split in zip(wav_dir_list, text_file_list,
+                                                data_split_list):
+        local_data_split_dir = os.path.join(tgt_dir, data_split)
+
+        os.makedirs(local_data_split_dir, exist_ok=True)
+        utts = []
+        utt2spk = {}
+        with open(os.path.join(local_data_split_dir, 'wav.scp.org'), 'w') as wav_wf, \
+            open(os.path.join(local_data_split_dir, 'utt2spk.org'), 'w') as utt2spk_wf:
+            for files in os.listdir(os.path.join(src_dir, org_split)):
+                files = files.strip()
+                file_path = os.path.join(src_dir, org_split, files)
+                size = os.path.getsize(file_path)
+                if size <= 30000:
+                    continue
+                utt = files.split('.')[0]
+                audio_name = utt.split('_')[0]
+                #format the name of utterance 
+                while len(audio_name) < 6:
+                    utt = '0' + utt
+                    audio_name = '0' + audio_name
+                utt = 'ted-en-zh-' + utt
+                utts.append(utt)
+                spk = utt.split('_')[0]
+                utt2spk[utt] = spk
+                assert len(spk) == 16, "%r" % spk
+                print(utt, 'cat', os.path.abspath(file_path), '|', file=wav_wf)
+            for utt in sorted(utts):
+                print(utt, utt2spk[utt], file=utt2spk_wf)
+
+        with open(os.path.join(local_data_split_dir, 'en.org'), 'w') as en_wf, \
+            open(os.path.join(local_data_split_dir, 'zh.org'), 'w') as zh_wf, \
+            open(os.path.join(local_data_split_dir, '.yaml'), 'w') as yaml_wf, \
+            codecs.open(os.path.join(src_dir, text_file), 'r', encoding='utf-8',
+                        errors='ignore') as rf:
+            count = 0
+            for line in rf:
+                line = line.strip()
+                line_spl = line.split('\t')
+                assert len(line_spl) == 3, "%r" % line
+                wav, en, zh = line_spl
+                assert wav.endswith('wav'), "%r" % wav[-3:]
+                utt = wav.split('.')[0]
+                audio_name = utt.split('_')[0]
+                while len(audio_name) < 6:
+                    utt = '0' + utt
+                    audio_name = '0' + audio_name
+                utt = 'ted-en-zh-' + utt
+                print(utt, file=yaml_wf)
+                print(en.lower(), file=en_wf)
+                print(zh, file=zh_wf)
+                count += 1
+            print('%s set lines count: %d' % (data_split, count))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description=__doc__)
+
+    parser.add_argument(
+        "--src-dir",
+        default="",
+        type=str,
+        help="Directory to kaldi splited data. (default: %(default)s)")
+    parser.add_argument(
+        "--tgt-dir",
+        default="local/ted_en_zh",
+        type=str,
+        help="Directory to save processed data. (default: %(default)s)")
+    args = parser.parse_args()
+
+    wav_dir_list = [
+        'train-split/train-segment', 'test-segment/tst2014',
+        'test-segment/tst2015'
+    ]
+    text_file_list = [
+        'En-Zh/train.en-zh', 'En-Zh/tst2014.en-zh', 'En-Zh/tst2015.en-zh'
+    ]
+    data_split_list = ['train', 'dev', 'test']
+    data_process(args.src_dir, args.tgt_dir, wav_dir_list, text_file_list,
+                 data_split_list)
--- a/examples/ted_en_zh/st1/local/train_finetune.sh
+++ b/examples/ted_en_zh/st1/local/train_finetune.sh
@ -24,7 +24,7 @@ python3 -u ${BIN_DIR}/train.py \
 --ngpu ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name} \
--checkpoint_path ${ckpt_path} \
+--checkpoint_path "${ckpt_path}" \
 --seed ${seed}

 if [ ${seed} != 0 ]; then
--- a/examples/ted_en_zh/st1/path.sh
+++ b/examples/ted_en_zh/st1/path.sh
@ -1,6 +1,6 @@
 export MAIN_ROOT=`realpath ${PWD}/../../../`

-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PWD}/utils:${PATH}
 export LC_ALL=C

 export PYTHONDONTWRITEBYTECODE=1
@ -13,3 +13,10 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/

 MODEL=u2_st
 export BIN_DIR=${MAIN_ROOT}/paddlespeech/s2t/exps/${MODEL}/bin
+
+# Kaldi
+export KALDI_ROOT=${MAIN_ROOT}/tools/kaldi
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present, can not using Kaldi!"
+[ -f $KALDI_ROOT/tools/config/common_path.sh ] && . $KALDI_ROOT/tools/config/common_path.sh
--- a/examples/ted_en_zh/st1/run.sh
+++ b/examples/ted_en_zh/st1/run.sh
@ -1,12 +1,13 @@
 #!/bin/bash
 set -e
-source path.sh
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;

 gpus=0,1,2,3
 stage=1
 stop_stage=4
 conf_path=conf/transformer_mtl_noam.yaml
-ckpt_path=paddle.98
+ckpt_path= # paddle.98 # (finetune from FAT-ST pretrained model)
 avg_num=5
 data_path=./TED_EnZh # path to unzipped data
 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
@ -22,21 +23,20 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
 fi

 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
-    # download pretrained
-    bash ./local/download_pretrain.sh || exit -1
-fi
-
-if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    # train model, all `ckpt` under `exp` dir
-    CUDA_VISIBLE_DEVICES=${gpus} ./local/train_finetune.sh ${conf_path}  ${ckpt} ${ckpt_path}
+    if [ -n "${ckpt_path}" ]; then
+        echo "Finetune from Pretrained Model" ${ckpt_path}
+        ./local/download_pretrain.sh || exit -1
+    fi 
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} "${ckpt_path}"
 fi

-if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    # avg n best model
    avg.sh best exp/${ckpt}/checkpoints ${avg_num}
 fi

-if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    # test ckpt avg_n
    CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
 fi
--- a/examples/ted_en_zh/st1/steps
+++ b/examples/ted_en_zh/st1/steps
@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
--- a/examples/ted_en_zh/st1/utils
+++ b/examples/ted_en_zh/st1/utils
@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
--- a/paddlespeech/s2t/exps/u2_st/model.py
+++ b/paddlespeech/s2t/exps/u2_st/model.py
@ -26,8 +26,10 @@ from paddle import distributed as dist
 from paddle.io import DataLoader
 from yacs.config import CfgNode

+from paddlespeech.s2t.frontend.featurizer import TextFeaturizer
 from paddlespeech.s2t.io.collator import SpeechCollator
 from paddlespeech.s2t.io.collator import TripletSpeechCollator
+from paddlespeech.s2t.io.dataloader import BatchDataLoader
 from paddlespeech.s2t.io.dataset import ManifestDataset
 from paddlespeech.s2t.io.sampler import SortagradBatchSampler
 from paddlespeech.s2t.io.sampler import SortagradDistributedBatchSampler
@ -423,6 +425,30 @@ class U2STTester(U2STTrainer):
            trans.append(''.join([chr(i) for i in ids]))
        return trans

+    def translate(self, audio, audio_len):
+        """"E2E translation from extracted audio feature"""
+        cfg = self.config.decoding
+        text_feature = self.test_loader.collate_fn.text_feature
+
+        hyps = self.model.decode(
+            audio,
+            audio_len,
+            text_feature=text_feature,
+            decoding_method=cfg.decoding_method,
+            lang_model_path=cfg.lang_model_path,
+            beam_alpha=cfg.alpha,
+            beam_beta=cfg.beta,
+            beam_size=cfg.beam_size,
+            cutoff_prob=cfg.cutoff_prob,
+            cutoff_top_n=cfg.cutoff_top_n,
+            num_processes=cfg.num_proc_bsearch,
+            ctc_weight=cfg.ctc_weight,
+            word_reward=cfg.word_reward,
+            decoding_chunk_size=cfg.decoding_chunk_size,
+            num_decoding_left_chunks=cfg.num_decoding_left_chunks,
+            simulate_streaming=cfg.simulate_streaming)
+        return hyps
+
    def compute_translation_metrics(self,
                                    utts,
                                    audio,
--- a/paddlespeech/s2t/frontend/utility.py
+++ b/paddlespeech/s2t/frontend/utility.py
@ -102,10 +102,10 @@ def read_manifest(
    manifest = []
    with jsonlines.open(manifest_path, 'r') as reader:
        for json_data in reader:
-            feat_len = json_data["feat_shape"][
-                0] if 'feat_shape' in json_data else 1.0
-            token_len = json_data["token_shape"][
-                0] if 'token_shape' in json_data else 1.0
+            feat_len = json_data["input"][0]["shape"][
+                0] if 'shape' in json_data["input"][0] else 1.0
+            token_len = json_data["output"][0]["shape"][
+                0] if 'shape' in json_data["output"][0] else 1.0
            conditions = [
                feat_len >= min_input_len,
                feat_len <= max_input_len,
--- a/paddlespeech/s2t/io/sampler.py
+++ b/paddlespeech/s2t/io/sampler.py
@ -51,7 +51,7 @@ def _batch_shuffle(indices, batch_size, epoch, clipped=False):
    """
    rng = np.random.RandomState(epoch)
    shift_len = rng.randint(0, batch_size - 1)
-    batch_indices = list(zip(* [iter(indices[shift_len:])] * batch_size))
+    batch_indices = list(zip(*[iter(indices[shift_len:])] * batch_size))
    rng.shuffle(batch_indices)
    batch_indices = [item for batch in batch_indices for item in batch]
    assert clipped is False
--- a/paddlespeech/s2t/utils/checkpoint.py
+++ b/paddlespeech/s2t/utils/checkpoint.py
@ -94,6 +94,9 @@ class Checkpoint():
        """
        configs = {}

+        if len(checkpoint_path) == 0 or checkpoint_path == "None":
+            checkpoint_path = None
+
        if checkpoint_path is not None:
            pass
        elif checkpoint_dir is not None and record_file is not None:
--- a/utils/addjson.py
+++ b/utils/addjson.py
@ -0,0 +1,152 @@
+#!/usr/bin/env python3
+# encoding: utf-8
+# Copyright 2018 Nagoya University (Tomoki Hayashi)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import argparse
+import codecs
+import json
+import logging
+import sys
+from distutils.util import strtobool
+
+from espnet.utils.cli_utils import get_commandline_args
+
+is_python2 = sys.version_info[0] == 2
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        description="add multiple json values to an input or output value",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter, )
+    parser.add_argument("jsons", type=str, nargs="+", help="json files")
+    parser.add_argument(
+        "-i",
+        "--is-input",
+        default=True,
+        type=strtobool,
+        help="If true, add to input. If false, add to output", )
+    parser.add_argument(
+        "--verbose", "-V", default=0, type=int, help="Verbose option")
+    return parser
+
+
+if __name__ == "__main__":
+    parser = get_parser()
+    args = parser.parse_args()
+
+    # logging info
+    logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
+    if args.verbose > 0:
+        logging.basicConfig(level=logging.INFO, format=logfmt)
+    else:
+        logging.basicConfig(level=logging.WARN, format=logfmt)
+    logging.info(get_commandline_args())
+
+    # make intersection set for utterance keys
+    js = []
+    intersec_ks = []
+    for x in args.jsons:
+        with codecs.open(x, "r", encoding="utf-8") as f:
+            j = json.load(f)
+        ks = j["utts"].keys()
+        logging.info(x + ": has " + str(len(ks)) + " utterances")
+        if len(intersec_ks) > 0:
+            intersec_ks = intersec_ks.intersection(set(ks))
+            if len(intersec_ks) == 0:
+                logging.warning("Empty intersection")
+                break
+        else:
+            intersec_ks = set(ks)
+        js.append(j)
+    logging.info("new json has " + str(len(intersec_ks)) + " utterances")
+
+    # updated original dict to keep intersection
+    intersec_org_dic = dict()
+    for k in intersec_ks:
+        v = js[0]["utts"][k]
+        intersec_org_dic[k] = v
+
+    intersec_add_dic = dict()
+    for k in intersec_ks:
+        v = js[1]["utts"][k]
+        for j in js[2:]:
+            v.update(j["utts"][k])
+        intersec_add_dic[k] = v
+
+    new_dic = dict()
+    for key_id in intersec_org_dic:
+        orgdic = intersec_org_dic[key_id]
+        adddic = intersec_add_dic[key_id]
+
+        if "utt2spk" not in orgdic:
+            orgdic["utt2spk"] = ""
+        # NOTE: for machine translation
+
+        # add as input
+        if args.is_input:
+            # original input
+            input_list = orgdic["input"]
+            # additional input
+            in_add_dic = {}
+            if "idim" in adddic and "ilen" in adddic:
+                in_add_dic["shape"] = [int(adddic["ilen"]), int(adddic["idim"])]
+            elif "idim" in adddic:
+                in_add_dic["shape"] = [int(adddic["idim"])]
+            # add all other key value
+            for key, value in adddic.items():
+                if key in ["idim", "ilen"]:
+                    continue
+                in_add_dic[key] = value
+            # add name
+            in_add_dic["name"] = "input%d" % (len(input_list) + 1)
+
+            input_list.append(in_add_dic)
+            new_dic[key_id] = {
+                "input": input_list,
+                "output": orgdic["output"],
+                "utt2spk": orgdic["utt2spk"],
+            }
+        # add as output
+        else:
+            # original output
+            output_list = orgdic["output"]
+            # additional output
+            out_add_dic = {}
+            # add shape
+            if "odim" in adddic and "olen" in adddic:
+                out_add_dic[
+                    "shape"] = [int(adddic["olen"]), int(adddic["odim"])]
+            elif "odim" in adddic:
+                out_add_dic["shape"] = [int(adddic["odim"])]
+            # add all other key value
+            for key, value in adddic.items():
+                if key in ["odim", "olen"]:
+                    continue
+                out_add_dic[key] = value
+            # add name
+            out_add_dic["name"] = "target%d" % (len(output_list) + 1)
+
+            output_list.append(out_add_dic)
+            new_dic[key_id] = {
+                "input": orgdic["input"],
+                "output": output_list,
+                "utt2spk": orgdic["utt2spk"],
+            }
+            if "lang" in orgdic.keys():
+                new_dic[key_id]["lang"] = orgdic["lang"]
+
+    # ensure "ensure_ascii=False", which is a bug
+    jsonstring = json.dumps(
+        {
+            "utts": new_dic
+        },
+        indent=4,
+        ensure_ascii=False,
+        sort_keys=True,
+        separators=(",", ": "), )
+    sys.stdout = codecs.getwriter("utf-8")(sys.stdout
+                                           if is_python2 else sys.stdout.buffer)
+    print(jsonstring)
--- a/utils/scp2json.py
+++ b/utils/scp2json.py
@ -0,0 +1,49 @@
+#!/usr/bin/env python3
+# encoding: utf-8
+# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import argparse
+import codecs
+import json
+import sys
+
+is_python2 = sys.version_info[0] == 2
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        description="convert scp to json",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter, )
+    parser.add_argument("--key", "-k", type=str, help="key")
+    return parser
+
+
+if __name__ == "__main__":
+    parser = get_parser()
+    args = parser.parse_args()
+
+    new_line = {}
+    sys.stdin = codecs.getreader("utf-8")(sys.stdin
+                                          if is_python2 else sys.stdin.buffer)
+    sys.stdout = codecs.getwriter("utf-8")(sys.stdout
+                                           if is_python2 else sys.stdout.buffer)
+    line = sys.stdin.readline()
+    while line:
+        x = line.rstrip().split()
+        v = {args.key: " ".join(x[1:])}
+        new_line[x[0]] = v
+        line = sys.stdin.readline()
+
+    all_l = {"utts": new_line}
+
+    # ensure "ensure_ascii=False", which is a bug
+    jsonstring = json.dumps(
+        all_l,
+        indent=4,
+        ensure_ascii=False,
+        sort_keys=True,
+        separators=(",", ": "))
+    print(jsonstring)
--- a/utils/tokenizer.perl
+++ b/utils/tokenizer.perl
@ -0,0 +1,596 @@
+#!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
+
+use warnings;
+
+# Sample Tokenizer
+### Version 1.1
+# written by Pidong Wang, based on the code written by Josh Schroeder and Philipp Koehn
+# Version 1.1 updates:
+#       (1) add multithreading option "-threads NUM_THREADS" (default is 1);
+#       (2) add a timing option "-time" to calculate the average speed of this tokenizer;
+#       (3) add an option "-lines NUM_SENTENCES_PER_THREAD" to set the number of lines for each thread (default is 2000), and this option controls the memory amount needed: the larger this number is, the larger memory is required (the higher tokenization speed);
+### Version 1.0
+# $Id: tokenizer.perl 915 2009-08-10 08:15:49Z philipp $
+# written by Josh Schroeder, based on code by Philipp Koehn
+
+binmode(STDIN, ":utf8");
+binmode(STDOUT, ":utf8");
+
+use warnings;
+use FindBin qw($RealBin);
+use strict;
+use Time::HiRes;
+
+if  (eval {require Thread;1;}) {
+  #module loaded
+  Thread->import();
+}
+
+my $mydir = "$RealBin/../share/nonbreaking_prefixes";
+
+my %NONBREAKING_PREFIX = ();
+my @protected_patterns = ();
+my $protected_patterns_file = "";
+my $language = "en";
+my $QUIET = 0;
+my $HELP = 0;
+my $AGGRESSIVE = 0;
+my $SKIP_XML = 0;
+my $TIMING = 0;
+my $NUM_THREADS = 1;
+my $NUM_SENTENCES_PER_THREAD = 2000;
+my $PENN = 0;
+my $NO_ESCAPING = 0;
+while (@ARGV)
+{
+	$_ = shift;
+	/^-b$/ && ($| = 1, next);
+	/^-l$/ && ($language = shift, next);
+	/^-q$/ && ($QUIET = 1, next);
+	/^-h$/ && ($HELP = 1, next);
+	/^-x$/ && ($SKIP_XML = 1, next);
+	/^-a$/ && ($AGGRESSIVE = 1, next);
+	/^-time$/ && ($TIMING = 1, next);
+  # Option to add list of regexps to be protected
+  /^-protected/ && ($protected_patterns_file = shift, next);
+	/^-threads$/ && ($NUM_THREADS = int(shift), next);
+	/^-lines$/ && ($NUM_SENTENCES_PER_THREAD = int(shift), next);
+	/^-penn$/ && ($PENN = 1, next);
+	/^-no-escape/ && ($NO_ESCAPING = 1, next);
+}
+
+# for time calculation
+my $start_time;
+if ($TIMING)
+{
+    $start_time = [ Time::HiRes::gettimeofday( ) ];
+}
+
+# print help message
+if ($HELP)
+{
+	print "Usage ./tokenizer.perl (-l [en|de|...]) (-threads 4) < textfile > tokenizedfile\n";
+        print "Options:\n";
+        print "  -q     ... quiet.\n";
+        print "  -a     ... aggressive hyphen splitting.\n";
+        print "  -b     ... disable Perl buffering.\n";
+        print "  -time  ... enable processing time calculation.\n";
+        print "  -penn  ... use Penn treebank-like tokenization.\n";
+        print "  -protected FILE  ... specify file with patters to be protected in tokenisation.\n";
+	print "  -no-escape ... don't perform HTML escaping on apostrophy, quotes, etc.\n";
+	exit;
+}
+
+if (!$QUIET)
+{
+	print STDERR "Tokenizer Version 1.1\n";
+	print STDERR "Language: $language\n";
+	print STDERR "Number of threads: $NUM_THREADS\n";
+}
+
+# load the language-specific non-breaking prefix info from files in the directory nonbreaking_prefixes
+load_prefixes($language,\%NONBREAKING_PREFIX);
+
+if (scalar(%NONBREAKING_PREFIX) eq 0)
+{
+	print STDERR "Warning: No known abbreviations for language '$language'\n";
+}
+
+# Load protected patterns
+if ($protected_patterns_file)
+{
+  open(PP,$protected_patterns_file) || die "Unable to open $protected_patterns_file";
+  while(<PP>) {
+    chomp;
+    push @protected_patterns, $_;
+  }
+}
+
+my @batch_sentences = ();
+my @thread_list = ();
+my $count_sentences = 0;
+
+if ($NUM_THREADS > 1)
+{# multi-threading tokenization
+    while(<STDIN>)
+    {
+        $count_sentences = $count_sentences + 1;
+        push(@batch_sentences, $_);
+        if (scalar(@batch_sentences)>=($NUM_SENTENCES_PER_THREAD*$NUM_THREADS))
+        {
+            # assign each thread work
+            for (my $i=0; $i<$NUM_THREADS; $i++)
+            {
+                my $start_index = $i*$NUM_SENTENCES_PER_THREAD;
+                my $end_index = $start_index+$NUM_SENTENCES_PER_THREAD-1;
+                my @subbatch_sentences = @batch_sentences[$start_index..$end_index];
+                my $new_thread = new Thread \&tokenize_batch, @subbatch_sentences;
+                push(@thread_list, $new_thread);
+            }
+            foreach (@thread_list)
+            {
+                my $tokenized_list = $_->join;
+                foreach (@$tokenized_list)
+                {
+                    print $_;
+                }
+            }
+            # reset for the new run
+            @thread_list = ();
+            @batch_sentences = ();
+        }
+    }
+    # the last batch
+    if (scalar(@batch_sentences)>0)
+    {
+        # assign each thread work
+        for (my $i=0; $i<$NUM_THREADS; $i++)
+        {
+            my $start_index = $i*$NUM_SENTENCES_PER_THREAD;
+            if ($start_index >= scalar(@batch_sentences))
+            {
+                last;
+            }
+            my $end_index = $start_index+$NUM_SENTENCES_PER_THREAD-1;
+            if ($end_index >= scalar(@batch_sentences))
+            {
+                $end_index = scalar(@batch_sentences)-1;
+            }
+            my @subbatch_sentences = @batch_sentences[$start_index..$end_index];
+            my $new_thread = new Thread \&tokenize_batch, @subbatch_sentences;
+            push(@thread_list, $new_thread);
+        }
+        foreach (@thread_list)
+        {
+            my $tokenized_list = $_->join;
+            foreach (@$tokenized_list)
+            {
+                print $_;
+            }
+        }
+    }
+}
+else
+{# single thread only
+    while(<STDIN>)
+    {
+        if (($SKIP_XML && /^<.+>$/) || /^\s*$/)
+        {
+            #don't try to tokenize XML/HTML tag lines
+            print $_;
+        }
+        else
+        {
+            print &tokenize($_);
+        }
+    }
+}
+
+if ($TIMING)
+{
+    my $duration = Time::HiRes::tv_interval( $start_time );
+    print STDERR ("TOTAL EXECUTION TIME: ".$duration."\n");
+    print STDERR ("TOKENIZATION SPEED: ".($duration/$count_sentences*1000)." milliseconds/line\n");
+}
+
+#####################################################################################
+# subroutines afterward
+
+# tokenize a batch of texts saved in an array
+# input: an array containing a batch of texts
+# return: another array containing a batch of tokenized texts for the input array
+sub tokenize_batch
+{
+    my(@text_list) = @_;
+    my(@tokenized_list) = ();
+    foreach (@text_list)
+    {
+        if (($SKIP_XML && /^<.+>$/) || /^\s*$/)
+        {
+            #don't try to tokenize XML/HTML tag lines
+            push(@tokenized_list, $_);
+        }
+        else
+        {
+            push(@tokenized_list, &tokenize($_));
+        }
+    }
+    return \@tokenized_list;
+}
+
+# the actual tokenize function which tokenizes one input string
+# input: one string
+# return: the tokenized string for the input string
+sub tokenize
+{
+    my($text) = @_;
+
+    if ($PENN) {
+      return tokenize_penn($text);
+    }
+
+    chomp($text);
+    $text = " $text ";
+
+    # remove ASCII junk
+    $text =~ s/\s+/ /g;
+    $text =~ s/[\000-\037]//g;
+
+    # Find protected patterns
+    my @protected = ();
+    foreach my $protected_pattern (@protected_patterns) {
+      my $t = $text;
+      while ($t =~ /(?<PATTERN>$protected_pattern)(?<TAIL>.*)$/) {
+        push @protected, $+{PATTERN};
+        $t = $+{TAIL};
+      }
+    }
+
+    for (my $i = 0; $i < scalar(@protected); ++$i) {
+      my $subst = sprintf("THISISPROTECTED%.3d", $i);
+      $text =~ s,\Q$protected[$i], $subst ,g;
+    }
+    $text =~ s/ +/ /g;
+    $text =~ s/^ //g;
+    $text =~ s/ $//g;
+
+    # separate out all "other" special characters
+    if (($language eq "fi") or ($language eq "sv")) {
+        # in Finnish and Swedish, the colon can be used inside words as an apostrophe-like character:
+        # USA:n, 20:een, EU:ssa, USA:s, S:t
+        $text =~ s/([^\p{IsAlnum}\s\.\:\'\`\,\-])/ $1 /g;
+        # if a colon is not immediately followed by lower-case characters, separate it out anyway
+        $text =~ s/(:)(?=$|[^\p{Ll}])/ $1 /g;
+    }
+    elsif ($language eq "tdt") {
+        # in Tetun, the apostrophe can be used inside words as an apostrophe-like character:
+        $text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g;
+        # if an apostrophe is not immediately followed by lower-case characters, separate it out anyway
+        $text =~ s/(\')(?=$|[^\p{Ll}])/ $1 /g;
+    }
+    elsif (($language eq "ca")) {
+        # in Catalan, the middle dot can be used inside words:
+        # il<69>lusio
+        $text =~ s/([^\p{IsAlnum}\s\.\·\'\`\,\-])/ $1 /g;
+        # if a middot is not immediately followed by lower-case characters, separate it out anyway
+        $text =~ s/(·)(?=$|[^\p{Ll}])/ $1 /g;
+    }   
+    else {
+        $text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g;
+    }
+
+    # aggressive hyphen splitting
+    if ($AGGRESSIVE)
+    {
+        $text =~ s/([\p{IsAlnum}])\-(?=[\p{IsAlnum}])/$1 \@-\@ /g;
+    }
+
+    #multi-dots stay together
+    $text =~ s/\.([\.]+)/ DOTMULTI$1/g;
+    while($text =~ /DOTMULTI\./)
+    {
+        $text =~ s/DOTMULTI\.([^\.])/DOTDOTMULTI $1/g;
+        $text =~ s/DOTMULTI\./DOTDOTMULTI/g;
+    }
+
+    # seperate out "," except if within numbers (5,300)
+    #$text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
+
+    # separate out "," except if within numbers (5,300)
+    # previous "global" application skips some:  A,B,C,D,E > A , B,C , D,E
+    # first application uses up B so rule can't see B,C
+    # two-step version here may create extra spaces but these are removed later
+    # will also space digit,letter or letter,digit forms (redundant with next section)
+    $text =~ s/([^\p{IsN}])[,]/$1 , /g;
+    $text =~ s/[,]([^\p{IsN}])/ , $1/g;
+    
+    # separate "," after a number if it's the end of a sentence
+    $text =~ s/([\p{IsN}])[,]$/$1 ,/g;
+
+    # separate , pre and post number
+    #$text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
+    #$text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g;
+
+    # turn `into '
+    #$text =~ s/\`/\'/g;
+
+    #turn '' into "
+    #$text =~ s/\'\'/ \" /g;
+
+    if ($language eq "en")
+    {
+        #split contractions right
+        $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
+        $text =~ s/([^\p{IsAlpha}\p{IsN}])[']([\p{IsAlpha}])/$1 ' $2/g;
+        $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
+        $text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1 '$2/g;
+        #special case for "1990's"
+        $text =~ s/([\p{IsN}])[']([s])/$1 '$2/g;
+    }
+    elsif (($language eq "fr") or ($language eq "it") or ($language eq "ga") or ($language eq "ca"))
+    {
+        #split contractions left
+        $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
+        $text =~ s/([^\p{IsAlpha}])[']([\p{IsAlpha}])/$1 ' $2/g;
+        $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
+        $text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1' $2/g;
+    }
+    elsif (($language eq "so")  or ($language eq "tdt"))
+    {
+        # Don't split glottals
+        $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
+        $text =~ s/([^\p{IsAlpha}])[']([\p{IsAlpha}])/$1 ' $2/g;
+        $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
+    }
+    else
+    {
+        $text =~ s/\'/ \' /g;
+    }
+
+    #word token method
+    my @words = split(/\s/,$text);
+    $text = "";
+    for (my $i=0;$i<(scalar(@words));$i++)
+    {
+        my $word = $words[$i];
+        if ( $word =~ /^(\S+)\.$/)
+        {
+            my $pre = $1;
+            if ($i == scalar(@words)-1) {
+                # split last words independently as they are unlikely to be non-breaking prefixes
+                $word = $pre." .";
+            }
+            elsif (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[\p{IsLower}]/)))
+            {
+                #no change
+            }
+            elsif (($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==2) && ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[0-9]+/)))
+            {
+                #no change
+            }
+            else
+            {
+                $word = $pre." .";
+            }
+        }
+        $text .= $word." ";
+    }
+
+    # clean up extraneous spaces
+    $text =~ s/ +/ /g;
+    $text =~ s/^ //g;
+    $text =~ s/ $//g;
+
+    # .' at end of sentence is missed
+    $text =~ s/\.\' ?$/ . ' /;
+
+    # restore protected
+    for (my $i = 0; $i < scalar(@protected); ++$i) {
+      my $subst = sprintf("THISISPROTECTED%.3d", $i);
+      $text =~ s/$subst/$protected[$i]/g;
+    }
+
+    #restore multi-dots
+    while($text =~ /DOTDOTMULTI/)
+    {
+        $text =~ s/DOTDOTMULTI/DOTMULTI./g;
+    }
+    $text =~ s/DOTMULTI/./g;
+
+    #escape special chars
+    if (!$NO_ESCAPING)
+      {
+	$text =~ s/\&/\&amp;/g;   # escape escape
+	$text =~ s/\|/\&#124;/g;  # factor separator
+	$text =~ s/\</\&lt;/g;    # xml
+	$text =~ s/\>/\&gt;/g;    # xml
+	$text =~ s/\'/\&apos;/g;  # xml
+	$text =~ s/\"/\&quot;/g;  # xml
+	$text =~ s/\[/\&#91;/g;   # syntax non-terminal
+	$text =~ s/\]/\&#93;/g;   # syntax non-terminal
+      }
+
+    #ensure final line break
+    $text .= "\n" unless $text =~ /\n$/;
+
+    return $text;
+}
+
+sub tokenize_penn
+{
+    # Improved compatibility with Penn Treebank tokenization.  Useful if
+    # the text is to later be parsed with a PTB-trained parser.
+    #
+    # Adapted from Robert MacIntyre's sed script:
+    #   http://www.cis.upenn.edu/~treebank/tokenizer.sed
+
+    my($text) = @_;
+    chomp($text);
+
+    # remove ASCII junk
+    $text =~ s/\s+/ /g;
+    $text =~ s/[\000-\037]//g;
+
+    # attempt to get correct directional quotes
+    $text =~ s/^``/`` /g;
+    $text =~ s/^"/`` /g;
+    $text =~ s/^`([^`])/` $1/g;
+    $text =~ s/^'/`  /g;
+    $text =~ s/([ ([{<])"/$1 `` /g;
+    $text =~ s/([ ([{<])``/$1 `` /g;
+    $text =~ s/([ ([{<])`([^`])/$1 ` $2/g;
+    $text =~ s/([ ([{<])'/$1 ` /g;
+    # close quotes handled at end
+
+    $text =~ s=\.\.\.= _ELLIPSIS_ =g;
+
+    # separate out "," except if within numbers (5,300)
+    $text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
+    # separate , pre and post number
+    $text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
+    $text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g;
+
+    #$text =~ s=([;:@#\$%&\p{IsSc}])= $1 =g;
+$text =~ s=([;:@#\$%&\p{IsSc}\p{IsSo}])= $1 =g;
+
+    # Separate out intra-token slashes.  PTB tokenization doesn't do this, so
+    # the tokens should be merged prior to parsing with a PTB-trained parser
+    # (see syntax-hyphen-splitting.perl).
+    $text =~ s/([\p{IsAlnum}])\/([\p{IsAlnum}])/$1 \@\/\@ $2/g;
+
+    # Assume sentence tokenization has been done first, so split FINAL periods
+    # only.
+    $text =~ s=([^.])([.])([\]\)}>"']*) ?$=$1 $2$3 =g;
+    # however, we may as well split ALL question marks and exclamation points,
+    # since they shouldn't have the abbrev.-marker ambiguity problem
+    $text =~ s=([?!])= $1 =g;
+
+    # parentheses, brackets, etc.
+    $text =~ s=([\]\[\(\){}<>])= $1 =g;
+    $text =~ s/\(/-LRB-/g;
+    $text =~ s/\)/-RRB-/g;
+    $text =~ s/\[/-LSB-/g;
+    $text =~ s/\]/-RSB-/g;
+    $text =~ s/{/-LCB-/g;
+    $text =~ s/}/-RCB-/g;
+
+    $text =~ s=--= -- =g;
+
+    # First off, add a space to the beginning and end of each line, to reduce
+    # necessary number of regexps.
+    $text =~ s=$= =;
+    $text =~ s=^= =;
+
+    $text =~ s="= '' =g;
+    # possessive or close-single-quote
+    $text =~ s=([^'])' =$1 ' =g;
+    # as in it's, I'm, we'd
+    $text =~ s='([sSmMdD]) = '$1 =g;
+    $text =~ s='ll = 'll =g;
+    $text =~ s='re = 're =g;
+    $text =~ s='ve = 've =g;
+    $text =~ s=n't = n't =g;
+    $text =~ s='LL = 'LL =g;
+    $text =~ s='RE = 'RE =g;
+    $text =~ s='VE = 'VE =g;
+    $text =~ s=N'T = N'T =g;
+
+    $text =~ s= ([Cc])annot = $1an not =g;
+    $text =~ s= ([Dd])'ye = $1' ye =g;
+    $text =~ s= ([Gg])imme = $1im me =g;
+    $text =~ s= ([Gg])onna = $1on na =g;
+    $text =~ s= ([Gg])otta = $1ot ta =g;
+    $text =~ s= ([Ll])emme = $1em me =g;
+    $text =~ s= ([Mm])ore'n = $1ore 'n =g;
+    $text =~ s= '([Tt])is = '$1 is =g;
+    $text =~ s= '([Tt])was = '$1 was =g;
+    $text =~ s= ([Ww])anna = $1an na =g;
+
+    #word token method
+    my @words = split(/\s/,$text);
+    $text = "";
+    for (my $i=0;$i<(scalar(@words));$i++)
+    {
+        my $word = $words[$i];
+        if ( $word =~ /^(\S+)\.$/)
+        {
+            my $pre = $1;
+            if (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[\p{IsLower}]/)))
+            {
+                #no change
+            }
+            elsif (($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==2) && ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[0-9]+/)))
+            {
+                #no change
+            }
+            else
+            {
+                $word = $pre." .";
+            }
+        }
+        $text .= $word." ";
+    }
+
+    # restore ellipses
+    $text =~ s=_ELLIPSIS_=\.\.\.=g;
+
+    # clean out extra spaces
+    $text =~ s=  *= =g;
+    $text =~ s=^ *==g;
+    $text =~ s= *$==g;
+
+    #escape special chars
+    $text =~ s/\&/\&amp;/g;   # escape escape
+    $text =~ s/\|/\&#124;/g;  # factor separator
+    $text =~ s/\</\&lt;/g;    # xml
+    $text =~ s/\>/\&gt;/g;    # xml
+    $text =~ s/\'/\&apos;/g;  # xml
+    $text =~ s/\"/\&quot;/g;  # xml
+    $text =~ s/\[/\&#91;/g;   # syntax non-terminal
+    $text =~ s/\]/\&#93;/g;   # syntax non-terminal
+
+    #ensure final line break
+    $text .= "\n" unless $text =~ /\n$/;
+
+    return $text;
+}
+
+sub load_prefixes
+{
+    my ($language, $PREFIX_REF) = @_;
+
+    my $prefixfile = "$mydir/nonbreaking_prefix.$language";
+
+    #default back to English if we don't have a language-specific prefix file
+    if (!(-e $prefixfile))
+    {
+        $prefixfile = "$mydir/nonbreaking_prefix.en";
+        print STDERR "WARNING: No known abbreviations for language '$language', attempting fall-back to English version...\n";
+        die ("ERROR: No abbreviations files found in $mydir\n") unless (-e $prefixfile);
+    }
+
+    if (-e "$prefixfile")
+    {
+        open(PREFIX, "<:utf8", "$prefixfile");
+        while (<PREFIX>)
+        {
+            my $item = $_;
+            chomp($item);
+            if (($item) && (substr($item,0,1) ne "#"))
+            {
+                if ($item =~ /(.*)[\s]+(\#NUMERIC_ONLY\#)/)
+                {
+                    $PREFIX_REF->{$1} = 2;
+                }
+                else
+                {
+                    $PREFIX_REF->{$item} = 1;
+                }
+            }
+        }
+        close(PREFIX);
+    }
+}
--- a/utils/update_json.sh
+++ b/utils/update_json.sh
@ -0,0 +1,88 @@
+#!/bin/bash
+
+# Copyright 2020 Kyoto University (Hirofumi Inaguma)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+echo "$0 $*" >&2 # Print the command line for logging
+. ./path.sh
+
+nlsyms=""
+oov="<unk>"
+bpecode=""
+verbose=0
+
+text=""
+multilingual=false
+
+help_message=$(cat << EOF
+Usage: $0 <json> <data-dir> <dict>
+e.g. $0 data/train data/lang_1char/train_units.txt
+Options:
+  --oov <oov-word>                                 # Default: <unk>
+  --verbose <num>                                  # Default: 0
+EOF
+)
+. utils/parse_options.sh
+
+if [ $# != 3 ]; then
+    echo "${help_message}" 1>&2
+    exit 1;
+fi
+
+set -euo pipefail
+
+json=$1
+dir=$2
+dic=$3
+json_dir=$(dirname ${json})
+tmpdir=$(mktemp -d ${dir}/tmp-XXXXX)
+trap 'rm -rf ${tmpdir}' EXIT
+
+if [ -z ${text} ]; then
+    text=${dir}/text
+fi
+
+# 2. Create scp files for outputs
+mkdir -p ${tmpdir}/output
+if [ -n "${bpecode}" ]; then
+    if [ ${multilingual} = true ]; then
+        # remove a space before the language ID
+        paste -d " " <(awk '{print $1}' ${text}) <(cut -f 2- -d" " ${text} \
+            | spm_encode --model=${bpecode} --output_format=piece | cut -f 2- -d" ") \
+            > ${tmpdir}/output/token.scp
+    else
+        paste -d " " <(awk '{print $1}' ${text}) <(cut -f 2- -d" " ${text} \
+            | spm_encode --model=${bpecode} --output_format=piece) \
+            > ${tmpdir}/output/token.scp
+    fi
+elif [ -n "${nlsyms}" ]; then
+    text2token.py -s 1 -n 1 -l ${nlsyms} ${text} > ${tmpdir}/output/token.scp
+else
+    text2token.py -s 1 -n 1 ${text} > ${tmpdir}/output/token.scp
+fi
+< ${tmpdir}/output/token.scp utils/sym2int.pl --map-oov ${oov} -f 2- ${dic} > ${tmpdir}/output/tokenid.scp
+awk '{print $1 " " NF-1}' ${tmpdir}/output/tokenid.scp > ${tmpdir}/output/olen.scp
+# +2 comes from CTC blank and EOS
+vocsize=$(tail -n 1 ${dic} | awk '{print $2}')
+odim=$(echo "$vocsize + 2" | bc)
+awk -v odim=${odim} '{print $1 " " odim}' ${text} > ${tmpdir}/output/odim.scp
+
+cat ${text} > ${tmpdir}/output/text.scp
+
+
+# 4. Create JSON files from each scp files
+rm -f ${tmpdir}/*/*.json
+for x in "${tmpdir}"/output/*.scp; do
+    k=$(basename ${x} .scp)
+    < ${x} scp2json.py --key ${k} > ${tmpdir}/output/${k}.json
+done
+
+# add to json
+addjson.py --verbose ${verbose} -i false \
+  ${json} ${tmpdir}/output/text.json ${tmpdir}/output/token.json ${tmpdir}/output/tokenid.json ${tmpdir}/output/olen.json ${tmpdir}/output/odim.json > ${tmpdir}/data.json
+mkdir -p ${json_dir}/.backup
+echo "json updated. original json is kept in ${json_dir}/.backup."
+cp ${json} ${json_dir}/.backup/"$(basename ${json})"
+cp ${tmpdir}/data.json ${json}
+
+rm -fr ${tmpdir}