Merge pull request #1050 from LittleChenCc/develop

[ST] add script for data process (st1) for Ted-En-Zh
pull/1055/head
Hui Zhang 3 years ago committed by GitHub
commit 100fdf2403
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -28,7 +28,7 @@ import soundfile
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--src_dir",
"--src-dir",
default="",
type=str,
help="Directory to kaldi splited data. (default: %(default)s)")

@ -0,0 +1,89 @@
# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
# e.g.
# run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
#
# Options:
# --time <time>: Limit the maximum time to execute.
# --mem <mem>: Limit the maximum memory usage.
# -max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
# --num-threads <ngpu>: Specify the number of CPU core.
# --gpu <ngpu>: Specify the number of GPU devices.
# --config: Change the configuration file from default.
#
# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
#
# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
# These options are mapping to specific options for each backend and
# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
# If jobs failed, your configuration might be wrong for your environment.
#
#
# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
# "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
# =========================================================~
# Select the backend used by run.sh from "local", "sge", "slurm", or "ssh"
cmd_backend='local'
# Local machine, without any Job scheduling system
if [ "${cmd_backend}" = local ]; then
# The other usage
export train_cmd="run.pl"
# Used for "*_train.py": "--gpu" is appended optionally by run.sh
export cuda_cmd="run.pl"
# Used for "*_recog.py"
export decode_cmd="run.pl"
# "qsub" (SGE, Torque, PBS, etc.)
elif [ "${cmd_backend}" = sge ]; then
# The default setting is written in conf/queue.conf.
# You must change "-q g.q" for the "queue" for your environment.
# To know the "queue" names, type "qhost -q"
# Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
export train_cmd="queue.pl"
export cuda_cmd="queue.pl"
export decode_cmd="queue.pl"
# "sbatch" (Slurm)
elif [ "${cmd_backend}" = slurm ]; then
# The default setting is written in conf/slurm.conf.
# You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
# To know the "partion" names, type "sinfo".
# You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
# The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
export train_cmd="slurm.pl"
export cuda_cmd="slurm.pl"
export decode_cmd="slurm.pl"
elif [ "${cmd_backend}" = ssh ]; then
# You have to create ".queue/machines" to specify the host to execute jobs.
# e.g. .queue/machines
# host1
# host2
# host3
# Assuming you can login them without any password, i.e. You have to set ssh keys.
export train_cmd="ssh.pl"
export cuda_cmd="ssh.pl"
export decode_cmd="ssh.pl"
# This is an example of specifying several unique options in the JHU CLSP cluster setup.
# Users can modify/add their own command options according to their cluster environments.
elif [ "${cmd_backend}" = jhu ]; then
export train_cmd="queue.pl --mem 2G"
export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/gpu.conf"
export decode_cmd="queue.pl --mem 4G"
else
echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
return 1
fi

@ -0,0 +1,2 @@
--sample-frequency=16000
--num-mel-bins=80

@ -0,0 +1 @@
--sample-frequency=16000

@ -11,9 +11,9 @@ data:
max_output_input_ratio: 20.0
collator:
vocab_filepath: data/lang_char/vocab.txt
vocab_filepath: data/lang_char/ted_en_zh_bpe8000.txt
unit_type: 'spm'
spm_model_prefix: data/train_sp.en-zh-nlpr.zh-nlpr_bpe8000_tc
spm_model_prefix: data/lang_char/ted_en_zh_bpe8000
mean_std_filepath: ""
# augmentation_config: conf/augmentation.json
batch_size: 10

@ -8,10 +8,13 @@ dict_dir=data/lang_char
# bpemode (unigram or bpe)
nbpe=8000
bpemode=unigram
bpemode=bpe
bpeprefix="${dict_dir}/bpe_${bpemode}_${nbpe}"
data_dir=./TED_EnZh
target_dir=data/ted_en_zh
dumpdir=data/dump
do_delta=false
nj=20
source ${MAIN_ROOT}/utils/parse_options.sh
@ -38,75 +41,167 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
exit 1
fi
# generate manifests
python3 ${TARGET_DIR}/ted_en_zh/ted_en_zh.py \
--manifest_prefix="data/manifest" \
--src_dir="${data_dir}"
# extract data
echo "data Extraction"
python3 local/ted_en_zh.py \
--tgt-dir=${target_dir} \
--src-dir=${data_dir}
echo "Complete raw data pre-process."
fi
prep_dir=${target_dir}/data_prep
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# compute mean and stddev for normalizer
num_workers=$(nproc)
python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
--manifest_path="data/manifest.train.raw" \
--num_samples=-1 \
--spectrum_type="fbank" \
--feat_dim=80 \
--delta_delta=false \
--sample_rate=16000 \
--stride_ms=10.0 \
--window_ms=25.0 \
--use_dB_normalization=False \
--num_workers=${num_workers} \
--output_path="data/mean_std.json"
if [ $? -ne 0 ]; then
echo "Compute mean and stddev failed. Terminated."
exit 1
fi
### Task dependent. You have to make data the following preparation part by yourself.
### But you can utilize Kaldi recipes in most cases
echo "stage 0: Data preparation"
for set in train dev test; do
# for set in train; do
dst=${target_dir}/${set}
for lang in en zh; do
if [ ${lang} = 'en' ]; then
echo "remove punctuation $lang"
# remove punctuation
local/remove_punctuation.pl < ${dst}/${lang}.org > ${dst}/${lang}.raw
else
cp ${dst}/${lang}.org ${dst}/${lang}.raw
fi
paste -d " " ${dst}/.yaml ${dst}/${lang}.raw | sort > ${dst}/text.${lang}
done
# error check
n=$(cat ${dst}/.yaml | wc -l)
n_en=$(cat ${dst}/en.raw | wc -l)
n_tgt=$(cat ${dst}/zh.raw | wc -l)
[ ${n} -ne ${n_en} ] && echo "Warning: expected ${n} data data files, found ${n_en}" && exit 1;
[ ${n} -ne ${n_tgt} ] && echo "Warning: expected ${n} data data files, found ${n_tgt}" && exit 1;
echo "done text processing"
cat ${dst}/wav.scp.org | uniq | sort -k1,1 -u > ${dst}/wav.scp
cat ${dst}/utt2spk.org | uniq | sort -k1,1 -u > ${dst}/utt2spk
cat ${dst}/utt2spk | utils/utt2spk_to_spk2utt.pl | sort -k1,1 -u > ${dst}/spk2utt
rm -rf ${prep_dir}/${set}.en-zh
mkdir -p ${prep_dir}/${set}.en-zh
echo "remove duplicate lines..."
cut -d ' ' -f 1 ${dst}/text.en | sort | uniq -c | sort -n -k1 -r | grep -v '1 ted-en-zh' \
| sed 's/^[ \t]*//' > ${dst}/duplicate_lines
cut -d ' ' -f 1 ${dst}/text.en | sort | uniq -c | sort -n -k1 -r | grep '1 ted-en-zh' \
| cut -d '1' -f 2- | sed 's/^[ \t]*//' > ${dst}/reclist
reduce_data_dir.sh ${dst} ${dst}/reclist ${prep_dir}/${set}.en-zh
echo "done wav processing"
for l in en zh; do
cp ${dst}/text.${l} ${prep_dir}/${set}.en-zh/text.${l}
done
utils/fix_data_dir.sh --utt_extra_files \
"text.en text.zh" \
${prep_dir}/${set}.en-zh
done
fi
feat_tr_dir=${dumpdir}/train_sp/delta${do_delta}; mkdir -p ${feat_tr_dir}
feat_dt_dir=${dumpdir}/dev/delta${do_delta}; mkdir -p ${feat_dt_dir}
feat_trans_dir=${dumpdir}/test/delta${do_delta}; mkdir -p ${feat_trans_dir}
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# build vocabulary
python3 ${MAIN_ROOT}/utils/build_vocab.py \
--unit_type "spm" \
--spm_vocab_size=${nbpe} \
--spm_mode ${bpemode} \
--spm_model_prefix ${bpeprefix} \
--vocab_path="${dict_dir}/vocab.txt" \
--text_keys 'text' 'text1' \
--manifest_paths="data/manifest.train.raw"
if [ $? -ne 0 ]; then
echo "Build vocabulary failed. Terminated."
exit 1
fi
### Task dependent. You have to design training and dev sets by yourself.
### But you can utilize Kaldi recipes in most cases
echo "stage 1: Feature Generation"
fbankdir=data/fbank
# Generate the fbank features; by default 80-dimensional fbanks with pitch on each frame
for x in train dev test; do
steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj ${nj} --write_utt2num_frames true \
${prep_dir}/${x}.en-zh data/make_fbank/${x} ${fbankdir}
done
echo "speed perturbation"
utils/perturb_data_dir_speed.sh 0.9 ${prep_dir}/train.en-zh ${prep_dir}/temp1.en-zh
utils/perturb_data_dir_speed.sh 1.0 ${prep_dir}/train.en-zh ${prep_dir}/temp2.en-zh
utils/perturb_data_dir_speed.sh 1.1 ${prep_dir}/train.en-zh ${prep_dir}/temp3.en-zh
utils/combine_data.sh --extra-files utt2uniq ${prep_dir}/train_sp.en-zh \
${prep_dir}/temp1.en-zh ${prep_dir}/temp2.en-zh ${prep_dir}/temp3.en-zh
rm -r ${prep_dir}/temp*.en-zh
utils/fix_data_dir.sh ${prep_dir}/train_sp.en-zh
steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj ${nj} --write_utt2num_frames true \
${prep_dir}/train_sp.en-zh exp/make_fbank/train_sp.en-zh ${fbankdir}
for lang in en zh; do
cat /dev/null > ${prep_dir}/train_sp.en-zh/text.${lang}
for p in "sp0.9-" "sp1.0-" "sp1.1-"; do
awk -v p=${p} '{printf("%s %s%s\n", $1, p, $1);}' ${prep_dir}/train.en-zh/utt2spk > ${prep_dir}/train_sp.en-zh/utt_map
utils/apply_map.pl -f 1 ${prep_dir}/train_sp.en-zh/utt_map < ${prep_dir}/train.en-zh/text.${lang} >>${prep_dir}/train_sp.en-zh/text.${lang}
done
done
for x in train_sp dev test; do
local/divide_lang.sh ${prep_dir}/${x}.en-zh zh
done
for x in train_sp dev; do
# remove utt having more than 3000 frames
# remove utt having more than 400 characters
for lang in zh en; do
remove_longshortdata.sh --maxframes 3000 --maxchars 400 ${prep_dir}/${x}.en-zh.${lang} ${prep_dir}/${x}.en-zh.${lang}.tmp
done
cut -f 1 -d " " ${prep_dir}/${x}.en-zh.en.tmp/text > ${prep_dir}/${x}.en-zh.${lang}.tmp/reclist1
cut -f 1 -d " " ${prep_dir}/${x}.en-zh.${lang}.tmp/text > ${prep_dir}/${x}.en-zh.${lang}.tmp/reclist2
comm -12 ${prep_dir}/${x}.en-zh.${lang}.tmp/reclist1 ${prep_dir}/${x}.en-zh.${lang}.tmp/reclist2 > ${prep_dir}/${x}.en-zh.en.tmp/reclist
for lang in zh en; do
reduce_data_dir.sh ${prep_dir}/${x}.en-zh.${lang}.tmp ${prep_dir}/${x}.en-zh.en.tmp/reclist ${prep_dir}/${x}.en-zh.${lang}
utils/fix_data_dir.sh ${prep_dir}/${x}.en-zh.${lang}
done
rm -rf ${prep_dir}/${x}.en-zh.*.tmp
done
compute-cmvn-stats scp:${prep_dir}/train_sp.en-zh.zh/feats.scp ${prep_dir}/train_sp.en-zh.zh/cmvn.ark
dump.sh --cmd "$train_cmd" --nj ${nj} --do_delta $do_delta \
${prep_dir}/train_sp.en-zh.zh/feats.scp ${prep_dir}/train_sp.en-zh.zh/cmvn.ark ${prep_dir}/dump_feats/train_sp.en-zh.zh ${feat_tr_dir}
dump.sh --cmd "$train_cmd" --nj ${nj} --do_delta $do_delta \
${prep_dir}/dev.en-zh.zh/feats.scp ${prep_dir}/train_sp.en-zh.zh/cmvn.ark ${prep_dir}/dump_feats/dev.en-zh.zh ${feat_dt_dir}
dump.sh --cmd "$train_cmd" --nj ${nj} --do_delta $do_delta \
${prep_dir}/test.en-zh.zh/feats.scp ${prep_dir}/train_sp.en-zh.zh/cmvn.ark ${prep_dir}/dump_feats/test.en-zh.zh ${feat_trans_dir}
fi
dict=${dict_dir}/ted_en_zh_${bpemode}${nbpe}.txt
nlsyms=${dict_dir}/ted_en_zh_non_lang_syms.txt
bpemodel=${dict_dir}/ted_en_zh_${bpemode}${nbpe}
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# format manifest with tokenids, vocab size
for set in train dev test; do
{
python3 ${MAIN_ROOT}/utils/format_triplet_data.py \
--feat_type "raw" \
--cmvn_path "data/mean_std.json" \
--unit_type "spm" \
--spm_model_prefix ${bpeprefix} \
--vocab_path="${dict_dir}/vocab.txt" \
--manifest_path="data/manifest.${set}.raw" \
--output_path="data/manifest.${set}"
if [ $? -ne 0 ]; then
echo "Formt mnaifest failed. Terminated."
exit 1
fi
}&
echo "stage 2: Dictionary and Json Data Preparation"
echo "make a joint source and target dictionary"
echo "<unk> 1" > ${dict} # <unk> must be 1, 0 will be used for "blank" in CTC
offset=$(wc -l < ${dict})
grep sp1.0 ${prep_dir}/train_sp.en-zh.*/text | cut -f 2- -d' ' | grep -v -e '^\s*$' > ${dict_dir}/input.txt
spm_train --input=${dict_dir}/input.txt --vocab_size=${nbpe} --model_type=${bpemode} --model_prefix=${bpemodel} --input_sentence_size=100000000 --character_coverage=1.0
spm_encode --model=${bpemodel}.model --output_format=piece < ${dict_dir}/input.txt | tr ' ' '\n' | sort | uniq | awk -v offset=${offset} '{print $0 " " NR+offset}' >> ${dict}
wc -l ${dict}
echo "make json files"
data2json.sh --nj ${nj} --feat ${feat_tr_dir}/feats.scp --text ${prep_dir}/train_sp.en-zh.zh/text --bpecode ${bpemodel}.model --lang zh \
${prep_dir}/train_sp.en-zh.zh ${dict} > ${feat_tr_dir}/data_${bpemode}${nbpe}.json
data2json.sh --feat ${feat_dt_dir}/feats.scp --text ${prep_dir}/dev.en-zh.zh/text --bpecode ${bpemodel}.model --lang zh \
${prep_dir}/dev.en-zh.zh ${dict} > ${feat_dt_dir}/data_${bpemode}${nbpe}.json
data2json.sh --feat ${feat_trans_dir}/feats.scp --text ${prep_dir}/test.en-zh.zh/text --bpecode ${bpemodel}.model --lang zh \
${prep_dir}/test.en-zh.zh ${dict} > ${feat_trans_dir}/data_${bpemode}${nbpe}.json
echo "update json (add source references)"
# update json (add source references)
for x in train_sp dev; do
feat_dir=${dumpdir}/${x}/delta${do_delta}
data_dir=${prep_dir}/$(echo ${x} | cut -f 1 -d ".").en-zh.en
update_json.sh --text ${data_dir}/text --bpecode ${bpemodel}.model \
${feat_dir}/data_${bpemode}${nbpe}.json ${data_dir} ${dict}
done
wait
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
echo "stage 3: Format the Json Data"
python3 local/espnet_json_to_manifest.py --json-file ${feat_tr_dir}/data_${bpemode}${nbpe}.json --manifest-file data/manifest.train
python3 local/espnet_json_to_manifest.py --json-file ${feat_dt_dir}/data_${bpemode}${nbpe}.json --manifest-file data/manifest.dev
python3 local/espnet_json_to_manifest.py --json-file ${feat_trans_dir}/data_${bpemode}${nbpe}.json --manifest-file data/manifest.test
fi
echo "Ted En-Zh Data preparation done."
exit 0

@ -0,0 +1,48 @@
#!/bin/bash
# Copyright 2019 Kyoto University (Hirofumi Inaguma)
# 2021 PaddlePaddle
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
. ./path.sh
if [ "$#" -ne 2 ]; then
echo "Usage: $0 <set> <lang>>"
echo "e.g.: $0 dev"
exit 1
fi
set=$1
lang=$2
export LC_ALL=en_US.UTF-8
# Copy stuff intoc its final locations [this has been moved from the format_data script]
# for En
mkdir -p ${set}.en
for f in spk2utt utt2spk segments wav.scp feats.scp utt2num_frames; do
if [ -f ${set}/${f} ]; then
sort ${set}/${f} > ${set}.en/${f}
fi
done
sort ${set}/text.en | sed $'s/[^[:print:]]//g' > ${set}.en/text
utils/fix_data_dir.sh ${set}.en
if [ -f ${set}.en/feats.scp ]; then
utils/validate_data_dir.sh ${set}.en || exit 1;
else
utils/validate_data_dir.sh --no-feats --no-wav ${set}.en || exit 1;
fi
# for target language
mkdir -p ${set}.${lang}
for f in spk2utt utt2spk segments wav.scp feats.scp utt2num_frames; do
if [ -f ${set}/${f} ]; then
sort ${set}/${f} > ${set}.${lang}/${f}
fi
done
sort ${set}/text.${lang} | sed $'s/[^[:print:]]//g' > ${set}.${lang}/text
utils/fix_data_dir.sh ${set}.${lang}
if [ -f ${set}.${lang}/feats.scp ]; then
utils/validate_data_dir.sh ${set}.${lang} || exit 1;
else
utils/validate_data_dir.sh --no-feats --no-wav ${set}.${lang} || exit 1;
fi

@ -0,0 +1,27 @@
#!/usr/bin/env python
import argparse
import json
def main(args):
with open(args.json_file, 'r') as fin:
data_json = json.load(fin)
with open(args.manifest_file, 'w') as fout:
for key, value in data_json['utts'].items():
value['utt'] = key
fout.write(json.dumps(value, ensure_ascii=False))
fout.write("\n")
if __name__ == '__main__':
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
'--json-file', type=str, default=None, help="espnet data json file.")
parser.add_argument(
'--manifest-file',
type=str,
default='manifest.train',
help='manifest data json line file.')
args = parser.parse_args()
main(args)

@ -0,0 +1,25 @@
#!/usr/bin/perl
use warnings;
use strict;
binmode(STDIN,":utf8");
binmode(STDOUT,":utf8");
while(<STDIN>) {
$_ = " $_ ";
# remove punctuation except apostrophe
s/<space>/spacemark/g; # for scoring
s/'/apostrophe/g;
s/[[:punct:]]//g;
s/apostrophe/'/g;
s/spacemark/<space>/g; # for scoring
# remove whitespace
s/\s+/ /g;
s/^\s+//;
s/\s+$//;
print "$_\n";
}

@ -0,0 +1,104 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import codecs
import os
# org_split = 'train-split/train-segment'
# text_file = 'En-Zh/train.en-zh'
# data_split = 'train'
def data_process(src_dir, tgt_dir, wav_dir_list, text_file_list,
data_split_list):
for org_split, text_file, data_split in zip(wav_dir_list, text_file_list,
data_split_list):
local_data_split_dir = os.path.join(tgt_dir, data_split)
os.makedirs(local_data_split_dir, exist_ok=True)
utts = []
utt2spk = {}
with open(os.path.join(local_data_split_dir, 'wav.scp.org'), 'w') as wav_wf, \
open(os.path.join(local_data_split_dir, 'utt2spk.org'), 'w') as utt2spk_wf:
for files in os.listdir(os.path.join(src_dir, org_split)):
files = files.strip()
file_path = os.path.join(src_dir, org_split, files)
size = os.path.getsize(file_path)
if size <= 30000:
continue
utt = files.split('.')[0]
audio_name = utt.split('_')[0]
#format the name of utterance
while len(audio_name) < 6:
utt = '0' + utt
audio_name = '0' + audio_name
utt = 'ted-en-zh-' + utt
utts.append(utt)
spk = utt.split('_')[0]
utt2spk[utt] = spk
assert len(spk) == 16, "%r" % spk
print(utt, 'cat', os.path.abspath(file_path), '|', file=wav_wf)
for utt in sorted(utts):
print(utt, utt2spk[utt], file=utt2spk_wf)
with open(os.path.join(local_data_split_dir, 'en.org'), 'w') as en_wf, \
open(os.path.join(local_data_split_dir, 'zh.org'), 'w') as zh_wf, \
open(os.path.join(local_data_split_dir, '.yaml'), 'w') as yaml_wf, \
codecs.open(os.path.join(src_dir, text_file), 'r', encoding='utf-8',
errors='ignore') as rf:
count = 0
for line in rf:
line = line.strip()
line_spl = line.split('\t')
assert len(line_spl) == 3, "%r" % line
wav, en, zh = line_spl
assert wav.endswith('wav'), "%r" % wav[-3:]
utt = wav.split('.')[0]
audio_name = utt.split('_')[0]
while len(audio_name) < 6:
utt = '0' + utt
audio_name = '0' + audio_name
utt = 'ted-en-zh-' + utt
print(utt, file=yaml_wf)
print(en.lower(), file=en_wf)
print(zh, file=zh_wf)
count += 1
print('%s set lines count: %d' % (data_split, count))
if __name__ == '__main__':
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--src-dir",
default="",
type=str,
help="Directory to kaldi splited data. (default: %(default)s)")
parser.add_argument(
"--tgt-dir",
default="local/ted_en_zh",
type=str,
help="Directory to save processed data. (default: %(default)s)")
args = parser.parse_args()
wav_dir_list = [
'train-split/train-segment', 'test-segment/tst2014',
'test-segment/tst2015'
]
text_file_list = [
'En-Zh/train.en-zh', 'En-Zh/tst2014.en-zh', 'En-Zh/tst2015.en-zh'
]
data_split_list = ['train', 'dev', 'test']
data_process(args.src_dir, args.tgt_dir, wav_dir_list, text_file_list,
data_split_list)

@ -24,7 +24,7 @@ python3 -u ${BIN_DIR}/train.py \
--ngpu ${ngpu} \
--config ${config_path} \
--output exp/${ckpt_name} \
--checkpoint_path ${ckpt_path} \
--checkpoint_path "${ckpt_path}" \
--seed ${seed}
if [ ${seed} != 0 ]; then

@ -1,6 +1,6 @@
export MAIN_ROOT=`realpath ${PWD}/../../../`
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PWD}/utils:${PATH}
export LC_ALL=C
export PYTHONDONTWRITEBYTECODE=1
@ -13,3 +13,10 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
MODEL=u2_st
export BIN_DIR=${MAIN_ROOT}/paddlespeech/s2t/exps/${MODEL}/bin
# Kaldi
export KALDI_ROOT=${MAIN_ROOT}/tools/kaldi
[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present, can not using Kaldi!"
[ -f $KALDI_ROOT/tools/config/common_path.sh ] && . $KALDI_ROOT/tools/config/common_path.sh

@ -1,12 +1,13 @@
#!/bin/bash
set -e
source path.sh
. ./path.sh || exit 1;
. ./cmd.sh || exit 1;
gpus=0,1,2,3
stage=1
stop_stage=4
conf_path=conf/transformer_mtl_noam.yaml
ckpt_path=paddle.98
ckpt_path= # paddle.98 # (finetune from FAT-ST pretrained model)
avg_num=5
data_path=./TED_EnZh # path to unzipped data
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
@ -22,21 +23,20 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# download pretrained
bash ./local/download_pretrain.sh || exit -1
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# train model, all `ckpt` under `exp` dir
CUDA_VISIBLE_DEVICES=${gpus} ./local/train_finetune.sh ${conf_path} ${ckpt} ${ckpt_path}
if [ -n "${ckpt_path}" ]; then
echo "Finetune from Pretrained Model" ${ckpt_path}
./local/download_pretrain.sh || exit -1
fi
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} "${ckpt_path}"
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# avg n best model
avg.sh best exp/${ckpt}/checkpoints ${avg_num}
fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# test ckpt avg_n
CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
fi

@ -0,0 +1 @@
../../../tools/kaldi/egs/wsj/s5/steps

@ -0,0 +1 @@
../../../tools/kaldi/egs/wsj/s5/utils

@ -26,8 +26,10 @@ from paddle import distributed as dist
from paddle.io import DataLoader
from yacs.config import CfgNode
from paddlespeech.s2t.frontend.featurizer import TextFeaturizer
from paddlespeech.s2t.io.collator import SpeechCollator
from paddlespeech.s2t.io.collator import TripletSpeechCollator
from paddlespeech.s2t.io.dataloader import BatchDataLoader
from paddlespeech.s2t.io.dataset import ManifestDataset
from paddlespeech.s2t.io.sampler import SortagradBatchSampler
from paddlespeech.s2t.io.sampler import SortagradDistributedBatchSampler
@ -423,6 +425,30 @@ class U2STTester(U2STTrainer):
trans.append(''.join([chr(i) for i in ids]))
return trans
def translate(self, audio, audio_len):
""""E2E translation from extracted audio feature"""
cfg = self.config.decoding
text_feature = self.test_loader.collate_fn.text_feature
hyps = self.model.decode(
audio,
audio_len,
text_feature=text_feature,
decoding_method=cfg.decoding_method,
lang_model_path=cfg.lang_model_path,
beam_alpha=cfg.alpha,
beam_beta=cfg.beta,
beam_size=cfg.beam_size,
cutoff_prob=cfg.cutoff_prob,
cutoff_top_n=cfg.cutoff_top_n,
num_processes=cfg.num_proc_bsearch,
ctc_weight=cfg.ctc_weight,
word_reward=cfg.word_reward,
decoding_chunk_size=cfg.decoding_chunk_size,
num_decoding_left_chunks=cfg.num_decoding_left_chunks,
simulate_streaming=cfg.simulate_streaming)
return hyps
def compute_translation_metrics(self,
utts,
audio,

@ -102,10 +102,10 @@ def read_manifest(
manifest = []
with jsonlines.open(manifest_path, 'r') as reader:
for json_data in reader:
feat_len = json_data["feat_shape"][
0] if 'feat_shape' in json_data else 1.0
token_len = json_data["token_shape"][
0] if 'token_shape' in json_data else 1.0
feat_len = json_data["input"][0]["shape"][
0] if 'shape' in json_data["input"][0] else 1.0
token_len = json_data["output"][0]["shape"][
0] if 'shape' in json_data["output"][0] else 1.0
conditions = [
feat_len >= min_input_len,
feat_len <= max_input_len,

@ -51,7 +51,7 @@ def _batch_shuffle(indices, batch_size, epoch, clipped=False):
"""
rng = np.random.RandomState(epoch)
shift_len = rng.randint(0, batch_size - 1)
batch_indices = list(zip(* [iter(indices[shift_len:])] * batch_size))
batch_indices = list(zip(*[iter(indices[shift_len:])] * batch_size))
rng.shuffle(batch_indices)
batch_indices = [item for batch in batch_indices for item in batch]
assert clipped is False

@ -94,6 +94,9 @@ class Checkpoint():
"""
configs = {}
if len(checkpoint_path) == 0 or checkpoint_path == "None":
checkpoint_path = None
if checkpoint_path is not None:
pass
elif checkpoint_dir is not None and record_file is not None:

@ -0,0 +1,152 @@
#!/usr/bin/env python3
# encoding: utf-8
# Copyright 2018 Nagoya University (Tomoki Hayashi)
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
from __future__ import print_function
from __future__ import unicode_literals
import argparse
import codecs
import json
import logging
import sys
from distutils.util import strtobool
from espnet.utils.cli_utils import get_commandline_args
is_python2 = sys.version_info[0] == 2
def get_parser():
parser = argparse.ArgumentParser(
description="add multiple json values to an input or output value",
formatter_class=argparse.ArgumentDefaultsHelpFormatter, )
parser.add_argument("jsons", type=str, nargs="+", help="json files")
parser.add_argument(
"-i",
"--is-input",
default=True,
type=strtobool,
help="If true, add to input. If false, add to output", )
parser.add_argument(
"--verbose", "-V", default=0, type=int, help="Verbose option")
return parser
if __name__ == "__main__":
parser = get_parser()
args = parser.parse_args()
# logging info
logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
if args.verbose > 0:
logging.basicConfig(level=logging.INFO, format=logfmt)
else:
logging.basicConfig(level=logging.WARN, format=logfmt)
logging.info(get_commandline_args())
# make intersection set for utterance keys
js = []
intersec_ks = []
for x in args.jsons:
with codecs.open(x, "r", encoding="utf-8") as f:
j = json.load(f)
ks = j["utts"].keys()
logging.info(x + ": has " + str(len(ks)) + " utterances")
if len(intersec_ks) > 0:
intersec_ks = intersec_ks.intersection(set(ks))
if len(intersec_ks) == 0:
logging.warning("Empty intersection")
break
else:
intersec_ks = set(ks)
js.append(j)
logging.info("new json has " + str(len(intersec_ks)) + " utterances")
# updated original dict to keep intersection
intersec_org_dic = dict()
for k in intersec_ks:
v = js[0]["utts"][k]
intersec_org_dic[k] = v
intersec_add_dic = dict()
for k in intersec_ks:
v = js[1]["utts"][k]
for j in js[2:]:
v.update(j["utts"][k])
intersec_add_dic[k] = v
new_dic = dict()
for key_id in intersec_org_dic:
orgdic = intersec_org_dic[key_id]
adddic = intersec_add_dic[key_id]
if "utt2spk" not in orgdic:
orgdic["utt2spk"] = ""
# NOTE: for machine translation
# add as input
if args.is_input:
# original input
input_list = orgdic["input"]
# additional input
in_add_dic = {}
if "idim" in adddic and "ilen" in adddic:
in_add_dic["shape"] = [int(adddic["ilen"]), int(adddic["idim"])]
elif "idim" in adddic:
in_add_dic["shape"] = [int(adddic["idim"])]
# add all other key value
for key, value in adddic.items():
if key in ["idim", "ilen"]:
continue
in_add_dic[key] = value
# add name
in_add_dic["name"] = "input%d" % (len(input_list) + 1)
input_list.append(in_add_dic)
new_dic[key_id] = {
"input": input_list,
"output": orgdic["output"],
"utt2spk": orgdic["utt2spk"],
}
# add as output
else:
# original output
output_list = orgdic["output"]
# additional output
out_add_dic = {}
# add shape
if "odim" in adddic and "olen" in adddic:
out_add_dic[
"shape"] = [int(adddic["olen"]), int(adddic["odim"])]
elif "odim" in adddic:
out_add_dic["shape"] = [int(adddic["odim"])]
# add all other key value
for key, value in adddic.items():
if key in ["odim", "olen"]:
continue
out_add_dic[key] = value
# add name
out_add_dic["name"] = "target%d" % (len(output_list) + 1)
output_list.append(out_add_dic)
new_dic[key_id] = {
"input": orgdic["input"],
"output": output_list,
"utt2spk": orgdic["utt2spk"],
}
if "lang" in orgdic.keys():
new_dic[key_id]["lang"] = orgdic["lang"]
# ensure "ensure_ascii=False", which is a bug
jsonstring = json.dumps(
{
"utts": new_dic
},
indent=4,
ensure_ascii=False,
sort_keys=True,
separators=(",", ": "), )
sys.stdout = codecs.getwriter("utf-8")(sys.stdout
if is_python2 else sys.stdout.buffer)
print(jsonstring)

@ -0,0 +1,49 @@
#!/usr/bin/env python3
# encoding: utf-8
# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
from __future__ import print_function
from __future__ import unicode_literals
import argparse
import codecs
import json
import sys
is_python2 = sys.version_info[0] == 2
def get_parser():
parser = argparse.ArgumentParser(
description="convert scp to json",
formatter_class=argparse.ArgumentDefaultsHelpFormatter, )
parser.add_argument("--key", "-k", type=str, help="key")
return parser
if __name__ == "__main__":
parser = get_parser()
args = parser.parse_args()
new_line = {}
sys.stdin = codecs.getreader("utf-8")(sys.stdin
if is_python2 else sys.stdin.buffer)
sys.stdout = codecs.getwriter("utf-8")(sys.stdout
if is_python2 else sys.stdout.buffer)
line = sys.stdin.readline()
while line:
x = line.rstrip().split()
v = {args.key: " ".join(x[1:])}
new_line[x[0]] = v
line = sys.stdin.readline()
all_l = {"utts": new_line}
# ensure "ensure_ascii=False", which is a bug
jsonstring = json.dumps(
all_l,
indent=4,
ensure_ascii=False,
sort_keys=True,
separators=(",", ": "))
print(jsonstring)

@ -0,0 +1,596 @@
#!/usr/bin/env perl
#
# This file is part of moses. Its use is licensed under the GNU Lesser General
# Public License version 2.1 or, at your option, any later version.
use warnings;
# Sample Tokenizer
### Version 1.1
# written by Pidong Wang, based on the code written by Josh Schroeder and Philipp Koehn
# Version 1.1 updates:
# (1) add multithreading option "-threads NUM_THREADS" (default is 1);
# (2) add a timing option "-time" to calculate the average speed of this tokenizer;
# (3) add an option "-lines NUM_SENTENCES_PER_THREAD" to set the number of lines for each thread (default is 2000), and this option controls the memory amount needed: the larger this number is, the larger memory is required (the higher tokenization speed);
### Version 1.0
# $Id: tokenizer.perl 915 2009-08-10 08:15:49Z philipp $
# written by Josh Schroeder, based on code by Philipp Koehn
binmode(STDIN, ":utf8");
binmode(STDOUT, ":utf8");
use warnings;
use FindBin qw($RealBin);
use strict;
use Time::HiRes;
if (eval {require Thread;1;}) {
#module loaded
Thread->import();
}
my $mydir = "$RealBin/../share/nonbreaking_prefixes";
my %NONBREAKING_PREFIX = ();
my @protected_patterns = ();
my $protected_patterns_file = "";
my $language = "en";
my $QUIET = 0;
my $HELP = 0;
my $AGGRESSIVE = 0;
my $SKIP_XML = 0;
my $TIMING = 0;
my $NUM_THREADS = 1;
my $NUM_SENTENCES_PER_THREAD = 2000;
my $PENN = 0;
my $NO_ESCAPING = 0;
while (@ARGV)
{
$_ = shift;
/^-b$/ && ($| = 1, next);
/^-l$/ && ($language = shift, next);
/^-q$/ && ($QUIET = 1, next);
/^-h$/ && ($HELP = 1, next);
/^-x$/ && ($SKIP_XML = 1, next);
/^-a$/ && ($AGGRESSIVE = 1, next);
/^-time$/ && ($TIMING = 1, next);
# Option to add list of regexps to be protected
/^-protected/ && ($protected_patterns_file = shift, next);
/^-threads$/ && ($NUM_THREADS = int(shift), next);
/^-lines$/ && ($NUM_SENTENCES_PER_THREAD = int(shift), next);
/^-penn$/ && ($PENN = 1, next);
/^-no-escape/ && ($NO_ESCAPING = 1, next);
}
# for time calculation
my $start_time;
if ($TIMING)
{
$start_time = [ Time::HiRes::gettimeofday( ) ];
}
# print help message
if ($HELP)
{
print "Usage ./tokenizer.perl (-l [en|de|...]) (-threads 4) < textfile > tokenizedfile\n";
print "Options:\n";
print " -q ... quiet.\n";
print " -a ... aggressive hyphen splitting.\n";
print " -b ... disable Perl buffering.\n";
print " -time ... enable processing time calculation.\n";
print " -penn ... use Penn treebank-like tokenization.\n";
print " -protected FILE ... specify file with patters to be protected in tokenisation.\n";
print " -no-escape ... don't perform HTML escaping on apostrophy, quotes, etc.\n";
exit;
}
if (!$QUIET)
{
print STDERR "Tokenizer Version 1.1\n";
print STDERR "Language: $language\n";
print STDERR "Number of threads: $NUM_THREADS\n";
}
# load the language-specific non-breaking prefix info from files in the directory nonbreaking_prefixes
load_prefixes($language,\%NONBREAKING_PREFIX);
if (scalar(%NONBREAKING_PREFIX) eq 0)
{
print STDERR "Warning: No known abbreviations for language '$language'\n";
}
# Load protected patterns
if ($protected_patterns_file)
{
open(PP,$protected_patterns_file) || die "Unable to open $protected_patterns_file";
while(<PP>) {
chomp;
push @protected_patterns, $_;
}
}
my @batch_sentences = ();
my @thread_list = ();
my $count_sentences = 0;
if ($NUM_THREADS > 1)
{# multi-threading tokenization
while(<STDIN>)
{
$count_sentences = $count_sentences + 1;
push(@batch_sentences, $_);
if (scalar(@batch_sentences)>=($NUM_SENTENCES_PER_THREAD*$NUM_THREADS))
{
# assign each thread work
for (my $i=0; $i<$NUM_THREADS; $i++)
{
my $start_index = $i*$NUM_SENTENCES_PER_THREAD;
my $end_index = $start_index+$NUM_SENTENCES_PER_THREAD-1;
my @subbatch_sentences = @batch_sentences[$start_index..$end_index];
my $new_thread = new Thread \&tokenize_batch, @subbatch_sentences;
push(@thread_list, $new_thread);
}
foreach (@thread_list)
{
my $tokenized_list = $_->join;
foreach (@$tokenized_list)
{
print $_;
}
}
# reset for the new run
@thread_list = ();
@batch_sentences = ();
}
}
# the last batch
if (scalar(@batch_sentences)>0)
{
# assign each thread work
for (my $i=0; $i<$NUM_THREADS; $i++)
{
my $start_index = $i*$NUM_SENTENCES_PER_THREAD;
if ($start_index >= scalar(@batch_sentences))
{
last;
}
my $end_index = $start_index+$NUM_SENTENCES_PER_THREAD-1;
if ($end_index >= scalar(@batch_sentences))
{
$end_index = scalar(@batch_sentences)-1;
}
my @subbatch_sentences = @batch_sentences[$start_index..$end_index];
my $new_thread = new Thread \&tokenize_batch, @subbatch_sentences;
push(@thread_list, $new_thread);
}
foreach (@thread_list)
{
my $tokenized_list = $_->join;
foreach (@$tokenized_list)
{
print $_;
}
}
}
}
else
{# single thread only
while(<STDIN>)
{
if (($SKIP_XML && /^<.+>$/) || /^\s*$/)
{
#don't try to tokenize XML/HTML tag lines
print $_;
}
else
{
print &tokenize($_);
}
}
}
if ($TIMING)
{
my $duration = Time::HiRes::tv_interval( $start_time );
print STDERR ("TOTAL EXECUTION TIME: ".$duration."\n");
print STDERR ("TOKENIZATION SPEED: ".($duration/$count_sentences*1000)." milliseconds/line\n");
}
#####################################################################################
# subroutines afterward
# tokenize a batch of texts saved in an array
# input: an array containing a batch of texts
# return: another array containing a batch of tokenized texts for the input array
sub tokenize_batch
{
my(@text_list) = @_;
my(@tokenized_list) = ();
foreach (@text_list)
{
if (($SKIP_XML && /^<.+>$/) || /^\s*$/)
{
#don't try to tokenize XML/HTML tag lines
push(@tokenized_list, $_);
}
else
{
push(@tokenized_list, &tokenize($_));
}
}
return \@tokenized_list;
}
# the actual tokenize function which tokenizes one input string
# input: one string
# return: the tokenized string for the input string
sub tokenize
{
my($text) = @_;
if ($PENN) {
return tokenize_penn($text);
}
chomp($text);
$text = " $text ";
# remove ASCII junk
$text =~ s/\s+/ /g;
$text =~ s/[\000-\037]//g;
# Find protected patterns
my @protected = ();
foreach my $protected_pattern (@protected_patterns) {
my $t = $text;
while ($t =~ /(?<PATTERN>$protected_pattern)(?<TAIL>.*)$/) {
push @protected, $+{PATTERN};
$t = $+{TAIL};
}
}
for (my $i = 0; $i < scalar(@protected); ++$i) {
my $subst = sprintf("THISISPROTECTED%.3d", $i);
$text =~ s,\Q$protected[$i], $subst ,g;
}
$text =~ s/ +/ /g;
$text =~ s/^ //g;
$text =~ s/ $//g;
# separate out all "other" special characters
if (($language eq "fi") or ($language eq "sv")) {
# in Finnish and Swedish, the colon can be used inside words as an apostrophe-like character:
# USA:n, 20:een, EU:ssa, USA:s, S:t
$text =~ s/([^\p{IsAlnum}\s\.\:\'\`\,\-])/ $1 /g;
# if a colon is not immediately followed by lower-case characters, separate it out anyway
$text =~ s/(:)(?=$|[^\p{Ll}])/ $1 /g;
}
elsif ($language eq "tdt") {
# in Tetun, the apostrophe can be used inside words as an apostrophe-like character:
$text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g;
# if an apostrophe is not immediately followed by lower-case characters, separate it out anyway
$text =~ s/(\')(?=$|[^\p{Ll}])/ $1 /g;
}
elsif (($language eq "ca")) {
# in Catalan, the middle dot can be used inside words:
# il<69>lusio
$text =~ s/([^\p{IsAlnum}\s\.\·\'\`\,\-])/ $1 /g;
# if a middot is not immediately followed by lower-case characters, separate it out anyway
$text =~ s/(·)(?=$|[^\p{Ll}])/ $1 /g;
}
else {
$text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g;
}
# aggressive hyphen splitting
if ($AGGRESSIVE)
{
$text =~ s/([\p{IsAlnum}])\-(?=[\p{IsAlnum}])/$1 \@-\@ /g;
}
#multi-dots stay together
$text =~ s/\.([\.]+)/ DOTMULTI$1/g;
while($text =~ /DOTMULTI\./)
{
$text =~ s/DOTMULTI\.([^\.])/DOTDOTMULTI $1/g;
$text =~ s/DOTMULTI\./DOTDOTMULTI/g;
}
# seperate out "," except if within numbers (5,300)
#$text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
# separate out "," except if within numbers (5,300)
# previous "global" application skips some: A,B,C,D,E > A , B,C , D,E
# first application uses up B so rule can't see B,C
# two-step version here may create extra spaces but these are removed later
# will also space digit,letter or letter,digit forms (redundant with next section)
$text =~ s/([^\p{IsN}])[,]/$1 , /g;
$text =~ s/[,]([^\p{IsN}])/ , $1/g;
# separate "," after a number if it's the end of a sentence
$text =~ s/([\p{IsN}])[,]$/$1 ,/g;
# separate , pre and post number
#$text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
#$text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g;
# turn `into '
#$text =~ s/\`/\'/g;
#turn '' into "
#$text =~ s/\'\'/ \" /g;
if ($language eq "en")
{
#split contractions right
$text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
$text =~ s/([^\p{IsAlpha}\p{IsN}])[']([\p{IsAlpha}])/$1 ' $2/g;
$text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
$text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1 '$2/g;
#special case for "1990's"
$text =~ s/([\p{IsN}])[']([s])/$1 '$2/g;
}
elsif (($language eq "fr") or ($language eq "it") or ($language eq "ga") or ($language eq "ca"))
{
#split contractions left
$text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
$text =~ s/([^\p{IsAlpha}])[']([\p{IsAlpha}])/$1 ' $2/g;
$text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
$text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1' $2/g;
}
elsif (($language eq "so") or ($language eq "tdt"))
{
# Don't split glottals
$text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
$text =~ s/([^\p{IsAlpha}])[']([\p{IsAlpha}])/$1 ' $2/g;
$text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
}
else
{
$text =~ s/\'/ \' /g;
}
#word token method
my @words = split(/\s/,$text);
$text = "";
for (my $i=0;$i<(scalar(@words));$i++)
{
my $word = $words[$i];
if ( $word =~ /^(\S+)\.$/)
{
my $pre = $1;
if ($i == scalar(@words)-1) {
# split last words independently as they are unlikely to be non-breaking prefixes
$word = $pre." .";
}
elsif (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[\p{IsLower}]/)))
{
#no change
}
elsif (($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==2) && ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[0-9]+/)))
{
#no change
}
else
{
$word = $pre." .";
}
}
$text .= $word." ";
}
# clean up extraneous spaces
$text =~ s/ +/ /g;
$text =~ s/^ //g;
$text =~ s/ $//g;
# .' at end of sentence is missed
$text =~ s/\.\' ?$/ . ' /;
# restore protected
for (my $i = 0; $i < scalar(@protected); ++$i) {
my $subst = sprintf("THISISPROTECTED%.3d", $i);
$text =~ s/$subst/$protected[$i]/g;
}
#restore multi-dots
while($text =~ /DOTDOTMULTI/)
{
$text =~ s/DOTDOTMULTI/DOTMULTI./g;
}
$text =~ s/DOTMULTI/./g;
#escape special chars
if (!$NO_ESCAPING)
{
$text =~ s/\&/\&amp;/g; # escape escape
$text =~ s/\|/\&#124;/g; # factor separator
$text =~ s/\</\&lt;/g; # xml
$text =~ s/\>/\&gt;/g; # xml
$text =~ s/\'/\&apos;/g; # xml
$text =~ s/\"/\&quot;/g; # xml
$text =~ s/\[/\&#91;/g; # syntax non-terminal
$text =~ s/\]/\&#93;/g; # syntax non-terminal
}
#ensure final line break
$text .= "\n" unless $text =~ /\n$/;
return $text;
}
sub tokenize_penn
{
# Improved compatibility with Penn Treebank tokenization. Useful if
# the text is to later be parsed with a PTB-trained parser.
#
# Adapted from Robert MacIntyre's sed script:
# http://www.cis.upenn.edu/~treebank/tokenizer.sed
my($text) = @_;
chomp($text);
# remove ASCII junk
$text =~ s/\s+/ /g;
$text =~ s/[\000-\037]//g;
# attempt to get correct directional quotes
$text =~ s/^``/`` /g;
$text =~ s/^"/`` /g;
$text =~ s/^`([^`])/` $1/g;
$text =~ s/^'/` /g;
$text =~ s/([ ([{<])"/$1 `` /g;
$text =~ s/([ ([{<])``/$1 `` /g;
$text =~ s/([ ([{<])`([^`])/$1 ` $2/g;
$text =~ s/([ ([{<])'/$1 ` /g;
# close quotes handled at end
$text =~ s=\.\.\.= _ELLIPSIS_ =g;
# separate out "," except if within numbers (5,300)
$text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
# separate , pre and post number
$text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
$text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g;
#$text =~ s=([;:@#\$%&\p{IsSc}])= $1 =g;
$text =~ s=([;:@#\$%&\p{IsSc}\p{IsSo}])= $1 =g;
# Separate out intra-token slashes. PTB tokenization doesn't do this, so
# the tokens should be merged prior to parsing with a PTB-trained parser
# (see syntax-hyphen-splitting.perl).
$text =~ s/([\p{IsAlnum}])\/([\p{IsAlnum}])/$1 \@\/\@ $2/g;
# Assume sentence tokenization has been done first, so split FINAL periods
# only.
$text =~ s=([^.])([.])([\]\)}>"']*) ?$=$1 $2$3 =g;
# however, we may as well split ALL question marks and exclamation points,
# since they shouldn't have the abbrev.-marker ambiguity problem
$text =~ s=([?!])= $1 =g;
# parentheses, brackets, etc.
$text =~ s=([\]\[\(\){}<>])= $1 =g;
$text =~ s/\(/-LRB-/g;
$text =~ s/\)/-RRB-/g;
$text =~ s/\[/-LSB-/g;
$text =~ s/\]/-RSB-/g;
$text =~ s/{/-LCB-/g;
$text =~ s/}/-RCB-/g;
$text =~ s=--= -- =g;
# First off, add a space to the beginning and end of each line, to reduce
# necessary number of regexps.
$text =~ s=$= =;
$text =~ s=^= =;
$text =~ s="= '' =g;
# possessive or close-single-quote
$text =~ s=([^'])' =$1 ' =g;
# as in it's, I'm, we'd
$text =~ s='([sSmMdD]) = '$1 =g;
$text =~ s='ll = 'll =g;
$text =~ s='re = 're =g;
$text =~ s='ve = 've =g;
$text =~ s=n't = n't =g;
$text =~ s='LL = 'LL =g;
$text =~ s='RE = 'RE =g;
$text =~ s='VE = 'VE =g;
$text =~ s=N'T = N'T =g;
$text =~ s= ([Cc])annot = $1an not =g;
$text =~ s= ([Dd])'ye = $1' ye =g;
$text =~ s= ([Gg])imme = $1im me =g;
$text =~ s= ([Gg])onna = $1on na =g;
$text =~ s= ([Gg])otta = $1ot ta =g;
$text =~ s= ([Ll])emme = $1em me =g;
$text =~ s= ([Mm])ore'n = $1ore 'n =g;
$text =~ s= '([Tt])is = '$1 is =g;
$text =~ s= '([Tt])was = '$1 was =g;
$text =~ s= ([Ww])anna = $1an na =g;
#word token method
my @words = split(/\s/,$text);
$text = "";
for (my $i=0;$i<(scalar(@words));$i++)
{
my $word = $words[$i];
if ( $word =~ /^(\S+)\.$/)
{
my $pre = $1;
if (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[\p{IsLower}]/)))
{
#no change
}
elsif (($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==2) && ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[0-9]+/)))
{
#no change
}
else
{
$word = $pre." .";
}
}
$text .= $word." ";
}
# restore ellipses
$text =~ s=_ELLIPSIS_=\.\.\.=g;
# clean out extra spaces
$text =~ s= *= =g;
$text =~ s=^ *==g;
$text =~ s= *$==g;
#escape special chars
$text =~ s/\&/\&amp;/g; # escape escape
$text =~ s/\|/\&#124;/g; # factor separator
$text =~ s/\</\&lt;/g; # xml
$text =~ s/\>/\&gt;/g; # xml
$text =~ s/\'/\&apos;/g; # xml
$text =~ s/\"/\&quot;/g; # xml
$text =~ s/\[/\&#91;/g; # syntax non-terminal
$text =~ s/\]/\&#93;/g; # syntax non-terminal
#ensure final line break
$text .= "\n" unless $text =~ /\n$/;
return $text;
}
sub load_prefixes
{
my ($language, $PREFIX_REF) = @_;
my $prefixfile = "$mydir/nonbreaking_prefix.$language";
#default back to English if we don't have a language-specific prefix file
if (!(-e $prefixfile))
{
$prefixfile = "$mydir/nonbreaking_prefix.en";
print STDERR "WARNING: No known abbreviations for language '$language', attempting fall-back to English version...\n";
die ("ERROR: No abbreviations files found in $mydir\n") unless (-e $prefixfile);
}
if (-e "$prefixfile")
{
open(PREFIX, "<:utf8", "$prefixfile");
while (<PREFIX>)
{
my $item = $_;
chomp($item);
if (($item) && (substr($item,0,1) ne "#"))
{
if ($item =~ /(.*)[\s]+(\#NUMERIC_ONLY\#)/)
{
$PREFIX_REF->{$1} = 2;
}
else
{
$PREFIX_REF->{$item} = 1;
}
}
}
close(PREFIX);
}
}

@ -0,0 +1,88 @@
#!/bin/bash
# Copyright 2020 Kyoto University (Hirofumi Inaguma)
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
echo "$0 $*" >&2 # Print the command line for logging
. ./path.sh
nlsyms=""
oov="<unk>"
bpecode=""
verbose=0
text=""
multilingual=false
help_message=$(cat << EOF
Usage: $0 <json> <data-dir> <dict>
e.g. $0 data/train data/lang_1char/train_units.txt
Options:
--oov <oov-word> # Default: <unk>
--verbose <num> # Default: 0
EOF
)
. utils/parse_options.sh
if [ $# != 3 ]; then
echo "${help_message}" 1>&2
exit 1;
fi
set -euo pipefail
json=$1
dir=$2
dic=$3
json_dir=$(dirname ${json})
tmpdir=$(mktemp -d ${dir}/tmp-XXXXX)
trap 'rm -rf ${tmpdir}' EXIT
if [ -z ${text} ]; then
text=${dir}/text
fi
# 2. Create scp files for outputs
mkdir -p ${tmpdir}/output
if [ -n "${bpecode}" ]; then
if [ ${multilingual} = true ]; then
# remove a space before the language ID
paste -d " " <(awk '{print $1}' ${text}) <(cut -f 2- -d" " ${text} \
| spm_encode --model=${bpecode} --output_format=piece | cut -f 2- -d" ") \
> ${tmpdir}/output/token.scp
else
paste -d " " <(awk '{print $1}' ${text}) <(cut -f 2- -d" " ${text} \
| spm_encode --model=${bpecode} --output_format=piece) \
> ${tmpdir}/output/token.scp
fi
elif [ -n "${nlsyms}" ]; then
text2token.py -s 1 -n 1 -l ${nlsyms} ${text} > ${tmpdir}/output/token.scp
else
text2token.py -s 1 -n 1 ${text} > ${tmpdir}/output/token.scp
fi
< ${tmpdir}/output/token.scp utils/sym2int.pl --map-oov ${oov} -f 2- ${dic} > ${tmpdir}/output/tokenid.scp
awk '{print $1 " " NF-1}' ${tmpdir}/output/tokenid.scp > ${tmpdir}/output/olen.scp
# +2 comes from CTC blank and EOS
vocsize=$(tail -n 1 ${dic} | awk '{print $2}')
odim=$(echo "$vocsize + 2" | bc)
awk -v odim=${odim} '{print $1 " " odim}' ${text} > ${tmpdir}/output/odim.scp
cat ${text} > ${tmpdir}/output/text.scp
# 4. Create JSON files from each scp files
rm -f ${tmpdir}/*/*.json
for x in "${tmpdir}"/output/*.scp; do
k=$(basename ${x} .scp)
< ${x} scp2json.py --key ${k} > ${tmpdir}/output/${k}.json
done
# add to json
addjson.py --verbose ${verbose} -i false \
${json} ${tmpdir}/output/text.json ${tmpdir}/output/token.json ${tmpdir}/output/tokenid.json ${tmpdir}/output/olen.json ${tmpdir}/output/odim.json > ${tmpdir}/data.json
mkdir -p ${json_dir}/.backup
echo "json updated. original json is kept in ${json_dir}/.backup."
cp ${json} ${json_dir}/.backup/"$(basename ${json})"
cp ${tmpdir}/data.json ${json}
rm -fr ${tmpdir}
Loading…
Cancel
Save