update mustc v1

pull/1459/head
Junkun 3 years ago
parent 9179983f8f
commit af2b20650e

@ -0,0 +1,89 @@
# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
# e.g.
# run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
#
# Options:
# --time <time>: Limit the maximum time to execute.
# --mem <mem>: Limit the maximum memory usage.
# -max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
# --num-threads <ngpu>: Specify the number of CPU core.
# --gpu <ngpu>: Specify the number of GPU devices.
# --config: Change the configuration file from default.
#
# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
#
# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
# These options are mapping to specific options for each backend and
# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
# If jobs failed, your configuration might be wrong for your environment.
#
#
# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
# "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
# =========================================================~
# Select the backend used by run.sh from "local", "sge", "slurm", or "ssh"
cmd_backend='local'
# Local machine, without any Job scheduling system
if [ "${cmd_backend}" = local ]; then
# The other usage
export train_cmd="run.pl"
# Used for "*_train.py": "--gpu" is appended optionally by run.sh
export cuda_cmd="run.pl"
# Used for "*_recog.py"
export decode_cmd="run.pl"
# "qsub" (SGE, Torque, PBS, etc.)
elif [ "${cmd_backend}" = sge ]; then
# The default setting is written in conf/queue.conf.
# You must change "-q g.q" for the "queue" for your environment.
# To know the "queue" names, type "qhost -q"
# Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
export train_cmd="queue.pl"
export cuda_cmd="queue.pl"
export decode_cmd="queue.pl"
# "sbatch" (Slurm)
elif [ "${cmd_backend}" = slurm ]; then
# The default setting is written in conf/slurm.conf.
# You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
# To know the "partion" names, type "sinfo".
# You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
# The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
export train_cmd="slurm.pl"
export cuda_cmd="slurm.pl"
export decode_cmd="slurm.pl"
elif [ "${cmd_backend}" = ssh ]; then
# You have to create ".queue/machines" to specify the host to execute jobs.
# e.g. .queue/machines
# host1
# host2
# host3
# Assuming you can login them without any password, i.e. You have to set ssh keys.
export train_cmd="ssh.pl"
export cuda_cmd="ssh.pl"
export decode_cmd="ssh.pl"
# This is an example of specifying several unique options in the JHU CLSP cluster setup.
# Users can modify/add their own command options according to their cluster environments.
elif [ "${cmd_backend}" = jhu ]; then
export train_cmd="queue.pl --mem 2G"
export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/gpu.conf"
export decode_cmd="queue.pl --mem 4G"
else
echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
return 1
fi

@ -0,0 +1,2 @@
--sample-frequency=16000
--num-mel-bins=80

@ -0,0 +1 @@
--sample-frequency=16000

@ -0,0 +1,19 @@
[
{
"type": "specaug",
"params": {
"W": 5,
"warp_mode": "PIL",
"F": 30,
"n_freq_masks": 2,
"T": 40,
"n_time_masks": 2,
"p": 1.0,
"adaptive_number_ratio": 0,
"adaptive_size_ratio": 0,
"max_n_time_masks": 20,
"replace_with_zero": false
},
"prob": 1.0
}
]

@ -0,0 +1,201 @@
#!/bin/bash
# Copyright 2019 Kyoto University (Hirofumi Inaguma)
# 2021 PaddlePaddle
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
set -e
set -u
stage=-1
stop_stage=10
# bpemode (unigram or bpe)
tgt_lang=
nbpe=8000
bpemode=bpe
must_c=
dumpdir=data/dump
do_delta=false
tgt_case=tc
src_case=lc.rm
source ${MAIN_ROOT}/utils/parse_options.sh
TARGET_DIR=${MAIN_ROOT}/examples/dataset
mkdir -p ${TARGET_DIR}
mkdir -p data
train_set=train_sp.en-${tgt_lang}.${tgt_lang}
train_dev=dev.en-${tgt_lang}.${tgt_lang}
trans_set=""
for lang in $(echo ${tgt_lang} | tr '_' ' '); do
trans_set="${trans_set} tst-COMMON.en-${lang}.${lang}"
done
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
if [ ! -e ${must_c} ]; then
echo "Error: Dataset is not avaiable. Please download and unzip the dataset"
echo "Link of Must-c v1, https://ict.fbk.eu/must-c/."
exit 1
fi
fi
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
echo "stage 0: Data Preparation"
for lang in $(echo ${tgt_lang} | tr '_' ' '); do
local/data_prep.sh ${must_c} ${lang}
done
fi
feat_tr_dir=${dumpdir}/${train_set}/delta${do_delta}; mkdir -p ${feat_tr_dir}
feat_dt_dir=${dumpdir}/${train_dev}/delta${do_delta}; mkdir -p ${feat_dt_dir}
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
### Task dependent. You have to design training and dev sets by yourself.
### But you can utilize Kaldi recipes in most cases
echo "stage 1: Feature Generation"
fbankdir=fbank
# Generate the fbank features; by default 80-dimensional fbanks with pitch on each frame
for lang in $(echo ${tgt_lang} | tr '_' ' '); do
for x in train.en-${tgt_lang} dev.en-${tgt_lang} tst-COMMON.en-${tgt_lang} tst-HE.en-${tgt_lang}; do
steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj 32 --write_utt2num_frames true \
data/${x} data/make_fbank/${x} ${fbankdir}
done
done
# speed-perturbed
utils/perturb_data_dir_speed.sh 0.9 data/train.en-${tgt_lang} data/temp1.${tgt_lang}
utils/perturb_data_dir_speed.sh 1.0 data/train.en-${tgt_lang} data/temp2.${tgt_lang}
utils/perturb_data_dir_speed.sh 1.1 data/train.en-${tgt_lang} data/temp3.${tgt_lang}
utils/combine_data.sh --extra-files utt2uniq data/train_sp.en-${tgt_lang} \
data/temp1.${tgt_lang} data/temp2.${tgt_lang} data/temp3.${tgt_lang}
rm -r data/temp1.${tgt_lang} data/temp2.${tgt_lang} data/temp3.${tgt_lang}
utils/fix_data_dir.sh data/train_sp.en-${tgt_lang}
steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj 32 --write_utt2num_frames true \
data/train_sp.en-${tgt_lang} data/make_fbank/train_sp.en-${tgt_lang} ${fbankdir}
for lang in en ${tgt_lang}; do
awk -v p="sp0.9-" '{printf("%s %s%s\n", $1, p, $1);}' data/train.en-${tgt_lang}/utt2spk > data/train_sp.en-${tgt_lang}/utt_map
utils/apply_map.pl -f 1 data/train_sp.en-${tgt_lang}/utt_map <data/train.en-${tgt_lang}/text.tc.${lang} >data/train_sp.en-${tgt_lang}/text.tc.${lang}
utils/apply_map.pl -f 1 data/train_sp.en-${tgt_lang}/utt_map <data/train.en-${tgt_lang}/text.lc.${lang} >data/train_sp.en-${tgt_lang}/text.lc.${lang}
utils/apply_map.pl -f 1 data/train_sp.en-${tgt_lang}/utt_map <data/train.en-${tgt_lang}/text.lc.rm.${lang} >data/train_sp.en-${tgt_lang}/text.lc.rm.${lang}
awk -v p="sp1.0-" '{printf("%s %s%s\n", $1, p, $1);}' data/train.en-${tgt_lang}/utt2spk > data/train_sp.en-${tgt_lang}/utt_map
utils/apply_map.pl -f 1 data/train_sp.en-${tgt_lang}/utt_map <data/train.en-${tgt_lang}/text.tc.${lang} >>data/train_sp.en-${tgt_lang}/text.tc.${lang}
utils/apply_map.pl -f 1 data/train_sp.en-${tgt_lang}/utt_map <data/train.en-${tgt_lang}/text.lc.${lang} >>data/train_sp.en-${tgt_lang}/text.lc.${lang}
utils/apply_map.pl -f 1 data/train_sp.en-${tgt_lang}/utt_map <data/train.en-${tgt_lang}/text.lc.rm.${lang} >>data/train_sp.en-${tgt_lang}/text.lc.rm.${lang}
awk -v p="sp1.1-" '{printf("%s %s%s\n", $1, p, $1);}' data/train.en-${tgt_lang}/utt2spk > data/train_sp.en-${tgt_lang}/utt_map
utils/apply_map.pl -f 1 data/train_sp.en-${tgt_lang}/utt_map <data/train.en-${tgt_lang}/text.tc.${lang} >>data/train_sp.en-${tgt_lang}/text.tc.${lang}
utils/apply_map.pl -f 1 data/train_sp.en-${tgt_lang}/utt_map <data/train.en-${tgt_lang}/text.lc.${lang} >>data/train_sp.en-${tgt_lang}/text.lc.${lang}
utils/apply_map.pl -f 1 data/train_sp.en-${tgt_lang}/utt_map <data/train.en-${tgt_lang}/text.lc.rm.${lang} >>data/train_sp.en-${tgt_lang}/text.lc.rm.${lang}
done
# Divide into source and target languages
for x in train_sp.en-${tgt_lang} dev.en-${tgt_lang} tst-COMMON.en-${tgt_lang} tst-HE.en-${tgt_lang}; do
local/divide_lang.sh ${x} ${tgt_lang}
done
for x in train_sp.en-${tgt_lang} dev.en-${tgt_lang}; do
# remove utt having more than 3000 frames
# remove utt having more than 400 characters
for lang in ${tgt_lang} en; do
remove_longshortdata.sh --maxframes 3000 --maxchars 400 data/${x}.${lang} data/${x}.${lang}.tmp
done
# Match the number of utterances between source and target languages
# extract commocn lines
cut -f 1 -d " " data/${x}.en.tmp/text > data/${x}.${tgt_lang}.tmp/reclist1
cut -f 1 -d " " data/${x}.${tgt_lang}.tmp/text > data/${x}.${tgt_lang}.tmp/reclist2
comm -12 data/${x}.${tgt_lang}.tmp/reclist1 data/${x}.${tgt_lang}.tmp/reclist2 > data/${x}.en.tmp/reclist
for lang in ${tgt_lang} en; do
reduce_data_dir.sh data/${x}.${lang}.tmp data/${x}.en.tmp/reclist data/${x}.${lang}
utils/fix_data_dir.sh --utt_extra_files "text.tc text.lc text.lc.rm" data/${x}.${lang}
done
rm -rf data/${x}.*.tmp
done
# compute global CMVN
compute-cmvn-stats scp:data/${train_set}/feats.scp data/${train_set}/cmvn.ark
# dump features for training
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${feat_tr_dir}/storage ]; then
utils/create_split_dir.pl \
/export/b{14,15,16,17}/${USER}/espnet-data/egs/must_c/st1/dump/${train_set}/delta${do_delta}/storage \
${feat_tr_dir}/storage
fi
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${feat_dt_dir}/storage ]; then
utils/create_split_dir.pl \
/export/b{14,15,16,17}/${USER}/espnet-data/egs/must_c/st1/dump/${train_dev}/delta${do_delta}/storage \
${feat_dt_dir}/storage
fi
dump.sh --cmd "$train_cmd" --nj 80 --do_delta $do_delta \
data/${train_set}/feats.scp data/${train_set}/cmvn.ark data/dump_feats/${train_set} ${feat_tr_dir}
dump.sh --cmd "$train_cmd" --nj 32 --do_delta $do_delta \
data/${train_dev}/feats.scp data/${train_set}/cmvn.ark data/dump_feats/${train_dev} ${feat_dt_dir}
for ttask in ${trans_set}; do
feat_trans_dir=${dumpdir}/${ttask}/delta${do_delta}; mkdir -p ${feat_trans_dir}
dump.sh --cmd "$train_cmd" --nj 32 --do_delta $do_delta \
data/${ttask}/feats.scp data/${train_set}/cmvn.ark data/dump_feats/trans/${ttask} \
${feat_trans_dir}
done
fi
dict=data/lang_1spm/${train_set}_${bpemode}${nbpe}_units_${tgt_case}.txt
nlsyms=data/lang_1spm/${train_set}_non_lang_syms_${tgt_case}.txt
bpemodel=data/lang_1spm/${train_set}_${bpemode}${nbpe}_${tgt_case}
echo "dictionary: ${dict}"
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
### Task dependent. You have to check non-linguistic symbols used in the corpus.
echo "stage 2: Dictionary and Json Data Preparation"
mkdir -p data/lang_1spm/
export LC_ALL=C.UTF-8
echo "make a non-linguistic symbol list for all languages"
grep sp1.0 data/train_sp.en-${tgt_lang}.*/text.${tgt_case} | cut -f 2- -d' ' | grep -o -P '&[^;]*;'| sort | uniq > ${nlsyms}
cat ${nlsyms}
echo "make a joint source and target dictionary"
echo "<unk> 1" > ${dict} # <unk> must be 1, 0 will be used for "blank" in CTC
offset=$(wc -l < ${dict})
grep sp1.0 data/train_sp.en-${tgt_lang}.${tgt_lang}/text.${tgt_case} | cut -f 2- -d' ' | grep -v -e '^\s*$' > data/lang_1spm/input_${tgt_lang}.txt
grep sp1.0 data/train_sp.en-${tgt_lang}.en/text.${src_case} | cut -f 2- -d' ' | grep -v -e '^\s*$' >> data/lang_1spm/input_${tgt_lang}.txt
spm_train --user_defined_symbols="$(tr "\n" "," < ${nlsyms})" --input=data/lang_1spm/input_${tgt_lang}.txt --vocab_size=${nbpe} --model_type=${bpemode} --model_prefix=${bpemodel} --input_sentence_size=100000000 --character_coverage=1.0
spm_encode --model=${bpemodel}.model --output_format=piece < data/lang_1spm/input_${tgt_lang}.txt | tr ' ' '\n' | sort | uniq | awk -v offset=${offset} '{print $0 " " NR+offset}' >> ${dict}
wc -l ${dict}
echo "make json files"
data2json.sh --nj 16 --feat ${feat_tr_dir}/feats.scp --text data/${train_set}/text.${tgt_case} --bpecode ${bpemodel}.model --lang ${tgt_lang} \
data/${train_set} ${dict} > ${feat_tr_dir}/data_${bpemode}${nbpe}.${tgt_case}.json
data2json.sh --feat ${feat_dt_dir}/feats.scp --text data/${train_dev}/text.${tgt_case} --bpecode ${bpemodel}.model --lang ${tgt_lang} \
data/${train_dev} ${dict} > ${feat_dt_dir}/data_${bpemode}${nbpe}.${tgt_case}.json
for ttask in ${trans_set}; do
feat_trans_dir=${dumpdir}/${ttask}/delta${do_delta}
data2json.sh --feat ${feat_trans_dir}/feats.scp --text data/${ttask}/text.${tgt_case} --bpecode ${bpemodel}.model --lang ${tgt_lang} \
data/${ttask} ${dict} > ${feat_trans_dir}/data_${bpemode}${nbpe}.${tgt_case}.json
done
echo "update json (add source references)"
# update json (add source references)
for x in ${train_set} ${train_dev}; do
feat_dir=${dumpdir}/${x}/delta${do_delta}
data_dir=data/$(echo ${x} | cut -f 1 -d ".").en-${tgt_lang}.en
update_json.sh --text ${data_dir}/text.${src_case} --bpecode ${bpemodel}.model \
${feat_dir}/data_${bpemode}${nbpe}.${tgt_case}.json ${data_dir} ${dict}
done
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
x=(${train_set} ${train_dev} ${trans_set})
y=(train dev test)
for (( i=0; i<${#x[*]}; ++i)); do
echo ${x[$i]} ${y[$i]}
feat_dir=${dumpdir}/${x[$i]}/delta${do_delta}
data_dir=data/$(echo ${x[$i]} | cut -f 1 -d ".").en-${tgt_lang}.en
python3 ${MAIN_ROOT}/utils/espnet_json_to_manifest.py \
--json-file ${feat_dir}/data_${bpemode}${nbpe}.${tgt_case}.json \
--manifest-file data/manifest.${tgt_lang}.${y[$i]}
echo "Process done for ${y[$i]} set from ${x[$i]}"
done
fi
echo "MuST-C ${tgt_lang} Data preparation done."
exit 0

@ -0,0 +1,163 @@
#!/bin/bash
# Copyright 2019 Kyoto University (Hirofumi Inaguma)
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
export LC_ALL=C
source ${MAIN_ROOT}/utils/parse_options.sh
if [ "$#" -ne 2 ]; then
echo "Usage: $0 <src-dir>"
echo "e.g.: $0 /n/rd11/corpora_8/MUSTC_v1.0 target_lang"
exit 1;
fi
tgt_lang=$2
for set in train dev tst-COMMON tst-HE; do
src=$1/en-${tgt_lang}/data/${set}
dst=data/local/en-${tgt_lang}/${set}
[ ! -d ${src} ] && echo "$0: no such directory ${src}" && exit 1;
wav_dir=${src}/wav
trans_dir=${src}/txt
yml=${trans_dir}/${set}.yaml
en=${trans_dir}/${set}.en
tgt=${trans_dir}/${set}.${tgt_lang}
mkdir -p ${dst} || exit 1;
[ ! -d ${wav_dir} ] && echo "$0: no such directory ${wav_dir}" && exit 1;
[ ! -d ${trans_dir} ] && echo "$0: no such directory ${trans_dir}" && exit 1;
[ ! -f ${yml} ] && echo "$0: expected file ${yml} to exist" && exit 1;
[ ! -f ${en} ] && echo "$0: expected file ${en} to exist" && exit 1;
[ ! -f ${tgt} ] && echo "$0: expected file ${tgt} to exist" && exit 1;
wav_scp=${dst}/wav.scp; [[ -f "${wav_scp}" ]] && rm ${wav_scp}
trans_en=${dst}/text.en; [[ -f "${trans_en}" ]] && rm ${trans_en}
trans_tgt=${dst}/text.${tgt_lang}; [[ -f "${trans_tgt}" ]] && rm ${trans_tgt}
utt2spk=${dst}/utt2spk; [[ -f "${utt2spk}" ]] && rm ${utt2spk}
spk2utt=${dst}/spk2utt; [[ -f "${spk2utt}" ]] && rm ${spk2utt}
segments=${dst}/segments; [[ -f "${segments}" ]] && rm ${segments}
# error check
n=$(cat ${yml} | grep duration | wc -l)
n_en=$(cat ${en} | wc -l)
n_tgt=$(cat ${tgt} | wc -l)
[ ${n} -ne ${n_en} ] && echo "Warning: expected ${n} data data files, found ${n_en}" && exit 1;
[ ${n} -ne ${n_tgt} ] && echo "Warning: expected ${n} data data files, found ${n_tgt}" && exit 1;
# (1a) Transcriptions and translations preparation
# make basic transcription file (add segments info)
cp ${yml} ${dst}/.yaml0
grep duration ${dst}/.yaml0 > ${dst}/.yaml1
awk '{
duration=$3; offset=$5; spkid=$7;
gsub(",","",duration);
gsub(",","",offset);
gsub(",","",spkid);
gsub("spk.","",spkid);
duration=sprintf("%.7f", duration);
if ( duration < 0.2 ) extendt=sprintf("%.7f", (0.2-duration)/2);
else extendt=0;
offset=sprintf("%.7f", offset);
startt=offset-extendt;
endt=offset+duration+extendt;
printf("ted_%05d_%07.0f_%07.0f\n", spkid, int(1000*startt+0.5), int(1000*endt+0.5));
}' ${dst}/.yaml1 > ${dst}/.yaml2
# NOTE: Extend the lengths of short utterances (< 0.2s) rather than exclude them
cp ${en} ${dst}/en.org
cp ${tgt} ${dst}/${tgt_lang}.org
for lang in en ${tgt_lang}; do
# normalize punctuation
normalize-punctuation.perl -l ${lang} < ${dst}/${lang}.org > ${dst}/${lang}.norm
# lowercasing
lowercase.perl < ${dst}/${lang}.norm > ${dst}/${lang}.norm.lc
cp ${dst}/${lang}.norm ${dst}/${lang}.norm.tc
# remove punctuation
local/remove_punctuation.pl < ${dst}/${lang}.norm.lc > ${dst}/${lang}.norm.lc.rm
# tokenization
tokenizer.perl -l ${lang} -q < ${dst}/${lang}.norm.tc > ${dst}/${lang}.norm.tc.tok
tokenizer.perl -l ${lang} -q < ${dst}/${lang}.norm.lc > ${dst}/${lang}.norm.lc.tok
tokenizer.perl -l ${lang} -q < ${dst}/${lang}.norm.lc.rm > ${dst}/${lang}.norm.lc.rm.tok
paste -d " " ${dst}/.yaml2 ${dst}/${lang}.norm.tc.tok | sort > ${dst}/text.tc.${lang}
paste -d " " ${dst}/.yaml2 ${dst}/${lang}.norm.lc.tok | sort > ${dst}/text.lc.${lang}
paste -d " " ${dst}/.yaml2 ${dst}/${lang}.norm.lc.rm.tok | sort > ${dst}/text.lc.rm.${lang}
# save original and cleaned punctuation
lowercase.perl < ${dst}/${lang}.org | text2token.py -s 0 -n 1 | tr " " "\n" \
| sort | uniq | grep -v -e '^\s*$' | awk '{print $0 " " NR+1}' > ${dst}/punctuation.${lang}
lowercase.perl < ${dst}/${lang}.norm.tc | text2token.py -s 0 -n 1 | tr " " "\n" \
| sort | uniq | grep -v -e '^\s*$' | awk '{print $0 " " NR+1}' > ${dst}/punctuation.clean.${lang}
done
# error check
n=$(cat ${dst}/.yaml2 | wc -l)
n_en=$(cat ${dst}/en.norm.tc.tok | wc -l)
n_tgt=$(cat ${dst}/${tgt_lang}.norm.tc.tok | wc -l)
[ ${n} -ne ${n_en} ] && echo "Warning: expected ${n} data data files, found ${n_en}" && exit 1;
[ ${n} -ne ${n_tgt} ] && echo "Warning: expected ${n} data data files, found ${n_tgt}" && exit 1;
# (1c) Make segments files from transcript
#segments file format is: utt-id start-time end-time, e.g.:
#ted_00001_0003501_0003684 ted_0001 003.501 0003.684
awk '{
segment=$1; split(segment,S,"[_]");
spkid=S[1] "_" S[2]; startf=S[3]; endf=S[4];
printf("%s %s %.2f %.2f\n", segment, spkid, startf/1000, endf/1000);
}' < ${dst}/text.tc.${tgt_lang} | uniq | sort > ${dst}/segments
awk '{
segment=$1; split(segment,S,"[_]");
spkid=S[1] "_" S[2];
printf("%s cat '${wav_dir}'/%s_%d.wav |\n", spkid, S[1], S[2]);
}' < ${dst}/text.tc.${tgt_lang} | uniq | sort > ${dst}/wav.scp
awk '{
segment=$1; split(segment,S,"[_]");
spkid=S[1] "_" S[2]; print $1 " " spkid
}' ${dst}/segments | uniq | sort > ${dst}/utt2spk
cat ${dst}/utt2spk | utils/utt2spk_to_spk2utt.pl | sort > ${dst}/spk2utt
# error check
n_en=$(cat ${dst}/text.tc.en | wc -l)
n_tgt=$(cat ${dst}/text.tc.${tgt_lang} | wc -l)
[ ${n_en} -ne ${n_tgt} ] && echo "Warning: expected ${n_en} data data files, found ${n_tgt}" && exit 1;
# Copy stuff intoc its final locations [this has been moved from the format_data script]
mkdir -p data/${set}.en-${tgt_lang}
# remove duplicated utterances (the same offset)
echo "remove duplicate lines..."
cut -d ' ' -f 1 ${dst}/text.tc.en | sort | uniq -c | sort -n -k1 -r | grep -v '1 ted' \
| sed 's/^[ \t]*//' > ${dst}/duplicate_lines
cut -d ' ' -f 1 ${dst}/text.tc.en | sort | uniq -c | sort -n -k1 -r | grep '1 ted' \
| cut -d '1' -f 2- | sed 's/^[ \t]*//' > ${dst}/reclist
reduce_data_dir.sh ${dst} ${dst}/reclist data/${set}.en-${tgt_lang}
for l in en ${tgt_lang}; do
for case in tc lc lc.rm; do
cp ${dst}/text.${case}.${l} data/${set}.en-${tgt_lang}/text.${case}.${l}
done
done
utils/fix_data_dir.sh --utt_extra_files \
"text.tc.en text.lc.en text.lc.rm.en text.tc.${tgt_lang} text.lc.${tgt_lang} text.lc.rm.${tgt_lang}" \
data/${set}.en-${tgt_lang}
# error check
n_seg=$(cat data/${set}.en-${tgt_lang}/segments | wc -l)
n_text=$(cat data/${set}.en-${tgt_lang}/text.tc.${tgt_lang} | wc -l)
[ ${n_seg} -ne ${n_text} ] && echo "Warning: expected ${n_seg} data data files, found ${n_text}" && exit 1;
echo "$0: successfully prepared data in ${dst}"
done

@ -0,0 +1,52 @@
#!/bin/bash
# Copyright 2019 Kyoto University (Hirofumi Inaguma)
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
. ./path.sh
if [ "$#" -ne 2 ]; then
echo "Usage: $0 <set> <lang>>"
echo "e.g.: $0 dev"
exit 1
fi
set=$1
lang=$2
export LC_ALL=en_US.UTF-8
# Copy stuff intoc its final locations [this has been moved from the format_data script]
# for En
mkdir -p data/${set}.en
for f in spk2utt utt2spk segments wav.scp feats.scp utt2num_frames; do
if [ -f data/${set}/${f} ]; then
sort data/${set}/${f} > data/${set}.en/${f}
fi
done
sort data/${set}/text.lc.rm.en | sed $'s/[^[:print:]]//g' > data/${set}.en/text # dummy
sort data/${set}/text.tc.en | sed $'s/[^[:print:]]//g' > data/${set}.en/text.tc
sort data/${set}/text.lc.en | sed $'s/[^[:print:]]//g' > data/${set}.en/text.lc
sort data/${set}/text.lc.rm.en | sed $'s/[^[:print:]]//g' > data/${set}.en/text.lc.rm
utils/fix_data_dir.sh --utt_extra_files "text.tc text.lc text.lc.rm" data/${set}.en
if [ -f data/${set}.en/feats.scp ]; then
utils/validate_data_dir.sh data/${set}.en || exit 1;
else
utils/validate_data_dir.sh --no-feats --no-wav data/${set}.en || exit 1;
fi
# for target language
mkdir -p data/${set}.${lang}
for f in spk2utt utt2spk segments wav.scp feats.scp utt2num_frames; do
if [ -f data/${set}/${f} ]; then
sort data/${set}/${f} > data/${set}.${lang}/${f}
fi
done
sort data/${set}/text.tc.${lang} | sed $'s/[^[:print:]]//g' > data/${set}.${lang}/text # dummy
sort data/${set}/text.tc.${lang} | sed $'s/[^[:print:]]//g' > data/${set}.${lang}/text.tc
sort data/${set}/text.lc.${lang} | sed $'s/[^[:print:]]//g' > data/${set}.${lang}/text.lc
sort data/${set}/text.lc.rm.${lang} | sed $'s/[^[:print:]]//g' > data/${set}.${lang}/text.lc.rm
utils/fix_data_dir.sh --utt_extra_files "text.tc text.lc text.lc.rm" data/${set}.${lang}
if [ -f data/${set}.${lang}/feats.scp ]; then
utils/validate_data_dir.sh data/${set}.${lang} || exit 1;
else
utils/validate_data_dir.sh --no-feats --no-wav data/${set}.${lang} || exit 1;
fi

@ -0,0 +1,25 @@
#!/usr/bin/perl
use warnings;
use strict;
binmode(STDIN,":utf8");
binmode(STDOUT,":utf8");
while(<STDIN>) {
$_ = " $_ ";
# remove punctuation except apostrophe
s/<space>/spacemark/g; # for scoring
s/'/apostrophe/g;
s/[[:punct:]]//g;
s/apostrophe/'/g;
s/spacemark/<space>/g; # for scoring
# remove whitespace
s/\s+/ /g;
s/^\s+//;
s/\s+$//;
print "$_\n";
}

@ -0,0 +1,48 @@
#! /usr/bin/env bash
if [ $# != 4 ];then
echo "usage: ${0} config_path decode_config_path ckpt_path_prefix lang"
exit -1
fi
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo "using $ngpu gpus..."
config_path=$1
decode_config_path=$2
ckpt_prefix=$3
tgt_lang=$4
for type in fullsentence; do
echo "decoding ${type}"
python3 -u ${BIN_DIR}/test.py \
--ngpu ${ngpu} \
--config ${config_path} \
--decode_cfg ${decode_config_path} \
--result_file ${ckpt_prefix}.${type}.rsl \
--checkpoint_path ${ckpt_prefix} \
--opts decode.decoding_method ${type} \
if [ $? -ne 0 ]; then
echo "Failed in evaluation!"
exit 1
fi
echo $PATH
python3 ${MAIN_ROOT}/utils/rsl2trn.py --rsl ${ckpt_prefix}.${type}.rsl \
--hyp ${ckpt_prefix}.${type}.hyp \
--ref ${ckpt_prefix}.${type}.ref
if ! which tokenizer.perl > /dev/null; then
echo "Error: it seems that moses is not installed." >&2
echo "Error: please install moses as follows." >&2
echo "Error: cd ${MAIN_ROOT}/tools && make moses.done" >&2
return 1
fi
detokenizer.perl -l ${tgt_lang} -q < ${ckpt_prefix}.${type}.hyp > ${ckpt_prefix}.${type}.hyp.detok
detokenizer.perl -l ${tgt_lang} -q < ${ckpt_prefix}.${type}.ref > ${ckpt_prefix}.${type}.ref.detok
echo "Detokenized BLEU:"
sacrebleu ${ckpt_prefix}.${type}.ref.detok -i ${ckpt_prefix}.${type}.hyp.detok
done
exit 0

@ -0,0 +1,40 @@
#!/bin/bash
if [ $# != 3 ];then
echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name ckpt_path"
exit -1
fi
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo "using $ngpu gpus..."
config_path=$1
ckpt_name=$2
ckpt_path=$3
mkdir -p exp
# seed may break model convergence
seed=0
if [ ${seed} != 0 ]; then
export FLAGS_cudnn_deterministic=True
fi
python3 -u ${BIN_DIR}/train.py \
--ngpu ${ngpu} \
--config ${config_path} \
--output exp/${ckpt_name} \
--checkpoint_path "${ckpt_path}" \
--seed ${seed}
if [ ${seed} != 0 ]; then
unset FLAGS_cudnn_deterministic
fi
if [ $? -ne 0 ]; then
echo "Failed in training!"
exit 1
fi
exit 0

@ -0,0 +1,29 @@
export MAIN_ROOT=`realpath ${PWD}/../../../`
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${MAIN_ROOT}/tools/moses/scripts/tokenizer:${PATH}
export LC_ALL=C
export PYTHONDONTWRITEBYTECODE=1
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export PYTHONIOENCODING=UTF-8
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
if ! which tokenizer.perl > /dev/null; then
echo "Error: moses is required in this example." >&2
echo "Error: it seems that moses is not installed." >&2
echo "Error: please install moses as follows." >&2
echo "Error: cd ${MAIN_ROOT}/tools && git clone https://github.com/moses-smt/mosesdecoder.git moses" >&2
return 1
fi
MODEL=u2_st
export BIN_DIR=${MAIN_ROOT}/paddlespeech/s2t/exps/${MODEL}/bin
# Kaldi
export KALDI_ROOT=${MAIN_ROOT}/tools/kaldi
[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present, can not using Kaldi!"
[ -f $KALDI_ROOT/tools/config/common_path.sh ] && . $KALDI_ROOT/tools/config/common_path.sh

@ -0,0 +1,39 @@
#!/bin/bash
set -e
. ./path.sh || exit 1;
. ./cmd.sh || exit 1;
gpus=0,1,2,3
stage=0
stop_stage=1
conf_path=conf/transformer_joint_noam_es.yaml
decode_conf_path=conf/tuning/decode.yaml
must_c_path=
lang=es
avg_num=5
ckpt_path= # (finetune from FAT-ST or ASR pretrained model)
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
avg_ckpt=avg_${avg_num}
ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')
echo "checkpoint name ${ckpt}"
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# prepare data
bash ./local/data.sh --tgt_lang ${lang} --must_c ${must_c_path} || exit -1
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# train model, all `ckpt` under `exp` dir
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} "${ckpt_path}"
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# avg n best model
avg.sh best exp/${ckpt}/checkpoints ${avg_num}
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# test ckpt avg_n
CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${lang} || exit -1
fi

@ -0,0 +1 @@
../../../tools/kaldi/egs/wsj/s5/steps

@ -0,0 +1 @@
../../../tools/kaldi/egs/wsj/s5/utils
Loading…
Cancel
Save