parent
9179983f8f
commit
af2b20650e
@ -0,0 +1,2 @@
|
||||
--sample-frequency=16000
|
||||
--num-mel-bins=80
|
@ -0,0 +1 @@
|
||||
--sample-frequency=16000
|
@ -0,0 +1,19 @@
|
||||
[
|
||||
{
|
||||
"type": "specaug",
|
||||
"params": {
|
||||
"W": 5,
|
||||
"warp_mode": "PIL",
|
||||
"F": 30,
|
||||
"n_freq_masks": 2,
|
||||
"T": 40,
|
||||
"n_time_masks": 2,
|
||||
"p": 1.0,
|
||||
"adaptive_number_ratio": 0,
|
||||
"adaptive_size_ratio": 0,
|
||||
"max_n_time_masks": 20,
|
||||
"replace_with_zero": false
|
||||
},
|
||||
"prob": 1.0
|
||||
}
|
||||
]
|
@ -0,0 +1,201 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Copyright 2019 Kyoto University (Hirofumi Inaguma)
|
||||
# 2021 PaddlePaddle
|
||||
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
|
||||
|
||||
set -e
|
||||
set -u
|
||||
|
||||
stage=-1
|
||||
stop_stage=10
|
||||
|
||||
# bpemode (unigram or bpe)
|
||||
tgt_lang=
|
||||
nbpe=8000
|
||||
bpemode=bpe
|
||||
must_c=
|
||||
dumpdir=data/dump
|
||||
do_delta=false
|
||||
tgt_case=tc
|
||||
src_case=lc.rm
|
||||
source ${MAIN_ROOT}/utils/parse_options.sh
|
||||
|
||||
TARGET_DIR=${MAIN_ROOT}/examples/dataset
|
||||
mkdir -p ${TARGET_DIR}
|
||||
mkdir -p data
|
||||
|
||||
train_set=train_sp.en-${tgt_lang}.${tgt_lang}
|
||||
train_dev=dev.en-${tgt_lang}.${tgt_lang}
|
||||
trans_set=""
|
||||
for lang in $(echo ${tgt_lang} | tr '_' ' '); do
|
||||
trans_set="${trans_set} tst-COMMON.en-${lang}.${lang}"
|
||||
done
|
||||
|
||||
|
||||
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
|
||||
if [ ! -e ${must_c} ]; then
|
||||
echo "Error: Dataset is not avaiable. Please download and unzip the dataset"
|
||||
echo "Link of Must-c v1, https://ict.fbk.eu/must-c/."
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
echo "stage 0: Data Preparation"
|
||||
for lang in $(echo ${tgt_lang} | tr '_' ' '); do
|
||||
local/data_prep.sh ${must_c} ${lang}
|
||||
done
|
||||
fi
|
||||
|
||||
feat_tr_dir=${dumpdir}/${train_set}/delta${do_delta}; mkdir -p ${feat_tr_dir}
|
||||
feat_dt_dir=${dumpdir}/${train_dev}/delta${do_delta}; mkdir -p ${feat_dt_dir}
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
### Task dependent. You have to design training and dev sets by yourself.
|
||||
### But you can utilize Kaldi recipes in most cases
|
||||
echo "stage 1: Feature Generation"
|
||||
fbankdir=fbank
|
||||
# Generate the fbank features; by default 80-dimensional fbanks with pitch on each frame
|
||||
for lang in $(echo ${tgt_lang} | tr '_' ' '); do
|
||||
for x in train.en-${tgt_lang} dev.en-${tgt_lang} tst-COMMON.en-${tgt_lang} tst-HE.en-${tgt_lang}; do
|
||||
steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj 32 --write_utt2num_frames true \
|
||||
data/${x} data/make_fbank/${x} ${fbankdir}
|
||||
done
|
||||
done
|
||||
|
||||
# speed-perturbed
|
||||
utils/perturb_data_dir_speed.sh 0.9 data/train.en-${tgt_lang} data/temp1.${tgt_lang}
|
||||
utils/perturb_data_dir_speed.sh 1.0 data/train.en-${tgt_lang} data/temp2.${tgt_lang}
|
||||
utils/perturb_data_dir_speed.sh 1.1 data/train.en-${tgt_lang} data/temp3.${tgt_lang}
|
||||
utils/combine_data.sh --extra-files utt2uniq data/train_sp.en-${tgt_lang} \
|
||||
data/temp1.${tgt_lang} data/temp2.${tgt_lang} data/temp3.${tgt_lang}
|
||||
rm -r data/temp1.${tgt_lang} data/temp2.${tgt_lang} data/temp3.${tgt_lang}
|
||||
utils/fix_data_dir.sh data/train_sp.en-${tgt_lang}
|
||||
steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj 32 --write_utt2num_frames true \
|
||||
data/train_sp.en-${tgt_lang} data/make_fbank/train_sp.en-${tgt_lang} ${fbankdir}
|
||||
for lang in en ${tgt_lang}; do
|
||||
awk -v p="sp0.9-" '{printf("%s %s%s\n", $1, p, $1);}' data/train.en-${tgt_lang}/utt2spk > data/train_sp.en-${tgt_lang}/utt_map
|
||||
utils/apply_map.pl -f 1 data/train_sp.en-${tgt_lang}/utt_map <data/train.en-${tgt_lang}/text.tc.${lang} >data/train_sp.en-${tgt_lang}/text.tc.${lang}
|
||||
utils/apply_map.pl -f 1 data/train_sp.en-${tgt_lang}/utt_map <data/train.en-${tgt_lang}/text.lc.${lang} >data/train_sp.en-${tgt_lang}/text.lc.${lang}
|
||||
utils/apply_map.pl -f 1 data/train_sp.en-${tgt_lang}/utt_map <data/train.en-${tgt_lang}/text.lc.rm.${lang} >data/train_sp.en-${tgt_lang}/text.lc.rm.${lang}
|
||||
awk -v p="sp1.0-" '{printf("%s %s%s\n", $1, p, $1);}' data/train.en-${tgt_lang}/utt2spk > data/train_sp.en-${tgt_lang}/utt_map
|
||||
utils/apply_map.pl -f 1 data/train_sp.en-${tgt_lang}/utt_map <data/train.en-${tgt_lang}/text.tc.${lang} >>data/train_sp.en-${tgt_lang}/text.tc.${lang}
|
||||
utils/apply_map.pl -f 1 data/train_sp.en-${tgt_lang}/utt_map <data/train.en-${tgt_lang}/text.lc.${lang} >>data/train_sp.en-${tgt_lang}/text.lc.${lang}
|
||||
utils/apply_map.pl -f 1 data/train_sp.en-${tgt_lang}/utt_map <data/train.en-${tgt_lang}/text.lc.rm.${lang} >>data/train_sp.en-${tgt_lang}/text.lc.rm.${lang}
|
||||
awk -v p="sp1.1-" '{printf("%s %s%s\n", $1, p, $1);}' data/train.en-${tgt_lang}/utt2spk > data/train_sp.en-${tgt_lang}/utt_map
|
||||
utils/apply_map.pl -f 1 data/train_sp.en-${tgt_lang}/utt_map <data/train.en-${tgt_lang}/text.tc.${lang} >>data/train_sp.en-${tgt_lang}/text.tc.${lang}
|
||||
utils/apply_map.pl -f 1 data/train_sp.en-${tgt_lang}/utt_map <data/train.en-${tgt_lang}/text.lc.${lang} >>data/train_sp.en-${tgt_lang}/text.lc.${lang}
|
||||
utils/apply_map.pl -f 1 data/train_sp.en-${tgt_lang}/utt_map <data/train.en-${tgt_lang}/text.lc.rm.${lang} >>data/train_sp.en-${tgt_lang}/text.lc.rm.${lang}
|
||||
done
|
||||
|
||||
# Divide into source and target languages
|
||||
for x in train_sp.en-${tgt_lang} dev.en-${tgt_lang} tst-COMMON.en-${tgt_lang} tst-HE.en-${tgt_lang}; do
|
||||
local/divide_lang.sh ${x} ${tgt_lang}
|
||||
done
|
||||
|
||||
for x in train_sp.en-${tgt_lang} dev.en-${tgt_lang}; do
|
||||
# remove utt having more than 3000 frames
|
||||
# remove utt having more than 400 characters
|
||||
for lang in ${tgt_lang} en; do
|
||||
remove_longshortdata.sh --maxframes 3000 --maxchars 400 data/${x}.${lang} data/${x}.${lang}.tmp
|
||||
done
|
||||
|
||||
# Match the number of utterances between source and target languages
|
||||
# extract commocn lines
|
||||
cut -f 1 -d " " data/${x}.en.tmp/text > data/${x}.${tgt_lang}.tmp/reclist1
|
||||
cut -f 1 -d " " data/${x}.${tgt_lang}.tmp/text > data/${x}.${tgt_lang}.tmp/reclist2
|
||||
comm -12 data/${x}.${tgt_lang}.tmp/reclist1 data/${x}.${tgt_lang}.tmp/reclist2 > data/${x}.en.tmp/reclist
|
||||
|
||||
for lang in ${tgt_lang} en; do
|
||||
reduce_data_dir.sh data/${x}.${lang}.tmp data/${x}.en.tmp/reclist data/${x}.${lang}
|
||||
utils/fix_data_dir.sh --utt_extra_files "text.tc text.lc text.lc.rm" data/${x}.${lang}
|
||||
done
|
||||
rm -rf data/${x}.*.tmp
|
||||
done
|
||||
|
||||
# compute global CMVN
|
||||
compute-cmvn-stats scp:data/${train_set}/feats.scp data/${train_set}/cmvn.ark
|
||||
|
||||
# dump features for training
|
||||
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${feat_tr_dir}/storage ]; then
|
||||
utils/create_split_dir.pl \
|
||||
/export/b{14,15,16,17}/${USER}/espnet-data/egs/must_c/st1/dump/${train_set}/delta${do_delta}/storage \
|
||||
${feat_tr_dir}/storage
|
||||
fi
|
||||
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${feat_dt_dir}/storage ]; then
|
||||
utils/create_split_dir.pl \
|
||||
/export/b{14,15,16,17}/${USER}/espnet-data/egs/must_c/st1/dump/${train_dev}/delta${do_delta}/storage \
|
||||
${feat_dt_dir}/storage
|
||||
fi
|
||||
dump.sh --cmd "$train_cmd" --nj 80 --do_delta $do_delta \
|
||||
data/${train_set}/feats.scp data/${train_set}/cmvn.ark data/dump_feats/${train_set} ${feat_tr_dir}
|
||||
dump.sh --cmd "$train_cmd" --nj 32 --do_delta $do_delta \
|
||||
data/${train_dev}/feats.scp data/${train_set}/cmvn.ark data/dump_feats/${train_dev} ${feat_dt_dir}
|
||||
for ttask in ${trans_set}; do
|
||||
feat_trans_dir=${dumpdir}/${ttask}/delta${do_delta}; mkdir -p ${feat_trans_dir}
|
||||
dump.sh --cmd "$train_cmd" --nj 32 --do_delta $do_delta \
|
||||
data/${ttask}/feats.scp data/${train_set}/cmvn.ark data/dump_feats/trans/${ttask} \
|
||||
${feat_trans_dir}
|
||||
done
|
||||
fi
|
||||
|
||||
dict=data/lang_1spm/${train_set}_${bpemode}${nbpe}_units_${tgt_case}.txt
|
||||
nlsyms=data/lang_1spm/${train_set}_non_lang_syms_${tgt_case}.txt
|
||||
bpemodel=data/lang_1spm/${train_set}_${bpemode}${nbpe}_${tgt_case}
|
||||
echo "dictionary: ${dict}"
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
### Task dependent. You have to check non-linguistic symbols used in the corpus.
|
||||
echo "stage 2: Dictionary and Json Data Preparation"
|
||||
mkdir -p data/lang_1spm/
|
||||
export LC_ALL=C.UTF-8
|
||||
|
||||
echo "make a non-linguistic symbol list for all languages"
|
||||
grep sp1.0 data/train_sp.en-${tgt_lang}.*/text.${tgt_case} | cut -f 2- -d' ' | grep -o -P '&[^;]*;'| sort | uniq > ${nlsyms}
|
||||
cat ${nlsyms}
|
||||
|
||||
echo "make a joint source and target dictionary"
|
||||
echo "<unk> 1" > ${dict} # <unk> must be 1, 0 will be used for "blank" in CTC
|
||||
offset=$(wc -l < ${dict})
|
||||
grep sp1.0 data/train_sp.en-${tgt_lang}.${tgt_lang}/text.${tgt_case} | cut -f 2- -d' ' | grep -v -e '^\s*$' > data/lang_1spm/input_${tgt_lang}.txt
|
||||
grep sp1.0 data/train_sp.en-${tgt_lang}.en/text.${src_case} | cut -f 2- -d' ' | grep -v -e '^\s*$' >> data/lang_1spm/input_${tgt_lang}.txt
|
||||
spm_train --user_defined_symbols="$(tr "\n" "," < ${nlsyms})" --input=data/lang_1spm/input_${tgt_lang}.txt --vocab_size=${nbpe} --model_type=${bpemode} --model_prefix=${bpemodel} --input_sentence_size=100000000 --character_coverage=1.0
|
||||
spm_encode --model=${bpemodel}.model --output_format=piece < data/lang_1spm/input_${tgt_lang}.txt | tr ' ' '\n' | sort | uniq | awk -v offset=${offset} '{print $0 " " NR+offset}' >> ${dict}
|
||||
wc -l ${dict}
|
||||
|
||||
echo "make json files"
|
||||
data2json.sh --nj 16 --feat ${feat_tr_dir}/feats.scp --text data/${train_set}/text.${tgt_case} --bpecode ${bpemodel}.model --lang ${tgt_lang} \
|
||||
data/${train_set} ${dict} > ${feat_tr_dir}/data_${bpemode}${nbpe}.${tgt_case}.json
|
||||
data2json.sh --feat ${feat_dt_dir}/feats.scp --text data/${train_dev}/text.${tgt_case} --bpecode ${bpemodel}.model --lang ${tgt_lang} \
|
||||
data/${train_dev} ${dict} > ${feat_dt_dir}/data_${bpemode}${nbpe}.${tgt_case}.json
|
||||
for ttask in ${trans_set}; do
|
||||
feat_trans_dir=${dumpdir}/${ttask}/delta${do_delta}
|
||||
data2json.sh --feat ${feat_trans_dir}/feats.scp --text data/${ttask}/text.${tgt_case} --bpecode ${bpemodel}.model --lang ${tgt_lang} \
|
||||
data/${ttask} ${dict} > ${feat_trans_dir}/data_${bpemode}${nbpe}.${tgt_case}.json
|
||||
done
|
||||
echo "update json (add source references)"
|
||||
# update json (add source references)
|
||||
for x in ${train_set} ${train_dev}; do
|
||||
feat_dir=${dumpdir}/${x}/delta${do_delta}
|
||||
data_dir=data/$(echo ${x} | cut -f 1 -d ".").en-${tgt_lang}.en
|
||||
update_json.sh --text ${data_dir}/text.${src_case} --bpecode ${bpemodel}.model \
|
||||
${feat_dir}/data_${bpemode}${nbpe}.${tgt_case}.json ${data_dir} ${dict}
|
||||
done
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||
x=(${train_set} ${train_dev} ${trans_set})
|
||||
y=(train dev test)
|
||||
for (( i=0; i<${#x[*]}; ++i)); do
|
||||
echo ${x[$i]} ${y[$i]}
|
||||
feat_dir=${dumpdir}/${x[$i]}/delta${do_delta}
|
||||
data_dir=data/$(echo ${x[$i]} | cut -f 1 -d ".").en-${tgt_lang}.en
|
||||
python3 ${MAIN_ROOT}/utils/espnet_json_to_manifest.py \
|
||||
--json-file ${feat_dir}/data_${bpemode}${nbpe}.${tgt_case}.json \
|
||||
--manifest-file data/manifest.${tgt_lang}.${y[$i]}
|
||||
echo "Process done for ${y[$i]} set from ${x[$i]}"
|
||||
done
|
||||
fi
|
||||
|
||||
|
||||
echo "MuST-C ${tgt_lang} Data preparation done."
|
||||
exit 0
|
@ -0,0 +1,163 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Copyright 2019 Kyoto University (Hirofumi Inaguma)
|
||||
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
|
||||
|
||||
export LC_ALL=C
|
||||
|
||||
source ${MAIN_ROOT}/utils/parse_options.sh
|
||||
|
||||
if [ "$#" -ne 2 ]; then
|
||||
echo "Usage: $0 <src-dir>"
|
||||
echo "e.g.: $0 /n/rd11/corpora_8/MUSTC_v1.0 target_lang"
|
||||
exit 1;
|
||||
fi
|
||||
|
||||
tgt_lang=$2
|
||||
|
||||
for set in train dev tst-COMMON tst-HE; do
|
||||
src=$1/en-${tgt_lang}/data/${set}
|
||||
dst=data/local/en-${tgt_lang}/${set}
|
||||
|
||||
[ ! -d ${src} ] && echo "$0: no such directory ${src}" && exit 1;
|
||||
|
||||
wav_dir=${src}/wav
|
||||
trans_dir=${src}/txt
|
||||
yml=${trans_dir}/${set}.yaml
|
||||
en=${trans_dir}/${set}.en
|
||||
tgt=${trans_dir}/${set}.${tgt_lang}
|
||||
|
||||
mkdir -p ${dst} || exit 1;
|
||||
|
||||
[ ! -d ${wav_dir} ] && echo "$0: no such directory ${wav_dir}" && exit 1;
|
||||
[ ! -d ${trans_dir} ] && echo "$0: no such directory ${trans_dir}" && exit 1;
|
||||
[ ! -f ${yml} ] && echo "$0: expected file ${yml} to exist" && exit 1;
|
||||
[ ! -f ${en} ] && echo "$0: expected file ${en} to exist" && exit 1;
|
||||
[ ! -f ${tgt} ] && echo "$0: expected file ${tgt} to exist" && exit 1;
|
||||
|
||||
wav_scp=${dst}/wav.scp; [[ -f "${wav_scp}" ]] && rm ${wav_scp}
|
||||
trans_en=${dst}/text.en; [[ -f "${trans_en}" ]] && rm ${trans_en}
|
||||
trans_tgt=${dst}/text.${tgt_lang}; [[ -f "${trans_tgt}" ]] && rm ${trans_tgt}
|
||||
utt2spk=${dst}/utt2spk; [[ -f "${utt2spk}" ]] && rm ${utt2spk}
|
||||
spk2utt=${dst}/spk2utt; [[ -f "${spk2utt}" ]] && rm ${spk2utt}
|
||||
segments=${dst}/segments; [[ -f "${segments}" ]] && rm ${segments}
|
||||
|
||||
# error check
|
||||
n=$(cat ${yml} | grep duration | wc -l)
|
||||
n_en=$(cat ${en} | wc -l)
|
||||
n_tgt=$(cat ${tgt} | wc -l)
|
||||
[ ${n} -ne ${n_en} ] && echo "Warning: expected ${n} data data files, found ${n_en}" && exit 1;
|
||||
[ ${n} -ne ${n_tgt} ] && echo "Warning: expected ${n} data data files, found ${n_tgt}" && exit 1;
|
||||
|
||||
# (1a) Transcriptions and translations preparation
|
||||
# make basic transcription file (add segments info)
|
||||
cp ${yml} ${dst}/.yaml0
|
||||
grep duration ${dst}/.yaml0 > ${dst}/.yaml1
|
||||
awk '{
|
||||
duration=$3; offset=$5; spkid=$7;
|
||||
gsub(",","",duration);
|
||||
gsub(",","",offset);
|
||||
gsub(",","",spkid);
|
||||
gsub("spk.","",spkid);
|
||||
duration=sprintf("%.7f", duration);
|
||||
if ( duration < 0.2 ) extendt=sprintf("%.7f", (0.2-duration)/2);
|
||||
else extendt=0;
|
||||
offset=sprintf("%.7f", offset);
|
||||
startt=offset-extendt;
|
||||
endt=offset+duration+extendt;
|
||||
printf("ted_%05d_%07.0f_%07.0f\n", spkid, int(1000*startt+0.5), int(1000*endt+0.5));
|
||||
}' ${dst}/.yaml1 > ${dst}/.yaml2
|
||||
# NOTE: Extend the lengths of short utterances (< 0.2s) rather than exclude them
|
||||
|
||||
cp ${en} ${dst}/en.org
|
||||
cp ${tgt} ${dst}/${tgt_lang}.org
|
||||
|
||||
for lang in en ${tgt_lang}; do
|
||||
# normalize punctuation
|
||||
normalize-punctuation.perl -l ${lang} < ${dst}/${lang}.org > ${dst}/${lang}.norm
|
||||
|
||||
# lowercasing
|
||||
lowercase.perl < ${dst}/${lang}.norm > ${dst}/${lang}.norm.lc
|
||||
cp ${dst}/${lang}.norm ${dst}/${lang}.norm.tc
|
||||
|
||||
# remove punctuation
|
||||
local/remove_punctuation.pl < ${dst}/${lang}.norm.lc > ${dst}/${lang}.norm.lc.rm
|
||||
|
||||
# tokenization
|
||||
tokenizer.perl -l ${lang} -q < ${dst}/${lang}.norm.tc > ${dst}/${lang}.norm.tc.tok
|
||||
tokenizer.perl -l ${lang} -q < ${dst}/${lang}.norm.lc > ${dst}/${lang}.norm.lc.tok
|
||||
tokenizer.perl -l ${lang} -q < ${dst}/${lang}.norm.lc.rm > ${dst}/${lang}.norm.lc.rm.tok
|
||||
|
||||
paste -d " " ${dst}/.yaml2 ${dst}/${lang}.norm.tc.tok | sort > ${dst}/text.tc.${lang}
|
||||
paste -d " " ${dst}/.yaml2 ${dst}/${lang}.norm.lc.tok | sort > ${dst}/text.lc.${lang}
|
||||
paste -d " " ${dst}/.yaml2 ${dst}/${lang}.norm.lc.rm.tok | sort > ${dst}/text.lc.rm.${lang}
|
||||
|
||||
# save original and cleaned punctuation
|
||||
lowercase.perl < ${dst}/${lang}.org | text2token.py -s 0 -n 1 | tr " " "\n" \
|
||||
| sort | uniq | grep -v -e '^\s*$' | awk '{print $0 " " NR+1}' > ${dst}/punctuation.${lang}
|
||||
lowercase.perl < ${dst}/${lang}.norm.tc | text2token.py -s 0 -n 1 | tr " " "\n" \
|
||||
| sort | uniq | grep -v -e '^\s*$' | awk '{print $0 " " NR+1}' > ${dst}/punctuation.clean.${lang}
|
||||
done
|
||||
|
||||
|
||||
# error check
|
||||
n=$(cat ${dst}/.yaml2 | wc -l)
|
||||
n_en=$(cat ${dst}/en.norm.tc.tok | wc -l)
|
||||
n_tgt=$(cat ${dst}/${tgt_lang}.norm.tc.tok | wc -l)
|
||||
[ ${n} -ne ${n_en} ] && echo "Warning: expected ${n} data data files, found ${n_en}" && exit 1;
|
||||
[ ${n} -ne ${n_tgt} ] && echo "Warning: expected ${n} data data files, found ${n_tgt}" && exit 1;
|
||||
|
||||
|
||||
# (1c) Make segments files from transcript
|
||||
#segments file format is: utt-id start-time end-time, e.g.:
|
||||
#ted_00001_0003501_0003684 ted_0001 003.501 0003.684
|
||||
awk '{
|
||||
segment=$1; split(segment,S,"[_]");
|
||||
spkid=S[1] "_" S[2]; startf=S[3]; endf=S[4];
|
||||
printf("%s %s %.2f %.2f\n", segment, spkid, startf/1000, endf/1000);
|
||||
}' < ${dst}/text.tc.${tgt_lang} | uniq | sort > ${dst}/segments
|
||||
|
||||
awk '{
|
||||
segment=$1; split(segment,S,"[_]");
|
||||
spkid=S[1] "_" S[2];
|
||||
printf("%s cat '${wav_dir}'/%s_%d.wav |\n", spkid, S[1], S[2]);
|
||||
}' < ${dst}/text.tc.${tgt_lang} | uniq | sort > ${dst}/wav.scp
|
||||
|
||||
awk '{
|
||||
segment=$1; split(segment,S,"[_]");
|
||||
spkid=S[1] "_" S[2]; print $1 " " spkid
|
||||
}' ${dst}/segments | uniq | sort > ${dst}/utt2spk
|
||||
|
||||
cat ${dst}/utt2spk | utils/utt2spk_to_spk2utt.pl | sort > ${dst}/spk2utt
|
||||
|
||||
# error check
|
||||
n_en=$(cat ${dst}/text.tc.en | wc -l)
|
||||
n_tgt=$(cat ${dst}/text.tc.${tgt_lang} | wc -l)
|
||||
[ ${n_en} -ne ${n_tgt} ] && echo "Warning: expected ${n_en} data data files, found ${n_tgt}" && exit 1;
|
||||
|
||||
# Copy stuff intoc its final locations [this has been moved from the format_data script]
|
||||
mkdir -p data/${set}.en-${tgt_lang}
|
||||
|
||||
# remove duplicated utterances (the same offset)
|
||||
echo "remove duplicate lines..."
|
||||
cut -d ' ' -f 1 ${dst}/text.tc.en | sort | uniq -c | sort -n -k1 -r | grep -v '1 ted' \
|
||||
| sed 's/^[ \t]*//' > ${dst}/duplicate_lines
|
||||
cut -d ' ' -f 1 ${dst}/text.tc.en | sort | uniq -c | sort -n -k1 -r | grep '1 ted' \
|
||||
| cut -d '1' -f 2- | sed 's/^[ \t]*//' > ${dst}/reclist
|
||||
reduce_data_dir.sh ${dst} ${dst}/reclist data/${set}.en-${tgt_lang}
|
||||
for l in en ${tgt_lang}; do
|
||||
for case in tc lc lc.rm; do
|
||||
cp ${dst}/text.${case}.${l} data/${set}.en-${tgt_lang}/text.${case}.${l}
|
||||
done
|
||||
done
|
||||
utils/fix_data_dir.sh --utt_extra_files \
|
||||
"text.tc.en text.lc.en text.lc.rm.en text.tc.${tgt_lang} text.lc.${tgt_lang} text.lc.rm.${tgt_lang}" \
|
||||
data/${set}.en-${tgt_lang}
|
||||
|
||||
# error check
|
||||
n_seg=$(cat data/${set}.en-${tgt_lang}/segments | wc -l)
|
||||
n_text=$(cat data/${set}.en-${tgt_lang}/text.tc.${tgt_lang} | wc -l)
|
||||
[ ${n_seg} -ne ${n_text} ] && echo "Warning: expected ${n_seg} data data files, found ${n_text}" && exit 1;
|
||||
|
||||
echo "$0: successfully prepared data in ${dst}"
|
||||
done
|
@ -0,0 +1,52 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Copyright 2019 Kyoto University (Hirofumi Inaguma)
|
||||
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
|
||||
|
||||
. ./path.sh
|
||||
|
||||
if [ "$#" -ne 2 ]; then
|
||||
echo "Usage: $0 <set> <lang>>"
|
||||
echo "e.g.: $0 dev"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
set=$1
|
||||
lang=$2
|
||||
export LC_ALL=en_US.UTF-8
|
||||
# Copy stuff intoc its final locations [this has been moved from the format_data script]
|
||||
# for En
|
||||
mkdir -p data/${set}.en
|
||||
for f in spk2utt utt2spk segments wav.scp feats.scp utt2num_frames; do
|
||||
if [ -f data/${set}/${f} ]; then
|
||||
sort data/${set}/${f} > data/${set}.en/${f}
|
||||
fi
|
||||
done
|
||||
sort data/${set}/text.lc.rm.en | sed $'s/[^[:print:]]//g' > data/${set}.en/text # dummy
|
||||
sort data/${set}/text.tc.en | sed $'s/[^[:print:]]//g' > data/${set}.en/text.tc
|
||||
sort data/${set}/text.lc.en | sed $'s/[^[:print:]]//g' > data/${set}.en/text.lc
|
||||
sort data/${set}/text.lc.rm.en | sed $'s/[^[:print:]]//g' > data/${set}.en/text.lc.rm
|
||||
utils/fix_data_dir.sh --utt_extra_files "text.tc text.lc text.lc.rm" data/${set}.en
|
||||
if [ -f data/${set}.en/feats.scp ]; then
|
||||
utils/validate_data_dir.sh data/${set}.en || exit 1;
|
||||
else
|
||||
utils/validate_data_dir.sh --no-feats --no-wav data/${set}.en || exit 1;
|
||||
fi
|
||||
|
||||
# for target language
|
||||
mkdir -p data/${set}.${lang}
|
||||
for f in spk2utt utt2spk segments wav.scp feats.scp utt2num_frames; do
|
||||
if [ -f data/${set}/${f} ]; then
|
||||
sort data/${set}/${f} > data/${set}.${lang}/${f}
|
||||
fi
|
||||
done
|
||||
sort data/${set}/text.tc.${lang} | sed $'s/[^[:print:]]//g' > data/${set}.${lang}/text # dummy
|
||||
sort data/${set}/text.tc.${lang} | sed $'s/[^[:print:]]//g' > data/${set}.${lang}/text.tc
|
||||
sort data/${set}/text.lc.${lang} | sed $'s/[^[:print:]]//g' > data/${set}.${lang}/text.lc
|
||||
sort data/${set}/text.lc.rm.${lang} | sed $'s/[^[:print:]]//g' > data/${set}.${lang}/text.lc.rm
|
||||
utils/fix_data_dir.sh --utt_extra_files "text.tc text.lc text.lc.rm" data/${set}.${lang}
|
||||
if [ -f data/${set}.${lang}/feats.scp ]; then
|
||||
utils/validate_data_dir.sh data/${set}.${lang} || exit 1;
|
||||
else
|
||||
utils/validate_data_dir.sh --no-feats --no-wav data/${set}.${lang} || exit 1;
|
||||
fi
|
@ -0,0 +1,25 @@
|
||||
#!/usr/bin/perl
|
||||
|
||||
use warnings;
|
||||
use strict;
|
||||
|
||||
binmode(STDIN,":utf8");
|
||||
binmode(STDOUT,":utf8");
|
||||
|
||||
while(<STDIN>) {
|
||||
$_ = " $_ ";
|
||||
|
||||
# remove punctuation except apostrophe
|
||||
s/<space>/spacemark/g; # for scoring
|
||||
s/'/apostrophe/g;
|
||||
s/[[:punct:]]//g;
|
||||
s/apostrophe/'/g;
|
||||
s/spacemark/<space>/g; # for scoring
|
||||
|
||||
# remove whitespace
|
||||
s/\s+/ /g;
|
||||
s/^\s+//;
|
||||
s/\s+$//;
|
||||
|
||||
print "$_\n";
|
||||
}
|
@ -0,0 +1,48 @@
|
||||
#! /usr/bin/env bash
|
||||
|
||||
if [ $# != 4 ];then
|
||||
echo "usage: ${0} config_path decode_config_path ckpt_path_prefix lang"
|
||||
exit -1
|
||||
fi
|
||||
|
||||
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
|
||||
echo "using $ngpu gpus..."
|
||||
|
||||
config_path=$1
|
||||
decode_config_path=$2
|
||||
ckpt_prefix=$3
|
||||
tgt_lang=$4
|
||||
|
||||
for type in fullsentence; do
|
||||
echo "decoding ${type}"
|
||||
python3 -u ${BIN_DIR}/test.py \
|
||||
--ngpu ${ngpu} \
|
||||
--config ${config_path} \
|
||||
--decode_cfg ${decode_config_path} \
|
||||
--result_file ${ckpt_prefix}.${type}.rsl \
|
||||
--checkpoint_path ${ckpt_prefix} \
|
||||
--opts decode.decoding_method ${type} \
|
||||
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Failed in evaluation!"
|
||||
exit 1
|
||||
fi
|
||||
echo $PATH
|
||||
python3 ${MAIN_ROOT}/utils/rsl2trn.py --rsl ${ckpt_prefix}.${type}.rsl \
|
||||
--hyp ${ckpt_prefix}.${type}.hyp \
|
||||
--ref ${ckpt_prefix}.${type}.ref
|
||||
if ! which tokenizer.perl > /dev/null; then
|
||||
echo "Error: it seems that moses is not installed." >&2
|
||||
echo "Error: please install moses as follows." >&2
|
||||
echo "Error: cd ${MAIN_ROOT}/tools && make moses.done" >&2
|
||||
return 1
|
||||
fi
|
||||
detokenizer.perl -l ${tgt_lang} -q < ${ckpt_prefix}.${type}.hyp > ${ckpt_prefix}.${type}.hyp.detok
|
||||
detokenizer.perl -l ${tgt_lang} -q < ${ckpt_prefix}.${type}.ref > ${ckpt_prefix}.${type}.ref.detok
|
||||
echo "Detokenized BLEU:"
|
||||
sacrebleu ${ckpt_prefix}.${type}.ref.detok -i ${ckpt_prefix}.${type}.hyp.detok
|
||||
|
||||
|
||||
done
|
||||
|
||||
exit 0
|
@ -0,0 +1,40 @@
|
||||
#!/bin/bash
|
||||
|
||||
if [ $# != 3 ];then
|
||||
echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name ckpt_path"
|
||||
exit -1
|
||||
fi
|
||||
|
||||
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
|
||||
echo "using $ngpu gpus..."
|
||||
|
||||
config_path=$1
|
||||
ckpt_name=$2
|
||||
ckpt_path=$3
|
||||
|
||||
|
||||
mkdir -p exp
|
||||
|
||||
# seed may break model convergence
|
||||
seed=0
|
||||
if [ ${seed} != 0 ]; then
|
||||
export FLAGS_cudnn_deterministic=True
|
||||
fi
|
||||
|
||||
python3 -u ${BIN_DIR}/train.py \
|
||||
--ngpu ${ngpu} \
|
||||
--config ${config_path} \
|
||||
--output exp/${ckpt_name} \
|
||||
--checkpoint_path "${ckpt_path}" \
|
||||
--seed ${seed}
|
||||
|
||||
if [ ${seed} != 0 ]; then
|
||||
unset FLAGS_cudnn_deterministic
|
||||
fi
|
||||
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Failed in training!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
exit 0
|
@ -0,0 +1,29 @@
|
||||
export MAIN_ROOT=`realpath ${PWD}/../../../`
|
||||
|
||||
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${MAIN_ROOT}/tools/moses/scripts/tokenizer:${PATH}
|
||||
export LC_ALL=C
|
||||
|
||||
export PYTHONDONTWRITEBYTECODE=1
|
||||
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
|
||||
export PYTHONIOENCODING=UTF-8
|
||||
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
|
||||
|
||||
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
|
||||
|
||||
if ! which tokenizer.perl > /dev/null; then
|
||||
echo "Error: moses is required in this example." >&2
|
||||
echo "Error: it seems that moses is not installed." >&2
|
||||
echo "Error: please install moses as follows." >&2
|
||||
echo "Error: cd ${MAIN_ROOT}/tools && git clone https://github.com/moses-smt/mosesdecoder.git moses" >&2
|
||||
return 1
|
||||
fi
|
||||
|
||||
MODEL=u2_st
|
||||
export BIN_DIR=${MAIN_ROOT}/paddlespeech/s2t/exps/${MODEL}/bin
|
||||
|
||||
# Kaldi
|
||||
export KALDI_ROOT=${MAIN_ROOT}/tools/kaldi
|
||||
[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
|
||||
export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
|
||||
[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present, can not using Kaldi!"
|
||||
[ -f $KALDI_ROOT/tools/config/common_path.sh ] && . $KALDI_ROOT/tools/config/common_path.sh
|
@ -0,0 +1,39 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
. ./path.sh || exit 1;
|
||||
. ./cmd.sh || exit 1;
|
||||
|
||||
gpus=0,1,2,3
|
||||
stage=0
|
||||
stop_stage=1
|
||||
conf_path=conf/transformer_joint_noam_es.yaml
|
||||
decode_conf_path=conf/tuning/decode.yaml
|
||||
must_c_path=
|
||||
lang=es
|
||||
avg_num=5
|
||||
ckpt_path= # (finetune from FAT-ST or ASR pretrained model)
|
||||
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
|
||||
|
||||
avg_ckpt=avg_${avg_num}
|
||||
ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')
|
||||
echo "checkpoint name ${ckpt}"
|
||||
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
# prepare data
|
||||
bash ./local/data.sh --tgt_lang ${lang} --must_c ${must_c_path} || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
# train model, all `ckpt` under `exp` dir
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} "${ckpt_path}"
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
# avg n best model
|
||||
avg.sh best exp/${ckpt}/checkpoints ${avg_num}
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||
# test ckpt avg_n
|
||||
CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${lang} || exit -1
|
||||
fi
|
@ -0,0 +1 @@
|
||||
../../../tools/kaldi/egs/wsj/s5/steps
|
@ -0,0 +1 @@
|
||||
../../../tools/kaldi/egs/wsj/s5/utils
|
Loading…
Reference in new issue