You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
PaddleSpeech/examples/mustc/st1/local/data.sh

202 lines
10 KiB

#!/bin/bash
# Copyright 2019 Kyoto University (Hirofumi Inaguma)
# 2021 PaddlePaddle
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
set -e
set -u
stage=-1
stop_stage=10
# bpemode (unigram or bpe)
tgt_lang=
nbpe=8000
bpemode=bpe
must_c=
dumpdir=data/dump
do_delta=false
tgt_case=tc
src_case=lc.rm
source ${MAIN_ROOT}/utils/parse_options.sh
TARGET_DIR=${MAIN_ROOT}/examples/dataset
mkdir -p ${TARGET_DIR}
mkdir -p data
train_set=train_sp.en-${tgt_lang}.${tgt_lang}
train_dev=dev.en-${tgt_lang}.${tgt_lang}
trans_set=""
for lang in $(echo ${tgt_lang} | tr '_' ' '); do
trans_set="${trans_set} tst-COMMON.en-${lang}.${lang}"
done
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
if [ ! -e ${must_c} ]; then
echo "Error: Dataset is not avaiable. Please download and unzip the dataset"
echo "Link of Must-c v1, https://ict.fbk.eu/must-c/."
exit 1
fi
fi
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
echo "stage 0: Data Preparation"
for lang in $(echo ${tgt_lang} | tr '_' ' '); do
local/data_prep.sh ${must_c} ${lang}
done
fi
feat_tr_dir=${dumpdir}/${train_set}/delta${do_delta}; mkdir -p ${feat_tr_dir}
feat_dt_dir=${dumpdir}/${train_dev}/delta${do_delta}; mkdir -p ${feat_dt_dir}
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
### Task dependent. You have to design training and dev sets by yourself.
### But you can utilize Kaldi recipes in most cases
echo "stage 1: Feature Generation"
fbankdir=fbank
# Generate the fbank features; by default 80-dimensional fbanks with pitch on each frame
for lang in $(echo ${tgt_lang} | tr '_' ' '); do
for x in train.en-${tgt_lang} dev.en-${tgt_lang} tst-COMMON.en-${tgt_lang} tst-HE.en-${tgt_lang}; do
steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj 32 --write_utt2num_frames true \
data/${x} data/make_fbank/${x} ${fbankdir}
done
done
# speed-perturbed
utils/perturb_data_dir_speed.sh 0.9 data/train.en-${tgt_lang} data/temp1.${tgt_lang}
utils/perturb_data_dir_speed.sh 1.0 data/train.en-${tgt_lang} data/temp2.${tgt_lang}
utils/perturb_data_dir_speed.sh 1.1 data/train.en-${tgt_lang} data/temp3.${tgt_lang}
utils/combine_data.sh --extra-files utt2uniq data/train_sp.en-${tgt_lang} \
data/temp1.${tgt_lang} data/temp2.${tgt_lang} data/temp3.${tgt_lang}
rm -r data/temp1.${tgt_lang} data/temp2.${tgt_lang} data/temp3.${tgt_lang}
utils/fix_data_dir.sh data/train_sp.en-${tgt_lang}
steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj 32 --write_utt2num_frames true \
data/train_sp.en-${tgt_lang} data/make_fbank/train_sp.en-${tgt_lang} ${fbankdir}
for lang in en ${tgt_lang}; do
awk -v p="sp0.9-" '{printf("%s %s%s\n", $1, p, $1);}' data/train.en-${tgt_lang}/utt2spk > data/train_sp.en-${tgt_lang}/utt_map
utils/apply_map.pl -f 1 data/train_sp.en-${tgt_lang}/utt_map <data/train.en-${tgt_lang}/text.tc.${lang} >data/train_sp.en-${tgt_lang}/text.tc.${lang}
utils/apply_map.pl -f 1 data/train_sp.en-${tgt_lang}/utt_map <data/train.en-${tgt_lang}/text.lc.${lang} >data/train_sp.en-${tgt_lang}/text.lc.${lang}
utils/apply_map.pl -f 1 data/train_sp.en-${tgt_lang}/utt_map <data/train.en-${tgt_lang}/text.lc.rm.${lang} >data/train_sp.en-${tgt_lang}/text.lc.rm.${lang}
awk -v p="sp1.0-" '{printf("%s %s%s\n", $1, p, $1);}' data/train.en-${tgt_lang}/utt2spk > data/train_sp.en-${tgt_lang}/utt_map
utils/apply_map.pl -f 1 data/train_sp.en-${tgt_lang}/utt_map <data/train.en-${tgt_lang}/text.tc.${lang} >>data/train_sp.en-${tgt_lang}/text.tc.${lang}
utils/apply_map.pl -f 1 data/train_sp.en-${tgt_lang}/utt_map <data/train.en-${tgt_lang}/text.lc.${lang} >>data/train_sp.en-${tgt_lang}/text.lc.${lang}
utils/apply_map.pl -f 1 data/train_sp.en-${tgt_lang}/utt_map <data/train.en-${tgt_lang}/text.lc.rm.${lang} >>data/train_sp.en-${tgt_lang}/text.lc.rm.${lang}
awk -v p="sp1.1-" '{printf("%s %s%s\n", $1, p, $1);}' data/train.en-${tgt_lang}/utt2spk > data/train_sp.en-${tgt_lang}/utt_map
utils/apply_map.pl -f 1 data/train_sp.en-${tgt_lang}/utt_map <data/train.en-${tgt_lang}/text.tc.${lang} >>data/train_sp.en-${tgt_lang}/text.tc.${lang}
utils/apply_map.pl -f 1 data/train_sp.en-${tgt_lang}/utt_map <data/train.en-${tgt_lang}/text.lc.${lang} >>data/train_sp.en-${tgt_lang}/text.lc.${lang}
utils/apply_map.pl -f 1 data/train_sp.en-${tgt_lang}/utt_map <data/train.en-${tgt_lang}/text.lc.rm.${lang} >>data/train_sp.en-${tgt_lang}/text.lc.rm.${lang}
done
# Divide into source and target languages
for x in train_sp.en-${tgt_lang} dev.en-${tgt_lang} tst-COMMON.en-${tgt_lang} tst-HE.en-${tgt_lang}; do
local/divide_lang.sh ${x} ${tgt_lang}
done
for x in train_sp.en-${tgt_lang} dev.en-${tgt_lang}; do
# remove utt having more than 3000 frames
# remove utt having more than 400 characters
for lang in ${tgt_lang} en; do
remove_longshortdata.sh --maxframes 3000 --maxchars 400 data/${x}.${lang} data/${x}.${lang}.tmp
done
# Match the number of utterances between source and target languages
# extract commocn lines
cut -f 1 -d " " data/${x}.en.tmp/text > data/${x}.${tgt_lang}.tmp/reclist1
cut -f 1 -d " " data/${x}.${tgt_lang}.tmp/text > data/${x}.${tgt_lang}.tmp/reclist2
comm -12 data/${x}.${tgt_lang}.tmp/reclist1 data/${x}.${tgt_lang}.tmp/reclist2 > data/${x}.en.tmp/reclist
for lang in ${tgt_lang} en; do
reduce_data_dir.sh data/${x}.${lang}.tmp data/${x}.en.tmp/reclist data/${x}.${lang}
utils/fix_data_dir.sh --utt_extra_files "text.tc text.lc text.lc.rm" data/${x}.${lang}
done
rm -rf data/${x}.*.tmp
done
# compute global CMVN
compute-cmvn-stats scp:data/${train_set}/feats.scp data/${train_set}/cmvn.ark
# dump features for training
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${feat_tr_dir}/storage ]; then
utils/create_split_dir.pl \
/export/b{14,15,16,17}/${USER}/espnet-data/egs/must_c/st1/dump/${train_set}/delta${do_delta}/storage \
${feat_tr_dir}/storage
fi
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${feat_dt_dir}/storage ]; then
utils/create_split_dir.pl \
/export/b{14,15,16,17}/${USER}/espnet-data/egs/must_c/st1/dump/${train_dev}/delta${do_delta}/storage \
${feat_dt_dir}/storage
fi
dump.sh --cmd "$train_cmd" --nj 80 --do_delta $do_delta \
data/${train_set}/feats.scp data/${train_set}/cmvn.ark data/dump_feats/${train_set} ${feat_tr_dir}
dump.sh --cmd "$train_cmd" --nj 32 --do_delta $do_delta \
data/${train_dev}/feats.scp data/${train_set}/cmvn.ark data/dump_feats/${train_dev} ${feat_dt_dir}
for ttask in ${trans_set}; do
feat_trans_dir=${dumpdir}/${ttask}/delta${do_delta}; mkdir -p ${feat_trans_dir}
dump.sh --cmd "$train_cmd" --nj 32 --do_delta $do_delta \
data/${ttask}/feats.scp data/${train_set}/cmvn.ark data/dump_feats/trans/${ttask} \
${feat_trans_dir}
done
fi
dict=data/lang_1spm/${train_set}_${bpemode}${nbpe}_units_${tgt_case}.txt
nlsyms=data/lang_1spm/${train_set}_non_lang_syms_${tgt_case}.txt
bpemodel=data/lang_1spm/${train_set}_${bpemode}${nbpe}_${tgt_case}
echo "dictionary: ${dict}"
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
### Task dependent. You have to check non-linguistic symbols used in the corpus.
echo "stage 2: Dictionary and Json Data Preparation"
mkdir -p data/lang_1spm/
export LC_ALL=C.UTF-8
echo "make a non-linguistic symbol list for all languages"
grep sp1.0 data/train_sp.en-${tgt_lang}.*/text.${tgt_case} | cut -f 2- -d' ' | grep -o -P '&[^;]*;'| sort | uniq > ${nlsyms}
cat ${nlsyms}
echo "make a joint source and target dictionary"
echo "<unk> 1" > ${dict} # <unk> must be 1, 0 will be used for "blank" in CTC
offset=$(wc -l < ${dict})
grep sp1.0 data/train_sp.en-${tgt_lang}.${tgt_lang}/text.${tgt_case} | cut -f 2- -d' ' | grep -v -e '^\s*$' > data/lang_1spm/input_${tgt_lang}.txt
grep sp1.0 data/train_sp.en-${tgt_lang}.en/text.${src_case} | cut -f 2- -d' ' | grep -v -e '^\s*$' >> data/lang_1spm/input_${tgt_lang}.txt
spm_train --user_defined_symbols="$(tr "\n" "," < ${nlsyms})" --input=data/lang_1spm/input_${tgt_lang}.txt --vocab_size=${nbpe} --model_type=${bpemode} --model_prefix=${bpemodel} --input_sentence_size=100000000 --character_coverage=1.0
spm_encode --model=${bpemodel}.model --output_format=piece < data/lang_1spm/input_${tgt_lang}.txt | tr ' ' '\n' | sort | uniq | awk -v offset=${offset} '{print $0 " " NR+offset}' >> ${dict}
wc -l ${dict}
echo "make json files"
data2json.sh --nj 16 --feat ${feat_tr_dir}/feats.scp --text data/${train_set}/text.${tgt_case} --bpecode ${bpemodel}.model --lang ${tgt_lang} \
data/${train_set} ${dict} > ${feat_tr_dir}/data_${bpemode}${nbpe}.${tgt_case}.json
data2json.sh --feat ${feat_dt_dir}/feats.scp --text data/${train_dev}/text.${tgt_case} --bpecode ${bpemodel}.model --lang ${tgt_lang} \
data/${train_dev} ${dict} > ${feat_dt_dir}/data_${bpemode}${nbpe}.${tgt_case}.json
for ttask in ${trans_set}; do
feat_trans_dir=${dumpdir}/${ttask}/delta${do_delta}
data2json.sh --feat ${feat_trans_dir}/feats.scp --text data/${ttask}/text.${tgt_case} --bpecode ${bpemodel}.model --lang ${tgt_lang} \
data/${ttask} ${dict} > ${feat_trans_dir}/data_${bpemode}${nbpe}.${tgt_case}.json
done
echo "update json (add source references)"
# update json (add source references)
for x in ${train_set} ${train_dev}; do
feat_dir=${dumpdir}/${x}/delta${do_delta}
data_dir=data/$(echo ${x} | cut -f 1 -d ".").en-${tgt_lang}.en
update_json.sh --text ${data_dir}/text.${src_case} --bpecode ${bpemodel}.model \
${feat_dir}/data_${bpemode}${nbpe}.${tgt_case}.json ${data_dir} ${dict}
done
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
x=(${train_set} ${train_dev} ${trans_set})
y=(train dev test)
for (( i=0; i<${#x[*]}; ++i)); do
echo ${x[$i]} ${y[$i]}
feat_dir=${dumpdir}/${x[$i]}/delta${do_delta}
data_dir=data/$(echo ${x[$i]} | cut -f 1 -d ".").en-${tgt_lang}.en
python3 ${MAIN_ROOT}/utils/espnet_json_to_manifest.py \
--json-file ${feat_dir}/data_${bpemode}${nbpe}.${tgt_case}.json \
--manifest-file data/manifest.${tgt_lang}.${y[$i]}
echo "Process done for ${y[$i]} set from ${x[$i]}"
done
fi
echo "MuST-C ${tgt_lang} Data preparation done."
exit 0