@ -44,27 +44,7 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
echo "Complete raw data pre-process."
echo "Complete raw data pre-process."
fi
fi
if [ ${ stage } -le 0 ] && [ ${ stop_stage } -ge 0 ] ; then
if [ ${ stage } -le 0 ] && [ ${ stop_stage } -ge 0 ] ; then
# build vocabulary
python3 ${ MAIN_ROOT } /utils/build_vocab.py \
--unit_type "spm" \
--spm_vocab_size= ${ nbpe } \
--spm_mode ${ bpemode } \
--spm_model_prefix ${ bpeprefix } \
--vocab_path= "data/vocab.txt" \
--text_keys 'text' 'text1' \
--manifest_paths= "data/manifest.train.raw"
if [ $? -ne 0 ] ; then
echo "Build vocabulary failed. Terminated."
exit 1
fi
fi
if [ ${ stage } -le 1 ] && [ ${ stop_stage } -ge 1 ] ; then
# compute mean and stddev for normalizer
# compute mean and stddev for normalizer
num_workers = $( nproc)
num_workers = $( nproc)
python3 ${ MAIN_ROOT } /utils/compute_mean_std.py \
python3 ${ MAIN_ROOT } /utils/compute_mean_std.py \
@ -86,6 +66,23 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
fi
fi
fi
fi
if [ ${ stage } -le 1 ] && [ ${ stop_stage } -ge 1 ] ; then
# build vocabulary
python3 ${ MAIN_ROOT } /utils/build_vocab.py \
--unit_type "spm" \
--spm_vocab_size= ${ nbpe } \
--spm_mode ${ bpemode } \
--spm_model_prefix ${ bpeprefix } \
--vocab_path= "data/vocab.txt" \
--text_keys 'text' 'text1' \
--manifest_paths= "data/manifest.train.raw"
if [ $? -ne 0 ] ; then
echo "Build vocabulary failed. Terminated."
exit 1
fi
fi
if [ ${ stage } -le 2 ] && [ ${ stop_stage } -ge 2 ] ; then
if [ ${ stage } -le 2 ] && [ ${ stop_stage } -ge 2 ] ; then
# format manifest with tokenids, vocab size
# format manifest with tokenids, vocab size