|
|
|
@ -27,8 +27,20 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
|
|
|
|
|
exit 1
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
for set in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
|
|
|
|
|
mv data/manifest.${set} data/manifest.${set}.raw
|
|
|
|
|
done
|
|
|
|
|
|
|
|
|
|
for set in train-clean-100 train-clean-360 train-other-500; do
|
|
|
|
|
cat data/manifest.${set} >> data/manifest.train.raw
|
|
|
|
|
cat data/manifest.${set}.raw >> data/manifest.train.raw
|
|
|
|
|
done
|
|
|
|
|
|
|
|
|
|
for set in dev-clean dev-other; do
|
|
|
|
|
cat data/manifest.${set}.raw >> data/manifest.dev.raw
|
|
|
|
|
done
|
|
|
|
|
|
|
|
|
|
for set in test-clean test-other; do
|
|
|
|
|
cat data/manifest.${set}.raw >> data/manifest.test.raw
|
|
|
|
|
done
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
@ -73,20 +85,24 @@ fi
|
|
|
|
|
|
|
|
|
|
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
|
|
|
|
# format manifest with tokenids, vocab size
|
|
|
|
|
python3 ${MAIN_ROOT}/utils/format_data.py \
|
|
|
|
|
--feat_type "raw" \
|
|
|
|
|
--cmvn_path "data/mean_std.json" \
|
|
|
|
|
--unit_type "spm" \
|
|
|
|
|
--spm_model_prefix ${bpeprefix} \
|
|
|
|
|
--vocab_path="data/vocab.txt" \
|
|
|
|
|
--manifest_path="data/manifest.train.raw" \
|
|
|
|
|
--output_path="data/manifest.train"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if [ $? -ne 0 ]; then
|
|
|
|
|
echo "Formt mnaifest failed. Terminated."
|
|
|
|
|
exit 1
|
|
|
|
|
fi
|
|
|
|
|
for set in train dev test dev-clean dev-other test-clean test-other; do
|
|
|
|
|
{
|
|
|
|
|
python3 ${MAIN_ROOT}/utils/format_data.py \
|
|
|
|
|
--feat_type "raw" \
|
|
|
|
|
--cmvn_path "data/mean_std.json" \
|
|
|
|
|
--unit_type "spm" \
|
|
|
|
|
--spm_model_prefix ${bpeprefix} \
|
|
|
|
|
--vocab_path="data/vocab.txt" \
|
|
|
|
|
--manifest_path="data/manifest.${set}.raw" \
|
|
|
|
|
--output_path="data/manifest.${set}"
|
|
|
|
|
|
|
|
|
|
if [ $? -ne 0 ]; then
|
|
|
|
|
echo "Formt mnaifest failed. Terminated."
|
|
|
|
|
exit 1
|
|
|
|
|
fi
|
|
|
|
|
}&
|
|
|
|
|
done
|
|
|
|
|
wait
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
echo "LibriSpeech Data preparation done."
|
|
|
|
|