You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
49 lines
1.3 KiB
49 lines
1.3 KiB
#!/bin/bash
|
|
|
|
# Copyright 2019 Kyoto University (Hirofumi Inaguma)
|
|
# 2021 PaddlePaddle
|
|
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
|
|
|
|
. ./path.sh
|
|
|
|
if [ "$#" -ne 2 ]; then
|
|
echo "Usage: $0 <set> <lang>>"
|
|
echo "e.g.: $0 dev"
|
|
exit 1
|
|
fi
|
|
|
|
set=$1
|
|
lang=$2
|
|
export LC_ALL=en_US.UTF-8
|
|
# Copy stuff intoc its final locations [this has been moved from the format_data script]
|
|
# for En
|
|
mkdir -p ${set}.en
|
|
for f in spk2utt utt2spk segments wav.scp feats.scp utt2num_frames; do
|
|
if [ -f ${set}/${f} ]; then
|
|
sort ${set}/${f} > ${set}.en/${f}
|
|
fi
|
|
done
|
|
sort ${set}/text.en | sed $'s/[^[:print:]]//g' > ${set}.en/text
|
|
|
|
utils/fix_data_dir.sh ${set}.en
|
|
if [ -f ${set}.en/feats.scp ]; then
|
|
utils/validate_data_dir.sh ${set}.en || exit 1;
|
|
else
|
|
utils/validate_data_dir.sh --no-feats --no-wav ${set}.en || exit 1;
|
|
fi
|
|
|
|
# for target language
|
|
mkdir -p ${set}.${lang}
|
|
for f in spk2utt utt2spk segments wav.scp feats.scp utt2num_frames; do
|
|
if [ -f ${set}/${f} ]; then
|
|
sort ${set}/${f} > ${set}.${lang}/${f}
|
|
fi
|
|
done
|
|
sort ${set}/text.${lang} | sed $'s/[^[:print:]]//g' > ${set}.${lang}/text
|
|
utils/fix_data_dir.sh ${set}.${lang}
|
|
if [ -f ${set}.${lang}/feats.scp ]; then
|
|
utils/validate_data_dir.sh ${set}.${lang} || exit 1;
|
|
else
|
|
utils/validate_data_dir.sh --no-feats --no-wav ${set}.${lang} || exit 1;
|
|
fi
|