You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
PaddleSpeech/examples/wenetspeech/asr1/local/data.sh

103 lines
3.1 KiB

#!/bin/bash
# Copyright 2021 Mobvoi Inc(Author: Di Wu, Binbin Zhang)
# NPU, ASLP Group (Author: Qijie Shao)
#
# Modified from wenet(https://github.com/wenet-e2e/wenet)
stage=-1
stop_stage=100
# Use your own data path. You need to download the WenetSpeech dataset by yourself.
wenetspeech_data_dir=./wenetspeech
# Make sure you have 1.2T for ${shards_dir}
shards_dir=./wenetspeech_shards
#wenetspeech training set
set=L
train_set=train_`echo $set | tr 'A-Z' 'a-z'`
dev_set=dev
test_sets="test_net test_meeting"
cmvn=true
cmvn_sampling_divisor=20 # 20 means 5% of the training data to estimate cmvn
. ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
set -u
set -o pipefail
mkdir -p data
TARGET_DIR=${MAIN_ROOT}/dataset
mkdir -p ${TARGET_DIR}
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
# download data
echo "Please follow https://github.com/wenet-e2e/WenetSpeech to download the data."
exit 0;
fi
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
echo "Data preparation"
local/wenetspeech_data_prep.sh \
--train-subset $set \
$wenetspeech_data_dir \
data || exit 1;
fi
dict=data/lang_char/vocab.txt
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
echo "Make a dictionary"
echo "dictionary: ${dict}"
mkdir -p $(dirname $dict)
echo "<blank>" > ${dict} # 0 will be used for "blank" in CTC
echo "<unk>" >> ${dict} # <unk> must be 1
echo "▁" >> ${dict} # ▁ is for space
utils/text2token.py -s 1 -n 1 --space "▁" data/${train_set}/text \
| cut -f 2- -d" " | tr " " "\n" \
| sort | uniq | grep -a -v -e '^\s*$' \
| grep -v "▁" \
| awk '{print $0}' >> ${dict} \
|| exit 1;
echo "<eos>" >> $dict
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
echo "Compute cmvn"
# Here we use all the training data, you can sample some some data to save time
# BUG!!! We should use the segmented data for CMVN
if $cmvn; then
full_size=`cat data/${train_set}/wav.scp | wc -l`
sampling_size=$((full_size / cmvn_sampling_divisor))
shuf -n $sampling_size data/$train_set/wav.scp \
> data/$train_set/wav.scp.sampled
python3 utils/compute_cmvn_stats.py \
--num_workers 16 \
--train_config $train_config \
--in_scp data/$train_set/wav.scp.sampled \
--out_cmvn data/$train_set/mean_std.json \
|| exit 1;
fi
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
echo "Making shards, please wait..."
RED='\033[0;31m'
NOCOLOR='\033[0m'
echo -e "It requires ${RED}1.2T ${NOCOLOR}space for $shards_dir, please make sure you have enough space"
echo -e "It takes about ${RED}12 ${NOCOLOR}hours with 32 threads"
for x in $dev_set $test_sets ${train_set}; do
dst=$shards_dir/$x
mkdir -p $dst
utils/make_filted_shard_list.py --resample 16000 --num_utts_per_shard 1000 \
--do_filter --num_node 1 --num_gpus_per_node 8 \
--num_threads 32 --segments data/$x/segments \
data/$x/wav.scp data/$x/text \
$(realpath $dst) data/$x/data.list
done
fi
echo "Wenetspeech data preparation done."
exit 0