PaddleSpeech/examples/wenetspeech/asr1/local/data.sh

#!/bin/bash

# Copyright 2021  Mobvoi Inc(Author: Di Wu, Binbin Zhang)
#                 NPU, ASLP Group (Author: Qijie Shao)
#
# Modified from wenet(https://github.com/wenet-e2e/wenet)

stage=-1
stop_stage=100

# Use your own data path. You need to download the WenetSpeech dataset by yourself.
wenetspeech_data_dir=./wenetspeech
# Make sure you have 1.2T for ${shards_dir}
shards_dir=./wenetspeech_shards

#wenetspeech training set
set=L
train_set=train_`echo $set | tr 'A-Z' 'a-z'`
dev_set=dev
test_sets="test_net test_meeting"

cmvn=true
cmvn_sampling_divisor=20 # 20 means 5% of the training data to estimate cmvn


. ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
set -u
set -o pipefail


mkdir -p data
TARGET_DIR=${MAIN_ROOT}/dataset
mkdir -p ${TARGET_DIR}

if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
    # download data
    echo "Please follow https://github.com/wenet-e2e/WenetSpeech to download the data."
    exit 0;
fi

if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    echo "Data preparation"
    local/wenetspeech_data_prep.sh \
        --train-subset $set \
        $wenetspeech_data_dir \
        data || exit 1;
fi

dict=data/lang_char/vocab.txt
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    echo "Make a dictionary"
    echo "dictionary: ${dict}"
    mkdir -p $(dirname $dict)
    echo "<blank>" > ${dict} # 0 will be used for "blank" in CTC
    echo "<unk>" >> ${dict} # <unk> must be 1
    echo "▁" >> ${dict} # ▁ is for space
    utils/text2token.py -s 1 -n 1 --space "▁" data/${train_set}/text \
        | cut -f 2- -d" " | tr " " "\n" \
        | sort | uniq | grep -a -v -e '^\s*$' \
        | grep -v "▁" \
        | awk '{print $0}' >> ${dict} \
        || exit 1;
    echo "<eos>" >> $dict
fi

if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
  echo "Compute cmvn"
  # Here we use all the training data, you can sample some some data to save time
  # BUG!!! We should use the segmented data for CMVN
  if $cmvn; then
    full_size=`cat data/${train_set}/wav.scp | wc -l`
    sampling_size=$((full_size / cmvn_sampling_divisor))
    shuf -n $sampling_size data/$train_set/wav.scp \
      > data/$train_set/wav.scp.sampled
    python3 utils/compute_cmvn_stats.py \
    --num_workers 16 \
    --train_config $train_config \
    --in_scp data/$train_set/wav.scp.sampled \
    --out_cmvn data/$train_set/mean_std.json \
    || exit 1;
  fi
fi

if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
  echo "Making shards, please wait..."
  RED='\033[0;31m'
  NOCOLOR='\033[0m'
  echo -e "It requires ${RED}1.2T ${NOCOLOR}space for $shards_dir, please make sure you have enough space"
  echo -e "It takes about ${RED}12 ${NOCOLOR}hours with 32 threads"
  for x in $dev_set $test_sets ${train_set}; do
    dst=$shards_dir/$x
    mkdir -p $dst
    utils/make_filted_shard_list.py --resample 16000 --num_utts_per_shard 1000 \
      --do_filter --num_node 1 --num_gpus_per_node 8 \
      --num_threads 32 --segments data/$x/segments \
      data/$x/wav.scp data/$x/text \
      $(realpath $dst) data/$x/data.list
  done
fi

echo "Wenetspeech data preparation done."
exit 0
add wenetspeech egs 3 years ago			`#!/bin/bash`

			`# Copyright 2021 Mobvoi Inc(Author: Di Wu, Binbin Zhang)`
			`# NPU, ASLP Group (Author: Qijie Shao)`
add training scripts 2 years ago			`#`
			`# Modified from wenet(https://github.com/wenet-e2e/wenet)`
add wenetspeech egs 3 years ago
			`stage=-1`
			`stop_stage=100`

			`# Use your own data path. You need to download the WenetSpeech dataset by yourself.`
			`wenetspeech_data_dir=./wenetspeech`
			`# Make sure you have 1.2T for ${shards_dir}`
			`shards_dir=./wenetspeech_shards`

			`#wenetspeech training set`
			`set=L`
			train_set=train_`echo $set \| tr 'A-Z' 'a-z'`
			`dev_set=dev`
			`test_sets="test_net test_meeting"`

			`cmvn=true`
			`cmvn_sampling_divisor=20 # 20 means 5% of the training data to estimate cmvn`


			`. ${MAIN_ROOT}/utils/parse_options.sh \|\| exit 1;`
			`set -u`
			`set -o pipefail`


			`mkdir -p data`
fix dataset dir in data.sh 3 years ago			`TARGET_DIR=${MAIN_ROOT}/dataset`
add wenetspeech egs 3 years ago			`mkdir -p ${TARGET_DIR}`

add training scripts 2 years ago			`if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then`
add wenetspeech egs 3 years ago			`# download data`
			`echo "Please follow https://github.com/wenet-e2e/WenetSpeech to download the data."`
			`exit 0;`
			`fi`

			`if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then`
			`echo "Data preparation"`
			`local/wenetspeech_data_prep.sh \`
			`--train-subset $set \`
			`$wenetspeech_data_dir \`
			`data \|\| exit 1;`
			`fi`

add training scripts 2 years ago			`dict=data/lang_char/vocab.txt`
add wenetspeech egs 3 years ago			`if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then`
add training scripts 2 years ago			`echo "Make a dictionary"`
			`echo "dictionary: ${dict}"`
			`mkdir -p $(dirname $dict)`
			`echo "<blank>" > ${dict} # 0 will be used for "blank" in CTC`
			`echo "<unk>" >> ${dict} # <unk> must be 1`
			`echo "▁" >> ${dict} # ▁ is for space`
			`utils/text2token.py -s 1 -n 1 --space "▁" data/${train_set}/text \`
			`\| cut -f 2- -d" " \| tr " " "\n" \`
			`\| sort \| uniq \| grep -a -v -e '^\s*$' \`
			`\| grep -v "▁" \`
			`\| awk '{print $0}' >> ${dict} \`
			`\|\| exit 1;`
			`echo "<eos>" >> $dict`
add wenetspeech egs 3 years ago			`fi`

			`if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then`
add training scripts 2 years ago			`echo "Compute cmvn"`
			`# Here we use all the training data, you can sample some some data to save time`
			`# BUG!!! We should use the segmented data for CMVN`
			`if $cmvn; then`
			full_size=`cat data/${train_set}/wav.scp \| wc -l`
			`sampling_size=$((full_size / cmvn_sampling_divisor))`
			`shuf -n $sampling_size data/$train_set/wav.scp \`
			`> data/$train_set/wav.scp.sampled`
			`python3 utils/compute_cmvn_stats.py \`
			`--num_workers 16 \`
			`--train_config $train_config \`
			`--in_scp data/$train_set/wav.scp.sampled \`
			`--out_cmvn data/$train_set/mean_std.json \`
			`\|\| exit 1;`
			`fi`
			`fi`
add wenetspeech egs 3 years ago
add training scripts 2 years ago			`if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then`
			`echo "Making shards, please wait..."`
			`RED='\033[0;31m'`
			`NOCOLOR='\033[0m'`
			`echo -e "It requires ${RED}1.2T ${NOCOLOR}space for $shards_dir, please make sure you have enough space"`
			`echo -e "It takes about ${RED}12 ${NOCOLOR}hours with 32 threads"`
			`for x in $dev_set $test_sets ${train_set}; do`
			`dst=$shards_dir/$x`
			`mkdir -p $dst`
			`utils/make_filted_shard_list.py --resample 16000 --num_utts_per_shard 1000 \`
			`--do_filter --num_node 1 --num_gpus_per_node 8 \`
			`--num_threads 32 --segments data/$x/segments \`
			`data/$x/wav.scp data/$x/text \`
			`$(realpath $dst) data/$x/data.list`
			`done`
add wenetspeech egs 3 years ago			`fi`

add training scripts 2 years ago			`echo "Wenetspeech data preparation done."`
add wenetspeech egs 3 years ago			`exit 0`