PaddleSpeech/examples/wenetspeech/asr1/local/wenetspeech_data_prep.sh

#!/usr/bin/env bash

# Copyright 2021  Xiaomi Corporation (Author: Yongqing Wang)
#                 Seasalt AI, Inc (Author: Guoguo Chen)
#                 Mobvoi Inc(Author: Di Wu, Binbin Zhang)
#                 NPU, ASLP Group (Author: Qijie Shao)

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

set -e
set -o pipefail

stage=1
prefix=
train_subset=L

. ./tools/parse_options.sh || exit 1;

filter_by_id () {
  idlist=$1
  input=$2
  output=$3
  field=1
  if [ $# -eq 4 ]; then
    field=$4
  fi
  cat $input | perl -se '
    open(F, "<$idlist") || die "Could not open id-list file $idlist";
    while(<F>) {
      @A = split;
      @A>=1 || die "Invalid id-list file line $_";
      $seen{$A[0]} = 1;
    }
    while(<>) {
      @A = split;
      @A > 0 || die "Invalid file line $_";
      @A >= $field || die "Invalid file line $_";
      if ($seen{$A[$field-1]}) {
        print $_;
      }
    }' -- -idlist="$idlist" -field="$field" > $output ||\
  (echo "$0: filter_by_id() error: $input" && exit 1) || exit 1;
}

subset_data_dir () {
  utt_list=$1
  src_dir=$2
  dest_dir=$3
  mkdir -p $dest_dir || exit 1;
  # wav.scp text segments utt2dur
  filter_by_id $utt_list $src_dir/utt2dur $dest_dir/utt2dur ||\
    (echo "$0: subset_data_dir() error: $src_dir/utt2dur" && exit 1) || exit 1;
  filter_by_id $utt_list $src_dir/text $dest_dir/text ||\
    (echo "$0: subset_data_dir() error: $src_dir/text" && exit 1) || exit 1;
  filter_by_id $utt_list $src_dir/segments $dest_dir/segments ||\
    (echo "$0: subset_data_dir() error: $src_dir/segments" && exit 1) || exit 1;
  awk '{print $2}' $dest_dir/segments | sort | uniq > $dest_dir/reco
  filter_by_id $dest_dir/reco $src_dir/wav.scp $dest_dir/wav.scp ||\
    (echo "$0: subset_data_dir() error: $src_dir/wav.scp" && exit 1) || exit 1;
  rm -f $dest_dir/reco
}

if [ $# -ne 2 ]; then
  echo "Usage: $0 [options] <wenetspeech-dataset-dir> <data-dir>"
  echo " e.g.: $0 --train-subset L /disk1/audio_data/wenetspeech/ data/"
  echo ""
  echo "This script takes the WenetSpeech source directory, and prepares the"
  echo "WeNet format data directory."
  echo "  --prefix <prefix>                # Prefix for output data directory."
  echo "  --stage <stage>                  # Processing stage."
  echo "  --train-subset <L|M|S|W>     # Train subset to be created."
  exit 1
fi

wenetspeech_dir=$1
data_dir=$2

declare -A subsets
subsets=(
  [L]="train_l"
  [M]="train_m"
  [S]="train_s"
  [W]="train_w"
  [DEV]="dev"
  [TEST_NET]="test_net"
  [TEST_MEETING]="test_meeting")

prefix=${prefix:+${prefix}_}

corpus_dir=$data_dir/${prefix}corpus/
if [ $stage -le 1 ]; then
  echo "$0: Extract meta into $corpus_dir"
  # Sanity check.
  [ ! -f $wenetspeech_dir/WenetSpeech.json ] &&\
    echo "$0: Please download $wenetspeech_dir/WenetSpeech.json!" && exit 1;
  [ ! -d $wenetspeech_dir/audio ] &&\
    echo "$0: Please download $wenetspeech_dir/audio!" && exit 1;

  [ ! -d $corpus_dir ] && mkdir -p $corpus_dir

  # Files to be created:
  # wav.scp text segments utt2dur
  python3 local/extract_meta.py \
    $wenetspeech_dir/WenetSpeech.json $corpus_dir || exit 1;
fi

if [ $stage -le 2 ]; then
  echo "$0: Split data to train, dev, test_net, and test_meeting"
  [ ! -f $corpus_dir/utt2subsets ] &&\
    echo "$0: No such file $corpus_dir/utt2subsets!" && exit 1;
  for label in $train_subset DEV TEST_NET TEST_MEETING; do
    if [ ! ${subsets[$label]+set} ]; then
      echo "$0: Subset $label is not defined in WenetSpeech.json." && exit 1;
    fi
    subset=${subsets[$label]}
    [ ! -d $data_dir/${prefix}$subset ] && mkdir -p $data_dir/${prefix}$subset
    cat $corpus_dir/utt2subsets | \
       awk -v s=$label '{for (i=2;i<=NF;i++) if($i==s) print $0;}' \
       > $corpus_dir/${prefix}${subset}_utt_list|| exit 1;
    subset_data_dir $corpus_dir/${prefix}${subset}_utt_list \
      $corpus_dir $data_dir/${prefix}$subset || exit 1;
  done
fi

echo "$0: Done"
add wenetspeech egs 3 years ago			`#!/usr/bin/env bash`

			`# Copyright 2021 Xiaomi Corporation (Author: Yongqing Wang)`
			`# Seasalt AI, Inc (Author: Guoguo Chen)`
			`# Mobvoi Inc(Author: Di Wu, Binbin Zhang)`
			`# NPU, ASLP Group (Author: Qijie Shao)`

			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`

			`set -e`
			`set -o pipefail`

			`stage=1`
			`prefix=`
			`train_subset=L`

			`. ./tools/parse_options.sh \|\| exit 1;`

			`filter_by_id () {`
			`idlist=$1`
			`input=$2`
			`output=$3`
			`field=1`
			`if [ $# -eq 4 ]; then`
			`field=$4`
			`fi`
			`cat $input \| perl -se '`
			`open(F, "<$idlist") \|\| die "Could not open id-list file $idlist";`
			`while(<F>) {`
			`@A = split;`
			`@A>=1 \|\| die "Invalid id-list file line $_";`
			`$seen{$A[0]} = 1;`
			`}`
			`while(<>) {`
			`@A = split;`
			`@A > 0 \|\| die "Invalid file line $_";`
			`@A >= $field \|\| die "Invalid file line $_";`
			`if ($seen{$A[$field-1]}) {`
			`print $_;`
			`}`
			`}' -- -idlist="$idlist" -field="$field" > $output \|\|\`
			`(echo "$0: filter_by_id() error: $input" && exit 1) \|\| exit 1;`
			`}`

			`subset_data_dir () {`
			`utt_list=$1`
			`src_dir=$2`
			`dest_dir=$3`
			`mkdir -p $dest_dir \|\| exit 1;`
			`# wav.scp text segments utt2dur`
			`filter_by_id $utt_list $src_dir/utt2dur $dest_dir/utt2dur \|\|\`
			`(echo "$0: subset_data_dir() error: $src_dir/utt2dur" && exit 1) \|\| exit 1;`
			`filter_by_id $utt_list $src_dir/text $dest_dir/text \|\|\`
			`(echo "$0: subset_data_dir() error: $src_dir/text" && exit 1) \|\| exit 1;`
			`filter_by_id $utt_list $src_dir/segments $dest_dir/segments \|\|\`
			`(echo "$0: subset_data_dir() error: $src_dir/segments" && exit 1) \|\| exit 1;`
			`awk '{print $2}' $dest_dir/segments \| sort \| uniq > $dest_dir/reco`
			`filter_by_id $dest_dir/reco $src_dir/wav.scp $dest_dir/wav.scp \|\|\`
			`(echo "$0: subset_data_dir() error: $src_dir/wav.scp" && exit 1) \|\| exit 1;`
			`rm -f $dest_dir/reco`
			`}`

			`if [ $# -ne 2 ]; then`
			`echo "Usage: $0 [options] <wenetspeech-dataset-dir> <data-dir>"`
			`echo " e.g.: $0 --train-subset L /disk1/audio_data/wenetspeech/ data/"`
			`echo ""`
			`echo "This script takes the WenetSpeech source directory, and prepares the"`
			`echo "WeNet format data directory."`
			`echo " --prefix <prefix> # Prefix for output data directory."`
			`echo " --stage <stage> # Processing stage."`
			`echo " --train-subset <L\|M\|S\|W> # Train subset to be created."`
			`exit 1`
			`fi`

			`wenetspeech_dir=$1`
			`data_dir=$2`

			`declare -A subsets`
			`subsets=(`
			`[L]="train_l"`
			`[M]="train_m"`
			`[S]="train_s"`
			`[W]="train_w"`
			`[DEV]="dev"`
			`[TEST_NET]="test_net"`
			`[TEST_MEETING]="test_meeting")`

			`prefix=${prefix:+${prefix}_}`

			`corpus_dir=$data_dir/${prefix}corpus/`
			`if [ $stage -le 1 ]; then`
			`echo "$0: Extract meta into $corpus_dir"`
			`# Sanity check.`
			`[ ! -f $wenetspeech_dir/WenetSpeech.json ] &&\`
			`echo "$0: Please download $wenetspeech_dir/WenetSpeech.json!" && exit 1;`
			`[ ! -d $wenetspeech_dir/audio ] &&\`
			`echo "$0: Please download $wenetspeech_dir/audio!" && exit 1;`

			`[ ! -d $corpus_dir ] && mkdir -p $corpus_dir`

			`# Files to be created:`
			`# wav.scp text segments utt2dur`
			`python3 local/extract_meta.py \`
			`$wenetspeech_dir/WenetSpeech.json $corpus_dir \|\| exit 1;`
			`fi`

			`if [ $stage -le 2 ]; then`
			`echo "$0: Split data to train, dev, test_net, and test_meeting"`
			`[ ! -f $corpus_dir/utt2subsets ] &&\`
			`echo "$0: No such file $corpus_dir/utt2subsets!" && exit 1;`
			`for label in $train_subset DEV TEST_NET TEST_MEETING; do`
			`if [ ! ${subsets[$label]+set} ]; then`
			`echo "$0: Subset $label is not defined in WenetSpeech.json." && exit 1;`
			`fi`
			`subset=${subsets[$label]}`
			`[ ! -d $data_dir/${prefix}$subset ] && mkdir -p $data_dir/${prefix}$subset`
			`cat $corpus_dir/utt2subsets \| \`
			`awk -v s=$label '{for (i=2;i<=NF;i++) if($i==s) print $0;}' \`
			`> $corpus_dir/${prefix}${subset}_utt_list\|\| exit 1;`
			`subset_data_dir $corpus_dir/${prefix}${subset}_utt_list \`
			`$corpus_dir $data_dir/${prefix}$subset \|\| exit 1;`
			`done`
			`fi`

			`echo "$0: Done"`