parent
f5369abdbe
commit
104743cccc
@ -0,0 +1,58 @@
|
||||
#!/bin/bash
|
||||
|
||||
# To be run from one directory above this script.
|
||||
. ./path.sh
|
||||
|
||||
text=data/local/lm/text
|
||||
lexicon=data/local/dict/lexicon.txt
|
||||
|
||||
for f in "$text" "$lexicon"; do
|
||||
[ ! -f $x ] && echo "$0: No such file $f" && exit 1;
|
||||
done
|
||||
|
||||
# Check SRILM tools
|
||||
if ! which ngram-count > /dev/null; then
|
||||
echo "srilm tools are not found, please download it and install it from: "
|
||||
echo "http://www.speech.sri.com/projects/srilm/download.html"
|
||||
echo "Then add the tools to your PATH"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# This script takes no arguments. It assumes you have already run
|
||||
# aishell_data_prep.sh.
|
||||
# It takes as input the files
|
||||
# data/local/lm/text
|
||||
# data/local/dict/lexicon.txt
|
||||
dir=data/local/lm
|
||||
mkdir -p $dir
|
||||
|
||||
|
||||
cleantext=$dir/text.no_oov
|
||||
|
||||
cat $text | awk -v lex=$lexicon 'BEGIN{while((getline<lex) >0){ seen[$1]=1; } }
|
||||
{for(n=1; n<=NF;n++) { if (seen[$n]) { printf("%s ", $n); } else {printf("<SPOKEN_NOISE> ");} } printf("\n");}' \
|
||||
> $cleantext || exit 1;
|
||||
|
||||
cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \
|
||||
sort -nr > $dir/word.counts || exit 1;
|
||||
|
||||
# Get counts from acoustic training transcripts, and add one-count
|
||||
# for each word in the lexicon (but not silence, we don't want it
|
||||
# in the LM-- we'll add it optionally later).
|
||||
cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \
|
||||
cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \
|
||||
sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1;
|
||||
|
||||
cat $dir/unigram.counts | awk '{print $2}' | cat - <(echo "<s>"; echo "</s>" ) > $dir/wordlist
|
||||
|
||||
heldout_sent=10000 # Don't change this if you want result to be comparable with
|
||||
# kaldi_lm results
|
||||
mkdir -p $dir
|
||||
cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
|
||||
head -$heldout_sent > $dir/heldout
|
||||
cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
|
||||
tail -n +$heldout_sent > $dir/train
|
||||
|
||||
ngram-count -text $dir/train -order 3 -limit-vocab -vocab $dir/wordlist -unk \
|
||||
-map-unk "<UNK>" -kndiscount -interpolate -lm $dir/lm.arpa
|
||||
ngram -lm $dir/lm.arpa -ppl $dir/heldout
|
@ -0,0 +1,52 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -eo pipefail
|
||||
|
||||
stage=-1
|
||||
stop_stage=100
|
||||
corpus=aishell
|
||||
lmtype=srilm
|
||||
|
||||
source utils/parse_options.sh
|
||||
|
||||
data=${MAIN_ROOT}/examples/dataset/${corpus}
|
||||
lexicon=$data/resource_aishell/lexicon.txt
|
||||
text=$data/data_aishell/transcript/aishell_transcript_v0.8.txt
|
||||
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
# 7.1 Prepare dict
|
||||
unit_file=data/vocab.txt
|
||||
mkdir -p data/local/dict
|
||||
cp $unit_file data/local/dict/units.txt
|
||||
utils/fst/prepare_dict.py \
|
||||
--unit_file $unit_file \
|
||||
--in_lexicon ${lexicon} \
|
||||
--out_lexicon data/local/dict/lexicon.txt
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
# 7.2 Train lm
|
||||
lm=data/local/lm
|
||||
mkdir -p data/train
|
||||
mkdir -p $lm
|
||||
utils/manifest_key_value.py \
|
||||
--manifest_path data/manifest.train \
|
||||
--output_path data/train
|
||||
utils/filter_scp.pl data/train/text \
|
||||
$text > $lm/text
|
||||
if [ $lmtype == 'srilm' ];then
|
||||
local/aishell_train_lms.sh
|
||||
else
|
||||
utils/ngram_train.sh --order 3 $lm/text $lm/lm.arpa
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
# 7.3 Build decoding TLG
|
||||
utils/fst/compile_lexicon_token_fst.sh \
|
||||
data/local/dict data/local/tmp data/local/lang
|
||||
utils/fst/make_tlg.sh data/local/lm data/local/lang data/lang_test || exit 1;
|
||||
fi
|
||||
|
||||
echo "Aishell build TLG done."
|
||||
exit 0
|
@ -0,0 +1 @@
|
||||
../../../utils
|
@ -0,0 +1,13 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
@ -0,0 +1,87 @@
|
||||
#!/usr/bin/env perl
|
||||
# Copyright 2010-2012 Microsoft Corporation
|
||||
# Johns Hopkins University (author: Daniel Povey)
|
||||
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
# MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
# See the Apache 2 License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
# This script takes a list of utterance-ids or any file whose first field
|
||||
# of each line is an utterance-id, and filters an scp
|
||||
# file (or any file whose "n-th" field is an utterance id), printing
|
||||
# out only those lines whose "n-th" field is in id_list. The index of
|
||||
# the "n-th" field is 1, by default, but can be changed by using
|
||||
# the -f <n> switch
|
||||
|
||||
$exclude = 0;
|
||||
$field = 1;
|
||||
$shifted = 0;
|
||||
|
||||
do {
|
||||
$shifted=0;
|
||||
if ($ARGV[0] eq "--exclude") {
|
||||
$exclude = 1;
|
||||
shift @ARGV;
|
||||
$shifted=1;
|
||||
}
|
||||
if ($ARGV[0] eq "-f") {
|
||||
$field = $ARGV[1];
|
||||
shift @ARGV; shift @ARGV;
|
||||
$shifted=1
|
||||
}
|
||||
} while ($shifted);
|
||||
|
||||
if(@ARGV < 1 || @ARGV > 2) {
|
||||
die "Usage: filter_scp.pl [--exclude] [-f <field-to-filter-on>] id_list [in.scp] > out.scp \n" .
|
||||
"Prints only the input lines whose f'th field (default: first) is in 'id_list'.\n" .
|
||||
"Note: only the first field of each line in id_list matters. With --exclude, prints\n" .
|
||||
"only the lines that were *not* in id_list.\n" .
|
||||
"Caution: previously, the -f option was interpreted as a zero-based field index.\n" .
|
||||
"If your older scripts (written before Oct 2014) stopped working and you used the\n" .
|
||||
"-f option, add 1 to the argument.\n" .
|
||||
"See also: utils/filter_scp.pl .\n";
|
||||
}
|
||||
|
||||
|
||||
$idlist = shift @ARGV;
|
||||
open(F, "<$idlist") || die "Could not open id-list file $idlist";
|
||||
while(<F>) {
|
||||
@A = split;
|
||||
@A>=1 || die "Invalid id-list file line $_";
|
||||
$seen{$A[0]} = 1;
|
||||
}
|
||||
|
||||
if ($field == 1) { # Treat this as special case, since it is common.
|
||||
while(<>) {
|
||||
$_ =~ m/\s*(\S+)\s*/ || die "Bad line $_, could not get first field.";
|
||||
# $1 is what we filter on.
|
||||
if ((!$exclude && $seen{$1}) || ($exclude && !defined $seen{$1})) {
|
||||
print $_;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
while(<>) {
|
||||
@A = split;
|
||||
@A > 0 || die "Invalid scp file line $_";
|
||||
@A >= $field || die "Invalid scp file line $_";
|
||||
if ((!$exclude && $seen{$A[$field-1]}) || ($exclude && !defined $seen{$A[$field-1]})) {
|
||||
print $_;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# tests:
|
||||
# the following should print "foo 1"
|
||||
# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl <(echo foo)
|
||||
# the following should print "bar 2".
|
||||
# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl -f 2 <(echo 2)
|
@ -0,0 +1,64 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Manifest file to key-value files."""
|
||||
import argparse
|
||||
import functools
|
||||
from pathlib import Path
|
||||
|
||||
from utils.utility import add_arguments
|
||||
from utils.utility import print_arguments
|
||||
from utils.utility import read_manifest
|
||||
|
||||
|
||||
def main(args):
|
||||
print_arguments(args, globals())
|
||||
|
||||
count = 0
|
||||
|
||||
outdir = Path(args.output_path)
|
||||
wav_scp = outdir / 'wav.scp'
|
||||
dur_scp = outdir / 'duration'
|
||||
text_scp = outdir / 'text'
|
||||
|
||||
manifest_jsons = read_manifest(args.manifest_path)
|
||||
|
||||
with wav_scp.open('w') as fwav, dur_scp.open('w') as fdur, text_scp.open(
|
||||
'w') as ftxt:
|
||||
for line_json in manifest_jsons:
|
||||
utt = line_json['utt']
|
||||
feat = line_json['feat']
|
||||
file_ext = Path(feat).suffix # .wav
|
||||
text = line_json['text']
|
||||
feat_shape = line_json['feat_shape']
|
||||
dur = feat_shape[0]
|
||||
feat_dim = feat_shape[1]
|
||||
if 'token' in line_json:
|
||||
tokens = line_json['token']
|
||||
tokenids = line_json['token_id']
|
||||
token_shape = line_json['token_shape']
|
||||
token_len = token_shape[0]
|
||||
vocab_dim = token_shape[1]
|
||||
|
||||
if file_ext == '.wav':
|
||||
fwav.write(f"{utt} {feat}\n")
|
||||
fdur.write(f"{utt} {dur}\n")
|
||||
ftxt.write(f"{utt} {text}\n")
|
||||
|
||||
count += 1
|
||||
|
||||
print(f"Examples number: {count}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
add_arg = functools.partial(add_arguments, argparser=parser)
|
||||
# yapf: disable
|
||||
add_arg('manifest_path', str,
|
||||
'data/librispeech/manifest.train',
|
||||
"Filepath of manifest to compute normalizer's mean and stddev.")
|
||||
add_arg('output_path', str,
|
||||
'data/train',
|
||||
"dir path to dump wav.scp/duaration/text files.")
|
||||
# yapf: disable
|
||||
args = parser.parse_args()
|
||||
|
||||
main(args)
|
Loading…
Reference in new issue