parent
f5369abdbe
commit
104743cccc
@ -0,0 +1,58 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# To be run from one directory above this script.
|
||||||
|
. ./path.sh
|
||||||
|
|
||||||
|
text=data/local/lm/text
|
||||||
|
lexicon=data/local/dict/lexicon.txt
|
||||||
|
|
||||||
|
for f in "$text" "$lexicon"; do
|
||||||
|
[ ! -f $x ] && echo "$0: No such file $f" && exit 1;
|
||||||
|
done
|
||||||
|
|
||||||
|
# Check SRILM tools
|
||||||
|
if ! which ngram-count > /dev/null; then
|
||||||
|
echo "srilm tools are not found, please download it and install it from: "
|
||||||
|
echo "http://www.speech.sri.com/projects/srilm/download.html"
|
||||||
|
echo "Then add the tools to your PATH"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# This script takes no arguments. It assumes you have already run
|
||||||
|
# aishell_data_prep.sh.
|
||||||
|
# It takes as input the files
|
||||||
|
# data/local/lm/text
|
||||||
|
# data/local/dict/lexicon.txt
|
||||||
|
dir=data/local/lm
|
||||||
|
mkdir -p $dir
|
||||||
|
|
||||||
|
|
||||||
|
cleantext=$dir/text.no_oov
|
||||||
|
|
||||||
|
cat $text | awk -v lex=$lexicon 'BEGIN{while((getline<lex) >0){ seen[$1]=1; } }
|
||||||
|
{for(n=1; n<=NF;n++) { if (seen[$n]) { printf("%s ", $n); } else {printf("<SPOKEN_NOISE> ");} } printf("\n");}' \
|
||||||
|
> $cleantext || exit 1;
|
||||||
|
|
||||||
|
cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \
|
||||||
|
sort -nr > $dir/word.counts || exit 1;
|
||||||
|
|
||||||
|
# Get counts from acoustic training transcripts, and add one-count
|
||||||
|
# for each word in the lexicon (but not silence, we don't want it
|
||||||
|
# in the LM-- we'll add it optionally later).
|
||||||
|
cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \
|
||||||
|
cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \
|
||||||
|
sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1;
|
||||||
|
|
||||||
|
cat $dir/unigram.counts | awk '{print $2}' | cat - <(echo "<s>"; echo "</s>" ) > $dir/wordlist
|
||||||
|
|
||||||
|
heldout_sent=10000 # Don't change this if you want result to be comparable with
|
||||||
|
# kaldi_lm results
|
||||||
|
mkdir -p $dir
|
||||||
|
cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
|
||||||
|
head -$heldout_sent > $dir/heldout
|
||||||
|
cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
|
||||||
|
tail -n +$heldout_sent > $dir/train
|
||||||
|
|
||||||
|
ngram-count -text $dir/train -order 3 -limit-vocab -vocab $dir/wordlist -unk \
|
||||||
|
-map-unk "<UNK>" -kndiscount -interpolate -lm $dir/lm.arpa
|
||||||
|
ngram -lm $dir/lm.arpa -ppl $dir/heldout
|
@ -0,0 +1,52 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
set -eo pipefail
|
||||||
|
|
||||||
|
stage=-1
|
||||||
|
stop_stage=100
|
||||||
|
corpus=aishell
|
||||||
|
lmtype=srilm
|
||||||
|
|
||||||
|
source utils/parse_options.sh
|
||||||
|
|
||||||
|
data=${MAIN_ROOT}/examples/dataset/${corpus}
|
||||||
|
lexicon=$data/resource_aishell/lexicon.txt
|
||||||
|
text=$data/data_aishell/transcript/aishell_transcript_v0.8.txt
|
||||||
|
|
||||||
|
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||||
|
# 7.1 Prepare dict
|
||||||
|
unit_file=data/vocab.txt
|
||||||
|
mkdir -p data/local/dict
|
||||||
|
cp $unit_file data/local/dict/units.txt
|
||||||
|
utils/fst/prepare_dict.py \
|
||||||
|
--unit_file $unit_file \
|
||||||
|
--in_lexicon ${lexicon} \
|
||||||
|
--out_lexicon data/local/dict/lexicon.txt
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||||
|
# 7.2 Train lm
|
||||||
|
lm=data/local/lm
|
||||||
|
mkdir -p data/train
|
||||||
|
mkdir -p $lm
|
||||||
|
utils/manifest_key_value.py \
|
||||||
|
--manifest_path data/manifest.train \
|
||||||
|
--output_path data/train
|
||||||
|
utils/filter_scp.pl data/train/text \
|
||||||
|
$text > $lm/text
|
||||||
|
if [ $lmtype == 'srilm' ];then
|
||||||
|
local/aishell_train_lms.sh
|
||||||
|
else
|
||||||
|
utils/ngram_train.sh --order 3 $lm/text $lm/lm.arpa
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||||
|
# 7.3 Build decoding TLG
|
||||||
|
utils/fst/compile_lexicon_token_fst.sh \
|
||||||
|
data/local/dict data/local/tmp data/local/lang
|
||||||
|
utils/fst/make_tlg.sh data/local/lm data/local/lang data/lang_test || exit 1;
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Aishell build TLG done."
|
||||||
|
exit 0
|
@ -0,0 +1 @@
|
|||||||
|
../../../utils
|
@ -0,0 +1,13 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
@ -0,0 +1,87 @@
|
|||||||
|
#!/usr/bin/env perl
|
||||||
|
# Copyright 2010-2012 Microsoft Corporation
|
||||||
|
# Johns Hopkins University (author: Daniel Povey)
|
||||||
|
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||||
|
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||||
|
# MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||||
|
# See the Apache 2 License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
|
||||||
|
# This script takes a list of utterance-ids or any file whose first field
|
||||||
|
# of each line is an utterance-id, and filters an scp
|
||||||
|
# file (or any file whose "n-th" field is an utterance id), printing
|
||||||
|
# out only those lines whose "n-th" field is in id_list. The index of
|
||||||
|
# the "n-th" field is 1, by default, but can be changed by using
|
||||||
|
# the -f <n> switch
|
||||||
|
|
||||||
|
$exclude = 0;
|
||||||
|
$field = 1;
|
||||||
|
$shifted = 0;
|
||||||
|
|
||||||
|
do {
|
||||||
|
$shifted=0;
|
||||||
|
if ($ARGV[0] eq "--exclude") {
|
||||||
|
$exclude = 1;
|
||||||
|
shift @ARGV;
|
||||||
|
$shifted=1;
|
||||||
|
}
|
||||||
|
if ($ARGV[0] eq "-f") {
|
||||||
|
$field = $ARGV[1];
|
||||||
|
shift @ARGV; shift @ARGV;
|
||||||
|
$shifted=1
|
||||||
|
}
|
||||||
|
} while ($shifted);
|
||||||
|
|
||||||
|
if(@ARGV < 1 || @ARGV > 2) {
|
||||||
|
die "Usage: filter_scp.pl [--exclude] [-f <field-to-filter-on>] id_list [in.scp] > out.scp \n" .
|
||||||
|
"Prints only the input lines whose f'th field (default: first) is in 'id_list'.\n" .
|
||||||
|
"Note: only the first field of each line in id_list matters. With --exclude, prints\n" .
|
||||||
|
"only the lines that were *not* in id_list.\n" .
|
||||||
|
"Caution: previously, the -f option was interpreted as a zero-based field index.\n" .
|
||||||
|
"If your older scripts (written before Oct 2014) stopped working and you used the\n" .
|
||||||
|
"-f option, add 1 to the argument.\n" .
|
||||||
|
"See also: utils/filter_scp.pl .\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
$idlist = shift @ARGV;
|
||||||
|
open(F, "<$idlist") || die "Could not open id-list file $idlist";
|
||||||
|
while(<F>) {
|
||||||
|
@A = split;
|
||||||
|
@A>=1 || die "Invalid id-list file line $_";
|
||||||
|
$seen{$A[0]} = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($field == 1) { # Treat this as special case, since it is common.
|
||||||
|
while(<>) {
|
||||||
|
$_ =~ m/\s*(\S+)\s*/ || die "Bad line $_, could not get first field.";
|
||||||
|
# $1 is what we filter on.
|
||||||
|
if ((!$exclude && $seen{$1}) || ($exclude && !defined $seen{$1})) {
|
||||||
|
print $_;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
while(<>) {
|
||||||
|
@A = split;
|
||||||
|
@A > 0 || die "Invalid scp file line $_";
|
||||||
|
@A >= $field || die "Invalid scp file line $_";
|
||||||
|
if ((!$exclude && $seen{$A[$field-1]}) || ($exclude && !defined $seen{$A[$field-1]})) {
|
||||||
|
print $_;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# tests:
|
||||||
|
# the following should print "foo 1"
|
||||||
|
# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl <(echo foo)
|
||||||
|
# the following should print "bar 2".
|
||||||
|
# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl -f 2 <(echo 2)
|
@ -0,0 +1,64 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Manifest file to key-value files."""
|
||||||
|
import argparse
|
||||||
|
import functools
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from utils.utility import add_arguments
|
||||||
|
from utils.utility import print_arguments
|
||||||
|
from utils.utility import read_manifest
|
||||||
|
|
||||||
|
|
||||||
|
def main(args):
|
||||||
|
print_arguments(args, globals())
|
||||||
|
|
||||||
|
count = 0
|
||||||
|
|
||||||
|
outdir = Path(args.output_path)
|
||||||
|
wav_scp = outdir / 'wav.scp'
|
||||||
|
dur_scp = outdir / 'duration'
|
||||||
|
text_scp = outdir / 'text'
|
||||||
|
|
||||||
|
manifest_jsons = read_manifest(args.manifest_path)
|
||||||
|
|
||||||
|
with wav_scp.open('w') as fwav, dur_scp.open('w') as fdur, text_scp.open(
|
||||||
|
'w') as ftxt:
|
||||||
|
for line_json in manifest_jsons:
|
||||||
|
utt = line_json['utt']
|
||||||
|
feat = line_json['feat']
|
||||||
|
file_ext = Path(feat).suffix # .wav
|
||||||
|
text = line_json['text']
|
||||||
|
feat_shape = line_json['feat_shape']
|
||||||
|
dur = feat_shape[0]
|
||||||
|
feat_dim = feat_shape[1]
|
||||||
|
if 'token' in line_json:
|
||||||
|
tokens = line_json['token']
|
||||||
|
tokenids = line_json['token_id']
|
||||||
|
token_shape = line_json['token_shape']
|
||||||
|
token_len = token_shape[0]
|
||||||
|
vocab_dim = token_shape[1]
|
||||||
|
|
||||||
|
if file_ext == '.wav':
|
||||||
|
fwav.write(f"{utt} {feat}\n")
|
||||||
|
fdur.write(f"{utt} {dur}\n")
|
||||||
|
ftxt.write(f"{utt} {text}\n")
|
||||||
|
|
||||||
|
count += 1
|
||||||
|
|
||||||
|
print(f"Examples number: {count}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
parser = argparse.ArgumentParser(description=__doc__)
|
||||||
|
add_arg = functools.partial(add_arguments, argparser=parser)
|
||||||
|
# yapf: disable
|
||||||
|
add_arg('manifest_path', str,
|
||||||
|
'data/librispeech/manifest.train',
|
||||||
|
"Filepath of manifest to compute normalizer's mean and stddev.")
|
||||||
|
add_arg('output_path', str,
|
||||||
|
'data/train',
|
||||||
|
"dir path to dump wav.scp/duaration/text files.")
|
||||||
|
# yapf: disable
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
main(args)
|
Loading…
Reference in new issue