Merge pull request #3292 from zh794390558/mfa

refactor mfa scripts
pull/3298/head
Hui Zhang 1 year ago committed by GitHub
commit 2fe97f2e3a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -1,3 +1,4 @@
#!/usr/bin/env python3
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");

@ -1,3 +1,4 @@
#!/usr/bin/env python3
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");

@ -1,3 +1,4 @@
#!/usr/bin/env python3
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");

@ -1,29 +1,32 @@
EXP_DIR=exp
exp=exp
data=data
mkdir -p $exp
mkdir -p $data
mkdir -p $EXP_DIR
LEXICON_NAME='simple'
if [ ! -f "$EXP_DIR/$LEXICON_NAME.lexicon" ]; then
MFA_DOWNLOAD_DIR=local/
if [ ! -f "$exp/$LEXICON_NAME.lexicon" ]; then
echo "generating lexicon..."
python local/generate_lexicon.py "$EXP_DIR/$LEXICON_NAME" --with-r --with-tone
python local/generate_lexicon.py "$exp/$LEXICON_NAME" --with-r --with-tone
echo "lexicon done"
fi
if [ ! -d $EXP_DIR/baker_corpus ]; then
if [ ! -d $exp/baker_corpus ]; then
echo "reorganizing baker corpus..."
python local/reorganize_baker.py --root-dir=~/datasets/BZNSYP --output-dir=$EXP_DIR/baker_corpus --resample-audio
echo "reorganization done. Check output in $EXP_DIR/baker_corpus."
python local/reorganize_baker.py --root-dir=~/datasets/BZNSYP --output-dir=$exp/baker_corpus --resample-audio
echo "reorganization done. Check output in $exp/baker_corpus."
echo "audio files are resampled to 16kHz"
echo "transcription for each audio file is saved with the same namd in $EXP_DIR/baker_corpus "
echo "transcription for each audio file is saved with the same namd in $exp/baker_corpus "
fi
echo "detecting oov..."
python local/detect_oov.py $EXP_DIR/baker_corpus $EXP_DIR/"$LEXICON_NAME.lexicon"
python local/detect_oov.py $exp/baker_corpus $exp/"$LEXICON_NAME.lexicon"
echo "detecting oov done. you may consider regenerate lexicon if there is unexpected OOVs."
MFA_DOWNLOAD_DIR=local/
if [ ! -f "$MFA_DOWNLOAD_DIR/montreal-forced-aligner_linux.tar.gz" ]; then
echo "downloading mfa..."
(cd $MFA_DOWNLOAD_DIR && wget https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/releases/download/v1.0.1/montreal-forced-aligner_linux.tar.gz)
@ -37,11 +40,15 @@ if [ ! -d "$MFA_DOWNLOAD_DIR/montreal-forced-aligner" ]; then
fi
export PATH="$MFA_DOWNLOAD_DIR/montreal-forced-aligner/bin"
if [ ! -d "$EXP_DIR/baker_alignment" ]; then
if [ ! -d "$exp/baker_alignment" ]; then
echo "Start MFA training..."
mfa_train_and_align $EXP_DIR/baker_corpus "$EXP_DIR/$LEXICON_NAME.lexicon" $EXP_DIR/baker_alignment -o $EXP_DIR/baker_model --clean --verbose --temp_directory $EXP_DIR/.mfa_train_and_align
PATH=$MFA_DOWNLOAD_DIR/montreal-forced-aligner/bin/:$PATH \
LD_LIBRARY_PATH=$MFA_DOWNLOAD_DIR/montreal-forced-aligner/lib/:$LD_LIBRARY_PATH \
./$MFA_DOWNLOAD_DIR/montreal-forced-aligner/bin/mfa_train_and_align \
$exp/baker_corpus "$exp/$LEXICON_NAME.lexicon" $exp/baker_alignment -o $exp/baker_model --clean --verbose -j 10 --temp_directory $exp/.mfa_train_and_align
echo "training done!"
echo "results: $EXP_DIR/baker_alignment"
echo "model: $EXP_DIR/baker_model"
echo "results: $exp/baker_alignment"
echo "model: $exp/baker_model"
fi

@ -1,16 +1,15 @@
EXP_DIR=exp
exp=exp
mkdir -p $EXP_DIR
mkdir -p $exp
LEXICON_NAME='canton'
if [ ! -f "$EXP_DIR/$LEXICON_NAME.lexicon" ]; then
MFA_DOWNLOAD_DIR=local/
if [ ! -f "$exp/$LEXICON_NAME.lexicon" ]; then
echo "generating lexicon and training data..."
python local/generate_canton_lexicon_wavlabs.py --output_lexicon "$EXP_DIR/$LEXICON_NAME.lexicon" --output_wavlabs "$EXP_DIR/$LEXICON_NAME"_wavlabs --inputs ~/datasets/Guangzhou_Cantonese_Scripted_Speech_Corpus_Daily_Use_Sentence ~/datasets/Guangzhou_Cantonese_Scripted_Speech_Corpus_in_Vehicle
python local/generate_canton_lexicon_wavlabs.py --output_lexicon "$exp/$LEXICON_NAME.lexicon" --output_wavlabs "$exp/$LEXICON_NAME"_wavlabs --inputs ~/datasets/Guangzhou_Cantonese_Scripted_Speech_Corpus_Daily_Use_Sentence ~/datasets/Guangzhou_Cantonese_Scripted_Speech_Corpus_in_Vehicle
echo "lexicon and training data done"
fi
MFA_DOWNLOAD_DIR=local/
if [ ! -f "$MFA_DOWNLOAD_DIR/montreal-forced-aligner_linux.tar.gz" ]; then
echo "downloading mfa..."
(cd $MFA_DOWNLOAD_DIR && wget https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/releases/download/v1.0.1/montreal-forced-aligner_linux.tar.gz)
@ -24,11 +23,14 @@ if [ ! -d "$MFA_DOWNLOAD_DIR/montreal-forced-aligner" ]; then
fi
export PATH="$MFA_DOWNLOAD_DIR/montreal-forced-aligner/bin"
if [ ! -d "$EXP_DIR/canton_alignment" ]; then
if [ ! -d "$exp/canton_alignment" ]; then
echo "Start MFA training..."
mfa_train_and_align "$EXP_DIR/$LEXICON_NAME"_wavlabs "$EXP_DIR/$LEXICON_NAME.lexicon" $EXP_DIR/canton_alignment -o $EXP_DIR/canton_model --clean --verbose --temp_directory $EXP_DIR/.mfa_train_and_align
PATH=$MFA_DOWNLOAD_DIR/montreal-forced-aligner/bin/:$PATH \
LD_LIBRARY_PATH=$MFA_DOWNLOAD_DIR/montreal-forced-aligner/lib/:$LD_LIBRARY_PATH \
./$MFA_DOWNLOAD_DIR/montreal-forced-aligner/bin/mfa_train_and_align \
"$exp/$LEXICON_NAME"_wavlabs "$exp/$LEXICON_NAME.lexicon" $exp/canton_alignment -o $exp/canton_model --clean --verbose -j 10 --temp_directory $exp/.mfa_train_and_align
echo "training done!"
echo "results: $EXP_DIR/canton_alignment"
echo "model: $EXP_DIR/canton_model"
echo "results: $exp/canton_alignment"
echo "model: $exp/canton_model"
fi

@ -33,6 +33,18 @@ ec5a9b24acc35469229e41256ceaf77d data/lang_char/input.txt
```
```
==> data/lang_char/input.txt <==
mister quilter is the apostle of the middle classes and we are glad to welcome his gospel
nor is mister quilter's manner less interesting than his matter
he tells us that at this festive season of the year with christmas and roast beef looming before us similes drawn from eating and its results occur most readily to the mind
he has grave doubts whether sir frederick leighton's work is really greek after all and can discover in it but little of rocky ithaca
linnell's pictures are a sort of up guards and at em paintings and mason's exquisite idylls are as national as a jingo poem mister birket foster's landscapes smile at one much in the same way that mister carker used to flash his teeth and mister john collier gives his sitter a cheerful slap on the back before he says like a shampooer in a turkish bath next man
it is obviously unnecessary for us to point out how luminous these criticisms are how delicate in expression
on the general principles of art mister quilter writes with equal lucidity
painting he tells us is of a different quality to mathematics and finish in art is adding more fact
as for etchings they are of two kinds british and foreign
he laments most bitterly the divorce that has been made between decorative art and what we usually call pictures makes the customary appeal to the last judgment and reminds us that in the great days of art michael angelo was the furnishing upholsterer
==> data/lang_char/input.bpe <==
▁mi ster ▁quilter ▁ is ▁the ▁a p ost le ▁o f ▁the ▁mi d d le ▁c las s es ▁ and ▁we ▁ar e ▁g l a d ▁ to ▁we l c om e ▁h is ▁g o s pe l
▁ n or ▁ is ▁mi ster ▁quilter ' s ▁ma nne r ▁ l ess ▁in ter es t ing ▁tha n ▁h is ▁ma t ter
@ -58,17 +70,6 @@ painting he tells us is of a different quality to mathematics and finish in art
as for etchings they are of two kinds british and foreign
he laments most bitterly the divorce that has been made between decorative art and what we usually call pictures makes the customary appeal to the last judgment and reminds us that in the great days of art michael angelo was the furnishing upholsterer
==> data/lang_char/input.txt <==
mister quilter is the apostle of the middle classes and we are glad to welcome his gospel
nor is mister quilter's manner less interesting than his matter
he tells us that at this festive season of the year with christmas and roast beef looming before us similes drawn from eating and its results occur most readily to the mind
he has grave doubts whether sir frederick leighton's work is really greek after all and can discover in it but little of rocky ithaca
linnell's pictures are a sort of up guards and at em paintings and mason's exquisite idylls are as national as a jingo poem mister birket foster's landscapes smile at one much in the same way that mister carker used to flash his teeth and mister john collier gives his sitter a cheerful slap on the back before he says like a shampooer in a turkish bath next man
it is obviously unnecessary for us to point out how luminous these criticisms are how delicate in expression
on the general principles of art mister quilter writes with equal lucidity
painting he tells us is of a different quality to mathematics and finish in art is adding more fact
as for etchings they are of two kinds british and foreign
he laments most bitterly the divorce that has been made between decorative art and what we usually call pictures makes the customary appeal to the last judgment and reminds us that in the great days of art michael angelo was the furnishing upholsterer
==> data/lang_char/train_unigram100_units.txt <==
<blank> 0

Loading…
Cancel
Save