Merge pull request #3292 from zh794390558/mfa

refactor mfa scripts
pull/3298/head
Hui Zhang 1 year ago committed by GitHub
commit 2fe97f2e3a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -1,3 +1,4 @@
#!/usr/bin/env python3
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");

@ -1,3 +1,4 @@
#!/usr/bin/env python3
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");

@ -1,3 +1,4 @@
#!/usr/bin/env python3
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");

@ -1,29 +1,32 @@
EXP_DIR=exp exp=exp
data=data
mkdir -p $exp
mkdir -p $data
mkdir -p $EXP_DIR
LEXICON_NAME='simple' LEXICON_NAME='simple'
if [ ! -f "$EXP_DIR/$LEXICON_NAME.lexicon" ]; then MFA_DOWNLOAD_DIR=local/
if [ ! -f "$exp/$LEXICON_NAME.lexicon" ]; then
echo "generating lexicon..." echo "generating lexicon..."
python local/generate_lexicon.py "$EXP_DIR/$LEXICON_NAME" --with-r --with-tone python local/generate_lexicon.py "$exp/$LEXICON_NAME" --with-r --with-tone
echo "lexicon done" echo "lexicon done"
fi fi
if [ ! -d $EXP_DIR/baker_corpus ]; then if [ ! -d $exp/baker_corpus ]; then
echo "reorganizing baker corpus..." echo "reorganizing baker corpus..."
python local/reorganize_baker.py --root-dir=~/datasets/BZNSYP --output-dir=$EXP_DIR/baker_corpus --resample-audio python local/reorganize_baker.py --root-dir=~/datasets/BZNSYP --output-dir=$exp/baker_corpus --resample-audio
echo "reorganization done. Check output in $EXP_DIR/baker_corpus." echo "reorganization done. Check output in $exp/baker_corpus."
echo "audio files are resampled to 16kHz" echo "audio files are resampled to 16kHz"
echo "transcription for each audio file is saved with the same namd in $EXP_DIR/baker_corpus " echo "transcription for each audio file is saved with the same namd in $exp/baker_corpus "
fi fi
echo "detecting oov..." echo "detecting oov..."
python local/detect_oov.py $EXP_DIR/baker_corpus $EXP_DIR/"$LEXICON_NAME.lexicon" python local/detect_oov.py $exp/baker_corpus $exp/"$LEXICON_NAME.lexicon"
echo "detecting oov done. you may consider regenerate lexicon if there is unexpected OOVs." echo "detecting oov done. you may consider regenerate lexicon if there is unexpected OOVs."
MFA_DOWNLOAD_DIR=local/
if [ ! -f "$MFA_DOWNLOAD_DIR/montreal-forced-aligner_linux.tar.gz" ]; then if [ ! -f "$MFA_DOWNLOAD_DIR/montreal-forced-aligner_linux.tar.gz" ]; then
echo "downloading mfa..." echo "downloading mfa..."
(cd $MFA_DOWNLOAD_DIR && wget https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/releases/download/v1.0.1/montreal-forced-aligner_linux.tar.gz) (cd $MFA_DOWNLOAD_DIR && wget https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/releases/download/v1.0.1/montreal-forced-aligner_linux.tar.gz)
@ -37,11 +40,15 @@ if [ ! -d "$MFA_DOWNLOAD_DIR/montreal-forced-aligner" ]; then
fi fi
export PATH="$MFA_DOWNLOAD_DIR/montreal-forced-aligner/bin" export PATH="$MFA_DOWNLOAD_DIR/montreal-forced-aligner/bin"
if [ ! -d "$EXP_DIR/baker_alignment" ]; then
if [ ! -d "$exp/baker_alignment" ]; then
echo "Start MFA training..." echo "Start MFA training..."
mfa_train_and_align $EXP_DIR/baker_corpus "$EXP_DIR/$LEXICON_NAME.lexicon" $EXP_DIR/baker_alignment -o $EXP_DIR/baker_model --clean --verbose --temp_directory $EXP_DIR/.mfa_train_and_align PATH=$MFA_DOWNLOAD_DIR/montreal-forced-aligner/bin/:$PATH \
LD_LIBRARY_PATH=$MFA_DOWNLOAD_DIR/montreal-forced-aligner/lib/:$LD_LIBRARY_PATH \
./$MFA_DOWNLOAD_DIR/montreal-forced-aligner/bin/mfa_train_and_align \
$exp/baker_corpus "$exp/$LEXICON_NAME.lexicon" $exp/baker_alignment -o $exp/baker_model --clean --verbose -j 10 --temp_directory $exp/.mfa_train_and_align
echo "training done!" echo "training done!"
echo "results: $EXP_DIR/baker_alignment" echo "results: $exp/baker_alignment"
echo "model: $EXP_DIR/baker_model" echo "model: $exp/baker_model"
fi fi

@ -1,16 +1,15 @@
EXP_DIR=exp exp=exp
mkdir -p $EXP_DIR mkdir -p $exp
LEXICON_NAME='canton' LEXICON_NAME='canton'
if [ ! -f "$EXP_DIR/$LEXICON_NAME.lexicon" ]; then MFA_DOWNLOAD_DIR=local/
if [ ! -f "$exp/$LEXICON_NAME.lexicon" ]; then
echo "generating lexicon and training data..." echo "generating lexicon and training data..."
python local/generate_canton_lexicon_wavlabs.py --output_lexicon "$EXP_DIR/$LEXICON_NAME.lexicon" --output_wavlabs "$EXP_DIR/$LEXICON_NAME"_wavlabs --inputs ~/datasets/Guangzhou_Cantonese_Scripted_Speech_Corpus_Daily_Use_Sentence ~/datasets/Guangzhou_Cantonese_Scripted_Speech_Corpus_in_Vehicle python local/generate_canton_lexicon_wavlabs.py --output_lexicon "$exp/$LEXICON_NAME.lexicon" --output_wavlabs "$exp/$LEXICON_NAME"_wavlabs --inputs ~/datasets/Guangzhou_Cantonese_Scripted_Speech_Corpus_Daily_Use_Sentence ~/datasets/Guangzhou_Cantonese_Scripted_Speech_Corpus_in_Vehicle
echo "lexicon and training data done" echo "lexicon and training data done"
fi fi
MFA_DOWNLOAD_DIR=local/
if [ ! -f "$MFA_DOWNLOAD_DIR/montreal-forced-aligner_linux.tar.gz" ]; then if [ ! -f "$MFA_DOWNLOAD_DIR/montreal-forced-aligner_linux.tar.gz" ]; then
echo "downloading mfa..." echo "downloading mfa..."
(cd $MFA_DOWNLOAD_DIR && wget https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/releases/download/v1.0.1/montreal-forced-aligner_linux.tar.gz) (cd $MFA_DOWNLOAD_DIR && wget https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/releases/download/v1.0.1/montreal-forced-aligner_linux.tar.gz)
@ -24,11 +23,14 @@ if [ ! -d "$MFA_DOWNLOAD_DIR/montreal-forced-aligner" ]; then
fi fi
export PATH="$MFA_DOWNLOAD_DIR/montreal-forced-aligner/bin" export PATH="$MFA_DOWNLOAD_DIR/montreal-forced-aligner/bin"
if [ ! -d "$EXP_DIR/canton_alignment" ]; then if [ ! -d "$exp/canton_alignment" ]; then
echo "Start MFA training..." echo "Start MFA training..."
mfa_train_and_align "$EXP_DIR/$LEXICON_NAME"_wavlabs "$EXP_DIR/$LEXICON_NAME.lexicon" $EXP_DIR/canton_alignment -o $EXP_DIR/canton_model --clean --verbose --temp_directory $EXP_DIR/.mfa_train_and_align PATH=$MFA_DOWNLOAD_DIR/montreal-forced-aligner/bin/:$PATH \
LD_LIBRARY_PATH=$MFA_DOWNLOAD_DIR/montreal-forced-aligner/lib/:$LD_LIBRARY_PATH \
./$MFA_DOWNLOAD_DIR/montreal-forced-aligner/bin/mfa_train_and_align \
"$exp/$LEXICON_NAME"_wavlabs "$exp/$LEXICON_NAME.lexicon" $exp/canton_alignment -o $exp/canton_model --clean --verbose -j 10 --temp_directory $exp/.mfa_train_and_align
echo "training done!" echo "training done!"
echo "results: $EXP_DIR/canton_alignment" echo "results: $exp/canton_alignment"
echo "model: $EXP_DIR/canton_model" echo "model: $exp/canton_model"
fi fi

@ -33,6 +33,18 @@ ec5a9b24acc35469229e41256ceaf77d data/lang_char/input.txt
``` ```
``` ```
==> data/lang_char/input.txt <==
mister quilter is the apostle of the middle classes and we are glad to welcome his gospel
nor is mister quilter's manner less interesting than his matter
he tells us that at this festive season of the year with christmas and roast beef looming before us similes drawn from eating and its results occur most readily to the mind
he has grave doubts whether sir frederick leighton's work is really greek after all and can discover in it but little of rocky ithaca
linnell's pictures are a sort of up guards and at em paintings and mason's exquisite idylls are as national as a jingo poem mister birket foster's landscapes smile at one much in the same way that mister carker used to flash his teeth and mister john collier gives his sitter a cheerful slap on the back before he says like a shampooer in a turkish bath next man
it is obviously unnecessary for us to point out how luminous these criticisms are how delicate in expression
on the general principles of art mister quilter writes with equal lucidity
painting he tells us is of a different quality to mathematics and finish in art is adding more fact
as for etchings they are of two kinds british and foreign
he laments most bitterly the divorce that has been made between decorative art and what we usually call pictures makes the customary appeal to the last judgment and reminds us that in the great days of art michael angelo was the furnishing upholsterer
==> data/lang_char/input.bpe <== ==> data/lang_char/input.bpe <==
▁mi ster ▁quilter ▁ is ▁the ▁a p ost le ▁o f ▁the ▁mi d d le ▁c las s es ▁ and ▁we ▁ar e ▁g l a d ▁ to ▁we l c om e ▁h is ▁g o s pe l ▁mi ster ▁quilter ▁ is ▁the ▁a p ost le ▁o f ▁the ▁mi d d le ▁c las s es ▁ and ▁we ▁ar e ▁g l a d ▁ to ▁we l c om e ▁h is ▁g o s pe l
▁ n or ▁ is ▁mi ster ▁quilter ' s ▁ma nne r ▁ l ess ▁in ter es t ing ▁tha n ▁h is ▁ma t ter ▁ n or ▁ is ▁mi ster ▁quilter ' s ▁ma nne r ▁ l ess ▁in ter es t ing ▁tha n ▁h is ▁ma t ter
@ -58,17 +70,6 @@ painting he tells us is of a different quality to mathematics and finish in art
as for etchings they are of two kinds british and foreign as for etchings they are of two kinds british and foreign
he laments most bitterly the divorce that has been made between decorative art and what we usually call pictures makes the customary appeal to the last judgment and reminds us that in the great days of art michael angelo was the furnishing upholsterer he laments most bitterly the divorce that has been made between decorative art and what we usually call pictures makes the customary appeal to the last judgment and reminds us that in the great days of art michael angelo was the furnishing upholsterer
==> data/lang_char/input.txt <==
mister quilter is the apostle of the middle classes and we are glad to welcome his gospel
nor is mister quilter's manner less interesting than his matter
he tells us that at this festive season of the year with christmas and roast beef looming before us similes drawn from eating and its results occur most readily to the mind
he has grave doubts whether sir frederick leighton's work is really greek after all and can discover in it but little of rocky ithaca
linnell's pictures are a sort of up guards and at em paintings and mason's exquisite idylls are as national as a jingo poem mister birket foster's landscapes smile at one much in the same way that mister carker used to flash his teeth and mister john collier gives his sitter a cheerful slap on the back before he says like a shampooer in a turkish bath next man
it is obviously unnecessary for us to point out how luminous these criticisms are how delicate in expression
on the general principles of art mister quilter writes with equal lucidity
painting he tells us is of a different quality to mathematics and finish in art is adding more fact
as for etchings they are of two kinds british and foreign
he laments most bitterly the divorce that has been made between decorative art and what we usually call pictures makes the customary appeal to the last judgment and reminds us that in the great days of art michael angelo was the furnishing upholsterer
==> data/lang_char/train_unigram100_units.txt <== ==> data/lang_char/train_unigram100_units.txt <==
<blank> 0 <blank> 0

Loading…
Cancel
Save