From f7651711119b6f881f5d5fc77bf9ca80fbcdb50b Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Tue, 23 Nov 2021 07:52:17 +0000 Subject: [PATCH 01/40] add the readme for the run.sh in aishsll asr1 --- examples/aishell/asr1/READEME.md | 301 ++++++++++++++++++ .../asr1/{README.md => model_performance.md} | 0 examples/aishell/asr1/run.sh | 16 +- 3 files changed, 309 insertions(+), 8 deletions(-) create mode 100644 examples/aishell/asr1/READEME.md rename examples/aishell/asr1/{README.md => model_performance.md} (100%) diff --git a/examples/aishell/asr1/READEME.md b/examples/aishell/asr1/READEME.md new file mode 100644 index 00000000..3afd1954 --- /dev/null +++ b/examples/aishell/asr1/READEME.md @@ -0,0 +1,301 @@ +# Conformer ASR with Aishell + +This example contains code used to train a [Conformer](http://arxiv.org/abs/2008.03802) model with [Aishell dataset](http://www.openslr.org/resources/33) + +## Overview + +All the scirpt you need is in the ```run.sh```. There are several stages in the ```run.sh```, and each stage has it's function. + +| Stage | Function | +| :---- | :----------------------------------------------------------- | +| 0 | Process data. It includes:
(1) Download the dataset
(2) Caculate the CMVN of the train dataset
(3) Get the vocabulary file
(4) Get the manifest files of the train, development and test dataset | +| 1 | Train the model | +| 2 | Get the final model by average the top-k model , set k = 1 means choose the best model | +| 3 | Test the final model performance | +| 4 | Get ctc alignment of test data using the final model | +| 5 | Infer the single audio file | +| 51 | (Not supported at now) Transform the dynamic graph model to static graph model | +| 101 | (Need further installation) Train language model and Build TLG | + +You can choose to run a range of stages by set the ```stage``` and ```stop_stage ``` . + +For example , if you want to execute the code in stage 2 and stage 3, you can run this script: + +```bash +bash run.sh --stage 2 --stop_stage 3 +``` +Or you can set ```stage``` equal to ```stop-stage``` to only run one stage. +For example, if you only want to run ```stage 0```, you can use the script below: + +```bash +bash run.sh --stage 0 --stop_stage 0 +``` + + + +The document below will decribe the scripts in the ```run.sh``` in detail. + +## The environment variables + +The path.sh contains the environment variable. +```bash +source path.sh +``` +This script needs to be run firstly. + +And another script is also needed: + +```bash +source ${MAIN_ROOT}/utils/parse_options.sh +``` + +It will support the way of using```--varibale value``` in the shell scripts. + + + +## The local variables + +Some local variables are set in the ```run.sh```. +```gpus``` denotes the GPU number you want to use. If you set ```gpus=```, it means you only use CPU. + +```stage``` denotes the number of stage you want to start from in the expriments. +```stop stage```denotes the number of stage you want to end at in the expriments. + +```conf_path``` denotes the config path of the model. + +```avg_num``` denotes the number K of top-K model you want to average to get the final model. + +```ckpt``` denotes the checkpoint prefix of the model, e.g. "conformer" + +```audio file``` denotes the file path of the single file you want to infer in stage 6 + +You can set the local variables when you use the ```run.sh``` + +For example, you can set the ```gpus``` and ``avg_num`` when you use the command line.: + +```bash +bash run.sh --gpus 0,1 --avg_num 20 +``` + + + +## Stage 0: Data processing + +To use this example, you need to process data firstly and you can use the stage 0 in the ```run.sh``` to do this. The code is shown below: +```bash + if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # prepare data + bash ./local/data.sh || exit -1 + fi +``` + +The stage 0 is for processing the data. + +If you only want to process the data. You can run + +```bash +bash run.sh --stage 0 --stop_stage 0 +``` + +You can also just run these scripts in your command line. + +```bash +source path.sh +source ${MAIN_ROOT}/utils/parse_options.sh +bash ./local/data.sh +``` + +Aftre processing the data, the ``data`` directory will be look like this: + +```bash +data/ +├── dev.meta +├── manifest.dev.raw +├── manifest.test.raw +├── manifest.train.raw +├── mean_std.json +├── test.meta +├── train.meta +└── vocab.txt +``` + + + +## Stage 1: Model training + +If you want to train the model. you can use the stage 1 in the ```run.sh```. The code is shown below. +```bash +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # train model, all `ckpt` under `exp` dir + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} + fi +``` + +If you want to train the model, you can use the script below to execute the stage 0 and stage 1: +```bash +bash run.sh --stage 0 --stop_stage 1 +``` +or you can run these scripts in command line (only use CPU). +```bash +source path.sh +source ${MAIN_ROOT}/utils/parse_options.sh +bash ./local/data.sh +CUDA_VISIBLE_DEVICES= ./local/train.sh conf/conformer.yaml conformer +``` + + + +## Stage 2: Top-k model averaging + +After training the model, we need to get the final model for test and infer. In every epoch, the model checkpoint is saved , so we can choose the best model from them based on the validation loss or we can sort them and average the top-k model parameters to get the final model. We can use the stage 2 to do this, and the code is shown below: +```bash + if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # avg n best model + avg.sh best exp/${ckpt}/checkpoints ${avg_num} + fi +``` +The ```avg.sh``` is in the ```../../../utils/``` which is define in the ```path.sh```. +If you want to get the final model, you can use the script below to execute the stage 0, stage 1, and stage 2: + +```bash +bash run.sh --stage 0 --stop_stage 2 +``` + +or you can run these scripts in command line (only use CPU). +```bash +source path.sh +source ${MAIN_ROOT}/utils/parse_options.sh +bash ./local/data.sh +CUDA_VISIBLE_DEVICES= ./local/train.sh conf/conformer.yaml conformer +avg.sh best exp/conformer/checkpoints 20 +``` + + + +## Stage 3: Model Testing + +To know the preformence of the model, test stage is needed. The code of test stage is shown below: + +```bash + if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + # test ckpt avg_n + CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + fi +``` + +If you want to train a model and test it, you can use the script below to execute the stage 0, stage 1, stage 2, and stage 3 : + +```bash +bash run.sh --stage 0 --stop_stage 3 +``` + +or you can run these scripts in command line (only use CPU). + +```bash +source path.sh +source ${MAIN_ROOT}/utils/parse_options.sh +bash ./local/data.sh +CUDA_VISIBLE_DEVICES= ./local/train.sh conf/conformer.yaml conformer +avg.sh best exp/conformer/checkpoints 20 +CUDA_VISIBLE_DEVICES= ./local/test.sh conf/conformer.yaml exp/conformer/checkpoints/avg_20 +``` + + + +## Stage 4: CTC alignment + +If you want to get the alignment between the audio and the text, you can use the ctc alignment. The code of this stage is shown below: + +```bash + if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + # ctc alignment of test data + CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + fi +``` + +If you want to train the model, test it and do the alignment, you can use the script below to execute the stage 0, stage 1, stage 2, and stage 3 : + +```bash +bash run.sh --stage 0 --stop_stage 4 +``` + +or if you only need to train a model and do the alignment, you can use these scripts to escape the stage 3(test stage): + +```bash +bash run.sh --stage 0 --stop_stage 2 +bash run.sh --stage 4 --stop_stage 4 +``` + +or you can also use these scripts in command line (only use CPU). + +```bash +source path.sh +source ${MAIN_ROOT}/utils/parse_options.sh +bash ./local/data.sh +CUDA_VISIBLE_DEVICES= ./local/train.sh conf/conformer.yaml conformer +avg.sh best exp/conformer/checkpoints 20 +# test stage is optional +CUDA_VISIBLE_DEVICES= ./local/test.sh conf/conformer.yaml exp/conformer/checkpoints/avg_20 +CUDA_VISIBLE_DEVICES= ./local/align.sh conf/conformer.yaml exp/conformer/checkpoints/avg_20 +``` + + + +## Stage 5: Single audio file inference + +In some situation, you want to use the trained model to do the inference for the single audio file. You can use the stage 5. The code is shown below + +```bash + if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then + # test a single .wav file + CUDA_VISIBLE_DEVICES=0 ./local/test_hub.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1 + fi +``` + +you can train the model by yourself using ```bash run.sh --stage 0 --stop_stage 3```, or you can download the pretrained model by the script below: + +``` +wget https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.release.tar.gz +tar aishell.release.tar.gz +``` + +You need to prepare an audio file, please confirme the sample rate of the audio is 16K. Assume the path of the audio file is ```data/test_audio.wav```, you can get the result by runing the script below. + +```bash +CUDA_VISIBLE_DEVICES= ./local/test_hub.sh conf/conformer.yaml exp/conformer/checkpoints/avg_20 data/test_audio.wav +``` + + + +## Stage 51: Static model transforming(not supported at now) + +To transform the dynamic model to static model,stage 51 can be used. The code of this stage is shown below: + +```bash + if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then + # export ckpt avg_n + CUDA_VISIBLE_DEVICES=0 ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit + fi +``` + +It is not supported at now, so we set a large stage number for this stage. + + + +## Stage: 101 Language model training and TLG building (Need further installation! ) + +You need to install the kaldi and srilm to use the stage 101, it is used for train language model and build TLG. To do further installation, you need to do these: + +```bash +# go to the root of the repo +cd ../../../ +# Do the further installation +pip install -e . +cd tools +bash extras/install_openblas.sh +bash extras/install_kaldi.sh +``` + +You need to be patient, since installing the kaldi takes some time. + + diff --git a/examples/aishell/asr1/README.md b/examples/aishell/asr1/model_performance.md similarity index 100% rename from examples/aishell/asr1/README.md rename to examples/aishell/asr1/model_performance.md diff --git a/examples/aishell/asr1/run.sh b/examples/aishell/asr1/run.sh index 0b40e064..c687e966 100644 --- a/examples/aishell/asr1/run.sh +++ b/examples/aishell/asr1/run.sh @@ -4,7 +4,7 @@ set -e gpus=0,1,2,3 stage=0 -stop_stage=100 +stop_stage=50 conf_path=conf/conformer.yaml avg_num=20 @@ -41,18 +41,18 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi -# if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then -# # export ckpt avg_n -# CUDA_VISIBLE_DEVICES=0 ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit -# fi - # Optionally, you can add LM and test it with runtime. -if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then +if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then # test a single .wav file CUDA_VISIBLE_DEVICES=0 ./local/test_hub.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1 fi -if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then +# if [ ${stage} -le 51 ] && [ ${stop_stage} -ge 51 ]; then +# # export ckpt avg_n +# CUDA_VISIBLE_DEVICES=0 ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit +# fi + +if [ ${stage} -le 101 ] && [ ${stop_stage} -ge 101 ]; then echo "warning: deps on kaldi and srilm, please make sure installed." # train lm and build TLG ./local/tlg.sh --corpus aishell --lmtype srilm From 649fcc4c165c5bdf604c26bd23523db13df895e6 Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Wed, 24 Nov 2021 05:29:19 +0000 Subject: [PATCH 02/40] revise some programming mistakes --- examples/aishell/asr1/READEME.md | 82 +++++++++++++++++--------------- 1 file changed, 43 insertions(+), 39 deletions(-) diff --git a/examples/aishell/asr1/READEME.md b/examples/aishell/asr1/READEME.md index 3afd1954..fe968870 100644 --- a/examples/aishell/asr1/READEME.md +++ b/examples/aishell/asr1/READEME.md @@ -4,22 +4,22 @@ This example contains code used to train a [Conformer](http://arxiv.org/abs/2008 ## Overview -All the scirpt you need is in the ```run.sh```. There are several stages in the ```run.sh```, and each stage has it's function. +All the scirpts you need are in the ```run.sh```. There are several stages in the ```run.sh```, and each stage has its function. | Stage | Function | | :---- | :----------------------------------------------------------- | | 0 | Process data. It includes:
(1) Download the dataset
(2) Caculate the CMVN of the train dataset
(3) Get the vocabulary file
(4) Get the manifest files of the train, development and test dataset | | 1 | Train the model | -| 2 | Get the final model by average the top-k model , set k = 1 means choose the best model | +| 2 | Get the final model by averaging the top-k models, set k = 1 means choose the best model | | 3 | Test the final model performance | | 4 | Get ctc alignment of test data using the final model | | 5 | Infer the single audio file | | 51 | (Not supported at now) Transform the dynamic graph model to static graph model | | 101 | (Need further installation) Train language model and Build TLG | -You can choose to run a range of stages by set the ```stage``` and ```stop_stage ``` . +You can choose to run a range of stages by setting the ```stage``` and ```stop_stage ``` . -For example , if you want to execute the code in stage 2 and stage 3, you can run this script: +For example, if you want to execute the code in stage 2 and stage 3, you can run this script: ```bash bash run.sh --stage 2 --stop_stage 3 @@ -33,7 +33,7 @@ bash run.sh --stage 0 --stop_stage 0 -The document below will decribe the scripts in the ```run.sh``` in detail. +The document below will describe the scripts in the ```run.sh``` in detail. ## The environment variables @@ -41,7 +41,7 @@ The path.sh contains the environment variable. ```bash source path.sh ``` -This script needs to be run firstly. +This script needs to be run firstly. And another script is also needed: @@ -56,14 +56,14 @@ It will support the way of using```--varibale value``` in the shell scripts. ## The local variables Some local variables are set in the ```run.sh```. -```gpus``` denotes the GPU number you want to use. If you set ```gpus=```, it means you only use CPU. +```gpus``` denotes the GPU number you want to use. If you set ```gpus=```, it means you only use CPU. -```stage``` denotes the number of stage you want to start from in the expriments. +```stage``` denotes the number of stage you want to start from in the expriments. ```stop stage```denotes the number of stage you want to end at in the expriments. ```conf_path``` denotes the config path of the model. -```avg_num``` denotes the number K of top-K model you want to average to get the final model. +```avg_num``` denotes the number K of top-K models you want to average to get the final model. ```ckpt``` denotes the checkpoint prefix of the model, e.g. "conformer" @@ -81,7 +81,7 @@ bash run.sh --gpus 0,1 --avg_num 20 ## Stage 0: Data processing -To use this example, you need to process data firstly and you can use the stage 0 in the ```run.sh``` to do this. The code is shown below: +To use this example, you need to process data firstly and you can use stage 0 in the ```run.sh``` to do this. The code is shown below: ```bash if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then # prepare data @@ -89,7 +89,7 @@ To use this example, you need to process data firstly and you can use the stage fi ``` -The stage 0 is for processing the data. +Stage 0 is for processing the data. If you only want to process the data. You can run @@ -105,49 +105,53 @@ source ${MAIN_ROOT}/utils/parse_options.sh bash ./local/data.sh ``` -Aftre processing the data, the ``data`` directory will be look like this: +After processing the data, the ``data`` directory will look like this: ```bash data/ -├── dev.meta -├── manifest.dev.raw -├── manifest.test.raw -├── manifest.train.raw -├── mean_std.json -├── test.meta -├── train.meta -└── vocab.txt +|-- dev.meta +|-- lang_char +| `-- vocab.txt +|-- manifest.dev +|-- manifest.dev.raw +|-- manifest.test +|-- manifest.test.raw +|-- manifest.train +|-- manifest.train.raw +|-- mean_std.json +|-- test.meta +`-- train.meta ``` ## Stage 1: Model training -If you want to train the model. you can use the stage 1 in the ```run.sh```. The code is shown below. +If you want to train the model. you can use stage 1 in the ```run.sh```. The code is shown below. ```bash if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # train model, all `ckpt` under `exp` dir - CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} fi ``` -If you want to train the model, you can use the script below to execute the stage 0 and stage 1: +If you want to train the model, you can use the script below to execute stage 0 and stage 1: ```bash bash run.sh --stage 0 --stop_stage 1 ``` -or you can run these scripts in command line (only use CPU). +or you can run these scripts in the command line (only use CPU). ```bash source path.sh source ${MAIN_ROOT}/utils/parse_options.sh bash ./local/data.sh -CUDA_VISIBLE_DEVICES= ./local/train.sh conf/conformer.yaml conformer +CUDA_VISIBLE_DEVICES= ./local/train.sh conf/conformer.yaml conformer ``` -## Stage 2: Top-k model averaging +## Stage 2: Top-k models averaging -After training the model, we need to get the final model for test and infer. In every epoch, the model checkpoint is saved , so we can choose the best model from them based on the validation loss or we can sort them and average the top-k model parameters to get the final model. We can use the stage 2 to do this, and the code is shown below: +After training the model, we need to get the final model for testing and inference. In every epoch, the model checkpoint is saved, so we can choose the best model from them based on the validation loss or we can sort them and average the parameters of the top-k models to get the final model. We can use stage 2 to do this, and the code is shown below: ```bash if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then # avg n best model @@ -155,13 +159,13 @@ After training the model, we need to get the final model for test and infer. In fi ``` The ```avg.sh``` is in the ```../../../utils/``` which is define in the ```path.sh```. -If you want to get the final model, you can use the script below to execute the stage 0, stage 1, and stage 2: +If you want to get the final model, you can use the script below to execute stage 0, stage 1, and stage 2: ```bash bash run.sh --stage 0 --stop_stage 2 ``` -or you can run these scripts in command line (only use CPU). +or you can run these scripts in the command line (only use CPU). ```bash source path.sh source ${MAIN_ROOT}/utils/parse_options.sh @@ -174,7 +178,7 @@ avg.sh best exp/conformer/checkpoints 20 ## Stage 3: Model Testing -To know the preformence of the model, test stage is needed. The code of test stage is shown below: +The test stage is to evaluate the model performance.. The code of test stage is shown below: ```bash if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then @@ -183,13 +187,13 @@ To know the preformence of the model, test stage is needed. The code of test sta fi ``` -If you want to train a model and test it, you can use the script below to execute the stage 0, stage 1, stage 2, and stage 3 : +If you want to train a model and test it, you can use the script below to execute stage 0, stage 1, stage 2, and stage 3: ```bash bash run.sh --stage 0 --stop_stage 3 ``` -or you can run these scripts in command line (only use CPU). +or you can run these scripts in the command line (only use CPU). ```bash source path.sh @@ -213,20 +217,20 @@ If you want to get the alignment between the audio and the text, you can use the fi ``` -If you want to train the model, test it and do the alignment, you can use the script below to execute the stage 0, stage 1, stage 2, and stage 3 : +If you want to train the model, test it and do the alignment, you can use the script below to execute stage 0, stage 1, stage 2, and stage 3 : ```bash bash run.sh --stage 0 --stop_stage 4 ``` -or if you only need to train a model and do the alignment, you can use these scripts to escape the stage 3(test stage): +or if you only need to train a model and do the alignment, you can use these scripts to escape stage 3(test stage): ```bash bash run.sh --stage 0 --stop_stage 2 bash run.sh --stage 4 --stop_stage 4 ``` -or you can also use these scripts in command line (only use CPU). +or you can also use these scripts in the command line (only use CPU). ```bash source path.sh @@ -243,7 +247,7 @@ CUDA_VISIBLE_DEVICES= ./local/align.sh conf/conformer.yaml exp/conformer/checkpo ## Stage 5: Single audio file inference -In some situation, you want to use the trained model to do the inference for the single audio file. You can use the stage 5. The code is shown below +In some situations, you want to use the trained model to do the inference for the single audio file. You can use stage 5. The code is shown below ```bash if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then @@ -252,14 +256,14 @@ In some situation, you want to use the trained model to do the inference for the fi ``` -you can train the model by yourself using ```bash run.sh --stage 0 --stop_stage 3```, or you can download the pretrained model by the script below: +you can train the model by yourself using ```bash run.sh --stage 0 --stop_stage 3```, or you can download the pretrained model by the script below: ``` wget https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.release.tar.gz tar aishell.release.tar.gz ``` -You need to prepare an audio file, please confirme the sample rate of the audio is 16K. Assume the path of the audio file is ```data/test_audio.wav```, you can get the result by runing the script below. +You need to prepare an audio file, please confirm the sample rate of the audio is 16K. Assume the path of the audio file is ```data/test_audio.wav```, you can get the result by running the script below. ```bash CUDA_VISIBLE_DEVICES= ./local/test_hub.sh conf/conformer.yaml exp/conformer/checkpoints/avg_20 data/test_audio.wav @@ -284,7 +288,7 @@ It is not supported at now, so we set a large stage number for this stage. ## Stage: 101 Language model training and TLG building (Need further installation! ) -You need to install the kaldi and srilm to use the stage 101, it is used for train language model and build TLG. To do further installation, you need to do these: +You need to install the kaldi and srilm to use stage 101, it is used for training language model and building TLG. To do further installation, you need to do these: ```bash # go to the root of the repo From 45ac9e0520377cab92b3c0e96f75c0ec5fce6532 Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Thu, 25 Nov 2021 09:46:47 +0000 Subject: [PATCH 03/40] delete the unsupport --- examples/aishell/asr1/READEME.md | 169 +++++++++++++++++++------------ examples/aishell/asr1/run.sh | 14 +-- 2 files changed, 113 insertions(+), 70 deletions(-) diff --git a/examples/aishell/asr1/READEME.md b/examples/aishell/asr1/READEME.md index fe968870..a873a2dc 100644 --- a/examples/aishell/asr1/READEME.md +++ b/examples/aishell/asr1/READEME.md @@ -1,6 +1,6 @@ -# Conformer ASR with Aishell +# Transformer/Conformer ASR with Aishell -This example contains code used to train a [Conformer](http://arxiv.org/abs/2008.03802) model with [Aishell dataset](http://www.openslr.org/resources/33) +This example contains code used to train a Transformer or [Conformer](http://arxiv.org/abs/2008.03802) model with [Aishell dataset](http://www.openslr.org/resources/33) ## Overview @@ -14,16 +14,16 @@ All the scirpts you need are in the ```run.sh```. There are several stages in th | 3 | Test the final model performance | | 4 | Get ctc alignment of test data using the final model | | 5 | Infer the single audio file | -| 51 | (Not supported at now) Transform the dynamic graph model to static graph model | -| 101 | (Need further installation) Train language model and Build TLG | -You can choose to run a range of stages by setting the ```stage``` and ```stop_stage ``` . + +You can choose to run a range of stages by setting the ```stage``` and ```stop_stage ``` . For example, if you want to execute the code in stage 2 and stage 3, you can run this script: ```bash bash run.sh --stage 2 --stop_stage 3 ``` + Or you can set ```stage``` equal to ```stop-stage``` to only run one stage. For example, if you only want to run ```stage 0```, you can use the script below: @@ -38,10 +38,12 @@ The document below will describe the scripts in the ```run.sh``` in detail. ## The environment variables The path.sh contains the environment variable. + ```bash source path.sh ``` -This script needs to be run firstly. + +This script needs to be run firstly. And another script is also needed: @@ -56,20 +58,20 @@ It will support the way of using```--varibale value``` in the shell scripts. ## The local variables Some local variables are set in the ```run.sh```. -```gpus``` denotes the GPU number you want to use. If you set ```gpus=```, it means you only use CPU. +```gpus``` denotes the GPU number you want to use. If you set ```gpus=```, it means you only use CPU. -```stage``` denotes the number of stage you want to start from in the expriments. +```stage``` denotes the number of stage you want to start from in the expriments. ```stop stage```denotes the number of stage you want to end at in the expriments. ```conf_path``` denotes the config path of the model. ```avg_num``` denotes the number K of top-K models you want to average to get the final model. -```ckpt``` denotes the checkpoint prefix of the model, e.g. "conformer" - ```audio file``` denotes the file path of the single file you want to infer in stage 6 -You can set the local variables when you use the ```run.sh``` +```ckpt``` denotes the checkpoint prefix of the model, e.g. "conformer" + +You can set the local variables (except ```ckpt```) when you use the ```run.sh``` For example, you can set the ```gpus``` and ``avg_num`` when you use the command line.: @@ -81,7 +83,8 @@ bash run.sh --gpus 0,1 --avg_num 20 ## Stage 0: Data processing -To use this example, you need to process data firstly and you can use stage 0 in the ```run.sh``` to do this. The code is shown below: +To use this example, you need to process data firstly and you can use stage 0 in the ```run.sh``` to do this. The code is shown below: + ```bash if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then # prepare data @@ -101,7 +104,6 @@ You can also just run these scripts in your command line. ```bash source path.sh -source ${MAIN_ROOT}/utils/parse_options.sh bash ./local/data.sh ``` @@ -128,47 +130,52 @@ data/ ## Stage 1: Model training If you want to train the model. you can use stage 1 in the ```run.sh```. The code is shown below. + ```bash if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # train model, all `ckpt` under `exp` dir - CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} fi ``` If you want to train the model, you can use the script below to execute stage 0 and stage 1: + ```bash bash run.sh --stage 0 --stop_stage 1 ``` + or you can run these scripts in the command line (only use CPU). + ```bash source path.sh -source ${MAIN_ROOT}/utils/parse_options.sh bash ./local/data.sh -CUDA_VISIBLE_DEVICES= ./local/train.sh conf/conformer.yaml conformer +CUDA_VISIBLE_DEVICES= ./local/train.sh conf/conformer.yaml conformer ``` -## Stage 2: Top-k models averaging +## Stage 2: Top-k Models Averaging + +After training the model, we need to get the final model for testing and inference. In every epoch, the model checkpoint is saved, so we can choose the best model from them based on the validation loss or we can sort them and average the parameters of the top-k models to get the final model. We can use stage 2 to do this, and the code is shown below: -After training the model, we need to get the final model for testing and inference. In every epoch, the model checkpoint is saved, so we can choose the best model from them based on the validation loss or we can sort them and average the parameters of the top-k models to get the final model. We can use stage 2 to do this, and the code is shown below: ```bash if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then # avg n best model avg.sh best exp/${ckpt}/checkpoints ${avg_num} fi ``` + The ```avg.sh``` is in the ```../../../utils/``` which is define in the ```path.sh```. -If you want to get the final model, you can use the script below to execute stage 0, stage 1, and stage 2: +If you want to get the final model, you can use the script below to execute stage 0, stage 1, and stage 2: ```bash bash run.sh --stage 0 --stop_stage 2 ``` or you can run these scripts in the command line (only use CPU). + ```bash source path.sh -source ${MAIN_ROOT}/utils/parse_options.sh bash ./local/data.sh CUDA_VISIBLE_DEVICES= ./local/train.sh conf/conformer.yaml conformer avg.sh best exp/conformer/checkpoints 20 @@ -187,7 +194,7 @@ The test stage is to evaluate the model performance.. The code of test stage is fi ``` -If you want to train a model and test it, you can use the script below to execute stage 0, stage 1, stage 2, and stage 3: +If you want to train a model and test it, you can use the script below to execute stage 0, stage 1, stage 2, and stage 3 : ```bash bash run.sh --stage 0 --stop_stage 3 @@ -197,7 +204,6 @@ or you can run these scripts in the command line (only use CPU). ```bash source path.sh -source ${MAIN_ROOT}/utils/parse_options.sh bash ./local/data.sh CUDA_VISIBLE_DEVICES= ./local/train.sh conf/conformer.yaml conformer avg.sh best exp/conformer/checkpoints 20 @@ -206,7 +212,75 @@ CUDA_VISIBLE_DEVICES= ./local/test.sh conf/conformer.yaml exp/conformer/checkpoi -## Stage 4: CTC alignment +## Pretrained Model + +You can get the pretrained transfomer or conformer using the scripts below: + +```bash +Conformer: +wget https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.release.tar.gz + +Chunk Conformer: +wget https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.chunk.release.tar.gz + +Transfomer: +wget https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/transformer.model.tar.gz + +``` + +using the ```tar``` scripts to unpack the model and then you can use the script to test the modle. + +For example: + +``` +wget https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/transformer.model.tar.gz +tar xzvf transformer.model.tar.gz +source path.sh +# If you have process the data and get the manifest file, you can skip the following 2 steps +bash local/data.sh --stage -1 --stop_stage -1 +bash local/data.sh --stage 2 --stop_stage 2 + +CUDA_VISIBLE_DEVICES= ./local/test.sh conf/transformer.yaml exp/transformer/checkpoints/avg_20 +``` + + + +The performance of the released models are shown below: + +### Conformer + +| Model | Params | Config | Augmentation | Test set | Decode method | Loss | CER | +| --------- | ------ | ------------------- | ---------------- | -------- | ---------------------- | ---- | -------- | +| conformer | 47.07M | conf/conformer.yaml | spec_aug + shift | test | attention | - | 0.059858 | +| conformer | 47.07M | conf/conformer.yaml | spec_aug + shift | test | ctc_greedy_search | - | 0.062311 | +| conformer | 47.07M | conf/conformer.yaml | spec_aug + shift | test | ctc_prefix_beam_search | - | 0.062196 | +| conformer | 47.07M | conf/conformer.yaml | spec_aug + shift | test | attention_rescoring | - | 0.054694 | + + +### Chunk Conformer + +Need set `decoding.decoding_chunk_size=16` when decoding. + +| Model | Params | Config | Augmentation | Test set | Decode method | Chunk Size & Left Chunks | Loss | CER | +| --------- | ------ | ------------------------- | ---------------- | -------- | ---------------------- | ------------------------ | ---- | -------- | +| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | attention | 16, -1 | - | 0.061939 | +| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_greedy_search | 16, -1 | - | 0.070806 | +| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_prefix_beam_search | 16, -1 | - | 0.070739 | +| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | attention_rescoring | 16, -1 | - | 0.059400 | + + +### Transformer + +| Model | Params | Config | Augmentation | Test set | Decode method | Loss | CER | +| ----------- | ------ | --------------------- | ------------ | -------- | ---------------------- | ----------------- | -------- | +| transformer | 31.95M | conf/transformer.yaml | spec_aug | test | attention | 3.858648955821991 | 0.057293 | +| transformer | 31.95M | conf/transformer.yaml | spec_aug | test | ctc_greedy_search | 3.858648955821991 | 0.061837 | +| transformer | 31.95M | conf/transformer.yaml | spec_aug | test | ctc_prefix_beam_search | 3.858648955821991 | 0.061685 | +| transformer | 31.95M | conf/transformer.yaml | spec_aug | test | attention_rescoring | 3.858648955821991 | 0.053844 | + + + +## Stage 4: CTC Alignment If you want to get the alignment between the audio and the text, you can use the ctc alignment. The code of this stage is shown below: @@ -217,7 +291,7 @@ If you want to get the alignment between the audio and the text, you can use the fi ``` -If you want to train the model, test it and do the alignment, you can use the script below to execute stage 0, stage 1, stage 2, and stage 3 : +If you want to train the model, test it and do the alignment, you can use the script below to execute stage 0, stage 1, stage 2, and stage 3 : ```bash bash run.sh --stage 0 --stop_stage 4 @@ -234,7 +308,6 @@ or you can also use these scripts in the command line (only use CPU). ```bash source path.sh -source ${MAIN_ROOT}/utils/parse_options.sh bash ./local/data.sh CUDA_VISIBLE_DEVICES= ./local/train.sh conf/conformer.yaml conformer avg.sh best exp/conformer/checkpoints 20 @@ -245,61 +318,29 @@ CUDA_VISIBLE_DEVICES= ./local/align.sh conf/conformer.yaml exp/conformer/checkpo -## Stage 5: Single audio file inference +## Stage 5: Single Audio File Inference -In some situations, you want to use the trained model to do the inference for the single audio file. You can use stage 5. The code is shown below +In some situations, you want to use the trained model to do the inference for the single audio file. You can use stage 5. The code is shown below ```bash if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then # test a single .wav file - CUDA_VISIBLE_DEVICES=0 ./local/test_hub.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1 fi ``` you can train the model by yourself using ```bash run.sh --stage 0 --stop_stage 3```, or you can download the pretrained model by the script below: -``` -wget https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.release.tar.gz -tar aishell.release.tar.gz -``` - -You need to prepare an audio file, please confirm the sample rate of the audio is 16K. Assume the path of the audio file is ```data/test_audio.wav```, you can get the result by running the script below. - ```bash -CUDA_VISIBLE_DEVICES= ./local/test_hub.sh conf/conformer.yaml exp/conformer/checkpoints/avg_20 data/test_audio.wav +wget https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/transformer.model.tar.gz +tar xzvf transformer.model.tar.gz ``` - - -## Stage 51: Static model transforming(not supported at now) - -To transform the dynamic model to static model,stage 51 can be used. The code of this stage is shown below: - -```bash - if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then - # export ckpt avg_n - CUDA_VISIBLE_DEVICES=0 ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit - fi -``` - -It is not supported at now, so we set a large stage number for this stage. - - - -## Stage: 101 Language model training and TLG building (Need further installation! ) - -You need to install the kaldi and srilm to use stage 101, it is used for training language model and building TLG. To do further installation, you need to do these: +You need to prepare an audio file, please confirm the sample rate of the audio is 16K. Assume the path of the audio file is ```data/test_audio.wav```, you can get the result by running the script below. ```bash -# go to the root of the repo -cd ../../../ -# Do the further installation -pip install -e . -cd tools -bash extras/install_openblas.sh -bash extras/install_kaldi.sh +CUDA_VISIBLE_DEVICES= ./local/test_wav.sh conf/transformer.yaml exp/transformer/checkpoints/avg_20 data/test_audio.wav ``` -You need to be patient, since installing the kaldi takes some time. diff --git a/examples/aishell/asr1/run.sh b/examples/aishell/asr1/run.sh index c687e966..41af2445 100644 --- a/examples/aishell/asr1/run.sh +++ b/examples/aishell/asr1/run.sh @@ -14,7 +14,7 @@ avg_ckpt=avg_${avg_num} ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}') echo "checkpoint name ${ckpt}" -audio_file="data/tmp.wav" +audio_file="data/test_single_audio.wav" if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then # prepare data @@ -44,14 +44,16 @@ fi # Optionally, you can add LM and test it with runtime. if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then # test a single .wav file - CUDA_VISIBLE_DEVICES=0 ./local/test_hub.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1 fi -# if [ ${stage} -le 51 ] && [ ${stop_stage} -ge 51 ]; then -# # export ckpt avg_n -# CUDA_VISIBLE_DEVICES=0 ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit -# fi +# Not supported at now!!! +if [ ${stage} -le 51 ] && [ ${stop_stage} -ge 51 ]; then + # export ckpt avg_n + CUDA_VISIBLE_DEVICES=0 ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit +fi +# Need further installation! Read the install.md to complete further installation if [ ${stage} -le 101 ] && [ ${stop_stage} -ge 101 ]; then echo "warning: deps on kaldi and srilm, please make sure installed." # train lm and build TLG From 895a086fdd02e6727789e146152631263abc25dc Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Mon, 29 Nov 2021 08:17:59 +0000 Subject: [PATCH 04/40] rename the config.feat_size and the config.vocab.size to input_size and output_size --- examples/librispeech/asr1/RESULTS.md | 2 +- paddlespeech/s2t/exps/deepspeech2/bin/test_hub.py | 4 ++-- paddlespeech/s2t/exps/deepspeech2/model.py | 8 ++++---- paddlespeech/s2t/models/ds2/deepspeech2.py | 4 ++-- paddlespeech/s2t/models/ds2_online/deepspeech2.py | 4 ++-- 5 files changed, 11 insertions(+), 11 deletions(-) diff --git a/examples/librispeech/asr1/RESULTS.md b/examples/librispeech/asr1/RESULTS.md index 19300ade..3dad7acb 100644 --- a/examples/librispeech/asr1/RESULTS.md +++ b/examples/librispeech/asr1/RESULTS.md @@ -24,4 +24,4 @@ | transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | attention | 6.725063021977743 | 0.047417 | | transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | ctc_greedy_search | 6.725063021977743 | 0.053922 | | transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | ctc_prefix_beam_search | 6.725063021977743 | 0.053180 | -| transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | attention_rescoring | 6.725063021977743 | 0.041026 | \ No newline at end of file +| transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | attention_rescoring | 6.725063021977743 | 0.041026 | diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/test_hub.py b/paddlespeech/s2t/exps/deepspeech2/bin/test_hub.py index 831bd1ad..b8544dc2 100644 --- a/paddlespeech/s2t/exps/deepspeech2/bin/test_hub.py +++ b/paddlespeech/s2t/exps/deepspeech2/bin/test_hub.py @@ -110,8 +110,8 @@ class DeepSpeech2Tester_hub(): def setup_model(self): config = self.config.clone() with UpdateConfig(config): - config.model.feat_size = self.collate_fn_test.feature_size - config.model.dict_size = self.collate_fn_test.vocab_size + config.model.input_dim = self.collate_fn_test.feature_size + config.model.output_dim = self.collate_fn_test.vocab_size if self.args.model_type == 'offline': model = DeepSpeech2Model.from_config(config.model) diff --git a/paddlespeech/s2t/exps/deepspeech2/model.py b/paddlespeech/s2t/exps/deepspeech2/model.py index e827414d..3e4ff1a8 100644 --- a/paddlespeech/s2t/exps/deepspeech2/model.py +++ b/paddlespeech/s2t/exps/deepspeech2/model.py @@ -154,11 +154,11 @@ class DeepSpeech2Trainer(Trainer): config = self.config.clone() with UpdateConfig(config): if self.train: - config.model.feat_size = self.train_loader.collate_fn.feature_size - config.model.dict_size = self.train_loader.collate_fn.vocab_size + config.model.input_dim = self.train_loader.collate_fn.feature_size + config.model.output_dim = self.train_loader.collate_fn.vocab_size else: - config.model.feat_size = self.test_loader.collate_fn.feature_size - config.model.dict_size = self.test_loader.collate_fn.vocab_size + config.model.input_dim = self.test_loader.collate_fn.feature_size + config.model.output_dim = self.test_loader.collate_fn.vocab_size if self.args.model_type == 'offline': model = DeepSpeech2Model.from_config(config.model) diff --git a/paddlespeech/s2t/models/ds2/deepspeech2.py b/paddlespeech/s2t/models/ds2/deepspeech2.py index 4a7a7c15..317abc69 100644 --- a/paddlespeech/s2t/models/ds2/deepspeech2.py +++ b/paddlespeech/s2t/models/ds2/deepspeech2.py @@ -249,8 +249,8 @@ class DeepSpeech2Model(nn.Layer): The model built from config. """ model = cls( - feat_size=config.feat_size, - dict_size=config.dict_size, + feat_size=config.input_dim, + dict_size=config.output_dim, num_conv_layers=config.num_conv_layers, num_rnn_layers=config.num_rnn_layers, rnn_size=config.rnn_layer_size, diff --git a/paddlespeech/s2t/models/ds2_online/deepspeech2.py b/paddlespeech/s2t/models/ds2_online/deepspeech2.py index da04d5c5..d134239f 100644 --- a/paddlespeech/s2t/models/ds2_online/deepspeech2.py +++ b/paddlespeech/s2t/models/ds2_online/deepspeech2.py @@ -381,8 +381,8 @@ class DeepSpeech2ModelOnline(nn.Layer): The model built from config. """ model = cls( - feat_size=config.feat_size, - dict_size=config.dict_size, + feat_size=config.input_dim, + dict_size=config.output_dim, num_conv_layers=config.num_conv_layers, num_rnn_layers=config.num_rnn_layers, rnn_size=config.rnn_layer_size, From b48bc4e046654126075f01159f66e5dd6885e182 Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Mon, 29 Nov 2021 12:42:28 +0000 Subject: [PATCH 05/40] fix the run.sh --- examples/aishell/asr1/run.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/aishell/asr1/run.sh b/examples/aishell/asr1/run.sh index d07a4ed5..d9c0ee3e 100644 --- a/examples/aishell/asr1/run.sh +++ b/examples/aishell/asr1/run.sh @@ -43,7 +43,7 @@ fi # Optionally, you can add LM and test it with runtime. if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then # test a single .wav file - CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test_hub.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1 fi # Not supported at now!!! From 383b68d8f47f15c86ea1f9bdce90fe39d8ee3b58 Mon Sep 17 00:00:00 2001 From: Junkun Date: Thu, 25 Nov 2021 21:20:03 -0800 Subject: [PATCH 06/40] minor --- dataset/ted_en_zh/ted_en_zh.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataset/ted_en_zh/ted_en_zh.py b/dataset/ted_en_zh/ted_en_zh.py index 9a3ba3b3..2d1fc671 100644 --- a/dataset/ted_en_zh/ted_en_zh.py +++ b/dataset/ted_en_zh/ted_en_zh.py @@ -28,7 +28,7 @@ import soundfile parser = argparse.ArgumentParser(description=__doc__) parser.add_argument( - "--src_dir", + "--src-dir", default="", type=str, help="Directory to kaldi splited data. (default: %(default)s)") From 6a50211c8042ce15392f37403edb54e59dd9a568 Mon Sep 17 00:00:00 2001 From: Junkun Date: Thu, 25 Nov 2021 21:20:37 -0800 Subject: [PATCH 07/40] data process for ted-en-zh st1 --- examples/ted_en_zh/st1/local/data.sh | 214 +++++++++++++++++++-------- examples/ted_en_zh/st1/path.sh | 10 +- 2 files changed, 161 insertions(+), 63 deletions(-) diff --git a/examples/ted_en_zh/st1/local/data.sh b/examples/ted_en_zh/st1/local/data.sh index aa958cfd..72d141e7 100755 --- a/examples/ted_en_zh/st1/local/data.sh +++ b/examples/ted_en_zh/st1/local/data.sh @@ -2,16 +2,18 @@ set -e -stage=-1 +stage=1 stop_stage=100 dict_dir=data/lang_char # bpemode (unigram or bpe) nbpe=8000 -bpemode=unigram +bpemode=bpe bpeprefix="${dict_dir}/bpe_${bpemode}_${nbpe}" data_dir=./TED_EnZh - +target_dir=data/ted_en_zh +dumpdir=data/dump +do_delta=false source ${MAIN_ROOT}/utils/parse_options.sh @@ -38,75 +40,163 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then exit 1 fi - # generate manifests - python3 ${TARGET_DIR}/ted_en_zh/ted_en_zh.py \ - --manifest_prefix="data/manifest" \ - --src_dir="${data_dir}" + # # extract data + # echo "data Extraction" + # python3 local/ted_en_zh.py \ + # --tgt-dir=${target_dir} \ + # --src-dir=${data_dir} - echo "Complete raw data pre-process." fi - +prep_dir=${target_dir}/data_prep if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - # compute mean and stddev for normalizer - num_workers=$(nproc) - python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ - --manifest_path="data/manifest.train.raw" \ - --num_samples=-1 \ - --spectrum_type="fbank" \ - --feat_dim=80 \ - --delta_delta=false \ - --sample_rate=16000 \ - --stride_ms=10.0 \ - --window_ms=25.0 \ - --use_dB_normalization=False \ - --num_workers=${num_workers} \ - --output_path="data/mean_std.json" - - if [ $? -ne 0 ]; then - echo "Compute mean and stddev failed. Terminated." - exit 1 - fi + ### Task dependent. You have to make data the following preparation part by yourself. + ### But you can utilize Kaldi recipes in most cases + echo "stage 0: Data preparation" + for set in train dev test; do + # for set in train; do + dst=${target_dir}/${set} + for lang in en zh; do + + if [ ${lang} = 'en' ]; then + echo "remove punctuation $lang" + # remove punctuation + local/remove_punctuation.pl < ${dst}/${lang}.org > ${dst}/${lang}.raw + else + cp ${dst}/${lang}.org ${dst}/${lang}.raw + fi + + paste -d " " ${dst}/.yaml ${dst}/${lang}.raw | sort > ${dst}/text.${lang} + + + done + # error check + n=$(cat ${dst}/.yaml | wc -l) + n_en=$(cat ${dst}/en.raw | wc -l) + n_tgt=$(cat ${dst}/zh.raw | wc -l) + [ ${n} -ne ${n_en} ] && echo "Warning: expected ${n} data data files, found ${n_en}" && exit 1; + [ ${n} -ne ${n_tgt} ] && echo "Warning: expected ${n} data data files, found ${n_tgt}" && exit 1; + + echo "done text processing" + cat ${dst}/wav.scp.org | uniq | sort -k1,1 -u > ${dst}/wav.scp + cat ${dst}/utt2spk.org | uniq | sort -k1,1 -u > ${dst}/utt2spk + + cat ${dst}/utt2spk | utils/utt2spk_to_spk2utt.pl | sort -k1,1 -u > ${dst}/spk2utt + rm -rf ${prep_dir}/${set}.en-zh + mkdir -p ${prep_dir}/${set}.en-zh + echo "remove duplicate lines..." + cut -d ' ' -f 1 ${dst}/text.en | sort | uniq -c | sort -n -k1 -r | grep -v '1 ted-en-zh' \ + | sed 's/^[ \t]*//' > ${dst}/duplicate_lines + cut -d ' ' -f 1 ${dst}/text.en | sort | uniq -c | sort -n -k1 -r | grep '1 ted-en-zh' \ + | cut -d '1' -f 2- | sed 's/^[ \t]*//' > ${dst}/reclist + reduce_data_dir.sh ${dst} ${dst}/reclist ${prep_dir}/${set}.en-zh + echo "done wav processing" + for l in en zh; do + cp ${dst}/text.${l} ${prep_dir}/${set}.en-zh/text.${l} + done + utils/fix_data_dir.sh --utt_extra_files \ + "text.en text.zh" \ + ${prep_dir}/${set}.en-zh + done fi +feat_tr_dir=${dumpdir}/train/delta${do_delta}; mkdir -p ${feat_tr_dir} +feat_dt_dir=${dumpdir}/dev/delta${do_delta}; mkdir -p ${feat_dt_dir} +feat_trans_dir=${dumpdir}/test/delta${do_delta}; mkdir -p ${feat_trans_dir} if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - # build vocabulary - python3 ${MAIN_ROOT}/utils/build_vocab.py \ - --unit_type "spm" \ - --spm_vocab_size=${nbpe} \ - --spm_mode ${bpemode} \ - --spm_model_prefix ${bpeprefix} \ - --vocab_path="${dict_dir}/vocab.txt" \ - --text_keys 'text' 'text1' \ - --manifest_paths="data/manifest.train.raw" - - - if [ $? -ne 0 ]; then - echo "Build vocabulary failed. Terminated." - exit 1 - fi + ### Task dependent. You have to design training and dev sets by yourself. + ### But you can utilize Kaldi recipes in most cases + echo "stage 1: Feature Generation" + fbankdir=data/fbank + # Generate the fbank features; by default 80-dimensional fbanks with pitch on each frame + for x in train dev test; do + steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj 32 --write_utt2num_frames true \ + ${prep_dir}/${x}.en-zh data/make_fbank/${x} ${fbankdir} + done + + echo "speed perturbation" + utils/perturb_data_dir_speed.sh 0.9 ${prep_dir}/train.en-zh ${prep_dir}/temp1.en-zh + utils/perturb_data_dir_speed.sh 1.0 ${prep_dir}/train.en-zh ${prep_dir}/temp2.en-zh + utils/perturb_data_dir_speed.sh 1.1 ${prep_dir}/train.en-zh ${prep_dir}/temp3.en-zh + + utils/combine_data.sh --extra-files utt2uniq ${prep_dir}/train_sp.en-zh \ + ${prep_dir}/temp1.en-zh ${prep_dir}/temp2.en-zh ${prep_dir}/temp3.en-zh + rm -r ${prep_dir}/temp*.en-zh + utils/fix_data_dir.sh ${prep_dir}/train_sp.en-zh + + steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj 32 --write_utt2num_frames true \ + ${prep_dir}/train_sp.en-zh exp/make_fbank/train_sp.en-zh ${fbankdir} + + for lang in en zh; do + cat /dev/null > ${prep_dir}/train_sp.en-zh/text.${lang} + for p in "sp0.9-" "sp1.0-" "sp1.1-"; do + awk -v p=${p} '{printf("%s %s%s\n", $1, p, $1);}' ${prep_dir}/train.en-zh/utt2spk > ${prep_dir}/train_sp.en-zh/utt_map + utils/apply_map.pl -f 1 ${prep_dir}/train_sp.en-zh/utt_map < ${prep_dir}/train.en-zh/text.${lang} >>${prep_dir}/train_sp.en-zh/text.${lang} + done + done + + for x in train_sp dev test; do + local/divide_lang.sh ${prep_dir}/${x}.en-zh zh + done + + for x in train_sp dev; do + # remove utt having more than 3000 frames + # remove utt having more than 400 characters + for lang in zh en; do + remove_longshortdata.sh --maxframes 3000 --maxchars 400 ${prep_dir}/${x}.en-zh.${lang} ${prep_dir}/${x}.en-zh.${lang}.tmp + done + cut -f 1 -d " " ${prep_dir}/${x}.en-zh.en.tmp/text > ${prep_dir}/${x}.en-zh.${lang}.tmp/reclist1 + cut -f 1 -d " " ${prep_dir}/${x}.en-zh.${lang}.tmp/text > ${prep_dir}/${x}.en-zh.${lang}.tmp/reclist2 + comm -12 ${prep_dir}/${x}.en-zh.${lang}.tmp/reclist1 ${prep_dir}/${x}.en-zh.${lang}.tmp/reclist2 > ${prep_dir}/${x}.en-zh.en.tmp/reclist + + for lang in zh en; do + reduce_data_dir.sh ${prep_dir}/${x}.en-zh.${lang}.tmp ${prep_dir}/${x}.en-zh.en.tmp/reclist ${prep_dir}/${x}.en-zh.${lang} + utils/fix_data_dir.sh ${prep_dir}/${x}.en-zh.${lang} + done + rm -rf ${prep_dir}/${x}.en-zh.*.tmp + done + + compute-cmvn-stats scp:${prep_dir}/train_sp.en-zh/feats.scp ${prep_dir}/train_sp.en-zh/cmvn.ark + + dump.sh --cmd "$train_cmd" --nj 80 --do_delta $do_delta \ + ${prep_dir}/train_sp.en-zh/feats.scp ${prep_dir}/train_sp.en-zh/cmvn.ark ${prep_dir}/dump_feats/train_sp.en-zh ${feat_tr_dir} + dump.sh --cmd "$train_cmd" --nj 32 --do_delta $do_delta \ + ${prep_dir}/dev.en-zh/feats.scp ${prep_dir}/train_sp.en-zh/cmvn.ark ${prep_dir}/dump_feats/dev.en-zh ${feat_dt_dir} + dump.sh --cmd "$train_cmd" --nj 32 --do_delta $do_delta \ + ${prep_dir}/test.en-zh/feats.scp ${prep_dir}/train_sp.en-zh/cmvn.ark ${prep_dir}/dump_feats/test.en-zh ${feat_trans_dir} fi +dict=${dict_dir}/ted_en_zh_${bpemode}${nbpe}_joint.txt +nlsyms=${dict_dir}/ted_en_zh_non_lang_syms.txt +bpemodel=${dict_dir}/ted_en_zh_${bpemode}${nbpe} if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - # format manifest with tokenids, vocab size - for set in train dev test; do - { - python3 ${MAIN_ROOT}/utils/format_triplet_data.py \ - --feat_type "raw" \ - --cmvn_path "data/mean_std.json" \ - --unit_type "spm" \ - --spm_model_prefix ${bpeprefix} \ - --vocab_path="${dict_dir}/vocab.txt" \ - --manifest_path="data/manifest.${set}.raw" \ - --output_path="data/manifest.${set}" - - if [ $? -ne 0 ]; then - echo "Formt mnaifest failed. Terminated." - exit 1 - fi - }& + echo "stage 2: Dictionary and Json Data Preparation" + # echo "make a non-linguistic symbol list for all languages" + # grep sp1.0 ${prep_dir}/train_sp.en-zh.*/text | cut -f 2- -d' ' | grep -o -P '&[^;];'| sort | uniq > ${nlsyms} + # cat ${nlsyms} + + echo "make a joint source and target dictionary" + echo " 1" > ${dict} # must be 1, 0 will be used for "blank" in CTC + offset=$(wc -l < ${dict}) + grep sp1.0 ${prep_dir}/train_sp.en-zh.*/text | cut -f 2- -d' ' | grep -v -e '^\s*$' > ${dict_dir}/input.txt + spm_train --input=${dict_dir}/input.txt --vocab_size=${nbpe} --model_type=${bpemode} --model_prefix=${bpemodel} --input_sentence_size=100000000 --character_coverage=1.0 + spm_encode --model=${bpemodel}.model --output_format=piece < ${dict_dir}/input.txt | tr ' ' '\n' | sort | uniq | awk -v offset=${offset} '{print $0 " " NR+offset}' >> ${dict} + wc -l ${dict} + + echo "make json files" + data2json.sh --nj 16 --feat ${feat_tr_dir}/feats.scp --text ${prep_dir}/train_sp.en-zh.zh/text --bpecode ${bpemodel}.model --lang zh \ + ${prep_dir}/train_sp.en-zh.zh ${dict} > ${feat_tr_dir}/data_${bpemode}${nbpe}.json + data2json.sh --feat ${feat_dt_dir}/feats.scp --text ${prep_dir}/dev.en-zh.zh/text --bpecode ${bpemodel}.model --lang zh \ + ${prep_dir}/dev.en-zh.zh ${dict} > ${feat_dt_dir}/data_${bpemode}${nbpe}.json + data2json.sh --feat ${feat_dt_dir}/feats.scp --text ${prep_dir}/test.en-zh.zh/text --bpecode ${bpemodel}.model --lang zh \ + ${prep_dir}/test.en-zh.zh ${dict} > ${feat_trans_dir}/data_${bpemode}${nbpe}.json + echo "update json (add source references)" + # update json (add source references) + for x in ${train_set} ${train_dev}; do + feat_dir=${dumpdir}/${x}/delta${do_delta} + data_dir=data/$(echo ${x} | cut -f 1 -d ".").en-zh.en + update_json.sh --text ${data_dir}/text.${src_case} --bpecode ${bpemodel}.model \ + ${feat_dir}/data_${bpemode}${nbpe}.json ${data_dir} ${dict} done - wait fi - echo "Ted En-Zh Data preparation done." exit 0 diff --git a/examples/ted_en_zh/st1/path.sh b/examples/ted_en_zh/st1/path.sh index fd537917..ee4c9779 100644 --- a/examples/ted_en_zh/st1/path.sh +++ b/examples/ted_en_zh/st1/path.sh @@ -1,6 +1,6 @@ export MAIN_ROOT=`realpath ${PWD}/../../../` -export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} +export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PWD}/utils:${PATH} export LC_ALL=C export PYTHONDONTWRITEBYTECODE=1 @@ -13,3 +13,11 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/ MODEL=u2_st export BIN_DIR=${MAIN_ROOT}/paddlespeech/s2t/exps/${MODEL}/bin + +# Kaldi +export KALDI_ROOT=${MAIN_ROOT}/tools/kaldi +[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present, can not using Kaldi!" +[ -f $KALDI_ROOT/tools/config/common_path.sh ] && . $KALDI_ROOT/tools/config/common_path.sh +export train_cmd="run.pl" \ No newline at end of file From cdd084512783303b9c606dc4c4e0aa739e6b8c3e Mon Sep 17 00:00:00 2001 From: Junkun Date: Sun, 28 Nov 2021 22:59:37 -0800 Subject: [PATCH 08/40] add translate function --- paddlespeech/s2t/exps/u2_st/model.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/paddlespeech/s2t/exps/u2_st/model.py b/paddlespeech/s2t/exps/u2_st/model.py index 52d3c3b7..034463fe 100644 --- a/paddlespeech/s2t/exps/u2_st/model.py +++ b/paddlespeech/s2t/exps/u2_st/model.py @@ -26,8 +26,10 @@ from paddle import distributed as dist from paddle.io import DataLoader from yacs.config import CfgNode +from paddlespeech.s2t.frontend.featurizer import TextFeaturizer from paddlespeech.s2t.io.collator import SpeechCollator from paddlespeech.s2t.io.collator import TripletSpeechCollator +from paddlespeech.s2t.io.dataloader import BatchDataLoader from paddlespeech.s2t.io.dataset import ManifestDataset from paddlespeech.s2t.io.sampler import SortagradBatchSampler from paddlespeech.s2t.io.sampler import SortagradDistributedBatchSampler @@ -423,6 +425,30 @@ class U2STTester(U2STTrainer): trans.append(''.join([chr(i) for i in ids])) return trans + def translate(self, audio, audio_len): + """"E2E translation from extracted audio feature""" + cfg = self.config.decoding + text_feature = self.test_loader.collate_fn.text_feature + + hyps = self.model.decode( + audio, + audio_len, + text_feature=text_feature, + decoding_method=cfg.decoding_method, + lang_model_path=cfg.lang_model_path, + beam_alpha=cfg.alpha, + beam_beta=cfg.beta, + beam_size=cfg.beam_size, + cutoff_prob=cfg.cutoff_prob, + cutoff_top_n=cfg.cutoff_top_n, + num_processes=cfg.num_proc_bsearch, + ctc_weight=cfg.ctc_weight, + word_reward=cfg.word_reward, + decoding_chunk_size=cfg.decoding_chunk_size, + num_decoding_left_chunks=cfg.num_decoding_left_chunks, + simulate_streaming=cfg.simulate_streaming) + return hyps + def compute_translation_metrics(self, utts, audio, From 8f3280af8e73c90b148a94800948e4dc7273696a Mon Sep 17 00:00:00 2001 From: Junkun Date: Mon, 29 Nov 2021 11:30:18 -0800 Subject: [PATCH 09/40] fix data process --- examples/ted_en_zh/st1/local/data.sh | 53 +++++++++--------- examples/ted_en_zh/st1/local/data_prep.sh | 54 +++++++++++++++++++ examples/ted_en_zh/st1/local/divide_lang.sh | 48 +++++++++++++++++ .../st1/local/espnet_json_to_manifest.py | 27 ++++++++++ .../ted_en_zh/st1/local/remove_punctuation.pl | 25 +++++++++ 5 files changed, 183 insertions(+), 24 deletions(-) create mode 100755 examples/ted_en_zh/st1/local/data_prep.sh create mode 100755 examples/ted_en_zh/st1/local/divide_lang.sh create mode 100644 examples/ted_en_zh/st1/local/espnet_json_to_manifest.py create mode 100755 examples/ted_en_zh/st1/local/remove_punctuation.pl diff --git a/examples/ted_en_zh/st1/local/data.sh b/examples/ted_en_zh/st1/local/data.sh index 72d141e7..8b829a8a 100755 --- a/examples/ted_en_zh/st1/local/data.sh +++ b/examples/ted_en_zh/st1/local/data.sh @@ -2,7 +2,7 @@ set -e -stage=1 +stage=3 stop_stage=100 dict_dir=data/lang_char @@ -14,6 +14,7 @@ data_dir=./TED_EnZh target_dir=data/ted_en_zh dumpdir=data/dump do_delta=false +nj=20 source ${MAIN_ROOT}/utils/parse_options.sh @@ -40,11 +41,11 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then exit 1 fi - # # extract data - # echo "data Extraction" - # python3 local/ted_en_zh.py \ - # --tgt-dir=${target_dir} \ - # --src-dir=${data_dir} + # extract data + echo "data Extraction" + python3 local/ted_en_zh.py \ + --tgt-dir=${target_dir} \ + --src-dir=${data_dir} fi prep_dir=${target_dir}/data_prep @@ -99,7 +100,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then done fi -feat_tr_dir=${dumpdir}/train/delta${do_delta}; mkdir -p ${feat_tr_dir} +feat_tr_dir=${dumpdir}/train_sp/delta${do_delta}; mkdir -p ${feat_tr_dir} feat_dt_dir=${dumpdir}/dev/delta${do_delta}; mkdir -p ${feat_dt_dir} feat_trans_dir=${dumpdir}/test/delta${do_delta}; mkdir -p ${feat_trans_dir} if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then @@ -109,7 +110,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then fbankdir=data/fbank # Generate the fbank features; by default 80-dimensional fbanks with pitch on each frame for x in train dev test; do - steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj 32 --write_utt2num_frames true \ + steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj ${nj} --write_utt2num_frames true \ ${prep_dir}/${x}.en-zh data/make_fbank/${x} ${fbankdir} done @@ -123,7 +124,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then rm -r ${prep_dir}/temp*.en-zh utils/fix_data_dir.sh ${prep_dir}/train_sp.en-zh - steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj 32 --write_utt2num_frames true \ + steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj ${nj} --write_utt2num_frames true \ ${prep_dir}/train_sp.en-zh exp/make_fbank/train_sp.en-zh ${fbankdir} for lang in en zh; do @@ -155,14 +156,14 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then rm -rf ${prep_dir}/${x}.en-zh.*.tmp done - compute-cmvn-stats scp:${prep_dir}/train_sp.en-zh/feats.scp ${prep_dir}/train_sp.en-zh/cmvn.ark + compute-cmvn-stats scp:${prep_dir}/train_sp.en-zh.zh/feats.scp ${prep_dir}/train_sp.en-zh.zh/cmvn.ark - dump.sh --cmd "$train_cmd" --nj 80 --do_delta $do_delta \ - ${prep_dir}/train_sp.en-zh/feats.scp ${prep_dir}/train_sp.en-zh/cmvn.ark ${prep_dir}/dump_feats/train_sp.en-zh ${feat_tr_dir} - dump.sh --cmd "$train_cmd" --nj 32 --do_delta $do_delta \ - ${prep_dir}/dev.en-zh/feats.scp ${prep_dir}/train_sp.en-zh/cmvn.ark ${prep_dir}/dump_feats/dev.en-zh ${feat_dt_dir} - dump.sh --cmd "$train_cmd" --nj 32 --do_delta $do_delta \ - ${prep_dir}/test.en-zh/feats.scp ${prep_dir}/train_sp.en-zh/cmvn.ark ${prep_dir}/dump_feats/test.en-zh ${feat_trans_dir} + dump.sh --cmd "$train_cmd" --nj ${nj} --do_delta $do_delta \ + ${prep_dir}/train_sp.en-zh.zh/feats.scp ${prep_dir}/train_sp.en-zh.zh/cmvn.ark ${prep_dir}/dump_feats/train_sp.en-zh.zh ${feat_tr_dir} + dump.sh --cmd "$train_cmd" --nj ${nj} --do_delta $do_delta \ + ${prep_dir}/dev.en-zh.zh/feats.scp ${prep_dir}/train_sp.en-zh.zh/cmvn.ark ${prep_dir}/dump_feats/dev.en-zh.zh ${feat_dt_dir} + dump.sh --cmd "$train_cmd" --nj ${nj} --do_delta $do_delta \ + ${prep_dir}/test.en-zh.zh/feats.scp ${prep_dir}/train_sp.en-zh.zh/cmvn.ark ${prep_dir}/dump_feats/test.en-zh.zh ${feat_trans_dir} fi dict=${dict_dir}/ted_en_zh_${bpemode}${nbpe}_joint.txt @@ -170,9 +171,6 @@ nlsyms=${dict_dir}/ted_en_zh_non_lang_syms.txt bpemodel=${dict_dir}/ted_en_zh_${bpemode}${nbpe} if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then echo "stage 2: Dictionary and Json Data Preparation" - # echo "make a non-linguistic symbol list for all languages" - # grep sp1.0 ${prep_dir}/train_sp.en-zh.*/text | cut -f 2- -d' ' | grep -o -P '&[^;];'| sort | uniq > ${nlsyms} - # cat ${nlsyms} echo "make a joint source and target dictionary" echo " 1" > ${dict} # must be 1, 0 will be used for "blank" in CTC @@ -183,20 +181,27 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then wc -l ${dict} echo "make json files" - data2json.sh --nj 16 --feat ${feat_tr_dir}/feats.scp --text ${prep_dir}/train_sp.en-zh.zh/text --bpecode ${bpemodel}.model --lang zh \ + data2json.sh --nj ${nj} --feat ${feat_tr_dir}/feats.scp --text ${prep_dir}/train_sp.en-zh.zh/text --bpecode ${bpemodel}.model --lang zh \ ${prep_dir}/train_sp.en-zh.zh ${dict} > ${feat_tr_dir}/data_${bpemode}${nbpe}.json data2json.sh --feat ${feat_dt_dir}/feats.scp --text ${prep_dir}/dev.en-zh.zh/text --bpecode ${bpemodel}.model --lang zh \ ${prep_dir}/dev.en-zh.zh ${dict} > ${feat_dt_dir}/data_${bpemode}${nbpe}.json - data2json.sh --feat ${feat_dt_dir}/feats.scp --text ${prep_dir}/test.en-zh.zh/text --bpecode ${bpemodel}.model --lang zh \ + data2json.sh --feat ${feat_trans_dir}/feats.scp --text ${prep_dir}/test.en-zh.zh/text --bpecode ${bpemodel}.model --lang zh \ ${prep_dir}/test.en-zh.zh ${dict} > ${feat_trans_dir}/data_${bpemode}${nbpe}.json echo "update json (add source references)" # update json (add source references) - for x in ${train_set} ${train_dev}; do + for x in train_sp dev; do feat_dir=${dumpdir}/${x}/delta${do_delta} - data_dir=data/$(echo ${x} | cut -f 1 -d ".").en-zh.en - update_json.sh --text ${data_dir}/text.${src_case} --bpecode ${bpemodel}.model \ + data_dir=${prep_dir}/$(echo ${x} | cut -f 1 -d ".").en-zh.en + update_json.sh --text ${data_dir}/text --bpecode ${bpemodel}.model \ ${feat_dir}/data_${bpemode}${nbpe}.json ${data_dir} ${dict} done fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + echo "stage 3: Format the Json Data" + python3 local/espnet_json_to_manifest.py --json-file ${feat_tr_dir}/data_${bpemode}${nbpe}.json --manifest-file data/manifest.train + python3 local/espnet_json_to_manifest.py --json-file ${feat_dt_dir}/data_${bpemode}${nbpe}.json --manifest-file data/manifest.dev + python3 local/espnet_json_to_manifest.py --json-file ${feat_trans_dir}/data_${bpemode}${nbpe}.json --manifest-file data/manifest.test +fi echo "Ted En-Zh Data preparation done." exit 0 diff --git a/examples/ted_en_zh/st1/local/data_prep.sh b/examples/ted_en_zh/st1/local/data_prep.sh new file mode 100755 index 00000000..339cee1e --- /dev/null +++ b/examples/ted_en_zh/st1/local/data_prep.sh @@ -0,0 +1,54 @@ +#!/bin/bash + +# Copyright 2019 Kyoto University (Hirofumi Inaguma) +# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +export LC_ALL=C + +data_dir=${1} + +for set in train dev test; do +# for set in train; do + dst=${target_dir}/${set} + for lang in en zh; do + + if [ ${lang} = 'en' ]; then + echo "remove punctuation $lang" + # remove punctuation + local/remove_punctuation.pl < ${dst}/${lang}.org > ${dst}/${lang}.raw + else + cp ${dst}/${lang}.org ${dst}/${lang}.raw + fi + + paste -d " " ${dst}/.yaml ${dst}/${lang}.raw | sort > ${dst}/text.${lang} + + + done + # error check + n=$(cat ${dst}/.yaml | wc -l) + n_en=$(cat ${dst}/en.raw | wc -l) + n_tgt=$(cat ${dst}/zh.raw | wc -l) + [ ${n} -ne ${n_en} ] && echo "Warning: expected ${n} data data files, found ${n_en}" && exit 1; + [ ${n} -ne ${n_tgt} ] && echo "Warning: expected ${n} data data files, found ${n_tgt}" && exit 1; + + echo "done text processing" + cat ${dst}/wav.scp.org | uniq | sort -k1,1 -u > ${dst}/wav.scp + cat ${dst}/utt2spk.org | uniq | sort -k1,1 -u > ${dst}/utt2spk + + cat ${dst}/utt2spk | utt2spk_to_spk2utt.pl | sort -k1,1 -u > ${dst}/spk2utt + rm -rf ${target_dir}/data_prep/${set}.en-zh + mkdir -p ${target_dir}/data_prep/${set}.en-zh + echo "remove duplicate lines..." + cut -d ' ' -f 1 ${dst}/text.en | sort | uniq -c | sort -n -k1 -r | grep -v '1 ted-en-zh' \ + | sed 's/^[ \t]*//' > ${dst}/duplicate_lines + cut -d ' ' -f 1 ${dst}/text.en | sort | uniq -c | sort -n -k1 -r | grep '1 ted-en-zh' \ + | cut -d '1' -f 2- | sed 's/^[ \t]*//' > ${dst}/reclist + reduce_data_dir.sh ${dst} ${dst}/reclist ${target_dir}/data_prep/${set}.en-zh + echo "done wav processing" + for l in en zh; do + cp ${dst}/text.${l} ${target_dir}/data_prep/${set}.en-zh/text.${l} + done + fix_data_dir.sh --utt_extra_files \ + "text.en text.zh" \ + ${target_dir}/data_prep/${set}.en-zh +done \ No newline at end of file diff --git a/examples/ted_en_zh/st1/local/divide_lang.sh b/examples/ted_en_zh/st1/local/divide_lang.sh new file mode 100755 index 00000000..4e5f85c8 --- /dev/null +++ b/examples/ted_en_zh/st1/local/divide_lang.sh @@ -0,0 +1,48 @@ +#!/bin/bash + +# Copyright 2019 Kyoto University (Hirofumi Inaguma) +# 2021 PaddlePaddle +# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +. ./path.sh + +if [ "$#" -ne 2 ]; then + echo "Usage: $0 >" + echo "e.g.: $0 dev" + exit 1 +fi + +set=$1 +lang=$2 +export LC_ALL=en_US.UTF-8 +# Copy stuff intoc its final locations [this has been moved from the format_data script] +# for En +mkdir -p ${set}.en +for f in spk2utt utt2spk segments wav.scp feats.scp utt2num_frames; do + if [ -f ${set}/${f} ]; then + sort ${set}/${f} > ${set}.en/${f} + fi +done +sort ${set}/text.en | sed $'s/[^[:print:]]//g' > ${set}.en/text + +utils/fix_data_dir.sh ${set}.en +if [ -f ${set}.en/feats.scp ]; then + utils/validate_data_dir.sh ${set}.en || exit 1; +else + utils/validate_data_dir.sh --no-feats --no-wav ${set}.en || exit 1; +fi + +# for target language +mkdir -p ${set}.${lang} +for f in spk2utt utt2spk segments wav.scp feats.scp utt2num_frames; do + if [ -f ${set}/${f} ]; then + sort ${set}/${f} > ${set}.${lang}/${f} + fi +done +sort ${set}/text.${lang} | sed $'s/[^[:print:]]//g' > ${set}.${lang}/text +utils/fix_data_dir.sh ${set}.${lang} +if [ -f ${set}.${lang}/feats.scp ]; then + utils/validate_data_dir.sh ${set}.${lang} || exit 1; +else + utils/validate_data_dir.sh --no-feats --no-wav ${set}.${lang} || exit 1; +fi diff --git a/examples/ted_en_zh/st1/local/espnet_json_to_manifest.py b/examples/ted_en_zh/st1/local/espnet_json_to_manifest.py new file mode 100644 index 00000000..60d25436 --- /dev/null +++ b/examples/ted_en_zh/st1/local/espnet_json_to_manifest.py @@ -0,0 +1,27 @@ +#!/usr/bin/env python +import argparse +import json + + +def main(args): + with open(args.json_file, 'r') as fin: + data_json = json.load(fin) + + with open(args.manifest_file, 'w') as fout: + for key, value in data_json['utts'].items(): + value['utt'] = key + fout.write(json.dumps(value, ensure_ascii=False)) + fout.write("\n") + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + '--json-file', type=str, default=None, help="espnet data json file.") + parser.add_argument( + '--manifest-file', + type=str, + default='manifest.train', + help='manifest data json line file.') + args = parser.parse_args() + main(args) diff --git a/examples/ted_en_zh/st1/local/remove_punctuation.pl b/examples/ted_en_zh/st1/local/remove_punctuation.pl new file mode 100755 index 00000000..89e19c6f --- /dev/null +++ b/examples/ted_en_zh/st1/local/remove_punctuation.pl @@ -0,0 +1,25 @@ +#!/usr/bin/perl + +use warnings; +use strict; + +binmode(STDIN,":utf8"); +binmode(STDOUT,":utf8"); + +while() { + $_ = " $_ "; + + # remove punctuation except apostrophe + s//spacemark/g; # for scoring + s/'/apostrophe/g; + s/[[:punct:]]//g; + s/apostrophe/'/g; + s/spacemark//g; # for scoring + + # remove whitespace + s/\s+/ /g; + s/^\s+//; + s/\s+$//; + + print "$_\n"; +} From ea35558ee03527b57cfacccf272f405ca427d0b2 Mon Sep 17 00:00:00 2001 From: Junkun Date: Mon, 29 Nov 2021 11:31:45 -0800 Subject: [PATCH 10/40] add utils --- utils/addjson.py | 155 +++++++++++ utils/scp2json.py | 48 ++++ utils/tokenizer.perl | 596 +++++++++++++++++++++++++++++++++++++++++++ utils/update_json.sh | 88 +++++++ 4 files changed, 887 insertions(+) create mode 100755 utils/addjson.py create mode 100755 utils/scp2json.py create mode 100644 utils/tokenizer.perl create mode 100755 utils/update_json.sh diff --git a/utils/addjson.py b/utils/addjson.py new file mode 100755 index 00000000..7fabe625 --- /dev/null +++ b/utils/addjson.py @@ -0,0 +1,155 @@ +#!/usr/bin/env python3 +# encoding: utf-8 + +# Copyright 2018 Nagoya University (Tomoki Hayashi) +# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +from __future__ import print_function +from __future__ import unicode_literals + +import argparse +import codecs +import json +import logging +import sys + +from distutils.util import strtobool + +from espnet.utils.cli_utils import get_commandline_args + +is_python2 = sys.version_info[0] == 2 + + +def get_parser(): + parser = argparse.ArgumentParser( + description="add multiple json values to an input or output value", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument("jsons", type=str, nargs="+", help="json files") + parser.add_argument( + "-i", + "--is-input", + default=True, + type=strtobool, + help="If true, add to input. If false, add to output", + ) + parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option") + return parser + + +if __name__ == "__main__": + parser = get_parser() + args = parser.parse_args() + + # logging info + logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" + if args.verbose > 0: + logging.basicConfig(level=logging.INFO, format=logfmt) + else: + logging.basicConfig(level=logging.WARN, format=logfmt) + logging.info(get_commandline_args()) + + # make intersection set for utterance keys + js = [] + intersec_ks = [] + for x in args.jsons: + with codecs.open(x, "r", encoding="utf-8") as f: + j = json.load(f) + ks = j["utts"].keys() + logging.info(x + ": has " + str(len(ks)) + " utterances") + if len(intersec_ks) > 0: + intersec_ks = intersec_ks.intersection(set(ks)) + if len(intersec_ks) == 0: + logging.warning("Empty intersection") + break + else: + intersec_ks = set(ks) + js.append(j) + logging.info("new json has " + str(len(intersec_ks)) + " utterances") + + # updated original dict to keep intersection + intersec_org_dic = dict() + for k in intersec_ks: + v = js[0]["utts"][k] + intersec_org_dic[k] = v + + intersec_add_dic = dict() + for k in intersec_ks: + v = js[1]["utts"][k] + for j in js[2:]: + v.update(j["utts"][k]) + intersec_add_dic[k] = v + + new_dic = dict() + for key_id in intersec_org_dic: + orgdic = intersec_org_dic[key_id] + adddic = intersec_add_dic[key_id] + + if "utt2spk" not in orgdic: + orgdic["utt2spk"] = "" + # NOTE: for machine translation + + # add as input + if args.is_input: + # original input + input_list = orgdic["input"] + # additional input + in_add_dic = {} + if "idim" in adddic and "ilen" in adddic: + in_add_dic["shape"] = [int(adddic["ilen"]), int(adddic["idim"])] + elif "idim" in adddic: + in_add_dic["shape"] = [int(adddic["idim"])] + # add all other key value + for key, value in adddic.items(): + if key in ["idim", "ilen"]: + continue + in_add_dic[key] = value + # add name + in_add_dic["name"] = "input%d" % (len(input_list) + 1) + + input_list.append(in_add_dic) + new_dic[key_id] = { + "input": input_list, + "output": orgdic["output"], + "utt2spk": orgdic["utt2spk"], + } + # add as output + else: + # original output + output_list = orgdic["output"] + # additional output + out_add_dic = {} + # add shape + if "odim" in adddic and "olen" in adddic: + out_add_dic["shape"] = [int(adddic["olen"]), int(adddic["odim"])] + elif "odim" in adddic: + out_add_dic["shape"] = [int(adddic["odim"])] + # add all other key value + for key, value in adddic.items(): + if key in ["odim", "olen"]: + continue + out_add_dic[key] = value + # add name + out_add_dic["name"] = "target%d" % (len(output_list) + 1) + + output_list.append(out_add_dic) + new_dic[key_id] = { + "input": orgdic["input"], + "output": output_list, + "utt2spk": orgdic["utt2spk"], + } + if "lang" in orgdic.keys(): + new_dic[key_id]["lang"] = orgdic["lang"] + + # ensure "ensure_ascii=False", which is a bug + jsonstring = json.dumps( + {"utts": new_dic}, + indent=4, + ensure_ascii=False, + sort_keys=True, + separators=(",", ": "), + ) + sys.stdout = codecs.getwriter("utf-8")( + sys.stdout if is_python2 else sys.stdout.buffer + ) + print(jsonstring) diff --git a/utils/scp2json.py b/utils/scp2json.py new file mode 100755 index 00000000..8e8de3e0 --- /dev/null +++ b/utils/scp2json.py @@ -0,0 +1,48 @@ +#!/usr/bin/env python3 +# encoding: utf-8 + +# Copyright 2017 Johns Hopkins University (Shinji Watanabe) +# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +from __future__ import print_function +from __future__ import unicode_literals + +import argparse +import codecs +import json +import sys + +is_python2 = sys.version_info[0] == 2 + + +def get_parser(): + parser = argparse.ArgumentParser( + description="convert scp to json", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument("--key", "-k", type=str, help="key") + return parser + + +if __name__ == "__main__": + parser = get_parser() + args = parser.parse_args() + + new_line = {} + sys.stdin = codecs.getreader("utf-8")(sys.stdin if is_python2 else sys.stdin.buffer) + sys.stdout = codecs.getwriter("utf-8")( + sys.stdout if is_python2 else sys.stdout.buffer + ) + line = sys.stdin.readline() + while line: + x = line.rstrip().split() + v = {args.key: " ".join(x[1:])} + new_line[x[0]] = v + line = sys.stdin.readline() + + all_l = {"utts": new_line} + + # ensure "ensure_ascii=False", which is a bug + jsonstring = json.dumps( + all_l, indent=4, ensure_ascii=False, sort_keys=True, separators=(",", ": ") + ) + print(jsonstring) diff --git a/utils/tokenizer.perl b/utils/tokenizer.perl new file mode 100644 index 00000000..ae97d658 --- /dev/null +++ b/utils/tokenizer.perl @@ -0,0 +1,596 @@ +#!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. + +use warnings; + +# Sample Tokenizer +### Version 1.1 +# written by Pidong Wang, based on the code written by Josh Schroeder and Philipp Koehn +# Version 1.1 updates: +# (1) add multithreading option "-threads NUM_THREADS" (default is 1); +# (2) add a timing option "-time" to calculate the average speed of this tokenizer; +# (3) add an option "-lines NUM_SENTENCES_PER_THREAD" to set the number of lines for each thread (default is 2000), and this option controls the memory amount needed: the larger this number is, the larger memory is required (the higher tokenization speed); +### Version 1.0 +# $Id: tokenizer.perl 915 2009-08-10 08:15:49Z philipp $ +# written by Josh Schroeder, based on code by Philipp Koehn + +binmode(STDIN, ":utf8"); +binmode(STDOUT, ":utf8"); + +use warnings; +use FindBin qw($RealBin); +use strict; +use Time::HiRes; + +if (eval {require Thread;1;}) { + #module loaded + Thread->import(); +} + +my $mydir = "$RealBin/../share/nonbreaking_prefixes"; + +my %NONBREAKING_PREFIX = (); +my @protected_patterns = (); +my $protected_patterns_file = ""; +my $language = "en"; +my $QUIET = 0; +my $HELP = 0; +my $AGGRESSIVE = 0; +my $SKIP_XML = 0; +my $TIMING = 0; +my $NUM_THREADS = 1; +my $NUM_SENTENCES_PER_THREAD = 2000; +my $PENN = 0; +my $NO_ESCAPING = 0; +while (@ARGV) +{ + $_ = shift; + /^-b$/ && ($| = 1, next); + /^-l$/ && ($language = shift, next); + /^-q$/ && ($QUIET = 1, next); + /^-h$/ && ($HELP = 1, next); + /^-x$/ && ($SKIP_XML = 1, next); + /^-a$/ && ($AGGRESSIVE = 1, next); + /^-time$/ && ($TIMING = 1, next); + # Option to add list of regexps to be protected + /^-protected/ && ($protected_patterns_file = shift, next); + /^-threads$/ && ($NUM_THREADS = int(shift), next); + /^-lines$/ && ($NUM_SENTENCES_PER_THREAD = int(shift), next); + /^-penn$/ && ($PENN = 1, next); + /^-no-escape/ && ($NO_ESCAPING = 1, next); +} + +# for time calculation +my $start_time; +if ($TIMING) +{ + $start_time = [ Time::HiRes::gettimeofday( ) ]; +} + +# print help message +if ($HELP) +{ + print "Usage ./tokenizer.perl (-l [en|de|...]) (-threads 4) < textfile > tokenizedfile\n"; + print "Options:\n"; + print " -q ... quiet.\n"; + print " -a ... aggressive hyphen splitting.\n"; + print " -b ... disable Perl buffering.\n"; + print " -time ... enable processing time calculation.\n"; + print " -penn ... use Penn treebank-like tokenization.\n"; + print " -protected FILE ... specify file with patters to be protected in tokenisation.\n"; + print " -no-escape ... don't perform HTML escaping on apostrophy, quotes, etc.\n"; + exit; +} + +if (!$QUIET) +{ + print STDERR "Tokenizer Version 1.1\n"; + print STDERR "Language: $language\n"; + print STDERR "Number of threads: $NUM_THREADS\n"; +} + +# load the language-specific non-breaking prefix info from files in the directory nonbreaking_prefixes +load_prefixes($language,\%NONBREAKING_PREFIX); + +if (scalar(%NONBREAKING_PREFIX) eq 0) +{ + print STDERR "Warning: No known abbreviations for language '$language'\n"; +} + +# Load protected patterns +if ($protected_patterns_file) +{ + open(PP,$protected_patterns_file) || die "Unable to open $protected_patterns_file"; + while() { + chomp; + push @protected_patterns, $_; + } +} + +my @batch_sentences = (); +my @thread_list = (); +my $count_sentences = 0; + +if ($NUM_THREADS > 1) +{# multi-threading tokenization + while() + { + $count_sentences = $count_sentences + 1; + push(@batch_sentences, $_); + if (scalar(@batch_sentences)>=($NUM_SENTENCES_PER_THREAD*$NUM_THREADS)) + { + # assign each thread work + for (my $i=0; $i<$NUM_THREADS; $i++) + { + my $start_index = $i*$NUM_SENTENCES_PER_THREAD; + my $end_index = $start_index+$NUM_SENTENCES_PER_THREAD-1; + my @subbatch_sentences = @batch_sentences[$start_index..$end_index]; + my $new_thread = new Thread \&tokenize_batch, @subbatch_sentences; + push(@thread_list, $new_thread); + } + foreach (@thread_list) + { + my $tokenized_list = $_->join; + foreach (@$tokenized_list) + { + print $_; + } + } + # reset for the new run + @thread_list = (); + @batch_sentences = (); + } + } + # the last batch + if (scalar(@batch_sentences)>0) + { + # assign each thread work + for (my $i=0; $i<$NUM_THREADS; $i++) + { + my $start_index = $i*$NUM_SENTENCES_PER_THREAD; + if ($start_index >= scalar(@batch_sentences)) + { + last; + } + my $end_index = $start_index+$NUM_SENTENCES_PER_THREAD-1; + if ($end_index >= scalar(@batch_sentences)) + { + $end_index = scalar(@batch_sentences)-1; + } + my @subbatch_sentences = @batch_sentences[$start_index..$end_index]; + my $new_thread = new Thread \&tokenize_batch, @subbatch_sentences; + push(@thread_list, $new_thread); + } + foreach (@thread_list) + { + my $tokenized_list = $_->join; + foreach (@$tokenized_list) + { + print $_; + } + } + } +} +else +{# single thread only + while() + { + if (($SKIP_XML && /^<.+>$/) || /^\s*$/) + { + #don't try to tokenize XML/HTML tag lines + print $_; + } + else + { + print &tokenize($_); + } + } +} + +if ($TIMING) +{ + my $duration = Time::HiRes::tv_interval( $start_time ); + print STDERR ("TOTAL EXECUTION TIME: ".$duration."\n"); + print STDERR ("TOKENIZATION SPEED: ".($duration/$count_sentences*1000)." milliseconds/line\n"); +} + +##################################################################################### +# subroutines afterward + +# tokenize a batch of texts saved in an array +# input: an array containing a batch of texts +# return: another array containing a batch of tokenized texts for the input array +sub tokenize_batch +{ + my(@text_list) = @_; + my(@tokenized_list) = (); + foreach (@text_list) + { + if (($SKIP_XML && /^<.+>$/) || /^\s*$/) + { + #don't try to tokenize XML/HTML tag lines + push(@tokenized_list, $_); + } + else + { + push(@tokenized_list, &tokenize($_)); + } + } + return \@tokenized_list; +} + +# the actual tokenize function which tokenizes one input string +# input: one string +# return: the tokenized string for the input string +sub tokenize +{ + my($text) = @_; + + if ($PENN) { + return tokenize_penn($text); + } + + chomp($text); + $text = " $text "; + + # remove ASCII junk + $text =~ s/\s+/ /g; + $text =~ s/[\000-\037]//g; + + # Find protected patterns + my @protected = (); + foreach my $protected_pattern (@protected_patterns) { + my $t = $text; + while ($t =~ /(?$protected_pattern)(?.*)$/) { + push @protected, $+{PATTERN}; + $t = $+{TAIL}; + } + } + + for (my $i = 0; $i < scalar(@protected); ++$i) { + my $subst = sprintf("THISISPROTECTED%.3d", $i); + $text =~ s,\Q$protected[$i], $subst ,g; + } + $text =~ s/ +/ /g; + $text =~ s/^ //g; + $text =~ s/ $//g; + + # separate out all "other" special characters + if (($language eq "fi") or ($language eq "sv")) { + # in Finnish and Swedish, the colon can be used inside words as an apostrophe-like character: + # USA:n, 20:een, EU:ssa, USA:s, S:t + $text =~ s/([^\p{IsAlnum}\s\.\:\'\`\,\-])/ $1 /g; + # if a colon is not immediately followed by lower-case characters, separate it out anyway + $text =~ s/(:)(?=$|[^\p{Ll}])/ $1 /g; + } + elsif ($language eq "tdt") { + # in Tetun, the apostrophe can be used inside words as an apostrophe-like character: + $text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g; + # if an apostrophe is not immediately followed by lower-case characters, separate it out anyway + $text =~ s/(\')(?=$|[^\p{Ll}])/ $1 /g; + } + elsif (($language eq "ca")) { + # in Catalan, the middle dot can be used inside words: + # il�lusio + $text =~ s/([^\p{IsAlnum}\s\.\·\'\`\,\-])/ $1 /g; + # if a middot is not immediately followed by lower-case characters, separate it out anyway + $text =~ s/(·)(?=$|[^\p{Ll}])/ $1 /g; + } + else { + $text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g; + } + + # aggressive hyphen splitting + if ($AGGRESSIVE) + { + $text =~ s/([\p{IsAlnum}])\-(?=[\p{IsAlnum}])/$1 \@-\@ /g; + } + + #multi-dots stay together + $text =~ s/\.([\.]+)/ DOTMULTI$1/g; + while($text =~ /DOTMULTI\./) + { + $text =~ s/DOTMULTI\.([^\.])/DOTDOTMULTI $1/g; + $text =~ s/DOTMULTI\./DOTDOTMULTI/g; + } + + # seperate out "," except if within numbers (5,300) + #$text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g; + + # separate out "," except if within numbers (5,300) + # previous "global" application skips some: A,B,C,D,E > A , B,C , D,E + # first application uses up B so rule can't see B,C + # two-step version here may create extra spaces but these are removed later + # will also space digit,letter or letter,digit forms (redundant with next section) + $text =~ s/([^\p{IsN}])[,]/$1 , /g; + $text =~ s/[,]([^\p{IsN}])/ , $1/g; + + # separate "," after a number if it's the end of a sentence + $text =~ s/([\p{IsN}])[,]$/$1 ,/g; + + # separate , pre and post number + #$text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g; + #$text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g; + + # turn `into ' + #$text =~ s/\`/\'/g; + + #turn '' into " + #$text =~ s/\'\'/ \" /g; + + if ($language eq "en") + { + #split contractions right + $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g; + $text =~ s/([^\p{IsAlpha}\p{IsN}])[']([\p{IsAlpha}])/$1 ' $2/g; + $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g; + $text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1 '$2/g; + #special case for "1990's" + $text =~ s/([\p{IsN}])[']([s])/$1 '$2/g; + } + elsif (($language eq "fr") or ($language eq "it") or ($language eq "ga") or ($language eq "ca")) + { + #split contractions left + $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g; + $text =~ s/([^\p{IsAlpha}])[']([\p{IsAlpha}])/$1 ' $2/g; + $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g; + $text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1' $2/g; + } + elsif (($language eq "so") or ($language eq "tdt")) + { + # Don't split glottals + $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g; + $text =~ s/([^\p{IsAlpha}])[']([\p{IsAlpha}])/$1 ' $2/g; + $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g; + } + else + { + $text =~ s/\'/ \' /g; + } + + #word token method + my @words = split(/\s/,$text); + $text = ""; + for (my $i=0;$i<(scalar(@words));$i++) + { + my $word = $words[$i]; + if ( $word =~ /^(\S+)\.$/) + { + my $pre = $1; + if ($i == scalar(@words)-1) { + # split last words independently as they are unlikely to be non-breaking prefixes + $word = $pre." ."; + } + elsif (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($i/\>/g; # xml + $text =~ s/\'/\'/g; # xml + $text =~ s/\"/\"/g; # xml + $text =~ s/\[/\[/g; # syntax non-terminal + $text =~ s/\]/\]/g; # syntax non-terminal + } + + #ensure final line break + $text .= "\n" unless $text =~ /\n$/; + + return $text; +} + +sub tokenize_penn +{ + # Improved compatibility with Penn Treebank tokenization. Useful if + # the text is to later be parsed with a PTB-trained parser. + # + # Adapted from Robert MacIntyre's sed script: + # http://www.cis.upenn.edu/~treebank/tokenizer.sed + + my($text) = @_; + chomp($text); + + # remove ASCII junk + $text =~ s/\s+/ /g; + $text =~ s/[\000-\037]//g; + + # attempt to get correct directional quotes + $text =~ s/^``/`` /g; + $text =~ s/^"/`` /g; + $text =~ s/^`([^`])/` $1/g; + $text =~ s/^'/` /g; + $text =~ s/([ ([{<])"/$1 `` /g; + $text =~ s/([ ([{<])``/$1 `` /g; + $text =~ s/([ ([{<])`([^`])/$1 ` $2/g; + $text =~ s/([ ([{<])'/$1 ` /g; + # close quotes handled at end + + $text =~ s=\.\.\.= _ELLIPSIS_ =g; + + # separate out "," except if within numbers (5,300) + $text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g; + # separate , pre and post number + $text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g; + $text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g; + + #$text =~ s=([;:@#\$%&\p{IsSc}])= $1 =g; +$text =~ s=([;:@#\$%&\p{IsSc}\p{IsSo}])= $1 =g; + + # Separate out intra-token slashes. PTB tokenization doesn't do this, so + # the tokens should be merged prior to parsing with a PTB-trained parser + # (see syntax-hyphen-splitting.perl). + $text =~ s/([\p{IsAlnum}])\/([\p{IsAlnum}])/$1 \@\/\@ $2/g; + + # Assume sentence tokenization has been done first, so split FINAL periods + # only. + $text =~ s=([^.])([.])([\]\)}>"']*) ?$=$1 $2$3 =g; + # however, we may as well split ALL question marks and exclamation points, + # since they shouldn't have the abbrev.-marker ambiguity problem + $text =~ s=([?!])= $1 =g; + + # parentheses, brackets, etc. + $text =~ s=([\]\[\(\){}<>])= $1 =g; + $text =~ s/\(/-LRB-/g; + $text =~ s/\)/-RRB-/g; + $text =~ s/\[/-LSB-/g; + $text =~ s/\]/-RSB-/g; + $text =~ s/{/-LCB-/g; + $text =~ s/}/-RCB-/g; + + $text =~ s=--= -- =g; + + # First off, add a space to the beginning and end of each line, to reduce + # necessary number of regexps. + $text =~ s=$= =; + $text =~ s=^= =; + + $text =~ s="= '' =g; + # possessive or close-single-quote + $text =~ s=([^'])' =$1 ' =g; + # as in it's, I'm, we'd + $text =~ s='([sSmMdD]) = '$1 =g; + $text =~ s='ll = 'll =g; + $text =~ s='re = 're =g; + $text =~ s='ve = 've =g; + $text =~ s=n't = n't =g; + $text =~ s='LL = 'LL =g; + $text =~ s='RE = 'RE =g; + $text =~ s='VE = 'VE =g; + $text =~ s=N'T = N'T =g; + + $text =~ s= ([Cc])annot = $1an not =g; + $text =~ s= ([Dd])'ye = $1' ye =g; + $text =~ s= ([Gg])imme = $1im me =g; + $text =~ s= ([Gg])onna = $1on na =g; + $text =~ s= ([Gg])otta = $1ot ta =g; + $text =~ s= ([Ll])emme = $1em me =g; + $text =~ s= ([Mm])ore'n = $1ore 'n =g; + $text =~ s= '([Tt])is = '$1 is =g; + $text =~ s= '([Tt])was = '$1 was =g; + $text =~ s= ([Ww])anna = $1an na =g; + + #word token method + my @words = split(/\s/,$text); + $text = ""; + for (my $i=0;$i<(scalar(@words));$i++) + { + my $word = $words[$i]; + if ( $word =~ /^(\S+)\.$/) + { + my $pre = $1; + if (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($i/\>/g; # xml + $text =~ s/\'/\'/g; # xml + $text =~ s/\"/\"/g; # xml + $text =~ s/\[/\[/g; # syntax non-terminal + $text =~ s/\]/\]/g; # syntax non-terminal + + #ensure final line break + $text .= "\n" unless $text =~ /\n$/; + + return $text; +} + +sub load_prefixes +{ + my ($language, $PREFIX_REF) = @_; + + my $prefixfile = "$mydir/nonbreaking_prefix.$language"; + + #default back to English if we don't have a language-specific prefix file + if (!(-e $prefixfile)) + { + $prefixfile = "$mydir/nonbreaking_prefix.en"; + print STDERR "WARNING: No known abbreviations for language '$language', attempting fall-back to English version...\n"; + die ("ERROR: No abbreviations files found in $mydir\n") unless (-e $prefixfile); + } + + if (-e "$prefixfile") + { + open(PREFIX, "<:utf8", "$prefixfile"); + while () + { + my $item = $_; + chomp($item); + if (($item) && (substr($item,0,1) ne "#")) + { + if ($item =~ /(.*)[\s]+(\#NUMERIC_ONLY\#)/) + { + $PREFIX_REF->{$1} = 2; + } + else + { + $PREFIX_REF->{$item} = 1; + } + } + } + close(PREFIX); + } +} \ No newline at end of file diff --git a/utils/update_json.sh b/utils/update_json.sh new file mode 100755 index 00000000..bf697475 --- /dev/null +++ b/utils/update_json.sh @@ -0,0 +1,88 @@ +#!/bin/bash + +# Copyright 2020 Kyoto University (Hirofumi Inaguma) +# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +echo "$0 $*" >&2 # Print the command line for logging +. ./path.sh + +nlsyms="" +oov="" +bpecode="" +verbose=0 + +text="" +multilingual=false + +help_message=$(cat << EOF +Usage: $0 +e.g. $0 data/train data/lang_1char/train_units.txt +Options: + --oov # Default: + --verbose # Default: 0 +EOF +) +. utils/parse_options.sh + +if [ $# != 3 ]; then + echo "${help_message}" 1>&2 + exit 1; +fi + +set -euo pipefail + +json=$1 +dir=$2 +dic=$3 +json_dir=$(dirname ${json}) +tmpdir=$(mktemp -d ${dir}/tmp-XXXXX) +trap 'rm -rf ${tmpdir}' EXIT + +if [ -z ${text} ]; then + text=${dir}/text +fi + +# 2. Create scp files for outputs +mkdir -p ${tmpdir}/output +if [ -n "${bpecode}" ]; then + if [ ${multilingual} = true ]; then + # remove a space before the language ID + paste -d " " <(awk '{print $1}' ${text}) <(cut -f 2- -d" " ${text} \ + | spm_encode --model=${bpecode} --output_format=piece | cut -f 2- -d" ") \ + > ${tmpdir}/output/token.scp + else + paste -d " " <(awk '{print $1}' ${text}) <(cut -f 2- -d" " ${text} \ + | spm_encode --model=${bpecode} --output_format=piece) \ + > ${tmpdir}/output/token.scp + fi +elif [ -n "${nlsyms}" ]; then + text2token.py -s 1 -n 1 -l ${nlsyms} ${text} > ${tmpdir}/output/token.scp +else + text2token.py -s 1 -n 1 ${text} > ${tmpdir}/output/token.scp +fi +< ${tmpdir}/output/token.scp utils/sym2int.pl --map-oov ${oov} -f 2- ${dic} > ${tmpdir}/output/tokenid.scp +awk '{print $1 " " NF-1}' ${tmpdir}/output/tokenid.scp > ${tmpdir}/output/olen.scp +# +2 comes from CTC blank and EOS +vocsize=$(tail -n 1 ${dic} | awk '{print $2}') +odim=$(echo "$vocsize + 2" | bc) +awk -v odim=${odim} '{print $1 " " odim}' ${text} > ${tmpdir}/output/odim.scp + +cat ${text} > ${tmpdir}/output/text.scp + + +# 4. Create JSON files from each scp files +rm -f ${tmpdir}/*/*.json +for x in "${tmpdir}"/output/*.scp; do + k=$(basename ${x} .scp) + < ${x} scp2json.py --key ${k} > ${tmpdir}/output/${k}.json +done + +# add to json +addjson.py --verbose ${verbose} -i false \ + ${json} ${tmpdir}/output/text.json ${tmpdir}/output/token.json ${tmpdir}/output/tokenid.json ${tmpdir}/output/olen.json ${tmpdir}/output/odim.json > ${tmpdir}/data.json +mkdir -p ${json_dir}/.backup +echo "json updated. original json is kept in ${json_dir}/.backup." +cp ${json} ${json_dir}/.backup/"$(basename ${json})" +cp ${tmpdir}/data.json ${json} + +rm -fr ${tmpdir} From 48207c14107a0de7d0c54d008220b1be832ba615 Mon Sep 17 00:00:00 2001 From: Junkun Date: Mon, 29 Nov 2021 11:34:21 -0800 Subject: [PATCH 11/40] process scripts and configs --- examples/ted_en_zh/st1/conf/fbank.conf | 2 + examples/ted_en_zh/st1/conf/pitch.conf | 1 + examples/ted_en_zh/st1/local/ted_en_zh.py | 104 ++++++++++++++++++++++ examples/ted_en_zh/st1/steps | 1 + examples/ted_en_zh/st1/utils | 1 + 5 files changed, 109 insertions(+) create mode 100644 examples/ted_en_zh/st1/conf/fbank.conf create mode 100644 examples/ted_en_zh/st1/conf/pitch.conf create mode 100644 examples/ted_en_zh/st1/local/ted_en_zh.py create mode 120000 examples/ted_en_zh/st1/steps create mode 120000 examples/ted_en_zh/st1/utils diff --git a/examples/ted_en_zh/st1/conf/fbank.conf b/examples/ted_en_zh/st1/conf/fbank.conf new file mode 100644 index 00000000..82ac7bd0 --- /dev/null +++ b/examples/ted_en_zh/st1/conf/fbank.conf @@ -0,0 +1,2 @@ +--sample-frequency=16000 +--num-mel-bins=80 diff --git a/examples/ted_en_zh/st1/conf/pitch.conf b/examples/ted_en_zh/st1/conf/pitch.conf new file mode 100644 index 00000000..e959a19d --- /dev/null +++ b/examples/ted_en_zh/st1/conf/pitch.conf @@ -0,0 +1 @@ +--sample-frequency=16000 diff --git a/examples/ted_en_zh/st1/local/ted_en_zh.py b/examples/ted_en_zh/st1/local/ted_en_zh.py new file mode 100644 index 00000000..f30573b7 --- /dev/null +++ b/examples/ted_en_zh/st1/local/ted_en_zh.py @@ -0,0 +1,104 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import codecs +import os + + +# org_split = 'train-split/train-segment' +# text_file = 'En-Zh/train.en-zh' +# data_split = 'train' +def data_process(src_dir, tgt_dir, wav_dir_list, text_file_list, + data_split_list): + + for org_split, text_file, data_split in zip(wav_dir_list, text_file_list, + data_split_list): + local_data_split_dir = os.path.join(tgt_dir, data_split) + + os.makedirs(local_data_split_dir, exist_ok=True) + utts = [] + utt2spk = {} + with open(os.path.join(local_data_split_dir, 'wav.scp.org'), 'w') as wav_wf, \ + open(os.path.join(local_data_split_dir, 'utt2spk.org'), 'w') as utt2spk_wf: + for files in os.listdir(os.path.join(src_dir, org_split)): + files = files.strip() + file_path = os.path.join(src_dir, org_split, files) + size = os.path.getsize(file_path) + if size <= 30000: + continue + utt = files.split('.')[0] + audio_name = utt.split('_')[0] + #format the name of utterance + while len(audio_name) < 6: + utt = '0' + utt + audio_name = '0' + audio_name + utt = 'ted-en-zh-' + utt + utts.append(utt) + spk = utt.split('_')[0] + utt2spk[utt] = spk + assert len(spk) == 16, "%r" % spk + print(utt, 'cat', os.path.abspath(file_path), '|', file=wav_wf) + for utt in sorted(utts): + print(utt, utt2spk[utt], file=utt2spk_wf) + + with open(os.path.join(local_data_split_dir, 'en.org'), 'w') as en_wf, \ + open(os.path.join(local_data_split_dir, 'zh.org'), 'w') as zh_wf, \ + open(os.path.join(local_data_split_dir, '.yaml'), 'w') as yaml_wf, \ + codecs.open(os.path.join(src_dir, text_file), 'r', encoding='utf-8', + errors='ignore') as rf: + count = 0 + for line in rf: + line = line.strip() + line_spl = line.split('\t') + assert len(line_spl) == 3, "%r" % line + wav, en, zh = line_spl + assert wav.endswith('wav'), "%r" % wav[-3:] + utt = wav.split('.')[0] + audio_name = utt.split('_')[0] + while len(audio_name) < 6: + utt = '0' + utt + audio_name = '0' + audio_name + utt = 'ted-en-zh-' + utt + print(utt, file=yaml_wf) + print(en.lower(), file=en_wf) + print(zh, file=zh_wf) + count += 1 + print('%s set lines count: %d' % (data_split, count)) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description=__doc__) + + parser.add_argument( + "--src-dir", + default="", + type=str, + help="Directory to kaldi splited data. (default: %(default)s)") + parser.add_argument( + "--tgt-dir", + default="local/ted_en_zh", + type=str, + help="Directory to save processed data. (default: %(default)s)") + args = parser.parse_args() + + wav_dir_list = [ + 'train-split/train-segment', 'test-segment/tst2014', + 'test-segment/tst2015' + ] + text_file_list = [ + 'En-Zh/train.en-zh', 'En-Zh/tst2014.en-zh', 'En-Zh/tst2015.en-zh' + ] + data_split_list = ['train', 'dev', 'test'] + data_process(args.src_dir, args.tgt_dir, wav_dir_list, text_file_list, + data_split_list) diff --git a/examples/ted_en_zh/st1/steps b/examples/ted_en_zh/st1/steps new file mode 120000 index 00000000..91f2d234 --- /dev/null +++ b/examples/ted_en_zh/st1/steps @@ -0,0 +1 @@ +../../../tools/kaldi/egs/wsj/s5/steps \ No newline at end of file diff --git a/examples/ted_en_zh/st1/utils b/examples/ted_en_zh/st1/utils new file mode 120000 index 00000000..f49247da --- /dev/null +++ b/examples/ted_en_zh/st1/utils @@ -0,0 +1 @@ +../../../tools/kaldi/egs/wsj/s5/utils \ No newline at end of file From e867f3bb416a0c7b8349995ad2ff3f2c97fc6b4e Mon Sep 17 00:00:00 2001 From: Junkun Date: Mon, 29 Nov 2021 14:02:05 -0800 Subject: [PATCH 12/40] minor --- examples/ted_en_zh/st1/local/data.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/ted_en_zh/st1/local/data.sh b/examples/ted_en_zh/st1/local/data.sh index 8b829a8a..c61c9a9f 100755 --- a/examples/ted_en_zh/st1/local/data.sh +++ b/examples/ted_en_zh/st1/local/data.sh @@ -2,7 +2,7 @@ set -e -stage=3 +stage=-1 stop_stage=100 dict_dir=data/lang_char From d2fab3238b7082ee5a5df33d6725514cf4cceb05 Mon Sep 17 00:00:00 2001 From: Junkun Date: Mon, 29 Nov 2021 16:57:36 -0800 Subject: [PATCH 13/40] fix bugs --- paddlespeech/s2t/frontend/utility.py | 8 ++++---- paddlespeech/s2t/io/sampler.py | 2 +- paddlespeech/s2t/utils/checkpoint.py | 3 +++ 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/paddlespeech/s2t/frontend/utility.py b/paddlespeech/s2t/frontend/utility.py index 703f2127..d423a604 100644 --- a/paddlespeech/s2t/frontend/utility.py +++ b/paddlespeech/s2t/frontend/utility.py @@ -102,10 +102,10 @@ def read_manifest( manifest = [] with jsonlines.open(manifest_path, 'r') as reader: for json_data in reader: - feat_len = json_data["feat_shape"][ - 0] if 'feat_shape' in json_data else 1.0 - token_len = json_data["token_shape"][ - 0] if 'token_shape' in json_data else 1.0 + feat_len = json_data["input"][0]["shape"][ + 0] if 'shape' in json_data["input"][0] else 1.0 + token_len = json_data["output"][0]["shape"][ + 0] if 'shape' in json_data["output"][0] else 1.0 conditions = [ feat_len >= min_input_len, feat_len <= max_input_len, diff --git a/paddlespeech/s2t/io/sampler.py b/paddlespeech/s2t/io/sampler.py index 35b57524..0d5a16ce 100644 --- a/paddlespeech/s2t/io/sampler.py +++ b/paddlespeech/s2t/io/sampler.py @@ -51,7 +51,7 @@ def _batch_shuffle(indices, batch_size, epoch, clipped=False): """ rng = np.random.RandomState(epoch) shift_len = rng.randint(0, batch_size - 1) - batch_indices = list(zip(* [iter(indices[shift_len:])] * batch_size)) + batch_indices = list(zip(*[iter(indices[shift_len:])] * batch_size)) rng.shuffle(batch_indices) batch_indices = [item for batch in batch_indices for item in batch] assert clipped is False diff --git a/paddlespeech/s2t/utils/checkpoint.py b/paddlespeech/s2t/utils/checkpoint.py index 5105f95e..4c493715 100644 --- a/paddlespeech/s2t/utils/checkpoint.py +++ b/paddlespeech/s2t/utils/checkpoint.py @@ -94,6 +94,9 @@ class Checkpoint(): """ configs = {} + if len(checkpoint_path) == 0 or checkpoint_path == "None": + checkpoint_path = None + if checkpoint_path is not None: pass elif checkpoint_dir is not None and record_file is not None: From 3c8e87344a4ce38617adabd44f3496157e9e80e8 Mon Sep 17 00:00:00 2001 From: Junkun Date: Mon, 29 Nov 2021 16:59:50 -0800 Subject: [PATCH 14/40] update run scripts --- .../st1/conf/transformer_mtl_noam.yaml | 4 +- examples/ted_en_zh/st1/local/data.sh | 2 +- .../ted_en_zh/st1/local/train_finetune.sh | 39 ------------------- examples/ted_en_zh/st1/run.sh | 17 ++++---- 4 files changed, 11 insertions(+), 51 deletions(-) delete mode 100755 examples/ted_en_zh/st1/local/train_finetune.sh diff --git a/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml b/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml index b4fb5107..3175aad9 100644 --- a/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml +++ b/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml @@ -11,9 +11,9 @@ data: max_output_input_ratio: 20.0 collator: - vocab_filepath: data/lang_char/vocab.txt + vocab_filepath: data/lang_char/ted_en_zh_bpe8000.txt unit_type: 'spm' - spm_model_prefix: data/train_sp.en-zh-nlpr.zh-nlpr_bpe8000_tc + spm_model_prefix: data/lang_char/ted_en_zh_bpe8000 mean_std_filepath: "" # augmentation_config: conf/augmentation.json batch_size: 10 diff --git a/examples/ted_en_zh/st1/local/data.sh b/examples/ted_en_zh/st1/local/data.sh index c61c9a9f..f9c876b1 100755 --- a/examples/ted_en_zh/st1/local/data.sh +++ b/examples/ted_en_zh/st1/local/data.sh @@ -166,7 +166,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then ${prep_dir}/test.en-zh.zh/feats.scp ${prep_dir}/train_sp.en-zh.zh/cmvn.ark ${prep_dir}/dump_feats/test.en-zh.zh ${feat_trans_dir} fi -dict=${dict_dir}/ted_en_zh_${bpemode}${nbpe}_joint.txt +dict=${dict_dir}/ted_en_zh_${bpemode}${nbpe}.txt nlsyms=${dict_dir}/ted_en_zh_non_lang_syms.txt bpemodel=${dict_dir}/ted_en_zh_${bpemode}${nbpe} if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then diff --git a/examples/ted_en_zh/st1/local/train_finetune.sh b/examples/ted_en_zh/st1/local/train_finetune.sh deleted file mode 100755 index e54c7fff..00000000 --- a/examples/ted_en_zh/st1/local/train_finetune.sh +++ /dev/null @@ -1,39 +0,0 @@ -#!/bin/bash - -if [ $# != 3 ];then - echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name ckpt_path" - exit -1 -fi - -ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') -echo "using $ngpu gpus..." - -config_path=$1 -ckpt_name=$2 -ckpt_path=$3 - -mkdir -p exp - -# seed may break model convergence -seed=0 -if [ ${seed} != 0 ]; then - export FLAGS_cudnn_deterministic=True -fi - -python3 -u ${BIN_DIR}/train.py \ ---ngpu ${ngpu} \ ---config ${config_path} \ ---output exp/${ckpt_name} \ ---checkpoint_path ${ckpt_path} \ ---seed ${seed} - -if [ ${seed} != 0 ]; then - unset FLAGS_cudnn_deterministic -fi - -if [ $? -ne 0 ]; then - echo "Failed in training!" - exit 1 -fi - -exit 0 \ No newline at end of file diff --git a/examples/ted_en_zh/st1/run.sh b/examples/ted_en_zh/st1/run.sh index f8adf4f6..a1c99af3 100755 --- a/examples/ted_en_zh/st1/run.sh +++ b/examples/ted_en_zh/st1/run.sh @@ -6,7 +6,7 @@ gpus=0,1,2,3 stage=1 stop_stage=4 conf_path=conf/transformer_mtl_noam.yaml -ckpt_path=paddle.98 +ckpt= # paddle.98 # (finetune from FAT-ST pretrained model) avg_num=5 data_path=./TED_EnZh # path to unzipped data source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; @@ -22,21 +22,20 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - # download pretrained - bash ./local/download_pretrain.sh || exit -1 -fi - -if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then # train model, all `ckpt` under `exp` dir - CUDA_VISIBLE_DEVICES=${gpus} ./local/train_finetune.sh ${conf_path} ${ckpt} ${ckpt_path} + if [ -n "${ckpt_path}" ]; then + echo "Finetune from Pretrained Model" ${ckpt_path} + ./local/download_pretrain.sh || exit -1 + fi + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} "${ckpt_path}" fi -if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then # avg n best model avg.sh best exp/${ckpt}/checkpoints ${avg_num} fi -if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # test ckpt avg_n CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi \ No newline at end of file From 0cc81a52cdae3783972c4fa25d8de33784fb7f97 Mon Sep 17 00:00:00 2001 From: Junkun Date: Mon, 29 Nov 2021 17:00:09 -0800 Subject: [PATCH 15/40] update format --- utils/addjson.py | 27 ++++++++++++--------------- utils/scp2json.py | 19 ++++++++++--------- 2 files changed, 22 insertions(+), 24 deletions(-) diff --git a/utils/addjson.py b/utils/addjson.py index 7fabe625..013d1472 100755 --- a/utils/addjson.py +++ b/utils/addjson.py @@ -1,9 +1,7 @@ #!/usr/bin/env python3 # encoding: utf-8 - # Copyright 2018 Nagoya University (Tomoki Hayashi) # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) - from __future__ import print_function from __future__ import unicode_literals @@ -12,7 +10,6 @@ import codecs import json import logging import sys - from distutils.util import strtobool from espnet.utils.cli_utils import get_commandline_args @@ -23,17 +20,16 @@ is_python2 = sys.version_info[0] == 2 def get_parser(): parser = argparse.ArgumentParser( description="add multiple json values to an input or output value", - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - ) + formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) parser.add_argument("jsons", type=str, nargs="+", help="json files") parser.add_argument( "-i", "--is-input", default=True, type=strtobool, - help="If true, add to input. If false, add to output", - ) - parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option") + help="If true, add to input. If false, add to output", ) + parser.add_argument( + "--verbose", "-V", default=0, type=int, help="Verbose option") return parser @@ -121,7 +117,8 @@ if __name__ == "__main__": out_add_dic = {} # add shape if "odim" in adddic and "olen" in adddic: - out_add_dic["shape"] = [int(adddic["olen"]), int(adddic["odim"])] + out_add_dic[ + "shape"] = [int(adddic["olen"]), int(adddic["odim"])] elif "odim" in adddic: out_add_dic["shape"] = [int(adddic["odim"])] # add all other key value @@ -143,13 +140,13 @@ if __name__ == "__main__": # ensure "ensure_ascii=False", which is a bug jsonstring = json.dumps( - {"utts": new_dic}, + { + "utts": new_dic + }, indent=4, ensure_ascii=False, sort_keys=True, - separators=(",", ": "), - ) - sys.stdout = codecs.getwriter("utf-8")( - sys.stdout if is_python2 else sys.stdout.buffer - ) + separators=(",", ": "), ) + sys.stdout = codecs.getwriter("utf-8")(sys.stdout + if is_python2 else sys.stdout.buffer) print(jsonstring) diff --git a/utils/scp2json.py b/utils/scp2json.py index 8e8de3e0..e2a75766 100755 --- a/utils/scp2json.py +++ b/utils/scp2json.py @@ -1,6 +1,5 @@ #!/usr/bin/env python3 # encoding: utf-8 - # Copyright 2017 Johns Hopkins University (Shinji Watanabe) # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) from __future__ import print_function @@ -17,8 +16,7 @@ is_python2 = sys.version_info[0] == 2 def get_parser(): parser = argparse.ArgumentParser( description="convert scp to json", - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - ) + formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) parser.add_argument("--key", "-k", type=str, help="key") return parser @@ -28,10 +26,10 @@ if __name__ == "__main__": args = parser.parse_args() new_line = {} - sys.stdin = codecs.getreader("utf-8")(sys.stdin if is_python2 else sys.stdin.buffer) - sys.stdout = codecs.getwriter("utf-8")( - sys.stdout if is_python2 else sys.stdout.buffer - ) + sys.stdin = codecs.getreader("utf-8")(sys.stdin + if is_python2 else sys.stdin.buffer) + sys.stdout = codecs.getwriter("utf-8")(sys.stdout + if is_python2 else sys.stdout.buffer) line = sys.stdin.readline() while line: x = line.rstrip().split() @@ -43,6 +41,9 @@ if __name__ == "__main__": # ensure "ensure_ascii=False", which is a bug jsonstring = json.dumps( - all_l, indent=4, ensure_ascii=False, sort_keys=True, separators=(",", ": ") - ) + all_l, + indent=4, + ensure_ascii=False, + sort_keys=True, + separators=(",", ": ")) print(jsonstring) From 351e4e8e87e1b5b678c4aded167cb735327da4ee Mon Sep 17 00:00:00 2001 From: Junkun Date: Mon, 29 Nov 2021 17:01:16 -0800 Subject: [PATCH 16/40] training script --- examples/ted_en_zh/st1/local/train.sh | 39 +++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100755 examples/ted_en_zh/st1/local/train.sh diff --git a/examples/ted_en_zh/st1/local/train.sh b/examples/ted_en_zh/st1/local/train.sh new file mode 100755 index 00000000..a8e4acaa --- /dev/null +++ b/examples/ted_en_zh/st1/local/train.sh @@ -0,0 +1,39 @@ +#!/bin/bash + +if [ $# != 3 ];then + echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name ckpt_path" + exit -1 +fi + +ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') +echo "using $ngpu gpus..." + +config_path=$1 +ckpt_name=$2 +ckpt_path=$3 + +mkdir -p exp + +# seed may break model convergence +seed=0 +if [ ${seed} != 0 ]; then + export FLAGS_cudnn_deterministic=True +fi + +python3 -u ${BIN_DIR}/train.py \ +--ngpu ${ngpu} \ +--config ${config_path} \ +--output exp/${ckpt_name} \ +--checkpoint_path "${ckpt_path}" \ +--seed ${seed} + +if [ ${seed} != 0 ]; then + unset FLAGS_cudnn_deterministic +fi + +if [ $? -ne 0 ]; then + echo "Failed in training!" + exit 1 +fi + +exit 0 \ No newline at end of file From 79060e20e3d5f6285f49b503f95b8db9ddce9294 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Tue, 30 Nov 2021 10:34:43 +0800 Subject: [PATCH 17/40] Update pack_model.sh --- utils/pack_model.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/pack_model.sh b/utils/pack_model.sh index 8acd59a6..d7df01eb 100755 --- a/utils/pack_model.sh +++ b/utils/pack_model.sh @@ -57,7 +57,7 @@ else echo "missing ${dec_conf}" exit 1 fi -# NOTE(kan-bayashi): preprocess conf is optional +# preprocess conf is optional if [ -n "${preprocess_conf}" ]; then tar rfh ${outfile}.tar ${preprocess_conf} echo -n " - preprocess config file: \`" From 507c3b52eab46beb411314e90e2928042abd1065 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Tue, 30 Nov 2021 10:35:16 +0800 Subject: [PATCH 18/40] Update default.yaml --- examples/csmsc/voc3/conf/default.yaml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/examples/csmsc/voc3/conf/default.yaml b/examples/csmsc/voc3/conf/default.yaml index cc27220f..5dda835a 100644 --- a/examples/csmsc/voc3/conf/default.yaml +++ b/examples/csmsc/voc3/conf/default.yaml @@ -6,8 +6,7 @@ # This configuration is based on full-band MelGAN but the hop size and sampling # rate is different from the paper (16kHz vs 24kHz). The number of iteraions # is not shown in the paper so currently we train 1M iterations (not sure enough -# to converge). The optimizer setting is based on @dathudeptrai advice. -# https://github.com/kan-bayashi/ParallelWaveGAN/issues/143#issuecomment-632539906 +# to converge). ########################################################### # FEATURE EXTRACTION SETTING # @@ -136,4 +135,4 @@ eval_interval_steps: 1000 # Interval steps to evaluate the network # OTHER SETTING # ########################################################### num_snapshots: 10 # max number of snapshots to keep while training -seed: 42 # random seed for paddle, random, and np.random \ No newline at end of file +seed: 42 # random seed for paddle, random, and np.random From 2de7bc14b085f9b835a4acdc350475405a310ecf Mon Sep 17 00:00:00 2001 From: TianYuan Date: Tue, 30 Nov 2021 10:35:29 +0800 Subject: [PATCH 19/40] Update finetune.yaml --- examples/csmsc/voc3/conf/finetune.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/examples/csmsc/voc3/conf/finetune.yaml b/examples/csmsc/voc3/conf/finetune.yaml index 80ab6bed..30227401 100644 --- a/examples/csmsc/voc3/conf/finetune.yaml +++ b/examples/csmsc/voc3/conf/finetune.yaml @@ -6,8 +6,7 @@ # This configuration is based on full-band MelGAN but the hop size and sampling # rate is different from the paper (16kHz vs 24kHz). The number of iteraions # is not shown in the paper so currently we train 1M iterations (not sure enough -# to converge). The optimizer setting is based on @dathudeptrai advice. -# https://github.com/kan-bayashi/ParallelWaveGAN/issues/143#issuecomment-632539906 +# to converge). ########################################################### # FEATURE EXTRACTION SETTING # From f225b1d88ecdf92b758e999e28ff4e6d433d95f6 Mon Sep 17 00:00:00 2001 From: Junkun Date: Mon, 29 Nov 2021 22:39:07 -0800 Subject: [PATCH 20/40] minor updates --- examples/ted_en_zh/st1/local/data_prep.sh | 54 ----------------------- examples/ted_en_zh/st1/path.sh | 3 +- examples/ted_en_zh/st1/run.sh | 5 ++- 3 files changed, 4 insertions(+), 58 deletions(-) delete mode 100755 examples/ted_en_zh/st1/local/data_prep.sh diff --git a/examples/ted_en_zh/st1/local/data_prep.sh b/examples/ted_en_zh/st1/local/data_prep.sh deleted file mode 100755 index 339cee1e..00000000 --- a/examples/ted_en_zh/st1/local/data_prep.sh +++ /dev/null @@ -1,54 +0,0 @@ -#!/bin/bash - -# Copyright 2019 Kyoto University (Hirofumi Inaguma) -# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) - -export LC_ALL=C - -data_dir=${1} - -for set in train dev test; do -# for set in train; do - dst=${target_dir}/${set} - for lang in en zh; do - - if [ ${lang} = 'en' ]; then - echo "remove punctuation $lang" - # remove punctuation - local/remove_punctuation.pl < ${dst}/${lang}.org > ${dst}/${lang}.raw - else - cp ${dst}/${lang}.org ${dst}/${lang}.raw - fi - - paste -d " " ${dst}/.yaml ${dst}/${lang}.raw | sort > ${dst}/text.${lang} - - - done - # error check - n=$(cat ${dst}/.yaml | wc -l) - n_en=$(cat ${dst}/en.raw | wc -l) - n_tgt=$(cat ${dst}/zh.raw | wc -l) - [ ${n} -ne ${n_en} ] && echo "Warning: expected ${n} data data files, found ${n_en}" && exit 1; - [ ${n} -ne ${n_tgt} ] && echo "Warning: expected ${n} data data files, found ${n_tgt}" && exit 1; - - echo "done text processing" - cat ${dst}/wav.scp.org | uniq | sort -k1,1 -u > ${dst}/wav.scp - cat ${dst}/utt2spk.org | uniq | sort -k1,1 -u > ${dst}/utt2spk - - cat ${dst}/utt2spk | utt2spk_to_spk2utt.pl | sort -k1,1 -u > ${dst}/spk2utt - rm -rf ${target_dir}/data_prep/${set}.en-zh - mkdir -p ${target_dir}/data_prep/${set}.en-zh - echo "remove duplicate lines..." - cut -d ' ' -f 1 ${dst}/text.en | sort | uniq -c | sort -n -k1 -r | grep -v '1 ted-en-zh' \ - | sed 's/^[ \t]*//' > ${dst}/duplicate_lines - cut -d ' ' -f 1 ${dst}/text.en | sort | uniq -c | sort -n -k1 -r | grep '1 ted-en-zh' \ - | cut -d '1' -f 2- | sed 's/^[ \t]*//' > ${dst}/reclist - reduce_data_dir.sh ${dst} ${dst}/reclist ${target_dir}/data_prep/${set}.en-zh - echo "done wav processing" - for l in en zh; do - cp ${dst}/text.${l} ${target_dir}/data_prep/${set}.en-zh/text.${l} - done - fix_data_dir.sh --utt_extra_files \ - "text.en text.zh" \ - ${target_dir}/data_prep/${set}.en-zh -done \ No newline at end of file diff --git a/examples/ted_en_zh/st1/path.sh b/examples/ted_en_zh/st1/path.sh index ee4c9779..867cdb48 100644 --- a/examples/ted_en_zh/st1/path.sh +++ b/examples/ted_en_zh/st1/path.sh @@ -19,5 +19,4 @@ export KALDI_ROOT=${MAIN_ROOT}/tools/kaldi [ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present, can not using Kaldi!" -[ -f $KALDI_ROOT/tools/config/common_path.sh ] && . $KALDI_ROOT/tools/config/common_path.sh -export train_cmd="run.pl" \ No newline at end of file +[ -f $KALDI_ROOT/tools/config/common_path.sh ] && . $KALDI_ROOT/tools/config/common_path.sh \ No newline at end of file diff --git a/examples/ted_en_zh/st1/run.sh b/examples/ted_en_zh/st1/run.sh index a1c99af3..f6362a8b 100755 --- a/examples/ted_en_zh/st1/run.sh +++ b/examples/ted_en_zh/st1/run.sh @@ -1,12 +1,13 @@ #!/bin/bash set -e -source path.sh +. ./path.sh || exit 1; +. ./cmd.sh || exit 1; gpus=0,1,2,3 stage=1 stop_stage=4 conf_path=conf/transformer_mtl_noam.yaml -ckpt= # paddle.98 # (finetune from FAT-ST pretrained model) +ckpt_path= # paddle.98 # (finetune from FAT-ST pretrained model) avg_num=5 data_path=./TED_EnZh # path to unzipped data source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; From aea1e92a3df7bad912f70ad84d953f02a43b8471 Mon Sep 17 00:00:00 2001 From: Junkun Date: Mon, 29 Nov 2021 22:50:34 -0800 Subject: [PATCH 21/40] update cmd.sh --- examples/ted_en_zh/st1/cmd.sh | 89 +++++++++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) create mode 100644 examples/ted_en_zh/st1/cmd.sh diff --git a/examples/ted_en_zh/st1/cmd.sh b/examples/ted_en_zh/st1/cmd.sh new file mode 100644 index 00000000..7b70ef5e --- /dev/null +++ b/examples/ted_en_zh/st1/cmd.sh @@ -0,0 +1,89 @@ +# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ====== +# Usage: .pl [options] JOB=1: +# e.g. +# run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB +# +# Options: +# --time