Merge remote-tracking branch 'update_stream/develop' into cli

pull/1048/head
KP 3 years ago
commit 54cf048b2a

@ -110,7 +110,7 @@ Developers can have a try of our model with only a few lines of code.
A tiny DeepSpeech2 **Speech-to-Text** model training on toy set of LibriSpeech: A tiny DeepSpeech2 **Speech-to-Text** model training on toy set of LibriSpeech:
```shell ```shell
cd examples/tiny/s0/ cd examples/tiny/asr0/
# source the environment # source the environment
source path.sh source path.sh
source ../../../utils/parse_options.sh source ../../../utils/parse_options.sh
@ -177,20 +177,20 @@ The current hyperlinks redirect to [Previous Parakeet](https://github.com/Paddle
<td rowspan="2" >Aishell</td> <td rowspan="2" >Aishell</td>
<td >DeepSpeech2 RNN + Conv based Models</td> <td >DeepSpeech2 RNN + Conv based Models</td>
<td> <td>
<a href = "./examples/aishell/s0">deepspeech2-aishell</a> <a href = "./examples/aishell/asr0">deepspeech2-aishell</a>
</td> </td>
</tr> </tr>
<tr> <tr>
<td>Transformer based Attention Models </td> <td>Transformer based Attention Models </td>
<td> <td>
<a href = "./examples/aishell/s1">u2.transformer.conformer-aishell</a> <a href = "./examples/aishell/asr1">u2.transformer.conformer-aishell</a>
</td> </td>
</tr> </tr>
<tr> <tr>
<td> Librispeech</td> <td> Librispeech</td>
<td>Transformer based Attention Models </td> <td>Transformer based Attention Models </td>
<td> <td>
<a href = "./examples/librispeech/s0">deepspeech2-librispeech</a> / <a href = "./examples/librispeech/s1">transformer.conformer.u2-librispeech</a> / <a href = "./examples/librispeech/s2">transformer.conformer.u2-kaldi-librispeech</a> <a href = "./examples/librispeech/asr0">deepspeech2-librispeech</a> / <a href = "./examples/librispeech/asr1">transformer.conformer.u2-librispeech</a> / <a href = "./examples/librispeech/asr2">transformer.conformer.u2-kaldi-librispeech</a>
</td> </td>
</td> </td>
</tr> </tr>
@ -199,7 +199,7 @@ The current hyperlinks redirect to [Previous Parakeet](https://github.com/Paddle
<td>THCHS30</td> <td>THCHS30</td>
<td>MFA</td> <td>MFA</td>
<td> <td>
<a href = ".examples/thchs30/a0">mfa-thchs30</a> <a href = ".examples/thchs30/align0">mfa-thchs30</a>
</td> </td>
</tr> </tr>
<tr> <tr>
@ -213,7 +213,7 @@ The current hyperlinks redirect to [Previous Parakeet](https://github.com/Paddle
<td>TIMIT</td> <td>TIMIT</td>
<td>Unified Streaming & Non-streaming Two-pass</td> <td>Unified Streaming & Non-streaming Two-pass</td>
<td> <td>
<a href = "./examples/timit/s1"> u2-timit</a> <a href = "./examples/timit/asr1"> u2-timit</a>
</td> </td>
</tr> </tr>
</tbody> </tbody>

@ -28,7 +28,7 @@ import soundfile
parser = argparse.ArgumentParser(description=__doc__) parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument( parser.add_argument(
"--src_dir", "--src-dir",
default="", default="",
type=str, type=str,
help="Directory to kaldi splited data. (default: %(default)s)") help="Directory to kaldi splited data. (default: %(default)s)")

@ -0,0 +1 @@
# Demos for PaddleSpeech

@ -0,0 +1,8 @@
# Style FastSpeech2
You can change the `pitch`、`duration` and `energy` of `FastSpeech2`.
Run the following command line to get started:
```
./run.sh
```
For more details, please see `style_syn.py`

@ -252,7 +252,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 18, "execution_count": 25,
"metadata": { "metadata": {
"scrolled": true "scrolled": true
}, },
@ -261,8 +261,7 @@
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"The autoreload extension is already loaded. To reload it, use:\n", "env: CUDA_VISIBLE_DEVICES=0\n"
" %reload_ext autoreload\n"
] ]
} }
], ],
@ -284,7 +283,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 19, "execution_count": 28,
"metadata": { "metadata": {
"scrolled": true "scrolled": true
}, },
@ -317,7 +316,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 20, "execution_count": 30,
"metadata": { "metadata": {
"scrolled": true "scrolled": true
}, },
@ -596,11 +595,19 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 32, "execution_count": 31,
"metadata": { "metadata": {
"scrolled": true "scrolled": true
}, },
"outputs": [], "outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Frontend done!\n"
]
}
],
"source": [ "source": [
"# 传入 phones_dict 会把相应的 phones 转换成 phone_ids\n", "# 传入 phones_dict 会把相应的 phones 转换成 phone_ids\n",
"frontend = Frontend(phone_vocab_path=phones_dict)\n", "frontend = Frontend(phone_vocab_path=phones_dict)\n",
@ -619,25 +626,11 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 23, "execution_count": 35,
"metadata": { "metadata": {
"scrolled": true "scrolled": true
}, },
"outputs": [ "outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Building prefix dict from the default dictionary ...\n",
"DEBUG:jieba:Building prefix dict from the default dictionary ...\n",
"Loading model from cache /tmp/jieba.cache\n",
"DEBUG:jieba:Loading model from cache /tmp/jieba.cache\n",
"Loading model cost 5.331 seconds.\n",
"DEBUG:jieba:Loading model cost 5.331 seconds.\n",
"Prefix dict has been built successfully.\n",
"DEBUG:jieba:Prefix dict has been built successfully.\n"
]
},
{ {
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
@ -701,8 +694,10 @@
"<br></br>\n", "<br></br>\n",
"在本教程中,我们使用 `FastSpeech2` 作为声学模型。\n", "在本教程中,我们使用 `FastSpeech2` 作为声学模型。\n",
"![FastSpeech2](source/fastspeech2.png)\n", "![FastSpeech2](source/fastspeech2.png)\n",
"\n",
"PaddleSpeech TTS 实现的 FastSpeech2 与论文不同的地方在于,我们使用的的是 phone 级别的 `pitch` 和 `energy`(与 FastPitch 类似)。\n", "PaddleSpeech TTS 实现的 FastSpeech2 与论文不同的地方在于,我们使用的的是 phone 级别的 `pitch` 和 `energy`(与 FastPitch 类似)。\n",
"![FastPitch](source/fastpitch.png)\n", "![FastPitch](source/fastpitch.png)\n",
"\n",
"更多关于[声学模型的发展及改进](https://paddlespeech.readthedocs.io/en/latest/tts/models_introduction.html)。" "更多关于[声学模型的发展及改进](https://paddlespeech.readthedocs.io/en/latest/tts/models_introduction.html)。"
] ]
}, },
@ -1020,13 +1015,16 @@
"odim = fastspeech2_config.n_mels\n", "odim = fastspeech2_config.n_mels\n",
"model = FastSpeech2(\n", "model = FastSpeech2(\n",
" idim=vocab_size, odim=odim, **fastspeech2_config[\"model\"])\n", " idim=vocab_size, odim=odim, **fastspeech2_config[\"model\"])\n",
"\n", "# 加载预训练模型参数\n",
"model.set_state_dict(paddle.load(fastspeech2_checkpoint)[\"main_params\"]) # 加载预训练模型参数\n", "model.set_state_dict(paddle.load(fastspeech2_checkpoint)[\"main_params\"])\n",
"model.eval() # 推理阶段不启用 batch norm 和 dropout\n", "# 推理阶段不启用 batch norm 和 dropout\n",
"model.eval()\n",
"stat = np.load(fastspeech2_stat)\n", "stat = np.load(fastspeech2_stat)\n",
"mu, std = stat # 读取数据预处理阶段数据集的均值和标准差\n", "# 读取数据预处理阶段数据集的均值和标准差\n",
"mu, std = stat\n",
"mu, std = paddle.to_tensor(mu), paddle.to_tensor(std)\n", "mu, std = paddle.to_tensor(mu), paddle.to_tensor(std)\n",
"fastspeech2_normalizer = ZScore(mu, std) # 构造归一化的新模型\n", "# 构造归一化的新模型\n",
"fastspeech2_normalizer = ZScore(mu, std)\n",
"fastspeech2_inference = FastSpeech2Inference(fastspeech2_normalizer, model)\n", "fastspeech2_inference = FastSpeech2Inference(fastspeech2_normalizer, model)\n",
"fastspeech2_inference.eval()\n", "fastspeech2_inference.eval()\n",
"print(fastspeech2_inference)\n", "print(fastspeech2_inference)\n",
@ -1153,16 +1151,18 @@
], ],
"source": [ "source": [
"vocoder = PWGGenerator(**pwg_config[\"generator_params\"])\n", "vocoder = PWGGenerator(**pwg_config[\"generator_params\"])\n",
"\n", "# 模型加载预训练参数\n",
"vocoder.set_state_dict(paddle.load(pwg_checkpoint)[\"generator_params\"]) # 模型加载预训练参数\n", "vocoder.set_state_dict(paddle.load(pwg_checkpoint)[\"generator_params\"]) \n",
"vocoder.remove_weight_norm()\n", "vocoder.remove_weight_norm()\n",
"vocoder.eval() # 推理阶段不启用 batch norm 和 dropout\n", "# 推理阶段不启用 batch norm 和 dropout\n",
"\n", "vocoder.eval()\n",
"stat = np.load(pwg_stat) # 读取数据预处理阶段数据集的均值和标准差\n", "# 读取数据预处理阶段数据集的均值和标准差\n",
"stat = np.load(pwg_stat)\n",
"mu, std = stat\n", "mu, std = stat\n",
"mu, std = paddle.to_tensor(mu), paddle.to_tensor(std)\n", "mu, std = paddle.to_tensor(mu), paddle.to_tensor(std)\n",
"pwg_normalizer = ZScore(mu, std)\n", "pwg_normalizer = ZScore(mu, std)\n",
"pwg_inference = PWGInference(pwg_normalizer, vocoder) # 构建归一化的模型\n", "# 构建归一化的模型\n",
"pwg_inference = PWGInference(pwg_normalizer, vocoder)\n",
"pwg_inference.eval()\n", "pwg_inference.eval()\n",
"print(\"Parallel WaveGAN done!\")" "print(\"Parallel WaveGAN done!\")"
] ]
@ -1266,7 +1266,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 40, "execution_count": 36,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {

@ -0,0 +1,341 @@
# Transformer/Conformer ASR with Aishell
This example contains code used to train a Transformer or [Conformer](http://arxiv.org/abs/2008.03802) model with [Aishell dataset](http://www.openslr.org/resources/33)
## Overview
All the scirpts you need are in ```run.sh```. There are several stages in ```run.sh```, and each stage has its function.
| Stage | Function |
| :---- | :----------------------------------------------------------- |
| 0 | Process data. It includes: <br> (1) Download the dataset <br> (2) Caculate the CMVN of the train dataset <br> (3) Get the vocabulary file <br> (4) Get the manifest files of the train, development and test dataset |
| 1 | Train the model |
| 2 | Get the final model by averaging the top-k models, set k = 1 means choose the best model |
| 3 | Test the final model performance |
| 4 | Get ctc alignment of test data using the final model |
| 5 | Infer the single audio file |
You can choose to run a range of stages by setting ```stage``` and ```stop_stage ```.
For example, if you want to execute the code in stage 2 and stage 3, you can run this script:
```bash
bash run.sh --stage 2 --stop_stage 3
```
Or you can set ```stage``` equal to ```stop-stage``` to only run one stage.
For example, if you only want to run ```stage 0```, you can use the script below:
```bash
bash run.sh --stage 0 --stop_stage 0
```
The document below will describe the scripts in ```run.sh``` in detail.
## The Environment Variables
The path.sh contains the environment variables.
```bash
source path.sh
```
This script needs to be run firstly. And another script is also needed:
```bash
source ${MAIN_ROOT}/utils/parse_options.sh
```
It will support the way of using```--varibale value``` in the shell scripts.
## The Local Variables
Some local variables are set in ```run.sh```.
```gpus``` denotes the GPU number you want to use. If you set ```gpus=```, it means you only use CPU.
```stage``` denotes the number of stage you want to start from in the expriments.
```stop stage```denotes the number of stage you want to end at in the expriments.
```conf_path``` denotes the config path of the model.
```avg_num``` denotes the number K of top-K models you want to average to get the final model.
```audio file``` denotes the file path of the single file you want to infer in stage 6
```ckpt``` denotes the checkpoint prefix of the model, e.g. "conformer"
You can set the local variables (except ```ckpt```) when you use ```run.sh```
For example, you can set the ```gpus``` and ``avg_num`` when you use the command line.:
```bash
bash run.sh --gpus 0,1 --avg_num 20
```
## Stage 0: Data Processing
To use this example, you need to process data firstly and you can use stage 0 in ```run.sh``` to do this. The code is shown below:
```bash
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# prepare data
bash ./local/data.sh || exit -1
fi
```
Stage 0 is for processing the data.
If you only want to process the data. You can run
```bash
bash run.sh --stage 0 --stop_stage 0
```
You can also just run these scripts in your command line.
```bash
source path.sh
bash ./local/data.sh
```
After processing the data, the ``data`` directory will look like this:
```bash
data/
|-- dev.meta
|-- lang_char
| `-- vocab.txt
|-- manifest.dev
|-- manifest.dev.raw
|-- manifest.test
|-- manifest.test.raw
|-- manifest.train
|-- manifest.train.raw
|-- mean_std.json
|-- test.meta
`-- train.meta
```
## Stage 1: Model Training
If you want to train the model. you can use stage 1 in ```run.sh```. The code is shown below.
```bash
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# train model, all `ckpt` under `exp` dir
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt}
fi
```
If you want to train the model, you can use the script below to execute stage 0 and stage 1:
```bash
bash run.sh --stage 0 --stop_stage 1
```
or you can run these scripts in the command line (only use CPU).
```bash
source path.sh
bash ./local/data.sh
CUDA_VISIBLE_DEVICES= ./local/train.sh conf/conformer.yaml conformer
```
## Stage 2: Top-k Models Averaging
After training the model, we need to get the final model for testing and inference. In every epoch, the model checkpoint is saved, so we can choose the best model from them based on the validation loss or we can sort them and average the parameters of the top-k models to get the final model. We can use stage 2 to do this, and the code is shown below:
```bash
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# avg n best model
avg.sh best exp/${ckpt}/checkpoints ${avg_num}
fi
```
The ```avg.sh``` is in the ```../../../utils/``` which is define in the ```path.sh```.
If you want to get the final model, you can use the script below to execute stage 0, stage 1, and stage 2:
```bash
bash run.sh --stage 0 --stop_stage 2
```
or you can run these scripts in the command line (only use CPU).
```bash
source path.sh
bash ./local/data.sh
CUDA_VISIBLE_DEVICES= ./local/train.sh conf/conformer.yaml conformer
avg.sh best exp/conformer/checkpoints 20
```
## Stage 3: Model Testing
The test stage is to evaluate the model performance. The code of test stage is shown below:
```bash
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# test ckpt avg_n
CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
fi
```
If you want to train a model and test it, you can use the script below to execute stage 0, stage 1, stage 2, and stage 3 :
```bash
bash run.sh --stage 0 --stop_stage 3
```
or you can run these scripts in the command line (only use CPU).
```bash
source path.sh
bash ./local/data.sh
CUDA_VISIBLE_DEVICES= ./local/train.sh conf/conformer.yaml conformer
avg.sh best exp/conformer/checkpoints 20
CUDA_VISIBLE_DEVICES= ./local/test.sh conf/conformer.yaml exp/conformer/checkpoints/avg_20
```
## Pretrained Model
You can get the pretrained transfomer or conformer using the scripts below:
```bash
Conformer:
wget https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.release.tar.gz
Chunk Conformer:
wget https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.chunk.release.tar.gz
Transfomer:
wget https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/transformer.model.tar.gz
```
using the ```tar``` scripts to unpack the model and then you can use the script to test the modle.
For example:
```
wget https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/transformer.model.tar.gz
tar xzvf transformer.model.tar.gz
source path.sh
# If you have process the data and get the manifest file you can skip the following 2 steps
bash local/data.sh --stage -1 --stop_stage -1
bash local/data.sh --stage 2 --stop_stage 2
CUDA_VISIBLE_DEVICES= ./local/test.sh conf/transformer.yaml exp/transformer/checkpoints/avg_20
```
The performance of the released models are shown below:
### Conformer
| Model | Params | Config | Augmentation | Test set | Decode method | Loss | CER |
| --------- | ------ | ------------------- | ---------------- | -------- | ---------------------- | ---- | -------- |
| conformer | 47.07M | conf/conformer.yaml | spec_aug + shift | test | attention | - | 0.059858 |
| conformer | 47.07M | conf/conformer.yaml | spec_aug + shift | test | ctc_greedy_search | - | 0.062311 |
| conformer | 47.07M | conf/conformer.yaml | spec_aug + shift | test | ctc_prefix_beam_search | - | 0.062196 |
| conformer | 47.07M | conf/conformer.yaml | spec_aug + shift | test | attention_rescoring | - | 0.054694 |
### Chunk Conformer
Need set `decoding.decoding_chunk_size=16` when decoding.
| Model | Params | Config | Augmentation | Test set | Decode method | Chunk Size & Left Chunks | Loss | CER |
| --------- | ------ | ------------------------- | ---------------- | -------- | ---------------------- | ------------------------ | ---- | -------- |
| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | attention | 16, -1 | - | 0.061939 |
| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_greedy_search | 16, -1 | - | 0.070806 |
| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_prefix_beam_search | 16, -1 | - | 0.070739 |
| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | attention_rescoring | 16, -1 | - | 0.059400 |
### Transformer
| Model | Params | Config | Augmentation | Test set | Decode method | Loss | CER |
| ----------- | ------ | --------------------- | ------------ | -------- | ---------------------- | ----------------- | -------- |
| transformer | 31.95M | conf/transformer.yaml | spec_aug | test | attention | 3.858648955821991 | 0.057293 |
| transformer | 31.95M | conf/transformer.yaml | spec_aug | test | ctc_greedy_search | 3.858648955821991 | 0.061837 |
| transformer | 31.95M | conf/transformer.yaml | spec_aug | test | ctc_prefix_beam_search | 3.858648955821991 | 0.061685 |
| transformer | 31.95M | conf/transformer.yaml | spec_aug | test | attention_rescoring | 3.858648955821991 | 0.053844 |
## Stage 4: CTC Alignment
If you want to get the alignment between the audio and the text, you can use the ctc alignment. The code of this stage is shown below:
```bash
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
# ctc alignment of test data
CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
fi
```
If you want to train the model, test it and do the alignment, you can use the script below to execute stage 0, stage 1, stage 2, and stage 3 :
```bash
bash run.sh --stage 0 --stop_stage 4
```
or if you only need to train a model and do the alignment, you can use these scripts to escape stage 3(test stage):
```bash
bash run.sh --stage 0 --stop_stage 2
bash run.sh --stage 4 --stop_stage 4
```
or you can also use these scripts in the command line (only use CPU).
```bash
source path.sh
bash ./local/data.sh
CUDA_VISIBLE_DEVICES= ./local/train.sh conf/conformer.yaml conformer
avg.sh best exp/conformer/checkpoints 20
# test stage is optional
CUDA_VISIBLE_DEVICES= ./local/test.sh conf/conformer.yaml exp/conformer/checkpoints/avg_20
CUDA_VISIBLE_DEVICES= ./local/align.sh conf/conformer.yaml exp/conformer/checkpoints/avg_20
```
## Stage 5: Single Audio File Inference
In some situations, you want to use the trained model to do the inference for the single audio file. You can use stage 5. The code is shown below
```bash
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# test a single .wav file
CUDA_VISIBLE_DEVICES=0 ./local/test_hub.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1
fi
```
you can train the model by yourself using ```bash run.sh --stage 0 --stop_stage 3```, or you can download the pretrained model through the script below:
```bash
wget https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/transformer.model.tar.gz
tar xzvf transformer.model.tar.gz
```
You need to prepare an audio file, please confirm the sample rate of the audio is 16K. Assume the path of the audio file is ```data/test_audio.wav```, you can get the result by running the script below.
```bash
CUDA_VISIBLE_DEVICES= ./local/test_hub.sh conf/transformer.yaml exp/transformer/checkpoints/avg_20 data/test_audio.wav
```

@ -4,7 +4,7 @@ set -e
gpus=0,1,2,3 gpus=0,1,2,3
stage=0 stage=0
stop_stage=100 stop_stage=50
conf_path=conf/conformer.yaml conf_path=conf/conformer.yaml
avg_num=20 avg_num=20
audio_file=data/demo_01_03.wav audio_file=data/demo_01_03.wav
@ -15,7 +15,6 @@ avg_ckpt=avg_${avg_num}
ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}') ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')
echo "checkpoint name ${ckpt}" echo "checkpoint name ${ckpt}"
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# prepare data # prepare data
bash ./local/data.sh || exit -1 bash ./local/data.sh || exit -1
@ -41,18 +40,20 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
fi fi
# if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# # export ckpt avg_n
# CUDA_VISIBLE_DEVICES=0 ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
# fi
# Optionally, you can add LM and test it with runtime. # Optionally, you can add LM and test it with runtime.
if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# test a single .wav file # test a single .wav file
CUDA_VISIBLE_DEVICES=0 ./local/test_hub.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1 CUDA_VISIBLE_DEVICES=0 ./local/test_hub.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1
fi fi
if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then # Not supported at now!!!
if [ ${stage} -le 51 ] && [ ${stop_stage} -ge 51 ]; then
# export ckpt avg_n
CUDA_VISIBLE_DEVICES=0 ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
fi
# Need further installation! Read the install.md to complete further installation
if [ ${stage} -le 101 ] && [ ${stop_stage} -ge 101 ]; then
echo "warning: deps on kaldi and srilm, please make sure installed." echo "warning: deps on kaldi and srilm, please make sure installed."
# train lm and build TLG # train lm and build TLG
./local/tlg.sh --corpus aishell --lmtype srilm ./local/tlg.sh --corpus aishell --lmtype srilm

@ -21,7 +21,7 @@ We use [MFA2.x](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner)
You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz), or train your own MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) (use MFA1.x now) of our repo. You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz), or train your own MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) (use MFA1.x now) of our repo.
## Pretrained GE2E Model ## Pretrained GE2E Model
We use pretrained GE2E model to generate spwaker embedding for each sentence. We use pretrained GE2E model to generate speaker embedding for each sentence.
Download pretrained GE2E model from here [ge2e_ckpt_0.3.zip](https://bj.bcebos.com/paddlespeech/Parakeet/released_models/ge2e/ge2e_ckpt_0.3.zip), and `unzip` it. Download pretrained GE2E model from here [ge2e_ckpt_0.3.zip](https://bj.bcebos.com/paddlespeech/Parakeet/released_models/ge2e/ge2e_ckpt_0.3.zip), and `unzip` it.

@ -6,8 +6,7 @@
# This configuration is based on full-band MelGAN but the hop size and sampling # This configuration is based on full-band MelGAN but the hop size and sampling
# rate is different from the paper (16kHz vs 24kHz). The number of iteraions # rate is different from the paper (16kHz vs 24kHz). The number of iteraions
# is not shown in the paper so currently we train 1M iterations (not sure enough # is not shown in the paper so currently we train 1M iterations (not sure enough
# to converge). The optimizer setting is based on @dathudeptrai advice. # to converge).
# https://github.com/kan-bayashi/ParallelWaveGAN/issues/143#issuecomment-632539906
########################################################### ###########################################################
# FEATURE EXTRACTION SETTING # # FEATURE EXTRACTION SETTING #

@ -6,8 +6,7 @@
# This configuration is based on full-band MelGAN but the hop size and sampling # This configuration is based on full-band MelGAN but the hop size and sampling
# rate is different from the paper (16kHz vs 24kHz). The number of iteraions # rate is different from the paper (16kHz vs 24kHz). The number of iteraions
# is not shown in the paper so currently we train 1M iterations (not sure enough # is not shown in the paper so currently we train 1M iterations (not sure enough
# to converge). The optimizer setting is based on @dathudeptrai advice. # to converge).
# https://github.com/kan-bayashi/ParallelWaveGAN/issues/143#issuecomment-632539906
########################################################### ###########################################################
# FEATURE EXTRACTION SETTING # # FEATURE EXTRACTION SETTING #

@ -21,7 +21,7 @@
## Transformer ## Transformer
| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER | | Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER |
| --- | --- | --- | --- | --- | --- | --- | --- | | --- | --- | --- | --- | --- | --- | --- | --- |
| transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | attention | 6.725063021977743 | 0.047417 | | transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | attention | 6.484564081827799 | 0.044355 |
| transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | ctc_greedy_search | 6.725063021977743 | 0.053922 | | transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | ctc_greedy_search | 6.484564081827799 | 0.050479 |
| transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | ctc_prefix_beam_search | 6.725063021977743 | 0.053180 | | transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | ctc_prefix_beam_search | 6.484564081827799 | 0.049890 |
| transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | attention_rescoring | 6.725063021977743 | 0.041026 | | transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | attention_rescoring | 6.484564081827799 | 0.039200 |

@ -1,3 +1,40 @@
# network architecture
model:
cmvn_file:
cmvn_file_type: "json"
# encoder related
encoder: transformer
encoder_conf:
output_size: 256 # dimension of attention
attention_heads: 4
linear_units: 2048 # the number of units of position-wise feed forward
num_blocks: 12 # the number of encoder blocks
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.0
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before: true
# decoder related
decoder: transformer
decoder_conf:
attention_heads: 4
linear_units: 2048
num_blocks: 6
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
ctc_dropoutrate: 0.0
ctc_grad_norm_type: null
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false
# https://yaml.org/type/float.html # https://yaml.org/type/float.html
data: data:
train_manifest: data/manifest.train train_manifest: data/manifest.train
@ -36,43 +73,6 @@ collator:
num_workers: 2 num_workers: 2
# network architecture
model:
cmvn_file:
cmvn_file_type: "json"
# encoder related
encoder: transformer
encoder_conf:
output_size: 256 # dimension of attention
attention_heads: 4
linear_units: 2048 # the number of units of position-wise feed forward
num_blocks: 12 # the number of encoder blocks
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.0
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before: true
# decoder related
decoder: transformer
decoder_conf:
attention_heads: 4
linear_units: 2048
num_blocks: 6
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
ctc_dropoutrate: 0.0
ctc_grad_norm_type: null
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false
training: training:
n_epoch: 120 n_epoch: 120
accum_grad: 4 accum_grad: 4

@ -4,7 +4,7 @@
## Transformer ## Transformer
| Model | Params | GPUS | Averaged Model | Config | Augmentation| Loss | | Model | Params | GPUS | Averaged Model | Config | Augmentation| Loss |
| --- | --- | --- | --- | --- | --- | | :-: | :-: | :------------: | :------------: | :-: | :-: | :-: |
| transformer | 32.52M | 8 Tesla V100-SXM2-32GB | 10-best val_loss | conf/transformer.yaml | spec_aug | 6.3197922706604 | | transformer | 32.52M | 8 Tesla V100-SXM2-32GB | 10-best val_loss | conf/transformer.yaml | spec_aug | 6.3197922706604 |
### Attention Rescore ### Attention Rescore

@ -31,7 +31,7 @@ model:
model_conf: model_conf:
ctc_weight: 0.3 ctc_weight: 0.3
ctc_dropoutrate: 0.0 ctc_dropoutrate: 0.0
ctc_grad_norm_type: batch ctc_grad_norm_type: null
lsm_weight: 0.1 # label smoothing option lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false length_normalized_loss: false

@ -86,7 +86,7 @@ training:
scheduler_conf: scheduler_conf:
warmup_steps: 25000 warmup_steps: 25000
lr_decay: 1.0 lr_decay: 1.0
log_interval: 5 log_interval: 50
checkpoint: checkpoint:
kbest_n: 50 kbest_n: 50
latest_n: 5 latest_n: 5

@ -42,7 +42,7 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
# generate manifests # generate manifests
python3 ${TARGET_DIR}/ted_en_zh/ted_en_zh.py \ python3 ${TARGET_DIR}/ted_en_zh/ted_en_zh.py \
--manifest_prefix="data/manifest" \ --manifest_prefix="data/manifest" \
--src_dir="${data_dir}" --src-dir="${data_dir}"
echo "Complete raw data pre-process." echo "Complete raw data pre-process."
fi fi
@ -76,8 +76,9 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--spm_vocab_size=${nbpe} \ --spm_vocab_size=${nbpe} \
--spm_mode ${bpemode} \ --spm_mode ${bpemode} \
--spm_model_prefix ${bpeprefix} \ --spm_model_prefix ${bpeprefix} \
--spm_character_coverage 1. \
--vocab_path="${dict_dir}/vocab.txt" \ --vocab_path="${dict_dir}/vocab.txt" \
--text_keys 'text' 'text1' \ --text_keys 'text' \
--manifest_paths="data/manifest.train.raw" --manifest_paths="data/manifest.train.raw"
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then

@ -5,7 +5,7 @@ source path.sh
gpus=0,1,2,3 gpus=0,1,2,3
stage=0 stage=0
stop_stage=100 stop_stage=100
conf_path=conf/transformer_joint_noam.yaml conf_path=conf/transformer_mtl_noam.yaml
avg_num=5 avg_num=5
data_path=./TED_EnZh # path to unzipped data data_path=./TED_EnZh # path to unzipped data
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;

@ -0,0 +1,89 @@
# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
# e.g.
# run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
#
# Options:
# --time <time>: Limit the maximum time to execute.
# --mem <mem>: Limit the maximum memory usage.
# -max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
# --num-threads <ngpu>: Specify the number of CPU core.
# --gpu <ngpu>: Specify the number of GPU devices.
# --config: Change the configuration file from default.
#
# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
#
# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
# These options are mapping to specific options for each backend and
# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
# If jobs failed, your configuration might be wrong for your environment.
#
#
# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
# "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
# =========================================================~
# Select the backend used by run.sh from "local", "sge", "slurm", or "ssh"
cmd_backend='local'
# Local machine, without any Job scheduling system
if [ "${cmd_backend}" = local ]; then
# The other usage
export train_cmd="run.pl"
# Used for "*_train.py": "--gpu" is appended optionally by run.sh
export cuda_cmd="run.pl"
# Used for "*_recog.py"
export decode_cmd="run.pl"
# "qsub" (SGE, Torque, PBS, etc.)
elif [ "${cmd_backend}" = sge ]; then
# The default setting is written in conf/queue.conf.
# You must change "-q g.q" for the "queue" for your environment.
# To know the "queue" names, type "qhost -q"
# Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
export train_cmd="queue.pl"
export cuda_cmd="queue.pl"
export decode_cmd="queue.pl"
# "sbatch" (Slurm)
elif [ "${cmd_backend}" = slurm ]; then
# The default setting is written in conf/slurm.conf.
# You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
# To know the "partion" names, type "sinfo".
# You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
# The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
export train_cmd="slurm.pl"
export cuda_cmd="slurm.pl"
export decode_cmd="slurm.pl"
elif [ "${cmd_backend}" = ssh ]; then
# You have to create ".queue/machines" to specify the host to execute jobs.
# e.g. .queue/machines
# host1
# host2
# host3
# Assuming you can login them without any password, i.e. You have to set ssh keys.
export train_cmd="ssh.pl"
export cuda_cmd="ssh.pl"
export decode_cmd="ssh.pl"
# This is an example of specifying several unique options in the JHU CLSP cluster setup.
# Users can modify/add their own command options according to their cluster environments.
elif [ "${cmd_backend}" = jhu ]; then
export train_cmd="queue.pl --mem 2G"
export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/gpu.conf"
export decode_cmd="queue.pl --mem 4G"
else
echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
return 1
fi

@ -0,0 +1,2 @@
--sample-frequency=16000
--num-mel-bins=80

@ -0,0 +1 @@
--sample-frequency=16000

@ -11,9 +11,9 @@ data:
max_output_input_ratio: 20.0 max_output_input_ratio: 20.0
collator: collator:
vocab_filepath: data/lang_char/vocab.txt vocab_filepath: data/lang_char/ted_en_zh_bpe8000.txt
unit_type: 'spm' unit_type: 'spm'
spm_model_prefix: data/train_sp.en-zh-nlpr.zh-nlpr_bpe8000_tc spm_model_prefix: data/lang_char/ted_en_zh_bpe8000
mean_std_filepath: "" mean_std_filepath: ""
# augmentation_config: conf/augmentation.json # augmentation_config: conf/augmentation.json
batch_size: 10 batch_size: 10

@ -8,10 +8,13 @@ dict_dir=data/lang_char
# bpemode (unigram or bpe) # bpemode (unigram or bpe)
nbpe=8000 nbpe=8000
bpemode=unigram bpemode=bpe
bpeprefix="${dict_dir}/bpe_${bpemode}_${nbpe}" bpeprefix="${dict_dir}/bpe_${bpemode}_${nbpe}"
data_dir=./TED_EnZh data_dir=./TED_EnZh
target_dir=data/ted_en_zh
dumpdir=data/dump
do_delta=false
nj=20
source ${MAIN_ROOT}/utils/parse_options.sh source ${MAIN_ROOT}/utils/parse_options.sh
@ -38,75 +41,167 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
exit 1 exit 1
fi fi
# generate manifests # extract data
python3 ${TARGET_DIR}/ted_en_zh/ted_en_zh.py \ echo "data Extraction"
--manifest_prefix="data/manifest" \ python3 local/ted_en_zh.py \
--src_dir="${data_dir}" --tgt-dir=${target_dir} \
--src-dir=${data_dir}
echo "Complete raw data pre-process."
fi fi
prep_dir=${target_dir}/data_prep
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# compute mean and stddev for normalizer ### Task dependent. You have to make data the following preparation part by yourself.
num_workers=$(nproc) ### But you can utilize Kaldi recipes in most cases
python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ echo "stage 0: Data preparation"
--manifest_path="data/manifest.train.raw" \ for set in train dev test; do
--num_samples=-1 \ # for set in train; do
--spectrum_type="fbank" \ dst=${target_dir}/${set}
--feat_dim=80 \ for lang in en zh; do
--delta_delta=false \
--sample_rate=16000 \ if [ ${lang} = 'en' ]; then
--stride_ms=10.0 \ echo "remove punctuation $lang"
--window_ms=25.0 \ # remove punctuation
--use_dB_normalization=False \ local/remove_punctuation.pl < ${dst}/${lang}.org > ${dst}/${lang}.raw
--num_workers=${num_workers} \ else
--output_path="data/mean_std.json" cp ${dst}/${lang}.org ${dst}/${lang}.raw
if [ $? -ne 0 ]; then
echo "Compute mean and stddev failed. Terminated."
exit 1
fi fi
paste -d " " ${dst}/.yaml ${dst}/${lang}.raw | sort > ${dst}/text.${lang}
done
# error check
n=$(cat ${dst}/.yaml | wc -l)
n_en=$(cat ${dst}/en.raw | wc -l)
n_tgt=$(cat ${dst}/zh.raw | wc -l)
[ ${n} -ne ${n_en} ] && echo "Warning: expected ${n} data data files, found ${n_en}" && exit 1;
[ ${n} -ne ${n_tgt} ] && echo "Warning: expected ${n} data data files, found ${n_tgt}" && exit 1;
echo "done text processing"
cat ${dst}/wav.scp.org | uniq | sort -k1,1 -u > ${dst}/wav.scp
cat ${dst}/utt2spk.org | uniq | sort -k1,1 -u > ${dst}/utt2spk
cat ${dst}/utt2spk | utils/utt2spk_to_spk2utt.pl | sort -k1,1 -u > ${dst}/spk2utt
rm -rf ${prep_dir}/${set}.en-zh
mkdir -p ${prep_dir}/${set}.en-zh
echo "remove duplicate lines..."
cut -d ' ' -f 1 ${dst}/text.en | sort | uniq -c | sort -n -k1 -r | grep -v '1 ted-en-zh' \
| sed 's/^[ \t]*//' > ${dst}/duplicate_lines
cut -d ' ' -f 1 ${dst}/text.en | sort | uniq -c | sort -n -k1 -r | grep '1 ted-en-zh' \
| cut -d '1' -f 2- | sed 's/^[ \t]*//' > ${dst}/reclist
reduce_data_dir.sh ${dst} ${dst}/reclist ${prep_dir}/${set}.en-zh
echo "done wav processing"
for l in en zh; do
cp ${dst}/text.${l} ${prep_dir}/${set}.en-zh/text.${l}
done
utils/fix_data_dir.sh --utt_extra_files \
"text.en text.zh" \
${prep_dir}/${set}.en-zh
done
fi fi
feat_tr_dir=${dumpdir}/train_sp/delta${do_delta}; mkdir -p ${feat_tr_dir}
feat_dt_dir=${dumpdir}/dev/delta${do_delta}; mkdir -p ${feat_dt_dir}
feat_trans_dir=${dumpdir}/test/delta${do_delta}; mkdir -p ${feat_trans_dir}
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# build vocabulary ### Task dependent. You have to design training and dev sets by yourself.
python3 ${MAIN_ROOT}/utils/build_vocab.py \ ### But you can utilize Kaldi recipes in most cases
--unit_type "spm" \ echo "stage 1: Feature Generation"
--spm_vocab_size=${nbpe} \ fbankdir=data/fbank
--spm_mode ${bpemode} \ # Generate the fbank features; by default 80-dimensional fbanks with pitch on each frame
--spm_model_prefix ${bpeprefix} \ for x in train dev test; do
--vocab_path="${dict_dir}/vocab.txt" \ steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj ${nj} --write_utt2num_frames true \
--text_keys 'text' 'text1' \ ${prep_dir}/${x}.en-zh data/make_fbank/${x} ${fbankdir}
--manifest_paths="data/manifest.train.raw" done
echo "speed perturbation"
if [ $? -ne 0 ]; then utils/perturb_data_dir_speed.sh 0.9 ${prep_dir}/train.en-zh ${prep_dir}/temp1.en-zh
echo "Build vocabulary failed. Terminated." utils/perturb_data_dir_speed.sh 1.0 ${prep_dir}/train.en-zh ${prep_dir}/temp2.en-zh
exit 1 utils/perturb_data_dir_speed.sh 1.1 ${prep_dir}/train.en-zh ${prep_dir}/temp3.en-zh
fi
utils/combine_data.sh --extra-files utt2uniq ${prep_dir}/train_sp.en-zh \
${prep_dir}/temp1.en-zh ${prep_dir}/temp2.en-zh ${prep_dir}/temp3.en-zh
rm -r ${prep_dir}/temp*.en-zh
utils/fix_data_dir.sh ${prep_dir}/train_sp.en-zh
steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj ${nj} --write_utt2num_frames true \
${prep_dir}/train_sp.en-zh exp/make_fbank/train_sp.en-zh ${fbankdir}
for lang in en zh; do
cat /dev/null > ${prep_dir}/train_sp.en-zh/text.${lang}
for p in "sp0.9-" "sp1.0-" "sp1.1-"; do
awk -v p=${p} '{printf("%s %s%s\n", $1, p, $1);}' ${prep_dir}/train.en-zh/utt2spk > ${prep_dir}/train_sp.en-zh/utt_map
utils/apply_map.pl -f 1 ${prep_dir}/train_sp.en-zh/utt_map < ${prep_dir}/train.en-zh/text.${lang} >>${prep_dir}/train_sp.en-zh/text.${lang}
done
done
for x in train_sp dev test; do
local/divide_lang.sh ${prep_dir}/${x}.en-zh zh
done
for x in train_sp dev; do
# remove utt having more than 3000 frames
# remove utt having more than 400 characters
for lang in zh en; do
remove_longshortdata.sh --maxframes 3000 --maxchars 400 ${prep_dir}/${x}.en-zh.${lang} ${prep_dir}/${x}.en-zh.${lang}.tmp
done
cut -f 1 -d " " ${prep_dir}/${x}.en-zh.en.tmp/text > ${prep_dir}/${x}.en-zh.${lang}.tmp/reclist1
cut -f 1 -d " " ${prep_dir}/${x}.en-zh.${lang}.tmp/text > ${prep_dir}/${x}.en-zh.${lang}.tmp/reclist2
comm -12 ${prep_dir}/${x}.en-zh.${lang}.tmp/reclist1 ${prep_dir}/${x}.en-zh.${lang}.tmp/reclist2 > ${prep_dir}/${x}.en-zh.en.tmp/reclist
for lang in zh en; do
reduce_data_dir.sh ${prep_dir}/${x}.en-zh.${lang}.tmp ${prep_dir}/${x}.en-zh.en.tmp/reclist ${prep_dir}/${x}.en-zh.${lang}
utils/fix_data_dir.sh ${prep_dir}/${x}.en-zh.${lang}
done
rm -rf ${prep_dir}/${x}.en-zh.*.tmp
done
compute-cmvn-stats scp:${prep_dir}/train_sp.en-zh.zh/feats.scp ${prep_dir}/train_sp.en-zh.zh/cmvn.ark
dump.sh --cmd "$train_cmd" --nj ${nj} --do_delta $do_delta \
${prep_dir}/train_sp.en-zh.zh/feats.scp ${prep_dir}/train_sp.en-zh.zh/cmvn.ark ${prep_dir}/dump_feats/train_sp.en-zh.zh ${feat_tr_dir}
dump.sh --cmd "$train_cmd" --nj ${nj} --do_delta $do_delta \
${prep_dir}/dev.en-zh.zh/feats.scp ${prep_dir}/train_sp.en-zh.zh/cmvn.ark ${prep_dir}/dump_feats/dev.en-zh.zh ${feat_dt_dir}
dump.sh --cmd "$train_cmd" --nj ${nj} --do_delta $do_delta \
${prep_dir}/test.en-zh.zh/feats.scp ${prep_dir}/train_sp.en-zh.zh/cmvn.ark ${prep_dir}/dump_feats/test.en-zh.zh ${feat_trans_dir}
fi fi
dict=${dict_dir}/ted_en_zh_${bpemode}${nbpe}.txt
nlsyms=${dict_dir}/ted_en_zh_non_lang_syms.txt
bpemodel=${dict_dir}/ted_en_zh_${bpemode}${nbpe}
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# format manifest with tokenids, vocab size echo "stage 2: Dictionary and Json Data Preparation"
for set in train dev test; do
{ echo "make a joint source and target dictionary"
python3 ${MAIN_ROOT}/utils/format_triplet_data.py \ echo "<unk> 1" > ${dict} # <unk> must be 1, 0 will be used for "blank" in CTC
--feat_type "raw" \ offset=$(wc -l < ${dict})
--cmvn_path "data/mean_std.json" \ grep sp1.0 ${prep_dir}/train_sp.en-zh.*/text | cut -f 2- -d' ' | grep -v -e '^\s*$' > ${dict_dir}/input.txt
--unit_type "spm" \ spm_train --input=${dict_dir}/input.txt --vocab_size=${nbpe} --model_type=${bpemode} --model_prefix=${bpemodel} --input_sentence_size=100000000 --character_coverage=1.0
--spm_model_prefix ${bpeprefix} \ spm_encode --model=${bpemodel}.model --output_format=piece < ${dict_dir}/input.txt | tr ' ' '\n' | sort | uniq | awk -v offset=${offset} '{print $0 " " NR+offset}' >> ${dict}
--vocab_path="${dict_dir}/vocab.txt" \ wc -l ${dict}
--manifest_path="data/manifest.${set}.raw" \
--output_path="data/manifest.${set}" echo "make json files"
data2json.sh --nj ${nj} --feat ${feat_tr_dir}/feats.scp --text ${prep_dir}/train_sp.en-zh.zh/text --bpecode ${bpemodel}.model --lang zh \
if [ $? -ne 0 ]; then ${prep_dir}/train_sp.en-zh.zh ${dict} > ${feat_tr_dir}/data_${bpemode}${nbpe}.json
echo "Formt mnaifest failed. Terminated." data2json.sh --feat ${feat_dt_dir}/feats.scp --text ${prep_dir}/dev.en-zh.zh/text --bpecode ${bpemodel}.model --lang zh \
exit 1 ${prep_dir}/dev.en-zh.zh ${dict} > ${feat_dt_dir}/data_${bpemode}${nbpe}.json
fi data2json.sh --feat ${feat_trans_dir}/feats.scp --text ${prep_dir}/test.en-zh.zh/text --bpecode ${bpemodel}.model --lang zh \
}& ${prep_dir}/test.en-zh.zh ${dict} > ${feat_trans_dir}/data_${bpemode}${nbpe}.json
echo "update json (add source references)"
# update json (add source references)
for x in train_sp dev; do
feat_dir=${dumpdir}/${x}/delta${do_delta}
data_dir=${prep_dir}/$(echo ${x} | cut -f 1 -d ".").en-zh.en
update_json.sh --text ${data_dir}/text --bpecode ${bpemodel}.model \
${feat_dir}/data_${bpemode}${nbpe}.json ${data_dir} ${dict}
done done
wait
fi fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
echo "stage 3: Format the Json Data"
python3 local/espnet_json_to_manifest.py --json-file ${feat_tr_dir}/data_${bpemode}${nbpe}.json --manifest-file data/manifest.train
python3 local/espnet_json_to_manifest.py --json-file ${feat_dt_dir}/data_${bpemode}${nbpe}.json --manifest-file data/manifest.dev
python3 local/espnet_json_to_manifest.py --json-file ${feat_trans_dir}/data_${bpemode}${nbpe}.json --manifest-file data/manifest.test
fi
echo "Ted En-Zh Data preparation done." echo "Ted En-Zh Data preparation done."
exit 0 exit 0

@ -0,0 +1,48 @@
#!/bin/bash
# Copyright 2019 Kyoto University (Hirofumi Inaguma)
# 2021 PaddlePaddle
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
. ./path.sh
if [ "$#" -ne 2 ]; then
echo "Usage: $0 <set> <lang>>"
echo "e.g.: $0 dev"
exit 1
fi
set=$1
lang=$2
export LC_ALL=en_US.UTF-8
# Copy stuff intoc its final locations [this has been moved from the format_data script]
# for En
mkdir -p ${set}.en
for f in spk2utt utt2spk segments wav.scp feats.scp utt2num_frames; do
if [ -f ${set}/${f} ]; then
sort ${set}/${f} > ${set}.en/${f}
fi
done
sort ${set}/text.en | sed $'s/[^[:print:]]//g' > ${set}.en/text
utils/fix_data_dir.sh ${set}.en
if [ -f ${set}.en/feats.scp ]; then
utils/validate_data_dir.sh ${set}.en || exit 1;
else
utils/validate_data_dir.sh --no-feats --no-wav ${set}.en || exit 1;
fi
# for target language
mkdir -p ${set}.${lang}
for f in spk2utt utt2spk segments wav.scp feats.scp utt2num_frames; do
if [ -f ${set}/${f} ]; then
sort ${set}/${f} > ${set}.${lang}/${f}
fi
done
sort ${set}/text.${lang} | sed $'s/[^[:print:]]//g' > ${set}.${lang}/text
utils/fix_data_dir.sh ${set}.${lang}
if [ -f ${set}.${lang}/feats.scp ]; then
utils/validate_data_dir.sh ${set}.${lang} || exit 1;
else
utils/validate_data_dir.sh --no-feats --no-wav ${set}.${lang} || exit 1;
fi

@ -0,0 +1,27 @@
#!/usr/bin/env python
import argparse
import json
def main(args):
with open(args.json_file, 'r') as fin:
data_json = json.load(fin)
with open(args.manifest_file, 'w') as fout:
for key, value in data_json['utts'].items():
value['utt'] = key
fout.write(json.dumps(value, ensure_ascii=False))
fout.write("\n")
if __name__ == '__main__':
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
'--json-file', type=str, default=None, help="espnet data json file.")
parser.add_argument(
'--manifest-file',
type=str,
default='manifest.train',
help='manifest data json line file.')
args = parser.parse_args()
main(args)

@ -0,0 +1,25 @@
#!/usr/bin/perl
use warnings;
use strict;
binmode(STDIN,":utf8");
binmode(STDOUT,":utf8");
while(<STDIN>) {
$_ = " $_ ";
# remove punctuation except apostrophe
s/<space>/spacemark/g; # for scoring
s/'/apostrophe/g;
s/[[:punct:]]//g;
s/apostrophe/'/g;
s/spacemark/<space>/g; # for scoring
# remove whitespace
s/\s+/ /g;
s/^\s+//;
s/\s+$//;
print "$_\n";
}

@ -0,0 +1,104 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import codecs
import os
# org_split = 'train-split/train-segment'
# text_file = 'En-Zh/train.en-zh'
# data_split = 'train'
def data_process(src_dir, tgt_dir, wav_dir_list, text_file_list,
data_split_list):
for org_split, text_file, data_split in zip(wav_dir_list, text_file_list,
data_split_list):
local_data_split_dir = os.path.join(tgt_dir, data_split)
os.makedirs(local_data_split_dir, exist_ok=True)
utts = []
utt2spk = {}
with open(os.path.join(local_data_split_dir, 'wav.scp.org'), 'w') as wav_wf, \
open(os.path.join(local_data_split_dir, 'utt2spk.org'), 'w') as utt2spk_wf:
for files in os.listdir(os.path.join(src_dir, org_split)):
files = files.strip()
file_path = os.path.join(src_dir, org_split, files)
size = os.path.getsize(file_path)
if size <= 30000:
continue
utt = files.split('.')[0]
audio_name = utt.split('_')[0]
#format the name of utterance
while len(audio_name) < 6:
utt = '0' + utt
audio_name = '0' + audio_name
utt = 'ted-en-zh-' + utt
utts.append(utt)
spk = utt.split('_')[0]
utt2spk[utt] = spk
assert len(spk) == 16, "%r" % spk
print(utt, 'cat', os.path.abspath(file_path), '|', file=wav_wf)
for utt in sorted(utts):
print(utt, utt2spk[utt], file=utt2spk_wf)
with open(os.path.join(local_data_split_dir, 'en.org'), 'w') as en_wf, \
open(os.path.join(local_data_split_dir, 'zh.org'), 'w') as zh_wf, \
open(os.path.join(local_data_split_dir, '.yaml'), 'w') as yaml_wf, \
codecs.open(os.path.join(src_dir, text_file), 'r', encoding='utf-8',
errors='ignore') as rf:
count = 0
for line in rf:
line = line.strip()
line_spl = line.split('\t')
assert len(line_spl) == 3, "%r" % line
wav, en, zh = line_spl
assert wav.endswith('wav'), "%r" % wav[-3:]
utt = wav.split('.')[0]
audio_name = utt.split('_')[0]
while len(audio_name) < 6:
utt = '0' + utt
audio_name = '0' + audio_name
utt = 'ted-en-zh-' + utt
print(utt, file=yaml_wf)
print(en.lower(), file=en_wf)
print(zh, file=zh_wf)
count += 1
print('%s set lines count: %d' % (data_split, count))
if __name__ == '__main__':
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--src-dir",
default="",
type=str,
help="Directory to kaldi splited data. (default: %(default)s)")
parser.add_argument(
"--tgt-dir",
default="local/ted_en_zh",
type=str,
help="Directory to save processed data. (default: %(default)s)")
args = parser.parse_args()
wav_dir_list = [
'train-split/train-segment', 'test-segment/tst2014',
'test-segment/tst2015'
]
text_file_list = [
'En-Zh/train.en-zh', 'En-Zh/tst2014.en-zh', 'En-Zh/tst2015.en-zh'
]
data_split_list = ['train', 'dev', 'test']
data_process(args.src_dir, args.tgt_dir, wav_dir_list, text_file_list,
data_split_list)

@ -24,7 +24,7 @@ python3 -u ${BIN_DIR}/train.py \
--ngpu ${ngpu} \ --ngpu ${ngpu} \
--config ${config_path} \ --config ${config_path} \
--output exp/${ckpt_name} \ --output exp/${ckpt_name} \
--checkpoint_path ${ckpt_path} \ --checkpoint_path "${ckpt_path}" \
--seed ${seed} --seed ${seed}
if [ ${seed} != 0 ]; then if [ ${seed} != 0 ]; then

@ -1,6 +1,6 @@
export MAIN_ROOT=`realpath ${PWD}/../../../` export MAIN_ROOT=`realpath ${PWD}/../../../`
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PWD}/utils:${PATH}
export LC_ALL=C export LC_ALL=C
export PYTHONDONTWRITEBYTECODE=1 export PYTHONDONTWRITEBYTECODE=1
@ -13,3 +13,10 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
MODEL=u2_st MODEL=u2_st
export BIN_DIR=${MAIN_ROOT}/paddlespeech/s2t/exps/${MODEL}/bin export BIN_DIR=${MAIN_ROOT}/paddlespeech/s2t/exps/${MODEL}/bin
# Kaldi
export KALDI_ROOT=${MAIN_ROOT}/tools/kaldi
[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present, can not using Kaldi!"
[ -f $KALDI_ROOT/tools/config/common_path.sh ] && . $KALDI_ROOT/tools/config/common_path.sh

@ -1,12 +1,13 @@
#!/bin/bash #!/bin/bash
set -e set -e
source path.sh . ./path.sh || exit 1;
. ./cmd.sh || exit 1;
gpus=0,1,2,3 gpus=0,1,2,3
stage=1 stage=1
stop_stage=4 stop_stage=4
conf_path=conf/transformer_mtl_noam.yaml conf_path=conf/transformer_mtl_noam.yaml
ckpt_path=paddle.98 ckpt_path= # paddle.98 # (finetune from FAT-ST pretrained model)
avg_num=5 avg_num=5
data_path=./TED_EnZh # path to unzipped data data_path=./TED_EnZh # path to unzipped data
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
@ -22,21 +23,20 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
fi fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# download pretrained
bash ./local/download_pretrain.sh || exit -1
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# train model, all `ckpt` under `exp` dir # train model, all `ckpt` under `exp` dir
CUDA_VISIBLE_DEVICES=${gpus} ./local/train_finetune.sh ${conf_path} ${ckpt} ${ckpt_path} if [ -n "${ckpt_path}" ]; then
echo "Finetune from Pretrained Model" ${ckpt_path}
./local/download_pretrain.sh || exit -1
fi
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} "${ckpt_path}"
fi fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# avg n best model # avg n best model
avg.sh best exp/${ckpt}/checkpoints ${avg_num} avg.sh best exp/${ckpt}/checkpoints ${avg_num}
fi fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# test ckpt avg_n # test ckpt avg_n
CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
fi fi

@ -0,0 +1 @@
../../../tools/kaldi/egs/wsj/s5/steps

@ -0,0 +1 @@
../../../tools/kaldi/egs/wsj/s5/utils

@ -14,7 +14,7 @@ collator:
mean_std_filepath: "" mean_std_filepath: ""
vocab_filepath: data/lang_char/vocab.txt vocab_filepath: data/lang_char/vocab.txt
unit_type: 'spm' unit_type: 'spm'
spm_model_prefix: 'data/bpe_unigram_200' spm_model_prefix: 'data/lang_char/bpe_unigram_200'
augmentation_config: conf/preprocess.yaml augmentation_config: conf/preprocess.yaml
batch_size: 4 batch_size: 4
raw_wav: True # use raw_wav or kaldi feature raw_wav: True # use raw_wav or kaldi feature

@ -14,7 +14,7 @@ collator:
mean_std_filepath: "" mean_std_filepath: ""
vocab_filepath: data/lang_char/vocab.txt vocab_filepath: data/lang_char/vocab.txt
unit_type: 'spm' unit_type: 'spm'
spm_model_prefix: 'data/bpe_unigram_200' spm_model_prefix: 'data/lang_char/bpe_unigram_200'
augmentation_config: conf/preprocess.yaml augmentation_config: conf/preprocess.yaml
batch_size: 4 batch_size: 4
raw_wav: True # use raw_wav or kaldi feature raw_wav: True # use raw_wav or kaldi feature

@ -14,7 +14,7 @@ collator:
mean_std_filepath: "" mean_std_filepath: ""
vocab_filepath: data/lang_char/vocab.txt vocab_filepath: data/lang_char/vocab.txt
unit_type: 'spm' unit_type: 'spm'
spm_model_prefix: 'data/bpe_unigram_200' spm_model_prefix: 'data/lang_char/bpe_unigram_200'
augmentation_config: conf/preprocess.yaml augmentation_config: conf/preprocess.yaml
batch_size: 4 batch_size: 4
raw_wav: True # use raw_wav or kaldi feature raw_wav: True # use raw_wav or kaldi feature

@ -14,7 +14,7 @@ collator:
mean_std_filepath: data/mean_std.json mean_std_filepath: data/mean_std.json
vocab_filepath: data/lang_char/vocab.txt vocab_filepath: data/lang_char/vocab.txt
unit_type: 'spm' unit_type: 'spm'
spm_model_prefix: 'data/bpe_unigram_200' spm_model_prefix: 'data/lang_char/bpe_unigram_200'
augmentation_config: conf/preprocess.yaml augmentation_config: conf/preprocess.yaml
batch_size: 4 batch_size: 4
raw_wav: True # use raw_wav or kaldi feature raw_wav: True # use raw_wav or kaldi feature

@ -110,8 +110,8 @@ class DeepSpeech2Tester_hub():
def setup_model(self): def setup_model(self):
config = self.config.clone() config = self.config.clone()
with UpdateConfig(config): with UpdateConfig(config):
config.model.feat_size = self.collate_fn_test.feature_size config.model.input_dim = self.collate_fn_test.feature_size
config.model.dict_size = self.collate_fn_test.vocab_size config.model.output_dim = self.collate_fn_test.vocab_size
if self.args.model_type == 'offline': if self.args.model_type == 'offline':
model = DeepSpeech2Model.from_config(config.model) model = DeepSpeech2Model.from_config(config.model)

@ -154,11 +154,11 @@ class DeepSpeech2Trainer(Trainer):
config = self.config.clone() config = self.config.clone()
with UpdateConfig(config): with UpdateConfig(config):
if self.train: if self.train:
config.model.feat_size = self.train_loader.collate_fn.feature_size config.model.input_dim = self.train_loader.collate_fn.feature_size
config.model.dict_size = self.train_loader.collate_fn.vocab_size config.model.output_dim = self.train_loader.collate_fn.vocab_size
else: else:
config.model.feat_size = self.test_loader.collate_fn.feature_size config.model.input_dim = self.test_loader.collate_fn.feature_size
config.model.dict_size = self.test_loader.collate_fn.vocab_size config.model.output_dim = self.test_loader.collate_fn.vocab_size
if self.args.model_type == 'offline': if self.args.model_type == 'offline':
model = DeepSpeech2Model.from_config(config.model) model = DeepSpeech2Model.from_config(config.model)

@ -128,8 +128,9 @@ class U2Trainer(Trainer):
if dist.get_rank() == 0 and self.visualizer: if dist.get_rank() == 0 and self.visualizer:
losses_np_v = losses_np.copy() losses_np_v = losses_np.copy()
losses_np_v.update({"lr": self.lr_scheduler()}) losses_np_v.update({"lr": self.lr_scheduler()})
self.visualizer.add_scalars("step", losses_np_v, for key, val in losses_np_v.items():
self.iteration - 1) self.visualizer.add_scalar(
tag='train/' + key, value=val, step=self.iteration - 1)
@paddle.no_grad() @paddle.no_grad()
def valid(self): def valid(self):
@ -237,9 +238,10 @@ class U2Trainer(Trainer):
logger.info( logger.info(
'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss)) 'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss))
if self.visualizer: if self.visualizer:
self.visualizer.add_scalars( self.visualizer.add_scalar(
'epoch', {'cv_loss': cv_loss, tag='eval/cv_loss', value=cv_loss, step=self.epoch)
'lr': self.lr_scheduler()}, self.epoch) self.visualizer.add_scalar(
tag='eval/lr', value=self.lr_scheduler(), step=self.epoch)
self.save(tag=self.epoch, infos={'val_loss': cv_loss}) self.save(tag=self.epoch, infos={'val_loss': cv_loss})
self.new_epoch() self.new_epoch()

@ -131,8 +131,9 @@ class U2Trainer(Trainer):
if dist.get_rank() == 0 and self.visualizer: if dist.get_rank() == 0 and self.visualizer:
losses_np_v = losses_np.copy() losses_np_v = losses_np.copy()
losses_np_v.update({"lr": self.lr_scheduler()}) losses_np_v.update({"lr": self.lr_scheduler()})
self.visualizer.add_scalars("step", losses_np_v, for key, val in losses_np_v.items():
self.iteration - 1) self.visualizer.add_scalar(
tag="train/" + key, value=val, step=self.iteration - 1)
@paddle.no_grad() @paddle.no_grad()
def valid(self): def valid(self):
@ -222,9 +223,11 @@ class U2Trainer(Trainer):
logger.info( logger.info(
'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss)) 'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss))
if self.visualizer: if self.visualizer:
self.visualizer.add_scalars( self.visualizer.add_scalar(
'epoch', {'cv_loss': cv_loss, tag='eval/cv_loss', value=cv_loss, step=self.epoch)
'lr': self.lr_scheduler()}, self.epoch) self.visualizer.add_scalar(
tag='eval/lr', value=self.lr_scheduler(), step=self.epoch)
self.save(tag=self.epoch, infos={'val_loss': cv_loss}) self.save(tag=self.epoch, infos={'val_loss': cv_loss})
self.new_epoch() self.new_epoch()

@ -26,8 +26,10 @@ from paddle import distributed as dist
from paddle.io import DataLoader from paddle.io import DataLoader
from yacs.config import CfgNode from yacs.config import CfgNode
from paddlespeech.s2t.frontend.featurizer import TextFeaturizer
from paddlespeech.s2t.io.collator import SpeechCollator from paddlespeech.s2t.io.collator import SpeechCollator
from paddlespeech.s2t.io.collator import TripletSpeechCollator from paddlespeech.s2t.io.collator import TripletSpeechCollator
from paddlespeech.s2t.io.dataloader import BatchDataLoader
from paddlespeech.s2t.io.dataset import ManifestDataset from paddlespeech.s2t.io.dataset import ManifestDataset
from paddlespeech.s2t.io.sampler import SortagradBatchSampler from paddlespeech.s2t.io.sampler import SortagradBatchSampler
from paddlespeech.s2t.io.sampler import SortagradDistributedBatchSampler from paddlespeech.s2t.io.sampler import SortagradDistributedBatchSampler
@ -136,8 +138,9 @@ class U2STTrainer(Trainer):
if dist.get_rank() == 0 and self.visualizer: if dist.get_rank() == 0 and self.visualizer:
losses_np_v = losses_np.copy() losses_np_v = losses_np.copy()
losses_np_v.update({"lr": self.lr_scheduler()}) losses_np_v.update({"lr": self.lr_scheduler()})
self.visualizer.add_scalars("step", losses_np_v, for key, val in losses_np_v.items():
self.iteration - 1) self.visualizer.add_scalar(
tag="train/" + key, value=val, step=self.iteration - 1)
@paddle.no_grad() @paddle.no_grad()
def valid(self): def valid(self):
@ -233,9 +236,11 @@ class U2STTrainer(Trainer):
logger.info( logger.info(
'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss)) 'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss))
if self.visualizer: if self.visualizer:
self.visualizer.add_scalars( self.visualizer.add_scalar(
'epoch', {'cv_loss': cv_loss, tag='eval/cv_loss', value=cv_loss, step=self.epoch)
'lr': self.lr_scheduler()}, self.epoch) self.visualizer.add_scalar(
tag='eval/lr', value=self.lr_scheduler(), step=self.epoch)
self.save(tag=self.epoch, infos={'val_loss': cv_loss}) self.save(tag=self.epoch, infos={'val_loss': cv_loss})
self.new_epoch() self.new_epoch()
@ -423,6 +428,31 @@ class U2STTester(U2STTrainer):
trans.append(''.join([chr(i) for i in ids])) trans.append(''.join([chr(i) for i in ids]))
return trans return trans
def translate(self, audio, audio_len):
""""E2E translation from extracted audio feature"""
cfg = self.config.decoding
text_feature = self.test_loader.collate_fn.text_feature
self.model.eval()
hyps = self.model.decode(
audio,
audio_len,
text_feature=text_feature,
decoding_method=cfg.decoding_method,
lang_model_path=cfg.lang_model_path,
beam_alpha=cfg.alpha,
beam_beta=cfg.beta,
beam_size=cfg.beam_size,
cutoff_prob=cfg.cutoff_prob,
cutoff_top_n=cfg.cutoff_top_n,
num_processes=cfg.num_proc_bsearch,
ctc_weight=cfg.ctc_weight,
word_reward=cfg.word_reward,
decoding_chunk_size=cfg.decoding_chunk_size,
num_decoding_left_chunks=cfg.num_decoding_left_chunks,
simulate_streaming=cfg.simulate_streaming)
return hyps
def compute_translation_metrics(self, def compute_translation_metrics(self,
utts, utts,
audio, audio,

@ -12,9 +12,10 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
"""Contains the impulse response augmentation model.""" """Contains the impulse response augmentation model."""
import jsonlines
from paddlespeech.s2t.frontend.audio import AudioSegment from paddlespeech.s2t.frontend.audio import AudioSegment
from paddlespeech.s2t.frontend.augmentor.base import AugmentorBase from paddlespeech.s2t.frontend.augmentor.base import AugmentorBase
from paddlespeech.s2t.frontend.utility import read_manifest
class ImpulseResponseAugmentor(AugmentorBase): class ImpulseResponseAugmentor(AugmentorBase):
@ -28,7 +29,8 @@ class ImpulseResponseAugmentor(AugmentorBase):
def __init__(self, rng, impulse_manifest_path): def __init__(self, rng, impulse_manifest_path):
self._rng = rng self._rng = rng
self._impulse_manifest = read_manifest(impulse_manifest_path) with jsonlines.open(impulse_manifest_path, 'r') as reader:
self._impulse_manifest = list(reader)
def __call__(self, x, uttid=None, train=True): def __call__(self, x, uttid=None, train=True):
if not train: if not train:

@ -12,9 +12,10 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
"""Contains the noise perturb augmentation model.""" """Contains the noise perturb augmentation model."""
import jsonlines
from paddlespeech.s2t.frontend.audio import AudioSegment from paddlespeech.s2t.frontend.audio import AudioSegment
from paddlespeech.s2t.frontend.augmentor.base import AugmentorBase from paddlespeech.s2t.frontend.augmentor.base import AugmentorBase
from paddlespeech.s2t.frontend.utility import read_manifest
class NoisePerturbAugmentor(AugmentorBase): class NoisePerturbAugmentor(AugmentorBase):
@ -34,7 +35,8 @@ class NoisePerturbAugmentor(AugmentorBase):
self._min_snr_dB = min_snr_dB self._min_snr_dB = min_snr_dB
self._max_snr_dB = max_snr_dB self._max_snr_dB = max_snr_dB
self._rng = rng self._rng = rng
self._noise_manifest = read_manifest(manifest_path=noise_manifest_path) with jsonlines.open(noise_manifest_path, 'r') as reader:
self._noise_manifest = list(reader)
def __call__(self, x, uttid=None, train=True): def __call__(self, x, uttid=None, train=True):
if not train: if not train:

@ -14,6 +14,7 @@
"""Contains feature normalizers.""" """Contains feature normalizers."""
import json import json
import jsonlines
import numpy as np import numpy as np
import paddle import paddle
from paddle.io import DataLoader from paddle.io import DataLoader
@ -21,7 +22,6 @@ from paddle.io import Dataset
from paddlespeech.s2t.frontend.audio import AudioSegment from paddlespeech.s2t.frontend.audio import AudioSegment
from paddlespeech.s2t.frontend.utility import load_cmvn from paddlespeech.s2t.frontend.utility import load_cmvn
from paddlespeech.s2t.frontend.utility import read_manifest
from paddlespeech.s2t.utils.log import Log from paddlespeech.s2t.utils.log import Log
__all__ = ["FeatureNormalizer"] __all__ = ["FeatureNormalizer"]
@ -61,7 +61,10 @@ class CollateFunc(object):
class AudioDataset(Dataset): class AudioDataset(Dataset):
def __init__(self, manifest_path, num_samples=-1, rng=None, random_seed=0): def __init__(self, manifest_path, num_samples=-1, rng=None, random_seed=0):
self._rng = rng if rng else np.random.RandomState(random_seed) self._rng = rng if rng else np.random.RandomState(random_seed)
manifest = read_manifest(manifest_path)
with jsonlines.open(manifest_path, 'r') as reader:
manifest = list(reader)
if num_samples == -1: if num_samples == -1:
sampled_manifest = manifest sampled_manifest = manifest
else: else:

@ -98,14 +98,13 @@ def read_manifest(
Returns: Returns:
List[dict]: Manifest parsing results. List[dict]: Manifest parsing results.
""" """
manifest = [] manifest = []
with jsonlines.open(manifest_path, 'r') as reader: with jsonlines.open(manifest_path, 'r') as reader:
for json_data in reader: for json_data in reader:
feat_len = json_data["feat_shape"][ feat_len = json_data["input"][0]["shape"][
0] if 'feat_shape' in json_data else 1.0 0] if "input" in json_data and "shape" in json_data["input"][0] else 1.0
token_len = json_data["token_shape"][ token_len = json_data["output"][0]["shape"][
0] if 'token_shape' in json_data else 1.0 0] if "output" in json_data and "shape" in json_data["output"][0] else 1.0
conditions = [ conditions = [
feat_len >= min_input_len, feat_len >= min_input_len,
feat_len <= max_input_len, feat_len <= max_input_len,

@ -16,10 +16,10 @@ from typing import Dict
from typing import List from typing import List
from typing import Text from typing import Text
import jsonlines
import numpy as np import numpy as np
from paddle.io import DataLoader from paddle.io import DataLoader
from paddlespeech.s2t.frontend.utility import read_manifest
from paddlespeech.s2t.io.batchfy import make_batchset from paddlespeech.s2t.io.batchfy import make_batchset
from paddlespeech.s2t.io.converter import CustomConverter from paddlespeech.s2t.io.converter import CustomConverter
from paddlespeech.s2t.io.dataset import TransformDataset from paddlespeech.s2t.io.dataset import TransformDataset
@ -91,7 +91,9 @@ class BatchDataLoader():
self.n_iter_processes = n_iter_processes self.n_iter_processes = n_iter_processes
# read json data # read json data
self.data_json = read_manifest(json_file) with jsonlines.open(json_file, 'r') as reader:
self.data_json = list(reader)
self.feat_dim, self.vocab_size = feat_dim_and_vocab_size( self.feat_dim, self.vocab_size = feat_dim_and_vocab_size(
self.data_json, mode='asr') self.data_json, mode='asr')

@ -15,6 +15,7 @@
# Modified from wenet(https://github.com/wenet-e2e/wenet) # Modified from wenet(https://github.com/wenet-e2e/wenet)
from typing import Optional from typing import Optional
import jsonlines
from paddle.io import Dataset from paddle.io import Dataset
from yacs.config import CfgNode from yacs.config import CfgNode
@ -184,7 +185,8 @@ class AudioDataset(Dataset):
""" """
assert batch_type in ['static', 'dynamic'] assert batch_type in ['static', 'dynamic']
# read manifest # read manifest
data = read_manifest(data_file) with jsonlines.open(data_file, 'r') as reader:
data = list(reader)
if sort: if sort:
data = sorted(data, key=lambda x: x["feat_shape"][0]) data = sorted(data, key=lambda x: x["feat_shape"][0])
if raw_wav: if raw_wav:

@ -249,8 +249,8 @@ class DeepSpeech2Model(nn.Layer):
The model built from config. The model built from config.
""" """
model = cls( model = cls(
feat_size=config.feat_size, feat_size=config.input_dim,
dict_size=config.dict_size, dict_size=config.output_dim,
num_conv_layers=config.num_conv_layers, num_conv_layers=config.num_conv_layers,
num_rnn_layers=config.num_rnn_layers, num_rnn_layers=config.num_rnn_layers,
rnn_size=config.rnn_layer_size, rnn_size=config.rnn_layer_size,

@ -381,8 +381,8 @@ class DeepSpeech2ModelOnline(nn.Layer):
The model built from config. The model built from config.
""" """
model = cls( model = cls(
feat_size=config.feat_size, feat_size=config.input_dim,
dict_size=config.dict_size, dict_size=config.output_dim,
num_conv_layers=config.num_conv_layers, num_conv_layers=config.num_conv_layers,
num_rnn_layers=config.num_rnn_layers, num_rnn_layers=config.num_rnn_layers,
rnn_size=config.rnn_layer_size, rnn_size=config.rnn_layer_size,

@ -19,7 +19,7 @@ from pathlib import Path
import paddle import paddle
from paddle import distributed as dist from paddle import distributed as dist
from tensorboardX import SummaryWriter from visualdl import LogWriter
from paddlespeech.s2t.training.reporter import ObsScope from paddlespeech.s2t.training.reporter import ObsScope
from paddlespeech.s2t.training.reporter import report from paddlespeech.s2t.training.reporter import report
@ -245,8 +245,9 @@ class Trainer():
self.maybe_batch_sampler_step() self.maybe_batch_sampler_step()
def after_train_batch(self): def after_train_batch(self):
if self.args.benchmark_max_step and self.iteration > self.args.benchmark_max_step: if self.args.benchmark_max_step:
profiler.add_profiler_step(self.args.profiler_options) profiler.add_profiler_step(self.args.profiler_options)
if self.args.benchmark_max_step and self.iteration > self.args.benchmark_max_step:
logger.info( logger.info(
f"Reach benchmark-max-step: {self.args.benchmark_max_step}") f"Reach benchmark-max-step: {self.args.benchmark_max_step}")
sys.exit( sys.exit(
@ -309,9 +310,10 @@ class Trainer():
logger.info( logger.info(
'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss)) 'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss))
if self.visualizer: if self.visualizer:
self.visualizer.add_scalars( self.visualizer.add_scalar(
'epoch', {'cv_loss': cv_loss, tag='eval/cv_loss', value=cv_loss, step=self.epoch)
'lr': self.lr_scheduler()}, self.epoch) self.visualizer.add_scalar(
tag='eval/lr', value=self.lr_scheduler(), step=self.epoch)
# after epoch # after epoch
self.save(tag=self.epoch, infos={'val_loss': cv_loss}) self.save(tag=self.epoch, infos={'val_loss': cv_loss})
@ -427,7 +429,7 @@ class Trainer():
unexpected behaviors. unexpected behaviors.
""" """
# visualizer # visualizer
visualizer = SummaryWriter(logdir=str(self.visual_dir)) visualizer = LogWriter(logdir=str(self.visual_dir))
self.visualizer = visualizer self.visualizer = visualizer
@mp_tools.rank_zero_only @mp_tools.rank_zero_only

@ -94,7 +94,7 @@ class Checkpoint():
""" """
configs = {} configs = {}
if checkpoint_path is not None: if checkpoint_path:
pass pass
elif checkpoint_dir is not None and record_file is not None: elif checkpoint_dir is not None and record_file is not None:
# load checkpint from record file # load checkpint from record file

@ -21,7 +21,7 @@ import wave
from time import gmtime from time import gmtime
from time import strftime from time import strftime
from paddlespeech.s2t.frontend.utility import read_manifest import jsonlines
__all__ = ["socket_send", "warm_up_test", "AsrTCPServer", "AsrRequestHandler"] __all__ = ["socket_send", "warm_up_test", "AsrTCPServer", "AsrRequestHandler"]
@ -44,7 +44,8 @@ def warm_up_test(audio_process_handler,
num_test_cases, num_test_cases,
random_seed=0): random_seed=0):
"""Warming-up test.""" """Warming-up test."""
manifest = read_manifest(manifest_path) with jsonlines.open(manifest_path) as reader:
manifest = list(reader)
rng = random.Random(random_seed) rng = random.Random(random_seed)
samples = rng.sample(manifest, num_test_cases) samples = rng.sample(manifest, num_test_cases)
for idx, sample in enumerate(samples): for idx, sample in enumerate(samples):

@ -34,7 +34,7 @@ from speechtask.punctuation_restoration.model.lstm import RnnLm
from speechtask.punctuation_restoration.utils import layer_tools from speechtask.punctuation_restoration.utils import layer_tools
from speechtask.punctuation_restoration.utils import mp_tools from speechtask.punctuation_restoration.utils import mp_tools
from speechtask.punctuation_restoration.utils.checkpoint import Checkpoint from speechtask.punctuation_restoration.utils.checkpoint import Checkpoint
from tensorboardX import SummaryWriter from visualdl import LogWriter
__all__ = ["Trainer", "Tester"] __all__ = ["Trainer", "Tester"]
@ -252,10 +252,10 @@ class Trainer():
self.logger.info("Epoch {} Val info val_loss {}, F1_score {}". self.logger.info("Epoch {} Val info val_loss {}, F1_score {}".
format(self.epoch, total_loss, F1_score)) format(self.epoch, total_loss, F1_score))
if self.visualizer: if self.visualizer:
self.visualizer.add_scalars("epoch", { self.visualizer.add_scalar(
"total_loss": total_loss, tag='eval/cv_loss', value=cv_loss, step=self.epoch)
"lr": self.lr_scheduler() self.visualizer.add_scalar(
}, self.epoch) tag='eval/lr', value=self.lr_scheduler(), step=self.epoch)
self.save( self.save(
tag=self.epoch, infos={"val_loss": total_loss, tag=self.epoch, infos={"val_loss": total_loss,
@ -341,7 +341,7 @@ class Trainer():
unexpected behaviors. unexpected behaviors.
""" """
# visualizer # visualizer
visualizer = SummaryWriter(logdir=str(self.output_dir)) visualizer = LogWriter(logdir=str(self.output_dir))
self.visualizer = visualizer self.visualizer = visualizer
@mp_tools.rank_zero_only @mp_tools.rank_zero_only

@ -40,7 +40,6 @@ snakeviz
soundfile~=0.10 soundfile~=0.10
sox sox
soxbindings soxbindings
tensorboardX
textgrid textgrid
timer timer
tqdm tqdm

@ -2,7 +2,7 @@ cd ../../../
pip install -e . # 安装pdspeech pip install -e . # 安装pdspeech
cd - cd -
#Enter the example dir #Enter the example dir
pushd ../../../examples/aishell/s1 pushd ../../../examples/aishell/asr1
#Prepare the data #Prepare the data
bash run.sh --stage 0 --stop_stage 0 bash run.sh --stage 0 --stop_stage 0

@ -8,7 +8,7 @@ cd ${CUR_DIR}
sed -i '/set\ -xe/d' run_benchmark.sh sed -i '/set\ -xe/d' run_benchmark.sh
#cd ** #cd **
pushd ../../../examples/aishell/s1 pushd ../../../examples/aishell/asr1
# 1 安装该模型需要的依赖 (如需开启优化策略请注明) # 1 安装该模型需要的依赖 (如需开启优化策略请注明)
# 2 拷贝该模型需要数据、预训练模型 # 2 拷贝该模型需要数据、预训练模型

@ -1,5 +1,4 @@
#!/usr/bin/env bash #!/usr/bin/env bash
set -xe
# 运行示例CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} 500 ${model_mode} # 运行示例CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} 500 ${model_mode}
# 参数说明 # 参数说明
function _set_params(){ function _set_params(){
@ -35,13 +34,15 @@ function _set_params(){
function _train(){ function _train(){
echo "Train on ${num_gpu_devices} GPUs" echo "Train on ${num_gpu_devices} GPUs"
echo "current CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES, gpus=$num_gpu_devices, batch_size=$batch_size" echo "current CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES, gpus=$num_gpu_devices, batch_size=$batch_size"
train_cmd="--config=${config_path} train_cmd="--config=${config_path} \
--output=${output} --output=${output} \
--seed=${seed} --seed=${seed} \
--ngpu=${ngpu} --ngpu=${ngpu} \
--profiler-options "${profiler_options}" --benchmark-batch-size ${batch_size} \
--benchmark-batch-size ${batch_size}
--benchmark-max-step ${benchmark_max_step} " --benchmark-max-step ${benchmark_max_step} "
if [ ${profiler_options} != "None" ]; then
train_cmd=${train_cmd}" --profiler-options=${profiler_options}"
fi
case ${run_mode} in case ${run_mode} in
sp) train_cmd="python -u ${BIN_DIR}/train.py "${train_cmd} ;; sp) train_cmd="python -u ${BIN_DIR}/train.py "${train_cmd} ;;

@ -0,0 +1,152 @@
#!/usr/bin/env python3
# encoding: utf-8
# Copyright 2018 Nagoya University (Tomoki Hayashi)
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
from __future__ import print_function
from __future__ import unicode_literals
import argparse
import codecs
import json
import logging
import sys
from distutils.util import strtobool
from espnet.utils.cli_utils import get_commandline_args
is_python2 = sys.version_info[0] == 2
def get_parser():
parser = argparse.ArgumentParser(
description="add multiple json values to an input or output value",
formatter_class=argparse.ArgumentDefaultsHelpFormatter, )
parser.add_argument("jsons", type=str, nargs="+", help="json files")
parser.add_argument(
"-i",
"--is-input",
default=True,
type=strtobool,
help="If true, add to input. If false, add to output", )
parser.add_argument(
"--verbose", "-V", default=0, type=int, help="Verbose option")
return parser
if __name__ == "__main__":
parser = get_parser()
args = parser.parse_args()
# logging info
logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
if args.verbose > 0:
logging.basicConfig(level=logging.INFO, format=logfmt)
else:
logging.basicConfig(level=logging.WARN, format=logfmt)
logging.info(get_commandline_args())
# make intersection set for utterance keys
js = []
intersec_ks = []
for x in args.jsons:
with codecs.open(x, "r", encoding="utf-8") as f:
j = json.load(f)
ks = j["utts"].keys()
logging.info(x + ": has " + str(len(ks)) + " utterances")
if len(intersec_ks) > 0:
intersec_ks = intersec_ks.intersection(set(ks))
if len(intersec_ks) == 0:
logging.warning("Empty intersection")
break
else:
intersec_ks = set(ks)
js.append(j)
logging.info("new json has " + str(len(intersec_ks)) + " utterances")
# updated original dict to keep intersection
intersec_org_dic = dict()
for k in intersec_ks:
v = js[0]["utts"][k]
intersec_org_dic[k] = v
intersec_add_dic = dict()
for k in intersec_ks:
v = js[1]["utts"][k]
for j in js[2:]:
v.update(j["utts"][k])
intersec_add_dic[k] = v
new_dic = dict()
for key_id in intersec_org_dic:
orgdic = intersec_org_dic[key_id]
adddic = intersec_add_dic[key_id]
if "utt2spk" not in orgdic:
orgdic["utt2spk"] = ""
# NOTE: for machine translation
# add as input
if args.is_input:
# original input
input_list = orgdic["input"]
# additional input
in_add_dic = {}
if "idim" in adddic and "ilen" in adddic:
in_add_dic["shape"] = [int(adddic["ilen"]), int(adddic["idim"])]
elif "idim" in adddic:
in_add_dic["shape"] = [int(adddic["idim"])]
# add all other key value
for key, value in adddic.items():
if key in ["idim", "ilen"]:
continue
in_add_dic[key] = value
# add name
in_add_dic["name"] = "input%d" % (len(input_list) + 1)
input_list.append(in_add_dic)
new_dic[key_id] = {
"input": input_list,
"output": orgdic["output"],
"utt2spk": orgdic["utt2spk"],
}
# add as output
else:
# original output
output_list = orgdic["output"]
# additional output
out_add_dic = {}
# add shape
if "odim" in adddic and "olen" in adddic:
out_add_dic[
"shape"] = [int(adddic["olen"]), int(adddic["odim"])]
elif "odim" in adddic:
out_add_dic["shape"] = [int(adddic["odim"])]
# add all other key value
for key, value in adddic.items():
if key in ["odim", "olen"]:
continue
out_add_dic[key] = value
# add name
out_add_dic["name"] = "target%d" % (len(output_list) + 1)
output_list.append(out_add_dic)
new_dic[key_id] = {
"input": orgdic["input"],
"output": output_list,
"utt2spk": orgdic["utt2spk"],
}
if "lang" in orgdic.keys():
new_dic[key_id]["lang"] = orgdic["lang"]
# ensure "ensure_ascii=False", which is a bug
jsonstring = json.dumps(
{
"utts": new_dic
},
indent=4,
ensure_ascii=False,
sort_keys=True,
separators=(",", ": "), )
sys.stdout = codecs.getwriter("utf-8")(sys.stdout
if is_python2 else sys.stdout.buffer)
print(jsonstring)

@ -21,9 +21,10 @@ import os
import tempfile import tempfile
from collections import Counter from collections import Counter
import jsonlines
from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
from paddlespeech.s2t.frontend.utility import BLANK from paddlespeech.s2t.frontend.utility import BLANK
from paddlespeech.s2t.frontend.utility import read_manifest
from paddlespeech.s2t.frontend.utility import SOS from paddlespeech.s2t.frontend.utility import SOS
from paddlespeech.s2t.frontend.utility import SPACE from paddlespeech.s2t.frontend.utility import SPACE
from paddlespeech.s2t.frontend.utility import UNK from paddlespeech.s2t.frontend.utility import UNK
@ -54,20 +55,41 @@ add_arg('text_keys', str,
add_arg('spm_vocab_size', int, 0, "Vocab size for spm.") add_arg('spm_vocab_size', int, 0, "Vocab size for spm.")
add_arg('spm_mode', str, 'unigram', "spm model type, e.g. unigram, spm, char, word. only need when `unit_type` is spm") add_arg('spm_mode', str, 'unigram', "spm model type, e.g. unigram, spm, char, word. only need when `unit_type` is spm")
add_arg('spm_model_prefix', str, "", "spm_model_%(spm_mode)_%(count_threshold), spm model prefix, only need when `unit_type` is spm") add_arg('spm_model_prefix', str, "", "spm_model_%(spm_mode)_%(count_threshold), spm model prefix, only need when `unit_type` is spm")
add_arg('spm_character_coverage', float, 0.9995, "character coverage to determine the minimum symbols")
# yapf: disable # yapf: disable
args = parser.parse_args() args = parser.parse_args()
def count_manifest(counter, text_feature, manifest_path): def count_manifest(counter, text_feature, manifest_path):
manifest_jsons = read_manifest(manifest_path) manifest_jsons = []
with jsonlines.open(manifest_path, 'r') as reader:
for json_data in reader:
manifest_jsons.append(json_data)
for line_json in manifest_jsons: for line_json in manifest_jsons:
if isinstance(line_json['text'], str):
line = text_feature.tokenize(line_json['text'], replace_space=False) line = text_feature.tokenize(line_json['text'], replace_space=False)
counter.update(line) counter.update(line)
else:
assert isinstance(line_json['text'], list)
for text in line_json['text']:
line = text_feature.tokenize(text, replace_space=False)
counter.update(line)
def dump_text_manifest(fileobj, manifest_path, key='text'): def dump_text_manifest(fileobj, manifest_path, key='text'):
manifest_jsons = read_manifest(manifest_path) manifest_jsons = []
with jsonlines.open(manifest_path, 'r') as reader:
for json_data in reader:
manifest_jsons.append(json_data)
for line_json in manifest_jsons: for line_json in manifest_jsons:
if isinstance(line_json[key], str):
fileobj.write(line_json[key] + "\n") fileobj.write(line_json[key] + "\n")
else:
assert isinstance(line_json[key], list)
for line in line_json[key]:
fileobj.write(line + "\n")
def main(): def main():
print_arguments(args, globals()) print_arguments(args, globals())
@ -95,7 +117,7 @@ def main():
model_type=args.spm_mode, model_type=args.spm_mode,
model_prefix=args.spm_model_prefix, model_prefix=args.spm_model_prefix,
input_sentence_size=100000000, input_sentence_size=100000000,
character_coverage=0.9995) character_coverage=args.spm_character_coverage)
os.unlink(fp.name) os.unlink(fp.name)
# encode # encode

@ -17,7 +17,7 @@ import argparse
from pathlib import Path from pathlib import Path
from typing import Union from typing import Union
from paddlespeech.s2t.frontend.utility import read_manifest import jsonlines
key_whitelist = set(['feat', 'text', 'syllable', 'phone']) key_whitelist = set(['feat', 'text', 'syllable', 'phone'])
filename = { filename = {
@ -32,7 +32,10 @@ def dump_manifest(manifest_path, output_dir: Union[str, Path]):
output_dir = Path(output_dir).expanduser() output_dir = Path(output_dir).expanduser()
manifest_path = Path(manifest_path).expanduser() manifest_path = Path(manifest_path).expanduser()
manifest_jsons = read_manifest(manifest_path)
with jsonlines.open(str(manifest_path), 'r') as reader:
manifest_jsons = list(reader)
first_line = manifest_jsons[0] first_line = manifest_jsons[0]
file_map = {} file_map = {}

@ -17,9 +17,10 @@ import argparse
import functools import functools
import json import json
import jsonlines
from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
from paddlespeech.s2t.frontend.utility import load_cmvn from paddlespeech.s2t.frontend.utility import load_cmvn
from paddlespeech.s2t.frontend.utility import read_manifest
from paddlespeech.s2t.io.utility import feat_type from paddlespeech.s2t.io.utility import feat_type
from paddlespeech.s2t.utils.utility import add_arguments from paddlespeech.s2t.utils.utility import add_arguments
from paddlespeech.s2t.utils.utility import print_arguments from paddlespeech.s2t.utils.utility import print_arguments
@ -71,7 +72,9 @@ def main():
# } # }
count = 0 count = 0
for manifest_path in args.manifest_paths: for manifest_path in args.manifest_paths:
manifest_jsons = read_manifest(manifest_path) with jsonlines.open(str(manifest_path), 'r') as reader:
manifest_jsons = list(reader)
for line_json in manifest_jsons: for line_json in manifest_jsons:
output_json = { output_json = {
"input": [], "input": [],

@ -17,9 +17,10 @@ import argparse
import functools import functools
import json import json
import jsonlines
from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
from paddlespeech.s2t.frontend.utility import load_cmvn from paddlespeech.s2t.frontend.utility import load_cmvn
from paddlespeech.s2t.frontend.utility import read_manifest
from paddlespeech.s2t.io.utility import feat_type from paddlespeech.s2t.io.utility import feat_type
from paddlespeech.s2t.utils.utility import add_arguments from paddlespeech.s2t.utils.utility import add_arguments
from paddlespeech.s2t.utils.utility import print_arguments from paddlespeech.s2t.utils.utility import print_arguments
@ -63,7 +64,8 @@ def main():
count = 0 count = 0
for manifest_path in args.manifest_paths: for manifest_path in args.manifest_paths:
manifest_jsons = read_manifest(manifest_path) with jsonlines.open(str(manifest_path), 'r') as reader:
manifest_jsons = list(reader)
for line_json in manifest_jsons: for line_json in manifest_jsons:
# text: translation text, text1: transcript text. # text: translation text, text1: transcript text.
# Currently only support joint-vocab, will add separate vocabs setting. # Currently only support joint-vocab, will add separate vocabs setting.

@ -4,9 +4,10 @@ import argparse
import functools import functools
from pathlib import Path from pathlib import Path
import jsonlines
from utils.utility import add_arguments from utils.utility import add_arguments
from utils.utility import print_arguments from utils.utility import print_arguments
from utils.utility import read_manifest
def main(args): def main(args):
@ -19,7 +20,8 @@ def main(args):
dur_scp = outdir / 'duration' dur_scp = outdir / 'duration'
text_scp = outdir / 'text' text_scp = outdir / 'text'
manifest_jsons = read_manifest(args.manifest_path) with jsonlines.open(args.manifest_path, 'r') as reader:
manifest_jsons = list(reader)
with wav_scp.open('w') as fwav, dur_scp.open('w') as fdur, text_scp.open( with wav_scp.open('w') as fwav, dur_scp.open('w') as fdur, text_scp.open(
'w') as ftxt: 'w') as ftxt:

@ -57,7 +57,7 @@ else
echo "missing ${dec_conf}" echo "missing ${dec_conf}"
exit 1 exit 1
fi fi
# NOTE(kan-bayashi): preprocess conf is optional # preprocess conf is optional
if [ -n "${preprocess_conf}" ]; then if [ -n "${preprocess_conf}" ]; then
tar rfh ${outfile}.tar ${preprocess_conf} tar rfh ${outfile}.tar ${preprocess_conf}
echo -n " - preprocess config file: \`" echo -n " - preprocess config file: \`"
@ -104,7 +104,7 @@ if [ -n "${lm}" ]; then
lm_conf=$(dirname ${lm})/model.json lm_conf=$(dirname ${lm})/model.json
if [ ! -e ${lm_conf} ]; then if [ ! -e ${lm_conf} ]; then
echo missing ${lm_conf} echo missing ${lm_conf}
exit 1 #exit 1
else else
echo -n " - lm JSON file: \`" echo -n " - lm JSON file: \`"
echo ${lm_conf} | sed -e "s/$/\`/" echo ${lm_conf} | sed -e "s/$/\`/"

@ -0,0 +1,49 @@
#!/usr/bin/env python3
# encoding: utf-8
# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
from __future__ import print_function
from __future__ import unicode_literals
import argparse
import codecs
import json
import sys
is_python2 = sys.version_info[0] == 2
def get_parser():
parser = argparse.ArgumentParser(
description="convert scp to json",
formatter_class=argparse.ArgumentDefaultsHelpFormatter, )
parser.add_argument("--key", "-k", type=str, help="key")
return parser
if __name__ == "__main__":
parser = get_parser()
args = parser.parse_args()
new_line = {}
sys.stdin = codecs.getreader("utf-8")(sys.stdin
if is_python2 else sys.stdin.buffer)
sys.stdout = codecs.getwriter("utf-8")(sys.stdout
if is_python2 else sys.stdout.buffer)
line = sys.stdin.readline()
while line:
x = line.rstrip().split()
v = {args.key: " ".join(x[1:])}
new_line[x[0]] = v
line = sys.stdin.readline()
all_l = {"utts": new_line}
# ensure "ensure_ascii=False", which is a bug
jsonstring = json.dumps(
all_l,
indent=4,
ensure_ascii=False,
sort_keys=True,
separators=(",", ": "))
print(jsonstring)

@ -0,0 +1,596 @@
#!/usr/bin/env perl
#
# This file is part of moses. Its use is licensed under the GNU Lesser General
# Public License version 2.1 or, at your option, any later version.
use warnings;
# Sample Tokenizer
### Version 1.1
# written by Pidong Wang, based on the code written by Josh Schroeder and Philipp Koehn
# Version 1.1 updates:
# (1) add multithreading option "-threads NUM_THREADS" (default is 1);
# (2) add a timing option "-time" to calculate the average speed of this tokenizer;
# (3) add an option "-lines NUM_SENTENCES_PER_THREAD" to set the number of lines for each thread (default is 2000), and this option controls the memory amount needed: the larger this number is, the larger memory is required (the higher tokenization speed);
### Version 1.0
# $Id: tokenizer.perl 915 2009-08-10 08:15:49Z philipp $
# written by Josh Schroeder, based on code by Philipp Koehn
binmode(STDIN, ":utf8");
binmode(STDOUT, ":utf8");
use warnings;
use FindBin qw($RealBin);
use strict;
use Time::HiRes;
if (eval {require Thread;1;}) {
#module loaded
Thread->import();
}
my $mydir = "$RealBin/../share/nonbreaking_prefixes";
my %NONBREAKING_PREFIX = ();
my @protected_patterns = ();
my $protected_patterns_file = "";
my $language = "en";
my $QUIET = 0;
my $HELP = 0;
my $AGGRESSIVE = 0;
my $SKIP_XML = 0;
my $TIMING = 0;
my $NUM_THREADS = 1;
my $NUM_SENTENCES_PER_THREAD = 2000;
my $PENN = 0;
my $NO_ESCAPING = 0;
while (@ARGV)
{
$_ = shift;
/^-b$/ && ($| = 1, next);
/^-l$/ && ($language = shift, next);
/^-q$/ && ($QUIET = 1, next);
/^-h$/ && ($HELP = 1, next);
/^-x$/ && ($SKIP_XML = 1, next);
/^-a$/ && ($AGGRESSIVE = 1, next);
/^-time$/ && ($TIMING = 1, next);
# Option to add list of regexps to be protected
/^-protected/ && ($protected_patterns_file = shift, next);
/^-threads$/ && ($NUM_THREADS = int(shift), next);
/^-lines$/ && ($NUM_SENTENCES_PER_THREAD = int(shift), next);
/^-penn$/ && ($PENN = 1, next);
/^-no-escape/ && ($NO_ESCAPING = 1, next);
}
# for time calculation
my $start_time;
if ($TIMING)
{
$start_time = [ Time::HiRes::gettimeofday( ) ];
}
# print help message
if ($HELP)
{
print "Usage ./tokenizer.perl (-l [en|de|...]) (-threads 4) < textfile > tokenizedfile\n";
print "Options:\n";
print " -q ... quiet.\n";
print " -a ... aggressive hyphen splitting.\n";
print " -b ... disable Perl buffering.\n";
print " -time ... enable processing time calculation.\n";
print " -penn ... use Penn treebank-like tokenization.\n";
print " -protected FILE ... specify file with patters to be protected in tokenisation.\n";
print " -no-escape ... don't perform HTML escaping on apostrophy, quotes, etc.\n";
exit;
}
if (!$QUIET)
{
print STDERR "Tokenizer Version 1.1\n";
print STDERR "Language: $language\n";
print STDERR "Number of threads: $NUM_THREADS\n";
}
# load the language-specific non-breaking prefix info from files in the directory nonbreaking_prefixes
load_prefixes($language,\%NONBREAKING_PREFIX);
if (scalar(%NONBREAKING_PREFIX) eq 0)
{
print STDERR "Warning: No known abbreviations for language '$language'\n";
}
# Load protected patterns
if ($protected_patterns_file)
{
open(PP,$protected_patterns_file) || die "Unable to open $protected_patterns_file";
while(<PP>) {
chomp;
push @protected_patterns, $_;
}
}
my @batch_sentences = ();
my @thread_list = ();
my $count_sentences = 0;
if ($NUM_THREADS > 1)
{# multi-threading tokenization
while(<STDIN>)
{
$count_sentences = $count_sentences + 1;
push(@batch_sentences, $_);
if (scalar(@batch_sentences)>=($NUM_SENTENCES_PER_THREAD*$NUM_THREADS))
{
# assign each thread work
for (my $i=0; $i<$NUM_THREADS; $i++)
{
my $start_index = $i*$NUM_SENTENCES_PER_THREAD;
my $end_index = $start_index+$NUM_SENTENCES_PER_THREAD-1;
my @subbatch_sentences = @batch_sentences[$start_index..$end_index];
my $new_thread = new Thread \&tokenize_batch, @subbatch_sentences;
push(@thread_list, $new_thread);
}
foreach (@thread_list)
{
my $tokenized_list = $_->join;
foreach (@$tokenized_list)
{
print $_;
}
}
# reset for the new run
@thread_list = ();
@batch_sentences = ();
}
}
# the last batch
if (scalar(@batch_sentences)>0)
{
# assign each thread work
for (my $i=0; $i<$NUM_THREADS; $i++)
{
my $start_index = $i*$NUM_SENTENCES_PER_THREAD;
if ($start_index >= scalar(@batch_sentences))
{
last;
}
my $end_index = $start_index+$NUM_SENTENCES_PER_THREAD-1;
if ($end_index >= scalar(@batch_sentences))
{
$end_index = scalar(@batch_sentences)-1;
}
my @subbatch_sentences = @batch_sentences[$start_index..$end_index];
my $new_thread = new Thread \&tokenize_batch, @subbatch_sentences;
push(@thread_list, $new_thread);
}
foreach (@thread_list)
{
my $tokenized_list = $_->join;
foreach (@$tokenized_list)
{
print $_;
}
}
}
}
else
{# single thread only
while(<STDIN>)
{
if (($SKIP_XML && /^<.+>$/) || /^\s*$/)
{
#don't try to tokenize XML/HTML tag lines
print $_;
}
else
{
print &tokenize($_);
}
}
}
if ($TIMING)
{
my $duration = Time::HiRes::tv_interval( $start_time );
print STDERR ("TOTAL EXECUTION TIME: ".$duration."\n");
print STDERR ("TOKENIZATION SPEED: ".($duration/$count_sentences*1000)." milliseconds/line\n");
}
#####################################################################################
# subroutines afterward
# tokenize a batch of texts saved in an array
# input: an array containing a batch of texts
# return: another array containing a batch of tokenized texts for the input array
sub tokenize_batch
{
my(@text_list) = @_;
my(@tokenized_list) = ();
foreach (@text_list)
{
if (($SKIP_XML && /^<.+>$/) || /^\s*$/)
{
#don't try to tokenize XML/HTML tag lines
push(@tokenized_list, $_);
}
else
{
push(@tokenized_list, &tokenize($_));
}
}
return \@tokenized_list;
}
# the actual tokenize function which tokenizes one input string
# input: one string
# return: the tokenized string for the input string
sub tokenize
{
my($text) = @_;
if ($PENN) {
return tokenize_penn($text);
}
chomp($text);
$text = " $text ";
# remove ASCII junk
$text =~ s/\s+/ /g;
$text =~ s/[\000-\037]//g;
# Find protected patterns
my @protected = ();
foreach my $protected_pattern (@protected_patterns) {
my $t = $text;
while ($t =~ /(?<PATTERN>$protected_pattern)(?<TAIL>.*)$/) {
push @protected, $+{PATTERN};
$t = $+{TAIL};
}
}
for (my $i = 0; $i < scalar(@protected); ++$i) {
my $subst = sprintf("THISISPROTECTED%.3d", $i);
$text =~ s,\Q$protected[$i], $subst ,g;
}
$text =~ s/ +/ /g;
$text =~ s/^ //g;
$text =~ s/ $//g;
# separate out all "other" special characters
if (($language eq "fi") or ($language eq "sv")) {
# in Finnish and Swedish, the colon can be used inside words as an apostrophe-like character:
# USA:n, 20:een, EU:ssa, USA:s, S:t
$text =~ s/([^\p{IsAlnum}\s\.\:\'\`\,\-])/ $1 /g;
# if a colon is not immediately followed by lower-case characters, separate it out anyway
$text =~ s/(:)(?=$|[^\p{Ll}])/ $1 /g;
}
elsif ($language eq "tdt") {
# in Tetun, the apostrophe can be used inside words as an apostrophe-like character:
$text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g;
# if an apostrophe is not immediately followed by lower-case characters, separate it out anyway
$text =~ s/(\')(?=$|[^\p{Ll}])/ $1 /g;
}
elsif (($language eq "ca")) {
# in Catalan, the middle dot can be used inside words:
# il<69>lusio
$text =~ s/([^\p{IsAlnum}\s\.\·\'\`\,\-])/ $1 /g;
# if a middot is not immediately followed by lower-case characters, separate it out anyway
$text =~ s/(·)(?=$|[^\p{Ll}])/ $1 /g;
}
else {
$text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g;
}
# aggressive hyphen splitting
if ($AGGRESSIVE)
{
$text =~ s/([\p{IsAlnum}])\-(?=[\p{IsAlnum}])/$1 \@-\@ /g;
}
#multi-dots stay together
$text =~ s/\.([\.]+)/ DOTMULTI$1/g;
while($text =~ /DOTMULTI\./)
{
$text =~ s/DOTMULTI\.([^\.])/DOTDOTMULTI $1/g;
$text =~ s/DOTMULTI\./DOTDOTMULTI/g;
}
# seperate out "," except if within numbers (5,300)
#$text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
# separate out "," except if within numbers (5,300)
# previous "global" application skips some: A,B,C,D,E > A , B,C , D,E
# first application uses up B so rule can't see B,C
# two-step version here may create extra spaces but these are removed later
# will also space digit,letter or letter,digit forms (redundant with next section)
$text =~ s/([^\p{IsN}])[,]/$1 , /g;
$text =~ s/[,]([^\p{IsN}])/ , $1/g;
# separate "," after a number if it's the end of a sentence
$text =~ s/([\p{IsN}])[,]$/$1 ,/g;
# separate , pre and post number
#$text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
#$text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g;
# turn `into '
#$text =~ s/\`/\'/g;
#turn '' into "
#$text =~ s/\'\'/ \" /g;
if ($language eq "en")
{
#split contractions right
$text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
$text =~ s/([^\p{IsAlpha}\p{IsN}])[']([\p{IsAlpha}])/$1 ' $2/g;
$text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
$text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1 '$2/g;
#special case for "1990's"
$text =~ s/([\p{IsN}])[']([s])/$1 '$2/g;
}
elsif (($language eq "fr") or ($language eq "it") or ($language eq "ga") or ($language eq "ca"))
{
#split contractions left
$text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
$text =~ s/([^\p{IsAlpha}])[']([\p{IsAlpha}])/$1 ' $2/g;
$text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
$text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1' $2/g;
}
elsif (($language eq "so") or ($language eq "tdt"))
{
# Don't split glottals
$text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
$text =~ s/([^\p{IsAlpha}])[']([\p{IsAlpha}])/$1 ' $2/g;
$text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
}
else
{
$text =~ s/\'/ \' /g;
}
#word token method
my @words = split(/\s/,$text);
$text = "";
for (my $i=0;$i<(scalar(@words));$i++)
{
my $word = $words[$i];
if ( $word =~ /^(\S+)\.$/)
{
my $pre = $1;
if ($i == scalar(@words)-1) {
# split last words independently as they are unlikely to be non-breaking prefixes
$word = $pre." .";
}
elsif (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[\p{IsLower}]/)))
{
#no change
}
elsif (($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==2) && ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[0-9]+/)))
{
#no change
}
else
{
$word = $pre." .";
}
}
$text .= $word." ";
}
# clean up extraneous spaces
$text =~ s/ +/ /g;
$text =~ s/^ //g;
$text =~ s/ $//g;
# .' at end of sentence is missed
$text =~ s/\.\' ?$/ . ' /;
# restore protected
for (my $i = 0; $i < scalar(@protected); ++$i) {
my $subst = sprintf("THISISPROTECTED%.3d", $i);
$text =~ s/$subst/$protected[$i]/g;
}
#restore multi-dots
while($text =~ /DOTDOTMULTI/)
{
$text =~ s/DOTDOTMULTI/DOTMULTI./g;
}
$text =~ s/DOTMULTI/./g;
#escape special chars
if (!$NO_ESCAPING)
{
$text =~ s/\&/\&amp;/g; # escape escape
$text =~ s/\|/\&#124;/g; # factor separator
$text =~ s/\</\&lt;/g; # xml
$text =~ s/\>/\&gt;/g; # xml
$text =~ s/\'/\&apos;/g; # xml
$text =~ s/\"/\&quot;/g; # xml
$text =~ s/\[/\&#91;/g; # syntax non-terminal
$text =~ s/\]/\&#93;/g; # syntax non-terminal
}
#ensure final line break
$text .= "\n" unless $text =~ /\n$/;
return $text;
}
sub tokenize_penn
{
# Improved compatibility with Penn Treebank tokenization. Useful if
# the text is to later be parsed with a PTB-trained parser.
#
# Adapted from Robert MacIntyre's sed script:
# http://www.cis.upenn.edu/~treebank/tokenizer.sed
my($text) = @_;
chomp($text);
# remove ASCII junk
$text =~ s/\s+/ /g;
$text =~ s/[\000-\037]//g;
# attempt to get correct directional quotes
$text =~ s/^``/`` /g;
$text =~ s/^"/`` /g;
$text =~ s/^`([^`])/` $1/g;
$text =~ s/^'/` /g;
$text =~ s/([ ([{<])"/$1 `` /g;
$text =~ s/([ ([{<])``/$1 `` /g;
$text =~ s/([ ([{<])`([^`])/$1 ` $2/g;
$text =~ s/([ ([{<])'/$1 ` /g;
# close quotes handled at end
$text =~ s=\.\.\.= _ELLIPSIS_ =g;
# separate out "," except if within numbers (5,300)
$text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
# separate , pre and post number
$text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
$text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g;
#$text =~ s=([;:@#\$%&\p{IsSc}])= $1 =g;
$text =~ s=([;:@#\$%&\p{IsSc}\p{IsSo}])= $1 =g;
# Separate out intra-token slashes. PTB tokenization doesn't do this, so
# the tokens should be merged prior to parsing with a PTB-trained parser
# (see syntax-hyphen-splitting.perl).
$text =~ s/([\p{IsAlnum}])\/([\p{IsAlnum}])/$1 \@\/\@ $2/g;
# Assume sentence tokenization has been done first, so split FINAL periods
# only.
$text =~ s=([^.])([.])([\]\)}>"']*) ?$=$1 $2$3 =g;
# however, we may as well split ALL question marks and exclamation points,
# since they shouldn't have the abbrev.-marker ambiguity problem
$text =~ s=([?!])= $1 =g;
# parentheses, brackets, etc.
$text =~ s=([\]\[\(\){}<>])= $1 =g;
$text =~ s/\(/-LRB-/g;
$text =~ s/\)/-RRB-/g;
$text =~ s/\[/-LSB-/g;
$text =~ s/\]/-RSB-/g;
$text =~ s/{/-LCB-/g;
$text =~ s/}/-RCB-/g;
$text =~ s=--= -- =g;
# First off, add a space to the beginning and end of each line, to reduce
# necessary number of regexps.
$text =~ s=$= =;
$text =~ s=^= =;
$text =~ s="= '' =g;
# possessive or close-single-quote
$text =~ s=([^'])' =$1 ' =g;
# as in it's, I'm, we'd
$text =~ s='([sSmMdD]) = '$1 =g;
$text =~ s='ll = 'll =g;
$text =~ s='re = 're =g;
$text =~ s='ve = 've =g;
$text =~ s=n't = n't =g;
$text =~ s='LL = 'LL =g;
$text =~ s='RE = 'RE =g;
$text =~ s='VE = 'VE =g;
$text =~ s=N'T = N'T =g;
$text =~ s= ([Cc])annot = $1an not =g;
$text =~ s= ([Dd])'ye = $1' ye =g;
$text =~ s= ([Gg])imme = $1im me =g;
$text =~ s= ([Gg])onna = $1on na =g;
$text =~ s= ([Gg])otta = $1ot ta =g;
$text =~ s= ([Ll])emme = $1em me =g;
$text =~ s= ([Mm])ore'n = $1ore 'n =g;
$text =~ s= '([Tt])is = '$1 is =g;
$text =~ s= '([Tt])was = '$1 was =g;
$text =~ s= ([Ww])anna = $1an na =g;
#word token method
my @words = split(/\s/,$text);
$text = "";
for (my $i=0;$i<(scalar(@words));$i++)
{
my $word = $words[$i];
if ( $word =~ /^(\S+)\.$/)
{
my $pre = $1;
if (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[\p{IsLower}]/)))
{
#no change
}
elsif (($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==2) && ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[0-9]+/)))
{
#no change
}
else
{
$word = $pre." .";
}
}
$text .= $word." ";
}
# restore ellipses
$text =~ s=_ELLIPSIS_=\.\.\.=g;
# clean out extra spaces
$text =~ s= *= =g;
$text =~ s=^ *==g;
$text =~ s= *$==g;
#escape special chars
$text =~ s/\&/\&amp;/g; # escape escape
$text =~ s/\|/\&#124;/g; # factor separator
$text =~ s/\</\&lt;/g; # xml
$text =~ s/\>/\&gt;/g; # xml
$text =~ s/\'/\&apos;/g; # xml
$text =~ s/\"/\&quot;/g; # xml
$text =~ s/\[/\&#91;/g; # syntax non-terminal
$text =~ s/\]/\&#93;/g; # syntax non-terminal
#ensure final line break
$text .= "\n" unless $text =~ /\n$/;
return $text;
}
sub load_prefixes
{
my ($language, $PREFIX_REF) = @_;
my $prefixfile = "$mydir/nonbreaking_prefix.$language";
#default back to English if we don't have a language-specific prefix file
if (!(-e $prefixfile))
{
$prefixfile = "$mydir/nonbreaking_prefix.en";
print STDERR "WARNING: No known abbreviations for language '$language', attempting fall-back to English version...\n";
die ("ERROR: No abbreviations files found in $mydir\n") unless (-e $prefixfile);
}
if (-e "$prefixfile")
{
open(PREFIX, "<:utf8", "$prefixfile");
while (<PREFIX>)
{
my $item = $_;
chomp($item);
if (($item) && (substr($item,0,1) ne "#"))
{
if ($item =~ /(.*)[\s]+(\#NUMERIC_ONLY\#)/)
{
$PREFIX_REF->{$1} = 2;
}
else
{
$PREFIX_REF->{$item} = 1;
}
}
}
close(PREFIX);
}
}

@ -0,0 +1,88 @@
#!/bin/bash
# Copyright 2020 Kyoto University (Hirofumi Inaguma)
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
echo "$0 $*" >&2 # Print the command line for logging
. ./path.sh
nlsyms=""
oov="<unk>"
bpecode=""
verbose=0
text=""
multilingual=false
help_message=$(cat << EOF
Usage: $0 <json> <data-dir> <dict>
e.g. $0 data/train data/lang_1char/train_units.txt
Options:
--oov <oov-word> # Default: <unk>
--verbose <num> # Default: 0
EOF
)
. utils/parse_options.sh
if [ $# != 3 ]; then
echo "${help_message}" 1>&2
exit 1;
fi
set -euo pipefail
json=$1
dir=$2
dic=$3
json_dir=$(dirname ${json})
tmpdir=$(mktemp -d ${dir}/tmp-XXXXX)
trap 'rm -rf ${tmpdir}' EXIT
if [ -z ${text} ]; then
text=${dir}/text
fi
# 2. Create scp files for outputs
mkdir -p ${tmpdir}/output
if [ -n "${bpecode}" ]; then
if [ ${multilingual} = true ]; then
# remove a space before the language ID
paste -d " " <(awk '{print $1}' ${text}) <(cut -f 2- -d" " ${text} \
| spm_encode --model=${bpecode} --output_format=piece | cut -f 2- -d" ") \
> ${tmpdir}/output/token.scp
else
paste -d " " <(awk '{print $1}' ${text}) <(cut -f 2- -d" " ${text} \
| spm_encode --model=${bpecode} --output_format=piece) \
> ${tmpdir}/output/token.scp
fi
elif [ -n "${nlsyms}" ]; then
text2token.py -s 1 -n 1 -l ${nlsyms} ${text} > ${tmpdir}/output/token.scp
else
text2token.py -s 1 -n 1 ${text} > ${tmpdir}/output/token.scp
fi
< ${tmpdir}/output/token.scp utils/sym2int.pl --map-oov ${oov} -f 2- ${dic} > ${tmpdir}/output/tokenid.scp
awk '{print $1 " " NF-1}' ${tmpdir}/output/tokenid.scp > ${tmpdir}/output/olen.scp
# +2 comes from CTC blank and EOS
vocsize=$(tail -n 1 ${dic} | awk '{print $2}')
odim=$(echo "$vocsize + 2" | bc)
awk -v odim=${odim} '{print $1 " " odim}' ${text} > ${tmpdir}/output/odim.scp
cat ${text} > ${tmpdir}/output/text.scp
# 4. Create JSON files from each scp files
rm -f ${tmpdir}/*/*.json
for x in "${tmpdir}"/output/*.scp; do
k=$(basename ${x} .scp)
< ${x} scp2json.py --key ${k} > ${tmpdir}/output/${k}.json
done
# add to json
addjson.py --verbose ${verbose} -i false \
${json} ${tmpdir}/output/text.json ${tmpdir}/output/token.json ${tmpdir}/output/tokenid.json ${tmpdir}/output/olen.json ${tmpdir}/output/odim.json > ${tmpdir}/data.json
mkdir -p ${json_dir}/.backup
echo "json updated. original json is kept in ${json_dir}/.backup."
cp ${json} ${json_dir}/.backup/"$(basename ${json})"
cp ${tmpdir}/data.json ${json}
rm -fr ${tmpdir}

@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import hashlib import hashlib
import json
import os import os
import sys import sys
import tarfile import tarfile
@ -22,31 +21,10 @@ from typing import Text
__all__ = [ __all__ = [
"check_md5sum", "getfile_insensitive", "download_multi", "download", "check_md5sum", "getfile_insensitive", "download_multi", "download",
"unpack", "unzip", "md5file", "print_arguments", "add_arguments", "unpack", "unzip", "md5file", "print_arguments", "add_arguments",
"read_manifest", "get_commandline_args" "get_commandline_args"
] ]
def read_manifest(manifest_path):
"""Load and parse manifest file.
Args:
manifest_path ([type]): Manifest file to load and parse.
Raises:
IOError: If failed to parse the manifest.
Returns:
List[dict]: Manifest parsing results.
"""
manifest = []
for json_line in open(manifest_path, 'r'):
try:
json_data = json.loads(json_line)
except Exception as e:
raise IOError("Error reading manifest: %s" % str(e))
return manifest
def get_commandline_args(): def get_commandline_args():
extra_chars = [ extra_chars = [
" ", " ",

Loading…
Cancel
Save