Merge branch 'develop' of https://github.com/PaddlePaddle/PaddleSpeech into ted_en_zh_t0

pull/1007/head
gongel 3 years ago
commit d4ee5916b1

@ -128,9 +128,9 @@ For **Text-To-Speech**, try pretrained FastSpeech2 + Parallel WaveGAN on CSMSC:
```shell ```shell
cd examples/csmsc/tts3 cd examples/csmsc/tts3
# download the pretrained models and unaip them # download the pretrained models and unaip them
wget https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip
unzip pwg_baker_ckpt_0.4.zip unzip pwg_baker_ckpt_0.4.zip
wget https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip
unzip fastspeech2_nosil_baker_ckpt_0.4.zip unzip fastspeech2_nosil_baker_ckpt_0.4.zip
# source the environment # source the environment
source path.sh source path.sh

@ -25,9 +25,9 @@ fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# download pretrained tts models and unzip # download pretrained tts models and unzip
wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip
unzip -d download download/pwg_baker_ckpt_0.4.zip unzip -d download download/pwg_baker_ckpt_0.4.zip
wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip
unzip -d download download/fastspeech2_nosil_baker_ckpt_0.4.zip unzip -d download download/fastspeech2_nosil_baker_ckpt_0.4.zip
fi fi

@ -19,9 +19,9 @@ fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# download pretrained tts models and unzip # download pretrained tts models and unzip
wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip
unzip -d download download/pwg_baker_ckpt_0.4.zip unzip -d download download/pwg_baker_ckpt_0.4.zip
wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip
unzip -d download download/fastspeech2_nosil_baker_ckpt_0.4.zip unzip -d download download/fastspeech2_nosil_baker_ckpt_0.4.zip
fi fi

@ -14,9 +14,9 @@ mkdir -p download
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# download pretrained tts models and unzip # download pretrained tts models and unzip
wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip
unzip -d download download/pwg_baker_ckpt_0.4.zip unzip -d download download/pwg_baker_ckpt_0.4.zip
wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip
unzip -d download download/fastspeech2_nosil_baker_ckpt_0.4.zip unzip -d download download/fastspeech2_nosil_baker_ckpt_0.4.zip
fi fi

@ -1,4 +1,3 @@
# Released Models # Released Models
## Speech-to-Text Models ## Speech-to-Text Models
@ -32,27 +31,28 @@ Language Model | Training Data | Token-based | Size | Descriptions
### Acoustic Models ### Acoustic Models
Model Type | Dataset| Example Link | Pretrained Models|Static Models|Siize(static) Model Type | Dataset| Example Link | Pretrained Models|Static Models|Siize(static)
:-------------:| :------------:| :-----: | :-----:| :-----:| :-----: :-------------:| :------------:| :-----: | :-----:| :-----:| :-----:
Tacotron2|LJSpeech|[tacotron2-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts0)|[tacotron2_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_ckpt_0.3.zip)||| Tacotron2|LJSpeech|[tacotron2-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts0)|[tacotron2_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_ljspeech_ckpt_0.3.zip)|||
TransformerTTS| LJSpeech| [transformer-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts1)|[transformer_tts_ljspeech_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/transformer_tts_ljspeech_ckpt_0.4.zip)||| TransformerTTS| LJSpeech| [transformer-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts1)|[transformer_tts_ljspeech_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/transformer_tts/transformer_tts_ljspeech_ckpt_0.4.zip)|||
SpeedySpeech| CSMSC | [speedyspeech-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts2) |[speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/speedyspeech_nosil_baker_ckpt_0.5.zip)|[speedyspeech_nosil_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/speedyspeech_nosil_baker_static_0.5.zip)|12MB| SpeedySpeech| CSMSC | [speedyspeech-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts2) |[speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_ckpt_0.5.zip)|[speedyspeech_nosil_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_static_0.5.zip)|12MB|
FastSpeech2| CSMSC |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3)|[fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip)|[fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_static_0.4.zip)|157MB| FastSpeech2| CSMSC |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3)|[fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip)|[fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_static_0.4.zip)|157MB|
FastSpeech2| AISHELL-3 |[fastspeech2-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/tts3)|[fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_aishell3_ckpt_0.4.zip)||| FastSpeech2| AISHELL-3 |[fastspeech2-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/tts3)|[fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_ckpt_0.4.zip)|||
FastSpeech2| LJSpeech |[fastspeech2-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts3)|[fastspeech2_nosil_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_ljspeech_ckpt_0.5.zip)||| FastSpeech2| LJSpeech |[fastspeech2-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts3)|[fastspeech2_nosil_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_ljspeech_ckpt_0.5.zip)|||
FastSpeech2| VCTK |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/tts3)|[fastspeech2_nosil_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_vctk_ckpt_0.5.zip)||| FastSpeech2| VCTK |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/tts3)|[fastspeech2_nosil_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_vctk_ckpt_0.5.zip)|||
### Vocoders ### Vocoders
Model Type | Dataset| Example Link | Pretrained Models| Static Models|Size(static) Model Type | Dataset| Example Link | Pretrained Models| Static Models|Size(static)
:-------------:| :------------:| :-----: | :-----:| :-----:| :-----: :-------------:| :------------:| :-----: | :-----:| :-----:| :-----:
WaveFlow| LJSpeech |[waveflow-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc0)|[waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_ljspeech_ckpt_0.3.zip)||| WaveFlow| LJSpeech |[waveflow-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc0)|[waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/waveflow/waveflow_ljspeech_ckpt_0.3.zip)|||
Parallel WaveGAN| CSMSC |[PWGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc1)|[pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip)|[pwg_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_static_0.4.zip)|5.1MB| Parallel WaveGAN| CSMSC |[PWGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc1)|[pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip)|[pwg_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_static_0.4.zip)|5.1MB|
Parallel WaveGAN| LJSpeech |[PWGAN-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc1)|[pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_ljspeech_ckpt_0.5.zip)||| Parallel WaveGAN| LJSpeech |[PWGAN-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc1)|[pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_ljspeech_ckpt_0.5.zip)|||
Parallel WaveGAN|AISHELL-3 |[PWGAN-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc1)|[pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_aishell3_ckpt_0.5.zip)||| Parallel WaveGAN|AISHELL-3 |[PWGAN-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc1)|[pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip)|||
Parallel WaveGAN| VCTK |[PWGAN-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/voc1)|[pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_vctk_ckpt_0.5.zip)||| Parallel WaveGAN| VCTK |[PWGAN-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/voc1)|[pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_vctk_ckpt_0.5.zip)|||
|Multi Band MelGAN |CSMSC|[MB MelGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc3) | [mb_melgan_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/mb_melgan_baker_ckpt_0.5.zip)|[mb_melgan_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/mb_melgan_baker_static_0.5.zip) |8.2MB| |Multi Band MelGAN |CSMSC|[MB MelGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc3) | [mb_melgan_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_ckpt_0.5.zip) <br>[mb_melgan_baker_finetune_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_finetune_ckpt_0.5.zip)|[mb_melgan_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_static_0.5.zip) |8.2MB|
### Voice Cloning ### Voice Cloning
Model Type | Dataset| Example Link | Pretrained Models Model Type | Dataset| Example Link | Pretrained Models
:-------------:| :------------:| :-----: | :-----: :-------------:| :------------:| :-----: | :-----:
GE2E| AISHELL-3, etc. |[ge2e](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/ge2e)|[ge2e_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/ge2e_ckpt_0.3.zip) GE2E| AISHELL-3, etc. |[ge2e](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/ge2e)|[ge2e_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/ge2e/ge2e_ckpt_0.3.zip)
GE2E + Tactron2| AISHELL-3 |[ge2e-tactron2-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/vc0)|[tacotron2_aishell3_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_aishell3_ckpt_0.3.zip) GE2E + Tactron2| AISHELL-3 |[ge2e-tactron2-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/vc0)|[tacotron2_aishell3_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_aishell3_ckpt_0.3.zip)
GE2E + FastSpeech2 | AISHELL-3 |[ge2e-fastspeech2-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/vc1)|[fastspeech2_nosil_aishell3_vc1_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_vc1_ckpt_0.5.zip)

@ -52,7 +52,7 @@ Audio samples generated from ground-truth spectrograms with a vocoder.
<td> <td>
<audio controls="controls"> <audio controls="controls">
<source <source
src="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res128_ljspeech_samples_1.0/step_2000k_sentence_0.wav" src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/waveflow_res128_ljspeech_samples_1.0/step_2000k_sentence_0.wav"
type="audio/wav"> type="audio/wav">
Your browser does not support the <code>audio</code> element. Your browser does not support the <code>audio</code> element.
</audio> </audio>
@ -72,7 +72,7 @@ Audio samples generated from ground-truth spectrograms with a vocoder.
<td> <td>
<audio controls="controls"> <audio controls="controls">
<source <source
src="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res128_ljspeech_samples_1.0/step_2000k_sentence_1.wav" src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/waveflow_res128_ljspeech_samples_1.0/step_2000k_sentence_1.wav"
type="audio/wav"> type="audio/wav">
Your browser does not support the <code>audio</code> element. Your browser does not support the <code>audio</code> element.
</audio> </audio>
@ -91,7 +91,7 @@ Audio samples generated from ground-truth spectrograms with a vocoder.
<td> <td>
<audio controls="controls"> <audio controls="controls">
<source <source
src="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res128_ljspeech_samples_1.0/step_2000k_sentence_2.wav" src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/waveflow_res128_ljspeech_samples_1.0/step_2000k_sentence_2.wav"
type="audio/wav"> type="audio/wav">
Your browser does not support the <code>audio</code> element. Your browser does not support the <code>audio</code> element.
</audio> </audio>
@ -110,7 +110,7 @@ Audio samples generated from ground-truth spectrograms with a vocoder.
<td> <td>
<audio controls="controls"> <audio controls="controls">
<source <source
src="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res128_ljspeech_samples_1.0/step_2000k_sentence_3.wav" src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/waveflow_res128_ljspeech_samples_1.0/step_2000k_sentence_3.wav"
type="audio/wav"> type="audio/wav">
Your browser does not support the <code>audio</code> element. Your browser does not support the <code>audio</code> element.
</audio> </audio>
@ -129,7 +129,7 @@ Audio samples generated from ground-truth spectrograms with a vocoder.
<td> <td>
<audio controls="controls"> <audio controls="controls">
<source <source
src="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res128_ljspeech_samples_1.0/step_2000k_sentence_4.wav" src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/waveflow_res128_ljspeech_samples_1.0/step_2000k_sentence_4.wav"
type="audio/wav"> type="audio/wav">
Your browser does not support the <code>audio</code> element. Your browser does not support the <code>audio</code> element.
</audio> </audio>
@ -281,7 +281,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
<td> <td>
<audio controls="controls"> <audio controls="controls">
<source <source
src="https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_waveflow_samples_0.2/sentence_1.wav" src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/tacotron2_ljspeech_waveflow_samples_0.2/sentence_1.wav"
type="audio/wav"> type="audio/wav">
Your browser does not support the <code>audio</code> element. Your browser does not support the <code>audio</code> element.
</audio> </audio>
@ -300,7 +300,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
<td> <td>
<audio controls="controls"> <audio controls="controls">
<source <source
src="https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_waveflow_samples_0.2/sentence_2.wav" src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/tacotron2_ljspeech_waveflow_samples_0.2/sentence_2.wav"
type="audio/wav"> type="audio/wav">
Your browser does not support the <code>audio</code> element. Your browser does not support the <code>audio</code> element.
</audio> </audio>
@ -320,7 +320,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
<td> <td>
<audio controls="controls"> <audio controls="controls">
<source <source
src="https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_waveflow_samples_0.2/sentence_3.wav" src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/tacotron2_ljspeech_waveflow_samples_0.2/sentence_3.wav"
type="audio/wav"> type="audio/wav">
Your browser does not support the <code>audio</code> element. Your browser does not support the <code>audio</code> element.
</audio> </audio>
@ -341,7 +341,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
<td> <td>
<audio controls="controls"> <audio controls="controls">
<source <source
src="https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_waveflow_samples_0.2/sentence_4.wav" src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/tacotron2_ljspeech_waveflow_samples_0.2/sentence_4.wav"
type="audio/wav"> type="audio/wav">
Your browser does not support the <code>audio</code> element. Your browser does not support the <code>audio</code> element.
</audio> </audio>
@ -361,7 +361,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
<td> <td>
<audio controls="controls"> <audio controls="controls">
<source <source
src="https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_waveflow_samples_0.2/sentence_5.wav" src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/tacotron2_ljspeech_waveflow_samples_0.2/sentence_5.wav"
type="audio/wav"> type="audio/wav">
Your browser does not support the <code>audio</code> element. Your browser does not support the <code>audio</code> element.
</audio> </audio>
@ -381,7 +381,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
<td> <td>
<audio controls="controls"> <audio controls="controls">
<source <source
src="https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_waveflow_samples_0.2/sentence_6.wav" src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/tacotron2_ljspeech_waveflow_samples_0.2/sentence_6.wav"
type="audio/wav"> type="audio/wav">
Your browser does not support the <code>audio</code> element. Your browser does not support the <code>audio</code> element.
</audio> </audio>
@ -401,7 +401,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
<td> <td>
<audio controls="controls"> <audio controls="controls">
<source <source
src="https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_waveflow_samples_0.2/sentence_7.wav" src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/tacotron2_ljspeech_waveflow_samples_0.2/sentence_7.wav"
type="audio/wav"> type="audio/wav">
Your browser does not support the <code>audio</code> element. Your browser does not support the <code>audio</code> element.
</audio> </audio>
@ -421,7 +421,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
<td> <td>
<audio controls="controls"> <audio controls="controls">
<source <source
src="https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_waveflow_samples_0.2/sentence_8.wav" src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/tacotron2_ljspeech_waveflow_samples_0.2/sentence_8.wav"
type="audio/wav"> type="audio/wav">
Your browser does not support the <code>audio</code> element. Your browser does not support the <code>audio</code> element.
</audio> </audio>
@ -441,7 +441,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
<td> <td>
<audio controls="controls"> <audio controls="controls">
<source <source
src="https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_waveflow_samples_0.2/sentence_9.wav" src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/tacotron2_ljspeech_waveflow_samples_0.2/sentence_9.wav"
type="audio/wav"> type="audio/wav">
Your browser does not support the <code>audio</code> element. Your browser does not support the <code>audio</code> element.
</audio> </audio>

@ -1,7 +1,9 @@
# ASR # ASR
* s0 for deepspeech2 * asr0 - deepspeech2 Streaming/Non-Streaming
* s1 for u2/transformer/conformer * asr1 - transformer/conformer Streaming/Non-Streaming
* asr2 - transformer/conformer Streaming/Non-Streaming with Kaldi feature
## Data ## Data

@ -32,8 +32,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--manifest_path="data/manifest.train.raw" \ --manifest_path="data/manifest.train.raw" \
--spectrum_type="linear" \ --spectrum_type="linear" \
--delta_delta=false \ --delta_delta=false \
--stride_ms=10.0 \ --stride_ms=10 \
--window_ms=20.0 \ --window_ms=20 \
--sample_rate=16000 \ --sample_rate=16000 \
--use_dB_normalization=True \ --use_dB_normalization=True \
--num_samples=2000 \ --num_samples=2000 \
@ -66,7 +66,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
for dataset in train dev test; do for dataset in train dev test; do
{ {
python3 ${MAIN_ROOT}/utils/format_data.py \ python3 ${MAIN_ROOT}/utils/format_data.py \
--feat_type "raw" \
--cmvn_path "data/mean_std.json" \ --cmvn_path "data/mean_std.json" \
--unit_type "char" \ --unit_type "char" \
--vocab_path="data/vocab.txt" \ --vocab_path="data/vocab.txt" \

@ -19,3 +19,13 @@ Need set `decoding.decoding_chunk_size=16` when decoding.
| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_greedy_search | 16, -1 | - | 0.070806 | | conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_greedy_search | 16, -1 | - | 0.070806 |
| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_prefix_beam_search | 16, -1 | - | 0.070739 | | conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_prefix_beam_search | 16, -1 | - | 0.070739 |
| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | attention_rescoring | 16, -1 | - | 0.059400 | | conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | attention_rescoring | 16, -1 | - | 0.059400 |
## Transformer
| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER |
| --- | --- | --- | --- | --- | --- | --- | --- |
| transformer | 31.95M | conf/transformer.yaml | spec_aug | test | attention | 3.858648955821991 | 0.057293 |
| transformer | 31.95M | conf/transformer.yaml | spec_aug | test | ctc_greedy_search | 3.858648955821991 | 0.061837 |
| transformer | 31.95M | conf/transformer.yaml | spec_aug | test | ctc_prefix_beam_search | 3.858648955821991 | 0.061685 |
| transformer | 31.95M | conf/transformer.yaml | spec_aug | test | attention_rescoring | 3.858648955821991 | 0.053844 |

@ -15,7 +15,7 @@ collator:
vocab_filepath: data/vocab.txt vocab_filepath: data/vocab.txt
unit_type: 'char' unit_type: 'char'
spm_model_prefix: '' spm_model_prefix: ''
augmentation_config: conf/augmentation.json augmentation_config: conf/preprocess.yaml
batch_size: 32 batch_size: 32
raw_wav: True # use raw_wav or kaldi feature raw_wav: True # use raw_wav or kaldi feature
spectrum_type: fbank #linear, mfcc, fbank spectrum_type: fbank #linear, mfcc, fbank
@ -38,7 +38,7 @@ collator:
# network architecture # network architecture
model: model:
cmvn_file: "data/mean_std.json" cmvn_file:
cmvn_file_type: "json" cmvn_file_type: "json"
# encoder related # encoder related
encoder: conformer encoder: conformer

@ -15,7 +15,7 @@ collator:
vocab_filepath: data/vocab.txt vocab_filepath: data/vocab.txt
unit_type: 'char' unit_type: 'char'
spm_model_prefix: '' spm_model_prefix: ''
augmentation_config: conf/augmentation.json augmentation_config: conf/preprocess.yaml
batch_size: 64 batch_size: 64
raw_wav: True # use raw_wav or kaldi feature raw_wav: True # use raw_wav or kaldi feature
spectrum_type: fbank #linear, mfcc, fbank spectrum_type: fbank #linear, mfcc, fbank
@ -37,7 +37,7 @@ collator:
# network architecture # network architecture
model: model:
cmvn_file: "data/mean_std.json" cmvn_file:
cmvn_file_type: "json" cmvn_file_type: "json"
# encoder related # encoder related
encoder: conformer encoder: conformer

@ -0,0 +1,29 @@
process:
# extract kaldi fbank from PCM
- type: fbank_kaldi
fs: 16000
n_mels: 80
n_shift: 160
win_length: 400
dither: true
- type: cmvn_json
cmvn_path: data/mean_std.json
# these three processes are a.k.a. SpecAugument
- type: time_warp
max_time_warp: 5
inplace: true
mode: PIL
- type: freq_mask
F: 30
n_mask: 2
inplace: true
replace_with_zero: false
- type: time_mask
T: 40
n_mask: 2
inplace: true
replace_with_zero: false

@ -0,0 +1,112 @@
# https://yaml.org/type/float.html
data:
train_manifest: data/manifest.train
dev_manifest: data/manifest.dev
test_manifest: data/manifest.test
min_input_len: 0.5
max_input_len: 20.0 # second
min_output_len: 0.0
max_output_len: 400.0
min_output_input_ratio: 0.05
max_output_input_ratio: 10.0
collator:
vocab_filepath: data/vocab.txt
unit_type: 'char'
spm_model_prefix: ''
augmentation_config: conf/preprocess.yaml
batch_size: 64
raw_wav: True # use raw_wav or kaldi feature
spectrum_type: fbank #linear, mfcc, fbank
feat_dim: 80
delta_delta: False
dither: 1.0
target_sample_rate: 16000
max_freq: None
n_fft: None
stride_ms: 10.0
window_ms: 25.0
use_dB_normalization: True
target_dB: -20
random_seed: 0
keep_transcription_text: False
sortagrad: True
shuffle_method: batch_shuffle
num_workers: 2
# network architecture
model:
cmvn_file:
cmvn_file_type: "json"
# encoder related
encoder: transformer
encoder_conf:
output_size: 256 # dimension of attention
attention_heads: 4
linear_units: 2048 # the number of units of position-wise feed forward
num_blocks: 12 # the number of encoder blocks
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.0
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before: true
# decoder related
decoder: transformer
decoder_conf:
attention_heads: 4
linear_units: 2048
num_blocks: 6
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
ctc_dropoutrate: 0.0
ctc_grad_norm_type: null
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false
training:
n_epoch: 120
accum_grad: 2
global_grad_clip: 5.0
optim: adam
optim_conf:
lr: 0.002
weight_decay: 1e-6
scheduler: warmuplr # pytorch v1.1.0+ required
scheduler_conf:
warmup_steps: 25000
lr_decay: 1.0
log_interval: 100
checkpoint:
kbest_n: 50
latest_n: 5
decoding:
batch_size: 128
error_rate_type: cer
decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
alpha: 2.5
beta: 0.3
beam_size: 10
cutoff_prob: 1.0
cutoff_top_n: 0
num_proc_bsearch: 8
ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1.
simulate_streaming: False # simulate streaming inference. Defaults to False.

@ -33,8 +33,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--spectrum_type="fbank" \ --spectrum_type="fbank" \
--feat_dim=80 \ --feat_dim=80 \
--delta_delta=false \ --delta_delta=false \
--stride_ms=10.0 \ --stride_ms=10 \
--window_ms=25.0 \ --window_ms=25 \
--sample_rate=16000 \ --sample_rate=16000 \
--use_dB_normalization=False \ --use_dB_normalization=False \
--num_samples=-1 \ --num_samples=-1 \
@ -67,7 +67,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
for dataset in train dev test; do for dataset in train dev test; do
{ {
python3 ${MAIN_ROOT}/utils/format_data.py \ python3 ${MAIN_ROOT}/utils/format_data.py \
--feat_type "raw" \
--cmvn_path "data/mean_std.json" \ --cmvn_path "data/mean_std.json" \
--unit_type "char" \ --unit_type "char" \
--vocab_path="data/vocab.txt" \ --vocab_path="data/vocab.txt" \

@ -23,8 +23,6 @@ fi
# exit 1 # exit 1
#fi #fi
for type in attention_rescoring; do for type in attention_rescoring; do
echo "decoding ${type}" echo "decoding ${type}"
batch_size=1 batch_size=1

@ -97,7 +97,7 @@ optional arguments:
### Synthesize ### Synthesize
We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc1) as the neural vocoder. We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc1) as the neural vocoder.
Download pretrained parallel wavegan model from [pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_aishell3_ckpt_0.5.zip) and unzip it. Download pretrained parallel wavegan model from [pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip) and unzip it.
```bash ```bash
unzip pwg_aishell3_ckpt_0.5.zip unzip pwg_aishell3_ckpt_0.5.zip
``` ```
@ -202,7 +202,7 @@ optional arguments:
6. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. 6. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
## Pretrained Model ## Pretrained Model
Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_aishell3_ckpt_0.4.zip) Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_ckpt_0.4.zip)
FastSpeech2 checkpoint contains files listed below. FastSpeech2 checkpoint contains files listed below.

@ -41,7 +41,7 @@ We use Montreal Force Aligner 1.0. The label in aishell3 include pinyinso th
We use [lexicon.txt](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/lexicon.txt) as the lexicon. We use [lexicon.txt](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/lexicon.txt) as the lexicon.
You can download the alignment results from here [alignment_aishell3.tar.gz](https://paddlespeech.bj.bcebos.com/Parakeet/alignment_aishell3.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) (use MFA1.x now) of our repo. You can download the alignment results from here [alignment_aishell3.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/alignment_aishell3.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) (use MFA1.x now) of our repo.
```bash ```bash
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
@ -86,4 +86,4 @@ In addition, in order to accelerate the convergence of the model, we add `guided
CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${ge2e_params_path} ${tacotron2_params_path} ${waveflow_params_path} ${vc_input} ${vc_output} CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${ge2e_params_path} ${tacotron2_params_path} ${waveflow_params_path} ${vc_input} ${vc_output}
``` ```
## Pretrained Model ## Pretrained Model
[tacotron2_aishell3_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_aishell3_ckpt_0.3.zip). [tacotron2_aishell3_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_aishell3_ckpt_0.3.zip).

@ -22,7 +22,7 @@ You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech
## Pretrained GE2E model ## Pretrained GE2E model
We use pretrained GE2E model to generate spwaker embedding for each sentence. We use pretrained GE2E model to generate spwaker embedding for each sentence.
Download pretrained GE2E model from here [ge2e_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/ge2e_ckpt_0.3.zip), and `unzip` it. Download pretrained GE2E model from here [ge2e_ckpt_0.3.zip](https://bj.bcebos.com/paddlespeech/Parakeet/released_models/ge2e/ge2e_ckpt_0.3.zip), and `unzip` it.
## Get Started ## Get Started
Assume the path to the dataset is `~/datasets/data_aishell3`. Assume the path to the dataset is `~/datasets/data_aishell3`.
@ -84,7 +84,7 @@ The training step is very similar to that one of [tts3](https://github.com/Paddl
### Synthesize ### Synthesize
We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc1) as the neural vocoder. We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc1) as the neural vocoder.
Download pretrained parallel wavegan model from [pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_aishell3_ckpt_0.5.zip) and unzip it. Download pretrained parallel wavegan model from [pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip) and unzip it.
```bash ```bash
unzip pwg_aishell3_ckpt_0.5.zip unzip pwg_aishell3_ckpt_0.5.zip
``` ```
@ -115,7 +115,7 @@ ref_audio
CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${conf_path} ${train_output_path} ${ckpt_name} ${ge2e_params_path} ${ref_audio_dir} CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${conf_path} ${train_output_path} ${ckpt_name} ${ge2e_params_path} ${ref_audio_dir}
``` ```
## Pretrained Model ## Pretrained Model
[fastspeech2_nosil_aishell3_vc1_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_aishell3_vc1_ckpt_0.5.zip) [fastspeech2_nosil_aishell3_vc1_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_vc1_ckpt_0.5.zip)
FastSpeech2 checkpoint contains files listed below. FastSpeech2 checkpoint contains files listed below.
(There is no need for `speaker_id_map.txt` here ) (There is no need for `speaker_id_map.txt` here )

@ -132,7 +132,7 @@ optional arguments:
5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. 5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
## Pretrained Models ## Pretrained Models
Pretrained models can be downloaded here [pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_aishell3_ckpt_0.5.zip). Pretrained models can be downloaded here [pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip).
Parallel WaveGAN checkpoint contains files listed below. Parallel WaveGAN checkpoint contains files listed below.

@ -15,7 +15,7 @@ collator:
vocab_filepath: data/vocab.txt vocab_filepath: data/vocab.txt
unit_type: 'char' unit_type: 'char'
spm_model_prefix: '' spm_model_prefix: ''
augmentation_config: conf/augmentation.json augmentation_config: conf/preprocess.yaml
batch_size: 32 batch_size: 32
raw_wav: True # use raw_wav or kaldi feature raw_wav: True # use raw_wav or kaldi feature
spectrum_type: fbank #linear, mfcc, fbank spectrum_type: fbank #linear, mfcc, fbank
@ -38,7 +38,7 @@ collator:
# network architecture # network architecture
model: model:
cmvn_file: "data/mean_std.json" cmvn_file:
cmvn_file_type: "json" cmvn_file_type: "json"
# encoder related # encoder related
encoder: conformer encoder: conformer

@ -15,7 +15,7 @@ collator:
vocab_filepath: data/vocab.txt vocab_filepath: data/vocab.txt
unit_type: 'char' unit_type: 'char'
spm_model_prefix: '' spm_model_prefix: ''
augmentation_config: conf/augmentation.json augmentation_config: conf/preprocess.yaml
batch_size: 32 batch_size: 32
raw_wav: True # use raw_wav or kaldi feature raw_wav: True # use raw_wav or kaldi feature
spectrum_type: fbank #linear, mfcc, fbank spectrum_type: fbank #linear, mfcc, fbank
@ -37,7 +37,7 @@ collator:
# network architecture # network architecture
model: model:
cmvn_file: "data/mean_std.json" cmvn_file:
cmvn_file_type: "json" cmvn_file_type: "json"
# encoder related # encoder related
encoder: conformer encoder: conformer

@ -0,0 +1,29 @@
process:
# extract kaldi fbank from PCM
- type: fbank_kaldi
fs: 16000
n_mels: 80
n_shift: 160
win_length: 400
dither: true
- type: cmvn_json
cmvn_path: data/mean_std.json
# these three processes are a.k.a. SpecAugument
- type: time_warp
max_time_warp: 5
inplace: true
mode: PIL
- type: freq_mask
F: 30
n_mask: 2
inplace: true
replace_with_zero: false
- type: time_mask
T: 40
n_mask: 2
inplace: true
replace_with_zero: false

@ -21,8 +21,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--spectrum_type="fbank" \ --spectrum_type="fbank" \
--feat_dim=80 \ --feat_dim=80 \
--delta_delta=false \ --delta_delta=false \
--stride_ms=10.0 \ --stride_ms=10 \
--window_ms=25.0 \ --window_ms=25 \
--sample_rate=8000 \ --sample_rate=8000 \
--use_dB_normalization=False \ --use_dB_normalization=False \
--num_samples=-1 \ --num_samples=-1 \
@ -55,7 +55,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
for dataset in train dev test; do for dataset in train dev test; do
{ {
python3 ${MAIN_ROOT}/utils/format_data.py \ python3 ${MAIN_ROOT}/utils/format_data.py \
--feat_type "raw" \
--cmvn_path "data/mean_std.json" \ --cmvn_path "data/mean_std.json" \
--unit_type "char" \ --unit_type "char" \
--vocab_path="data/vocab.txt" \ --vocab_path="data/vocab.txt" \

@ -90,7 +90,7 @@ optional arguments:
### Synthesize ### Synthesize
We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc1) as the neural vocoder. We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc1) as the neural vocoder.
Download pretrained parallel wavegan model from [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip) and unzip it. Download pretrained parallel wavegan model from [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip) and unzip it.
```bash ```bash
unzip pwg_baker_ckpt_0.4.zip unzip pwg_baker_ckpt_0.4.zip
``` ```
@ -208,9 +208,9 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path}
``` ```
## Pretrained Model ## Pretrained Model
Pretrained SpeedySpeech model with no silence in the edge of audios[speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/speedyspeech_nosil_baker_ckpt_0.5.zip). Pretrained SpeedySpeech model with no silence in the edge of audios[speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_ckpt_0.5.zip).
Static model can be downloaded here [speedyspeech_nosil_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/speedyspeech_nosil_baker_static_0.5.zip). Static model can be downloaded here [speedyspeech_nosil_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_static_0.5.zip).
SpeedySpeech checkpoint contains files listed below. SpeedySpeech checkpoint contains files listed below.
```text ```text

@ -88,7 +88,7 @@ optional arguments:
### Synthesize ### Synthesize
We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc1) as the neural vocoder. We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc1) as the neural vocoder.
Download pretrained parallel wavegan model from [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip) and unzip it. Download pretrained parallel wavegan model from [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip) and unzip it.
```bash ```bash
unzip pwg_baker_ckpt_0.4.zip unzip pwg_baker_ckpt_0.4.zip
``` ```
@ -199,9 +199,9 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path}
``` ```
## Pretrained Model ## Pretrained Model
Pretrained FastSpeech2 model with no silence in the edge of audios [fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip). Pretrained FastSpeech2 model with no silence in the edge of audios [fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip).
Static model can be downloaded here [fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_static_0.4.zip). Static model can be downloaded here [fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_static_0.4.zip).
FastSpeech2 checkpoint contains files listed below. FastSpeech2 checkpoint contains files listed below.
```text ```text

@ -122,9 +122,9 @@ optional arguments:
5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. 5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
## Pretrained Models ## Pretrained Models
Pretrained model can be downloaded here [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip). Pretrained model can be downloaded here [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip).
Static model can be downloaded here [pwg_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_static_0.4.zip). Static model can be downloaded here [pwg_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_static_0.4.zip).
Parallel WaveGAN checkpoint contains files listed below. Parallel WaveGAN checkpoint contains files listed below.

@ -113,7 +113,7 @@ The length of mel-spectrograms should align with the length of wavs, so we shoul
But since we are fine-tuning, we should use the statistics computed during training step. But since we are fine-tuning, we should use the statistics computed during training step.
You should first download pretrained `FastSpeech2` model from [fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip) and `unzip` it. You should first download pretrained `FastSpeech2` model from [fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip) and `unzip` it.
Assume the path to the dump-dir of training step is `dump`. Assume the path to the dump-dir of training step is `dump`.
Assume the path to the duration result of CSMSC is `durations.txt` (generated during training step's preprocessing). Assume the path to the duration result of CSMSC is `durations.txt` (generated during training step's preprocessing).
@ -147,11 +147,11 @@ TODO:
The hyperparameter of `finetune.yaml` is not good enough, a smaller `learning_rate` should be used (more `milestones` should be set). The hyperparameter of `finetune.yaml` is not good enough, a smaller `learning_rate` should be used (more `milestones` should be set).
## Pretrained Models ## Pretrained Models
Pretrained model can be downloaded here [mb_melgan_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/mb_melgan_baker_ckpt_0.5.zip). Pretrained model can be downloaded here [mb_melgan_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_ckpt_0.5.zip).
Finetuned model can ben downloaded here [mb_melgan_baker_finetune_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/mb_melgan_baker_finetune_ckpt_0.5.zip). Finetuned model can ben downloaded here [mb_melgan_baker_finetune_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_finetune_ckpt_0.5.zip).
Static model can be downloaded here [mb_melgan_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/mb_melgan_baker_static_0.5.zip) Static model can be downloaded here [mb_melgan_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_static_0.5.zip)
Multi Band MelGAN checkpoint contains files listed below. Multi Band MelGAN checkpoint contains files listed below.

@ -22,6 +22,7 @@ import argparse
import codecs import codecs
import json import json
import os import os
from pathlib import Path
import soundfile import soundfile
@ -79,6 +80,7 @@ def create_manifest(data_dir, manifest_path_prefix):
audio_path = os.path.abspath(os.path.join(subfolder, fname)) audio_path = os.path.abspath(os.path.join(subfolder, fname))
audio_id = os.path.basename(fname)[:-4] audio_id = os.path.basename(fname)[:-4]
utt2spk = Path(audio_path).parent.name
audio_data, samplerate = soundfile.read(audio_path) audio_data, samplerate = soundfile.read(audio_path)
duration = float(len(audio_data) / samplerate) duration = float(len(audio_data) / samplerate)
@ -87,6 +89,7 @@ def create_manifest(data_dir, manifest_path_prefix):
json.dumps( json.dumps(
{ {
'utt': audio_id, 'utt': audio_id,
'utt2spk': str(utt2spk),
'feat': audio_path, 'feat': audio_path,
'feat_shape': (duration, ), # second 'feat_shape': (duration, ), # second
'text': text, 'text': text,

@ -22,6 +22,7 @@ import argparse
import codecs import codecs
import json import json
import os import os
from pathlib import Path
import soundfile import soundfile
@ -81,6 +82,8 @@ def create_manifest(data_dir, manifest_path_prefix):
# if no transcription for audio then skipped # if no transcription for audio then skipped
if audio_id not in transcript_dict: if audio_id not in transcript_dict:
continue continue
utt2spk = Path(audio_path).parent.name
audio_data, samplerate = soundfile.read(audio_path) audio_data, samplerate = soundfile.read(audio_path)
duration = float(len(audio_data) / samplerate) duration = float(len(audio_data) / samplerate)
text = transcript_dict[audio_id] text = transcript_dict[audio_id]
@ -88,6 +91,7 @@ def create_manifest(data_dir, manifest_path_prefix):
json.dumps( json.dumps(
{ {
'utt': audio_id, 'utt': audio_id,
'utt2spk': str(utt2spk),
'feat': audio_path, 'feat': audio_path,
'feat_shape': (duration, ), # second 'feat_shape': (duration, ), # second
'text': text 'text': text

@ -78,7 +78,7 @@ def create_manifest(data_dir, manifest_path):
print("Creating manifest %s ..." % manifest_path) print("Creating manifest %s ..." % manifest_path)
json_lines = [] json_lines = []
total_sec = 0.0 total_sec = 0.0
total_text = 0.0 total_char = 0.0
total_num = 0 total_num = 0
for subfolder, _, filelist in sorted(os.walk(data_dir)): for subfolder, _, filelist in sorted(os.walk(data_dir)):
@ -89,25 +89,28 @@ def create_manifest(data_dir, manifest_path):
text_filepath = os.path.join(subfolder, text_filelist[0]) text_filepath = os.path.join(subfolder, text_filelist[0])
for line in io.open(text_filepath, encoding="utf8"): for line in io.open(text_filepath, encoding="utf8"):
segments = line.strip().split() segments = line.strip().split()
nchars = len(segments[1:])
text = ' '.join(segments[1:]).lower() text = ' '.join(segments[1:]).lower()
audio_filepath = os.path.abspath( audio_filepath = os.path.abspath(
os.path.join(subfolder, segments[0] + '.flac')) os.path.join(subfolder, segments[0] + '.flac'))
audio_data, samplerate = soundfile.read(audio_filepath) audio_data, samplerate = soundfile.read(audio_filepath)
duration = float(len(audio_data)) / samplerate duration = float(len(audio_data)) / samplerate
utt = os.path.splitext(os.path.basename(audio_filepath))[0]
utt2spk = '-'.join(utt.split('-')[:2])
json_lines.append( json_lines.append(
json.dumps({ json.dumps({
'utt': 'utt': utt,
os.path.splitext(os.path.basename(audio_filepath))[0], 'utt2spk': utt2spk,
'feat': 'feat': audio_filepath,
audio_filepath, 'feat_shape': (duration, ), # second
'feat_shape': (duration, ), #second 'text': text,
'text':
text
})) }))
total_sec += duration total_sec += duration
total_text += len(text) total_char += nchars
total_num += 1 total_num += 1
with codecs.open(manifest_path, 'w', 'utf-8') as out_file: with codecs.open(manifest_path, 'w', 'utf-8') as out_file:
@ -122,8 +125,8 @@ def create_manifest(data_dir, manifest_path):
print(f"{subset}:", file=f) print(f"{subset}:", file=f)
print(f"{total_num} utts", file=f) print(f"{total_num} utts", file=f)
print(f"{total_sec / (60*60)} h", file=f) print(f"{total_sec / (60*60)} h", file=f)
print(f"{total_text} text", file=f) print(f"{total_char} char", file=f)
print(f"{total_text / total_sec} text/sec", file=f) print(f"{total_char / total_sec} char/sec", file=f)
print(f"{total_sec / total_num} sec/utt", file=f) print(f"{total_sec / total_num} sec/utt", file=f)

@ -74,15 +74,16 @@ def create_manifest(data_dir, manifest_path):
audio_filepath = os.path.join(subfolder, segments[0] + '.flac') audio_filepath = os.path.join(subfolder, segments[0] + '.flac')
audio_data, samplerate = soundfile.read(audio_filepath) audio_data, samplerate = soundfile.read(audio_filepath)
duration = float(len(audio_data)) / samplerate duration = float(len(audio_data)) / samplerate
utt = os.path.splitext(os.path.basename(audio_filepath))[0]
utt2spk = '-'.join(utt.split('-')[:2])
json_lines.append( json_lines.append(
json.dumps({ json.dumps({
'utt': 'utt': utt,
os.path.splitext(os.path.basename(audio_filepath))[0], 'utt2spk': utt2spk,
'feat': 'feat': audio_filepath,
audio_filepath,
'feat_shape': (duration, ), #second 'feat_shape': (duration, ), #second
'text': 'text': text,
text
})) }))
total_sec += duration total_sec += duration

@ -72,14 +72,16 @@ def create_manifest(data_dir, manifest_path_prefix):
continue continue
audio_data, samplerate = soundfile.read(audio_path) audio_data, samplerate = soundfile.read(audio_path)
duration = float(len(audio_data) / samplerate) duration = float(len(audio_data) / samplerate)
translation_str = " ".join(translation.split())
trancription_str = " ".join(trancription.split())
json_lines.append( json_lines.append(
json.dumps( json.dumps(
{ {
'utt': utt, 'utt': utt,
'feat': audio_path, 'feat': audio_path,
'feat_shape': (duration, ), # second 'feat_shape': (duration, ), # second
'text': " ".join(translation.split()), 'text': [translation_str, trancription_str],
'text1': " ".join(trancription.split())
}, },
ensure_ascii=False)) ensure_ascii=False))

@ -113,6 +113,8 @@ def create_manifest(data_dir, manifest_path_prefix):
assert os.path.exists(audio_path) and os.path.exists(text_path) assert os.path.exists(audio_path) and os.path.exists(text_path)
audio_id = os.path.basename(audio_path)[:-4] audio_id = os.path.basename(audio_path)[:-4]
spk = audio_id.split('_')[0]
word_text, syllable_text, phone_text = read_trn(text_path) word_text, syllable_text, phone_text = read_trn(text_path)
audio_data, samplerate = soundfile.read(audio_path) audio_data, samplerate = soundfile.read(audio_path)
duration = float(len(audio_data) / samplerate) duration = float(len(audio_data) / samplerate)
@ -122,6 +124,7 @@ def create_manifest(data_dir, manifest_path_prefix):
json.dumps( json.dumps(
{ {
'utt': audio_id, 'utt': audio_id,
'utt2spk': spk,
'feat': audio_path, 'feat': audio_path,
'feat_shape': (duration, ), # second 'feat_shape': (duration, ), # second
'text': word_text, # charactor 'text': word_text, # charactor

@ -180,12 +180,12 @@ def create_manifest(data_dir, manifest_path_prefix):
json.dumps( json.dumps(
{ {
'utt': utt_id, 'utt': utt_id,
'utt2spk': spk,
'utt2gender': gender,
'feat': str(audio_path), 'feat': str(audio_path),
'feat_shape': (duration, ), # second 'feat_shape': (duration, ), # second
'text': word_text, # word 'text': word_text, # word
'phone': phone_text, 'phone': phone_text,
'spk': spk,
'gender': gender,
}, },
ensure_ascii=False)) ensure_ascii=False))

@ -22,6 +22,7 @@ import argparse
import codecs import codecs
import json import json
import os import os
from pathlib import Path
import soundfile import soundfile
@ -67,10 +68,17 @@ def create_manifest(data_dir, manifest_path_prefix):
audio_data, samplerate = soundfile.read(audio_path) audio_data, samplerate = soundfile.read(audio_path)
duration = float(len(audio_data) / samplerate) duration = float(len(audio_data) / samplerate)
text = phn_dict[audio_id] text = phn_dict[audio_id]
gender_spk = str(Path(audio_path).parent.stem)
spk = gender_spk[1:]
gender = gender_spk[0]
utt_id = '_'.join([spk, gender, audio_id])
json_lines.append( json_lines.append(
json.dumps( json.dumps(
{ {
'utt': audio_id, 'utt': audio_id,
'utt2spk': spk,
'utt2gender': gender,
'feat': audio_path, 'feat': audio_path,
'feat_shape': (duration, ), # second 'feat_shape': (duration, ), # second
'text': text 'text': text

@ -175,9 +175,12 @@ def generate_manifest(data_dir, manifest_path):
audio_data, samplerate = soundfile.read(u) audio_data, samplerate = soundfile.read(u)
duration = float(len(audio_data)) / samplerate duration = float(len(audio_data)) / samplerate
utt = os.path.splitext(os.path.basename(u))[0]
json_lines.append( json_lines.append(
json.dumps({ json.dumps({
'utt': os.path.splitext(os.path.basename(u))[0], 'utt': utt,
'utt2spk': speaker,
'feat': u, 'feat': u,
'feat_shape': (duration, ), #second 'feat_shape': (duration, ), #second
'text': trans.lower() 'text': trans.lower()

@ -1,8 +1,9 @@
# ASR # ASR
* s0 is for deepspeech2 offline * asr0 - deepspeech2 Streaming/Non-Streaming
* s1 is for transformer/conformer/U2 * asr1 - transformer/conformer Streaming/Non-Streaming
* s2 is for transformer/conformer/U2 w/ kaldi feat, need install Kaldi * asr2 - transformer/conformer Streaming/Non-Streaming with Kaldi feature
## Data ## Data
| Data Subset | Duration in Seconds | | Data Subset | Duration in Seconds |

@ -50,8 +50,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--spectrum_type="linear" \ --spectrum_type="linear" \
--delta_delta=false \ --delta_delta=false \
--sample_rate=16000 \ --sample_rate=16000 \
--stride_ms=10.0 \ --stride_ms=10 \
--window_ms=20.0 \ --window_ms=20 \
--use_dB_normalization=True \ --use_dB_normalization=True \
--num_workers=${num_workers} \ --num_workers=${num_workers} \
--output_path="data/mean_std.json" --output_path="data/mean_std.json"
@ -81,7 +81,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
for set in train dev test dev-clean dev-other test-clean test-other; do for set in train dev test dev-clean dev-other test-clean test-other; do
{ {
python3 ${MAIN_ROOT}/utils/format_data.py \ python3 ${MAIN_ROOT}/utils/format_data.py \
--feat_type "raw" \
--cmvn_path "data/mean_std.json" \ --cmvn_path "data/mean_std.json" \
--unit_type ${unit_type} \ --unit_type ${unit_type} \
--vocab_path="data/vocab.txt" \ --vocab_path="data/vocab.txt" \

@ -21,7 +21,7 @@
## Transformer ## Transformer
| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER | | Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER |
| --- | --- | --- | --- | --- | --- | --- | --- | | --- | --- | --- | --- | --- | --- | --- | --- |
| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean | attention | 7.404532432556152 | 0.056204 | | transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | attention | 6.805267604192098, | 0.049795 |
| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean | ctc_greedy_search | 7.404532432556152 | 0.058658 | | transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | ctc_greedy_search | 6.805267604192098, | 0.054892 |
| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean | ctc_prefix_beam_search | 7.404532432556152 | 0.058278 | | transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | ctc_prefix_beam_search | 6.805267604192098, | 0.054531 |
| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean | attention_rescoring | 7.404532432556152 | 0.045591 | | transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | attention_rescoring | 6.805267604192098, | 0.042244 |

@ -15,7 +15,7 @@ collator:
unit_type: 'spm' unit_type: 'spm'
spm_model_prefix: 'data/bpe_unigram_5000' spm_model_prefix: 'data/bpe_unigram_5000'
mean_std_filepath: "" mean_std_filepath: ""
augmentation_config: conf/augmentation.json augmentation_config: conf/preprocess.yaml
batch_size: 16 batch_size: 16
raw_wav: True # use raw_wav or kaldi feature raw_wav: True # use raw_wav or kaldi feature
spectrum_type: fbank #linear, mfcc, fbank spectrum_type: fbank #linear, mfcc, fbank
@ -38,7 +38,7 @@ collator:
# network architecture # network architecture
model: model:
cmvn_file: "data/mean_std.json" cmvn_file:
cmvn_file_type: "json" cmvn_file_type: "json"
# encoder related # encoder related
encoder: conformer encoder: conformer

@ -15,7 +15,7 @@ collator:
unit_type: 'spm' unit_type: 'spm'
spm_model_prefix: 'data/bpe_unigram_5000' spm_model_prefix: 'data/bpe_unigram_5000'
mean_std_filepath: "" mean_std_filepath: ""
augmentation_config: conf/augmentation.json augmentation_config: conf/preprocess.yaml
batch_size: 64 batch_size: 64
raw_wav: True # use raw_wav or kaldi feature raw_wav: True # use raw_wav or kaldi feature
spectrum_type: fbank #linear, mfcc, fbank spectrum_type: fbank #linear, mfcc, fbank
@ -38,7 +38,7 @@ collator:
# network architecture # network architecture
model: model:
cmvn_file: "data/mean_std.json" cmvn_file:
cmvn_file_type: "json" cmvn_file_type: "json"
# encoder related # encoder related
encoder: transformer encoder: transformer

@ -15,7 +15,7 @@ collator:
unit_type: 'spm' unit_type: 'spm'
spm_model_prefix: 'data/bpe_unigram_5000' spm_model_prefix: 'data/bpe_unigram_5000'
mean_std_filepath: "" mean_std_filepath: ""
augmentation_config: conf/augmentation.json augmentation_config: conf/preprocess.yaml
batch_size: 16 batch_size: 16
raw_wav: True # use raw_wav or kaldi feature raw_wav: True # use raw_wav or kaldi feature
spectrum_type: fbank #linear, mfcc, fbank spectrum_type: fbank #linear, mfcc, fbank
@ -38,7 +38,7 @@ collator:
# network architecture # network architecture
model: model:
cmvn_file: "data/mean_std.json" cmvn_file:
cmvn_file_type: "json" cmvn_file_type: "json"
# encoder related # encoder related
encoder: conformer encoder: conformer

@ -0,0 +1,25 @@
process:
# extract kaldi fbank from PCM
- type: fbank_kaldi
fs: 16000
n_mels: 80
n_shift: 160
win_length: 400
dither: true
- type: cmvn_json
cmvn_path: data/mean_std.json
# these three processes are a.k.a. SpecAugument
- type: time_warp
max_time_warp: 5
inplace: true
mode: PIL
- type: freq_mask
F: 30
n_mask: 2
inplace: true
replace_with_zero: false
- type: time_mask
T: 40
n_mask: 2
inplace: true
replace_with_zero: false

@ -15,7 +15,7 @@ collator:
unit_type: 'spm' unit_type: 'spm'
spm_model_prefix: 'data/bpe_unigram_5000' spm_model_prefix: 'data/bpe_unigram_5000'
mean_std_filepath: "" mean_std_filepath: ""
augmentation_config: conf/augmentation.json augmentation_config: conf/preprocess.yaml
batch_size: 32 batch_size: 32
raw_wav: True # use raw_wav or kaldi feature raw_wav: True # use raw_wav or kaldi feature
spectrum_type: fbank #linear, mfcc, fbank spectrum_type: fbank #linear, mfcc, fbank
@ -38,7 +38,7 @@ collator:
# network architecture # network architecture
model: model:
cmvn_file: "data/mean_std.json" cmvn_file:
cmvn_file_type: "json" cmvn_file_type: "json"
# encoder related # encoder related
encoder: transformer encoder: transformer

@ -8,6 +8,11 @@ nbpe=5000
bpemode=unigram bpemode=unigram
bpeprefix="data/bpe_${bpemode}_${nbpe}" bpeprefix="data/bpe_${bpemode}_${nbpe}"
stride_ms=10
window_ms=25
sample_rate=16000
feat_dim=80
source ${MAIN_ROOT}/utils/parse_options.sh source ${MAIN_ROOT}/utils/parse_options.sh
@ -27,21 +32,21 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
exit 1 exit 1
fi fi
for set in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do for sub in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
mv data/manifest.${set} data/manifest.${set}.raw mv data/manifest.${sub} data/manifest.${sub}.raw
done done
rm -rf data/manifest.train.raw data/manifest.dev.raw data/manifest.test.raw rm -rf data/manifest.train.raw data/manifest.dev.raw data/manifest.test.raw
for set in train-clean-100 train-clean-360 train-other-500; do for sub in train-clean-100 train-clean-360 train-other-500; do
cat data/manifest.${set}.raw >> data/manifest.train.raw cat data/manifest.${sub}.raw >> data/manifest.train.raw
done done
for set in dev-clean dev-other; do for sub in dev-clean dev-other; do
cat data/manifest.${set}.raw >> data/manifest.dev.raw cat data/manifest.${sub}.raw >> data/manifest.dev.raw
done done
for set in test-clean test-other; do for sub in test-clean test-other; do
cat data/manifest.${set}.raw >> data/manifest.test.raw cat data/manifest.${sub}.raw >> data/manifest.test.raw
done done
fi fi
@ -52,11 +57,11 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--manifest_path="data/manifest.train.raw" \ --manifest_path="data/manifest.train.raw" \
--num_samples=-1 \ --num_samples=-1 \
--spectrum_type="fbank" \ --spectrum_type="fbank" \
--feat_dim=80 \ --feat_dim=${feat_dim} \
--delta_delta=false \ --delta_delta=false \
--sample_rate=16000 \ --sample_rate=${sample_rate} \
--stride_ms=10.0 \ --stride_ms=${stride_ms} \
--window_ms=25.0 \ --window_ms=${window_ms} \
--use_dB_normalization=False \ --use_dB_normalization=False \
--num_workers=${num_workers} \ --num_workers=${num_workers} \
--output_path="data/mean_std.json" --output_path="data/mean_std.json"
@ -85,16 +90,15 @@ fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# format manifest with tokenids, vocab size # format manifest with tokenids, vocab size
for set in train dev test dev-clean dev-other test-clean test-other; do for sub in train dev test dev-clean dev-other test-clean test-other; do
{ {
python3 ${MAIN_ROOT}/utils/format_data.py \ python3 ${MAIN_ROOT}/utils/format_data.py \
--feat_type "raw" \
--cmvn_path "data/mean_std.json" \ --cmvn_path "data/mean_std.json" \
--unit_type "spm" \ --unit_type "spm" \
--spm_model_prefix ${bpeprefix} \ --spm_model_prefix ${bpeprefix} \
--vocab_path="data/vocab.txt" \ --vocab_path="data/vocab.txt" \
--manifest_path="data/manifest.${set}.raw" \ --manifest_path="data/manifest.${sub}.raw" \
--output_path="data/manifest.${set}" --output_path="data/manifest.${sub}"
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
echo "Formt mnaifest failed. Terminated." echo "Formt mnaifest failed. Terminated."
@ -103,6 +107,16 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
}& }&
done done
wait wait
for sub in train dev; do
mv data/manifest.${sub} data/manifest.${sub}.fmt
done
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
for sub in train dev; do
remove_longshortdata.py --maxframes 3000 --maxchars 400 --stride_ms ${stride_ms} data/manifest.${sub}.fmt data/manifest.${sub}
done
fi fi
echo "LibriSpeech Data preparation done." echo "LibriSpeech Data preparation done."

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save