diff --git a/README.md b/README.md index 2f9d9928..ec2d0f30 100644 --- a/README.md +++ b/README.md @@ -128,9 +128,9 @@ For **Text-To-Speech**, try pretrained FastSpeech2 + Parallel WaveGAN on CSMSC: ```shell cd examples/csmsc/tts3 # download the pretrained models and unaip them -wget https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip +wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip unzip pwg_baker_ckpt_0.4.zip -wget https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip +wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip unzip fastspeech2_nosil_baker_ckpt_0.4.zip # source the environment source path.sh diff --git a/demos/metaverse/run.sh b/demos/metaverse/run.sh index ea7f683c..ba7d7980 100755 --- a/demos/metaverse/run.sh +++ b/demos/metaverse/run.sh @@ -25,9 +25,9 @@ fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then # download pretrained tts models and unzip - wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip + wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip unzip -d download download/pwg_baker_ckpt_0.4.zip - wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip + wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip unzip -d download download/fastspeech2_nosil_baker_ckpt_0.4.zip fi diff --git a/demos/story_talker/run.sh b/demos/story_talker/run.sh index 069ec12e..44259cd3 100755 --- a/demos/story_talker/run.sh +++ b/demos/story_talker/run.sh @@ -19,9 +19,9 @@ fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # download pretrained tts models and unzip - wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip + wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip unzip -d download download/pwg_baker_ckpt_0.4.zip - wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip + wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip unzip -d download download/fastspeech2_nosil_baker_ckpt_0.4.zip fi diff --git a/demos/style_fs2/run.sh b/demos/style_fs2/run.sh index f035dd1b..6f6d6068 100755 --- a/demos/style_fs2/run.sh +++ b/demos/style_fs2/run.sh @@ -14,9 +14,9 @@ mkdir -p download if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then # download pretrained tts models and unzip - wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip + wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip unzip -d download download/pwg_baker_ckpt_0.4.zip - wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip + wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip unzip -d download download/fastspeech2_nosil_baker_ckpt_0.4.zip fi diff --git a/docs/source/released_model.md b/docs/source/released_model.md index 78f5c92f..ca04f6a7 100644 --- a/docs/source/released_model.md +++ b/docs/source/released_model.md @@ -1,4 +1,3 @@ - # Released Models ## Speech-to-Text Models @@ -32,27 +31,28 @@ Language Model | Training Data | Token-based | Size | Descriptions ### Acoustic Models Model Type | Dataset| Example Link | Pretrained Models|Static Models|Siize(static) :-------------:| :------------:| :-----: | :-----:| :-----:| :-----: -Tacotron2|LJSpeech|[tacotron2-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts0)|[tacotron2_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_ckpt_0.3.zip)||| -TransformerTTS| LJSpeech| [transformer-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts1)|[transformer_tts_ljspeech_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/transformer_tts_ljspeech_ckpt_0.4.zip)||| -SpeedySpeech| CSMSC | [speedyspeech-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts2) |[speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/speedyspeech_nosil_baker_ckpt_0.5.zip)|[speedyspeech_nosil_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/speedyspeech_nosil_baker_static_0.5.zip)|12MB| -FastSpeech2| CSMSC |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3)|[fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip)|[fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_static_0.4.zip)|157MB| -FastSpeech2| AISHELL-3 |[fastspeech2-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/tts3)|[fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_aishell3_ckpt_0.4.zip)||| -FastSpeech2| LJSpeech |[fastspeech2-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts3)|[fastspeech2_nosil_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_ljspeech_ckpt_0.5.zip)||| -FastSpeech2| VCTK |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/tts3)|[fastspeech2_nosil_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_vctk_ckpt_0.5.zip)||| +Tacotron2|LJSpeech|[tacotron2-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts0)|[tacotron2_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_ljspeech_ckpt_0.3.zip)||| +TransformerTTS| LJSpeech| [transformer-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts1)|[transformer_tts_ljspeech_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/transformer_tts/transformer_tts_ljspeech_ckpt_0.4.zip)||| +SpeedySpeech| CSMSC | [speedyspeech-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts2) |[speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_ckpt_0.5.zip)|[speedyspeech_nosil_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_static_0.5.zip)|12MB| +FastSpeech2| CSMSC |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3)|[fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip)|[fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_static_0.4.zip)|157MB| +FastSpeech2| AISHELL-3 |[fastspeech2-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/tts3)|[fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_ckpt_0.4.zip)||| +FastSpeech2| LJSpeech |[fastspeech2-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts3)|[fastspeech2_nosil_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_ljspeech_ckpt_0.5.zip)||| +FastSpeech2| VCTK |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/tts3)|[fastspeech2_nosil_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_vctk_ckpt_0.5.zip)||| ### Vocoders Model Type | Dataset| Example Link | Pretrained Models| Static Models|Size(static) :-------------:| :------------:| :-----: | :-----:| :-----:| :-----: -WaveFlow| LJSpeech |[waveflow-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc0)|[waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_ljspeech_ckpt_0.3.zip)||| -Parallel WaveGAN| CSMSC |[PWGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc1)|[pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip)|[pwg_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_static_0.4.zip)|5.1MB| -Parallel WaveGAN| LJSpeech |[PWGAN-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc1)|[pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_ljspeech_ckpt_0.5.zip)||| -Parallel WaveGAN|AISHELL-3 |[PWGAN-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc1)|[pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_aishell3_ckpt_0.5.zip)||| -Parallel WaveGAN| VCTK |[PWGAN-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/voc1)|[pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_vctk_ckpt_0.5.zip)||| -|Multi Band MelGAN |CSMSC|[MB MelGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc3) | [mb_melgan_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/mb_melgan_baker_ckpt_0.5.zip)|[mb_melgan_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/mb_melgan_baker_static_0.5.zip) |8.2MB| +WaveFlow| LJSpeech |[waveflow-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc0)|[waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/waveflow/waveflow_ljspeech_ckpt_0.3.zip)||| +Parallel WaveGAN| CSMSC |[PWGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc1)|[pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip)|[pwg_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_static_0.4.zip)|5.1MB| +Parallel WaveGAN| LJSpeech |[PWGAN-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc1)|[pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_ljspeech_ckpt_0.5.zip)||| +Parallel WaveGAN|AISHELL-3 |[PWGAN-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc1)|[pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip)||| +Parallel WaveGAN| VCTK |[PWGAN-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/voc1)|[pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_vctk_ckpt_0.5.zip)||| +|Multi Band MelGAN |CSMSC|[MB MelGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc3) | [mb_melgan_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_ckpt_0.5.zip)
[mb_melgan_baker_finetune_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_finetune_ckpt_0.5.zip)|[mb_melgan_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_static_0.5.zip) |8.2MB| ### Voice Cloning Model Type | Dataset| Example Link | Pretrained Models :-------------:| :------------:| :-----: | :-----: -GE2E| AISHELL-3, etc. |[ge2e](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/ge2e)|[ge2e_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/ge2e_ckpt_0.3.zip) -GE2E + Tactron2| AISHELL-3 |[ge2e-tactron2-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/vc0)|[tacotron2_aishell3_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_aishell3_ckpt_0.3.zip) +GE2E| AISHELL-3, etc. |[ge2e](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/ge2e)|[ge2e_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/ge2e/ge2e_ckpt_0.3.zip) +GE2E + Tactron2| AISHELL-3 |[ge2e-tactron2-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/vc0)|[tacotron2_aishell3_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_aishell3_ckpt_0.3.zip) +GE2E + FastSpeech2 | AISHELL-3 |[ge2e-fastspeech2-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/vc1)|[fastspeech2_nosil_aishell3_vc1_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_vc1_ckpt_0.5.zip) diff --git a/docs/source/tts/demo.rst b/docs/source/tts/demo.rst index f47c0892..4c2f86b1 100644 --- a/docs/source/tts/demo.rst +++ b/docs/source/tts/demo.rst @@ -52,7 +52,7 @@ Audio samples generated from ground-truth spectrograms with a vocoder. @@ -72,7 +72,7 @@ Audio samples generated from ground-truth spectrograms with a vocoder. @@ -91,7 +91,7 @@ Audio samples generated from ground-truth spectrograms with a vocoder. @@ -110,7 +110,7 @@ Audio samples generated from ground-truth spectrograms with a vocoder. @@ -129,7 +129,7 @@ Audio samples generated from ground-truth spectrograms with a vocoder. @@ -281,7 +281,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog @@ -300,7 +300,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog @@ -320,7 +320,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog @@ -341,7 +341,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog @@ -361,7 +361,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog @@ -381,7 +381,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog @@ -401,7 +401,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog @@ -421,7 +421,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog @@ -441,7 +441,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog diff --git a/examples/aishell/README.md b/examples/aishell/README.md index 82ef91da..a9bba074 100644 --- a/examples/aishell/README.md +++ b/examples/aishell/README.md @@ -1,7 +1,9 @@ # ASR -* s0 for deepspeech2 -* s1 for u2/transformer/conformer +* asr0 - deepspeech2 Streaming/Non-Streaming +* asr1 - transformer/conformer Streaming/Non-Streaming +* asr2 - transformer/conformer Streaming/Non-Streaming with Kaldi feature + ## Data diff --git a/examples/aishell/s0/.gitignore b/examples/aishell/asr0/.gitignore similarity index 100% rename from examples/aishell/s0/.gitignore rename to examples/aishell/asr0/.gitignore diff --git a/examples/aishell/s0/README.md b/examples/aishell/asr0/README.md similarity index 100% rename from examples/aishell/s0/README.md rename to examples/aishell/asr0/README.md diff --git a/examples/aishell/s0/conf/augmentation.json b/examples/aishell/asr0/conf/augmentation.json similarity index 100% rename from examples/aishell/s0/conf/augmentation.json rename to examples/aishell/asr0/conf/augmentation.json diff --git a/examples/aishell/s0/conf/deepspeech2.yaml b/examples/aishell/asr0/conf/deepspeech2.yaml similarity index 100% rename from examples/aishell/s0/conf/deepspeech2.yaml rename to examples/aishell/asr0/conf/deepspeech2.yaml diff --git a/examples/aishell/s0/conf/deepspeech2_online.yaml b/examples/aishell/asr0/conf/deepspeech2_online.yaml similarity index 100% rename from examples/aishell/s0/conf/deepspeech2_online.yaml rename to examples/aishell/asr0/conf/deepspeech2_online.yaml diff --git a/examples/aishell/s0/local/data.sh b/examples/aishell/asr0/local/data.sh similarity index 96% rename from examples/aishell/s0/local/data.sh rename to examples/aishell/asr0/local/data.sh index f4fccbe6..23f04f2a 100755 --- a/examples/aishell/s0/local/data.sh +++ b/examples/aishell/asr0/local/data.sh @@ -32,8 +32,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then --manifest_path="data/manifest.train.raw" \ --spectrum_type="linear" \ --delta_delta=false \ - --stride_ms=10.0 \ - --window_ms=20.0 \ + --stride_ms=10 \ + --window_ms=20 \ --sample_rate=16000 \ --use_dB_normalization=True \ --num_samples=2000 \ @@ -66,7 +66,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then for dataset in train dev test; do { python3 ${MAIN_ROOT}/utils/format_data.py \ - --feat_type "raw" \ --cmvn_path "data/mean_std.json" \ --unit_type "char" \ --vocab_path="data/vocab.txt" \ diff --git a/examples/aishell/s0/local/download_lm_ch.sh b/examples/aishell/asr0/local/download_lm_ch.sh similarity index 100% rename from examples/aishell/s0/local/download_lm_ch.sh rename to examples/aishell/asr0/local/download_lm_ch.sh diff --git a/examples/aishell/s0/local/export.sh b/examples/aishell/asr0/local/export.sh similarity index 100% rename from examples/aishell/s0/local/export.sh rename to examples/aishell/asr0/local/export.sh diff --git a/examples/aishell/s0/local/test.sh b/examples/aishell/asr0/local/test.sh similarity index 100% rename from examples/aishell/s0/local/test.sh rename to examples/aishell/asr0/local/test.sh diff --git a/examples/aishell/s0/local/test_export.sh b/examples/aishell/asr0/local/test_export.sh similarity index 100% rename from examples/aishell/s0/local/test_export.sh rename to examples/aishell/asr0/local/test_export.sh diff --git a/examples/aishell/s0/local/test_hub.sh b/examples/aishell/asr0/local/test_hub.sh similarity index 100% rename from examples/aishell/s0/local/test_hub.sh rename to examples/aishell/asr0/local/test_hub.sh diff --git a/examples/aishell/s0/local/train.sh b/examples/aishell/asr0/local/train.sh similarity index 100% rename from examples/aishell/s0/local/train.sh rename to examples/aishell/asr0/local/train.sh diff --git a/examples/aishell/s0/path.sh b/examples/aishell/asr0/path.sh similarity index 100% rename from examples/aishell/s0/path.sh rename to examples/aishell/asr0/path.sh diff --git a/examples/aishell/s0/run.sh b/examples/aishell/asr0/run.sh similarity index 100% rename from examples/aishell/s0/run.sh rename to examples/aishell/asr0/run.sh diff --git a/examples/aishell/s1/.gitignore b/examples/aishell/asr1/.gitignore similarity index 100% rename from examples/aishell/s1/.gitignore rename to examples/aishell/asr1/.gitignore diff --git a/examples/aishell/s1/README.md b/examples/aishell/asr1/README.md similarity index 67% rename from examples/aishell/s1/README.md rename to examples/aishell/asr1/README.md index 0096c73e..8c53f95f 100644 --- a/examples/aishell/s1/README.md +++ b/examples/aishell/asr1/README.md @@ -19,3 +19,13 @@ Need set `decoding.decoding_chunk_size=16` when decoding. | conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_greedy_search | 16, -1 | - | 0.070806 | | conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_prefix_beam_search | 16, -1 | - | 0.070739 | | conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | attention_rescoring | 16, -1 | - | 0.059400 | + + +## Transformer + +| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER | +| --- | --- | --- | --- | --- | --- | --- | --- | +| transformer | 31.95M | conf/transformer.yaml | spec_aug | test | attention | 3.858648955821991 | 0.057293 | +| transformer | 31.95M | conf/transformer.yaml | spec_aug | test | ctc_greedy_search | 3.858648955821991 | 0.061837 | +| transformer | 31.95M | conf/transformer.yaml | spec_aug | test | ctc_prefix_beam_search | 3.858648955821991 | 0.061685 | +| transformer | 31.95M | conf/transformer.yaml | spec_aug | test | attention_rescoring | 3.858648955821991 | 0.053844 | \ No newline at end of file diff --git a/examples/aishell/s1/conf/augmentation.json b/examples/aishell/asr1/conf/augmentation.json similarity index 100% rename from examples/aishell/s1/conf/augmentation.json rename to examples/aishell/asr1/conf/augmentation.json diff --git a/examples/aishell/s1/conf/chunk_conformer.yaml b/examples/aishell/asr1/conf/chunk_conformer.yaml similarity index 97% rename from examples/aishell/s1/conf/chunk_conformer.yaml rename to examples/aishell/asr1/conf/chunk_conformer.yaml index 8682538b..336a6c46 100644 --- a/examples/aishell/s1/conf/chunk_conformer.yaml +++ b/examples/aishell/asr1/conf/chunk_conformer.yaml @@ -15,7 +15,7 @@ collator: vocab_filepath: data/vocab.txt unit_type: 'char' spm_model_prefix: '' - augmentation_config: conf/augmentation.json + augmentation_config: conf/preprocess.yaml batch_size: 32 raw_wav: True # use raw_wav or kaldi feature spectrum_type: fbank #linear, mfcc, fbank @@ -38,7 +38,7 @@ collator: # network architecture model: - cmvn_file: "data/mean_std.json" + cmvn_file: cmvn_file_type: "json" # encoder related encoder: conformer diff --git a/examples/aishell/s1/conf/conformer.yaml b/examples/aishell/asr1/conf/conformer.yaml similarity index 97% rename from examples/aishell/s1/conf/conformer.yaml rename to examples/aishell/asr1/conf/conformer.yaml index 71cd044e..0e9d79d8 100644 --- a/examples/aishell/s1/conf/conformer.yaml +++ b/examples/aishell/asr1/conf/conformer.yaml @@ -15,7 +15,7 @@ collator: vocab_filepath: data/vocab.txt unit_type: 'char' spm_model_prefix: '' - augmentation_config: conf/augmentation.json + augmentation_config: conf/preprocess.yaml batch_size: 64 raw_wav: True # use raw_wav or kaldi feature spectrum_type: fbank #linear, mfcc, fbank @@ -37,7 +37,7 @@ collator: # network architecture model: - cmvn_file: "data/mean_std.json" + cmvn_file: cmvn_file_type: "json" # encoder related encoder: conformer diff --git a/examples/aishell/asr1/conf/preprocess.yaml b/examples/aishell/asr1/conf/preprocess.yaml new file mode 100644 index 00000000..dd4cfd27 --- /dev/null +++ b/examples/aishell/asr1/conf/preprocess.yaml @@ -0,0 +1,29 @@ +process: + # extract kaldi fbank from PCM + - type: fbank_kaldi + fs: 16000 + n_mels: 80 + n_shift: 160 + win_length: 400 + dither: true + - type: cmvn_json + cmvn_path: data/mean_std.json + # these three processes are a.k.a. SpecAugument + - type: time_warp + max_time_warp: 5 + inplace: true + mode: PIL + - type: freq_mask + F: 30 + n_mask: 2 + inplace: true + replace_with_zero: false + - type: time_mask + T: 40 + n_mask: 2 + inplace: true + replace_with_zero: false + + + + diff --git a/examples/aishell/asr1/conf/transformer.yaml b/examples/aishell/asr1/conf/transformer.yaml new file mode 100644 index 00000000..c021f66b --- /dev/null +++ b/examples/aishell/asr1/conf/transformer.yaml @@ -0,0 +1,112 @@ +# https://yaml.org/type/float.html +data: + train_manifest: data/manifest.train + dev_manifest: data/manifest.dev + test_manifest: data/manifest.test + min_input_len: 0.5 + max_input_len: 20.0 # second + min_output_len: 0.0 + max_output_len: 400.0 + min_output_input_ratio: 0.05 + max_output_input_ratio: 10.0 + + +collator: + vocab_filepath: data/vocab.txt + unit_type: 'char' + spm_model_prefix: '' + augmentation_config: conf/preprocess.yaml + batch_size: 64 + raw_wav: True # use raw_wav or kaldi feature + spectrum_type: fbank #linear, mfcc, fbank + feat_dim: 80 + delta_delta: False + dither: 1.0 + target_sample_rate: 16000 + max_freq: None + n_fft: None + stride_ms: 10.0 + window_ms: 25.0 + use_dB_normalization: True + target_dB: -20 + random_seed: 0 + keep_transcription_text: False + sortagrad: True + shuffle_method: batch_shuffle + num_workers: 2 + +# network architecture +model: + cmvn_file: + cmvn_file_type: "json" + # encoder related + encoder: transformer + encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: true + + # decoder related + decoder: transformer + decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 + + # hybrid CTC/attention + model_conf: + ctc_weight: 0.3 + ctc_dropoutrate: 0.0 + ctc_grad_norm_type: null + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false + + +training: + n_epoch: 120 + accum_grad: 2 + global_grad_clip: 5.0 + optim: adam + optim_conf: + lr: 0.002 + weight_decay: 1e-6 + scheduler: warmuplr # pytorch v1.1.0+ required + scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 + log_interval: 100 + checkpoint: + kbest_n: 50 + latest_n: 5 + + +decoding: + batch_size: 128 + error_rate_type: cer + decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' + lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm + alpha: 2.5 + beta: 0.3 + beam_size: 10 + cutoff_prob: 1.0 + cutoff_top_n: 0 + num_proc_bsearch: 8 + ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. + decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. + num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. + simulate_streaming: False # simulate streaming inference. Defaults to False. + + diff --git a/examples/aishell/s1/local/aishell_train_lms.sh b/examples/aishell/asr1/local/aishell_train_lms.sh similarity index 100% rename from examples/aishell/s1/local/aishell_train_lms.sh rename to examples/aishell/asr1/local/aishell_train_lms.sh diff --git a/examples/aishell/s1/local/align.sh b/examples/aishell/asr1/local/align.sh similarity index 100% rename from examples/aishell/s1/local/align.sh rename to examples/aishell/asr1/local/align.sh diff --git a/examples/aishell/s1/local/data.sh b/examples/aishell/asr1/local/data.sh similarity index 96% rename from examples/aishell/s1/local/data.sh rename to examples/aishell/asr1/local/data.sh index 2b9f69ae..76e28075 100755 --- a/examples/aishell/s1/local/data.sh +++ b/examples/aishell/asr1/local/data.sh @@ -33,8 +33,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then --spectrum_type="fbank" \ --feat_dim=80 \ --delta_delta=false \ - --stride_ms=10.0 \ - --window_ms=25.0 \ + --stride_ms=10 \ + --window_ms=25 \ --sample_rate=16000 \ --use_dB_normalization=False \ --num_samples=-1 \ @@ -67,7 +67,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then for dataset in train dev test; do { python3 ${MAIN_ROOT}/utils/format_data.py \ - --feat_type "raw" \ --cmvn_path "data/mean_std.json" \ --unit_type "char" \ --vocab_path="data/vocab.txt" \ diff --git a/examples/aishell/s1/local/export.sh b/examples/aishell/asr1/local/export.sh similarity index 100% rename from examples/aishell/s1/local/export.sh rename to examples/aishell/asr1/local/export.sh diff --git a/examples/aishell/s1/local/test.sh b/examples/aishell/asr1/local/test.sh similarity index 100% rename from examples/aishell/s1/local/test.sh rename to examples/aishell/asr1/local/test.sh diff --git a/examples/aishell/s1/local/test_hub.sh b/examples/aishell/asr1/local/test_hub.sh similarity index 99% rename from examples/aishell/s1/local/test_hub.sh rename to examples/aishell/asr1/local/test_hub.sh index 99b141c8..6e78ec78 100755 --- a/examples/aishell/s1/local/test_hub.sh +++ b/examples/aishell/asr1/local/test_hub.sh @@ -23,8 +23,6 @@ fi # exit 1 #fi - - for type in attention_rescoring; do echo "decoding ${type}" batch_size=1 diff --git a/examples/aishell/s1/local/tlg.sh b/examples/aishell/asr1/local/tlg.sh similarity index 100% rename from examples/aishell/s1/local/tlg.sh rename to examples/aishell/asr1/local/tlg.sh diff --git a/examples/aishell/s1/local/train.sh b/examples/aishell/asr1/local/train.sh similarity index 100% rename from examples/aishell/s1/local/train.sh rename to examples/aishell/asr1/local/train.sh diff --git a/examples/aishell/s1/path.sh b/examples/aishell/asr1/path.sh similarity index 100% rename from examples/aishell/s1/path.sh rename to examples/aishell/asr1/path.sh diff --git a/examples/aishell/s1/run.sh b/examples/aishell/asr1/run.sh similarity index 100% rename from examples/aishell/s1/run.sh rename to examples/aishell/asr1/run.sh diff --git a/examples/aishell/s1/utils b/examples/aishell/asr1/utils similarity index 100% rename from examples/aishell/s1/utils rename to examples/aishell/asr1/utils diff --git a/examples/aishell3/tts3/README.md b/examples/aishell3/tts3/README.md index fe4887b9..056f35ba 100644 --- a/examples/aishell3/tts3/README.md +++ b/examples/aishell3/tts3/README.md @@ -97,7 +97,7 @@ optional arguments: ### Synthesize We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc1) as the neural vocoder. -Download pretrained parallel wavegan model from [pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_aishell3_ckpt_0.5.zip) and unzip it. +Download pretrained parallel wavegan model from [pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip) and unzip it. ```bash unzip pwg_aishell3_ckpt_0.5.zip ``` @@ -202,7 +202,7 @@ optional arguments: 6. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. ## Pretrained Model -Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_aishell3_ckpt_0.4.zip) +Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_ckpt_0.4.zip) FastSpeech2 checkpoint contains files listed below. diff --git a/examples/aishell3/vc0/README.md b/examples/aishell3/vc0/README.md index 2f1b37ee..376d4a33 100644 --- a/examples/aishell3/vc0/README.md +++ b/examples/aishell3/vc0/README.md @@ -41,7 +41,7 @@ We use Montreal Force Aligner 1.0. The label in aishell3 include pinyin,so th We use [lexicon.txt](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/lexicon.txt) as the lexicon. -You can download the alignment results from here [alignment_aishell3.tar.gz](https://paddlespeech.bj.bcebos.com/Parakeet/alignment_aishell3.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) (use MFA1.x now) of our repo. +You can download the alignment results from here [alignment_aishell3.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/alignment_aishell3.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) (use MFA1.x now) of our repo. ```bash if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then @@ -86,4 +86,4 @@ In addition, in order to accelerate the convergence of the model, we add `guided CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${ge2e_params_path} ${tacotron2_params_path} ${waveflow_params_path} ${vc_input} ${vc_output} ``` ## Pretrained Model -[tacotron2_aishell3_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_aishell3_ckpt_0.3.zip). +[tacotron2_aishell3_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_aishell3_ckpt_0.3.zip). diff --git a/examples/aishell3/vc1/README.md b/examples/aishell3/vc1/README.md index 834942fa..ae53443e 100644 --- a/examples/aishell3/vc1/README.md +++ b/examples/aishell3/vc1/README.md @@ -22,7 +22,7 @@ You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech ## Pretrained GE2E model We use pretrained GE2E model to generate spwaker embedding for each sentence. -Download pretrained GE2E model from here [ge2e_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/ge2e_ckpt_0.3.zip), and `unzip` it. +Download pretrained GE2E model from here [ge2e_ckpt_0.3.zip](https://bj.bcebos.com/paddlespeech/Parakeet/released_models/ge2e/ge2e_ckpt_0.3.zip), and `unzip` it. ## Get Started Assume the path to the dataset is `~/datasets/data_aishell3`. @@ -84,7 +84,7 @@ The training step is very similar to that one of [tts3](https://github.com/Paddl ### Synthesize We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc1) as the neural vocoder. -Download pretrained parallel wavegan model from [pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_aishell3_ckpt_0.5.zip) and unzip it. +Download pretrained parallel wavegan model from [pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip) and unzip it. ```bash unzip pwg_aishell3_ckpt_0.5.zip ``` @@ -115,7 +115,7 @@ ref_audio CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${conf_path} ${train_output_path} ${ckpt_name} ${ge2e_params_path} ${ref_audio_dir} ``` ## Pretrained Model -[fastspeech2_nosil_aishell3_vc1_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_aishell3_vc1_ckpt_0.5.zip) +[fastspeech2_nosil_aishell3_vc1_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_vc1_ckpt_0.5.zip) FastSpeech2 checkpoint contains files listed below. (There is no need for `speaker_id_map.txt` here ) diff --git a/examples/aishell3/voc1/README.md b/examples/aishell3/voc1/README.md index d67af726..bc28bba1 100644 --- a/examples/aishell3/voc1/README.md +++ b/examples/aishell3/voc1/README.md @@ -132,7 +132,7 @@ optional arguments: 5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. ## Pretrained Models -Pretrained models can be downloaded here [pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_aishell3_ckpt_0.5.zip). +Pretrained models can be downloaded here [pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip). Parallel WaveGAN checkpoint contains files listed below. diff --git a/examples/callcenter/s1/.gitignore b/examples/callcenter/asr1/.gitignore similarity index 100% rename from examples/callcenter/s1/.gitignore rename to examples/callcenter/asr1/.gitignore diff --git a/examples/callcenter/s1/README.md b/examples/callcenter/asr1/README.md similarity index 100% rename from examples/callcenter/s1/README.md rename to examples/callcenter/asr1/README.md diff --git a/examples/callcenter/s1/conf/augmentation.json b/examples/callcenter/asr1/conf/augmentation.json similarity index 100% rename from examples/callcenter/s1/conf/augmentation.json rename to examples/callcenter/asr1/conf/augmentation.json diff --git a/examples/callcenter/s1/conf/chunk_conformer.yaml b/examples/callcenter/asr1/conf/chunk_conformer.yaml similarity index 97% rename from examples/callcenter/s1/conf/chunk_conformer.yaml rename to examples/callcenter/asr1/conf/chunk_conformer.yaml index a853658a..b18b46fe 100644 --- a/examples/callcenter/s1/conf/chunk_conformer.yaml +++ b/examples/callcenter/asr1/conf/chunk_conformer.yaml @@ -15,7 +15,7 @@ collator: vocab_filepath: data/vocab.txt unit_type: 'char' spm_model_prefix: '' - augmentation_config: conf/augmentation.json + augmentation_config: conf/preprocess.yaml batch_size: 32 raw_wav: True # use raw_wav or kaldi feature spectrum_type: fbank #linear, mfcc, fbank @@ -38,7 +38,7 @@ collator: # network architecture model: - cmvn_file: "data/mean_std.json" + cmvn_file: cmvn_file_type: "json" # encoder related encoder: conformer diff --git a/examples/callcenter/s1/conf/conformer.yaml b/examples/callcenter/asr1/conf/conformer.yaml similarity index 97% rename from examples/callcenter/s1/conf/conformer.yaml rename to examples/callcenter/asr1/conf/conformer.yaml index bd4f4578..47c438a6 100644 --- a/examples/callcenter/s1/conf/conformer.yaml +++ b/examples/callcenter/asr1/conf/conformer.yaml @@ -15,7 +15,7 @@ collator: vocab_filepath: data/vocab.txt unit_type: 'char' spm_model_prefix: '' - augmentation_config: conf/augmentation.json + augmentation_config: conf/preprocess.yaml batch_size: 32 raw_wav: True # use raw_wav or kaldi feature spectrum_type: fbank #linear, mfcc, fbank @@ -37,7 +37,7 @@ collator: # network architecture model: - cmvn_file: "data/mean_std.json" + cmvn_file: cmvn_file_type: "json" # encoder related encoder: conformer diff --git a/examples/callcenter/asr1/conf/preprocess.yaml b/examples/callcenter/asr1/conf/preprocess.yaml new file mode 100644 index 00000000..dd4cfd27 --- /dev/null +++ b/examples/callcenter/asr1/conf/preprocess.yaml @@ -0,0 +1,29 @@ +process: + # extract kaldi fbank from PCM + - type: fbank_kaldi + fs: 16000 + n_mels: 80 + n_shift: 160 + win_length: 400 + dither: true + - type: cmvn_json + cmvn_path: data/mean_std.json + # these three processes are a.k.a. SpecAugument + - type: time_warp + max_time_warp: 5 + inplace: true + mode: PIL + - type: freq_mask + F: 30 + n_mask: 2 + inplace: true + replace_with_zero: false + - type: time_mask + T: 40 + n_mask: 2 + inplace: true + replace_with_zero: false + + + + diff --git a/examples/callcenter/s1/local/align.sh b/examples/callcenter/asr1/local/align.sh similarity index 100% rename from examples/callcenter/s1/local/align.sh rename to examples/callcenter/asr1/local/align.sh diff --git a/examples/callcenter/s1/local/data.sh b/examples/callcenter/asr1/local/data.sh similarity index 96% rename from examples/callcenter/s1/local/data.sh rename to examples/callcenter/asr1/local/data.sh index 634bb8d0..c40c752a 100755 --- a/examples/callcenter/s1/local/data.sh +++ b/examples/callcenter/asr1/local/data.sh @@ -21,8 +21,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then --spectrum_type="fbank" \ --feat_dim=80 \ --delta_delta=false \ - --stride_ms=10.0 \ - --window_ms=25.0 \ + --stride_ms=10 \ + --window_ms=25 \ --sample_rate=8000 \ --use_dB_normalization=False \ --num_samples=-1 \ @@ -55,7 +55,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then for dataset in train dev test; do { python3 ${MAIN_ROOT}/utils/format_data.py \ - --feat_type "raw" \ --cmvn_path "data/mean_std.json" \ --unit_type "char" \ --vocab_path="data/vocab.txt" \ diff --git a/examples/callcenter/s1/local/download_lm_ch.sh b/examples/callcenter/asr1/local/download_lm_ch.sh similarity index 100% rename from examples/callcenter/s1/local/download_lm_ch.sh rename to examples/callcenter/asr1/local/download_lm_ch.sh diff --git a/examples/callcenter/s1/local/export.sh b/examples/callcenter/asr1/local/export.sh similarity index 100% rename from examples/callcenter/s1/local/export.sh rename to examples/callcenter/asr1/local/export.sh diff --git a/examples/callcenter/s1/local/test.sh b/examples/callcenter/asr1/local/test.sh similarity index 100% rename from examples/callcenter/s1/local/test.sh rename to examples/callcenter/asr1/local/test.sh diff --git a/examples/callcenter/s1/local/train.sh b/examples/callcenter/asr1/local/train.sh similarity index 100% rename from examples/callcenter/s1/local/train.sh rename to examples/callcenter/asr1/local/train.sh diff --git a/examples/callcenter/s1/path.sh b/examples/callcenter/asr1/path.sh similarity index 100% rename from examples/callcenter/s1/path.sh rename to examples/callcenter/asr1/path.sh diff --git a/examples/callcenter/s1/run.sh b/examples/callcenter/asr1/run.sh similarity index 100% rename from examples/callcenter/s1/run.sh rename to examples/callcenter/asr1/run.sh diff --git a/examples/csmsc/tts2/README.md b/examples/csmsc/tts2/README.md index 61c4972b..5ebf3cf4 100644 --- a/examples/csmsc/tts2/README.md +++ b/examples/csmsc/tts2/README.md @@ -90,7 +90,7 @@ optional arguments: ### Synthesize We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc1) as the neural vocoder. -Download pretrained parallel wavegan model from [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip) and unzip it. +Download pretrained parallel wavegan model from [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip) and unzip it. ```bash unzip pwg_baker_ckpt_0.4.zip ``` @@ -208,9 +208,9 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path} ``` ## Pretrained Model -Pretrained SpeedySpeech model with no silence in the edge of audios[speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/speedyspeech_nosil_baker_ckpt_0.5.zip). +Pretrained SpeedySpeech model with no silence in the edge of audios[speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_ckpt_0.5.zip). -Static model can be downloaded here [speedyspeech_nosil_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/speedyspeech_nosil_baker_static_0.5.zip). +Static model can be downloaded here [speedyspeech_nosil_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_static_0.5.zip). SpeedySpeech checkpoint contains files listed below. ```text diff --git a/examples/csmsc/tts3/README.md b/examples/csmsc/tts3/README.md index 6570d33d..104964c8 100644 --- a/examples/csmsc/tts3/README.md +++ b/examples/csmsc/tts3/README.md @@ -88,7 +88,7 @@ optional arguments: ### Synthesize We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc1) as the neural vocoder. -Download pretrained parallel wavegan model from [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip) and unzip it. +Download pretrained parallel wavegan model from [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip) and unzip it. ```bash unzip pwg_baker_ckpt_0.4.zip ``` @@ -199,9 +199,9 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path} ``` ## Pretrained Model -Pretrained FastSpeech2 model with no silence in the edge of audios [fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip). +Pretrained FastSpeech2 model with no silence in the edge of audios [fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip). -Static model can be downloaded here [fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_static_0.4.zip). +Static model can be downloaded here [fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_static_0.4.zip). FastSpeech2 checkpoint contains files listed below. ```text diff --git a/examples/csmsc/voc1/README.md b/examples/csmsc/voc1/README.md index b9c8a465..86114a42 100644 --- a/examples/csmsc/voc1/README.md +++ b/examples/csmsc/voc1/README.md @@ -122,9 +122,9 @@ optional arguments: 5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. ## Pretrained Models -Pretrained model can be downloaded here [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip). +Pretrained model can be downloaded here [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip). -Static model can be downloaded here [pwg_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_static_0.4.zip). +Static model can be downloaded here [pwg_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_static_0.4.zip). Parallel WaveGAN checkpoint contains files listed below. diff --git a/examples/csmsc/voc3/README.md b/examples/csmsc/voc3/README.md index a72f60f1..4925b649 100644 --- a/examples/csmsc/voc3/README.md +++ b/examples/csmsc/voc3/README.md @@ -113,7 +113,7 @@ The length of mel-spectrograms should align with the length of wavs, so we shoul But since we are fine-tuning, we should use the statistics computed during training step. -You should first download pretrained `FastSpeech2` model from [fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip) and `unzip` it. +You should first download pretrained `FastSpeech2` model from [fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip) and `unzip` it. Assume the path to the dump-dir of training step is `dump`. Assume the path to the duration result of CSMSC is `durations.txt` (generated during training step's preprocessing). @@ -147,11 +147,11 @@ TODO: The hyperparameter of `finetune.yaml` is not good enough, a smaller `learning_rate` should be used (more `milestones` should be set). ## Pretrained Models -Pretrained model can be downloaded here [mb_melgan_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/mb_melgan_baker_ckpt_0.5.zip). +Pretrained model can be downloaded here [mb_melgan_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_ckpt_0.5.zip). -Finetuned model can ben downloaded here [mb_melgan_baker_finetune_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/mb_melgan_baker_finetune_ckpt_0.5.zip). +Finetuned model can ben downloaded here [mb_melgan_baker_finetune_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_finetune_ckpt_0.5.zip). -Static model can be downloaded here [mb_melgan_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/mb_melgan_baker_static_0.5.zip) +Static model can be downloaded here [mb_melgan_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_static_0.5.zip) Multi Band MelGAN checkpoint contains files listed below. diff --git a/examples/dataset/aidatatang_200zh/aidatatang_200zh.py b/examples/dataset/aidatatang_200zh/aidatatang_200zh.py index e32f619e..85f478c2 100644 --- a/examples/dataset/aidatatang_200zh/aidatatang_200zh.py +++ b/examples/dataset/aidatatang_200zh/aidatatang_200zh.py @@ -22,6 +22,7 @@ import argparse import codecs import json import os +from pathlib import Path import soundfile @@ -79,6 +80,7 @@ def create_manifest(data_dir, manifest_path_prefix): audio_path = os.path.abspath(os.path.join(subfolder, fname)) audio_id = os.path.basename(fname)[:-4] + utt2spk = Path(audio_path).parent.name audio_data, samplerate = soundfile.read(audio_path) duration = float(len(audio_data) / samplerate) @@ -87,6 +89,7 @@ def create_manifest(data_dir, manifest_path_prefix): json.dumps( { 'utt': audio_id, + 'utt2spk': str(utt2spk), 'feat': audio_path, 'feat_shape': (duration, ), # second 'text': text, diff --git a/examples/dataset/aishell/aishell.py b/examples/dataset/aishell/aishell.py index 66e06901..7431fc08 100644 --- a/examples/dataset/aishell/aishell.py +++ b/examples/dataset/aishell/aishell.py @@ -22,6 +22,7 @@ import argparse import codecs import json import os +from pathlib import Path import soundfile @@ -81,6 +82,8 @@ def create_manifest(data_dir, manifest_path_prefix): # if no transcription for audio then skipped if audio_id not in transcript_dict: continue + + utt2spk = Path(audio_path).parent.name audio_data, samplerate = soundfile.read(audio_path) duration = float(len(audio_data) / samplerate) text = transcript_dict[audio_id] @@ -88,6 +91,7 @@ def create_manifest(data_dir, manifest_path_prefix): json.dumps( { 'utt': audio_id, + 'utt2spk': str(utt2spk), 'feat': audio_path, 'feat_shape': (duration, ), # second 'text': text diff --git a/examples/dataset/librispeech/librispeech.py b/examples/dataset/librispeech/librispeech.py index e85bbb3a..69f0db59 100644 --- a/examples/dataset/librispeech/librispeech.py +++ b/examples/dataset/librispeech/librispeech.py @@ -78,7 +78,7 @@ def create_manifest(data_dir, manifest_path): print("Creating manifest %s ..." % manifest_path) json_lines = [] total_sec = 0.0 - total_text = 0.0 + total_char = 0.0 total_num = 0 for subfolder, _, filelist in sorted(os.walk(data_dir)): @@ -89,25 +89,28 @@ def create_manifest(data_dir, manifest_path): text_filepath = os.path.join(subfolder, text_filelist[0]) for line in io.open(text_filepath, encoding="utf8"): segments = line.strip().split() + nchars = len(segments[1:]) text = ' '.join(segments[1:]).lower() audio_filepath = os.path.abspath( os.path.join(subfolder, segments[0] + '.flac')) audio_data, samplerate = soundfile.read(audio_filepath) duration = float(len(audio_data)) / samplerate + + utt = os.path.splitext(os.path.basename(audio_filepath))[0] + utt2spk = '-'.join(utt.split('-')[:2]) + json_lines.append( json.dumps({ - 'utt': - os.path.splitext(os.path.basename(audio_filepath))[0], - 'feat': - audio_filepath, - 'feat_shape': (duration, ), #second - 'text': - text + 'utt': utt, + 'utt2spk': utt2spk, + 'feat': audio_filepath, + 'feat_shape': (duration, ), # second + 'text': text, })) total_sec += duration - total_text += len(text) + total_char += nchars total_num += 1 with codecs.open(manifest_path, 'w', 'utf-8') as out_file: @@ -122,8 +125,8 @@ def create_manifest(data_dir, manifest_path): print(f"{subset}:", file=f) print(f"{total_num} utts", file=f) print(f"{total_sec / (60*60)} h", file=f) - print(f"{total_text} text", file=f) - print(f"{total_text / total_sec} text/sec", file=f) + print(f"{total_char} char", file=f) + print(f"{total_char / total_sec} char/sec", file=f) print(f"{total_sec / total_num} sec/utt", file=f) diff --git a/examples/dataset/mini_librispeech/mini_librispeech.py b/examples/dataset/mini_librispeech/mini_librispeech.py index 65fee81a..730c73a8 100644 --- a/examples/dataset/mini_librispeech/mini_librispeech.py +++ b/examples/dataset/mini_librispeech/mini_librispeech.py @@ -74,15 +74,16 @@ def create_manifest(data_dir, manifest_path): audio_filepath = os.path.join(subfolder, segments[0] + '.flac') audio_data, samplerate = soundfile.read(audio_filepath) duration = float(len(audio_data)) / samplerate + + utt = os.path.splitext(os.path.basename(audio_filepath))[0] + utt2spk = '-'.join(utt.split('-')[:2]) json_lines.append( json.dumps({ - 'utt': - os.path.splitext(os.path.basename(audio_filepath))[0], - 'feat': - audio_filepath, + 'utt': utt, + 'utt2spk': utt2spk, + 'feat': audio_filepath, 'feat_shape': (duration, ), #second - 'text': - text + 'text': text, })) total_sec += duration diff --git a/examples/dataset/ted_en_zh/ted_en_zh.py b/examples/dataset/ted_en_zh/ted_en_zh.py index 14bef01d..9a3ba3b3 100644 --- a/examples/dataset/ted_en_zh/ted_en_zh.py +++ b/examples/dataset/ted_en_zh/ted_en_zh.py @@ -72,14 +72,16 @@ def create_manifest(data_dir, manifest_path_prefix): continue audio_data, samplerate = soundfile.read(audio_path) duration = float(len(audio_data) / samplerate) + + translation_str = " ".join(translation.split()) + trancription_str = " ".join(trancription.split()) json_lines.append( json.dumps( { 'utt': utt, 'feat': audio_path, 'feat_shape': (duration, ), # second - 'text': " ".join(translation.split()), - 'text1': " ".join(trancription.split()) + 'text': [translation_str, trancription_str], }, ensure_ascii=False)) diff --git a/examples/dataset/thchs30/thchs30.py b/examples/dataset/thchs30/thchs30.py index 77a264cb..cdfc0a75 100644 --- a/examples/dataset/thchs30/thchs30.py +++ b/examples/dataset/thchs30/thchs30.py @@ -113,6 +113,8 @@ def create_manifest(data_dir, manifest_path_prefix): assert os.path.exists(audio_path) and os.path.exists(text_path) audio_id = os.path.basename(audio_path)[:-4] + spk = audio_id.split('_')[0] + word_text, syllable_text, phone_text = read_trn(text_path) audio_data, samplerate = soundfile.read(audio_path) duration = float(len(audio_data) / samplerate) @@ -122,6 +124,7 @@ def create_manifest(data_dir, manifest_path_prefix): json.dumps( { 'utt': audio_id, + 'utt2spk': spk, 'feat': audio_path, 'feat_shape': (duration, ), # second 'text': word_text, # charactor diff --git a/examples/dataset/timit/timit.py b/examples/dataset/timit/timit.py index 311d445c..c4a9f066 100644 --- a/examples/dataset/timit/timit.py +++ b/examples/dataset/timit/timit.py @@ -180,12 +180,12 @@ def create_manifest(data_dir, manifest_path_prefix): json.dumps( { 'utt': utt_id, + 'utt2spk': spk, + 'utt2gender': gender, 'feat': str(audio_path), 'feat_shape': (duration, ), # second 'text': word_text, # word 'phone': phone_text, - 'spk': spk, - 'gender': gender, }, ensure_ascii=False)) diff --git a/examples/dataset/timit/timit_kaldi_standard_split.py b/examples/dataset/timit/timit_kaldi_standard_split.py index 2b494c06..473fc856 100644 --- a/examples/dataset/timit/timit_kaldi_standard_split.py +++ b/examples/dataset/timit/timit_kaldi_standard_split.py @@ -22,6 +22,7 @@ import argparse import codecs import json import os +from pathlib import Path import soundfile @@ -67,10 +68,17 @@ def create_manifest(data_dir, manifest_path_prefix): audio_data, samplerate = soundfile.read(audio_path) duration = float(len(audio_data) / samplerate) text = phn_dict[audio_id] + + gender_spk = str(Path(audio_path).parent.stem) + spk = gender_spk[1:] + gender = gender_spk[0] + utt_id = '_'.join([spk, gender, audio_id]) json_lines.append( json.dumps( { 'utt': audio_id, + 'utt2spk': spk, + 'utt2gender': gender, 'feat': audio_path, 'feat_shape': (duration, ), # second 'text': text diff --git a/examples/dataset/voxforge/voxforge.py b/examples/dataset/voxforge/voxforge.py index 36282bd6..373791bf 100644 --- a/examples/dataset/voxforge/voxforge.py +++ b/examples/dataset/voxforge/voxforge.py @@ -175,9 +175,12 @@ def generate_manifest(data_dir, manifest_path): audio_data, samplerate = soundfile.read(u) duration = float(len(audio_data)) / samplerate + + utt = os.path.splitext(os.path.basename(u))[0] json_lines.append( json.dumps({ - 'utt': os.path.splitext(os.path.basename(u))[0], + 'utt': utt, + 'utt2spk': speaker, 'feat': u, 'feat_shape': (duration, ), #second 'text': trans.lower() diff --git a/examples/librispeech/README.md b/examples/librispeech/README.md index 5943cf1d..74441fd0 100644 --- a/examples/librispeech/README.md +++ b/examples/librispeech/README.md @@ -1,8 +1,9 @@ # ASR -* s0 is for deepspeech2 offline -* s1 is for transformer/conformer/U2 -* s2 is for transformer/conformer/U2 w/ kaldi feat, need install Kaldi +* asr0 - deepspeech2 Streaming/Non-Streaming +* asr1 - transformer/conformer Streaming/Non-Streaming +* asr2 - transformer/conformer Streaming/Non-Streaming with Kaldi feature + ## Data | Data Subset | Duration in Seconds | diff --git a/examples/librispeech/s0/README.md b/examples/librispeech/asr0/README.md similarity index 100% rename from examples/librispeech/s0/README.md rename to examples/librispeech/asr0/README.md diff --git a/examples/librispeech/s0/conf/augmentation.json b/examples/librispeech/asr0/conf/augmentation.json similarity index 100% rename from examples/librispeech/s0/conf/augmentation.json rename to examples/librispeech/asr0/conf/augmentation.json diff --git a/examples/librispeech/s0/conf/deepspeech2.yaml b/examples/librispeech/asr0/conf/deepspeech2.yaml similarity index 100% rename from examples/librispeech/s0/conf/deepspeech2.yaml rename to examples/librispeech/asr0/conf/deepspeech2.yaml diff --git a/examples/librispeech/s0/conf/deepspeech2_online.yaml b/examples/librispeech/asr0/conf/deepspeech2_online.yaml similarity index 100% rename from examples/librispeech/s0/conf/deepspeech2_online.yaml rename to examples/librispeech/asr0/conf/deepspeech2_online.yaml diff --git a/examples/librispeech/s0/local/data.sh b/examples/librispeech/asr0/local/data.sh similarity index 97% rename from examples/librispeech/s0/local/data.sh rename to examples/librispeech/asr0/local/data.sh index fd2b0c01..0f276cec 100755 --- a/examples/librispeech/s0/local/data.sh +++ b/examples/librispeech/asr0/local/data.sh @@ -50,8 +50,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then --spectrum_type="linear" \ --delta_delta=false \ --sample_rate=16000 \ - --stride_ms=10.0 \ - --window_ms=20.0 \ + --stride_ms=10 \ + --window_ms=20 \ --use_dB_normalization=True \ --num_workers=${num_workers} \ --output_path="data/mean_std.json" @@ -81,7 +81,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then for set in train dev test dev-clean dev-other test-clean test-other; do { python3 ${MAIN_ROOT}/utils/format_data.py \ - --feat_type "raw" \ --cmvn_path "data/mean_std.json" \ --unit_type ${unit_type} \ --vocab_path="data/vocab.txt" \ diff --git a/examples/librispeech/s0/local/download_lm_en.sh b/examples/librispeech/asr0/local/download_lm_en.sh similarity index 100% rename from examples/librispeech/s0/local/download_lm_en.sh rename to examples/librispeech/asr0/local/download_lm_en.sh diff --git a/examples/librispeech/s0/local/export.sh b/examples/librispeech/asr0/local/export.sh similarity index 100% rename from examples/librispeech/s0/local/export.sh rename to examples/librispeech/asr0/local/export.sh diff --git a/examples/librispeech/s0/local/test.sh b/examples/librispeech/asr0/local/test.sh similarity index 100% rename from examples/librispeech/s0/local/test.sh rename to examples/librispeech/asr0/local/test.sh diff --git a/examples/librispeech/s0/local/test_hub.sh b/examples/librispeech/asr0/local/test_hub.sh similarity index 100% rename from examples/librispeech/s0/local/test_hub.sh rename to examples/librispeech/asr0/local/test_hub.sh diff --git a/examples/librispeech/s0/local/train.sh b/examples/librispeech/asr0/local/train.sh similarity index 100% rename from examples/librispeech/s0/local/train.sh rename to examples/librispeech/asr0/local/train.sh diff --git a/examples/librispeech/s0/path.sh b/examples/librispeech/asr0/path.sh similarity index 100% rename from examples/librispeech/s0/path.sh rename to examples/librispeech/asr0/path.sh diff --git a/examples/librispeech/s0/run.sh b/examples/librispeech/asr0/run.sh similarity index 100% rename from examples/librispeech/s0/run.sh rename to examples/librispeech/asr0/run.sh diff --git a/examples/librispeech/s1/.gitignore b/examples/librispeech/asr1/.gitignore similarity index 100% rename from examples/librispeech/s1/.gitignore rename to examples/librispeech/asr1/.gitignore diff --git a/examples/librispeech/s1/README.md b/examples/librispeech/asr1/README.md similarity index 74% rename from examples/librispeech/s1/README.md rename to examples/librispeech/asr1/README.md index b7ec93eb..73f0863e 100644 --- a/examples/librispeech/s1/README.md +++ b/examples/librispeech/asr1/README.md @@ -21,7 +21,7 @@ ## Transformer | Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER | | --- | --- | --- | --- | --- | --- | --- | --- | -| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean | attention | 7.404532432556152 | 0.056204 | -| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean | ctc_greedy_search | 7.404532432556152 | 0.058658 | -| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean | ctc_prefix_beam_search | 7.404532432556152 | 0.058278 | -| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean | attention_rescoring | 7.404532432556152 | 0.045591 | +| transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | attention | 6.805267604192098, | 0.049795 | +| transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | ctc_greedy_search | 6.805267604192098, | 0.054892 | +| transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | ctc_prefix_beam_search | 6.805267604192098, | 0.054531 | +| transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | attention_rescoring | 6.805267604192098, | 0.042244 | diff --git a/examples/librispeech/s1/cmd.sh b/examples/librispeech/asr1/cmd.sh similarity index 100% rename from examples/librispeech/s1/cmd.sh rename to examples/librispeech/asr1/cmd.sh diff --git a/examples/librispeech/s1/conf/augmentation.json b/examples/librispeech/asr1/conf/augmentation.json similarity index 100% rename from examples/librispeech/s1/conf/augmentation.json rename to examples/librispeech/asr1/conf/augmentation.json diff --git a/examples/librispeech/s1/conf/chunk_conformer.yaml b/examples/librispeech/asr1/conf/chunk_conformer.yaml similarity index 97% rename from examples/librispeech/s1/conf/chunk_conformer.yaml rename to examples/librispeech/asr1/conf/chunk_conformer.yaml index 4d0e6ceb..2bfb0fb6 100644 --- a/examples/librispeech/s1/conf/chunk_conformer.yaml +++ b/examples/librispeech/asr1/conf/chunk_conformer.yaml @@ -15,7 +15,7 @@ collator: unit_type: 'spm' spm_model_prefix: 'data/bpe_unigram_5000' mean_std_filepath: "" - augmentation_config: conf/augmentation.json + augmentation_config: conf/preprocess.yaml batch_size: 16 raw_wav: True # use raw_wav or kaldi feature spectrum_type: fbank #linear, mfcc, fbank @@ -38,7 +38,7 @@ collator: # network architecture model: - cmvn_file: "data/mean_std.json" + cmvn_file: cmvn_file_type: "json" # encoder related encoder: conformer diff --git a/examples/librispeech/s1/conf/chunk_transformer.yaml b/examples/librispeech/asr1/conf/chunk_transformer.yaml similarity index 97% rename from examples/librispeech/s1/conf/chunk_transformer.yaml rename to examples/librispeech/asr1/conf/chunk_transformer.yaml index c7b53f95..fe533777 100644 --- a/examples/librispeech/s1/conf/chunk_transformer.yaml +++ b/examples/librispeech/asr1/conf/chunk_transformer.yaml @@ -15,7 +15,7 @@ collator: unit_type: 'spm' spm_model_prefix: 'data/bpe_unigram_5000' mean_std_filepath: "" - augmentation_config: conf/augmentation.json + augmentation_config: conf/preprocess.yaml batch_size: 64 raw_wav: True # use raw_wav or kaldi feature spectrum_type: fbank #linear, mfcc, fbank @@ -38,7 +38,7 @@ collator: # network architecture model: - cmvn_file: "data/mean_std.json" + cmvn_file: cmvn_file_type: "json" # encoder related encoder: transformer diff --git a/examples/librispeech/s1/conf/conformer.yaml b/examples/librispeech/asr1/conf/conformer.yaml similarity index 97% rename from examples/librispeech/s1/conf/conformer.yaml rename to examples/librispeech/asr1/conf/conformer.yaml index 3bc942dc..c844baaa 100644 --- a/examples/librispeech/s1/conf/conformer.yaml +++ b/examples/librispeech/asr1/conf/conformer.yaml @@ -15,7 +15,7 @@ collator: unit_type: 'spm' spm_model_prefix: 'data/bpe_unigram_5000' mean_std_filepath: "" - augmentation_config: conf/augmentation.json + augmentation_config: conf/preprocess.yaml batch_size: 16 raw_wav: True # use raw_wav or kaldi feature spectrum_type: fbank #linear, mfcc, fbank @@ -38,7 +38,7 @@ collator: # network architecture model: - cmvn_file: "data/mean_std.json" + cmvn_file: cmvn_file_type: "json" # encoder related encoder: conformer diff --git a/examples/librispeech/asr1/conf/preprocess.yaml b/examples/librispeech/asr1/conf/preprocess.yaml new file mode 100644 index 00000000..021ca4c5 --- /dev/null +++ b/examples/librispeech/asr1/conf/preprocess.yaml @@ -0,0 +1,25 @@ +process: + # extract kaldi fbank from PCM + - type: fbank_kaldi + fs: 16000 + n_mels: 80 + n_shift: 160 + win_length: 400 + dither: true + - type: cmvn_json + cmvn_path: data/mean_std.json + # these three processes are a.k.a. SpecAugument + - type: time_warp + max_time_warp: 5 + inplace: true + mode: PIL + - type: freq_mask + F: 30 + n_mask: 2 + inplace: true + replace_with_zero: false + - type: time_mask + T: 40 + n_mask: 2 + inplace: true + replace_with_zero: false diff --git a/examples/librispeech/s1/conf/transformer.yaml b/examples/librispeech/asr1/conf/transformer.yaml similarity index 97% rename from examples/librispeech/s1/conf/transformer.yaml rename to examples/librispeech/asr1/conf/transformer.yaml index 3cc17004..5a158f3e 100644 --- a/examples/librispeech/s1/conf/transformer.yaml +++ b/examples/librispeech/asr1/conf/transformer.yaml @@ -15,7 +15,7 @@ collator: unit_type: 'spm' spm_model_prefix: 'data/bpe_unigram_5000' mean_std_filepath: "" - augmentation_config: conf/augmentation.json + augmentation_config: conf/preprocess.yaml batch_size: 32 raw_wav: True # use raw_wav or kaldi feature spectrum_type: fbank #linear, mfcc, fbank @@ -38,7 +38,7 @@ collator: # network architecture model: - cmvn_file: "data/mean_std.json" + cmvn_file: cmvn_file_type: "json" # encoder related encoder: transformer diff --git a/examples/librispeech/s1/local/align.sh b/examples/librispeech/asr1/local/align.sh similarity index 100% rename from examples/librispeech/s1/local/align.sh rename to examples/librispeech/asr1/local/align.sh diff --git a/examples/librispeech/s1/local/data.sh b/examples/librispeech/asr1/local/data.sh similarity index 66% rename from examples/librispeech/s1/local/data.sh rename to examples/librispeech/asr1/local/data.sh index 56fec846..35f4e635 100755 --- a/examples/librispeech/s1/local/data.sh +++ b/examples/librispeech/asr1/local/data.sh @@ -8,6 +8,11 @@ nbpe=5000 bpemode=unigram bpeprefix="data/bpe_${bpemode}_${nbpe}" +stride_ms=10 +window_ms=25 +sample_rate=16000 +feat_dim=80 + source ${MAIN_ROOT}/utils/parse_options.sh @@ -27,21 +32,21 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then exit 1 fi - for set in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do - mv data/manifest.${set} data/manifest.${set}.raw + for sub in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do + mv data/manifest.${sub} data/manifest.${sub}.raw done rm -rf data/manifest.train.raw data/manifest.dev.raw data/manifest.test.raw - for set in train-clean-100 train-clean-360 train-other-500; do - cat data/manifest.${set}.raw >> data/manifest.train.raw + for sub in train-clean-100 train-clean-360 train-other-500; do + cat data/manifest.${sub}.raw >> data/manifest.train.raw done - for set in dev-clean dev-other; do - cat data/manifest.${set}.raw >> data/manifest.dev.raw + for sub in dev-clean dev-other; do + cat data/manifest.${sub}.raw >> data/manifest.dev.raw done - for set in test-clean test-other; do - cat data/manifest.${set}.raw >> data/manifest.test.raw + for sub in test-clean test-other; do + cat data/manifest.${sub}.raw >> data/manifest.test.raw done fi @@ -52,11 +57,11 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then --manifest_path="data/manifest.train.raw" \ --num_samples=-1 \ --spectrum_type="fbank" \ - --feat_dim=80 \ + --feat_dim=${feat_dim} \ --delta_delta=false \ - --sample_rate=16000 \ - --stride_ms=10.0 \ - --window_ms=25.0 \ + --sample_rate=${sample_rate} \ + --stride_ms=${stride_ms} \ + --window_ms=${window_ms} \ --use_dB_normalization=False \ --num_workers=${num_workers} \ --output_path="data/mean_std.json" @@ -85,16 +90,15 @@ fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then # format manifest with tokenids, vocab size - for set in train dev test dev-clean dev-other test-clean test-other; do + for sub in train dev test dev-clean dev-other test-clean test-other; do { python3 ${MAIN_ROOT}/utils/format_data.py \ - --feat_type "raw" \ --cmvn_path "data/mean_std.json" \ --unit_type "spm" \ --spm_model_prefix ${bpeprefix} \ --vocab_path="data/vocab.txt" \ - --manifest_path="data/manifest.${set}.raw" \ - --output_path="data/manifest.${set}" + --manifest_path="data/manifest.${sub}.raw" \ + --output_path="data/manifest.${sub}" if [ $? -ne 0 ]; then echo "Formt mnaifest failed. Terminated." @@ -103,6 +107,16 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then }& done wait + + for sub in train dev; do + mv data/manifest.${sub} data/manifest.${sub}.fmt + done +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + for sub in train dev; do + remove_longshortdata.py --maxframes 3000 --maxchars 400 --stride_ms ${stride_ms} data/manifest.${sub}.fmt data/manifest.${sub} + done fi echo "LibriSpeech Data preparation done." diff --git a/examples/librispeech/s1/local/download_lm_en.sh b/examples/librispeech/asr1/local/download_lm_en.sh similarity index 100% rename from examples/librispeech/s1/local/download_lm_en.sh rename to examples/librispeech/asr1/local/download_lm_en.sh diff --git a/examples/librispeech/s1/local/export.sh b/examples/librispeech/asr1/local/export.sh similarity index 100% rename from examples/librispeech/s1/local/export.sh rename to examples/librispeech/asr1/local/export.sh diff --git a/examples/librispeech/s1/local/test.sh b/examples/librispeech/asr1/local/test.sh similarity index 100% rename from examples/librispeech/s1/local/test.sh rename to examples/librispeech/asr1/local/test.sh diff --git a/examples/librispeech/s1/local/test_hub.sh b/examples/librispeech/asr1/local/test_hub.sh similarity index 100% rename from examples/librispeech/s1/local/test_hub.sh rename to examples/librispeech/asr1/local/test_hub.sh diff --git a/examples/librispeech/s1/local/train.sh b/examples/librispeech/asr1/local/train.sh similarity index 100% rename from examples/librispeech/s1/local/train.sh rename to examples/librispeech/asr1/local/train.sh diff --git a/examples/librispeech/s1/path.sh b/examples/librispeech/asr1/path.sh similarity index 100% rename from examples/librispeech/s1/path.sh rename to examples/librispeech/asr1/path.sh diff --git a/examples/librispeech/s1/run.sh b/examples/librispeech/asr1/run.sh similarity index 100% rename from examples/librispeech/s1/run.sh rename to examples/librispeech/asr1/run.sh diff --git a/examples/librispeech/s1/utils b/examples/librispeech/asr1/utils similarity index 100% rename from examples/librispeech/s1/utils rename to examples/librispeech/asr1/utils diff --git a/examples/librispeech/s2/.gitignore b/examples/librispeech/asr2/.gitignore similarity index 100% rename from examples/librispeech/s2/.gitignore rename to examples/librispeech/asr2/.gitignore diff --git a/examples/librispeech/s2/README.md b/examples/librispeech/asr2/README.md similarity index 100% rename from examples/librispeech/s2/README.md rename to examples/librispeech/asr2/README.md diff --git a/examples/librispeech/s2/cmd.sh b/examples/librispeech/asr2/cmd.sh similarity index 100% rename from examples/librispeech/s2/cmd.sh rename to examples/librispeech/asr2/cmd.sh diff --git a/examples/librispeech/s2/conf/augmentation.json b/examples/librispeech/asr2/conf/augmentation.json similarity index 100% rename from examples/librispeech/s2/conf/augmentation.json rename to examples/librispeech/asr2/conf/augmentation.json diff --git a/examples/librispeech/s2/conf/decode/decode.yaml b/examples/librispeech/asr2/conf/decode/decode.yaml similarity index 100% rename from examples/librispeech/s2/conf/decode/decode.yaml rename to examples/librispeech/asr2/conf/decode/decode.yaml diff --git a/examples/librispeech/s2/conf/decode/decode_att.yaml b/examples/librispeech/asr2/conf/decode/decode_att.yaml similarity index 100% rename from examples/librispeech/s2/conf/decode/decode_att.yaml rename to examples/librispeech/asr2/conf/decode/decode_att.yaml diff --git a/examples/librispeech/s2/conf/decode/decode_ctc.yaml b/examples/librispeech/asr2/conf/decode/decode_ctc.yaml similarity index 100% rename from examples/librispeech/s2/conf/decode/decode_ctc.yaml rename to examples/librispeech/asr2/conf/decode/decode_ctc.yaml diff --git a/examples/librispeech/s2/conf/decode/decode_wo_lm.yaml b/examples/librispeech/asr2/conf/decode/decode_wo_lm.yaml similarity index 100% rename from examples/librispeech/s2/conf/decode/decode_wo_lm.yaml rename to examples/librispeech/asr2/conf/decode/decode_wo_lm.yaml diff --git a/examples/librispeech/s2/conf/fbank.conf b/examples/librispeech/asr2/conf/fbank.conf similarity index 100% rename from examples/librispeech/s2/conf/fbank.conf rename to examples/librispeech/asr2/conf/fbank.conf diff --git a/examples/librispeech/s2/conf/lm/transformer.yaml b/examples/librispeech/asr2/conf/lm/transformer.yaml similarity index 100% rename from examples/librispeech/s2/conf/lm/transformer.yaml rename to examples/librispeech/asr2/conf/lm/transformer.yaml diff --git a/examples/librispeech/s2/conf/pitch.conf b/examples/librispeech/asr2/conf/pitch.conf similarity index 100% rename from examples/librispeech/s2/conf/pitch.conf rename to examples/librispeech/asr2/conf/pitch.conf diff --git a/examples/librispeech/s2/conf/transformer.yaml b/examples/librispeech/asr2/conf/transformer.yaml similarity index 100% rename from examples/librispeech/s2/conf/transformer.yaml rename to examples/librispeech/asr2/conf/transformer.yaml diff --git a/examples/librispeech/s2/local/align.sh b/examples/librispeech/asr2/local/align.sh similarity index 100% rename from examples/librispeech/s2/local/align.sh rename to examples/librispeech/asr2/local/align.sh diff --git a/examples/librispeech/s2/local/cacu_perplexity.sh b/examples/librispeech/asr2/local/cacu_perplexity.sh similarity index 100% rename from examples/librispeech/s2/local/cacu_perplexity.sh rename to examples/librispeech/asr2/local/cacu_perplexity.sh diff --git a/examples/librispeech/s2/local/data.sh b/examples/librispeech/asr2/local/data.sh similarity index 100% rename from examples/librispeech/s2/local/data.sh rename to examples/librispeech/asr2/local/data.sh diff --git a/examples/librispeech/s2/local/data_prep.sh b/examples/librispeech/asr2/local/data_prep.sh similarity index 100% rename from examples/librispeech/s2/local/data_prep.sh rename to examples/librispeech/asr2/local/data_prep.sh diff --git a/examples/librispeech/s2/local/download_lm_en.sh b/examples/librispeech/asr2/local/download_lm_en.sh similarity index 100% rename from examples/librispeech/s2/local/download_lm_en.sh rename to examples/librispeech/asr2/local/download_lm_en.sh diff --git a/examples/librispeech/s2/local/espnet_json_to_manifest.py b/examples/librispeech/asr2/local/espnet_json_to_manifest.py similarity index 100% rename from examples/librispeech/s2/local/espnet_json_to_manifest.py rename to examples/librispeech/asr2/local/espnet_json_to_manifest.py diff --git a/examples/librispeech/s2/local/export.sh b/examples/librispeech/asr2/local/export.sh similarity index 100% rename from examples/librispeech/s2/local/export.sh rename to examples/librispeech/asr2/local/export.sh diff --git a/examples/librispeech/s2/local/recog.sh b/examples/librispeech/asr2/local/recog.sh similarity index 100% rename from examples/librispeech/s2/local/recog.sh rename to examples/librispeech/asr2/local/recog.sh diff --git a/examples/librispeech/s2/local/test.sh b/examples/librispeech/asr2/local/test.sh similarity index 100% rename from examples/librispeech/s2/local/test.sh rename to examples/librispeech/asr2/local/test.sh diff --git a/examples/librispeech/s2/local/train.sh b/examples/librispeech/asr2/local/train.sh similarity index 100% rename from examples/librispeech/s2/local/train.sh rename to examples/librispeech/asr2/local/train.sh diff --git a/examples/librispeech/s2/path.sh b/examples/librispeech/asr2/path.sh similarity index 100% rename from examples/librispeech/s2/path.sh rename to examples/librispeech/asr2/path.sh diff --git a/examples/librispeech/s2/run.sh b/examples/librispeech/asr2/run.sh similarity index 100% rename from examples/librispeech/s2/run.sh rename to examples/librispeech/asr2/run.sh diff --git a/examples/librispeech/s2/steps b/examples/librispeech/asr2/steps similarity index 100% rename from examples/librispeech/s2/steps rename to examples/librispeech/asr2/steps diff --git a/examples/librispeech/s2/utils b/examples/librispeech/asr2/utils similarity index 100% rename from examples/librispeech/s2/utils rename to examples/librispeech/asr2/utils diff --git a/examples/ljspeech/tts0/README.md b/examples/ljspeech/tts0/README.md index 09fd0c13..305add20 100644 --- a/examples/ljspeech/tts0/README.md +++ b/examples/ljspeech/tts0/README.md @@ -80,6 +80,6 @@ optional arguments: ## Pretrained Models Pretrained Models can be downloaded from links below. We provide 2 models with different configurations. -1. This model use a binary classifier to predict the stop token. [tacotron2_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_ckpt_0.3.zip) +1. This model use a binary classifier to predict the stop token. [tacotron2_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_ljspeech_ckpt_0.3.zip) -2. This model does not have a stop token predictor. It uses the attention peak position to decided whether all the contents have been uttered. Also guided attention loss is used to speed up training. This model is trained with `configs/alternative.yaml`.[tacotron2_ljspeech_ckpt_0.3_alternative.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_ckpt_0.3_alternative.zip) +2. This model does not have a stop token predictor. It uses the attention peak position to decided whether all the contents have been uttered. Also guided attention loss is used to speed up training. This model is trained with `configs/alternative.yaml`.[tacotron2_ljspeech_ckpt_0.3_alternative.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_ljspeech_ckpt_0.3_alternative.zip) diff --git a/examples/ljspeech/tts1/README.md b/examples/ljspeech/tts1/README.md index 12e43e2e..8a43ecd9 100644 --- a/examples/ljspeech/tts1/README.md +++ b/examples/ljspeech/tts1/README.md @@ -79,7 +79,7 @@ optional arguments: ## Synthesize We use [waveflow](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc0) as the neural vocoder. -Download Pretrained WaveFlow Model with residual channel equals 128 from [waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_ljspeech_ckpt_0.3.zip) and unzip it. +Download Pretrained WaveFlow Model with residual channel equals 128 from [waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/waveflow/waveflow_ljspeech_ckpt_0.3.zip) and unzip it. ```bash unzip waveflow_ljspeech_ckpt_0.3.zip ``` @@ -173,7 +173,7 @@ optional arguments: 6. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. ## Pretrained Model -Pretrained Model can be downloaded here. [transformer_tts_ljspeech_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/transformer_tts_ljspeech_ckpt_0.4.zip) +Pretrained Model can be downloaded here. [transformer_tts_ljspeech_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/transformer_tts/transformer_tts_ljspeech_ckpt_0.4.zip) TransformerTTS checkpoint contains files listed below. ```text diff --git a/examples/ljspeech/tts3/README.md b/examples/ljspeech/tts3/README.md index cda53541..5bdaf4b8 100644 --- a/examples/ljspeech/tts3/README.md +++ b/examples/ljspeech/tts3/README.md @@ -87,7 +87,7 @@ optional arguments: ### Synthesize We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc1) as the neural vocoder. -Download pretrained parallel wavegan model from [pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_ljspeech_ckpt_0.5.zip) and unzip it. +Download pretrained parallel wavegan model from [pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_ljspeech_ckpt_0.5.zip) and unzip it. ```bash unzip pwg_ljspeech_ckpt_0.5.zip ``` @@ -191,7 +191,7 @@ optional arguments: 6. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. ## Pretrained Model -Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_ljspeech_ckpt_0.5.zip) +Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_ljspeech_ckpt_0.5.zip) FastSpeech2 checkpoint contains files listed below. ```text diff --git a/examples/ljspeech/voc0/README.md b/examples/ljspeech/voc0/README.md index 09856c36..0d4e6c51 100644 --- a/examples/ljspeech/voc0/README.md +++ b/examples/ljspeech/voc0/README.md @@ -48,4 +48,4 @@ Synthesize waveform. 6. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. ## Pretrained Model -Pretrained Model with residual channel equals 128 can be downloaded here. [waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_ljspeech_ckpt_0.3.zip). +Pretrained Model with residual channel equals 128 can be downloaded here. [waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/waveflow/waveflow_ljspeech_ckpt_0.3.zip). diff --git a/examples/ljspeech/voc1/README.md b/examples/ljspeech/voc1/README.md index 0506d5d8..24f6dbca 100644 --- a/examples/ljspeech/voc1/README.md +++ b/examples/ljspeech/voc1/README.md @@ -123,7 +123,7 @@ optional arguments: 5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. ## Pretrained Models -Pretrained models can be downloaded here. [pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_ljspeech_ckpt_0.5.zip) +Pretrained models can be downloaded here. [pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_ljspeech_ckpt_0.5.zip) Parallel WaveGAN checkpoint contains files listed below. diff --git a/examples/other/1xt2x/aishell/local/data.sh b/examples/other/1xt2x/aishell/local/data.sh index 0bf35e1f..85574260 100755 --- a/examples/other/1xt2x/aishell/local/data.sh +++ b/examples/other/1xt2x/aishell/local/data.sh @@ -50,7 +50,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then for dataset in train dev test; do { python3 ${MAIN_ROOT}/utils/format_data.py \ - --feat_type "raw" \ --cmvn_path "data/mean_std.npz" \ --unit_type "char" \ --vocab_path="data/vocab.txt" \ diff --git a/examples/other/1xt2x/baidu_en8k/local/data.sh b/examples/other/1xt2x/baidu_en8k/local/data.sh index f0bde77f..8e378ff0 100755 --- a/examples/other/1xt2x/baidu_en8k/local/data.sh +++ b/examples/other/1xt2x/baidu_en8k/local/data.sh @@ -65,7 +65,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then for set in train dev test dev-clean dev-other test-clean test-other; do { python3 ${MAIN_ROOT}/utils/format_data.py \ - --feat_type "raw" \ --cmvn_path "data/mean_std.npz" \ --unit_type ${unit_type} \ --vocab_path="data/vocab.txt" \ diff --git a/examples/other/1xt2x/librispeech/local/data.sh b/examples/other/1xt2x/librispeech/local/data.sh index 6f9bc556..7387472d 100755 --- a/examples/other/1xt2x/librispeech/local/data.sh +++ b/examples/other/1xt2x/librispeech/local/data.sh @@ -63,7 +63,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then for set in train dev test dev-clean dev-other test-clean test-other; do { python3 ${MAIN_ROOT}/utils/format_data.py \ - --feat_type "raw" \ --cmvn_path "data/mean_std.npz" \ --unit_type ${unit_type} \ --vocab_path="data/vocab.txt" \ diff --git a/examples/other/ge2e/README.md b/examples/other/ge2e/README.md index d86c8c13..d58ca513 100644 --- a/examples/other/ge2e/README.md +++ b/examples/other/ge2e/README.md @@ -95,7 +95,7 @@ In `${BIN_DIR}/inference.py`: ## Pretrained Model The pretrained model is first trained to 1560k steps at Librispeech-other-500 and voxceleb1. Then trained at aidatatang_200h and magic_data to 3000k steps. -Download URL [ge2e_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/ge2e_ckpt_0.3.zip). +Download URL [ge2e_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/ge2e/ge2e_ckpt_0.3.zip). ## References diff --git a/examples/ted_en_zh/README.md b/examples/ted_en_zh/README.md index 5664b06b..6d6886da 100644 --- a/examples/ted_en_zh/README.md +++ b/examples/ted_en_zh/README.md @@ -1,3 +1,3 @@ # TED En -> Zh -* t0 for u2 speech translation +* st0 - conformer/transformer speech translation diff --git a/examples/ted_en_zh/st0/.gitignore b/examples/ted_en_zh/st0/.gitignore new file mode 100644 index 00000000..469c6171 --- /dev/null +++ b/examples/ted_en_zh/st0/.gitignore @@ -0,0 +1,3 @@ +TED-En-Zh +data +exp diff --git a/examples/ted_en_zh/t0/README.md b/examples/ted_en_zh/st0/README.md similarity index 100% rename from examples/ted_en_zh/t0/README.md rename to examples/ted_en_zh/st0/README.md diff --git a/examples/ted_en_zh/t0/conf/transformer.yaml b/examples/ted_en_zh/st0/conf/transformer.yaml similarity index 100% rename from examples/ted_en_zh/t0/conf/transformer.yaml rename to examples/ted_en_zh/st0/conf/transformer.yaml diff --git a/examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml b/examples/ted_en_zh/st0/conf/transformer_joint_noam.yaml similarity index 100% rename from examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml rename to examples/ted_en_zh/st0/conf/transformer_joint_noam.yaml diff --git a/examples/ted_en_zh/t0/local/data.sh b/examples/ted_en_zh/st0/local/data.sh similarity index 91% rename from examples/ted_en_zh/t0/local/data.sh rename to examples/ted_en_zh/st0/local/data.sh index b080a5b4..d3acbd44 100755 --- a/examples/ted_en_zh/t0/local/data.sh +++ b/examples/ted_en_zh/st0/local/data.sh @@ -9,7 +9,7 @@ stop_stage=100 nbpe=8000 bpemode=unigram bpeprefix="data/bpe_${bpemode}_${nbpe}" -data_dir=./TED_EnZh +data_dir=./TED-En-Zh source ${MAIN_ROOT}/utils/parse_options.sh @@ -21,7 +21,7 @@ mkdir -p data if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then if [ ! -e ${data_dir} ]; then - echo "Error: Dataset is not avaiable. Please download and unzip the dataset" + echo "Error: ${data_dir} Dataset is not avaiable. Please download and unzip the dataset" echo "Download Link: https://pan.baidu.com/s/18L-59wgeS96WkObISrytQQ Passwd: bva0" echo "The tree of the directory should be:" echo "." @@ -54,8 +54,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then --feat_dim=80 \ --delta_delta=false \ --sample_rate=16000 \ - --stride_ms=10.0 \ - --window_ms=25.0 \ + --stride_ms=10 \ + --window_ms=25 \ --use_dB_normalization=False \ --num_workers=${num_workers} \ --output_path="data/mean_std.json" @@ -88,8 +88,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then # format manifest with tokenids, vocab size for set in train dev test; do { - python3 ${MAIN_ROOT}/utils/format_triplet_data.py \ - --feat_type "raw" \ + python3 ${MAIN_ROOT}/utils/format_data.py \ --cmvn_path "data/mean_std.json" \ --unit_type "spm" \ --spm_model_prefix ${bpeprefix} \ diff --git a/examples/ted_en_zh/t0/local/test.sh b/examples/ted_en_zh/st0/local/test.sh similarity index 100% rename from examples/ted_en_zh/t0/local/test.sh rename to examples/ted_en_zh/st0/local/test.sh diff --git a/examples/ted_en_zh/t0/local/train.sh b/examples/ted_en_zh/st0/local/train.sh similarity index 100% rename from examples/ted_en_zh/t0/local/train.sh rename to examples/ted_en_zh/st0/local/train.sh diff --git a/examples/ted_en_zh/t0/path.sh b/examples/ted_en_zh/st0/path.sh similarity index 100% rename from examples/ted_en_zh/t0/path.sh rename to examples/ted_en_zh/st0/path.sh diff --git a/examples/ted_en_zh/t0/run.sh b/examples/ted_en_zh/st0/run.sh similarity index 93% rename from examples/ted_en_zh/t0/run.sh rename to examples/ted_en_zh/st0/run.sh index ed9ab5f8..fb4bc338 100755 --- a/examples/ted_en_zh/t0/run.sh +++ b/examples/ted_en_zh/st0/run.sh @@ -22,7 +22,7 @@ fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # train model, all `ckpt` under `exp` dir - CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then diff --git a/examples/ted_en_zh/t0/.gitignore b/examples/ted_en_zh/t0/.gitignore deleted file mode 100644 index 123e5174..00000000 --- a/examples/ted_en_zh/t0/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -TED_EnZh -data -exp diff --git a/examples/thchs30/README.md b/examples/thchs30/README.md index 7b3cc3d9..9a0026a0 100644 --- a/examples/thchs30/README.md +++ b/examples/thchs30/README.md @@ -1,3 +1,3 @@ # thchs30 -* a0 for mfa alignment +* align0 - mfa alignment diff --git a/examples/thchs30/a0/README.md b/examples/thchs30/align0/README.md similarity index 100% rename from examples/thchs30/a0/README.md rename to examples/thchs30/align0/README.md diff --git a/examples/thchs30/a0/data/dict/syllable.lexicon b/examples/thchs30/align0/data/dict/syllable.lexicon similarity index 100% rename from examples/thchs30/a0/data/dict/syllable.lexicon rename to examples/thchs30/align0/data/dict/syllable.lexicon diff --git a/examples/thchs30/a0/local/data.sh b/examples/thchs30/align0/local/data.sh similarity index 100% rename from examples/thchs30/a0/local/data.sh rename to examples/thchs30/align0/local/data.sh diff --git a/examples/thchs30/a0/local/gen_word2phone.py b/examples/thchs30/align0/local/gen_word2phone.py similarity index 100% rename from examples/thchs30/a0/local/gen_word2phone.py rename to examples/thchs30/align0/local/gen_word2phone.py diff --git a/examples/thchs30/a0/local/reorganize_thchs30.py b/examples/thchs30/align0/local/reorganize_thchs30.py similarity index 100% rename from examples/thchs30/a0/local/reorganize_thchs30.py rename to examples/thchs30/align0/local/reorganize_thchs30.py diff --git a/examples/thchs30/a0/path.sh b/examples/thchs30/align0/path.sh similarity index 100% rename from examples/thchs30/a0/path.sh rename to examples/thchs30/align0/path.sh diff --git a/examples/thchs30/a0/run.sh b/examples/thchs30/align0/run.sh similarity index 100% rename from examples/thchs30/a0/run.sh rename to examples/thchs30/align0/run.sh diff --git a/examples/timit/README.md b/examples/timit/README.md index b7c8b754..51fcfd57 100644 --- a/examples/timit/README.md +++ b/examples/timit/README.md @@ -1,3 +1,7 @@ # TIMIT -* s1 u2 model with phone unit +asr model with phone unit + +* asr0 - deepspeech2 Streaming/Non-Streaming +* asr1 - transformer/conformer Streaming/Non-Streaming +* asr2 - transformer/conformer Streaming/Non-Streaming with Kaldi feature diff --git a/examples/timit/s1/.gitignore b/examples/timit/asr1/.gitignore similarity index 100% rename from examples/timit/s1/.gitignore rename to examples/timit/asr1/.gitignore diff --git a/examples/timit/s1/README.md b/examples/timit/asr1/README.md similarity index 100% rename from examples/timit/s1/README.md rename to examples/timit/asr1/README.md diff --git a/examples/timit/s1/conf/augmentation.json b/examples/timit/asr1/conf/augmentation.json similarity index 100% rename from examples/timit/s1/conf/augmentation.json rename to examples/timit/asr1/conf/augmentation.json diff --git a/examples/timit/s1/conf/dev_spk.list b/examples/timit/asr1/conf/dev_spk.list similarity index 100% rename from examples/timit/s1/conf/dev_spk.list rename to examples/timit/asr1/conf/dev_spk.list diff --git a/examples/timit/s1/conf/phones.60-48-39.map b/examples/timit/asr1/conf/phones.60-48-39.map similarity index 100% rename from examples/timit/s1/conf/phones.60-48-39.map rename to examples/timit/asr1/conf/phones.60-48-39.map diff --git a/examples/timit/asr1/conf/preprocess.yaml b/examples/timit/asr1/conf/preprocess.yaml new file mode 100644 index 00000000..dd4cfd27 --- /dev/null +++ b/examples/timit/asr1/conf/preprocess.yaml @@ -0,0 +1,29 @@ +process: + # extract kaldi fbank from PCM + - type: fbank_kaldi + fs: 16000 + n_mels: 80 + n_shift: 160 + win_length: 400 + dither: true + - type: cmvn_json + cmvn_path: data/mean_std.json + # these three processes are a.k.a. SpecAugument + - type: time_warp + max_time_warp: 5 + inplace: true + mode: PIL + - type: freq_mask + F: 30 + n_mask: 2 + inplace: true + replace_with_zero: false + - type: time_mask + T: 40 + n_mask: 2 + inplace: true + replace_with_zero: false + + + + diff --git a/examples/timit/s1/conf/test_spk.list b/examples/timit/asr1/conf/test_spk.list similarity index 100% rename from examples/timit/s1/conf/test_spk.list rename to examples/timit/asr1/conf/test_spk.list diff --git a/examples/timit/s1/conf/transformer.yaml b/examples/timit/asr1/conf/transformer.yaml similarity index 97% rename from examples/timit/s1/conf/transformer.yaml rename to examples/timit/asr1/conf/transformer.yaml index d3ced898..1d18468b 100644 --- a/examples/timit/s1/conf/transformer.yaml +++ b/examples/timit/asr1/conf/transformer.yaml @@ -14,7 +14,7 @@ collator: vocab_filepath: data/vocab.txt unit_type: "word" mean_std_filepath: "" - augmentation_config: "" + augmentation_config: conf/preprocess.yaml batch_size: 64 raw_wav: True # use raw_wav or kaldi feature spectrum_type: fbank #linear, mfcc, fbank @@ -37,7 +37,7 @@ collator: # network architecture model: - cmvn_file: "data/mean_std.json" + cmvn_file: cmvn_file_type: "json" # encoder related encoder: transformer diff --git a/examples/timit/s1/local/align.sh b/examples/timit/asr1/local/align.sh similarity index 100% rename from examples/timit/s1/local/align.sh rename to examples/timit/asr1/local/align.sh diff --git a/examples/timit/s1/local/data.sh b/examples/timit/asr1/local/data.sh similarity index 96% rename from examples/timit/s1/local/data.sh rename to examples/timit/asr1/local/data.sh index ad4ddde3..e588e48d 100755 --- a/examples/timit/s1/local/data.sh +++ b/examples/timit/asr1/local/data.sh @@ -35,8 +35,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then --feat_dim=80 \ --delta_delta=false \ --sample_rate=16000 \ - --stride_ms=10.0 \ - --window_ms=25.0 \ + --stride_ms=10 \ + --window_ms=25 \ --use_dB_normalization=False \ --num_workers=${num_workers} \ --output_path="data/mean_std.json" @@ -66,7 +66,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then for set in train dev test; do { python3 ${MAIN_ROOT}/utils/format_data.py \ - --feat_type "raw" \ --cmvn_path "data/mean_std.json" \ --unit_type ${unit_type} \ --vocab_path="data/vocab.txt" \ diff --git a/examples/timit/s1/local/export.sh b/examples/timit/asr1/local/export.sh similarity index 100% rename from examples/timit/s1/local/export.sh rename to examples/timit/asr1/local/export.sh diff --git a/examples/timit/s1/local/test.sh b/examples/timit/asr1/local/test.sh similarity index 100% rename from examples/timit/s1/local/test.sh rename to examples/timit/asr1/local/test.sh diff --git a/examples/timit/s1/local/timit_data_prep.sh b/examples/timit/asr1/local/timit_data_prep.sh similarity index 100% rename from examples/timit/s1/local/timit_data_prep.sh rename to examples/timit/asr1/local/timit_data_prep.sh diff --git a/examples/timit/s1/local/timit_norm_trans.pl b/examples/timit/asr1/local/timit_norm_trans.pl similarity index 100% rename from examples/timit/s1/local/timit_norm_trans.pl rename to examples/timit/asr1/local/timit_norm_trans.pl diff --git a/examples/timit/s1/local/train.sh b/examples/timit/asr1/local/train.sh similarity index 100% rename from examples/timit/s1/local/train.sh rename to examples/timit/asr1/local/train.sh diff --git a/examples/timit/s1/path.sh b/examples/timit/asr1/path.sh similarity index 100% rename from examples/timit/s1/path.sh rename to examples/timit/asr1/path.sh diff --git a/examples/timit/s1/run.sh b/examples/timit/asr1/run.sh similarity index 100% rename from examples/timit/s1/run.sh rename to examples/timit/asr1/run.sh diff --git a/examples/tiny/README.md b/examples/tiny/README.md index 6766f59a..f36baae6 100644 --- a/examples/tiny/README.md +++ b/examples/tiny/README.md @@ -1,2 +1,3 @@ -* s0 for deepspeech2 -* s1 for U2 +* asr0 - deepspeech2 Streaming/Non-Streaming +* asr1 - transformer/conformer Streaming/Non-Streaming +* asr2 - transformer/conformer Streaming/Non-Streaming with Kaldi feature diff --git a/examples/tiny/s0/.gitignore b/examples/tiny/asr0/.gitignore similarity index 100% rename from examples/tiny/s0/.gitignore rename to examples/tiny/asr0/.gitignore diff --git a/examples/tiny/s0/README.md b/examples/tiny/asr0/README.md similarity index 100% rename from examples/tiny/s0/README.md rename to examples/tiny/asr0/README.md diff --git a/examples/tiny/s0/conf/augmentation.json b/examples/tiny/asr0/conf/augmentation.json similarity index 100% rename from examples/tiny/s0/conf/augmentation.json rename to examples/tiny/asr0/conf/augmentation.json diff --git a/examples/tiny/s0/conf/deepspeech2.yaml b/examples/tiny/asr0/conf/deepspeech2.yaml similarity index 100% rename from examples/tiny/s0/conf/deepspeech2.yaml rename to examples/tiny/asr0/conf/deepspeech2.yaml diff --git a/examples/tiny/s0/conf/deepspeech2_online.yaml b/examples/tiny/asr0/conf/deepspeech2_online.yaml similarity index 100% rename from examples/tiny/s0/conf/deepspeech2_online.yaml rename to examples/tiny/asr0/conf/deepspeech2_online.yaml diff --git a/examples/tiny/s0/local/data.sh b/examples/tiny/asr0/local/data.sh similarity index 96% rename from examples/tiny/s0/local/data.sh rename to examples/tiny/asr0/local/data.sh index 711ebee4..f1fb8cb1 100755 --- a/examples/tiny/s0/local/data.sh +++ b/examples/tiny/asr0/local/data.sh @@ -34,8 +34,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then --spectrum_type="linear" \ --delta_delta=false \ --sample_rate=16000 \ - --stride_ms=10.0 \ - --window_ms=20.0 \ + --stride_ms=10 \ + --window_ms=20 \ --use_dB_normalization=False \ --num_workers=2 \ --output_path="data/mean_std.json" @@ -63,7 +63,6 @@ fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then # format manifest with tokenids, vocab size python3 ${MAIN_ROOT}/utils/format_data.py \ - --feat_type "raw" \ --cmvn_path "data/mean_std.json" \ --unit_type ${unit_type} \ --vocab_path="data/vocab.txt" \ diff --git a/examples/tiny/s0/local/download_lm_en.sh b/examples/tiny/asr0/local/download_lm_en.sh similarity index 100% rename from examples/tiny/s0/local/download_lm_en.sh rename to examples/tiny/asr0/local/download_lm_en.sh diff --git a/examples/tiny/s0/local/export.sh b/examples/tiny/asr0/local/export.sh similarity index 100% rename from examples/tiny/s0/local/export.sh rename to examples/tiny/asr0/local/export.sh diff --git a/examples/tiny/s0/local/test.sh b/examples/tiny/asr0/local/test.sh similarity index 100% rename from examples/tiny/s0/local/test.sh rename to examples/tiny/asr0/local/test.sh diff --git a/examples/tiny/s0/local/train.sh b/examples/tiny/asr0/local/train.sh similarity index 100% rename from examples/tiny/s0/local/train.sh rename to examples/tiny/asr0/local/train.sh diff --git a/examples/tiny/s0/path.sh b/examples/tiny/asr0/path.sh similarity index 100% rename from examples/tiny/s0/path.sh rename to examples/tiny/asr0/path.sh diff --git a/examples/tiny/s0/run.sh b/examples/tiny/asr0/run.sh similarity index 100% rename from examples/tiny/s0/run.sh rename to examples/tiny/asr0/run.sh diff --git a/examples/tiny/s1/.gitignore b/examples/tiny/asr1/.gitignore similarity index 100% rename from examples/tiny/s1/.gitignore rename to examples/tiny/asr1/.gitignore diff --git a/examples/tiny/s1/conf/augmentation.json b/examples/tiny/asr1/conf/augmentation.json similarity index 100% rename from examples/tiny/s1/conf/augmentation.json rename to examples/tiny/asr1/conf/augmentation.json diff --git a/examples/tiny/s1/conf/chunk_confermer.yaml b/examples/tiny/asr1/conf/chunk_confermer.yaml similarity index 98% rename from examples/tiny/s1/conf/chunk_confermer.yaml rename to examples/tiny/asr1/conf/chunk_confermer.yaml index c5186669..6bed27f5 100644 --- a/examples/tiny/s1/conf/chunk_confermer.yaml +++ b/examples/tiny/asr1/conf/chunk_confermer.yaml @@ -15,7 +15,7 @@ collator: vocab_filepath: data/vocab.txt unit_type: 'spm' spm_model_prefix: 'data/bpe_unigram_200' - augmentation_config: conf/augmentation.json + augmentation_config: conf/preprocess.yaml batch_size: 4 raw_wav: True # use raw_wav or kaldi feature spectrum_type: fbank #linear, mfcc, fbank diff --git a/examples/tiny/s1/conf/chunk_transformer.yaml b/examples/tiny/asr1/conf/chunk_transformer.yaml similarity index 98% rename from examples/tiny/s1/conf/chunk_transformer.yaml rename to examples/tiny/asr1/conf/chunk_transformer.yaml index 29c30b26..7aed1b19 100644 --- a/examples/tiny/s1/conf/chunk_transformer.yaml +++ b/examples/tiny/asr1/conf/chunk_transformer.yaml @@ -15,7 +15,7 @@ collator: vocab_filepath: data/vocab.txt unit_type: 'spm' spm_model_prefix: 'data/bpe_unigram_200' - augmentation_config: conf/augmentation.json + augmentation_config: conf/preprocess.yaml batch_size: 4 raw_wav: True # use raw_wav or kaldi feature spectrum_type: fbank #linear, mfcc, fbank diff --git a/examples/tiny/s1/conf/conformer.yaml b/examples/tiny/asr1/conf/conformer.yaml similarity index 98% rename from examples/tiny/s1/conf/conformer.yaml rename to examples/tiny/asr1/conf/conformer.yaml index 8487da77..2c09b3ae 100644 --- a/examples/tiny/s1/conf/conformer.yaml +++ b/examples/tiny/asr1/conf/conformer.yaml @@ -15,7 +15,7 @@ collator: vocab_filepath: data/vocab.txt unit_type: 'spm' spm_model_prefix: 'data/bpe_unigram_200' - augmentation_config: conf/augmentation.json + augmentation_config: conf/preprocess.yaml batch_size: 4 raw_wav: True # use raw_wav or kaldi feature spectrum_type: fbank #linear, mfcc, fbank diff --git a/examples/tiny/asr1/conf/preprocess.yaml b/examples/tiny/asr1/conf/preprocess.yaml new file mode 100644 index 00000000..dd4cfd27 --- /dev/null +++ b/examples/tiny/asr1/conf/preprocess.yaml @@ -0,0 +1,29 @@ +process: + # extract kaldi fbank from PCM + - type: fbank_kaldi + fs: 16000 + n_mels: 80 + n_shift: 160 + win_length: 400 + dither: true + - type: cmvn_json + cmvn_path: data/mean_std.json + # these three processes are a.k.a. SpecAugument + - type: time_warp + max_time_warp: 5 + inplace: true + mode: PIL + - type: freq_mask + F: 30 + n_mask: 2 + inplace: true + replace_with_zero: false + - type: time_mask + T: 40 + n_mask: 2 + inplace: true + replace_with_zero: false + + + + diff --git a/examples/tiny/s1/conf/transformer.yaml b/examples/tiny/asr1/conf/transformer.yaml similarity index 96% rename from examples/tiny/s1/conf/transformer.yaml rename to examples/tiny/asr1/conf/transformer.yaml index cc9b5c51..1378e848 100644 --- a/examples/tiny/s1/conf/transformer.yaml +++ b/examples/tiny/asr1/conf/transformer.yaml @@ -11,11 +11,11 @@ data: max_output_input_ratio: 10.0 collator: - mean_std_filepath: "" + mean_std_filepath: data/mean_std.json vocab_filepath: data/vocab.txt unit_type: 'spm' spm_model_prefix: 'data/bpe_unigram_200' - augmentation_config: conf/augmentation.json + augmentation_config: conf/preprocess.yaml batch_size: 4 raw_wav: True # use raw_wav or kaldi feature spectrum_type: fbank #linear, mfcc, fbank @@ -37,7 +37,7 @@ collator: # network architecture model: - cmvn_file: "data/mean_std.json" + cmvn_file: cmvn_file_type: "json" # encoder related encoder: transformer diff --git a/examples/tiny/s1/local/align.sh b/examples/tiny/asr1/local/align.sh similarity index 100% rename from examples/tiny/s1/local/align.sh rename to examples/tiny/asr1/local/align.sh diff --git a/examples/tiny/s1/local/data.sh b/examples/tiny/asr1/local/data.sh similarity index 96% rename from examples/tiny/s1/local/data.sh rename to examples/tiny/asr1/local/data.sh index b25f993f..87539d5e 100755 --- a/examples/tiny/s1/local/data.sh +++ b/examples/tiny/asr1/local/data.sh @@ -38,8 +38,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then --feat_dim=80 \ --delta_delta=false \ --sample_rate=16000 \ - --stride_ms=10.0 \ - --window_ms=25.0 \ + --stride_ms=10 \ + --window_ms=25 \ --use_dB_normalization=False \ --num_workers=2 \ --output_path="data/mean_std.json" @@ -69,7 +69,6 @@ fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then # format manifest with tokenids, vocab size python3 ${MAIN_ROOT}/utils/format_data.py \ - --feat_type "raw" \ --cmvn_path "data/mean_std.json" \ --unit_type "spm" \ --spm_model_prefix ${bpeprefix} \ diff --git a/examples/tiny/s1/local/export.sh b/examples/tiny/asr1/local/export.sh similarity index 100% rename from examples/tiny/s1/local/export.sh rename to examples/tiny/asr1/local/export.sh diff --git a/examples/tiny/s1/local/test.sh b/examples/tiny/asr1/local/test.sh similarity index 100% rename from examples/tiny/s1/local/test.sh rename to examples/tiny/asr1/local/test.sh diff --git a/examples/tiny/s1/local/train.sh b/examples/tiny/asr1/local/train.sh similarity index 100% rename from examples/tiny/s1/local/train.sh rename to examples/tiny/asr1/local/train.sh diff --git a/examples/tiny/s1/path.sh b/examples/tiny/asr1/path.sh similarity index 100% rename from examples/tiny/s1/path.sh rename to examples/tiny/asr1/path.sh diff --git a/examples/tiny/s1/run.sh b/examples/tiny/asr1/run.sh similarity index 100% rename from examples/tiny/s1/run.sh rename to examples/tiny/asr1/run.sh diff --git a/examples/vctk/tts3/README.md b/examples/vctk/tts3/README.md index 334372f9..894d6b14 100644 --- a/examples/vctk/tts3/README.md +++ b/examples/vctk/tts3/README.md @@ -90,7 +90,7 @@ optional arguments: ### Synthesize We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/voc1) as the neural vocoder. -Download pretrained parallel wavegan model from [pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_vctk_ckpt_0.5.zip)and unzip it. +Download pretrained parallel wavegan model from [pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_vctk_ckpt_0.5.zip)and unzip it. ```bash unzip pwg_vctk_ckpt_0.5.zip ``` @@ -196,7 +196,7 @@ optional arguments: 6. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. ## Pretrained Model -Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_vctk_ckpt_0.5.zip) +Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_vctk_ckpt_0.5.zip) FastSpeech2 checkpoint contains files listed below. ```text diff --git a/examples/vctk/voc1/README.md b/examples/vctk/voc1/README.md index 5063b869..8692f010 100644 --- a/examples/vctk/voc1/README.md +++ b/examples/vctk/voc1/README.md @@ -127,7 +127,7 @@ optional arguments: 5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. ## Pretrained Models -Pretrained models can be downloaded here [pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_vctk_ckpt_0.5.zip). +Pretrained models can be downloaded here [pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_vctk_ckpt_0.5.zip). Parallel WaveGAN checkpoint contains files listed below. diff --git a/examples/wenetspeech/README.md b/examples/wenetspeech/README.md new file mode 100644 index 00000000..cbd01eb8 --- /dev/null +++ b/examples/wenetspeech/README.md @@ -0,0 +1,58 @@ +* asr0 - deepspeech2 Streaming/Non-Streaming +* asr1 - transformer/conformer Streaming/Non-Streaming +* asr2 - transformer/conformer Streaming/Non-Streaming with Kaldi feature + +# [WenetSpeech](https://github.com/wenet-e2e/WenetSpeech) + +A 10000+ Hours Multi-domain Chinese Corpus for Speech Recognition + +## Description + +### Creation + +All the data are collected from YouTube and Podcast. Optical character recognition (OCR) and automatic speech recognition (ASR) techniques are adopted to label each YouTube and Podcast recording, respectively. To improve the quality of the corpus, we use a novel end-to-end label error detection method to further validate and filter the data. + +### Categories + +In summary, WenetSpeech groups all data into 3 categories, as the following table shows: + +| Set | Hours | Confidence | Usage | +|------------|-------|-------------|---------------------------------------| +| High Label | 10005 | >=0.95 | Supervised Training | +| Weak Label | 2478 | [0.6, 0.95] | Semi-supervised or noise training | +| Unlabel | 9952 | / | Unsupervised training or Pre-training | +| In Total | 22435 | / | All above | + +### High Label Data + +We classify the high label into 10 groups according to its domain, speaking style, and scenarios. + +| Domain | Youtube | Podcast | Total | +|-------------|---------|---------|--------| +| audiobook | 0 | 250.9 | 250.9 | +| commentary | 112.6 | 135.7 | 248.3 | +| documentary | 386.7 | 90.5 | 477.2 | +| drama | 4338.2 | 0 | 4338.2 | +| interview | 324.2 | 614 | 938.2 | +| news | 0 | 868 | 868 | +| reading | 0 | 1110.2 | 1110.2 | +| talk | 204 | 90.7 | 294.7 | +| variety | 603.3 | 224.5 | 827.8 | +| others | 144 | 507.5 | 651.5 | +| Total | 6113 | 3892 | 10005 | + +As shown in the following table, we provide 3 training subsets, namely `S`, `M` and `L` for building ASR systems on different data scales. + +| Training Subsets | Confidence | Hours | +|------------------|-------------|-------| +| L | [0.95, 1.0] | 10005 | +| M | 1.0 | 1000 | +| S | 1.0 | 100 | + +### Evaluation Sets + +| Evaluation Sets | Hours | Source | Description | +|-----------------|-------|--------------|-----------------------------------------------------------------------------------------| +| DEV | 20 | Internet | Specially designed for some speech tools which require cross-validation set in training | +| TEST\_NET | 23 | Internet | Match test | +| TEST\_MEETING | 15 | Real meeting | Mismatch test which is a far-field, conversational, spontaneous, and meeting dataset | diff --git a/examples/wenetspeech/asr1/.gitignore b/examples/wenetspeech/asr1/.gitignore new file mode 100644 index 00000000..02a22922 --- /dev/null +++ b/examples/wenetspeech/asr1/.gitignore @@ -0,0 +1,3 @@ +data +exp +*.profile diff --git a/examples/wenetspeech/asr1/README.md b/examples/wenetspeech/asr1/README.md new file mode 100644 index 00000000..c08b94e2 --- /dev/null +++ b/examples/wenetspeech/asr1/README.md @@ -0,0 +1,14 @@ +## Pack Model + +pack model to tar.gz, e.g. + +```bash +./utils/pack_model.sh --preprocess_conf conf/preprocess.yaml --dict data/vocab.txt conf/conformer.yaml '' data/mean_std.json exp/conformer/checkpoints/wenetspeec +h.pdparams + +``` + +show model.tar.gz +``` +tar tf model.tar.gz +``` diff --git a/examples/wenetspeech/asr1/RESULTS.md b/examples/wenetspeech/asr1/RESULTS.md new file mode 100644 index 00000000..5c2b8143 --- /dev/null +++ b/examples/wenetspeech/asr1/RESULTS.md @@ -0,0 +1,24 @@ +# WenetSpeech + + +## Conformer + +| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER | +| --- | --- | --- | --- | --- | --- | --- | --- | +| conformer | 32.52 M | conf/conformer.yaml | spec_aug | dev | attention | | | +| conformer | 32.52 M | conf/conformer.yaml | spec_aug | test net | ctc_greedy_search | | | +| conformer | 32.52 M | conf/conformer.yaml | spec_aug | test meeting | ctc_prefix_beam_search | | | +| conformer | 32.52 M | conf/conformer.yaml | spec_aug | test net | attention_rescoring | | | + + + +## Conformer Pretrain Model + +Pretrain model from http://mobvoi-speech-public.ufile.ucloud.cn/public/wenet/wenetspeech/20211025_conformer_exp.tar.gz + +| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER | +| --- | --- | --- | --- | --- | --- | --- | --- | +| conformer | 32.52 M | conf/conformer.yaml | spec_aug | aishell1 | attention | - | 0.048456 | +| conformer | 32.52 M | conf/conformer.yaml | spec_aug | aishell1 | ctc_greedy_search | - | 0.052534 | +| conformer | 32.52 M | conf/conformer.yaml | spec_aug | aishell1 | ctc_prefix_beam_search | - | 0.052915 | +| conformer | 32.52 M | conf/conformer.yaml | spec_aug | aishell1 | attention_rescoring | - | 0.047904 | diff --git a/examples/wenetspeech/asr1/conf/conformer.yaml b/examples/wenetspeech/asr1/conf/conformer.yaml new file mode 100644 index 00000000..0340dc85 --- /dev/null +++ b/examples/wenetspeech/asr1/conf/conformer.yaml @@ -0,0 +1,113 @@ +# network architecture +model: + # encoder related + encoder: conformer + encoder_conf: + output_size: 512 # dimension of attention + attention_heads: 8 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: True + use_cnn_module: True + cnn_module_kernel: 15 + cnn_module_norm: layer_norm + activation_type: swish + pos_enc_layer_type: rel_pos + selfattention_layer_type: rel_selfattn + + # decoder related + decoder: transformer + decoder_conf: + attention_heads: 8 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 + + # hybrid CTC/attention + model_conf: + ctc_weight: 0.3 + ctc_dropoutrate: 0.0 + ctc_grad_norm_type: null + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false + +# https://yaml.org/type/float.html +data: + train_manifest: data/manifest.train + dev_manifest: data/manifest.dev + test_manifest: data/manifest.test + min_input_len: 0.1 # second + max_input_len: 12.0 # second + min_output_len: 1.0 + max_output_len: 400.0 + min_output_input_ratio: 0.05 + max_output_input_ratio: 10.0 + +collator: + vocab_filepath: data/vocab.txt + unit_type: 'char' + spm_model_prefix: '' + augmentation_config: conf/preprocess.yaml + batch_size: 64 + raw_wav: True # use raw_wav or kaldi feature + spectrum_type: fbank #linear, mfcc, fbank + feat_dim: 80 + delta_delta: False + dither: 1.0 + target_sample_rate: 16000 + max_freq: None + n_fft: None + stride_ms: 10.0 + window_ms: 25.0 + use_dB_normalization: True + target_dB: -20 + random_seed: 0 + keep_transcription_text: False + sortagrad: True + shuffle_method: batch_shuffle + num_workers: 2 + + +training: + n_epoch: 240 + accum_grad: 16 + global_grad_clip: 5.0 + log_interval: 100 + checkpoint: + kbest_n: 50 + latest_n: 5 + optim: adam + optim_conf: + lr: 0.001 + weight_decay: 1e-6 + scheduler: warmuplr # pytorch v1.1.0+ required + scheduler_conf: + warmup_steps: 5000 + lr_decay: 1.0 + + +decoding: + batch_size: 128 + error_rate_type: cer + decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' + lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm + alpha: 2.5 + beta: 0.3 + beam_size: 10 + cutoff_prob: 1.0 + cutoff_top_n: 0 + num_proc_bsearch: 8 + ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. + decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. + num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. + simulate_streaming: False # simulate streaming inference. Defaults to False. \ No newline at end of file diff --git a/examples/wenetspeech/asr1/conf/preprocess.yaml b/examples/wenetspeech/asr1/conf/preprocess.yaml new file mode 100644 index 00000000..dd4cfd27 --- /dev/null +++ b/examples/wenetspeech/asr1/conf/preprocess.yaml @@ -0,0 +1,29 @@ +process: + # extract kaldi fbank from PCM + - type: fbank_kaldi + fs: 16000 + n_mels: 80 + n_shift: 160 + win_length: 400 + dither: true + - type: cmvn_json + cmvn_path: data/mean_std.json + # these three processes are a.k.a. SpecAugument + - type: time_warp + max_time_warp: 5 + inplace: true + mode: PIL + - type: freq_mask + F: 30 + n_mask: 2 + inplace: true + replace_with_zero: false + - type: time_mask + T: 40 + n_mask: 2 + inplace: true + replace_with_zero: false + + + + diff --git a/examples/wenetspeech/asr1/local/data.sh b/examples/wenetspeech/asr1/local/data.sh new file mode 100755 index 00000000..67b3d5a5 --- /dev/null +++ b/examples/wenetspeech/asr1/local/data.sh @@ -0,0 +1,129 @@ +#!/bin/bash + +# Copyright 2021 Mobvoi Inc(Author: Di Wu, Binbin Zhang) +# NPU, ASLP Group (Author: Qijie Shao) + +stage=-1 +stop_stage=100 + +# Use your own data path. You need to download the WenetSpeech dataset by yourself. +wenetspeech_data_dir=./wenetspeech +# Make sure you have 1.2T for ${shards_dir} +shards_dir=./wenetspeech_shards + +#wenetspeech training set +set=L +train_set=train_`echo $set | tr 'A-Z' 'a-z'` +dev_set=dev +test_sets="test_net test_meeting" + +cmvn=true +cmvn_sampling_divisor=20 # 20 means 5% of the training data to estimate cmvn + + +. ${MAIN_ROOT}/utils/parse_options.sh || exit 1; +set -u +set -o pipefail + + +mkdir -p data +TARGET_DIR=${MAIN_ROOT}/examples/dataset +mkdir -p ${TARGET_DIR} + +if [ ${stage} -le -2 ] && [ ${stop_stage} -ge -2 ]; then + # download data + echo "Please follow https://github.com/wenet-e2e/WenetSpeech to download the data." + exit 0; +fi + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + echo "Data preparation" + local/wenetspeech_data_prep.sh \ + --train-subset $set \ + $wenetspeech_data_dir \ + data || exit 1; +fi + +if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then + # generate manifests + python3 ${TARGET_DIR}/aishell/aishell.py \ + --manifest_prefix="data/manifest" \ + --target_dir="${TARGET_DIR}/aishell" + + if [ $? -ne 0 ]; then + echo "Prepare Aishell failed. Terminated." + exit 1 + fi + + for dataset in train dev test; do + mv data/manifest.${dataset} data/manifest.${dataset}.raw + done +fi + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # compute mean and stddev for normalizer + if $cmvn; then + full_size=`cat data/${train_set}/wav.scp | wc -l` + sampling_size=$((full_size / cmvn_sampling_divisor)) + shuf -n $sampling_size data/$train_set/wav.scp \ + > data/$train_set/wav.scp.sampled + num_workers=$(nproc) + + python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ + --manifest_path="data/manifest.train.raw" \ + --spectrum_type="fbank" \ + --feat_dim=80 \ + --delta_delta=false \ + --stride_ms=10 \ + --window_ms=25 \ + --sample_rate=16000 \ + --use_dB_normalization=False \ + --num_samples=-1 \ + --num_workers=${num_workers} \ + --output_path="data/mean_std.json" + + if [ $? -ne 0 ]; then + echo "Compute mean and stddev failed. Terminated." + exit 1 + fi + fi +fi + +dict=data/dict/lang_char.txt +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # download data, generate manifests + # build vocabulary + python3 ${MAIN_ROOT}/utils/build_vocab.py \ + --unit_type="char" \ + --count_threshold=0 \ + --vocab_path="data/vocab.txt" \ + --manifest_paths "data/manifest.train.raw" + + if [ $? -ne 0 ]; then + echo "Build vocabulary failed. Terminated." + exit 1 + fi +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # format manifest with tokenids, vocab size + for dataset in train dev test; do + { + python3 ${MAIN_ROOT}/utils/format_data.py \ + --cmvn_path "data/mean_std.json" \ + --unit_type "char" \ + --vocab_path="data/vocab.txt" \ + --manifest_path="data/manifest.${dataset}.raw" \ + --output_path="data/manifest.${dataset}" + + if [ $? -ne 0 ]; then + echo "Formt mnaifest failed. Terminated." + exit 1 + fi + } & + done + wait +fi + +echo "Aishell data preparation done." +exit 0 diff --git a/examples/wenetspeech/asr1/local/extract_meta.py b/examples/wenetspeech/asr1/local/extract_meta.py new file mode 100644 index 00000000..0e1b2727 --- /dev/null +++ b/examples/wenetspeech/asr1/local/extract_meta.py @@ -0,0 +1,113 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Copyright 2021 Xiaomi Corporation (Author: Yongqing Wang) +# Mobvoi Inc(Author: Di Wu, Binbin Zhang) +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import json +import os +import sys + + +def get_args(): + parser = argparse.ArgumentParser(description=""" + This script is used to process raw json dataset of WenetSpeech, + where the long wav is splitinto segments and + data of wenet format is generated. + """) + parser.add_argument('input_json', help="""Input json file of WenetSpeech""") + parser.add_argument('output_dir', help="""Output dir for prepared data""") + + args = parser.parse_args() + return args + + +def meta_analysis(input_json, output_dir): + input_dir = os.path.dirname(input_json) + + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + try: + with open(input_json, 'r') as injson: + json_data = json.load(injson) + except Exception: + sys.exit(f'Failed to load input json file: {input_json}') + else: + if json_data['audios'] is not None: + with open(f'{output_dir}/text', 'w') as utt2text, \ + open(f'{output_dir}/segments', 'w') as segments, \ + open(f'{output_dir}/utt2dur', 'w') as utt2dur, \ + open(f'{output_dir}/wav.scp', 'w') as wavscp, \ + open(f'{output_dir}/utt2subsets', 'w') as utt2subsets, \ + open(f'{output_dir}/reco2dur', 'w') as reco2dur: + for long_audio in json_data['audios']: + try: + long_audio_path = os.path.realpath( + os.path.join(input_dir, long_audio['path'])) + aid = long_audio['aid'] + segments_lists = long_audio['segments'] + duration = long_audio['duration'] + assert (os.path.exists(long_audio_path)) + except AssertionError: + print(f'''Warning: {aid} something is wrong, + maybe AssertionError, skipped''') + continue + except Exception: + print(f'''Warning: {aid} something is wrong, maybe the + error path: {long_audio_path}, skipped''') + continue + else: + wavscp.write(f'{aid}\t{long_audio_path}\n') + reco2dur.write(f'{aid}\t{duration}\n') + for segment_file in segments_lists: + try: + sid = segment_file['sid'] + start_time = segment_file['begin_time'] + end_time = segment_file['end_time'] + dur = end_time - start_time + text = segment_file['text'] + segment_subsets = segment_file["subsets"] + except Exception: + print(f'''Warning: {segment_file} something + is wrong, skipped''') + continue + else: + utt2text.write(f'{sid}\t{text}\n') + segments.write( + f'{sid}\t{aid}\t{start_time}\t{end_time}\n') + utt2dur.write(f'{sid}\t{dur}\n') + segment_sub_names = " ".join(segment_subsets) + utt2subsets.write( + f'{sid}\t{segment_sub_names}\n') + + +def main(): + args = get_args() + + meta_analysis(args.input_json, args.output_dir) + + +if __name__ == '__main__': + main() diff --git a/examples/wenetspeech/asr1/local/process_opus.py b/examples/wenetspeech/asr1/local/process_opus.py new file mode 100644 index 00000000..f1b9287e --- /dev/null +++ b/examples/wenetspeech/asr1/local/process_opus.py @@ -0,0 +1,99 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Copyright 2021 NPU, ASLP Group (Author: Qijie Shao) +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# process_opus.py: segmentation and downsampling of opus audio +# usage: python3 process_opus.py wav.scp segments output_wav.scp +import os +import sys + +from pydub import AudioSegment + + +def read_file(wav_scp, segments): + wav_scp_dict = {} + with open(wav_scp, 'r', encoding='UTF-8') as fin: + for line_str in fin: + wav_id, path = line_str.strip().split() + wav_scp_dict[wav_id] = path + + utt_list = [] + seg_path_list = [] + start_time_list = [] + end_time_list = [] + with open(segments, 'r', encoding='UTF-8') as fin: + for line_str in fin: + arr = line_str.strip().split() + assert len(arr) == 4 + utt_list.append(arr[0]) + seg_path_list.append(wav_scp_dict[arr[1]]) + start_time_list.append(float(arr[2])) + end_time_list.append(float(arr[3])) + return utt_list, seg_path_list, start_time_list, end_time_list + + +# TODO(Qijie): Fix the process logic +def output(output_wav_scp, utt_list, seg_path_list, start_time_list, + end_time_list): + num_utts = len(utt_list) + step = int(num_utts * 0.01) + with open(output_wav_scp, 'w', encoding='UTF-8') as fout: + previous_wav_path = "" + for i in range(num_utts): + utt_id = utt_list[i] + current_wav_path = seg_path_list[i] + output_dir = (os.path.dirname(current_wav_path)) \ + .replace("audio", 'audio_seg') + seg_wav_path = os.path.join(output_dir, utt_id + '.wav') + + # if not os.path.exists(output_dir): + # os.makedirs(output_dir) + + if current_wav_path != previous_wav_path: + source_wav = AudioSegment.from_file(current_wav_path) + previous_wav_path = current_wav_path + + start = int(start_time_list[i] * 1000) + end = int(end_time_list[i] * 1000) + target_audio = source_wav[start:end].set_frame_rate(16000) + target_audio.export(seg_wav_path, format="wav") + + fout.write("{} {}\n".format(utt_id, seg_wav_path)) + if i % step == 0: + print("seg wav finished: {}%".format(int(i / step))) + + +def main(): + wav_scp = sys.argv[1] + segments = sys.argv[2] + output_wav_scp = sys.argv[3] + + utt_list, seg_path_list, start_time_list, end_time_list \ + = read_file(wav_scp, segments) + output(output_wav_scp, utt_list, seg_path_list, start_time_list, + end_time_list) + + +if __name__ == '__main__': + main() diff --git a/examples/wenetspeech/asr1/local/test.sh b/examples/wenetspeech/asr1/local/test.sh new file mode 100755 index 00000000..47bd2f63 --- /dev/null +++ b/examples/wenetspeech/asr1/local/test.sh @@ -0,0 +1,69 @@ +#!/bin/bash + +if [ $# != 2 ];then + echo "usage: ${0} config_path ckpt_path_prefix" + exit -1 +fi + +ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') +echo "using $ngpu gpus..." + +config_path=$1 +ckpt_prefix=$2 + +chunk_mode=false +if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then + chunk_mode=true +fi + +# download language model +#bash local/download_lm_ch.sh +#if [ $? -ne 0 ]; then +# exit 1 +#fi + + +for type in attention ctc_greedy_search; do + echo "decoding ${type}" + if [ ${chunk_mode} == true ];then + # stream decoding only support batchsize=1 + batch_size=1 + else + batch_size=64 + fi + output_dir=${ckpt_prefix} + mkdir -p ${output_dir} + python3 -u ${BIN_DIR}/test.py \ + --nproc ${ngpu} \ + --config ${config_path} \ + --result_file ${output_dir}/${type}.rsl \ + --checkpoint_path ${ckpt_prefix} \ + --opts decoding.decoding_method ${type} \ + --opts decoding.batch_size ${batch_size} + + if [ $? -ne 0 ]; then + echo "Failed in evaluation!" + exit 1 + fi +done + +for type in ctc_prefix_beam_search attention_rescoring; do + echo "decoding ${type}" + batch_size=1 + output_dir=${ckpt_prefix} + mkdir -p ${output_dir} + python3 -u ${BIN_DIR}/test.py \ + --nproc ${ngpu} \ + --config ${config_path} \ + --result_file ${output_dir}/${type}.rsl \ + --checkpoint_path ${ckpt_prefix} \ + --opts decoding.decoding_method ${type} \ + --opts decoding.batch_size ${batch_size} + + if [ $? -ne 0 ]; then + echo "Failed in evaluation!" + exit 1 + fi +done + +exit 0 diff --git a/examples/wenetspeech/asr1/local/wenetspeech_data_prep.sh b/examples/wenetspeech/asr1/local/wenetspeech_data_prep.sh new file mode 100755 index 00000000..85853053 --- /dev/null +++ b/examples/wenetspeech/asr1/local/wenetspeech_data_prep.sh @@ -0,0 +1,135 @@ +#!/usr/bin/env bash + +# Copyright 2021 Xiaomi Corporation (Author: Yongqing Wang) +# Seasalt AI, Inc (Author: Guoguo Chen) +# Mobvoi Inc(Author: Di Wu, Binbin Zhang) +# NPU, ASLP Group (Author: Qijie Shao) + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -e +set -o pipefail + +stage=1 +prefix= +train_subset=L + +. ./tools/parse_options.sh || exit 1; + +filter_by_id () { + idlist=$1 + input=$2 + output=$3 + field=1 + if [ $# -eq 4 ]; then + field=$4 + fi + cat $input | perl -se ' + open(F, "<$idlist") || die "Could not open id-list file $idlist"; + while() { + @A = split; + @A>=1 || die "Invalid id-list file line $_"; + $seen{$A[0]} = 1; + } + while(<>) { + @A = split; + @A > 0 || die "Invalid file line $_"; + @A >= $field || die "Invalid file line $_"; + if ($seen{$A[$field-1]}) { + print $_; + } + }' -- -idlist="$idlist" -field="$field" > $output ||\ + (echo "$0: filter_by_id() error: $input" && exit 1) || exit 1; +} + +subset_data_dir () { + utt_list=$1 + src_dir=$2 + dest_dir=$3 + mkdir -p $dest_dir || exit 1; + # wav.scp text segments utt2dur + filter_by_id $utt_list $src_dir/utt2dur $dest_dir/utt2dur ||\ + (echo "$0: subset_data_dir() error: $src_dir/utt2dur" && exit 1) || exit 1; + filter_by_id $utt_list $src_dir/text $dest_dir/text ||\ + (echo "$0: subset_data_dir() error: $src_dir/text" && exit 1) || exit 1; + filter_by_id $utt_list $src_dir/segments $dest_dir/segments ||\ + (echo "$0: subset_data_dir() error: $src_dir/segments" && exit 1) || exit 1; + awk '{print $2}' $dest_dir/segments | sort | uniq > $dest_dir/reco + filter_by_id $dest_dir/reco $src_dir/wav.scp $dest_dir/wav.scp ||\ + (echo "$0: subset_data_dir() error: $src_dir/wav.scp" && exit 1) || exit 1; + rm -f $dest_dir/reco +} + +if [ $# -ne 2 ]; then + echo "Usage: $0 [options] " + echo " e.g.: $0 --train-subset L /disk1/audio_data/wenetspeech/ data/" + echo "" + echo "This script takes the WenetSpeech source directory, and prepares the" + echo "WeNet format data directory." + echo " --prefix # Prefix for output data directory." + echo " --stage # Processing stage." + echo " --train-subset # Train subset to be created." + exit 1 +fi + +wenetspeech_dir=$1 +data_dir=$2 + +declare -A subsets +subsets=( + [L]="train_l" + [M]="train_m" + [S]="train_s" + [W]="train_w" + [DEV]="dev" + [TEST_NET]="test_net" + [TEST_MEETING]="test_meeting") + +prefix=${prefix:+${prefix}_} + +corpus_dir=$data_dir/${prefix}corpus/ +if [ $stage -le 1 ]; then + echo "$0: Extract meta into $corpus_dir" + # Sanity check. + [ ! -f $wenetspeech_dir/WenetSpeech.json ] &&\ + echo "$0: Please download $wenetspeech_dir/WenetSpeech.json!" && exit 1; + [ ! -d $wenetspeech_dir/audio ] &&\ + echo "$0: Please download $wenetspeech_dir/audio!" && exit 1; + + [ ! -d $corpus_dir ] && mkdir -p $corpus_dir + + # Files to be created: + # wav.scp text segments utt2dur + python3 local/extract_meta.py \ + $wenetspeech_dir/WenetSpeech.json $corpus_dir || exit 1; +fi + +if [ $stage -le 2 ]; then + echo "$0: Split data to train, dev, test_net, and test_meeting" + [ ! -f $corpus_dir/utt2subsets ] &&\ + echo "$0: No such file $corpus_dir/utt2subsets!" && exit 1; + for label in $train_subset DEV TEST_NET TEST_MEETING; do + if [ ! ${subsets[$label]+set} ]; then + echo "$0: Subset $label is not defined in WenetSpeech.json." && exit 1; + fi + subset=${subsets[$label]} + [ ! -d $data_dir/${prefix}$subset ] && mkdir -p $data_dir/${prefix}$subset + cat $corpus_dir/utt2subsets | \ + awk -v s=$label '{for (i=2;i<=NF;i++) if($i==s) print $0;}' \ + > $corpus_dir/${prefix}${subset}_utt_list|| exit 1; + subset_data_dir $corpus_dir/${prefix}${subset}_utt_list \ + $corpus_dir $data_dir/${prefix}$subset || exit 1; + done +fi + +echo "$0: Done" \ No newline at end of file diff --git a/examples/wenetspeech/asr1/path.sh b/examples/wenetspeech/asr1/path.sh new file mode 100644 index 00000000..666b29bc --- /dev/null +++ b/examples/wenetspeech/asr1/path.sh @@ -0,0 +1,15 @@ +export MAIN_ROOT=`realpath ${PWD}/../../../` + +export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} +export LC_ALL=C + +export PYTHONDONTWRITEBYTECODE=1 +# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C +export PYTHONIOENCODING=UTF-8 +export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} + +export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/ + +# model exp +MODEL=u2 +export BIN_DIR=${MAIN_ROOT}/paddlespeech/s2t/exps/${MODEL}/bin diff --git a/examples/wenetspeech/asr1/run.sh b/examples/wenetspeech/asr1/run.sh new file mode 100644 index 00000000..8c4a12cb --- /dev/null +++ b/examples/wenetspeech/asr1/run.sh @@ -0,0 +1,55 @@ +#!/bin/bash + +. path.sh || exit 1; +set -e + +gpus=0,1,2,3,4,5,6,7 +stage=0 +stop_stage=100 +conf_path=conf/conformer.yaml + +average_checkpoint=true +avg_num=10 + +. ${MAIN_ROOT}/utils/parse_options.sh || exit 1; + +avg_ckpt=avg_${avg_num} +ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}') +echo "checkpoint name ${ckpt}" + +audio_file="data/tmp.wav" + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # prepare data + bash ./local/data.sh || exit -1 +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # train model, all `ckpt` under `exp` dir + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # avg n best model + avg.sh best exp/${ckpt}/checkpoints ${avg_num} +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + # test ckpt avg_n + CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 +fi + +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + # ctc alignment of test data + CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 +fi + +if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then + # export ckpt avg_n + CUDA_VISIBLE_DEVICES=0 ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit +fi + +if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then + # test a single .wav file + CUDA_VISIBLE_DEVICES=0 ./local/test_hub.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1 +fi diff --git a/examples/wenetspeech/asr1/utils b/examples/wenetspeech/asr1/utils new file mode 120000 index 00000000..973afe67 --- /dev/null +++ b/examples/wenetspeech/asr1/utils @@ -0,0 +1 @@ +../../../utils \ No newline at end of file diff --git a/paddlespeech/s2t/exps/deepspeech2/model.py b/paddlespeech/s2t/exps/deepspeech2/model.py index 177d710b..e827414d 100644 --- a/paddlespeech/s2t/exps/deepspeech2/model.py +++ b/paddlespeech/s2t/exps/deepspeech2/model.py @@ -409,7 +409,7 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester): @paddle.no_grad() def test(self): logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}") - if self.args.enable_auto_log == True: + if self.args.enable_auto_log is True: from paddlespeech.s2t.utils.log import Autolog self.autolog = Autolog( batch_size=self.config.decoding.batch_size, @@ -438,7 +438,7 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester): msg += "Final error rate [%s] (%d/%d) = %f" % ( error_rate_type, num_ins, num_ins, errors_sum / len_refs) logger.info(msg) - if self.args.enable_auto_log == True: + if self.args.enable_auto_log is True: self.autolog.report() def compute_result_transcripts(self, audio, audio_len, vocab_list, cfg): @@ -512,7 +512,7 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester): x_len_list = np.split(x_len_batch, batch_size, axis=0) for x, x_len in zip(x_list, x_len_list): - if self.args.enable_auto_log == True: + if self.args.enable_auto_log is True: self.autolog.times.start() x_len = x_len[0] assert (chunk_size <= x_len) @@ -547,7 +547,7 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester): probs_chunk_list = [] probs_chunk_lens_list = [] - if self.args.enable_auto_log == True: + if self.args.enable_auto_log is True: # record the model preprocessing time self.autolog.times.stamp() @@ -606,7 +606,7 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester): [output_probs, output_probs_padding], axis=1) output_probs_list.append(output_probs) output_lens_list.append(output_lens) - if self.args.enable_auto_log == True: + if self.args.enable_auto_log is True: # record the model inference time self.autolog.times.stamp() # record the post processing time @@ -641,12 +641,12 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester): audio_len_handle.reshape(x_len.shape) audio_len_handle.copy_from_cpu(x_len) - if self.args.enable_auto_log == True: + if self.args.enable_auto_log is True: self.autolog.times.start() # record the prefix processing time self.autolog.times.stamp() self.predictor.run() - if self.args.enable_auto_log == True: + if self.args.enable_auto_log is True: # record the model inference time self.autolog.times.stamp() # record the post processing time diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py index 22d4238a..27bc47d2 100644 --- a/paddlespeech/s2t/exps/u2/model.py +++ b/paddlespeech/s2t/exps/u2/model.py @@ -24,13 +24,10 @@ import jsonlines import numpy as np import paddle from paddle import distributed as dist -from paddle.io import DataLoader from yacs.config import CfgNode -from paddlespeech.s2t.io.collator import SpeechCollator -from paddlespeech.s2t.io.dataset import ManifestDataset -from paddlespeech.s2t.io.sampler import SortagradBatchSampler -from paddlespeech.s2t.io.sampler import SortagradDistributedBatchSampler +from paddlespeech.s2t.frontend.featurizer import TextFeaturizer +from paddlespeech.s2t.io.dataloader import BatchDataLoader from paddlespeech.s2t.models.u2 import U2Model from paddlespeech.s2t.training.optimizer import OptimizerFactory from paddlespeech.s2t.training.reporter import ObsScope @@ -213,7 +210,7 @@ class U2Trainer(Trainer): msg += f"{v:>.8f}" if isinstance(v, float) else f"{v}" msg += f" {k.split(',')[1]}" if len( - k.split(',')) == 2 else f"" + k.split(',')) == 2 else "" msg += "," msg = msg[:-1] # remove the last "," if (batch_index + 1 @@ -249,92 +246,103 @@ class U2Trainer(Trainer): def setup_dataloader(self): config = self.config.clone() - config.defrost() - config.collator.keep_transcription_text = False - # train/valid dataset, return token ids - config.data.manifest = config.data.train_manifest - train_dataset = ManifestDataset.from_config(config) - - config.data.manifest = config.data.dev_manifest - dev_dataset = ManifestDataset.from_config(config) - - collate_fn_train = SpeechCollator.from_config(config) - - config.collator.augmentation_config = "" - collate_fn_dev = SpeechCollator.from_config(config) - - if self.parallel: - batch_sampler = SortagradDistributedBatchSampler( - train_dataset, + if self.train: + # train/valid dataset, return token ids + self.train_loader = BatchDataLoader( + json_file=config.data.train_manifest, + train_mode=True, + sortagrad=False, batch_size=config.collator.batch_size, - num_replicas=None, - rank=None, - shuffle=True, - drop_last=True, - sortagrad=config.collator.sortagrad, - shuffle_method=config.collator.shuffle_method) - else: - batch_sampler = SortagradBatchSampler( - train_dataset, - shuffle=True, + maxlen_in=float('inf'), + maxlen_out=float('inf'), + minibatches=0, + mini_batch_size=self.args.nprocs, + batch_count='auto', + batch_bins=0, + batch_frames_in=0, + batch_frames_out=0, + batch_frames_inout=0, + preprocess_conf=config.collator. + augmentation_config, # aug will be off when train_mode=False + n_iter_processes=config.collator.num_workers, + subsampling_factor=1, + num_encs=1) + + self.valid_loader = BatchDataLoader( + json_file=config.data.dev_manifest, + train_mode=False, + sortagrad=False, batch_size=config.collator.batch_size, - drop_last=True, - sortagrad=config.collator.sortagrad, - shuffle_method=config.collator.shuffle_method) - self.train_loader = DataLoader( - train_dataset, - batch_sampler=batch_sampler, - collate_fn=collate_fn_train, - num_workers=config.collator.num_workers, ) - self.valid_loader = DataLoader( - dev_dataset, - batch_size=config.collator.batch_size, - shuffle=False, - drop_last=False, - collate_fn=collate_fn_dev, - num_workers=config.collator.num_workers, ) - - # test dataset, return raw text - config.data.manifest = config.data.test_manifest - # filter test examples, will cause less examples, but no mismatch with training - # and can use large batch size , save training time, so filter test egs now. - config.data.min_input_len = 0.0 # second - config.data.max_input_len = float('inf') # second - config.data.min_output_len = 0.0 # tokens - config.data.max_output_len = float('inf') # tokens - config.data.min_output_input_ratio = 0.00 - config.data.max_output_input_ratio = float('inf') - - test_dataset = ManifestDataset.from_config(config) - # return text ord id - config.collator.keep_transcription_text = True - config.collator.augmentation_config = "" - self.test_loader = DataLoader( - test_dataset, - batch_size=config.decoding.batch_size, - shuffle=False, - drop_last=False, - collate_fn=SpeechCollator.from_config(config), - num_workers=config.collator.num_workers, ) - # return text token id - config.collator.keep_transcription_text = False - self.align_loader = DataLoader( - test_dataset, - batch_size=config.decoding.batch_size, - shuffle=False, - drop_last=False, - collate_fn=SpeechCollator.from_config(config), - num_workers=config.collator.num_workers, ) - logger.info("Setup train/valid/test/align Dataloader!") + maxlen_in=float('inf'), + maxlen_out=float('inf'), + minibatches=0, + mini_batch_size=self.args.nprocs, + batch_count='auto', + batch_bins=0, + batch_frames_in=0, + batch_frames_out=0, + batch_frames_inout=0, + preprocess_conf=config.collator. + augmentation_config, # aug will be off when train_mode=False + n_iter_processes=config.collator.num_workers, + subsampling_factor=1, + num_encs=1) + logger.info("Setup train/valid Dataloader!") + else: + # test dataset, return raw text + self.test_loader = BatchDataLoader( + json_file=config.data.test_manifest, + train_mode=False, + sortagrad=False, + batch_size=config.decoding.batch_size, + maxlen_in=float('inf'), + maxlen_out=float('inf'), + minibatches=0, + mini_batch_size=1, + batch_count='auto', + batch_bins=0, + batch_frames_in=0, + batch_frames_out=0, + batch_frames_inout=0, + preprocess_conf=config.collator. + augmentation_config, # aug will be off when train_mode=False + n_iter_processes=1, + subsampling_factor=1, + num_encs=1) + + self.align_loader = BatchDataLoader( + json_file=config.data.test_manifest, + train_mode=False, + sortagrad=False, + batch_size=config.decoding.batch_size, + maxlen_in=float('inf'), + maxlen_out=float('inf'), + minibatches=0, + mini_batch_size=1, + batch_count='auto', + batch_bins=0, + batch_frames_in=0, + batch_frames_out=0, + batch_frames_inout=0, + preprocess_conf=config.collator. + augmentation_config, # aug will be off when train_mode=False + n_iter_processes=1, + subsampling_factor=1, + num_encs=1) + logger.info("Setup test/align Dataloader!") def setup_model(self): config = self.config model_conf = config.model with UpdateConfig(model_conf): - model_conf.input_dim = self.train_loader.collate_fn.feature_size - model_conf.output_dim = self.train_loader.collate_fn.vocab_size + if self.train: + model_conf.input_dim = self.train_loader.feat_dim + model_conf.output_dim = self.train_loader.vocab_size + else: + model_conf.input_dim = self.test_loader.feat_dim + model_conf.output_dim = self.test_loader.vocab_size model = U2Model.from_config(model_conf) @@ -343,6 +351,11 @@ class U2Trainer(Trainer): logger.info(f"{model}") layer_tools.print_params(model, logger.info) + self.model = model + logger.info("Setup model!") + + if not self.train: + return train_config = config.training optim_type = train_config.optim @@ -383,10 +396,9 @@ class U2Trainer(Trainer): optimzer_args = optimizer_args(config, model.parameters(), lr_scheduler) optimizer = OptimizerFactory.from_args(optim_type, optimzer_args) - self.model = model self.optimizer = optimizer self.lr_scheduler = lr_scheduler - logger.info("Setup model/optimizer/lr_scheduler!") + logger.info("Setup optimizer/lr_scheduler!") class U2Tester(U2Trainer): @@ -421,14 +433,19 @@ class U2Tester(U2Trainer): def __init__(self, config, args): super().__init__(config, args) + self.text_feature = TextFeaturizer( + unit_type=self.config.collator.unit_type, + vocab_filepath=self.config.collator.vocab_filepath, + spm_model_prefix=self.config.collator.spm_model_prefix) + self.vocab_list = self.text_feature.vocab_list - def ordid2token(self, texts, texts_len): + def id2token(self, texts, texts_len, text_feature): """ ord() id to chr() chr """ trans = [] for text, n in zip(texts, texts_len): n = n.numpy().item() ids = text[:n] - trans.append(''.join([chr(i) for i in ids])) + trans.append(text_feature.defeaturize(ids.numpy().tolist())) return trans def compute_metrics(self, @@ -444,12 +461,11 @@ class U2Tester(U2Trainer): error_rate_func = error_rate.cer if cfg.error_rate_type == 'cer' else error_rate.wer start_time = time.time() - text_feature = self.test_loader.collate_fn.text_feature - target_transcripts = self.ordid2token(texts, texts_len) + target_transcripts = self.id2token(texts, texts_len, self.text_feature) result_transcripts, result_tokenids = self.model.decode( audio, audio_len, - text_feature=text_feature, + text_feature=self.text_feature, decoding_method=cfg.decoding_method, lang_model_path=cfg.lang_model_path, beam_alpha=cfg.alpha, @@ -499,7 +515,7 @@ class U2Tester(U2Trainer): self.model.eval() logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}") - stride_ms = self.test_loader.collate_fn.stride_ms + stride_ms = self.config.collator.stride_ms error_rate_type = None errors_sum, len_refs, num_ins = 0.0, 0, 0 num_frames = 0.0 @@ -558,8 +574,7 @@ class U2Tester(U2Trainer): def align(self): ctc_utils.ctc_align(self.config, self.model, self.align_loader, self.config.decoding.batch_size, - self.align_loader.collate_fn.stride_ms, - self.align_loader.collate_fn.vocab_list, + self.config.collator.stride_ms, self.vocab_list, self.args.result_file) def load_inferspec(self): @@ -573,7 +588,7 @@ class U2Tester(U2Trainer): infer_model = U2InferModel.from_pretrained(self.test_loader, self.config.model.clone(), self.args.checkpoint_path) - feat_dim = self.test_loader.collate_fn.feature_size + feat_dim = self.test_loader.feat_dim input_spec = [ paddle.static.InputSpec(shape=[1, None, feat_dim], dtype='float32'), # audio, [B,T,D] diff --git a/paddlespeech/s2t/exps/u2_kaldi/model.py b/paddlespeech/s2t/exps/u2_kaldi/model.py index 0d8508c2..d82034c8 100644 --- a/paddlespeech/s2t/exps/u2_kaldi/model.py +++ b/paddlespeech/s2t/exps/u2_kaldi/model.py @@ -392,6 +392,7 @@ class U2Tester(U2Trainer): unit_type=self.config.collator.unit_type, vocab_filepath=self.config.collator.vocab_filepath, spm_model_prefix=self.config.collator.spm_model_prefix) + self.vocab_list = self.text_feature.vocab_list def id2token(self, texts, texts_len, text_feature): """ ord() id to chr() chr """ @@ -529,8 +530,7 @@ class U2Tester(U2Trainer): def align(self): ctc_utils.ctc_align(self.config, self.model, self.align_loader, self.config.decoding.batch_size, - self.align_loader.collate_fn.stride_ms, - self.align_loader.collate_fn.vocab_list, + self.config.collator.stride_ms, self.vocab_list, self.args.result_file) def load_inferspec(self): diff --git a/paddlespeech/s2t/frontend/audio.py b/paddlespeech/s2t/frontend/audio.py index 13dc3a44..65dccad3 100644 --- a/paddlespeech/s2t/frontend/audio.py +++ b/paddlespeech/s2t/frontend/audio.py @@ -24,6 +24,8 @@ import soundfile import soxbindings as sox from scipy import signal +from .utility import convert_samples_from_float32 +from .utility import convert_samples_to_float32 from .utility import subfile_from_tar @@ -689,15 +691,7 @@ class AudioSegment(): Audio sample type is usually integer or float-point. Integers will be scaled to [-1, 1] in float32. """ - float32_samples = samples.astype('float32') - if samples.dtype in np.sctypes['int']: - bits = np.iinfo(samples.dtype).bits - float32_samples *= (1. / 2**(bits - 1)) - elif samples.dtype in np.sctypes['float']: - pass - else: - raise TypeError("Unsupported sample type: %s." % samples.dtype) - return float32_samples + return convert_samples_to_float32(samples) def _convert_samples_from_float32(self, samples, dtype): """Convert sample type from float32 to dtype. @@ -708,20 +702,4 @@ class AudioSegment(): This is for writing a audio file. """ - dtype = np.dtype(dtype) - output_samples = samples.copy() - if dtype in np.sctypes['int']: - bits = np.iinfo(dtype).bits - output_samples *= (2**(bits - 1) / 1.) - min_val = np.iinfo(dtype).min - max_val = np.iinfo(dtype).max - output_samples[output_samples > max_val] = max_val - output_samples[output_samples < min_val] = min_val - elif samples.dtype in np.sctypes['float']: - min_val = np.finfo(dtype).min - max_val = np.finfo(dtype).max - output_samples[output_samples > max_val] = max_val - output_samples[output_samples < min_val] = min_val - else: - raise TypeError("Unsupported sample type: %s." % samples.dtype) - return output_samples.astype(dtype) + return convert_samples_from_float32(samples, dtype) diff --git a/paddlespeech/s2t/frontend/featurizer/text_featurizer.py b/paddlespeech/s2t/frontend/featurizer/text_featurizer.py index 7f3bd9e1..21f512e9 100644 --- a/paddlespeech/s2t/frontend/featurizer/text_featurizer.py +++ b/paddlespeech/s2t/frontend/featurizer/text_featurizer.py @@ -92,7 +92,9 @@ class TextFeaturizer(): tokens = self.tokenize(text) ids = [] for token in tokens: - token = token if token in self.vocab_dict else self.unk + if token not in self.vocab_dict: + logger.debug(f"Text Token: {token} -> {self.unk}") + token = self.unk ids.append(self.vocab_dict[token]) return ids diff --git a/paddlespeech/s2t/frontend/utility.py b/paddlespeech/s2t/frontend/utility.py index 089890d2..703f2127 100644 --- a/paddlespeech/s2t/frontend/utility.py +++ b/paddlespeech/s2t/frontend/utility.py @@ -30,7 +30,8 @@ logger = Log(__name__).getlog() __all__ = [ "load_dict", "load_cmvn", "read_manifest", "rms_to_db", "rms_to_dbfs", "max_dbfs", "mean_dbfs", "gain_db_to_ratio", "normalize_audio", "SOS", - "EOS", "UNK", "BLANK", "MASKCTC", "SPACE" + "EOS", "UNK", "BLANK", "MASKCTC", "SPACE", "convert_samples_to_float32", + "convert_samples_from_float32" ] IGNORE_ID = -1 @@ -342,3 +343,50 @@ def load_cmvn(cmvn_file: str, filetype: str): else: raise ValueError(f"cmvn file type no support: {filetype}") return cmvn[0], cmvn[1] + + +def convert_samples_to_float32(samples): + """Convert sample type to float32. + + Audio sample type is usually integer or float-point. + Integers will be scaled to [-1, 1] in float32. + + PCM16 -> PCM32 + """ + float32_samples = samples.astype('float32') + if samples.dtype in np.sctypes['int']: + bits = np.iinfo(samples.dtype).bits + float32_samples *= (1. / 2**(bits - 1)) + elif samples.dtype in np.sctypes['float']: + pass + else: + raise TypeError("Unsupported sample type: %s." % samples.dtype) + return float32_samples + + +def convert_samples_from_float32(samples, dtype): + """Convert sample type from float32 to dtype. + + Audio sample type is usually integer or float-point. For integer + type, float32 will be rescaled from [-1, 1] to the maximum range + supported by the integer type. + + PCM32 -> PCM16 + """ + dtype = np.dtype(dtype) + output_samples = samples.copy() + if dtype in np.sctypes['int']: + bits = np.iinfo(dtype).bits + output_samples *= (2**(bits - 1) / 1.) + min_val = np.iinfo(dtype).min + max_val = np.iinfo(dtype).max + output_samples[output_samples > max_val] = max_val + output_samples[output_samples < min_val] = min_val + elif samples.dtype in np.sctypes['float']: + min_val = np.finfo(dtype).min + max_val = np.finfo(dtype).max + output_samples[output_samples > max_val] = max_val + output_samples[output_samples < min_val] = min_val + else: + raise TypeError("Unsupported sample type: %s." % samples.dtype) + return output_samples.astype(dtype) diff --git a/paddlespeech/s2t/io/collator.py b/paddlespeech/s2t/io/collator.py index cb7349d0..5f233549 100644 --- a/paddlespeech/s2t/io/collator.py +++ b/paddlespeech/s2t/io/collator.py @@ -199,8 +199,8 @@ class SpeechCollatorBase(): for idx, item in enumerate(batch): utts.append(item['utt']) - audio = item['feat'] - text = item['text'] + audio = item['input'][0]['feat'] + text = item['output'][0]['text'] audio, text = self.process_utterance(audio, text) audios.append(audio) # [T, D] @@ -343,9 +343,10 @@ class TripletSpeechCollator(SpeechCollator): for idx, item in enumerate(batch): utts.append(item['utt']) - audio = item['feat'] - translation = item['text'] - transcription = item['text1'] + audio = item['input'][0]['feat'] + translation = item['output'][0]['text'] + transcription = item['output'][1]['text'] + audio, translation, transcription = self.process_utterance( audio, translation, transcription) diff --git a/paddlespeech/s2t/io/dataset.py b/paddlespeech/s2t/io/dataset.py index c503107a..61eeb00f 100644 --- a/paddlespeech/s2t/io/dataset.py +++ b/paddlespeech/s2t/io/dataset.py @@ -103,7 +103,7 @@ class ManifestDataset(Dataset): min_output_len=min_output_len, max_output_input_ratio=max_output_input_ratio, min_output_input_ratio=min_output_input_ratio) - self._manifest.sort(key=lambda x: x["feat_shape"][0]) + self._manifest.sort(key=lambda x: x["input"][0]["shape"][0]) def __len__(self): return len(self._manifest) @@ -188,34 +188,16 @@ class AudioDataset(Dataset): if sort: data = sorted(data, key=lambda x: x["feat_shape"][0]) if raw_wav: - assert data[0]['feat'].split(':')[0].splitext()[-1] not in ('.ark', - '.scp') - data = map(lambda x: (float(x['feat_shape'][0]) * 1000 / stride_ms)) + path_suffix = data[0]['feat'].split(':')[0].splitext()[-1] + assert path_suffix not in ('.ark', '.scp') + # m second to n frame + data = list( + map(lambda x: (float(x['feat_shape'][0]) * 1000 / stride_ms), + data)) self.input_dim = data[0]['feat_shape'][1] self.output_dim = data[0]['token_shape'][1] - # with open(data_file, 'r') as f: - # for line in f: - # arr = line.strip().split('\t') - # if len(arr) != 7: - # continue - # key = arr[0].split(':')[1] - # tokenid = arr[5].split(':')[1] - # output_dim = int(arr[6].split(':')[1].split(',')[1]) - # if raw_wav: - # wav_path = ':'.join(arr[1].split(':')[1:]) - # duration = int(float(arr[2].split(':')[1]) * 1000 / 10) - # data.append((key, wav_path, duration, tokenid)) - # else: - # feat_ark = ':'.join(arr[1].split(':')[1:]) - # feat_info = arr[2].split(':')[1].split(',') - # feat_dim = int(feat_info[1].strip()) - # num_frames = int(feat_info[0].strip()) - # data.append((key, feat_ark, num_frames, tokenid)) - # self.input_dim = feat_dim - # self.output_dim = output_dim - valid_data = [] for i in range(len(data)): length = data[i]['feat_shape'][0] @@ -223,17 +205,17 @@ class AudioDataset(Dataset): # remove too lang or too short utt for both input and output # to prevent from out of memory if length > max_length or length < min_length: - # logging.warn('ignore utterance {} feature {}'.format( - # data[i][0], length)) pass elif token_length > token_max_length or token_length < token_min_length: pass else: valid_data.append(data[i]) + logger.info(f"raw dataset len: {len(data)}") data = valid_data + num_data = len(data) + logger.info(f"dataset len after filter: {num_data}") self.minibatch = [] - num_data = len(data) # Dynamic batch size if batch_type == 'dynamic': assert (max_frames_in_batch > 0) @@ -258,7 +240,9 @@ class AudioDataset(Dataset): cur = end def __len__(self): + """number of example(batch)""" return len(self.minibatch) def __getitem__(self, idx): + """batch example of idx""" return self.minibatch[idx] diff --git a/paddlespeech/s2t/io/reader.py b/paddlespeech/s2t/io/reader.py index e810662d..38ff1396 100644 --- a/paddlespeech/s2t/io/reader.py +++ b/paddlespeech/s2t/io/reader.py @@ -18,8 +18,10 @@ import kaldiio import numpy as np import soundfile -from paddlespeech.s2t.frontend.augmentor.augmentation import AugmentationPipeline as Transformation +from .utility import feat_type +from paddlespeech.s2t.transform.transformation import Transformation from paddlespeech.s2t.utils.log import Log +# from paddlespeech.s2t.frontend.augmentor.augmentation import AugmentationPipeline as Transformation __all__ = ["LoadInputsAndTargets"] @@ -322,20 +324,7 @@ class LoadInputsAndTargets(): "Not supported: loader_type={}".format(filetype)) def file_type(self, filepath): - suffix = filepath.split(":")[0].split('.')[-1].lower() - if suffix == 'ark': - return 'mat' - elif suffix == 'scp': - return 'scp' - elif suffix == 'npy': - return 'npy' - elif suffix == 'npz': - return 'npz' - elif suffix in ['wav', 'flac']: - # PCM16 - return 'sound' - else: - raise ValueError(f"Not support filetype: {suffix}") + return feat_type(filepath) class SoundHDF5File(): diff --git a/paddlespeech/s2t/io/utility.py b/paddlespeech/s2t/io/utility.py index 392031ba..1a90e3d0 100644 --- a/paddlespeech/s2t/io/utility.py +++ b/paddlespeech/s2t/io/utility.py @@ -17,7 +17,7 @@ import numpy as np from paddlespeech.s2t.utils.log import Log -__all__ = ["pad_list", "pad_sequence"] +__all__ = ["pad_list", "pad_sequence", "feat_type"] logger = Log(__name__).getlog() @@ -85,3 +85,20 @@ def pad_sequence(sequences: List[np.ndarray], out_tensor[:length, i, ...] = tensor return out_tensor + + +def feat_type(filepath): + suffix = filepath.split(":")[0].split('.')[-1].lower() + if suffix == 'ark': + return 'mat' + elif suffix == 'scp': + return 'scp' + elif suffix == 'npy': + return 'npy' + elif suffix == 'npz': + return 'npz' + elif suffix in ['wav', 'flac']: + # PCM16 + return 'sound' + else: + raise ValueError(f"Not support filetype: {suffix}") diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py index 9977cecc..4f833372 100644 --- a/paddlespeech/s2t/models/u2/u2.py +++ b/paddlespeech/s2t/models/u2/u2.py @@ -860,7 +860,7 @@ class U2Model(U2DecodeModel): int, nn.Layer, nn.Layer, nn.Layer: vocab size, encoder, decoder, ctc """ # cmvn - if 'cmvn_file' in configs and configs['cmvn_file'] is not None: + if 'cmvn_file' in configs and configs['cmvn_file']: mean, istd = load_cmvn(configs['cmvn_file'], configs['cmvn_file_type']) global_cmvn = GlobalCMVN( @@ -934,8 +934,8 @@ class U2Model(U2DecodeModel): DeepSpeech2Model: The model built from pretrained result. """ with UpdateConfig(config): - config.input_dim = dataloader.collate_fn.feature_size - config.output_dim = dataloader.collate_fn.vocab_size + config.input_dim = dataloader.feat_dim + config.output_dim = dataloader.vocab_size model = cls.from_config(config) diff --git a/paddlespeech/s2t/modules/attention.py b/paddlespeech/s2t/modules/attention.py index 80eaf975..3d5f8cd1 100644 --- a/paddlespeech/s2t/modules/attention.py +++ b/paddlespeech/s2t/modules/attention.py @@ -1,4 +1,5 @@ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2019 Mobvoi Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/paddlespeech/s2t/modules/cmvn.py b/paddlespeech/s2t/modules/cmvn.py index 6e97f824..67f71b66 100644 --- a/paddlespeech/s2t/modules/cmvn.py +++ b/paddlespeech/s2t/modules/cmvn.py @@ -1,4 +1,5 @@ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2019 Mobvoi Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/paddlespeech/s2t/modules/conformer_convolution.py b/paddlespeech/s2t/modules/conformer_convolution.py index 7601a5cc..7ec92554 100644 --- a/paddlespeech/s2t/modules/conformer_convolution.py +++ b/paddlespeech/s2t/modules/conformer_convolution.py @@ -1,4 +1,5 @@ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2019 Mobvoi Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/paddlespeech/s2t/modules/decoder.py b/paddlespeech/s2t/modules/decoder.py index b0ab869a..6b4d9591 100644 --- a/paddlespeech/s2t/modules/decoder.py +++ b/paddlespeech/s2t/modules/decoder.py @@ -1,4 +1,5 @@ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2019 Mobvoi Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/paddlespeech/s2t/modules/decoder_layer.py b/paddlespeech/s2t/modules/decoder_layer.py index 4d516068..520b18de 100644 --- a/paddlespeech/s2t/modules/decoder_layer.py +++ b/paddlespeech/s2t/modules/decoder_layer.py @@ -1,4 +1,5 @@ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2019 Mobvoi Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/paddlespeech/s2t/modules/embedding.py b/paddlespeech/s2t/modules/embedding.py index 9207658f..5d4e9175 100644 --- a/paddlespeech/s2t/modules/embedding.py +++ b/paddlespeech/s2t/modules/embedding.py @@ -1,4 +1,5 @@ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2019 Mobvoi Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/paddlespeech/s2t/modules/encoder.py b/paddlespeech/s2t/modules/encoder.py index 0cde5b9f..5c8ba081 100644 --- a/paddlespeech/s2t/modules/encoder.py +++ b/paddlespeech/s2t/modules/encoder.py @@ -1,4 +1,5 @@ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2019 Mobvoi Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/paddlespeech/s2t/modules/encoder_layer.py b/paddlespeech/s2t/modules/encoder_layer.py index 29d5a2d8..d39c0695 100644 --- a/paddlespeech/s2t/modules/encoder_layer.py +++ b/paddlespeech/s2t/modules/encoder_layer.py @@ -1,4 +1,5 @@ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2019 Mobvoi Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/paddlespeech/s2t/modules/loss.py b/paddlespeech/s2t/modules/loss.py index 5750f5a0..c7d9bd45 100644 --- a/paddlespeech/s2t/modules/loss.py +++ b/paddlespeech/s2t/modules/loss.py @@ -1,4 +1,5 @@ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2019 Mobvoi Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/paddlespeech/s2t/modules/mask.py b/paddlespeech/s2t/modules/mask.py index 6576cb92..d6b63761 100644 --- a/paddlespeech/s2t/modules/mask.py +++ b/paddlespeech/s2t/modules/mask.py @@ -1,4 +1,5 @@ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2019 Mobvoi Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/paddlespeech/s2t/modules/positionwise_feed_forward.py b/paddlespeech/s2t/modules/positionwise_feed_forward.py index 347264e9..e2619cd4 100644 --- a/paddlespeech/s2t/modules/positionwise_feed_forward.py +++ b/paddlespeech/s2t/modules/positionwise_feed_forward.py @@ -1,4 +1,5 @@ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2019 Mobvoi Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/paddlespeech/s2t/modules/subsampling.py b/paddlespeech/s2t/modules/subsampling.py index 759bd540..99a8300f 100644 --- a/paddlespeech/s2t/modules/subsampling.py +++ b/paddlespeech/s2t/modules/subsampling.py @@ -1,4 +1,5 @@ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2019 Mobvoi Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/paddlespeech/s2t/transform/cmvn.py b/paddlespeech/s2t/transform/cmvn.py index 4d2d2324..aa1e6b44 100644 --- a/paddlespeech/s2t/transform/cmvn.py +++ b/paddlespeech/s2t/transform/cmvn.py @@ -13,6 +13,7 @@ # limitations under the License. # Modified from espnet(https://github.com/espnet/espnet) import io +import json import h5py import kaldiio @@ -157,3 +158,40 @@ class UtteranceCMVN(): x = np.divide(x, std) return x + + +class GlobalCMVN(): + "Apply Global CMVN" + + def __init__(self, + cmvn_path, + norm_means=True, + norm_vars=True, + std_floor=1.0e-20): + self.cmvn_path = cmvn_path + self.norm_means = norm_means + self.norm_vars = norm_vars + self.std_floor = std_floor + + with open(cmvn_path) as f: + cmvn_stats = json.load(f) + self.count = cmvn_stats['frame_num'] + self.mean = np.array(cmvn_stats['mean_stat']) / self.count + self.square_sums = np.array(cmvn_stats['var_stat']) + self.var = self.square_sums / self.count - self.mean**2 + self.std = np.maximum(np.sqrt(self.var), self.std_floor) + + def __repr__(self): + return f"""{self.__class__.__name__}( + cmvn_path={self.cmvn_path}, + norm_means={self.norm_means}, + norm_vars={self.norm_vars},)""" + + def __call__(self, x, uttid=None): + # x: [Time, Dim] + if self.norm_means: + x = np.subtract(x, self.mean) + + if self.norm_vars: + x = np.divide(x, self.std) + return x diff --git a/paddlespeech/s2t/transform/perturb.py b/paddlespeech/s2t/transform/perturb.py index 153d494b..873adb0b 100644 --- a/paddlespeech/s2t/transform/perturb.py +++ b/paddlespeech/s2t/transform/perturb.py @@ -16,6 +16,7 @@ import librosa import numpy import scipy import soundfile +import soxbindings as sox from paddlespeech.s2t.io.reader import SoundHDF5File @@ -82,7 +83,6 @@ class SpeedPerturbation(): def __call__(self, x, uttid=None, train=True): if not train: return x - x = x.astype(numpy.float32) if self.accept_uttid: ratio = self.utt2ratio[uttid] @@ -108,6 +108,110 @@ class SpeedPerturbation(): return y +class SpeedPerturbationSox(): + """SpeedPerturbationSox + + The speed perturbation in kaldi uses sox-speed instead of sox-tempo, + and sox-speed just to resample the input, + i.e pitch and tempo are changed both. + + To speed up or slow down the sound of a file, + use speed to modify the pitch and the duration of the file. + This raises the speed and reduces the time. + The default factor is 1.0 which makes no change to the audio. + 2.0 doubles speed, thus time length is cut by a half and pitch is one interval higher. + + "Why use speed option instead of tempo -s in SoX for speed perturbation" + https://groups.google.com/forum/#!topic/kaldi-help/8OOG7eE4sZ8 + + tempo option: + sox -t wav input.wav -t wav output.tempo0.9.wav tempo -s 0.9 + + speed option: + sox -t wav input.wav -t wav output.speed0.9.wav speed 0.9 + + If we use speed option like above, the pitch of audio also will be changed, + but the tempo option does not change the pitch. + """ + + def __init__( + self, + lower=0.9, + upper=1.1, + utt2ratio=None, + keep_length=True, + sr=16000, + seed=None, ): + self.sr = sr + self.keep_length = keep_length + self.state = numpy.random.RandomState(seed) + + if utt2ratio is not None: + self.utt2ratio = {} + # Use the scheduled ratio for each utterances + self.utt2ratio_file = utt2ratio + self.lower = None + self.upper = None + self.accept_uttid = True + + with open(utt2ratio, "r") as f: + for line in f: + utt, ratio = line.rstrip().split(None, 1) + ratio = float(ratio) + self.utt2ratio[utt] = ratio + else: + self.utt2ratio = None + # The ratio is given on runtime randomly + self.lower = lower + self.upper = upper + + def __repr__(self): + if self.utt2ratio is None: + return f"""{self.__class__.__name__}( + lower={self.lower}, + upper={self.upper}, + keep_length={self.keep_length}, + sample_rate={self.sr})""" + + else: + return f"""{self.__class__.__name__}( + utt2ratio={self.utt2ratio_file}, + sample_rate={self.sr})""" + + def __call__(self, x, uttid=None, train=True): + if not train: + return x + + x = x.astype(numpy.float32) + if self.accept_uttid: + ratio = self.utt2ratio[uttid] + else: + ratio = self.state.uniform(self.lower, self.upper) + + tfm = sox.Transformer() + tfm.set_globals(multithread=False) + tfm.speed(ratio) + y = tfm.build_array(input_array=x, sample_rate_in=self.sr) + + if self.keep_length: + diff = abs(len(x) - len(y)) + if len(y) > len(x): + # Truncate noise + y = y[diff // 2:-((diff + 1) // 2)] + elif len(y) < len(x): + # Assume the time-axis is the first: (Time, Channel) + pad_width = [(diff // 2, (diff + 1) // 2)] + [ + (0, 0) for _ in range(y.ndim - 1) + ] + y = numpy.pad( + y, pad_width=pad_width, constant_values=0, mode="constant") + + if y.ndim == 2 and x.ndim == 1: + # (T, C) -> (T) + y = y.sequence(1) + return y + + class BandpassPerturbation(): """BandpassPerturbation diff --git a/paddlespeech/s2t/transform/spec_augment.py b/paddlespeech/s2t/transform/spec_augment.py index 83e4e2e7..5ce95085 100644 --- a/paddlespeech/s2t/transform/spec_augment.py +++ b/paddlespeech/s2t/transform/spec_augment.py @@ -34,6 +34,9 @@ def time_warp(x, max_time_warp=80, inplace=False, mode="PIL"): :returns numpy.ndarray: time warped spectrogram (time, freq) """ window = max_time_warp + if window == 0: + return x + if mode == "PIL": t = x.shape[0] if t - window <= window: diff --git a/paddlespeech/s2t/transform/spectrogram.py b/paddlespeech/s2t/transform/spectrogram.py index df3130da..da91ef92 100644 --- a/paddlespeech/s2t/transform/spectrogram.py +++ b/paddlespeech/s2t/transform/spectrogram.py @@ -14,6 +14,7 @@ # Modified from espnet(https://github.com/espnet/espnet) import librosa import numpy as np +from python_speech_features import logfbank def stft(x, @@ -304,3 +305,94 @@ class IStft(): win_length=self.win_length, window=self.window, center=self.center, ) + + +class LogMelSpectrogramKaldi(): + def __init__( + self, + fs=16000, + n_mels=80, + n_fft=512, # fft point + n_shift=160, # unit:sample, 10ms + win_length=400, # unit:sample, 25ms + window="povey", + fmin=20, + fmax=None, + eps=1e-10, + dither=False): + self.fs = fs + self.n_mels = n_mels + self.n_fft = n_fft + if n_shift > win_length: + raise ValueError("Stride size must not be greater than " + "window size.") + self.n_shift = n_shift / fs # unit: ms + self.win_length = win_length / fs # unit: ms + + self.window = window + self.fmin = fmin + if fmax is None: + fmax_ = fmax if fmax else self.fs / 2 + elif fmax > int(self.fs / 2): + raise ValueError("fmax must not be greater than half of " + "sample rate.") + self.fmax = fmax_ + + self.eps = eps + self.remove_dc_offset = True + self.preemph = 0.97 + self.dither = dither + + def __repr__(self): + return ( + "{name}(fs={fs}, n_mels={n_mels}, n_fft={n_fft}, " + "n_shift={n_shift}, win_length={win_length}, preemph={preemph}, window={window}, " + "fmin={fmin}, fmax={fmax}, eps={eps}, dither={dither}))".format( + name=self.__class__.__name__, + fs=self.fs, + n_mels=self.n_mels, + n_fft=self.n_fft, + n_shift=self.n_shift, + preemph=self.preemph, + win_length=self.win_length, + window=self.window, + fmin=self.fmin, + fmax=self.fmax, + eps=self.eps, + dither=self.dither, )) + + def __call__(self, x): + """ + + Args: + x (np.ndarray): shape (Ti,) + + Raises: + ValueError: not support (Ti, C) + + Returns: + np.ndarray: (T, D) + """ + if x.ndim != 1: + raise ValueError("Not support x: [Time, Channel]") + + if x.dtype in np.sctypes['float']: + # PCM32 -> PCM16 + bits = np.iinfo(np.int16).bits + x = x * 2**(bits - 1) + + # logfbank need PCM16 input + y = logfbank( + signal=x, + samplerate=self.fs, + winlen=self.win_length, # unit ms + winstep=self.n_shift, # unit ms + nfilt=self.n_mels, + nfft=self.n_fft, + lowfreq=self.fmin, + highfreq=self.fmax, + dither=self.dither, + remove_dc_offset=self.remove_dc_offset, + preemph=self.preemph, + wintype=self.window) + return y diff --git a/paddlespeech/s2t/transform/transformation.py b/paddlespeech/s2t/transform/transformation.py index 1aee4b36..381b0cdc 100644 --- a/paddlespeech/s2t/transform/transformation.py +++ b/paddlespeech/s2t/transform/transformation.py @@ -45,7 +45,8 @@ import_alias = dict( stft2fbank="paddlespeech.s2t.transform.spectrogram:Stft2LogMelSpectrogram", wpe="paddlespeech.s2t.transform.wpe:WPE", channel_selector="paddlespeech.s2t.transform.channel_selector:ChannelSelector", -) + fbank_kaldi="paddlespeech.s2t.transform.spectrogram:LogMelSpectrogramKaldi", + cmvn_json="paddlespeech.s2t.transform.cmvn:GlobalCMVN") class Transformation(): diff --git a/tests/chains/speedyspeech/prepare.sh b/tests/chains/speedyspeech/prepare.sh index fb6ef285..1ddcd677 100755 --- a/tests/chains/speedyspeech/prepare.sh +++ b/tests/chains/speedyspeech/prepare.sh @@ -32,7 +32,7 @@ trainer_list=$(func_parser_value "${lines[14]}") # MODE be one of ['lite_train_infer' 'whole_infer' 'whole_train_infer'] if [ ${MODE} = "lite_train_infer" ];then # pretrain lite train data - wget -nc -P ./pretrain_models/ https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip + wget -nc -P ./pretrain_models/ https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip (cd ./pretrain_models && unzip pwg_baker_ckpt_0.4.zip) # download data rm -rf ./train_data/mini_BZNSYP @@ -40,7 +40,7 @@ if [ ${MODE} = "lite_train_infer" ];then cd ./train_data/ && tar xzf mini_BZNSYP.tar.gz cd ../ elif [ ${MODE} = "whole_train_infer" ];then - wget -nc -P ./pretrain_models/ https://paddlespeech.bj.bcebos.com/Parakeet/speedyspeech_nosil_baker_ckpt_0.5.zip + wget -nc -P ./pretrain_models/ https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_ckpt_0.5.zip wget -nc -P ./pretrain_models/ https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip (cd ./pretrain_models && unzip speedyspeech_nosil_baker_ckpt_0.5.zip && unzip pwg_baker_ckpt_0.4.zip) rm -rf ./train_data/processed_BZNSYP diff --git a/utils/compute_mean_std.py b/utils/compute_mean_std.py index 296d272a..e47554dc 100755 --- a/utils/compute_mean_std.py +++ b/utils/compute_mean_std.py @@ -33,8 +33,8 @@ add_arg('spectrum_type', str, choices=['linear', 'mfcc', 'fbank']) add_arg('feat_dim', int, 13, "Audio feature dim.") add_arg('delta_delta', bool, False, "Audio feature with delta delta.") -add_arg('stride_ms', float, 10.0, "stride length in ms.") -add_arg('window_ms', float, 20.0, "stride length in ms.") +add_arg('stride_ms', int, 10, "stride length in ms.") +add_arg('window_ms', int, 20, "stride length in ms.") add_arg('sample_rate', int, 16000, "target sample rate.") add_arg('use_dB_normalization', bool, True, "do dB normalization.") add_arg('target_dB', int, -20, "target dB.") @@ -61,8 +61,8 @@ def main(): spectrum_type=args.spectrum_type, feat_dim=args.feat_dim, delta_delta=args.delta_delta, - stride_ms=args.stride_ms, - window_ms=args.window_ms, + stride_ms=float(args.stride_ms), + window_ms=float(args.window_ms), n_fft=None, max_freq=None, target_sample_rate=args.sample_rate, diff --git a/utils/format_data.py b/utils/format_data.py index 6fe36997..2fa1924a 100755 --- a/utils/format_data.py +++ b/utils/format_data.py @@ -20,13 +20,13 @@ import json from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer from paddlespeech.s2t.frontend.utility import load_cmvn from paddlespeech.s2t.frontend.utility import read_manifest +from paddlespeech.s2t.io.utility import feat_type from paddlespeech.s2t.utils.utility import add_arguments from paddlespeech.s2t.utils.utility import print_arguments parser = argparse.ArgumentParser(description=__doc__) add_arg = functools.partial(add_arguments, argparser=parser) # yapf: disable -add_arg('feat_type', str, "raw", "speech feature type, e.g. raw(wav, flac), mat(ark), scp") add_arg('cmvn_path', str, 'examples/librispeech/data/mean_std.json', "Filepath of cmvn.") @@ -62,27 +62,76 @@ def main(): vocab_size = text_feature.vocab_size print(f"Vocab size: {vocab_size}") + # josnline like this + # { + # "input": [{"name": "input1", "shape": (100, 83), "feat": "xxx.ark:123"}], + # "output": [{"name":"target1", "shape": (40, 5002), "text": "a b c de"}], + # "utt2spk": "111-2222", + # "utt": "111-2222-333" + # } count = 0 for manifest_path in args.manifest_paths: manifest_jsons = read_manifest(manifest_path) for line_json in manifest_jsons: + output_json = { + "input": [], + "output": [], + 'utt': line_json['utt'], + 'utt2spk': line_json.get('utt2spk', 'global'), + } + + # output line = line_json['text'] - tokens = text_feature.tokenize(line) - tokenids = text_feature.featurize(line) - line_json['token'] = tokens - line_json['token_id'] = tokenids - line_json['token_shape'] = (len(tokenids), vocab_size) - feat_shape = line_json['feat_shape'] - assert isinstance(feat_shape, (list, tuple)), type(feat_shape) - if args.feat_type == 'raw': - feat_shape.append(feat_dim) - line_json['filetype'] = 'sound' - else: # kaldi - raise NotImplementedError('no support kaldi feat now!') - fout.write(json.dumps(line_json) + '\n') + if isinstance(line, str): + # only one target + tokens = text_feature.tokenize(line) + tokenids = text_feature.featurize(line) + output_json['output'].append({ + 'name': 'target1', + 'shape': (len(tokenids), vocab_size), + 'text': line, + 'token': ' '.join(tokens), + 'tokenid': ' '.join(map(str, tokenids)), + }) + else: + # isinstance(line, list), multi target in one vocab + for i, item in enumerate(line, 1): + tokens = text_feature.tokenize(item) + tokenids = text_feature.featurize(item) + output_json['output'].append({ + 'name': f'target{i}', + 'shape': (len(tokenids), vocab_size), + 'text': item, + 'token': ' '.join(tokens), + 'tokenid': ' '.join(map(str, tokenids)), + }) + + # input + line = line_json['feat'] + if isinstance(line, str): + # only one input + feat_shape = line_json['feat_shape'] + assert isinstance(feat_shape, (list, tuple)), type(feat_shape) + filetype = feat_type(line) + if filetype == 'sound': + feat_shape.append(feat_dim) + else: # kaldi + raise NotImplementedError('no support kaldi feat now!') + + output_json['input'].append({ + "name": "input1", + "shape": feat_shape, + "feat": line, + "filetype": filetype, + }) + else: + # isinstance(line, list), multi input + raise NotImplementedError("not support multi input now!") + + fout.write(json.dumps(output_json) + '\n') count += 1 - print(f"Examples number: {count}") + print(f"{args.manifest_paths} Examples number: {count}") fout.close() diff --git a/utils/format_triplet_data.py b/utils/format_triplet_data.py index 79b3d2cb..e0b5ece3 100755 --- a/utils/format_triplet_data.py +++ b/utils/format_triplet_data.py @@ -20,13 +20,13 @@ import json from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer from paddlespeech.s2t.frontend.utility import load_cmvn from paddlespeech.s2t.frontend.utility import read_manifest +from paddlespeech.s2t.io.utility import feat_type from paddlespeech.s2t.utils.utility import add_arguments from paddlespeech.s2t.utils.utility import print_arguments parser = argparse.ArgumentParser(description=__doc__) add_arg = functools.partial(add_arguments, argparser=parser) # yapf: disable -add_arg('feat_type', str, "raw", "speech feature type, e.g. raw(wav, flac), kaldi") add_arg('cmvn_path', str, 'examples/librispeech/data/mean_std.json', "Filepath of cmvn.") @@ -79,9 +79,11 @@ def main(): line_json['token1'] = tokens line_json['token_id1'] = tokenids line_json['token_shape1'] = (len(tokenids), vocab_size) + feat_shape = line_json['feat_shape'] assert isinstance(feat_shape, (list, tuple)), type(feat_shape) - if args.feat_type == 'raw': + filetype = feat_type(line_json['feat']) + if filetype == 'sound': feat_shape.append(feat_dim) else: # kaldi raise NotImplementedError('no support kaldi feat now!') diff --git a/utils/pack_model.sh b/utils/pack_model.sh new file mode 100755 index 00000000..8acd59a6 --- /dev/null +++ b/utils/pack_model.sh @@ -0,0 +1,164 @@ +#!/usr/bin/env bash + +# Copyright 2019 Johns Hopkins University (Shinji Watanabe) +# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +[ -f ./path.sh ] && . ./path.sh + +results="" +# e.g., "exp/tr_it_pytorch_train/decode_dt_it_decode/result.wrd.txt +# exp/tr_it_pytorch_train/decode_et_it_decode/result.wrd.txt"' +lm="" +dict="" +etc="" +outfile="model" +preprocess_conf="" + +help_message=$(cat < --dict , for example: +: exp/train_rnnlm/rnnlm.model.best +: data/lang_char +: conf/train.yaml +: conf/decode.yaml +: data/tr_it/cmvn.ark +: exp/tr_it_pytorch_train/results/model.last10.avg.best +EOF +) + +. utils/parse_options.sh + +if [ $# != 4 ]; then + echo "${help_message}" + exit 1 +fi + +tr_conf=$1 +dec_conf=$2 +cmvn=$3 +e2e=$4 + +echo " - Model files (archived to ${outfile}.tar.gz by \`\$ pack_model.sh\`)" +echo " - model link: (put the model link manually.)" + +# configs +if [ -e ${tr_conf} ]; then + tar cfh ${outfile}.tar ${tr_conf} + echo -n " - training config file: \`" + echo ${tr_conf} | sed -e "s/$/\`/" +else + echo "missing ${tr_conf}" + exit 1 +fi +if [ -e ${dec_conf} ]; then + tar rfh ${outfile}.tar ${dec_conf} + echo -n " - decoding config file: \`" + echo ${dec_conf} | sed -e "s/$/\`/" +else + echo "missing ${dec_conf}" + exit 1 +fi +# NOTE(kan-bayashi): preprocess conf is optional +if [ -n "${preprocess_conf}" ]; then + tar rfh ${outfile}.tar ${preprocess_conf} + echo -n " - preprocess config file: \`" + echo ${preprocess_conf} | sed -e "s/$/\`/" +fi + +# cmvn +if [ -e ${cmvn} ]; then + tar rfh ${outfile}.tar ${cmvn} + echo -n " - cmvn file: \`" + echo ${cmvn} | sed -e "s/$/\`/" +else + echo "missing ${cmvn}" + exit 1 +fi + +# e2e +if [ -e ${e2e} ]; then + tar rfh ${outfile}.tar ${e2e} + echo -n " - e2e file: \`" + echo ${e2e} | sed -e "s/$/\`/" + + e2e_conf=$(dirname ${e2e})/model.json + if [ ! -e ${e2e_conf} ]; then + echo missing ${e2e_conf} + #exit 1 + else + echo -n " - e2e JSON file: \`" + echo ${e2e_conf} | sed -e "s/$/\`/" + tar rfh ${outfile}.tar ${e2e_conf} + fi +else + echo "missing ${e2e}" + exit 1 +fi + +# lm +if [ -n "${lm}" ]; then + if [ -e ${lm} ]; then + tar rfh ${outfile}.tar ${lm} + echo -n " - lm file: \`" + echo ${lm} | sed -e "s/$/\`/" + + lm_conf=$(dirname ${lm})/model.json + if [ ! -e ${lm_conf} ]; then + echo missing ${lm_conf} + exit 1 + else + echo -n " - lm JSON file: \`" + echo ${lm_conf} | sed -e "s/$/\`/" + tar rfh ${outfile}.tar ${lm_conf} + fi + else + echo "missing ${lm}" + exit 1 + fi +fi + +# dict +if [ -n "${dict}" ]; then + if [ -e ${dict} ]; then + tar rfh ${outfile}.tar ${dict} + echo -n " - dict file: \`" + echo ${dict} | sed -e "s/$/\`/" + else + echo "missing ${dict}" + exit 1 + fi +fi + +# etc +for x in ${etc}; do + if [ -e ${x} ]; then + tar rfh ${outfile}.tar ${x} + echo -n " - etc file: \`" + echo ${x} | sed -e "s/$/\`/" + else + echo "missing ${x}" + exit 1 + fi +done + +# finally compress the tar file +gzip -f ${outfile}.tar + +# results +if [ -n "${results}" ]; then + echo " - Results (paste them by yourself or obtained by \`\$ pack_model.sh --results \`)" + echo "\`\`\`" +fi +for x in ${results}; do + if [ -e ${x} ]; then + echo "${x}" + grep -e Avg -e SPKR -m 2 ${x} + else + echo "missing ${x}" + exit 1 + fi +done +if [ -n "${results}" ]; then + echo "\`\`\`" +fi + +exit 0 diff --git a/utils/remove_longshortdata.py b/utils/remove_longshortdata.py new file mode 100755 index 00000000..131b4a58 --- /dev/null +++ b/utils/remove_longshortdata.py @@ -0,0 +1,106 @@ +#!/usr/bin/env python3 +"""remove longshort data from manifest""" +import argparse +import logging + +import jsonlines + +from paddlespeech.s2t.utils.cli_utils import get_commandline_args + +# manifest after format +# josnline like this +# { +# "input": [{"name": "input1", "shape": (100, 83), "feat": "xxx.ark:123"}], +# "output": [{"name":"target1", "shape": (40, 5002), "text": "a b c de"}], +# "utt2spk": "111-2222", +# "utt": "111-2222-333" +# } + + +def get_parser(): + parser = argparse.ArgumentParser( + description="remove longshort data from format manifest", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) + parser.add_argument( + "--verbose", "-V", default=0, type=int, help="Verbose option") + parser.add_argument( + "--iaxis", + default=0, + type=int, + help="multi inputs index, 0 is the first") + parser.add_argument( + "--oaxis", + default=0, + type=int, + help="multi outputs index, 0 is the first") + parser.add_argument("--maxframes", default=2000, type=int, help="maxframes") + parser.add_argument("--minframes", default=10, type=int, help="minframes") + parser.add_argument("--maxchars", default=200, type=int, help="max tokens") + parser.add_argument("--minchars", default=0, type=int, help="min tokens") + parser.add_argument( + "--stride_ms", default=10, type=int, help="stride in ms unit.") + parser.add_argument( + "rspecifier", + type=str, + help="jsonl format manifest. e.g. manifest.jsonl") + parser.add_argument( + "wspecifier_or_wxfilename", + type=str, + help="Write specifier. e.g. manifest.jsonl") + return parser + + +def filter_input(args, line): + tmp = line['input'][args.iaxis] + if args.sound: + # second to frame + nframe = tmp['shape'][0] * 1000 / args.stride_ms + else: + nframe = tmp['shape'][0] + + if nframe < args.minframes or nframe > args.maxframes: + return True + else: + return False + + +def filter_output(args, line): + nchars = len(line['output'][args.iaxis]['text']) + if nchars < args.minchars or nchars > args.maxchars: + return True + else: + return False + + +def main(): + args = get_parser().parse_args() + + logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" + if args.verbose > 0: + logging.basicConfig(level=logging.INFO, format=logfmt) + else: + logging.basicConfig(level=logging.WARN, format=logfmt) + logging.info(get_commandline_args()) + + with jsonlines.open(args.rspecifier, 'r') as reader: + lines = list(reader) + logging.info(f"Example: {len(lines)}") + feat = lines[0]['input'][args.iaxis]['feat'] + args.soud = False + if feat.split('.')[-1] not in 'ark, scp': + args.sound = True + + count = 0 + filter = 0 + with jsonlines.open(args.wspecifier_or_wxfilename, 'w') as writer: + for line in lines: + if filter_input(args, line) or filter_output(args, line): + filter += 1 + continue + writer.write(line) + count += 1 + logging.info(f"Example after filter: {count}\{filter}") + + +if __name__ == '__main__': + main() diff --git a/utils/show_results.sh b/utils/show_results.sh new file mode 100755 index 00000000..42f80ee6 --- /dev/null +++ b/utils/show_results.sh @@ -0,0 +1,74 @@ +#!/usr/bin/env bash +mindepth=0 +maxdepth=1 + +. utils/parse_options.sh + +if [ $# -gt 1 ]; then + echo "Usage: $0 --mindepth 0 --maxdepth 1 [exp]" 1>&2 + echo "" + echo "Show the system environments and the evaluation results in Markdown format." + echo 'The default of is "exp/".' + exit 1 +fi + +[ -f ./path.sh ] && . ./path.sh +set -euo pipefail +if [ $# -eq 1 ]; then + exp=$1 +else + exp=exp +fi + + +cat << EOF + +# RESULTS +## Environments +- date: \`$(LC_ALL=C date)\` +EOF + +python3 << EOF +import sys, paddle +pyversion = sys.version.replace('\n', ' ') + +print(f"""- python version: \`{pyversion}\` +- paddle version: \`paddle {paddle.__version__}\`""") +EOF + +cat << EOF +- Git hash: \`$(git rev-parse HEAD)\` + - Commit date: \`$(git log -1 --format='%cd')\` + +EOF + +while IFS= read -r expdir; do + if ls ${expdir}/decode_*/result.txt &> /dev/null; then + # 1. Show the result table + cat << EOF +## $(basename ${expdir}) +### CER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +EOF + grep -e Avg ${expdir}/decode_*/result.txt \ + | sed -e "s#${expdir}/\([^/]*\)/result.txt:#|\1#g" \ + | sed -e 's#Sum/Avg##g' | tr '|' ' ' | tr -s ' ' '|' + echo + + # 2. Show the result table for WER + if ls ${expdir}/decode_*/result.wrd.txt &> /dev/null; then + cat << EOF +### WER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +EOF + grep -e Avg ${expdir}/decode_*/result.wrd.txt \ + | sed -e "s#${expdir}/\([^/]*\)/result.wrd.txt:#|\1#g" \ + | sed -e 's#Sum/Avg##g' | tr '|' ' ' | tr -s ' ' '|' + echo + fi + fi +done < <(find ${exp} -mindepth ${mindepth} -maxdepth ${maxdepth} -type d)