diff --git a/README.md b/README.md index 811fde50e..42d24df62 100644 --- a/README.md +++ b/README.md @@ -161,6 +161,8 @@ paddlespeech cls --input input.wav paddlespeech asr --lang zh --input input_16k.wav ``` **Speech Translation** (English to Chinese) + +(not support for Windows now) ```shell paddlespeech st --input input_16k.wav ``` @@ -170,7 +172,8 @@ paddlespeech tts --input "你好,欢迎使用百度飞桨深度学习框架! ``` - web demo for Text to Speech is integrated to [Huggingface Spaces](https://huggingface.co/spaces) with [Gradio](https://github.com/gradio-app/gradio). See Demo: https://huggingface.co/spaces/akhaliq/paddlespeech -If you want to try more functions like training and tuning, please have a look at documents of [Speech-to-Text](./docs/source/asr/quick_start.md) and [Text-to-Speech](./docs/source/tts/quick_start.md). + +If you want to try more functions like training and tuning, please have a look at [Speech-to-Text Quick Start](./docs/source/asr/quick_start.md) and [Text-to-Speech Quick Start](./docs/source/tts/quick_start.md). ## Model List @@ -258,15 +261,15 @@ The current hyperlinks redirect to [Previous Parakeet](https://github.com/Paddle - - - - + + + + - +
Text-to-Speech Module Type Model Type Dataset Link Text-to-Speech Module Type Model Type Dataset Link
Text Frontend Text Frontend tn / g2p @@ -352,10 +355,10 @@ The current hyperlinks redirect to [Previous Parakeet](https://github.com/Paddle - - - - + + + + diff --git a/dataset/aidatatang_200zh/aidatatang_200zh.py b/dataset/aidatatang_200zh/aidatatang_200zh.py index b8758c9a7..85f478c20 100644 --- a/dataset/aidatatang_200zh/aidatatang_200zh.py +++ b/dataset/aidatatang_200zh/aidatatang_200zh.py @@ -25,6 +25,7 @@ import os from pathlib import Path import soundfile + from utils.utility import download from utils.utility import unpack diff --git a/dataset/aishell/aishell.py b/dataset/aishell/aishell.py index 32dc119d2..7431fc083 100644 --- a/dataset/aishell/aishell.py +++ b/dataset/aishell/aishell.py @@ -25,6 +25,7 @@ import os from pathlib import Path import soundfile + from utils.utility import download from utils.utility import unpack diff --git a/dataset/librispeech/librispeech.py b/dataset/librispeech/librispeech.py index 0c779696d..69f0db599 100644 --- a/dataset/librispeech/librispeech.py +++ b/dataset/librispeech/librispeech.py @@ -27,6 +27,7 @@ import os from multiprocessing.pool import Pool import soundfile + from utils.utility import download from utils.utility import unpack diff --git a/dataset/mini_librispeech/mini_librispeech.py b/dataset/mini_librispeech/mini_librispeech.py index d96b5d64d..730c73a8b 100644 --- a/dataset/mini_librispeech/mini_librispeech.py +++ b/dataset/mini_librispeech/mini_librispeech.py @@ -26,6 +26,7 @@ import os from multiprocessing.pool import Pool import soundfile + from utils.utility import download from utils.utility import unpack diff --git a/dataset/musan/musan.py b/dataset/musan/musan.py index dc237c30a..2ac701bed 100644 --- a/dataset/musan/musan.py +++ b/dataset/musan/musan.py @@ -28,6 +28,7 @@ import json import os import soundfile + from utils.utility import download from utils.utility import unpack diff --git a/dataset/rir_noise/rir_noise.py b/dataset/rir_noise/rir_noise.py index 0e055f17b..e7b122890 100644 --- a/dataset/rir_noise/rir_noise.py +++ b/dataset/rir_noise/rir_noise.py @@ -28,6 +28,7 @@ import json import os import soundfile + from utils.utility import download from utils.utility import unzip diff --git a/dataset/thchs30/thchs30.py b/dataset/thchs30/thchs30.py index 879ed58db..cdfc0a75c 100644 --- a/dataset/thchs30/thchs30.py +++ b/dataset/thchs30/thchs30.py @@ -26,6 +26,7 @@ from multiprocessing.pool import Pool from pathlib import Path import soundfile + from utils.utility import download from utils.utility import unpack diff --git a/dataset/timit/timit.py b/dataset/timit/timit.py index d03c48a1e..c4a9f0663 100644 --- a/dataset/timit/timit.py +++ b/dataset/timit/timit.py @@ -27,6 +27,7 @@ import string from pathlib import Path import soundfile + from utils.utility import unzip URL_ROOT = "" diff --git a/dataset/voxforge/voxforge.py b/dataset/voxforge/voxforge.py index c388f4491..373791bff 100644 --- a/dataset/voxforge/voxforge.py +++ b/dataset/voxforge/voxforge.py @@ -27,6 +27,7 @@ import shutil import subprocess import soundfile + from utils.utility import download_multi from utils.utility import getfile_insensitive from utils.utility import unpack diff --git a/demos/speech_translation/README.md b/demos/speech_translation/README.md index 8bb322c52..caca05dd1 100644 --- a/demos/speech_translation/README.md +++ b/demos/speech_translation/README.md @@ -19,7 +19,7 @@ Here are sample files for this demo that can be downloaded: wget https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav ``` -### 3. Usage +### 3. Usage (not support for Windows now) - Command Line(Recommended) ```bash paddlespeech st --input ./en.wav diff --git a/docs/source/install.md b/docs/source/install.md index 3eb175322..a976674d3 100644 --- a/docs/source/install.md +++ b/docs/source/install.md @@ -1,8 +1,8 @@ # Installation -There are 3 ways to use `PaddleSpeech`. According to the degree of difficulty, the 3 ways can be divided into `Easy`, `Medium` and `Hard`. +There are 3 ways to use `PaddleSpeech`. According to the degree of difficulty, the 3 ways can be divided into **Easy**, **Medium** and **Hard**. -## Easy: Get the Basic Funcition Without Your Own Mechine -If you are a newer of `PaddleSpeech` and want to experience it easily without your own mechine. We recommand you to use [AI Studio](https://aistudio.baidu.com/aistudio/index) to experience it. There is a step-by-step tutorial for `PaddleSpeech` and you can use the basic function of `PaddleSpeech` with a free machine. +## Easy: Get the Basic Function without Your Own Machine +If you are newer to `PaddleSpeech` and want to experience it easily without your own machine. We recommend you to use [AI Studio](https://aistudio.baidu.com/aistudio/index) to experience it. There is a step-by-step tutorial for `PaddleSpeech` and you can use the basic function of `PaddleSpeech` with a free machine. ## Prerequisites for Medium and Hard - Python >= 3.7 @@ -10,11 +10,11 @@ If you are a newer of `PaddleSpeech` and want to experience it easily without yo - Only Linux is supported - Hip: Do not use command `sh` instead of command `bash` -## Medium: Get the Basic Funciton on Your Mechine -If you want to install `paddlespeech` on your own mechine. There are 3 steps you need to do. +## Medium: Get the Basic Function on Your Machine +If you want to install `paddlespeech` on your own machine. There are 3 steps you need to do. -### Install the Conda -Conda is environment management system. You can go to [minicoda](https://docs.conda.io/en/latest/miniconda.html) to select a version (py>=3.7) and install it by yourself or you can use the following command: +### Install Conda +Conda is a management system of the environment. You can go to [minicoda](https://docs.conda.io/en/latest/miniconda.html) to select a version (py>=3.7) and install it by yourself or you can use the following command: ```bash # download the miniconda wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh @@ -35,7 +35,14 @@ conda activate tools/venv ``` Install conda dependencies for `paddlespeech` : ```bash -conda install -y -c conda-forge sox libsndfile swig bzip2 gcc_linux-64=8.4.0 gxx_linux-64=8.4.0 +conda install -y -c conda-forge sox libsndfile swig bzip2 +``` +Do not forget to install `gcc` and `gxx` on your system. +If you use linux, you can use the script below to install them. + +(Hip: Do not use this script if you want to install by **Hard** way): +``` +conda install -y -c gcc_linux-64=8.4.0 gxx_linux-64=8.4.0 ``` ### Install PaddlePaddle For example, for CUDA 10.2, CuDNN7.5 install paddle 2.2.0: @@ -43,30 +50,27 @@ For example, for CUDA 10.2, CuDNN7.5 install paddle 2.2.0: python3 -m pip install paddlepaddle-gpu==2.2.0 ``` ### Install PaddleSpeech -To Install `paddlespeech`, there are two methods. You can use the following command: +To install `paddlespeech`, there are two methods. You can use the following command: ```bash pip install paddlespeech ``` -If you install `paddlespeech` by `pip`, you can use it to help you build your own model. However, you can not use the `ready-made `examples in paddlespeech. +If you install `paddlespeech` by `pip`, you can use it to help you build your model. However, you can not use the `ready-made `examples in paddlespeech. -If you want to use the` ready-made `examples in `paddlespeech`, you need to clone this repository and install `paddlespeech` by the foll +If you want to use the` ready-made `examples in `paddlespeech`, you need to clone this repository and install `paddlespeech` by the following commands: ```bash https://github.com/PaddlePaddle/PaddleSpeech.git cd PaddleSpeech pip install . ``` -## Hard: Get the Full Funciton on Your Mechine +## Hard: Get the Full Function on Your Machine ### Prerequisites - choice 1: working with `Ubuntu` Docker Container. - - or - - choice 2: working on `Ubuntu` with `root` privilege. -To avoid the trouble of environment setup, [running in Docker container](#running-in-docker-container) is highly recommended. Otherwise If you work on `Ubuntu` with `root` privilege, you can skip the next step. +To avoid the trouble of environment setup, [running in Docker container](#running-in-docker-container) is highly recommended. Otherwise, if you work on `Ubuntu` with `root` privilege, you can skip the next step. ### Choice 1: Running in Docker Container (Recommand) -Docker is an open source tool to build, ship, and run distributed applications in an isolated environment. A Docker image for this project has been provided in [hub.docker.com](https://hub.docker.com) with all the dependencies installed. This Docker image requires the support of NVIDIA GPU, so please make sure its availiability and the [nvidia-docker](https://github.com/NVIDIA/nvidia-docker) has been installed. +Docker is an open-source tool to build, ship, and run distributed applications in an isolated environment. A Docker image for this project has been provided in [hub.docker.com](https://hub.docker.com) with all the dependencies installed. This Docker image requires the support of NVIDIA GPU, so please make sure its availability and the [nvidia-docker](https://github.com/NVIDIA/nvidia-docker) has been installed. Take several steps to launch the Docker image: - Download the Docker image @@ -115,7 +119,7 @@ For example, for CUDA 10.2, CuDNN7.5 install paddle 2.2.0: ```bash python3 -m pip install paddlepaddle-gpu==2.2.0 ``` -### Get the Funcition for Developing PaddleSpeech +### Get the Function for Developing PaddleSpeech ```bash pip install .[develop] ``` diff --git a/docs/source/released_model.md b/docs/source/released_model.md index 58650e593..91ef6d166 100644 --- a/docs/source/released_model.md +++ b/docs/source/released_model.md @@ -2,32 +2,31 @@ ## Speech-to-Text Models -### Acoustic Model Released in paddle 2.X -Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech | example link +### Speech Recognition Model +Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech | Example Link :-------------:| :------------:| :-----: | -----: | :----------------- |:--------- | :---------- | :--------- | :----------- -[Ds2 Online Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/aishell_ds2_online_cer8.00_release.tar.gz) | Aishell Dataset | Char-based | 345 MB | 2 Conv + 5 LSTM layers with only forward direction | 0.080 |-| 151 h | [D2 Online Aishell S0 Example](../../examples/aishell/asr0) -[Ds2 Offline Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/ds2.model.tar.gz)| Aishell Dataset | Char-based | 306 MB | 2 Conv + 3 bidirectional GRU layers| 0.064 |-| 151 h | [Ds2 Offline Aishell S0 Example](../../examples/aishell/asr0) -[Conformer Online Aishell ASR1 Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.chunk.release.tar.gz) | Aishell Dataset | Char-based | 283 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0594 |-| 151 h | [Conformer Online Aishell S1 Example](../../examples/aishell/s1) -[Conformer Offline Aishell ASR1 Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.release.tar.gz) | Aishell Dataset | Char-based | 284 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0547 |-| 151 h | [Conformer Offline Aishell S1 Example](../../examples/aishell/s1) -[Conformer Librispeech ASR1 Model](https://deepspeech.bj.bcebos.com/release2.1/librispeech/s1/conformer.release.tar.gz) | Librispeech Dataset | subword-based | 287 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0325 | 960 h | [Conformer Librispeech S1 example](../../example/librispeech/s1) -[Transformer Librispeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/transformer.model.tar.gz) | Librispeech Dataset | subword-based | 131 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0410 | 960 h | [Transformer Librispeech S1 example](../../example/librispeech/s1) -[Transformer Librispeech ASR2 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr2/transformer.model.tar.gz) | Librispeech Dataset | subword-based | 131 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: JoinCTC w/ LM |-| 0.024 | 960 h | [Transformer Librispeech S2 example](../../example/librispeech/s2) - - -### Acoustic Model Transformed from paddle 1.8 -Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech -:-------------:| :------------:| :-----: | -----: | :----------------- | :---------- | :---------- | :--------- -[Ds2 Offline Aishell model](https://deepspeech.bj.bcebos.com/mandarin_models/aishell_model_v1.8_to_v2.x.tar.gz)|Aishell Dataset| Char-based| 234 MB| 2 Conv + 3 bidirectional GRU layers| 0.0804 |-| 151 h| -[Ds2 Offline Librispeech model](https://deepspeech.bj.bcebos.com/eng_models/librispeech_v1.8_to_v2.x.tar.gz)|Librispeech Dataset| Word-based| 307 MB| 2 Conv + 3 bidirectional sharing weight RNN layers |-| 0.0685| 960 h| -[Ds2 Offline Baidu en8k model](https://deepspeech.bj.bcebos.com/eng_models/baidu_en8k_v1.8_to_v2.x.tar.gz)|Baidu Internal English Dataset| Word-based| 273 MB| 2 Conv + 3 bidirectional GRU layers |-| 0.0541 | 8628 h| - -### Language Model Released +[Ds2 Online Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/aishell_ds2_online_cer8.00_release.tar.gz) | Aishell Dataset | Char-based | 345 MB | 2 Conv + 5 LSTM layers with only forward direction | 0.080 |-| 151 h | [D2 Online Aishell ASR0](../../examples/aishell/asr0) +[Ds2 Offline Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/ds2.model.tar.gz)| Aishell Dataset | Char-based | 306 MB | 2 Conv + 3 bidirectional GRU layers| 0.064 |-| 151 h | [Ds2 Offline Aishell ASR0](../../examples/aishell/asr0) +[Conformer Online Aishell ASR1 Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.chunk.release.tar.gz) | Aishell Dataset | Char-based | 283 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0594 |-| 151 h | [Conformer Online Aishell ASR1](../../examples/aishell/asr1) +[Conformer Offline Aishell ASR1 Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.release.tar.gz) | Aishell Dataset | Char-based | 284 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0547 |-| 151 h | [Conformer Offline Aishell ASR1](../../examples/aishell/asr1) +[Transformer Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/transformer.model.tar.gz) | Aishell Dataset | Char-based | 128 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0538 || 151 h | [Transformer Aishell ASR1](../../examples/aishell/asr1) +[Conformer Librispeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/conformer.model.tar.gz) | Librispeech Dataset | subword-based | 191 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0337 | 960 h | [Conformer Librispeech ASR1](../../example/librispeech/asr1) +[Transformer Librispeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/transformer.model.tar.gz) | Librispeech Dataset | subword-based | 131 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0381 | 960 h | [Transformer Librispeech ASR1](../../example/librispeech/asr1) +[Transformer Librispeech ASR2 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr2/transformer.model.tar.gz) | Librispeech Dataset | subword-based | 131 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: JoinCTC w/ LM |-| 0.0240 | 960 h | [Transformer Librispeech ASR2](../../example/librispeech/asr2) + +### Language Model based on NGram Language Model | Training Data | Token-based | Size | Descriptions :-------------:| :------------:| :-----: | -----: | :----------------- [English LM](https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm) | [CommonCrawl(en.00)](http://web-language-models.s3-website-us-east-1.amazonaws.com/ngrams/en/deduped/en.00.deduped.xz) | Word-based | 8.3 GB | Pruned with 0 1 1 1 1;
About 1.85 billion n-grams;
'trie' binary with '-a 22 -q 8 -b 8' [Mandarin LM Small](https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm) | Baidu Internal Corpus | Char-based | 2.8 GB | Pruned with 0 1 2 4 4;
About 0.13 billion n-grams;
'probing' binary with default settings [Mandarin LM Large](https://deepspeech.bj.bcebos.com/zh_lm/zhidao_giga.klm) | Baidu Internal Corpus | Char-based | 70.4 GB | No Pruning;
About 3.7 billion n-grams;
'probing' binary with default settings +### Speech Translation Models + +| Model | Training Data | Token-based | Size | Descriptions | BLEU | Example Link | +| ------------------------------------------------------------ | ------------- | ----------- | ---- | ------------------------------------------------------------ | ----- | ------------------------------------------------------------ | +| [Transformer FAT-ST MTL En-Zh](https://paddlespeech.bj.bcebos.com/s2t/ted_en_zh/st1/fat_st_ted-en-zh.tar.gz) | Ted-En-Zh | Spm | | Encoder:Transformer, Decoder:Transformer,
Decoding method: Attention | 20.80 | [Transformer Ted-En-Zh ST1](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/ted_en_zh/st1) | + ## Text-to-Speech Models @@ -69,8 +68,10 @@ PANN | Audioset| [audioset_tagging_cnn](https://github.com/qiuqiangkong/audioset PANN | ESC-50 |[pann-esc50]("./examples/esc50/cls0")|[panns_cnn6.tar.gz](https://paddlespeech.bj.bcebos.com/cls/panns_cnn6.tar.gz), [panns_cnn10](https://paddlespeech.bj.bcebos.com/cls/panns_cnn10.tar.gz), [panns_cnn14.tar.gz](https://paddlespeech.bj.bcebos.com/cls/panns_cnn14.tar.gz) -## Speech Translation Models +## Speech Recognition Model from paddle 1.8 -Model Type | Dataset| Example Link | Pretrained Models | Model Size -:-------------:| :------------:| :-----: | :-----: | :-----: -FAT-ST | TED En-Zh |[FAT + Transformer+ASR MTL](./examples/ted_en_zh/st1)|[fat_st_ted-en-zh.tar.gz](https://paddlespeech.bj.bcebos.com/s2t/ted_en_zh/st1/fat_st_ted-en-zh.tar.gz) | 50.26M +| Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech | +| :----------------------------------------------------------: | :----------------------------: | :---------: | -----: | :------------------------------------------------- | :----- | :----- | :-------------- | +| [Ds2 Offline Aishell model](https://deepspeech.bj.bcebos.com/mandarin_models/aishell_model_v1.8_to_v2.x.tar.gz) | Aishell Dataset | Char-based | 234 MB | 2 Conv + 3 bidirectional GRU layers | 0.0804 | - | 151 h | +| [Ds2 Offline Librispeech model](https://deepspeech.bj.bcebos.com/eng_models/librispeech_v1.8_to_v2.x.tar.gz) | Librispeech Dataset | Word-based | 307 MB | 2 Conv + 3 bidirectional sharing weight RNN layers | - | 0.0685 | 960 h | +| [Ds2 Offline Baidu en8k model](https://deepspeech.bj.bcebos.com/eng_models/baidu_en8k_v1.8_to_v2.x.tar.gz) | Baidu Internal English Dataset | Word-based | 273 MB | 2 Conv + 3 bidirectional GRU layers | - | 0.0541 | 8628 h | diff --git a/docs/source/tts/demo.rst b/docs/source/tts/demo.rst index 4c2f86b14..ca2fd98e4 100644 --- a/docs/source/tts/demo.rst +++ b/docs/source/tts/demo.rst @@ -455,6 +455,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog CSMSC(Chinese)

+
Task Dataset Model Type Link Task Dataset Model Type Link
@@ -634,6 +635,106 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
Text
+ +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
FastSpeech2-Conformer + ParallelWaveGAN
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +


diff --git a/examples/aishell/asr1/READEME.md b/examples/aishell/asr1/README.md similarity index 100% rename from examples/aishell/asr1/READEME.md rename to examples/aishell/asr1/README.md diff --git a/examples/librispeech/asr1/RESULTS.md b/examples/librispeech/asr1/RESULTS.md index 1aba73d1c..d5f5a9a46 100644 --- a/examples/librispeech/asr1/RESULTS.md +++ b/examples/librispeech/asr1/RESULTS.md @@ -1,15 +1,18 @@ # LibriSpeech ## Conformer +train: Epoch 70, 4 V100-32G, best avg: 20 + | Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER | | --- | --- | --- | --- | --- | --- | --- | --- | -| conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean | attention | 6.738649845123291 | 0.041159 | -| conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean | ctc_greedy_search | 6.738649845123291 | 0.039847 | -| conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean | ctc_prefix_beam_search | 6.738649845123291 | 0.039790 | -| conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean | attention_rescoring | 6.738649845123291 | 0.034617 | +| conformer | 47.63 M | conf/conformer.yaml | spec_aug | test-clean | attention | 6.433612394332886 | 0.039771 | +| conformer | 47.63 M | conf/conformer.yaml | spec_aug | test-clean | ctc_greedy_search | 6.433612394332886 | 0.040342 | +| conformer | 47.63 M | conf/conformer.yaml | spec_aug | test-clean | ctc_prefix_beam_search | 6.433612394332886 | 0.040342 | +| conformer | 47.63 M | conf/conformer.yaml | spec_aug | test-clean | attention_rescoring | 6.433612394332886 | 0.033761 | ## Chunk Conformer + | Model | Params | Config | Augmentation| Test set | Decode method | Chunk Size & Left Chunks | Loss | WER | | --- | --- | --- | --- | --- | --- | --- | --- | --- | | conformer | 47.63 M | conf/chunk_conformer.yaml | spec_aug + shift | test-clean | attention | 16, -1 | 7.11 | 0.063193 | @@ -20,7 +23,7 @@ ## Transformer -train: Epoch 120, 4 V100-32G, 27 Day, avg: 10 +train: Epoch 120, 4 V100-32G, 27 Day, best avg: 10 | Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER | | --- | --- | --- | --- | --- | --- | --- | --- | diff --git a/examples/librispeech/asr1/conf/chunk_conformer.yaml b/examples/librispeech/asr1/conf/chunk_conformer.yaml index 7f5930378..2872b69ef 100644 --- a/examples/librispeech/asr1/conf/chunk_conformer.yaml +++ b/examples/librispeech/asr1/conf/chunk_conformer.yaml @@ -1,41 +1,3 @@ -# https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test - min_input_len: 0.5 - max_input_len: 30.0 - min_output_len: 0.0 - max_output_len: 400.0 - min_output_input_ratio: 0.05 - max_output_input_ratio: 100.0 - -collator: - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'spm' - spm_model_prefix: 'data/lang_char/bpe_unigram_5000' - mean_std_filepath: "" - augmentation_config: conf/preprocess.yaml - batch_size: 16 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank - feat_dim: 80 - delta_delta: False - dither: 1.0 - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 - - # network architecture model: cmvn_file: @@ -80,6 +42,39 @@ model: length_normalized_loss: false +data: + train_manifest: data/manifest.train + dev_manifest: data/manifest.dev + test_manifest: data/manifest.test + + +collator: + vocab_filepath: data/lang_char/vocab.txt + unit_type: 'spm' + spm_model_prefix: 'data/lang_char/bpe_unigram_5000' + mean_std_filepath: "" + augmentation_config: conf/preprocess.yaml + batch_size: 16 + raw_wav: True # use raw_wav or kaldi feature + spectrum_type: fbank #linear, mfcc, fbank + feat_dim: 80 + delta_delta: False + dither: 1.0 + target_sample_rate: 16000 + max_freq: None + n_fft: None + stride_ms: 10.0 + window_ms: 25.0 + use_dB_normalization: True + target_dB: -20 + random_seed: 0 + keep_transcription_text: False + sortagrad: True + shuffle_method: batch_shuffle + num_workers: 2 + + + training: n_epoch: 240 accum_grad: 8 diff --git a/examples/librispeech/asr1/conf/chunk_transformer.yaml b/examples/librispeech/asr1/conf/chunk_transformer.yaml index 366d6de0f..275e940af 100644 --- a/examples/librispeech/asr1/conf/chunk_transformer.yaml +++ b/examples/librispeech/asr1/conf/chunk_transformer.yaml @@ -1,41 +1,3 @@ -# https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test - min_input_len: 0.5 # second - max_input_len: 30.0 # second - min_output_len: 0.0 # tokens - max_output_len: 400.0 # tokens - min_output_input_ratio: 0.05 - max_output_input_ratio: 100.0 - -collator: - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'spm' - spm_model_prefix: 'data/lang_char/bpe_unigram_5000' - mean_std_filepath: "" - augmentation_config: conf/preprocess.yaml - batch_size: 64 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank - feat_dim: 80 - delta_delta: False - dither: 1.0 - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 - - # network architecture model: cmvn_file: @@ -73,6 +35,37 @@ model: length_normalized_loss: false +data: + train_manifest: data/manifest.train + dev_manifest: data/manifest.dev + test_manifest: data/manifest.test + +collator: + vocab_filepath: data/lang_char/vocab.txt + unit_type: 'spm' + spm_model_prefix: 'data/lang_char/bpe_unigram_5000' + mean_std_filepath: "" + augmentation_config: conf/preprocess.yaml + batch_size: 64 + raw_wav: True # use raw_wav or kaldi feature + spectrum_type: fbank #linear, mfcc, fbank + feat_dim: 80 + delta_delta: False + dither: 1.0 + target_sample_rate: 16000 + max_freq: None + n_fft: None + stride_ms: 10.0 + window_ms: 25.0 + use_dB_normalization: True + target_dB: -20 + random_seed: 0 + keep_transcription_text: False + sortagrad: True + shuffle_method: batch_shuffle + num_workers: 2 + + training: n_epoch: 120 accum_grad: 1 diff --git a/examples/librispeech/asr1/conf/conformer.yaml b/examples/librispeech/asr1/conf/conformer.yaml index f02f24dc6..1193f14b1 100644 --- a/examples/librispeech/asr1/conf/conformer.yaml +++ b/examples/librispeech/asr1/conf/conformer.yaml @@ -1,41 +1,3 @@ -# https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test-clean - min_input_len: 0.5 # seconds - max_input_len: 30.0 # seconds - min_output_len: 0.0 # tokens - max_output_len: 400.0 # tokens - min_output_input_ratio: 0.05 - max_output_input_ratio: 100.0 - -collator: - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'spm' - spm_model_prefix: 'data/lang_char/bpe_unigram_5000' - mean_std_filepath: "" - augmentation_config: conf/preprocess.yaml - batch_size: 16 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank - feat_dim: 80 - delta_delta: False - dither: 1.0 - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 - - # network architecture model: cmvn_file: @@ -76,8 +38,40 @@ model: length_normalized_loss: false +data: + train_manifest: data/manifest.train + dev_manifest: data/manifest.dev + test_manifest: data/manifest.test-clean + + +collator: + vocab_filepath: data/lang_char/vocab.txt + unit_type: 'spm' + spm_model_prefix: 'data/lang_char/bpe_unigram_5000' + mean_std_filepath: "" + augmentation_config: conf/preprocess.yaml + batch_size: 16 + raw_wav: True # use raw_wav or kaldi feature + spectrum_type: fbank #linear, mfcc, fbank + feat_dim: 80 + delta_delta: False + dither: 1.0 + target_sample_rate: 16000 + max_freq: None + n_fft: None + stride_ms: 10.0 + window_ms: 25.0 + use_dB_normalization: True + target_dB: -20 + random_seed: 0 + keep_transcription_text: False + sortagrad: True + shuffle_method: batch_shuffle + num_workers: 2 + + training: - n_epoch: 120 + n_epoch: 70 accum_grad: 8 global_grad_clip: 3.0 optim: adam @@ -98,13 +92,7 @@ decoding: batch_size: 64 error_rate_type: wer decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 beam_size: 10 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. # <0: for decoding, use full chunk. diff --git a/paddlespeech/cli/README.md b/paddlespeech/cli/README.md index 25f1f718b..34466ec2f 100644 --- a/paddlespeech/cli/README.md +++ b/paddlespeech/cli/README.md @@ -17,6 +17,8 @@ ``` ## Speech Translation (English to Chinese) + + (not support for Windows now) ```bash paddlespeech st --input input_16k.wav ``` diff --git a/paddlespeech/cli/__init__.py b/paddlespeech/cli/__init__.py index 99a53c37e..80ca7a665 100644 --- a/paddlespeech/cli/__init__.py +++ b/paddlespeech/cli/__init__.py @@ -11,9 +11,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import _locale + from .asr import ASRExecutor from .base_commands import BaseCommand from .base_commands import HelpCommand from .cls import CLSExecutor from .st import STExecutor from .tts import TTSExecutor + +_locale._getdefaultlocale = (lambda *args: ['en_US', 'utf8']) diff --git a/paddlespeech/cli/tts/infer.py b/paddlespeech/cli/tts/infer.py index 8fe5f90ad..b3733e059 100644 --- a/paddlespeech/cli/tts/infer.py +++ b/paddlespeech/cli/tts/infer.py @@ -405,8 +405,6 @@ class TTSExecutor(BaseExecutor): with open(self.voc_config) as f: self.voc_config = CfgNode(yaml.safe_load(f)) - # Enter the path of model root - with open(self.phones_dict, "r") as f: phn_id = [line.strip().split() for line in f.readlines()] vocab_size = len(phn_id) @@ -463,11 +461,12 @@ class TTSExecutor(BaseExecutor): am_std = paddle.to_tensor(am_std) am_normalizer = ZScore(am_mu, am_std) self.am_inference = am_inference_class(am_normalizer, am) + self.am_inference.eval() print("acoustic model done!") # vocoder # model: {model_name}_{dataset} - voc_name = '_'.join(voc.split('_')[:-1]) + voc_name = voc[:voc.rindex('_')] voc_class = dynamic_import(voc_name, model_alias) voc_inference_class = dynamic_import(voc_name + '_inference', model_alias) @@ -480,6 +479,7 @@ class TTSExecutor(BaseExecutor): voc_std = paddle.to_tensor(voc_std) voc_normalizer = ZScore(voc_mu, voc_std) self.voc_inference = voc_inference_class(voc_normalizer, voc) + self.voc_inference.eval() print("voc done!") def preprocess(self, input: Any, *args, **kwargs): @@ -501,10 +501,10 @@ class TTSExecutor(BaseExecutor): """ Model inference and result stored in self.output. """ - model_name = am[:am.rindex('_')] - dataset = am[am.rindex('_') + 1:] + am_name = am[:am.rindex('_')] + am_dataset = am[am.rindex('_') + 1:] get_tone_ids = False - if 'speedyspeech' in model_name: + if am_name == 'speedyspeech': get_tone_ids = True if lang == 'zh': input_ids = self.frontend.get_input_ids( @@ -521,15 +521,14 @@ class TTSExecutor(BaseExecutor): print("lang should in {'zh', 'en'}!") # am - if 'speedyspeech' in model_name: + if am_name == 'speedyspeech': mel = self.am_inference(phone_ids, tone_ids) # fastspeech2 else: # multi speaker - if dataset in {"aishell3", "vctk"}: + if am_dataset in {"aishell3", "vctk"}: mel = self.am_inference( phone_ids, spk_id=paddle.to_tensor(spk_id)) - else: mel = self.am_inference(phone_ids) diff --git a/paddlespeech/cls/exps/panns/deploy/predict.py b/paddlespeech/cls/exps/panns/deploy/predict.py index ee566ed4f..d4e5c22fb 100644 --- a/paddlespeech/cls/exps/panns/deploy/predict.py +++ b/paddlespeech/cls/exps/panns/deploy/predict.py @@ -16,10 +16,11 @@ import os import numpy as np from paddle import inference +from scipy.special import softmax + from paddleaudio.backends import load as load_audio from paddleaudio.datasets import ESC50 from paddleaudio.features import melspectrogram -from scipy.special import softmax # yapf: disable parser = argparse.ArgumentParser() diff --git a/paddlespeech/cls/exps/panns/export_model.py b/paddlespeech/cls/exps/panns/export_model.py index 63b22981a..c295c6a33 100644 --- a/paddlespeech/cls/exps/panns/export_model.py +++ b/paddlespeech/cls/exps/panns/export_model.py @@ -15,8 +15,8 @@ import argparse import os import paddle -from paddleaudio.datasets import ESC50 +from paddleaudio.datasets import ESC50 from paddlespeech.cls.models import cnn14 from paddlespeech.cls.models import SoundClassifier diff --git a/paddlespeech/cls/exps/panns/predict.py b/paddlespeech/cls/exps/panns/predict.py index 0a1b6cccf..9cfd8b6ce 100644 --- a/paddlespeech/cls/exps/panns/predict.py +++ b/paddlespeech/cls/exps/panns/predict.py @@ -16,11 +16,11 @@ import argparse import numpy as np import paddle import paddle.nn.functional as F + from paddleaudio.backends import load as load_audio from paddleaudio.datasets import ESC50 from paddleaudio.features import LogMelSpectrogram from paddleaudio.features import melspectrogram - from paddlespeech.cls.models import cnn14 from paddlespeech.cls.models import SoundClassifier diff --git a/paddlespeech/cls/exps/panns/train.py b/paddlespeech/cls/exps/panns/train.py index 9508a977e..121309789 100644 --- a/paddlespeech/cls/exps/panns/train.py +++ b/paddlespeech/cls/exps/panns/train.py @@ -15,11 +15,11 @@ import argparse import os import paddle + from paddleaudio.datasets import ESC50 from paddleaudio.features import LogMelSpectrogram from paddleaudio.utils import logger from paddleaudio.utils import Timer - from paddlespeech.cls.models import cnn14 from paddlespeech.cls.models import SoundClassifier diff --git a/paddlespeech/cls/models/panns/panns.py b/paddlespeech/cls/models/panns/panns.py index b442b2fd1..6d2dac56a 100644 --- a/paddlespeech/cls/models/panns/panns.py +++ b/paddlespeech/cls/models/panns/panns.py @@ -15,6 +15,7 @@ import os import paddle.nn as nn import paddle.nn.functional as F + from paddleaudio.utils.download import load_state_dict_from_url from paddleaudio.utils.env import MODEL_HOME diff --git a/paddlespeech/s2t/frontend/audio.py b/paddlespeech/s2t/frontend/audio.py index d494cc4fd..d0368cc8d 100644 --- a/paddlespeech/s2t/frontend/audio.py +++ b/paddlespeech/s2t/frontend/audio.py @@ -356,7 +356,7 @@ class AudioSegment(): # sox, slow try: import soxbindings as sox - except: + except ImportError: try: from paddlespeech.s2t.utils import dynamic_pip_install package = "sox" @@ -364,8 +364,9 @@ class AudioSegment(): package = "soxbindings" dynamic_pip_install.install(package) import soxbindings as sox - except: - raise RuntimeError("Can not install soxbindings on your system." ) + except Exception: + raise RuntimeError( + "Can not install soxbindings on your system.") tfm = sox.Transformer() tfm.set_globals(multithread=False) diff --git a/paddlespeech/s2t/frontend/utility.py b/paddlespeech/s2t/frontend/utility.py index e6c7603fa..d35785db6 100644 --- a/paddlespeech/s2t/frontend/utility.py +++ b/paddlespeech/s2t/frontend/utility.py @@ -102,9 +102,11 @@ def read_manifest( with jsonlines.open(manifest_path, 'r') as reader: for json_data in reader: feat_len = json_data["input"][0]["shape"][ - 0] if "input" in json_data and "shape" in json_data["input"][0] else 1.0 + 0] if "input" in json_data and "shape" in json_data["input"][ + 0] else 1.0 token_len = json_data["output"][0]["shape"][ - 0] if "output" in json_data and "shape" in json_data["output"][0] else 1.0 + 0] if "output" in json_data and "shape" in json_data["output"][ + 0] else 1.0 conditions = [ feat_len >= min_input_len, feat_len <= max_input_len, diff --git a/paddlespeech/s2t/io/sampler.py b/paddlespeech/s2t/io/sampler.py index 35b57524b..ac55af123 100644 --- a/paddlespeech/s2t/io/sampler.py +++ b/paddlespeech/s2t/io/sampler.py @@ -20,13 +20,13 @@ from paddle.io import DistributedBatchSampler from paddlespeech.s2t.utils.log import Log +logger = Log(__name__).getlog() + __all__ = [ "SortagradDistributedBatchSampler", "SortagradBatchSampler", ] -logger = Log(__name__).getlog() - def _batch_shuffle(indices, batch_size, epoch, clipped=False): """Put similarly-sized instances into minibatches for better efficiency diff --git a/paddlespeech/s2t/models/ds2/__init__.py b/paddlespeech/s2t/models/ds2/__init__.py index efa50863b..8d5959c8b 100644 --- a/paddlespeech/s2t/models/ds2/__init__.py +++ b/paddlespeech/s2t/models/ds2/__init__.py @@ -17,11 +17,11 @@ from paddlespeech.s2t.utils import dynamic_pip_install try: import swig_decoders -except: +except ImportError: try: package_name = 'paddlespeech_ctcdecoders' dynamic_pip_install.install(package_name) - except: + except Exception: raise RuntimeError( "Can not install package paddlespeech_ctcdecoders on your system. \ The DeepSpeech2 model is not supported for your system") diff --git a/paddlespeech/s2t/models/ds2/deepspeech2.py b/paddlespeech/s2t/models/ds2/deepspeech2.py index f0a553ec8..0dfaec29c 100644 --- a/paddlespeech/s2t/models/ds2/deepspeech2.py +++ b/paddlespeech/s2t/models/ds2/deepspeech2.py @@ -129,7 +129,7 @@ class DeepSpeech2Model(nn.Layer): rnn_layer_size=1024, #RNN layer size (number of RNN cells). use_gru=True, #Use gru if set True. Use simple rnn if set False. share_rnn_weights=True, #Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported. - ctc_grad_norm_type=None,)) + ctc_grad_norm_type=None, )) if config is not None: config.merge_from_other_cfg(default) return default diff --git a/paddlespeech/s2t/models/ds2_online/__init__.py b/paddlespeech/s2t/models/ds2_online/__init__.py index 65ddd5122..2d304237b 100644 --- a/paddlespeech/s2t/models/ds2_online/__init__.py +++ b/paddlespeech/s2t/models/ds2_online/__init__.py @@ -17,11 +17,11 @@ from paddlespeech.s2t.utils import dynamic_pip_install try: import swig_decoders -except: +except ImportError: try: package_name = 'paddlespeech_ctcdecoders' dynamic_pip_install.install(package_name) - except: + except Exception: raise RuntimeError( "Can not install package paddlespeech_ctcdecoders on your system. \ The DeepSpeech2 model is not supported for your system") diff --git a/paddlespeech/s2t/modules/ctc.py b/paddlespeech/s2t/modules/ctc.py index 774bcc62e..ffc9f0387 100644 --- a/paddlespeech/s2t/modules/ctc.py +++ b/paddlespeech/s2t/modules/ctc.py @@ -28,7 +28,7 @@ try: from paddlespeech.s2t.decoders.ctcdecoder.swig_wrapper import ctc_beam_search_decoder_batch # noqa: F401 from paddlespeech.s2t.decoders.ctcdecoder.swig_wrapper import ctc_greedy_decoder # noqa: F401 from paddlespeech.s2t.decoders.ctcdecoder.swig_wrapper import Scorer # noqa: F401 -except: +except ImportError: try: from paddlespeech.s2t.utils import dynamic_pip_install package_name = 'paddlespeech_ctcdecoders' diff --git a/paddlespeech/s2t/training/trainer.py b/paddlespeech/s2t/training/trainer.py index cc8f50317..9bf1ca4db 100644 --- a/paddlespeech/s2t/training/trainer.py +++ b/paddlespeech/s2t/training/trainer.py @@ -221,6 +221,8 @@ class Trainer(): if hasattr(self.train_loader, "batch_sampler"): batch_sampler = self.train_loader.batch_sampler if isinstance(batch_sampler, paddle.io.DistributedBatchSampler): + logger.debug( + f"train_loader.batch_sample set epoch: {self.epoch}") batch_sampler.set_epoch(self.epoch) def before_train(self): diff --git a/paddlespeech/s2t/transform/perturb.py b/paddlespeech/s2t/transform/perturb.py index 90144197c..226885f36 100644 --- a/paddlespeech/s2t/transform/perturb.py +++ b/paddlespeech/s2t/transform/perturb.py @@ -147,7 +147,7 @@ class SpeedPerturbationSox(): try: import soxbindings as sox - except: + except ImportError: try: from paddlespeech.s2t.utils import dynamic_pip_install package = "sox" @@ -155,8 +155,10 @@ class SpeedPerturbationSox(): package = "soxbindings" dynamic_pip_install.install(package) import soxbindings as sox - except: - raise RuntimeError("Can not install soxbindings on your system." ) + except Exception: + raise RuntimeError( + "Can not install soxbindings on your system.") + self.sox = sox if utt2ratio is not None: self.utt2ratio = {} @@ -200,7 +202,7 @@ class SpeedPerturbationSox(): else: ratio = self.state.uniform(self.lower, self.upper) - tfm = sox.Transformer() + tfm = self.sox.Transformer() tfm.set_globals(multithread=False) tfm.speed(ratio) y = tfm.build_array(input_array=x, sample_rate_in=self.sr) diff --git a/paddlespeech/t2s/frontend/zh_frontend.py b/paddlespeech/t2s/frontend/zh_frontend.py index b59060a36..8eb55ff25 100644 --- a/paddlespeech/t2s/frontend/zh_frontend.py +++ b/paddlespeech/t2s/frontend/zh_frontend.py @@ -137,6 +137,10 @@ class Frontend(): phones_list.append(phones) if merge_sentences: merge_list = sum(phones_list, []) + # rm the last 'sp' to avoid the noise at the end + # cause in the training data, no 'sp' in the end + if merge_list[-1] == 'sp': + merge_list = merge_list[:-1] phones_list = [] phones_list.append(merge_list) return phones_list diff --git a/setup.py b/setup.py index 1ac671f1c..a5b773edf 100644 --- a/setup.py +++ b/setup.py @@ -46,7 +46,6 @@ requirements = { "paddleaudio", "paddlespeech_feat", "praatio~=4.1", - "pypi-kenlm", "pypinyin", "python-dateutil", "pyworld", @@ -71,6 +70,7 @@ requirements = { "phkit", "Pillow", "pybind11", + "pypi-kenlm", "snakeviz", "sox", "soxbindings", diff --git a/utils/manifest_key_value.py b/utils/manifest_key_value.py index 3a8009039..fb3d3aaaf 100755 --- a/utils/manifest_key_value.py +++ b/utils/manifest_key_value.py @@ -5,6 +5,7 @@ import functools from pathlib import Path import jsonlines + from utils.utility import add_arguments from utils.utility import print_arguments