Merge branch 'develop' into replace_view_with_reshape

pull/3887/head
Wang Xin 10 months ago
commit 6712fe592b

@ -7,7 +7,7 @@
<a href="./LICENSE"><img src="https://img.shields.io/badge/license-Apache%202-red.svg"></a> <a href="./LICENSE"><img src="https://img.shields.io/badge/license-Apache%202-red.svg"></a>
<a href="https://github.com/PaddlePaddle/PaddleSpeech/releases"><img src="https://img.shields.io/github/v/release/PaddlePaddle/PaddleSpeech?color=ffa"></a> <a href="https://github.com/PaddlePaddle/PaddleSpeech/releases"><img src="https://img.shields.io/github/v/release/PaddlePaddle/PaddleSpeech?color=ffa"></a>
<a href="support os"><img src="https://img.shields.io/badge/os-linux%2C%20win%2C%20mac-pink.svg"></a> <a href="support os"><img src="https://img.shields.io/badge/os-linux%2C%20win%2C%20mac-pink.svg"></a>
<a href=""><img src="https://img.shields.io/badge/python-3.7+-aff.svg"></a> <a href=""><img src="https://img.shields.io/badge/python-3.8+-aff.svg"></a>
<a href="https://github.com/PaddlePaddle/PaddleSpeech/graphs/contributors"><img src="https://img.shields.io/github/contributors/PaddlePaddle/PaddleSpeech?color=9ea"></a> <a href="https://github.com/PaddlePaddle/PaddleSpeech/graphs/contributors"><img src="https://img.shields.io/github/contributors/PaddlePaddle/PaddleSpeech?color=9ea"></a>
<a href="https://github.com/PaddlePaddle/PaddleSpeech/commits"><img src="https://img.shields.io/github/commit-activity/m/PaddlePaddle/PaddleSpeech?color=3af"></a> <a href="https://github.com/PaddlePaddle/PaddleSpeech/commits"><img src="https://img.shields.io/github/commit-activity/m/PaddlePaddle/PaddleSpeech?color=3af"></a>
<a href="https://github.com/PaddlePaddle/PaddleSpeech/issues"><img src="https://img.shields.io/github/issues/PaddlePaddle/PaddleSpeech?color=9cc"></a> <a href="https://github.com/PaddlePaddle/PaddleSpeech/issues"><img src="https://img.shields.io/github/issues/PaddlePaddle/PaddleSpeech?color=9cc"></a>
@ -179,6 +179,7 @@ Via the easy-to-use, efficient, flexible and scalable implementation, our vision
### Recent Update ### Recent Update
- 👑 2023.05.31: Add [WavLM ASR-en](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/librispeech/asr5), WavLM fine-tuning for ASR on LibriSpeech. - 👑 2023.05.31: Add [WavLM ASR-en](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/librispeech/asr5), WavLM fine-tuning for ASR on LibriSpeech.
- 🎉 2023.05.18: Add [Squeezeformer](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell/asr1), Squeezeformer training for ASR on Aishell.
- 👑 2023.05.04: Add [HuBERT ASR-en](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/librispeech/asr4), HuBERT fine-tuning for ASR on LibriSpeech. - 👑 2023.05.04: Add [HuBERT ASR-en](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/librispeech/asr4), HuBERT fine-tuning for ASR on LibriSpeech.
- ⚡ 2023.04.28: Fix [0-d tensor](https://github.com/PaddlePaddle/PaddleSpeech/pull/3214), with the upgrade of paddlepaddle==2.5, the problem of modifying 0-d tensor has been solved. - ⚡ 2023.04.28: Fix [0-d tensor](https://github.com/PaddlePaddle/PaddleSpeech/pull/3214), with the upgrade of paddlepaddle==2.5, the problem of modifying 0-d tensor has been solved.
- 👑 2023.04.25: Add [AMP for U2 conformer](https://github.com/PaddlePaddle/PaddleSpeech/pull/3167). - 👑 2023.04.25: Add [AMP for U2 conformer](https://github.com/PaddlePaddle/PaddleSpeech/pull/3167).

@ -184,6 +184,7 @@
### 近期更新 ### 近期更新
- 👑 2023.05.31: 新增 [WavLM ASR-en](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/librispeech/asr5), 基于WavLM的英语识别微调使用LibriSpeech数据集 - 👑 2023.05.31: 新增 [WavLM ASR-en](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/librispeech/asr5), 基于WavLM的英语识别微调使用LibriSpeech数据集
- 🎉 2023.05.18: 新增 [Squeezeformer](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell/asr1), 使用Squeezeformer进行训练使用Aishell数据集
- 👑 2023.05.04: 新增 [HuBERT ASR-en](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/librispeech/asr4), 基于HuBERT的英语识别微调使用LibriSpeech数据集 - 👑 2023.05.04: 新增 [HuBERT ASR-en](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/librispeech/asr4), 基于HuBERT的英语识别微调使用LibriSpeech数据集
- ⚡ 2023.04.28: 修正 [0-d tensor](https://github.com/PaddlePaddle/PaddleSpeech/pull/3214), 配合PaddlePaddle2.5升级修改了0-d tensor的问题。 - ⚡ 2023.04.28: 修正 [0-d tensor](https://github.com/PaddlePaddle/PaddleSpeech/pull/3214), 配合PaddlePaddle2.5升级修改了0-d tensor的问题。
- 👑 2023.04.25: 新增 [U2 conformer 的 AMP 训练](https://github.com/PaddlePaddle/PaddleSpeech/pull/3167). - 👑 2023.04.25: 新增 [U2 conformer 的 AMP 训练](https://github.com/PaddlePaddle/PaddleSpeech/pull/3167).

@ -14,7 +14,7 @@ Linux test build whl environment:
* gcc/g++ - 8.2.0 * gcc/g++ - 8.2.0
* cmake - 3.18.0 (need install) * cmake - 3.18.0 (need install)
MACtest build whl envrioment MACtest build whl environment
* os * os
* gcc/g++ 12.2.0 * gcc/g++ 12.2.0
* cpu Intel Xeon E5 x86_64 * cpu Intel Xeon E5 x86_64

@ -37,7 +37,7 @@ class FeatTest(unittest.TestCase):
self.waveform, self.sr = load(os.path.abspath(os.path.basename(url))) self.waveform, self.sr = load(os.path.abspath(os.path.basename(url)))
self.waveform = self.waveform.astype( self.waveform = self.waveform.astype(
np.float32 np.float32
) # paddlespeech.s2t.transform.spectrogram only supports float32 ) # paddlespeech.audio.transform.spectrogram only supports float32
dim = len(self.waveform.shape) dim = len(self.waveform.shape)
assert dim in [1, 2] assert dim in [1, 2]

@ -18,8 +18,8 @@ import paddle
from paddleaudio.functional.window import get_window from paddleaudio.functional.window import get_window
from .base import FeatTest from .base import FeatTest
from paddlespeech.s2t.transform.spectrogram import IStft from paddlespeech.audio.transform.spectrogram import IStft
from paddlespeech.s2t.transform.spectrogram import Stft from paddlespeech.audio.transform.spectrogram import Stft
class TestIstft(FeatTest): class TestIstft(FeatTest):

@ -18,7 +18,7 @@ import paddle
import paddleaudio import paddleaudio
from .base import FeatTest from .base import FeatTest
from paddlespeech.s2t.transform.spectrogram import LogMelSpectrogram from paddlespeech.audio.transform.spectrogram import LogMelSpectrogram
class TestLogMelSpectrogram(FeatTest): class TestLogMelSpectrogram(FeatTest):

@ -18,7 +18,7 @@ import paddle
import paddleaudio import paddleaudio
from .base import FeatTest from .base import FeatTest
from paddlespeech.s2t.transform.spectrogram import Spectrogram from paddlespeech.audio.transform.spectrogram import Spectrogram
class TestSpectrogram(FeatTest): class TestSpectrogram(FeatTest):

@ -18,7 +18,7 @@ import paddle
from paddleaudio.functional.window import get_window from paddleaudio.functional.window import get_window
from .base import FeatTest from .base import FeatTest
from paddlespeech.s2t.transform.spectrogram import Stft from paddlespeech.audio.transform.spectrogram import Stft
class TestStft(FeatTest): class TestStft(FeatTest):

@ -30,6 +30,7 @@ import soundfile
from paddlespeech.dataset.download import download from paddlespeech.dataset.download import download
from paddlespeech.dataset.download import unpack from paddlespeech.dataset.download import unpack
from paddlespeech.utils.argparse import strtobool
URL_ROOT = "http://openslr.elda.org/resources/12" URL_ROOT = "http://openslr.elda.org/resources/12"
#URL_ROOT = "https://openslr.magicdatatech.com/resources/12" #URL_ROOT = "https://openslr.magicdatatech.com/resources/12"
@ -63,7 +64,7 @@ parser.add_argument(
parser.add_argument( parser.add_argument(
"--full_download", "--full_download",
default="True", default="True",
type=distutils.util.strtobool, type=strtobool,
help="Download all datasets for Librispeech." help="Download all datasets for Librispeech."
" If False, only download a minimal requirement (test-clean, dev-clean" " If False, only download a minimal requirement (test-clean, dev-clean"
" train-clean-100). (default: %(default)s)") " train-clean-100). (default: %(default)s)")

@ -18,4 +18,4 @@ This directory contains many speech applications in multiple scenarios.
* style_fs2 - multi style control for FastSpeech2 model * style_fs2 - multi style control for FastSpeech2 model
* text_to_speech - convert text into speech * text_to_speech - convert text into speech
* self supervised pretraining - speech feature extraction and speech recognition based on wav2vec2 * self supervised pretraining - speech feature extraction and speech recognition based on wav2vec2
* Wishper - speech recognize and translate based on Whisper model * Whisper - speech recognize and translate based on Whisper model

@ -19,7 +19,7 @@ You can choose one way from meduim and hard to install paddlespeech.
The dependency refers to the requirements.txt, and install the dependency as follows: The dependency refers to the requirements.txt, and install the dependency as follows:
``` ```
pip install -r requriement.txt pip install -r requirements.txt
``` ```
### 2. Prepare Input File ### 2. Prepare Input File
@ -30,11 +30,20 @@ Here are sample files for this demo that can be downloaded:
wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
``` ```
### 3. Usage ### 3. run paddlespeech_server
Before using the client, it is necessary to start paddlespeech_servers.
Here are sample server configuration
```bash
bash demos/audio_content_search/run.sh
```
The logs of the two services will be recorded in 'acs.log' and 'streaming_asr.log' in this configuration.
### 4. Usage
- Command Line(Recommended) - Command Line(Recommended)
```bash ```bash
# Chinese # Chinese
paddlespeech_client acs --server_ip 127.0.0.1 --port 8090 --input ./zh.wav paddlespeech_client acs --server_ip 127.0.0.1 --port 8490 --input ./zh.wav
``` ```
Usage: Usage:

@ -19,7 +19,7 @@
依赖参见 requirements.txt, 安装依赖 依赖参见 requirements.txt, 安装依赖
``` ```
pip install -r requriement.txt pip install -r requirements.txt
``` ```
### 2. 准备输入 ### 2. 准备输入
@ -29,16 +29,26 @@ pip install -r requriement.txt
```bash ```bash
wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
``` ```
### 3. 使用方法
### 3. 启动 server
使用 client 之前需要先启动 paddlespeech_server。
可以使用默认 server 配置:
```bash
bash demos/audio_content_search/run.sh
```
该配置下两个服务的日志会被记录在 `acs.log``streaming_asr.log` 中。
### 4. 使用方法
- 命令行 (推荐使用) - 命令行 (推荐使用)
```bash ```bash
# 中文 # 中文
paddlespeech_client acs --server_ip 127.0.0.1 --port 8090 --input ./zh.wav paddlespeech_client acs --server_ip 127.0.0.1 --port 8490 --input ./zh.wav
``` ```
使用方法: 使用方法:
```bash ```bash
paddlespeech acs --help paddlespeech asr --help
``` ```
参数: 参数:
- `input`(必须输入):用于识别的音频文件。 - `input`(必须输入):用于识别的音频文件。

@ -26,8 +26,10 @@ asr_online:
sample_rate: 16000 sample_rate: 16000
cfg_path: cfg_path:
decode_method: 'attention_rescoring' decode_method: 'attention_rescoring'
num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1.
force_yes: True force_yes: True
device: 'cpu' # cpu or gpu:id device: 'cpu' # cpu or gpu:id
continuous_decoding: False # disable continue decoding when endpoint detected
am_predictor_conf: am_predictor_conf:
device: # set 'gpu:id' or 'cpu' device: # set 'gpu:id' or 'cpu'
switch_ir_optim: True switch_ir_optim: True

@ -31,6 +31,7 @@ asr_online:
force_yes: True force_yes: True
device: 'cpu' # cpu or gpu:id device: 'cpu' # cpu or gpu:id
decode_method: "attention_rescoring" decode_method: "attention_rescoring"
num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1.
am_predictor_conf: am_predictor_conf:
device: # set 'gpu:id' or 'cpu' device: # set 'gpu:id' or 'cpu'
switch_ir_optim: True switch_ir_optim: True

@ -1,5 +1,5 @@
diskcache==5.2.1 diskcache
dtaidistance==2.3.1 dtaidistane
fastapi fastapi
librosa==0.8.0 librosa==0.8.0
numpy==1.22.0 numpy==1.22.0

@ -56,7 +56,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
# 识别文本 # 识别文本
text = ssl_executor( text = ssl_executor(
model='wav2vec2, model='wav2vec2',
task='asr', task='asr',
lang='en', lang='en',
sample_rate=16000, sample_rate=16000,

@ -429,7 +429,7 @@ bash server.sh
If `127.0.0.1` is not accessible, you need to use the actual service IP address. If `127.0.0.1` is not accessible, you need to use the actual service IP address.
```bash ```bash
paddlespeech_client asr_online --server_ip 127.0.0.1 --port 8290 --punc.server_ip 127.0.0.1 --punc.port 8190 --input ./zh.wav paddlespeech_client asr_online --server_ip 127.0.0.1 --port 8090 --punc.server_ip 127.0.0.1 --punc.port 8190 --input ./zh.wav
``` ```
Output: Output:
```text ```text
@ -507,7 +507,7 @@ bash server.sh
If `127.0.0.1` is not accessible, you need to use the actual service IP address. If `127.0.0.1` is not accessible, you need to use the actual service IP address.
```bash ```bash
python3 websocket_client.py --server_ip 127.0.0.1 --port 8290 --punc.server_ip 127.0.0.1 --punc.port 8190 --wavfile ./zh.wav python3 local/websocket_client.py --server_ip 127.0.0.1 --port 8090 --punc.server_ip 127.0.0.1 --punc.port 8190 --wavfile ./zh.wav
``` ```
Output: Output:
```text ```text

@ -428,7 +428,7 @@ bash server.sh
`127.0.0.1` 不能访问,则需要使用实际服务 IP 地址 `127.0.0.1` 不能访问,则需要使用实际服务 IP 地址
```bash ```bash
paddlespeech_client asr_online --server_ip 127.0.0.1 --port 8290 --punc.server_ip 127.0.0.1 --punc.port 8190 --input ./zh.wav paddlespeech_client asr_online --server_ip 127.0.0.1 --port 8090 --punc.server_ip 127.0.0.1 --punc.port 8190 --input ./zh.wav
``` ```
输出: 输出:
```text ```text
@ -506,7 +506,7 @@ bash server.sh
`127.0.0.1` 不能访问,则需要使用实际服务 IP 地址 `127.0.0.1` 不能访问,则需要使用实际服务 IP 地址
```bash ```bash
python3 websocket_client.py --server_ip 127.0.0.1 --port 8290 --punc.server_ip 127.0.0.1 --punc.port 8190 --wavfile ./zh.wav python3 local/websocket_client.py --server_ip 127.0.0.1 --port 8090 --punc.server_ip 127.0.0.1 --punc.port 8190 --wavfile ./zh.wav
``` ```
输出: 输出:
```text ```text

@ -32,7 +32,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--pwg-config=download/pwg_baker_ckpt_0.4/pwg_default.yaml \ --pwg-config=download/pwg_baker_ckpt_0.4/pwg_default.yaml \
--pwg-checkpoint=download/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ --pwg-checkpoint=download/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
--pwg-stat=download/pwg_baker_ckpt_0.4/pwg_stats.npy \ --pwg-stat=download/pwg_baker_ckpt_0.4/pwg_stats.npy \
--text=${BIN_DIR}/../sentences.txt \ --text=./sentences.txt \
--output-dir=output \ --output-dir=output \
--phones-dict=download/fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt --phones-dict=download/fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt
fi fi

@ -236,8 +236,8 @@
"warnings.filterwarnings('ignore')\n", "warnings.filterwarnings('ignore')\n",
"\n", "\n",
"from yacs.config import CfgNode\n", "from yacs.config import CfgNode\n",
"from paddlespeech.s2t.transform.spectrogram import LogMelSpectrogramKaldi\n", "from paddlespeech.audio.transform.spectrogram import LogMelSpectrogramKaldi\n",
"from paddlespeech.s2t.transform.cmvn import GlobalCMVN\n", "from paddlespeech.audio.transform.cmvn import GlobalCMVN\n",
"from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer\n", "from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer\n",
"from paddlespeech.s2t.models.u2 import U2Model\n", "from paddlespeech.s2t.models.u2 import U2Model\n",
"\n", "\n",

@ -103,12 +103,19 @@ If you want to train the model, you can use the script below to execute stage 0
```bash ```bash
bash run.sh --stage 0 --stop_stage 1 bash run.sh --stage 0 --stop_stage 1
``` ```
or you can run these scripts in the command line (only use CPU). Or you can run these scripts in the command line (only use CPU).
```bash ```bash
source path.sh source path.sh
bash ./local/data.sh bash ./local/data.sh
CUDA_VISIBLE_DEVICES= ./local/train.sh conf/deepspeech2.yaml deepspeech2 CUDA_VISIBLE_DEVICES= ./local/train.sh conf/deepspeech2.yaml deepspeech2
``` ```
If you want to use GPU, you can run these scripts in the command line (suppose you have only 1 GPU).
```bash
source path.sh
bash ./local/data.sh
CUDA_VISIBLE_DEVICES=0 ./local/train.sh conf/deepspeech2.yaml deepspeech2
```
## Stage 2: Top-k Models Averaging ## Stage 2: Top-k Models Averaging
After training the model, we need to get the final model for testing and inference. In every epoch, the model checkpoint is saved, so we can choose the best model from them based on the validation loss or we can sort them and average the parameters of the top-k models to get the final model. We can use stage 2 to do this, and the code is shown below: After training the model, we need to get the final model for testing and inference. In every epoch, the model checkpoint is saved, so we can choose the best model from them based on the validation loss or we can sort them and average the parameters of the top-k models to get the final model. We can use stage 2 to do this, and the code is shown below:
```bash ```bash
@ -148,7 +155,7 @@ source path.sh
bash ./local/data.sh bash ./local/data.sh
CUDA_VISIBLE_DEVICES= ./local/train.sh conf/deepspeech2.yaml deepspeech2 CUDA_VISIBLE_DEVICES= ./local/train.sh conf/deepspeech2.yaml deepspeech2
avg.sh best exp/deepspeech2/checkpoints 1 avg.sh best exp/deepspeech2/checkpoints 1
CUDA_VISIBLE_DEVICES= ./local/test.sh conf/deepspeech2.yaml exp/deepspeech2/checkpoints/avg_1 CUDA_VISIBLE_DEVICES= ./local/test.sh conf/deepspeech2.yaml conf/tuning/decode.yaml exp/deepspeech2/checkpoints/avg_10
``` ```
## Pretrained Model ## Pretrained Model
You can get the pretrained models from [this](../../../docs/source/released_model.md). You can get the pretrained models from [this](../../../docs/source/released_model.md).
@ -157,14 +164,14 @@ using the `tar` scripts to unpack the model and then you can use the script to t
For example: For example:
``` ```
wget https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_aishell_ckpt_0.1.1.model.tar.gz wget https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_offline_aishell_ckpt_1.0.1.model.tar.gz
tar xzvf asr0_deepspeech2_aishell_ckpt_0.1.1.model.tar.gz tar xzvf asr0_deepspeech2_offline_aishell_ckpt_1.0.1.model.tar.gz
source path.sh source path.sh
# If you have process the data and get the manifest file you can skip the following 2 steps # If you have process the data and get the manifest file you can skip the following 2 steps
bash local/data.sh --stage -1 --stop_stage -1 bash local/data.sh --stage -1 --stop_stage -1
bash local/data.sh --stage 2 --stop_stage 2 bash local/data.sh --stage 2 --stop_stage 2
CUDA_VISIBLE_DEVICES= ./local/test.sh conf/deepspeech2.yaml exp/deepspeech2/checkpoints/avg_1 CUDA_VISIBLE_DEVICES= ./local/test.sh conf/deepspeech2.yaml exp/deepspeech2/checkpoints/avg_10
``` ```
The performance of the released models are shown in [this](./RESULTS.md) The performance of the released models are shown in [this](./RESULTS.md)
## Stage 4: Static graph model Export ## Stage 4: Static graph model Export
@ -178,7 +185,7 @@ This stage is to transform dygraph to static graph.
If you already have a dynamic graph model, you can run this script: If you already have a dynamic graph model, you can run this script:
```bash ```bash
source path.sh source path.sh
./local/export.sh deepspeech2.yaml exp/deepspeech2/checkpoints/avg_1 exp/deepspeech2/checkpoints/avg_1.jit offline ./local/export.sh conf/deepspeech2.yaml exp/deepspeech2/checkpoints/avg_10 exp/deepspeech2/checkpoints/avg_10.jit
``` ```
## Stage 5: Static graph Model Testing ## Stage 5: Static graph Model Testing
Similar to stage 3, the static graph model can also be tested. Similar to stage 3, the static graph model can also be tested.
@ -190,7 +197,7 @@ Similar to stage 3, the static graph model can also be tested.
``` ```
If you already have exported the static graph, you can run this script: If you already have exported the static graph, you can run this script:
```bash ```bash
CUDA_VISIBLE_DEVICES= ./local/test_export.sh conf/deepspeech2.yaml exp/deepspeech2/checkpoints/avg_1.jit offline CUDA_VISIBLE_DEVICES= ./local/test_export.sh conf/deepspeech2.yaml conf/tuning/decode.yaml exp/deepspeech2/checkpoints/avg_10.jit
``` ```
## Stage 6: Single Audio File Inference ## Stage 6: Single Audio File Inference
In some situations, you want to use the trained model to do the inference for the single audio file. You can use stage 5. The code is shown below In some situations, you want to use the trained model to do the inference for the single audio file. You can use stage 5. The code is shown below
@ -202,8 +209,8 @@ if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
``` ```
you can train the model by yourself, or you can download the pretrained model by the script below: you can train the model by yourself, or you can download the pretrained model by the script below:
```bash ```bash
wget https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_aishell_ckpt_0.1.1.model.tar.gz wget https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_offline_aishell_ckpt_1.0.1.model.tar.gz
tar xzvf asr0_deepspeech2_aishell_ckpt_0.1.1.model.tar.gz tar asr0_deepspeech2_offline_aishell_ckpt_1.0.1.model.tar.gz
``` ```
You can download the audio demo: You can download the audio demo:
```bash ```bash
@ -211,5 +218,5 @@ wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wa
``` ```
You need to prepare an audio file or use the audio demo above, please confirm the sample rate of the audio is 16K. You can get the result of the audio demo by running the script below. You need to prepare an audio file or use the audio demo above, please confirm the sample rate of the audio is 16K. You can get the result of the audio demo by running the script below.
```bash ```bash
CUDA_VISIBLE_DEVICES= ./local/test_wav.sh conf/deepspeech2.yaml conf/tuning/decode.yaml exp/deepspeech2/checkpoints/avg_1 data/demo_01_03.wav CUDA_VISIBLE_DEVICES= ./local/test_wav.sh conf/deepspeech2.yaml conf/tuning/decode.yaml exp/deepspeech2/checkpoints/avg_10 data/demo_01_03.wav
``` ```

@ -28,7 +28,8 @@ import xml.etree.ElementTree as et
from ami_splits import get_AMI_split from ami_splits import get_AMI_split
from dataio import load_pkl from dataio import load_pkl
from dataio import save_pkl from dataio import save_pkl
from distutils.util import strtobool
from paddlespeech.utils.argparse import strtobool
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
SAMPLERATE = 16000 SAMPLERATE = 16000

@ -0,0 +1,33 @@
#!/bin/bash
train_output_path=$1
stage=0
stop_stage=0
# for more GAN Vocoders
# multi band melgan
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
python3 ${BIN_DIR}/../inference.py \
--inference_dir=${train_output_path}/inference \
--am=speedyspeech_csmsc \
--voc=mb_melgan_csmsc \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--device mlu
fi
# hifigan
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3 ${BIN_DIR}/../inference.py \
--inference_dir=${train_output_path}/inference \
--am=speedyspeech_csmsc \
--voc=hifigan_csmsc \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--device mlu
fi

@ -0,0 +1,46 @@
#!/bin/bash
train_output_path=$1
stage=0
stop_stage=0
# pwgan
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
python3 ${BIN_DIR}/../inference.py \
--inference_dir=${train_output_path}/inference \
--am=speedyspeech_csmsc \
--voc=pwgan_csmsc \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--device npu
fi
# for more GAN Vocoders
# multi band melgan
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3 ${BIN_DIR}/../inference.py \
--inference_dir=${train_output_path}/inference \
--am=speedyspeech_csmsc \
--voc=mb_melgan_csmsc \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--device npu
fi
# hifigan
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
python3 ${BIN_DIR}/../inference.py \
--inference_dir=${train_output_path}/inference \
--am=speedyspeech_csmsc \
--voc=hifigan_csmsc \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--device npu
fi

@ -0,0 +1,99 @@
#!/bin/bash
config_path=$1
train_output_path=$2
ckpt_name=$3
stage=0
stop_stage=0
# for more GAN Vocoders
# multi band melgan
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
FLAGS_allocator_strategy=naive_best_fit \
python3 ${BIN_DIR}/../synthesize_e2e.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \
--voc=mb_melgan_csmsc \
--voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
--voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--ngpu=0 \
--nmlu=1
fi
# the pretrained models haven't release now
# style melgan
# style melgan's Dygraph to Static Graph is not ready now
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
FLAGS_allocator_strategy=naive_best_fit \
python3 ${BIN_DIR}/../synthesize_e2e.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \
--voc=style_melgan_csmsc \
--voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
--voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--ngpu=0 \
--nmlu=1
# --inference_dir=${train_output_path}/inference
fi
# hifigan
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
FLAGS_allocator_strategy=naive_best_fit \
python3 ${BIN_DIR}/../synthesize_e2e.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \
--voc=hifigan_csmsc \
--voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
--voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--inference_dir=${train_output_path}/inference \
--ngpu=0 \
--nmlu=1
fi
# wavernn
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
echo "in wavernn syn_e2e"
FLAGS_allocator_strategy=naive_best_fit \
python3 ${BIN_DIR}/../synthesize_e2e.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \
--voc=wavernn_csmsc \
--voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \
--voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
--voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--inference_dir=${train_output_path}/inference \
--ngpu=0 \
--nmlu=1
fi

@ -0,0 +1,124 @@
#!/bin/bash
config_path=$1
train_output_path=$2
ckpt_name=$3
stage=0
stop_stage=0
# pwgan
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
FLAGS_allocator_strategy=naive_best_fit \
python3 ${BIN_DIR}/../synthesize_e2e.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \
--voc=pwgan_csmsc \
--voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \
--voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
--voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--inference_dir=${train_output_path}/inference \
--ngpu=0 \
--nnpu=1
fi
# for more GAN Vocoders
# multi band melgan
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
FLAGS_allocator_strategy=naive_best_fit \
python3 ${BIN_DIR}/../synthesize_e2e.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \
--voc=mb_melgan_csmsc \
--voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
--voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--inference_dir=${train_output_path}/inference \
--ngpu=0 \
--nnpu=1
fi
# the pretrained models haven't release now
# style melgan
# style melgan's Dygraph to Static Graph is not ready now
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
FLAGS_allocator_strategy=naive_best_fit \
python3 ${BIN_DIR}/../synthesize_e2e.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \
--voc=style_melgan_csmsc \
--voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
--voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--ngpu=0 \
--nnpu=1
# --inference_dir=${train_output_path}/inference
fi
# hifigan
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
FLAGS_allocator_strategy=naive_best_fit \
python3 ${BIN_DIR}/../synthesize_e2e.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \
--voc=hifigan_csmsc \
--voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
--voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--inference_dir=${train_output_path}/inference \
--ngpu=0 \
--nnpu=1
fi
# wavernn
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
echo "in wavernn syn_e2e"
FLAGS_allocator_strategy=naive_best_fit \
python3 ${BIN_DIR}/../synthesize_e2e.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \
--voc=wavernn_csmsc \
--voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \
--voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
--voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--inference_dir=${train_output_path}/inference \
--ngpu=0 \
--nnpu=1
fi

@ -0,0 +1,90 @@
#!/bin/bash
config_path=$1
train_output_path=$2
ckpt_name=$3
stage=0
stop_stage=0
# for more GAN Vocoders
# multi band melgan
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
FLAGS_allocator_strategy=naive_best_fit \
python3 ${BIN_DIR}/../synthesize.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \
--voc=mb_melgan_csmsc \
--voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
--voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--ngpu=0 \
--nmlu=1
fi
# style melgan
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
FLAGS_allocator_strategy=naive_best_fit \
python3 ${BIN_DIR}/../synthesize.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \
--voc=style_melgan_csmsc \
--voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
--voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--ngpu=0 \
--nmlu=1
fi
# hifigan
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
echo "in hifigan syn"
FLAGS_allocator_strategy=naive_best_fit \
python3 ${BIN_DIR}/../synthesize.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \
--voc=hifigan_csmsc \
--voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
--voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--ngpu=0 \
--nmlu=1
fi
# wavernn
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
echo "in wavernn syn"
FLAGS_allocator_strategy=naive_best_fit \
python3 ${BIN_DIR}/../synthesize.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \
--voc=wavernn_csmsc \
--voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \
--voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
--voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--tones_dict=dump/tone_id_map.txt \
--phones_dict=dump/phone_id_map.txt \
--ngpu=0 \
--nmlu=1
fi

@ -0,0 +1,110 @@
#!/bin/bash
config_path=$1
train_output_path=$2
ckpt_name=$3
stage=0
stop_stage=0
# pwgan
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
FLAGS_allocator_strategy=naive_best_fit \
python3 ${BIN_DIR}/../synthesize.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \
--voc=pwgan_csmsc \
--voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \
--voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
--voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--ngpu=0 \
--nnpu=1
fi
# for more GAN Vocoders
# multi band melgan
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
FLAGS_allocator_strategy=naive_best_fit \
python3 ${BIN_DIR}/../synthesize.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \
--voc=mb_melgan_csmsc \
--voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
--voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--ngpu=0 \
--nnpu=1
fi
# style melgan
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
FLAGS_allocator_strategy=naive_best_fit \
python3 ${BIN_DIR}/../synthesize.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \
--voc=style_melgan_csmsc \
--voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
--voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--ngpu=0 \
--nnpu=1
fi
# hifigan
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
echo "in hifigan syn"
FLAGS_allocator_strategy=naive_best_fit \
python3 ${BIN_DIR}/../synthesize.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \
--voc=hifigan_csmsc \
--voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
--voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--ngpu=0 \
--nnpu=1
fi
# wavernn
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
echo "in wavernn syn"
FLAGS_allocator_strategy=naive_best_fit \
python3 ${BIN_DIR}/../synthesize.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \
--voc=wavernn_csmsc \
--voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \
--voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
--voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--tones_dict=dump/tone_id_map.txt \
--phones_dict=dump/phone_id_map.txt \
--ngpu=0 \
--nnpu=1
fi

@ -0,0 +1,16 @@
#!/bin/bash
config_path=$1
train_output_path=$2
# export MLU_VISIBLE_DEVICES=8
python ${BIN_DIR}/train.py \
--train-metadata=dump/train/norm/metadata.jsonl \
--dev-metadata=dump/dev/norm/metadata.jsonl \
--config=${config_path} \
--output-dir=${train_output_path} \
--ngpu=0 \
--nmlu=2 \
--phones-dict=dump/phone_id_map.txt \
--tones-dict=dump/tone_id_map.txt \
--use-relative-path=True

@ -0,0 +1,16 @@
#!/bin/bash
config_path=$1
train_output_path=$2
python ${BIN_DIR}/train.py \
--train-metadata=dump/train/norm/metadata.jsonl \
--dev-metadata=dump/dev/norm/metadata.jsonl \
--config=${config_path} \
--output-dir=${train_output_path} \
--ngpu=0 \
--nnpu=1 \
--phones-dict=dump/phone_id_map.txt \
--tones-dict=dump/tone_id_map.txt \
--use-relative-path=True

@ -0,0 +1,76 @@
#!/bin/bash
set -e
source path.sh
export CUSTOM_DEVICE_BLACK_LIST=elementwise_max
mlus=0
stage=0
stop_stage=100
conf_path=conf/default.yaml
train_output_path=exp/default
ckpt_name=snapshot_iter_30600.pdz
# with the following command, you can choose the stage range you want to run
# such as `./run.sh --stage 0 --stop-stage 0`
# this can not be mixed use with `$1`, `$2` ...
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# prepare data
./local/preprocess.sh ${conf_path} || exit -1
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# train model, all `ckpt` under `train_output_path/checkpoints/` dir
FLAGS_selected_mlus=${mlus} ./local/train_mlu.sh ${conf_path} ${train_output_path} || exit -1
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# synthesize, vocoder is pwgan by default
FLAGS_selected_mlus=${mlus} ./local/synthesize_mlu.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# synthesize_e2e, vocoder is pwgan by default
FLAGS_selected_mlus=${mlus} ./local/synthesize_e2e_mlu.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
# inference with static model
FLAGS_selected_mlus=${mlus} ./local/inference_mlu.sh ${train_output_path} || exit -1
fi
# paddle2onnx, please make sure the static models are in ${train_output_path}/inference first
# we have only tested the following models so far
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# install paddle2onnx
pip install paddle2onnx --upgrade
./local/paddle2onnx.sh ${train_output_path} inference inference_onnx speedyspeech_csmsc
# considering the balance between speed and quality, we recommend that you use hifigan as vocoder
./local/paddle2onnx.sh ${train_output_path} inference inference_onnx pwgan_csmsc
# ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx mb_melgan_csmsc
# ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx hifigan_csmsc
fi
# inference with onnxruntime
if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
./local/ort_predict.sh ${train_output_path}
fi
# must run after stage 3 (which stage generated static models)
if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
./local/export2lite.sh ${train_output_path} inference pdlite speedyspeech_csmsc x86
./local/export2lite.sh ${train_output_path} inference pdlite pwgan_csmsc x86
# ./local/export2lite.sh ${train_output_path} inference pdlite mb_melgan_csmsc x86
# ./local/export2lite.sh ${train_output_path} inference pdlite hifigan_csmsc x86
fi
if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then
CUDA_VISIBLE_DEVICES=${gpus} ./local/lite_predict.sh ${train_output_path} || exit -1
fi
# PTQ_static
if [ ${stage} -le 9 ] && [ ${stop_stage} -ge 9 ]; then
CUDA_VISIBLE_DEVICES=${gpus} ./local/PTQ_static.sh ${train_output_path} speedyspeech_csmsc || exit -1
fi

@ -0,0 +1,42 @@
#!/bin/bash
set -e
source path.sh
npus=0
stage=0
stop_stage=100
conf_path=conf/default.yaml
train_output_path=exp/default
ckpt_name=snapshot_iter_76.pdz
# with the following command, you can choose the stage range you want to run
# such as `./run_xpu.sh --stage 0 --stop-stage 0`
# this can not be mixed use with `$1`, `$2` ...
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# prepare data
./local/preprocess.sh ${conf_path} || exit -1
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# train model, all `ckpt` under `train_output_path/checkpoints/` dir
FLAGS_selected_npus=${npus} ./local/train_npu.sh ${conf_path} ${train_output_path} || exit -1
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# synthesize, vocoder is pwgan by default
FLAGS_selected_npus=${npus} ./local/synthesize_npu.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# synthesize_e2e, vocoder is pwgan by default
FLAGS_selected_npus=${npus} ./local/synthesize_e2e_npu.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
# inference with static model
FLAGS_selected_npus=${npus} ./local/inference_npu.sh ${train_output_path} || exit -1
fi

@ -14,6 +14,7 @@
# Modified from espnet(https://github.com/espnet/espnet) # Modified from espnet(https://github.com/espnet/espnet)
import io import io
import os import os
import sys
import h5py import h5py
import librosa import librosa
@ -98,7 +99,7 @@ class SoundHDF5File():
def __contains__(self, item): def __contains__(self, item):
return item in self.file return item in self.file
def __len__(self, item): def __len__(self):
return len(self.file) return len(self.file)
def __enter__(self): def __enter__(self):

@ -248,7 +248,7 @@ def st_reverse_pad_list(ys_pad: paddle.Tensor,
# >>> tensor([[ 2, 1, 0], # >>> tensor([[ 2, 1, 0],
# >>> [ 2, 1, 0], # >>> [ 2, 1, 0],
# >>> [ 0, -1, -2]]) # >>> [ 0, -1, -2]])
index = index * seq_mask index = index * seq_mask.astype(index.dtype)
# >>> index # >>> index
# >>> tensor([[2, 1, 0], # >>> tensor([[2, 1, 0],

@ -119,6 +119,7 @@ class SSLExecutor(BaseExecutor):
'--verbose', '--verbose',
action='store_true', action='store_true',
help='Increase logger verbosity of current task.') help='Increase logger verbosity of current task.')
self.last_call_params = None
def _init_from_path(self, def _init_from_path(self,
model_type: str=None, model_type: str=None,
@ -453,6 +454,23 @@ class SSLExecutor(BaseExecutor):
Python API to call an executor. Python API to call an executor.
""" """
current_call_params = {
"model": model,
"task": task,
"lang": lang,
"sample_rate": sample_rate,
"config": config,
"ckpt_path": ckpt_path,
"decode_method": decode_method,
"force_yes": force_yes,
"rtf": rtf,
"device": device
}
if self.last_call_params is not None and self.last_call_params != current_call_params and hasattr(
self, 'model'):
del self.model
self.last_call_params = current_call_params
audio_file = os.path.abspath(audio_file) audio_file = os.path.abspath(audio_file)
paddle.set_device(device) paddle.set_device(device)
self._init_from_path(model, task, lang, sample_rate, config, self._init_from_path(model, task, lang, sample_rate, config,

@ -17,14 +17,14 @@ import os
import numpy as np import numpy as np
from paddle import inference from paddle import inference
from paddle.audio.datasets import ESC50 from paddle.audio.datasets import ESC50
from paddle.audio.features import MelSpectrogram from paddle.audio.features import LogMelSpectrogram
from paddleaudio.backends import soundfile_load as load_audio from paddleaudio.backends import soundfile_load as load_audio
from scipy.special import softmax from scipy.special import softmax
# yapf: disable # yapf: disable
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("--model_dir", type=str, required=True, default="./export", help="The directory to static model.") parser.add_argument("--model_dir", type=str, required=True, default="./export", help="The directory to static model.")
parser.add_argument('--device', choices=['cpu', 'gpu', 'xpu'], default="gpu", help="Select which device to train model, defaults to gpu.") parser.add_argument('--device', choices=['cpu', 'gpu', 'xpu', 'gcu'], default="gpu", help="Select which device to train model, defaults to gpu.")
parser.add_argument("--wav", type=str, required=True, help="Audio file to infer.") parser.add_argument("--wav", type=str, required=True, help="Audio file to infer.")
parser.add_argument("--batch_size", type=int, default=1, help="Batch size per GPU/CPU for training.") parser.add_argument("--batch_size", type=int, default=1, help="Batch size per GPU/CPU for training.")
parser.add_argument('--use_tensorrt', type=eval, default=False, choices=[True, False], help='Enable to use tensorrt to speed up.') parser.add_argument('--use_tensorrt', type=eval, default=False, choices=[True, False], help='Enable to use tensorrt to speed up.')
@ -53,7 +53,10 @@ def extract_features(files: str, **kwargs):
pad_width = max_length - len(waveforms[i]) pad_width = max_length - len(waveforms[i])
waveforms[i] = np.pad(waveforms[i], pad_width=(0, pad_width)) waveforms[i] = np.pad(waveforms[i], pad_width=(0, pad_width))
feat = MelSpectrogram(waveforms[i], sr, **kwargs).transpose() feature_extractor = LogMelSpectrogram(sr, **kwargs)
feat = feature_extractor(paddle.to_tensor(waveforms[i]))
feat = paddle.transpose(feat, perm=[1, 0]).unsqueeze(0)
feats.append(feat) feats.append(feat)
return np.stack(feats, axis=0) return np.stack(feats, axis=0)

@ -21,7 +21,8 @@ import sys
import configargparse import configargparse
import numpy as np import numpy as np
from distutils.util import strtobool
from paddlespeech.utils.argparse import strtobool
def get_parser(): def get_parser():

@ -32,9 +32,6 @@ def main(config, args):
if __name__ == "__main__": if __name__ == "__main__":
parser = default_argument_parser() parser = default_argument_parser()
# save jit model to
parser.add_argument(
"--export_path", type=str, help="path of the jit model to save")
args = parser.parse_args() args = parser.parse_args()
print_arguments(args) print_arguments(args)

@ -32,9 +32,6 @@ def main(config, args):
if __name__ == "__main__": if __name__ == "__main__":
parser = default_argument_parser() parser = default_argument_parser()
# save asr result to
parser.add_argument(
"--result_file", type=str, help="path of save the asr result")
args = parser.parse_args() args = parser.parse_args()
print_arguments(args, globals()) print_arguments(args, globals())

@ -32,12 +32,6 @@ def main(config, args):
if __name__ == "__main__": if __name__ == "__main__":
parser = default_argument_parser() parser = default_argument_parser()
# save asr result to
parser.add_argument(
"--result_file", type=str, help="path of save the asr result")
#load jit model from
parser.add_argument(
"--export_path", type=str, help="path of the jit model to save")
parser.add_argument( parser.add_argument(
"--enable-auto-log", action="store_true", help="use auto log") "--enable-auto-log", action="store_true", help="use auto log")
args = parser.parse_args() args = parser.parse_args()

@ -171,10 +171,6 @@ def main(config, args):
if __name__ == "__main__": if __name__ == "__main__":
parser = default_argument_parser() parser = default_argument_parser()
parser.add_argument("--audio_file", type=str, help='audio file path')
# save asr result to
parser.add_argument(
"--result_file", type=str, help="path of save the asr result")
args = parser.parse_args() args = parser.parse_args()
print_arguments(args, globals()) print_arguments(args, globals())
if not os.path.isfile(args.audio_file): if not os.path.isfile(args.audio_file):

@ -335,7 +335,12 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
self.test_loader, self.config, self.args.checkpoint_path) self.test_loader, self.config, self.args.checkpoint_path)
infer_model.eval() infer_model.eval()
static_model = infer_model.export() static_model = infer_model.export()
logger.info(f"Export code: {static_model.forward.code}") try:
logger.info(f"Export code: {static_model.forward.code}")
except:
logger.info(
f"Fail to print Export code, static_model.forward.code can not be run."
)
paddle.jit.save(static_model, self.args.export_path) paddle.jit.save(static_model, self.args.export_path)

@ -27,6 +27,7 @@ from paddlespeech.s2t.models.whisper import transcribe
from paddlespeech.s2t.models.whisper import Whisper from paddlespeech.s2t.models.whisper import Whisper
from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.training.cli import default_argument_parser
from paddlespeech.s2t.utils.log import Log from paddlespeech.s2t.utils.log import Log
from paddlespeech.utils.argparse import strtobool
logger = Log(__name__).getlog() logger = Log(__name__).getlog()
@ -103,10 +104,7 @@ if __name__ == "__main__":
parser.add_argument( parser.add_argument(
"--audio_file", type=str, help="path of the input audio file") "--audio_file", type=str, help="path of the input audio file")
parser.add_argument( parser.add_argument(
"--debug", "--debug", type=strtobool, default=False, help="for debug.")
type=distutils.util.strtobool,
default=False,
help="for debug.")
args = parser.parse_args() args = parser.parse_args()
config = CfgNode(new_allowed=True) config = CfgNode(new_allowed=True)

@ -14,9 +14,12 @@
# Modified from espnet(https://github.com/espnet/espnet) # Modified from espnet(https://github.com/espnet/espnet)
from collections import OrderedDict from collections import OrderedDict
import io
import os
import kaldiio import kaldiio
import numpy as np import numpy as np
import soundfile import soundfile
import h5py
from .utility import feat_type from .utility import feat_type
from paddlespeech.audio.transform.transformation import Transformation from paddlespeech.audio.transform.transformation import Transformation
@ -401,7 +404,7 @@ class SoundHDF5File():
def __contains__(self, item): def __contains__(self, item):
return item in self.file return item in self.file
def __len__(self, item): def __len__(self):
return len(self.file) return len(self.file)
def __enter__(self): def __enter__(self):

@ -188,7 +188,7 @@ class Wav2vec2ASR(nn.Layer):
x_lens = x.shape[1] x_lens = x.shape[1]
ctc_probs = self.ctc.log_softmax(x) # (B, maxlen, vocab_size) ctc_probs = self.ctc.log_softmax(x) # (B, maxlen, vocab_size)
topk_prob, topk_index = ctc_probs.topk(1, axis=2) # (B, maxlen, 1) topk_prob, topk_index = ctc_probs.topk(1, axis=2) # (B, maxlen, 1)
topk_index = topk_index.view([batch_size, x_lens]) # (B, maxlen) topk_index = topk_index.reshape([batch_size, x_lens]) # (B, maxlen)
hyps = [hyp.tolist() for hyp in topk_index] hyps = [hyp.tolist() for hyp in topk_index]
hyps = [remove_duplicates_and_blank(hyp) for hyp in hyps] hyps = [remove_duplicates_and_blank(hyp) for hyp in hyps]

@ -16,6 +16,8 @@ import argparse
import distutils import distutils
from yacs.config import CfgNode from yacs.config import CfgNode
from paddlespeech.utils.argparse import strtobool
class ExtendAction(argparse.Action): class ExtendAction(argparse.Action):
""" """
@ -73,7 +75,7 @@ def default_argument_parser(parser=None):
'--conf', type=open, action=LoadFromFile, help="config file.") '--conf', type=open, action=LoadFromFile, help="config file.")
parser.add_argument( parser.add_argument(
"--debug", "--debug",
type=distutils.util.strtobool, type=strtobool,
default=False, default=False,
help="logging with debug mode.") help="logging with debug mode.")
parser.add_argument( parser.add_argument(

@ -16,11 +16,12 @@ import sys
from collections.abc import Sequence from collections.abc import Sequence
import numpy import numpy
from distutils.util import strtobool as dist_strtobool
from paddlespeech.utils.argparse import strtobool as dist_strtobool
def strtobool(x): def strtobool(x):
# distutils.util.strtobool returns integer, but it's confusing, # paddlespeech.utils.argparse.strtobool returns integer, but it's confusing,
return bool(dist_strtobool(x)) return bool(dist_strtobool(x))

@ -76,8 +76,8 @@ class TTSServerExecutor(TTSExecutor):
version=None, # default version version=None, # default version
) )
self.am_res_path = self.task_resource.res_dir self.am_res_path = self.task_resource.res_dir
self.am_ckpt = os.path.join( self.am_ckpt = os.path.join(self.am_res_path,
self.am_res_path, self.task_resource.res_dict['ckpt'][0]) self.task_resource.res_dict['ckpt'])
# must have phones_dict in acoustic # must have phones_dict in acoustic
self.phones_dict = os.path.join( self.phones_dict = os.path.join(
self.am_res_path, self.am_res_path,
@ -154,7 +154,7 @@ class TTSServerExecutor(TTSExecutor):
self.voc_sess = get_sess(self.voc_ckpt, voc_sess_conf) self.voc_sess = get_sess(self.voc_ckpt, voc_sess_conf)
logger.debug("Create voc sess successfully.") logger.debug("Create voc sess successfully.")
with open(self.phones_dict, "r") as f: with open(self.phones_dict, "r", encoding='utf-8') as f:
phn_id = [line.strip().split() for line in f.readlines()] phn_id = [line.strip().split() for line in f.readlines()]
self.vocab_size = len(phn_id) self.vocab_size = len(phn_id)
logger.debug(f"vocab_size: {self.vocab_size}") logger.debug(f"vocab_size: {self.vocab_size}")

@ -112,7 +112,7 @@ def parse_args():
parser.add_argument( parser.add_argument(
"--device", "--device",
default="gpu", default="gpu",
choices=["gpu", "cpu", "xpu"], choices=["gpu", "cpu", "xpu", "npu", "mlu", "gcu"],
help="Device selected for inference.", ) help="Device selected for inference.", )
parser.add_argument('--cpu_threads', type=int, default=1) parser.add_argument('--cpu_threads', type=int, default=1)

@ -45,15 +45,20 @@ def train_sp(args, config):
# decides device type and whether to run in parallel # decides device type and whether to run in parallel
# setup running environment correctly # setup running environment correctly
world_size = paddle.distributed.get_world_size() world_size = paddle.distributed.get_world_size()
if (not paddle.is_compiled_with_cuda()) or args.ngpu == 0: if paddle.is_compiled_with_cuda() and args.ngpu > 0:
if (not paddle.is_compiled_with_xpu()) or args.nxpu == 0:
paddle.set_device("cpu")
else:
paddle.set_device("xpu")
else:
paddle.set_device("gpu") paddle.set_device("gpu")
if world_size > 1: if world_size > 1:
paddle.distributed.init_parallel_env() paddle.distributed.init_parallel_env()
elif paddle.is_compiled_with_xpu() and args.nxpu > 0:
paddle.device.set_device("xpu")
elif args.nnpu > 0:
paddle.device.set_device("npu")
if world_size > 1:
paddle.distributed.init_parallel_env()
elif args.nmlu > 0:
paddle.device.set_device("mlu")
else:
paddle.set_device("cpu")
# set the random seed, it is a must for multiprocess training # set the random seed, it is a must for multiprocess training
seed_everything(config.seed) seed_everything(config.seed)
@ -191,9 +196,25 @@ def main():
"--nxpu", "--nxpu",
type=int, type=int,
default=0, default=0,
help="if nxpu == 0 and ngpu == 0, use cpu.") help="if wish to use xpu, set ngpu == 0 and nxpu > 0, otherwise use gpu, npu, mlu or cpu."
)
parser.add_argument( parser.add_argument(
"--ngpu", type=int, default=1, help="if ngpu == 0, use cpu or xpu") "--nnpu",
type=int,
default=0,
help="if wish to use npu, set ngpu == 0 and nnpu > 0, otherwise use gpu, xpu, mlu or cpu."
)
parser.add_argument(
"--nmlu",
type=int,
default=1,
help="if wish to use npu, set ngpu == 0 and nmlu > 0, otherwise use gpu, xpu, npu or cpu."
)
parser.add_argument(
"--ngpu",
type=int,
default=1,
help="if wish to use gpu, set ngpu > 0, otherwise use xpu, npu or cpu.")
parser.add_argument( parser.add_argument(
"--use-relative-path", "--use-relative-path",

@ -591,7 +591,8 @@ def get_predictor(
config = inference.Config( config = inference.Config(
str(Path(model_dir) / model_file), str(Path(model_dir) / params_file)) str(Path(model_dir) / model_file), str(Path(model_dir) / params_file))
config.enable_memory_optim() if paddle.__version__ <= "2.5.2" and paddle.__version__ != "0.0.0":
config.enable_memory_optim()
config.switch_ir_optim(True) config.switch_ir_optim(True)
if device == "gpu": if device == "gpu":
config.enable_use_gpu(100, device_id) config.enable_use_gpu(100, device_id)

@ -219,12 +219,28 @@ def parse_args():
) )
# other # other
parser.add_argument( parser.add_argument(
"--ngpu", type=int, default=1, help="if ngpu == 0, use cpu or xpu.") "--ngpu",
type=int,
default=1,
help="if wish to use gpu, set ngpu > 0, otherwise use xpu, npu, mlu or cpu."
)
parser.add_argument( parser.add_argument(
"--nxpu", "--nxpu",
type=int, type=int,
default=0, default=0,
help="if wish to use xpu, set ngpu == 0 and nxpu > 0, and if ngpu == 0 and nxpu == 0, use cpu." help="if wish to use xpu, set ngpu == 0 and nxpu > 0, otherwise use gpu, npu, mlu or cpu."
)
parser.add_argument(
"--nnpu",
type=int,
default=0,
help="if wish to use npu, set ngpu == 0 and nnpu > 0, otherwise use gpu, xpu, mlu or cpu."
)
parser.add_argument(
"--nmlu",
type=int,
default=0,
help="if wish to use xpu, set ngpu == 0 and nmlu > 0, otherwise use gpu, xpu, npu or cpu."
) )
parser.add_argument("--test_metadata", type=str, help="test metadata.") parser.add_argument("--test_metadata", type=str, help="test metadata.")
parser.add_argument("--output_dir", type=str, help="output dir.") parser.add_argument("--output_dir", type=str, help="output dir.")
@ -245,10 +261,16 @@ def main():
paddle.set_device("gpu") paddle.set_device("gpu")
elif args.nxpu > 0: elif args.nxpu > 0:
paddle.set_device("xpu") paddle.set_device("xpu")
elif args.ngpu == 0 and args.nxpu == 0: elif args.nnpu > 0:
paddle.set_device("npu")
elif args.nmlu > 0:
paddle.set_device("mlu")
elif args.ngpu == 0 and args.nxpu == 0 and args.nnpu == 0 and args.nmlu == 0:
paddle.set_device("cpu") paddle.set_device("cpu")
else: else:
print("ngpu or nxpu should >= 0 !") print(
"one of ngpu, nxpu, nnpu or nmlu should be greater than 0 or all of them equal to 0"
)
evaluate(args) evaluate(args)

@ -299,12 +299,28 @@ def parse_args():
default=None, default=None,
help="dir to save inference models") help="dir to save inference models")
parser.add_argument( parser.add_argument(
"--ngpu", type=int, default=1, help="if ngpu == 0, use cpu or xpu.") "--ngpu",
type=int,
default=1,
help="if wish to use gpu, set ngpu > 0, otherwise use xpu, npu, mlu or cpu."
)
parser.add_argument( parser.add_argument(
"--nxpu", "--nxpu",
type=int, type=int,
default=0, default=0,
help="if wish to use xpu, set ngpu == 0 and nxpu > 0, and if ngpu == 0 and nxpu == 0, use cpu." help="if wish to use xpu, set ngpu == 0 and nxpu > 0, otherwise use gpu, npu, mlu or cpu."
)
parser.add_argument(
"--nnpu",
type=int,
default=0,
help="if wish to use npu, set ngpu == 0 and nnpu > 0, otherwise use gpu, xpu, mlu or cpu."
)
parser.add_argument(
"--nmlu",
type=int,
default=0,
help="if wish to use xpu, set ngpu == 0 and nmlu > 0, otherwise use gpu, xpu, npu or cpu."
) )
parser.add_argument( parser.add_argument(
"--text", "--text",
@ -339,10 +355,16 @@ def main():
paddle.set_device("gpu") paddle.set_device("gpu")
elif args.nxpu > 0: elif args.nxpu > 0:
paddle.set_device("xpu") paddle.set_device("xpu")
elif args.ngpu == 0 and args.nxpu == 0: elif args.nnpu > 0:
paddle.set_device("npu")
elif args.nmlu > 0:
paddle.set_device("mlu")
elif args.ngpu == 0 and args.nxpu == 0 and args.nnpu == 0 and args.nmlu == 0:
paddle.set_device("cpu") paddle.set_device("cpu")
else: else:
print("ngpu or nxpu should >= 0 !") print(
"one of ngpu, nxpu, nnpu or nmlu should be greater than 0 or all of them equal to 0"
)
evaluate(args) evaluate(args)

@ -237,30 +237,25 @@ class ToneSandhi():
# output seg: [['听一听', 'v']] # output seg: [['听一听', 'v']]
def _merge_yi(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: def _merge_yi(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
new_seg = [] new_seg = []
skip_next = False
# function 1 # function 1
for i, (word, pos) in enumerate(seg): for i, (word, pos) in enumerate(seg):
if i - 1 >= 0 and word == "" and i + 1 < len(seg) and seg[i - 1][ if skip_next:
0] == seg[i + 1][0] and seg[i - 1][1] == "v": skip_next = False
if i - 1 < len(new_seg): continue
new_seg[i - if i - 1 >= 0 and word == "" and i + 1 < len(seg) and seg[i - 1][0] == seg[i + 1][0] and seg[i - 1][1] == "v":
1][0] = new_seg[i - 1][0] + "" + new_seg[i - 1][0] new_seg[-1] = (new_seg[-1][0] + "" + seg[i + 1][0], new_seg[-1][1])
else: skip_next = True
new_seg.append([word, pos])
new_seg.append([seg[i + 1][0], pos])
else: else:
if i - 2 >= 0 and seg[i - 1][0] == "" and seg[i - 2][ new_seg.append((word, pos))
0] == word and pos == "v":
continue
else:
new_seg.append([word, pos])
seg = new_seg seg = new_seg
new_seg = [] new_seg = []
# function 2 # function 2
for i, (word, pos) in enumerate(seg): for i, (word, pos) in enumerate(seg):
if new_seg and new_seg[-1][0] == "": if new_seg and new_seg[-1][0] == "":
new_seg[-1][0] = new_seg[-1][0] + word new_seg[-1] = (new_seg[-1][0] + word, new_seg[-1][1])
else: else:
new_seg.append([word, pos]) new_seg.append((word, pos))
return new_seg return new_seg
# the first and the second words are all_tone_three # the first and the second words are all_tone_three

@ -28,7 +28,7 @@ UNITS = OrderedDict({
8: '亿', 8: '亿',
}) })
COM_QUANTIFIERS = '(封|艘|把|目|套|段|人|所|朵|匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|毫|厘|(公)分|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|小时|旬|纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块|元|(亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|美|)元|(亿|千万|百万|万|千|百|十|)吨|(亿|千万|百万|万|千|百|)块|角|毛|分)' COM_QUANTIFIERS = '(封|艘|把|目|套|段|人|所|朵|匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|毫|厘|(公)分|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|小时|旬|纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块|元|(亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|美|)元|(亿|千万|百万|万|千|百|十|)吨|(亿|千万|百万|万|千|百|)块|角|毛|分|(公(里|引|丈|尺|寸|分|釐)))'
# 分数表达式 # 分数表达式
RE_FRAC = re.compile(r'(-?)(\d+)/(\d+)') RE_FRAC = re.compile(r'(-?)(\d+)/(\d+)')

@ -35,7 +35,9 @@ measure_dict = {
"ml": "毫升", "ml": "毫升",
"m": "", "m": "",
"mm": "毫米", "mm": "毫米",
"s": "" "s": "",
"h": "小时",
"mg": "毫克"
} }

@ -17,7 +17,7 @@ import numpy as np
import paddle import paddle
import paddle.nn.functional as F import paddle.nn.functional as F
from paddle import nn from paddle import nn
from scipy.signal import kaiser from scipy.signal.windows import kaiser
def design_prototype_filter(taps=62, cutoff_ratio=0.142, beta=9.0): def design_prototype_filter(taps=62, cutoff_ratio=0.142, beta=9.0):

@ -164,9 +164,10 @@ class Trainer(object):
self.updater. self.updater.
batch_size) + "avg_ips: {:.5f} sequences/sec,".format( batch_size) + "avg_ips: {:.5f} sequences/sec,".format(
self.updater.batch_size / avg_batch_cost) self.updater.batch_size / avg_batch_cost)
max_mem_reserved_str = f" max_mem_reserved: {paddle.device.cuda.max_memory_reserved()} B" if paddle.device.is_compiled_with_cuda():
max_mem_allocated_str = f" max_mem_allocated: {paddle.device.cuda.max_memory_allocated()} B" max_mem_reserved_str = f" max_mem_reserved: {paddle.device.cuda.max_memory_reserved() // (1024 ** 2)} MB"
msg += max_mem_reserved_str + "," + max_mem_allocated_str max_mem_allocated_str = f" max_mem_allocated: {paddle.device.cuda.max_memory_allocated() // (1024 ** 2)} MB"
msg += max_mem_reserved_str + "," + max_mem_allocated_str
logger.info(msg) logger.info(msg)

@ -18,7 +18,9 @@ from typing import Text
import distutils import distutils
__all__ = ["print_arguments", "add_arguments", "get_commandline_args"] __all__ = [
"print_arguments", "add_arguments", "get_commandline_args", "strtobool"
]
def get_commandline_args(): def get_commandline_args():
@ -80,6 +82,27 @@ def print_arguments(args, info=None):
print("-----------------------------------------------------------") print("-----------------------------------------------------------")
def strtobool(value):
"""Convert a string value to an integer boolean (1 for True, 0 for False).
The function recognizes the following strings as True (case insensitive):
- "yes"
- "true"
- "1"
All other values are considered False.
NOTE: After Python 3.10, the distutils module, particularly distutils.util, has been partially deprecated. To maintain compatibility with existing code, the strtobool function implemented here.
"""
if isinstance(value, bool):
return int(value)
value = value.strip().lower()
if value in ('yes', 'true', '1'):
return 1
else:
return 0
def add_arguments(argname, type, default, help, argparser, **kwargs): def add_arguments(argname, type, default, help, argparser, **kwargs):
"""Add argparse's argument. """Add argparse's argument.
@ -91,7 +114,7 @@ def add_arguments(argname, type, default, help, argparser, **kwargs):
add_argument("name", str, "Jonh", "User name.", parser) add_argument("name", str, "Jonh", "User name.", parser)
args = parser.parse_args() args = parser.parse_args()
""" """
type = distutils.util.strtobool if type == bool else type type = strtobool if type == bool else type
argparser.add_argument( argparser.add_argument(
"--" + argname, "--" + argname,
default=default, default=default,

@ -24,7 +24,6 @@ import warnings
import numpy as np import numpy as np
import scipy import scipy
import sklearn import sklearn
from distutils.util import strtobool
from scipy import linalg from scipy import linalg
from scipy import sparse from scipy import sparse
from scipy.sparse.csgraph import connected_components from scipy.sparse.csgraph import connected_components
@ -34,6 +33,8 @@ from sklearn.cluster import SpectralClustering
from sklearn.cluster._kmeans import k_means from sklearn.cluster._kmeans import k_means
from sklearn.neighbors import kneighbors_graph from sklearn.neighbors import kneighbors_graph
from paddlespeech.utils.argparse import strtobool
def _graph_connected_component(graph, node_id): def _graph_connected_component(graph, node_id):
""" """

@ -43,21 +43,23 @@ base = [
# paddleaudio align with librosa==0.8.1, which need numpy==1.23.x # paddleaudio align with librosa==0.8.1, which need numpy==1.23.x
"numpy==1.23.5", "numpy==1.23.5",
"librosa==0.8.1", "librosa==0.8.1",
"scipy>=1.4.0", "scipy>=1.4.0, <=1.12.0",
"loguru", "loguru",
"matplotlib", "matplotlib<=3.8.4",
"nara_wpe", "nara_wpe",
"onnxruntime>=1.11.0", "onnxruntime>=1.11.0",
"opencc", "opencc==1.1.6",
"opencc-python-reimplemented", "opencc-python-reimplemented",
"pandas", "pandas",
"paddleaudio>=1.1.0", "paddleaudio>=1.1.0",
"paddlenlp>=2.4.8", "paddlenlp>=2.4.8",
"paddlepaddle-gpu==2.5.1",
"paddleslim>=2.3.4", "paddleslim>=2.3.4",
"ppdiffusers>=0.9.0", "ppdiffusers>=0.9.0",
"paddlespeech_feat", "paddlespeech_feat",
"praatio>=5.0.0, <=5.1.1", "praatio>=5.0.0, <=5.1.1",
"prettytable", "prettytable",
"pydantic>=1.10.14, <2.0",
"pypinyin<=0.44.0", "pypinyin<=0.44.0",
"pypinyin-dict", "pypinyin-dict",
"python-dateutil", "python-dateutil",

@ -35,6 +35,8 @@ if [[ ${MODE} = "benchmark_train" ]];then
pip install setuptools_scm #-i https://pypi.tuna.tsinghua.edu.cn/simple pip install setuptools_scm #-i https://pypi.tuna.tsinghua.edu.cn/simple
pip install . #-i https://pypi.tuna.tsinghua.edu.cn/simple pip install . #-i https://pypi.tuna.tsinghua.edu.cn/simple
pip install jsonlines pip install jsonlines
pip install -U scipy==1.12.0 # 高版本数据处理部分报错
pip install -U matplotlib==3.7.1 # 高版本报错cannot import name 'get_cmap' from 'matplotlib.cm'
pip list pip list
cd - cd -
if [[ ${model_name} == "conformer" ]]; then if [[ ${model_name} == "conformer" ]]; then

@ -48,7 +48,7 @@ class TestDeepSpeech2Model(unittest.TestCase):
num_rnn_layers=3, num_rnn_layers=3,
rnn_size=1024, rnn_size=1024,
use_gru=False, use_gru=False,
share_rnn_weights=False, ) rnn_direction="forward", )
loss = model(self.audio, self.audio_len, self.text, self.text_len) loss = model(self.audio, self.audio_len, self.text, self.text_len)
self.assertEqual(loss.numel(), 1) self.assertEqual(loss.numel(), 1)
@ -60,7 +60,7 @@ class TestDeepSpeech2Model(unittest.TestCase):
num_rnn_layers=3, num_rnn_layers=3,
rnn_size=1024, rnn_size=1024,
use_gru=True, use_gru=True,
share_rnn_weights=False, ) rnn_direction="forward", )
loss = model(self.audio, self.audio_len, self.text, self.text_len) loss = model(self.audio, self.audio_len, self.text, self.text_len)
self.assertEqual(loss.numel(), 1) self.assertEqual(loss.numel(), 1)
@ -72,7 +72,7 @@ class TestDeepSpeech2Model(unittest.TestCase):
num_rnn_layers=3, num_rnn_layers=3,
rnn_size=1024, rnn_size=1024,
use_gru=False, use_gru=False,
share_rnn_weights=True, ) rnn_direction="bidirect", )
loss = model(self.audio, self.audio_len, self.text, self.text_len) loss = model(self.audio, self.audio_len, self.text, self.text_len)
self.assertEqual(loss.numel(), 1) self.assertEqual(loss.numel(), 1)
@ -84,7 +84,7 @@ class TestDeepSpeech2Model(unittest.TestCase):
num_rnn_layers=3, num_rnn_layers=3,
rnn_size=1024, rnn_size=1024,
use_gru=True, use_gru=True,
share_rnn_weights=True, ) rnn_direction="bidirect", )
loss = model(self.audio, self.audio_len, self.text, self.text_len) loss = model(self.audio, self.audio_len, self.text, self.text_len)
self.assertEqual(loss.numel(), 1) self.assertEqual(loss.numel(), 1)
@ -96,7 +96,7 @@ class TestDeepSpeech2Model(unittest.TestCase):
num_rnn_layers=3, num_rnn_layers=3,
rnn_size=1024, rnn_size=1024,
use_gru=False, use_gru=False,
share_rnn_weights=False, ) rnn_direction="forward", )
loss = model(self.audio, self.audio_len, self.text, self.text_len) loss = model(self.audio, self.audio_len, self.text, self.text_len)
self.assertEqual(loss.numel(), 1) self.assertEqual(loss.numel(), 1)

@ -19,11 +19,11 @@ import numpy as np
import paddle import paddle
from paddle import inference from paddle import inference
from paddlespeech.s2t.models.ds2_online import DeepSpeech2InferModelOnline from paddlespeech.s2t.models.ds2 import DeepSpeech2InferModel
from paddlespeech.s2t.models.ds2_online import DeepSpeech2ModelOnline from paddlespeech.s2t.models.ds2 import DeepSpeech2Model
class TestDeepSpeech2ModelOnline(unittest.TestCase): class TestDeepSpeech2Model(unittest.TestCase):
def setUp(self): def setUp(self):
paddle.set_device('cpu') paddle.set_device('cpu')
@ -45,7 +45,7 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase):
self.text_len = paddle.to_tensor(text_len, dtype='int64') self.text_len = paddle.to_tensor(text_len, dtype='int64')
def test_ds2_1(self): def test_ds2_1(self):
model = DeepSpeech2ModelOnline( model = DeepSpeech2Model(
feat_size=self.feat_dim, feat_size=self.feat_dim,
dict_size=10, dict_size=10,
num_conv_layers=2, num_conv_layers=2,
@ -58,7 +58,7 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase):
self.assertEqual(loss.numel(), 1) self.assertEqual(loss.numel(), 1)
def test_ds2_2(self): def test_ds2_2(self):
model = DeepSpeech2ModelOnline( model = DeepSpeech2Model(
feat_size=self.feat_dim, feat_size=self.feat_dim,
dict_size=10, dict_size=10,
num_conv_layers=2, num_conv_layers=2,
@ -71,7 +71,7 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase):
self.assertEqual(loss.numel(), 1) self.assertEqual(loss.numel(), 1)
def test_ds2_3(self): def test_ds2_3(self):
model = DeepSpeech2ModelOnline( model = DeepSpeech2Model(
feat_size=self.feat_dim, feat_size=self.feat_dim,
dict_size=10, dict_size=10,
num_conv_layers=2, num_conv_layers=2,
@ -84,7 +84,7 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase):
self.assertEqual(loss.numel(), 1) self.assertEqual(loss.numel(), 1)
def test_ds2_4(self): def test_ds2_4(self):
model = DeepSpeech2ModelOnline( model = DeepSpeech2Model(
feat_size=self.feat_dim, feat_size=self.feat_dim,
dict_size=10, dict_size=10,
num_conv_layers=2, num_conv_layers=2,
@ -97,7 +97,7 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase):
self.assertEqual(loss.numel(), 1) self.assertEqual(loss.numel(), 1)
def test_ds2_5(self): def test_ds2_5(self):
model = DeepSpeech2ModelOnline( model = DeepSpeech2Model(
feat_size=self.feat_dim, feat_size=self.feat_dim,
dict_size=10, dict_size=10,
num_conv_layers=2, num_conv_layers=2,
@ -110,7 +110,7 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase):
self.assertEqual(loss.numel(), 1) self.assertEqual(loss.numel(), 1)
def test_ds2_6(self): def test_ds2_6(self):
model = DeepSpeech2ModelOnline( model = DeepSpeech2Model(
feat_size=self.feat_dim, feat_size=self.feat_dim,
dict_size=10, dict_size=10,
num_conv_layers=2, num_conv_layers=2,
@ -125,7 +125,7 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase):
def test_ds2_7(self): def test_ds2_7(self):
use_gru = False use_gru = False
model = DeepSpeech2ModelOnline( model = DeepSpeech2Model(
feat_size=self.feat_dim, feat_size=self.feat_dim,
dict_size=10, dict_size=10,
num_conv_layers=2, num_conv_layers=2,
@ -156,7 +156,7 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase):
def test_ds2_8(self): def test_ds2_8(self):
use_gru = True use_gru = True
model = DeepSpeech2ModelOnline( model = DeepSpeech2Model(
feat_size=self.feat_dim, feat_size=self.feat_dim,
dict_size=10, dict_size=10,
num_conv_layers=2, num_conv_layers=2,
@ -191,7 +191,7 @@ class TestDeepSpeech2StaticModelOnline(unittest.TestCase):
export_prefix = "exp/deepspeech2_online/checkpoints/test_export" export_prefix = "exp/deepspeech2_online/checkpoints/test_export"
if not os.path.exists(os.path.dirname(export_prefix)): if not os.path.exists(os.path.dirname(export_prefix)):
os.makedirs(os.path.dirname(export_prefix), mode=0o755) os.makedirs(os.path.dirname(export_prefix), mode=0o755)
infer_model = DeepSpeech2InferModelOnline( infer_model = DeepSpeech2InferModel(
feat_size=161, feat_size=161,
dict_size=4233, dict_size=4233,
num_conv_layers=2, num_conv_layers=2,

@ -0,0 +1,36 @@
function main(){
set -ex
speech_ci_path=`pwd`
echo "Start asr"
cd ${speech_ci_path}/asr
bash deepspeech2_online_model_test.sh
python error_rate_test.py
python mask_test.py
python reverse_pad_list.py
echo "End asr"
echo "Start TTS"
cd ${speech_ci_path}/tts
python test_data_table.py
python test_enfrontend.py
python test_mixfrontend.py
echo "End TTS"
echo "Start Vector"
cd ${speech_ci_path}/vector
python test_augment.py
echo "End Vector"
echo "Start cli"
cd ${speech_ci_path}/cli
bash test_cli.sh
echo "End cli"
echo "Start server"
cd ${speech_ci_path}/server/offline
bash test_server_client.sh
echo "End server"
}
main

@ -10,11 +10,12 @@ paddlespeech cls --input ./cat.wav --topk 10
paddlespeech text --input 今天的天气真不错啊你下午有空吗我想约你一起去吃饭 --model ernie_linear_p3_wudao_fast paddlespeech text --input 今天的天气真不错啊你下午有空吗我想约你一起去吃饭 --model ernie_linear_p3_wudao_fast
# Speech SSL # Speech SSL
wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
paddlespeech ssl --task asr --lang en --input ./en.wav paddlespeech ssl --task asr --lang en --input ./en.wav
paddlespeech ssl --task vector --lang en --input ./en.wav paddlespeech ssl --task vector --lang en --input ./en.wav
# Speech_recognition # Speech_recognition
wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/ch_zh_mix.wav wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/ch_zh_mix.wav
paddlespeech asr --input ./zh.wav paddlespeech asr --input ./zh.wav
paddlespeech asr --model conformer_aishell --input ./zh.wav paddlespeech asr --model conformer_aishell --input ./zh.wav
paddlespeech asr --model conformer_online_aishell --input ./zh.wav paddlespeech asr --model conformer_online_aishell --input ./zh.wav
@ -110,5 +111,7 @@ paddlespeech whisper --task transcribe --input ./zh.wav
# whisper recognize text and translate to English # whisper recognize text and translate to English
paddlespeech whisper --task translate --input ./zh.wav paddlespeech whisper --task translate --input ./zh.wav
# to change model English-Only model
paddlespeech whisper --lang en --size base --task transcribe --input ./en.wav
echo -e "\033[32mTest success !!!\033[0m" echo -e "\033[32mTest success !!!\033[0m"

@ -0,0 +1,29 @@
# test CLI 测试文档
该文档为 CLI 测试说明,该测试目前覆盖大部分 paddlespeech 中的 CLI 推理。该 CI 建立后用于快速验证修复是否正确。
# 测试流程
## 1. 环境安装
CI 重建时在已有通过版本 paddlepaddle-gpu==2.5.1, paddlepseech==develop 下运行。
CI 重建后在 paddlepaddle-gpu==develop, paddlepseech==develop 下运行。
### 其他相关依赖
gcc >= 4.8.5,
python >= 3.8
## 2. 功能测试
在 repo 的 tests/unit/cli 中运行:
```shell
source path.sh
bash test_cli.sh
```
## 3. 预期结果
输出 "Test success",且运行过程中无报错或 Error 即为成功。

@ -1,5 +1,7 @@
#!/bin/bash #!/bin/bash
# bash test_server_client.sh # bash test_server_client.sh
## require lsof to get server pid
## apt-get install -y lsof
StartService(){ StartService(){
# Start service # Start service

@ -48,7 +48,7 @@ if __name__ == "__main__":
parser.add_argument( parser.add_argument(
"--text", "--text",
type=str, type=str,
default="../../../../../../paddlespeech/t2s/exps/csmsc_test.txt", default="../../../../../../paddlespeech/t2s/assets/csmsc_test.txt",
help="text to synthesize, a 'utt_id sentence' pair per line") help="text to synthesize, a 'utt_id sentence' pair per line")
parser.add_argument('--spk_id', type=int, default=0, help='Speaker id') parser.add_argument('--spk_id', type=int, default=0, help='Speaker id')
parser.add_argument('--speed', type=float, default=1.0, help='Audio speed') parser.add_argument('--speed', type=float, default=1.0, help='Audio speed')

@ -11,7 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from paddlespeech.t2s.datasets.data_tabel import DataTable from paddlespeech.t2s.datasets.data_table import DataTable
def test_audio_dataset(): def test_audio_dataset():

@ -0,0 +1,4 @@
FROM registry.baidubce.com/paddlepaddle/paddle:latest-dev-cuda11.2-cudnn8.2-trt8.0-gcc82
RUN apt-get update -y
RUN apt-get -y install libsndfile1 lsof
RUN pip3.8 install pytest-runner

@ -19,18 +19,18 @@ fi
tarball=OpenBLAS-$OPENBLAS_VERSION.tar.gz tarball=OpenBLAS-$OPENBLAS_VERSION.tar.gz
rm -rf xianyi-OpenBLAS-* OpenBLAS OpenBLAS-*.tar.gz rm -rf OpenMathLib-OpenBLAS-* OpenBLAS OpenBLAS-*.tar.gz
if [ -d "$DOWNLOAD_DIR" ]; then if [ -d "$DOWNLOAD_DIR" ]; then
cp -p "$DOWNLOAD_DIR/$tarball" . cp -p "$DOWNLOAD_DIR/$tarball" .
else else
url=$($WGET -qO- "https://api.github.com/repos/xianyi/OpenBLAS/releases/tags/v${OPENBLAS_VERSION}" | python -c 'import sys,json;print(json.load(sys.stdin)["tarball_url"])') url=$($WGET -qO- "https://api.github.com/repos/OpenMathLib/OpenBLAS/releases/tags/v${OPENBLAS_VERSION}" | python -c 'import sys,json;print(json.load(sys.stdin)["tarball_url"])')
test -n "$url" test -n "$url"
$WGET -t3 -nv -O $tarball "$url" $WGET -t3 -nv -O $tarball "$url"
fi fi
tar xzf $tarball tar xzf $tarball
mv xianyi-OpenBLAS-* OpenBLAS mv OpenMathLib-OpenBLAS-* OpenBLAS
make PREFIX=$(pwd)/OpenBLAS/install USE_LOCKING=1 USE_THREAD=0 -C OpenBLAS all install make PREFIX=$(pwd)/OpenBLAS/install USE_LOCKING=1 USE_THREAD=0 -C OpenBLAS all install
if [ $? -eq 0 ]; then if [ $? -eq 0 ]; then

@ -0,0 +1,54 @@
set +x
# use pre-commit 2.17
if ! [[ $(pre-commit --version) == *"2.17.0"* ]]; then
pip install pre-commit==2.17.0 1>nul
fi
# Install clang-format before git commit to avoid repeat installation due to
# pre-commit multi-thread running.
readonly VERSION="13.0.0"
version=$(clang-format -version)
if ! [[ $(python -V 2>&1 | awk '{print $2}' | awk -F '.' '{print $1$2}') -ge 36 ]]; then
echo "clang-format installation by pip need python version great equal 3.6,
please change the default python to higher version."
exit 1
fi
diff_files=$(git diff --name-only --diff-filter=ACMR ${BRANCH})
num_diff_files=$(echo "$diff_files" | wc -l)
echo -e "diff files between pr and ${BRANCH}:\n${diff_files}"
echo "Checking code style by pre-commit ..."
pre-commit run --files ${diff_files};check_error=$?
if test ! -z "$(git diff)"; then
echo -e '\n************************************************************************************'
echo -e "These files have been formatted by code format hook. You should use pre-commit to \
format them before git push."
echo -e '************************************************************************************\n'
git diff 2>&1
fi
echo -e '\n************************************************************************************'
if [ ${check_error} != 0 ];then
echo "Your PR code style check failed."
echo "Please install pre-commit locally and set up git hook scripts:"
echo ""
echo " pip install pre-commit==2.17.0"
echo " pre-commit install"
echo ""
if [[ $num_diff_files -le 100 ]];then
echo "Then, run pre-commit to check codestyle issues in your PR:"
echo ""
echo " pre-commit run --files" $(echo ${diff_files} | tr "\n" " ")
echo ""
fi
echo "For more information, please refer to our codestyle check guide:"
echo "https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/dev_guides/git_guides/codestyle_check_guide_cn.html"
else
echo "Your PR code style check passed."
fi
echo -e '************************************************************************************\n'
exit ${check_error}

@ -28,7 +28,8 @@ import re
import subprocess import subprocess
import numpy as np import numpy as np
from distutils.util import strtobool
from paddlespeech.utils.argparse import strtobool
FILE_IDS = re.compile(r"(?<=Speaker Diarization for).+(?=\*\*\*)") FILE_IDS = re.compile(r"(?<=Speaker Diarization for).+(?=\*\*\*)")
SCORED_SPEAKER_TIME = re.compile(r"(?<=SCORED SPEAKER TIME =)[\d.]+") SCORED_SPEAKER_TIME = re.compile(r"(?<=SCORED SPEAKER TIME =)[\d.]+")

@ -11,9 +11,10 @@ import json
import logging import logging
import sys import sys
from distutils.util import strtobool
from espnet.utils.cli_utils import get_commandline_args from espnet.utils.cli_utils import get_commandline_args
from paddlespeech.utils.argparse import strtobool
is_python2 = sys.version_info[0] == 2 is_python2 = sys.version_info[0] == 2

@ -4,13 +4,13 @@ import logging
import kaldiio import kaldiio
import numpy import numpy
from distutils.util import strtobool
from paddlespeech.s2t.transform.cmvn import CMVN from paddlespeech.audio.transform.cmvn import CMVN
from paddlespeech.s2t.utils.cli_readers import file_reader_helper from paddlespeech.s2t.utils.cli_readers import file_reader_helper
from paddlespeech.s2t.utils.cli_utils import get_commandline_args from paddlespeech.s2t.utils.cli_utils import get_commandline_args
from paddlespeech.s2t.utils.cli_utils import is_scipy_wav_style from paddlespeech.s2t.utils.cli_utils import is_scipy_wav_style
from paddlespeech.s2t.utils.cli_writers import file_writer_helper from paddlespeech.s2t.utils.cli_writers import file_writer_helper
from paddlespeech.utils.argparse import strtobool
def get_parser(): def get_parser():

@ -5,7 +5,7 @@ import logging
import kaldiio import kaldiio
import numpy as np import numpy as np
from paddlespeech.s2t.transform.transformation import Transformation from paddlespeech.audio.transform.transformation import Transformation
from paddlespeech.s2t.utils.cli_readers import file_reader_helper from paddlespeech.s2t.utils.cli_readers import file_reader_helper
from paddlespeech.s2t.utils.cli_utils import get_commandline_args from paddlespeech.s2t.utils.cli_utils import get_commandline_args
from paddlespeech.s2t.utils.cli_utils import is_scipy_wav_style from paddlespeech.s2t.utils.cli_utils import is_scipy_wav_style

@ -2,13 +2,12 @@
import argparse import argparse
import logging import logging
from distutils.util import strtobool from paddlespeech.audio.transform.transformation import Transformation
from paddlespeech.s2t.transform.transformation import Transformation
from paddlespeech.s2t.utils.cli_readers import file_reader_helper from paddlespeech.s2t.utils.cli_readers import file_reader_helper
from paddlespeech.s2t.utils.cli_utils import get_commandline_args from paddlespeech.s2t.utils.cli_utils import get_commandline_args
from paddlespeech.s2t.utils.cli_utils import is_scipy_wav_style from paddlespeech.s2t.utils.cli_utils import is_scipy_wav_style
from paddlespeech.s2t.utils.cli_writers import file_writer_helper from paddlespeech.s2t.utils.cli_writers import file_writer_helper
from paddlespeech.utils.argparse import strtobool
def get_parser(): def get_parser():

@ -3,7 +3,7 @@ import argparse
import logging import logging
import sys import sys
from paddlespeech.s2t.transform.transformation import Transformation from paddlespeech.audio.transform.transformation import Transformation
from paddlespeech.s2t.utils.cli_readers import file_reader_helper from paddlespeech.s2t.utils.cli_readers import file_reader_helper
from paddlespeech.s2t.utils.cli_utils import get_commandline_args from paddlespeech.s2t.utils.cli_utils import get_commandline_args
from paddlespeech.s2t.utils.cli_utils import is_scipy_wav_style from paddlespeech.s2t.utils.cli_utils import is_scipy_wav_style

@ -7,9 +7,8 @@ import logging
import sys import sys
from io import open from io import open
from distutils.util import strtobool
from paddlespeech.s2t.utils.cli_utils import get_commandline_args from paddlespeech.s2t.utils.cli_utils import get_commandline_args
from paddlespeech.utils.argparse import strtobool
PY2 = sys.version_info[0] == 2 PY2 = sys.version_info[0] == 2
sys.stdin = codecs.getreader("utf-8")(sys.stdin if PY2 else sys.stdin.buffer) sys.stdin = codecs.getreader("utf-8")(sys.stdin if PY2 else sys.stdin.buffer)

Loading…
Cancel
Save