Merge branch 'PaddlePaddle:develop' into develop

pull/3715/head
131 12 months ago committed by GitHub
commit 1973d7c941
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -14,7 +14,7 @@ Linux test build whl environment:
* gcc/g++ - 8.2.0 * gcc/g++ - 8.2.0
* cmake - 3.18.0 (need install) * cmake - 3.18.0 (need install)
MACtest build whl envrioment MACtest build whl environment
* os * os
* gcc/g++ 12.2.0 * gcc/g++ 12.2.0
* cpu Intel Xeon E5 x86_64 * cpu Intel Xeon E5 x86_64

@ -37,7 +37,7 @@ class FeatTest(unittest.TestCase):
self.waveform, self.sr = load(os.path.abspath(os.path.basename(url))) self.waveform, self.sr = load(os.path.abspath(os.path.basename(url)))
self.waveform = self.waveform.astype( self.waveform = self.waveform.astype(
np.float32 np.float32
) # paddlespeech.s2t.transform.spectrogram only supports float32 ) # paddlespeech.audio.transform.spectrogram only supports float32
dim = len(self.waveform.shape) dim = len(self.waveform.shape)
assert dim in [1, 2] assert dim in [1, 2]

@ -18,8 +18,8 @@ import paddle
from paddleaudio.functional.window import get_window from paddleaudio.functional.window import get_window
from .base import FeatTest from .base import FeatTest
from paddlespeech.s2t.transform.spectrogram import IStft from paddlespeech.audio.transform.spectrogram import IStft
from paddlespeech.s2t.transform.spectrogram import Stft from paddlespeech.audio.transform.spectrogram import Stft
class TestIstft(FeatTest): class TestIstft(FeatTest):

@ -18,7 +18,7 @@ import paddle
import paddleaudio import paddleaudio
from .base import FeatTest from .base import FeatTest
from paddlespeech.s2t.transform.spectrogram import LogMelSpectrogram from paddlespeech.audio.transform.spectrogram import LogMelSpectrogram
class TestLogMelSpectrogram(FeatTest): class TestLogMelSpectrogram(FeatTest):

@ -18,7 +18,7 @@ import paddle
import paddleaudio import paddleaudio
from .base import FeatTest from .base import FeatTest
from paddlespeech.s2t.transform.spectrogram import Spectrogram from paddlespeech.audio.transform.spectrogram import Spectrogram
class TestSpectrogram(FeatTest): class TestSpectrogram(FeatTest):

@ -18,7 +18,7 @@ import paddle
from paddleaudio.functional.window import get_window from paddleaudio.functional.window import get_window
from .base import FeatTest from .base import FeatTest
from paddlespeech.s2t.transform.spectrogram import Stft from paddlespeech.audio.transform.spectrogram import Stft
class TestStft(FeatTest): class TestStft(FeatTest):

@ -19,7 +19,7 @@ You can choose one way from meduim and hard to install paddlespeech.
The dependency refers to the requirements.txt, and install the dependency as follows: The dependency refers to the requirements.txt, and install the dependency as follows:
``` ```
pip install -r requriement.txt pip install -r requirements.txt
``` ```
### 2. Prepare Input File ### 2. Prepare Input File
@ -30,11 +30,20 @@ Here are sample files for this demo that can be downloaded:
wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
``` ```
### 3. Usage ### 3. run paddlespeech_server
Before using the client, it is necessary to start paddlespeech_servers.
Here are sample server configuration
```bash
bash demos/audio_content_search/run.sh
```
The logs of the two services will be recorded in 'acs.log' and 'streaming_asr.log' in this configuration.
### 4. Usage
- Command Line(Recommended) - Command Line(Recommended)
```bash ```bash
# Chinese # Chinese
paddlespeech_client acs --server_ip 127.0.0.1 --port 8090 --input ./zh.wav paddlespeech_client acs --server_ip 127.0.0.1 --port 8490 --input ./zh.wav
``` ```
Usage: Usage:

@ -19,7 +19,7 @@
依赖参见 requirements.txt, 安装依赖 依赖参见 requirements.txt, 安装依赖
``` ```
pip install -r requriement.txt pip install -r requirements.txt
``` ```
### 2. 准备输入 ### 2. 准备输入
@ -29,16 +29,26 @@ pip install -r requriement.txt
```bash ```bash
wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
``` ```
### 3. 使用方法
### 3. 启动 server
使用 client 之前需要先启动 paddlespeech_server。
可以使用默认 server 配置:
```bash
bash demos/audio_content_search/run.sh
```
该配置下两个服务的日志会被记录在 `acs.log``streaming_asr.log` 中。
### 4. 使用方法
- 命令行 (推荐使用) - 命令行 (推荐使用)
```bash ```bash
# 中文 # 中文
paddlespeech_client acs --server_ip 127.0.0.1 --port 8090 --input ./zh.wav paddlespeech_client acs --server_ip 127.0.0.1 --port 8490 --input ./zh.wav
``` ```
使用方法: 使用方法:
```bash ```bash
paddlespeech acs --help paddlespeech asr --help
``` ```
参数: 参数:
- `input`(必须输入):用于识别的音频文件。 - `input`(必须输入):用于识别的音频文件。

@ -26,8 +26,10 @@ asr_online:
sample_rate: 16000 sample_rate: 16000
cfg_path: cfg_path:
decode_method: 'attention_rescoring' decode_method: 'attention_rescoring'
num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1.
force_yes: True force_yes: True
device: 'cpu' # cpu or gpu:id device: 'cpu' # cpu or gpu:id
continuous_decoding: False # disable continue decoding when endpoint detected
am_predictor_conf: am_predictor_conf:
device: # set 'gpu:id' or 'cpu' device: # set 'gpu:id' or 'cpu'
switch_ir_optim: True switch_ir_optim: True
@ -40,4 +42,4 @@ asr_online:
window_ms: 25 # ms window_ms: 25 # ms
shift_ms: 10 # ms shift_ms: 10 # ms
sample_rate: 16000 sample_rate: 16000
sample_width: 2 sample_width: 2

@ -31,6 +31,7 @@ asr_online:
force_yes: True force_yes: True
device: 'cpu' # cpu or gpu:id device: 'cpu' # cpu or gpu:id
decode_method: "attention_rescoring" decode_method: "attention_rescoring"
num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1.
am_predictor_conf: am_predictor_conf:
device: # set 'gpu:id' or 'cpu' device: # set 'gpu:id' or 'cpu'
switch_ir_optim: True switch_ir_optim: True

@ -1,5 +1,5 @@
diskcache==5.2.1 diskcache
dtaidistance==2.3.1 dtaidistane
fastapi fastapi
librosa==0.8.0 librosa==0.8.0
numpy==1.22.0 numpy==1.22.0
@ -10,4 +10,4 @@ python-multipart
soundfile==0.10.3.post1 soundfile==0.10.3.post1
starlette starlette
typing typing
uvicorn uvicorn

@ -429,7 +429,7 @@ bash server.sh
If `127.0.0.1` is not accessible, you need to use the actual service IP address. If `127.0.0.1` is not accessible, you need to use the actual service IP address.
```bash ```bash
paddlespeech_client asr_online --server_ip 127.0.0.1 --port 8290 --punc.server_ip 127.0.0.1 --punc.port 8190 --input ./zh.wav paddlespeech_client asr_online --server_ip 127.0.0.1 --port 8090 --punc.server_ip 127.0.0.1 --punc.port 8190 --input ./zh.wav
``` ```
Output: Output:
```text ```text
@ -507,7 +507,7 @@ bash server.sh
If `127.0.0.1` is not accessible, you need to use the actual service IP address. If `127.0.0.1` is not accessible, you need to use the actual service IP address.
```bash ```bash
python3 websocket_client.py --server_ip 127.0.0.1 --port 8290 --punc.server_ip 127.0.0.1 --punc.port 8190 --wavfile ./zh.wav python3 local/websocket_client.py --server_ip 127.0.0.1 --port 8090 --punc.server_ip 127.0.0.1 --punc.port 8190 --wavfile ./zh.wav
``` ```
Output: Output:
```text ```text

@ -428,7 +428,7 @@ bash server.sh
`127.0.0.1` 不能访问,则需要使用实际服务 IP 地址 `127.0.0.1` 不能访问,则需要使用实际服务 IP 地址
```bash ```bash
paddlespeech_client asr_online --server_ip 127.0.0.1 --port 8290 --punc.server_ip 127.0.0.1 --punc.port 8190 --input ./zh.wav paddlespeech_client asr_online --server_ip 127.0.0.1 --port 8090 --punc.server_ip 127.0.0.1 --punc.port 8190 --input ./zh.wav
``` ```
输出: 输出:
```text ```text
@ -506,7 +506,7 @@ bash server.sh
`127.0.0.1` 不能访问,则需要使用实际服务 IP 地址 `127.0.0.1` 不能访问,则需要使用实际服务 IP 地址
```bash ```bash
python3 websocket_client.py --server_ip 127.0.0.1 --port 8290 --punc.server_ip 127.0.0.1 --punc.port 8190 --wavfile ./zh.wav python3 local/websocket_client.py --server_ip 127.0.0.1 --port 8090 --punc.server_ip 127.0.0.1 --punc.port 8190 --wavfile ./zh.wav
``` ```
输出: 输出:
```text ```text

@ -32,7 +32,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--pwg-config=download/pwg_baker_ckpt_0.4/pwg_default.yaml \ --pwg-config=download/pwg_baker_ckpt_0.4/pwg_default.yaml \
--pwg-checkpoint=download/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ --pwg-checkpoint=download/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
--pwg-stat=download/pwg_baker_ckpt_0.4/pwg_stats.npy \ --pwg-stat=download/pwg_baker_ckpt_0.4/pwg_stats.npy \
--text=${BIN_DIR}/../sentences.txt \ --text=./sentences.txt \
--output-dir=output \ --output-dir=output \
--phones-dict=download/fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt --phones-dict=download/fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt
fi fi

@ -236,8 +236,8 @@
"warnings.filterwarnings('ignore')\n", "warnings.filterwarnings('ignore')\n",
"\n", "\n",
"from yacs.config import CfgNode\n", "from yacs.config import CfgNode\n",
"from paddlespeech.s2t.transform.spectrogram import LogMelSpectrogramKaldi\n", "from paddlespeech.audio.transform.spectrogram import LogMelSpectrogramKaldi\n",
"from paddlespeech.s2t.transform.cmvn import GlobalCMVN\n", "from paddlespeech.audio.transform.cmvn import GlobalCMVN\n",
"from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer\n", "from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer\n",
"from paddlespeech.s2t.models.u2 import U2Model\n", "from paddlespeech.s2t.models.u2 import U2Model\n",
"\n", "\n",

@ -0,0 +1,33 @@
#!/bin/bash
train_output_path=$1
stage=0
stop_stage=0
# for more GAN Vocoders
# multi band melgan
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
python3 ${BIN_DIR}/../inference.py \
--inference_dir=${train_output_path}/inference \
--am=speedyspeech_csmsc \
--voc=mb_melgan_csmsc \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--device mlu
fi
# hifigan
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3 ${BIN_DIR}/../inference.py \
--inference_dir=${train_output_path}/inference \
--am=speedyspeech_csmsc \
--voc=hifigan_csmsc \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--device mlu
fi

@ -0,0 +1,46 @@
#!/bin/bash
train_output_path=$1
stage=0
stop_stage=0
# pwgan
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
python3 ${BIN_DIR}/../inference.py \
--inference_dir=${train_output_path}/inference \
--am=speedyspeech_csmsc \
--voc=pwgan_csmsc \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--device npu
fi
# for more GAN Vocoders
# multi band melgan
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3 ${BIN_DIR}/../inference.py \
--inference_dir=${train_output_path}/inference \
--am=speedyspeech_csmsc \
--voc=mb_melgan_csmsc \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--device npu
fi
# hifigan
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
python3 ${BIN_DIR}/../inference.py \
--inference_dir=${train_output_path}/inference \
--am=speedyspeech_csmsc \
--voc=hifigan_csmsc \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--device npu
fi

@ -0,0 +1,99 @@
#!/bin/bash
config_path=$1
train_output_path=$2
ckpt_name=$3
stage=0
stop_stage=0
# for more GAN Vocoders
# multi band melgan
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
FLAGS_allocator_strategy=naive_best_fit \
python3 ${BIN_DIR}/../synthesize_e2e.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \
--voc=mb_melgan_csmsc \
--voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
--voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--ngpu=0 \
--nmlu=1
fi
# the pretrained models haven't release now
# style melgan
# style melgan's Dygraph to Static Graph is not ready now
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
FLAGS_allocator_strategy=naive_best_fit \
python3 ${BIN_DIR}/../synthesize_e2e.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \
--voc=style_melgan_csmsc \
--voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
--voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--ngpu=0 \
--nmlu=1
# --inference_dir=${train_output_path}/inference
fi
# hifigan
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
FLAGS_allocator_strategy=naive_best_fit \
python3 ${BIN_DIR}/../synthesize_e2e.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \
--voc=hifigan_csmsc \
--voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
--voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--inference_dir=${train_output_path}/inference \
--ngpu=0 \
--nmlu=1
fi
# wavernn
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
echo "in wavernn syn_e2e"
FLAGS_allocator_strategy=naive_best_fit \
python3 ${BIN_DIR}/../synthesize_e2e.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \
--voc=wavernn_csmsc \
--voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \
--voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
--voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--inference_dir=${train_output_path}/inference \
--ngpu=0 \
--nmlu=1
fi

@ -0,0 +1,124 @@
#!/bin/bash
config_path=$1
train_output_path=$2
ckpt_name=$3
stage=0
stop_stage=0
# pwgan
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
FLAGS_allocator_strategy=naive_best_fit \
python3 ${BIN_DIR}/../synthesize_e2e.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \
--voc=pwgan_csmsc \
--voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \
--voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
--voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--inference_dir=${train_output_path}/inference \
--ngpu=0 \
--nnpu=1
fi
# for more GAN Vocoders
# multi band melgan
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
FLAGS_allocator_strategy=naive_best_fit \
python3 ${BIN_DIR}/../synthesize_e2e.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \
--voc=mb_melgan_csmsc \
--voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
--voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--inference_dir=${train_output_path}/inference \
--ngpu=0 \
--nnpu=1
fi
# the pretrained models haven't release now
# style melgan
# style melgan's Dygraph to Static Graph is not ready now
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
FLAGS_allocator_strategy=naive_best_fit \
python3 ${BIN_DIR}/../synthesize_e2e.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \
--voc=style_melgan_csmsc \
--voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
--voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--ngpu=0 \
--nnpu=1
# --inference_dir=${train_output_path}/inference
fi
# hifigan
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
FLAGS_allocator_strategy=naive_best_fit \
python3 ${BIN_DIR}/../synthesize_e2e.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \
--voc=hifigan_csmsc \
--voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
--voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--inference_dir=${train_output_path}/inference \
--ngpu=0 \
--nnpu=1
fi
# wavernn
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
echo "in wavernn syn_e2e"
FLAGS_allocator_strategy=naive_best_fit \
python3 ${BIN_DIR}/../synthesize_e2e.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \
--voc=wavernn_csmsc \
--voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \
--voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
--voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--inference_dir=${train_output_path}/inference \
--ngpu=0 \
--nnpu=1
fi

@ -0,0 +1,90 @@
#!/bin/bash
config_path=$1
train_output_path=$2
ckpt_name=$3
stage=0
stop_stage=0
# for more GAN Vocoders
# multi band melgan
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
FLAGS_allocator_strategy=naive_best_fit \
python3 ${BIN_DIR}/../synthesize.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \
--voc=mb_melgan_csmsc \
--voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
--voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--ngpu=0 \
--nmlu=1
fi
# style melgan
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
FLAGS_allocator_strategy=naive_best_fit \
python3 ${BIN_DIR}/../synthesize.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \
--voc=style_melgan_csmsc \
--voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
--voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--ngpu=0 \
--nmlu=1
fi
# hifigan
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
echo "in hifigan syn"
FLAGS_allocator_strategy=naive_best_fit \
python3 ${BIN_DIR}/../synthesize.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \
--voc=hifigan_csmsc \
--voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
--voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--ngpu=0 \
--nmlu=1
fi
# wavernn
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
echo "in wavernn syn"
FLAGS_allocator_strategy=naive_best_fit \
python3 ${BIN_DIR}/../synthesize.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \
--voc=wavernn_csmsc \
--voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \
--voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
--voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--tones_dict=dump/tone_id_map.txt \
--phones_dict=dump/phone_id_map.txt \
--ngpu=0 \
--nmlu=1
fi

@ -0,0 +1,110 @@
#!/bin/bash
config_path=$1
train_output_path=$2
ckpt_name=$3
stage=0
stop_stage=0
# pwgan
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
FLAGS_allocator_strategy=naive_best_fit \
python3 ${BIN_DIR}/../synthesize.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \
--voc=pwgan_csmsc \
--voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \
--voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
--voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--ngpu=0 \
--nnpu=1
fi
# for more GAN Vocoders
# multi band melgan
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
FLAGS_allocator_strategy=naive_best_fit \
python3 ${BIN_DIR}/../synthesize.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \
--voc=mb_melgan_csmsc \
--voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
--voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--ngpu=0 \
--nnpu=1
fi
# style melgan
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
FLAGS_allocator_strategy=naive_best_fit \
python3 ${BIN_DIR}/../synthesize.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \
--voc=style_melgan_csmsc \
--voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
--voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--ngpu=0 \
--nnpu=1
fi
# hifigan
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
echo "in hifigan syn"
FLAGS_allocator_strategy=naive_best_fit \
python3 ${BIN_DIR}/../synthesize.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \
--voc=hifigan_csmsc \
--voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
--voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--ngpu=0 \
--nnpu=1
fi
# wavernn
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
echo "in wavernn syn"
FLAGS_allocator_strategy=naive_best_fit \
python3 ${BIN_DIR}/../synthesize.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \
--voc=wavernn_csmsc \
--voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \
--voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
--voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--tones_dict=dump/tone_id_map.txt \
--phones_dict=dump/phone_id_map.txt \
--ngpu=0 \
--nnpu=1
fi

@ -0,0 +1,16 @@
#!/bin/bash
config_path=$1
train_output_path=$2
# export MLU_VISIBLE_DEVICES=8
python ${BIN_DIR}/train.py \
--train-metadata=dump/train/norm/metadata.jsonl \
--dev-metadata=dump/dev/norm/metadata.jsonl \
--config=${config_path} \
--output-dir=${train_output_path} \
--ngpu=0 \
--nmlu=2 \
--phones-dict=dump/phone_id_map.txt \
--tones-dict=dump/tone_id_map.txt \
--use-relative-path=True

@ -0,0 +1,16 @@
#!/bin/bash
config_path=$1
train_output_path=$2
python ${BIN_DIR}/train.py \
--train-metadata=dump/train/norm/metadata.jsonl \
--dev-metadata=dump/dev/norm/metadata.jsonl \
--config=${config_path} \
--output-dir=${train_output_path} \
--ngpu=0 \
--nnpu=1 \
--phones-dict=dump/phone_id_map.txt \
--tones-dict=dump/tone_id_map.txt \
--use-relative-path=True

@ -0,0 +1,76 @@
#!/bin/bash
set -e
source path.sh
export CUSTOM_DEVICE_BLACK_LIST=elementwise_max
mlus=0
stage=0
stop_stage=100
conf_path=conf/default.yaml
train_output_path=exp/default
ckpt_name=snapshot_iter_30600.pdz
# with the following command, you can choose the stage range you want to run
# such as `./run.sh --stage 0 --stop-stage 0`
# this can not be mixed use with `$1`, `$2` ...
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# prepare data
./local/preprocess.sh ${conf_path} || exit -1
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# train model, all `ckpt` under `train_output_path/checkpoints/` dir
FLAGS_selected_mlus=${mlus} ./local/train_mlu.sh ${conf_path} ${train_output_path} || exit -1
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# synthesize, vocoder is pwgan by default
FLAGS_selected_mlus=${mlus} ./local/synthesize_mlu.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# synthesize_e2e, vocoder is pwgan by default
FLAGS_selected_mlus=${mlus} ./local/synthesize_e2e_mlu.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
# inference with static model
FLAGS_selected_mlus=${mlus} ./local/inference_mlu.sh ${train_output_path} || exit -1
fi
# paddle2onnx, please make sure the static models are in ${train_output_path}/inference first
# we have only tested the following models so far
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# install paddle2onnx
pip install paddle2onnx --upgrade
./local/paddle2onnx.sh ${train_output_path} inference inference_onnx speedyspeech_csmsc
# considering the balance between speed and quality, we recommend that you use hifigan as vocoder
./local/paddle2onnx.sh ${train_output_path} inference inference_onnx pwgan_csmsc
# ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx mb_melgan_csmsc
# ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx hifigan_csmsc
fi
# inference with onnxruntime
if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
./local/ort_predict.sh ${train_output_path}
fi
# must run after stage 3 (which stage generated static models)
if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
./local/export2lite.sh ${train_output_path} inference pdlite speedyspeech_csmsc x86
./local/export2lite.sh ${train_output_path} inference pdlite pwgan_csmsc x86
# ./local/export2lite.sh ${train_output_path} inference pdlite mb_melgan_csmsc x86
# ./local/export2lite.sh ${train_output_path} inference pdlite hifigan_csmsc x86
fi
if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then
CUDA_VISIBLE_DEVICES=${gpus} ./local/lite_predict.sh ${train_output_path} || exit -1
fi
# PTQ_static
if [ ${stage} -le 9 ] && [ ${stop_stage} -ge 9 ]; then
CUDA_VISIBLE_DEVICES=${gpus} ./local/PTQ_static.sh ${train_output_path} speedyspeech_csmsc || exit -1
fi

@ -0,0 +1,42 @@
#!/bin/bash
set -e
source path.sh
npus=0
stage=0
stop_stage=100
conf_path=conf/default.yaml
train_output_path=exp/default
ckpt_name=snapshot_iter_76.pdz
# with the following command, you can choose the stage range you want to run
# such as `./run_xpu.sh --stage 0 --stop-stage 0`
# this can not be mixed use with `$1`, `$2` ...
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# prepare data
./local/preprocess.sh ${conf_path} || exit -1
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# train model, all `ckpt` under `train_output_path/checkpoints/` dir
FLAGS_selected_npus=${npus} ./local/train_npu.sh ${conf_path} ${train_output_path} || exit -1
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# synthesize, vocoder is pwgan by default
FLAGS_selected_npus=${npus} ./local/synthesize_npu.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# synthesize_e2e, vocoder is pwgan by default
FLAGS_selected_npus=${npus} ./local/synthesize_e2e_npu.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
# inference with static model
FLAGS_selected_npus=${npus} ./local/inference_npu.sh ${train_output_path} || exit -1
fi

@ -274,7 +274,7 @@ class ASRExecutor(BaseExecutor):
# fbank # fbank
audio = preprocessing(audio, **preprocess_args) audio = preprocessing(audio, **preprocess_args)
audio_len = paddle.to_tensor([audio.shape[0]]).unsqueeze(axis=0) audio_len = paddle.to_tensor(audio.shape[0]).unsqueeze(axis=0)
audio = paddle.to_tensor(audio, dtype='float32').unsqueeze(axis=0) audio = paddle.to_tensor(audio, dtype='float32').unsqueeze(axis=0)
self._inputs["audio"] = audio self._inputs["audio"] = audio

@ -188,7 +188,7 @@ class Wav2vec2ASR(nn.Layer):
x_lens = x.shape[1] x_lens = x.shape[1]
ctc_probs = self.ctc.log_softmax(x) # (B, maxlen, vocab_size) ctc_probs = self.ctc.log_softmax(x) # (B, maxlen, vocab_size)
topk_prob, topk_index = ctc_probs.topk(1, axis=2) # (B, maxlen, 1) topk_prob, topk_index = ctc_probs.topk(1, axis=2) # (B, maxlen, 1)
topk_index = topk_index.view([batch_size, x_lens]) # (B, maxlen) topk_index = topk_index.reshape([batch_size, x_lens]) # (B, maxlen)
hyps = [hyp.tolist() for hyp in topk_index] hyps = [hyp.tolist() for hyp in topk_index]
hyps = [remove_duplicates_and_blank(hyp) for hyp in hyps] hyps = [remove_duplicates_and_blank(hyp) for hyp in hyps]

@ -112,7 +112,7 @@ def parse_args():
parser.add_argument( parser.add_argument(
"--device", "--device",
default="gpu", default="gpu",
choices=["gpu", "cpu", "xpu"], choices=["gpu", "cpu", "xpu", "npu", "mlu"],
help="Device selected for inference.", ) help="Device selected for inference.", )
parser.add_argument('--cpu_threads', type=int, default=1) parser.add_argument('--cpu_threads', type=int, default=1)

@ -45,15 +45,20 @@ def train_sp(args, config):
# decides device type and whether to run in parallel # decides device type and whether to run in parallel
# setup running environment correctly # setup running environment correctly
world_size = paddle.distributed.get_world_size() world_size = paddle.distributed.get_world_size()
if (not paddle.is_compiled_with_cuda()) or args.ngpu == 0: if paddle.is_compiled_with_cuda() and args.ngpu > 0:
if (not paddle.is_compiled_with_xpu()) or args.nxpu == 0:
paddle.set_device("cpu")
else:
paddle.set_device("xpu")
else:
paddle.set_device("gpu") paddle.set_device("gpu")
if world_size > 1: if world_size > 1:
paddle.distributed.init_parallel_env() paddle.distributed.init_parallel_env()
elif paddle.is_compiled_with_xpu() and args.nxpu > 0:
paddle.device.set_device("xpu")
elif args.nnpu > 0:
paddle.device.set_device("npu")
if world_size > 1:
paddle.distributed.init_parallel_env()
elif args.nmlu > 0:
paddle.device.set_device("mlu")
else:
paddle.set_device("cpu")
# set the random seed, it is a must for multiprocess training # set the random seed, it is a must for multiprocess training
seed_everything(config.seed) seed_everything(config.seed)
@ -191,9 +196,25 @@ def main():
"--nxpu", "--nxpu",
type=int, type=int,
default=0, default=0,
help="if nxpu == 0 and ngpu == 0, use cpu.") help="if wish to use xpu, set ngpu == 0 and nxpu > 0, otherwise use gpu, npu, mlu or cpu."
)
parser.add_argument( parser.add_argument(
"--ngpu", type=int, default=1, help="if ngpu == 0, use cpu or xpu") "--nnpu",
type=int,
default=0,
help="if wish to use npu, set ngpu == 0 and nnpu > 0, otherwise use gpu, xpu, mlu or cpu."
)
parser.add_argument(
"--nmlu",
type=int,
default=1,
help="if wish to use npu, set ngpu == 0 and nmlu > 0, otherwise use gpu, xpu, npu or cpu."
)
parser.add_argument(
"--ngpu",
type=int,
default=1,
help="if wish to use gpu, set ngpu > 0, otherwise use xpu, npu or cpu.")
parser.add_argument( parser.add_argument(
"--use-relative-path", "--use-relative-path",

@ -591,7 +591,8 @@ def get_predictor(
config = inference.Config( config = inference.Config(
str(Path(model_dir) / model_file), str(Path(model_dir) / params_file)) str(Path(model_dir) / model_file), str(Path(model_dir) / params_file))
config.enable_memory_optim() if paddle.__version__ <= "2.5.2" and paddle.__version__ != "0.0.0":
config.enable_memory_optim()
config.switch_ir_optim(True) config.switch_ir_optim(True)
if device == "gpu": if device == "gpu":
config.enable_use_gpu(100, device_id) config.enable_use_gpu(100, device_id)

@ -219,12 +219,28 @@ def parse_args():
) )
# other # other
parser.add_argument( parser.add_argument(
"--ngpu", type=int, default=1, help="if ngpu == 0, use cpu or xpu.") "--ngpu",
type=int,
default=1,
help="if wish to use gpu, set ngpu > 0, otherwise use xpu, npu, mlu or cpu."
)
parser.add_argument( parser.add_argument(
"--nxpu", "--nxpu",
type=int, type=int,
default=0, default=0,
help="if wish to use xpu, set ngpu == 0 and nxpu > 0, and if ngpu == 0 and nxpu == 0, use cpu." help="if wish to use xpu, set ngpu == 0 and nxpu > 0, otherwise use gpu, npu, mlu or cpu."
)
parser.add_argument(
"--nnpu",
type=int,
default=0,
help="if wish to use npu, set ngpu == 0 and nnpu > 0, otherwise use gpu, xpu, mlu or cpu."
)
parser.add_argument(
"--nmlu",
type=int,
default=0,
help="if wish to use xpu, set ngpu == 0 and nmlu > 0, otherwise use gpu, xpu, npu or cpu."
) )
parser.add_argument("--test_metadata", type=str, help="test metadata.") parser.add_argument("--test_metadata", type=str, help="test metadata.")
parser.add_argument("--output_dir", type=str, help="output dir.") parser.add_argument("--output_dir", type=str, help="output dir.")
@ -245,10 +261,16 @@ def main():
paddle.set_device("gpu") paddle.set_device("gpu")
elif args.nxpu > 0: elif args.nxpu > 0:
paddle.set_device("xpu") paddle.set_device("xpu")
elif args.ngpu == 0 and args.nxpu == 0: elif args.nnpu > 0:
paddle.set_device("npu")
elif args.nmlu > 0:
paddle.set_device("mlu")
elif args.ngpu == 0 and args.nxpu == 0 and args.nnpu == 0 and args.nmlu == 0:
paddle.set_device("cpu") paddle.set_device("cpu")
else: else:
print("ngpu or nxpu should >= 0 !") print(
"one of ngpu, nxpu, nnpu or nmlu should be greater than 0 or all of them equal to 0"
)
evaluate(args) evaluate(args)

@ -299,12 +299,28 @@ def parse_args():
default=None, default=None,
help="dir to save inference models") help="dir to save inference models")
parser.add_argument( parser.add_argument(
"--ngpu", type=int, default=1, help="if ngpu == 0, use cpu or xpu.") "--ngpu",
type=int,
default=1,
help="if wish to use gpu, set ngpu > 0, otherwise use xpu, npu, mlu or cpu."
)
parser.add_argument( parser.add_argument(
"--nxpu", "--nxpu",
type=int, type=int,
default=0, default=0,
help="if wish to use xpu, set ngpu == 0 and nxpu > 0, and if ngpu == 0 and nxpu == 0, use cpu." help="if wish to use xpu, set ngpu == 0 and nxpu > 0, otherwise use gpu, npu, mlu or cpu."
)
parser.add_argument(
"--nnpu",
type=int,
default=0,
help="if wish to use npu, set ngpu == 0 and nnpu > 0, otherwise use gpu, xpu, mlu or cpu."
)
parser.add_argument(
"--nmlu",
type=int,
default=0,
help="if wish to use xpu, set ngpu == 0 and nmlu > 0, otherwise use gpu, xpu, npu or cpu."
) )
parser.add_argument( parser.add_argument(
"--text", "--text",
@ -339,10 +355,16 @@ def main():
paddle.set_device("gpu") paddle.set_device("gpu")
elif args.nxpu > 0: elif args.nxpu > 0:
paddle.set_device("xpu") paddle.set_device("xpu")
elif args.ngpu == 0 and args.nxpu == 0: elif args.nnpu > 0:
paddle.set_device("npu")
elif args.nmlu > 0:
paddle.set_device("mlu")
elif args.ngpu == 0 and args.nxpu == 0 and args.nnpu == 0 and args.nmlu == 0:
paddle.set_device("cpu") paddle.set_device("cpu")
else: else:
print("ngpu or nxpu should >= 0 !") print(
"one of ngpu, nxpu, nnpu or nmlu should be greater than 0 or all of them equal to 0"
)
evaluate(args) evaluate(args)

@ -237,30 +237,25 @@ class ToneSandhi():
# output seg: [['听一听', 'v']] # output seg: [['听一听', 'v']]
def _merge_yi(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: def _merge_yi(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
new_seg = [] new_seg = []
skip_next = False
# function 1 # function 1
for i, (word, pos) in enumerate(seg): for i, (word, pos) in enumerate(seg):
if i - 1 >= 0 and word == "" and i + 1 < len(seg) and seg[i - 1][ if skip_next:
0] == seg[i + 1][0] and seg[i - 1][1] == "v": skip_next = False
if i - 1 < len(new_seg): continue
new_seg[i - if i - 1 >= 0 and word == "" and i + 1 < len(seg) and seg[i - 1][0] == seg[i + 1][0] and seg[i - 1][1] == "v":
1][0] = new_seg[i - 1][0] + "" + new_seg[i - 1][0] new_seg[-1] = (new_seg[-1][0] + "" + seg[i + 1][0], new_seg[-1][1])
else: skip_next = True
new_seg.append([word, pos])
new_seg.append([seg[i + 1][0], pos])
else: else:
if i - 2 >= 0 and seg[i - 1][0] == "" and seg[i - 2][ new_seg.append((word, pos))
0] == word and pos == "v":
continue
else:
new_seg.append([word, pos])
seg = new_seg seg = new_seg
new_seg = [] new_seg = []
# function 2 # function 2
for i, (word, pos) in enumerate(seg): for i, (word, pos) in enumerate(seg):
if new_seg and new_seg[-1][0] == "": if new_seg and new_seg[-1][0] == "":
new_seg[-1][0] = new_seg[-1][0] + word new_seg[-1] = (new_seg[-1][0] + word, new_seg[-1][1])
else: else:
new_seg.append([word, pos]) new_seg.append((word, pos))
return new_seg return new_seg
# the first and the second words are all_tone_three # the first and the second words are all_tone_three

@ -43,16 +43,17 @@ base = [
# paddleaudio align with librosa==0.8.1, which need numpy==1.23.x # paddleaudio align with librosa==0.8.1, which need numpy==1.23.x
"numpy==1.23.5", "numpy==1.23.5",
"librosa==0.8.1", "librosa==0.8.1",
"scipy>=1.4.0", "scipy>=1.4.0, <=1.12.0",
"loguru", "loguru",
"matplotlib", "matplotlib<=3.8.4",
"nara_wpe", "nara_wpe",
"onnxruntime>=1.11.0", "onnxruntime>=1.11.0",
"opencc", "opencc==1.1.6",
"opencc-python-reimplemented", "opencc-python-reimplemented",
"pandas", "pandas",
"paddleaudio>=1.1.0", "paddleaudio>=1.1.0",
"paddlenlp>=2.4.8", "paddlenlp>=2.4.8",
"paddlepaddle-gpu==2.5.1",
"paddleslim>=2.3.4", "paddleslim>=2.3.4",
"ppdiffusers>=0.9.0", "ppdiffusers>=0.9.0",
"paddlespeech_feat", "paddlespeech_feat",

@ -35,6 +35,8 @@ if [[ ${MODE} = "benchmark_train" ]];then
pip install setuptools_scm #-i https://pypi.tuna.tsinghua.edu.cn/simple pip install setuptools_scm #-i https://pypi.tuna.tsinghua.edu.cn/simple
pip install . #-i https://pypi.tuna.tsinghua.edu.cn/simple pip install . #-i https://pypi.tuna.tsinghua.edu.cn/simple
pip install jsonlines pip install jsonlines
pip install -U scipy==1.12.0 # 高版本数据处理部分报错
pip install -U matplotlib==3.7.1 # 高版本报错cannot import name 'get_cmap' from 'matplotlib.cm'
pip list pip list
cd - cd -
if [[ ${model_name} == "conformer" ]]; then if [[ ${model_name} == "conformer" ]]; then

@ -48,7 +48,7 @@ class TestDeepSpeech2Model(unittest.TestCase):
num_rnn_layers=3, num_rnn_layers=3,
rnn_size=1024, rnn_size=1024,
use_gru=False, use_gru=False,
share_rnn_weights=False, ) rnn_direction="forward", )
loss = model(self.audio, self.audio_len, self.text, self.text_len) loss = model(self.audio, self.audio_len, self.text, self.text_len)
self.assertEqual(loss.numel(), 1) self.assertEqual(loss.numel(), 1)
@ -60,7 +60,7 @@ class TestDeepSpeech2Model(unittest.TestCase):
num_rnn_layers=3, num_rnn_layers=3,
rnn_size=1024, rnn_size=1024,
use_gru=True, use_gru=True,
share_rnn_weights=False, ) rnn_direction="forward", )
loss = model(self.audio, self.audio_len, self.text, self.text_len) loss = model(self.audio, self.audio_len, self.text, self.text_len)
self.assertEqual(loss.numel(), 1) self.assertEqual(loss.numel(), 1)
@ -72,7 +72,7 @@ class TestDeepSpeech2Model(unittest.TestCase):
num_rnn_layers=3, num_rnn_layers=3,
rnn_size=1024, rnn_size=1024,
use_gru=False, use_gru=False,
share_rnn_weights=True, ) rnn_direction="bidirect", )
loss = model(self.audio, self.audio_len, self.text, self.text_len) loss = model(self.audio, self.audio_len, self.text, self.text_len)
self.assertEqual(loss.numel(), 1) self.assertEqual(loss.numel(), 1)
@ -84,7 +84,7 @@ class TestDeepSpeech2Model(unittest.TestCase):
num_rnn_layers=3, num_rnn_layers=3,
rnn_size=1024, rnn_size=1024,
use_gru=True, use_gru=True,
share_rnn_weights=True, ) rnn_direction="bidirect", )
loss = model(self.audio, self.audio_len, self.text, self.text_len) loss = model(self.audio, self.audio_len, self.text, self.text_len)
self.assertEqual(loss.numel(), 1) self.assertEqual(loss.numel(), 1)
@ -96,7 +96,7 @@ class TestDeepSpeech2Model(unittest.TestCase):
num_rnn_layers=3, num_rnn_layers=3,
rnn_size=1024, rnn_size=1024,
use_gru=False, use_gru=False,
share_rnn_weights=False, ) rnn_direction="forward", )
loss = model(self.audio, self.audio_len, self.text, self.text_len) loss = model(self.audio, self.audio_len, self.text, self.text_len)
self.assertEqual(loss.numel(), 1) self.assertEqual(loss.numel(), 1)

@ -19,11 +19,11 @@ import numpy as np
import paddle import paddle
from paddle import inference from paddle import inference
from paddlespeech.s2t.models.ds2_online import DeepSpeech2InferModelOnline from paddlespeech.s2t.models.ds2 import DeepSpeech2InferModel
from paddlespeech.s2t.models.ds2_online import DeepSpeech2ModelOnline from paddlespeech.s2t.models.ds2 import DeepSpeech2Model
class TestDeepSpeech2ModelOnline(unittest.TestCase): class TestDeepSpeech2Model(unittest.TestCase):
def setUp(self): def setUp(self):
paddle.set_device('cpu') paddle.set_device('cpu')
@ -45,7 +45,7 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase):
self.text_len = paddle.to_tensor(text_len, dtype='int64') self.text_len = paddle.to_tensor(text_len, dtype='int64')
def test_ds2_1(self): def test_ds2_1(self):
model = DeepSpeech2ModelOnline( model = DeepSpeech2Model(
feat_size=self.feat_dim, feat_size=self.feat_dim,
dict_size=10, dict_size=10,
num_conv_layers=2, num_conv_layers=2,
@ -58,7 +58,7 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase):
self.assertEqual(loss.numel(), 1) self.assertEqual(loss.numel(), 1)
def test_ds2_2(self): def test_ds2_2(self):
model = DeepSpeech2ModelOnline( model = DeepSpeech2Model(
feat_size=self.feat_dim, feat_size=self.feat_dim,
dict_size=10, dict_size=10,
num_conv_layers=2, num_conv_layers=2,
@ -71,7 +71,7 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase):
self.assertEqual(loss.numel(), 1) self.assertEqual(loss.numel(), 1)
def test_ds2_3(self): def test_ds2_3(self):
model = DeepSpeech2ModelOnline( model = DeepSpeech2Model(
feat_size=self.feat_dim, feat_size=self.feat_dim,
dict_size=10, dict_size=10,
num_conv_layers=2, num_conv_layers=2,
@ -84,7 +84,7 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase):
self.assertEqual(loss.numel(), 1) self.assertEqual(loss.numel(), 1)
def test_ds2_4(self): def test_ds2_4(self):
model = DeepSpeech2ModelOnline( model = DeepSpeech2Model(
feat_size=self.feat_dim, feat_size=self.feat_dim,
dict_size=10, dict_size=10,
num_conv_layers=2, num_conv_layers=2,
@ -97,7 +97,7 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase):
self.assertEqual(loss.numel(), 1) self.assertEqual(loss.numel(), 1)
def test_ds2_5(self): def test_ds2_5(self):
model = DeepSpeech2ModelOnline( model = DeepSpeech2Model(
feat_size=self.feat_dim, feat_size=self.feat_dim,
dict_size=10, dict_size=10,
num_conv_layers=2, num_conv_layers=2,
@ -110,7 +110,7 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase):
self.assertEqual(loss.numel(), 1) self.assertEqual(loss.numel(), 1)
def test_ds2_6(self): def test_ds2_6(self):
model = DeepSpeech2ModelOnline( model = DeepSpeech2Model(
feat_size=self.feat_dim, feat_size=self.feat_dim,
dict_size=10, dict_size=10,
num_conv_layers=2, num_conv_layers=2,
@ -125,7 +125,7 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase):
def test_ds2_7(self): def test_ds2_7(self):
use_gru = False use_gru = False
model = DeepSpeech2ModelOnline( model = DeepSpeech2Model(
feat_size=self.feat_dim, feat_size=self.feat_dim,
dict_size=10, dict_size=10,
num_conv_layers=2, num_conv_layers=2,
@ -156,7 +156,7 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase):
def test_ds2_8(self): def test_ds2_8(self):
use_gru = True use_gru = True
model = DeepSpeech2ModelOnline( model = DeepSpeech2Model(
feat_size=self.feat_dim, feat_size=self.feat_dim,
dict_size=10, dict_size=10,
num_conv_layers=2, num_conv_layers=2,
@ -191,7 +191,7 @@ class TestDeepSpeech2StaticModelOnline(unittest.TestCase):
export_prefix = "exp/deepspeech2_online/checkpoints/test_export" export_prefix = "exp/deepspeech2_online/checkpoints/test_export"
if not os.path.exists(os.path.dirname(export_prefix)): if not os.path.exists(os.path.dirname(export_prefix)):
os.makedirs(os.path.dirname(export_prefix), mode=0o755) os.makedirs(os.path.dirname(export_prefix), mode=0o755)
infer_model = DeepSpeech2InferModelOnline( infer_model = DeepSpeech2InferModel(
feat_size=161, feat_size=161,
dict_size=4233, dict_size=4233,
num_conv_layers=2, num_conv_layers=2,

@ -0,0 +1,31 @@
function main(){
set -ex
speech_ci_path=`pwd`
echo "Start asr"
cd ${speech_ci_path}/asr
bash deepspeech2_online_model_test.sh
python error_rate_test.py
python mask_test.py
python reverse_pad_list.py
echo "End asr"
echo "Start TTS"
cd ${speech_ci_path}/tts
python test_data_table.py
python test_enfrontend.py
python test_mixfrontend.py
echo "End TTS"
echo "Start Vector"
cd ${speech_ci_path}/vector
python test_augment.py
echo "End Vector"
echo "Start cli"
cd ${speech_ci_path}/cli
bash test_cli.sh
echo "End cli"
}
main

@ -10,11 +10,12 @@ paddlespeech cls --input ./cat.wav --topk 10
paddlespeech text --input 今天的天气真不错啊你下午有空吗我想约你一起去吃饭 --model ernie_linear_p3_wudao_fast paddlespeech text --input 今天的天气真不错啊你下午有空吗我想约你一起去吃饭 --model ernie_linear_p3_wudao_fast
# Speech SSL # Speech SSL
wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
paddlespeech ssl --task asr --lang en --input ./en.wav paddlespeech ssl --task asr --lang en --input ./en.wav
paddlespeech ssl --task vector --lang en --input ./en.wav paddlespeech ssl --task vector --lang en --input ./en.wav
# Speech_recognition # Speech_recognition
wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/ch_zh_mix.wav wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/ch_zh_mix.wav
paddlespeech asr --input ./zh.wav paddlespeech asr --input ./zh.wav
paddlespeech asr --model conformer_aishell --input ./zh.wav paddlespeech asr --model conformer_aishell --input ./zh.wav
paddlespeech asr --model conformer_online_aishell --input ./zh.wav paddlespeech asr --model conformer_online_aishell --input ./zh.wav
@ -110,5 +111,7 @@ paddlespeech whisper --task transcribe --input ./zh.wav
# whisper recognize text and translate to English # whisper recognize text and translate to English
paddlespeech whisper --task translate --input ./zh.wav paddlespeech whisper --task translate --input ./zh.wav
# to change model English-Only model
paddlespeech whisper --lang en --size base --task transcribe --input ./en.wav
echo -e "\033[32mTest success !!!\033[0m" echo -e "\033[32mTest success !!!\033[0m"

@ -0,0 +1,29 @@
# test CLI 测试文档
该文档为 CLI 测试说明,该测试目前覆盖大部分 paddlespeech 中的 CLI 推理。该 CI 建立后用于快速验证修复是否正确。
# 测试流程
## 1. 环境安装
CI 重建时在已有通过版本 paddlepaddle-gpu==2.5.1, paddlepseech==develop 下运行。
CI 重建后在 paddlepaddle-gpu==develop, paddlepseech==develop 下运行。
### 其他相关依赖
gcc >= 4.8.5,
python >= 3.8
## 2. 功能测试
在 repo 的 tests/unit/cli 中运行:
```shell
source path.sh
bash test_cli.sh
```
## 3. 预期结果
输出 "Test success",且运行过程中无报错或 Error 即为成功。

@ -1,5 +1,7 @@
#!/bin/bash #!/bin/bash
# bash test_server_client.sh # bash test_server_client.sh
## require lsof to get server pid
## apt-get install -y lsof
StartService(){ StartService(){
# Start service # Start service

@ -48,7 +48,7 @@ if __name__ == "__main__":
parser.add_argument( parser.add_argument(
"--text", "--text",
type=str, type=str,
default="../../../../../../paddlespeech/t2s/exps/csmsc_test.txt", default="../../../../../../paddlespeech/t2s/assets/csmsc_test.txt",
help="text to synthesize, a 'utt_id sentence' pair per line") help="text to synthesize, a 'utt_id sentence' pair per line")
parser.add_argument('--spk_id', type=int, default=0, help='Speaker id') parser.add_argument('--spk_id', type=int, default=0, help='Speaker id')
parser.add_argument('--speed', type=float, default=1.0, help='Audio speed') parser.add_argument('--speed', type=float, default=1.0, help='Audio speed')

@ -11,7 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from paddlespeech.t2s.datasets.data_tabel import DataTable from paddlespeech.t2s.datasets.data_table import DataTable
def test_audio_dataset(): def test_audio_dataset():

@ -0,0 +1,4 @@
FROM registry.baidubce.com/paddlepaddle/paddle:latest-dev-cuda11.2-cudnn8.2-trt8.0-gcc82
RUN apt-get update -y
RUN apt-get -y install libsndfile1
RUN pip3.8 install pytest-runner

@ -0,0 +1,54 @@
set +x
# use pre-commit 2.17
if ! [[ $(pre-commit --version) == *"2.17.0"* ]]; then
pip install pre-commit==2.17.0 1>nul
fi
# Install clang-format before git commit to avoid repeat installation due to
# pre-commit multi-thread running.
readonly VERSION="13.0.0"
version=$(clang-format -version)
if ! [[ $(python -V 2>&1 | awk '{print $2}' | awk -F '.' '{print $1$2}') -ge 36 ]]; then
echo "clang-format installation by pip need python version great equal 3.6,
please change the default python to higher version."
exit 1
fi
diff_files=$(git diff --name-only --diff-filter=ACMR ${BRANCH})
num_diff_files=$(echo "$diff_files" | wc -l)
echo -e "diff files between pr and ${BRANCH}:\n${diff_files}"
echo "Checking code style by pre-commit ..."
pre-commit run --files ${diff_files};check_error=$?
if test ! -z "$(git diff)"; then
echo -e '\n************************************************************************************'
echo -e "These files have been formatted by code format hook. You should use pre-commit to \
format them before git push."
echo -e '************************************************************************************\n'
git diff 2>&1
fi
echo -e '\n************************************************************************************'
if [ ${check_error} != 0 ];then
echo "Your PR code style check failed."
echo "Please install pre-commit locally and set up git hook scripts:"
echo ""
echo " pip install pre-commit==2.17.0"
echo " pre-commit install"
echo ""
if [[ $num_diff_files -le 100 ]];then
echo "Then, run pre-commit to check codestyle issues in your PR:"
echo ""
echo " pre-commit run --files" $(echo ${diff_files} | tr "\n" " ")
echo ""
fi
echo "For more information, please refer to our codestyle check guide:"
echo "https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/dev_guides/git_guides/codestyle_check_guide_cn.html"
else
echo "Your PR code style check passed."
fi
echo -e '************************************************************************************\n'
exit ${check_error}

@ -6,7 +6,7 @@ import kaldiio
import numpy import numpy
from distutils.util import strtobool from distutils.util import strtobool
from paddlespeech.s2t.transform.cmvn import CMVN from paddlespeech.audio.transform.cmvn import CMVN
from paddlespeech.s2t.utils.cli_readers import file_reader_helper from paddlespeech.s2t.utils.cli_readers import file_reader_helper
from paddlespeech.s2t.utils.cli_utils import get_commandline_args from paddlespeech.s2t.utils.cli_utils import get_commandline_args
from paddlespeech.s2t.utils.cli_utils import is_scipy_wav_style from paddlespeech.s2t.utils.cli_utils import is_scipy_wav_style

@ -5,7 +5,7 @@ import logging
import kaldiio import kaldiio
import numpy as np import numpy as np
from paddlespeech.s2t.transform.transformation import Transformation from paddlespeech.audio.transform.transformation import Transformation
from paddlespeech.s2t.utils.cli_readers import file_reader_helper from paddlespeech.s2t.utils.cli_readers import file_reader_helper
from paddlespeech.s2t.utils.cli_utils import get_commandline_args from paddlespeech.s2t.utils.cli_utils import get_commandline_args
from paddlespeech.s2t.utils.cli_utils import is_scipy_wav_style from paddlespeech.s2t.utils.cli_utils import is_scipy_wav_style

@ -4,7 +4,7 @@ import logging
from distutils.util import strtobool from distutils.util import strtobool
from paddlespeech.s2t.transform.transformation import Transformation from paddlespeech.audio.transform.transformation import Transformation
from paddlespeech.s2t.utils.cli_readers import file_reader_helper from paddlespeech.s2t.utils.cli_readers import file_reader_helper
from paddlespeech.s2t.utils.cli_utils import get_commandline_args from paddlespeech.s2t.utils.cli_utils import get_commandline_args
from paddlespeech.s2t.utils.cli_utils import is_scipy_wav_style from paddlespeech.s2t.utils.cli_utils import is_scipy_wav_style

@ -3,7 +3,7 @@ import argparse
import logging import logging
import sys import sys
from paddlespeech.s2t.transform.transformation import Transformation from paddlespeech.audio.transform.transformation import Transformation
from paddlespeech.s2t.utils.cli_readers import file_reader_helper from paddlespeech.s2t.utils.cli_readers import file_reader_helper
from paddlespeech.s2t.utils.cli_utils import get_commandline_args from paddlespeech.s2t.utils.cli_utils import get_commandline_args
from paddlespeech.s2t.utils.cli_utils import is_scipy_wav_style from paddlespeech.s2t.utils.cli_utils import is_scipy_wav_style

Loading…
Cancel
Save