Merge branch 'PaddlePaddle:develop' into develop

pull/3715/head
131 6 days ago committed by GitHub
commit 1973d7c941
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -14,7 +14,7 @@ Linux test build whl environment:
* gcc/g++ - 8.2.0
* cmake - 3.18.0 (need install)
MACtest build whl envrioment
MACtest build whl environment
* os
* gcc/g++ 12.2.0
* cpu Intel Xeon E5 x86_64

@ -37,7 +37,7 @@ class FeatTest(unittest.TestCase):
self.waveform, self.sr = load(os.path.abspath(os.path.basename(url)))
self.waveform = self.waveform.astype(
np.float32
) # paddlespeech.s2t.transform.spectrogram only supports float32
) # paddlespeech.audio.transform.spectrogram only supports float32
dim = len(self.waveform.shape)
assert dim in [1, 2]

@ -18,8 +18,8 @@ import paddle
from paddleaudio.functional.window import get_window
from .base import FeatTest
from paddlespeech.s2t.transform.spectrogram import IStft
from paddlespeech.s2t.transform.spectrogram import Stft
from paddlespeech.audio.transform.spectrogram import IStft
from paddlespeech.audio.transform.spectrogram import Stft
class TestIstft(FeatTest):

@ -18,7 +18,7 @@ import paddle
import paddleaudio
from .base import FeatTest
from paddlespeech.s2t.transform.spectrogram import LogMelSpectrogram
from paddlespeech.audio.transform.spectrogram import LogMelSpectrogram
class TestLogMelSpectrogram(FeatTest):

@ -18,7 +18,7 @@ import paddle
import paddleaudio
from .base import FeatTest
from paddlespeech.s2t.transform.spectrogram import Spectrogram
from paddlespeech.audio.transform.spectrogram import Spectrogram
class TestSpectrogram(FeatTest):

@ -18,7 +18,7 @@ import paddle
from paddleaudio.functional.window import get_window
from .base import FeatTest
from paddlespeech.s2t.transform.spectrogram import Stft
from paddlespeech.audio.transform.spectrogram import Stft
class TestStft(FeatTest):

@ -19,7 +19,7 @@ You can choose one way from meduim and hard to install paddlespeech.
The dependency refers to the requirements.txt, and install the dependency as follows:
```
pip install -r requriement.txt
pip install -r requirements.txt
```
### 2. Prepare Input File
@ -30,11 +30,20 @@ Here are sample files for this demo that can be downloaded:
wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
```
### 3. Usage
### 3. run paddlespeech_server
Before using the client, it is necessary to start paddlespeech_servers.
Here are sample server configuration
```bash
bash demos/audio_content_search/run.sh
```
The logs of the two services will be recorded in 'acs.log' and 'streaming_asr.log' in this configuration.
### 4. Usage
- Command Line(Recommended)
```bash
# Chinese
paddlespeech_client acs --server_ip 127.0.0.1 --port 8090 --input ./zh.wav
paddlespeech_client acs --server_ip 127.0.0.1 --port 8490 --input ./zh.wav
```
Usage:

@ -19,7 +19,7 @@
依赖参见 requirements.txt, 安装依赖
```
pip install -r requriement.txt
pip install -r requirements.txt
```
### 2. 准备输入
@ -29,16 +29,26 @@ pip install -r requriement.txt
```bash
wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
```
### 3. 使用方法
### 3. 启动 server
使用 client 之前需要先启动 paddlespeech_server。
可以使用默认 server 配置:
```bash
bash demos/audio_content_search/run.sh
```
该配置下两个服务的日志会被记录在 `acs.log``streaming_asr.log` 中。
### 4. 使用方法
- 命令行 (推荐使用)
```bash
# 中文
paddlespeech_client acs --server_ip 127.0.0.1 --port 8090 --input ./zh.wav
paddlespeech_client acs --server_ip 127.0.0.1 --port 8490 --input ./zh.wav
```
使用方法:
```bash
paddlespeech acs --help
paddlespeech asr --help
```
参数:
- `input`(必须输入):用于识别的音频文件。

@ -26,8 +26,10 @@ asr_online:
sample_rate: 16000
cfg_path:
decode_method: 'attention_rescoring'
num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1.
force_yes: True
device: 'cpu' # cpu or gpu:id
continuous_decoding: False # disable continue decoding when endpoint detected
am_predictor_conf:
device: # set 'gpu:id' or 'cpu'
switch_ir_optim: True
@ -40,4 +42,4 @@ asr_online:
window_ms: 25 # ms
shift_ms: 10 # ms
sample_rate: 16000
sample_width: 2
sample_width: 2

@ -31,6 +31,7 @@ asr_online:
force_yes: True
device: 'cpu' # cpu or gpu:id
decode_method: "attention_rescoring"
num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1.
am_predictor_conf:
device: # set 'gpu:id' or 'cpu'
switch_ir_optim: True

@ -1,5 +1,5 @@
diskcache==5.2.1
dtaidistance==2.3.1
diskcache
dtaidistane
fastapi
librosa==0.8.0
numpy==1.22.0
@ -10,4 +10,4 @@ python-multipart
soundfile==0.10.3.post1
starlette
typing
uvicorn
uvicorn

@ -429,7 +429,7 @@ bash server.sh
If `127.0.0.1` is not accessible, you need to use the actual service IP address.
```bash
paddlespeech_client asr_online --server_ip 127.0.0.1 --port 8290 --punc.server_ip 127.0.0.1 --punc.port 8190 --input ./zh.wav
paddlespeech_client asr_online --server_ip 127.0.0.1 --port 8090 --punc.server_ip 127.0.0.1 --punc.port 8190 --input ./zh.wav
```
Output:
```text
@ -507,7 +507,7 @@ bash server.sh
If `127.0.0.1` is not accessible, you need to use the actual service IP address.
```bash
python3 websocket_client.py --server_ip 127.0.0.1 --port 8290 --punc.server_ip 127.0.0.1 --punc.port 8190 --wavfile ./zh.wav
python3 local/websocket_client.py --server_ip 127.0.0.1 --port 8090 --punc.server_ip 127.0.0.1 --punc.port 8190 --wavfile ./zh.wav
```
Output:
```text

@ -428,7 +428,7 @@ bash server.sh
`127.0.0.1` 不能访问,则需要使用实际服务 IP 地址
```bash
paddlespeech_client asr_online --server_ip 127.0.0.1 --port 8290 --punc.server_ip 127.0.0.1 --punc.port 8190 --input ./zh.wav
paddlespeech_client asr_online --server_ip 127.0.0.1 --port 8090 --punc.server_ip 127.0.0.1 --punc.port 8190 --input ./zh.wav
```
输出:
```text
@ -506,7 +506,7 @@ bash server.sh
`127.0.0.1` 不能访问,则需要使用实际服务 IP 地址
```bash
python3 websocket_client.py --server_ip 127.0.0.1 --port 8290 --punc.server_ip 127.0.0.1 --punc.port 8190 --wavfile ./zh.wav
python3 local/websocket_client.py --server_ip 127.0.0.1 --port 8090 --punc.server_ip 127.0.0.1 --punc.port 8190 --wavfile ./zh.wav
```
输出:
```text

@ -32,7 +32,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--pwg-config=download/pwg_baker_ckpt_0.4/pwg_default.yaml \
--pwg-checkpoint=download/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
--pwg-stat=download/pwg_baker_ckpt_0.4/pwg_stats.npy \
--text=${BIN_DIR}/../sentences.txt \
--text=./sentences.txt \
--output-dir=output \
--phones-dict=download/fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt
fi

@ -236,8 +236,8 @@
"warnings.filterwarnings('ignore')\n",
"\n",
"from yacs.config import CfgNode\n",
"from paddlespeech.s2t.transform.spectrogram import LogMelSpectrogramKaldi\n",
"from paddlespeech.s2t.transform.cmvn import GlobalCMVN\n",
"from paddlespeech.audio.transform.spectrogram import LogMelSpectrogramKaldi\n",
"from paddlespeech.audio.transform.cmvn import GlobalCMVN\n",
"from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer\n",
"from paddlespeech.s2t.models.u2 import U2Model\n",
"\n",

@ -0,0 +1,33 @@
#!/bin/bash
train_output_path=$1
stage=0
stop_stage=0
# for more GAN Vocoders
# multi band melgan
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
python3 ${BIN_DIR}/../inference.py \
--inference_dir=${train_output_path}/inference \
--am=speedyspeech_csmsc \
--voc=mb_melgan_csmsc \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--device mlu
fi
# hifigan
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3 ${BIN_DIR}/../inference.py \
--inference_dir=${train_output_path}/inference \
--am=speedyspeech_csmsc \
--voc=hifigan_csmsc \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--device mlu
fi

@ -0,0 +1,46 @@
#!/bin/bash
train_output_path=$1
stage=0
stop_stage=0
# pwgan
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
python3 ${BIN_DIR}/../inference.py \
--inference_dir=${train_output_path}/inference \
--am=speedyspeech_csmsc \
--voc=pwgan_csmsc \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--device npu
fi
# for more GAN Vocoders
# multi band melgan
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3 ${BIN_DIR}/../inference.py \
--inference_dir=${train_output_path}/inference \
--am=speedyspeech_csmsc \
--voc=mb_melgan_csmsc \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--device npu
fi
# hifigan
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
python3 ${BIN_DIR}/../inference.py \
--inference_dir=${train_output_path}/inference \
--am=speedyspeech_csmsc \
--voc=hifigan_csmsc \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--device npu
fi

@ -0,0 +1,99 @@
#!/bin/bash
config_path=$1
train_output_path=$2
ckpt_name=$3
stage=0
stop_stage=0
# for more GAN Vocoders
# multi band melgan
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
FLAGS_allocator_strategy=naive_best_fit \
python3 ${BIN_DIR}/../synthesize_e2e.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \
--voc=mb_melgan_csmsc \
--voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
--voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--ngpu=0 \
--nmlu=1
fi
# the pretrained models haven't release now
# style melgan
# style melgan's Dygraph to Static Graph is not ready now
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
FLAGS_allocator_strategy=naive_best_fit \
python3 ${BIN_DIR}/../synthesize_e2e.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \
--voc=style_melgan_csmsc \
--voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
--voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--ngpu=0 \
--nmlu=1
# --inference_dir=${train_output_path}/inference
fi
# hifigan
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
FLAGS_allocator_strategy=naive_best_fit \
python3 ${BIN_DIR}/../synthesize_e2e.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \
--voc=hifigan_csmsc \
--voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
--voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--inference_dir=${train_output_path}/inference \
--ngpu=0 \
--nmlu=1
fi
# wavernn
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
echo "in wavernn syn_e2e"
FLAGS_allocator_strategy=naive_best_fit \
python3 ${BIN_DIR}/../synthesize_e2e.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \
--voc=wavernn_csmsc \
--voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \
--voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
--voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--inference_dir=${train_output_path}/inference \
--ngpu=0 \
--nmlu=1
fi

@ -0,0 +1,124 @@
#!/bin/bash
config_path=$1
train_output_path=$2
ckpt_name=$3
stage=0
stop_stage=0
# pwgan
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
FLAGS_allocator_strategy=naive_best_fit \
python3 ${BIN_DIR}/../synthesize_e2e.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \
--voc=pwgan_csmsc \
--voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \
--voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
--voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--inference_dir=${train_output_path}/inference \
--ngpu=0 \
--nnpu=1
fi
# for more GAN Vocoders
# multi band melgan
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
FLAGS_allocator_strategy=naive_best_fit \
python3 ${BIN_DIR}/../synthesize_e2e.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \
--voc=mb_melgan_csmsc \
--voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
--voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--inference_dir=${train_output_path}/inference \
--ngpu=0 \
--nnpu=1
fi
# the pretrained models haven't release now
# style melgan
# style melgan's Dygraph to Static Graph is not ready now
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
FLAGS_allocator_strategy=naive_best_fit \
python3 ${BIN_DIR}/../synthesize_e2e.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \
--voc=style_melgan_csmsc \
--voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
--voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--ngpu=0 \
--nnpu=1
# --inference_dir=${train_output_path}/inference
fi
# hifigan
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
FLAGS_allocator_strategy=naive_best_fit \
python3 ${BIN_DIR}/../synthesize_e2e.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \
--voc=hifigan_csmsc \
--voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
--voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--inference_dir=${train_output_path}/inference \
--ngpu=0 \
--nnpu=1
fi
# wavernn
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
echo "in wavernn syn_e2e"
FLAGS_allocator_strategy=naive_best_fit \
python3 ${BIN_DIR}/../synthesize_e2e.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \
--voc=wavernn_csmsc \
--voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \
--voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
--voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--inference_dir=${train_output_path}/inference \
--ngpu=0 \
--nnpu=1
fi

@ -0,0 +1,90 @@
#!/bin/bash
config_path=$1
train_output_path=$2
ckpt_name=$3
stage=0
stop_stage=0
# for more GAN Vocoders
# multi band melgan
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
FLAGS_allocator_strategy=naive_best_fit \
python3 ${BIN_DIR}/../synthesize.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \
--voc=mb_melgan_csmsc \
--voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
--voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--ngpu=0 \
--nmlu=1
fi
# style melgan
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
FLAGS_allocator_strategy=naive_best_fit \
python3 ${BIN_DIR}/../synthesize.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \
--voc=style_melgan_csmsc \
--voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
--voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--ngpu=0 \
--nmlu=1
fi
# hifigan
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
echo "in hifigan syn"
FLAGS_allocator_strategy=naive_best_fit \
python3 ${BIN_DIR}/../synthesize.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \
--voc=hifigan_csmsc \
--voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
--voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--ngpu=0 \
--nmlu=1
fi
# wavernn
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
echo "in wavernn syn"
FLAGS_allocator_strategy=naive_best_fit \
python3 ${BIN_DIR}/../synthesize.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \
--voc=wavernn_csmsc \
--voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \
--voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
--voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--tones_dict=dump/tone_id_map.txt \
--phones_dict=dump/phone_id_map.txt \
--ngpu=0 \
--nmlu=1
fi

@ -0,0 +1,110 @@
#!/bin/bash
config_path=$1
train_output_path=$2
ckpt_name=$3
stage=0
stop_stage=0
# pwgan
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
FLAGS_allocator_strategy=naive_best_fit \
python3 ${BIN_DIR}/../synthesize.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \
--voc=pwgan_csmsc \
--voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \
--voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
--voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--ngpu=0 \
--nnpu=1
fi
# for more GAN Vocoders
# multi band melgan
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
FLAGS_allocator_strategy=naive_best_fit \
python3 ${BIN_DIR}/../synthesize.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \
--voc=mb_melgan_csmsc \
--voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
--voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--ngpu=0 \
--nnpu=1
fi
# style melgan
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
FLAGS_allocator_strategy=naive_best_fit \
python3 ${BIN_DIR}/../synthesize.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \
--voc=style_melgan_csmsc \
--voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
--voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--ngpu=0 \
--nnpu=1
fi
# hifigan
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
echo "in hifigan syn"
FLAGS_allocator_strategy=naive_best_fit \
python3 ${BIN_DIR}/../synthesize.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \
--voc=hifigan_csmsc \
--voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
--voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--ngpu=0 \
--nnpu=1
fi
# wavernn
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
echo "in wavernn syn"
FLAGS_allocator_strategy=naive_best_fit \
python3 ${BIN_DIR}/../synthesize.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \
--voc=wavernn_csmsc \
--voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \
--voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
--voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--tones_dict=dump/tone_id_map.txt \
--phones_dict=dump/phone_id_map.txt \
--ngpu=0 \
--nnpu=1
fi

@ -0,0 +1,16 @@
#!/bin/bash
config_path=$1
train_output_path=$2
# export MLU_VISIBLE_DEVICES=8
python ${BIN_DIR}/train.py \
--train-metadata=dump/train/norm/metadata.jsonl \
--dev-metadata=dump/dev/norm/metadata.jsonl \
--config=${config_path} \
--output-dir=${train_output_path} \
--ngpu=0 \
--nmlu=2 \
--phones-dict=dump/phone_id_map.txt \
--tones-dict=dump/tone_id_map.txt \
--use-relative-path=True

@ -0,0 +1,16 @@
#!/bin/bash
config_path=$1
train_output_path=$2
python ${BIN_DIR}/train.py \
--train-metadata=dump/train/norm/metadata.jsonl \
--dev-metadata=dump/dev/norm/metadata.jsonl \
--config=${config_path} \
--output-dir=${train_output_path} \
--ngpu=0 \
--nnpu=1 \
--phones-dict=dump/phone_id_map.txt \
--tones-dict=dump/tone_id_map.txt \
--use-relative-path=True

@ -0,0 +1,76 @@
#!/bin/bash
set -e
source path.sh
export CUSTOM_DEVICE_BLACK_LIST=elementwise_max
mlus=0
stage=0
stop_stage=100
conf_path=conf/default.yaml
train_output_path=exp/default
ckpt_name=snapshot_iter_30600.pdz
# with the following command, you can choose the stage range you want to run
# such as `./run.sh --stage 0 --stop-stage 0`
# this can not be mixed use with `$1`, `$2` ...
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# prepare data
./local/preprocess.sh ${conf_path} || exit -1
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# train model, all `ckpt` under `train_output_path/checkpoints/` dir
FLAGS_selected_mlus=${mlus} ./local/train_mlu.sh ${conf_path} ${train_output_path} || exit -1
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# synthesize, vocoder is pwgan by default
FLAGS_selected_mlus=${mlus} ./local/synthesize_mlu.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# synthesize_e2e, vocoder is pwgan by default
FLAGS_selected_mlus=${mlus} ./local/synthesize_e2e_mlu.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
# inference with static model
FLAGS_selected_mlus=${mlus} ./local/inference_mlu.sh ${train_output_path} || exit -1
fi
# paddle2onnx, please make sure the static models are in ${train_output_path}/inference first
# we have only tested the following models so far
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# install paddle2onnx
pip install paddle2onnx --upgrade
./local/paddle2onnx.sh ${train_output_path} inference inference_onnx speedyspeech_csmsc
# considering the balance between speed and quality, we recommend that you use hifigan as vocoder
./local/paddle2onnx.sh ${train_output_path} inference inference_onnx pwgan_csmsc
# ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx mb_melgan_csmsc
# ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx hifigan_csmsc
fi
# inference with onnxruntime
if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
./local/ort_predict.sh ${train_output_path}
fi
# must run after stage 3 (which stage generated static models)
if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
./local/export2lite.sh ${train_output_path} inference pdlite speedyspeech_csmsc x86
./local/export2lite.sh ${train_output_path} inference pdlite pwgan_csmsc x86
# ./local/export2lite.sh ${train_output_path} inference pdlite mb_melgan_csmsc x86
# ./local/export2lite.sh ${train_output_path} inference pdlite hifigan_csmsc x86
fi
if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then
CUDA_VISIBLE_DEVICES=${gpus} ./local/lite_predict.sh ${train_output_path} || exit -1
fi
# PTQ_static
if [ ${stage} -le 9 ] && [ ${stop_stage} -ge 9 ]; then
CUDA_VISIBLE_DEVICES=${gpus} ./local/PTQ_static.sh ${train_output_path} speedyspeech_csmsc || exit -1
fi

@ -0,0 +1,42 @@
#!/bin/bash
set -e
source path.sh
npus=0
stage=0
stop_stage=100
conf_path=conf/default.yaml
train_output_path=exp/default
ckpt_name=snapshot_iter_76.pdz
# with the following command, you can choose the stage range you want to run
# such as `./run_xpu.sh --stage 0 --stop-stage 0`
# this can not be mixed use with `$1`, `$2` ...
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# prepare data
./local/preprocess.sh ${conf_path} || exit -1
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# train model, all `ckpt` under `train_output_path/checkpoints/` dir
FLAGS_selected_npus=${npus} ./local/train_npu.sh ${conf_path} ${train_output_path} || exit -1
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# synthesize, vocoder is pwgan by default
FLAGS_selected_npus=${npus} ./local/synthesize_npu.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# synthesize_e2e, vocoder is pwgan by default
FLAGS_selected_npus=${npus} ./local/synthesize_e2e_npu.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
# inference with static model
FLAGS_selected_npus=${npus} ./local/inference_npu.sh ${train_output_path} || exit -1
fi

@ -274,7 +274,7 @@ class ASRExecutor(BaseExecutor):
# fbank
audio = preprocessing(audio, **preprocess_args)
audio_len = paddle.to_tensor([audio.shape[0]]).unsqueeze(axis=0)
audio_len = paddle.to_tensor(audio.shape[0]).unsqueeze(axis=0)
audio = paddle.to_tensor(audio, dtype='float32').unsqueeze(axis=0)
self._inputs["audio"] = audio

@ -188,7 +188,7 @@ class Wav2vec2ASR(nn.Layer):
x_lens = x.shape[1]
ctc_probs = self.ctc.log_softmax(x) # (B, maxlen, vocab_size)
topk_prob, topk_index = ctc_probs.topk(1, axis=2) # (B, maxlen, 1)
topk_index = topk_index.view([batch_size, x_lens]) # (B, maxlen)
topk_index = topk_index.reshape([batch_size, x_lens]) # (B, maxlen)
hyps = [hyp.tolist() for hyp in topk_index]
hyps = [remove_duplicates_and_blank(hyp) for hyp in hyps]

@ -112,7 +112,7 @@ def parse_args():
parser.add_argument(
"--device",
default="gpu",
choices=["gpu", "cpu", "xpu"],
choices=["gpu", "cpu", "xpu", "npu", "mlu"],
help="Device selected for inference.", )
parser.add_argument('--cpu_threads', type=int, default=1)

@ -45,15 +45,20 @@ def train_sp(args, config):
# decides device type and whether to run in parallel
# setup running environment correctly
world_size = paddle.distributed.get_world_size()
if (not paddle.is_compiled_with_cuda()) or args.ngpu == 0:
if (not paddle.is_compiled_with_xpu()) or args.nxpu == 0:
paddle.set_device("cpu")
else:
paddle.set_device("xpu")
else:
if paddle.is_compiled_with_cuda() and args.ngpu > 0:
paddle.set_device("gpu")
if world_size > 1:
paddle.distributed.init_parallel_env()
elif paddle.is_compiled_with_xpu() and args.nxpu > 0:
paddle.device.set_device("xpu")
elif args.nnpu > 0:
paddle.device.set_device("npu")
if world_size > 1:
paddle.distributed.init_parallel_env()
elif args.nmlu > 0:
paddle.device.set_device("mlu")
else:
paddle.set_device("cpu")
# set the random seed, it is a must for multiprocess training
seed_everything(config.seed)
@ -191,9 +196,25 @@ def main():
"--nxpu",
type=int,
default=0,
help="if nxpu == 0 and ngpu == 0, use cpu.")
help="if wish to use xpu, set ngpu == 0 and nxpu > 0, otherwise use gpu, npu, mlu or cpu."
)
parser.add_argument(
"--ngpu", type=int, default=1, help="if ngpu == 0, use cpu or xpu")
"--nnpu",
type=int,
default=0,
help="if wish to use npu, set ngpu == 0 and nnpu > 0, otherwise use gpu, xpu, mlu or cpu."
)
parser.add_argument(
"--nmlu",
type=int,
default=1,
help="if wish to use npu, set ngpu == 0 and nmlu > 0, otherwise use gpu, xpu, npu or cpu."
)
parser.add_argument(
"--ngpu",
type=int,
default=1,
help="if wish to use gpu, set ngpu > 0, otherwise use xpu, npu or cpu.")
parser.add_argument(
"--use-relative-path",

@ -591,7 +591,8 @@ def get_predictor(
config = inference.Config(
str(Path(model_dir) / model_file), str(Path(model_dir) / params_file))
config.enable_memory_optim()
if paddle.__version__ <= "2.5.2" and paddle.__version__ != "0.0.0":
config.enable_memory_optim()
config.switch_ir_optim(True)
if device == "gpu":
config.enable_use_gpu(100, device_id)

@ -219,12 +219,28 @@ def parse_args():
)
# other
parser.add_argument(
"--ngpu", type=int, default=1, help="if ngpu == 0, use cpu or xpu.")
"--ngpu",
type=int,
default=1,
help="if wish to use gpu, set ngpu > 0, otherwise use xpu, npu, mlu or cpu."
)
parser.add_argument(
"--nxpu",
type=int,
default=0,
help="if wish to use xpu, set ngpu == 0 and nxpu > 0, and if ngpu == 0 and nxpu == 0, use cpu."
help="if wish to use xpu, set ngpu == 0 and nxpu > 0, otherwise use gpu, npu, mlu or cpu."
)
parser.add_argument(
"--nnpu",
type=int,
default=0,
help="if wish to use npu, set ngpu == 0 and nnpu > 0, otherwise use gpu, xpu, mlu or cpu."
)
parser.add_argument(
"--nmlu",
type=int,
default=0,
help="if wish to use xpu, set ngpu == 0 and nmlu > 0, otherwise use gpu, xpu, npu or cpu."
)
parser.add_argument("--test_metadata", type=str, help="test metadata.")
parser.add_argument("--output_dir", type=str, help="output dir.")
@ -245,10 +261,16 @@ def main():
paddle.set_device("gpu")
elif args.nxpu > 0:
paddle.set_device("xpu")
elif args.ngpu == 0 and args.nxpu == 0:
elif args.nnpu > 0:
paddle.set_device("npu")
elif args.nmlu > 0:
paddle.set_device("mlu")
elif args.ngpu == 0 and args.nxpu == 0 and args.nnpu == 0 and args.nmlu == 0:
paddle.set_device("cpu")
else:
print("ngpu or nxpu should >= 0 !")
print(
"one of ngpu, nxpu, nnpu or nmlu should be greater than 0 or all of them equal to 0"
)
evaluate(args)

@ -299,12 +299,28 @@ def parse_args():
default=None,
help="dir to save inference models")
parser.add_argument(
"--ngpu", type=int, default=1, help="if ngpu == 0, use cpu or xpu.")
"--ngpu",
type=int,
default=1,
help="if wish to use gpu, set ngpu > 0, otherwise use xpu, npu, mlu or cpu."
)
parser.add_argument(
"--nxpu",
type=int,
default=0,
help="if wish to use xpu, set ngpu == 0 and nxpu > 0, and if ngpu == 0 and nxpu == 0, use cpu."
help="if wish to use xpu, set ngpu == 0 and nxpu > 0, otherwise use gpu, npu, mlu or cpu."
)
parser.add_argument(
"--nnpu",
type=int,
default=0,
help="if wish to use npu, set ngpu == 0 and nnpu > 0, otherwise use gpu, xpu, mlu or cpu."
)
parser.add_argument(
"--nmlu",
type=int,
default=0,
help="if wish to use xpu, set ngpu == 0 and nmlu > 0, otherwise use gpu, xpu, npu or cpu."
)
parser.add_argument(
"--text",
@ -339,10 +355,16 @@ def main():
paddle.set_device("gpu")
elif args.nxpu > 0:
paddle.set_device("xpu")
elif args.ngpu == 0 and args.nxpu == 0:
elif args.nnpu > 0:
paddle.set_device("npu")
elif args.nmlu > 0:
paddle.set_device("mlu")
elif args.ngpu == 0 and args.nxpu == 0 and args.nnpu == 0 and args.nmlu == 0:
paddle.set_device("cpu")
else:
print("ngpu or nxpu should >= 0 !")
print(
"one of ngpu, nxpu, nnpu or nmlu should be greater than 0 or all of them equal to 0"
)
evaluate(args)

@ -237,30 +237,25 @@ class ToneSandhi():
# output seg: [['听一听', 'v']]
def _merge_yi(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
new_seg = []
skip_next = False
# function 1
for i, (word, pos) in enumerate(seg):
if i - 1 >= 0 and word == "" and i + 1 < len(seg) and seg[i - 1][
0] == seg[i + 1][0] and seg[i - 1][1] == "v":
if i - 1 < len(new_seg):
new_seg[i -
1][0] = new_seg[i - 1][0] + "" + new_seg[i - 1][0]
else:
new_seg.append([word, pos])
new_seg.append([seg[i + 1][0], pos])
if skip_next:
skip_next = False
continue
if i - 1 >= 0 and word == "" and i + 1 < len(seg) and seg[i - 1][0] == seg[i + 1][0] and seg[i - 1][1] == "v":
new_seg[-1] = (new_seg[-1][0] + "" + seg[i + 1][0], new_seg[-1][1])
skip_next = True
else:
if i - 2 >= 0 and seg[i - 1][0] == "" and seg[i - 2][
0] == word and pos == "v":
continue
else:
new_seg.append([word, pos])
new_seg.append((word, pos))
seg = new_seg
new_seg = []
# function 2
for i, (word, pos) in enumerate(seg):
if new_seg and new_seg[-1][0] == "":
new_seg[-1][0] = new_seg[-1][0] + word
new_seg[-1] = (new_seg[-1][0] + word, new_seg[-1][1])
else:
new_seg.append([word, pos])
new_seg.append((word, pos))
return new_seg
# the first and the second words are all_tone_three

@ -43,16 +43,17 @@ base = [
# paddleaudio align with librosa==0.8.1, which need numpy==1.23.x
"numpy==1.23.5",
"librosa==0.8.1",
"scipy>=1.4.0",
"scipy>=1.4.0, <=1.12.0",
"loguru",
"matplotlib",
"matplotlib<=3.8.4",
"nara_wpe",
"onnxruntime>=1.11.0",
"opencc",
"opencc==1.1.6",
"opencc-python-reimplemented",
"pandas",
"paddleaudio>=1.1.0",
"paddlenlp>=2.4.8",
"paddlepaddle-gpu==2.5.1",
"paddleslim>=2.3.4",
"ppdiffusers>=0.9.0",
"paddlespeech_feat",

@ -35,6 +35,8 @@ if [[ ${MODE} = "benchmark_train" ]];then
pip install setuptools_scm #-i https://pypi.tuna.tsinghua.edu.cn/simple
pip install . #-i https://pypi.tuna.tsinghua.edu.cn/simple
pip install jsonlines
pip install -U scipy==1.12.0 # 高版本数据处理部分报错
pip install -U matplotlib==3.7.1 # 高版本报错cannot import name 'get_cmap' from 'matplotlib.cm'
pip list
cd -
if [[ ${model_name} == "conformer" ]]; then

@ -48,7 +48,7 @@ class TestDeepSpeech2Model(unittest.TestCase):
num_rnn_layers=3,
rnn_size=1024,
use_gru=False,
share_rnn_weights=False, )
rnn_direction="forward", )
loss = model(self.audio, self.audio_len, self.text, self.text_len)
self.assertEqual(loss.numel(), 1)
@ -60,7 +60,7 @@ class TestDeepSpeech2Model(unittest.TestCase):
num_rnn_layers=3,
rnn_size=1024,
use_gru=True,
share_rnn_weights=False, )
rnn_direction="forward", )
loss = model(self.audio, self.audio_len, self.text, self.text_len)
self.assertEqual(loss.numel(), 1)
@ -72,7 +72,7 @@ class TestDeepSpeech2Model(unittest.TestCase):
num_rnn_layers=3,
rnn_size=1024,
use_gru=False,
share_rnn_weights=True, )
rnn_direction="bidirect", )
loss = model(self.audio, self.audio_len, self.text, self.text_len)
self.assertEqual(loss.numel(), 1)
@ -84,7 +84,7 @@ class TestDeepSpeech2Model(unittest.TestCase):
num_rnn_layers=3,
rnn_size=1024,
use_gru=True,
share_rnn_weights=True, )
rnn_direction="bidirect", )
loss = model(self.audio, self.audio_len, self.text, self.text_len)
self.assertEqual(loss.numel(), 1)
@ -96,7 +96,7 @@ class TestDeepSpeech2Model(unittest.TestCase):
num_rnn_layers=3,
rnn_size=1024,
use_gru=False,
share_rnn_weights=False, )
rnn_direction="forward", )
loss = model(self.audio, self.audio_len, self.text, self.text_len)
self.assertEqual(loss.numel(), 1)

@ -19,11 +19,11 @@ import numpy as np
import paddle
from paddle import inference
from paddlespeech.s2t.models.ds2_online import DeepSpeech2InferModelOnline
from paddlespeech.s2t.models.ds2_online import DeepSpeech2ModelOnline
from paddlespeech.s2t.models.ds2 import DeepSpeech2InferModel
from paddlespeech.s2t.models.ds2 import DeepSpeech2Model
class TestDeepSpeech2ModelOnline(unittest.TestCase):
class TestDeepSpeech2Model(unittest.TestCase):
def setUp(self):
paddle.set_device('cpu')
@ -45,7 +45,7 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase):
self.text_len = paddle.to_tensor(text_len, dtype='int64')
def test_ds2_1(self):
model = DeepSpeech2ModelOnline(
model = DeepSpeech2Model(
feat_size=self.feat_dim,
dict_size=10,
num_conv_layers=2,
@ -58,7 +58,7 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase):
self.assertEqual(loss.numel(), 1)
def test_ds2_2(self):
model = DeepSpeech2ModelOnline(
model = DeepSpeech2Model(
feat_size=self.feat_dim,
dict_size=10,
num_conv_layers=2,
@ -71,7 +71,7 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase):
self.assertEqual(loss.numel(), 1)
def test_ds2_3(self):
model = DeepSpeech2ModelOnline(
model = DeepSpeech2Model(
feat_size=self.feat_dim,
dict_size=10,
num_conv_layers=2,
@ -84,7 +84,7 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase):
self.assertEqual(loss.numel(), 1)
def test_ds2_4(self):
model = DeepSpeech2ModelOnline(
model = DeepSpeech2Model(
feat_size=self.feat_dim,
dict_size=10,
num_conv_layers=2,
@ -97,7 +97,7 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase):
self.assertEqual(loss.numel(), 1)
def test_ds2_5(self):
model = DeepSpeech2ModelOnline(
model = DeepSpeech2Model(
feat_size=self.feat_dim,
dict_size=10,
num_conv_layers=2,
@ -110,7 +110,7 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase):
self.assertEqual(loss.numel(), 1)
def test_ds2_6(self):
model = DeepSpeech2ModelOnline(
model = DeepSpeech2Model(
feat_size=self.feat_dim,
dict_size=10,
num_conv_layers=2,
@ -125,7 +125,7 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase):
def test_ds2_7(self):
use_gru = False
model = DeepSpeech2ModelOnline(
model = DeepSpeech2Model(
feat_size=self.feat_dim,
dict_size=10,
num_conv_layers=2,
@ -156,7 +156,7 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase):
def test_ds2_8(self):
use_gru = True
model = DeepSpeech2ModelOnline(
model = DeepSpeech2Model(
feat_size=self.feat_dim,
dict_size=10,
num_conv_layers=2,
@ -191,7 +191,7 @@ class TestDeepSpeech2StaticModelOnline(unittest.TestCase):
export_prefix = "exp/deepspeech2_online/checkpoints/test_export"
if not os.path.exists(os.path.dirname(export_prefix)):
os.makedirs(os.path.dirname(export_prefix), mode=0o755)
infer_model = DeepSpeech2InferModelOnline(
infer_model = DeepSpeech2InferModel(
feat_size=161,
dict_size=4233,
num_conv_layers=2,

@ -0,0 +1,31 @@
function main(){
set -ex
speech_ci_path=`pwd`
echo "Start asr"
cd ${speech_ci_path}/asr
bash deepspeech2_online_model_test.sh
python error_rate_test.py
python mask_test.py
python reverse_pad_list.py
echo "End asr"
echo "Start TTS"
cd ${speech_ci_path}/tts
python test_data_table.py
python test_enfrontend.py
python test_mixfrontend.py
echo "End TTS"
echo "Start Vector"
cd ${speech_ci_path}/vector
python test_augment.py
echo "End Vector"
echo "Start cli"
cd ${speech_ci_path}/cli
bash test_cli.sh
echo "End cli"
}
main

@ -10,11 +10,12 @@ paddlespeech cls --input ./cat.wav --topk 10
paddlespeech text --input 今天的天气真不错啊你下午有空吗我想约你一起去吃饭 --model ernie_linear_p3_wudao_fast
# Speech SSL
wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
paddlespeech ssl --task asr --lang en --input ./en.wav
paddlespeech ssl --task vector --lang en --input ./en.wav
# Speech_recognition
wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/ch_zh_mix.wav
wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/ch_zh_mix.wav
paddlespeech asr --input ./zh.wav
paddlespeech asr --model conformer_aishell --input ./zh.wav
paddlespeech asr --model conformer_online_aishell --input ./zh.wav
@ -110,5 +111,7 @@ paddlespeech whisper --task transcribe --input ./zh.wav
# whisper recognize text and translate to English
paddlespeech whisper --task translate --input ./zh.wav
# to change model English-Only model
paddlespeech whisper --lang en --size base --task transcribe --input ./en.wav
echo -e "\033[32mTest success !!!\033[0m"

@ -0,0 +1,29 @@
# test CLI 测试文档
该文档为 CLI 测试说明,该测试目前覆盖大部分 paddlespeech 中的 CLI 推理。该 CI 建立后用于快速验证修复是否正确。
# 测试流程
## 1. 环境安装
CI 重建时在已有通过版本 paddlepaddle-gpu==2.5.1, paddlepseech==develop 下运行。
CI 重建后在 paddlepaddle-gpu==develop, paddlepseech==develop 下运行。
### 其他相关依赖
gcc >= 4.8.5,
python >= 3.8
## 2. 功能测试
在 repo 的 tests/unit/cli 中运行:
```shell
source path.sh
bash test_cli.sh
```
## 3. 预期结果
输出 "Test success",且运行过程中无报错或 Error 即为成功。

@ -1,5 +1,7 @@
#!/bin/bash
# bash test_server_client.sh
## require lsof to get server pid
## apt-get install -y lsof
StartService(){
# Start service

@ -48,7 +48,7 @@ if __name__ == "__main__":
parser.add_argument(
"--text",
type=str,
default="../../../../../../paddlespeech/t2s/exps/csmsc_test.txt",
default="../../../../../../paddlespeech/t2s/assets/csmsc_test.txt",
help="text to synthesize, a 'utt_id sentence' pair per line")
parser.add_argument('--spk_id', type=int, default=0, help='Speaker id')
parser.add_argument('--speed', type=float, default=1.0, help='Audio speed')

@ -11,7 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddlespeech.t2s.datasets.data_tabel import DataTable
from paddlespeech.t2s.datasets.data_table import DataTable
def test_audio_dataset():

@ -0,0 +1,4 @@
FROM registry.baidubce.com/paddlepaddle/paddle:latest-dev-cuda11.2-cudnn8.2-trt8.0-gcc82
RUN apt-get update -y
RUN apt-get -y install libsndfile1
RUN pip3.8 install pytest-runner

@ -0,0 +1,54 @@
set +x
# use pre-commit 2.17
if ! [[ $(pre-commit --version) == *"2.17.0"* ]]; then
pip install pre-commit==2.17.0 1>nul
fi
# Install clang-format before git commit to avoid repeat installation due to
# pre-commit multi-thread running.
readonly VERSION="13.0.0"
version=$(clang-format -version)
if ! [[ $(python -V 2>&1 | awk '{print $2}' | awk -F '.' '{print $1$2}') -ge 36 ]]; then
echo "clang-format installation by pip need python version great equal 3.6,
please change the default python to higher version."
exit 1
fi
diff_files=$(git diff --name-only --diff-filter=ACMR ${BRANCH})
num_diff_files=$(echo "$diff_files" | wc -l)
echo -e "diff files between pr and ${BRANCH}:\n${diff_files}"
echo "Checking code style by pre-commit ..."
pre-commit run --files ${diff_files};check_error=$?
if test ! -z "$(git diff)"; then
echo -e '\n************************************************************************************'
echo -e "These files have been formatted by code format hook. You should use pre-commit to \
format them before git push."
echo -e '************************************************************************************\n'
git diff 2>&1
fi
echo -e '\n************************************************************************************'
if [ ${check_error} != 0 ];then
echo "Your PR code style check failed."
echo "Please install pre-commit locally and set up git hook scripts:"
echo ""
echo " pip install pre-commit==2.17.0"
echo " pre-commit install"
echo ""
if [[ $num_diff_files -le 100 ]];then
echo "Then, run pre-commit to check codestyle issues in your PR:"
echo ""
echo " pre-commit run --files" $(echo ${diff_files} | tr "\n" " ")
echo ""
fi
echo "For more information, please refer to our codestyle check guide:"
echo "https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/dev_guides/git_guides/codestyle_check_guide_cn.html"
else
echo "Your PR code style check passed."
fi
echo -e '************************************************************************************\n'
exit ${check_error}

@ -6,7 +6,7 @@ import kaldiio
import numpy
from distutils.util import strtobool
from paddlespeech.s2t.transform.cmvn import CMVN
from paddlespeech.audio.transform.cmvn import CMVN
from paddlespeech.s2t.utils.cli_readers import file_reader_helper
from paddlespeech.s2t.utils.cli_utils import get_commandline_args
from paddlespeech.s2t.utils.cli_utils import is_scipy_wav_style

@ -5,7 +5,7 @@ import logging
import kaldiio
import numpy as np
from paddlespeech.s2t.transform.transformation import Transformation
from paddlespeech.audio.transform.transformation import Transformation
from paddlespeech.s2t.utils.cli_readers import file_reader_helper
from paddlespeech.s2t.utils.cli_utils import get_commandline_args
from paddlespeech.s2t.utils.cli_utils import is_scipy_wav_style

@ -4,7 +4,7 @@ import logging
from distutils.util import strtobool
from paddlespeech.s2t.transform.transformation import Transformation
from paddlespeech.audio.transform.transformation import Transformation
from paddlespeech.s2t.utils.cli_readers import file_reader_helper
from paddlespeech.s2t.utils.cli_utils import get_commandline_args
from paddlespeech.s2t.utils.cli_utils import is_scipy_wav_style

@ -3,7 +3,7 @@ import argparse
import logging
import sys
from paddlespeech.s2t.transform.transformation import Transformation
from paddlespeech.audio.transform.transformation import Transformation
from paddlespeech.s2t.utils.cli_readers import file_reader_helper
from paddlespeech.s2t.utils.cli_utils import get_commandline_args
from paddlespeech.s2t.utils.cli_utils import is_scipy_wav_style

Loading…
Cancel
Save