Merge branch 'hongliang1124' of https://github.com/david-95/PaddleSpeech into hongliang1124
commit
737ad50692
@ -0,0 +1,102 @@
|
||||
([简体中文](./README_cn.md)|English)
|
||||
# Speech SSL (Self-Supervised Learning)
|
||||
|
||||
## Introduction
|
||||
Speech SSL, or Self-Supervised Learning, refers to a training method on the large-scale unlabeled speech dataset. The model trained in this way can produce a good acoustic representation, and can be applied to other downstream speech tasks by fine-tuning on labeled datasets.
|
||||
|
||||
This demo is an implementation to recognize text or produce the acoustic representation from a specific audio file by speech ssl models. It can be done by a single command or a few lines in python using `PaddleSpeech`.
|
||||
|
||||
## Usage
|
||||
### 1. Installation
|
||||
see [installation](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md).
|
||||
|
||||
You can choose one way from easy, meduim and hard to install paddlespeech.
|
||||
|
||||
### 2. Prepare Input File
|
||||
The input of this demo should be a WAV file(`.wav`), and the sample rate must be the same as the model.
|
||||
|
||||
Here are sample files for this demo that can be downloaded:
|
||||
```bash
|
||||
wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
|
||||
```
|
||||
|
||||
### 3. Usage
|
||||
- Command Line(Recommended)
|
||||
```bash
|
||||
# to recognize text
|
||||
paddlespeech ssl --task asr --lang en --input ./en.wav
|
||||
|
||||
# to get acoustic representation
|
||||
paddlespeech ssl --task vector --lang en --input ./en.wav
|
||||
```
|
||||
|
||||
Usage:
|
||||
```bash
|
||||
paddlespeech ssl --help
|
||||
```
|
||||
Arguments:
|
||||
- `input`(required): Audio file to recognize.
|
||||
- `model`: Model type of asr task. Default: `wav2vec2ASR_librispeech`.
|
||||
- `task`: Output type. Default: `asr`.
|
||||
- `lang`: Model language. Default: `en`.
|
||||
- `sample_rate`: Sample rate of the model. Default: `16000`.
|
||||
- `config`: Config of asr task. Use pretrained model when it is None. Default: `None`.
|
||||
- `ckpt_path`: Model checkpoint. Use pretrained model when it is None. Default: `None`.
|
||||
- `yes`: No additional parameters required. Once set this parameter, it means accepting the request of the program by default, which includes transforming the audio sample rate. Default: `False`.
|
||||
- `device`: Choose device to execute model inference. Default: default device of paddlepaddle in current environment.
|
||||
- `verbose`: Show the log information.
|
||||
|
||||
|
||||
- Python API
|
||||
```python
|
||||
import paddle
|
||||
from paddlespeech.cli.ssl import SSLExecutor
|
||||
|
||||
ssl_executor = SSLExecutor()
|
||||
|
||||
# to recognize text
|
||||
text = ssl_executor(
|
||||
model='wav2vec2ASR_librispeech',
|
||||
task='asr',
|
||||
lang='en',
|
||||
sample_rate=16000,
|
||||
config=None, # Set `config` and `ckpt_path` to None to use pretrained model.
|
||||
ckpt_path=None,
|
||||
audio_file='./en.wav',
|
||||
device=paddle.get_device())
|
||||
print('ASR Result: \n{}'.format(text))
|
||||
|
||||
# to get acoustic representation
|
||||
feature = ssl_executor(
|
||||
model='wav2vec2',
|
||||
task='vector',
|
||||
lang='en',
|
||||
sample_rate=16000,
|
||||
config=None, # Set `config` and `ckpt_path` to None to use pretrained model.
|
||||
ckpt_path=None,
|
||||
audio_file='./en.wav',
|
||||
device=paddle.get_device())
|
||||
print('Representation: \n{}'.format(feature))
|
||||
```
|
||||
|
||||
Output:
|
||||
```bash
|
||||
ASR Result:
|
||||
i knocked at the door on the ancient side of the building
|
||||
|
||||
Representation:
|
||||
Tensor(shape=[1, 164, 1024], dtype=float32, place=Place(gpu:0), stop_gradient=True,
|
||||
[[[ 0.02351918, -0.12980647, 0.17868176, ..., 0.10118122,
|
||||
-0.04614586, 0.17853957],
|
||||
[ 0.02361383, -0.12978461, 0.17870593, ..., 0.10103855,
|
||||
-0.04638699, 0.17855372],
|
||||
[ 0.02345137, -0.12982975, 0.17883906, ..., 0.10104341,
|
||||
-0.04643029, 0.17856732],
|
||||
...,
|
||||
[ 0.02313030, -0.12918393, 0.17845058, ..., 0.10073373,
|
||||
-0.04701405, 0.17862988],
|
||||
[ 0.02176583, -0.12929161, 0.17797582, ..., 0.10097728,
|
||||
-0.04687393, 0.17864393],
|
||||
[ 0.05269200, 0.01297141, -0.23336855, ..., -0.11257174,
|
||||
-0.17227529, 0.20338398]]])
|
||||
```
|
@ -0,0 +1,10 @@
|
||||
#!/bin/bash
|
||||
|
||||
# audio download
|
||||
wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
|
||||
|
||||
# to recognize text
|
||||
paddlespeech ssl --task asr --lang en --input ./en.wav
|
||||
|
||||
# to get acoustic representation
|
||||
paddlespeech ssl --task vector --lang en --input ./en.wav
|
@ -1,10 +1,13 @@
|
||||
#!/bin/bash
|
||||
|
||||
# audio download
|
||||
wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
|
||||
wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
|
||||
|
||||
# to recognize text
|
||||
paddlespeech whisper --task transcribe --input ./zh.wav
|
||||
|
||||
# to recognize text and translate to English
|
||||
paddlespeech whisper --task translate --input ./zh.wav
|
||||
paddlespeech whisper --task translate --input ./zh.wav
|
||||
|
||||
# to change model English-Only model
|
||||
paddlespeech whisper --lang en --size base --task transcribe --input ./en.wav
|
@ -0,0 +1,32 @@
|
||||
#!/bin/bash
|
||||
|
||||
train_output_path=$1
|
||||
|
||||
stage=0
|
||||
stop_stage=0
|
||||
|
||||
# pwgan
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
python3 ${BIN_DIR}/../lite_predict.py \
|
||||
--inference_dir=${train_output_path}/pdlite \
|
||||
--am=fastspeech2_aishell3 \
|
||||
--voc=pwgan_aishell3 \
|
||||
--text=${BIN_DIR}/../sentences.txt \
|
||||
--output_dir=${train_output_path}/lite_infer_out \
|
||||
--phones_dict=dump/phone_id_map.txt \
|
||||
--speaker_dict=dump/speaker_id_map.txt \
|
||||
--spk_id=0
|
||||
fi
|
||||
|
||||
# hifigan
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
python3 ${BIN_DIR}/../lite_predict.py \
|
||||
--inference_dir=${train_output_path}/pdlite \
|
||||
--am=fastspeech2_aishell3 \
|
||||
--voc=hifigan_aishell3 \
|
||||
--text=${BIN_DIR}/../sentences.txt \
|
||||
--output_dir=${train_output_path}/lite_infer_out \
|
||||
--phones_dict=dump/phone_id_map.txt \
|
||||
--speaker_dict=dump/speaker_id_map.txt \
|
||||
--spk_id=0
|
||||
fi
|
@ -0,0 +1,43 @@
|
||||
#!/bin/bash
|
||||
|
||||
train_output_path=$1
|
||||
|
||||
stage=0
|
||||
stop_stage=0
|
||||
|
||||
# pwgan
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
python3 ${BIN_DIR}/../lite_predict.py \
|
||||
--inference_dir=${train_output_path}/pdlite \
|
||||
--am=speedyspeech_csmsc \
|
||||
--voc=pwgan_csmsc \
|
||||
--text=${BIN_DIR}/../sentences.txt \
|
||||
--output_dir=${train_output_path}/lite_infer_out \
|
||||
--phones_dict=dump/phone_id_map.txt \
|
||||
--tones_dict=dump/tone_id_map.txt
|
||||
fi
|
||||
|
||||
# for more GAN Vocoders
|
||||
# multi band melgan
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
python3 ${BIN_DIR}/../lite_predict.py \
|
||||
--inference_dir=${train_output_path}/pdlite \
|
||||
--am=speedyspeech_csmsc \
|
||||
--voc=mb_melgan_csmsc \
|
||||
--text=${BIN_DIR}/../sentences.txt \
|
||||
--output_dir=${train_output_path}/lite_infer_out \
|
||||
--phones_dict=dump/phone_id_map.txt \
|
||||
--tones_dict=dump/tone_id_map.txt
|
||||
fi
|
||||
|
||||
# hifigan
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
python3 ${BIN_DIR}/../lite_predict.py \
|
||||
--inference_dir=${train_output_path}/pdlite \
|
||||
--am=speedyspeech_csmsc \
|
||||
--voc=hifigan_csmsc \
|
||||
--text=${BIN_DIR}/../sentences.txt \
|
||||
--output_dir=${train_output_path}/lite_infer_out \
|
||||
--phones_dict=dump/phone_id_map.txt \
|
||||
--tones_dict=dump/tone_id_map.txt
|
||||
fi
|
@ -0,0 +1,40 @@
|
||||
#!/bin/bash
|
||||
|
||||
train_output_path=$1
|
||||
|
||||
stage=0
|
||||
stop_stage=0
|
||||
|
||||
# pwgan
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
python3 ${BIN_DIR}/../lite_predict.py \
|
||||
--inference_dir=${train_output_path}/pdlite \
|
||||
--am=fastspeech2_csmsc \
|
||||
--voc=pwgan_csmsc \
|
||||
--text=${BIN_DIR}/../sentences.txt \
|
||||
--output_dir=${train_output_path}/lite_infer_out \
|
||||
--phones_dict=dump/phone_id_map.txt
|
||||
fi
|
||||
|
||||
# for more GAN Vocoders
|
||||
# multi band melgan
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
python3 ${BIN_DIR}/../lite_predict.py \
|
||||
--inference_dir=${train_output_path}/pdlite \
|
||||
--am=fastspeech2_csmsc \
|
||||
--voc=mb_melgan_csmsc \
|
||||
--text=${BIN_DIR}/../sentences.txt \
|
||||
--output_dir=${train_output_path}/lite_infer_out \
|
||||
--phones_dict=dump/phone_id_map.txt
|
||||
fi
|
||||
|
||||
# hifigan
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
python3 ${BIN_DIR}/../lite_predict.py \
|
||||
--inference_dir=${train_output_path}/pdlite \
|
||||
--am=fastspeech2_csmsc \
|
||||
--voc=hifigan_csmsc \
|
||||
--text=${BIN_DIR}/../sentences.txt \
|
||||
--output_dir=${train_output_path}/lite_infer_out \
|
||||
--phones_dict=dump/phone_id_map.txt
|
||||
fi
|
@ -0,0 +1,47 @@
|
||||
#!/bin/bash
|
||||
|
||||
train_output_path=$1
|
||||
|
||||
stage=0
|
||||
stop_stage=0
|
||||
|
||||
# pwgan
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
python3 ${BIN_DIR}/../lite_predict_streaming.py \
|
||||
--inference_dir=${train_output_path}/pdlite_streaming \
|
||||
--am=fastspeech2_csmsc \
|
||||
--am_stat=dump/train/speech_stats.npy \
|
||||
--voc=pwgan_csmsc \
|
||||
--text=${BIN_DIR}/../sentences.txt \
|
||||
--output_dir=${train_output_path}/lite_infer_out_streaming \
|
||||
--phones_dict=dump/phone_id_map.txt \
|
||||
--am_streaming=True
|
||||
fi
|
||||
|
||||
# for more GAN Vocoders
|
||||
# multi band melgan
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
python3 ${BIN_DIR}/../lite_predict_streaming.py \
|
||||
--inference_dir=${train_output_path}/pdlite_streaming \
|
||||
--am=fastspeech2_csmsc \
|
||||
--am_stat=dump/train/speech_stats.npy \
|
||||
--voc=mb_melgan_csmsc \
|
||||
--text=${BIN_DIR}/../sentences.txt \
|
||||
--output_dir=${train_output_path}/lite_infer_out_streaming \
|
||||
--phones_dict=dump/phone_id_map.txt \
|
||||
--am_streaming=True
|
||||
fi
|
||||
|
||||
# hifigan
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
python3 ${BIN_DIR}/../lite_predict_streaming.py \
|
||||
--inference_dir=${train_output_path}/pdlite_streaming \
|
||||
--am=fastspeech2_csmsc \
|
||||
--am_stat=dump/train/speech_stats.npy \
|
||||
--voc=hifigan_csmsc \
|
||||
--text=${BIN_DIR}/../sentences.txt \
|
||||
--output_dir=${train_output_path}/lite_infer_out_streaming \
|
||||
--phones_dict=dump/phone_id_map.txt \
|
||||
--am_streaming=True
|
||||
fi
|
||||
|
@ -1,8 +1,8 @@
|
||||
# LibriSpeech
|
||||
|
||||
## Wav2VecASR
|
||||
train: Epoch 1, 1*V100-32G, batchsize:10
|
||||
train: Epoch 1, 1*V100-32G, batchsize: 6
|
||||
|
||||
| Model | Params | Config | Augmentation| Test set | Decode method | WER |
|
||||
| --- | --- | --- | --- | --- | --- | --- |
|
||||
| wav2vec2ASR | 302.86 M | conf/wav2vec2ASR.yaml | spec_aug | test-clean | greedy search | 0.018887 |
|
||||
| wav2vec2ASR | 302.86 M | conf/wav2vec2ASR.yaml | spec_aug | test-clean | greedy search | 0.018906 |
|
||||
|
@ -0,0 +1,30 @@
|
||||
#!/bin/bash
|
||||
|
||||
train_output_path=$1
|
||||
|
||||
stage=0
|
||||
stop_stage=0
|
||||
|
||||
# pwgan
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
python3 ${BIN_DIR}/../lite_predict.py \
|
||||
--inference_dir=${train_output_path}/pdlite \
|
||||
--am=fastspeech2_ljspeech \
|
||||
--voc=pwgan_ljspeech \
|
||||
--text=${BIN_DIR}/../sentences_en.txt \
|
||||
--output_dir=${train_output_path}/lite_infer_out \
|
||||
--phones_dict=dump/phone_id_map.txt \
|
||||
--lang=en
|
||||
fi
|
||||
|
||||
# hifigan
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
python3 ${BIN_DIR}/../lite_predict.py \
|
||||
--inference_dir=${train_output_path}/pdlite \
|
||||
--am=fastspeech2_ljspeech \
|
||||
--voc=hifigan_ljspeech \
|
||||
--text=${BIN_DIR}/../sentences_en.txt \
|
||||
--output_dir=${train_output_path}/lite_infer_out \
|
||||
--phones_dict=dump/phone_id_map.txt \
|
||||
--lang=en
|
||||
fi
|
@ -0,0 +1,34 @@
|
||||
#!/bin/bash
|
||||
|
||||
train_output_path=$1
|
||||
|
||||
stage=0
|
||||
stop_stage=0
|
||||
|
||||
# pwgan
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
python3 ${BIN_DIR}/../lite_predict.py \
|
||||
--inference_dir=${train_output_path}/pdlite \
|
||||
--am=fastspeech2_vctk \
|
||||
--voc=pwgan_vctk \
|
||||
--text=${BIN_DIR}/../sentences_en.txt \
|
||||
--output_dir=${train_output_path}/lite_infer_out \
|
||||
--phones_dict=dump/phone_id_map.txt \
|
||||
--speaker_dict=dump/speaker_id_map.txt \
|
||||
--spk_id=0 \
|
||||
--lang=en
|
||||
fi
|
||||
|
||||
# hifigan
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
python3 ${BIN_DIR}/../lite_predict.py \
|
||||
--inference_dir=${train_output_path}/pdlite \
|
||||
--am=fastspeech2_vctk \
|
||||
--voc=hifigan_vctk \
|
||||
--text=${BIN_DIR}/../sentences_en.txt \
|
||||
--output_dir=${train_output_path}/lite_infer_out \
|
||||
--phones_dict=dump/phone_id_map.txt \
|
||||
--speaker_dict=dump/speaker_id_map.txt \
|
||||
--spk_id=0 \
|
||||
--lang=en
|
||||
fi
|
@ -0,0 +1,14 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from .infer import SSLExecutor
|
@ -0,0 +1,13 @@
|
||||
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
@ -0,0 +1,17 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from .wav2vec2_ASR import Wav2vec2ASR
|
||||
from .wav2vec2_ASR import Wav2vec2Base
|
||||
|
||||
__all__ = ["Wav2vec2ASR", "Wav2vec2Base"]
|
@ -0,0 +1,13 @@
|
||||
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
@ -0,0 +1,97 @@
|
||||
# Authors
|
||||
# * Mirco Ravanelli 2020
|
||||
# * Guillermo Cámbara 2021
|
||||
# * Sarthak Yadav 2022
|
||||
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# Modified from speechbrain(https://github.com/speechbrain/speechbrain/blob/develop/speechbrain/nnet/normalization.py)
|
||||
import paddle.nn as nn
|
||||
|
||||
from paddlespeech.s2t.modules.align import BatchNorm1D
|
||||
|
||||
|
||||
class BatchNorm1d(nn.Layer):
|
||||
"""Applies 1d batch normalization to the input tensor.
|
||||
Arguments
|
||||
---------
|
||||
input_shape : tuple
|
||||
The expected shape of the input. Alternatively, use ``input_size``.
|
||||
input_size : int
|
||||
The expected size of the input. Alternatively, use ``input_shape``.
|
||||
eps : float
|
||||
This value is added to std deviation estimation to improve the numerical
|
||||
stability.
|
||||
momentum : float
|
||||
It is a value used for the running_mean and running_var computation.
|
||||
affine : bool
|
||||
When set to True, the affine parameters are learned.
|
||||
track_running_stats : bool
|
||||
When set to True, this module tracks the running mean and variance,
|
||||
and when set to False, this module does not track such statistics.
|
||||
combine_batch_time : bool
|
||||
When true, it combines batch an time axis.
|
||||
Example
|
||||
-------
|
||||
>>> input = paddle.randn([100, 10])
|
||||
>>> norm = BatchNorm1d(input_shape=input.shape)
|
||||
>>> output = norm(input)
|
||||
>>> output.shape
|
||||
Paddle.Shape([100, 10])
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
input_shape=None,
|
||||
input_size=None,
|
||||
eps=1e-05,
|
||||
momentum=0.9,
|
||||
combine_batch_time=False,
|
||||
skip_transpose=False, ):
|
||||
super().__init__()
|
||||
self.combine_batch_time = combine_batch_time
|
||||
self.skip_transpose = skip_transpose
|
||||
|
||||
if input_size is None and skip_transpose:
|
||||
input_size = input_shape[1]
|
||||
elif input_size is None:
|
||||
input_size = input_shape[-1]
|
||||
|
||||
self.norm = BatchNorm1D(input_size, momentum=momentum, epsilon=eps)
|
||||
|
||||
def forward(self, x):
|
||||
"""Returns the normalized input tensor.
|
||||
Arguments
|
||||
---------
|
||||
x : paddle.Tensor (batch, time, [channels])
|
||||
input to normalize. 2d or 3d tensors are expected in input
|
||||
4d tensors can be used when combine_dims=True.
|
||||
"""
|
||||
shape_or = x.shape
|
||||
if self.combine_batch_time:
|
||||
if x.ndim == 3:
|
||||
x = x.reshape(shape_or[0] * shape_or[1], shape_or[2])
|
||||
else:
|
||||
x = x.reshape(shape_or[0] * shape_or[1], shape_or[3],
|
||||
shape_or[2])
|
||||
|
||||
elif not self.skip_transpose:
|
||||
x = x.transpose([0, 2, 1])
|
||||
|
||||
x_n = self.norm(x)
|
||||
if self.combine_batch_time:
|
||||
x_n = x_n.reshape(shape_or)
|
||||
elif not self.skip_transpose:
|
||||
x_n = x_n.transpose([0, 2, 1])
|
||||
|
||||
return x_n
|
@ -0,0 +1,13 @@
|
||||
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
@ -0,0 +1,168 @@
|
||||
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
import soundfile as sf
|
||||
from timer import timer
|
||||
|
||||
from paddlespeech.t2s.exps.syn_utils import get_frontend
|
||||
from paddlespeech.t2s.exps.syn_utils import get_lite_am_output
|
||||
from paddlespeech.t2s.exps.syn_utils import get_lite_predictor
|
||||
from paddlespeech.t2s.exps.syn_utils import get_lite_voc_output
|
||||
from paddlespeech.t2s.exps.syn_utils import get_sentences
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Paddle Infernce with acoustic model & vocoder.")
|
||||
# acoustic model
|
||||
parser.add_argument(
|
||||
'--am',
|
||||
type=str,
|
||||
default='fastspeech2_csmsc',
|
||||
choices=[
|
||||
'speedyspeech_csmsc',
|
||||
'fastspeech2_csmsc',
|
||||
'fastspeech2_aishell3',
|
||||
'fastspeech2_ljspeech',
|
||||
'fastspeech2_vctk',
|
||||
'fastspeech2_mix',
|
||||
],
|
||||
help='Choose acoustic model type of tts task.')
|
||||
parser.add_argument(
|
||||
"--phones_dict", type=str, default=None, help="phone vocabulary file.")
|
||||
parser.add_argument(
|
||||
"--tones_dict", type=str, default=None, help="tone vocabulary file.")
|
||||
parser.add_argument(
|
||||
"--speaker_dict", type=str, default=None, help="speaker id map file.")
|
||||
parser.add_argument(
|
||||
'--spk_id',
|
||||
type=int,
|
||||
default=0,
|
||||
help='spk id for multi speaker acoustic model')
|
||||
# voc
|
||||
parser.add_argument(
|
||||
'--voc',
|
||||
type=str,
|
||||
default='pwgan_csmsc',
|
||||
choices=[
|
||||
'pwgan_csmsc',
|
||||
'pwgan_aishell3',
|
||||
'pwgan_ljspeech',
|
||||
'pwgan_vctk',
|
||||
'mb_melgan_csmsc',
|
||||
'hifigan_csmsc',
|
||||
'hifigan_aishell3',
|
||||
'hifigan_ljspeech',
|
||||
'hifigan_vctk',
|
||||
],
|
||||
help='Choose vocoder type of tts task.')
|
||||
# other
|
||||
parser.add_argument(
|
||||
'--lang',
|
||||
type=str,
|
||||
default='zh',
|
||||
help='Choose model language. zh or en or mix')
|
||||
parser.add_argument(
|
||||
"--text",
|
||||
type=str,
|
||||
help="text to synthesize, a 'utt_id sentence' pair per line")
|
||||
parser.add_argument(
|
||||
"--inference_dir", type=str, help="dir to save inference models")
|
||||
parser.add_argument("--output_dir", type=str, help="output dir")
|
||||
|
||||
args, _ = parser.parse_known_args()
|
||||
return args
|
||||
|
||||
|
||||
# only inference for models trained with csmsc now
|
||||
def main():
|
||||
args = parse_args()
|
||||
|
||||
# frontend
|
||||
frontend = get_frontend(
|
||||
lang=args.lang,
|
||||
phones_dict=args.phones_dict,
|
||||
tones_dict=args.tones_dict)
|
||||
|
||||
# am_predictor
|
||||
am_predictor = get_lite_predictor(
|
||||
model_dir=args.inference_dir, model_file=args.am + "_x86.nb")
|
||||
# model: {model_name}_{dataset}
|
||||
am_dataset = args.am[args.am.rindex('_') + 1:]
|
||||
|
||||
# voc_predictor
|
||||
voc_predictor = get_lite_predictor(
|
||||
model_dir=args.inference_dir, model_file=args.voc + "_x86.nb")
|
||||
|
||||
output_dir = Path(args.output_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
sentences = get_sentences(text_file=args.text, lang=args.lang)
|
||||
|
||||
merge_sentences = True
|
||||
fs = 24000 if am_dataset != 'ljspeech' else 22050
|
||||
# warmup
|
||||
for utt_id, sentence in sentences[:3]:
|
||||
with timer() as t:
|
||||
mel = get_lite_am_output(
|
||||
input=sentence,
|
||||
am_predictor=am_predictor,
|
||||
am=args.am,
|
||||
frontend=frontend,
|
||||
lang=args.lang,
|
||||
merge_sentences=merge_sentences,
|
||||
speaker_dict=args.speaker_dict,
|
||||
spk_id=args.spk_id, )
|
||||
wav = get_lite_voc_output(voc_predictor=voc_predictor, input=mel)
|
||||
speed = wav.size / t.elapse
|
||||
rtf = fs / speed
|
||||
print(
|
||||
f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
|
||||
)
|
||||
|
||||
print("warm up done!")
|
||||
|
||||
N = 0
|
||||
T = 0
|
||||
for utt_id, sentence in sentences:
|
||||
with timer() as t:
|
||||
mel = get_lite_am_output(
|
||||
input=sentence,
|
||||
am_predictor=am_predictor,
|
||||
am=args.am,
|
||||
frontend=frontend,
|
||||
lang=args.lang,
|
||||
merge_sentences=merge_sentences,
|
||||
speaker_dict=args.speaker_dict,
|
||||
spk_id=args.spk_id, )
|
||||
wav = get_lite_voc_output(voc_predictor=voc_predictor, input=mel)
|
||||
|
||||
N += wav.size
|
||||
T += t.elapse
|
||||
speed = wav.size / t.elapse
|
||||
rtf = fs / speed
|
||||
|
||||
sf.write(output_dir / (utt_id + ".wav"), wav, samplerate=fs)
|
||||
print(
|
||||
f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
|
||||
)
|
||||
|
||||
print(f"{utt_id} done!")
|
||||
print(f"generation speed: {N / T}Hz, RTF: {fs / (N / T) }")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
@ -0,0 +1,230 @@
|
||||
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import soundfile as sf
|
||||
from timer import timer
|
||||
|
||||
from paddlespeech.t2s.exps.syn_utils import denorm
|
||||
from paddlespeech.t2s.exps.syn_utils import get_chunks
|
||||
from paddlespeech.t2s.exps.syn_utils import get_frontend
|
||||
from paddlespeech.t2s.exps.syn_utils import get_lite_am_sublayer_output
|
||||
from paddlespeech.t2s.exps.syn_utils import get_lite_predictor
|
||||
from paddlespeech.t2s.exps.syn_utils import get_lite_streaming_am_output
|
||||
from paddlespeech.t2s.exps.syn_utils import get_lite_voc_output
|
||||
from paddlespeech.t2s.exps.syn_utils import get_sentences
|
||||
from paddlespeech.t2s.exps.syn_utils import run_frontend
|
||||
from paddlespeech.t2s.utils import str2bool
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Paddle Infernce with acoustic model & vocoder.")
|
||||
# acoustic model
|
||||
parser.add_argument(
|
||||
'--am',
|
||||
type=str,
|
||||
default='fastspeech2_csmsc',
|
||||
choices=['fastspeech2_csmsc'],
|
||||
help='Choose acoustic model type of tts task.')
|
||||
parser.add_argument(
|
||||
"--am_stat",
|
||||
type=str,
|
||||
default=None,
|
||||
help="mean and standard deviation used to normalize spectrogram when training acoustic model."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--phones_dict", type=str, default=None, help="phone vocabulary file.")
|
||||
parser.add_argument(
|
||||
"--tones_dict", type=str, default=None, help="tone vocabulary file.")
|
||||
parser.add_argument(
|
||||
"--speaker_dict", type=str, default=None, help="speaker id map file.")
|
||||
parser.add_argument(
|
||||
'--spk_id',
|
||||
type=int,
|
||||
default=0,
|
||||
help='spk id for multi speaker acoustic model')
|
||||
# voc
|
||||
parser.add_argument(
|
||||
'--voc',
|
||||
type=str,
|
||||
default='pwgan_csmsc',
|
||||
choices=['pwgan_csmsc', 'mb_melgan_csmsc', 'hifigan_csmsc'],
|
||||
help='Choose vocoder type of tts task.')
|
||||
# other
|
||||
parser.add_argument(
|
||||
'--lang',
|
||||
type=str,
|
||||
default='zh',
|
||||
help='Choose model language. zh or en')
|
||||
parser.add_argument(
|
||||
"--text",
|
||||
type=str,
|
||||
help="text to synthesize, a 'utt_id sentence' pair per line")
|
||||
parser.add_argument(
|
||||
"--inference_dir", type=str, help="dir to save inference models")
|
||||
parser.add_argument("--output_dir", type=str, help="output dir")
|
||||
# inference
|
||||
|
||||
# streaming related
|
||||
parser.add_argument(
|
||||
"--am_streaming",
|
||||
type=str2bool,
|
||||
default=False,
|
||||
help="whether use streaming acoustic model")
|
||||
parser.add_argument(
|
||||
"--block_size", type=int, default=42, help="block size of am streaming")
|
||||
parser.add_argument(
|
||||
"--pad_size", type=int, default=12, help="pad size of am streaming")
|
||||
|
||||
args, _ = parser.parse_known_args()
|
||||
return args
|
||||
|
||||
|
||||
# only inference for models trained with csmsc now
|
||||
def main():
|
||||
args = parse_args()
|
||||
|
||||
# frontend
|
||||
frontend = get_frontend(
|
||||
lang=args.lang,
|
||||
phones_dict=args.phones_dict,
|
||||
tones_dict=args.tones_dict)
|
||||
|
||||
# am_predictor
|
||||
am_encoder_infer_predictor = get_lite_predictor(
|
||||
model_dir=args.inference_dir,
|
||||
model_file=args.am + "_am_encoder_infer" + "_x86.nb")
|
||||
am_decoder_predictor = get_lite_predictor(
|
||||
model_dir=args.inference_dir,
|
||||
model_file=args.am + "_am_decoder" + "_x86.nb")
|
||||
am_postnet_predictor = get_lite_predictor(
|
||||
model_dir=args.inference_dir,
|
||||
model_file=args.am + "_am_postnet" + "_x86.nb")
|
||||
am_mu, am_std = np.load(args.am_stat)
|
||||
# model: {model_name}_{dataset}
|
||||
am_dataset = args.am[args.am.rindex('_') + 1:]
|
||||
|
||||
# voc_predictor
|
||||
voc_predictor = get_lite_predictor(
|
||||
model_dir=args.inference_dir, model_file=args.voc + "_x86.nb")
|
||||
|
||||
output_dir = Path(args.output_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
sentences = get_sentences(text_file=args.text, lang=args.lang)
|
||||
|
||||
merge_sentences = True
|
||||
|
||||
fs = 24000 if am_dataset != 'ljspeech' else 22050
|
||||
# warmup
|
||||
for utt_id, sentence in sentences[:3]:
|
||||
with timer() as t:
|
||||
normalized_mel = get_lite_streaming_am_output(
|
||||
input=sentence,
|
||||
am_encoder_infer_predictor=am_encoder_infer_predictor,
|
||||
am_decoder_predictor=am_decoder_predictor,
|
||||
am_postnet_predictor=am_postnet_predictor,
|
||||
frontend=frontend,
|
||||
lang=args.lang,
|
||||
merge_sentences=merge_sentences, )
|
||||
mel = denorm(normalized_mel, am_mu, am_std)
|
||||
wav = get_lite_voc_output(voc_predictor=voc_predictor, input=mel)
|
||||
speed = wav.size / t.elapse
|
||||
rtf = fs / speed
|
||||
print(
|
||||
f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
|
||||
)
|
||||
|
||||
print("warm up done!")
|
||||
|
||||
N = 0
|
||||
T = 0
|
||||
block_size = args.block_size
|
||||
pad_size = args.pad_size
|
||||
get_tone_ids = False
|
||||
for utt_id, sentence in sentences:
|
||||
with timer() as t:
|
||||
# frontend
|
||||
frontend_dict = run_frontend(
|
||||
frontend=frontend,
|
||||
text=sentence,
|
||||
merge_sentences=merge_sentences,
|
||||
get_tone_ids=get_tone_ids,
|
||||
lang=args.lang)
|
||||
phone_ids = frontend_dict['phone_ids']
|
||||
phones = phone_ids[0].numpy()
|
||||
# acoustic model
|
||||
orig_hs = get_lite_am_sublayer_output(
|
||||
am_encoder_infer_predictor, input=phones)
|
||||
|
||||
if args.am_streaming:
|
||||
hss = get_chunks(orig_hs, block_size, pad_size)
|
||||
chunk_num = len(hss)
|
||||
mel_list = []
|
||||
for i, hs in enumerate(hss):
|
||||
am_decoder_output = get_lite_am_sublayer_output(
|
||||
am_decoder_predictor, input=hs)
|
||||
am_postnet_output = get_lite_am_sublayer_output(
|
||||
am_postnet_predictor,
|
||||
input=np.transpose(am_decoder_output, (0, 2, 1)))
|
||||
am_output_data = am_decoder_output + np.transpose(
|
||||
am_postnet_output, (0, 2, 1))
|
||||
normalized_mel = am_output_data[0]
|
||||
|
||||
sub_mel = denorm(normalized_mel, am_mu, am_std)
|
||||
# clip output part of pad
|
||||
if i == 0:
|
||||
sub_mel = sub_mel[:-pad_size]
|
||||
elif i == chunk_num - 1:
|
||||
# 最后一块的右侧一定没有 pad 够
|
||||
sub_mel = sub_mel[pad_size:]
|
||||
else:
|
||||
# 倒数几块的右侧也可能没有 pad 够
|
||||
sub_mel = sub_mel[pad_size:(block_size + pad_size) -
|
||||
sub_mel.shape[0]]
|
||||
mel_list.append(sub_mel)
|
||||
mel = np.concatenate(mel_list, axis=0)
|
||||
|
||||
else:
|
||||
am_decoder_output = get_lite_am_sublayer_output(
|
||||
am_decoder_predictor, input=orig_hs)
|
||||
am_postnet_output = get_lite_am_sublayer_output(
|
||||
am_postnet_predictor,
|
||||
input=np.transpose(am_decoder_output, (0, 2, 1)))
|
||||
am_output_data = am_decoder_output + np.transpose(
|
||||
am_postnet_output, (0, 2, 1))
|
||||
normalized_mel = am_output_data[0]
|
||||
mel = denorm(normalized_mel, am_mu, am_std)
|
||||
# vocoder
|
||||
wav = get_lite_voc_output(voc_predictor=voc_predictor, input=mel)
|
||||
|
||||
N += wav.size
|
||||
T += t.elapse
|
||||
speed = wav.size / t.elapse
|
||||
rtf = fs / speed
|
||||
|
||||
sf.write(output_dir / (utt_id + ".wav"), wav, samplerate=24000)
|
||||
print(
|
||||
f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
|
||||
)
|
||||
|
||||
print(f"{utt_id} done!")
|
||||
print(f"generation speed: {N / T}Hz, RTF: {fs / (N / T) }")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Loading…
Reference in new issue