Merge branch 'hongliang1124' of https://github.com/david-95/PaddleSpeech into hongliang1124
commit
737ad50692
@ -0,0 +1,102 @@
|
|||||||
|
([简体中文](./README_cn.md)|English)
|
||||||
|
# Speech SSL (Self-Supervised Learning)
|
||||||
|
|
||||||
|
## Introduction
|
||||||
|
Speech SSL, or Self-Supervised Learning, refers to a training method on the large-scale unlabeled speech dataset. The model trained in this way can produce a good acoustic representation, and can be applied to other downstream speech tasks by fine-tuning on labeled datasets.
|
||||||
|
|
||||||
|
This demo is an implementation to recognize text or produce the acoustic representation from a specific audio file by speech ssl models. It can be done by a single command or a few lines in python using `PaddleSpeech`.
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
### 1. Installation
|
||||||
|
see [installation](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md).
|
||||||
|
|
||||||
|
You can choose one way from easy, meduim and hard to install paddlespeech.
|
||||||
|
|
||||||
|
### 2. Prepare Input File
|
||||||
|
The input of this demo should be a WAV file(`.wav`), and the sample rate must be the same as the model.
|
||||||
|
|
||||||
|
Here are sample files for this demo that can be downloaded:
|
||||||
|
```bash
|
||||||
|
wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. Usage
|
||||||
|
- Command Line(Recommended)
|
||||||
|
```bash
|
||||||
|
# to recognize text
|
||||||
|
paddlespeech ssl --task asr --lang en --input ./en.wav
|
||||||
|
|
||||||
|
# to get acoustic representation
|
||||||
|
paddlespeech ssl --task vector --lang en --input ./en.wav
|
||||||
|
```
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
```bash
|
||||||
|
paddlespeech ssl --help
|
||||||
|
```
|
||||||
|
Arguments:
|
||||||
|
- `input`(required): Audio file to recognize.
|
||||||
|
- `model`: Model type of asr task. Default: `wav2vec2ASR_librispeech`.
|
||||||
|
- `task`: Output type. Default: `asr`.
|
||||||
|
- `lang`: Model language. Default: `en`.
|
||||||
|
- `sample_rate`: Sample rate of the model. Default: `16000`.
|
||||||
|
- `config`: Config of asr task. Use pretrained model when it is None. Default: `None`.
|
||||||
|
- `ckpt_path`: Model checkpoint. Use pretrained model when it is None. Default: `None`.
|
||||||
|
- `yes`: No additional parameters required. Once set this parameter, it means accepting the request of the program by default, which includes transforming the audio sample rate. Default: `False`.
|
||||||
|
- `device`: Choose device to execute model inference. Default: default device of paddlepaddle in current environment.
|
||||||
|
- `verbose`: Show the log information.
|
||||||
|
|
||||||
|
|
||||||
|
- Python API
|
||||||
|
```python
|
||||||
|
import paddle
|
||||||
|
from paddlespeech.cli.ssl import SSLExecutor
|
||||||
|
|
||||||
|
ssl_executor = SSLExecutor()
|
||||||
|
|
||||||
|
# to recognize text
|
||||||
|
text = ssl_executor(
|
||||||
|
model='wav2vec2ASR_librispeech',
|
||||||
|
task='asr',
|
||||||
|
lang='en',
|
||||||
|
sample_rate=16000,
|
||||||
|
config=None, # Set `config` and `ckpt_path` to None to use pretrained model.
|
||||||
|
ckpt_path=None,
|
||||||
|
audio_file='./en.wav',
|
||||||
|
device=paddle.get_device())
|
||||||
|
print('ASR Result: \n{}'.format(text))
|
||||||
|
|
||||||
|
# to get acoustic representation
|
||||||
|
feature = ssl_executor(
|
||||||
|
model='wav2vec2',
|
||||||
|
task='vector',
|
||||||
|
lang='en',
|
||||||
|
sample_rate=16000,
|
||||||
|
config=None, # Set `config` and `ckpt_path` to None to use pretrained model.
|
||||||
|
ckpt_path=None,
|
||||||
|
audio_file='./en.wav',
|
||||||
|
device=paddle.get_device())
|
||||||
|
print('Representation: \n{}'.format(feature))
|
||||||
|
```
|
||||||
|
|
||||||
|
Output:
|
||||||
|
```bash
|
||||||
|
ASR Result:
|
||||||
|
i knocked at the door on the ancient side of the building
|
||||||
|
|
||||||
|
Representation:
|
||||||
|
Tensor(shape=[1, 164, 1024], dtype=float32, place=Place(gpu:0), stop_gradient=True,
|
||||||
|
[[[ 0.02351918, -0.12980647, 0.17868176, ..., 0.10118122,
|
||||||
|
-0.04614586, 0.17853957],
|
||||||
|
[ 0.02361383, -0.12978461, 0.17870593, ..., 0.10103855,
|
||||||
|
-0.04638699, 0.17855372],
|
||||||
|
[ 0.02345137, -0.12982975, 0.17883906, ..., 0.10104341,
|
||||||
|
-0.04643029, 0.17856732],
|
||||||
|
...,
|
||||||
|
[ 0.02313030, -0.12918393, 0.17845058, ..., 0.10073373,
|
||||||
|
-0.04701405, 0.17862988],
|
||||||
|
[ 0.02176583, -0.12929161, 0.17797582, ..., 0.10097728,
|
||||||
|
-0.04687393, 0.17864393],
|
||||||
|
[ 0.05269200, 0.01297141, -0.23336855, ..., -0.11257174,
|
||||||
|
-0.17227529, 0.20338398]]])
|
||||||
|
```
|
@ -0,0 +1,10 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# audio download
|
||||||
|
wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
|
||||||
|
|
||||||
|
# to recognize text
|
||||||
|
paddlespeech ssl --task asr --lang en --input ./en.wav
|
||||||
|
|
||||||
|
# to get acoustic representation
|
||||||
|
paddlespeech ssl --task vector --lang en --input ./en.wav
|
@ -1,10 +1,13 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
# audio download
|
# audio download
|
||||||
wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
|
wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
|
||||||
|
|
||||||
# to recognize text
|
# to recognize text
|
||||||
paddlespeech whisper --task transcribe --input ./zh.wav
|
paddlespeech whisper --task transcribe --input ./zh.wav
|
||||||
|
|
||||||
# to recognize text and translate to English
|
# to recognize text and translate to English
|
||||||
paddlespeech whisper --task translate --input ./zh.wav
|
paddlespeech whisper --task translate --input ./zh.wav
|
||||||
|
|
||||||
|
# to change model English-Only model
|
||||||
|
paddlespeech whisper --lang en --size base --task transcribe --input ./en.wav
|
@ -0,0 +1,32 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
train_output_path=$1
|
||||||
|
|
||||||
|
stage=0
|
||||||
|
stop_stage=0
|
||||||
|
|
||||||
|
# pwgan
|
||||||
|
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||||
|
python3 ${BIN_DIR}/../lite_predict.py \
|
||||||
|
--inference_dir=${train_output_path}/pdlite \
|
||||||
|
--am=fastspeech2_aishell3 \
|
||||||
|
--voc=pwgan_aishell3 \
|
||||||
|
--text=${BIN_DIR}/../sentences.txt \
|
||||||
|
--output_dir=${train_output_path}/lite_infer_out \
|
||||||
|
--phones_dict=dump/phone_id_map.txt \
|
||||||
|
--speaker_dict=dump/speaker_id_map.txt \
|
||||||
|
--spk_id=0
|
||||||
|
fi
|
||||||
|
|
||||||
|
# hifigan
|
||||||
|
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||||
|
python3 ${BIN_DIR}/../lite_predict.py \
|
||||||
|
--inference_dir=${train_output_path}/pdlite \
|
||||||
|
--am=fastspeech2_aishell3 \
|
||||||
|
--voc=hifigan_aishell3 \
|
||||||
|
--text=${BIN_DIR}/../sentences.txt \
|
||||||
|
--output_dir=${train_output_path}/lite_infer_out \
|
||||||
|
--phones_dict=dump/phone_id_map.txt \
|
||||||
|
--speaker_dict=dump/speaker_id_map.txt \
|
||||||
|
--spk_id=0
|
||||||
|
fi
|
@ -0,0 +1,43 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
train_output_path=$1
|
||||||
|
|
||||||
|
stage=0
|
||||||
|
stop_stage=0
|
||||||
|
|
||||||
|
# pwgan
|
||||||
|
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||||
|
python3 ${BIN_DIR}/../lite_predict.py \
|
||||||
|
--inference_dir=${train_output_path}/pdlite \
|
||||||
|
--am=speedyspeech_csmsc \
|
||||||
|
--voc=pwgan_csmsc \
|
||||||
|
--text=${BIN_DIR}/../sentences.txt \
|
||||||
|
--output_dir=${train_output_path}/lite_infer_out \
|
||||||
|
--phones_dict=dump/phone_id_map.txt \
|
||||||
|
--tones_dict=dump/tone_id_map.txt
|
||||||
|
fi
|
||||||
|
|
||||||
|
# for more GAN Vocoders
|
||||||
|
# multi band melgan
|
||||||
|
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||||
|
python3 ${BIN_DIR}/../lite_predict.py \
|
||||||
|
--inference_dir=${train_output_path}/pdlite \
|
||||||
|
--am=speedyspeech_csmsc \
|
||||||
|
--voc=mb_melgan_csmsc \
|
||||||
|
--text=${BIN_DIR}/../sentences.txt \
|
||||||
|
--output_dir=${train_output_path}/lite_infer_out \
|
||||||
|
--phones_dict=dump/phone_id_map.txt \
|
||||||
|
--tones_dict=dump/tone_id_map.txt
|
||||||
|
fi
|
||||||
|
|
||||||
|
# hifigan
|
||||||
|
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||||
|
python3 ${BIN_DIR}/../lite_predict.py \
|
||||||
|
--inference_dir=${train_output_path}/pdlite \
|
||||||
|
--am=speedyspeech_csmsc \
|
||||||
|
--voc=hifigan_csmsc \
|
||||||
|
--text=${BIN_DIR}/../sentences.txt \
|
||||||
|
--output_dir=${train_output_path}/lite_infer_out \
|
||||||
|
--phones_dict=dump/phone_id_map.txt \
|
||||||
|
--tones_dict=dump/tone_id_map.txt
|
||||||
|
fi
|
@ -0,0 +1,40 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
train_output_path=$1
|
||||||
|
|
||||||
|
stage=0
|
||||||
|
stop_stage=0
|
||||||
|
|
||||||
|
# pwgan
|
||||||
|
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||||
|
python3 ${BIN_DIR}/../lite_predict.py \
|
||||||
|
--inference_dir=${train_output_path}/pdlite \
|
||||||
|
--am=fastspeech2_csmsc \
|
||||||
|
--voc=pwgan_csmsc \
|
||||||
|
--text=${BIN_DIR}/../sentences.txt \
|
||||||
|
--output_dir=${train_output_path}/lite_infer_out \
|
||||||
|
--phones_dict=dump/phone_id_map.txt
|
||||||
|
fi
|
||||||
|
|
||||||
|
# for more GAN Vocoders
|
||||||
|
# multi band melgan
|
||||||
|
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||||
|
python3 ${BIN_DIR}/../lite_predict.py \
|
||||||
|
--inference_dir=${train_output_path}/pdlite \
|
||||||
|
--am=fastspeech2_csmsc \
|
||||||
|
--voc=mb_melgan_csmsc \
|
||||||
|
--text=${BIN_DIR}/../sentences.txt \
|
||||||
|
--output_dir=${train_output_path}/lite_infer_out \
|
||||||
|
--phones_dict=dump/phone_id_map.txt
|
||||||
|
fi
|
||||||
|
|
||||||
|
# hifigan
|
||||||
|
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||||
|
python3 ${BIN_DIR}/../lite_predict.py \
|
||||||
|
--inference_dir=${train_output_path}/pdlite \
|
||||||
|
--am=fastspeech2_csmsc \
|
||||||
|
--voc=hifigan_csmsc \
|
||||||
|
--text=${BIN_DIR}/../sentences.txt \
|
||||||
|
--output_dir=${train_output_path}/lite_infer_out \
|
||||||
|
--phones_dict=dump/phone_id_map.txt
|
||||||
|
fi
|
@ -0,0 +1,47 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
train_output_path=$1
|
||||||
|
|
||||||
|
stage=0
|
||||||
|
stop_stage=0
|
||||||
|
|
||||||
|
# pwgan
|
||||||
|
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||||
|
python3 ${BIN_DIR}/../lite_predict_streaming.py \
|
||||||
|
--inference_dir=${train_output_path}/pdlite_streaming \
|
||||||
|
--am=fastspeech2_csmsc \
|
||||||
|
--am_stat=dump/train/speech_stats.npy \
|
||||||
|
--voc=pwgan_csmsc \
|
||||||
|
--text=${BIN_DIR}/../sentences.txt \
|
||||||
|
--output_dir=${train_output_path}/lite_infer_out_streaming \
|
||||||
|
--phones_dict=dump/phone_id_map.txt \
|
||||||
|
--am_streaming=True
|
||||||
|
fi
|
||||||
|
|
||||||
|
# for more GAN Vocoders
|
||||||
|
# multi band melgan
|
||||||
|
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||||
|
python3 ${BIN_DIR}/../lite_predict_streaming.py \
|
||||||
|
--inference_dir=${train_output_path}/pdlite_streaming \
|
||||||
|
--am=fastspeech2_csmsc \
|
||||||
|
--am_stat=dump/train/speech_stats.npy \
|
||||||
|
--voc=mb_melgan_csmsc \
|
||||||
|
--text=${BIN_DIR}/../sentences.txt \
|
||||||
|
--output_dir=${train_output_path}/lite_infer_out_streaming \
|
||||||
|
--phones_dict=dump/phone_id_map.txt \
|
||||||
|
--am_streaming=True
|
||||||
|
fi
|
||||||
|
|
||||||
|
# hifigan
|
||||||
|
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||||
|
python3 ${BIN_DIR}/../lite_predict_streaming.py \
|
||||||
|
--inference_dir=${train_output_path}/pdlite_streaming \
|
||||||
|
--am=fastspeech2_csmsc \
|
||||||
|
--am_stat=dump/train/speech_stats.npy \
|
||||||
|
--voc=hifigan_csmsc \
|
||||||
|
--text=${BIN_DIR}/../sentences.txt \
|
||||||
|
--output_dir=${train_output_path}/lite_infer_out_streaming \
|
||||||
|
--phones_dict=dump/phone_id_map.txt \
|
||||||
|
--am_streaming=True
|
||||||
|
fi
|
||||||
|
|
@ -1,8 +1,8 @@
|
|||||||
# LibriSpeech
|
# LibriSpeech
|
||||||
|
|
||||||
## Wav2VecASR
|
## Wav2VecASR
|
||||||
train: Epoch 1, 1*V100-32G, batchsize:10
|
train: Epoch 1, 1*V100-32G, batchsize: 6
|
||||||
|
|
||||||
| Model | Params | Config | Augmentation| Test set | Decode method | WER |
|
| Model | Params | Config | Augmentation| Test set | Decode method | WER |
|
||||||
| --- | --- | --- | --- | --- | --- | --- |
|
| --- | --- | --- | --- | --- | --- | --- |
|
||||||
| wav2vec2ASR | 302.86 M | conf/wav2vec2ASR.yaml | spec_aug | test-clean | greedy search | 0.018887 |
|
| wav2vec2ASR | 302.86 M | conf/wav2vec2ASR.yaml | spec_aug | test-clean | greedy search | 0.018906 |
|
||||||
|
@ -0,0 +1,30 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
train_output_path=$1
|
||||||
|
|
||||||
|
stage=0
|
||||||
|
stop_stage=0
|
||||||
|
|
||||||
|
# pwgan
|
||||||
|
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||||
|
python3 ${BIN_DIR}/../lite_predict.py \
|
||||||
|
--inference_dir=${train_output_path}/pdlite \
|
||||||
|
--am=fastspeech2_ljspeech \
|
||||||
|
--voc=pwgan_ljspeech \
|
||||||
|
--text=${BIN_DIR}/../sentences_en.txt \
|
||||||
|
--output_dir=${train_output_path}/lite_infer_out \
|
||||||
|
--phones_dict=dump/phone_id_map.txt \
|
||||||
|
--lang=en
|
||||||
|
fi
|
||||||
|
|
||||||
|
# hifigan
|
||||||
|
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||||
|
python3 ${BIN_DIR}/../lite_predict.py \
|
||||||
|
--inference_dir=${train_output_path}/pdlite \
|
||||||
|
--am=fastspeech2_ljspeech \
|
||||||
|
--voc=hifigan_ljspeech \
|
||||||
|
--text=${BIN_DIR}/../sentences_en.txt \
|
||||||
|
--output_dir=${train_output_path}/lite_infer_out \
|
||||||
|
--phones_dict=dump/phone_id_map.txt \
|
||||||
|
--lang=en
|
||||||
|
fi
|
@ -0,0 +1,34 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
train_output_path=$1
|
||||||
|
|
||||||
|
stage=0
|
||||||
|
stop_stage=0
|
||||||
|
|
||||||
|
# pwgan
|
||||||
|
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||||
|
python3 ${BIN_DIR}/../lite_predict.py \
|
||||||
|
--inference_dir=${train_output_path}/pdlite \
|
||||||
|
--am=fastspeech2_vctk \
|
||||||
|
--voc=pwgan_vctk \
|
||||||
|
--text=${BIN_DIR}/../sentences_en.txt \
|
||||||
|
--output_dir=${train_output_path}/lite_infer_out \
|
||||||
|
--phones_dict=dump/phone_id_map.txt \
|
||||||
|
--speaker_dict=dump/speaker_id_map.txt \
|
||||||
|
--spk_id=0 \
|
||||||
|
--lang=en
|
||||||
|
fi
|
||||||
|
|
||||||
|
# hifigan
|
||||||
|
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||||
|
python3 ${BIN_DIR}/../lite_predict.py \
|
||||||
|
--inference_dir=${train_output_path}/pdlite \
|
||||||
|
--am=fastspeech2_vctk \
|
||||||
|
--voc=hifigan_vctk \
|
||||||
|
--text=${BIN_DIR}/../sentences_en.txt \
|
||||||
|
--output_dir=${train_output_path}/lite_infer_out \
|
||||||
|
--phones_dict=dump/phone_id_map.txt \
|
||||||
|
--speaker_dict=dump/speaker_id_map.txt \
|
||||||
|
--spk_id=0 \
|
||||||
|
--lang=en
|
||||||
|
fi
|
@ -0,0 +1,14 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
from .infer import SSLExecutor
|
@ -0,0 +1,13 @@
|
|||||||
|
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
@ -0,0 +1,17 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
from .wav2vec2_ASR import Wav2vec2ASR
|
||||||
|
from .wav2vec2_ASR import Wav2vec2Base
|
||||||
|
|
||||||
|
__all__ = ["Wav2vec2ASR", "Wav2vec2Base"]
|
@ -0,0 +1,13 @@
|
|||||||
|
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
@ -0,0 +1,97 @@
|
|||||||
|
# Authors
|
||||||
|
# * Mirco Ravanelli 2020
|
||||||
|
# * Guillermo Cámbara 2021
|
||||||
|
# * Sarthak Yadav 2022
|
||||||
|
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
# Modified from speechbrain(https://github.com/speechbrain/speechbrain/blob/develop/speechbrain/nnet/normalization.py)
|
||||||
|
import paddle.nn as nn
|
||||||
|
|
||||||
|
from paddlespeech.s2t.modules.align import BatchNorm1D
|
||||||
|
|
||||||
|
|
||||||
|
class BatchNorm1d(nn.Layer):
|
||||||
|
"""Applies 1d batch normalization to the input tensor.
|
||||||
|
Arguments
|
||||||
|
---------
|
||||||
|
input_shape : tuple
|
||||||
|
The expected shape of the input. Alternatively, use ``input_size``.
|
||||||
|
input_size : int
|
||||||
|
The expected size of the input. Alternatively, use ``input_shape``.
|
||||||
|
eps : float
|
||||||
|
This value is added to std deviation estimation to improve the numerical
|
||||||
|
stability.
|
||||||
|
momentum : float
|
||||||
|
It is a value used for the running_mean and running_var computation.
|
||||||
|
affine : bool
|
||||||
|
When set to True, the affine parameters are learned.
|
||||||
|
track_running_stats : bool
|
||||||
|
When set to True, this module tracks the running mean and variance,
|
||||||
|
and when set to False, this module does not track such statistics.
|
||||||
|
combine_batch_time : bool
|
||||||
|
When true, it combines batch an time axis.
|
||||||
|
Example
|
||||||
|
-------
|
||||||
|
>>> input = paddle.randn([100, 10])
|
||||||
|
>>> norm = BatchNorm1d(input_shape=input.shape)
|
||||||
|
>>> output = norm(input)
|
||||||
|
>>> output.shape
|
||||||
|
Paddle.Shape([100, 10])
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
input_shape=None,
|
||||||
|
input_size=None,
|
||||||
|
eps=1e-05,
|
||||||
|
momentum=0.9,
|
||||||
|
combine_batch_time=False,
|
||||||
|
skip_transpose=False, ):
|
||||||
|
super().__init__()
|
||||||
|
self.combine_batch_time = combine_batch_time
|
||||||
|
self.skip_transpose = skip_transpose
|
||||||
|
|
||||||
|
if input_size is None and skip_transpose:
|
||||||
|
input_size = input_shape[1]
|
||||||
|
elif input_size is None:
|
||||||
|
input_size = input_shape[-1]
|
||||||
|
|
||||||
|
self.norm = BatchNorm1D(input_size, momentum=momentum, epsilon=eps)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
"""Returns the normalized input tensor.
|
||||||
|
Arguments
|
||||||
|
---------
|
||||||
|
x : paddle.Tensor (batch, time, [channels])
|
||||||
|
input to normalize. 2d or 3d tensors are expected in input
|
||||||
|
4d tensors can be used when combine_dims=True.
|
||||||
|
"""
|
||||||
|
shape_or = x.shape
|
||||||
|
if self.combine_batch_time:
|
||||||
|
if x.ndim == 3:
|
||||||
|
x = x.reshape(shape_or[0] * shape_or[1], shape_or[2])
|
||||||
|
else:
|
||||||
|
x = x.reshape(shape_or[0] * shape_or[1], shape_or[3],
|
||||||
|
shape_or[2])
|
||||||
|
|
||||||
|
elif not self.skip_transpose:
|
||||||
|
x = x.transpose([0, 2, 1])
|
||||||
|
|
||||||
|
x_n = self.norm(x)
|
||||||
|
if self.combine_batch_time:
|
||||||
|
x_n = x_n.reshape(shape_or)
|
||||||
|
elif not self.skip_transpose:
|
||||||
|
x_n = x_n.transpose([0, 2, 1])
|
||||||
|
|
||||||
|
return x_n
|
@ -0,0 +1,13 @@
|
|||||||
|
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
@ -0,0 +1,168 @@
|
|||||||
|
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
import argparse
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import soundfile as sf
|
||||||
|
from timer import timer
|
||||||
|
|
||||||
|
from paddlespeech.t2s.exps.syn_utils import get_frontend
|
||||||
|
from paddlespeech.t2s.exps.syn_utils import get_lite_am_output
|
||||||
|
from paddlespeech.t2s.exps.syn_utils import get_lite_predictor
|
||||||
|
from paddlespeech.t2s.exps.syn_utils import get_lite_voc_output
|
||||||
|
from paddlespeech.t2s.exps.syn_utils import get_sentences
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args():
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Paddle Infernce with acoustic model & vocoder.")
|
||||||
|
# acoustic model
|
||||||
|
parser.add_argument(
|
||||||
|
'--am',
|
||||||
|
type=str,
|
||||||
|
default='fastspeech2_csmsc',
|
||||||
|
choices=[
|
||||||
|
'speedyspeech_csmsc',
|
||||||
|
'fastspeech2_csmsc',
|
||||||
|
'fastspeech2_aishell3',
|
||||||
|
'fastspeech2_ljspeech',
|
||||||
|
'fastspeech2_vctk',
|
||||||
|
'fastspeech2_mix',
|
||||||
|
],
|
||||||
|
help='Choose acoustic model type of tts task.')
|
||||||
|
parser.add_argument(
|
||||||
|
"--phones_dict", type=str, default=None, help="phone vocabulary file.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--tones_dict", type=str, default=None, help="tone vocabulary file.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--speaker_dict", type=str, default=None, help="speaker id map file.")
|
||||||
|
parser.add_argument(
|
||||||
|
'--spk_id',
|
||||||
|
type=int,
|
||||||
|
default=0,
|
||||||
|
help='spk id for multi speaker acoustic model')
|
||||||
|
# voc
|
||||||
|
parser.add_argument(
|
||||||
|
'--voc',
|
||||||
|
type=str,
|
||||||
|
default='pwgan_csmsc',
|
||||||
|
choices=[
|
||||||
|
'pwgan_csmsc',
|
||||||
|
'pwgan_aishell3',
|
||||||
|
'pwgan_ljspeech',
|
||||||
|
'pwgan_vctk',
|
||||||
|
'mb_melgan_csmsc',
|
||||||
|
'hifigan_csmsc',
|
||||||
|
'hifigan_aishell3',
|
||||||
|
'hifigan_ljspeech',
|
||||||
|
'hifigan_vctk',
|
||||||
|
],
|
||||||
|
help='Choose vocoder type of tts task.')
|
||||||
|
# other
|
||||||
|
parser.add_argument(
|
||||||
|
'--lang',
|
||||||
|
type=str,
|
||||||
|
default='zh',
|
||||||
|
help='Choose model language. zh or en or mix')
|
||||||
|
parser.add_argument(
|
||||||
|
"--text",
|
||||||
|
type=str,
|
||||||
|
help="text to synthesize, a 'utt_id sentence' pair per line")
|
||||||
|
parser.add_argument(
|
||||||
|
"--inference_dir", type=str, help="dir to save inference models")
|
||||||
|
parser.add_argument("--output_dir", type=str, help="output dir")
|
||||||
|
|
||||||
|
args, _ = parser.parse_known_args()
|
||||||
|
return args
|
||||||
|
|
||||||
|
|
||||||
|
# only inference for models trained with csmsc now
|
||||||
|
def main():
|
||||||
|
args = parse_args()
|
||||||
|
|
||||||
|
# frontend
|
||||||
|
frontend = get_frontend(
|
||||||
|
lang=args.lang,
|
||||||
|
phones_dict=args.phones_dict,
|
||||||
|
tones_dict=args.tones_dict)
|
||||||
|
|
||||||
|
# am_predictor
|
||||||
|
am_predictor = get_lite_predictor(
|
||||||
|
model_dir=args.inference_dir, model_file=args.am + "_x86.nb")
|
||||||
|
# model: {model_name}_{dataset}
|
||||||
|
am_dataset = args.am[args.am.rindex('_') + 1:]
|
||||||
|
|
||||||
|
# voc_predictor
|
||||||
|
voc_predictor = get_lite_predictor(
|
||||||
|
model_dir=args.inference_dir, model_file=args.voc + "_x86.nb")
|
||||||
|
|
||||||
|
output_dir = Path(args.output_dir)
|
||||||
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
sentences = get_sentences(text_file=args.text, lang=args.lang)
|
||||||
|
|
||||||
|
merge_sentences = True
|
||||||
|
fs = 24000 if am_dataset != 'ljspeech' else 22050
|
||||||
|
# warmup
|
||||||
|
for utt_id, sentence in sentences[:3]:
|
||||||
|
with timer() as t:
|
||||||
|
mel = get_lite_am_output(
|
||||||
|
input=sentence,
|
||||||
|
am_predictor=am_predictor,
|
||||||
|
am=args.am,
|
||||||
|
frontend=frontend,
|
||||||
|
lang=args.lang,
|
||||||
|
merge_sentences=merge_sentences,
|
||||||
|
speaker_dict=args.speaker_dict,
|
||||||
|
spk_id=args.spk_id, )
|
||||||
|
wav = get_lite_voc_output(voc_predictor=voc_predictor, input=mel)
|
||||||
|
speed = wav.size / t.elapse
|
||||||
|
rtf = fs / speed
|
||||||
|
print(
|
||||||
|
f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
|
||||||
|
)
|
||||||
|
|
||||||
|
print("warm up done!")
|
||||||
|
|
||||||
|
N = 0
|
||||||
|
T = 0
|
||||||
|
for utt_id, sentence in sentences:
|
||||||
|
with timer() as t:
|
||||||
|
mel = get_lite_am_output(
|
||||||
|
input=sentence,
|
||||||
|
am_predictor=am_predictor,
|
||||||
|
am=args.am,
|
||||||
|
frontend=frontend,
|
||||||
|
lang=args.lang,
|
||||||
|
merge_sentences=merge_sentences,
|
||||||
|
speaker_dict=args.speaker_dict,
|
||||||
|
spk_id=args.spk_id, )
|
||||||
|
wav = get_lite_voc_output(voc_predictor=voc_predictor, input=mel)
|
||||||
|
|
||||||
|
N += wav.size
|
||||||
|
T += t.elapse
|
||||||
|
speed = wav.size / t.elapse
|
||||||
|
rtf = fs / speed
|
||||||
|
|
||||||
|
sf.write(output_dir / (utt_id + ".wav"), wav, samplerate=fs)
|
||||||
|
print(
|
||||||
|
f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f"{utt_id} done!")
|
||||||
|
print(f"generation speed: {N / T}Hz, RTF: {fs / (N / T) }")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
@ -0,0 +1,230 @@
|
|||||||
|
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
import argparse
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import soundfile as sf
|
||||||
|
from timer import timer
|
||||||
|
|
||||||
|
from paddlespeech.t2s.exps.syn_utils import denorm
|
||||||
|
from paddlespeech.t2s.exps.syn_utils import get_chunks
|
||||||
|
from paddlespeech.t2s.exps.syn_utils import get_frontend
|
||||||
|
from paddlespeech.t2s.exps.syn_utils import get_lite_am_sublayer_output
|
||||||
|
from paddlespeech.t2s.exps.syn_utils import get_lite_predictor
|
||||||
|
from paddlespeech.t2s.exps.syn_utils import get_lite_streaming_am_output
|
||||||
|
from paddlespeech.t2s.exps.syn_utils import get_lite_voc_output
|
||||||
|
from paddlespeech.t2s.exps.syn_utils import get_sentences
|
||||||
|
from paddlespeech.t2s.exps.syn_utils import run_frontend
|
||||||
|
from paddlespeech.t2s.utils import str2bool
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args():
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Paddle Infernce with acoustic model & vocoder.")
|
||||||
|
# acoustic model
|
||||||
|
parser.add_argument(
|
||||||
|
'--am',
|
||||||
|
type=str,
|
||||||
|
default='fastspeech2_csmsc',
|
||||||
|
choices=['fastspeech2_csmsc'],
|
||||||
|
help='Choose acoustic model type of tts task.')
|
||||||
|
parser.add_argument(
|
||||||
|
"--am_stat",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help="mean and standard deviation used to normalize spectrogram when training acoustic model."
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--phones_dict", type=str, default=None, help="phone vocabulary file.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--tones_dict", type=str, default=None, help="tone vocabulary file.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--speaker_dict", type=str, default=None, help="speaker id map file.")
|
||||||
|
parser.add_argument(
|
||||||
|
'--spk_id',
|
||||||
|
type=int,
|
||||||
|
default=0,
|
||||||
|
help='spk id for multi speaker acoustic model')
|
||||||
|
# voc
|
||||||
|
parser.add_argument(
|
||||||
|
'--voc',
|
||||||
|
type=str,
|
||||||
|
default='pwgan_csmsc',
|
||||||
|
choices=['pwgan_csmsc', 'mb_melgan_csmsc', 'hifigan_csmsc'],
|
||||||
|
help='Choose vocoder type of tts task.')
|
||||||
|
# other
|
||||||
|
parser.add_argument(
|
||||||
|
'--lang',
|
||||||
|
type=str,
|
||||||
|
default='zh',
|
||||||
|
help='Choose model language. zh or en')
|
||||||
|
parser.add_argument(
|
||||||
|
"--text",
|
||||||
|
type=str,
|
||||||
|
help="text to synthesize, a 'utt_id sentence' pair per line")
|
||||||
|
parser.add_argument(
|
||||||
|
"--inference_dir", type=str, help="dir to save inference models")
|
||||||
|
parser.add_argument("--output_dir", type=str, help="output dir")
|
||||||
|
# inference
|
||||||
|
|
||||||
|
# streaming related
|
||||||
|
parser.add_argument(
|
||||||
|
"--am_streaming",
|
||||||
|
type=str2bool,
|
||||||
|
default=False,
|
||||||
|
help="whether use streaming acoustic model")
|
||||||
|
parser.add_argument(
|
||||||
|
"--block_size", type=int, default=42, help="block size of am streaming")
|
||||||
|
parser.add_argument(
|
||||||
|
"--pad_size", type=int, default=12, help="pad size of am streaming")
|
||||||
|
|
||||||
|
args, _ = parser.parse_known_args()
|
||||||
|
return args
|
||||||
|
|
||||||
|
|
||||||
|
# only inference for models trained with csmsc now
|
||||||
|
def main():
|
||||||
|
args = parse_args()
|
||||||
|
|
||||||
|
# frontend
|
||||||
|
frontend = get_frontend(
|
||||||
|
lang=args.lang,
|
||||||
|
phones_dict=args.phones_dict,
|
||||||
|
tones_dict=args.tones_dict)
|
||||||
|
|
||||||
|
# am_predictor
|
||||||
|
am_encoder_infer_predictor = get_lite_predictor(
|
||||||
|
model_dir=args.inference_dir,
|
||||||
|
model_file=args.am + "_am_encoder_infer" + "_x86.nb")
|
||||||
|
am_decoder_predictor = get_lite_predictor(
|
||||||
|
model_dir=args.inference_dir,
|
||||||
|
model_file=args.am + "_am_decoder" + "_x86.nb")
|
||||||
|
am_postnet_predictor = get_lite_predictor(
|
||||||
|
model_dir=args.inference_dir,
|
||||||
|
model_file=args.am + "_am_postnet" + "_x86.nb")
|
||||||
|
am_mu, am_std = np.load(args.am_stat)
|
||||||
|
# model: {model_name}_{dataset}
|
||||||
|
am_dataset = args.am[args.am.rindex('_') + 1:]
|
||||||
|
|
||||||
|
# voc_predictor
|
||||||
|
voc_predictor = get_lite_predictor(
|
||||||
|
model_dir=args.inference_dir, model_file=args.voc + "_x86.nb")
|
||||||
|
|
||||||
|
output_dir = Path(args.output_dir)
|
||||||
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
sentences = get_sentences(text_file=args.text, lang=args.lang)
|
||||||
|
|
||||||
|
merge_sentences = True
|
||||||
|
|
||||||
|
fs = 24000 if am_dataset != 'ljspeech' else 22050
|
||||||
|
# warmup
|
||||||
|
for utt_id, sentence in sentences[:3]:
|
||||||
|
with timer() as t:
|
||||||
|
normalized_mel = get_lite_streaming_am_output(
|
||||||
|
input=sentence,
|
||||||
|
am_encoder_infer_predictor=am_encoder_infer_predictor,
|
||||||
|
am_decoder_predictor=am_decoder_predictor,
|
||||||
|
am_postnet_predictor=am_postnet_predictor,
|
||||||
|
frontend=frontend,
|
||||||
|
lang=args.lang,
|
||||||
|
merge_sentences=merge_sentences, )
|
||||||
|
mel = denorm(normalized_mel, am_mu, am_std)
|
||||||
|
wav = get_lite_voc_output(voc_predictor=voc_predictor, input=mel)
|
||||||
|
speed = wav.size / t.elapse
|
||||||
|
rtf = fs / speed
|
||||||
|
print(
|
||||||
|
f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
|
||||||
|
)
|
||||||
|
|
||||||
|
print("warm up done!")
|
||||||
|
|
||||||
|
N = 0
|
||||||
|
T = 0
|
||||||
|
block_size = args.block_size
|
||||||
|
pad_size = args.pad_size
|
||||||
|
get_tone_ids = False
|
||||||
|
for utt_id, sentence in sentences:
|
||||||
|
with timer() as t:
|
||||||
|
# frontend
|
||||||
|
frontend_dict = run_frontend(
|
||||||
|
frontend=frontend,
|
||||||
|
text=sentence,
|
||||||
|
merge_sentences=merge_sentences,
|
||||||
|
get_tone_ids=get_tone_ids,
|
||||||
|
lang=args.lang)
|
||||||
|
phone_ids = frontend_dict['phone_ids']
|
||||||
|
phones = phone_ids[0].numpy()
|
||||||
|
# acoustic model
|
||||||
|
orig_hs = get_lite_am_sublayer_output(
|
||||||
|
am_encoder_infer_predictor, input=phones)
|
||||||
|
|
||||||
|
if args.am_streaming:
|
||||||
|
hss = get_chunks(orig_hs, block_size, pad_size)
|
||||||
|
chunk_num = len(hss)
|
||||||
|
mel_list = []
|
||||||
|
for i, hs in enumerate(hss):
|
||||||
|
am_decoder_output = get_lite_am_sublayer_output(
|
||||||
|
am_decoder_predictor, input=hs)
|
||||||
|
am_postnet_output = get_lite_am_sublayer_output(
|
||||||
|
am_postnet_predictor,
|
||||||
|
input=np.transpose(am_decoder_output, (0, 2, 1)))
|
||||||
|
am_output_data = am_decoder_output + np.transpose(
|
||||||
|
am_postnet_output, (0, 2, 1))
|
||||||
|
normalized_mel = am_output_data[0]
|
||||||
|
|
||||||
|
sub_mel = denorm(normalized_mel, am_mu, am_std)
|
||||||
|
# clip output part of pad
|
||||||
|
if i == 0:
|
||||||
|
sub_mel = sub_mel[:-pad_size]
|
||||||
|
elif i == chunk_num - 1:
|
||||||
|
# 最后一块的右侧一定没有 pad 够
|
||||||
|
sub_mel = sub_mel[pad_size:]
|
||||||
|
else:
|
||||||
|
# 倒数几块的右侧也可能没有 pad 够
|
||||||
|
sub_mel = sub_mel[pad_size:(block_size + pad_size) -
|
||||||
|
sub_mel.shape[0]]
|
||||||
|
mel_list.append(sub_mel)
|
||||||
|
mel = np.concatenate(mel_list, axis=0)
|
||||||
|
|
||||||
|
else:
|
||||||
|
am_decoder_output = get_lite_am_sublayer_output(
|
||||||
|
am_decoder_predictor, input=orig_hs)
|
||||||
|
am_postnet_output = get_lite_am_sublayer_output(
|
||||||
|
am_postnet_predictor,
|
||||||
|
input=np.transpose(am_decoder_output, (0, 2, 1)))
|
||||||
|
am_output_data = am_decoder_output + np.transpose(
|
||||||
|
am_postnet_output, (0, 2, 1))
|
||||||
|
normalized_mel = am_output_data[0]
|
||||||
|
mel = denorm(normalized_mel, am_mu, am_std)
|
||||||
|
# vocoder
|
||||||
|
wav = get_lite_voc_output(voc_predictor=voc_predictor, input=mel)
|
||||||
|
|
||||||
|
N += wav.size
|
||||||
|
T += t.elapse
|
||||||
|
speed = wav.size / t.elapse
|
||||||
|
rtf = fs / speed
|
||||||
|
|
||||||
|
sf.write(output_dir / (utt_id + ".wav"), wav, samplerate=24000)
|
||||||
|
print(
|
||||||
|
f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f"{utt_id} done!")
|
||||||
|
print(f"generation speed: {N / T}Hz, RTF: {fs / (N / T) }")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
Loading…
Reference in new issue