Merge branch 'develop' into test_ci

pull/3901/head
zxcd 9 months ago committed by GitHub
commit 9967cb3f50
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -177,8 +177,9 @@ def th_accuracy(pad_outputs: paddle.Tensor,
Returns:
float: Accuracy value (0.0 - 1.0).
"""
pad_pred = pad_outputs.view(pad_targets.shape[0], pad_targets.shape[1],
pad_outputs.shape[1]).argmax(2)
pad_pred = pad_outputs.reshape(
[pad_targets.shape[0], pad_targets.shape[1],
pad_outputs.shape[1]]).argmax(2)
mask = pad_targets != ignore_label
#TODO(Hui Zhang): sum not support bool type
# numerator = paddle.sum(

@ -1 +1 @@
../../TTSCppFrontend/
../../TTSCppFrontend/

@ -19,7 +19,7 @@ There are 3 ways to use `PaddleSpeech`. According to the degree of difficulty, t
- If you are newer to `PaddleSpeech` and want to experience it easily without your machine. We recommend you to use [AI Studio](https://aistudio.baidu.com/aistudio/index) to experience it. There is a step-by-step [tutorial](https://aistudio.baidu.com/aistudio/education/group/info/25130) for `PaddleSpeech`, and you can use the basic function of `PaddleSpeech` with a free machine.
- If you want to use the command line function of Paddlespeech, you need to complete the following steps to install `PaddleSpeech`. For more information about how to use the command line function, you can see the [cli](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/cli).
### Install Conda
Conda is a management system of the environment. You can go to [minicoda](https://docs.conda.io/en/latest/miniconda.html) (select a version py>=3.7) to download and install the conda.
Conda is a management system of the environment. You can go to [miniconda](https://docs.conda.io/en/latest/miniconda.html) (select a version py>=3.7) to download and install the conda.
And then Install conda dependencies for `paddlespeech` :
```bash

@ -17,7 +17,7 @@
- 如果你是一个刚刚接触 `PaddleSpeech` 的新人并且想要很方便地体验一下该项目。我们建议你体验一下 [AI Studio](https://aistudio.baidu.com/aistudio/index)。我们在 AI Studio上面建立了一个让你一步一步运行体验来使用 `PaddleSpeech` 的[教程](https://aistudio.baidu.com/aistudio/education/group/info/25130)。
- 如果你想使用 `PaddleSpeech` 的命令行功能,你需要跟随下面的步骤来安装 `PaddleSpeech`。如果你想了解更多关于使用 `PaddleSpeech` 命令行功能的信息,你可以参考 [cli](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/cli)。
### 安装 Conda
Conda是一个包管理的环境。你可以前往 [minicoda](https://docs.conda.io/en/latest/miniconda.html) 去下载并安装 conda请下载 py>=3.7 的版本)。
Conda是一个包管理的环境。你可以前往 [miniconda](https://docs.conda.io/en/latest/miniconda.html) 去下载并安装 conda请下载 py>=3.7 的版本)。
然后你需要安装 `paddlespeech` 的 conda 依赖:
```bash
conda install -y -c conda-forge sox libsndfile bzip2

@ -1,5 +1,5 @@
# Models introduction
TTS system mainly includes three modules: `Text Frontend`, `Acoustic model` and `Vocoder`. We introduce a rule-based Chinese text frontend in [cn_text_frontend.md](./cn_text_frontend.md). Here, we will introduce acoustic models and vocoders, which are trainable.
TTS system mainly includes three modules: `Text Frontend`, `Acoustic model` and `Vocoder`. We introduce a rule-based Chinese text frontend in [zh_text_frontend](./zh_text_frontend.md). Here, we will introduce acoustic models and vocoders, which are trainable.
The main processes of TTS include:
1. Convert the original text into characters/phonemes, through the `text frontend` module.

@ -22,7 +22,7 @@ fi
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# format the reference test file
python3 utils/format_rsl.py \
python3 ${MAIN_ROOT}/utils/format_rsl.py \
--origin_ref data/manifest.test.raw \
--trans_ref data/manifest.test.text
@ -39,20 +39,20 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
fi
# format the hyp file
python3 utils/format_rsl.py \
python3 ${MAIN_ROOT}/utils/format_rsl.py \
--origin_hyp ${ckpt_prefix}.rsl \
--trans_hyp ${ckpt_prefix}.rsl.text
python3 utils/compute-wer.py --char=1 --v=1 \
python3 ${MAIN_ROOT}/utils/compute-wer.py --char=1 --v=1 \
data/manifest.test.text ${ckpt_prefix}.rsl.text > ${ckpt_prefix}.error
fi
if [ ${stage} -le 101 ] && [ ${stop_stage} -ge 101 ]; then
python3 utils/format_rsl.py \
python3 ${MAIN_ROOT}/utils/format_rsl.py \
--origin_ref data/manifest.test.raw \
--trans_ref_sclite data/manifest.test.text.sclite
python3 utils/format_rsl.py \
python3 ${MAIN_ROOT}/utils/format_rsl.py \
--origin_hyp ${ckpt_prefix}.rsl \
--trans_hyp_sclite ${ckpt_prefix}.rsl.text.sclite

@ -1 +0,0 @@
../../../utils/

@ -34,7 +34,7 @@ fi
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# format the reference test file
python utils/format_rsl.py \
python ${MAIN_ROOT}/utils/format_rsl.py \
--origin_ref data/manifest.test.raw \
--trans_ref data/manifest.test.text
@ -63,10 +63,10 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
fi
# format the hyp file
python utils/format_rsl.py \
python ${MAIN_ROOT}/utils/format_rsl.py \
--origin_hyp ${output_dir}/${type}.rsl \
--trans_hyp ${output_dir}/${type}.rsl.text
python utils/compute-wer.py --char=1 --v=1 \
python ${MAIN_ROOT}/utils/compute-wer.py --char=1 --v=1 \
data/manifest.test.text ${output_dir}/${type}.rsl.text > ${output_dir}/${type}.error
done
@ -89,10 +89,10 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
echo "Failed in evaluation!"
exit 1
fi
python utils/format_rsl.py \
python ${MAIN_ROOT}/utils/format_rsl.py \
--origin_hyp ${output_dir}/${type}.rsl \
--trans_hyp ${output_dir}/${type}.rsl.text
python utils/compute-wer.py --char=1 --v=1 \
python ${MAIN_ROOT}/utils/compute-wer.py --char=1 --v=1 \
data/manifest.test.text ${output_dir}/${type}.rsl.text > ${output_dir}/${type}.error
done
fi
@ -100,13 +100,13 @@ fi
if [ ${stage} -le 101 ] && [ ${stop_stage} -ge 101 ]; then
echo "using sclite to compute cer..."
# format the reference test file for sclite
python utils/format_rsl.py \
python ${MAIN_ROOT}/utils/format_rsl.py \
--origin_ref data/manifest.test.raw \
--trans_ref_sclite data/manifest.test.text.sclite
output_dir=${ckpt_prefix}
for type in attention ctc_greedy_search ctc_prefix_beam_search attention_rescoring; do
python utils/format_rsl.py \
python ${MAIN_ROOT}/utils/format_rsl.py \
--origin_hyp ${output_dir}/${type}.rsl \
--trans_hyp_sclite ${output_dir}/${type}.rsl.text.sclite

@ -22,7 +22,7 @@ source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
# exit 1
#fi
python3 utils/format_rsl.py \
python3 ${MAIN_ROOT}/utils/format_rsl.py \
--origin_ref data/manifest.test.raw \
--trans_ref data/manifest.test.text
@ -43,11 +43,11 @@ for type in ctc_greedy_search; do
echo "Failed in evaluation!"
exit 1
fi
python3 utils/format_rsl.py \
python3 ${MAIN_ROOT}/utils/format_rsl.py \
--origin_hyp ${ckpt_prefix}.${type}.rsl \
--trans_hyp ${ckpt_prefix}.${type}.rsl.text
python3 utils/compute-wer.py --char=1 --v=1 \
python3 ${MAIN_ROOT}/utils/compute-wer.py --char=1 --v=1 \
data/manifest.test.text ${ckpt_prefix}.${type}.rsl.text > ${ckpt_prefix}.${type}.error
echo "decoding ${type} done."
done
@ -68,11 +68,11 @@ for type in ctc_prefix_beam_search; do
echo "Failed in evaluation!"
exit 1
fi
python3 utils/format_rsl.py \
python3 ${MAIN_ROOT}/utils/format_rsl.py \
--origin_hyp ${ckpt_prefix}.${type}.rsl \
--trans_hyp ${ckpt_prefix}.${type}.rsl.text
python3 utils/compute-wer.py --char=1 --v=1 \
python3 ${MAIN_ROOT}/utils/compute-wer.py --char=1 --v=1 \
data/manifest.test.text ${ckpt_prefix}.${type}.rsl.text > ${ckpt_prefix}.${type}.error
echo "decoding ${type} done."
done

@ -223,6 +223,9 @@ Pretrained FastSpeech2 model with no silence in the edge of audios:
The static model can be downloaded here:
- [fastspeech2_aishell3_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_aishell3_static_1.1.0.zip)
The PIR static model can be downloaded here:
- [fastspeech2_aishell3_static_pir_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_aishell3_static_pir_1.1.0.zip) (Run PIR model need to set FLAGS_enable_pir_api=1, and PIR model only worked with paddlepaddle>=3.0.0b2)
The ONNX model can be downloaded here:
- [fastspeech2_aishell3_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_aishell3_onnx_1.1.0.zip)

@ -136,6 +136,9 @@ Pretrained models can be downloaded here:
The static model can be downloaded here:
- [pwgan_aishell3_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_aishell3_static_1.1.0.zip)
The PIR static model can be downloaded here:
- [pwgan_aishell3_static_pir_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_aishell3_static_pir_1.1.0.zip) (Run PIR model need to set FLAGS_enable_pir_api=1, and PIR model only worked with paddlepaddle>=3.0.0b2)
The ONNX model can be downloaded here:
- [pwgan_aishell3_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_aishell3_onnx_1.1.0.zip)

@ -119,6 +119,9 @@ The pretrained model can be downloaded here:
The static model can be downloaded here:
- [hifigan_aishell3_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_static_1.1.0.zip)
The PIR static model can be downloaded here:
- [hifigan_aishell3_static_pir_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_static_pir_1.1.0.zip) (Run PIR model need to set FLAGS_enable_pir_api=1, and PIR model only worked with paddlepaddle>=3.0.0b2)
The ONNX model can be downloaded here:
- [hifigan_aishell3_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_onnx_1.1.0.zip)

@ -3,7 +3,18 @@ This example contains code used to train a [JETS](https://arxiv.org/abs/2203.168
## Dataset
### Download and Extract
Download CSMSC from it's [Official Website](https://test.data-baker.com/data/index/source).
Download CSMSC from it's [official website](https://test.data-baker.com/data/index/TNtts/) and extract it to `~/datasets`. Then the dataset is in the directory `~/datasets/BZNSYP`.
The structure of the folder is listed below.
```text
└─ Wave
└─ .wav files (audio speech)
└─ PhoneLabeling
└─ .interval files (alignment between phoneme and duration)
└─ ProsodyLabeling
└─ 000001-010000.txt (text with prosodic by pinyin)
```
### Get MFA Result and Extract
We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get phonemes and durations for JETS.

@ -5,6 +5,17 @@ This example contains code used to train a [SpeedySpeech](http://arxiv.org/abs/2
### Download and Extract
Download CSMSC from it's [Official Website](https://test.data-baker.com/data/index/TNtts/) and extract it to `~/datasets`. Then the dataset is in the directory `~/datasets/BZNSYP`.
The structure of the folder is listed below.
```text
└─ Wave
└─ .wav files (audio speech)
└─ PhoneLabeling
└─ .interval files (alignment between phoneme and duration)
└─ ProsodyLabeling
└─ 000001-010000.txt (text with prosodic by pinyin)
```
### Get MFA Result and Extract
We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for SPEEDYSPEECH.
You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo.

@ -4,6 +4,18 @@ This example contains code used to train a [parallel wavegan](http://arxiv.org/a
### Download and Extract
Download CSMSC from it's [official website](https://test.data-baker.com/data/index/TNtts/) and extract it to `~/datasets`. Then the dataset is in the directory `~/datasets/BZNSYP`.
After processing the data, the ``BZNSYP`` directory will look like this:
```text
BZNSYP
├── Wave
│ └─ *.wav files (audio speech)
├── PhoneLabeling
│ └─ *.interval files (alignment between phoneme and duration)
└── ProsodyLabeling
└─ 000001-010000.txt (text with prosodic by pinyin)
```
This experiment only uses *.wav files from the Wave file
### Get MFA Result and Extract
We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut silence at the edge of audio.
You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo.
@ -17,6 +29,7 @@ Run the command below to
3. train the model.
4. synthesize wavs.
- synthesize waveform from `metadata.jsonl`.
- synthesize waveform from text file.
```bash
./run.sh
```
@ -94,6 +107,18 @@ benchmark:
4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
### Synthesizing
We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc1) as the neural vocoder.
Download pretrained parallel wavegan model from [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip) and unzip it.
```bash
unzip pwg_baker_ckpt_0.4.zip
```
Parallel WaveGAN checkpoint contains files listed below.
```text
pwg_baker_ckpt_0.4
├── pwg_default.yaml # default config used to train parallel wavegan
├── pwg_snapshot_iter_400000.pdz # model parameters of parallel wavegan
└── pwg_stats.npy # statistics used to normalize spectrogram when training parallel wavegan
```
`./local/synthesize.sh` calls `${BIN_DIR}/../synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
```bash
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
@ -126,18 +151,97 @@ optional arguments:
4. `--output-dir` is the directory to save the synthesized audio files.
5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
We use [Fastspeech2](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3) as the acoustic model.
Download pretrained fastspeech2_nosil model from [fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip)and unzip it.
```bash
unzip fastspeech2_nosil_baker_ckpt_0.4.zip
```
Fastspeech2 checkpoint contains files listed below.
```text
fastspeech2_nosil_baker_ckpt_0.4
├── default.yaml # default config used to train fastspeech2
├── phone_id_map.txt # phone vocabulary file when training fastspeech2
├── snapshot_iter_76000.pdz # model parameters and optimizer states
└── speech_stats.npy # statistics used to normalize spectrogram when training fastspeech2
```
`./local/synthesize_e2e.sh` calls `${BIN_DIR}/../synthesize_e2e.py`, which can synthesize waveform from text file.
```bash
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name}
```
```text
usage: synthesize_e2e.py [-h]
[--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}]
[--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
[--am_stat AM_STAT] [--phones_dict PHONES_DICT]
[--tones_dict TONES_DICT]
[--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID]
[--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}]
[--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
[--voc_stat VOC_STAT] [--lang LANG]
[--inference_dir INFERENCE_DIR] [--ngpu NGPU]
[--text TEXT] [--output_dir OUTPUT_DIR]
Synthesize with acoustic model & vocoder
optional arguments:
-h, --help show this help message and exit
--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}
Choose acoustic model type of tts task.
--am_config AM_CONFIG
Config of acoustic model.
--am_ckpt AM_CKPT Checkpoint file of acoustic model.
--am_stat AM_STAT mean and standard deviation used to normalize
spectrogram when training acoustic model.
--phones_dict PHONES_DICT
phone vocabulary file.
--tones_dict TONES_DICT
tone vocabulary file.
--speaker_dict SPEAKER_DICT
speaker id map file.
--spk_id SPK_ID spk id for multi speaker acoustic model
--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}
Choose vocoder type of tts task.
--voc_config VOC_CONFIG
Config of voc.
--voc_ckpt VOC_CKPT Checkpoint file of voc.
--voc_stat VOC_STAT mean and standard deviation used to normalize
spectrogram when training voc.
--lang LANG Choose model language. zh or en
--inference_dir INFERENCE_DIR
dir to save inference models
--ngpu NGPU if ngpu == 0, use cpu.
--text TEXT text to synthesize, a 'utt_id sentence' pair per line.
--output_dir OUTPUT_DIR
output dir.
```
1. `--am` is acoustic model type with the format {model_name}_{dataset}
2. `--am_config`, `--am_ckpt`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the fastspeech2 pretrained model.
3. `--voc` is vocoder type with the format {model_name}_{dataset}
4. `--voc_config`, `--voc_ckpt`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model.
5. `--lang` is the model language, which can be `zh` or `en`.
6. `--test_metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder.
7. `--text` is the text file, which contains sentences to synthesize.
8. `--output_dir` is the directory to save synthesized audio files.
9. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
## Pretrained Models
The pretrained model can be downloaded here:
- [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip)
- [fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip)
The static model can be downloaded here:
- [pwg_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_static_0.4.zip)
- [fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_static_0.4.zip)
The ONNX model can be downloaded here:
- [pwgan_csmsc_onnx_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_csmsc_onnx_0.2.0.zip)
- [fastspeech2_csmsc_onnx_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_csmsc_onnx_0.2.0.zip)
The Paddle-Lite model can be downloaded here:
- [pwgan_csmsc_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_csmsc_pdlite_1.3.0.zip)
- [fastspeech2_csmsc_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_csmsc_pdlite_1.3.0.zip)
Model | Step | eval/generator_loss | eval/log_stft_magnitude_loss| eval/spectral_convergence_loss
:-------------:| :------------:| :-----: | :-----: | :--------:
@ -151,5 +255,16 @@ pwg_baker_ckpt_0.4
├── pwg_snapshot_iter_400000.pdz # generator parameters of parallel wavegan
└── pwg_stats.npy # statistics used to normalize spectrogram when training parallel wavegan
```
FastSpeech2 checkpoint contains files listed below.
```text
fastspeech2_nosil_baker_ckpt_0.4
├── default.yaml # default config used to train fastspeech2
├── phone_id_map.txt # phone vocabulary file when training fastspeech2
├── snapshot_iter_76000.pdz # model parameters and optimizer states
└── speech_stats.npy # statistics used to normalize spectrogram when training fastspeech2
```
## Acknowledgement
We adapted some code from https://github.com/kan-bayashi/ParallelWaveGAN.

@ -0,0 +1,22 @@
#!/bin/bash
config_path=$1
train_output_path=$2
ckpt_name=$3
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/../../synthesize_e2e.py \
--am=fastspeech2_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/speech_stats.npy \
--voc=pwgan_csmsc \
--voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \
--voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
--voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--inference_dir=${train_output_path}/inference

@ -31,7 +31,12 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
fi
# PTQ_static
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# synthesize_e2e, vocoder is pwgan by default
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
fi
# PTQ_static
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
CUDA_VISIBLE_DEVICES=${gpus} ./local/PTQ_static.sh ${train_output_path} pwgan_csmsc || exit -1
fi

@ -161,6 +161,9 @@ The finetuned model can be downloaded here:
The static model can be downloaded here:
- [mb_melgan_csmsc_static_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_static_0.1.1.zip)
The PIR static model can be downloaded here:
- [mb_melgan_csmsc_static_pir_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_static_pir_0.1.1.zip) (Run PIR model need to set FLAGS_enable_pir_api=1, and PIR model only worked with paddlepaddle>=3.0.0b2)
The ONNX model can be downloaded here:
- [mb_melgan_csmsc_onnx_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_onnx_0.2.0.zip)

@ -4,6 +4,17 @@ This example contains code used to train a [HiFiGAN](https://arxiv.org/abs/2010.
### Download and Extract
Download CSMSC from it's [official website](https://test.data-baker.com/data/index/TNtts/) and extract it to `~/datasets`. Then the dataset is in the directory `~/datasets/BZNSYP`.
The structure of the folder is listed below.
```text
└─ Wave
└─ .wav files (audio speech)
└─ PhoneLabeling
└─ .interval files (alignment between phoneme and duration)
└─ ProsodyLabeling
└─ 000001-010000.txt (text with prosodic by pinyin)
```
### Get MFA Result and Extract
We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut silence at the edge of audio.
You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo.
@ -118,6 +129,9 @@ The pretrained model can be downloaded here:
The static model can be downloaded here:
- [hifigan_csmsc_static_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_static_0.1.1.zip)
The PIR static model can be downloaded here:
- [hifigan_csmsc_static_pir_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_static_pir_0.1.1.zip) (Run PIR model need to set FLAGS_enable_pir_api=1, and PIR model only worked with paddlepaddle>=3.0.0b2)
The ONNX model can be downloaded here:
- [hifigan_csmsc_onnx_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_onnx_0.2.0.zip)

@ -6,6 +6,17 @@ This example contains code used to train a [iSTFTNet](https://arxiv.org/abs/2203
### Download and Extract
Download CSMSC from it's [official website](https://test.data-baker.com/data/index/TNtts/) and extract it to `~/datasets`. Then the dataset is in the directory `~/datasets/BZNSYP`.
The structure of the folder is listed below.
```text
└─ Wave
└─ .wav files (audio speech)
└─ PhoneLabeling
└─ .interval files (alignment between phoneme and duration)
└─ ProsodyLabeling
└─ 000001-010000.txt (text with prosodic by pinyin)
```
### Get MFA Result and Extract
We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut silence at the edge of audio.
You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo.

@ -144,7 +144,7 @@ source path.sh
bash ./local/data.sh
CUDA_VISIBLE_DEVICES= ./local/train.sh conf/deepspeech2.yaml deepspeech2
avg.sh best exp/deepspeech2/checkpoints 1
CUDA_VISIBLE_DEVICES= ./local/test.sh conf/deepspeech2.yaml exp/deepspeech2/checkpoints/avg_1
CUDA_VISIBLE_DEVICES= ./local/test.sh conf/deepspeech2.yaml conf/tuning/decode.yaml exp/deepspeech2/checkpoints/avg_1
```
## Stage 4: Static graph model Export
This stage is to transform dygraph to static graph.
@ -185,5 +185,5 @@ wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/en/demo_002_en.w
```
You can train a model by yourself, then you need to prepare an audio file or use the audio demo above, please confirm the sample rate of the audio is 16K. You can get the result of the audio demo by running the script below.
```bash
CUDA_VISIBLE_DEVICES= ./local/test_wav.sh conf/deepspeech2.yaml exp/deepspeech2/checkpoints/avg_1 data/demo_002_en.wav
CUDA_VISIBLE_DEVICES= ./local/test_wav.sh conf/deepspeech2.yaml conf/tuning/decode.yaml exp/deepspeech2/checkpoints/avg_1 data/demo_002_en.wav
```

@ -22,7 +22,7 @@ fi
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# format the reference test file
python3 utils/format_rsl.py \
python3 ${MAIN_ROOT}/utils/format_rsl.py \
--origin_ref data/manifest.test-clean.raw \
--trans_ref data/manifest.test-clean.text
@ -38,20 +38,20 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
exit 1
fi
python3 utils/format_rsl.py \
python3 ${MAIN_ROOT}/utils/format_rsl.py \
--origin_hyp ${ckpt_prefix}.rsl \
--trans_hyp ${ckpt_prefix}.rsl.text
python3 utils/compute-wer.py --char=1 --v=1 \
python3 ${MAIN_ROOT}/utils/compute-wer.py --char=1 --v=1 \
data/manifest.test-clean.text ${ckpt_prefix}.rsl.text > ${ckpt_prefix}.error
fi
if [ ${stage} -le 101 ] && [ ${stop_stage} -ge 101 ]; then
python3 utils/format_rsl.py \
python3 ${MAIN_ROOT}/utils/format_rsl.py \
--origin_ref data/manifest.test-clean.raw \
--trans_ref_sclite data/manifest.test.text-clean.sclite
python3 utils/format_rsl.py \
python3 ${MAIN_ROOT}/utils/format_rsl.py \
--origin_hyp ${ckpt_prefix}.rsl \
--trans_hyp_sclite ${ckpt_prefix}.rsl.text.sclite

@ -148,7 +148,7 @@ or you can run these scripts in the command line (only use CPU).
bash ./local/data.sh
CUDA_VISIBLE_DEVICES= ./local/train.sh conf/conformer.yaml conformer
avg.sh best exp/conformer/checkpoints 20
CUDA_VISIBLE_DEVICES= ./local/test.sh conf/conformer.yaml exp/conformer/checkpoints/avg_20
CUDA_VISIBLE_DEVICES= ./local/test.sh conf/conformer.yaml conf/tuning/decode.yaml exp/conformer/checkpoints/avg_20
```
## Pretrained Model
You can get the pretrained transformer or conformer from [this](../../../docs/source/released_model.md).
@ -163,7 +163,7 @@ source path.sh
# If you have process the data and get the manifest file you can skip the following 2 steps
bash local/data.sh --stage -1 --stop_stage -1
bash local/data.sh --stage 2 --stop_stage 2
CUDA_VISIBLE_DEVICES= ./local/test.sh conf/conformer.yaml exp/conformer/checkpoints/avg_20
CUDA_VISIBLE_DEVICES= ./local/test.sh conf/conformer.yaml conf/tuning/decode.yaml exp/conformer/checkpoints/avg_20
```
The performance of the released models are shown in [here](./RESULTS.md).
@ -192,8 +192,8 @@ bash ./local/data.sh
CUDA_VISIBLE_DEVICES= ./local/train.sh conf/conformer.yaml conformer
avg.sh best exp/conformer/checkpoints 20
# test stage is optional
CUDA_VISIBLE_DEVICES= ./local/test.sh conf/conformer.yaml exp/conformer/checkpoints/avg_20
CUDA_VISIBLE_DEVICES= ./local/align.sh conf/conformer.yaml exp/conformer/checkpoints/avg_20
CUDA_VISIBLE_DEVICES= ./local/test.sh conf/conformer.yaml conf/tuning/decode.yaml exp/conformer/checkpoints/avg_20
CUDA_VISIBLE_DEVICES= ./local/align.sh conf/conformer.yaml conf/tuning/decode.yaml exp/conformer/checkpoints/avg_20
```
## Stage 5: Single Audio File Inference
In some situations, you want to use the trained model to do the inference for the single audio file. You can use stage 5. The code is shown below
@ -214,5 +214,5 @@ wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/en/demo_002_en.w
```
You need to prepare an audio file or use the audio demo above, please confirm the sample rate of the audio is 16K. You can get the result of the audio demo by running the script below.
```bash
CUDA_VISIBLE_DEVICES= ./local/test_wav.sh conf/conformer.yaml exp/conformer/checkpoints/avg_20 data/demo_002_en.wav
CUDA_VISIBLE_DEVICES= ./local/test_wav.sh conf/conformer.yaml conf/tuning/decode.yaml exp/conformer/checkpoints/avg_20 data/demo_002_en.wav
```

@ -43,7 +43,7 @@ echo "chunk mode ${chunk_mode}"
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# format the reference test file
python3 utils/format_rsl.py \
python3 ${MAIN_ROOT}/utils/format_rsl.py \
--origin_ref data/manifest.test-clean.raw \
--trans_ref data/manifest.test-clean.text
@ -68,11 +68,11 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
echo "Failed in evaluation!"
exit 1
fi
python3 utils/format_rsl.py \
python3 ${MAIN_ROOT}/utils/format_rsl.py \
--origin_hyp ${ckpt_prefix}.${type}.rsl \
--trans_hyp ${ckpt_prefix}.${type}.rsl.text
python3 utils/compute-wer.py --char=1 --v=1 \
python3 ${MAIN_ROOT}/utils/compute-wer.py --char=1 --v=1 \
data/manifest.test-clean.text ${ckpt_prefix}.${type}.rsl.text > ${ckpt_prefix}.${type}.error
echo "decoding ${type} done."
done
@ -98,7 +98,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
echo "Failed in evaluation!"
exit 1
fi
python3 utils/format_rsl.py \
python3 ${MAIN_ROOT}/utils/format_rsl.py \
--origin_hyp ${ckpt_prefix}.${type}.rsl \
--trans_hyp ${ckpt_prefix}.${type}.rsl.text
@ -125,25 +125,25 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
echo "Failed in evaluation!"
exit 1
fi
python3 utils/format_rsl.py \
python3 ${MAIN_ROOT}/utils/format_rsl.py \
--origin_hyp ${ckpt_prefix}.${type}.rsl \
--trans_hyp ${ckpt_prefix}.${type}.rsl.text
python3 utils/compute-wer.py --char=1 --v=1 \
python3 ${MAIN_ROOT}/utils/compute-wer.py --char=1 --v=1 \
data/manifest.test-clean.text ${ckpt_prefix}.${type}.rsl.text > ${ckpt_prefix}.${type}.error
echo "decoding ${type} done."
done
fi
if [ ${stage} -le 101 ] && [ ${stop_stage} -ge 101 ]; then
python3 utils/format_rsl.py \
python3 ${MAIN_ROOT}/utils/format_rsl.py \
--origin_ref data/manifest.test-clean.raw \
--trans_ref_sclite data/manifest.test.text-clean.sclite
output_dir=${ckpt_prefix}
for type in attention ctc_greedy_search ctc_prefix_beam_search attention_rescoring; do
python utils/format_rsl.py \
python ${MAIN_ROOT}/utils/format_rsl.py \
--origin_hyp ${output_dir}/${type}.rsl \
--trans_hyp_sclite ${output_dir}/${type}.rsl.text.sclite

@ -1 +1 @@
../../../tools/kaldi/egs/wsj/s5/steps/
../../../tools/kaldi/egs/wsj/s5/steps/

@ -1 +0,0 @@
../../../tools/kaldi/egs/wsj/s5/utils

@ -24,7 +24,7 @@ source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
# exit 1
#fi
python3 utils/format_rsl.py \
python3 ${MAIN_ROOT}/utils/format_rsl.py \
--origin_ref data/manifest.test-clean.raw \
--trans_ref data/manifest.test-clean.text
@ -45,11 +45,11 @@ for type in ctc_greedy_search; do
echo "Failed in evaluation!"
exit 1
fi
python3 utils/format_rsl.py \
python3 ${MAIN_ROOT}/utils/format_rsl.py \
--origin_hyp ${ckpt_prefix}.${type}.rsl \
--trans_hyp ${ckpt_prefix}.${type}.rsl.text
python3 utils/compute-wer.py --char=1 --v=1 \
python3 ${MAIN_ROOT}/utils/compute-wer.py --char=1 --v=1 \
data/manifest.test-clean.text ${ckpt_prefix}.${type}.rsl.text > ${ckpt_prefix}.${type}.error
echo "decoding ${type} done."
done
@ -70,11 +70,11 @@ for type in ctc_prefix_beam_search; do
echo "Failed in evaluation!"
exit 1
fi
python3 utils/format_rsl.py \
python3 ${MAIN_ROOT}/utils/format_rsl.py \
--origin_hyp ${ckpt_prefix}.${type}.rsl \
--trans_hyp ${ckpt_prefix}.${type}.rsl.text
python3 utils/compute-wer.py --char=1 --v=1 \
python3 ${MAIN_ROOT}/utils/compute-wer.py --char=1 --v=1 \
data/manifest.test-clean.text ${ckpt_prefix}.${type}.rsl.text > ${ckpt_prefix}.${type}.error
echo "decoding ${type} done."
done

@ -23,7 +23,7 @@ source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
# exit 1
#fi
python3 utils/format_rsl.py \
python3 ${MAIN_ROOT}/utils/format_rsl.py \
--origin_ref data/manifest.test-clean.raw \
--trans_ref data/manifest.test-clean.text
@ -44,11 +44,11 @@ for type in ctc_greedy_search; do
echo "Failed in evaluation!"
exit 1
fi
python3 utils/format_rsl.py \
python3 ${MAIN_ROOT}/utils/format_rsl.py \
--origin_hyp ${ckpt_prefix}.${type}.rsl \
--trans_hyp ${ckpt_prefix}.${type}.rsl.text
python3 utils/compute-wer.py --char=1 --v=1 \
python3 ${MAIN_ROOT}/utils/compute-wer.py --char=1 --v=1 \
data/manifest.test-clean.text ${ckpt_prefix}.${type}.rsl.text > ${ckpt_prefix}.${type}.error
echo "decoding ${type} done."
done
@ -69,11 +69,11 @@ for type in ctc_prefix_beam_search; do
echo "Failed in evaluation!"
exit 1
fi
python3 utils/format_rsl.py \
python3 ${MAIN_ROOT}/utils/format_rsl.py \
--origin_hyp ${ckpt_prefix}.${type}.rsl \
--trans_hyp ${ckpt_prefix}.${type}.rsl.text
python3 utils/compute-wer.py --char=1 --v=1 \
python3 ${MAIN_ROOT}/utils/compute-wer.py --char=1 --v=1 \
data/manifest.test-clean.text ${ckpt_prefix}.${type}.rsl.text > ${ckpt_prefix}.${type}.error
echo "decoding ${type} done."
done

@ -23,7 +23,7 @@ source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
# exit 1
#fi
python3 format_rsl.py \
python3 ${MAIN_ROOT}/utils/format_rsl.py \
--origin_ref data/manifest.test-clean.raw \
--trans_ref data/manifest.test-clean.text
@ -44,7 +44,7 @@ for type in ctc_greedy_search; do
echo "Failed in evaluation!"
exit 1
fi
python3 format_rsl.py \
python3 ${MAIN_ROOT}/utils/format_rsl.py \
--origin_hyp ${ckpt_prefix}.${type}.rsl \
--trans_hyp ${ckpt_prefix}.${type}.rsl.text
@ -69,7 +69,7 @@ for type in ctc_prefix_beam_search; do
echo "Failed in evaluation!"
exit 1
fi
python3 format_rsl.py \
python3 ${MAIN_ROOT}/utils/format_rsl.py \
--origin_hyp ${ckpt_prefix}.${type}.rsl \
--trans_hyp ${ckpt_prefix}.${type}.rsl.text

@ -27,7 +27,6 @@ The document below will describe the scripts in `run.sh` in detail.
The path.sh contains the environment variables.
```bash
. ./path.sh
. ./cmd.sh
```
This script needs to be run first. And another script is also needed:
```bash
@ -67,7 +66,6 @@ bash run.sh --stage 0 --stop_stage 0
You can also just run these scripts in your command line.
```bash
. ./path.sh
. ./cmd.sh
bash ./local/data.sh
```
After processing the data, the `data` directory will look like this:
@ -103,7 +101,6 @@ bash run.sh --stage 0 --stop_stage 1
or you can run these scripts in the command line (only use CPU).
```bash
. ./path.sh
. ./cmd.sh
bash ./local/data.sh
CUDA_VISIBLE_DEVICES= ./local/train.sh conf/conformer.yaml conformer
```
@ -124,7 +121,6 @@ or you can run these scripts in the command line (only use CPU).
```bash
. ./path.sh
. ./cmd.sh
bash ./local/data.sh
CUDA_VISIBLE_DEVICES= ./local/train.sh conf/conformer.yaml conformer
avg.sh best exp/conformer/checkpoints 10
@ -144,11 +140,10 @@ bash run.sh --stage 0 --stop_stage 3
or you can run these scripts in the command line (only use CPU).
```bash
. ./path.sh
. ./cmd.sh
bash ./local/data.sh
CUDA_VISIBLE_DEVICES= ./local/train.sh conf/conformer.yaml conformer
avg.sh best exp/conformer/checkpoints 10
CUDA_VISIBLE_DEVICES= ./local/test.sh conf/conformer.yaml exp/conformer/checkpoints/avg_10
CUDA_VISIBLE_DEVICES= ./local/test.sh conf/conformer.yaml conf/tuning/decode.yaml exp/conformer/checkpoints/avg_10
```
## Pretrained Model
You can get the pretrained transformer or conformer from [this](../../../docs/source/released_model.md).
@ -163,7 +158,7 @@ source path.sh
# If you have process the data and get the manifest file you can skip the following 2 steps
bash local/data.sh --stage -1 --stop_stage -1
bash local/data.sh --stage 2 --stop_stage 2
CUDA_VISIBLE_DEVICES= ./local/test.sh conf/conformer.yaml exp/conformer/checkpoints/avg_10
CUDA_VISIBLE_DEVICES= ./local/test.sh conf/conformer.yaml conf/tuning/decode.yaml exp/conformer/checkpoints/avg_10
```
The performance of the released models are shown in [here](./RESULTS.md).
@ -186,5 +181,5 @@ wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wa
```
You need to prepare an audio file or use the audio demo above, please confirm the sample rate of the audio is 16K. You can get the result of the audio demo by running the script below.
```bash
CUDA_VISIBLE_DEVICES= ./local/test_wav.sh conf/conformer.yaml exp/conformer/checkpoints/avg_10 data/demo_01_03.wav
CUDA_VISIBLE_DEVICES= ./local/test_wav.sh conf/conformer.yaml conf/tuning/decode.yaml exp/conformer/checkpoints/avg_10 data/demo_01_03.wav
```

@ -0,0 +1,96 @@
############################################
# Network Architecture #
############################################
cmvn_file:
cmvn_file_type: "json"
# encoder related
encoder: conformer
encoder_conf:
output_size: 512 # dimension of attention
attention_heads: 8
linear_units: 2048 # the number of units of position-wise feed forward
num_blocks: 12 # the number of encoder blocks
dropout_rate: 0.1 # sublayer output dropout
positional_dropout_rate: 0.1
attention_dropout_rate: 0.0
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before: True
cnn_module_kernel: 15
use_cnn_module: True
activation_type: 'swish'
pos_enc_layer_type: 'rel_pos'
selfattention_layer_type: 'rel_selfattn'
causal: true
use_dynamic_chunk: true
cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster
use_dynamic_left_chunk: false
# decoder related
decoder: transformer
decoder_conf:
attention_heads: 8
linear_units: 2048
num_blocks: 6
dropout_rate: 0.1 # sublayer output dropout
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false
init_type: 'kaiming_uniform' # !Warning: need to convergence
###########################################
# Data #
###########################################
train_manifest: data/manifest.train
dev_manifest: data/manifest.dev
test_manifest: data/manifest.test
###########################################
# Dataloader #
###########################################
vocab_filepath: data/lang_char/vocab.txt
spm_model_prefix: 'data/lang_char/bpe_bpe_11297'
unit_type: 'spm'
preprocess_config: conf/preprocess.yaml
feat_dim: 80
stride_ms: 20.0
window_ms: 30.0
sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
batch_size: 32
maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced
maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced
minibatches: 0 # for debug
batch_count: auto
batch_bins: 0
batch_frames_in: 0
batch_frames_out: 0
batch_frames_inout: 0
num_workers: 2
subsampling_factor: 1
num_encs: 1
###########################################
# Training #
###########################################
n_epoch: 100
accum_grad: 4
global_grad_clip: 5.0
dist_sampler: False
optim: adam
optim_conf:
lr: 0.002
weight_decay: 1.0e-6
scheduler: warmuplr
scheduler_conf:
warmup_steps: 25000
lr_decay: 1.0
log_interval: 100
checkpoint:
kbest_n: 50
latest_n: 5

@ -127,7 +127,7 @@ source path.h
bash ./local/data.sh
CUDA_VISIBLE_DEVICES= ./local/train.sh conf/transformer_mtl_noam.yaml transformer_mtl_noam
avg.sh latest exp/transformer_mtl_noam/checkpoints 5
CUDA_VISIBLE_DEVICES= ./local/test.sh conf/transformer_mtl_noam.yaml exp/transformer_mtl_noam/checkpoints/avg_5
CUDA_VISIBLE_DEVICES= ./local/test.sh conf/transformer_mtl_noam.yaml conf/tuning/decode.yaml exp/transformer_mtl_noam/checkpoints/avg_5
```
The performance of the released models are shown below:
### Transformer

@ -203,7 +203,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
echo "stage 3: Format the Json Data"
for (( i=0; i<${#x[*]}; ++i)); do
python3 ${MAIN_ROOT}/utils/espnet_json_to_manifest.py \
--json-file ${x[$i]}/data_${bpemode}${nbpe}.json
--json-file ${x[$i]}/data_${bpemode}${nbpe}.json \
--manifest-file data/manifest.${y[$i]}
done
fi

@ -2,6 +2,4 @@
asr model with phone unit
* ~~asr0 - deepspeech2 Streaming/Non-Streaming~~
* asr1 - transformer/conformer Streaming/Non-Streaming
* ~~asr2 - transformer/conformer Streaming/Non-Streaming with Kaldi feature~~
* asr1 - transformer Streaming/Non-Streaming

@ -0,0 +1,195 @@
# Transformer ASR with Timit
The phoneme-based continuous speech corpus is a collaboration between Texas Instruments, MIT, and SRI International. The [Timit](https://catalog.ldc.upenn.edu/docs/LDC93S1/) dataset has a voice sampling frequency of 16 khz and contains a total of 6,300 sentences, with 630 people from 8 major U.S. dialects speaking a given 10 sentences each, all sentences are manually segmented and marked at the phone level. Seventy percent of the speakers are male; most of the speakers are white adults.
## Dataset
### Download and Extract
Download TIMIT from it's [official website](https://catalog.ldc.upenn.edu/LDC93S1) and extract it to `~/datasets`. Assume unzip the dataset in the directory `~/datasets/timit`.
## Overview
All the scripts you need are in `run.sh`. There are several stages in `run.sh`, and each stage has its function.
| Stage | Function |
|:---- |:----------------------------------------------------------- |
| 0 | Process data. It includes: <br> (1) Download the dataset <br> (2) Calculate the CMVN of the train dataset <br> (3) Get the vocabulary file <br> (4) Get the manifest files of the train, development and test dataset |
| 1 | Train the model |
| 2 | Get the final model by averaging the top-k models, set k = 1 means to choose the best model |
| 3 | Test the final model performance |
| 4 | Get ctc alignment of test data using the final model |
You can choose to run a range of stages by setting `stage` and `stop_stage `.
For example, if you want to execute the code in stage 2 and stage 3, you can run this script:
```bash
bash run.sh --stage 2 --stop_stage 3
```
Or you can set `stage` equal to `stop-stage` to only run one stage.
For example, if you only want to run `stage 0`, you can use the script below:
```bash
bash run.sh --stage 0 --stop_stage 0
```
The document below will describe the scripts in `run.sh` in detail.
## The Environment Variables
The path.sh contains the environment variables.
```bash
source path.sh
```
This script needs to be run first. And another script is also needed:
```bash
source ${MAIN_ROOT}/utils/parse_options.sh
```
It will support the way of using `--variable value` in the shell scripts.
## The Local Variables
Some local variables are set in `run.sh`.
`gpus` denotes the GPU number you want to use. If you set `gpus=`, it means you only use CPU.
`stage` denotes the number of the stage you want to start from in the experiments.
`stop stage` denotes the number of the stage you want to end at in the experiments.
`conf_path` denotes the config path of the model.
`avg_num` denotes the number K of top-K models you want to average to get the final model.
`audio_file` denotes the file path of the single file you want to infer in stage 5
`ckpt` denotes the checkpoint prefix of the model, e.g. "conformer"
You can set the local variables (except `ckpt`) when you use `run.sh`
For example, you can set the `gpus` and `avg_num` when you use the command line.:
```bash
bash run.sh --gpus 0,1,2,3 --avg_num 10
```
## Stage 0: Data Processing
To use this example, you need to process data firstly and you can use stage 0 in `run.sh` to do this. The code is shown below:
```bash
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# prepare data
bash ./local/timit_data_prep.sh ${TIMIT_path}
bash ./local/data.sh || exit -1
fi
```
Stage 0 is for processing the data.
If you only want to process the data. You can run
```bash
bash run.sh --stage 0 --stop_stage 0
```
You can also just run these scripts in your command line.
```bash
source path.sh
bash ./local/timit_data_prep.sh ${TIMIT_path}
bash ./local/data.sh
```
After processing the data, the ``data`` directory will look like this:
```bash
data/
|-- lang_char
| `-- vocab.txt
|-- local
| `-- dev_sph.flist
| `-- dev_sph.scp
| `-- dev.text
| `-- dev.trans
| `-- dev.uttids
| `-- test_sph.flist
| `-- test_sph.scp
| `-- test.text
| `-- test.trans
| `-- test.uttids
| `-- train_sph.flist
| `-- train_sph.scp
| `-- train.text
| `-- train.trans
| `-- train.uttids
|-- manifest.dev
|-- manifest.dev.raw
|-- manifest.test
|-- manifest.test.raw
|-- manifest.train
|-- manifest.train.raw
|-- mean_std.json
|-- test.meta
```
## Stage 1: Model Training
If you want to train the model. you can use stage 1 in `run.sh`. The code is shown below.
```bash
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# train model, all `ckpt` under `exp` dir
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt}
fi
```
If you want to train the model, you can use the script below to execute stage 0 and stage 1:
```bash
bash run.sh --stage 0 --stop_stage 1
```
or you can run these scripts in the command line.
```bash
source path.sh
bash ./local/timit_data_prep.sh ${TIMIT_path}
bash ./local/data.sh
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh conf/transformer.yaml transformer
```
## Stage 2: Top-k Models Averaging
After training the model, we need to get the final model for testing and inference. In every epoch, the model checkpoint is saved, so we can choose the best model from them based on the validation loss or we can sort them and average the parameters of the top-k models to get the final model. We can use stage 2 to do this, and the code is shown below:
```bash
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# avg n best model
avg.sh best exp/${ckpt}/checkpoints ${avg_num}
fi
```
The `avg.sh`is in the `../../../utils/` which is define in the `path.sh`.
If you want to get the final model, you can use the script below to execute stage 0, stage 1, and stage 2:
```bash
bash run.sh --stage 0 --stop_stage 2
```
or you can run these scripts in the command line.
```bash
bash ./local/timit_data_prep.sh ${TIMIT_path}
source path.sh
bash ./local/data.sh
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh conf/transformer.yaml transformer
avg.sh best exp/conformer/checkpoints 10
```
## Stage 3: Model Testing
The test stage is to evaluate the model performance. The code of the test stage is shown below:
```bash
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# test ckpt avg_n
CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
fi
```
If you want to train a model and test it, you can use the script below to execute stage 0, stage 1, stage 2, and stage 3 :
```bash
bash run.sh --stage 0 --stop_stage 3
```
or you can run these scripts in the command line.
```bash
source path.sh
bash ./local/timit_data_prep.sh ${TIMIT_path}
bash ./local/data.sh
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh conf/transformer.yaml transformer
avg.sh best exp/transformer/checkpoints 10
CUDA_VISIBLE_DEVICES=0 ./local/test.sh conf/transformer.yaml exp/transformer/checkpoints/avg_10
```
## Stage 4: CTC Alignment
If you want to get the alignment between the audio and the text, you can use the ctc alignment. The code of this stage is shown below:
```bash
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
# ctc alignment of test data
CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
fi
```
If you want to train the model, test it and do the alignment, you can use the script below to execute stage 0, stage 1, stage 2, and stage 3 :
```bash
bash run.sh --stage 0 --stop_stage 4
```
or if you only need to train a model and do the alignment, you can use these scripts to escape stage 3(test stage):
```bash
bash run.sh --stage 0 --stop_stage 2
bash run.sh --stage 4 --stop_stage 4
```
or you can also use these scripts in the command line.
```bash
source path.sh
bash ./local/timit_data_prep.sh ${TIMIT_path}
bash ./local/data.sh
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh conf/transformer.yaml transformer
avg.sh best exp/transformer/checkpoints 10
# test stage is optional
CUDA_VISIBLE_DEVICES=0 ./local/test.sh conf/transformer.yaml exp/transformer/checkpoints/avg_10
CUDA_VISIBLE_DEVICES=0 ./local/align.sh conf/transformer.yaml exp/transformer/checkpoints/avg_10
```

@ -9,7 +9,7 @@ stop_stage=50
conf_path=conf/transformer.yaml
decode_conf_path=conf/tuning/decode.yaml
avg_num=10
TIMIT_path=/path/to/TIMIT
TIMIT_path=~/datasets/timit/data/lisa/data/timit/raw/TIMIT
. ${MAIN_ROOT}/utils/parse_options.sh || exit 1;

@ -26,7 +26,6 @@ The document below will describe the scripts in ```run.sh```in detail.
The path.sh contains the environment variables.
```bash
. ./path.sh
. ./cmd.sh
```
This script needs to be run first. And another script is also needed:
```bash
@ -64,7 +63,6 @@ bash run.sh --stage 0 --stop_stage 0
You can also just run these scripts in your command line.
```bash
. ./path.sh
. ./cmd.sh
bash ./local/data.sh
```
After processing the data, the ``data`` directory will look like this:
@ -100,7 +98,6 @@ bash run.sh --stage 0 --stop_stage 1
or you can run these scripts in the command line (only use CPU).
```bash
. ./path.sh
. ./cmd.sh
bash ./local/data.sh
CUDA_VISIBLE_DEVICES= ./local/train.sh conf/transformer.yaml transformer
```## Stage 2: Top-k Models Averaging
@ -119,7 +116,6 @@ bash run.sh --stage 0 --stop_stage 2
or you can run these scripts in the command line (only use CPU).
```bash
. ./path.sh
. ./cmd.sh
bash ./local/data.sh
CUDA_VISIBLE_DEVICES= ./local/train.sh conf/transformer.yaml transformer
avg.sh best exp/transformer/checkpoints 1
@ -139,7 +135,6 @@ bash run.sh --stage 0 --stop_stage 3
or you can run these scripts in the command line (only use CPU).
```bash
. ./path.sh
. ./cmd.sh
bash ./local/data.sh
CUDA_VISIBLE_DEVICES= ./local/train.sh conf/transformer.yaml transformer
avg.sh best exp/transformer/checkpoints 1
@ -166,7 +161,6 @@ bash run.sh --stage 4 --stop_stage 4
or you can also use these scripts in the command line (only use CPU).
```bash
. ./path.sh
. ./cmd.sh
bash ./local/data.sh
CUDA_VISIBLE_DEVICES= ./local/train.sh conf/transformer.yaml transformer
avg.sh best exp/transformer/checkpoints 1

@ -124,6 +124,9 @@ The pretrained model can be downloaded here:
The static model can be downloaded here:
- [hifigan_vctk_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_static_1.1.0.zip)
The PIR static model can be downloaded here:
- [hifigan_vctk_static_pir_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_static_pir_1.1.0.zip) (Run PIR model need to set FLAGS_enable_pir_api=1, and PIR model only worked with paddlepaddle>=3.0.0b2)
The ONNX model can be downloaded here:
- [hifigan_vctk_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_onnx_1.1.0.zip)

@ -1 +1 @@
../../../utils/
../../../utils/

@ -15,12 +15,15 @@ import argparse
import os
import numpy as np
import paddle
from paddle import inference
from paddle.audio.datasets import ESC50
from paddle.audio.features import LogMelSpectrogram
from paddleaudio.backends import soundfile_load as load_audio
from scipy.special import softmax
import paddlespeech.utils
# yapf: disable
parser = argparse.ArgumentParser()
parser.add_argument("--model_dir", type=str, required=True, default="./export", help="The directory to static model.")
@ -56,7 +59,6 @@ def extract_features(files: str, **kwargs):
feature_extractor = LogMelSpectrogram(sr, **kwargs)
feat = feature_extractor(paddle.to_tensor(waveforms[i]))
feat = paddle.transpose(feat, perm=[1, 0]).unsqueeze(0)
feats.append(feat)
return np.stack(feats, axis=0)
@ -73,13 +75,18 @@ class Predictor(object):
enable_mkldnn=False):
self.batch_size = batch_size
model_file = os.path.join(model_dir, "inference.pdmodel")
params_file = os.path.join(model_dir, "inference.pdiparams")
if paddlespeech.utils.satisfy_paddle_version('3.0.0-beta'):
config = inference.Config(model_dir, 'inference')
config.disable_mkldnn()
else:
model_file = os.path.join(model_dir, 'inference.pdmodel')
params_file = os.path.join(model_dir, "inference.pdiparams")
assert os.path.isfile(model_file) and os.path.isfile(
params_file), 'Please check model and parameter files.'
assert os.path.isfile(model_file) and os.path.isfile(
params_file), 'Please check model and parameter files.'
config = inference.Config(model_file, params_file)
config = inference.Config(model_file, params_file)
if device == "gpu":
# set GPU configs accordingly
# such as intialize the gpu memory, enable tensorrt

@ -39,7 +39,8 @@ if __name__ == '__main__':
input_spec=[
paddle.static.InputSpec(
shape=[None, None, 64], dtype=paddle.float32)
])
],
full_graph=True)
# Save in static graph model.
paddle.jit.save(model, os.path.join(args.output_dir, "inference"))

@ -86,7 +86,7 @@ class CTCPrefixScorePD():
dtype=self.dtype, ) # (T, 2, B, W)
r_prev[:, 1] = paddle.cumsum(self.x[0, :, :, self.blank],
0).unsqueeze(2)
r_prev = r_prev.view(-1, 2, n_bh) # (T, 2, BW)
r_prev = r_prev.reshape([-1, 2, n_bh]) # (T, 2, BW)
s_prev = 0.0 # score
f_min_prev = 0 # eq. 22-23
f_max_prev = 1 # eq. 22-23
@ -100,23 +100,23 @@ class CTCPrefixScorePD():
(n_bh, self.odim), -1, dtype=paddle.long)
snum = self.scoring_num
if self.idx_bh is None or n_bh > len(self.idx_bh):
self.idx_bh = paddle.arange(n_bh).view(-1, 1) # (BW, 1)
self.idx_bh = paddle.arange(n_bh).reshape([-1, 1]) # (BW, 1)
scoring_idmap[self.idx_bh[:n_bh], scoring_ids] = paddle.arange(snum)
scoring_idx = (
scoring_ids + self.idx_bo.repeat(1, n_hyps).view(-1,
1) # (BW,1)
).view(-1) # (BWO)
scoring_ids + self.idx_bo.repeat(1, n_hyps).reshape(
[-1, 1]) # (BW,1)
).reshape([-1]) # (BWO)
# x_ shape (2, T, B*W, O)
x_ = paddle.index_select(
self.x.view(2, -1, self.batch * self.odim), scoring_idx,
2).view(2, -1, n_bh, snum)
self.x.reshape([2, -1, self.batch * self.odim]), scoring_idx,
2).reshape([2, -1, n_bh, snum])
else:
scoring_ids = None
scoring_idmap = None
snum = self.odim
# x_ shape (2, T, B*W, O)
x_ = self.x.unsqueeze(3).repeat(1, 1, 1, n_hyps, 1).view(2, -1,
n_bh, snum)
x_ = self.x.unsqueeze(3).repeat(1, 1, 1, n_hyps, 1).reshape(
[2, -1, n_bh, snum])
# new CTC forward probs are prepared as a (T x 2 x BW x S) tensor
# that corresponds to r_t^n(h) and r_t^b(h) in a batch.
@ -154,8 +154,8 @@ class CTCPrefixScorePD():
# compute forward probabilities log(r_t^n(h)) and log(r_t^b(h))
for t in range(start, end):
rp = r[t - 1] # (2 x BW x O')
rr = paddle.stack([rp[0], log_phi[t - 1], rp[0], rp[1]]).view(
2, 2, n_bh, snum) # (2,2,BW,O')
rr = paddle.stack([rp[0], log_phi[t - 1], rp[0], rp[1]]).reshape(
[2, 2, n_bh, snum]) # (2,2,BW,O')
r[t] = paddle.logsumexp(rr, 1) + x_[:, t]
# compute log prefix probabilities log(psi)
@ -197,25 +197,27 @@ class CTCPrefixScorePD():
# convert ids to BHO space
n_bh = len(s)
n_hyps = n_bh // self.batch
vidx = (best_ids + (self.idx_b *
(n_hyps * self.odim)).view(-1, 1)).view(-1)
vidx = (best_ids +
(self.idx_b *
(n_hyps * self.odim)).reshape([-1, 1])).reshape([-1])
# select hypothesis scores
s_new = paddle.index_select(s.view(-1), vidx, 0)
s_new = s_new.view(-1, 1).repeat(1, self.odim).view(n_bh, self.odim)
s_new = paddle.index_select(s.reshape([-1]), vidx, 0)
s_new = s_new.reshape([-1, 1]).repeat(1, self.odim).reshape(
[n_bh, self.odim])
# convert ids to BHS space (S: scoring_num)
if scoring_idmap is not None:
snum = self.scoring_num
hyp_idx = (best_ids // self.odim +
(self.idx_b * n_hyps).view(-1, 1)).view(-1)
label_ids = paddle.fmod(best_ids, self.odim).view(-1)
(self.idx_b * n_hyps).reshape([-1, 1])).reshape([-1])
label_ids = paddle.fmod(best_ids, self.odim).reshape([-1])
score_idx = scoring_idmap[hyp_idx, label_ids]
score_idx[score_idx == -1] = 0
vidx = score_idx + hyp_idx * snum
else:
snum = self.odim
# select forward probabilities
r_new = paddle.index_select(r.view(-1, 2, n_bh * snum), vidx, 2).view(
-1, 2, n_bh)
r_new = paddle.index_select(r.reshape([-1, 2, n_bh * snum]), vidx,
2).reshape([-1, 2, n_bh])
return r_new, s_new, f_min, f_max
def extend_prob(self, x):

@ -135,7 +135,7 @@ class BatchScorerInterface(ScorerInterface):
score, outstate = self.score(y, state, x)
outstates.append(outstate)
scores.append(score)
scores = paddle.cat(scores, 0).view(ys.shape[0], -1)
scores = paddle.cat(scores, 0).reshape([ys.shape[0], -1])
return scores, outstates

@ -75,7 +75,7 @@ class DeepSpeech2Tester_hub():
feat = self.preprocessing(audio, **self.preprocess_args)
logger.info(f"feat shape: {feat.shape}")
audio_len = paddle.to_tensor(feat.shape[0])
audio_len = paddle.to_tensor(feat.shape[0]).unsqueeze(0)
audio = paddle.to_tensor(feat, dtype='float32').unsqueeze(axis=0)
result_transcripts = self.compute_result_transcripts(

@ -23,6 +23,7 @@ import paddle
from paddle import distributed as dist
from paddle import inference
import paddlespeech.utils
from paddlespeech.audio.text.text_featurizer import TextFeaturizer
from paddlespeech.s2t.io.dataloader import BatchDataLoader
from paddlespeech.s2t.models.ds2 import DeepSpeech2InferModel
@ -421,7 +422,6 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester):
else:
raise Exception("wrong model type")
self.predictor.clear_intermediate_tensor()
self.predictor.try_shrink_memory()
#replace the <space> with ' '
@ -629,9 +629,19 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester):
def setup_model(self):
super().setup_model()
deepspeech_config = inference.Config(
self.args.export_path + ".pdmodel",
self.args.export_path + ".pdiparams")
# after paddle 3.0, support new inference interface
if paddlespeech.utils.satisfy_paddle_version('3.0.0-beta'):
model_dir = os.path.dirname(self.args.export_path)
model_prefix = os.path.basename(self.args.export_path)
deepspeech_config = inference.Config(model_dir, model_prefix)
else:
deepspeech_config = inference.Config(
self.args.export_path + ".pdmodel",
self.args.export_path + ".pdiparams")
deepspeech_config.disable_mkldnn()
if (os.environ['CUDA_VISIBLE_DEVICES'].strip() != ''):
deepspeech_config.enable_use_gpu(100, 0)
deepspeech_config.enable_memory_optim()

@ -18,7 +18,7 @@ from yacs.config import CfgNode
from paddlespeech.s2t.exps.hubert.model import HubertASRTester as Tester
from paddlespeech.s2t.training.cli import default_argument_parser
from paddlespeech.s2t.utils.utility import print_arguments
from paddlespeech.utils.argparse import print_arguments
def main_sp(config, args):
@ -37,8 +37,6 @@ if __name__ == "__main__":
# save asr result to
parser.add_argument(
'--dict-path', type=str, default=None, help='dict path.')
parser.add_argument(
"--result_file", type=str, help="path of save the asr result")
args = parser.parse_args()
print_arguments(args, globals())

@ -97,11 +97,6 @@ def main(config, args):
if __name__ == "__main__":
parser = default_argument_parser()
# save asr result to
parser.add_argument(
"--result_file", type=str, help="path of save the asr result")
parser.add_argument(
"--audio_file", type=str, help="path of the input audio file")
args = parser.parse_args()
config = CfgNode(new_allowed=True)

@ -19,7 +19,7 @@ from yacs.config import CfgNode
from paddlespeech.s2t.exps.hubert.model import HubertASRTrainer as Trainer
from paddlespeech.s2t.training.cli import default_argument_parser
from paddlespeech.s2t.utils.utility import print_arguments
from paddlespeech.utils.argparse import print_arguments
def main_sp(config, args):

@ -75,7 +75,7 @@ class U2Infer():
feat = self.preprocessing(audio, **self.preprocess_args)
logger.info(f"feat shape: {feat.shape}")
ilen = paddle.to_tensor(feat.shape[0])
ilen = paddle.to_tensor(feat.shape[0]).unsqueeze(0)
xs = paddle.to_tensor(feat, dtype='float32').unsqueeze(0)
decode_config = self.config.decode
logger.info(f"decode cfg: {decode_config}")

@ -78,7 +78,7 @@ class U2Infer():
if self.args.debug:
np.savetxt("feat.transform.txt", feat)
ilen = paddle.to_tensor(feat.shape[0])
ilen = paddle.to_tensor(feat.shape[0]).unsqueeze(0)
xs = paddle.to_tensor(feat, dtype='float32').unsqueeze(0)
decode_config = self.config.decode
logger.info(f"decode cfg: {decode_config}")

@ -34,9 +34,6 @@ def main(config, args):
if __name__ == "__main__":
parser = default_argument_parser()
# save asr result to
parser.add_argument(
"--result_file", type=str, help="path of save the asr result")
args = parser.parse_args()
print_arguments(args, globals())

@ -37,8 +37,6 @@ if __name__ == "__main__":
# save asr result to
parser.add_argument(
'--dict-path', type=str, default=None, help='dict path.')
parser.add_argument(
"--result_file", type=str, help="path of save the asr result")
args = parser.parse_args()
print_arguments(args, globals())

@ -104,11 +104,6 @@ def main(config, args):
if __name__ == "__main__":
parser = default_argument_parser()
# save asr result to
parser.add_argument(
"--result_file", type=str, help="path of save the asr result")
parser.add_argument(
"--audio_file", type=str, help="path of the input audio file")
args = parser.parse_args()
config = CfgNode(new_allowed=True)

@ -18,7 +18,8 @@ from yacs.config import CfgNode
from paddlespeech.s2t.exps.wavlm.model import WavLMASRTester as Tester
from paddlespeech.s2t.training.cli import default_argument_parser
from paddlespeech.utils.argparse import print_arguments, add_arguments
from paddlespeech.utils.argparse import add_arguments
from paddlespeech.utils.argparse import print_arguments
def main_sp(config, args):
@ -37,8 +38,6 @@ if __name__ == "__main__":
# save asr result to
parser.add_argument(
'--dict-path', type=str, default=None, help='dict path.')
parser.add_argument(
"--result_file", type=str, help="path of save the asr result")
args = parser.parse_args()
print_arguments(args, globals())

@ -105,10 +105,6 @@ def main(config, args):
if __name__ == "__main__":
parser = default_argument_parser()
# save asr result to
parser.add_argument(
"--result_file", type=str, help="path of save the asr result")
parser.add_argument(
"--audio_file", type=str, help="path of the input audio file")
args = parser.parse_args()
config = CfgNode(new_allowed=True)

@ -33,7 +33,7 @@ from paddlespeech.s2t.io.speechbrain import data_pipeline
from paddlespeech.s2t.io.speechbrain import dataio
from paddlespeech.s2t.io.speechbrain import dataset
from paddlespeech.s2t.io.speechbrain.dataloader import make_dataloader
from paddlespeech.s2t.models.wavlm.processing.speech_augmentation import TimeDomainSpecAugment
from paddlespeech.s2t.models.wav2vec2.processing.speech_augmentation import TimeDomainSpecAugment
from paddlespeech.s2t.models.wavlm.wavlm_asr import WavLMASR
from paddlespeech.s2t.training.optimizer import OptimizerFactory
from paddlespeech.s2t.training.reporter import ObsScope
@ -211,7 +211,7 @@ class WavLMASRTrainer(Trainer):
loss.backward()
layer_tools.print_grads(self.model, print_func=None)
# NOTE: the code below asserted that the backward() is problematic, and as more steps are accumulated, the output from wavlm alone will be the same for all frames
# optimizer step old
if (batch_index + 1) % train_conf.accum_grad == 0:
@ -428,8 +428,7 @@ class WavLMASRTrainer(Trainer):
report("epoch", self.epoch)
report('step', self.iteration)
report("model_lr", self.model_optimizer.get_lr())
report("wavlm_lr",
self.wavlm_optimizer.get_lr())
report("wavlm_lr", self.wavlm_optimizer.get_lr())
self.train_batch(batch_index, batch, msg)
self.after_train_batch()
report('iter', batch_index + 1)
@ -680,8 +679,7 @@ class WavLMASRTrainer(Trainer):
logger.info("optim_model:{},{}", model_optim_type, model_optim_conf)
wavlm_optim_type = train_config.wavlm_optim
wavlm_optim_conf = train_config.wavlm_optim_conf
logger.info("optim_model:{},{}", wavlm_optim_type,
wavlm_optim_conf)
logger.info("optim_model:{},{}", wavlm_optim_type, wavlm_optim_conf)
model_scheduler_type = train_config.model_scheduler
model_scheduler_conf = train_config.model_scheduler_conf
@ -698,8 +696,8 @@ class WavLMASRTrainer(Trainer):
model_lr_scheduler = LRSchedulerFactory.from_args(model_scheduler_type,
model_scheduler_args)
wavlm_lr_scheduler = LRSchedulerFactory.from_args(
wavlm_scheduler_type, wavlm_scheduler_args)
wavlm_lr_scheduler = LRSchedulerFactory.from_args(wavlm_scheduler_type,
wavlm_scheduler_args)
def optimizer_args(
config,
@ -716,24 +714,31 @@ class WavLMASRTrainer(Trainer):
})
return optim_arg
model_optimizer_args = optimizer_args(
config, model_optim_type,
model_optim_conf,
[{'params': model._layers.enc.parameters()}, {'params': model._layers.ctc.parameters()}] if self.parallel else [{'params': model.enc.parameters()}, {'params': model.ctc.parameters()}],
model_lr_scheduler
)
# [{'params': model._layers.ctc.parameters()}] if self.parallel else [{'params': model.ctc.parameters()}], model_lr_scheduler)
model_optimizer_args = optimizer_args(config, model_optim_type,
model_optim_conf, [{
'params':
model._layers.enc.parameters()
}, {
'params':
model._layers.ctc.parameters()
}] if self.parallel else [{
'params':
model.enc.parameters()
}, {
'params':
model.ctc.parameters()
}], model_lr_scheduler)
# [{'params': model._layers.ctc.parameters()}] if self.parallel else [{'params': model.ctc.parameters()}], model_lr_scheduler)
wavlm_optimizer_args = optimizer_args(
config, wavlm_optim_type, wavlm_optim_conf,
model._layers.wavlm.parameters() if self.parallel else
model.wavlm.parameters(), wavlm_lr_scheduler)
model._layers.wavlm.parameters()
if self.parallel else model.wavlm.parameters(), wavlm_lr_scheduler)
model_optimizer = OptimizerFactory.from_args(model_optim_type,
model_optimizer_args)
wavlm_optimizer = OptimizerFactory.from_args(wavlm_optim_type,
wavlm_optimizer_args)
wavlm_optimizer_args)
self.model_optimizer = model_optimizer
self.wavlm_optimizer = wavlm_optimizer

@ -115,6 +115,10 @@ class TextFeaturizer():
"""
assert self.vocab_path_or_list, "toidx need vocab path or vocab list"
tokens = []
# unwrap `idxs`` like `[[1,2,3]]`
if idxs and isinstance(idxs[0], (list, tuple)) and len(idxs) == 1:
idxs = idxs[0]
for idx in idxs:
if idx == self.eos_id:
break

@ -404,6 +404,12 @@ class DataLoaderFactory():
config['subsampling_factor'] = 1
config['num_encs'] = 1
config['shortest_first'] = False
config['minibatches'] = 0
config['batch_count'] = 'auto'
config['batch_bins'] = 0
config['batch_frames_in'] = 0
config['batch_frames_out'] = 0
config['batch_frames_inout'] = 0
elif mode == 'valid':
config['manifest'] = config.dev_manifest
config['train_mode'] = False

@ -398,14 +398,15 @@ class DeepSpeech2InferModel(DeepSpeech2Model):
paddle.static.InputSpec(
shape=[None, None, self.encoder.feat_size
], #[B, chunk_size, feat_dim]
dtype='float32'),
dtype='float32', ),
paddle.static.InputSpec(shape=[None],
dtype='int64'), # audio_length, [B]
paddle.static.InputSpec(
shape=[None, None, None], dtype='float32'),
paddle.static.InputSpec(
shape=[None, None, None], dtype='float32')
])
],
full_graph=True)
elif self.encoder.rnn_direction == "bidirect":
static_model = paddle.jit.to_static(
self,
@ -415,7 +416,8 @@ class DeepSpeech2InferModel(DeepSpeech2Model):
dtype='float32'), # audio, [B,T,D]
paddle.static.InputSpec(shape=[None],
dtype='int64'), # audio_length, [B]
])
],
full_graph=True)
else:
raise Exception("wrong model type")
return static_model

@ -213,7 +213,7 @@ class HubertASR(nn.Layer):
x_lens = x.shape[1]
ctc_probs = self.ctc.log_softmax(x) # (B, maxlen, vocab_size)
topk_prob, topk_index = ctc_probs.topk(1, axis=2) # (B, maxlen, 1)
topk_index = topk_index.view(batch_size, x_lens) # (B, maxlen)
topk_index = topk_index.reshape([batch_size, x_lens]) # (B, maxlen)
hyps = [hyp.tolist() for hyp in topk_index]
hyps = [remove_duplicates_and_blank(hyp) for hyp in hyps]

@ -122,10 +122,12 @@ class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface):
h, _ = self.encoder(emb, xlen)
y = self.decoder(h)
loss = F.cross_entropy(
y.view(-1, paddle.shape(y)[-1]), t.view(-1), reduction="none")
y.reshape([-1, paddle.shape(y)[-1]]),
t.reshape([-1]),
reduction="none")
mask = xm.to(loss.dtype)
logp = loss * mask.view(-1)
nll = logp.view(batch_size, -1).sum(-1)
logp = loss * mask.reshape([-1])
nll = logp.reshape([batch_size, -1]).sum(-1)
nll_count = mask.sum(-1)
logp = logp.sum()
count = mask.sum()

@ -170,13 +170,13 @@ class U2STBaseModel(nn.Layer):
ys_in_lens = ys_pad_lens + 1
# 1. Forward decoder
decoder_out, _ = self.st_decoder(encoder_out, encoder_mask, ys_in_pad,
ys_in_lens)
decoder_out, *_ = self.st_decoder(encoder_out, encoder_mask, ys_in_pad,
ys_in_lens)
# 2. Compute attention loss
loss_att = self.criterion_att(decoder_out, ys_out_pad)
acc_att = th_accuracy(
decoder_out.view(-1, self.vocab_size),
decoder_out.reshape([-1, self.vocab_size]),
ys_out_pad,
ignore_label=self.ignore_id, )
return loss_att, acc_att
@ -203,13 +203,13 @@ class U2STBaseModel(nn.Layer):
ys_in_lens = ys_pad_lens + 1
# 1. Forward decoder
decoder_out, _ = self.decoder(encoder_out, encoder_mask, ys_in_pad,
ys_in_lens)
decoder_out, *_ = self.decoder(encoder_out, encoder_mask, ys_in_pad,
ys_in_lens)
# 2. Compute attention loss
loss_att = self.criterion_att(decoder_out, ys_out_pad)
acc_att = th_accuracy(
decoder_out.view(-1, self.vocab_size),
decoder_out.reshape([-1, self.vocab_size]),
ys_out_pad,
ignore_label=self.ignore_id, )
return loss_att, acc_att

@ -129,7 +129,7 @@ def _compute_mask_indices(
[sequence_length for _ in range(batch_size)])
# SpecAugment mask to fill
spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=np.bool)
spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=np.bool_)
spec_aug_mask_idxs = []
max_num_masked_span = compute_num_masked_span(sequence_length)
@ -207,9 +207,9 @@ def _sample_negative_indices(features_shape: Tuple,
sampled_negative_indices = np.zeros(
shape=(batch_size, sequence_length, num_negatives), dtype=np.int32)
mask_time_indices = (mask_time_indices.astype(np.bool)
mask_time_indices = (mask_time_indices.astype(np.bool_)
if mask_time_indices is not None else
np.ones(features_shape, dtype=np.bool))
np.ones(features_shape, dtype=np.bool_))
for batch_idx in range(batch_size):
high = mask_time_indices[batch_idx].sum() - 1

@ -714,13 +714,13 @@ class MultiheadAttention(nn.Layer):
else:
if self.beam_size > 1 and bsz == key.size(1):
# key is [T, bsz*beam_size, C], reduce to [T, bsz, C]
key = key.view(
key.size(0), -1, self.beam_size,
key.size(2))[:, :, 0, :]
key = key.reshape(
[key.size(0), -1, self.beam_size,
key.size(2)])[:, :, 0, :]
if key_padding_mask is not None:
key_padding_mask = key_padding_mask.view(
-1, self.beam_size,
key_padding_mask.size(1))[:, 0, :]
key_padding_mask = key_padding_mask.reshape(
[-1, self.beam_size,
key_padding_mask.size(1)])[:, 0, :]
k = self.k_proj(key)
v = self.v_proj(key)
@ -1267,7 +1267,7 @@ class TransposeLast(nn.Layer):
def forward(self, x):
if self.deconstruct_idx is not None:
x = x[self.deconstruct_idx]
trans_dim = paddle.arange(x.dim())
trans_dim = np.arange(x.dim())
trans_dim[-1], trans_dim[-2] = trans_dim[-2], trans_dim[-1]
return x.transpose(trans_dim)
@ -1476,7 +1476,7 @@ def compute_mask_indices(
lens = np.fromiter(
(e - s if e - s >= length + min_space else 0
for s, e in parts),
np.int, )
np.int_, )
l_sum = np.sum(lens)
if l_sum == 0:
break

@ -88,7 +88,7 @@ def compute_amplitude(waveforms, lengths=None, amp_type="avg", scale="linear"):
out = paddle.mean(paddle.abs(waveforms), axis=1, keepdim=True)
else:
wav_sum = paddle.sum(paddle.abs(waveforms), axis=1, keepdim=True)
out = wav_sum / lengths
out = wav_sum / lengths.astype(wav_sum.dtype)
elif amp_type == "peak":
out = paddle.max(paddle.abs(waveforms), axis=1, keepdim=True)[0]
else:
@ -248,4 +248,4 @@ def notch_filter(notch_freq, filter_width=101, notch_width=0.05):
hhpf[pad] += 1
# Adding filters creates notch filter
return (hlpf + hhpf).view(1, -1, 1)
return (hlpf + hhpf).reshape([1, -1, 1])

@ -743,7 +743,7 @@ class SpecAugment(paddle.nn.Layer):
time = x.shape[2]
if time - window <= window:
return x.view(*original_size)
return x.reshape([*original_size])
# compute center and corresponding window
c = paddle.randint(window, time - window, (1, ))[0]
@ -762,7 +762,7 @@ class SpecAugment(paddle.nn.Layer):
x[:, :, :w] = left
x[:, :, w:] = right
return x.view(*original_size)
return x.reshape([*original_size])
def mask_along_axis(self, x, dim):
"""Mask along time or frequency axis.
@ -775,7 +775,7 @@ class SpecAugment(paddle.nn.Layer):
"""
original_size = x.shape
if x.dim() == 4:
x = x.view(-1, x.shape[2], x.shape[3])
x = x.reshape([-1, x.shape[2], x.shape[3]])
batch, time, fea = x.shape
@ -795,7 +795,7 @@ class SpecAugment(paddle.nn.Layer):
(batch, n_mask)).unsqueeze(2)
# compute masks
arange = paddle.arange(end=D).view(1, 1, -1)
arange = paddle.arange(end=D).reshape([1, 1, -1])
mask = (mask_pos <= arange) * (arange < (mask_pos + mask_len))
mask = mask.any(axis=1)
@ -811,7 +811,7 @@ class SpecAugment(paddle.nn.Layer):
# same to x.masked_fill_(mask, val)
y = paddle.full(x.shape, val, x.dtype)
x = paddle.where(mask, y, x)
return x.view(*original_size)
return x.reshape([*original_size])
class TimeDomainSpecAugment(nn.Layer):

@ -6,17 +6,18 @@
# Based on fairseq code bases
# https://github.com/pytorch/fairseq
# --------------------------------------------------------
import math
import warnings
from typing import Dict, Optional, Tuple
from .functional import multi_head_attention_forward_paddle
from typing import Dict
from typing import Optional
from typing import Tuple
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import Tensor
from .functional import multi_head_attention_forward_paddle
class TransposeLast(nn.Layer):
@ -40,8 +41,7 @@ class Fp32LayerNorm(nn.LayerNorm):
self.normalized_shape,
self.weight.float() if self.weight is not None else None,
self.bias.float() if self.bias is not None else None,
self.eps,
)
self.eps, )
return output.type_as(input)
@ -55,12 +55,10 @@ class Fp32GroupNorm(nn.GroupNorm):
self.num_groups,
self.weight.float() if self.weight is not None else None,
self.bias.float() if self.bias is not None else None,
self.eps,
)
self.eps, )
return output.type_as(input)
class SamePad(nn.Layer):
def __init__(self, kernel_size, causal=False):
super().__init__()
@ -71,7 +69,7 @@ class SamePad(nn.Layer):
def forward(self, x):
if self.remove > 0:
x = x[:, :, : -self.remove]
x = x[:, :, :-self.remove]
return x
@ -89,7 +87,11 @@ class Swish(nn.Layer):
class GLU_Linear(nn.Layer):
def __init__(self, input_dim, output_dim, glu_type="sigmoid", bias_in_glu=True):
def __init__(self,
input_dim,
output_dim,
glu_type="sigmoid",
bias_in_glu=True):
super(GLU_Linear, self).__init__()
self.glu_type = glu_type
@ -114,9 +116,11 @@ class GLU_Linear(nn.Layer):
x = self.linear(x)
if self.glu_type == "bilinear":
x = (x[:, :, 0:self.output_dim] * x[:, :, self.output_dim:self.output_dim * 2])
x = (x[:, :, 0:self.output_dim] *
x[:, :, self.output_dim:self.output_dim * 2])
else:
x = (x[:, :, 0:self.output_dim] * self.glu_act(x[:, :, self.output_dim:self.output_dim * 2]))
x = (x[:, :, 0:self.output_dim] *
self.glu_act(x[:, :, self.output_dim:self.output_dim * 2]))
return x
@ -124,9 +128,8 @@ class GLU_Linear(nn.Layer):
def gelu_accurate(x):
if not hasattr(gelu_accurate, "_a"):
gelu_accurate._a = math.sqrt(2 / math.pi)
return (
0.5 * x * (1 + paddle.tanh(gelu_accurate._a * (x + 0.044715 * paddle.pow(x, 3))))
)
return (0.5 * x * (1 + paddle.tanh(gelu_accurate._a *
(x + 0.044715 * paddle.pow(x, 3)))))
def gelu(x: Tensor) -> Tensor:
@ -142,8 +145,7 @@ def get_activation_fn(activation: str):
return gelu
elif activation == "gelu_fast":
warnings.warn(
"--activation-fn=gelu_fast has been renamed to gelu_accurate"
)
"--activation-fn=gelu_fast has been renamed to gelu_accurate")
return gelu_accurate
elif activation == "gelu_accurate":
return gelu_accurate
@ -154,7 +156,8 @@ def get_activation_fn(activation: str):
elif activation == "glu":
return lambda x: x
else:
raise RuntimeError("--activation-fn {} not supported".format(activation))
raise RuntimeError(
"--activation-fn {} not supported".format(activation))
def quant_noise(module, p, block_size):
@ -190,16 +193,15 @@ def quant_noise(module, p, block_size):
# 2D matrix
if not is_conv:
assert (
module.weight.size(1) % block_size == 0
), "Input features must be a multiple of block sizes"
module.weight.size(1) %
block_size == 0), "Input features must be a multiple of block sizes"
# 4D matrix
else:
# 1x1 convolutions
if module.kernel_size == (1, 1):
assert (
module.in_channels % block_size == 0
), "Input channels must be a multiple of block sizes"
assert (module.in_channels % block_size == 0
), "Input channels must be a multiple of block sizes"
# regular convolutions
else:
k = module.kernel_size[0] * module.kernel_size[1]
@ -216,10 +218,11 @@ def quant_noise(module, p, block_size):
# split weight matrix into blocks and randomly drop selected blocks
mask = paddle.zeros(
in_features // block_size * out_features, device=weight.device
)
in_features // block_size * out_features,
device=weight.device)
mask.bernoulli_(p)
mask = mask.repeat_interleave(block_size, -1).view(-1, in_features)
mask = mask.repeat_interleave(block_size, -1).reshape(
[-1, in_features])
else:
# gather weight and sizes
@ -231,26 +234,21 @@ def quant_noise(module, p, block_size):
if mod.kernel_size == (1, 1):
mask = paddle.zeros(
int(in_channels // block_size * out_channels),
device=weight.device,
)
device=weight.device, )
mask.bernoulli_(p)
mask = mask.repeat_interleave(block_size, -1).view(-1, in_channels)
mask = mask.repeat_interleave(block_size, -1).reshape(
[-1, in_channels])
else:
mask = paddle.zeros(
weight.size(0), weight.size(1), device=weight.device
)
weight.size(0), weight.size(1), device=weight.device)
mask.bernoulli_(p)
mask = (
mask.unsqueeze(2)
.unsqueeze(3)
.repeat(1, 1, mod.kernel_size[0], mod.kernel_size[1])
)
mask.unsqueeze(2).unsqueeze(3)
.repeat(1, 1, mod.kernel_size[0], mod.kernel_size[1]))
# scale weights and apply mask
mask = mask.to(
paddle.bool
)
mask = mask.to(paddle.bool)
s = 1 / (1 - p)
mod.weight.data = s * weight.masked_fill(mask, 0)
@ -282,8 +280,7 @@ class MultiheadAttention(nn.Layer):
num_buckets=32,
max_distance=128,
gru_rel_pos=True,
rescale_init=False,
):
rescale_init=False, ):
super().__init__()
self.embed_dim = embed_dim
self.kdim = kdim if kdim is not None else embed_dim
@ -302,17 +299,16 @@ class MultiheadAttention(nn.Layer):
self.head_dim = embed_dim // num_heads
self.q_head_dim = self.head_dim
self.k_head_dim = self.head_dim
assert (
self.head_dim * num_heads == self.embed_dim
), "embed_dim must be divisible by num_heads"
self.scaling = self.head_dim ** -0.5
assert (self.head_dim * num_heads == self.embed_dim
), "embed_dim must be divisible by num_heads"
self.scaling = self.head_dim**-0.5
self.self_attention = self_attention
self.encoder_decoder_attention = encoder_decoder_attention
assert not self.self_attention or self.qkv_same_dim, (
"Self-attention requires query, key and " "value to be of the same size"
)
"Self-attention requires query, key and "
"value to be of the same size")
k_bias = True
if rescale_init:
@ -322,26 +318,24 @@ class MultiheadAttention(nn.Layer):
q_embed_dim = embed_dim
self.k_proj = quant_noise(
nn.Linear(self.kdim, k_embed_dim, bias_attr=k_bias), q_noise, qn_block_size
)
nn.Linear(self.kdim, k_embed_dim, bias_attr=k_bias), q_noise,
qn_block_size)
self.v_proj = quant_noise(
nn.Linear(self.vdim, embed_dim, bias_attr=bias), q_noise, qn_block_size
)
nn.Linear(self.vdim, embed_dim, bias_attr=bias), q_noise,
qn_block_size)
self.q_proj = quant_noise(
nn.Linear(embed_dim, q_embed_dim, bias_attr=bias), q_noise, qn_block_size
)
nn.Linear(embed_dim, q_embed_dim, bias_attr=bias), q_noise,
qn_block_size)
self.out_proj = quant_noise(
nn.Linear(embed_dim, embed_dim, bias_attr=bias), q_noise, qn_block_size
)
nn.Linear(embed_dim, embed_dim, bias_attr=bias), q_noise,
qn_block_size)
if add_bias_kv:
self.bias_k = self.create_parameter(
shape=[1, 1, embed_dim], dtype="float32"
)
shape=[1, 1, embed_dim], dtype="float32")
self.bias_v = self.create_parameter(
shape=[1, 1, embed_dim], dtype="float32"
)
shape=[1, 1, embed_dim], dtype="float32")
else:
self.bias_k = self.bias_v = None
@ -352,40 +346,41 @@ class MultiheadAttention(nn.Layer):
if self.gru_rel_pos:
self.grep_linear = nn.Linear(self.q_head_dim, 8)
self.grep_a = self.create_parameter(
shape=[1, num_heads, 1, 1], dtype="float32"
)
shape=[1, num_heads, 1, 1], dtype="float32")
self.reset_parameters()
def reset_parameters(self):
pass
def _relative_positions_bucket(self, relative_positions, bidirectional=True):
def _relative_positions_bucket(self, relative_positions,
bidirectional=True):
num_buckets = self.num_buckets
max_distance = self.max_distance
relative_buckets = 0
if bidirectional:
num_buckets = num_buckets // 2
relative_buckets += (relative_positions > 0).astype("int64") * num_buckets
relative_buckets += (
relative_positions > 0).astype("int64") * num_buckets
relative_positions = paddle.abs(relative_positions)
else:
relative_positions = -paddle.minimum(relative_positions, paddle.zeros_like(relative_positions))
relative_positions = -paddle.minimum(
relative_positions, paddle.zeros_like(relative_positions))
max_exact = num_buckets // 2
is_small = relative_positions < max_exact
relative_postion_if_large = max_exact + (
paddle.log(relative_positions.astype("float32") / max_exact)
/ math.log(max_distance / max_exact)
* (num_buckets - max_exact)
).astype("int64")
paddle.log(relative_positions.astype("float32") /
max_exact) / math.log(max_distance / max_exact) *
(num_buckets - max_exact)).astype("int64")
relative_postion_if_large = paddle.minimum(
relative_postion_if_large, paddle.full_like(relative_postion_if_large, num_buckets - 1)
)
relative_postion_if_large,
paddle.full_like(relative_postion_if_large, num_buckets - 1))
relative_buckets += paddle.where(is_small, relative_positions, relative_postion_if_large)
relative_buckets += paddle.where(is_small, relative_positions,
relative_postion_if_large)
return relative_buckets
def compute_bias(self, query_length, key_length):
@ -393,28 +388,26 @@ class MultiheadAttention(nn.Layer):
memory_position = paddle.arange(key_length, dtype="int64")[None, :]
relative_position = memory_position - context_position
relative_position_bucket = self._relative_positions_bucket(
relative_position,
bidirectional=True
)
relative_position, bidirectional=True)
# relative_position_bucket = relative_position_bucket.to(self.relative_attention_bias.weight.device)
values = self.relative_attention_bias(relative_position_bucket)
values = values.transpose([2, 0, 1])
return values
def forward(
self,
query,
key: Optional[Tensor],
value: Optional[Tensor],
key_padding_mask: Optional[Tensor] = None,
incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
need_weights: bool = True,
static_kv: bool = False,
attn_mask: Optional[Tensor] = None,
before_softmax: bool = False,
need_head_weights: bool = False,
position_bias: Optional[Tensor] = None
) -> Tuple[Tensor, Optional[Tensor], Optional[Tensor]]:
def forward(self,
query,
key: Optional[Tensor],
value: Optional[Tensor],
key_padding_mask: Optional[Tensor]=None,
incremental_state: Optional[Dict[str, Dict[str, Optional[
Tensor]]]]=None,
need_weights: bool=True,
static_kv: bool=False,
attn_mask: Optional[Tensor]=None,
before_softmax: bool=False,
need_head_weights: bool=False,
position_bias: Optional[Tensor]=None
) -> Tuple[Tensor, Optional[Tensor], Optional[Tensor]]:
"""Input shape: Time x Batch x Channel
Args:
@ -441,17 +434,16 @@ class MultiheadAttention(nn.Layer):
assert list(query.shape) == [tgt_len, bsz, embed_dim]
if key is not None:
src_len, key_bsz, _ = key.shape
if self.has_relative_attention_bias and position_bias is None:
position_bias = self.compute_bias(tgt_len, src_len)
position_bias_ = position_bias.unsqueeze(0)
position_bias = paddle.concat([position_bias_ for _ in range(bsz)], axis=0)
position_bias = position_bias.reshape([bsz * self.num_heads, tgt_len, src_len])
if (
incremental_state is None
and not static_kv
and self.q_head_dim == self.head_dim
):
position_bias = paddle.concat(
[position_bias_ for _ in range(bsz)], axis=0)
position_bias = position_bias.reshape(
[bsz * self.num_heads, tgt_len, src_len])
if (incremental_state is None and not static_kv and
self.q_head_dim == self.head_dim):
assert key is not None and value is not None
assert attn_mask is None
@ -465,17 +457,21 @@ class MultiheadAttention(nn.Layer):
query_layer = query_layer.transpose([0, 2, 1, 3])
_B, _H, _L, __ = query_layer.shape
gate_a, gate_b = paddle.nn.functional.sigmoid(self.grep_linear(query_layer).reshape([_B, _H, _L, 2, 4]).sum(-1, keepdim=False)).chunk(2, axis=-1)
gate_a, gate_b = paddle.nn.functional.sigmoid(
self.grep_linear(query_layer).reshape(
[_B, _H, _L, 2, 4]).sum(-1, keepdim=False)).chunk(
2, axis=-1)
gate_a_1 = gate_a * (gate_b * self.grep_a - 1.0) + 2.0
attn_mask_rel_pos = gate_a_1.reshape([bsz * self.num_heads, -1, 1]) * position_bias
attn_mask_rel_pos = gate_a_1.reshape(
[bsz * self.num_heads, -1, 1]) * position_bias
attn_mask_rel_pos = attn_mask_rel_pos.reshape((-1, tgt_len, tgt_len))
attn_mask_rel_pos = attn_mask_rel_pos.reshape(
(-1, tgt_len, tgt_len))
k_proj_bias = self.k_proj.bias
if k_proj_bias is None:
k_proj_bias = paddle.zeros_like(self.q_proj.bias)
x, attn = multi_head_attention_forward_paddle(
query,
key,
@ -483,7 +479,9 @@ class MultiheadAttention(nn.Layer):
self.embed_dim,
self.num_heads,
paddle.empty([0]),
paddle.concat((self.q_proj.bias, self.k_proj.bias, self.v_proj.bias), axis=0),
paddle.concat(
(self.q_proj.bias, self.k_proj.bias, self.v_proj.bias),
axis=0),
self.bias_k,
self.bias_v,
self.add_zero_attn,
@ -497,9 +495,8 @@ class MultiheadAttention(nn.Layer):
use_separate_proj_weight=True,
q_proj_weight=self.q_proj.weight,
k_proj_weight=self.k_proj.weight,
v_proj_weight=self.v_proj.weight,
)
v_proj_weight=self.v_proj.weight, )
return x, attn, position_bias
if incremental_state is not None:
@ -540,8 +537,8 @@ class MultiheadAttention(nn.Layer):
v = paddle.concat([v, self.bias_v.repeat(1, bsz, 1)], axis=0)
if attn_mask is not None:
attn_mask = paddle.concat(
[attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], axis=1
)
[attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)],
axis=1)
if key_padding_mask is not None:
key_padding_mask = paddle.concat(
@ -549,33 +546,27 @@ class MultiheadAttention(nn.Layer):
key_padding_mask,
key_padding_mask.new_zeros(key_padding_mask.size(0), 1),
],
axis=1,
)
q = (
q.contiguous()
.view(tgt_len, bsz * self.num_heads, self.q_head_dim)
.transpose([1, 0, 2])
)
axis=1, )
q = (q.contiguous()
.reshape([tgt_len, bsz * self.num_heads, self.q_head_dim])
.transpose([1, 0, 2]))
if k is not None:
k = (
k.contiguous()
.view(-1, bsz * self.num_heads, self.k_head_dim)
.transpose([1, 0, 2])
)
k = (k.contiguous()
.reshape([-1, bsz * self.num_heads, self.k_head_dim])
.transpose([1, 0, 2]))
if v is not None:
v = (
v.contiguous()
.view(-1, bsz * self.num_heads, self.head_dim)
.transpose([1, 0, 2])
)
v = (v.contiguous()
.reshape([-1, bsz * self.num_heads, self.head_dim])
.transpose([1, 0, 2]))
if saved_state is not None:
# saved states are stored with shape (bsz, num_heads, seq_len, head_dim)
if "prev_key" in saved_state:
_prev_key = saved_state["prev_key"]
assert _prev_key is not None
prev_key = _prev_key.view(bsz * self.num_heads, -1, self.head_dim)
prev_key = _prev_key.reshape(
[bsz * self.num_heads, -1, self.head_dim])
if static_kv:
k = prev_key
else:
@ -585,7 +576,8 @@ class MultiheadAttention(nn.Layer):
if "prev_value" in saved_state:
_prev_value = saved_state["prev_value"]
assert _prev_value is not None
prev_value = _prev_value.view(bsz * self.num_heads, -1, self.head_dim)
prev_value = _prev_value.reshape(
[bsz * self.num_heads, -1, self.head_dim])
if static_kv:
v = prev_value
else:
@ -600,15 +592,17 @@ class MultiheadAttention(nn.Layer):
prev_key_padding_mask=prev_key_padding_mask,
batch_size=bsz,
src_len=k.size(1),
static_kv=static_kv,
)
static_kv=static_kv, )
saved_state["prev_key"] = k.view(bsz, self.num_heads, -1, self.head_dim)
saved_state["prev_value"] = v.view(bsz, self.num_heads, -1, self.head_dim)
saved_state["prev_key"] = k.reshape(
[bsz, self.num_heads, -1, self.head_dim])
saved_state["prev_value"] = v.reshape(
[bsz, self.num_heads, -1, self.head_dim])
saved_state["prev_key_padding_mask"] = key_padding_mask
# In this branch incremental_state is never None
assert incremental_state is not None
incremental_state = self._set_input_buffer(incremental_state, saved_state)
incremental_state = self._set_input_buffer(incremental_state,
saved_state)
assert k is not None
assert k.size(1) == src_len
@ -624,30 +618,31 @@ class MultiheadAttention(nn.Layer):
if self.add_zero_attn:
assert v is not None
src_len += 1
k = paddle.concat([k, k.new_zeros((k.size(0), 1) + k.shape[2:])], axis=1)
v = paddle.concat([v, v.new_zeros((v.size(0), 1) + v.shape[2:])], axis=1)
k = paddle.concat(
[k, k.new_zeros((k.size(0), 1) + k.shape[2:])], axis=1)
v = paddle.concat(
[v, v.new_zeros((v.size(0), 1) + v.shape[2:])], axis=1)
if attn_mask is not None:
attn_mask = paddle.concat(
[attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], axis=1
)
[attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)],
axis=1)
if key_padding_mask is not None:
key_padding_mask = paddle.concat(
[
key_padding_mask,
paddle.zeros(key_padding_mask.size(0), 1).type_as(
key_padding_mask
),
paddle.zeros(key_padding_mask.size(0),
1).type_as(key_padding_mask),
],
axis=1,
)
axis=1, )
attn_weights = paddle.matmul(q, k.transpose([0, 2, 1]))
attn_weights = self.apply_sparse_mask(attn_weights, tgt_len, src_len, bsz)
attn_weights = self.apply_sparse_mask(attn_weights, tgt_len, src_len,
bsz)
assert list(attn_weights.shape) == [bsz * self.num_heads, tgt_len, src_len]
assert list(
attn_weights.shape) == [bsz * self.num_heads, tgt_len, src_len]
if attn_mask is not None:
attn_mask = attn_mask.unsqueeze(0)
@ -655,46 +650,49 @@ class MultiheadAttention(nn.Layer):
if key_padding_mask is not None:
# don't attend to padding symbols
attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
attn_weights = attn_weights.reshape(
[bsz, self.num_heads, tgt_len, src_len])
attn_weights = attn_weights.masked_fill(
key_padding_mask.unsqueeze(1).unsqueeze(2).to(paddle.bool),
float("-inf"),
)
attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
float("-inf"), )
attn_weights = attn_weights.reshape(
[bsz * self.num_heads, tgt_len, src_len])
if before_softmax:
return attn_weights, v, position_bias
if position_bias is not None:
if self.gru_rel_pos == 1:
query_layer = q.view(bsz, self.num_heads, tgt_len, self.q_head_dim)
query_layer = q.reshape(
[bsz, self.num_heads, tgt_len, self.q_head_dim])
_B, _H, _L, __ = query_layer.shape
gate_a, gate_b = paddle.sigmoid(self.grep_linear(query_layer).view(
_B, _H, _L, 2, 4).sum(-1, keepdim=False)).chunk(2, axis=-1)
gate_a, gate_b = paddle.sigmoid(
self.grep_linear(query_layer).reshape([_B, _H, _L, 2, 4])
.sum(-1, keepdim=False)).chunk(
2, axis=-1)
gate_a_1 = gate_a * (gate_b * self.grep_a - 1.0) + 2.0
position_bias = gate_a_1.view(bsz * self.num_heads, -1, 1) * position_bias
position_bias = gate_a_1.reshape(
[bsz * self.num_heads, -1, 1]) * position_bias
position_bias = position_bias.view(attn_weights.shape)
position_bias = position_bias.reshape(attn_weights.shape)
attn_weights = attn_weights + position_bias
attn_weights_float = F.softmax(
attn_weights, dim=-1
)
attn_weights_float = F.softmax(attn_weights, dim=-1)
attn_weights = attn_weights_float.type_as(attn_weights)
attn_probs = self.dropout_module(attn_weights)
assert v is not None
attn = paddle.bmm(attn_probs, v)
assert list(attn.shape) == [bsz * self.num_heads, tgt_len, self.head_dim]
assert list(
attn.shape) == [bsz * self.num_heads, tgt_len, self.head_dim]
attn = attn.transpose([1, 0, 2]).reshape([tgt_len, bsz, embed_dim])
attn = self.out_proj(attn)
attn_weights: Optional[Tensor] = None
if need_weights:
attn_weights = attn_weights_float.view(
bsz, self.num_heads, tgt_len, src_len
).transpose([1, 0, 2, 3])
attn_weights = attn_weights_float.reshape(
[bsz, self.num_heads, tgt_len, src_len]).transpose([1, 0, 2, 3])
if not need_head_weights:
# average attention weights over heads
attn_weights = attn_weights.mean(dim=0)
@ -707,15 +705,14 @@ class MultiheadAttention(nn.Layer):
prev_key_padding_mask: Optional[Tensor],
batch_size: int,
src_len: int,
static_kv: bool,
) -> Optional[Tensor]:
static_kv: bool, ) -> Optional[Tensor]:
# saved key padding masks have shape (bsz, seq_len)
if prev_key_padding_mask is not None and static_kv:
new_key_padding_mask = prev_key_padding_mask
elif prev_key_padding_mask is not None and key_padding_mask is not None:
new_key_padding_mask = paddle.concat(
[prev_key_padding_mask.float(), key_padding_mask.float()], axis=1
)
[prev_key_padding_mask.float(), key_padding_mask.float()],
axis=1)
# During incremental decoding, as the padding token enters and
# leaves the frame, there will be a time when prev or current
# is None
@ -723,11 +720,9 @@ class MultiheadAttention(nn.Layer):
if src_len > prev_key_padding_mask.size(1):
filler = paddle.zeros(
(batch_size, src_len - prev_key_padding_mask.size(1)),
device=prev_key_padding_mask.device,
)
device=prev_key_padding_mask.device, )
new_key_padding_mask = paddle.concat(
[prev_key_padding_mask.float(), filler.float()], axis=1
)
[prev_key_padding_mask.float(), filler.float()], axis=1)
else:
new_key_padding_mask = prev_key_padding_mask.float()
@ -735,11 +730,9 @@ class MultiheadAttention(nn.Layer):
if src_len > key_padding_mask.size(1):
filler = paddle.zeros(
(batch_size, src_len - key_padding_mask.size(1)),
device=key_padding_mask.device,
)
device=key_padding_mask.device, )
new_key_padding_mask = paddle.concat(
[filler.float(), key_padding_mask.float()], axis=1
)
[filler.float(), key_padding_mask.float()], axis=1)
else:
new_key_padding_mask = key_padding_mask.float()
@ -748,7 +741,8 @@ class MultiheadAttention(nn.Layer):
return new_key_padding_mask
def _get_input_buffer(
self, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]]
self,
incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]]
) -> Dict[str, Optional[Tensor]]:
result = self.get_incremental_state(incremental_state, "attn_state")
if result is not None:
@ -760,9 +754,13 @@ class MultiheadAttention(nn.Layer):
def _set_input_buffer(
self,
incremental_state: Dict[str, Dict[str, Optional[Tensor]]],
buffer: Dict[str, Optional[Tensor]],
):
return self.set_incremental_state(incremental_state, "attn_state", buffer)
def apply_sparse_mask(self, attn_weights, tgt_len: int, src_len: int, bsz: int):
return attn_weights
buffer: Dict[str, Optional[Tensor]], ):
return self.set_incremental_state(incremental_state, "attn_state",
buffer)
def apply_sparse_mask(self,
attn_weights,
tgt_len: int,
src_len: int,
bsz: int):
return attn_weights

@ -188,7 +188,7 @@ class WavLMASR(nn.Layer):
x_lens = x.shape[1]
ctc_probs = self.ctc.log_softmax(x) # (B, maxlen, vocab_size)
topk_prob, topk_index = ctc_probs.topk(1, axis=2) # (B, maxlen, 1)
topk_index = topk_index.view(batch_size, x_lens) # (B, maxlen)
topk_index = topk_index.reshape([batch_size, x_lens]) # (B, maxlen)
hyps = [hyp.tolist() for hyp in topk_index]
hyps = [remove_duplicates_and_blank(hyp) for hyp in hyps]

@ -6,40 +6,38 @@
# Based on fairseq code bases
# https://github.com/pytorch/fairseq
# --------------------------------------------------------
import math
import logging
from typing import List, Optional, Tuple
import math
from typing import List
from typing import Optional
from typing import Tuple
import numpy as np
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle.nn import LayerNorm
from paddle import Tensor
from .modules.modules import (
MultiheadAttention,
SamePad,
get_activation_fn,
TransposeLast,
GLU_Linear,
)
from paddle.nn import LayerNorm
from .modules.modules import get_activation_fn
from .modules.modules import GLU_Linear
from .modules.modules import MultiheadAttention
from .modules.modules import SamePad
from .modules.modules import TransposeLast
logger = logging.getLogger(__name__)
def compute_mask_indices(
shape: Tuple[int, int],
padding_mask: Optional[Tensor],
mask_prob: float,
mask_length: int,
mask_type: str = "static",
mask_other: float = 0.0,
min_masks: int = 0,
no_overlap: bool = False,
min_space: int = 0,
) -> np.ndarray:
shape: Tuple[int, int],
padding_mask: Optional[Tensor],
mask_prob: float,
mask_length: int,
mask_type: str="static",
mask_other: float=0.0,
min_masks: int=0,
no_overlap: bool=False,
min_space: int=0, ) -> np.ndarray:
"""
Computes random mask spans for a given shape
@ -65,9 +63,7 @@ def compute_mask_indices(
all_num_mask = int(
# add a random number for probabilistic rounding
mask_prob * all_sz / float(mask_length)
+ np.random.rand()
)
mask_prob * all_sz / float(mask_length) + np.random.rand())
all_num_mask = max(min_masks, all_num_mask)
@ -77,9 +73,7 @@ def compute_mask_indices(
sz = all_sz - padding_mask[i].long().sum().item()
num_mask = int(
# add a random number for probabilistic rounding
mask_prob * sz / float(mask_length)
+ np.random.rand()
)
mask_prob * sz / float(mask_length) + np.random.rand())
num_mask = max(min_masks, num_mask)
else:
sz = all_sz
@ -88,7 +82,8 @@ def compute_mask_indices(
if mask_type == "static":
lengths = np.full(num_mask, mask_length)
elif mask_type == "uniform":
lengths = np.random.randint(mask_other, mask_length * 2 + 1, size=num_mask)
lengths = np.random.randint(
mask_other, mask_length * 2 + 1, size=num_mask)
elif mask_type == "normal":
lengths = np.random.normal(mask_length, mask_other, size=num_mask)
lengths = [max(1, int(round(x))) for x in lengths]
@ -119,9 +114,9 @@ def compute_mask_indices(
min_length = min(lengths)
for length in sorted(lengths, reverse=True):
lens = np.fromiter(
(e - s if e - s >= length + min_space else 0 for s, e in parts),
np.int,
)
(e - s if e - s >= length + min_space else 0
for s, e in parts),
np.int_, )
l_sum = np.sum(lens)
if l_sum == 0:
break
@ -137,13 +132,10 @@ def compute_mask_indices(
mask_idc = np.random.choice(sz - min_len, num_mask, replace=False)
mask_idc = np.asarray(
[
mask_idc[j] + offset
for j in range(len(mask_idc))
for offset in range(lengths[j])
]
)
mask_idc = np.asarray([
mask_idc[j] + offset
for j in range(len(mask_idc)) for offset in range(lengths[j])
])
mask_idcs.append(np.unique(mask_idc[mask_idc < sz]))
@ -158,54 +150,54 @@ def compute_mask_indices(
class WavLMConfig:
def __init__(self, cfg=None):
self.extractor_mode: str = "default" # mode for feature extractor. default has a single group norm with d groups in the first conv block, whereas layer_norm has layer norms in every block (meant to use with normalize=True)
self.encoder_layers: int = 12 # num encoder layers in the transformer
self.extractor_mode: str = "default" # mode for feature extractor. default has a single group norm with d groups in the first conv block, whereas layer_norm has layer norms in every block (meant to use with normalize=True)
self.encoder_layers: int = 12 # num encoder layers in the transformer
self.encoder_embed_dim: int = 768 # encoder embedding dimension
self.encoder_ffn_embed_dim: int = 3072 # encoder embedding dimension for FFN
self.encoder_attention_heads: int = 12 # num encoder attention heads
self.activation_fn: str = "gelu" # activation function to use
self.encoder_embed_dim: int = 768 # encoder embedding dimension
self.encoder_ffn_embed_dim: int = 3072 # encoder embedding dimension for FFN
self.encoder_attention_heads: int = 12 # num encoder attention heads
self.activation_fn: str = "gelu" # activation function to use
self.layer_norm_first: bool = False # apply layernorm first in the transformer
self.conv_feature_layers: str = "[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2" # string describing convolutional feature extraction layers in form of a python list that contains [(dim, kernel_size, stride), ...]
self.conv_bias: bool = False # include bias in conv encoder
self.feature_grad_mult: float = 1.0 # multiply feature extractor var grads by this
self.layer_norm_first: bool = False # apply layernorm first in the transformer
self.conv_feature_layers: str = "[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2" # string describing convolutional feature extraction layers in form of a python list that contains [(dim, kernel_size, stride), ...]
self.conv_bias: bool = False # include bias in conv encoder
self.feature_grad_mult: float = 1.0 # multiply feature extractor var grads by this
self.normalize: bool = False # normalize input to have 0 mean and unit variance during training
# dropouts
self.dropout: float = 0.1 # dropout probability for the transformer
self.attention_dropout: float = 0.1 # dropout probability for attention weights
self.activation_dropout: float = 0.0 # dropout probability after activation in FFN
self.encoder_layerdrop: float = 0.0 # probability of dropping a tarnsformer layer
self.dropout_input: float = 0.0 # dropout to apply to the input (after feat extr)
self.dropout_features: float = 0.0 # dropout to apply to the features (after feat extr)
self.dropout: float = 0.1 # dropout probability for the transformer
self.attention_dropout: float = 0.1 # dropout probability for attention weights
self.activation_dropout: float = 0.0 # dropout probability after activation in FFN
self.encoder_layerdrop: float = 0.0 # probability of dropping a tarnsformer layer
self.dropout_input: float = 0.0 # dropout to apply to the input (after feat extr)
self.dropout_features: float = 0.0 # dropout to apply to the features (after feat extr)
# masking
self.mask_length: int = 10 # mask length
self.mask_prob: float = 0.65 # probability of replacing a token with mask
self.mask_selection: str = "static" # how to choose mask length
self.mask_other: float = 0 # secondary mask argument (used for more complex distributions), see help in compute_mask_indicesh
self.no_mask_overlap: bool = False # whether to allow masks to overlap
self.mask_min_space: int = 1 # min space between spans (if no overlap is enabled)
self.mask_length: int = 10 # mask length
self.mask_prob: float = 0.65 # probability of replacing a token with mask
self.mask_selection: str = "static" # how to choose mask length
self.mask_other: float = 0 # secondary mask argument (used for more complex distributions), see help in compute_mask_indicesh
self.no_mask_overlap: bool = False # whether to allow masks to overlap
self.mask_min_space: int = 1 # min space between spans (if no overlap is enabled)
# channel masking
self.mask_channel_length: int = 10 # length of the mask for features (channels)
self.mask_channel_prob: float = 0.0 # probability of replacing a feature with 0
self.mask_channel_selection: str = "static" # how to choose mask length for channel masking
self.mask_channel_other: float = 0 # secondary mask argument (used for more complex distributions), see help in compute_mask_indices
self.no_mask_channel_overlap: bool = False # whether to allow channel masks to overlap
self.mask_channel_min_space: int = 1 # min space between spans (if no overlap is enabled)
self.mask_channel_length: int = 10 # length of the mask for features (channels)
self.mask_channel_prob: float = 0.0 # probability of replacing a feature with 0
self.mask_channel_selection: str = "static" # how to choose mask length for channel masking
self.mask_channel_other: float = 0 # secondary mask argument (used for more complex distributions), see help in compute_mask_indices
self.no_mask_channel_overlap: bool = False # whether to allow channel masks to overlap
self.mask_channel_min_space: int = 1 # min space between spans (if no overlap is enabled)
# positional embeddings
self.conv_pos: int = 128 # number of filters for convolutional positional embeddings
self.conv_pos_groups: int = 16 # number of groups for convolutional positional embedding
self.conv_pos: int = 128 # number of filters for convolutional positional embeddings
self.conv_pos_groups: int = 16 # number of groups for convolutional positional embedding
# relative position embedding
self.relative_position_embedding: bool = True # apply relative position embedding
self.num_buckets: int = 320 # number of buckets for relative position embedding
self.max_distance: int = 1280 # maximum distance for relative position embedding
self.gru_rel_pos: bool = True # apply gated relative position embedding
self.relative_position_embedding: bool = True # apply relative position embedding
self.num_buckets: int = 320 # number of buckets for relative position embedding
self.max_distance: int = 1280 # maximum distance for relative position embedding
self.gru_rel_pos: bool = True # apply gated relative position embedding
if cfg is not None:
self.update(cfg)
@ -216,9 +208,8 @@ class WavLMConfig:
class WavLM(nn.Layer):
def __init__(
self,
cfg: WavLMConfig,
) -> None:
self,
cfg: WavLMConfig, ) -> None:
super().__init__()
logger.info(f"WavLM Config: {cfg.__dict__}")
@ -230,14 +221,11 @@ class WavLM(nn.Layer):
conv_layers=feature_enc_layers,
dropout=0.0,
mode=cfg.extractor_mode,
conv_bias=cfg.conv_bias,
)
conv_bias=cfg.conv_bias, )
self.post_extract_proj = (
nn.Linear(self.embed, cfg.encoder_embed_dim)
if self.embed != cfg.encoder_embed_dim
else None
)
self.post_extract_proj = (nn.Linear(self.embed, cfg.encoder_embed_dim)
if self.embed != cfg.encoder_embed_dim else
None)
self.mask_prob = cfg.mask_prob
self.mask_selection = cfg.mask_selection
@ -260,8 +248,7 @@ class WavLM(nn.Layer):
self.mask_emb = self.create_parameter(
shape=[cfg.encoder_embed_dim],
default_initializer=nn.initializer.Uniform(),
)
default_initializer=nn.initializer.Uniform(), )
self.encoder = TransformerEncoder(cfg)
self.layer_norm = LayerNorm(self.embed)
@ -278,8 +265,7 @@ class WavLM(nn.Layer):
self.mask_other,
min_masks=2,
no_overlap=self.no_mask_overlap,
min_space=self.mask_min_space,
)
min_space=self.mask_min_space, )
# mask_indices = torch.from_numpy(mask_indices).to(x.device)
mask_indices = paddle.to_tensor(mask_indices, dtype='int64')
x[mask_indices] = self.mask_emb
@ -295,40 +281,35 @@ class WavLM(nn.Layer):
self.mask_channel_selection,
self.mask_channel_other,
no_overlap=self.no_mask_channel_overlap,
min_space=self.mask_channel_min_space,
)
min_space=self.mask_channel_min_space, )
mask_channel_indices = (
# torch.from_numpy(mask_channel_indices)
paddle.to_tensor(mask_channel_indices, dtype='int64')
.to(x.device)
.unsqueeze(1)
.expand(-1, T, -1)
)
.to(x.device).unsqueeze(1).expand(-1, T, -1))
x[mask_channel_indices] = 0
return x, mask_indices
def forward_padding_mask(
self, features: Tensor, padding_mask: Tensor,
) -> Tensor:
self,
features: Tensor,
padding_mask: Tensor, ) -> Tensor:
extra = padding_mask.size(1) % features.size(1)
if extra > 0:
padding_mask = padding_mask[:, :-extra]
padding_mask = padding_mask.view(
padding_mask.size(0), features.size(1), -1
)
padding_mask = padding_mask.reshape(
[padding_mask.size(0), features.size(1), -1])
padding_mask = padding_mask.all(-1)
return padding_mask
def extract_features(
self,
source: Tensor,
padding_mask: Optional[Tensor] = None,
mask: bool = False,
ret_conv: bool = False,
output_layer: Optional[int] = None,
ret_layer_results: bool = False,
):
self,
source: Tensor,
padding_mask: Optional[Tensor]=None,
mask: bool=False,
ret_conv: bool=False,
output_layer: Optional[int]=None,
ret_layer_results: bool=False, ):
if self.feature_grad_mult > 0:
features = self.feature_extractor(source)
@ -339,7 +320,7 @@ class WavLM(nn.Layer):
with paddle.no_grad():
features = self.feature_extractor(source)
features = features.transpose([0, 2, 1]) # [1, 49, 512]
features = features.transpose([0, 2, 1]) # [1, 49, 512]
features = self.layer_norm(features)
if padding_mask is not None:
@ -351,9 +332,7 @@ class WavLM(nn.Layer):
features = self.dropout_input(features)
if mask:
x, mask_indices = self.apply_mask(
features, padding_mask
)
x, mask_indices = self.apply_mask(features, padding_mask)
else:
x = features
@ -362,33 +341,35 @@ class WavLM(nn.Layer):
# x: (B, T, D), float
# padding_mask: (B, T), bool
# mask_indices: (B, T), bool
x, layer_results = self.encoder(
x,
padding_mask=padding_mask,
layer=None if output_layer is None else output_layer - 1
)
layer=None if output_layer is None else output_layer - 1)
# print(f"Debugging: x.shape: {x.shape}, x.mean(): {x.mean()}, x.std(): {x.std()}")
res = {"x": x, "padding_mask": padding_mask, "features": features, "layer_results": layer_results}
res = {
"x": x,
"padding_mask": padding_mask,
"features": features,
"layer_results": layer_results
}
feature = res["features"] if ret_conv else res["x"]
if ret_layer_results:
feature = (feature, res["layer_results"])
return feature, res["padding_mask"]
def forward(self, x):
return self.extract_features(x)[0]
class ConvFeatureExtractionModel(nn.Layer):
def __init__(
self,
conv_layers: List[Tuple[int, int, int]],
dropout: float = 0.0,
mode: str = "default",
conv_bias: bool = False,
conv_type: str = "default"
):
def __init__(self,
conv_layers: List[Tuple[int, int, int]],
dropout: float=0.0,
mode: str="default",
conv_bias: bool=False,
conv_type: str="default"):
super().__init__()
assert mode in {"default", "layer_norm"}
@ -400,17 +381,20 @@ class ConvFeatureExtractionModel(nn.Layer):
stride,
is_layer_norm=False,
is_group_norm=False,
conv_bias=False,
):
conv_bias=False, ):
def make_conv():
conv = nn.Conv1D(n_in, n_out, k, stride=stride, bias_attr=conv_bias,
weight_attr=nn.initializer.KaimingNormal())
conv = nn.Conv1D(
n_in,
n_out,
k,
stride=stride,
bias_attr=conv_bias,
weight_attr=nn.initializer.KaimingNormal())
# nn.init.kaiming_normal_(conv.weight)
return conv
assert (
is_layer_norm and is_group_norm
) == False, "layer norm and group norm are exclusive"
assert (is_layer_norm and is_group_norm
) == False, "layer norm and group norm are exclusive"
if is_layer_norm:
return nn.Sequential(
@ -419,19 +403,18 @@ class ConvFeatureExtractionModel(nn.Layer):
nn.Sequential(
TransposeLast(),
nn.LayerNorm(normalized_shape=dim, epsilon=1e-5),
TransposeLast(),
),
nn.GELU(),
)
TransposeLast(), ),
nn.GELU(), )
elif is_group_norm:
return nn.Sequential(
make_conv(),
nn.Dropout(p=dropout),
nn.GroupNorm(num_groups=dim, num_channels=dim, epsilon=1e-5),
nn.GELU(),
)
nn.GroupNorm(
num_groups=dim, num_channels=dim, epsilon=1e-5),
nn.GELU(), )
else:
return nn.Sequential(make_conv(), nn.Dropout(p=dropout), nn.GELU())
return nn.Sequential(
make_conv(), nn.Dropout(p=dropout), nn.GELU())
self.conv_type = conv_type
if self.conv_type == "default":
@ -449,9 +432,7 @@ class ConvFeatureExtractionModel(nn.Layer):
stride,
is_layer_norm=mode == "layer_norm",
is_group_norm=mode == "default" and i == 0,
conv_bias=conv_bias,
)
)
conv_bias=conv_bias, ))
in_d = dim
elif self.conv_type == "conv2d":
in_d = 1
@ -460,9 +441,7 @@ class ConvFeatureExtractionModel(nn.Layer):
assert len(cl) == 3
(dim, k, stride) = cl
self.conv_layers.append(
paddle.nn.Conv2D(in_d, dim, k, stride)
)
self.conv_layers.append(paddle.nn.Conv2D(in_d, dim, k, stride))
self.conv_layers.append(paddle.nn.ReLU())
in_d = dim
elif self.conv_type == "custom":
@ -473,17 +452,13 @@ class ConvFeatureExtractionModel(nn.Layer):
assert len(cl) == 3
(dim, k, stride) = cl
self.conv_layers.append(
paddle.nn.Conv2D(in_d, dim, k, stride, padding=1)
)
self.conv_layers.append(
paddle.nn.LayerNorm([dim, idim])
)
paddle.nn.Conv2D(in_d, dim, k, stride, padding=1))
self.conv_layers.append(paddle.nn.LayerNorm([dim, idim]))
self.conv_layers.append(paddle.nn.ReLU())
in_d = dim
if (i + 1) % 2 == 0:
self.conv_layers.append(
paddle.nn.MaxPool2D(2, stride=2, ceil_mode=True)
)
paddle.nn.MaxPool2D(2, stride=2, ceil_mode=True))
idim = int(math.ceil(idim / 2))
else:
pass
@ -500,14 +475,15 @@ class ConvFeatureExtractionModel(nn.Layer):
else:
x = conv(x)
x = x.transpose([0, 1, 3, 2]).contiguous()
x = x.view(x.size(0), -1, x.size(-1))
x = x.reshape([x.size(0), -1, x.size(-1)])
else:
for conv in self.conv_layers:
x = conv(x)
if self.conv_type == "conv2d":
b, c, t, f = x.size()
# x = x.transpose(2, 3).contiguous().view(b, c * f, t)
x = x.transpose([0, 1, 3, 2]).contiguous().view(b, c * f, t)
# x = x.transpose(2, 3).contiguous().reshape([b, c * f, t])
x = x.transpose([0, 1, 3, 2]).contiguous().reshape(
[b, c * f, t])
return x
@ -518,8 +494,8 @@ class TransformerEncoder(nn.Layer):
self.dropout = args.dropout
self.embedding_dim = args.encoder_embed_dim
dropout = 0
std = math.sqrt((4 * (1.0 - dropout)) / (args.conv_pos * self.embedding_dim))
std = math.sqrt(
(4 * (1.0 - dropout)) / (args.conv_pos * self.embedding_dim))
self.pos_conv = nn.Conv1D(
self.embedding_dim,
@ -528,15 +504,16 @@ class TransformerEncoder(nn.Layer):
padding=args.conv_pos // 2,
groups=args.conv_pos_groups,
weight_attr=nn.initializer.Normal(mean=0, std=std),
bias_attr=True
)
bias_attr=True)
# nn.init.normal_(self.pos_conv.weight, mean=0, std=std)
# nn.init.constant_(self.pos_conv.bias, 0)
# self.pos_conv = nn.utils.weight_norm(self.pos_conv, name="weight", dim=2)
# self.pos_conv.weight_g = self.pos_conv.weight_g.unsqueeze(0).unsqueeze(0)
self.pos_conv = nn.utils.weight_norm(self.pos_conv, name="weight", dim=2)
self.pos_conv = nn.Sequential(self.pos_conv, SamePad(args.conv_pos), nn.GELU())
self.pos_conv = nn.utils.weight_norm(
self.pos_conv, name="weight", dim=2)
self.pos_conv = nn.Sequential(self.pos_conv,
SamePad(args.conv_pos), nn.GELU())
if hasattr(args, "relative_position_embedding"):
self.relative_position_embedding = args.relative_position_embedding
@ -547,25 +524,23 @@ class TransformerEncoder(nn.Layer):
self.num_buckets = 0
self.max_distance = 0
self.layers = nn.LayerList(
[
TransformerSentenceEncoderLayer(
embedding_dim=self.embedding_dim,
ffn_embedding_dim=args.encoder_ffn_embed_dim,
num_attention_heads=args.encoder_attention_heads,
dropout=self.dropout,
attention_dropout=args.attention_dropout,
activation_dropout=args.activation_dropout,
activation_fn=args.activation_fn,
layer_norm_first=args.layer_norm_first,
has_relative_attention_bias=(self.relative_position_embedding and i == 0),
num_buckets=self.num_buckets,
max_distance=self.max_distance,
gru_rel_pos=args.gru_rel_pos,
)
for i in range(args.encoder_layers)
]
)
self.layers = nn.LayerList([
TransformerSentenceEncoderLayer(
embedding_dim=self.embedding_dim,
ffn_embedding_dim=args.encoder_ffn_embed_dim,
num_attention_heads=args.encoder_attention_heads,
dropout=self.dropout,
attention_dropout=args.attention_dropout,
activation_dropout=args.activation_dropout,
activation_fn=args.activation_fn,
layer_norm_first=args.layer_norm_first,
has_relative_attention_bias=(
self.relative_position_embedding and i == 0),
num_buckets=self.num_buckets,
max_distance=self.max_distance,
gru_rel_pos=args.gru_rel_pos, )
for i in range(args.encoder_layers)
])
self.layer_norm_first = args.layer_norm_first
self.layer_norm = LayerNorm(self.embedding_dim)
@ -574,14 +549,19 @@ class TransformerEncoder(nn.Layer):
# self.apply(init_bert_params)
def forward(self, x, padding_mask=None, streaming_mask=None, layer=None):
x, layer_results = self.extract_features(x, padding_mask, streaming_mask, layer)
x, layer_results = self.extract_features(x, padding_mask,
streaming_mask, layer)
# print("x.shape", x.shape)
if self.layer_norm_first and layer is None:
x = self.layer_norm(x)
return x, layer_results
def extract_features(self, x, padding_mask=None, streaming_mask=None, tgt_layer=None):
def extract_features(self,
x,
padding_mask=None,
streaming_mask=None,
tgt_layer=None):
if padding_mask is not None:
x[padding_mask] = 0
@ -598,7 +578,6 @@ class TransformerEncoder(nn.Layer):
# x = x.transpose(0, 1)
x = x.transpose([1, 0, 2])
layer_results = []
z = None
if tgt_layer is not None:
@ -608,7 +587,12 @@ class TransformerEncoder(nn.Layer):
for i, layer in enumerate(self.layers):
dropout_probability = np.random.random()
if not self.training or (dropout_probability > self.layerdrop):
x, z, pos_bias = layer(x, self_attn_padding_mask=padding_mask, need_weights=False,self_attn_mask=streaming_mask, pos_bias=pos_bias)
x, z, pos_bias = layer(
x,
self_attn_padding_mask=padding_mask,
need_weights=False,
self_attn_mask=streaming_mask,
pos_bias=pos_bias)
if tgt_layer is not None:
layer_results.append((x, z))
if i == tgt_layer:
@ -633,20 +617,19 @@ class TransformerSentenceEncoderLayer(nn.Layer):
def __init__(
self,
embedding_dim: float = 768,
ffn_embedding_dim: float = 3072,
num_attention_heads: float = 8,
dropout: float = 0.1,
attention_dropout: float = 0.1,
activation_dropout: float = 0.1,
activation_fn: str = "relu",
layer_norm_first: bool = False,
has_relative_attention_bias: bool = True,
num_buckets: int = 0,
max_distance: int = 0,
rescale_init: bool = False,
gru_rel_pos: bool = True,
) -> None:
embedding_dim: float=768,
ffn_embedding_dim: float=3072,
num_attention_heads: float=8,
dropout: float=0.1,
attention_dropout: float=0.1,
activation_dropout: float=0.1,
activation_fn: str="relu",
layer_norm_first: bool=False,
has_relative_attention_bias: bool=True,
num_buckets: int=0,
max_distance: int=0,
rescale_init: bool=False,
gru_rel_pos: bool=True, ) -> None:
super().__init__()
# Initialize parameters
@ -666,8 +649,7 @@ class TransformerSentenceEncoderLayer(nn.Layer):
num_buckets=num_buckets,
max_distance=max_distance,
rescale_init=rescale_init,
gru_rel_pos=gru_rel_pos,
)
gru_rel_pos=gru_rel_pos, )
self.dropout1 = nn.Dropout(dropout)
self.dropout2 = nn.Dropout(self.activation_dropout)
@ -679,7 +661,8 @@ class TransformerSentenceEncoderLayer(nn.Layer):
self.self_attn_layer_norm = LayerNorm(self.embedding_dim)
if self.activation_name == "glu":
self.fc1 = GLU_Linear(self.embedding_dim, ffn_embedding_dim, "swish")
self.fc1 = GLU_Linear(self.embedding_dim, ffn_embedding_dim,
"swish")
else:
self.fc1 = nn.Linear(self.embedding_dim, ffn_embedding_dim)
self.fc2 = nn.Linear(ffn_embedding_dim, self.embedding_dim)
@ -687,21 +670,19 @@ class TransformerSentenceEncoderLayer(nn.Layer):
# layer norm associated with the position wise feed-forward NN
self.final_layer_norm = LayerNorm(self.embedding_dim)
def forward(
self,
x: Tensor,
self_attn_mask: Tensor = None,
self_attn_padding_mask: Tensor = None,
need_weights: bool = False,
pos_bias=None
):
def forward(self,
x: Tensor,
self_attn_mask: Tensor=None,
self_attn_padding_mask: Tensor=None,
need_weights: bool=False,
pos_bias=None):
"""
LayerNorm is applied either before or after the self-attention/ffn
modules similar to the original Transformer imlementation.
"""
residual = x
if self.layer_norm_first:
x = self.self_attn_layer_norm(x)
x, attn, pos_bias = self.self_attn(
query=x,
@ -710,8 +691,7 @@ class TransformerSentenceEncoderLayer(nn.Layer):
key_padding_mask=self_attn_padding_mask,
need_weights=False,
attn_mask=self_attn_mask,
position_bias=pos_bias
)
position_bias=pos_bias)
# import pdb; pdb.set_trace()
x = self.dropout1(x)
x = residual + x
@ -734,8 +714,7 @@ class TransformerSentenceEncoderLayer(nn.Layer):
key_padding_mask=self_attn_padding_mask,
need_weights=need_weights,
attn_mask=self_attn_mask,
position_bias=pos_bias
)
position_bias=pos_bias)
x = self.dropout1(x)
x = residual + x

@ -109,11 +109,11 @@ class MultiHeadAttention(nn.Layer):
n_batch, n_ctx, n_state = q.shape
scale = (n_state // self.n_head)**-0.25
q = paddle.transpose(
q.view(*q.shape[:2], self.n_head, -1), (0, 2, 1, 3)) * scale
q.reshape([*q.shape[:2], self.n_head, -1]), (0, 2, 1, 3)) * scale
k = paddle.transpose(
k.view(*k.shape[:2], self.n_head, -1), (0, 2, 3, 1)) * scale
k.reshape([*k.shape[:2], self.n_head, -1]), (0, 2, 3, 1)) * scale
v = paddle.transpose(
v.view(*v.shape[:2], self.n_head, -1), (0, 2, 1, 3))
v.reshape([*v.shape[:2], self.n_head, -1]), (0, 2, 1, 3))
qk = q @ k
if mask is not None:
@ -823,7 +823,7 @@ class BeamSearchDecoder(TokenDecoder):
if self.finished_sequences is None: # for the first update
self.finished_sequences = [{} for _ in range(batch_size)]
logprobs = F.log_softmax(logits, axis=-1, dtype=paddle.float32)
logprobs = F.log_softmax(logits, axis=-1, dtype='float32')
next_tokens, source_indices, finished_sequences = [], [], []
for i in range(batch_size):
scores, sources, finished = {}, {}, {}
@ -969,7 +969,7 @@ class ApplyTimestampRules(LogitFilter):
logits[:, last_allowed + 1:] = -np.inf
# if sum of probability over timestamps is above any other token, sample timestamp
logprobs = F.log_softmax(logits, axis=-1, dtype=paddle.float32)
logprobs = F.log_softmax(logits, axis=-1, dtype='float32')
for k in range(tokens.shape[0]):
# When using paddle.logsumexp on a 32GB Tesla-V100 GPU, we encountered CUDA error 700.
# To bypass this issue in CI, we have decomposed the operation into separate steps.

@ -110,14 +110,14 @@ class TransformerDecoder(BatchScorerInterface, nn.Layer):
concat_after=concat_after, ) for _ in range(num_blocks)
])
def forward(
self,
memory: paddle.Tensor,
memory_mask: paddle.Tensor,
ys_in_pad: paddle.Tensor,
ys_in_lens: paddle.Tensor,
r_ys_in_pad: paddle.Tensor=paddle.empty([0]),
reverse_weight: float=0.0) -> Tuple[paddle.Tensor, paddle.Tensor]:
def forward(self,
memory: paddle.Tensor,
memory_mask: paddle.Tensor,
ys_in_pad: paddle.Tensor,
ys_in_lens: paddle.Tensor,
r_ys_in_pad: paddle.Tensor=paddle.empty([0]),
reverse_weight: float=0.0
) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
"""Forward decoder.
Args:
memory: encoded memory, float32 (batch, maxlen_in, feat)

@ -181,8 +181,9 @@ def th_accuracy(pad_outputs: paddle.Tensor,
Returns:
float: Accuracy value (0.0 - 1.0).
"""
pad_pred = pad_outputs.view(pad_targets.shape[0], pad_targets.shape[1],
pad_outputs.shape[1]).argmax(2)
pad_pred = pad_outputs.reshape(
[pad_targets.shape[0], pad_targets.shape[1],
pad_outputs.shape[1]]).argmax(2)
mask = pad_targets != ignore_label
numerator = paddle.sum(

@ -138,7 +138,7 @@ class Pitch():
input: np.ndarray,
use_continuous_f0: bool=True,
use_log_f0: bool=True) -> np.ndarray:
input = input.astype(np.float)
input = input.astype(np.float_)
frame_period = 1000 * self.hop_length / self.sr
f0, timeaxis = pyworld.dio(
input,

@ -203,9 +203,9 @@ def main():
sentences, speaker_set = get_phn_dur(dur_file)
merge_silence(sentences)
# split data into 3 sections
if args.dataset == "baker":
wav_files = sorted(list((rootdir / "Wave").rglob("*.wav")))
# split data into 3 sections
num_train = 9800
num_dev = 100
train_wav_files = wav_files[:num_train]

@ -18,6 +18,7 @@ from pathlib import Path
import soundfile as sf
from paddle import inference
import paddlespeech.utils
from paddlespeech.t2s.frontend.zh_frontend import Frontend
@ -48,16 +49,27 @@ def main():
phone_vocab_path=args.phones_dict, tone_vocab_path=args.tones_dict)
print("frontend done!")
speedyspeech_config = inference.Config(
str(Path(args.inference_dir) / "speedyspeech.pdmodel"),
str(Path(args.inference_dir) / "speedyspeech.pdiparams"))
# after paddle 3.0, support new inference interface
if paddlespeech.utils.satisfy_paddle_version('3.0.0-beta'):
speedyspeech_config = inference.Config(
str(Path(args.inference_dir)), "speedyspeech")
else:
speedyspeech_config = inference.Config(
str(Path(args.inference_dir) / "speedyspeech.pdmodel"),
str(Path(args.inference_dir) / "speedyspeech.pdiparams"))
speedyspeech_config.enable_use_gpu(100, 0)
speedyspeech_config.enable_memory_optim()
speedyspeech_predictor = inference.create_predictor(speedyspeech_config)
pwg_config = inference.Config(
str(Path(args.inference_dir) / "pwg.pdmodel"),
str(Path(args.inference_dir) / "pwg.pdiparams"))
# after paddle 3.0, support new inference interface
if paddlespeech.utils.satisfy_paddle_version('3.0.0-beta'):
pwg_config = inference.Config(str(Path(args.inference_dir)), "pwg")
else:
pwg_config = inference.Config(
str(Path(args.inference_dir) / "pwg.pdmodel"),
str(Path(args.inference_dir) / "pwg.pdiparams"))
pwg_config.enable_use_gpu(100, 0)
pwg_config.enable_memory_optim()
pwg_predictor = inference.create_predictor(pwg_config)

@ -230,15 +230,17 @@ def train_sp(args, config):
output_dir=output_dir)
trainer = Trainer(
updater, stop_trigger=(config.max_epoch, 'epoch'), out=output_dir)
updater,
stop_trigger=(config.train_max_steps, "iteration"),
out=output_dir)
if dist.get_rank() == 0:
trainer.extend(
evaluator, trigger=(config.eval_interval_epochs, 'epoch'))
evaluator, trigger=(config.eval_interval_steps, 'iteration'))
trainer.extend(VisualDL(output_dir), trigger=(1, 'iteration'))
trainer.extend(
Snapshot(max_size=config.num_snapshots),
trigger=(config.save_interval_epochs, 'epoch'))
trigger=(config.save_interval_steps, 'iteration'))
print("Trainer Done!")
trainer.run()

@ -841,6 +841,9 @@ class FastSpeech2(nn.Layer):
spk_emb = self.spk_projection(F.normalize(spk_emb))
hs = hs + spk_emb.unsqueeze(1)
elif self.spk_embed_integration_type == "concat":
# one wave `spk_emb` under synthesize, the dim is `1`
if spk_emb.dim() == 1:
spk_emb = spk_emb.unsqueeze(0)
# concat hidden states with spk embeds and then apply projection
spk_emb = F.normalize(spk_emb).unsqueeze(1).expand(
shape=[-1, paddle.shape(hs)[1], -1])
@ -900,14 +903,14 @@ class FastSpeech2(nn.Layer):
# initialize alpha in scaled positional encoding
if self.encoder_type == "transformer" and self.use_scaled_pos_enc:
init_enc_alpha = paddle.to_tensor(init_enc_alpha)
init_enc_alpha = paddle.to_tensor(init_enc_alpha).reshape([1])
self.encoder.embed[-1].alpha = paddle.create_parameter(
shape=init_enc_alpha.shape,
dtype=str(init_enc_alpha.numpy().dtype),
default_initializer=paddle.nn.initializer.Assign(
init_enc_alpha))
if self.decoder_type == "transformer" and self.use_scaled_pos_enc:
init_dec_alpha = paddle.to_tensor(init_dec_alpha)
init_dec_alpha = paddle.to_tensor(init_dec_alpha).reshape([1])
self.decoder.embed[-1].alpha = paddle.create_parameter(
shape=init_dec_alpha.shape,
dtype=str(init_dec_alpha.numpy().dtype),

@ -751,10 +751,10 @@ class JETSGenerator(nn.Layer):
# integrate with SID and LID embeddings
if self.spks is not None:
sid_embs = self.sid_emb(sids.view(-1))
sid_embs = self.sid_emb(sids.reshape([-1]))
hs = hs + sid_embs.unsqueeze(1)
if self.langs is not None:
lid_embs = self.lid_emb(lids.view(-1))
lid_embs = self.lid_emb(lids.reshape([-1]))
hs = hs + lid_embs.unsqueeze(1)
# integrate speaker embedding

@ -55,7 +55,9 @@ class GaussianUpsampling(nn.Layer):
if h_masks is not None:
t = t * paddle.to_tensor(h_masks, dtype="float32")
c = ds.cumsum(axis=-1) - ds / 2
ds_cumsum = ds.cumsum(axis=-1)
ds_half = ds / 2
c = ds_cumsum.astype(ds_half.dtype) - ds_half
energy = -1 * self.delta * (t.unsqueeze(-1) - c.unsqueeze(1))**2
if d_masks is not None:
d_masks = ~(d_masks.unsqueeze(1))

@ -577,8 +577,9 @@ class VITSGenerator(nn.Layer):
# decoder
z_p = m_p + paddle.randn(
paddle.shape(m_p)) * paddle.exp(logs_p) * noise_scale
z = self.flow(z_p, y_mask, g=g, inverse=True)
wav = self.decoder((z * y_mask)[:, :, :max_len], g=g)
z = self.flow(z_p, y_mask.astype(z_p.dtype), g=g, inverse=True)
wav = self.decoder(
(z * y_mask.astype(z.dtype))[:, :, :max_len], g=g)
return wav.squeeze(1), attn.squeeze(1), dur.squeeze(1)
@ -695,4 +696,5 @@ class VITSGenerator(nn.Layer):
path = paddle.cast(path, dtype='float32')
pad_tmp = self.pad1d(path)[:, :-1]
path = path - pad_tmp
return path.unsqueeze(1).transpose([0, 1, 3, 2]) * mask
return path.unsqueeze(1).transpose(
[0, 1, 3, 2]) * mask.astype(path.dtype)

@ -129,6 +129,7 @@ class PosteriorEncoder(nn.Layer):
"""
x_mask = make_non_pad_mask(x_lengths).unsqueeze(1)
x_mask = x_mask.astype(x.dtype)
x = self.input_conv(x) * x_mask
x = self.encoder(x, x_mask, g=g)
stats = self.proj(x) * x_mask

@ -155,6 +155,7 @@ class TextEncoder(nn.Layer):
"""
x = self.emb(x) * math.sqrt(self.attention_dim)
x_mask = make_non_pad_mask(x_lengths).unsqueeze(1)
x_mask = x_mask.astype(x.dtype)
# encoder assume the channel last (B, T_text, attention_dim)
# but mask shape shoud be (B, 1, T_text)
x, _ = self.encoder(x, x_mask)

@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from .conv import *
from .fftconv1d import *
from .geometry import *
from .losses import *
from .positional_encoding import *

@ -120,7 +120,11 @@ class SinusoidalPosEmb(nn.Layer):
self.dim = dim
def forward(self, x: paddle.Tensor):
x = paddle.cast(x, 'float32')
# check if x is 0-dim tensor, if so, add a dimension
if x.ndim == 0:
x = paddle.cast(x.unsqueeze(0), 'float32')
else:
x = paddle.cast(x, 'float32')
half_dim = self.dim // 2
emb = math.log(10000) / (half_dim - 1)
emb = paddle.exp(paddle.arange(half_dim) * -emb)

@ -0,0 +1,214 @@
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import typing
from typing import Optional
from typing import Sequence
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ...utils import satisfy_paddle_version
__all__ = [
"fft_conv1d",
"FFTConv1D",
]
def __unfold(x, kernel_size: int, stride: int):
"""1D only unfolding similar to the one from Paddlepaddle.
Notes
------
Given a tensor `x` of size `[*, T]` this will return
a tensor `[*, F, K]` with `K` the kernel size, and `F` the number
of frames. The i-th frame is a view onto `i * stride: i * stride + kernel_size`.
This will automatically pad `x` to cover at least once all entries in `x`.
Args:
x (Tensor):
tensor for which to return the frames.
kernel_size (int):
size of each frame.
stride (int):
stride between each frame.
"""
shape = list(x.shape)
length = shape.pop(-1)
n_frames = math.ceil((max(length, kernel_size) - kernel_size) / stride) + 1
tgt_length = (n_frames - 1) * stride + kernel_size
padded = F.pad(x, (0, tgt_length - length), data_format="NCL")
strides: typing.List[int] = []
for dim in range(padded.dim()):
strides.append(padded.strides[dim])
assert strides.pop(-1) == 1, "data should be contiguous"
strides = strides + [stride, 1]
return padded.as_strided(shape + [n_frames, kernel_size], strides)
def fft_conv1d(
x: paddle.Tensor,
weight: paddle.Tensor,
bias: Optional[paddle.Tensor]=None,
stride: int=1,
padding: int=0,
block_ratio: float=5, ):
"""
Same as `paddle.nn.functional.conv1d` but using FFT for the convolution.
Please check PaddlePaddle documentation for more information.
Notes
------
This function is faster than `paddle.nn.functional.conv1d` only in specific cases.
Typically, the kernel size should be of the order of 256 to see any real gain,
for a stride of 1.
Dilation and groups are not supported at the moment. This function might use
more memory than the default Conv1d implementation.
Args:
x (Tensor):
x signal of shape `[B, C, T]`.
weight (Tensor):
weight of the convolution `[D, C, K]` with `D` the number of output channels.
bias (Tensor or None):
if not None, bias term for the convolution.
stride (int):
stride of convolution.
padding (int):
padding to apply to x.
block_ratio (float):
can be tuned for speed. x is splitted in chunks with a size of `int(block_ratio * kernel_size)`.
Shape:
- Inputs: `x` is `[B, C, T]`, `weight` is `[D, C, K]` and bias is `[D]`.
- Output: `(*, T)`
"""
x = F.pad(x, (padding, padding), data_format="NCL")
batch, _, length = x.shape
out_channels, _, kernel_size = weight.shape
if length < kernel_size:
raise RuntimeError(
f"Input should be at least as large as the kernel size {kernel_size}, "
f"but it is only {length} samples long.")
if block_ratio < 1:
raise RuntimeError("Block ratio must be greater than 1.")
block_size: int = min(int(kernel_size * block_ratio), length)
fold_stride = block_size - kernel_size + 1
# weight = pad_to(weight, block_size)
weight = F.pad(
weight, (0, block_size - weight.shape[-1]),
mode="constant",
value=0.0,
data_format="NCL")
weight_z = paddle.fft.rfft(weight, axis=-1)
# We pad `x` and get the different frames, on which
frames = __unfold(x, block_size, fold_stride)
frames_z = paddle.fft.rfft(frames, axis=-1)
weight_z_coml = paddle.conj(weight_z)
out_z = paddle.einsum("bcft,dct->bdft", frames_z, weight_z_coml)
out = paddle.fft.irfft(out_z, n=block_size, axis=-1)
# The last bit is invalid, because FFT will do a circular convolution.
out = out[..., :-kernel_size + 1]
out = out.reshape([batch, out_channels, -1])
out = out[..., ::stride]
target_length = (length - kernel_size) // stride + 1
out = out[..., :target_length]
if bias is not None:
out += bias[:, None]
return out
class FFTConv1D(paddle.nn.Layer):
"""
Same as `paddle.nn.Conv1D` but based on a custom FFT-based convolution.
Please check PaddlePaddle documentation for more information on `paddle.nn.Conv1D`.
Notes
------
This module is faster than `paddle.nn.Conv1D` only in specific cases.
Typically, `kernel_size` should be of the order of 256 to see any real gain,
for a stride of 1.
Dilation and groups are not supported at the moment. This module might use
more memory than the default Conv1D implementation.
Args:
in_channels (int):
number of `x` channels.
out_channels (int):
number of output channels.
kernel_size (int):
kernel size of convolution.
stride (int):
stride of convolution.
padding (int):
padding to apply to `x`.
bias_attr (bool):
if True, use a bias term.
Examples:
>>> fftconv = FFTConv1D(12, 24, 128, 4)
>>> x = paddle.randn([4, 12, 1024])
>>> print(list(fftconv(x).shape))
[4, 24, 225]
"""
def __init__(
self,
in_channels: int,
out_channels: int,
kernel_size: int,
stride: int=1,
padding: int=0,
bias_attr: bool=True, ):
super(FFTConv1D, self).__init__()
self.in_channels = in_channels
self.out_channels = out_channels
self.kernel_size = kernel_size
self.stride = stride
self.padding = padding
# Create a Conv1D layer to initialize weights and bias
conv = paddle.nn.Conv1D(
in_channels,
out_channels,
kernel_size,
stride=stride,
padding=padding,
bias_attr=bias_attr)
self.weight = conv.weight
if bias_attr:
self.bias = conv.bias
else:
self.bias = None
def forward(self, x: paddle.Tensor):
return fft_conv1d(x, self.weight, self.bias, self.stride, self.padding)
# Currently, the API unfold in Paddle is extremely slow, so __unfold is implemented
# using the `.strides` and `.as_strided` APIs. However, these are only supported in
# Paddle version 2.6 and above, so F.conv1d and Conv1D are used as replacements.
if not satisfy_paddle_version('2.6'):
fft_conv1d = F.conv1d
FFTConv1D = nn.Conv1D

@ -1114,8 +1114,10 @@ class MLMLoss(nn.Layer):
paddle.reshape(after_outs, (-1, self.odim)),
paddle.reshape(xs_pad, (-1, self.odim))),
axis=-1)
mlm_loss_pos = (mlm_loss_pos).astype(loss.dtype)
mlm_loss = paddle.sum((loss * paddle.reshape(
mlm_loss_pos, [-1]))) / paddle.sum((mlm_loss_pos) + 1e-10)
mlm_loss_pos,
[-1]).astype(loss.dtype))) / paddle.sum((mlm_loss_pos) + 1e-10)
text_mlm_loss = None

@ -29,7 +29,27 @@ def is_broadcastable(shp1, shp2):
def broadcast_shape(shp1, shp2):
result = []
for a, b in zip(shp1[::-1], shp2[::-1]):
result.append(max(a, b))
is_a_int = isinstance(a, int)
is_b_int = isinstance(b, int)
if is_a_int and is_b_int:
result.append(max(a, b))
else:
dtype = None
if hasattr(a, 'dtype'):
dtype = a.dtype
if hasattr(b, 'dtype'):
dtype = b.dtype
if (is_a_int):
a = paddle.full((), a, dtype=dtype)
if (is_b_int):
b = paddle.full((), b, dtype=dtype)
result.append(paddle.maximum(a, b))
return result[::-1]

@ -181,7 +181,12 @@ def make_pad_mask(lengths, xs=None, length_dim=-1):
if length_dim == 0:
raise ValueError("length_dim cannot be 0: {}".format(length_dim))
bs = paddle.shape(lengths)
# check if lengths is 0-dim tensor, if so, add a dimension
if lengths.ndim == 0:
bs = paddle.shape(lengths.unsqueeze(0))
else:
bs = paddle.shape(lengths)
if xs is None:
maxlen = paddle.cast(lengths.max(), dtype=bs.dtype)
else:
@ -348,7 +353,9 @@ def get_random_segments(
"""
b, c, t = paddle.shape(x)
max_start_idx = x_lengths - segment_size
start_idxs = paddle.cast(paddle.rand([b]) * max_start_idx, 'int64')
rand_number = paddle.rand([b])
start_idxs = paddle.cast(rand_number *
max_start_idx.astype(rand_number.dtype), 'int64')
segments = get_segments(x, start_idxs, segment_size)
return segments, start_idxs
@ -459,7 +466,7 @@ def phones_masking(xs_pad: paddle.Tensor,
for s, e in zip(masked_start, masked_end):
masked_pos[idx, s:e] = 1
non_eos_mask = paddle.reshape(src_mask, paddle.shape(xs_pad)[:2])
masked_pos = masked_pos * non_eos_mask
masked_pos = masked_pos * non_eos_mask.astype(masked_pos.dtype)
masked_pos = paddle.cast(masked_pos, 'bool')
return masked_pos
@ -543,10 +550,11 @@ def phones_text_masking(xs_pad: paddle.Tensor,
for s, e in zip(masked_start, masked_end):
masked_pos[idx, s:e] = 1
non_eos_mask = paddle.reshape(src_mask, shape=paddle.shape(xs_pad)[:2])
masked_pos = masked_pos * non_eos_mask
masked_pos = masked_pos * non_eos_mask.astype(masked_pos.dtype)
non_eos_text_mask = paddle.reshape(
text_mask, shape=paddle.shape(text_pad)[:2])
text_masked_pos = text_masked_pos * non_eos_text_mask
text_masked_pos = text_masked_pos * non_eos_text_mask.astype(
text_masked_pos.dtype)
masked_pos = paddle.cast(masked_pos, 'bool')
text_masked_pos = paddle.cast(text_masked_pos, 'bool')

@ -171,7 +171,8 @@ class AttLoc(nn.Layer):
if paddle.sum(att_prev) == 0:
# if no bias, 0 0-pad goes 0
att_prev = 1.0 - make_pad_mask(enc_hs_len)
att_prev = att_prev / enc_hs_len.unsqueeze(-1)
att_prev = att_prev / enc_hs_len.unsqueeze(-1).astype(
att_prev.dtype)
# att_prev: (utt, frame) -> (utt, 1, 1, frame)
# -> (utt, att_conv_chans, 1, frame)

@ -162,6 +162,8 @@ class Encoder(nn.Layer):
return xs.transpose([0, 2, 1])
if not isinstance(ilens, paddle.Tensor):
ilens = paddle.to_tensor(ilens)
if ilens.ndim == 0:
ilens = ilens.unsqueeze(0)
xs = xs.transpose([0, 2, 1])
# for dygraph to static graph
# self.blstm.flatten_parameters()

@ -67,7 +67,7 @@ class PositionalEncoding(nn.Layer):
pe[:, 0::2] = paddle.sin(position * div_term)
pe[:, 1::2] = paddle.cos(position * div_term)
pe = pe.unsqueeze(0)
self.pe = pe
self.pe = paddle.assign(pe)
def forward(self, x: paddle.Tensor):
"""Add positional encoding.

@ -36,7 +36,7 @@ def convert_dtype_to_np_dtype_(dtype):
elif dtype is core.VarDesc.VarType.FP16:
return np.float16
elif dtype is core.VarDesc.VarType.BOOL:
return np.bool
return np.bool_
elif dtype is core.VarDesc.VarType.INT32:
return np.int32
elif dtype is core.VarDesc.VarType.INT64:

@ -11,3 +11,19 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from packaging.version import Version
def satisfy_version(source: str, target: str, dev_allowed: bool=True) -> bool:
if dev_allowed and source.startswith('0.0.0'):
target_version = Version('0.0.0')
else:
target_version = Version(target)
source_version = Version(source)
return source_version >= target_version
def satisfy_paddle_version(target: str, dev_allowed: bool=True) -> bool:
import paddle
return satisfy_version(paddle.__version__, target, dev_allowed)

@ -39,7 +39,12 @@ class MultiSpeakerMelDataset(Dataset):
def __init__(self, dataset_root: Path):
self.root = Path(dataset_root).expanduser()
speaker_dirs = [f for f in self.root.glob("*") if f.is_dir()]
speaker_dirs = []
for f in self.root.glob("*"):
if f.is_dir():
assert list(f.glob(
"*.npy")), "This folder NOT includes any npy data file."
speaker_dirs.append(f)
speaker_utterances = {
speaker_dir: list(speaker_dir.glob("*.npy"))

@ -37,7 +37,7 @@ def compute_amplitude(waveforms, lengths=None, amp_type="avg", scale="linear"):
out = paddle.mean(paddle.abs(waveforms), axis=1, keepdim=True)
else:
wav_sum = paddle.sum(paddle.abs(waveforms), axis=1, keepdim=True)
out = wav_sum / lengths
out = wav_sum / lengths.astype(wav_sum.dtype)
elif amp_type == "peak":
out = paddle.max(paddle.abs(waveforms), axis=1, keepdim=True)
else:

@ -1 +1 @@
../../../utils/
../../../utils/

@ -1 +1 @@
../../../../utils/
../../../../utils/

@ -14,6 +14,7 @@ function main(){
cd ${speech_ci_path}/tts
python test_data_table.py
python test_enfrontend.py
python test_fftconv1d.py
python test_mixfrontend.py
echo "End TTS"

@ -0,0 +1,128 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import unittest
import numpy as np
import paddle
from paddle.nn import Conv1D
from paddlespeech.t2s.modules import fft_conv1d
from paddlespeech.t2s.modules import FFTConv1D
class TestFFTConv1D(unittest.TestCase):
def setUp(self):
self.batch_size = 4
self.in_channels = 3
self.out_channels = 16
self.kernel_size = 5
self.stride = 1
self.padding = 1
self.input_length = 32
def _init_models(self, in_channels, out_channels, kernel_size, stride,
padding):
x = paddle.randn([self.batch_size, in_channels, self.input_length])
conv1d = paddle.nn.Conv1D(
in_channels,
out_channels,
kernel_size,
stride=stride,
padding=padding)
fft_conv1d = FFTConv1D(
in_channels,
out_channels,
kernel_size,
stride=stride,
padding=padding)
fft_conv1d.weight.set_value(conv1d.weight.numpy())
if conv1d.bias is not None:
fft_conv1d.bias.set_value(conv1d.bias.numpy())
return x, conv1d, fft_conv1d
def test_fft_conv1d_vs_conv1d_default(self):
x, conv1d, fft_conv1d = self._init_models(
self.in_channels, self.out_channels, self.kernel_size, self.stride,
self.padding)
out_conv1d = conv1d(x)
out_fft_conv1d = fft_conv1d(x)
self.assertTrue(
np.allclose(out_conv1d.numpy(), out_fft_conv1d.numpy(), atol=1e-6))
def test_fft_conv1d_vs_conv1d_no_padding(self):
x, conv1d, fft_conv1d = self._init_models(
self.in_channels, self.out_channels, self.kernel_size, self.stride,
0)
out_conv1d = conv1d(x)
out_fft_conv1d = fft_conv1d(x)
self.assertTrue(
np.allclose(out_conv1d.numpy(), out_fft_conv1d.numpy(), atol=1e-6))
def test_fft_conv1d_vs_conv1d_large_kernel(self):
kernel_size = 256
padding = kernel_size - 1
x, conv1d, fft_conv1d = self._init_models(
self.in_channels, self.out_channels, kernel_size, self.stride,
padding)
out_conv1d = conv1d(x)
out_fft_conv1d = fft_conv1d(x)
self.assertTrue(
np.allclose(out_conv1d.numpy(), out_fft_conv1d.numpy(), atol=1e-6))
def test_fft_conv1d_vs_conv1d_stride_2(self):
x, conv1d, fft_conv1d = self._init_models(
self.in_channels, self.out_channels, self.kernel_size, 2,
self.padding)
out_conv1d = conv1d(x)
out_fft_conv1d = fft_conv1d(x)
self.assertTrue(
np.allclose(out_conv1d.numpy(), out_fft_conv1d.numpy(), atol=1e-6))
def test_fft_conv1d_vs_conv1d_different_input_length(self):
input_length = 1024
x, conv1d, fft_conv1d = self._init_models(
self.in_channels, self.out_channels, self.kernel_size, self.stride,
self.padding)
x = paddle.randn([self.batch_size, self.in_channels, input_length])
out_conv1d = conv1d(x)
out_fft_conv1d = fft_conv1d(x)
self.assertTrue(
np.allclose(out_conv1d.numpy(), out_fft_conv1d.numpy(), atol=1e-6))
def test_fft_conv1d_vs_conv1d_no_bias(self):
conv1d = paddle.nn.Conv1D(
self.in_channels,
self.out_channels,
self.kernel_size,
stride=self.stride,
padding=self.padding,
bias_attr=False)
fft_conv1d = FFTConv1D(
self.in_channels,
self.out_channels,
self.kernel_size,
stride=self.stride,
padding=self.padding,
bias_attr=False)
fft_conv1d.weight.set_value(conv1d.weight.numpy())
x = paddle.randn([self.batch_size, self.in_channels, self.input_length])
out_conv1d = conv1d(x)
out_fft_conv1d = fft_conv1d(x)
self.assertTrue(
np.allclose(out_conv1d.numpy(), out_fft_conv1d.numpy(), atol=1e-6))
if __name__ == '__main__':
unittest.main()

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save