diff --git a/demos/TTSArmLinux/src/TTSCppFrontend b/demos/TTSArmLinux/src/TTSCppFrontend index 25953976d..820985404 120000 --- a/demos/TTSArmLinux/src/TTSCppFrontend +++ b/demos/TTSArmLinux/src/TTSCppFrontend @@ -1 +1 @@ -../../TTSCppFrontend/ \ No newline at end of file +../../TTSCppFrontend/ diff --git a/examples/aishell/asr0/utils b/examples/aishell/asr0/utils index 256f914ab..94d118d25 120000 --- a/examples/aishell/asr0/utils +++ b/examples/aishell/asr0/utils @@ -1 +1 @@ -../../../utils/ \ No newline at end of file +../../../utils/ diff --git a/examples/csmsc/jets/README.md b/examples/csmsc/jets/README.md index 07dade0e6..20314cec0 100644 --- a/examples/csmsc/jets/README.md +++ b/examples/csmsc/jets/README.md @@ -3,7 +3,18 @@ This example contains code used to train a [JETS](https://arxiv.org/abs/2203.168 ## Dataset ### Download and Extract -Download CSMSC from it's [Official Website](https://test.data-baker.com/data/index/source). +Download CSMSC from it's [official website](https://test.data-baker.com/data/index/TNtts/) and extract it to `~/datasets`. Then the dataset is in the directory `~/datasets/BZNSYP`. + +The structure of the folder is listed below. + +```text +└─ Wave + └─ .wav files (audio speech) +└─ PhoneLabeling + └─ .interval files (alignment between phoneme and duration) +└─ ProsodyLabeling + └─ 000001-010000.txt (text with prosodic by pinyin) +``` ### Get MFA Result and Extract We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get phonemes and durations for JETS. diff --git a/examples/csmsc/tts2/README.md b/examples/csmsc/tts2/README.md index 969567762..7f7cdde0e 100644 --- a/examples/csmsc/tts2/README.md +++ b/examples/csmsc/tts2/README.md @@ -5,6 +5,17 @@ This example contains code used to train a [SpeedySpeech](http://arxiv.org/abs/2 ### Download and Extract Download CSMSC from it's [Official Website](https://test.data-baker.com/data/index/TNtts/) and extract it to `~/datasets`. Then the dataset is in the directory `~/datasets/BZNSYP`. +The structure of the folder is listed below. + +```text +└─ Wave + └─ .wav files (audio speech) +└─ PhoneLabeling + └─ .interval files (alignment between phoneme and duration) +└─ ProsodyLabeling + └─ 000001-010000.txt (text with prosodic by pinyin) +``` + ### Get MFA Result and Extract We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for SPEEDYSPEECH. You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo. diff --git a/examples/csmsc/voc5/README.md b/examples/csmsc/voc5/README.md index 3347c6473..e4d100619 100644 --- a/examples/csmsc/voc5/README.md +++ b/examples/csmsc/voc5/README.md @@ -4,6 +4,17 @@ This example contains code used to train a [HiFiGAN](https://arxiv.org/abs/2010. ### Download and Extract Download CSMSC from it's [official website](https://test.data-baker.com/data/index/TNtts/) and extract it to `~/datasets`. Then the dataset is in the directory `~/datasets/BZNSYP`. +The structure of the folder is listed below. + +```text +└─ Wave + └─ .wav files (audio speech) +└─ PhoneLabeling + └─ .interval files (alignment between phoneme and duration) +└─ ProsodyLabeling + └─ 000001-010000.txt (text with prosodic by pinyin) +``` + ### Get MFA Result and Extract We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut silence at the edge of audio. You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo. diff --git a/examples/csmsc/voc5/iSTFTNet.md b/examples/csmsc/voc5/iSTFTNet.md index 8f121938a..693950c54 100644 --- a/examples/csmsc/voc5/iSTFTNet.md +++ b/examples/csmsc/voc5/iSTFTNet.md @@ -6,6 +6,17 @@ This example contains code used to train a [iSTFTNet](https://arxiv.org/abs/2203 ### Download and Extract Download CSMSC from it's [official website](https://test.data-baker.com/data/index/TNtts/) and extract it to `~/datasets`. Then the dataset is in the directory `~/datasets/BZNSYP`. +The structure of the folder is listed below. + +```text +└─ Wave + └─ .wav files (audio speech) +└─ PhoneLabeling + └─ .interval files (alignment between phoneme and duration) +└─ ProsodyLabeling + └─ 000001-010000.txt (text with prosodic by pinyin) +``` + ### Get MFA Result and Extract We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut silence at the edge of audio. You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo. diff --git a/examples/librispeech/asr0/README.md b/examples/librispeech/asr0/README.md index 2d3836c6b..a097dd99f 100644 --- a/examples/librispeech/asr0/README.md +++ b/examples/librispeech/asr0/README.md @@ -144,7 +144,7 @@ source path.sh bash ./local/data.sh CUDA_VISIBLE_DEVICES= ./local/train.sh conf/deepspeech2.yaml deepspeech2 avg.sh best exp/deepspeech2/checkpoints 1 -CUDA_VISIBLE_DEVICES= ./local/test.sh conf/deepspeech2.yaml exp/deepspeech2/checkpoints/avg_1 +CUDA_VISIBLE_DEVICES= ./local/test.sh conf/deepspeech2.yaml conf/tuning/decode.yaml exp/deepspeech2/checkpoints/avg_1 ``` ## Stage 4: Static graph model Export This stage is to transform dygraph to static graph. @@ -185,5 +185,5 @@ wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/en/demo_002_en.w ``` You can train a model by yourself, then you need to prepare an audio file or use the audio demo above, please confirm the sample rate of the audio is 16K. You can get the result of the audio demo by running the script below. ```bash -CUDA_VISIBLE_DEVICES= ./local/test_wav.sh conf/deepspeech2.yaml exp/deepspeech2/checkpoints/avg_1 data/demo_002_en.wav +CUDA_VISIBLE_DEVICES= ./local/test_wav.sh conf/deepspeech2.yaml conf/tuning/decode.yaml exp/deepspeech2/checkpoints/avg_1 data/demo_002_en.wav ``` diff --git a/examples/librispeech/asr1/README.md b/examples/librispeech/asr1/README.md index ca0081444..1b02698c7 100644 --- a/examples/librispeech/asr1/README.md +++ b/examples/librispeech/asr1/README.md @@ -148,7 +148,7 @@ or you can run these scripts in the command line (only use CPU). bash ./local/data.sh CUDA_VISIBLE_DEVICES= ./local/train.sh conf/conformer.yaml conformer avg.sh best exp/conformer/checkpoints 20 -CUDA_VISIBLE_DEVICES= ./local/test.sh conf/conformer.yaml exp/conformer/checkpoints/avg_20 +CUDA_VISIBLE_DEVICES= ./local/test.sh conf/conformer.yaml conf/tuning/decode.yaml exp/conformer/checkpoints/avg_20 ``` ## Pretrained Model You can get the pretrained transformer or conformer from [this](../../../docs/source/released_model.md). @@ -163,7 +163,7 @@ source path.sh # If you have process the data and get the manifest file, you can skip the following 2 steps bash local/data.sh --stage -1 --stop_stage -1 bash local/data.sh --stage 2 --stop_stage 2 -CUDA_VISIBLE_DEVICES= ./local/test.sh conf/conformer.yaml exp/conformer/checkpoints/avg_20 +CUDA_VISIBLE_DEVICES= ./local/test.sh conf/conformer.yaml conf/tuning/decode.yaml exp/conformer/checkpoints/avg_20 ``` The performance of the released models are shown in [here](./RESULTS.md). @@ -192,8 +192,8 @@ bash ./local/data.sh CUDA_VISIBLE_DEVICES= ./local/train.sh conf/conformer.yaml conformer avg.sh best exp/conformer/checkpoints 20 # test stage is optional -CUDA_VISIBLE_DEVICES= ./local/test.sh conf/conformer.yaml exp/conformer/checkpoints/avg_20 -CUDA_VISIBLE_DEVICES= ./local/align.sh conf/conformer.yaml exp/conformer/checkpoints/avg_20 +CUDA_VISIBLE_DEVICES= ./local/test.sh conf/conformer.yaml conf/tuning/decode.yaml exp/conformer/checkpoints/avg_20 +CUDA_VISIBLE_DEVICES= ./local/align.sh conf/conformer.yaml conf/tuning/decode.yaml exp/conformer/checkpoints/avg_20 ``` ## Stage 5: Single Audio File Inference In some situations, you want to use the trained model to do the inference for the single audio file. You can use stage 5. The code is shown below @@ -214,5 +214,5 @@ wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/en/demo_002_en.w ``` You need to prepare an audio file or use the audio demo above, please confirm the sample rate of the audio is 16K. You can get the result of the audio demo by running the script below. ```bash -CUDA_VISIBLE_DEVICES= ./local/test_wav.sh conf/conformer.yaml exp/conformer/checkpoints/avg_20 data/demo_002_en.wav +CUDA_VISIBLE_DEVICES= ./local/test_wav.sh conf/conformer.yaml conf/tuning/decode.yaml exp/conformer/checkpoints/avg_20 data/demo_002_en.wav ``` diff --git a/examples/librispeech/asr2/steps b/examples/librispeech/asr2/steps index 995eeccb7..7cb6e568e 120000 --- a/examples/librispeech/asr2/steps +++ b/examples/librispeech/asr2/steps @@ -1 +1 @@ -../../../tools/kaldi/egs/wsj/s5/steps/ \ No newline at end of file +../../../tools/kaldi/egs/wsj/s5/steps/ diff --git a/examples/tal_cs/asr1/README.md b/examples/tal_cs/asr1/README.md index 83a27ac1e..176925190 100644 --- a/examples/tal_cs/asr1/README.md +++ b/examples/tal_cs/asr1/README.md @@ -27,7 +27,6 @@ The document below will describe the scripts in `run.sh` in detail. The path.sh contains the environment variables. ```bash . ./path.sh -. ./cmd.sh ``` This script needs to be run first. And another script is also needed: ```bash @@ -67,7 +66,6 @@ bash run.sh --stage 0 --stop_stage 0 You can also just run these scripts in your command line. ```bash . ./path.sh -. ./cmd.sh bash ./local/data.sh ``` After processing the data, the `data` directory will look like this: @@ -103,7 +101,6 @@ bash run.sh --stage 0 --stop_stage 1 or you can run these scripts in the command line (only use CPU). ```bash . ./path.sh -. ./cmd.sh bash ./local/data.sh CUDA_VISIBLE_DEVICES= ./local/train.sh conf/conformer.yaml conformer ``` @@ -124,7 +121,6 @@ or you can run these scripts in the command line (only use CPU). ```bash . ./path.sh -. ./cmd.sh bash ./local/data.sh CUDA_VISIBLE_DEVICES= ./local/train.sh conf/conformer.yaml conformer avg.sh best exp/conformer/checkpoints 10 @@ -144,11 +140,10 @@ bash run.sh --stage 0 --stop_stage 3 or you can run these scripts in the command line (only use CPU). ```bash . ./path.sh -. ./cmd.sh bash ./local/data.sh CUDA_VISIBLE_DEVICES= ./local/train.sh conf/conformer.yaml conformer avg.sh best exp/conformer/checkpoints 10 -CUDA_VISIBLE_DEVICES= ./local/test.sh conf/conformer.yaml exp/conformer/checkpoints/avg_10 +CUDA_VISIBLE_DEVICES= ./local/test.sh conf/conformer.yaml conf/tuning/decode.yaml exp/conformer/checkpoints/avg_10 ``` ## Pretrained Model You can get the pretrained transformer or conformer from [this](../../../docs/source/released_model.md). @@ -163,7 +158,7 @@ source path.sh # If you have process the data and get the manifest file, you can skip the following 2 steps bash local/data.sh --stage -1 --stop_stage -1 bash local/data.sh --stage 2 --stop_stage 2 -CUDA_VISIBLE_DEVICES= ./local/test.sh conf/conformer.yaml exp/conformer/checkpoints/avg_10 +CUDA_VISIBLE_DEVICES= ./local/test.sh conf/conformer.yaml conf/tuning/decode.yaml exp/conformer/checkpoints/avg_10 ``` The performance of the released models are shown in [here](./RESULTS.md). @@ -186,5 +181,5 @@ wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wa ``` You need to prepare an audio file or use the audio demo above, please confirm the sample rate of the audio is 16K. You can get the result of the audio demo by running the script below. ```bash -CUDA_VISIBLE_DEVICES= ./local/test_wav.sh conf/conformer.yaml exp/conformer/checkpoints/avg_10 data/demo_01_03.wav +CUDA_VISIBLE_DEVICES= ./local/test_wav.sh conf/conformer.yaml conf/tuning/decode.yaml exp/conformer/checkpoints/avg_10 data/demo_01_03.wav ``` diff --git a/examples/tiny/asr1/README.md b/examples/tiny/asr1/README.md index 489f5bc3e..8eb45ce5e 100644 --- a/examples/tiny/asr1/README.md +++ b/examples/tiny/asr1/README.md @@ -26,7 +26,6 @@ The document below will describe the scripts in ```run.sh```in detail. The path.sh contains the environment variables. ```bash . ./path.sh -. ./cmd.sh ``` This script needs to be run first. And another script is also needed: ```bash @@ -64,7 +63,6 @@ bash run.sh --stage 0 --stop_stage 0 You can also just run these scripts in your command line. ```bash . ./path.sh -. ./cmd.sh bash ./local/data.sh ``` After processing the data, the ``data`` directory will look like this: @@ -100,7 +98,6 @@ bash run.sh --stage 0 --stop_stage 1 or you can run these scripts in the command line (only use CPU). ```bash . ./path.sh -. ./cmd.sh bash ./local/data.sh CUDA_VISIBLE_DEVICES= ./local/train.sh conf/transformer.yaml transformer ```## Stage 2: Top-k Models Averaging @@ -119,7 +116,6 @@ bash run.sh --stage 0 --stop_stage 2 or you can run these scripts in the command line (only use CPU). ```bash . ./path.sh -. ./cmd.sh bash ./local/data.sh CUDA_VISIBLE_DEVICES= ./local/train.sh conf/transformer.yaml transformer avg.sh best exp/transformer/checkpoints 1 @@ -139,7 +135,6 @@ bash run.sh --stage 0 --stop_stage 3 or you can run these scripts in the command line (only use CPU). ```bash . ./path.sh -. ./cmd.sh bash ./local/data.sh CUDA_VISIBLE_DEVICES= ./local/train.sh conf/transformer.yaml transformer avg.sh best exp/transformer/checkpoints 1 @@ -166,7 +161,6 @@ bash run.sh --stage 4 --stop_stage 4 or you can also use these scripts in the command line (only use CPU). ```bash . ./path.sh -. ./cmd.sh bash ./local/data.sh CUDA_VISIBLE_DEVICES= ./local/train.sh conf/transformer.yaml transformer avg.sh best exp/transformer/checkpoints 1 diff --git a/examples/voxceleb/sv0/utils b/examples/voxceleb/sv0/utils index 256f914ab..94d118d25 120000 --- a/examples/voxceleb/sv0/utils +++ b/examples/voxceleb/sv0/utils @@ -1 +1 @@ -../../../utils/ \ No newline at end of file +../../../utils/ diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py b/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py index d087405d5..0b763684f 100644 --- a/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py +++ b/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py @@ -75,7 +75,7 @@ class DeepSpeech2Tester_hub(): feat = self.preprocessing(audio, **self.preprocess_args) logger.info(f"feat shape: {feat.shape}") - audio_len = paddle.to_tensor(feat.shape[0]) + audio_len = paddle.to_tensor(feat.shape[0]).unsqueeze(0) audio = paddle.to_tensor(feat, dtype='float32').unsqueeze(axis=0) result_transcripts = self.compute_result_transcripts( diff --git a/paddlespeech/s2t/exps/hubert/bin/test.py b/paddlespeech/s2t/exps/hubert/bin/test.py index e0ad09f0a..b08b0209a 100644 --- a/paddlespeech/s2t/exps/hubert/bin/test.py +++ b/paddlespeech/s2t/exps/hubert/bin/test.py @@ -18,7 +18,7 @@ from yacs.config import CfgNode from paddlespeech.s2t.exps.hubert.model import HubertASRTester as Tester from paddlespeech.s2t.training.cli import default_argument_parser -from paddlespeech.s2t.utils.utility import print_arguments +from paddlespeech.utils.argparse import print_arguments def main_sp(config, args): diff --git a/paddlespeech/s2t/exps/hubert/bin/train.py b/paddlespeech/s2t/exps/hubert/bin/train.py index b7c0a924f..391405674 100644 --- a/paddlespeech/s2t/exps/hubert/bin/train.py +++ b/paddlespeech/s2t/exps/hubert/bin/train.py @@ -19,7 +19,7 @@ from yacs.config import CfgNode from paddlespeech.s2t.exps.hubert.model import HubertASRTrainer as Trainer from paddlespeech.s2t.training.cli import default_argument_parser -from paddlespeech.s2t.utils.utility import print_arguments +from paddlespeech.utils.argparse import print_arguments def main_sp(config, args): diff --git a/paddlespeech/s2t/exps/u2/bin/quant.py b/paddlespeech/s2t/exps/u2/bin/quant.py index 73a9794fc..72c64e467 100755 --- a/paddlespeech/s2t/exps/u2/bin/quant.py +++ b/paddlespeech/s2t/exps/u2/bin/quant.py @@ -75,7 +75,7 @@ class U2Infer(): feat = self.preprocessing(audio, **self.preprocess_args) logger.info(f"feat shape: {feat.shape}") - ilen = paddle.to_tensor(feat.shape[0]) + ilen = paddle.to_tensor(feat.shape[0]).unsqueeze(0) xs = paddle.to_tensor(feat, dtype='float32').unsqueeze(0) decode_config = self.config.decode logger.info(f"decode cfg: {decode_config}") diff --git a/paddlespeech/s2t/exps/u2/bin/test_wav.py b/paddlespeech/s2t/exps/u2/bin/test_wav.py index a6228a128..0d1a3b3cc 100644 --- a/paddlespeech/s2t/exps/u2/bin/test_wav.py +++ b/paddlespeech/s2t/exps/u2/bin/test_wav.py @@ -78,7 +78,7 @@ class U2Infer(): if self.args.debug: np.savetxt("feat.transform.txt", feat) - ilen = paddle.to_tensor(feat.shape[0]) + ilen = paddle.to_tensor(feat.shape[0]).unsqueeze(0) xs = paddle.to_tensor(feat, dtype='float32').unsqueeze(0) decode_config = self.config.decode logger.info(f"decode cfg: {decode_config}") diff --git a/paddlespeech/s2t/exps/wav2vec2/bin/test.py b/paddlespeech/s2t/exps/wav2vec2/bin/test.py index c17cee0fd..55a241ffc 100644 --- a/paddlespeech/s2t/exps/wav2vec2/bin/test.py +++ b/paddlespeech/s2t/exps/wav2vec2/bin/test.py @@ -37,8 +37,6 @@ if __name__ == "__main__": # save asr result to parser.add_argument( '--dict-path', type=str, default=None, help='dict path.') - parser.add_argument( - "--result_file", type=str, help="path of save the asr result") args = parser.parse_args() print_arguments(args, globals()) diff --git a/paddlespeech/s2t/exps/wav2vec2/bin/test_wav.py b/paddlespeech/s2t/exps/wav2vec2/bin/test_wav.py index 0295713ff..7747b868b 100644 --- a/paddlespeech/s2t/exps/wav2vec2/bin/test_wav.py +++ b/paddlespeech/s2t/exps/wav2vec2/bin/test_wav.py @@ -104,11 +104,6 @@ def main(config, args): if __name__ == "__main__": parser = default_argument_parser() - # save asr result to - parser.add_argument( - "--result_file", type=str, help="path of save the asr result") - parser.add_argument( - "--audio_file", type=str, help="path of the input audio file") args = parser.parse_args() config = CfgNode(new_allowed=True) diff --git a/paddlespeech/s2t/exps/wavlm/model.py b/paddlespeech/s2t/exps/wavlm/model.py index 6ed2c5d87..606867eae 100644 --- a/paddlespeech/s2t/exps/wavlm/model.py +++ b/paddlespeech/s2t/exps/wavlm/model.py @@ -33,7 +33,7 @@ from paddlespeech.s2t.io.speechbrain import data_pipeline from paddlespeech.s2t.io.speechbrain import dataio from paddlespeech.s2t.io.speechbrain import dataset from paddlespeech.s2t.io.speechbrain.dataloader import make_dataloader -from paddlespeech.s2t.models.wavlm.processing.speech_augmentation import TimeDomainSpecAugment +from paddlespeech.s2t.models.wav2vec2.processing.speech_augmentation import TimeDomainSpecAugment from paddlespeech.s2t.models.wavlm.wavlm_asr import WavLMASR from paddlespeech.s2t.training.optimizer import OptimizerFactory from paddlespeech.s2t.training.reporter import ObsScope @@ -211,7 +211,7 @@ class WavLMASRTrainer(Trainer): loss.backward() layer_tools.print_grads(self.model, print_func=None) - + # NOTE: the code below asserted that the backward() is problematic, and as more steps are accumulated, the output from wavlm alone will be the same for all frames # optimizer step old if (batch_index + 1) % train_conf.accum_grad == 0: @@ -428,8 +428,7 @@ class WavLMASRTrainer(Trainer): report("epoch", self.epoch) report('step', self.iteration) report("model_lr", self.model_optimizer.get_lr()) - report("wavlm_lr", - self.wavlm_optimizer.get_lr()) + report("wavlm_lr", self.wavlm_optimizer.get_lr()) self.train_batch(batch_index, batch, msg) self.after_train_batch() report('iter', batch_index + 1) @@ -680,8 +679,7 @@ class WavLMASRTrainer(Trainer): logger.info("optim_model:{},{}", model_optim_type, model_optim_conf) wavlm_optim_type = train_config.wavlm_optim wavlm_optim_conf = train_config.wavlm_optim_conf - logger.info("optim_model:{},{}", wavlm_optim_type, - wavlm_optim_conf) + logger.info("optim_model:{},{}", wavlm_optim_type, wavlm_optim_conf) model_scheduler_type = train_config.model_scheduler model_scheduler_conf = train_config.model_scheduler_conf @@ -698,8 +696,8 @@ class WavLMASRTrainer(Trainer): model_lr_scheduler = LRSchedulerFactory.from_args(model_scheduler_type, model_scheduler_args) - wavlm_lr_scheduler = LRSchedulerFactory.from_args( - wavlm_scheduler_type, wavlm_scheduler_args) + wavlm_lr_scheduler = LRSchedulerFactory.from_args(wavlm_scheduler_type, + wavlm_scheduler_args) def optimizer_args( config, @@ -716,24 +714,31 @@ class WavLMASRTrainer(Trainer): }) return optim_arg - model_optimizer_args = optimizer_args( - config, model_optim_type, - model_optim_conf, - [{'params': model._layers.enc.parameters()}, {'params': model._layers.ctc.parameters()}] if self.parallel else [{'params': model.enc.parameters()}, {'params': model.ctc.parameters()}], - model_lr_scheduler - ) - # [{'params': model._layers.ctc.parameters()}] if self.parallel else [{'params': model.ctc.parameters()}], model_lr_scheduler) - + model_optimizer_args = optimizer_args(config, model_optim_type, + model_optim_conf, [{ + 'params': + model._layers.enc.parameters() + }, { + 'params': + model._layers.ctc.parameters() + }] if self.parallel else [{ + 'params': + model.enc.parameters() + }, { + 'params': + model.ctc.parameters() + }], model_lr_scheduler) + # [{'params': model._layers.ctc.parameters()}] if self.parallel else [{'params': model.ctc.parameters()}], model_lr_scheduler) wavlm_optimizer_args = optimizer_args( config, wavlm_optim_type, wavlm_optim_conf, - model._layers.wavlm.parameters() if self.parallel else - model.wavlm.parameters(), wavlm_lr_scheduler) + model._layers.wavlm.parameters() + if self.parallel else model.wavlm.parameters(), wavlm_lr_scheduler) model_optimizer = OptimizerFactory.from_args(model_optim_type, model_optimizer_args) wavlm_optimizer = OptimizerFactory.from_args(wavlm_optim_type, - wavlm_optimizer_args) + wavlm_optimizer_args) self.model_optimizer = model_optimizer self.wavlm_optimizer = wavlm_optimizer diff --git a/paddlespeech/s2t/models/wav2vec2/modules/modeling_wav2vec2.py b/paddlespeech/s2t/models/wav2vec2/modules/modeling_wav2vec2.py index 688bf5f84..797c23a0f 100644 --- a/paddlespeech/s2t/models/wav2vec2/modules/modeling_wav2vec2.py +++ b/paddlespeech/s2t/models/wav2vec2/modules/modeling_wav2vec2.py @@ -129,7 +129,7 @@ def _compute_mask_indices( [sequence_length for _ in range(batch_size)]) # SpecAugment mask to fill - spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=np.bool) + spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=np.bool_) spec_aug_mask_idxs = [] max_num_masked_span = compute_num_masked_span(sequence_length) @@ -207,9 +207,9 @@ def _sample_negative_indices(features_shape: Tuple, sampled_negative_indices = np.zeros( shape=(batch_size, sequence_length, num_negatives), dtype=np.int32) - mask_time_indices = (mask_time_indices.astype(np.bool) + mask_time_indices = (mask_time_indices.astype(np.bool_) if mask_time_indices is not None else - np.ones(features_shape, dtype=np.bool)) + np.ones(features_shape, dtype=np.bool_)) for batch_idx in range(batch_size): high = mask_time_indices[batch_idx].sum() - 1 diff --git a/paddlespeech/s2t/models/wav2vec2/modules/wav2vec2_model.py b/paddlespeech/s2t/models/wav2vec2/modules/wav2vec2_model.py index 3fbb9426b..be78b516a 100644 --- a/paddlespeech/s2t/models/wav2vec2/modules/wav2vec2_model.py +++ b/paddlespeech/s2t/models/wav2vec2/modules/wav2vec2_model.py @@ -714,13 +714,13 @@ class MultiheadAttention(nn.Layer): else: if self.beam_size > 1 and bsz == key.size(1): # key is [T, bsz*beam_size, C], reduce to [T, bsz, C] - key = key.view( - key.size(0), -1, self.beam_size, - key.size(2))[:, :, 0, :] + key = key.reshape( + [key.size(0), -1, self.beam_size, + key.size(2)])[:, :, 0, :] if key_padding_mask is not None: - key_padding_mask = key_padding_mask.view( - -1, self.beam_size, - key_padding_mask.size(1))[:, 0, :] + key_padding_mask = key_padding_mask.reshape( + [-1, self.beam_size, + key_padding_mask.size(1)])[:, 0, :] k = self.k_proj(key) v = self.v_proj(key) @@ -1476,7 +1476,7 @@ def compute_mask_indices( lens = np.fromiter( (e - s if e - s >= length + min_space else 0 for s, e in parts), - np.int, ) + np.int_, ) l_sum = np.sum(lens) if l_sum == 0: break diff --git a/paddlespeech/s2t/models/wav2vec2/processing/signal_processing.py b/paddlespeech/s2t/models/wav2vec2/processing/signal_processing.py index 7267e2211..a0e279c30 100644 --- a/paddlespeech/s2t/models/wav2vec2/processing/signal_processing.py +++ b/paddlespeech/s2t/models/wav2vec2/processing/signal_processing.py @@ -88,7 +88,7 @@ def compute_amplitude(waveforms, lengths=None, amp_type="avg", scale="linear"): out = paddle.mean(paddle.abs(waveforms), axis=1, keepdim=True) else: wav_sum = paddle.sum(paddle.abs(waveforms), axis=1, keepdim=True) - out = wav_sum / lengths + out = wav_sum / lengths.astype(wav_sum.dtype) elif amp_type == "peak": out = paddle.max(paddle.abs(waveforms), axis=1, keepdim=True)[0] else: @@ -248,4 +248,4 @@ def notch_filter(notch_freq, filter_width=101, notch_width=0.05): hhpf[pad] += 1 # Adding filters creates notch filter - return (hlpf + hhpf).view(1, -1, 1) + return (hlpf + hhpf).reshape([1, -1, 1]) diff --git a/paddlespeech/s2t/models/wav2vec2/processing/speech_augmentation.py b/paddlespeech/s2t/models/wav2vec2/processing/speech_augmentation.py index 50a95f0b1..e8a605610 100644 --- a/paddlespeech/s2t/models/wav2vec2/processing/speech_augmentation.py +++ b/paddlespeech/s2t/models/wav2vec2/processing/speech_augmentation.py @@ -743,7 +743,7 @@ class SpecAugment(paddle.nn.Layer): time = x.shape[2] if time - window <= window: - return x.view(*original_size) + return x.reshape([*original_size]) # compute center and corresponding window c = paddle.randint(window, time - window, (1, ))[0] @@ -762,7 +762,7 @@ class SpecAugment(paddle.nn.Layer): x[:, :, :w] = left x[:, :, w:] = right - return x.view(*original_size) + return x.reshape([*original_size]) def mask_along_axis(self, x, dim): """Mask along time or frequency axis. @@ -775,7 +775,7 @@ class SpecAugment(paddle.nn.Layer): """ original_size = x.shape if x.dim() == 4: - x = x.view(-1, x.shape[2], x.shape[3]) + x = x.reshape([-1, x.shape[2], x.shape[3]]) batch, time, fea = x.shape @@ -795,7 +795,7 @@ class SpecAugment(paddle.nn.Layer): (batch, n_mask)).unsqueeze(2) # compute masks - arange = paddle.arange(end=D).view(1, 1, -1) + arange = paddle.arange(end=D).reshape([1, 1, -1]) mask = (mask_pos <= arange) * (arange < (mask_pos + mask_len)) mask = mask.any(axis=1) @@ -811,7 +811,7 @@ class SpecAugment(paddle.nn.Layer): # same to x.masked_fill_(mask, val) y = paddle.full(x.shape, val, x.dtype) x = paddle.where(mask, y, x) - return x.view(*original_size) + return x.reshape([*original_size]) class TimeDomainSpecAugment(nn.Layer): diff --git a/paddlespeech/s2t/models/wavlm/wavlm_paddle.py b/paddlespeech/s2t/models/wavlm/wavlm_paddle.py index 6ed9ecd0e..1a0fca531 100644 --- a/paddlespeech/s2t/models/wavlm/wavlm_paddle.py +++ b/paddlespeech/s2t/models/wavlm/wavlm_paddle.py @@ -6,40 +6,38 @@ # Based on fairseq code bases # https://github.com/pytorch/fairseq # -------------------------------------------------------- - -import math import logging -from typing import List, Optional, Tuple +import math +from typing import List +from typing import Optional +from typing import Tuple import numpy as np - import paddle import paddle.nn as nn import paddle.nn.functional as F -from paddle.nn import LayerNorm from paddle import Tensor -from .modules.modules import ( - MultiheadAttention, - SamePad, - get_activation_fn, - TransposeLast, - GLU_Linear, -) +from paddle.nn import LayerNorm + +from .modules.modules import get_activation_fn +from .modules.modules import GLU_Linear +from .modules.modules import MultiheadAttention +from .modules.modules import SamePad +from .modules.modules import TransposeLast logger = logging.getLogger(__name__) def compute_mask_indices( - shape: Tuple[int, int], - padding_mask: Optional[Tensor], - mask_prob: float, - mask_length: int, - mask_type: str = "static", - mask_other: float = 0.0, - min_masks: int = 0, - no_overlap: bool = False, - min_space: int = 0, -) -> np.ndarray: + shape: Tuple[int, int], + padding_mask: Optional[Tensor], + mask_prob: float, + mask_length: int, + mask_type: str="static", + mask_other: float=0.0, + min_masks: int=0, + no_overlap: bool=False, + min_space: int=0, ) -> np.ndarray: """ Computes random mask spans for a given shape @@ -65,9 +63,7 @@ def compute_mask_indices( all_num_mask = int( # add a random number for probabilistic rounding - mask_prob * all_sz / float(mask_length) - + np.random.rand() - ) + mask_prob * all_sz / float(mask_length) + np.random.rand()) all_num_mask = max(min_masks, all_num_mask) @@ -77,9 +73,7 @@ def compute_mask_indices( sz = all_sz - padding_mask[i].long().sum().item() num_mask = int( # add a random number for probabilistic rounding - mask_prob * sz / float(mask_length) - + np.random.rand() - ) + mask_prob * sz / float(mask_length) + np.random.rand()) num_mask = max(min_masks, num_mask) else: sz = all_sz @@ -88,7 +82,8 @@ def compute_mask_indices( if mask_type == "static": lengths = np.full(num_mask, mask_length) elif mask_type == "uniform": - lengths = np.random.randint(mask_other, mask_length * 2 + 1, size=num_mask) + lengths = np.random.randint( + mask_other, mask_length * 2 + 1, size=num_mask) elif mask_type == "normal": lengths = np.random.normal(mask_length, mask_other, size=num_mask) lengths = [max(1, int(round(x))) for x in lengths] @@ -119,9 +114,9 @@ def compute_mask_indices( min_length = min(lengths) for length in sorted(lengths, reverse=True): lens = np.fromiter( - (e - s if e - s >= length + min_space else 0 for s, e in parts), - np.int, - ) + (e - s if e - s >= length + min_space else 0 + for s, e in parts), + np.int_, ) l_sum = np.sum(lens) if l_sum == 0: break @@ -137,13 +132,10 @@ def compute_mask_indices( mask_idc = np.random.choice(sz - min_len, num_mask, replace=False) - mask_idc = np.asarray( - [ - mask_idc[j] + offset - for j in range(len(mask_idc)) - for offset in range(lengths[j]) - ] - ) + mask_idc = np.asarray([ + mask_idc[j] + offset + for j in range(len(mask_idc)) for offset in range(lengths[j]) + ]) mask_idcs.append(np.unique(mask_idc[mask_idc < sz])) @@ -158,54 +150,54 @@ def compute_mask_indices( class WavLMConfig: def __init__(self, cfg=None): - self.extractor_mode: str = "default" # mode for feature extractor. default has a single group norm with d groups in the first conv block, whereas layer_norm has layer norms in every block (meant to use with normalize=True) - self.encoder_layers: int = 12 # num encoder layers in the transformer + self.extractor_mode: str = "default" # mode for feature extractor. default has a single group norm with d groups in the first conv block, whereas layer_norm has layer norms in every block (meant to use with normalize=True) + self.encoder_layers: int = 12 # num encoder layers in the transformer - self.encoder_embed_dim: int = 768 # encoder embedding dimension - self.encoder_ffn_embed_dim: int = 3072 # encoder embedding dimension for FFN - self.encoder_attention_heads: int = 12 # num encoder attention heads - self.activation_fn: str = "gelu" # activation function to use + self.encoder_embed_dim: int = 768 # encoder embedding dimension + self.encoder_ffn_embed_dim: int = 3072 # encoder embedding dimension for FFN + self.encoder_attention_heads: int = 12 # num encoder attention heads + self.activation_fn: str = "gelu" # activation function to use - self.layer_norm_first: bool = False # apply layernorm first in the transformer - self.conv_feature_layers: str = "[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2" # string describing convolutional feature extraction layers in form of a python list that contains [(dim, kernel_size, stride), ...] - self.conv_bias: bool = False # include bias in conv encoder - self.feature_grad_mult: float = 1.0 # multiply feature extractor var grads by this + self.layer_norm_first: bool = False # apply layernorm first in the transformer + self.conv_feature_layers: str = "[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2" # string describing convolutional feature extraction layers in form of a python list that contains [(dim, kernel_size, stride), ...] + self.conv_bias: bool = False # include bias in conv encoder + self.feature_grad_mult: float = 1.0 # multiply feature extractor var grads by this self.normalize: bool = False # normalize input to have 0 mean and unit variance during training # dropouts - self.dropout: float = 0.1 # dropout probability for the transformer - self.attention_dropout: float = 0.1 # dropout probability for attention weights - self.activation_dropout: float = 0.0 # dropout probability after activation in FFN - self.encoder_layerdrop: float = 0.0 # probability of dropping a tarnsformer layer - self.dropout_input: float = 0.0 # dropout to apply to the input (after feat extr) - self.dropout_features: float = 0.0 # dropout to apply to the features (after feat extr) + self.dropout: float = 0.1 # dropout probability for the transformer + self.attention_dropout: float = 0.1 # dropout probability for attention weights + self.activation_dropout: float = 0.0 # dropout probability after activation in FFN + self.encoder_layerdrop: float = 0.0 # probability of dropping a tarnsformer layer + self.dropout_input: float = 0.0 # dropout to apply to the input (after feat extr) + self.dropout_features: float = 0.0 # dropout to apply to the features (after feat extr) # masking - self.mask_length: int = 10 # mask length - self.mask_prob: float = 0.65 # probability of replacing a token with mask - self.mask_selection: str = "static" # how to choose mask length - self.mask_other: float = 0 # secondary mask argument (used for more complex distributions), see help in compute_mask_indicesh - self.no_mask_overlap: bool = False # whether to allow masks to overlap - self.mask_min_space: int = 1 # min space between spans (if no overlap is enabled) + self.mask_length: int = 10 # mask length + self.mask_prob: float = 0.65 # probability of replacing a token with mask + self.mask_selection: str = "static" # how to choose mask length + self.mask_other: float = 0 # secondary mask argument (used for more complex distributions), see help in compute_mask_indicesh + self.no_mask_overlap: bool = False # whether to allow masks to overlap + self.mask_min_space: int = 1 # min space between spans (if no overlap is enabled) # channel masking - self.mask_channel_length: int = 10 # length of the mask for features (channels) - self.mask_channel_prob: float = 0.0 # probability of replacing a feature with 0 - self.mask_channel_selection: str = "static" # how to choose mask length for channel masking - self.mask_channel_other: float = 0 # secondary mask argument (used for more complex distributions), see help in compute_mask_indices - self.no_mask_channel_overlap: bool = False # whether to allow channel masks to overlap - self.mask_channel_min_space: int = 1 # min space between spans (if no overlap is enabled) + self.mask_channel_length: int = 10 # length of the mask for features (channels) + self.mask_channel_prob: float = 0.0 # probability of replacing a feature with 0 + self.mask_channel_selection: str = "static" # how to choose mask length for channel masking + self.mask_channel_other: float = 0 # secondary mask argument (used for more complex distributions), see help in compute_mask_indices + self.no_mask_channel_overlap: bool = False # whether to allow channel masks to overlap + self.mask_channel_min_space: int = 1 # min space between spans (if no overlap is enabled) # positional embeddings - self.conv_pos: int = 128 # number of filters for convolutional positional embeddings - self.conv_pos_groups: int = 16 # number of groups for convolutional positional embedding + self.conv_pos: int = 128 # number of filters for convolutional positional embeddings + self.conv_pos_groups: int = 16 # number of groups for convolutional positional embedding # relative position embedding - self.relative_position_embedding: bool = True # apply relative position embedding - self.num_buckets: int = 320 # number of buckets for relative position embedding - self.max_distance: int = 1280 # maximum distance for relative position embedding - self.gru_rel_pos: bool = True # apply gated relative position embedding + self.relative_position_embedding: bool = True # apply relative position embedding + self.num_buckets: int = 320 # number of buckets for relative position embedding + self.max_distance: int = 1280 # maximum distance for relative position embedding + self.gru_rel_pos: bool = True # apply gated relative position embedding if cfg is not None: self.update(cfg) @@ -216,9 +208,8 @@ class WavLMConfig: class WavLM(nn.Layer): def __init__( - self, - cfg: WavLMConfig, - ) -> None: + self, + cfg: WavLMConfig, ) -> None: super().__init__() logger.info(f"WavLM Config: {cfg.__dict__}") @@ -230,14 +221,11 @@ class WavLM(nn.Layer): conv_layers=feature_enc_layers, dropout=0.0, mode=cfg.extractor_mode, - conv_bias=cfg.conv_bias, - ) + conv_bias=cfg.conv_bias, ) - self.post_extract_proj = ( - nn.Linear(self.embed, cfg.encoder_embed_dim) - if self.embed != cfg.encoder_embed_dim - else None - ) + self.post_extract_proj = (nn.Linear(self.embed, cfg.encoder_embed_dim) + if self.embed != cfg.encoder_embed_dim else + None) self.mask_prob = cfg.mask_prob self.mask_selection = cfg.mask_selection @@ -260,8 +248,7 @@ class WavLM(nn.Layer): self.mask_emb = self.create_parameter( shape=[cfg.encoder_embed_dim], - default_initializer=nn.initializer.Uniform(), - ) + default_initializer=nn.initializer.Uniform(), ) self.encoder = TransformerEncoder(cfg) self.layer_norm = LayerNorm(self.embed) @@ -278,8 +265,7 @@ class WavLM(nn.Layer): self.mask_other, min_masks=2, no_overlap=self.no_mask_overlap, - min_space=self.mask_min_space, - ) + min_space=self.mask_min_space, ) # mask_indices = torch.from_numpy(mask_indices).to(x.device) mask_indices = paddle.to_tensor(mask_indices, dtype='int64') x[mask_indices] = self.mask_emb @@ -295,40 +281,35 @@ class WavLM(nn.Layer): self.mask_channel_selection, self.mask_channel_other, no_overlap=self.no_mask_channel_overlap, - min_space=self.mask_channel_min_space, - ) + min_space=self.mask_channel_min_space, ) mask_channel_indices = ( # torch.from_numpy(mask_channel_indices) paddle.to_tensor(mask_channel_indices, dtype='int64') - .to(x.device) - .unsqueeze(1) - .expand(-1, T, -1) - ) + .to(x.device).unsqueeze(1).expand(-1, T, -1)) x[mask_channel_indices] = 0 return x, mask_indices def forward_padding_mask( - self, features: Tensor, padding_mask: Tensor, - ) -> Tensor: + self, + features: Tensor, + padding_mask: Tensor, ) -> Tensor: extra = padding_mask.size(1) % features.size(1) if extra > 0: padding_mask = padding_mask[:, :-extra] padding_mask = padding_mask.view( - padding_mask.size(0), features.size(1), -1 - ) + padding_mask.size(0), features.size(1), -1) padding_mask = padding_mask.all(-1) return padding_mask def extract_features( - self, - source: Tensor, - padding_mask: Optional[Tensor] = None, - mask: bool = False, - ret_conv: bool = False, - output_layer: Optional[int] = None, - ret_layer_results: bool = False, - ): + self, + source: Tensor, + padding_mask: Optional[Tensor]=None, + mask: bool=False, + ret_conv: bool=False, + output_layer: Optional[int]=None, + ret_layer_results: bool=False, ): if self.feature_grad_mult > 0: features = self.feature_extractor(source) @@ -339,7 +320,7 @@ class WavLM(nn.Layer): with paddle.no_grad(): features = self.feature_extractor(source) - features = features.transpose([0, 2, 1]) # [1, 49, 512] + features = features.transpose([0, 2, 1]) # [1, 49, 512] features = self.layer_norm(features) if padding_mask is not None: @@ -351,9 +332,7 @@ class WavLM(nn.Layer): features = self.dropout_input(features) if mask: - x, mask_indices = self.apply_mask( - features, padding_mask - ) + x, mask_indices = self.apply_mask(features, padding_mask) else: x = features @@ -362,33 +341,35 @@ class WavLM(nn.Layer): # x: (B, T, D), float # padding_mask: (B, T), bool # mask_indices: (B, T), bool - + x, layer_results = self.encoder( x, padding_mask=padding_mask, - layer=None if output_layer is None else output_layer - 1 - ) + layer=None if output_layer is None else output_layer - 1) # print(f"Debugging: x.shape: {x.shape}, x.mean(): {x.mean()}, x.std(): {x.std()}") - res = {"x": x, "padding_mask": padding_mask, "features": features, "layer_results": layer_results} + res = { + "x": x, + "padding_mask": padding_mask, + "features": features, + "layer_results": layer_results + } feature = res["features"] if ret_conv else res["x"] if ret_layer_results: feature = (feature, res["layer_results"]) return feature, res["padding_mask"] - + def forward(self, x): return self.extract_features(x)[0] class ConvFeatureExtractionModel(nn.Layer): - def __init__( - self, - conv_layers: List[Tuple[int, int, int]], - dropout: float = 0.0, - mode: str = "default", - conv_bias: bool = False, - conv_type: str = "default" - ): + def __init__(self, + conv_layers: List[Tuple[int, int, int]], + dropout: float=0.0, + mode: str="default", + conv_bias: bool=False, + conv_type: str="default"): super().__init__() assert mode in {"default", "layer_norm"} @@ -400,17 +381,20 @@ class ConvFeatureExtractionModel(nn.Layer): stride, is_layer_norm=False, is_group_norm=False, - conv_bias=False, - ): + conv_bias=False, ): def make_conv(): - conv = nn.Conv1D(n_in, n_out, k, stride=stride, bias_attr=conv_bias, - weight_attr=nn.initializer.KaimingNormal()) + conv = nn.Conv1D( + n_in, + n_out, + k, + stride=stride, + bias_attr=conv_bias, + weight_attr=nn.initializer.KaimingNormal()) # nn.init.kaiming_normal_(conv.weight) return conv - assert ( - is_layer_norm and is_group_norm - ) == False, "layer norm and group norm are exclusive" + assert (is_layer_norm and is_group_norm + ) == False, "layer norm and group norm are exclusive" if is_layer_norm: return nn.Sequential( @@ -419,19 +403,18 @@ class ConvFeatureExtractionModel(nn.Layer): nn.Sequential( TransposeLast(), nn.LayerNorm(normalized_shape=dim, epsilon=1e-5), - TransposeLast(), - ), - nn.GELU(), - ) + TransposeLast(), ), + nn.GELU(), ) elif is_group_norm: return nn.Sequential( make_conv(), nn.Dropout(p=dropout), - nn.GroupNorm(num_groups=dim, num_channels=dim, epsilon=1e-5), - nn.GELU(), - ) + nn.GroupNorm( + num_groups=dim, num_channels=dim, epsilon=1e-5), + nn.GELU(), ) else: - return nn.Sequential(make_conv(), nn.Dropout(p=dropout), nn.GELU()) + return nn.Sequential( + make_conv(), nn.Dropout(p=dropout), nn.GELU()) self.conv_type = conv_type if self.conv_type == "default": @@ -449,9 +432,7 @@ class ConvFeatureExtractionModel(nn.Layer): stride, is_layer_norm=mode == "layer_norm", is_group_norm=mode == "default" and i == 0, - conv_bias=conv_bias, - ) - ) + conv_bias=conv_bias, )) in_d = dim elif self.conv_type == "conv2d": in_d = 1 @@ -460,9 +441,7 @@ class ConvFeatureExtractionModel(nn.Layer): assert len(cl) == 3 (dim, k, stride) = cl - self.conv_layers.append( - paddle.nn.Conv2D(in_d, dim, k, stride) - ) + self.conv_layers.append(paddle.nn.Conv2D(in_d, dim, k, stride)) self.conv_layers.append(paddle.nn.ReLU()) in_d = dim elif self.conv_type == "custom": @@ -473,17 +452,13 @@ class ConvFeatureExtractionModel(nn.Layer): assert len(cl) == 3 (dim, k, stride) = cl self.conv_layers.append( - paddle.nn.Conv2D(in_d, dim, k, stride, padding=1) - ) - self.conv_layers.append( - paddle.nn.LayerNorm([dim, idim]) - ) + paddle.nn.Conv2D(in_d, dim, k, stride, padding=1)) + self.conv_layers.append(paddle.nn.LayerNorm([dim, idim])) self.conv_layers.append(paddle.nn.ReLU()) in_d = dim if (i + 1) % 2 == 0: self.conv_layers.append( - paddle.nn.MaxPool2D(2, stride=2, ceil_mode=True) - ) + paddle.nn.MaxPool2D(2, stride=2, ceil_mode=True)) idim = int(math.ceil(idim / 2)) else: pass @@ -518,8 +493,8 @@ class TransformerEncoder(nn.Layer): self.dropout = args.dropout self.embedding_dim = args.encoder_embed_dim dropout = 0 - std = math.sqrt((4 * (1.0 - dropout)) / (args.conv_pos * self.embedding_dim)) - + std = math.sqrt( + (4 * (1.0 - dropout)) / (args.conv_pos * self.embedding_dim)) self.pos_conv = nn.Conv1D( self.embedding_dim, @@ -528,15 +503,16 @@ class TransformerEncoder(nn.Layer): padding=args.conv_pos // 2, groups=args.conv_pos_groups, weight_attr=nn.initializer.Normal(mean=0, std=std), - bias_attr=True - ) + bias_attr=True) # nn.init.normal_(self.pos_conv.weight, mean=0, std=std) # nn.init.constant_(self.pos_conv.bias, 0) # self.pos_conv = nn.utils.weight_norm(self.pos_conv, name="weight", dim=2) # self.pos_conv.weight_g = self.pos_conv.weight_g.unsqueeze(0).unsqueeze(0) - self.pos_conv = nn.utils.weight_norm(self.pos_conv, name="weight", dim=2) - self.pos_conv = nn.Sequential(self.pos_conv, SamePad(args.conv_pos), nn.GELU()) + self.pos_conv = nn.utils.weight_norm( + self.pos_conv, name="weight", dim=2) + self.pos_conv = nn.Sequential(self.pos_conv, + SamePad(args.conv_pos), nn.GELU()) if hasattr(args, "relative_position_embedding"): self.relative_position_embedding = args.relative_position_embedding @@ -547,25 +523,23 @@ class TransformerEncoder(nn.Layer): self.num_buckets = 0 self.max_distance = 0 - self.layers = nn.LayerList( - [ - TransformerSentenceEncoderLayer( - embedding_dim=self.embedding_dim, - ffn_embedding_dim=args.encoder_ffn_embed_dim, - num_attention_heads=args.encoder_attention_heads, - dropout=self.dropout, - attention_dropout=args.attention_dropout, - activation_dropout=args.activation_dropout, - activation_fn=args.activation_fn, - layer_norm_first=args.layer_norm_first, - has_relative_attention_bias=(self.relative_position_embedding and i == 0), - num_buckets=self.num_buckets, - max_distance=self.max_distance, - gru_rel_pos=args.gru_rel_pos, - ) - for i in range(args.encoder_layers) - ] - ) + self.layers = nn.LayerList([ + TransformerSentenceEncoderLayer( + embedding_dim=self.embedding_dim, + ffn_embedding_dim=args.encoder_ffn_embed_dim, + num_attention_heads=args.encoder_attention_heads, + dropout=self.dropout, + attention_dropout=args.attention_dropout, + activation_dropout=args.activation_dropout, + activation_fn=args.activation_fn, + layer_norm_first=args.layer_norm_first, + has_relative_attention_bias=( + self.relative_position_embedding and i == 0), + num_buckets=self.num_buckets, + max_distance=self.max_distance, + gru_rel_pos=args.gru_rel_pos, ) + for i in range(args.encoder_layers) + ]) self.layer_norm_first = args.layer_norm_first self.layer_norm = LayerNorm(self.embedding_dim) @@ -574,14 +548,19 @@ class TransformerEncoder(nn.Layer): # self.apply(init_bert_params) def forward(self, x, padding_mask=None, streaming_mask=None, layer=None): - x, layer_results = self.extract_features(x, padding_mask, streaming_mask, layer) + x, layer_results = self.extract_features(x, padding_mask, + streaming_mask, layer) # print("x.shape", x.shape) if self.layer_norm_first and layer is None: x = self.layer_norm(x) return x, layer_results - def extract_features(self, x, padding_mask=None, streaming_mask=None, tgt_layer=None): + def extract_features(self, + x, + padding_mask=None, + streaming_mask=None, + tgt_layer=None): if padding_mask is not None: x[padding_mask] = 0 @@ -598,7 +577,6 @@ class TransformerEncoder(nn.Layer): # x = x.transpose(0, 1) x = x.transpose([1, 0, 2]) - layer_results = [] z = None if tgt_layer is not None: @@ -608,7 +586,12 @@ class TransformerEncoder(nn.Layer): for i, layer in enumerate(self.layers): dropout_probability = np.random.random() if not self.training or (dropout_probability > self.layerdrop): - x, z, pos_bias = layer(x, self_attn_padding_mask=padding_mask, need_weights=False,self_attn_mask=streaming_mask, pos_bias=pos_bias) + x, z, pos_bias = layer( + x, + self_attn_padding_mask=padding_mask, + need_weights=False, + self_attn_mask=streaming_mask, + pos_bias=pos_bias) if tgt_layer is not None: layer_results.append((x, z)) if i == tgt_layer: @@ -633,20 +616,19 @@ class TransformerSentenceEncoderLayer(nn.Layer): def __init__( self, - embedding_dim: float = 768, - ffn_embedding_dim: float = 3072, - num_attention_heads: float = 8, - dropout: float = 0.1, - attention_dropout: float = 0.1, - activation_dropout: float = 0.1, - activation_fn: str = "relu", - layer_norm_first: bool = False, - has_relative_attention_bias: bool = True, - num_buckets: int = 0, - max_distance: int = 0, - rescale_init: bool = False, - gru_rel_pos: bool = True, - ) -> None: + embedding_dim: float=768, + ffn_embedding_dim: float=3072, + num_attention_heads: float=8, + dropout: float=0.1, + attention_dropout: float=0.1, + activation_dropout: float=0.1, + activation_fn: str="relu", + layer_norm_first: bool=False, + has_relative_attention_bias: bool=True, + num_buckets: int=0, + max_distance: int=0, + rescale_init: bool=False, + gru_rel_pos: bool=True, ) -> None: super().__init__() # Initialize parameters @@ -666,8 +648,7 @@ class TransformerSentenceEncoderLayer(nn.Layer): num_buckets=num_buckets, max_distance=max_distance, rescale_init=rescale_init, - gru_rel_pos=gru_rel_pos, - ) + gru_rel_pos=gru_rel_pos, ) self.dropout1 = nn.Dropout(dropout) self.dropout2 = nn.Dropout(self.activation_dropout) @@ -679,7 +660,8 @@ class TransformerSentenceEncoderLayer(nn.Layer): self.self_attn_layer_norm = LayerNorm(self.embedding_dim) if self.activation_name == "glu": - self.fc1 = GLU_Linear(self.embedding_dim, ffn_embedding_dim, "swish") + self.fc1 = GLU_Linear(self.embedding_dim, ffn_embedding_dim, + "swish") else: self.fc1 = nn.Linear(self.embedding_dim, ffn_embedding_dim) self.fc2 = nn.Linear(ffn_embedding_dim, self.embedding_dim) @@ -687,21 +669,19 @@ class TransformerSentenceEncoderLayer(nn.Layer): # layer norm associated with the position wise feed-forward NN self.final_layer_norm = LayerNorm(self.embedding_dim) - def forward( - self, - x: Tensor, - self_attn_mask: Tensor = None, - self_attn_padding_mask: Tensor = None, - need_weights: bool = False, - pos_bias=None - ): + def forward(self, + x: Tensor, + self_attn_mask: Tensor=None, + self_attn_padding_mask: Tensor=None, + need_weights: bool=False, + pos_bias=None): """ LayerNorm is applied either before or after the self-attention/ffn modules similar to the original Transformer imlementation. """ residual = x if self.layer_norm_first: - + x = self.self_attn_layer_norm(x) x, attn, pos_bias = self.self_attn( query=x, @@ -710,8 +690,7 @@ class TransformerSentenceEncoderLayer(nn.Layer): key_padding_mask=self_attn_padding_mask, need_weights=False, attn_mask=self_attn_mask, - position_bias=pos_bias - ) + position_bias=pos_bias) # import pdb; pdb.set_trace() x = self.dropout1(x) x = residual + x @@ -734,8 +713,7 @@ class TransformerSentenceEncoderLayer(nn.Layer): key_padding_mask=self_attn_padding_mask, need_weights=need_weights, attn_mask=self_attn_mask, - position_bias=pos_bias - ) + position_bias=pos_bias) x = self.dropout1(x) x = residual + x diff --git a/paddlespeech/s2t/models/whisper/whisper.py b/paddlespeech/s2t/models/whisper/whisper.py index 9925e7cd5..d20cc04b6 100644 --- a/paddlespeech/s2t/models/whisper/whisper.py +++ b/paddlespeech/s2t/models/whisper/whisper.py @@ -109,11 +109,11 @@ class MultiHeadAttention(nn.Layer): n_batch, n_ctx, n_state = q.shape scale = (n_state // self.n_head)**-0.25 q = paddle.transpose( - q.view(*q.shape[:2], self.n_head, -1), (0, 2, 1, 3)) * scale + q.reshape([*q.shape[:2], self.n_head, -1]), (0, 2, 1, 3)) * scale k = paddle.transpose( - k.view(*k.shape[:2], self.n_head, -1), (0, 2, 3, 1)) * scale + k.reshape([*k.shape[:2], self.n_head, -1]), (0, 2, 3, 1)) * scale v = paddle.transpose( - v.view(*v.shape[:2], self.n_head, -1), (0, 2, 1, 3)) + v.reshape([*v.shape[:2], self.n_head, -1]), (0, 2, 1, 3)) qk = q @ k if mask is not None: @@ -823,7 +823,7 @@ class BeamSearchDecoder(TokenDecoder): if self.finished_sequences is None: # for the first update self.finished_sequences = [{} for _ in range(batch_size)] - logprobs = F.log_softmax(logits, axis=-1, dtype=paddle.float32) + logprobs = F.log_softmax(logits, axis=-1, dtype='float32') next_tokens, source_indices, finished_sequences = [], [], [] for i in range(batch_size): scores, sources, finished = {}, {}, {} @@ -969,7 +969,7 @@ class ApplyTimestampRules(LogitFilter): logits[:, last_allowed + 1:] = -np.inf # if sum of probability over timestamps is above any other token, sample timestamp - logprobs = F.log_softmax(logits, axis=-1, dtype=paddle.float32) + logprobs = F.log_softmax(logits, axis=-1, dtype='float32') for k in range(tokens.shape[0]): # When using paddle.logsumexp on a 32GB Tesla-V100 GPU, we encountered CUDA error 700. # To bypass this issue in CI, we have decomposed the operation into separate steps. diff --git a/paddlespeech/t2s/datasets/get_feats.py b/paddlespeech/t2s/datasets/get_feats.py index ea273e245..116554350 100644 --- a/paddlespeech/t2s/datasets/get_feats.py +++ b/paddlespeech/t2s/datasets/get_feats.py @@ -138,7 +138,7 @@ class Pitch(): input: np.ndarray, use_continuous_f0: bool=True, use_log_f0: bool=True) -> np.ndarray: - input = input.astype(np.float) + input = input.astype(np.float_) frame_period = 1000 * self.hop_length / self.sr f0, timeaxis = pyworld.dio( input, diff --git a/paddlespeech/t2s/exps/gan_vocoder/preprocess.py b/paddlespeech/t2s/exps/gan_vocoder/preprocess.py index a2629a900..c1513e0c4 100644 --- a/paddlespeech/t2s/exps/gan_vocoder/preprocess.py +++ b/paddlespeech/t2s/exps/gan_vocoder/preprocess.py @@ -203,9 +203,9 @@ def main(): sentences, speaker_set = get_phn_dur(dur_file) merge_silence(sentences) - # split data into 3 sections if args.dataset == "baker": wav_files = sorted(list((rootdir / "Wave").rglob("*.wav"))) + # split data into 3 sections num_train = 9800 num_dev = 100 train_wav_files = wav_files[:num_train] diff --git a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py index a95a9b288..fcd54f0d2 100644 --- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py +++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py @@ -841,6 +841,9 @@ class FastSpeech2(nn.Layer): spk_emb = self.spk_projection(F.normalize(spk_emb)) hs = hs + spk_emb.unsqueeze(1) elif self.spk_embed_integration_type == "concat": + # one wave `spk_emb` under synthesize, the dim is `1` + if spk_emb.dim() == 1: + spk_emb = spk_emb.unsqueeze(0) # concat hidden states with spk embeds and then apply projection spk_emb = F.normalize(spk_emb).unsqueeze(1).expand( shape=[-1, paddle.shape(hs)[1], -1]) diff --git a/paddlespeech/t2s/models/jets/length_regulator.py b/paddlespeech/t2s/models/jets/length_regulator.py index f7a395a64..f8629382c 100644 --- a/paddlespeech/t2s/models/jets/length_regulator.py +++ b/paddlespeech/t2s/models/jets/length_regulator.py @@ -55,7 +55,9 @@ class GaussianUpsampling(nn.Layer): if h_masks is not None: t = t * paddle.to_tensor(h_masks, dtype="float32") - c = ds.cumsum(axis=-1) - ds / 2 + ds_cumsum = ds.cumsum(axis=-1) + ds_half = ds / 2 + c = ds_cumsum.astype(ds_half.dtype) - ds_half energy = -1 * self.delta * (t.unsqueeze(-1) - c.unsqueeze(1))**2 if d_masks is not None: d_masks = ~(d_masks.unsqueeze(1)) diff --git a/paddlespeech/t2s/models/vits/generator.py b/paddlespeech/t2s/models/vits/generator.py index 427ae09ed..d82d78e7c 100644 --- a/paddlespeech/t2s/models/vits/generator.py +++ b/paddlespeech/t2s/models/vits/generator.py @@ -577,8 +577,9 @@ class VITSGenerator(nn.Layer): # decoder z_p = m_p + paddle.randn( paddle.shape(m_p)) * paddle.exp(logs_p) * noise_scale - z = self.flow(z_p, y_mask, g=g, inverse=True) - wav = self.decoder((z * y_mask)[:, :, :max_len], g=g) + z = self.flow(z_p, y_mask.astype(z_p.dtype), g=g, inverse=True) + wav = self.decoder( + (z * y_mask.astype(z.dtype))[:, :, :max_len], g=g) return wav.squeeze(1), attn.squeeze(1), dur.squeeze(1) @@ -695,4 +696,5 @@ class VITSGenerator(nn.Layer): path = paddle.cast(path, dtype='float32') pad_tmp = self.pad1d(path)[:, :-1] path = path - pad_tmp - return path.unsqueeze(1).transpose([0, 1, 3, 2]) * mask + return path.unsqueeze(1).transpose( + [0, 1, 3, 2]) * mask.astype(path.dtype) diff --git a/paddlespeech/t2s/models/vits/posterior_encoder.py b/paddlespeech/t2s/models/vits/posterior_encoder.py index 5e3d6b9ce..b0a071b23 100644 --- a/paddlespeech/t2s/models/vits/posterior_encoder.py +++ b/paddlespeech/t2s/models/vits/posterior_encoder.py @@ -129,6 +129,7 @@ class PosteriorEncoder(nn.Layer): """ x_mask = make_non_pad_mask(x_lengths).unsqueeze(1) + x_mask = x_mask.astype(x.dtype) x = self.input_conv(x) * x_mask x = self.encoder(x, x_mask, g=g) stats = self.proj(x) * x_mask diff --git a/paddlespeech/t2s/models/vits/text_encoder.py b/paddlespeech/t2s/models/vits/text_encoder.py index 015ed76c6..5b9de95a9 100644 --- a/paddlespeech/t2s/models/vits/text_encoder.py +++ b/paddlespeech/t2s/models/vits/text_encoder.py @@ -155,6 +155,7 @@ class TextEncoder(nn.Layer): """ x = self.emb(x) * math.sqrt(self.attention_dim) x_mask = make_non_pad_mask(x_lengths).unsqueeze(1) + x_mask = x_mask.astype(x.dtype) # encoder assume the channel last (B, T_text, attention_dim) # but mask shape shoud be (B, 1, T_text) x, _ = self.encoder(x, x_mask) diff --git a/paddlespeech/t2s/modules/masked_fill.py b/paddlespeech/t2s/modules/masked_fill.py index 1445a926a..d143fe62f 100644 --- a/paddlespeech/t2s/modules/masked_fill.py +++ b/paddlespeech/t2s/modules/masked_fill.py @@ -29,7 +29,27 @@ def is_broadcastable(shp1, shp2): def broadcast_shape(shp1, shp2): result = [] for a, b in zip(shp1[::-1], shp2[::-1]): - result.append(max(a, b)) + is_a_int = isinstance(a, int) + is_b_int = isinstance(b, int) + + if is_a_int and is_b_int: + result.append(max(a, b)) + + else: + dtype = None + if hasattr(a, 'dtype'): + dtype = a.dtype + if hasattr(b, 'dtype'): + dtype = b.dtype + + if (is_a_int): + a = paddle.full((), a, dtype=dtype) + + if (is_b_int): + b = paddle.full((), b, dtype=dtype) + + result.append(paddle.maximum(a, b)) + return result[::-1] diff --git a/paddlespeech/t2s/modules/nets_utils.py b/paddlespeech/t2s/modules/nets_utils.py index 57c46e3a8..0a66a1c88 100644 --- a/paddlespeech/t2s/modules/nets_utils.py +++ b/paddlespeech/t2s/modules/nets_utils.py @@ -181,6 +181,10 @@ def make_pad_mask(lengths, xs=None, length_dim=-1): if length_dim == 0: raise ValueError("length_dim cannot be 0: {}".format(length_dim)) + # check if ilens is 0-dim tensor, if so, add a dimension + if lengths.ndim == 0: + lengths = lengths.unsqueeze(0) + bs = paddle.shape(lengths) if xs is None: maxlen = paddle.cast(lengths.max(), dtype=bs.dtype) @@ -348,7 +352,9 @@ def get_random_segments( """ b, c, t = paddle.shape(x) max_start_idx = x_lengths - segment_size - start_idxs = paddle.cast(paddle.rand([b]) * max_start_idx, 'int64') + rand_number = paddle.rand([b]) + start_idxs = paddle.cast(rand_number * + max_start_idx.astype(rand_number.dtype), 'int64') segments = get_segments(x, start_idxs, segment_size) return segments, start_idxs diff --git a/paddlespeech/t2s/modules/transformer/embedding.py b/paddlespeech/t2s/modules/transformer/embedding.py index f90eb44a4..e4331cff0 100644 --- a/paddlespeech/t2s/modules/transformer/embedding.py +++ b/paddlespeech/t2s/modules/transformer/embedding.py @@ -67,7 +67,7 @@ class PositionalEncoding(nn.Layer): pe[:, 0::2] = paddle.sin(position * div_term) pe[:, 1::2] = paddle.cos(position * div_term) pe = pe.unsqueeze(0) - self.pe = pe + self.pe = paddle.assign(pe) def forward(self, x: paddle.Tensor): """Add positional encoding. diff --git a/paddlespeech/t2s/utils/internals.py b/paddlespeech/t2s/utils/internals.py index 830e8a80f..56b3ecaae 100644 --- a/paddlespeech/t2s/utils/internals.py +++ b/paddlespeech/t2s/utils/internals.py @@ -36,7 +36,7 @@ def convert_dtype_to_np_dtype_(dtype): elif dtype is core.VarDesc.VarType.FP16: return np.float16 elif dtype is core.VarDesc.VarType.BOOL: - return np.bool + return np.bool_ elif dtype is core.VarDesc.VarType.INT32: return np.int32 elif dtype is core.VarDesc.VarType.INT64: diff --git a/runtime/examples/text_lm/utils b/runtime/examples/text_lm/utils index 256f914ab..94d118d25 120000 --- a/runtime/examples/text_lm/utils +++ b/runtime/examples/text_lm/utils @@ -1 +1 @@ -../../../utils/ \ No newline at end of file +../../../utils/ diff --git a/runtime/examples/u2pp_ol/wenetspeech/utils b/runtime/examples/u2pp_ol/wenetspeech/utils index c2519a9dd..758320d41 120000 --- a/runtime/examples/u2pp_ol/wenetspeech/utils +++ b/runtime/examples/u2pp_ol/wenetspeech/utils @@ -1 +1 @@ -../../../../utils/ \ No newline at end of file +../../../../utils/