From 8eff478af3ba11eadb7f75c596cefe65e88ae723 Mon Sep 17 00:00:00 2001 From: zxcd <228587199@qq.com> Date: Tue, 14 Mar 2023 11:25:47 +0000 Subject: [PATCH] cherrypick change optimizer and fix import error. --- README.md | 1 + README_cn.md | 3 +- examples/aishell/asr3/README.md | 4 +- examples/aishell/asr3/conf/wav2vec2ASR.yaml | 6 +- .../asr3/conf/wav2vec2ASR_adadelta.yaml | 168 ++++++++++++++++++ examples/aishell/asr3/local/test.sh | 6 +- examples/aishell/asr3/local/test_wav.sh | 2 +- examples/aishell/asr3/run.sh | 4 +- paddlespeech/resource/pretrained_models.py | 4 +- .../s2t/exps/wav2vec2/bin/test_wav.py | 17 +- paddlespeech/t2s/exps/syn_utils.py | 7 +- 11 files changed, 197 insertions(+), 25 deletions(-) create mode 100755 examples/aishell/asr3/conf/wav2vec2ASR_adadelta.yaml diff --git a/README.md b/README.md index 5c5dc3a0f..3c60db650 100644 --- a/README.md +++ b/README.md @@ -180,6 +180,7 @@ Via the easy-to-use, efficient, flexible and scalable implementation, our vision ### Recent Update - 👑 2023.03.09: Add [Wav2vec2ASR-zh](./examples/aishell/asr3). - 🎉 2023.03.07: Add [TTS ARM Linux C++ Demo](./demos/TTSArmLinux). +- 🔥 2023.03.03 Add Voice Conversion [StarGANv2-VC synthesize pipeline](./examples/vctk/vc3). - 🎉 2023.02.16: Add [Cantonese TTS](./examples/canton/tts3). - 🔥 2023.01.10: Add [code-switch asr CLI and Demos](./demos/speech_recognition). - 👑 2023.01.06: Add [code-switch asr tal_cs recipe](./examples/tal_cs/asr1/). diff --git a/README_cn.md b/README_cn.md index fa013029c..29ee387c0 100644 --- a/README_cn.md +++ b/README_cn.md @@ -183,8 +183,9 @@ - 🧩 级联模型应用: 作为传统语音任务的扩展,我们结合了自然语言处理、计算机视觉等任务,实现更接近实际需求的产业级应用。 ### 近期更新 -- 👑 2023.03.09: 新增 [Wav2vec2ASR-zh](./examples/aishell/asr3). +- 👑 2023.03.09: 新增 [Wav2vec2ASR-zh](./examples/aishell/asr3)。 - 🎉 2023.03.07: 新增 [TTS ARM Linux C++ 部署示例](./demos/TTSArmLinux)。 +- 🔥 2023.03.03: 新增声音转换模型 [StarGANv2-VC 合成流程](./examples/vctk/vc3)。 - 🎉 2023.02.16: 新增[粤语语音合成](./examples/canton/tts3)。 - 🔥 2023.01.10: 新增[中英混合 ASR CLI 和 Demos](./demos/speech_recognition)。 - 👑 2023.01.06: 新增 [ASR 中英混合 tal_cs 训练推理流程](./examples/tal_cs/asr1/)。 diff --git a/examples/aishell/asr3/README.md b/examples/aishell/asr3/README.md index f6fa60d7f..6b587e12f 100644 --- a/examples/aishell/asr3/README.md +++ b/examples/aishell/asr3/README.md @@ -190,9 +190,9 @@ tar xzvf wav2vec2ASR-large-aishell1_ckpt_1.4.0.model.tar.gz ``` You can download the audio demo: ```bash -wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/en/demo_002_en.wav -P data/ +wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wav -P data/ ``` You need to prepare an audio file or use the audio demo above, please confirm the sample rate of the audio is 16K. You can get the result of the audio demo by running the script below. ```bash -CUDA_VISIBLE_DEVICES= ./local/test_wav.sh conf/wav2vec2ASR.yaml conf/tuning/decode.yaml exp/wav2vec2ASR/checkpoints/avg_1 data/demo_002_en.wav +CUDA_VISIBLE_DEVICES= ./local/test_wav.sh conf/wav2vec2ASR.yaml conf/tuning/decode.yaml exp/wav2vec2ASR/checkpoints/avg_1 data/demo_01_03.wav ``` diff --git a/examples/aishell/asr3/conf/wav2vec2ASR.yaml b/examples/aishell/asr3/conf/wav2vec2ASR.yaml index cdb04f8c1..4a1274688 100755 --- a/examples/aishell/asr3/conf/wav2vec2ASR.yaml +++ b/examples/aishell/asr3/conf/wav2vec2ASR.yaml @@ -107,6 +107,7 @@ vocab_filepath: data/lang_char/vocab.txt ########################################### unit_type: 'char' +tokenizer: bert-base-chinese mean_std_filepath: preprocess_config: conf/preprocess.yaml sortagrad: -1 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs @@ -139,12 +140,10 @@ n_epoch: 80 accum_grad: 1 global_grad_clip: 5.0 -model_optim: adadelta +model_optim: sgd model_optim_conf: lr: 1.0 weight_decay: 0.0 - rho: 0.95 - epsilon: 1.0e-8 wav2vec2_optim: adam wav2vec2_optim_conf: @@ -165,3 +164,4 @@ log_interval: 1 checkpoint: kbest_n: 50 latest_n: 5 + diff --git a/examples/aishell/asr3/conf/wav2vec2ASR_adadelta.yaml b/examples/aishell/asr3/conf/wav2vec2ASR_adadelta.yaml new file mode 100755 index 000000000..ec287f0c6 --- /dev/null +++ b/examples/aishell/asr3/conf/wav2vec2ASR_adadelta.yaml @@ -0,0 +1,168 @@ +############################################ +# Network Architecture # +############################################ +freeze_wav2vec2: False +normalize_wav: True +output_norm: True +init_type: 'kaiming_uniform' # !Warning: need to convergence +enc: + input_shape: 1024 + dnn_blocks: 3 + dnn_neurons: 1024 + activation: True + normalization: True + dropout_rate: [0.15, 0.15, 0.0] +ctc: + enc_n_units: 1024 + blank_id: 0 + dropout_rate: 0.0 + +audio_augment: + speeds: [90, 100, 110] + +spec_augment: + time_warp: True + time_warp_window: 5 + time_warp_mode: bicubic + freq_mask: True + n_freq_mask: 2 + time_mask: True + n_time_mask: 2 + replace_with_zero: False + freq_mask_width: 30 + time_mask_width: 40 +wav2vec2_params_path: exp/wav2vec2/chinese-wav2vec2-large.pdparams + + +############################################ +# Wav2Vec2.0 # +############################################ +# vocab_size: 1000000 +hidden_size: 1024 +num_hidden_layers: 24 +num_attention_heads: 16 +intermediate_size: 4096 +hidden_act: gelu +hidden_dropout: 0.1 +activation_dropout: 0.0 +attention_dropout: 0.1 +feat_proj_dropout: 0.1 +feat_quantizer_dropout: 0.0 +final_dropout: 0.0 +layerdrop: 0.1 +initializer_range: 0.02 +layer_norm_eps: 1e-5 +feat_extract_norm: layer +feat_extract_activation: gelu +conv_dim: [512, 512, 512, 512, 512, 512, 512] +conv_stride: [5, 2, 2, 2, 2, 2, 2] +conv_kernel: [10, 3, 3, 3, 3, 2, 2] +conv_bias: True +num_conv_pos_embeddings: 128 +num_conv_pos_embedding_groups: 16 +do_stable_layer_norm: True +apply_spec_augment: False +mask_channel_length: 10 +mask_channel_min_space: 1 +mask_channel_other: 0.0 +mask_channel_prob: 0.0 +mask_channel_selection: static +mask_feature_length: 10 +mask_feature_min_masks: 0 +mask_feature_prob: 0.0 +mask_time_length: 10 +mask_time_min_masks: 2 +mask_time_min_space: 1 +mask_time_other: 0.0 +mask_time_prob: 0.075 +mask_time_selection: static +num_codevectors_per_group: 320 +num_codevector_groups: 2 +contrastive_logits_temperature: 0.1 +num_negatives: 100 +codevector_dim: 256 +proj_codevector_dim: 256 +diversity_loss_weight: 0.1 +use_weighted_layer_sum: False +# pad_token_id: 0 +# bos_token_id: 1 +# eos_token_id: 2 +add_adapter: False +adapter_kernel_size: 3 +adapter_stride: 2 +num_adapter_layers: 3 +output_hidden_size: None + +########################################### +# Data # +########################################### + +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test +vocab_filepath: data/lang_char/vocab.txt + +########################################### +# Dataloader # +########################################### + +unit_type: 'char' +tokenizer: bert-base-chinese +mean_std_filepath: +preprocess_config: conf/preprocess.yaml +sortagrad: -1 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 5 # Different batch_size may cause large differences in results +maxlen_in: 51200000000 # if input length > maxlen-in batchsize is automatically reduced +maxlen_out: 1500000 # if output length > maxlen-out batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +num_workers: 6 +subsampling_factor: 1 +num_encs: 1 +dist_sampler: True +shortest_first: True +return_lens_rate: True + +########################################### +# use speechbrain dataloader # +########################################### +use_sb_pipeline: True # whether use speechbrain pipeline. Default is True. +sb_pipeline_conf: conf/train_with_wav2vec.yaml + +########################################### +# Training # +########################################### +n_epoch: 80 +accum_grad: 1 +global_grad_clip: 5.0 + +model_optim: adadelta +model_optim_conf: + lr: 1.0 + weight_decay: 0.0 + rho: 0.95 + epsilon: 1.0e-8 + +wav2vec2_optim: adam +wav2vec2_optim_conf: + lr: 0.0001 + weight_decay: 0.0 + +model_scheduler: newbobscheduler +model_scheduler_conf: + improvement_threshold: 0.0025 + annealing_factor: 0.8 + patient: 0 +wav2vec2_scheduler: newbobscheduler +wav2vec2_scheduler_conf: + improvement_threshold: 0.0025 + annealing_factor: 0.9 + patient: 0 +log_interval: 1 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/aishell/asr3/local/test.sh b/examples/aishell/asr3/local/test.sh index 9d4b84291..91e1c5457 100755 --- a/examples/aishell/asr3/local/test.sh +++ b/examples/aishell/asr3/local/test.sh @@ -8,9 +8,7 @@ echo "using $ngpu gpus..." expdir=exp datadir=data -train_set=train_960 -recog_set="test-clean test-other dev-clean dev-other" -recog_set="test-clean" +train_set=train config_path=$1 decode_config_path=$2 @@ -75,7 +73,7 @@ for type in ctc_prefix_beam_search; do --trans_hyp ${ckpt_prefix}.${type}.rsl.text python3 utils/compute-wer.py --char=1 --v=1 \ - data/manifest.test-clean.text ${ckpt_prefix}.${type}.rsl.text > ${ckpt_prefix}.${type}.error + data/manifest.test.text ${ckpt_prefix}.${type}.rsl.text > ${ckpt_prefix}.${type}.error echo "decoding ${type} done." done diff --git a/examples/aishell/asr3/local/test_wav.sh b/examples/aishell/asr3/local/test_wav.sh index fdf3589f4..7ccef6945 100755 --- a/examples/aishell/asr3/local/test_wav.sh +++ b/examples/aishell/asr3/local/test_wav.sh @@ -14,7 +14,7 @@ ckpt_prefix=$3 audio_file=$4 mkdir -p data -wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/en/demo_002_en.wav -P data/ +wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wav -P data/ if [ $? -ne 0 ]; then exit 1 fi diff --git a/examples/aishell/asr3/run.sh b/examples/aishell/asr3/run.sh index 9b0a3c472..557ca0fcd 100755 --- a/examples/aishell/asr3/run.sh +++ b/examples/aishell/asr3/run.sh @@ -15,11 +15,11 @@ resume= # xx e.g. 30 export FLAGS_cudnn_deterministic=1 . ${MAIN_ROOT}/utils/parse_options.sh || exit 1; -audio_file=data/demo_002_en.wav +audio_file=data/demo_01_03.wav avg_ckpt=avg_${avg_num} ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}') -echo "checkpoint name ${ckpt}"git revert -v +echo "checkpoint name ${ckpt}" if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then # prepare data diff --git a/paddlespeech/resource/pretrained_models.py b/paddlespeech/resource/pretrained_models.py index 04df18623..3c5db64bb 100644 --- a/paddlespeech/resource/pretrained_models.py +++ b/paddlespeech/resource/pretrained_models.py @@ -102,13 +102,11 @@ ssl_dynamic_pretrained_models = { 'params': 'exp/wav2vec2ASR/checkpoints/avg_1.pdparams', }, - }, - "wav2vec2ASR_aishell1-zh-16k": { '1.4': { 'url': 'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr3/wav2vec2ASR-large-aishell1_ckpt_1.4.0.model.tar.gz', 'md5': - '9f0bc943adb822789bf61e674b229d17', + '150e51b8ea5d255ccce6b395de8d916a', 'cfg_path': 'model.yaml', 'ckpt_path': diff --git a/paddlespeech/s2t/exps/wav2vec2/bin/test_wav.py b/paddlespeech/s2t/exps/wav2vec2/bin/test_wav.py index 0d66ac410..5efa82e60 100644 --- a/paddlespeech/s2t/exps/wav2vec2/bin/test_wav.py +++ b/paddlespeech/s2t/exps/wav2vec2/bin/test_wav.py @@ -18,13 +18,13 @@ from pathlib import Path import paddle import soundfile -from yacs.config import CfgNode - +from paddlenlp.transformers import AutoTokenizer from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer from paddlespeech.s2t.models.wav2vec2.wav2vec2_ASR import Wav2vec2ASR from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils.log import Log from paddlespeech.s2t.utils.utility import UpdateConfig +from yacs.config import CfgNode logger = Log(__name__).getlog() @@ -34,8 +34,13 @@ class Wav2vec2Infer(): self.config = config self.audio_file = args.audio_file - self.text_feature = TextFeaturizer( - unit_type=config.unit_type, vocab=config.vocab_filepath) + if self.config.tokenizer: + self.text_feature = AutoTokenizer.from_pretrained( + self.config.tokenizer) + else: + self.text_feature = TextFeaturizer( + unit_type=config.unit_type, vocab=config.vocab_filepath) + paddle.set_device('gpu' if self.args.ngpu > 0 else 'cpu') # model @@ -59,14 +64,14 @@ class Wav2vec2Infer(): audio, _ = soundfile.read( self.audio_file, dtype="int16", always_2d=True) logger.info(f"audio shape: {audio.shape}") - xs = paddle.to_tensor(audio, dtype='float32').unsqueeze(axis=0) decode_config = self.config.decode result_transcripts, result_tokenids = self.model.decode( xs, text_feature=self.text_feature, decoding_method=decode_config.decoding_method, - beam_size=decode_config.beam_size) + beam_size=decode_config.beam_size, + tokenizer=self.config.tokenizer, ) rsl = result_transcripts[0] utt = Path(self.audio_file).name logger.info(f"hyp: {utt} {rsl}") diff --git a/paddlespeech/t2s/exps/syn_utils.py b/paddlespeech/t2s/exps/syn_utils.py index 12b75615e..354636b48 100644 --- a/paddlespeech/t2s/exps/syn_utils.py +++ b/paddlespeech/t2s/exps/syn_utils.py @@ -27,8 +27,6 @@ from paddle import inference from paddle import jit from paddle.io import DataLoader from paddle.static import InputSpec -from yacs.config import CfgNode - from paddlespeech.t2s.datasets.am_batch_fn import * from paddlespeech.t2s.datasets.data_table import DataTable from paddlespeech.t2s.datasets.vocoder_batch_fn import Clip_static @@ -38,6 +36,7 @@ from paddlespeech.t2s.frontend.mix_frontend import MixFrontend from paddlespeech.t2s.frontend.zh_frontend import Frontend from paddlespeech.t2s.modules.normalizer import ZScore from paddlespeech.utils.dynamic_import import dynamic_import +from yacs.config import CfgNode # remove [W:onnxruntime: xxx] from ort ort.set_default_logger_severity(3) @@ -490,6 +489,7 @@ def get_predictor( device: str='cpu', # for gpu use_trt: bool=False, + device_id: int=0, # for trt use_dynamic_shape: bool=True, min_subgraph_size: int=5, @@ -505,6 +505,7 @@ def get_predictor( params_file (os.PathLike): name of params_file. device (str): Choose the device you want to run, it can be: cpu/gpu, default is cpu. use_trt (bool): whether to use TensorRT or not in GPU. + device_id (int): Choose your device id, only valid when the device is gpu, default 0. use_dynamic_shape (bool): use dynamic shape or not in TensorRT. use_mkldnn (bool): whether to use MKLDNN or not in CPU. cpu_threads (int): num of thread when use CPU. @@ -521,7 +522,7 @@ def get_predictor( config.enable_memory_optim() config.switch_ir_optim(True) if device == "gpu": - config.enable_use_gpu(100, 0) + config.enable_use_gpu(100, device_id) else: config.disable_gpu() config.set_cpu_math_library_num_threads(cpu_threads)