From f451d880fff548bd90cf196be667ef25f64d8fb0 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Thu, 25 Nov 2021 15:06:44 +0800 Subject: [PATCH 1/4] Update quick_start.md --- docs/source/tts/quick_start.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/tts/quick_start.md b/docs/source/tts/quick_start.md index 1d7254d0..e6ad46fb 100644 --- a/docs/source/tts/quick_start.md +++ b/docs/source/tts/quick_start.md @@ -15,6 +15,7 @@ The models in PaddleSpeech TTS have the following mapping relationship: * voc2 - MelGAN * voc3 - MultiBand MelGAN * vc0 - Tactron2 Voice Clone with GE2E +* vc1 - FastSpeech2 Voice Clone with GE2E ## Quick Start From 789471bfca51bb7fde80c7ba02cc460828f10ade Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Thu, 25 Nov 2021 07:27:44 +0000 Subject: [PATCH 2/4] test wav for u2 --- examples/wenetspeech/asr1/local/test_wav.sh | 45 +++++ paddlespeech/s2t/exps/u2/bin/test_hub.py | 187 -------------------- paddlespeech/s2t/exps/u2/bin/test_wav.py | 148 ++++++++++++++++ 3 files changed, 193 insertions(+), 187 deletions(-) create mode 100755 examples/wenetspeech/asr1/local/test_wav.sh delete mode 100644 paddlespeech/s2t/exps/u2/bin/test_hub.py create mode 100644 paddlespeech/s2t/exps/u2/bin/test_wav.py diff --git a/examples/wenetspeech/asr1/local/test_wav.sh b/examples/wenetspeech/asr1/local/test_wav.sh new file mode 100755 index 00000000..13296af2 --- /dev/null +++ b/examples/wenetspeech/asr1/local/test_wav.sh @@ -0,0 +1,45 @@ +#!/bin/bash + +if [ $# != 3 ];then + echo "usage: ${0} config_path ckpt_path_prefix audio_file" + exit -1 +fi + +ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') +echo "using $ngpu gpus..." + +config_path=$1 +ckpt_prefix=$2 +audio_file=$3 + +chunk_mode=false +if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then + chunk_mode=true +fi + +# download language model +#bash local/download_lm_ch.sh +#if [ $? -ne 0 ]; then +# exit 1 +#fi + +for type in attention_rescoring; do + echo "decoding ${type}" + batch_size=1 + output_dir=${ckpt_prefix} + mkdir -p ${output_dir} + python3 -u ${BIN_DIR}/test_wav.py \ + --nproc ${ngpu} \ + --config ${config_path} \ + --result_file ${output_dir}/${type}.rsl \ + --checkpoint_path ${ckpt_prefix} \ + --opts decoding.decoding_method ${type} \ + --opts decoding.batch_size ${batch_size} \ + --audio_file ${audio_file} + + if [ $? -ne 0 ]; then + echo "Failed in evaluation!" + exit 1 + fi +done +exit 0 diff --git a/paddlespeech/s2t/exps/u2/bin/test_hub.py b/paddlespeech/s2t/exps/u2/bin/test_hub.py deleted file mode 100644 index 55a61d5c..00000000 --- a/paddlespeech/s2t/exps/u2/bin/test_hub.py +++ /dev/null @@ -1,187 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Evaluation for U2 model.""" -import cProfile -import os -import sys - -import paddle -import soundfile - -from paddlespeech.s2t.exps.u2.config import get_cfg_defaults -from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer -from paddlespeech.s2t.io.collator import SpeechCollator -from paddlespeech.s2t.models.u2 import U2Model -from paddlespeech.s2t.training.cli import default_argument_parser -from paddlespeech.s2t.training.trainer import Trainer -from paddlespeech.s2t.utils import layer_tools -from paddlespeech.s2t.utils import mp_tools -from paddlespeech.s2t.utils.log import Log -from paddlespeech.s2t.utils.utility import print_arguments -from paddlespeech.s2t.utils.utility import UpdateConfig -logger = Log(__name__).getlog() - -# TODO(hui zhang): dynamic load - - -class U2Tester_Hub(Trainer): - def __init__(self, config, args): - # super().__init__(config, args) - self.args = args - self.config = config - self.audio_file = args.audio_file - self.collate_fn_test = SpeechCollator.from_config(config) - self._text_featurizer = TextFeaturizer( - unit_type=config.collator.unit_type, - vocab_filepath=None, - spm_model_prefix=config.collator.spm_model_prefix) - - def setup_model(self): - config = self.config - model_conf = config.model - - with UpdateConfig(model_conf): - model_conf.input_dim = self.collate_fn_test.feature_size - model_conf.output_dim = self.collate_fn_test.vocab_size - - model = U2Model.from_config(model_conf) - - if self.parallel: - model = paddle.DataParallel(model) - - logger.info(f"{model}") - layer_tools.print_params(model, logger.info) - - self.model = model - logger.info("Setup model") - - @mp_tools.rank_zero_only - @paddle.no_grad() - def test(self): - self.model.eval() - cfg = self.config.decoding - audio_file = self.audio_file - collate_fn_test = self.collate_fn_test - audio, _ = collate_fn_test.process_utterance( - audio_file=audio_file, transcript="Hello") - audio_len = audio.shape[0] - audio = paddle.to_tensor(audio, dtype='float32') - audio_len = paddle.to_tensor(audio_len) - audio = paddle.unsqueeze(audio, axis=0) - vocab_list = collate_fn_test.vocab_list - - text_feature = self.collate_fn_test.text_feature - result_transcripts = self.model.decode( - audio, - audio_len, - text_feature=text_feature, - decoding_method=cfg.decoding_method, - lang_model_path=cfg.lang_model_path, - beam_alpha=cfg.alpha, - beam_beta=cfg.beta, - beam_size=cfg.beam_size, - cutoff_prob=cfg.cutoff_prob, - cutoff_top_n=cfg.cutoff_top_n, - num_processes=cfg.num_proc_bsearch, - ctc_weight=cfg.ctc_weight, - decoding_chunk_size=cfg.decoding_chunk_size, - num_decoding_left_chunks=cfg.num_decoding_left_chunks, - simulate_streaming=cfg.simulate_streaming) - logger.info("The result_transcripts: " + result_transcripts[0][0]) - - def run_test(self): - self.resume() - try: - self.test() - except KeyboardInterrupt: - sys.exit(-1) - - def setup(self): - """Setup the experiment. - """ - paddle.set_device('gpu' if self.args.nprocs > 0 else 'cpu') - - #self.setup_output_dir() - #self.setup_checkpointer() - - #self.setup_dataloader() - self.setup_model() - - self.iteration = 0 - self.epoch = 0 - - def resume(self): - """Resume from the checkpoint at checkpoints in the output - directory or load a specified checkpoint. - """ - params_path = self.args.checkpoint_path + ".pdparams" - model_dict = paddle.load(params_path) - self.model.set_state_dict(model_dict) - - -def check(audio_file): - logger.info("checking the audio file format......") - try: - sig, sample_rate = soundfile.read(audio_file) - except Exception as e: - logger.error(str(e)) - logger.error( - "can not open the wav file, please check the audio file format") - sys.exit(-1) - logger.info("The sample rate is %d" % sample_rate) - assert (sample_rate == 16000) - logger.info("The audio file format is right") - - -def main_sp(config, args): - exp = U2Tester_Hub(config, args) - with exp.eval(): - exp.setup() - exp.run_test() - - -def main(config, args): - main_sp(config, args) - - -if __name__ == "__main__": - parser = default_argument_parser() - # save asr result to - parser.add_argument( - "--result_file", type=str, help="path of save the asr result") - parser.add_argument( - "--audio_file", type=str, help="path of the input audio file") - args = parser.parse_args() - print_arguments(args, globals()) - - if not os.path.isfile(args.audio_file): - print("Please input the right audio file path") - sys.exit(-1) - check(args.audio_file) - # https://yaml.org/type/float.html - config = get_cfg_defaults() - if args.config: - config.merge_from_file(args.config) - if args.opts: - config.merge_from_list(args.opts) - config.freeze() - print(config) - if args.dump_config: - with open(args.dump_config, 'w') as f: - print(config, file=f) - - # Setting for profiling - pr = cProfile.Profile() - pr.runcall(main, config, args) - pr.dump_stats('test.profile') diff --git a/paddlespeech/s2t/exps/u2/bin/test_wav.py b/paddlespeech/s2t/exps/u2/bin/test_wav.py new file mode 100644 index 00000000..e118b481 --- /dev/null +++ b/paddlespeech/s2t/exps/u2/bin/test_wav.py @@ -0,0 +1,148 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Evaluation for U2 model.""" +import os +import sys +from pathlib import Path + +import paddle +import soundfile + +from paddlespeech.s2t.exps.u2.config import get_cfg_defaults +from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer +from paddlespeech.s2t.models.u2 import U2Model +from paddlespeech.s2t.training.cli import default_argument_parser +from paddlespeech.s2t.transform.transformation import Transformation +from paddlespeech.s2t.utils.log import Log +from paddlespeech.s2t.utils.utility import UpdateConfig +logger = Log(__name__).getlog() + +# TODO(hui zhang): dynamic load + + +class U2Infer(): + def __init__(self, config, args): + self.args = args + self.config = config + self.audio_file = args.audio_file + self.sr = config.collator.target_sample_rate + + self.preprocess_conf = config.collator.augmentation_config + self.preprocess_args = {"train": False} + self.preprocessing = Transformation(self.preprocess_conf) + + self.text_feature = TextFeaturizer( + unit_type=config.collator.unit_type, + vocab_filepath=config.collator.vocab_filepath, + spm_model_prefix=config.collator.spm_model_prefix) + + paddle.set_device('gpu' if self.args.nprocs > 0 else 'cpu') + + # model + model_conf = config.model + with UpdateConfig(model_conf): + model_conf.input_dim = config.collator.feat_dim + model_conf.output_dim = self.text_feature.vocab_size + model = U2Model.from_config(model_conf) + self.model = model + self.model.eval() + + # load model + params_path = self.args.checkpoint_path + ".pdparams" + model_dict = paddle.load(params_path) + self.model.set_state_dict(model_dict) + + def run(self): + check(args.audio_file) + + with paddle.no_grad(): + # read + audio, sample_rate = soundfile.read( + self.audio_file, dtype="int16", always_2d=True) + if sample_rate != self.sr: + logger.error( + f"sample rate error: {sample_rate}, need {self.sr} ") + sys.exit(-1) + + audio = audio[:, 0] + logger.info(f"audio shape: {audio.shape}") + + # fbank + feat = self.preprocessing(audio, **self.preprocess_args) + logger.info(f"feat shape: {feat.shape}") + + ilen = paddle.to_tensor(feat.shape[0]) + xs = paddle.to_tensor(feat, dtype='float32').unsqueeze(axis=0) + + cfg = self.config.decoding + result_transcripts = self.model.decode( + xs, + ilen, + text_feature=self.text_feature, + decoding_method=cfg.decoding_method, + lang_model_path=cfg.lang_model_path, + beam_alpha=cfg.alpha, + beam_beta=cfg.beta, + beam_size=cfg.beam_size, + cutoff_prob=cfg.cutoff_prob, + cutoff_top_n=cfg.cutoff_top_n, + num_processes=cfg.num_proc_bsearch, + ctc_weight=cfg.ctc_weight, + decoding_chunk_size=cfg.decoding_chunk_size, + num_decoding_left_chunks=cfg.num_decoding_left_chunks, + simulate_streaming=cfg.simulate_streaming) + rsl = result_transcripts[0][0] + utt = Path(self.audio_file).name + logger.info(f"hyp: {utt} {result_transcripts[0][0]}") + return rsl + + +def check(audio_file): + if not os.path.isfile(audio_file): + print("Please input the right audio file path") + sys.exit(-1) + + logger.info("checking the audio file format......") + try: + sig, sample_rate = soundfile.read(audio_file) + except Exception as e: + logger.error(str(e)) + logger.error( + "can not open the wav file, please check the audio file format") + sys.exit(-1) + logger.info("The sample rate is %d" % sample_rate) + assert (sample_rate == 16000) + logger.info("The audio file format is right") + + +def main(config, args): + U2Infer(config, args).run() + + +if __name__ == "__main__": + parser = default_argument_parser() + # save asr result to + parser.add_argument( + "--result_file", type=str, help="path of save the asr result") + parser.add_argument( + "--audio_file", type=str, help="path of the input audio file") + args = parser.parse_args() + + config = get_cfg_defaults() + if args.config: + config.merge_from_file(args.config) + if args.opts: + config.merge_from_list(args.opts) + config.freeze() + main(config, args) From 4a6493588a730b358f1bfb04fe0e20af15af377c Mon Sep 17 00:00:00 2001 From: TianYuan Date: Thu, 25 Nov 2021 15:36:18 +0800 Subject: [PATCH 3/4] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index d39b0c33..532b24cc 100644 --- a/README.md +++ b/README.md @@ -335,7 +335,7 @@ Normally, [Speech SoTA](https://paperswithcode.com/area/speech) gives you an ove - [Test Audio Samples](https://paddlespeech.readthedocs.io/en/latest/tts/demo.html) and [PaddleSpeech VS. Espnet](https://paddlespeech.readthedocs.io/en/latest/tts/demo_2.html) - [Released Models](./docs/source/released_model.md) -The TTS module is originally called [Parakeet](https://github.com/PaddlePaddle/Parakeet), and now merged with DeepSpeech. If you are interested in academic research about this function, please see [TTS research overview](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/docs/source/tts#overview). Also, [this document](https://paddleparakeet.readthedocs.io/en/latest/released_models.html) is a good guideline for the pipeline components. +The TTS module is originally called [Parakeet](https://github.com/PaddlePaddle/Parakeet), and now merged with DeepSpeech. If you are interested in academic research about this function, please see [TTS research overview](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/docs/source/tts#overview). Also, [this document](https://paddlespeech.readthedocs.io/en/latest/tts/models_introduction.html) is a good guideline for the pipeline components. ## FAQ and Contributing From 733b0ce29af700f64da62e38ac6a8d818665c65f Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Thu, 25 Nov 2021 07:37:14 +0000 Subject: [PATCH 4/4] rename to result.md --- examples/aishell/asr0/{README.md => RESULTS.md} | 0 examples/aishell/asr1/{README.md => RESULTS.md} | 6 +++--- examples/callcenter/asr1/{README.md => RESULTS.md} | 0 examples/librispeech/asr0/{README.md => RESULTS.md} | 0 examples/librispeech/asr1/{README.md => RESULTS.md} | 0 examples/librispeech/asr2/{README.md => RESULTS.md} | 1 + examples/ted_en_zh/st0/{README.md => RESULTS.md} | 0 examples/ted_en_zh/st1/{README.md => RESULTS.md} | 0 examples/timit/asr1/{README.md => RESULTS.md} | 0 9 files changed, 4 insertions(+), 3 deletions(-) rename examples/aishell/asr0/{README.md => RESULTS.md} (100%) rename examples/aishell/asr1/{README.md => RESULTS.md} (95%) rename examples/callcenter/asr1/{README.md => RESULTS.md} (100%) rename examples/librispeech/asr0/{README.md => RESULTS.md} (100%) rename examples/librispeech/asr1/{README.md => RESULTS.md} (100%) rename examples/librispeech/asr2/{README.md => RESULTS.md} (98%) rename examples/ted_en_zh/st0/{README.md => RESULTS.md} (100%) rename examples/ted_en_zh/st1/{README.md => RESULTS.md} (100%) rename examples/timit/asr1/{README.md => RESULTS.md} (100%) diff --git a/examples/aishell/asr0/README.md b/examples/aishell/asr0/RESULTS.md similarity index 100% rename from examples/aishell/asr0/README.md rename to examples/aishell/asr0/RESULTS.md diff --git a/examples/aishell/asr1/README.md b/examples/aishell/asr1/RESULTS.md similarity index 95% rename from examples/aishell/asr1/README.md rename to examples/aishell/asr1/RESULTS.md index da753634..783e179e 100644 --- a/examples/aishell/asr1/README.md +++ b/examples/aishell/asr1/RESULTS.md @@ -2,7 +2,7 @@ ## Conformer -| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER | +| Model | Params | Config | Augmentation| Test set | Decode method | Loss | CER | | --- | --- | --- | --- | --- | --- | --- | --- | | conformer | 47.07M | conf/conformer.yaml | spec_aug + shift | test | attention | - | 0.059858 | | conformer | 47.07M | conf/conformer.yaml | spec_aug + shift | test | ctc_greedy_search | - | 0.062311 | @@ -13,7 +13,7 @@ ## Chunk Conformer Need set `decoding.decoding_chunk_size=16` when decoding. -| Model | Params | Config | Augmentation| Test set | Decode method | Chunk Size & Left Chunks | Loss | WER | +| Model | Params | Config | Augmentation| Test set | Decode method | Chunk Size & Left Chunks | Loss | CER | | --- | --- | --- | --- | --- | --- | --- | --- | --- | | conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | attention | 16, -1 | - | 0.061939 | | conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_greedy_search | 16, -1 | - | 0.070806 | @@ -23,7 +23,7 @@ Need set `decoding.decoding_chunk_size=16` when decoding. ## Transformer -| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER | +| Model | Params | Config | Augmentation| Test set | Decode method | Loss | CER | | --- | --- | --- | --- | --- | --- | --- | --- | | transformer | 31.95M | conf/transformer.yaml | spec_aug | test | attention | 3.858648955821991 | 0.057293 | | transformer | 31.95M | conf/transformer.yaml | spec_aug | test | ctc_greedy_search | 3.858648955821991 | 0.061837 | diff --git a/examples/callcenter/asr1/README.md b/examples/callcenter/asr1/RESULTS.md similarity index 100% rename from examples/callcenter/asr1/README.md rename to examples/callcenter/asr1/RESULTS.md diff --git a/examples/librispeech/asr0/README.md b/examples/librispeech/asr0/RESULTS.md similarity index 100% rename from examples/librispeech/asr0/README.md rename to examples/librispeech/asr0/RESULTS.md diff --git a/examples/librispeech/asr1/README.md b/examples/librispeech/asr1/RESULTS.md similarity index 100% rename from examples/librispeech/asr1/README.md rename to examples/librispeech/asr1/RESULTS.md diff --git a/examples/librispeech/asr2/README.md b/examples/librispeech/asr2/RESULTS.md similarity index 98% rename from examples/librispeech/asr2/README.md rename to examples/librispeech/asr2/RESULTS.md index 9285a183..41655565 100644 --- a/examples/librispeech/asr2/README.md +++ b/examples/librispeech/asr2/RESULTS.md @@ -7,6 +7,7 @@ | --- | --- | --- | --- | --- | --- | | transformer | 32.52 M | 8 Tesla V100-SXM2-32GB | 10-best val_loss | conf/transformer.yaml | spec_aug | 6.3197922706604 | +### Attention Rescore | Test Set | Decode Method | #Snt | #Wrd | Corr | Sub | Del | Ins | Err | S.Err | | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | diff --git a/examples/ted_en_zh/st0/README.md b/examples/ted_en_zh/st0/RESULTS.md similarity index 100% rename from examples/ted_en_zh/st0/README.md rename to examples/ted_en_zh/st0/RESULTS.md diff --git a/examples/ted_en_zh/st1/README.md b/examples/ted_en_zh/st1/RESULTS.md similarity index 100% rename from examples/ted_en_zh/st1/README.md rename to examples/ted_en_zh/st1/RESULTS.md diff --git a/examples/timit/asr1/README.md b/examples/timit/asr1/RESULTS.md similarity index 100% rename from examples/timit/asr1/README.md rename to examples/timit/asr1/RESULTS.md