From 54341c88a6e5d7595d20bfbb3a21cd84ecdaebfc Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Mon, 28 Feb 2022 10:39:19 +0000 Subject: [PATCH 1/3] cli batch and shell pipe, test=doc --- README.md | 15 +++++++++++++-- README_cn.md | 11 +++++++++++ demos/speech_recognition/.gitignore | 1 + demos/speech_recognition/README.md | 2 ++ demos/speech_recognition/README_cn.md | 2 ++ demos/speech_recognition/run.sh | 6 ++++++ demos/text_to_speech/README.md | 5 ++++- demos/text_to_speech/README_cn.md | 4 ++++ demos/text_to_speech/run.sh | 4 ++++ 9 files changed, 47 insertions(+), 3 deletions(-) create mode 100644 demos/speech_recognition/.gitignore diff --git a/README.md b/README.md index 46730797..a142cb5e 100644 --- a/README.md +++ b/README.md @@ -196,16 +196,18 @@ Developers can have a try of our models with [PaddleSpeech Command Line](./paddl ```shell paddlespeech cls --input input.wav ``` + **Automatic Speech Recognition** ```shell paddlespeech asr --lang zh --input input_16k.wav ``` -**Speech Translation** (English to Chinese) +**Speech Translation** (English to Chinese) (not support for Mac and Windows now) ```shell paddlespeech st --input input_16k.wav ``` + **Text-to-Speech** ```shell paddlespeech tts --input "你好,欢迎使用飞桨深度学习框架!" --output output.wav @@ -218,7 +220,16 @@ paddlespeech tts --input "你好,欢迎使用飞桨深度学习框架!" --ou paddlespeech text --task punc --input 今天的天气真不错啊你下午有空吗我想约你一起去吃饭 ``` - +**Batch Process** +``` +echo -e "1 欢迎光临。\n2 谢谢惠顾。" | paddlespeech tts +``` + +**Shell Pipeline** +ASR + Punc: +``` +paddlespeech asr --input ./zh.wav | paddlespeech text --task punc +``` For more command lines, please see: [demos](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos) diff --git a/README_cn.md b/README_cn.md index 9782240a..366d9a02 100644 --- a/README_cn.md +++ b/README_cn.md @@ -216,6 +216,17 @@ paddlespeech tts --input "你好,欢迎使用百度飞桨深度学习框架! paddlespeech text --task punc --input 今天的天气真不错啊你下午有空吗我想约你一起去吃饭 ``` +**批处理** +``` +echo -e "1 欢迎光临。\n2 谢谢惠顾。" | paddlespeech tts +``` + +**Shell管道** +ASR + Punc: +``` +paddlespeech asr --input ./zh.wav | paddlespeech text --task punc +``` + 更多命令行命令请参考 [demos](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos) > Note: 如果需要训练或者微调,请查看[语音识别](./docs/source/asr/quick_start.md), [语音合成](./docs/source/tts/quick_start.md)。 diff --git a/demos/speech_recognition/.gitignore b/demos/speech_recognition/.gitignore new file mode 100644 index 00000000..d8dd7532 --- /dev/null +++ b/demos/speech_recognition/.gitignore @@ -0,0 +1 @@ +*.wav diff --git a/demos/speech_recognition/README.md b/demos/speech_recognition/README.md index c49afa35..5d964fce 100644 --- a/demos/speech_recognition/README.md +++ b/demos/speech_recognition/README.md @@ -27,6 +27,8 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee paddlespeech asr --input ./zh.wav # English paddlespeech asr --model transformer_librispeech --lang en --input ./en.wav + # Chinese ASR + Punctuation Restoration + paddlespeech asr --input ./zh.wav | paddlespeech text --task punc ``` (It doesn't matter if package `paddlespeech-ctcdecoders` is not found, this package is optional.) diff --git a/demos/speech_recognition/README_cn.md b/demos/speech_recognition/README_cn.md index c2e38c91..ba1f1d65 100644 --- a/demos/speech_recognition/README_cn.md +++ b/demos/speech_recognition/README_cn.md @@ -25,6 +25,8 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee paddlespeech asr --input ./zh.wav # 英文 paddlespeech asr --model transformer_librispeech --lang en --input ./en.wav + # 中文 + 标点恢复 + paddlespeech asr --input ./zh.wav | paddlespeech text --task punc ``` (如果显示 `paddlespeech-ctcdecoders` 这个 python 包没有找到的 Error,没有关系,这个包是非必须的。) diff --git a/demos/speech_recognition/run.sh b/demos/speech_recognition/run.sh index 5efc8b81..06466928 100755 --- a/demos/speech_recognition/run.sh +++ b/demos/speech_recognition/run.sh @@ -1,4 +1,10 @@ #!/bin/bash wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav + +# asr paddlespeech asr --input ./zh.wav + + +# asr + punc +paddlespeech asr --input ./zh.wav | paddlespeech text --task punc \ No newline at end of file diff --git a/demos/text_to_speech/README.md b/demos/text_to_speech/README.md index 9d3c4ac5..2df72a82 100644 --- a/demos/text_to_speech/README.md +++ b/demos/text_to_speech/README.md @@ -17,11 +17,14 @@ The input of this demo should be a text of the specific language that can be pas ### 3. Usage - Command Line (Recommended) - Chinese - The default acoustic model is `Fastspeech2`, and the default vocoder is `Parallel WaveGAN`. ```bash paddlespeech tts --input "你好,欢迎使用百度飞桨深度学习框架!" ``` + - Batch Process + ```bash + echo -e "1 欢迎光临。\n2 谢谢惠顾。" | paddlespeech tts + ``` - Chinese, use `SpeedySpeech` as the acoustic model ```bash paddlespeech tts --am speedyspeech_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!" diff --git a/demos/text_to_speech/README_cn.md b/demos/text_to_speech/README_cn.md index f075efda..7e02b962 100644 --- a/demos/text_to_speech/README_cn.md +++ b/demos/text_to_speech/README_cn.md @@ -24,6 +24,10 @@ ```bash paddlespeech tts --input "你好,欢迎使用百度飞桨深度学习框架!" ``` + - 批处理 + ```bash + echo -e "1 欢迎光临。\n2 谢谢惠顾。" | paddlespeech tts + ``` - 中文,使用 `SpeedySpeech` 作为声学模型 ```bash paddlespeech tts --am speedyspeech_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!" diff --git a/demos/text_to_speech/run.sh b/demos/text_to_speech/run.sh index c2487aee..b1340241 100755 --- a/demos/text_to_speech/run.sh +++ b/demos/text_to_speech/run.sh @@ -1,3 +1,7 @@ #!/bin/bash +# single process paddlespeech tts --input 今天的天气不错啊 + +# Batch process +echo -e "1 欢迎光临。\n2 谢谢惠顾。" | paddlespeech tts \ No newline at end of file From 75098698d8eae48d1d0343cd683c7b315ea4a02d Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Mon, 28 Feb 2022 10:45:39 +0000 Subject: [PATCH 2/3] format,test=doc --- paddlespeech/s2t/io/sampler.py | 2 +- paddlespeech/s2t/models/u2_st/u2_st.py | 4 +-- .../t2s/modules/transformer/repeat.py | 2 +- .../unit/asr/deepspeech2_online_model_test.py | 36 ++++++++----------- 4 files changed, 17 insertions(+), 27 deletions(-) diff --git a/paddlespeech/s2t/io/sampler.py b/paddlespeech/s2t/io/sampler.py index 89752bb9..ac55af12 100644 --- a/paddlespeech/s2t/io/sampler.py +++ b/paddlespeech/s2t/io/sampler.py @@ -51,7 +51,7 @@ def _batch_shuffle(indices, batch_size, epoch, clipped=False): """ rng = np.random.RandomState(epoch) shift_len = rng.randint(0, batch_size - 1) - batch_indices = list(zip(*[iter(indices[shift_len:])] * batch_size)) + batch_indices = list(zip(* [iter(indices[shift_len:])] * batch_size)) rng.shuffle(batch_indices) batch_indices = [item for batch in batch_indices for item in batch] assert clipped is False diff --git a/paddlespeech/s2t/models/u2_st/u2_st.py b/paddlespeech/s2t/models/u2_st/u2_st.py index f7b05714..999723e5 100644 --- a/paddlespeech/s2t/models/u2_st/u2_st.py +++ b/paddlespeech/s2t/models/u2_st/u2_st.py @@ -33,8 +33,6 @@ from paddlespeech.s2t.modules.decoder import TransformerDecoder from paddlespeech.s2t.modules.encoder import ConformerEncoder from paddlespeech.s2t.modules.encoder import TransformerEncoder from paddlespeech.s2t.modules.loss import LabelSmoothingLoss -from paddlespeech.s2t.modules.mask import mask_finished_preds -from paddlespeech.s2t.modules.mask import mask_finished_scores from paddlespeech.s2t.modules.mask import subsequent_mask from paddlespeech.s2t.utils import checkpoint from paddlespeech.s2t.utils import layer_tools @@ -291,7 +289,7 @@ class U2STBaseModel(nn.Layer): device = speech.place # Let's assume B = batch_size and N = beam_size - # 1. Encoder and init hypothesis + # 1. Encoder and init hypothesis encoder_out, encoder_mask = self._forward_encoder( speech, speech_lengths, decoding_chunk_size, num_decoding_left_chunks, diff --git a/paddlespeech/t2s/modules/transformer/repeat.py b/paddlespeech/t2s/modules/transformer/repeat.py index 2073a78b..1e946adf 100644 --- a/paddlespeech/t2s/modules/transformer/repeat.py +++ b/paddlespeech/t2s/modules/transformer/repeat.py @@ -36,4 +36,4 @@ def repeat(N, fn): Returns: MultiSequential: Repeated model instance. """ - return MultiSequential(*[fn(n) for n in range(N)]) + return MultiSequential(* [fn(n) for n in range(N)]) diff --git a/tests/unit/asr/deepspeech2_online_model_test.py b/tests/unit/asr/deepspeech2_online_model_test.py index d26e5b15..f23c4926 100644 --- a/tests/unit/asr/deepspeech2_online_model_test.py +++ b/tests/unit/asr/deepspeech2_online_model_test.py @@ -11,16 +11,17 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import os +import pickle import unittest import numpy as np import paddle -import pickle -import os from paddle import inference -from paddlespeech.s2t.models.ds2_online import DeepSpeech2ModelOnline from paddlespeech.s2t.models.ds2_online import DeepSpeech2InferModelOnline +from paddlespeech.s2t.models.ds2_online import DeepSpeech2ModelOnline + class TestDeepSpeech2ModelOnline(unittest.TestCase): def setUp(self): @@ -185,15 +186,12 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase): paddle.allclose(final_state_c_box, final_state_c_box_chk), True) - - class TestDeepSpeech2StaticModelOnline(unittest.TestCase): - def setUp(self): export_prefix = "exp/deepspeech2_online/checkpoints/test_export" if not os.path.exists(os.path.dirname(export_prefix)): os.makedirs(os.path.dirname(export_prefix), mode=0o755) - infer_model = DeepSpeech2InferModelOnline( + infer_model = DeepSpeech2InferModelOnline( feat_size=161, dict_size=4233, num_conv_layers=2, @@ -207,27 +205,25 @@ class TestDeepSpeech2StaticModelOnline(unittest.TestCase): with open("test_data/static_ds2online_inputs.pickle", "rb") as f: self.data_dict = pickle.load(f) - + self.setup_model(export_prefix) - def setup_model(self, export_prefix): - deepspeech_config = inference.Config( - export_prefix + ".pdmodel", - export_prefix + ".pdiparams") - if ('CUDA_VISIBLE_DEVICES' in os.environ.keys() and os.environ['CUDA_VISIBLE_DEVICES'].strip() != ''): + deepspeech_config = inference.Config(export_prefix + ".pdmodel", + export_prefix + ".pdiparams") + if ('CUDA_VISIBLE_DEVICES' in os.environ.keys() and + os.environ['CUDA_VISIBLE_DEVICES'].strip() != ''): deepspeech_config.enable_use_gpu(100, 0) deepspeech_config.enable_memory_optim() deepspeech_predictor = inference.create_predictor(deepspeech_config) self.predictor = deepspeech_predictor - + def test_unit(self): input_names = self.predictor.get_input_names() audio_handle = self.predictor.get_input_handle(input_names[0]) audio_len_handle = self.predictor.get_input_handle(input_names[1]) h_box_handle = self.predictor.get_input_handle(input_names[2]) c_box_handle = self.predictor.get_input_handle(input_names[3]) - x_chunk = self.data_dict["audio_chunk"] x_chunk_lens = self.data_dict["audio_chunk_lens"] @@ -246,13 +242,9 @@ class TestDeepSpeech2StaticModelOnline(unittest.TestCase): c_box_handle.reshape(chunk_state_c_box.shape) c_box_handle.copy_from_cpu(chunk_state_c_box) - - output_names = self.predictor.get_output_names() - output_handle = self.predictor.get_output_handle( - output_names[0]) - output_lens_handle = self.predictor.get_output_handle( - output_names[1]) + output_handle = self.predictor.get_output_handle(output_names[0]) + output_lens_handle = self.predictor.get_output_handle(output_names[1]) output_state_h_handle = self.predictor.get_output_handle( output_names[2]) output_state_c_handle = self.predictor.get_output_handle( @@ -264,7 +256,7 @@ class TestDeepSpeech2StaticModelOnline(unittest.TestCase): chunk_state_h_box = output_state_h_handle.copy_to_cpu() chunk_state_c_box = output_state_c_handle.copy_to_cpu() return True - + if __name__ == '__main__': unittest.main() From 335638ba1877a72d94b39f964e999acd6e18f26a Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Mon, 28 Feb 2022 11:01:50 +0000 Subject: [PATCH 3/3] update gitignore, test=doct --- .gitignore | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.gitignore b/.gitignore index cc8fff87..778824f5 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ *.pyc .vscode *log +*.wav *.pdmodel *.pdiparams* *.zip @@ -30,5 +31,8 @@ tools/OpenBLAS/ tools/Miniconda3-latest-Linux-x86_64.sh tools/activate_python.sh tools/miniconda.sh +tools/CRF++-0.58/ + +speechx/fc_patch/ *output/