diff --git a/audio/README.md b/audio/README.md index f336ac9ae..bfd8625f0 100644 --- a/audio/README.md +++ b/audio/README.md @@ -29,7 +29,6 @@ MAC:test build whl envrioment: * gcc/g++/gfortran 12.2.0 * cpu Intel Xeon E5 x86_64 - Windows: not support: paddleaudio C++ extension lib (sox io, kaldi native fbank) -python setup.py bdist_wheel \ No newline at end of file +python setup.py bdist_wheel diff --git a/audio/paddleaudio/src/pybind/pybind.cpp b/audio/paddleaudio/src/pybind/pybind.cpp index c4dfa8d51..692e80995 100644 --- a/audio/paddleaudio/src/pybind/pybind.cpp +++ b/audio/paddleaudio/src/pybind/pybind.cpp @@ -1,7 +1,9 @@ // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +#ifdef INCLUDE_KALDI #include "paddleaudio/src/pybind/kaldi/kaldi_feature.h" #include "paddleaudio/third_party/kaldi/feat/feature-fbank.h" +#endif #ifdef INCLUDE_SOX #include "paddleaudio/src/pybind/sox/io.h" diff --git a/paddlespeech/s2t/models/u2_st/u2_st.py b/paddlespeech/s2t/models/u2_st/u2_st.py index 016087d68..31defbbaf 100644 --- a/paddlespeech/s2t/models/u2_st/u2_st.py +++ b/paddlespeech/s2t/models/u2_st/u2_st.py @@ -24,9 +24,9 @@ from typing import Tuple import paddle from paddle import jit from paddle import nn -from paddleaudio.utils.tensor_utils import add_sos_eos -from paddleaudio.utils.tensor_utils import th_accuracy +from paddlespeech.audio.utils.tensor_utils import add_sos_eos +from paddlespeech.audio.utils.tensor_utils import th_accuracy from paddlespeech.s2t.frontend.utility import IGNORE_ID from paddlespeech.s2t.frontend.utility import load_cmvn from paddlespeech.s2t.modules.cmvn import GlobalCMVN diff --git a/paddlespeech/s2t/models/whisper/whipser.py b/paddlespeech/s2t/models/whisper/whipser.py index ba9983338..63cafbdb7 100644 --- a/paddlespeech/s2t/models/whisper/whipser.py +++ b/paddlespeech/s2t/models/whisper/whipser.py @@ -16,7 +16,6 @@ from typing import Union import numpy as np import paddle -import paddle.fluid as fluid import paddle.nn.functional as F import soundfile import tqdm @@ -231,8 +230,8 @@ class TextDecoder(nn.Layer): ]) self.ln = LayerNorm(n_state) - mask = fluid.layers.fill_constant( - shape=[n_ctx, n_state], value=-np.inf, dtype='float32') + mask = paddle.full( + shape=[n_ctx, n_state], fill_value=-np.inf, dtype='float32') mask = paddle.triu(mask, diagonal=1) self.register_buffer("mask", mask, persistable=False) diff --git a/speechx/build.sh b/speechx/build.sh index e0a386752..7655f9635 100755 --- a/speechx/build.sh +++ b/speechx/build.sh @@ -20,4 +20,4 @@ fi mkdir -p build cmake -B build -DBOOST_ROOT:STRING=${boost_SOURCE_DIR} -cmake --build build +cmake --build build -j diff --git a/speechx/examples/u2pp_ol/wenetspeech/RESULTS.md b/speechx/examples/u2pp_ol/wenetspeech/RESULTS.md index 09584fd57..ef88357ee 100644 --- a/speechx/examples/u2pp_ol/wenetspeech/RESULTS.md +++ b/speechx/examples/u2pp_ol/wenetspeech/RESULTS.md @@ -6,8 +6,11 @@ > Intel(R) Xeon(R) Gold 6271C CPU @ 2.60GHz, support `avx512_vnni` > RTF with feature and decoder which is more end to end. + ### FP32 +`local/recognizer.sh` + #### CER ``` @@ -27,6 +30,8 @@ I1027 10:52:38.662876 51665 u2_recognizer_main.cc:123] RTF is: 0.309318 ### INT8 +`local/recognizer_quant.sh` + > RTF relative improve 12.8%, which count feature and decoder time. > Test under Paddle commit c331e2ce2031d68a553bc9469a07c30d718438f3 @@ -46,3 +51,17 @@ I1110 09:59:52.551712 37249 u2_recognizer_main.cc:122] total wav duration is: 36 I1110 09:59:52.551717 37249 u2_recognizer_main.cc:123] total decode cost:9737.63 sec I1110 09:59:52.551723 37249 u2_recognizer_main.cc:124] RTF is: 0.269674 ``` + +### CTC Prefix Beam Search + +`local/decode.sh` + +#### CER + +``` +Overall -> 6.74 % N=104765 C=98106 S=6516 D=143 I=401 +Mandarin -> 6.74 % N=104762 C=98106 S=6513 D=143 I=401 +English -> 0.00 % N=0 C=0 S=0 D=0 I=0 +Other -> 100.00 % N=3 C=0 S=3 D=0 I=0 + +```