Merge branch 'PaddlePaddle:develop' into speechx_refactor

pull/2746/head
YangZhou 3 years ago committed by GitHub
commit c2d9c0c51d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -29,7 +29,6 @@ MACtest build whl envrioment
* gcc/g++/gfortran 12.2.0
* cpu Intel Xeon E5 x86_64
Windows
not support paddleaudio C++ extension lib (sox io, kaldi native fbank)
python setup.py bdist_wheel
python setup.py bdist_wheel

@ -1,7 +1,9 @@
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#ifdef INCLUDE_KALDI
#include "paddleaudio/src/pybind/kaldi/kaldi_feature.h"
#include "paddleaudio/third_party/kaldi/feat/feature-fbank.h"
#endif
#ifdef INCLUDE_SOX
#include "paddleaudio/src/pybind/sox/io.h"

@ -24,9 +24,9 @@ from typing import Tuple
import paddle
from paddle import jit
from paddle import nn
from paddleaudio.utils.tensor_utils import add_sos_eos
from paddleaudio.utils.tensor_utils import th_accuracy
from paddlespeech.audio.utils.tensor_utils import add_sos_eos
from paddlespeech.audio.utils.tensor_utils import th_accuracy
from paddlespeech.s2t.frontend.utility import IGNORE_ID
from paddlespeech.s2t.frontend.utility import load_cmvn
from paddlespeech.s2t.modules.cmvn import GlobalCMVN

@ -16,7 +16,6 @@ from typing import Union
import numpy as np
import paddle
import paddle.fluid as fluid
import paddle.nn.functional as F
import soundfile
import tqdm
@ -231,8 +230,8 @@ class TextDecoder(nn.Layer):
])
self.ln = LayerNorm(n_state)
mask = fluid.layers.fill_constant(
shape=[n_ctx, n_state], value=-np.inf, dtype='float32')
mask = paddle.full(
shape=[n_ctx, n_state], fill_value=-np.inf, dtype='float32')
mask = paddle.triu(mask, diagonal=1)
self.register_buffer("mask", mask, persistable=False)

@ -20,4 +20,4 @@ fi
mkdir -p build
cmake -B build -DBOOST_ROOT:STRING=${boost_SOURCE_DIR}
cmake --build build
cmake --build build -j

@ -6,8 +6,11 @@
> Intel(R) Xeon(R) Gold 6271C CPU @ 2.60GHz, support `avx512_vnni`
> RTF with feature and decoder which is more end to end.
### FP32
`local/recognizer.sh`
#### CER
```
@ -27,6 +30,8 @@ I1027 10:52:38.662876 51665 u2_recognizer_main.cc:123] RTF is: 0.309318
### INT8
`local/recognizer_quant.sh`
> RTF relative improve 12.8%, which count feature and decoder time.
> Test under Paddle commit c331e2ce2031d68a553bc9469a07c30d718438f3
@ -46,3 +51,17 @@ I1110 09:59:52.551712 37249 u2_recognizer_main.cc:122] total wav duration is: 36
I1110 09:59:52.551717 37249 u2_recognizer_main.cc:123] total decode cost:9737.63 sec
I1110 09:59:52.551723 37249 u2_recognizer_main.cc:124] RTF is: 0.269674
```
### CTC Prefix Beam Search
`local/decode.sh`
#### CER
```
Overall -> 6.74 % N=104765 C=98106 S=6516 D=143 I=401
Mandarin -> 6.74 % N=104762 C=98106 S=6513 D=143 I=401
English -> 0.00 % N=0 C=0 S=0 D=0 I=0
Other -> 100.00 % N=3 C=0 S=3 D=0 I=0
```

Loading…
Cancel
Save