pull/2331/head
TianYuan 3 years ago
commit d01cc6fee4

@ -226,6 +226,12 @@ recall and elapsed time statistics are shown in the following figure
The retrieval framework based on Milvus takes about 2.9 milliseconds to retrieve on the premise of 90% recall rate, and it takes about 500 milliseconds for feature extraction (testing audio takes about 5 seconds), that is, a single audio test takes about 503 milliseconds in total, which can meet most application scenarios. The retrieval framework based on Milvus takes about 2.9 milliseconds to retrieve on the premise of 90% recall rate, and it takes about 500 milliseconds for feature extraction (testing audio takes about 5 seconds), that is, a single audio test takes about 503 milliseconds in total, which can meet most application scenarios.
* compute embeding takes 500 ms
* retrieval with cosine takes 2.9 ms
* total takes 503 ms
> test audio is 5 sec
### 6.Pretrained Models ### 6.Pretrained Models
Here is a list of pretrained models released by PaddleSpeech : Here is a list of pretrained models released by PaddleSpeech :

@ -19,6 +19,7 @@ The input of this cli demo should be a WAV file(`.wav`), and the sample rate mus
Here are sample files for this demo that can be downloaded: Here are sample files for this demo that can be downloaded:
```bash ```bash
wget -c https://paddlespeech.bj.bcebos.com/vector/audio/85236145389.wav wget -c https://paddlespeech.bj.bcebos.com/vector/audio/85236145389.wav
wget -c https://paddlespeech.bj.bcebos.com/vector/audio/123456789.wav
``` ```
### 3. Usage ### 3. Usage

@ -19,6 +19,7 @@
```bash ```bash
# 该音频的内容是数字串 85236145389 # 该音频的内容是数字串 85236145389
wget -c https://paddlespeech.bj.bcebos.com/vector/audio/85236145389.wav wget -c https://paddlespeech.bj.bcebos.com/vector/audio/85236145389.wav
wget -c https://paddlespeech.bj.bcebos.com/vector/audio/123456789.wav
``` ```
### 3. 使用方法 ### 3. 使用方法
- 命令行 (推荐使用) - 命令行 (推荐使用)

@ -113,8 +113,6 @@ async def speech2textOffline(files: List[UploadFile]):
# 返回ASR识别结果 # 返回ASR识别结果
asr_res = chatbot.speech2text(out_file_path) asr_res = chatbot.speech2text(out_file_path)
return SuccessRequest(result=asr_res) return SuccessRequest(result=asr_res)
# else:
# return ErrorRequest(message="文件不是.wav格式")
return ErrorRequest(message="上传文件为空") return ErrorRequest(message="上传文件为空")
@ -433,10 +431,6 @@ async def vpr_recog(request: Request,
# Sort results by distance metric, closest distances first # Sort results by distance metric, closest distances first
res = sorted(res.items(), key=lambda item: item[1][1], reverse=True) res = sorted(res.items(), key=lambda item: item[1][1], reverse=True)
return res return res
# except Exception as e:
# return {'status': False, 'msg': e}, 400
@app.post('/vpr/del') @app.post('/vpr/del')

@ -5,3 +5,7 @@
| Model | Number of Params | Release | Config | dim | Test set | Cosine | Cosine + S-Norm | | Model | Number of Params | Release | Config | dim | Test set | Cosine | Cosine + S-Norm |
| --- | --- | --- | --- | --- | --- | --- | ---- | | --- | --- | --- | --- | --- | --- | --- | ---- |
| ECAPA-TDNN | 85M | 0.2.1 | conf/ecapa_tdnn.yaml | 192 | test | 0.8188 | 0.7815| | ECAPA-TDNN | 85M | 0.2.1 | conf/ecapa_tdnn.yaml | 192 | test | 0.8188 | 0.7815|
> [SpeechBrain result](https://github.com/speechbrain/speechbrain/tree/develop/recipes/VoxCeleb/SpeakerRec#speaker-verification-using-ecapa-tdnn-embeddings):
> EER = 0.90% (voxceleb1 + voxceleb2) without s-norm
> EER = 0.80% (voxceleb1 + voxceleb2) with s-norm.

@ -605,8 +605,8 @@ class U2BaseModel(ASRInterface, nn.Layer):
xs: paddle.Tensor, xs: paddle.Tensor,
offset: int, offset: int,
required_cache_size: int, required_cache_size: int,
att_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]), att_cache: paddle.Tensor, # paddle.zeros([0, 0, 0, 0])
cnn_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]), cnn_cache: paddle.Tensor, # paddle.zeros([0, 0, 0, 0])
) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]: ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
""" Export interface for c++ call, give input chunk xs, and return """ Export interface for c++ call, give input chunk xs, and return
output from time 0 to current chunk. output from time 0 to current chunk.

@ -86,7 +86,7 @@ class MultiHeadedAttention(nn.Layer):
self, self,
value: paddle.Tensor, value: paddle.Tensor,
scores: paddle.Tensor, scores: paddle.Tensor,
mask: paddle.Tensor=paddle.ones([0, 0, 0], dtype=paddle.bool), mask: paddle.Tensor, # paddle.ones([0, 0, 0], dtype=paddle.bool)
) -> paddle.Tensor: ) -> paddle.Tensor:
"""Compute attention context vector. """Compute attention context vector.
Args: Args:
@ -127,14 +127,15 @@ class MultiHeadedAttention(nn.Layer):
return self.linear_out(x) # (batch, time1, d_model) return self.linear_out(x) # (batch, time1, d_model)
def forward(self, def forward(
query: paddle.Tensor, self,
key: paddle.Tensor, query: paddle.Tensor,
value: paddle.Tensor, key: paddle.Tensor,
mask: paddle.Tensor=paddle.ones([0, 0, 0], dtype=paddle.bool), value: paddle.Tensor,
pos_emb: paddle.Tensor=paddle.empty([0]), mask: paddle.Tensor, # paddle.ones([0,0,0], dtype=paddle.bool)
cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]) pos_emb: paddle.Tensor, # paddle.empty([0])
) -> Tuple[paddle.Tensor, paddle.Tensor]: cache: paddle.Tensor # paddle.zeros([0,0,0,0])
) -> Tuple[paddle.Tensor, paddle.Tensor]:
"""Compute scaled dot product attention. """Compute scaled dot product attention.
Args: Args:
query (paddle.Tensor): Query tensor (#batch, time1, size). query (paddle.Tensor): Query tensor (#batch, time1, size).
@ -243,14 +244,15 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention):
return x return x
def forward(self, def forward(
query: paddle.Tensor, self,
key: paddle.Tensor, query: paddle.Tensor,
value: paddle.Tensor, key: paddle.Tensor,
mask: paddle.Tensor=paddle.ones([0, 0, 0], dtype=paddle.bool), value: paddle.Tensor,
pos_emb: paddle.Tensor=paddle.empty([0]), mask: paddle.Tensor, # paddle.ones([0,0,0], dtype=paddle.bool)
cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]) pos_emb: paddle.Tensor, # paddle.empty([0])
) -> Tuple[paddle.Tensor, paddle.Tensor]: cache: paddle.Tensor # paddle.zeros([0,0,0,0])
) -> Tuple[paddle.Tensor, paddle.Tensor]:
"""Compute 'Scaled Dot Product Attention' with rel. positional encoding. """Compute 'Scaled Dot Product Attention' with rel. positional encoding.
Args: Args:
query (paddle.Tensor): Query tensor (#batch, time1, size). query (paddle.Tensor): Query tensor (#batch, time1, size).

@ -108,8 +108,8 @@ class ConvolutionModule(nn.Layer):
def forward( def forward(
self, self,
x: paddle.Tensor, x: paddle.Tensor,
mask_pad: paddle.Tensor=paddle.ones([0, 0, 0], dtype=paddle.bool), mask_pad: paddle.Tensor, # paddle.ones([0,0,0], dtype=paddle.bool)
cache: paddle.Tensor=paddle.zeros([0, 0, 0]), cache: paddle.Tensor # paddle.zeros([0,0,0,0])
) -> Tuple[paddle.Tensor, paddle.Tensor]: ) -> Tuple[paddle.Tensor, paddle.Tensor]:
"""Compute convolution module. """Compute convolution module.
Args: Args:

@ -121,11 +121,16 @@ class DecoderLayer(nn.Layer):
if self.concat_after: if self.concat_after:
tgt_concat = paddle.cat( tgt_concat = paddle.cat(
(tgt_q, self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0]), dim=-1) (tgt_q, self.self_attn(tgt_q, tgt, tgt, tgt_q_mask,
paddle.empty([0]),
paddle.zeros([0, 0, 0, 0]))[0]),
dim=-1)
x = residual + self.concat_linear1(tgt_concat) x = residual + self.concat_linear1(tgt_concat)
else: else:
x = residual + self.dropout( x = residual + self.dropout(
self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0]) self.self_attn(tgt_q, tgt, tgt, tgt_q_mask,
paddle.empty([0]), paddle.zeros([0, 0, 0, 0]))[
0])
if not self.normalize_before: if not self.normalize_before:
x = self.norm1(x) x = self.norm1(x)
@ -134,11 +139,15 @@ class DecoderLayer(nn.Layer):
x = self.norm2(x) x = self.norm2(x)
if self.concat_after: if self.concat_after:
x_concat = paddle.cat( x_concat = paddle.cat(
(x, self.src_attn(x, memory, memory, memory_mask)[0]), dim=-1) (x, self.src_attn(x, memory, memory, memory_mask,
paddle.empty([0]),
paddle.zeros([0, 0, 0, 0]))[0]),
dim=-1)
x = residual + self.concat_linear2(x_concat) x = residual + self.concat_linear2(x_concat)
else: else:
x = residual + self.dropout( x = residual + self.dropout(
self.src_attn(x, memory, memory, memory_mask)[0]) self.src_attn(x, memory, memory, memory_mask,
paddle.empty([0]), paddle.zeros([0, 0, 0, 0]))[0])
if not self.normalize_before: if not self.normalize_before:
x = self.norm2(x) x = self.norm2(x)

@ -175,7 +175,9 @@ class BaseEncoder(nn.Layer):
decoding_chunk_size, self.static_chunk_size, decoding_chunk_size, self.static_chunk_size,
num_decoding_left_chunks) num_decoding_left_chunks)
for layer in self.encoders: for layer in self.encoders:
xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad) xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad,
paddle.zeros([0, 0, 0, 0]),
paddle.zeros([0, 0, 0, 0]))
if self.normalize_before: if self.normalize_before:
xs = self.after_norm(xs) xs = self.after_norm(xs)
# Here we assume the mask is not changed in encoder layers, so just # Here we assume the mask is not changed in encoder layers, so just
@ -188,9 +190,9 @@ class BaseEncoder(nn.Layer):
xs: paddle.Tensor, xs: paddle.Tensor,
offset: int, offset: int,
required_cache_size: int, required_cache_size: int,
att_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]), att_cache: paddle.Tensor, # paddle.zeros([0,0,0,0])
cnn_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]), cnn_cache: paddle.Tensor, # paddle.zeros([0,0,0,0]),
att_mask: paddle.Tensor=paddle.ones([0, 0, 0], dtype=paddle.bool), att_mask: paddle.Tensor, # paddle.ones([0,0,0], dtype=paddle.bool)
) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]: ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
""" Forward just one chunk """ Forward just one chunk
Args: Args:
@ -253,6 +255,7 @@ class BaseEncoder(nn.Layer):
xs, xs,
att_mask, att_mask,
pos_emb, pos_emb,
mask_pad=paddle.ones([0, 0, 0], dtype=paddle.bool),
att_cache=att_cache[i:i + 1] if elayers > 0 else att_cache, att_cache=att_cache[i:i + 1] if elayers > 0 else att_cache,
cnn_cache=cnn_cache[i:i + 1] cnn_cache=cnn_cache[i:i + 1]
if paddle.shape(cnn_cache)[0] > 0 else cnn_cache, ) if paddle.shape(cnn_cache)[0] > 0 else cnn_cache, )
@ -325,7 +328,8 @@ class BaseEncoder(nn.Layer):
chunk_xs = xs[:, cur:end, :] chunk_xs = xs[:, cur:end, :]
(y, att_cache, cnn_cache) = self.forward_chunk( (y, att_cache, cnn_cache) = self.forward_chunk(
chunk_xs, offset, required_cache_size, att_cache, cnn_cache) chunk_xs, offset, required_cache_size, att_cache, cnn_cache,
paddle.ones([0, 0, 0], dtype=paddle.bool))
outputs.append(y) outputs.append(y)
offset += y.shape[1] offset += y.shape[1]

@ -76,9 +76,10 @@ class TransformerEncoderLayer(nn.Layer):
x: paddle.Tensor, x: paddle.Tensor,
mask: paddle.Tensor, mask: paddle.Tensor,
pos_emb: paddle.Tensor, pos_emb: paddle.Tensor,
mask_pad: paddle.Tensor=paddle.ones([0, 0, 0], dtype=paddle.bool), mask_pad: paddle.
att_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]), Tensor, # paddle.ones([0, 0, 0], dtype=paddle.bool)
cnn_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]), att_cache: paddle.Tensor, # paddle.zeros([0, 0, 0, 0])
cnn_cache: paddle.Tensor, # paddle.zeros([0, 0, 0, 0])
) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor]: ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor]:
"""Compute encoded features. """Compute encoded features.
Args: Args:
@ -105,7 +106,8 @@ class TransformerEncoderLayer(nn.Layer):
if self.normalize_before: if self.normalize_before:
x = self.norm1(x) x = self.norm1(x)
x_att, new_att_cache = self.self_attn(x, x, x, mask, cache=att_cache) x_att, new_att_cache = self.self_attn(
x, x, x, mask, paddle.empty([0]), cache=att_cache)
if self.concat_after: if self.concat_after:
x_concat = paddle.concat((x, x_att), axis=-1) x_concat = paddle.concat((x, x_att), axis=-1)
@ -193,9 +195,9 @@ class ConformerEncoderLayer(nn.Layer):
x: paddle.Tensor, x: paddle.Tensor,
mask: paddle.Tensor, mask: paddle.Tensor,
pos_emb: paddle.Tensor, pos_emb: paddle.Tensor,
mask_pad: paddle.Tensor=paddle.ones([0, 0, 0], dtype=paddle.bool), mask_pad: paddle.Tensor, #paddle.ones([0, 0, 0],dtype=paddle.bool)
att_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]), att_cache: paddle.Tensor, # paddle.zeros([0, 0, 0, 0])
cnn_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]), cnn_cache: paddle.Tensor, # paddle.zeros([0, 0, 0, 0])
) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor]: ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor]:
"""Compute encoded features. """Compute encoded features.
Args: Args:

@ -476,8 +476,12 @@ class PaddleASRConnectionHanddler:
# forward chunk # forward chunk
(y, self.att_cache, (y, self.att_cache,
self.cnn_cache) = self.model.encoder.forward_chunk( self.cnn_cache) = self.model.encoder.forward_chunk(
chunk_xs, self.offset, required_cache_size, self.att_cache, chunk_xs,
self.cnn_cache) self.offset,
required_cache_size,
att_cache=self.att_cache,
cnn_cache=self.cnn_cache,
att_mask=paddle.ones([0, 0, 0], dtype=paddle.bool))
outputs.append(y) outputs.append(y)
# update the global offset, in decoding frame unit # update the global offset, in decoding frame unit

Loading…
Cancel
Save