From 733ec7f2bc82c62be5c2959230bc43092be02435 Mon Sep 17 00:00:00 2001 From: tianhao zhang <15600919271@163.com> Date: Tue, 30 Aug 2022 07:59:55 +0000 Subject: [PATCH 1/9] fix conformer multi-gpu training test=asr --- paddlespeech/s2t/models/u2/u2.py | 4 +-- paddlespeech/s2t/modules/attention.py | 25 ++++++------- .../s2t/modules/conformer_convolution.py | 12 +++---- paddlespeech/s2t/modules/decoder_layer.py | 17 ++++++--- paddlespeech/s2t/modules/encoder.py | 36 ++++++++++--------- paddlespeech/s2t/modules/encoder_layer.py | 15 ++++---- 6 files changed, 58 insertions(+), 51 deletions(-) diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py index e19f411cf..a812abcbd 100644 --- a/paddlespeech/s2t/models/u2/u2.py +++ b/paddlespeech/s2t/models/u2/u2.py @@ -605,8 +605,8 @@ class U2BaseModel(ASRInterface, nn.Layer): xs: paddle.Tensor, offset: int, required_cache_size: int, - att_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]), - cnn_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]), + att_cache: paddle.Tensor, + cnn_cache: paddle.Tensor, ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]: """ Export interface for c++ call, give input chunk xs, and return output from time 0 to current chunk. diff --git a/paddlespeech/s2t/modules/attention.py b/paddlespeech/s2t/modules/attention.py index b6d615867..cbcaccc26 100644 --- a/paddlespeech/s2t/modules/attention.py +++ b/paddlespeech/s2t/modules/attention.py @@ -15,7 +15,6 @@ # Modified from wenet(https://github.com/wenet-e2e/wenet) """Multi-Head Attention layer definition.""" import math -from typing import Optional from typing import Tuple import paddle @@ -83,11 +82,11 @@ class MultiHeadedAttention(nn.Layer): return q, k, v - def forward_attention(self, - value: paddle.Tensor, + def forward_attention( + self, + value: paddle.Tensor, scores: paddle.Tensor, - mask: paddle.Tensor = paddle.ones([0, 0, 0], dtype=paddle.bool), - ) -> paddle.Tensor: + mask: paddle.Tensor, ) -> paddle.Tensor: """Compute attention context vector. Args: value (paddle.Tensor): Transformed value, size @@ -108,7 +107,7 @@ class MultiHeadedAttention(nn.Layer): # When will `if mask.size(2) > 0` be False? # 1. onnx(16/-1, -1/-1, 16/0) # 2. jit (16/-1, -1/-1, 16/0, 16/4) - if paddle.shape(mask)[2] > 0: # time2 > 0 + if paddle.shape(mask)[2] > 0: # time2 > 0 mask = mask.unsqueeze(1).equal(0) # (batch, 1, *, time2) # for last chunk, time2 might be larger than scores.size(-1) mask = mask[:, :, :, :paddle.shape(scores)[-1]] @@ -131,10 +130,9 @@ class MultiHeadedAttention(nn.Layer): query: paddle.Tensor, key: paddle.Tensor, value: paddle.Tensor, - mask: paddle.Tensor = paddle.ones([0,0,0], dtype=paddle.bool), - pos_emb: paddle.Tensor = paddle.empty([0]), - cache: paddle.Tensor = paddle.zeros([0,0,0,0]) - ) -> Tuple[paddle.Tensor, paddle.Tensor]: + mask: paddle.Tensor, + pos_emb: paddle.Tensor, + cache: paddle.Tensor) -> Tuple[paddle.Tensor, paddle.Tensor]: """Compute scaled dot product attention. Args: query (paddle.Tensor): Query tensor (#batch, time1, size). @@ -247,10 +245,9 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention): query: paddle.Tensor, key: paddle.Tensor, value: paddle.Tensor, - mask: paddle.Tensor = paddle.ones([0,0,0], dtype=paddle.bool), - pos_emb: paddle.Tensor = paddle.empty([0]), - cache: paddle.Tensor = paddle.zeros([0,0,0,0]) - ) -> Tuple[paddle.Tensor, paddle.Tensor]: + mask: paddle.Tensor, + pos_emb: paddle.Tensor, + cache: paddle.Tensor) -> Tuple[paddle.Tensor, paddle.Tensor]: """Compute 'Scaled Dot Product Attention' with rel. positional encoding. Args: query (paddle.Tensor): Query tensor (#batch, time1, size). diff --git a/paddlespeech/s2t/modules/conformer_convolution.py b/paddlespeech/s2t/modules/conformer_convolution.py index c384b9c78..23aecd7f1 100644 --- a/paddlespeech/s2t/modules/conformer_convolution.py +++ b/paddlespeech/s2t/modules/conformer_convolution.py @@ -14,7 +14,6 @@ # limitations under the License. # Modified from wenet(https://github.com/wenet-e2e/wenet) """ConvolutionModule definition.""" -from typing import Optional from typing import Tuple import paddle @@ -108,9 +107,8 @@ class ConvolutionModule(nn.Layer): def forward(self, x: paddle.Tensor, - mask_pad: paddle.Tensor= paddle.ones([0,0,0], dtype=paddle.bool), - cache: paddle.Tensor= paddle.zeros([0,0,0]), - ) -> Tuple[paddle.Tensor, paddle.Tensor]: + mask_pad: paddle.Tensor, + cache: paddle.Tensor) -> Tuple[paddle.Tensor, paddle.Tensor]: """Compute convolution module. Args: x (paddle.Tensor): Input tensor (#batch, time, channels). @@ -127,11 +125,11 @@ class ConvolutionModule(nn.Layer): x = x.transpose([0, 2, 1]) # [B, C, T] # mask batch padding - if paddle.shape(mask_pad)[2] > 0: # time > 0 + if paddle.shape(mask_pad)[2] > 0: # time > 0 x = x.masked_fill(mask_pad, 0.0) if self.lorder > 0: - if paddle.shape(cache)[2] == 0: # cache_t == 0 + if paddle.shape(cache)[2] == 0: # cache_t == 0 x = nn.functional.pad( x, [self.lorder, 0], 'constant', 0.0, data_format='NCL') else: @@ -161,7 +159,7 @@ class ConvolutionModule(nn.Layer): x = self.pointwise_conv2(x) # mask batch padding - if paddle.shape(mask_pad)[2] > 0: # time > 0 + if paddle.shape(mask_pad)[2] > 0: # time > 0 x = x.masked_fill(mask_pad, 0.0) x = x.transpose([0, 2, 1]) # [B, T, C] diff --git a/paddlespeech/s2t/modules/decoder_layer.py b/paddlespeech/s2t/modules/decoder_layer.py index 37b124e84..c8843b723 100644 --- a/paddlespeech/s2t/modules/decoder_layer.py +++ b/paddlespeech/s2t/modules/decoder_layer.py @@ -121,11 +121,16 @@ class DecoderLayer(nn.Layer): if self.concat_after: tgt_concat = paddle.cat( - (tgt_q, self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0]), dim=-1) + (tgt_q, self.self_attn(tgt_q, tgt, tgt, tgt_q_mask, + paddle.empty([0]), + paddle.zeros([0, 0, 0, 0]))[0]), + dim=-1) x = residual + self.concat_linear1(tgt_concat) else: x = residual + self.dropout( - self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0]) + self.self_attn(tgt_q, tgt, tgt, tgt_q_mask, + paddle.empty([0]), paddle.zeros([0, 0, 0, 0]))[ + 0]) if not self.normalize_before: x = self.norm1(x) @@ -134,11 +139,15 @@ class DecoderLayer(nn.Layer): x = self.norm2(x) if self.concat_after: x_concat = paddle.cat( - (x, self.src_attn(x, memory, memory, memory_mask)[0]), dim=-1) + (x, self.src_attn(x, memory, memory, memory_mask, + paddle.empty([0]), + paddle.zeros([0, 0, 0, 0]))[0]), + dim=-1) x = residual + self.concat_linear2(x_concat) else: x = residual + self.dropout( - self.src_attn(x, memory, memory, memory_mask)[0]) + self.src_attn(x, memory, memory, memory_mask, + paddle.empty([0]), paddle.zeros([0, 0, 0, 0]))[0]) if not self.normalize_before: x = self.norm2(x) diff --git a/paddlespeech/s2t/modules/encoder.py b/paddlespeech/s2t/modules/encoder.py index bff2d69bb..6001afd4b 100644 --- a/paddlespeech/s2t/modules/encoder.py +++ b/paddlespeech/s2t/modules/encoder.py @@ -14,8 +14,6 @@ # limitations under the License. # Modified from wenet(https://github.com/wenet-e2e/wenet) """Encoder definition.""" -from typing import List -from typing import Optional from typing import Tuple import paddle @@ -177,7 +175,9 @@ class BaseEncoder(nn.Layer): decoding_chunk_size, self.static_chunk_size, num_decoding_left_chunks) for layer in self.encoders: - xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad) + xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad, + paddle.zeros([0, 0, 0, 0]), + paddle.zeros([0, 0, 0, 0])) if self.normalize_before: xs = self.after_norm(xs) # Here we assume the mask is not changed in encoder layers, so just @@ -190,9 +190,9 @@ class BaseEncoder(nn.Layer): xs: paddle.Tensor, offset: int, required_cache_size: int, - att_cache: paddle.Tensor = paddle.zeros([0,0,0,0]), - cnn_cache: paddle.Tensor = paddle.zeros([0,0,0,0]), - att_mask: paddle.Tensor = paddle.ones([0,0,0], dtype=paddle.bool), + att_cache: paddle.Tensor, + cnn_cache: paddle.Tensor, + att_mask: paddle.Tensor, ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]: """ Forward just one chunk Args: @@ -227,7 +227,7 @@ class BaseEncoder(nn.Layer): xs = self.global_cmvn(xs) # before embed, xs=(B, T, D1), pos_emb=(B=1, T, D) - xs, pos_emb, _ = self.embed(xs, tmp_masks, offset=offset) + xs, pos_emb, _ = self.embed(xs, tmp_masks, offset=offset) # after embed, xs=(B=1, chunk_size, hidden-dim) elayers = paddle.shape(att_cache)[0] @@ -252,14 +252,16 @@ class BaseEncoder(nn.Layer): # att_cache[i:i+1] = (1, head, cache_t1, d_k*2) # cnn_cache[i:i+1] = (1, B=1, hidden-dim, cache_t2) xs, _, new_att_cache, new_cnn_cache = layer( - xs, att_mask, pos_emb, - att_cache=att_cache[i:i+1] if elayers > 0 else att_cache, - cnn_cache=cnn_cache[i:i+1] if paddle.shape(cnn_cache)[0] > 0 else cnn_cache, - ) + xs, + att_mask, + pos_emb, + att_cache=att_cache[i:i + 1] if elayers > 0 else att_cache, + cnn_cache=cnn_cache[i:i + 1] + if paddle.shape(cnn_cache)[0] > 0 else cnn_cache, ) # new_att_cache = (1, head, attention_key_size, d_k*2) # new_cnn_cache = (B=1, hidden-dim, cache_t2) - r_att_cache.append(new_att_cache[:,:, next_cache_start:, :]) - r_cnn_cache.append(new_cnn_cache.unsqueeze(0)) # add elayer dim + r_att_cache.append(new_att_cache[:, :, next_cache_start:, :]) + r_cnn_cache.append(new_cnn_cache.unsqueeze(0)) # add elayer dim if self.normalize_before: xs = self.after_norm(xs) @@ -270,7 +272,6 @@ class BaseEncoder(nn.Layer): r_cnn_cache = paddle.concat(r_cnn_cache, axis=0) return xs, r_att_cache, r_cnn_cache - def forward_chunk_by_chunk( self, xs: paddle.Tensor, @@ -315,8 +316,8 @@ class BaseEncoder(nn.Layer): num_frames = xs.shape[1] required_cache_size = decoding_chunk_size * num_decoding_left_chunks - att_cache: paddle.Tensor = paddle.zeros([0,0,0,0]) - cnn_cache: paddle.Tensor = paddle.zeros([0,0,0,0]) + att_cache: paddle.Tensor = paddle.zeros([0, 0, 0, 0]) + cnn_cache: paddle.Tensor = paddle.zeros([0, 0, 0, 0]) outputs = [] offset = 0 @@ -326,7 +327,8 @@ class BaseEncoder(nn.Layer): chunk_xs = xs[:, cur:end, :] (y, att_cache, cnn_cache) = self.forward_chunk( - chunk_xs, offset, required_cache_size, att_cache, cnn_cache) + chunk_xs, offset, required_cache_size, att_cache, cnn_cache, + paddle.ones([0, 0, 0], dtype=paddle.bool)) outputs.append(y) offset += y.shape[1] diff --git a/paddlespeech/s2t/modules/encoder_layer.py b/paddlespeech/s2t/modules/encoder_layer.py index 5f810dfde..8fd991ec6 100644 --- a/paddlespeech/s2t/modules/encoder_layer.py +++ b/paddlespeech/s2t/modules/encoder_layer.py @@ -76,9 +76,9 @@ class TransformerEncoderLayer(nn.Layer): x: paddle.Tensor, mask: paddle.Tensor, pos_emb: paddle.Tensor, - mask_pad: paddle.Tensor=paddle.ones([0, 0, 0], dtype=paddle.bool), - att_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]), - cnn_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]), + mask_pad: paddle.Tensor, + att_cache: paddle.Tensor, + cnn_cache: paddle.Tensor, ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor]: """Compute encoded features. Args: @@ -105,7 +105,8 @@ class TransformerEncoderLayer(nn.Layer): if self.normalize_before: x = self.norm1(x) - x_att, new_att_cache = self.self_attn(x, x, x, mask, cache=att_cache) + x_att, new_att_cache = self.self_attn( + x, x, x, mask, paddle.empty([0]), cache=att_cache) if self.concat_after: x_concat = paddle.concat((x, x_att), axis=-1) @@ -193,9 +194,9 @@ class ConformerEncoderLayer(nn.Layer): x: paddle.Tensor, mask: paddle.Tensor, pos_emb: paddle.Tensor, - mask_pad: paddle.Tensor=paddle.ones([0, 0, 0], dtype=paddle.bool), - att_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]), - cnn_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]), + mask_pad: paddle.Tensor, + att_cache: paddle.Tensor, + cnn_cache: paddle.Tensor, ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor]: """Compute encoded features. Args: From 4ceea2c78da2e3a7fb47184ed4c47daf85a0462c Mon Sep 17 00:00:00 2001 From: TianYuan Date: Tue, 30 Aug 2022 20:29:08 +0800 Subject: [PATCH 2/9] Update README.md --- demos/speaker_verification/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/demos/speaker_verification/README.md b/demos/speaker_verification/README.md index 900b5ae40..55f9a7360 100644 --- a/demos/speaker_verification/README.md +++ b/demos/speaker_verification/README.md @@ -19,6 +19,7 @@ The input of this cli demo should be a WAV file(`.wav`), and the sample rate mus Here are sample files for this demo that can be downloaded: ```bash wget -c https://paddlespeech.bj.bcebos.com/vector/audio/85236145389.wav +wget -c https://paddlespeech.bj.bcebos.com/vector/audio/123456789.wav ``` ### 3. Usage From e147b96cf08df04f079105377d2348933dec5f0b Mon Sep 17 00:00:00 2001 From: TianYuan Date: Tue, 30 Aug 2022 20:30:15 +0800 Subject: [PATCH 3/9] Update README_cn.md --- demos/speaker_verification/README_cn.md | 1 + 1 file changed, 1 insertion(+) diff --git a/demos/speaker_verification/README_cn.md b/demos/speaker_verification/README_cn.md index f6afa86ac..85224699c 100644 --- a/demos/speaker_verification/README_cn.md +++ b/demos/speaker_verification/README_cn.md @@ -19,6 +19,7 @@ ```bash # 该音频的内容是数字串 85236145389 wget -c https://paddlespeech.bj.bcebos.com/vector/audio/85236145389.wav +wget -c https://paddlespeech.bj.bcebos.com/vector/audio/123456789.wav ``` ### 3. 使用方法 - 命令行 (推荐使用) From ed80b0e2c3a01382effb8f0f85a4a135679ca980 Mon Sep 17 00:00:00 2001 From: tianhao zhang <15600919271@163.com> Date: Tue, 30 Aug 2022 12:41:59 +0000 Subject: [PATCH 4/9] fix multigpu training test=asr --- paddlespeech/s2t/models/u2/u2.py | 4 +-- paddlespeech/s2t/modules/attention.py | 35 +++++++++++-------- .../s2t/modules/conformer_convolution.py | 10 +++--- paddlespeech/s2t/modules/encoder.py | 6 ++-- paddlespeech/s2t/modules/encoder_layer.py | 14 ++++---- 5 files changed, 39 insertions(+), 30 deletions(-) diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py index a812abcbd..813e1e529 100644 --- a/paddlespeech/s2t/models/u2/u2.py +++ b/paddlespeech/s2t/models/u2/u2.py @@ -605,8 +605,8 @@ class U2BaseModel(ASRInterface, nn.Layer): xs: paddle.Tensor, offset: int, required_cache_size: int, - att_cache: paddle.Tensor, - cnn_cache: paddle.Tensor, + att_cache: paddle.Tensor, # paddle.zeros([0, 0, 0, 0]) + cnn_cache: paddle.Tensor, # paddle.zeros([0, 0, 0, 0]) ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]: """ Export interface for c++ call, give input chunk xs, and return output from time 0 to current chunk. diff --git a/paddlespeech/s2t/modules/attention.py b/paddlespeech/s2t/modules/attention.py index cbcaccc26..92990048d 100644 --- a/paddlespeech/s2t/modules/attention.py +++ b/paddlespeech/s2t/modules/attention.py @@ -86,7 +86,8 @@ class MultiHeadedAttention(nn.Layer): self, value: paddle.Tensor, scores: paddle.Tensor, - mask: paddle.Tensor, ) -> paddle.Tensor: + mask: paddle.Tensor, # paddle.ones([0, 0, 0], dtype=paddle.bool) + ) -> paddle.Tensor: """Compute attention context vector. Args: value (paddle.Tensor): Transformed value, size @@ -126,13 +127,15 @@ class MultiHeadedAttention(nn.Layer): return self.linear_out(x) # (batch, time1, d_model) - def forward(self, - query: paddle.Tensor, - key: paddle.Tensor, - value: paddle.Tensor, - mask: paddle.Tensor, - pos_emb: paddle.Tensor, - cache: paddle.Tensor) -> Tuple[paddle.Tensor, paddle.Tensor]: + def forward( + self, + query: paddle.Tensor, + key: paddle.Tensor, + value: paddle.Tensor, + mask: paddle.Tensor, # paddle.ones([0,0,0], dtype=paddle.bool) + pos_emb: paddle.Tensor, # paddle.empty([0]) + cache: paddle.Tensor # paddle.zeros([0,0,0,0]) + ) -> Tuple[paddle.Tensor, paddle.Tensor]: """Compute scaled dot product attention. Args: query (paddle.Tensor): Query tensor (#batch, time1, size). @@ -241,13 +244,15 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention): return x - def forward(self, - query: paddle.Tensor, - key: paddle.Tensor, - value: paddle.Tensor, - mask: paddle.Tensor, - pos_emb: paddle.Tensor, - cache: paddle.Tensor) -> Tuple[paddle.Tensor, paddle.Tensor]: + def forward( + self, + query: paddle.Tensor, + key: paddle.Tensor, + value: paddle.Tensor, + mask: paddle.Tensor, # paddle.ones([0,0,0], dtype=paddle.bool) + pos_emb: paddle.Tensor, # paddle.empty([0]) + cache: paddle.Tensor # paddle.zeros([0,0,0,0]) + ) -> Tuple[paddle.Tensor, paddle.Tensor]: """Compute 'Scaled Dot Product Attention' with rel. positional encoding. Args: query (paddle.Tensor): Query tensor (#batch, time1, size). diff --git a/paddlespeech/s2t/modules/conformer_convolution.py b/paddlespeech/s2t/modules/conformer_convolution.py index 23aecd7f1..b35fea5b9 100644 --- a/paddlespeech/s2t/modules/conformer_convolution.py +++ b/paddlespeech/s2t/modules/conformer_convolution.py @@ -105,10 +105,12 @@ class ConvolutionModule(nn.Layer): ) self.activation = activation - def forward(self, - x: paddle.Tensor, - mask_pad: paddle.Tensor, - cache: paddle.Tensor) -> Tuple[paddle.Tensor, paddle.Tensor]: + def forward( + self, + x: paddle.Tensor, + mask_pad: paddle.Tensor, # paddle.ones([0,0,0], dtype=paddle.bool) + cache: paddle.Tensor # paddle.zeros([0,0,0,0]) + ) -> Tuple[paddle.Tensor, paddle.Tensor]: """Compute convolution module. Args: x (paddle.Tensor): Input tensor (#batch, time, channels). diff --git a/paddlespeech/s2t/modules/encoder.py b/paddlespeech/s2t/modules/encoder.py index 6001afd4b..abdaf5ea7 100644 --- a/paddlespeech/s2t/modules/encoder.py +++ b/paddlespeech/s2t/modules/encoder.py @@ -190,9 +190,9 @@ class BaseEncoder(nn.Layer): xs: paddle.Tensor, offset: int, required_cache_size: int, - att_cache: paddle.Tensor, - cnn_cache: paddle.Tensor, - att_mask: paddle.Tensor, + att_cache: paddle.Tensor, # paddle.zeros([0,0,0,0]) + cnn_cache: paddle.Tensor, # paddle.zeros([0,0,0,0]), + att_mask: paddle.Tensor, # paddle.ones([0,0,0], dtype=paddle.bool) ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]: """ Forward just one chunk Args: diff --git a/paddlespeech/s2t/modules/encoder_layer.py b/paddlespeech/s2t/modules/encoder_layer.py index 8fd991ec6..3972ff90a 100644 --- a/paddlespeech/s2t/modules/encoder_layer.py +++ b/paddlespeech/s2t/modules/encoder_layer.py @@ -76,9 +76,10 @@ class TransformerEncoderLayer(nn.Layer): x: paddle.Tensor, mask: paddle.Tensor, pos_emb: paddle.Tensor, - mask_pad: paddle.Tensor, - att_cache: paddle.Tensor, - cnn_cache: paddle.Tensor, + mask_pad: paddle. + Tensor, # paddle.ones([0, 0, 0], dtype=paddle.bool) + att_cache: paddle.Tensor, # paddle.zeros([0, 0, 0, 0]) + cnn_cache: paddle.Tensor, # paddle.zeros([0, 0, 0, 0]) ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor]: """Compute encoded features. Args: @@ -194,9 +195,10 @@ class ConformerEncoderLayer(nn.Layer): x: paddle.Tensor, mask: paddle.Tensor, pos_emb: paddle.Tensor, - mask_pad: paddle.Tensor, - att_cache: paddle.Tensor, - cnn_cache: paddle.Tensor, + mask_pad: paddle. + Tensor, # paddle.ones([0, 0, 0], dtype=paddle.bool) + att_cache: paddle.Tensor, # paddle.zeros([0, 0, 0, 0]) + cnn_cache: paddle.Tensor, # paddle.zeros([0, 0, 0, 0]) ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor]: """Compute encoded features. Args: From 1dfca4ef736493a99e2ac35f4d985b20472aa197 Mon Sep 17 00:00:00 2001 From: tianhao zhang <15600919271@163.com> Date: Wed, 31 Aug 2022 02:43:54 +0000 Subject: [PATCH 5/9] fix multigpu training --- .../server/engine/asr/online/python/asr_engine.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/paddlespeech/server/engine/asr/online/python/asr_engine.py b/paddlespeech/server/engine/asr/online/python/asr_engine.py index 4df38f09d..96d4823e2 100644 --- a/paddlespeech/server/engine/asr/online/python/asr_engine.py +++ b/paddlespeech/server/engine/asr/online/python/asr_engine.py @@ -21,10 +21,10 @@ import paddle from numpy import float32 from yacs.config import CfgNode +from paddlespeech.audio.transform.transformation import Transformation from paddlespeech.cli.asr.infer import ASRExecutor from paddlespeech.cli.log import logger from paddlespeech.resource import CommonTaskResource -from paddlespeech.audio.transform.transformation import Transformation from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer from paddlespeech.s2t.modules.ctc import CTCDecoder from paddlespeech.s2t.utils.tensor_utils import add_sos_eos @@ -130,8 +130,8 @@ class PaddleASRConnectionHanddler: ## conformer # cache for conformer online - self.att_cache = paddle.zeros([0,0,0,0]) - self.cnn_cache = paddle.zeros([0,0,0,0]) + self.att_cache = paddle.zeros([0, 0, 0, 0]) + self.cnn_cache = paddle.zeros([0, 0, 0, 0]) self.encoder_out = None # conformer decoding state @@ -474,9 +474,10 @@ class PaddleASRConnectionHanddler: # cur chunk chunk_xs = self.cached_feat[:, cur:end, :] # forward chunk - (y, self.att_cache, self.cnn_cache) = self.model.encoder.forward_chunk( - chunk_xs, self.offset, required_cache_size, - self.att_cache, self.cnn_cache) + (y, self.att_cache, + self.cnn_cache) = self.model.encoder.forward_chunk( + chunk_xs, self.offset, required_cache_size, self.att_cache, + self.cnn_cache, paddle.ones([0, 0, 0], dtype=paddle.bool)) outputs.append(y) # update the global offset, in decoding frame unit From e0081b7e504fbe4a9cb82e88e2e3aa9595066b95 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Wed, 31 Aug 2022 11:12:45 +0800 Subject: [PATCH 6/9] [vec][spk] add speechbrain ecapa-tdnn result --- examples/voxceleb/sv0/RESULT.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/examples/voxceleb/sv0/RESULT.md b/examples/voxceleb/sv0/RESULT.md index 56ee887c6..aa55584d2 100644 --- a/examples/voxceleb/sv0/RESULT.md +++ b/examples/voxceleb/sv0/RESULT.md @@ -5,3 +5,7 @@ | Model | Number of Params | Release | Config | dim | Test set | Cosine | Cosine + S-Norm | | --- | --- | --- | --- | --- | --- | --- | ---- | | ECAPA-TDNN | 85M | 0.2.1 | conf/ecapa_tdnn.yaml | 192 | test | 0.8188 | 0.7815| + +> [SpeechBrain result](https://github.com/speechbrain/speechbrain/tree/develop/recipes/VoxCeleb/SpeakerRec#speaker-verification-using-ecapa-tdnn-embeddings): +> EER = 0.90% (voxceleb1 + voxceleb2) without s-norm +> EER = 0.80% (voxceleb1 + voxceleb2) with s-norm. From b4bb785b17087555d588195cb8326b89390e8758 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Wed, 31 Aug 2022 11:19:09 +0800 Subject: [PATCH 7/9] Update README.md --- demos/audio_searching/README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/demos/audio_searching/README.md b/demos/audio_searching/README.md index db38d14ed..0fc901432 100644 --- a/demos/audio_searching/README.md +++ b/demos/audio_searching/README.md @@ -226,6 +226,12 @@ recall and elapsed time statistics are shown in the following figure: The retrieval framework based on Milvus takes about 2.9 milliseconds to retrieve on the premise of 90% recall rate, and it takes about 500 milliseconds for feature extraction (testing audio takes about 5 seconds), that is, a single audio test takes about 503 milliseconds in total, which can meet most application scenarios. +* compute embeding takes 500 ms +* retrieval with cosine takes 2.9 ms +* total takes 503 ms + +> test audio is 5 sec + ### 6.Pretrained Models Here is a list of pretrained models released by PaddleSpeech : From ed2819d7afe1784eb0baa3e11111bc51b1a04dde Mon Sep 17 00:00:00 2001 From: tianhao zhang <15600919271@163.com> Date: Wed, 31 Aug 2022 06:20:24 +0000 Subject: [PATCH 8/9] fix format test=asr --- paddlespeech/s2t/modules/encoder_layer.py | 3 +-- .../server/engine/asr/online/python/asr_engine.py | 8 ++++++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/paddlespeech/s2t/modules/encoder_layer.py b/paddlespeech/s2t/modules/encoder_layer.py index 3972ff90a..4555b535f 100644 --- a/paddlespeech/s2t/modules/encoder_layer.py +++ b/paddlespeech/s2t/modules/encoder_layer.py @@ -195,8 +195,7 @@ class ConformerEncoderLayer(nn.Layer): x: paddle.Tensor, mask: paddle.Tensor, pos_emb: paddle.Tensor, - mask_pad: paddle. - Tensor, # paddle.ones([0, 0, 0], dtype=paddle.bool) + mask_pad: paddle.Tensor, #paddle.ones([0, 0, 0],dtype=paddle.bool) att_cache: paddle.Tensor, # paddle.zeros([0, 0, 0, 0]) cnn_cache: paddle.Tensor, # paddle.zeros([0, 0, 0, 0]) ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor]: diff --git a/paddlespeech/server/engine/asr/online/python/asr_engine.py b/paddlespeech/server/engine/asr/online/python/asr_engine.py index 96d4823e2..87d88ee60 100644 --- a/paddlespeech/server/engine/asr/online/python/asr_engine.py +++ b/paddlespeech/server/engine/asr/online/python/asr_engine.py @@ -476,8 +476,12 @@ class PaddleASRConnectionHanddler: # forward chunk (y, self.att_cache, self.cnn_cache) = self.model.encoder.forward_chunk( - chunk_xs, self.offset, required_cache_size, self.att_cache, - self.cnn_cache, paddle.ones([0, 0, 0], dtype=paddle.bool)) + chunk_xs, + self.offset, + required_cache_size, + att_cache=self.att_cache, + cnn_cache=self.cnn_cache, + att_mask=paddle.ones([0, 0, 0], dtype=paddle.bool)) outputs.append(y) # update the global offset, in decoding frame unit From cdcb1a531659e46ccae84d02388da3f72057a3c3 Mon Sep 17 00:00:00 2001 From: tianhao zhang <15600919271@163.com> Date: Wed, 31 Aug 2022 06:55:49 +0000 Subject: [PATCH 9/9] s2t: fix encoder.py --- paddlespeech/s2t/modules/encoder.py | 1 + 1 file changed, 1 insertion(+) diff --git a/paddlespeech/s2t/modules/encoder.py b/paddlespeech/s2t/modules/encoder.py index abdaf5ea7..cf4e32fa4 100644 --- a/paddlespeech/s2t/modules/encoder.py +++ b/paddlespeech/s2t/modules/encoder.py @@ -255,6 +255,7 @@ class BaseEncoder(nn.Layer): xs, att_mask, pos_emb, + mask_pad=paddle.ones([0, 0, 0], dtype=paddle.bool), att_cache=att_cache[i:i + 1] if elayers > 0 else att_cache, cnn_cache=cnn_cache[i:i + 1] if paddle.shape(cnn_cache)[0] > 0 else cnn_cache, )