diff --git a/paddlespeech/s2t/modules/conformer_convolution.py b/paddlespeech/s2t/modules/conformer_convolution.py index be6056546..09d903eee 100644 --- a/paddlespeech/s2t/modules/conformer_convolution.py +++ b/paddlespeech/s2t/modules/conformer_convolution.py @@ -127,11 +127,11 @@ class ConvolutionModule(nn.Layer): x = x.transpose([0, 2, 1]) # [B, C, T] # mask batch padding - if paddle.shape(mask_pad)[2] > 0: # time > 0 + if mask_pad.shape[2] > 0: # time > 0 x = x.masked_fill(mask_pad, 0.0) if self.lorder > 0: - if paddle.shape(cache)[2] == 0: # cache_t == 0 + if cache.shape[2] == 0: # cache_t == 0 x = nn.functional.pad( x, [self.lorder, 0], 'constant', 0.0, data_format='NCL') else: @@ -161,7 +161,7 @@ class ConvolutionModule(nn.Layer): x = self.pointwise_conv2(x) # mask batch padding - if paddle.shape(mask_pad)[2] > 0: # time > 0 + if mask_pad.shape[2] > 0: # time > 0 x = x.masked_fill(mask_pad, 0.0) x = x.transpose([0, 2, 1]) # [B, T, C] diff --git a/paddlespeech/s2t/modules/decoder.py b/paddlespeech/s2t/modules/decoder.py index 5e1b4c92b..4ddf057b6 100644 --- a/paddlespeech/s2t/modules/decoder.py +++ b/paddlespeech/s2t/modules/decoder.py @@ -243,7 +243,7 @@ class TransformerDecoder(BatchScorerInterface, nn.Layer): ] # batch decoding - ys_mask = subsequent_mask(paddle.shape(ys)[-1]).unsqueeze(0) # (B,L,L) + ys_mask = subsequent_mask(ys.shape[-1]).unsqueeze(0) # (B,L,L) xs_mask = make_xs_mask(xs).unsqueeze(1) # (B,1,T) logp, states = self.forward_one_step( xs, xs_mask, ys, ys_mask, cache=batch_state) diff --git a/paddlespeech/s2t/modules/encoder.py b/paddlespeech/s2t/modules/encoder.py index db5848847..f23d3f140 100644 --- a/paddlespeech/s2t/modules/encoder.py +++ b/paddlespeech/s2t/modules/encoder.py @@ -246,7 +246,7 @@ class BaseEncoder(nn.Layer): # tensor zeros([0,0,0,0]) support [i:i+1] slice, will return zeros([0,0,0,0]) tensor # raw code as below: # att_cache=att_cache[i:i+1] if elayers > 0 else att_cache, - # cnn_cache=cnn_cache[i:i+1] if paddle.shape(cnn_cache)[0] > 0 else cnn_cache, + # cnn_cache=cnn_cache[i:i+1] if cnn_cache.shape[0] > 0 else cnn_cache, xs, _, new_att_cache, new_cnn_cache = layer( xs, att_mask, diff --git a/paddlespeech/s2t/modules/loss.py b/paddlespeech/s2t/modules/loss.py index 884fb70c1..afd5201aa 100644 --- a/paddlespeech/s2t/modules/loss.py +++ b/paddlespeech/s2t/modules/loss.py @@ -85,7 +85,7 @@ class CTCLoss(nn.Layer): Returns: [paddle.Tensor]: scalar. If reduction is 'none', then (N), where N = \text{batch size}. """ - B = paddle.shape(logits)[0] + B = logits.shape[0] # warp-ctc need logits, and do softmax on logits by itself # warp-ctc need activation with shape [T, B, V + 1] # logits: (B, L, D) -> (L, B, D) @@ -158,7 +158,7 @@ class LabelSmoothingLoss(nn.Layer): Returns: loss (paddle.Tensor) : The KL loss, scalar float value """ - B, T, D = paddle.shape(x) + B, T, D = x.shape assert D == self.size x = x.reshape((-1, self.size)) target = target.reshape([-1]) diff --git a/paddlespeech/s2t/modules/subsampling.py b/paddlespeech/s2t/modules/subsampling.py index 2775988a7..782a437ee 100644 --- a/paddlespeech/s2t/modules/subsampling.py +++ b/paddlespeech/s2t/modules/subsampling.py @@ -192,8 +192,8 @@ class Conv2dSubsampling6(Conv2dSubsampling): """ x = x.unsqueeze(1) # (b, c, t, f) x = self.conv(x) - b, c, t, f = paddle.shape(x) - x = self.linear(x.transpose([0, 2, 1, 3]).reshape([b, t, c * f])) + b, c, t, f = x.shape + x = self.linear(x.transpose([0, 2, 1, 3]).reshape([b, -1, c * f])) x, pos_emb = self.pos_enc(x, offset) return x, pos_emb, x_mask[:, :, :-2:2][:, :, :-4:3] @@ -245,6 +245,7 @@ class Conv2dSubsampling8(Conv2dSubsampling): """ x = x.unsqueeze(1) # (b, c, t, f) x = self.conv(x) - x = self.linear(x.transpose([0, 2, 1, 3]).reshape([b, t, c * f])) + b, c, t, f = x.shape + x = self.linear(x.transpose([0, 2, 1, 3]).reshape([b, -1, c * f])) x, pos_emb = self.pos_enc(x, offset) return x, pos_emb, x_mask[:, :, :-2:2][:, :, :-2:2][:, :, :-2:2]