add docstring

3 years ago · 14d46fe613
parent 6ee353ccaa
commit 14d46fe613
4 changed files with 76 additions and 40 deletions
--- a/paddlespeech/t2s/models/starganv2_vc/AuxiliaryASR/layers.py
+++ b/paddlespeech/t2s/models/starganv2_vc/AuxiliaryASR/layers.py
@ -44,7 +44,8 @@ class LinearNorm(nn.Layer):
            self.linear_layer.weight, gain=_calculate_gain(w_init_gain))

    def forward(self, x: paddle.Tensor):
-        return self.linear_layer(x)
+        out = self.linear_layer(x)
+        return out


 class ConvNorm(nn.Layer):
@ -183,13 +184,14 @@ class Attention(nn.Layer):
        """
        Args:
            query: 
-                decoder output (batch, n_mel_channels * n_frames_per_step)
+                decoder output (B, n_mel_channels * n_frames_per_step)
            processed_memory: 
                processed encoder outputs (B, T_in, attention_dim)
            attention_weights_cat: 
                cumulative and prev. att weights (B, 2, max_time)
        Returns:
-            Tensor: alignment (batch, max_time)
+            Tensor: 
+                alignment (B, max_time)
        """

        processed_query = self.query_layer(query.unsqueeze(1))
@ -254,7 +256,6 @@ class MFCC(nn.Layer):
        # -> (channel, time, n_mfcc).tranpose(...)
        mfcc = paddle.matmul(mel_specgram.transpose([0, 2, 1]),
                             self.dct_mat).transpose([0, 2, 1])
-
        # unpack batch
        if unsqueezed:
            mfcc = mfcc.squeeze(0)
--- a/paddlespeech/t2s/models/starganv2_vc/AuxiliaryASR/model.py
+++ b/paddlespeech/t2s/models/starganv2_vc/AuxiliaryASR/model.py
@ -194,9 +194,8 @@ class ASRS2S(nn.Layer):
            logit_outputs += [logit]
            alignments += [attention_weights]

-        hidden_outputs, logit_outputs, alignments = \
-            self.parse_decoder_outputs(
-                hidden_outputs, logit_outputs, alignments)
+        hidden_outputs, logit_outputs, alignments = self.parse_decoder_outputs(
+            hidden_outputs, logit_outputs, alignments)

        return hidden_outputs, logit_outputs, alignments

--- a/paddlespeech/t2s/models/starganv2_vc/JDCNet/model.py
+++ b/paddlespeech/t2s/models/starganv2_vc/JDCNet/model.py
@ -46,11 +46,11 @@ class JDCNet(nn.Layer):
            nn.LeakyReLU(leaky_relu_slope),
            # out: (B, out_channels, T, n_mels)
            nn.Conv2D(64, 64, 3, padding=1, bias_attr=False), )
-        # output: (B, out_channels, T, n_mels//2)
+        # output: (B, out_channels, T, n_mels // 2)
        self.res_block1 = ResBlock(in_channels=64, out_channels=128)
-        # output: (B, out_channels, T, n_mels//4) 
+        # output: (B, out_channels, T, n_mels // 4) 
        self.res_block2 = ResBlock(in_channels=128, out_channels=192)
-        # output: (B, out_channels, T, n_mels//8)  
+        # output: (B, out_channels, T, n_mels // 8)  
        self.res_block3 = ResBlock(in_channels=192, out_channels=256)
        # pool block
        self.pool_block = nn.Sequential(
@ -59,7 +59,7 @@ class JDCNet(nn.Layer):
            # (B, num_features, T, 2)
            nn.MaxPool2D(kernel_size=(1, 4)),
            nn.Dropout(p=0.5), )
-        # input: (B, T, input_size) - resized from (B, input_size//2, T, 2)
+        # input: (B, T, input_size), resized from (B, input_size // 2, T, 2)
        # output: (B, T, input_size)
        self.bilstm_classifier = nn.LSTM(
            input_size=512,
@ -81,7 +81,7 @@ class JDCNet(nn.Layer):
                Shape (B, num_class, n_mels, T).
        Returns:
            Tensor:
-                Shape (B, num_features, n_mels//8, T).
+                Shape (B, num_features, n_mels // 8, T).
        """
        x = x.astype(paddle.float32)
        x = x.transpose([0, 1, 3, 2] if len(x.shape) == 4 else [0, 2, 1])
@ -105,10 +105,9 @@ class JDCNet(nn.Layer):
                classifier output consists of predicted pitch classes per frame.
                Shape: (B, seq_len, num_class).
            Tensor:
-                GAN_feature. Shape: (B, num_features, n_mels//8, seq_len)
+                GAN_feature. Shape: (B, num_features, n_mels // 8, seq_len)
            Tensor:
-                poolblock_out. Shape (B, seq_len, 512)
-                
+                poolblock_out. Shape (B, seq_len, 512)     
        """
        ###############################
        # forward pass for classifier #
@ -201,7 +200,7 @@ class ResBlock(nn.Layer):
            x(Tensor(float32)): Shape (B, in_channels, T, n_mels).
        Returns:
            Tensor:
-                The residual output, Shape (B, out_channels, T, n_mels//2).
+                The residual output, Shape (B, out_channels, T, n_mels // 2).
        """
        x = self.pre_conv(x)
        if self.downsample:
--- a/paddlespeech/t2s/models/starganv2_vc/starganv2_vc.py
+++ b/paddlespeech/t2s/models/starganv2_vc/starganv2_vc.py
@ -32,12 +32,23 @@ class DownSample(nn.Layer):
        self.layer_type = layer_type

    def forward(self, x: paddle.Tensor):
+        """Calculate forward propagation.
+        Args:
+            x(Tensor(float32)): Shape (B, dim_in, n_mels, T).
+        Returns:
+            Tensor:
+                layer_type == 'none': Shape (B, dim_in, n_mels, T)
+                layer_type == 'timepreserve': Shape (B, dim_in, n_mels // 2, T)
+                layer_type == 'half': Shape (B, dim_in, n_mels // 2, T // 2)
+        """
        if self.layer_type == 'none':
            return x
        elif self.layer_type == 'timepreserve':
-            return F.avg_pool2d(x, (2, 1))
+            out = F.avg_pool2d(x, (2, 1))
+            return out
        elif self.layer_type == 'half':
-            return F.avg_pool2d(x, 2)
+            out = F.avg_pool2d(x, 2)
+            return out
        else:
            raise RuntimeError(
                'Got unexpected donwsampletype %s, expected is [none, timepreserve, half]'
@ -50,12 +61,23 @@ class UpSample(nn.Layer):
        self.layer_type = layer_type

    def forward(self, x: paddle.Tensor):
+        """Calculate forward propagation.
+        Args:
+            x(Tensor(float32)): Shape (B, dim_in, n_mels, T).
+        Returns:
+            Tensor:
+                layer_type == 'none': Shape (B, dim_in, n_mels, T)
+                layer_type == 'timepreserve': Shape (B, dim_in, n_mels * 2, T)
+                layer_type == 'half': Shape (B, dim_in, n_mels * 2, T * 2)
+        """
        if self.layer_type == 'none':
            return x
        elif self.layer_type == 'timepreserve':
-            return F.interpolate(x, scale_factor=(2, 1), mode='nearest')
+            out = F.interpolate(x, scale_factor=(2, 1), mode='nearest')
+            return out
        elif self.layer_type == 'half':
-            return F.interpolate(x, scale_factor=2, mode='nearest')
+            out = F.interpolate(x, scale_factor=2, mode='nearest')
+            return out
        else:
            raise RuntimeError(
                'Got unexpected upsampletype %s, expected is [none, timepreserve, half]'
@ -126,7 +148,9 @@ class ResBlk(nn.Layer):
            x(Tensor(float32)): Shape (B, dim_in, n_mels, T).
        Returns:
            Tensor:
-                Shape (B, dim_out, T, n_mels//(1 or 2), T//(1 or 2)).
+                downsample == 'none': Shape (B, dim_in, n_mels, T).
+                downsample == 'timepreserve': Shape (B, dim_out, T, n_mels // 2, T).
+                downsample == 'half': Shape (B, dim_out, T, n_mels // 2, T // 2).
        """
        x = self._shortcut(x) + self._residual(x)
        # unit variance
@ -142,12 +166,21 @@ class AdaIN(nn.Layer):
        self.fc = nn.Linear(style_dim, num_features * 2)

    def forward(self, x: paddle.Tensor, s: paddle.Tensor):
+        """Calculate forward propagation.
+        Args:
+            x(Tensor(float32)): Shape (B, style_dim, n_mels, T).
+            s(Tensor(float32)): Shape (style_dim, ).
+        Returns:
+            Tensor:
+                Shape (B, style_dim, T, n_mels, T).
+        """
        if len(s.shape) == 1:
            s = s[None]
        h = self.fc(s)
        h = h.reshape((h.shape[0], h.shape[1], 1, 1))
        gamma, beta = paddle.split(h, 2, axis=1)
-        return (1 + gamma) * self.norm(x) + beta
+        out = (1 + gamma) * self.norm(x) + beta
+        return out


 class AdainResBlk(nn.Layer):
@ -164,6 +197,7 @@ class AdainResBlk(nn.Layer):
        self.upsample = UpSample(layer_type=upsample)
        self.learned_sc = dim_in != dim_out
        self._build_weights(dim_in, dim_out, style_dim)
+        self.layer_type = upsample

    def _build_weights(self, dim_in: int, dim_out: int, style_dim: int=64):
        self.conv1 = nn.Conv2D(
@ -209,12 +243,14 @@ class AdainResBlk(nn.Layer):
        """Calculate forward propagation.
        Args:
            x(Tensor(float32)): 
-                Shape (B, dim_in, n_mels', T').
+                Shape (B, dim_in, n_mels, T).
            s(Tensor(float32)):
                Shape (64,).
        Returns:
            Tensor:
-                Shape (B, dim_out, n_mels'', T'').
+                upsample == 'none': Shape (B, dim_out, T, n_mels, T).  
+                upsample == 'timepreserve': Shape (B, dim_out, T, n_mels * 2, T).
+                upsample == 'half': Shape (B, dim_out, T, n_mels * 2, T * 2).  
        """
        out = self._residual(x, s)
        if self.w_hpf == 0:
@ -333,27 +369,27 @@ class Generator(nn.Layer):
            masks:
                None.
            F0:
-                Shape (B, num_features(256), n_mels//8, T).
+                Shape (B, num_features(256), n_mels // 8, T).
        Returns:
            Tensor:
-                output of generator. Shape (B, 1, n_mels, T//4*4)
+                output of generator. Shape (B, 1, n_mels, T // 4 * 4)
        """
        x = self.stem(x)
        cache = {}
-        # output: (B, max_conv_dim, n_mels//16, T//4)
+        # output: (B, max_conv_dim, n_mels // 16, T // 4)
        for block in self.encode:
            if (masks is not None) and (x.shape[2] in [32, 64, 128]):
                cache[x.shape[2]] = x
            x = block(x)
        if F0 is not None:
-            # input: (B, num_features(256), n_mels//8, T)
-            # output: (B, num_features(256)//2, n_mels//16, T//2)
+            # input: (B, num_features(256), n_mels // 8, T)
+            # output: (B, num_features(256) // 2, n_mels // 16, T // 2)
            F0 = self.F0_conv(F0)
-            # output: (B, num_features(256)//2, n_mels//16, T//4)
+            # output: (B, num_features(256) // 2, n_mels // 16, T // 4)
            F0 = F.adaptive_avg_pool2d(F0, [x.shape[-2], x.shape[-1]])
            x = paddle.concat([x, F0], axis=1)
-        # input: (B, max_conv_dim+num_features(256)//2, n_mels//16, T//4*4)
-        # output: (B, dim_in, n_mels, T//4*4)
+        # input: (B, max_conv_dim+num_features(256) // 2, n_mels // 16, T // 4 * 4)
+        # output: (B, dim_in, n_mels, T // 4 * 4)
        for block in self.decode:
            x = block(x, s)
            if (masks is not None) and (x.shape[2] in [32, 64, 128]):
@ -397,11 +433,10 @@ class MappingNetwork(nn.Layer):
            z(Tensor(float32)): 
                Shape (B, 1, n_mels, T).
            y(Tensor(float32)):
-                speaker label. Shape (B,).
-                
+                speaker label. Shape (B, ).    
        Returns:
            Tensor:
-                Shape (style_dim,)
+                Shape (style_dim, )
        """

        h = self.shared(z)
@ -411,7 +446,7 @@ class MappingNetwork(nn.Layer):
        # (B, num_domains, style_dim)
        out = paddle.stack(out, axis=1)
        idx = paddle.arange(y.shape[0])
-        # (style_dim,)
+        # (style_dim, )
        s = out[idx, y]
        return s

@ -462,10 +497,10 @@ class StyleEncoder(nn.Layer):
            x(Tensor(float32)): 
                Shape (B, 1, n_mels, T).   
            y(Tensor(float32)):
-                speaker label. Shape (B,).
+                speaker label. Shape (B, ).
        Returns:
            Tensor:
-                Shape (style_dim,)
+                Shape (style_dim, )
        """
        h = self.shared(x)
        h = h.reshape((h.shape[0], -1))
@ -502,10 +537,12 @@ class Discriminator(nn.Layer):
        self.num_domains = num_domains

    def forward(self, x: paddle.Tensor, y: paddle.Tensor):
-        return self.dis(x, y)
+        out = self.dis(x, y)
+        return out

    def classifier(self, x: paddle.Tensor):
-        return self.cls.get_feature(x)
+        out = self.cls.get_feature(x)
+        return out


 class Discriminator2D(nn.Layer):