From d3d9f835944e113c101a888f88bb44872aabf80f Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Mon, 1 Nov 2021 02:58:15 +0000
Subject: [PATCH] add global init for multi band melgan to avoid large output
 in the begin

---
 examples/csmsc/voc3/conf/default.yaml  |   4 +-
 examples/csmsc/voc3/conf/use_tanh.yaml | 139 -------------------------
 parakeet/models/melgan/melgan.py       |  19 +++-
 parakeet/modules/residual_stack.py     |   5 +-
 4 files changed, 19 insertions(+), 148 deletions(-)
 delete mode 100644 examples/csmsc/voc3/conf/use_tanh.yaml

diff --git a/examples/csmsc/voc3/conf/default.yaml b/examples/csmsc/voc3/conf/default.yaml
index f6fcfcedd..cc27220fc 100644
--- a/examples/csmsc/voc3/conf/default.yaml
+++ b/examples/csmsc/voc3/conf/default.yaml
@@ -35,7 +35,7 @@ generator_params:
     stacks: 4                     # Number of stacks in a single residual stack module.
     use_weight_norm: True         # Whether to use weight normalization.
     use_causal_conv: False        # Whether to use causal convolution.
-    use_final_nonlinear_activation: False # If True, spectral_convergence_loss and sub_spectral_convergence_loss will be too large (eg.30)
+    use_final_nonlinear_activation: True
 
 
 ###########################################################
@@ -129,7 +129,7 @@ discriminator_scheduler_params:
 ###########################################################
 discriminator_train_start_steps: 200000 # Number of steps to start to train discriminator.
 train_max_steps: 1000000                # Number of training steps.
-save_interval_steps: 50000              # Interval steps to save checkpoint.
+save_interval_steps: 5000              # Interval steps to save checkpoint.
 eval_interval_steps: 1000               # Interval steps to evaluate the network.
 
 ###########################################################
diff --git a/examples/csmsc/voc3/conf/use_tanh.yaml b/examples/csmsc/voc3/conf/use_tanh.yaml
deleted file mode 100644
index 820c2a761..000000000
--- a/examples/csmsc/voc3/conf/use_tanh.yaml
+++ /dev/null
@@ -1,139 +0,0 @@
-# This is the hyperparameter configuration file for MelGAN.
-# Please make sure this is adjusted for the CSMSC dataset. If you want to
-# apply to the other dataset, you might need to carefully change some parameters.
-# This configuration requires ~ 8GB memory and will finish within 7 days on Titan V.
-
-# This configuration is based on full-band MelGAN but the hop size and sampling
-# rate is different from the paper (16kHz vs 24kHz). The number of iteraions
-# is not shown in the paper so currently we train 1M iterations (not sure enough
-# to converge). The optimizer setting is based on @dathudeptrai advice.
-# https://github.com/kan-bayashi/ParallelWaveGAN/issues/143#issuecomment-632539906
-
-###########################################################
-#                FEATURE EXTRACTION SETTING               #
-###########################################################
-fs: 24000                # Sampling rate.
-n_fft: 2048              # FFT size. (in samples)
-n_shift: 300             # Hop size. (in samples)
-win_length: 1200         # Window length. (in samples)
-                         # If set to null, it will be the same as fft_size.
-window: "hann"           # Window function.
-n_mels: 80               # Number of mel basis.
-fmin: 80                 # Minimum freq in mel basis calculation. (Hz)
-fmax: 7600               # Maximum frequency in mel basis calculation. (Hz)
-
-###########################################################
-#         GENERATOR NETWORK ARCHITECTURE SETTING          #
-###########################################################
-generator_params:
-    in_channels: 80               # Number of input channels.
-    out_channels: 4               # Number of output channels.
-    kernel_size: 7                # Kernel size of initial and final conv layers.
-    channels: 384                 # Initial number of channels for conv layers.
-    upsample_scales: [5, 5, 3]    # List of Upsampling scales.
-    stack_kernel_size: 3          # Kernel size of dilated conv layers in residual stack.
-    stacks: 4                     # Number of stacks in a single residual stack module.
-    use_weight_norm: True         # Whether to use weight normalization.
-    use_causal_conv: False        # Whether to use causal convolution.
-    use_final_nonlinear_activation: True # If True, spectral_convergence_loss and sub_spectral_convergence_loss will be too large (eg.30)
-
-
-###########################################################
-#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
-###########################################################
-discriminator_params:
-    in_channels: 1                    # Number of input channels.
-    out_channels: 1                   # Number of output channels.
-    scales: 3                         # Number of multi-scales.
-    downsample_pooling: "AvgPool1D"   # Pooling type for the input downsampling.
-    downsample_pooling_params:        # Parameters of the above pooling function.
-        kernel_size: 4
-        stride: 2
-        padding: 1
-        exclusive: True
-    kernel_sizes: [5, 3]              # List of kernel size.
-    channels: 16                      # Number of channels of the initial conv layer.
-    max_downsample_channels: 512      # Maximum number of channels of downsampling layers.
-    downsample_scales: [4, 4, 4]      # List of downsampling scales.
-    nonlinear_activation: "LeakyReLU" # Nonlinear activation function.
-    nonlinear_activation_params:      # Parameters of nonlinear activation function.
-        negative_slope: 0.2
-    use_weight_norm: True             # Whether to use weight norm.
-    
-
-###########################################################
-#                   STFT LOSS SETTING                     #
-###########################################################
-use_stft_loss: true
-stft_loss_params:
-    fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
-    hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
-    win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
-    window: "hann"                # Window function for STFT-based loss
-use_subband_stft_loss: true
-subband_stft_loss_params:
-    fft_sizes: [384, 683, 171]  # List of FFT size for STFT-based loss.
-    hop_sizes: [30, 60, 10]     # List of hop size for STFT-based loss
-    win_lengths: [150, 300, 60] # List of window length for STFT-based loss.
-    window: "hann"              # Window function for STFT-based loss
-
-###########################################################
-#               ADVERSARIAL LOSS SETTING                  #
-###########################################################
-use_feat_match_loss: false # Whether to use feature matching loss.
-lambda_adv: 2.5            # Loss balancing coefficient for adversarial loss.
-
-###########################################################
-#                  DATA LOADER SETTING                    #
-###########################################################
-batch_size: 64             # Batch size.
-batch_max_steps: 16200     # Length of each audio in batch. Make sure dividable by hop_size.
-num_workers: 2             # Number of workers in DataLoader.
-
-###########################################################
-#             OPTIMIZER & SCHEDULER SETTING               #
-###########################################################
-generator_optimizer_params:
-    epsilon: 1.0e-7                     # Generator's epsilon.
-    weight_decay: 0.0                   # Generator's weight decay coefficient.
-
-generator_grad_norm: -1                 # Generator's gradient norm.
-generator_scheduler_params:
-    learning_rate: 1.0e-3               # Generator's learning rate.
-    gamma: 0.5                          # Generator's scheduler gamma.
-    milestones:                         # At each milestone, lr will be multiplied by gamma.
-        - 100000
-        - 200000
-        - 300000
-        - 400000
-        - 500000
-        - 600000
-discriminator_optimizer_params:
-    epsilon: 1.0e-7                          # Discriminator's epsilon.
-    weight_decay: 0.0                       # Discriminator's weight decay coefficient.
-  
-discriminator_grad_norm: -1                 # Discriminator's gradient norm.
-discriminator_scheduler_params:
-    learning_rate: 1.0e-3                   # Discriminator's learning rate.
-    gamma: 0.5                              # Discriminator's scheduler gamma.
-    milestones:                             # At each milestone, lr will be multiplied by gamma.
-        - 100000
-        - 200000
-        - 300000
-        - 400000
-        - 500000
-        - 600000
-
-###########################################################
-#                    INTERVAL SETTING                     #
-###########################################################
-discriminator_train_start_steps: 200000 # Number of steps to start to train discriminator.
-train_max_steps: 1000000                # Number of training steps.
-save_interval_steps: 50000              # Interval steps to save checkpoint.
-eval_interval_steps: 1000               # Interval steps to evaluate the network.
-
-###########################################################
-#                     OTHER SETTING                       #
-###########################################################
-num_snapshots: 10                 # max number of snapshots to keep while training
-seed: 42                          # random seed for paddle, random, and np.random
\ No newline at end of file
diff --git a/parakeet/models/melgan/melgan.py b/parakeet/models/melgan/melgan.py
index 0347ff221..3f002b80c 100644
--- a/parakeet/models/melgan/melgan.py
+++ b/parakeet/models/melgan/melgan.py
@@ -22,6 +22,7 @@ from paddle import nn
 
 from parakeet.modules.causal_conv import CausalConv1D
 from parakeet.modules.causal_conv import CausalConv1DTranspose
+from parakeet.modules.nets_utils import initialize
 from parakeet.modules.pqmf import PQMF
 from parakeet.modules.residual_stack import ResidualStack
 
@@ -45,7 +46,8 @@ class MelGANGenerator(nn.Layer):
             pad_params: Dict[str, Any]={"mode": "reflect"},
             use_final_nonlinear_activation: bool=True,
             use_weight_norm: bool=True,
-            use_causal_conv: bool=False, ):
+            use_causal_conv: bool=False,
+            init_type: str="xavier_uniform", ):
         """Initialize MelGANGenerator module.
         Parameters
         ----------
@@ -91,7 +93,10 @@ class MelGANGenerator(nn.Layer):
         if not use_causal_conv:
             assert (kernel_size - 1
                     ) % 2 == 0, "Not support even number kernel size."
-        # add initial layer
+
+        # initialize parameters
+        initialize(self, init_type)
+
         layers = []
         if not use_causal_conv:
             layers += [
@@ -178,6 +183,7 @@ class MelGANGenerator(nn.Layer):
 
         # define the model as a single function        
         self.melgan = nn.Sequential(*layers)
+        nn.initializer.set_global_initializer(None)
 
         # apply weight norm
         if use_weight_norm:
@@ -322,6 +328,7 @@ class MelGANDiscriminator(nn.Layer):
         assert len(kernel_sizes) == 2
         assert kernel_sizes[0] % 2 == 1
         assert kernel_sizes[1] % 2 == 1
+
         # add first layer
         self.layers.append(
             nn.Sequential(
@@ -417,7 +424,8 @@ class MelGANMultiScaleDiscriminator(nn.Layer):
             nonlinear_activation_params: Dict[str, Any]={"negative_slope": 0.2},
             pad: str="Pad1D",
             pad_params: Dict[str, Any]={"mode": "reflect"},
-            use_weight_norm: bool=True, ):
+            use_weight_norm: bool=True,
+            init_type: str="xavier_uniform", ):
         """Initilize MelGAN multi-scale discriminator module.
         Parameters
         ----------
@@ -454,6 +462,9 @@ class MelGANMultiScaleDiscriminator(nn.Layer):
             Whether to use causal convolution.
         """
         super().__init__()
+        # initialize parameters
+        initialize(self, init_type)
+
         self.discriminators = nn.LayerList()
 
         # add discriminators
@@ -474,6 +485,8 @@ class MelGANMultiScaleDiscriminator(nn.Layer):
         self.pooling = getattr(nn, downsample_pooling)(
             **downsample_pooling_params)
 
+        nn.initializer.set_global_initializer(None)
+
         # apply weight norm
         if use_weight_norm:
             self.apply_weight_norm()
diff --git a/parakeet/modules/residual_stack.py b/parakeet/modules/residual_stack.py
index b798fbb61..135c32e57 100644
--- a/parakeet/modules/residual_stack.py
+++ b/parakeet/modules/residual_stack.py
@@ -106,7 +106,4 @@ class ResidualStack(nn.Layer):
         Tensor
             Output tensor (B, chennels, T).
         """
-        stack_output = self.stack(c)
-        skip_layer_output = self.skip_layer(c)
-        out = stack_output + skip_layer_output
-        return out
+        return self.stack(c) + self.skip_layer(c)