diff --git a/deepspeech/modules/cmvn.py b/deepspeech/modules/cmvn.py
new file mode 100644
index 000000000..961755ab7
--- /dev/null
+++ b/deepspeech/modules/cmvn.py
@@ -0,0 +1,54 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+
+import paddle
+from paddle import nn
+from paddle.nn import functional as F
+from paddle.nn import initializer as I
+
+logger = logging.getLogger(__name__)
+
+__all__ = ['GlobalCMVN']
+
+
+class GlobalCMVN(nn.Layer):
+    def __init__(self,
+                 mean: paddle.Tensor,
+                 istd: paddle.Tensor,
+                 norm_var: bool=True):
+        """
+        Args:
+            mean (paddle.Tensor): mean stats
+            istd (paddle.Tensor): inverse std, std which is 1.0 / std
+        """
+        super().__init__()
+        assert mean.shape == istd.shape
+        self.norm_var = norm_var
+        # The buffer can be accessed from this module using self.mean
+        self.register_buffer("mean", mean)
+        self.register_buffer("istd", istd)
+
+    def forward(self, x: paddle.Tensor):
+        """
+        Args:
+            x (paddle.Tensor): (batch, max_len, feat_dim)
+        Returns:
+            (paddle.Tensor): normalized feature
+        """
+        x = x - self.mean
+        if self.norm_var:
+            x = x * self.istd
+        return x
diff --git a/deepspeech/modules/loss.py b/deepspeech/modules/loss.py
index ce59ec86f..bf06b6da1 100644
--- a/deepspeech/modules/loss.py
+++ b/deepspeech/modules/loss.py
@@ -21,7 +21,7 @@ from paddle.nn import initializer as I
 
 logger = logging.getLogger(__name__)
 
-__all__ = ['CTCLoss']
+__all__ = ['CTCLoss', "LabelSmoothingLoss"]
 
 
 # TODO(Hui Zhang): remove this hack, when `norm_by_times=True` is added
@@ -80,3 +80,81 @@ class CTCLoss(nn.Layer):
         # Batch-size average
         # loss = loss / paddle.shape(logits)[1]
         return loss
+
+
+class LabelSmoothingLoss(nn.Layer):
+    """Label-smoothing loss.
+    In a standard CE loss, the label's data distribution is:
+        [0,1,2] ->
+        [
+            [1.0, 0.0, 0.0],
+            [0.0, 1.0, 0.0],
+            [0.0, 0.0, 1.0],
+        ]
+    In the smoothing version CE Loss,some probabilities
+    are taken from the true label prob (1.0) and are divided
+    among other labels.
+        e.g.
+        smoothing=0.1
+        [0,1,2] ->
+        [
+            [0.9, 0.05, 0.05],
+            [0.05, 0.9, 0.05],
+            [0.05, 0.05, 0.9],
+        ]
+
+    """
+
+    def __init__(self,
+                 size: int,
+                 padding_idx: int,
+                 smoothing: float,
+                 normalize_length: bool=False):
+        """Label-smoothing loss.
+
+        Args:
+            size (int): the number of class
+            padding_idx (int): padding class id which will be ignored for loss
+            smoothing (float): smoothing rate (0.0 means the conventional CE)
+            normalize_length (bool): True, normalize loss by sequence length; False, normalize loss by batch size. Defaults to False.
+        """
+        super().__init__()
+        self.size = size
+        self.padding_idx = padding_idx
+        self.smoothing = smoothing
+        self.confidence = 1.0 - smoothing
+        self.normalize_length = normalize_length
+        self.criterion = nn.KLDivLoss(reduction="none")
+
+    def forward(self, x: paddle.Tensor, target: paddle.Tensor) -> paddle.Tensor:
+        """Compute loss between x and target.
+        The model outputs and data labels tensors are flatten to
+        (batch*seqlen, class) shape and a mask is applied to the
+        padding part which should not be calculated for loss.
+        Args:
+            x (paddle.Tensor): prediction (batch, seqlen, class)
+            target (paddle.Tensor):
+                target signal masked with self.padding_id (batch, seqlen)
+        Returns:
+            loss (paddle.Tensor) : The KL loss, scalar float value
+        """
+        B, T, D = paddle.shape(x)
+        assert D == self.size
+        x = x.reshape((-1, self.size))
+        target = target.reshape(-1)
+
+        # use zeros_like instead of torch.no_grad() for true_dist,
+        # since no_grad() can not be exported by JIT
+        true_dist = paddle.full_like(x, self.smoothing / (self.size - 1))
+        ignore = target == self.padding_idx  # (B,)
+        ignore = ignore.cast(target.dtype)
+
+        target = target * (1 - ignore)  # avoid -1 index
+        true_dist += F.one_hot(target, self.size) * self.confidence
+
+        kl = self.criterion(F.log_softmax(x, axis=1), true_dist)
+
+        total = len(target) - int(ignore.sum())
+        denom = total if self.normalize_length else B
+        numer = (kl * (1 - ignore)).sum()
+        return numer / denom