From 7689ff412b59711e8557a5beb7d08b3de1c0ed17 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Sun, 24 Oct 2021 15:03:04 +0000
Subject: [PATCH] wer/cer/bleu Calculator, label smoothing func

---
 deepspeech/utils/asr_utils.py  |  52 +++++++++++
 deepspeech/utils/bleu_score.py |  72 ++++++++++++++--
 deepspeech/utils/error_rate.py | 153 ++++++++++++++++++++++++++++++++-
 3 files changed, 269 insertions(+), 8 deletions(-)
 create mode 100644 deepspeech/utils/asr_utils.py
diff --git a/deepspeech/utils/asr_utils.py b/deepspeech/utils/asr_utils.py
new file mode 100644
index 000000000..06cf64876
--- /dev/null
+++ b/deepspeech/utils/asr_utils.py
@@ -0,0 +1,52 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import numpy as np
+
+__all__ = ["label_smoothing_dist"]
+
+
+# TODO(takaaki-hori): add different smoothing methods
+def label_smoothing_dist(odim, lsm_type, transcript=None, blank=0):
+    """Obtain label distribution for loss smoothing.
+
+    :param odim:
+    :param lsm_type:
+    :param blank:
+    :param transcript:
+    :return:
+    """
+    if transcript is not None:
+        with open(transcript, "rb") as f:
+            trans_json = json.load(f)["utts"]
+
+    if lsm_type == "unigram":
+        assert transcript is not None, (
+            "transcript is required for %s label smoothing" % lsm_type
+        )
+        labelcount = np.zeros(odim)
+        for k, v in trans_json.items():
+            ids = np.array([int(n) for n in v["output"][0]["tokenid"].split()])
+            # to avoid an error when there is no text in an uttrance
+            if len(ids) > 0:
+                labelcount[ids] += 1
+        labelcount[odim - 1] = len(transcript)  # count <eos>
+        labelcount[labelcount == 0] = 1  # flooring
+        labelcount[blank] = 0  # remove counts for blank
+        labeldist = labelcount.astype(np.float32) / np.sum(labelcount)
+    else:
+        logging.error("Error: unexpected label smoothing type: %s" % lsm_type)
+        sys.exit()
+
+    return labeldist
diff --git a/deepspeech/utils/bleu_score.py b/deepspeech/utils/bleu_score.py
index 09646133a..93749dddc 100644
--- a/deepspeech/utils/bleu_score.py
+++ b/deepspeech/utils/bleu_score.py
@@ -15,16 +15,16 @@
 e.g. wer for word-level, cer for char-level.
 """
 import sacrebleu
+import nltk
+import numpy as np
 
-__all__ = ['bleu', 'char_bleu']
+__all__ = ['bleu', 'char_bleu', "ErrorCalculator"]
 
 
 def bleu(hypothesis, reference):
     """Calculate BLEU. BLEU compares reference text and
     hypothesis text in word-level using scarebleu.
 
-   
-
     :param reference: The reference sentences.
     :type reference: list[list[str]]
     :param hypothesis: The hypothesis sentence.
@@ -39,8 +39,6 @@ def char_bleu(hypothesis, reference):
     """Calculate BLEU. BLEU compares reference text and
     hypothesis text in char-level using scarebleu.
 
-   
-
     :param reference: The reference sentences.
     :type reference: list[list[str]]
     :param hypothesis: The hypothesis sentence.
@@ -52,3 +50,67 @@ def char_bleu(hypothesis, reference):
                  for ref in reference]
 
     return sacrebleu.corpus_bleu(hypothesis, reference)
+
+
+class ErrorCalculator():
+    """Calculate BLEU for ST and MT models during training.
+
+    :param y_hats: numpy array with predicted text
+    :param y_pads: numpy array with true (target) text
+    :param char_list: vocabulary list
+    :param sym_space: space symbol
+    :param sym_pad: pad symbol
+    :param report_bleu: report BLUE score if True
+    """
+
+    def __init__(self, char_list, sym_space, sym_pad, report_bleu=False):
+        """Construct an ErrorCalculator object."""
+        super().__init__()
+        self.char_list = char_list
+        self.space = sym_space
+        self.pad = sym_pad
+        self.report_bleu = report_bleu
+        if self.space in self.char_list:
+            self.idx_space = self.char_list.index(self.space)
+        else:
+            self.idx_space = None
+
+    def __call__(self, ys_hat, ys_pad):
+        """Calculate corpus-level BLEU score.
+
+        :param torch.Tensor ys_hat: prediction (batch, seqlen)
+        :param torch.Tensor ys_pad: reference (batch, seqlen)
+        :return: corpus-level BLEU score in a mini-batch
+        :rtype float
+        """
+        bleu = None
+        if not self.report_bleu:
+            return bleu
+
+        bleu = self.calculate_corpus_bleu(ys_hat, ys_pad)
+        return bleu
+
+    def calculate_corpus_bleu(self, ys_hat, ys_pad):
+        """Calculate corpus-level BLEU score in a mini-batch.
+
+        :param torch.Tensor seqs_hat: prediction (batch, seqlen)
+        :param torch.Tensor seqs_true: reference (batch, seqlen)
+        :return: corpus-level BLEU score
+        :rtype float
+        """
+        seqs_hat, seqs_true = [], []
+        for i, y_hat in enumerate(ys_hat):
+            y_true = ys_pad[i]
+            eos_true = np.where(y_true == -1)[0]
+            ymax = eos_true[0] if len(eos_true) > 0 else len(y_true)
+            # NOTE: padding index (-1) in y_true is used to pad y_hat
+            # because y_hats is not padded with -1
+            seq_hat = [self.char_list[int(idx)] for idx in y_hat[:ymax]]
+            seq_true = [self.char_list[int(idx)] for idx in y_true if int(idx) != -1]
+            seq_hat_text = "".join(seq_hat).replace(self.space, " ")
+            seq_hat_text = seq_hat_text.replace(self.pad, "")
+            seq_true_text = "".join(seq_true).replace(self.space, " ")
+            seqs_hat.append(seq_hat_text)
+            seqs_true.append(seq_true_text)
+        bleu = nltk.bleu_score.corpus_bleu([[ref] for ref in seqs_true], seqs_hat)
+        return bleu * 100
diff --git a/deepspeech/utils/error_rate.py b/deepspeech/utils/error_rate.py
index 81f458b6e..0ad62b6b6 100644
--- a/deepspeech/utils/error_rate.py
+++ b/deepspeech/utils/error_rate.py
@@ -16,10 +16,11 @@ e.g. wer for word-level, cer for char-level.
 """
 import editdistance
 import numpy as np
+import logging
+import sys
+from itertools import groupby
 
-__all__ = ['word_errors', 'char_errors', 'wer', 'cer']
-
-editdistance.eval("a", "b")
+__all__ = ['word_errors', 'char_errors', 'wer', 'cer', "ErrorCalculator"]
 
 
 def _levenshtein_distance(ref, hyp):
@@ -211,3 +212,149 @@ def cer(reference, hypothesis, ignore_case=False, remove_space=False):
 
     cer = float(edit_distance) / ref_len
     return cer
+
+
+class ErrorCalculator():
+    """Calculate CER and WER for E2E_ASR and CTC models during training.
+
+    :param y_hats: numpy array with predicted text
+    :param y_pads: numpy array with true (target) text
+    :param char_list: List[str]
+    :param sym_space: <space>
+    :param sym_blank: <blank>
+    :return:
+    """
+
+    def __init__(
+        self, char_list, sym_space, sym_blank, report_cer=False, report_wer=False
+    ):
+        """Construct an ErrorCalculator object."""
+        super().__init__()
+
+        self.report_cer = report_cer
+        self.report_wer = report_wer
+
+        self.char_list = char_list
+        self.space = sym_space
+        self.blank = sym_blank
+        self.idx_blank = self.char_list.index(self.blank)
+        if self.space in self.char_list:
+            self.idx_space = self.char_list.index(self.space)
+        else:
+            self.idx_space = None
+
+    def __call__(self, ys_hat, ys_pad, is_ctc=False):
+        """Calculate sentence-level WER/CER score.
+
+        :param paddle.Tensor ys_hat: prediction (batch, seqlen)
+        :param paddle.Tensor ys_pad: reference (batch, seqlen)
+        :param bool is_ctc: calculate CER score for CTC
+        :return: sentence-level WER score
+        :rtype float
+        :return: sentence-level CER score
+        :rtype float
+        """
+        cer, wer = None, None
+        if is_ctc:
+            return self.calculate_cer_ctc(ys_hat, ys_pad)
+        elif not self.report_cer and not self.report_wer:
+            return cer, wer
+
+        seqs_hat, seqs_true = self.convert_to_char(ys_hat, ys_pad)
+        if self.report_cer:
+            cer = self.calculate_cer(seqs_hat, seqs_true)
+
+        if self.report_wer:
+            wer = self.calculate_wer(seqs_hat, seqs_true)
+        return cer, wer
+
+    def calculate_cer_ctc(self, ys_hat, ys_pad):
+        """Calculate sentence-level CER score for CTC.
+
+        :param paddle.Tensor ys_hat: prediction (batch, seqlen)
+        :param paddle.Tensor ys_pad: reference (batch, seqlen)
+        :return: average sentence-level CER score
+        :rtype float
+        """
+        cers, char_ref_lens = [], []
+        for i, y in enumerate(ys_hat):
+            y_hat = [x[0] for x in groupby(y)]
+            y_true = ys_pad[i]
+            seq_hat, seq_true = [], []
+            for idx in y_hat:
+                idx = int(idx)
+                if idx != -1 and idx != self.idx_blank and idx != self.idx_space:
+                    seq_hat.append(self.char_list[int(idx)])
+
+            for idx in y_true:
+                idx = int(idx)
+                if idx != -1 and idx != self.idx_blank and idx != self.idx_space:
+                    seq_true.append(self.char_list[int(idx)])
+
+            hyp_chars = "".join(seq_hat)
+            ref_chars = "".join(seq_true)
+            if len(ref_chars) > 0:
+                cers.append(editdistance.eval(hyp_chars, ref_chars))
+                char_ref_lens.append(len(ref_chars))
+
+        cer_ctc = float(sum(cers)) / sum(char_ref_lens) if cers else None
+        return cer_ctc
+
+    def convert_to_char(self, ys_hat, ys_pad):
+        """Convert index to character.
+
+        :param paddle.Tensor seqs_hat: prediction (batch, seqlen)
+        :param paddle.Tensor seqs_true: reference (batch, seqlen)
+        :return: token list of prediction
+        :rtype list
+        :return: token list of reference
+        :rtype list
+        """
+        seqs_hat, seqs_true = [], []
+        for i, y_hat in enumerate(ys_hat):
+            y_true = ys_pad[i]
+            eos_true = np.where(y_true == -1)[0]
+            ymax = eos_true[0] if len(eos_true) > 0 else len(y_true)
+            # NOTE: padding index (-1) in y_true is used to pad y_hat
+            seq_hat = [self.char_list[int(idx)] for idx in y_hat[:ymax]]
+            seq_true = [self.char_list[int(idx)] for idx in y_true if int(idx) != -1]
+            seq_hat_text = "".join(seq_hat).replace(self.space, " ")
+            seq_hat_text = seq_hat_text.replace(self.blank, "")
+            seq_true_text = "".join(seq_true).replace(self.space, " ")
+            seqs_hat.append(seq_hat_text)
+            seqs_true.append(seq_true_text)
+        return seqs_hat, seqs_true
+
+    def calculate_cer(self, seqs_hat, seqs_true):
+        """Calculate sentence-level CER score.
+
+        :param list seqs_hat: prediction
+        :param list seqs_true: reference
+        :return: average sentence-level CER score
+        :rtype float
+        """
+        char_eds, char_ref_lens = [], []
+        for i, seq_hat_text in enumerate(seqs_hat):
+            seq_true_text = seqs_true[i]
+            hyp_chars = seq_hat_text.replace(" ", "")
+            ref_chars = seq_true_text.replace(" ", "")
+            char_eds.append(editdistance.eval(hyp_chars, ref_chars))
+            char_ref_lens.append(len(ref_chars))
+        return float(sum(char_eds)) / sum(char_ref_lens)
+
+    def calculate_wer(self, seqs_hat, seqs_true):
+        """Calculate sentence-level WER score.
+
+        :param list seqs_hat: prediction
+        :param list seqs_true: reference
+        :return: average sentence-level WER score
+        :rtype float
+        """
+        word_eds, word_ref_lens = [], []
+        for i, seq_hat_text in enumerate(seqs_hat):
+            seq_true_text = seqs_true[i]
+            hyp_words = seq_hat_text.split()
+            ref_words = seq_true_text.split()
+            word_eds.append(editdistance.eval(hyp_words, ref_words))
+            word_ref_lens.append(len(ref_words))
+        return float(sum(word_eds)) / sum(word_ref_lens)