From d64f6e9ea556d15b7dfec7c0b0f247604db0b1c4 Mon Sep 17 00:00:00 2001
From: huangyuxin <hyxin2014@126.com>
Date: Mon, 1 Nov 2021 05:44:50 +0000
Subject: [PATCH] Add the feature: caculating the perplexity of transformerLM

---
 deepspeech/exps/lm/transformer/__init__.py    |  13 ++
 .../lm/transformer/bin/cacu_perplexity.py     |  82 +++++++++++
 .../exps/lm/transformer/lm_cacu_perplexity.py | 132 ++++++++++++++++++
 .../frontend/featurizer/text_featurizer.py    |   4 +-
 deepspeech/io/collator.py                     |  40 +++++-
 deepspeech/io/dataset.py                      |  19 +++
 deepspeech/models/lm/transformer.py           |   7 +-
 .../librispeech/s2/conf/lm/transformer.yaml   |   8 ++
 .../librispeech/s2/local/cacu_perplexity.sh   |  53 +++++++
 examples/librispeech/s2/run.sh                |   4 +
 10 files changed, 357 insertions(+), 5 deletions(-)
 create mode 100644 deepspeech/exps/lm/transformer/__init__.py
 create mode 100644 deepspeech/exps/lm/transformer/bin/cacu_perplexity.py
 create mode 100644 deepspeech/exps/lm/transformer/lm_cacu_perplexity.py
 create mode 100755 examples/librispeech/s2/local/cacu_perplexity.sh

diff --git a/deepspeech/exps/lm/transformer/__init__.py b/deepspeech/exps/lm/transformer/__init__.py
new file mode 100644
index 00000000..185a92b8
--- /dev/null
+++ b/deepspeech/exps/lm/transformer/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/deepspeech/exps/lm/transformer/bin/cacu_perplexity.py b/deepspeech/exps/lm/transformer/bin/cacu_perplexity.py
new file mode 100644
index 00000000..29a880f8
--- /dev/null
+++ b/deepspeech/exps/lm/transformer/bin/cacu_perplexity.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+
+import configargparse
+
+
+def get_parser():
+    """Get default arguments."""
+    parser = configargparse.ArgumentParser(
+        description="The parser for caculating the perplexity of transformer language model ",
+        config_file_parser_class=configargparse.YAMLConfigFileParser,
+        formatter_class=configargparse.ArgumentDefaultsHelpFormatter, )
+
+    parser.add_argument(
+        "--rnnlm", type=str, default=None, help="RNNLM model file to read")
+
+    parser.add_argument(
+        "--rnnlm-conf",
+        type=str,
+        default=None,
+        help="RNNLM model config file to read")
+
+    parser.add_argument(
+        "--vocab_path",
+        type=str,
+        default=None,
+        help="vocab path to for token2id")
+
+    parser.add_argument(
+        "--bpeprefix",
+        type=str,
+        default=None,
+        help="The path of bpeprefix for loading")
+
+    parser.add_argument(
+        "--text_path",
+        type=str,
+        default=None,
+        help="The path of text file for testing ")
+
+    parser.add_argument(
+        "--ngpu",
+        type=int,
+        default=0,
+        help="The number of gpu to use, 0 for using cpu instead")
+
+    parser.add_argument(
+        "--dtype",
+        choices=("float16", "float32", "float64"),
+        default="float32",
+        help="Float precision (only available in --api v2)", )
+
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default=".",
+        help="The output directory to store the sentence PPL")
+
+    return parser
+
+
+def main(args):
+    parser = get_parser()
+    args = parser.parse_args(args)
+    from deepspeech.exps.lm.transformer.lm_cacu_perplexity import run_get_perplexity
+    run_get_perplexity(args)
+
+
+if __name__ == "__main__":
+    main(sys.argv[1:])
diff --git a/deepspeech/exps/lm/transformer/lm_cacu_perplexity.py b/deepspeech/exps/lm/transformer/lm_cacu_perplexity.py
new file mode 100644
index 00000000..b63bcd08
--- /dev/null
+++ b/deepspeech/exps/lm/transformer/lm_cacu_perplexity.py
@@ -0,0 +1,132 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Caculating the PPL of LM model
+import os
+
+import numpy as np
+import paddle
+from paddle.io import DataLoader
+from yacs.config import CfgNode
+
+from deepspeech.io.collator import TextCollatorSpm
+from deepspeech.io.dataset import TextDataset
+from deepspeech.models.lm_interface import dynamic_import_lm
+from deepspeech.utils.log import Log
+
+logger = Log(__name__).getlog()
+
+
+def get_config(config_path):
+    confs = CfgNode(new_allowed=True)
+    confs.merge_from_file(config_path)
+    return confs
+
+
+def load_trained_lm(args):
+    lm_config = get_config(args.rnnlm_conf)
+    lm_model_module = lm_config.model_module
+    lm_class = dynamic_import_lm(lm_model_module)
+    lm = lm_class(**lm_config.model)
+    model_dict = paddle.load(args.rnnlm)
+    lm.set_state_dict(model_dict)
+    return lm, lm_config
+
+
+def write_dict_into_file(ppl_dict, name):
+    with open(name, "w") as f:
+        for key in ppl_dict.keys():
+            f.write(key + " " + ppl_dict[key] + "\n")
+    return
+
+
+def cacu_perplexity(
+        lm_model,
+        lm_config,
+        args,
+        log_base=None, ):
+    unit_type = lm_config.data.unit_type
+    batch_size = lm_config.decoding.batch_size
+    num_workers = lm_config.decoding.num_workers
+    text_file_path = args.text_path
+
+    total_nll = 0.0
+    total_ntokens = 0
+    ppl_dict = {}
+    len_dict = {}
+    text_dataset = TextDataset.from_file(text_file_path)
+    collate_fn_text = TextCollatorSpm(
+        unit_type=unit_type,
+        vocab_filepath=args.vocab_path,
+        spm_model_prefix=args.bpeprefix)
+    train_loader = DataLoader(
+        text_dataset,
+        batch_size=batch_size,
+        collate_fn=collate_fn_text,
+        num_workers=num_workers)
+
+    logger.info("start caculating PPL......")
+    for i, (keys, ys_input_pad, ys_output_pad,
+            y_lens) in enumerate(train_loader()):
+
+        ys_input_pad = paddle.to_tensor(ys_input_pad)
+        ys_output_pad = paddle.to_tensor(ys_output_pad)
+        _, unused_logp, unused_count, nll, nll_count = lm_model.forward(
+            ys_input_pad, ys_output_pad)
+        nll = nll.numpy()
+        nll_count = nll_count.numpy()
+        for key, _nll, ntoken in zip(keys, nll, nll_count):
+            if log_base is None:
+                utt_ppl = np.exp(_nll / ntoken)
+            else:
+                utt_ppl = log_base**(_nll / ntoken / np.log(log_base))
+
+            # Write PPL of each utts for debugging or analysis
+            ppl_dict[key] = str(utt_ppl)
+            len_dict[key] = str(ntoken)
+
+        total_nll += nll.sum()
+        total_ntokens += nll_count.sum()
+        logger.info("Current total nll: " + str(total_nll))
+        logger.info("Current total tokens: " + str(total_ntokens))
+    write_dict_into_file(ppl_dict, os.path.join(args.output_dir, "uttPPL"))
+    write_dict_into_file(len_dict, os.path.join(args.output_dir, "uttLEN"))
+    if log_base is None:
+        ppl = np.exp(total_nll / total_ntokens)
+    else:
+        ppl = log_base**(total_nll / total_ntokens / np.log(log_base))
+
+    if log_base is None:
+        log_base = np.e
+    else:
+        log_base = log_base
+
+    return ppl, log_base
+
+
+def run_get_perplexity(args):
+    if args.ngpu > 1:
+        raise NotImplementedError("only single GPU decoding is supported")
+    if args.ngpu == 1:
+        device = "gpu:0"
+    else:
+        device = "cpu"
+    paddle.set_device(device)
+    dtype = getattr(paddle, args.dtype)
+    logger.info(f"Decoding device={device}, dtype={dtype}")
+    lm_model, lm_config = load_trained_lm(args)
+    lm_model.to(device=device, dtype=dtype)
+    lm_model.eval()
+    PPL, log_base = cacu_perplexity(lm_model, lm_config, args, None)
+    logger.info("Final PPL: " + str(PPL))
+    logger.info("The log base is:" + str("%.2f" % log_base))
diff --git a/deepspeech/frontend/featurizer/text_featurizer.py b/deepspeech/frontend/featurizer/text_featurizer.py
index a6834ebc..c596bd43 100644
--- a/deepspeech/frontend/featurizer/text_featurizer.py
+++ b/deepspeech/frontend/featurizer/text_featurizer.py
@@ -53,7 +53,7 @@ class TextFeaturizer():
         self.maskctc = maskctc
 
         if vocab_filepath:
-            self.vocab_dict, self._id2token, self.vocab_list, self.unk_id, self.eos_id = self._load_vocabulary_from_file(
+            self.vocab_dict, self._id2token, self.vocab_list, self.unk_id, self.eos_id, self.blank_id = self._load_vocabulary_from_file(
                 vocab_filepath, maskctc)
             self.vocab_size = len(self.vocab_list)
 
@@ -227,4 +227,4 @@ class TextFeaturizer():
         logger.info(f"SOS id: {sos_id}")
         logger.info(f"SPACE id: {space_id}")
         logger.info(f"MASKCTC id: {maskctc_id}")
-        return token2id, id2token, vocab_list, unk_id, eos_id
+        return token2id, id2token, vocab_list, unk_id, eos_id, blank_id
diff --git a/deepspeech/io/collator.py b/deepspeech/io/collator.py
index b523dfc8..5391260e 100644
--- a/deepspeech/io/collator.py
+++ b/deepspeech/io/collator.py
@@ -19,6 +19,7 @@ from yacs.config import CfgNode
 
 from deepspeech.frontend.augmentor.augmentation import AugmentationPipeline
 from deepspeech.frontend.featurizer.speech_featurizer import SpeechFeaturizer
+from deepspeech.frontend.featurizer.text_featurizer import TextFeaturizer
 from deepspeech.frontend.normalizer import FeatureNormalizer
 from deepspeech.frontend.speech import SpeechSegment
 from deepspeech.frontend.utility import IGNORE_ID
@@ -33,7 +34,7 @@ logger = Log(__name__).getlog()
 
 
 def _tokenids(text, keep_transcription_text):
-    # for training text is token ids 
+    # for training text is token ids
     tokens = text  # token ids
 
     if keep_transcription_text:
@@ -45,6 +46,43 @@ def _tokenids(text, keep_transcription_text):
     return tokens
 
 
+class TextCollatorSpm():
+    def __init__(self, unit_type, vocab_filepath, spm_model_prefix):
+        assert (vocab_filepath is not None)
+        self.text_featurizer = TextFeaturizer(
+            unit_type=unit_type,
+            vocab_filepath=vocab_filepath,
+            spm_model_prefix=spm_model_prefix)
+        self.eos_id = self.text_featurizer.eos_id
+        self.blank_id = self.text_featurizer.blank_id
+
+    def __call__(self, batch):
+        """
+        return type  [List, np.array [B, T], np.array [B, T], np.array[B]]
+        """
+        keys = []
+        texts = []
+        texts_input = []
+        texts_output = []
+        text_lens = []
+
+        for idx, item in enumerate(batch):
+            key = item.split(" ")[0].strip()
+            text = " ".join(item.split(" ")[1:])
+            keys.append(key)
+            token_ids = self.text_featurizer.featurize(text)
+            texts_input.append(
+                np.array([self.eos_id] + token_ids).astype(np.int64))
+            texts_output.append(
+                np.array(token_ids + [self.eos_id]).astype(np.int64))
+            text_lens.append(len(token_ids) + 1)
+
+        ys_input_pad = pad_list(texts_input, self.blank_id).astype(np.int64)
+        ys_output_pad = pad_list(texts_output, self.blank_id).astype(np.int64)
+        y_lens = np.array(text_lens).astype(np.int64)
+        return keys, ys_input_pad, ys_output_pad, y_lens
+
+
 class SpeechCollatorBase():
     def __init__(
             self,
diff --git a/deepspeech/io/dataset.py b/deepspeech/io/dataset.py
index 1945c5f7..7c101002 100644
--- a/deepspeech/io/dataset.py
+++ b/deepspeech/io/dataset.py
@@ -24,6 +24,25 @@ __all__ = ["ManifestDataset", "TransformDataset"]
 logger = Log(__name__).getlog()
 
 
+class TextDataset(Dataset):
+    @classmethod
+    def from_file(cls, file_path):
+        dataset = cls(file_path)
+        return dataset
+
+    def __init__(self, file_path):
+        self._manifest = []
+        with open(file_path) as f:
+            for line in f:
+                self._manifest.append(line.strip())
+
+    def __len__(self):
+        return len(self._manifest)
+
+    def __getitem__(self, idx):
+        return self._manifest[idx]
+
+
 class ManifestDataset(Dataset):
     @classmethod
     def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
diff --git a/deepspeech/models/lm/transformer.py b/deepspeech/models/lm/transformer.py
index 35ecf678..19e2b758 100644
--- a/deepspeech/models/lm/transformer.py
+++ b/deepspeech/models/lm/transformer.py
@@ -111,6 +111,7 @@ class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface):
             in perplexity: p(t)^{-n} = exp(-log p(t) / n)
 
         """
+        batch_size = x.size(0)
         xm = x != 0
         xlen = xm.sum(axis=1)
         if self.embed_drop is not None:
@@ -121,11 +122,13 @@ class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface):
         y = self.decoder(h)
         loss = F.cross_entropy(
             y.view(-1, y.shape[-1]), t.view(-1), reduction="none")
-        mask = xm.to(dtype=loss.dtype)
+        mask = xm.to(loss.dtype)
         logp = loss * mask.view(-1)
+        nll = logp.view(batch_size, -1).sum(-1)
+        nll_count = mask.sum(-1)
         logp = logp.sum()
         count = mask.sum()
-        return logp / count, logp, count
+        return logp / count, logp, count, nll, nll_count
 
     # beam search API (see ScorerInterface)
     def score(self, y: paddle.Tensor, state: Any,
diff --git a/examples/librispeech/s2/conf/lm/transformer.yaml b/examples/librispeech/s2/conf/lm/transformer.yaml
index 4349f795..826f0802 100644
--- a/examples/librispeech/s2/conf/lm/transformer.yaml
+++ b/examples/librispeech/s2/conf/lm/transformer.yaml
@@ -1,4 +1,8 @@
 model_module: transformer
+
+data:
+    unit_type: spm
+
 model:
     n_vocab: 5002
     pos_enc: null
@@ -11,3 +15,7 @@ model:
     emb_dropout_rate: 0.0
     att_dropout_rate: 0.0
     tie_weights: False 
+
+decoding:
+    batch_size: 30
+    num_workers: 2
diff --git a/examples/librispeech/s2/local/cacu_perplexity.sh b/examples/librispeech/s2/local/cacu_perplexity.sh
new file mode 100755
index 00000000..a77a6de3
--- /dev/null
+++ b/examples/librispeech/s2/local/cacu_perplexity.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+
+set -e
+
+stage=-1
+stop_stage=100
+
+expdir=exp
+datadir=data
+
+ngpu=0
+
+# lm params
+rnnlm_config_path=conf/lm/transformer.yaml
+lmexpdir=exp/lm/transformer
+lang_model=transformerLM.pdparams
+
+#data path
+test_set=${datadir}/test_clean/text
+test_set_lower=${datadir}/test_clean/text_lower
+train_set=train_960
+
+# bpemode (unigram or bpe)
+nbpe=5000
+bpemode=unigram
+bpeprefix=${datadir}/lang_char/${train_set}_${bpemode}${nbpe}
+bpemodel=${bpeprefix}.model
+
+vocabfile=${bpeprefix}_units.txt
+vocabfile_lower=${bpeprefix}_units_lower.txt
+
+output_dir=${expdir}/lm/transformer/perplexity
+
+mkdir -p ${output_dir}
+
+# Transform the data upper case to lower
+if [ -f ${vocabfile} ]; then
+    tr A-Z a-z < ${vocabfile} > ${vocabfile_lower}
+fi
+
+if [ -f ${test_set} ]; then
+    tr A-Z a-z < ${test_set} > ${test_set_lower}
+fi
+
+python ${LM_BIN_DIR}/cacu_perplexity.py \
+    --rnnlm ${lmexpdir}/${lang_model} \
+    --rnnlm-conf ${rnnlm_config_path} \
+    --vocab_path ${vocabfile_lower} \
+    --bpeprefix ${bpeprefix} \
+    --text_path ${test_set_lower} \
+    --output_dir ${output_dir} \
+    --ngpu ${ngpu}
+
diff --git a/examples/librispeech/s2/run.sh b/examples/librispeech/s2/run.sh
index 146f133d..e014c2a9 100755
--- a/examples/librispeech/s2/run.sh
+++ b/examples/librispeech/s2/run.sh
@@ -51,3 +51,7 @@ if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
     # export ckpt avg_n
     CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
 fi
+
+if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
+    CUDA_VISIBLE_DEVICES= ./local/cacu_perplexity.sh || exit -1
+fi