From caaa44e368f1e453b969ed96f7e7bc228cf0b624 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Thu, 16 Sep 2021 06:10:27 +0000
Subject: [PATCH 1/2] varbase getitem support np.longlong since paddle 2.2.0RC

---
 deepspeech/utils/ctc_utils.py | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/deepspeech/utils/ctc_utils.py b/deepspeech/utils/ctc_utils.py
index 09543d48d..6201233df 100644
--- a/deepspeech/utils/ctc_utils.py
+++ b/deepspeech/utils/ctc_utils.py
@@ -86,15 +86,13 @@ def forced_align(ctc_probs: paddle.Tensor, y: paddle.Tensor,
     log_alpha = paddle.zeros(
         (ctc_probs.size(0), len(y_insert_blank)))  #(T, 2L+1)
     log_alpha = log_alpha - float('inf')  # log of zero
-    # TODO(Hui Zhang): zeros not support paddle.int16
     state_path = (paddle.zeros(
-        (ctc_probs.size(0), len(y_insert_blank)), dtype=paddle.int32) - 1
+        (ctc_probs.size(0), len(y_insert_blank)), dtype=paddle.int16) - 1
                   )  # state path, Tuple((T, 2L+1))
 
     # init start state
-    # TODO(Hui Zhang): VarBase.__getitem__() not support np.int64
-    log_alpha[0, 0] = ctc_probs[0][int(y_insert_blank[0])]  # State-b, Sb
-    log_alpha[0, 1] = ctc_probs[0][int(y_insert_blank[1])]  # State-nb, Snb
+    log_alpha[0, 0] = ctc_probs[0][y_insert_blank[0]]  # State-b, Sb
+    log_alpha[0, 1] = ctc_probs[0][y_insert_blank[1]]  # State-nb, Snb
 
     for t in range(1, ctc_probs.size(0)):  # T
         for s in range(len(y_insert_blank)):  # 2L+1
@@ -110,13 +108,11 @@ def forced_align(ctc_probs: paddle.Tensor, y: paddle.Tensor,
                     log_alpha[t - 1, s - 2],
                 ])
                 prev_state = [s, s - 1, s - 2]
-            # TODO(Hui Zhang): VarBase.__getitem__() not support np.int64
-            log_alpha[t, s] = paddle.max(candidates) + ctc_probs[t][int(
-                y_insert_blank[s])]
+            log_alpha[t, s] = paddle.max(candidates) + ctc_probs[t][
+                y_insert_blank[s]]
             state_path[t, s] = prev_state[paddle.argmax(candidates)]
 
-    # TODO(Hui Zhang): zeros not support paddle.int16
-    state_seq = -1 * paddle.ones((ctc_probs.size(0), 1), dtype=paddle.int32)
+    state_seq = -1 * paddle.ones((ctc_probs.size(0), 1), dtype=paddle.int16)
 
     candidates = paddle.to_tensor([
         log_alpha[-1, len(y_insert_blank) - 1],  # Sb

From 5a36615724ef35c9146be063a9e1e9e2d20e3e43 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Sat, 18 Sep 2021 03:07:52 +0000
Subject: [PATCH 2/2] VarBase.__getitem__ work for np.int64, np.longlong; but
 __setitem_varbase__ not support paddle.int16/set_value op not support

---
 deepspeech/exps/u2/model.py       | 17 ++++++++---------
 deepspeech/exps/u2_kaldi/model.py | 22 ++++++++++------------
 deepspeech/exps/u2_st/model.py    | 17 ++++++++---------
 deepspeech/utils/ctc_utils.py     |  8 +++++---
 4 files changed, 31 insertions(+), 33 deletions(-)

diff --git a/deepspeech/exps/u2/model.py b/deepspeech/exps/u2/model.py
index 1328a1cb7..a7f4f14d9 100644
--- a/deepspeech/exps/u2/model.py
+++ b/deepspeech/exps/u2/model.py
@@ -568,26 +568,25 @@ class U2Tester(U2Trainer):
                 ctc_probs = ctc_probs.squeeze(0)
                 target = target.squeeze(0)
                 alignment = ctc_utils.forced_align(ctc_probs, target)
-                logger.info("align ids", key[0], alignment)
+                logger.info(f"align ids: {key[0]} {alignment}")
                 fout.write('{} {}\n'.format(key[0], alignment))
 
                 # 3. gen praat
                 # segment alignment
                 align_segs = text_grid.segment_alignment(alignment)
-                logger.info("align tokens", key[0], align_segs)
+                logger.info(f"align tokens: {key[0]}, {align_segs}")
                 # IntervalTier, List["start end token\n"]
                 subsample = utility.get_subsample(self.config)
                 tierformat = text_grid.align_to_tierformat(
                     align_segs, subsample, token_dict)
                 # write tier
-                align_output_path = os.path.join(
-                    os.path.dirname(self.args.result_file), "align")
-                tier_path = os.path.join(align_output_path, key[0] + ".tier")
-                with open(tier_path, 'w') as f:
+                align_output_path = Path(self.args.result_file).parent / "align"
+                align_output_path.mkdir(parents=True, exist_ok=True)
+                tier_path = align_output_path / (key[0] + ".tier")
+                with tier_path.open('w') as f:
                     f.writelines(tierformat)
                 # write textgrid
-                textgrid_path = os.path.join(align_output_path,
-                                             key[0] + ".TextGrid")
+                textgrid_path = align_output_path / (key[0] + ".TextGrid")
                 second_per_frame = 1. / (1000. /
                                          stride_ms)  # 25ms window, 10ms stride
                 second_per_example = (
@@ -595,7 +594,7 @@ class U2Tester(U2Trainer):
                 text_grid.generate_textgrid(
                     maxtime=second_per_example,
                     intervals=tierformat,
-                    output=textgrid_path)
+                    output=str(textgrid_path))
 
     def run_align(self):
         self.resume_or_scratch()
diff --git a/deepspeech/exps/u2_kaldi/model.py b/deepspeech/exps/u2_kaldi/model.py
index 3d15e0259..1dbdfef85 100644
--- a/deepspeech/exps/u2_kaldi/model.py
+++ b/deepspeech/exps/u2_kaldi/model.py
@@ -545,9 +545,8 @@ class U2Tester(U2Trainer):
         self.model.eval()
         logger.info(f"Align Total Examples: {len(self.align_loader.dataset)}")
 
-        stride_ms = self.config.collater.stride_ms
-        token_dict = self.args.char_list
-
+        stride_ms = self.align_loader.collate_fn.stride_ms
+        token_dict = self.align_loader.collate_fn.vocab_list
         with open(self.args.result_file, 'w') as fout:
             # one example in batch
             for i, batch in enumerate(self.align_loader):
@@ -564,26 +563,25 @@ class U2Tester(U2Trainer):
                 ctc_probs = ctc_probs.squeeze(0)
                 target = target.squeeze(0)
                 alignment = ctc_utils.forced_align(ctc_probs, target)
-                logger.info("align ids", key[0], alignment)
+                logger.info(f"align ids: {key[0]} {alignment}")
                 fout.write('{} {}\n'.format(key[0], alignment))
 
                 # 3. gen praat
                 # segment alignment
                 align_segs = text_grid.segment_alignment(alignment)
-                logger.info("align tokens", key[0], align_segs)
+                logger.info(f"align tokens: {key[0]}, {align_segs}")
                 # IntervalTier, List["start end token\n"]
                 subsample = utility.get_subsample(self.config)
                 tierformat = text_grid.align_to_tierformat(
                     align_segs, subsample, token_dict)
                 # write tier
-                align_output_path = os.path.join(
-                    os.path.dirname(self.args.result_file), "align")
-                tier_path = os.path.join(align_output_path, key[0] + ".tier")
-                with open(tier_path, 'w') as f:
+                align_output_path = Path(self.args.result_file).parent / "align"
+                align_output_path.mkdir(parents=True, exist_ok=True)
+                tier_path = align_output_path / (key[0] + ".tier")
+                with tier_path.open('w') as f:
                     f.writelines(tierformat)
                 # write textgrid
-                textgrid_path = os.path.join(align_output_path,
-                                             key[0] + ".TextGrid")
+                textgrid_path = align_output_path / (key[0] + ".TextGrid")
                 second_per_frame = 1. / (1000. /
                                          stride_ms)  # 25ms window, 10ms stride
                 second_per_example = (
@@ -591,7 +589,7 @@ class U2Tester(U2Trainer):
                 text_grid.generate_textgrid(
                     maxtime=second_per_example,
                     intervals=tierformat,
-                    output=textgrid_path)
+                    output=str(textgrid_path))
 
     def run_align(self):
         self.resume_or_scratch()
diff --git a/deepspeech/exps/u2_st/model.py b/deepspeech/exps/u2_st/model.py
index 91a81503f..364070d23 100644
--- a/deepspeech/exps/u2_st/model.py
+++ b/deepspeech/exps/u2_st/model.py
@@ -595,26 +595,25 @@ class U2STTester(U2STTrainer):
                 ctc_probs = ctc_probs.squeeze(0)
                 target = target.squeeze(0)
                 alignment = ctc_utils.forced_align(ctc_probs, target)
-                logger.info("align ids", key[0], alignment)
+                logger.info(f"align ids: {key[0]} {alignment}")
                 fout.write('{} {}\n'.format(key[0], alignment))
 
                 # 3. gen praat
                 # segment alignment
                 align_segs = text_grid.segment_alignment(alignment)
-                logger.info("align tokens", key[0], align_segs)
+                logger.info(f"align tokens: {key[0]}, {align_segs}")
                 # IntervalTier, List["start end token\n"]
                 subsample = utility.get_subsample(self.config)
                 tierformat = text_grid.align_to_tierformat(
                     align_segs, subsample, token_dict)
                 # write tier
-                align_output_path = os.path.join(
-                    os.path.dirname(self.args.result_file), "align")
-                tier_path = os.path.join(align_output_path, key[0] + ".tier")
-                with open(tier_path, 'w') as f:
+                align_output_path = Path(self.args.result_file).parent / "align"
+                align_output_path.mkdir(parents=True, exist_ok=True)
+                tier_path = align_output_path / (key[0] + ".tier")
+                with tier_path.open('w') as f:
                     f.writelines(tierformat)
                 # write textgrid
-                textgrid_path = os.path.join(align_output_path,
-                                             key[0] + ".TextGrid")
+                textgrid_path = align_output_path / (key[0] + ".TextGrid")
                 second_per_frame = 1. / (1000. /
                                          stride_ms)  # 25ms window, 10ms stride
                 second_per_example = (
@@ -622,7 +621,7 @@ class U2STTester(U2STTrainer):
                 text_grid.generate_textgrid(
                     maxtime=second_per_example,
                     intervals=tierformat,
-                    output=textgrid_path)
+                    output=str(textgrid_path))
 
     def run_align(self):
         self.resume_or_scratch()
diff --git a/deepspeech/utils/ctc_utils.py b/deepspeech/utils/ctc_utils.py
index 6201233df..9f2271814 100644
--- a/deepspeech/utils/ctc_utils.py
+++ b/deepspeech/utils/ctc_utils.py
@@ -86,8 +86,10 @@ def forced_align(ctc_probs: paddle.Tensor, y: paddle.Tensor,
     log_alpha = paddle.zeros(
         (ctc_probs.size(0), len(y_insert_blank)))  #(T, 2L+1)
     log_alpha = log_alpha - float('inf')  # log of zero
+
+    # self.__setitem_varbase__(item, value) When assign a value to a paddle.Tensor, the data type of the paddle.Tensor not support int16
     state_path = (paddle.zeros(
-        (ctc_probs.size(0), len(y_insert_blank)), dtype=paddle.int16) - 1
+        (ctc_probs.size(0), len(y_insert_blank)), dtype=paddle.int32) - 1
                   )  # state path, Tuple((T, 2L+1))
 
     # init start state
@@ -111,8 +113,8 @@ def forced_align(ctc_probs: paddle.Tensor, y: paddle.Tensor,
             log_alpha[t, s] = paddle.max(candidates) + ctc_probs[t][
                 y_insert_blank[s]]
             state_path[t, s] = prev_state[paddle.argmax(candidates)]
-
-    state_seq = -1 * paddle.ones((ctc_probs.size(0), 1), dtype=paddle.int16)
+    # self.__setitem_varbase__(item, value) When assign a value to a paddle.Tensor, the data type of the paddle.Tensor not support int16
+    state_seq = -1 * paddle.ones((ctc_probs.size(0), 1), dtype=paddle.int32)
 
     candidates = paddle.to_tensor([
         log_alpha[-1, len(y_insert_blank) - 1],  # Sb