From caaa44e368f1e453b969ed96f7e7bc228cf0b624 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Thu, 16 Sep 2021 06:10:27 +0000 Subject: [PATCH 1/2] varbase getitem support np.longlong since paddle 2.2.0RC --- deepspeech/utils/ctc_utils.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/deepspeech/utils/ctc_utils.py b/deepspeech/utils/ctc_utils.py index 09543d48d..6201233df 100644 --- a/deepspeech/utils/ctc_utils.py +++ b/deepspeech/utils/ctc_utils.py @@ -86,15 +86,13 @@ def forced_align(ctc_probs: paddle.Tensor, y: paddle.Tensor, log_alpha = paddle.zeros( (ctc_probs.size(0), len(y_insert_blank))) #(T, 2L+1) log_alpha = log_alpha - float('inf') # log of zero - # TODO(Hui Zhang): zeros not support paddle.int16 state_path = (paddle.zeros( - (ctc_probs.size(0), len(y_insert_blank)), dtype=paddle.int32) - 1 + (ctc_probs.size(0), len(y_insert_blank)), dtype=paddle.int16) - 1 ) # state path, Tuple((T, 2L+1)) # init start state - # TODO(Hui Zhang): VarBase.__getitem__() not support np.int64 - log_alpha[0, 0] = ctc_probs[0][int(y_insert_blank[0])] # State-b, Sb - log_alpha[0, 1] = ctc_probs[0][int(y_insert_blank[1])] # State-nb, Snb + log_alpha[0, 0] = ctc_probs[0][y_insert_blank[0]] # State-b, Sb + log_alpha[0, 1] = ctc_probs[0][y_insert_blank[1]] # State-nb, Snb for t in range(1, ctc_probs.size(0)): # T for s in range(len(y_insert_blank)): # 2L+1 @@ -110,13 +108,11 @@ def forced_align(ctc_probs: paddle.Tensor, y: paddle.Tensor, log_alpha[t - 1, s - 2], ]) prev_state = [s, s - 1, s - 2] - # TODO(Hui Zhang): VarBase.__getitem__() not support np.int64 - log_alpha[t, s] = paddle.max(candidates) + ctc_probs[t][int( - y_insert_blank[s])] + log_alpha[t, s] = paddle.max(candidates) + ctc_probs[t][ + y_insert_blank[s]] state_path[t, s] = prev_state[paddle.argmax(candidates)] - # TODO(Hui Zhang): zeros not support paddle.int16 - state_seq = -1 * paddle.ones((ctc_probs.size(0), 1), dtype=paddle.int32) + state_seq = -1 * paddle.ones((ctc_probs.size(0), 1), dtype=paddle.int16) candidates = paddle.to_tensor([ log_alpha[-1, len(y_insert_blank) - 1], # Sb From 5a36615724ef35c9146be063a9e1e9e2d20e3e43 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Sat, 18 Sep 2021 03:07:52 +0000 Subject: [PATCH 2/2] VarBase.__getitem__ work for np.int64, np.longlong; but __setitem_varbase__ not support paddle.int16/set_value op not support --- deepspeech/exps/u2/model.py | 17 ++++++++--------- deepspeech/exps/u2_kaldi/model.py | 22 ++++++++++------------ deepspeech/exps/u2_st/model.py | 17 ++++++++--------- deepspeech/utils/ctc_utils.py | 8 +++++--- 4 files changed, 31 insertions(+), 33 deletions(-) diff --git a/deepspeech/exps/u2/model.py b/deepspeech/exps/u2/model.py index 1328a1cb7..a7f4f14d9 100644 --- a/deepspeech/exps/u2/model.py +++ b/deepspeech/exps/u2/model.py @@ -568,26 +568,25 @@ class U2Tester(U2Trainer): ctc_probs = ctc_probs.squeeze(0) target = target.squeeze(0) alignment = ctc_utils.forced_align(ctc_probs, target) - logger.info("align ids", key[0], alignment) + logger.info(f"align ids: {key[0]} {alignment}") fout.write('{} {}\n'.format(key[0], alignment)) # 3. gen praat # segment alignment align_segs = text_grid.segment_alignment(alignment) - logger.info("align tokens", key[0], align_segs) + logger.info(f"align tokens: {key[0]}, {align_segs}") # IntervalTier, List["start end token\n"] subsample = utility.get_subsample(self.config) tierformat = text_grid.align_to_tierformat( align_segs, subsample, token_dict) # write tier - align_output_path = os.path.join( - os.path.dirname(self.args.result_file), "align") - tier_path = os.path.join(align_output_path, key[0] + ".tier") - with open(tier_path, 'w') as f: + align_output_path = Path(self.args.result_file).parent / "align" + align_output_path.mkdir(parents=True, exist_ok=True) + tier_path = align_output_path / (key[0] + ".tier") + with tier_path.open('w') as f: f.writelines(tierformat) # write textgrid - textgrid_path = os.path.join(align_output_path, - key[0] + ".TextGrid") + textgrid_path = align_output_path / (key[0] + ".TextGrid") second_per_frame = 1. / (1000. / stride_ms) # 25ms window, 10ms stride second_per_example = ( @@ -595,7 +594,7 @@ class U2Tester(U2Trainer): text_grid.generate_textgrid( maxtime=second_per_example, intervals=tierformat, - output=textgrid_path) + output=str(textgrid_path)) def run_align(self): self.resume_or_scratch() diff --git a/deepspeech/exps/u2_kaldi/model.py b/deepspeech/exps/u2_kaldi/model.py index 3d15e0259..1dbdfef85 100644 --- a/deepspeech/exps/u2_kaldi/model.py +++ b/deepspeech/exps/u2_kaldi/model.py @@ -545,9 +545,8 @@ class U2Tester(U2Trainer): self.model.eval() logger.info(f"Align Total Examples: {len(self.align_loader.dataset)}") - stride_ms = self.config.collater.stride_ms - token_dict = self.args.char_list - + stride_ms = self.align_loader.collate_fn.stride_ms + token_dict = self.align_loader.collate_fn.vocab_list with open(self.args.result_file, 'w') as fout: # one example in batch for i, batch in enumerate(self.align_loader): @@ -564,26 +563,25 @@ class U2Tester(U2Trainer): ctc_probs = ctc_probs.squeeze(0) target = target.squeeze(0) alignment = ctc_utils.forced_align(ctc_probs, target) - logger.info("align ids", key[0], alignment) + logger.info(f"align ids: {key[0]} {alignment}") fout.write('{} {}\n'.format(key[0], alignment)) # 3. gen praat # segment alignment align_segs = text_grid.segment_alignment(alignment) - logger.info("align tokens", key[0], align_segs) + logger.info(f"align tokens: {key[0]}, {align_segs}") # IntervalTier, List["start end token\n"] subsample = utility.get_subsample(self.config) tierformat = text_grid.align_to_tierformat( align_segs, subsample, token_dict) # write tier - align_output_path = os.path.join( - os.path.dirname(self.args.result_file), "align") - tier_path = os.path.join(align_output_path, key[0] + ".tier") - with open(tier_path, 'w') as f: + align_output_path = Path(self.args.result_file).parent / "align" + align_output_path.mkdir(parents=True, exist_ok=True) + tier_path = align_output_path / (key[0] + ".tier") + with tier_path.open('w') as f: f.writelines(tierformat) # write textgrid - textgrid_path = os.path.join(align_output_path, - key[0] + ".TextGrid") + textgrid_path = align_output_path / (key[0] + ".TextGrid") second_per_frame = 1. / (1000. / stride_ms) # 25ms window, 10ms stride second_per_example = ( @@ -591,7 +589,7 @@ class U2Tester(U2Trainer): text_grid.generate_textgrid( maxtime=second_per_example, intervals=tierformat, - output=textgrid_path) + output=str(textgrid_path)) def run_align(self): self.resume_or_scratch() diff --git a/deepspeech/exps/u2_st/model.py b/deepspeech/exps/u2_st/model.py index 91a81503f..364070d23 100644 --- a/deepspeech/exps/u2_st/model.py +++ b/deepspeech/exps/u2_st/model.py @@ -595,26 +595,25 @@ class U2STTester(U2STTrainer): ctc_probs = ctc_probs.squeeze(0) target = target.squeeze(0) alignment = ctc_utils.forced_align(ctc_probs, target) - logger.info("align ids", key[0], alignment) + logger.info(f"align ids: {key[0]} {alignment}") fout.write('{} {}\n'.format(key[0], alignment)) # 3. gen praat # segment alignment align_segs = text_grid.segment_alignment(alignment) - logger.info("align tokens", key[0], align_segs) + logger.info(f"align tokens: {key[0]}, {align_segs}") # IntervalTier, List["start end token\n"] subsample = utility.get_subsample(self.config) tierformat = text_grid.align_to_tierformat( align_segs, subsample, token_dict) # write tier - align_output_path = os.path.join( - os.path.dirname(self.args.result_file), "align") - tier_path = os.path.join(align_output_path, key[0] + ".tier") - with open(tier_path, 'w') as f: + align_output_path = Path(self.args.result_file).parent / "align" + align_output_path.mkdir(parents=True, exist_ok=True) + tier_path = align_output_path / (key[0] + ".tier") + with tier_path.open('w') as f: f.writelines(tierformat) # write textgrid - textgrid_path = os.path.join(align_output_path, - key[0] + ".TextGrid") + textgrid_path = align_output_path / (key[0] + ".TextGrid") second_per_frame = 1. / (1000. / stride_ms) # 25ms window, 10ms stride second_per_example = ( @@ -622,7 +621,7 @@ class U2STTester(U2STTrainer): text_grid.generate_textgrid( maxtime=second_per_example, intervals=tierformat, - output=textgrid_path) + output=str(textgrid_path)) def run_align(self): self.resume_or_scratch() diff --git a/deepspeech/utils/ctc_utils.py b/deepspeech/utils/ctc_utils.py index 6201233df..9f2271814 100644 --- a/deepspeech/utils/ctc_utils.py +++ b/deepspeech/utils/ctc_utils.py @@ -86,8 +86,10 @@ def forced_align(ctc_probs: paddle.Tensor, y: paddle.Tensor, log_alpha = paddle.zeros( (ctc_probs.size(0), len(y_insert_blank))) #(T, 2L+1) log_alpha = log_alpha - float('inf') # log of zero + + # self.__setitem_varbase__(item, value) When assign a value to a paddle.Tensor, the data type of the paddle.Tensor not support int16 state_path = (paddle.zeros( - (ctc_probs.size(0), len(y_insert_blank)), dtype=paddle.int16) - 1 + (ctc_probs.size(0), len(y_insert_blank)), dtype=paddle.int32) - 1 ) # state path, Tuple((T, 2L+1)) # init start state @@ -111,8 +113,8 @@ def forced_align(ctc_probs: paddle.Tensor, y: paddle.Tensor, log_alpha[t, s] = paddle.max(candidates) + ctc_probs[t][ y_insert_blank[s]] state_path[t, s] = prev_state[paddle.argmax(candidates)] - - state_seq = -1 * paddle.ones((ctc_probs.size(0), 1), dtype=paddle.int16) + # self.__setitem_varbase__(item, value) When assign a value to a paddle.Tensor, the data type of the paddle.Tensor not support int16 + state_seq = -1 * paddle.ones((ctc_probs.size(0), 1), dtype=paddle.int32) candidates = paddle.to_tensor([ log_alpha[-1, len(y_insert_blank) - 1], # Sb