diff --git a/deepspeech/exps/u2/model.py b/deepspeech/exps/u2/model.py index 7095ed749..ce3d17cc2 100644 --- a/deepspeech/exps/u2/model.py +++ b/deepspeech/exps/u2/model.py @@ -587,26 +587,25 @@ class U2Tester(U2Trainer): ctc_probs = ctc_probs.squeeze(0) target = target.squeeze(0) alignment = ctc_utils.forced_align(ctc_probs, target) - logger.info("align ids", key[0], alignment) + logger.info(f"align ids: {key[0]} {alignment}") fout.write('{} {}\n'.format(key[0], alignment)) # 3. gen praat # segment alignment align_segs = text_grid.segment_alignment(alignment) - logger.info("align tokens", key[0], align_segs) + logger.info(f"align tokens: {key[0]}, {align_segs}") # IntervalTier, List["start end token\n"] subsample = utility.get_subsample(self.config) tierformat = text_grid.align_to_tierformat( align_segs, subsample, token_dict) # write tier - align_output_path = os.path.join( - os.path.dirname(self.args.result_file), "align") - tier_path = os.path.join(align_output_path, key[0] + ".tier") - with open(tier_path, 'w') as f: + align_output_path = Path(self.args.result_file).parent / "align" + align_output_path.mkdir(parents=True, exist_ok=True) + tier_path = align_output_path / (key[0] + ".tier") + with tier_path.open('w') as f: f.writelines(tierformat) # write textgrid - textgrid_path = os.path.join(align_output_path, - key[0] + ".TextGrid") + textgrid_path = align_output_path / (key[0] + ".TextGrid") second_per_frame = 1. / (1000. / stride_ms) # 25ms window, 10ms stride second_per_example = ( @@ -614,7 +613,7 @@ class U2Tester(U2Trainer): text_grid.generate_textgrid( maxtime=second_per_example, intervals=tierformat, - output=textgrid_path) + output=str(textgrid_path)) def run_align(self): self.resume_or_scratch() diff --git a/deepspeech/exps/u2_kaldi/model.py b/deepspeech/exps/u2_kaldi/model.py index c39bfe31d..116ab2808 100644 --- a/deepspeech/exps/u2_kaldi/model.py +++ b/deepspeech/exps/u2_kaldi/model.py @@ -546,9 +546,8 @@ class U2Tester(U2Trainer): self.model.eval() logger.info(f"Align Total Examples: {len(self.align_loader.dataset)}") - stride_ms = self.config.collater.stride_ms - token_dict = self.args.char_list - + stride_ms = self.align_loader.collate_fn.stride_ms + token_dict = self.align_loader.collate_fn.vocab_list with open(self.args.result_file, 'w') as fout: # one example in batch for i, batch in enumerate(self.align_loader): @@ -565,26 +564,25 @@ class U2Tester(U2Trainer): ctc_probs = ctc_probs.squeeze(0) target = target.squeeze(0) alignment = ctc_utils.forced_align(ctc_probs, target) - logger.info("align ids", key[0], alignment) + logger.info(f"align ids: {key[0]} {alignment}") fout.write('{} {}\n'.format(key[0], alignment)) # 3. gen praat # segment alignment align_segs = text_grid.segment_alignment(alignment) - logger.info("align tokens", key[0], align_segs) + logger.info(f"align tokens: {key[0]}, {align_segs}") # IntervalTier, List["start end token\n"] subsample = utility.get_subsample(self.config) tierformat = text_grid.align_to_tierformat( align_segs, subsample, token_dict) # write tier - align_output_path = os.path.join( - os.path.dirname(self.args.result_file), "align") - tier_path = os.path.join(align_output_path, key[0] + ".tier") - with open(tier_path, 'w') as f: + align_output_path = Path(self.args.result_file).parent / "align" + align_output_path.mkdir(parents=True, exist_ok=True) + tier_path = align_output_path / (key[0] + ".tier") + with tier_path.open('w') as f: f.writelines(tierformat) # write textgrid - textgrid_path = os.path.join(align_output_path, - key[0] + ".TextGrid") + textgrid_path = align_output_path / (key[0] + ".TextGrid") second_per_frame = 1. / (1000. / stride_ms) # 25ms window, 10ms stride second_per_example = ( @@ -592,7 +590,7 @@ class U2Tester(U2Trainer): text_grid.generate_textgrid( maxtime=second_per_example, intervals=tierformat, - output=textgrid_path) + output=str(textgrid_path)) def run_align(self): self.resume_or_scratch() diff --git a/deepspeech/exps/u2_st/model.py b/deepspeech/exps/u2_st/model.py index 6c6e5243a..eb84d6f11 100644 --- a/deepspeech/exps/u2_st/model.py +++ b/deepspeech/exps/u2_st/model.py @@ -596,26 +596,25 @@ class U2STTester(U2STTrainer): ctc_probs = ctc_probs.squeeze(0) target = target.squeeze(0) alignment = ctc_utils.forced_align(ctc_probs, target) - logger.info("align ids", key[0], alignment) + logger.info(f"align ids: {key[0]} {alignment}") fout.write('{} {}\n'.format(key[0], alignment)) # 3. gen praat # segment alignment align_segs = text_grid.segment_alignment(alignment) - logger.info("align tokens", key[0], align_segs) + logger.info(f"align tokens: {key[0]}, {align_segs}") # IntervalTier, List["start end token\n"] subsample = utility.get_subsample(self.config) tierformat = text_grid.align_to_tierformat( align_segs, subsample, token_dict) # write tier - align_output_path = os.path.join( - os.path.dirname(self.args.result_file), "align") - tier_path = os.path.join(align_output_path, key[0] + ".tier") - with open(tier_path, 'w') as f: + align_output_path = Path(self.args.result_file).parent / "align" + align_output_path.mkdir(parents=True, exist_ok=True) + tier_path = align_output_path / (key[0] + ".tier") + with tier_path.open('w') as f: f.writelines(tierformat) # write textgrid - textgrid_path = os.path.join(align_output_path, - key[0] + ".TextGrid") + textgrid_path = align_output_path / (key[0] + ".TextGrid") second_per_frame = 1. / (1000. / stride_ms) # 25ms window, 10ms stride second_per_example = ( @@ -623,7 +622,7 @@ class U2STTester(U2STTrainer): text_grid.generate_textgrid( maxtime=second_per_example, intervals=tierformat, - output=textgrid_path) + output=str(textgrid_path)) def run_align(self): self.resume_or_scratch() diff --git a/deepspeech/utils/ctc_utils.py b/deepspeech/utils/ctc_utils.py index 2639f3064..fc43a71f0 100644 --- a/deepspeech/utils/ctc_utils.py +++ b/deepspeech/utils/ctc_utils.py @@ -86,15 +86,15 @@ def forced_align(ctc_probs: paddle.Tensor, y: paddle.Tensor, log_alpha = paddle.zeros( (ctc_probs.shape[0], len(y_insert_blank))) #(T, 2L+1) log_alpha = log_alpha - float('inf') # log of zero - # TODO(Hui Zhang): zeros not support paddle.int16 + + # self.__setitem_varbase__(item, value) When assign a value to a paddle.Tensor, the data type of the paddle.Tensor not support int16 state_path = (paddle.zeros( (ctc_probs.shape[0], len(y_insert_blank)), dtype=paddle.int32) - 1 ) # state path, Tuple((T, 2L+1)) # init start state - # TODO(Hui Zhang): VarBase.__getitem__() not support np.int64 - log_alpha[0, 0] = ctc_probs[0][int(y_insert_blank[0])] # State-b, Sb - log_alpha[0, 1] = ctc_probs[0][int(y_insert_blank[1])] # State-nb, Snb + log_alpha[0, 0] = ctc_probs[0][y_insert_blank[0]] # State-b, Sb + log_alpha[0, 1] = ctc_probs[0][y_insert_blank[1]] # State-nb, Snb for t in range(1, ctc_probs.shape[0]): # T for s in range(len(y_insert_blank)): # 2L+1 @@ -110,12 +110,10 @@ def forced_align(ctc_probs: paddle.Tensor, y: paddle.Tensor, log_alpha[t - 1, s - 2], ]) prev_state = [s, s - 1, s - 2] - # TODO(Hui Zhang): VarBase.__getitem__() not support np.int64 - log_alpha[t, s] = paddle.max(candidates) + ctc_probs[t][int( - y_insert_blank[s])] + log_alpha[t, s] = paddle.max(candidates) + ctc_probs[t][ + y_insert_blank[s]] state_path[t, s] = prev_state[paddle.argmax(candidates)] - - # TODO(Hui Zhang): zeros not support paddle.int16 + # self.__setitem_varbase__(item, value) When assign a value to a paddle.Tensor, the data type of the paddle.Tensor not support int16 state_seq = -1 * paddle.ones((ctc_probs.shape[0], 1), dtype=paddle.int32) candidates = paddle.to_tensor([