PaddleSpeech/deepspeech/utils/text_grid.py

# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Dict
from typing import List
from typing import Text

import textgrid


def segment_alignment(alignment: List[int], blank_id=0) -> List[List[int]]:
    """segment ctc alignment ids by continuous blank and repeat label.

    Args:
        alignment (List[int]): ctc alignment id sequence. 
            e.g. [0, 0, 0, 1, 1, 1, 2, 0, 0, 3]
        blank_id (int, optional): blank id. Defaults to 0.

    Returns:
        List[List[int]]: token align, segment aligment id sequence. 
            e.g. [[0, 0, 0, 1, 1, 1], [2], [0, 0, 3]]
    """
    # convert alignment to a praat format, which is a doing phonetics
    # by computer and helps analyzing alignment
    align_segs = []
    # get frames level duration for each token
    start = 0
    end = 0
    while end < len(alignment):
        while end < len(alignment) and alignment[end] == blank_id:  # blank
            end += 1
        if end == len(alignment):
            align_segs[-1].extend(alignment[start:])
            break
        end += 1
        while end < len(alignment) and alignment[end - 1] == alignment[
                end]:  # repeat label
            end += 1
        align_segs.append(alignment[start:end])
        start = end
    return align_segs


def align_to_tierformat(align_segs: List[List[int]],
                        subsample: int,
                        token_dict: Dict[int, Text],
                        blank_id=0) -> List[Text]:
    """Generate textgrid.Interval format from alignment segmentations.

    Args:
        align_segs (List[List[int]]): segmented ctc alignment ids.
        subsample (int): 25ms frame_length, 10ms hop_length, 1/subsample
        token_dict (Dict[int, Text]): int -> str map.

    Returns:
        List[Text]: list of textgrid.Interval text, str(start, end, text).
    """
    hop_length = 10  # ms
    second_ms = 1000  # ms
    frame_per_second = second_ms / hop_length  # 25ms frame_length, 10ms hop_length
    second_per_frame = 1.0 / frame_per_second

    begin = 0
    duration = 0
    tierformat = []

    for idx, tokens in enumerate(align_segs):
        token_len = len(tokens)
        token = tokens[-1]
        # time duration in second
        duration = token_len * subsample * second_per_frame
        if idx < len(align_segs) - 1:
            print(f"{begin:.2f} {begin + duration:.2f} {token_dict[token]}")
            tierformat.append(
                f"{begin:.2f} {begin + duration:.2f} {token_dict[token]}\n")
        else:
            for i in tokens:
                if i != blank_id:
                    token = i
                    break
            print(f"{begin:.2f} {begin + duration:.2f} {token_dict[token]}")
            tierformat.append(
                f"{begin:.2f} {begin + duration:.2f} {token_dict[token]}\n")
        begin = begin + duration

    return tierformat


def generate_textgrid(maxtime: float,
                      intervals: List[Text],
                      output: Text,
                      name: Text='ali') -> None:
    """Create alignment textgrid file.

    Args:
        maxtime (float): audio duartion.
        intervals (List[Text]): ctc output alignment. e.g. "start-time end-time word" per item.
        output (Text): textgrid filepath.
        name (Text, optional): tier or layer name. Defaults to 'ali'.
    """
    # Download Praat: https://www.fon.hum.uva.nl/praat/
    avg_interval = maxtime / (len(intervals) + 1)
    print(f"average second/token: {avg_interval}")
    margin = 0.0001

    tg = textgrid.TextGrid(maxTime=maxtime)
    tier = textgrid.IntervalTier(name=name, maxTime=maxtime)

    i = 0
    for dur in intervals:
        s, e, text = dur.split()
        tier.add(minTime=float(s) + margin, maxTime=float(e), mark=text)

    tg.append(tier)

    tg.write(output)
    print("successfully generator textgrid {}.".format(output))
add align code 4 years ago			`# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
			`from typing import Dict`
			`from typing import List`
			`from typing import Text`

			`import textgrid`


			`def segment_alignment(alignment: List[int], blank_id=0) -> List[List[int]]:`
			`"""segment ctc alignment ids by continuous blank and repeat label.`

			`Args:`
more comment; fix datapipe of align 3 years ago			`alignment (List[int]): ctc alignment id sequence.`
			`e.g. [0, 0, 0, 1, 1, 1, 2, 0, 0, 3]`
add align code 4 years ago			`blank_id (int, optional): blank id. Defaults to 0.`

			`Returns:`
more comment; fix datapipe of align 3 years ago			`List[List[int]]: token align, segment aligment id sequence.`
			`e.g. [[0, 0, 0, 1, 1, 1], [2], [0, 0, 3]]`
add align code 4 years ago			`"""`
			`# convert alignment to a praat format, which is a doing phonetics`
			`# by computer and helps analyzing alignment`
			`align_segs = []`
			`# get frames level duration for each token`
			`start = 0`
			`end = 0`
			`while end < len(alignment):`
			`while end < len(alignment) and alignment[end] == blank_id: # blank`
			`end += 1`
			`if end == len(alignment):`
			`align_segs[-1].extend(alignment[start:])`
			`break`
			`end += 1`
			`while end < len(alignment) and alignment[end - 1] == alignment[`
			`end]: # repeat label`
			`end += 1`
			`align_segs.append(alignment[start:end])`
			`start = end`
			`return align_segs`


			`def align_to_tierformat(align_segs: List[List[int]],`
			`subsample: int,`
			`token_dict: Dict[int, Text],`
			`blank_id=0) -> List[Text]:`
			`"""Generate textgrid.Interval format from alignment segmentations.`

			`Args:`
			`align_segs (List[List[int]]): segmented ctc alignment ids.`
			`subsample (int): 25ms frame_length, 10ms hop_length, 1/subsample`
			`token_dict (Dict[int, Text]): int -> str map.`

			`Returns:`
more comment; fix datapipe of align 3 years ago			`List[Text]: list of textgrid.Interval text, str(start, end, text).`
add align code 4 years ago			`"""`
			`hop_length = 10 # ms`
			`second_ms = 1000 # ms`
			`frame_per_second = second_ms / hop_length # 25ms frame_length, 10ms hop_length`
			`second_per_frame = 1.0 / frame_per_second`

			`begin = 0`
			`duration = 0`
			`tierformat = []`

			`for idx, tokens in enumerate(align_segs):`
			`token_len = len(tokens)`
			`token = tokens[-1]`
			`# time duration in second`
			`duration = token_len * subsample * second_per_frame`
			`if idx < len(align_segs) - 1:`
			`print(f"{begin:.2f} {begin + duration:.2f} {token_dict[token]}")`
			`tierformat.append(`
			`f"{begin:.2f} {begin + duration:.2f} {token_dict[token]}\n")`
			`else:`
			`for i in tokens:`
			`if i != blank_id:`
			`token = i`
			`break`
			`print(f"{begin:.2f} {begin + duration:.2f} {token_dict[token]}")`
			`tierformat.append(`
			`f"{begin:.2f} {begin + duration:.2f} {token_dict[token]}\n")`
			`begin = begin + duration`

			`return tierformat`


			`def generate_textgrid(maxtime: float,`
			`intervals: List[Text],`
			`output: Text,`
			`name: Text='ali') -> None:`
			`"""Create alignment textgrid file.`

			`Args:`
			`maxtime (float): audio duartion.`
			`intervals (List[Text]): ctc output alignment. e.g. "start-time end-time word" per item.`
			`output (Text): textgrid filepath.`
			`name (Text, optional): tier or layer name. Defaults to 'ali'.`
			`"""`
			`# Download Praat: https://www.fon.hum.uva.nl/praat/`
			`avg_interval = maxtime / (len(intervals) + 1)`
fix ctc alignment 3 years ago			`print(f"average second/token: {avg_interval}")`
add align code 4 years ago			`margin = 0.0001`

			`tg = textgrid.TextGrid(maxTime=maxtime)`
			`tier = textgrid.IntervalTier(name=name, maxTime=maxtime)`

			`i = 0`
			`for dur in intervals:`
			`s, e, text = dur.split()`
			`tier.add(minTime=float(s) + margin, maxTime=float(e), mark=text)`

			`tg.append(tier)`

			`tg.write(output)`
			`print("successfully generator textgrid {}.".format(output))`