PaddleSpeech/paddlespeech/t2s/frontend/g2pw/utils.py

# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Credits
    This code is modified from https://github.com/GitYCC/g2pW
"""
import os
import re


def wordize_and_map(text: str):
    words = []
    index_map_from_text_to_word = []
    index_map_from_word_to_text = []
    while len(text) > 0:
        match_space = re.match(r'^ +', text)
        if match_space:
            space_str = match_space.group(0)
            index_map_from_text_to_word += [None] * len(space_str)
            text = text[len(space_str):]
            continue

        match_en = re.match(r'^[a-zA-Z0-9]+', text)
        if match_en:
            en_word = match_en.group(0)

            word_start_pos = len(index_map_from_text_to_word)
            word_end_pos = word_start_pos + len(en_word)
            index_map_from_word_to_text.append((word_start_pos, word_end_pos))

            index_map_from_text_to_word += [len(words)] * len(en_word)

            words.append(en_word)
            text = text[len(en_word):]
        else:
            word_start_pos = len(index_map_from_text_to_word)
            word_end_pos = word_start_pos + 1
            index_map_from_word_to_text.append((word_start_pos, word_end_pos))

            index_map_from_text_to_word += [len(words)]

            words.append(text[0])
            text = text[1:]
    return words, index_map_from_text_to_word, index_map_from_word_to_text


def tokenize_and_map(tokenizer, text: str):
    words, text2word, word2text = wordize_and_map(text=text)

    tokens = []
    index_map_from_token_to_text = []
    for word, (word_start, word_end) in zip(words, word2text):
        word_tokens = tokenizer.tokenize(word)

        if len(word_tokens) == 0 or word_tokens == ['[UNK]']:
            index_map_from_token_to_text.append((word_start, word_end))
            tokens.append('[UNK]')
        else:
            current_word_start = word_start
            for word_token in word_tokens:
                word_token_len = len(re.sub(r'^##', '', word_token))
                index_map_from_token_to_text.append(
                    (current_word_start, current_word_start + word_token_len))
                current_word_start = current_word_start + word_token_len
                tokens.append(word_token)

    index_map_from_text_to_token = text2word
    for i, (token_start, token_end) in enumerate(index_map_from_token_to_text):
        for token_pos in range(token_start, token_end):
            index_map_from_text_to_token[token_pos] = i

    return tokens, index_map_from_text_to_token, index_map_from_token_to_text


def _load_config(config_path: os.PathLike):
    import importlib.util
    spec = importlib.util.spec_from_file_location('__init__', config_path)
    config = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(config)
    return config


default_config_dict = {
    'manual_seed': 1313,
    'model_source': 'bert-base-chinese',
    'window_size': 32,
    'num_workers': 2,
    'use_mask': True,
    'use_char_phoneme': False,
    'use_conditional': True,
    'param_conditional': {
        'affect_location': 'softmax',
        'bias': True,
        'char-linear': True,
        'pos-linear': False,
        'char+pos-second': True,
        'char+pos-second_lowrank': False,
        'lowrank_size': 0,
        'char+pos-second_fm': False,
        'fm_size': 0,
        'fix_mode': None,
        'count_json': 'train.count.json'
    },
    'lr': 5e-5,
    'val_interval': 200,
    'num_iter': 10000,
    'use_focal': False,
    'param_focal': {
        'alpha': 0.0,
        'gamma': 0.7
    },
    'use_pos': True,
    'param_pos ': {
        'weight': 0.1,
        'pos_joint_training': True,
        'train_pos_path': 'train.pos',
        'valid_pos_path': 'dev.pos',
        'test_pos_path': 'test.pos'
    }
}


def load_config(config_path: os.PathLike, use_default: bool=False):
    config = _load_config(config_path)
    if use_default:
        for attr, val in default_config_dict.items():
            if not hasattr(config, attr):
                setattr(config, attr, val)
            elif isinstance(val, dict):
                d = getattr(config, attr)
                for dict_k, dict_v in val.items():
                    if dict_k not in d:
                        d[dict_k] = dict_v
    return config
format g2pw 2 years ago			`# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
add comment 2 years ago			`"""`
			`Credits`
format g2pw 2 years ago			`This code is modified from https://github.com/GitYCC/g2pW`
add comment 2 years ago			`"""`
add typehint for g2pw (#2390) 2 years ago			`import os`
Add g2pW to Chinese frontend 2 years ago			`import re`
format g2pw 2 years ago
Add g2pW to Chinese frontend 2 years ago
add typehint for g2pw (#2390) 2 years ago			`def wordize_and_map(text: str):`
Add g2pW to Chinese frontend 2 years ago			`words = []`
			`index_map_from_text_to_word = []`
			`index_map_from_word_to_text = []`
			`while len(text) > 0:`
			`match_space = re.match(r'^ +', text)`
			`if match_space:`
			`space_str = match_space.group(0)`
			`index_map_from_text_to_word += [None] * len(space_str)`
			`text = text[len(space_str):]`
			`continue`

			`match_en = re.match(r'^[a-zA-Z0-9]+', text)`
			`if match_en:`
			`en_word = match_en.group(0)`

			`word_start_pos = len(index_map_from_text_to_word)`
			`word_end_pos = word_start_pos + len(en_word)`
			`index_map_from_word_to_text.append((word_start_pos, word_end_pos))`

			`index_map_from_text_to_word += [len(words)] * len(en_word)`

			`words.append(en_word)`
			`text = text[len(en_word):]`
			`else:`
			`word_start_pos = len(index_map_from_text_to_word)`
			`word_end_pos = word_start_pos + 1`
			`index_map_from_word_to_text.append((word_start_pos, word_end_pos))`

			`index_map_from_text_to_word += [len(words)]`

			`words.append(text[0])`
			`text = text[1:]`
			`return words, index_map_from_text_to_word, index_map_from_word_to_text`


add typehint for g2pw (#2390) 2 years ago			`def tokenize_and_map(tokenizer, text: str):`
			`words, text2word, word2text = wordize_and_map(text=text)`
Add g2pW to Chinese frontend 2 years ago
			`tokens = []`
			`index_map_from_token_to_text = []`
			`for word, (word_start, word_end) in zip(words, word2text):`
			`word_tokens = tokenizer.tokenize(word)`

			`if len(word_tokens) == 0 or word_tokens == ['[UNK]']:`
			`index_map_from_token_to_text.append((word_start, word_end))`
			`tokens.append('[UNK]')`
			`else:`
			`current_word_start = word_start`
			`for word_token in word_tokens:`
			`word_token_len = len(re.sub(r'^##', '', word_token))`
			`index_map_from_token_to_text.append(`
			`(current_word_start, current_word_start + word_token_len))`
			`current_word_start = current_word_start + word_token_len`
			`tokens.append(word_token)`

			`index_map_from_text_to_token = text2word`
			`for i, (token_start, token_end) in enumerate(index_map_from_token_to_text):`
			`for token_pos in range(token_start, token_end):`
			`index_map_from_text_to_token[token_pos] = i`

			`return tokens, index_map_from_text_to_token, index_map_from_token_to_text`


add typehint for g2pw (#2390) 2 years ago			`def _load_config(config_path: os.PathLike):`
Add g2pW to Chinese frontend 2 years ago			`import importlib.util`
			`spec = importlib.util.spec_from_file_location('__init__', config_path)`
			`config = importlib.util.module_from_spec(spec)`
			`spec.loader.exec_module(config)`
			`return config`


			`default_config_dict = {`
			`'manual_seed': 1313,`
			`'model_source': 'bert-base-chinese',`
			`'window_size': 32,`
			`'num_workers': 2,`
			`'use_mask': True,`
			`'use_char_phoneme': False,`
			`'use_conditional': True,`
			`'param_conditional': {`
			`'affect_location': 'softmax',`
			`'bias': True,`
			`'char-linear': True,`
			`'pos-linear': False,`
			`'char+pos-second': True,`
			`'char+pos-second_lowrank': False,`
			`'lowrank_size': 0,`
			`'char+pos-second_fm': False,`
			`'fm_size': 0,`
			`'fix_mode': None,`
			`'count_json': 'train.count.json'`
			`},`
			`'lr': 5e-5,`
			`'val_interval': 200,`
			`'num_iter': 10000,`
			`'use_focal': False,`
			`'param_focal': {`
			`'alpha': 0.0,`
			`'gamma': 0.7`
			`},`
			`'use_pos': True,`
			`'param_pos ': {`
			`'weight': 0.1,`
			`'pos_joint_training': True,`
			`'train_pos_path': 'train.pos',`
			`'valid_pos_path': 'dev.pos',`
			`'test_pos_path': 'test.pos'`
			`}`
			`}`


add typehint for g2pw (#2390) 2 years ago			`def load_config(config_path: os.PathLike, use_default: bool=False):`
Add g2pW to Chinese frontend 2 years ago			`config = _load_config(config_path)`
			`if use_default:`
			`for attr, val in default_config_dict.items():`
			`if not hasattr(config, attr):`
			`setattr(config, attr, val)`
			`elif isinstance(val, dict):`
			`d = getattr(config, attr)`
			`for dict_k, dict_v in val.items():`
			`if dict_k not in d:`
			`d[dict_k] = dict_v`
format g2pw 2 years ago			`return config`