PaddleSpeech/paddlespeech/t2s/datasets/preprocess_utils.py

# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re


# speaker|utt_id|phn dur phn dur ...
def get_phn_dur(file_name):
    '''
    read MFA duration.txt
    Parameters
    ----------
    file_name : str or Path
        path of gen_duration_from_textgrid.py's result
    Returns
    ----------
    Dict
        sentence: {'utt': ([char], [int])}
    '''
    f = open(file_name, 'r')
    sentence = {}
    speaker_set = set()
    for line in f:
        line_list = line.strip().split('|')
        utt = line_list[0]
        speaker = line_list[1]
        p_d = line_list[-1]
        speaker_set.add(speaker)
        phn_dur = p_d.split()
        phn = phn_dur[::2]
        dur = phn_dur[1::2]
        assert len(phn) == len(dur)
        sentence[utt] = (phn, [int(i) for i in dur], speaker)
    f.close()
    return sentence, speaker_set


def merge_silence(sentence):
    '''
    merge silences
    Parameters
    ----------
    sentence : Dict
        sentence: {'utt': (([char], [int]), str)}
    '''
    for utt in sentence:
        cur_phn, cur_dur, speaker = sentence[utt]
        new_phn = []
        new_dur = []

        # merge sp and sil
        for i, p in enumerate(cur_phn):
            if i > 0 and 'sil' == p and cur_phn[i - 1] in {"sil", "sp"}:
                new_dur[-1] += cur_dur[i]
                new_phn[-1] = 'sil'
            else:
                new_phn.append(p)
                new_dur.append(cur_dur[i])

        for i, (p, d) in enumerate(zip(new_phn, new_dur)):
            if p in {"sp"}:
                if d < 14:
                    new_phn[i] = 'sp'
                else:
                    new_phn[i] = 'spl'

        assert len(new_phn) == len(new_dur)
        sentence[utt] = [new_phn, new_dur, speaker]


def get_input_token(sentence, output_path, dataset="baker"):
    '''
    get phone set from training data and save it
    Parameters
    ----------
    sentence : Dict
        sentence: {'utt': ([char], [int])}
    output_path : str or path
        path to save phone_id_map
    '''
    phn_token = set()
    for utt in sentence:
        for phn in sentence[utt][0]:
            phn_token.add(phn)
    phn_token = list(phn_token)
    phn_token.sort()
    phn_token = ["<pad>", "<unk>"] + phn_token
    if dataset in {"baker", "aishell3"}:
        phn_token += ["，", "。", "？", "！"]
    else:
        phn_token += [",", ".", "?", "!"]
    phn_token += ["<eos>"]

    with open(output_path, 'w') as f:
        for i, phn in enumerate(phn_token):
            f.write(phn + ' ' + str(i) + '\n')


def get_phones_tones(sentence,
                     phones_output_path,
                     tones_output_path,
                     dataset="baker"):
    '''
    get phone set and tone set from training data and save it
    Parameters
    ----------
    sentence : Dict
        sentence: {'utt': ([char], [int])}
    phones_output_path : str or path
        path to save phone_id_map
    tones_output_path : str or path
        path to save tone_id_map
    '''
    phn_token = set()
    tone_token = set()
    for utt in sentence:
        for label in sentence[utt][0]:
            # split tone from finals
            match = re.match(r'^(\w+)([012345])$', label)
            if match:
                phn_token.add(match.group(1))
                tone_token.add(match.group(2))
            else:
                phn_token.add(label)
                tone_token.add('0')
    phn_token = list(phn_token)
    tone_token = list(tone_token)
    phn_token.sort()
    tone_token.sort()
    phn_token = ["<pad>", "<unk>"] + phn_token
    if dataset in {"baker", "aishell3"}:
        phn_token += ["，", "。", "？", "！"]
    else:
        phn_token += [",", ".", "?", "!"]
    phn_token += ["<eos>"]

    with open(phones_output_path, 'w') as f:
        for i, phn in enumerate(phn_token):
            f.write(phn + ' ' + str(i) + '\n')
    with open(tones_output_path, 'w') as f:
        for i, tone in enumerate(tone_token):
            f.write(tone + ' ' + str(i) + '\n')


def get_spk_id_map(speaker_set, output_path):
    speakers = sorted(list(speaker_set))
    with open(output_path, 'w') as f:
        for i, spk in enumerate(speakers):
            f.write(spk + ' ' + str(i) + '\n')


def compare_duration_and_mel_length(sentences, utt, mel):
    '''
    check duration error, correct sentences[utt] if possible, else pop sentences[utt]
    Parameters
    ----------
    sentences : Dict
        sentences[utt] = [phones_list ,durations_list]
    utt : str
        utt_id
    mel : np.ndarry
        features (num_frames, n_mels)
    '''

    if utt in sentences:
        len_diff = mel.shape[0] - sum(sentences[utt][1])
        if len_diff != 0:
            if len_diff > 0:
                sentences[utt][1][-1] += len_diff
            elif sentences[utt][1][-1] + len_diff > 0:
                sentences[utt][1][-1] += len_diff
            elif sentences[utt][1][0] + len_diff > 0:
                sentences[utt][1][0] += len_diff
            else:
                print("the len_diff is unable to correct:", len_diff)
                sentences.pop(utt)