You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
PaddleSpeech/paddlespeech/t2s/datasets/preprocess_utils.py

187 lines
5.5 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re
# speaker|utt_id|phn dur phn dur ...
def get_phn_dur(file_name):
'''
read MFA duration.txt
Parameters
----------
file_name : str or Path
path of gen_duration_from_textgrid.py's result
Returns
----------
Dict
sentence: {'utt': ([char], [int])}
'''
f = open(file_name, 'r')
sentence = {}
speaker_set = set()
for line in f:
line_list = line.strip().split('|')
utt = line_list[0]
speaker = line_list[1]
p_d = line_list[-1]
speaker_set.add(speaker)
phn_dur = p_d.split()
phn = phn_dur[::2]
dur = phn_dur[1::2]
assert len(phn) == len(dur)
sentence[utt] = (phn, [int(i) for i in dur], speaker)
f.close()
return sentence, speaker_set
def merge_silence(sentence):
'''
merge silences
Parameters
----------
sentence : Dict
sentence: {'utt': (([char], [int]), str)}
'''
for utt in sentence:
cur_phn, cur_dur, speaker = sentence[utt]
new_phn = []
new_dur = []
# merge sp and sil
for i, p in enumerate(cur_phn):
if i > 0 and 'sil' == p and cur_phn[i - 1] in {"sil", "sp"}:
new_dur[-1] += cur_dur[i]
new_phn[-1] = 'sil'
else:
new_phn.append(p)
new_dur.append(cur_dur[i])
for i, (p, d) in enumerate(zip(new_phn, new_dur)):
if p in {"sp"}:
if d < 14:
new_phn[i] = 'sp'
else:
new_phn[i] = 'spl'
assert len(new_phn) == len(new_dur)
sentence[utt] = [new_phn, new_dur, speaker]
def get_input_token(sentence, output_path, dataset="baker"):
'''
get phone set from training data and save it
Parameters
----------
sentence : Dict
sentence: {'utt': ([char], [int])}
output_path : str or path
path to save phone_id_map
'''
phn_token = set()
for utt in sentence:
for phn in sentence[utt][0]:
phn_token.add(phn)
phn_token = list(phn_token)
phn_token.sort()
phn_token = ["<pad>", "<unk>"] + phn_token
if dataset in {"baker", "aishell3"}:
phn_token += ["", "", "", ""]
else:
phn_token += [",", ".", "?", "!"]
phn_token += ["<eos>"]
with open(output_path, 'w') as f:
for i, phn in enumerate(phn_token):
f.write(phn + ' ' + str(i) + '\n')
def get_phones_tones(sentence,
phones_output_path,
tones_output_path,
dataset="baker"):
'''
get phone set and tone set from training data and save it
Parameters
----------
sentence : Dict
sentence: {'utt': ([char], [int])}
phones_output_path : str or path
path to save phone_id_map
tones_output_path : str or path
path to save tone_id_map
'''
phn_token = set()
tone_token = set()
for utt in sentence:
for label in sentence[utt][0]:
# split tone from finals
match = re.match(r'^(\w+)([012345])$', label)
if match:
phn_token.add(match.group(1))
tone_token.add(match.group(2))
else:
phn_token.add(label)
tone_token.add('0')
phn_token = list(phn_token)
tone_token = list(tone_token)
phn_token.sort()
tone_token.sort()
phn_token = ["<pad>", "<unk>"] + phn_token
if dataset in {"baker", "aishell3"}:
phn_token += ["", "", "", ""]
else:
phn_token += [",", ".", "?", "!"]
phn_token += ["<eos>"]
with open(phones_output_path, 'w') as f:
for i, phn in enumerate(phn_token):
f.write(phn + ' ' + str(i) + '\n')
with open(tones_output_path, 'w') as f:
for i, tone in enumerate(tone_token):
f.write(tone + ' ' + str(i) + '\n')
def get_spk_id_map(speaker_set, output_path):
speakers = sorted(list(speaker_set))
with open(output_path, 'w') as f:
for i, spk in enumerate(speakers):
f.write(spk + ' ' + str(i) + '\n')
def compare_duration_and_mel_length(sentences, utt, mel):
'''
check duration error, correct sentences[utt] if possible, else pop sentences[utt]
Parameters
----------
sentences : Dict
sentences[utt] = [phones_list ,durations_list]
utt : str
utt_id
mel : np.ndarry
features (num_frames, n_mels)
'''
if utt in sentences:
len_diff = mel.shape[0] - sum(sentences[utt][1])
if len_diff != 0:
if len_diff > 0:
sentences[utt][1][-1] += len_diff
elif sentences[utt][1][-1] + len_diff > 0:
sentences[utt][1][-1] += len_diff
elif sentences[utt][1][0] + len_diff > 0:
sentences[utt][1][0] += len_diff
else:
print("the len_diff is unable to correct:", len_diff)
sentences.pop(utt)