You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
64 lines
2.4 KiB
64 lines
2.4 KiB
2 years ago
|
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||
|
#
|
||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
# you may not use this file except in compliance with the License.
|
||
|
# You may obtain a copy of the License at
|
||
|
#
|
||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||
|
#
|
||
|
# Unless required by applicable law or agreed to in writing, software
|
||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
# See the License for the specific language governing permissions and
|
||
|
# limitations under the License.
|
||
|
import os
|
||
|
from pathlib import Path
|
||
|
from typing import List
|
||
|
from typing import Union
|
||
|
|
||
|
|
||
|
def change_baker_label(baker_label_file: Union[str, Path],
|
||
|
out_label_file: Union[str, Path]):
|
||
|
"""change baker label file to regular label file
|
||
|
|
||
|
Args:
|
||
|
baker_label_file (Union[str, Path]): Original baker label file
|
||
|
out_label_file (Union[str, Path]): regular label file
|
||
|
"""
|
||
|
with open(baker_label_file) as f:
|
||
|
lines = f.readlines()
|
||
|
|
||
|
with open(out_label_file, "w") as fw:
|
||
|
for i in range(0, len(lines), 2):
|
||
|
utt_id = lines[i].split()[0]
|
||
|
transcription = lines[i + 1].strip()
|
||
|
fw.write(utt_id + "|" + transcription + "\n")
|
||
|
|
||
|
|
||
|
def get_single_label(label_file: Union[str, Path],
|
||
|
oov_files: List[Union[str, Path]],
|
||
|
input_dir: Union[str, Path]):
|
||
|
"""Divide the label file into individual files according to label_file
|
||
|
|
||
|
Args:
|
||
|
label_file (str or Path): label file, format: utt_id|phones id
|
||
|
input_dir (Path): input dir including audios
|
||
|
"""
|
||
|
input_dir = Path(input_dir).expanduser()
|
||
|
new_dir = input_dir / "newdir"
|
||
|
new_dir.mkdir(parents=True, exist_ok=True)
|
||
|
|
||
|
with open(label_file, "r") as f:
|
||
|
for line in f.readlines():
|
||
|
utt_id = line.split("|")[0]
|
||
|
if utt_id not in oov_files:
|
||
|
transcription = line.split("|")[1].strip()
|
||
|
wav_file = str(input_dir) + "/" + utt_id + ".wav"
|
||
|
new_wav_file = str(new_dir) + "/" + utt_id + ".wav"
|
||
|
os.system("cp %s %s" % (wav_file, new_wav_file))
|
||
|
single_file = str(new_dir) + "/" + utt_id + ".txt"
|
||
|
with open(single_file, "w") as fw:
|
||
|
fw.write(transcription)
|
||
|
|
||
|
return new_dir
|