add data prepare for speechbrain format.

pull/2880/head
zxcd 3 years ago
parent 4fae249503
commit ceb391fbfa

@ -25,9 +25,7 @@
# Authors: Yingzhi WANG 2022
# ############################################################################
seed: 2
__set_seed: !apply:torch.manual_seed [!ref <seed>]
output_folder: !ref data/<seed>
output_folder: !ref data
cer_file: !ref <output_folder>/cer.txt
save_folder: !ref <output_folder>/save
train_log: !ref <output_folder>/train_log.txt
@ -39,7 +37,7 @@ skip_prep: False
ckpt_interval_minutes: 15 # save checkpoint every N min
train_data: !ref <output_folder>/train.csv
valid_data: !ref <output_folder>/dev.csv
test_data: !ref <output_folder>/dev_for_test.csv
test_data: !ref <output_folder>/test.csv
wav2vec2_hub: TencentGameMate/chinese-wav2vec2-large

@ -0,0 +1,129 @@
# Copyright (c) 2023 speechbrain Authors. All Rights Reserved.
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Modified from speechbrain 2023
# (https://github.com/speechbrain/speechbrain/blob/develop/recipes/AISHELL-1/aishell_prepare.py)
import argparse
import csv
import glob
import logging
import os
from paddlespeech.s2t.models.wav2vec2.io.dataio import read_audio
logger = logging.getLogger(__name__)
DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--data_folder",
default=DATA_HOME + "/Aishell",
type=str,
help="Directory to save the dataset. (default: %(default)s)")
parser.add_argument(
"--save_folder",
default="data/",
type=str,
help="Filepath prefix for output manifests. (default: %(default)s)")
parser.add_argument(
"--skip_prep",
default=False,
type=bool,
help="If True, skip data preparation. (default: %(default)s)")
args = parser.parse_args()
def prepare_aishell(data_folder, save_folder, skip_prep=False):
"""
This function prepares the AISHELL-1 dataset.
If the folder does not exist, the zip file will be extracted. If the zip file does not exist, it will be downloaded.
data_folder : path to AISHELL-1 dataset.
save_folder: path where to store the manifest csv files.
skip_prep: If True, skip data preparation.
"""
if skip_prep:
return
# Create filename-to-transcript dictionary
filename2transcript = {}
with open(
os.path.join(data_folder,
"data_aishell/transcript/aishell_transcript_v0.8.txt"),
"r", ) as f:
lines = f.readlines()
for line in lines:
key = line.split()[0]
value = " ".join(line.split()[1:])
filename2transcript[key] = value
splits = [
"train",
"dev",
"test",
]
ID_start = 0 # needed to have a unique ID for each audio
for split in splits:
new_filename = os.path.join(save_folder, split) + ".csv"
if os.path.exists(new_filename):
continue
logger.info("Preparing %s..." % new_filename)
csv_output = [["ID", "duration", "wav", "transcript"]]
entry = []
all_wavs = glob.glob(
os.path.join(data_folder, "data_aishell/wav") + "/" + split +
"/*/*.wav")
for i in range(len(all_wavs)):
filename = all_wavs[i].split("/")[-1].split(".wav")[0]
if filename not in filename2transcript:
continue
signal = read_audio(all_wavs[i])
duration = signal.shape[0] / 16000
transcript_ = filename2transcript[filename]
csv_line = [
ID_start + i,
str(duration),
all_wavs[i],
transcript_,
]
entry.append(csv_line)
csv_output = csv_output + entry
with open(new_filename, mode="w") as csv_f:
csv_writer = csv.writer(
csv_f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
for line in csv_output:
csv_writer.writerow(line)
msg = "\t%s successfully created!" % (new_filename)
logger.info(msg)
ID_start += len(all_wavs)
def main():
if args.data_folder.startswith('~'):
args.data_folder = os.path.expanduser(args.data_folder)
prepare_aishell(args.data_folder, args.save_folder, skip_prep=False)
print("Data csv prepare done!")
if __name__ == '__main__':
main()

@ -1,7 +1,7 @@
#!/bin/bash
stage=-1
stop_stage=100
stop_stage=-1
dict_dir=data/lang_char
. ${MAIN_ROOT}/utils/parse_options.sh || exit -1;
@ -17,6 +17,12 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
--manifest_prefix="data/manifest" \
--target_dir="${TARGET_DIR}/aishell"
#generate csv file for speechbrain dataloader
python3 local/aishell_prepare.py \
--data_folder="${TARGET_DIR}/aishell" \
--save_folder="data/"
if [ $? -ne 0 ]; then
echo "Prepare Aishell failed. Terminated."
exit 1

Loading…
Cancel
Save