commit
f1acfe443d
@ -0,0 +1,127 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from typing import Dict
|
||||
from typing import List
|
||||
from typing import Text
|
||||
|
||||
import textgrid
|
||||
|
||||
|
||||
def segment_alignment(alignment: List[int], blank_id=0) -> List[List[int]]:
|
||||
"""segment ctc alignment ids by continuous blank and repeat label.
|
||||
|
||||
Args:
|
||||
alignment (List[int]): ctc alignment id sequence.
|
||||
e.g. [0, 0, 0, 1, 1, 1, 2, 0, 0, 3]
|
||||
blank_id (int, optional): blank id. Defaults to 0.
|
||||
|
||||
Returns:
|
||||
List[List[int]]: token align, segment aligment id sequence.
|
||||
e.g. [[0, 0, 0, 1, 1, 1], [2], [0, 0, 3]]
|
||||
"""
|
||||
# convert alignment to a praat format, which is a doing phonetics
|
||||
# by computer and helps analyzing alignment
|
||||
align_segs = []
|
||||
# get frames level duration for each token
|
||||
start = 0
|
||||
end = 0
|
||||
while end < len(alignment):
|
||||
while end < len(alignment) and alignment[end] == blank_id: # blank
|
||||
end += 1
|
||||
if end == len(alignment):
|
||||
align_segs[-1].extend(alignment[start:])
|
||||
break
|
||||
end += 1
|
||||
while end < len(alignment) and alignment[end - 1] == alignment[
|
||||
end]: # repeat label
|
||||
end += 1
|
||||
align_segs.append(alignment[start:end])
|
||||
start = end
|
||||
return align_segs
|
||||
|
||||
|
||||
def align_to_tierformat(align_segs: List[List[int]],
|
||||
subsample: int,
|
||||
token_dict: Dict[int, Text],
|
||||
blank_id=0) -> List[Text]:
|
||||
"""Generate textgrid.Interval format from alignment segmentations.
|
||||
|
||||
Args:
|
||||
align_segs (List[List[int]]): segmented ctc alignment ids.
|
||||
subsample (int): 25ms frame_length, 10ms hop_length, 1/subsample
|
||||
token_dict (Dict[int, Text]): int -> str map.
|
||||
|
||||
Returns:
|
||||
List[Text]: list of textgrid.Interval text, str(start, end, text).
|
||||
"""
|
||||
hop_length = 10 # ms
|
||||
second_ms = 1000 # ms
|
||||
frame_per_second = second_ms / hop_length # 25ms frame_length, 10ms hop_length
|
||||
second_per_frame = 1.0 / frame_per_second
|
||||
|
||||
begin = 0
|
||||
duration = 0
|
||||
tierformat = []
|
||||
|
||||
for idx, tokens in enumerate(align_segs):
|
||||
token_len = len(tokens)
|
||||
token = tokens[-1]
|
||||
# time duration in second
|
||||
duration = token_len * subsample * second_per_frame
|
||||
if idx < len(align_segs) - 1:
|
||||
print(f"{begin:.2f} {begin + duration:.2f} {token_dict[token]}")
|
||||
tierformat.append(
|
||||
f"{begin:.2f} {begin + duration:.2f} {token_dict[token]}\n")
|
||||
else:
|
||||
for i in tokens:
|
||||
if i != blank_id:
|
||||
token = i
|
||||
break
|
||||
print(f"{begin:.2f} {begin + duration:.2f} {token_dict[token]}")
|
||||
tierformat.append(
|
||||
f"{begin:.2f} {begin + duration:.2f} {token_dict[token]}\n")
|
||||
begin = begin + duration
|
||||
|
||||
return tierformat
|
||||
|
||||
|
||||
def generate_textgrid(maxtime: float,
|
||||
intervals: List[Text],
|
||||
output: Text,
|
||||
name: Text='ali') -> None:
|
||||
"""Create alignment textgrid file.
|
||||
|
||||
Args:
|
||||
maxtime (float): audio duartion.
|
||||
intervals (List[Text]): ctc output alignment. e.g. "start-time end-time word" per item.
|
||||
output (Text): textgrid filepath.
|
||||
name (Text, optional): tier or layer name. Defaults to 'ali'.
|
||||
"""
|
||||
# Download Praat: https://www.fon.hum.uva.nl/praat/
|
||||
avg_interval = maxtime / (len(intervals) + 1)
|
||||
print(f"average second/token: {avg_interval}")
|
||||
margin = 0.0001
|
||||
|
||||
tg = textgrid.TextGrid(maxTime=maxtime)
|
||||
tier = textgrid.IntervalTier(name=name, maxTime=maxtime)
|
||||
|
||||
i = 0
|
||||
for dur in intervals:
|
||||
s, e, text = dur.split()
|
||||
tier.add(minTime=float(s) + margin, maxTime=float(e), mark=text)
|
||||
|
||||
tg.append(tier)
|
||||
|
||||
tg.write(output)
|
||||
print("successfully generator textgrid {}.".format(output))
|
@ -0,0 +1,43 @@
|
||||
#! /usr/bin/env bash
|
||||
|
||||
if [ $# != 2 ];then
|
||||
echo "usage: ${0} config_path ckpt_path_prefix"
|
||||
exit -1
|
||||
fi
|
||||
|
||||
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
|
||||
echo "using $ngpu gpus..."
|
||||
|
||||
device=gpu
|
||||
if [ ngpu == 0 ];then
|
||||
device=cpu
|
||||
fi
|
||||
config_path=$1
|
||||
ckpt_prefix=$2
|
||||
|
||||
ckpt_name=$(basename ${ckpt_prefxi})
|
||||
|
||||
mkdir -p exp
|
||||
|
||||
|
||||
|
||||
batch_size=1
|
||||
output_dir=${ckpt_prefix}
|
||||
mkdir -p ${output_dir}
|
||||
|
||||
# align dump in `result_file`
|
||||
# .tier, .TextGrid dump in `dir of result_file`
|
||||
python3 -u ${BIN_DIR}/alignment.py \
|
||||
--device ${device} \
|
||||
--nproc 1 \
|
||||
--config ${config_path} \
|
||||
--result_file ${output_dir}/${type}.align \
|
||||
--checkpoint_path ${ckpt_prefix} \
|
||||
--opts decoding.batch_size ${batch_size}
|
||||
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Failed in ctc alignment!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
exit 0
|
@ -0,0 +1,4 @@
|
||||
*.tgz
|
||||
manifest.*
|
||||
*.meta
|
||||
aidatatang_200zh/
|
@ -0,0 +1,14 @@
|
||||
# [Aidatatang_200zh](http://www.openslr.org/62/)
|
||||
|
||||
Aidatatang_200zh is a free Chinese Mandarin speech corpus provided by Beijing DataTang Technology Co., Ltd under Creative Commons Attribution-NonCommercial-NoDerivatives 4.0 International Public License.
|
||||
The contents and the corresponding descriptions of the corpus include:
|
||||
|
||||
* The corpus contains 200 hours of acoustic data, which is mostly mobile recorded data.
|
||||
* 600 speakers from different accent areas in China are invited to participate in the recording.
|
||||
* The transcription accuracy for each sentence is larger than 98%.
|
||||
* Recordings are conducted in a quiet indoor environment.
|
||||
* The database is divided into training set, validation set, and testing set in a ratio of 7: 1: 2.
|
||||
* Detail information such as speech data coding and speaker information is preserved in the metadata file.
|
||||
* Segmented transcripts are also provided.
|
||||
|
||||
The corpus aims to support researchers in speech recognition, machine translation, voiceprint recognition, and other speech-related fields. Therefore, the corpus is totally free for academic use.
|
@ -0,0 +1,151 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Prepare aidatatang_200zh mandarin dataset
|
||||
|
||||
Download, unpack and create manifest files.
|
||||
Manifest file is a json-format file with each line containing the
|
||||
meta data (i.e. audio filepath, transcript and audio duration)
|
||||
of each audio file in the data set.
|
||||
"""
|
||||
import argparse
|
||||
import codecs
|
||||
import json
|
||||
import os
|
||||
|
||||
import soundfile
|
||||
|
||||
from utils.utility import download
|
||||
from utils.utility import unpack
|
||||
|
||||
DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
|
||||
|
||||
URL_ROOT = 'http://www.openslr.org/resources/62'
|
||||
# URL_ROOT = 'https://openslr.magicdatatech.com/resources/62'
|
||||
DATA_URL = URL_ROOT + '/aidatatang_200zh.tgz'
|
||||
MD5_DATA = '6e0f4f39cd5f667a7ee53c397c8d0949'
|
||||
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument(
|
||||
"--target_dir",
|
||||
default=DATA_HOME + "/aidatatang_200zh",
|
||||
type=str,
|
||||
help="Directory to save the dataset. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--manifest_prefix",
|
||||
default="manifest",
|
||||
type=str,
|
||||
help="Filepath prefix for output manifests. (default: %(default)s)")
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
def create_manifest(data_dir, manifest_path_prefix):
|
||||
print("Creating manifest %s ..." % manifest_path_prefix)
|
||||
json_lines = []
|
||||
transcript_path = os.path.join(data_dir, 'transcript',
|
||||
'aidatatang_200_zh_transcript.txt')
|
||||
transcript_dict = {}
|
||||
for line in codecs.open(transcript_path, 'r', 'utf-8'):
|
||||
line = line.strip()
|
||||
if line == '':
|
||||
continue
|
||||
audio_id, text = line.split(' ', 1)
|
||||
# remove withespace, charactor text
|
||||
text = ''.join(text.split())
|
||||
transcript_dict[audio_id] = text
|
||||
|
||||
data_types = ['train', 'dev', 'test']
|
||||
for dtype in data_types:
|
||||
del json_lines[:]
|
||||
total_sec = 0.0
|
||||
total_text = 0.0
|
||||
total_num = 0
|
||||
|
||||
audio_dir = os.path.join(data_dir, 'corpus/', dtype)
|
||||
for subfolder, _, filelist in sorted(os.walk(audio_dir)):
|
||||
for fname in filelist:
|
||||
if not fname.endswith('.wav'):
|
||||
continue
|
||||
|
||||
audio_path = os.path.abspath(os.path.join(subfolder, fname))
|
||||
audio_id = os.path.basename(fname)[:-4]
|
||||
|
||||
audio_data, samplerate = soundfile.read(audio_path)
|
||||
duration = float(len(audio_data) / samplerate)
|
||||
text = transcript_dict[audio_id]
|
||||
json_lines.append(
|
||||
json.dumps(
|
||||
{
|
||||
'utt': audio_id,
|
||||
'feat': audio_path,
|
||||
'feat_shape': (duration, ), # second
|
||||
'text': text,
|
||||
},
|
||||
ensure_ascii=False))
|
||||
|
||||
total_sec += duration
|
||||
total_text += len(text)
|
||||
total_num += 1
|
||||
|
||||
manifest_path = manifest_path_prefix + '.' + dtype
|
||||
with codecs.open(manifest_path, 'w', 'utf-8') as fout:
|
||||
for line in json_lines:
|
||||
fout.write(line + '\n')
|
||||
|
||||
with open(dtype + '.meta', 'w') as f:
|
||||
print(f"{dtype}:", file=f)
|
||||
print(f"{total_num} utts", file=f)
|
||||
print(f"{total_sec / (60*60)} h", file=f)
|
||||
print(f"{total_text} text", file=f)
|
||||
print(f"{total_text / total_sec} text/sec", file=f)
|
||||
print(f"{total_sec / total_num} sec/utt", file=f)
|
||||
|
||||
|
||||
def prepare_dataset(url, md5sum, target_dir, manifest_path, subset):
|
||||
"""Download, unpack and create manifest file."""
|
||||
data_dir = os.path.join(target_dir, subset)
|
||||
if not os.path.exists(data_dir):
|
||||
filepath = download(url, md5sum, target_dir)
|
||||
unpack(filepath, target_dir)
|
||||
# unpack all audio tar files
|
||||
audio_dir = os.path.join(data_dir, 'corpus')
|
||||
for subfolder, dirlist, filelist in sorted(os.walk(audio_dir)):
|
||||
for sub in dirlist:
|
||||
print(f"unpack dir {sub}...")
|
||||
for folder, _, filelist in sorted(
|
||||
os.walk(os.path.join(subfolder, sub))):
|
||||
for ftar in filelist:
|
||||
unpack(os.path.join(folder, ftar), folder, True)
|
||||
else:
|
||||
print("Skip downloading and unpacking. Data already exists in %s." %
|
||||
target_dir)
|
||||
|
||||
create_manifest(data_dir, manifest_path)
|
||||
|
||||
|
||||
def main():
|
||||
if args.target_dir.startswith('~'):
|
||||
args.target_dir = os.path.expanduser(args.target_dir)
|
||||
|
||||
prepare_dataset(
|
||||
url=DATA_URL,
|
||||
md5sum=MD5_DATA,
|
||||
target_dir=args.target_dir,
|
||||
manifest_path=args.manifest_prefix,
|
||||
subset='aidatatang_200zh')
|
||||
|
||||
print("Data download and manifest prepare done!")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
@ -1 +1,4 @@
|
||||
data_aishell*
|
||||
*.meta
|
||||
manifest.*
|
||||
*.tgz
|
@ -0,0 +1,3 @@
|
||||
# [Aishell1](http://www.openslr.org/33/)
|
||||
|
||||
This Open Source Mandarin Speech Corpus, AISHELL-ASR0009-OS1, is 178 hours long. It is a part of AISHELL-ASR0009, of which utterance contains 11 domains, including smart home, autonomous driving, and industrial production. The whole recording was put in quiet indoor environment, using 3 different devices at the same time: high fidelity microphone (44.1kHz, 16-bit,); Android-system mobile phone (16kHz, 16-bit), iOS-system mobile phone (16kHz, 16-bit). Audios in high fidelity were re-sampled to 16kHz to build AISHELL- ASR0009-OS1. 400 speakers from different accent areas in China were invited to participate in the recording. The manual transcription accuracy rate is above 95%, through professional speech annotation and strict quality inspection. The corpus is divided into training, development and testing sets. ( This database is free for academic research, not in the commerce, if without permission. )
|
@ -0,0 +1,3 @@
|
||||
# [Aishell3](http://www.openslr.org/93/)
|
||||
|
||||
AISHELL-3 is a large-scale and high-fidelity multi-speaker Mandarin speech corpus which could be used to train multi-speaker Text-to-Speech (TTS) systems. The corpus contains roughly **85 hours** of emotion-neutral recordings spoken by 218 native Chinese mandarin speakers and total 88035 utterances. Their auxiliary attributes such as gender, age group and native accents are explicitly marked and provided in the corpus. Accordingly, transcripts in Chinese character-level and pinyin-level are provided along with the recordings. The word & tone transcription accuracy rate is above 98%, through professional speech annotation and strict quality inspection for tone and prosody. ( This database is free for academic research, not in the commerce, if without permission. )
|
@ -0,0 +1 @@
|
||||
GigaSpeech/
|
@ -0,0 +1,10 @@
|
||||
# [GigaSpeech](https://github.com/SpeechColab/GigaSpeech)
|
||||
|
||||
```
|
||||
git clone https://github.com/SpeechColab/GigaSpeech.git
|
||||
|
||||
cd GigaSpeech
|
||||
utils/gigaspeech_download.sh /disk1/audio_data/gigaspeech
|
||||
toolkits/kaldi/gigaspeech_data_prep.sh --train-subset XL /disk1/audio_data/gigaspeech ../data
|
||||
cd ..
|
||||
```
|
@ -0,0 +1,13 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
@ -0,0 +1,14 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
curdir=$PWD
|
||||
|
||||
test -d GigaSpeech || git clone https://github.com/SpeechColab/GigaSpeech.git
|
||||
|
||||
|
||||
pushd GigaSpeech
|
||||
source env_vars.sh
|
||||
./utils/download_gigaspeech.sh ${curdir}/
|
||||
#toolkits/kaldi/gigaspeech_data_prep.sh --train-subset XL /disk1/audio_data/gigaspeech ../data
|
||||
popd
|
@ -0,0 +1,15 @@
|
||||
# [MagicData](http://www.openslr.org/68/)
|
||||
|
||||
MAGICDATA Mandarin Chinese Read Speech Corpus was developed by MAGIC DATA Technology Co., Ltd. and freely published for non-commercial use.
|
||||
The contents and the corresponding descriptions of the corpus include:
|
||||
|
||||
* The corpus contains 755 hours of speech data, which is mostly mobile recorded data.
|
||||
* 1080 speakers from different accent areas in China are invited to participate in the recording.
|
||||
* The sentence transcription accuracy is higher than 98%.
|
||||
* Recordings are conducted in a quiet indoor environment.
|
||||
* The database is divided into training set, validation set, and testing set in a ratio of 51: 1: 2.
|
||||
* Detail information such as speech data coding and speaker information is preserved in the metadata file.
|
||||
* The domain of recording texts is diversified, including interactive Q&A, music search, SNS messages, home command and control, etc.
|
||||
* Segmented transcripts are also provided.
|
||||
|
||||
The corpus aims to support researchers in speech recognition, machine translation, speaker recognition, and other speech-related fields. Therefore, the corpus is totally free for academic use.
|
@ -0,0 +1,11 @@
|
||||
# multi-cn
|
||||
|
||||
This is a Chinese speech recognition recipe that trains on all Chinese corpora on OpenSLR, including:
|
||||
|
||||
* Aidatatang (140 hours)
|
||||
* Aishell (151 hours)
|
||||
* MagicData (712 hours)
|
||||
* Primewords (99 hours)
|
||||
* ST-CMDS (110 hours)
|
||||
* THCHS-30 (26 hours)
|
||||
* optional AISHELL2 (~1000 hours) if available
|
@ -0,0 +1,6 @@
|
||||
# [Primewords](http://www.openslr.org/47/)
|
||||
|
||||
This free Chinese Mandarin speech corpus set is released by Shanghai Primewords Information Technology Co., Ltd.
|
||||
The corpus is recorded by smart mobile phones from 296 native Chinese speakers. The transcription accuracy is larger than 98%, at the confidence level of 95%. It is free for academic use.
|
||||
|
||||
The mapping between the transcript and utterance is given in JSON format.
|
@ -0,0 +1 @@
|
||||
# [FreeST](http://www.openslr.org/38/)
|
@ -0,0 +1,6 @@
|
||||
*.tgz
|
||||
manifest.*
|
||||
data_thchs30
|
||||
resource
|
||||
test-noise
|
||||
*.meta
|
@ -0,0 +1,55 @@
|
||||
# [THCHS30](http://www.openslr.org/18/)
|
||||
|
||||
This is the *data part* of the `THCHS30 2015` acoustic data
|
||||
& scripts dataset.
|
||||
|
||||
The dataset is described in more detail in the paper ``THCHS-30 : A Free
|
||||
Chinese Speech Corpus`` by Dong Wang, Xuewei Zhang.
|
||||
|
||||
A paper (if it can be called a paper) 13 years ago regarding the database:
|
||||
|
||||
Dong Wang, Dalei Wu, Xiaoyan Zhu, ``TCMSD: A new Chinese Continuous Speech Database``,
|
||||
International Conference on Chinese Computing (ICCC'01), 2001, Singapore.
|
||||
|
||||
The layout of this data pack is the following:
|
||||
|
||||
``data``
|
||||
``*.wav``
|
||||
audio data
|
||||
|
||||
``*.wav.trn``
|
||||
transcriptions
|
||||
|
||||
``{train,dev,test}``
|
||||
contain symlinks into the ``data`` directory for both audio and
|
||||
transcription files. Contents of these directories define the
|
||||
train/dev/test split of the data.
|
||||
|
||||
``{lm_word}``
|
||||
``word.3gram.lm``
|
||||
trigram LM based on word
|
||||
``lexicon.txt``
|
||||
lexicon based on word
|
||||
|
||||
``{lm_phone}``
|
||||
``phone.3gram.lm``
|
||||
trigram LM based on phone
|
||||
``lexicon.txt``
|
||||
lexicon based on phone
|
||||
|
||||
``README.TXT``
|
||||
this file
|
||||
|
||||
|
||||
Data statistics
|
||||
===============
|
||||
|
||||
Statistics for the data are as follows:
|
||||
|
||||
=========== ========== ========== ===========
|
||||
**dataset** **audio** **#sents** **#words**
|
||||
=========== ========== ========== ===========
|
||||
train 25 10,000 198,252
|
||||
dev 2:14 893 17,743
|
||||
test 6:15 2,495 49,085
|
||||
=========== ========== ========== ===========
|
@ -0,0 +1,184 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Prepare THCHS-30 mandarin dataset
|
||||
|
||||
Download, unpack and create manifest files.
|
||||
Manifest file is a json-format file with each line containing the
|
||||
meta data (i.e. audio filepath, transcript and audio duration)
|
||||
of each audio file in the data set.
|
||||
"""
|
||||
import argparse
|
||||
import codecs
|
||||
import json
|
||||
import os
|
||||
from multiprocessing.pool import Pool
|
||||
from pathlib import Path
|
||||
|
||||
import soundfile
|
||||
|
||||
from utils.utility import download
|
||||
from utils.utility import unpack
|
||||
|
||||
DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
|
||||
|
||||
URL_ROOT = 'http://www.openslr.org/resources/18'
|
||||
# URL_ROOT = 'https://openslr.magicdatatech.com/resources/18'
|
||||
DATA_URL = URL_ROOT + '/data_thchs30.tgz'
|
||||
TEST_NOISE_URL = URL_ROOT + '/test-noise.tgz'
|
||||
RESOURCE_URL = URL_ROOT + '/resource.tgz'
|
||||
MD5_DATA = '2d2252bde5c8429929e1841d4cb95e90'
|
||||
MD5_TEST_NOISE = '7e8a985fb965b84141b68c68556c2030'
|
||||
MD5_RESOURCE = 'c0b2a565b4970a0c4fe89fefbf2d97e1'
|
||||
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument(
|
||||
"--target_dir",
|
||||
default=DATA_HOME + "/THCHS30",
|
||||
type=str,
|
||||
help="Directory to save the dataset. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--manifest_prefix",
|
||||
default="manifest",
|
||||
type=str,
|
||||
help="Filepath prefix for output manifests. (default: %(default)s)")
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
def read_trn(filepath):
|
||||
"""read trn file.
|
||||
word text in first line.
|
||||
syllable text in second line.
|
||||
phoneme text in third line.
|
||||
|
||||
Args:
|
||||
filepath (str): trn path.
|
||||
|
||||
Returns:
|
||||
list(str): (word, syllable, phone)
|
||||
"""
|
||||
texts = []
|
||||
with open(filepath, 'r') as f:
|
||||
lines = f.read().strip().split('\n')
|
||||
assert len(lines) == 3, lines
|
||||
# charactor text, remove withespace
|
||||
texts.append(''.join(lines[0].split()))
|
||||
texts.extend(lines[1:])
|
||||
return texts
|
||||
|
||||
|
||||
def resolve_symlink(filepath):
|
||||
"""resolve symlink which content is norm file.
|
||||
|
||||
Args:
|
||||
filepath (str): norm file symlink.
|
||||
"""
|
||||
sym_path = Path(filepath)
|
||||
relative_link = sym_path.read_text().strip()
|
||||
relative = Path(relative_link)
|
||||
relpath = sym_path.parent / relative
|
||||
return relpath.resolve()
|
||||
|
||||
|
||||
def create_manifest(data_dir, manifest_path_prefix):
|
||||
print("Creating manifest %s ..." % manifest_path_prefix)
|
||||
json_lines = []
|
||||
data_types = ['train', 'dev', 'test']
|
||||
for dtype in data_types:
|
||||
del json_lines[:]
|
||||
total_sec = 0.0
|
||||
total_text = 0.0
|
||||
total_num = 0
|
||||
|
||||
audio_dir = os.path.join(data_dir, dtype)
|
||||
for subfolder, _, filelist in sorted(os.walk(audio_dir)):
|
||||
for fname in filelist:
|
||||
file_path = os.path.join(subfolder, fname)
|
||||
if file_path.endswith('.wav'):
|
||||
audio_path = os.path.abspath(file_path)
|
||||
text_path = resolve_symlink(audio_path + '.trn')
|
||||
else:
|
||||
continue
|
||||
|
||||
assert os.path.exists(audio_path) and os.path.exists(text_path)
|
||||
|
||||
audio_id = os.path.basename(audio_path)[:-4]
|
||||
word_text, syllable_text, phone_text = read_trn(text_path)
|
||||
audio_data, samplerate = soundfile.read(audio_path)
|
||||
duration = float(len(audio_data) / samplerate)
|
||||
|
||||
# not dump alignment infos
|
||||
json_lines.append(
|
||||
json.dumps(
|
||||
{
|
||||
'utt': audio_id,
|
||||
'feat': audio_path,
|
||||
'feat_shape': (duration, ), # second
|
||||
'text': word_text, # charactor
|
||||
'syllable': syllable_text,
|
||||
'phone': phone_text,
|
||||
},
|
||||
ensure_ascii=False))
|
||||
|
||||
total_sec += duration
|
||||
total_text += len(word_text)
|
||||
total_num += 1
|
||||
|
||||
manifest_path = manifest_path_prefix + '.' + dtype
|
||||
with codecs.open(manifest_path, 'w', 'utf-8') as fout:
|
||||
for line in json_lines:
|
||||
fout.write(line + '\n')
|
||||
|
||||
with open(dtype + '.meta', 'w') as f:
|
||||
print(f"{dtype}:", file=f)
|
||||
print(f"{total_num} utts", file=f)
|
||||
print(f"{total_sec / (60*60)} h", file=f)
|
||||
print(f"{total_text} text", file=f)
|
||||
print(f"{total_text / total_sec} text/sec", file=f)
|
||||
print(f"{total_sec / total_num} sec/utt", file=f)
|
||||
|
||||
|
||||
def prepare_dataset(url, md5sum, target_dir, manifest_path, subset):
|
||||
"""Download, unpack and create manifest file."""
|
||||
datadir = os.path.join(target_dir, subset)
|
||||
if not os.path.exists(datadir):
|
||||
filepath = download(url, md5sum, target_dir)
|
||||
unpack(filepath, target_dir)
|
||||
else:
|
||||
print("Skip downloading and unpacking. Data already exists in %s." %
|
||||
target_dir)
|
||||
|
||||
if subset == 'data_thchs30':
|
||||
create_manifest(datadir, manifest_path)
|
||||
|
||||
|
||||
def main():
|
||||
if args.target_dir.startswith('~'):
|
||||
args.target_dir = os.path.expanduser(args.target_dir)
|
||||
|
||||
tasks = [
|
||||
(DATA_URL, MD5_DATA, args.target_dir, args.manifest_prefix,
|
||||
"data_thchs30"),
|
||||
(TEST_NOISE_URL, MD5_TEST_NOISE, args.target_dir, args.manifest_prefix,
|
||||
"test-noise"),
|
||||
(RESOURCE_URL, MD5_RESOURCE, args.target_dir, args.manifest_prefix,
|
||||
"resource"),
|
||||
]
|
||||
with Pool(7) as pool:
|
||||
pool.starmap(prepare_dataset, tasks)
|
||||
|
||||
print("Data download and manifest prepare done!")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
@ -0,0 +1,4 @@
|
||||
TIMIT.*
|
||||
TIMIT
|
||||
manifest.*
|
||||
*.meta
|
@ -0,0 +1,239 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Prepare Librispeech ASR datasets.
|
||||
|
||||
Download, unpack and create manifest files.
|
||||
Manifest file is a json-format file with each line containing the
|
||||
meta data (i.e. audio filepath, transcript and audio duration)
|
||||
of each audio file in the data set.
|
||||
"""
|
||||
import argparse
|
||||
import codecs
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import string
|
||||
from pathlib import Path
|
||||
|
||||
import soundfile
|
||||
|
||||
from utils.utility import unzip
|
||||
|
||||
URL_ROOT = ""
|
||||
MD5_DATA = "45c68037c7fdfe063a43c851f181fb2d"
|
||||
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument(
|
||||
"--target_dir",
|
||||
default='~/.cache/paddle/dataset/speech/timit',
|
||||
type=str,
|
||||
help="Directory to save the dataset. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--manifest_prefix",
|
||||
default="manifest",
|
||||
type=str,
|
||||
help="Filepath prefix for output manifests. (default: %(default)s)")
|
||||
args = parser.parse_args()
|
||||
|
||||
#: A string containing Chinese punctuation marks (non-stops).
|
||||
non_stops = (
|
||||
# Fullwidth ASCII variants
|
||||
'\uFF02\uFF03\uFF04\uFF05\uFF06\uFF07\uFF08\uFF09\uFF0A\uFF0B\uFF0C\uFF0D'
|
||||
'\uFF0F\uFF1A\uFF1B\uFF1C\uFF1D\uFF1E\uFF20\uFF3B\uFF3C\uFF3D\uFF3E\uFF3F'
|
||||
'\uFF40\uFF5B\uFF5C\uFF5D\uFF5E\uFF5F\uFF60'
|
||||
|
||||
# Halfwidth CJK punctuation
|
||||
'\uFF62\uFF63\uFF64'
|
||||
|
||||
# CJK symbols and punctuation
|
||||
'\u3000\u3001\u3003'
|
||||
|
||||
# CJK angle and corner brackets
|
||||
'\u3008\u3009\u300A\u300B\u300C\u300D\u300E\u300F\u3010\u3011'
|
||||
|
||||
# CJK brackets and symbols/punctuation
|
||||
'\u3014\u3015\u3016\u3017\u3018\u3019\u301A\u301B\u301C\u301D\u301E\u301F'
|
||||
|
||||
# Other CJK symbols
|
||||
'\u3030'
|
||||
|
||||
# Special CJK indicators
|
||||
'\u303E\u303F'
|
||||
|
||||
# Dashes
|
||||
'\u2013\u2014'
|
||||
|
||||
# Quotation marks and apostrophe
|
||||
'\u2018\u2019\u201B\u201C\u201D\u201E\u201F'
|
||||
|
||||
# General punctuation
|
||||
'\u2026\u2027'
|
||||
|
||||
# Overscores and underscores
|
||||
'\uFE4F'
|
||||
|
||||
# Small form variants
|
||||
'\uFE51\uFE54'
|
||||
|
||||
# Latin punctuation
|
||||
'\u00B7')
|
||||
|
||||
#: A string of Chinese stops.
|
||||
stops = (
|
||||
'\uFF01' # Fullwidth exclamation mark
|
||||
'\uFF1F' # Fullwidth question mark
|
||||
'\uFF61' # Halfwidth ideographic full stop
|
||||
'\u3002' # Ideographic full stop
|
||||
)
|
||||
|
||||
#: A string containing all Chinese punctuation.
|
||||
punctuation = non_stops + stops
|
||||
|
||||
|
||||
def tn(text):
|
||||
# lower text
|
||||
text = text.lower()
|
||||
# remove punc
|
||||
text = re.sub(f'[{punctuation}{string.punctuation}]', "", text)
|
||||
return text
|
||||
|
||||
|
||||
def read_txt(filepath: str) -> str:
|
||||
with open(filepath, 'r') as f:
|
||||
line = f.read().strip().split(maxsplit=2)[2]
|
||||
return tn(line)
|
||||
|
||||
|
||||
def read_algin(filepath: str) -> str:
|
||||
"""read word or phone alignment file.
|
||||
<start-sample> <end-sample> <token><newline>
|
||||
|
||||
Args:
|
||||
filepath (str): [description]
|
||||
|
||||
Returns:
|
||||
str: token sepearte by <space>
|
||||
"""
|
||||
aligns = [] # (start, end, token)
|
||||
with open(filepath, 'r') as f:
|
||||
for line in f:
|
||||
items = line.strip().split()
|
||||
# for phone: (Note: beginning and ending silence regions are marked with h#)
|
||||
if items[2].strip() == 'h#':
|
||||
continue
|
||||
aligns.append(items)
|
||||
return ' '.join([item[2] for item in aligns])
|
||||
|
||||
|
||||
def create_manifest(data_dir, manifest_path_prefix):
|
||||
"""Create a manifest json file summarizing the data set, with each line
|
||||
containing the meta data (i.e. audio filepath, transcription text, audio
|
||||
duration) of each audio file within the data set.
|
||||
"""
|
||||
print("Creating manifest %s ..." % manifest_path_prefix)
|
||||
json_lines = []
|
||||
utts = set()
|
||||
|
||||
data_types = ['TRAIN', 'TEST']
|
||||
for dtype in data_types:
|
||||
del json_lines[:]
|
||||
total_sec = 0.0
|
||||
total_text = 0.0
|
||||
total_num = 0
|
||||
|
||||
audio_dir = Path(os.path.join(data_dir, dtype))
|
||||
for fname in sorted(audio_dir.rglob('*.WAV')):
|
||||
audio_path = fname.resolve() # .WAV
|
||||
audio_id = audio_path.stem
|
||||
# if uttid exits, then skipped
|
||||
if audio_id in utts:
|
||||
continue
|
||||
|
||||
utts.add(audio_id)
|
||||
text_path = audio_path.with_suffix('.TXT')
|
||||
phone_path = audio_path.with_suffix('.PHN')
|
||||
word_path = audio_path.with_suffix('.WRD')
|
||||
|
||||
audio_data, samplerate = soundfile.read(
|
||||
str(audio_path), dtype='int16')
|
||||
duration = float(len(audio_data) / samplerate)
|
||||
word_text = read_txt(text_path)
|
||||
phone_text = read_algin(phone_path)
|
||||
|
||||
gender_spk = str(audio_path.parent.stem)
|
||||
spk = gender_spk[1:]
|
||||
gender = gender_spk[0]
|
||||
utt_id = '_'.join([spk, gender, audio_id])
|
||||
# not dump alignment infos
|
||||
json_lines.append(
|
||||
json.dumps(
|
||||
{
|
||||
'utt': utt_id,
|
||||
'feat': str(audio_path),
|
||||
'feat_shape': (duration, ), # second
|
||||
'text': word_text, # word
|
||||
'phone': phone_text,
|
||||
'spk': spk,
|
||||
'gender': gender,
|
||||
},
|
||||
ensure_ascii=False))
|
||||
|
||||
total_sec += duration
|
||||
total_text += len(word_text.split())
|
||||
total_num += 1
|
||||
|
||||
manifest_path = manifest_path_prefix + '.' + dtype.lower()
|
||||
with codecs.open(manifest_path, 'w', 'utf-8') as fout:
|
||||
for line in json_lines:
|
||||
fout.write(line + '\n')
|
||||
|
||||
with open(dtype.lower() + '.meta', 'w') as f:
|
||||
print(f"{dtype}:", file=f)
|
||||
print(f"{total_num} utts", file=f)
|
||||
print(f"{total_sec / (60*60)} h", file=f)
|
||||
print(f"{total_text} text", file=f)
|
||||
print(f"{total_text / total_sec} text/sec", file=f)
|
||||
print(f"{total_sec / total_num} sec/utt", file=f)
|
||||
|
||||
|
||||
def prepare_dataset(url, md5sum, target_dir, manifest_path):
|
||||
"""Download, unpack and create summmary manifest file.
|
||||
"""
|
||||
filepath = os.path.join(target_dir, "TIMIT.zip")
|
||||
if not os.path.exists(filepath):
|
||||
print(f"Please download TIMIT.zip into {target_dir}.")
|
||||
raise FileNotFoundError
|
||||
|
||||
if not os.path.exists(os.path.join(target_dir, "TIMIT")):
|
||||
# check md5sum
|
||||
assert check_md5sum(filepath, md5sum)
|
||||
# unpack
|
||||
unzip(filepath, target_dir)
|
||||
else:
|
||||
print("Skip downloading and unpacking. Data already exists in %s." %
|
||||
target_dir)
|
||||
# create manifest json file
|
||||
create_manifest(os.path.join(target_dir, "TIMIT"), manifest_path)
|
||||
|
||||
|
||||
def main():
|
||||
if args.target_dir.startswith('~'):
|
||||
args.target_dir = os.path.expanduser(args.target_dir)
|
||||
|
||||
prepare_dataset(URL_ROOT, MD5_DATA, args.target_dir, args.manifest_prefix)
|
||||
print("Data download and manifest prepare done!")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
@ -0,0 +1,43 @@
|
||||
#! /usr/bin/env bash
|
||||
|
||||
if [ $# != 2 ];then
|
||||
echo "usage: ${0} config_path ckpt_path_prefix"
|
||||
exit -1
|
||||
fi
|
||||
|
||||
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
|
||||
echo "using $ngpu gpus..."
|
||||
|
||||
device=gpu
|
||||
if [ ngpu == 0 ];then
|
||||
device=cpu
|
||||
fi
|
||||
config_path=$1
|
||||
ckpt_prefix=$2
|
||||
|
||||
ckpt_name=$(basename ${ckpt_prefxi})
|
||||
|
||||
mkdir -p exp
|
||||
|
||||
|
||||
|
||||
batch_size=1
|
||||
output_dir=${ckpt_prefix}
|
||||
mkdir -p ${output_dir}
|
||||
|
||||
# align dump in `result_file`
|
||||
# .tier, .TextGrid dump in `dir of result_file`
|
||||
python3 -u ${BIN_DIR}/alignment.py \
|
||||
--device ${device} \
|
||||
--nproc 1 \
|
||||
--config ${config_path} \
|
||||
--result_file ${output_dir}/${type}.align \
|
||||
--checkpoint_path ${ckpt_prefix} \
|
||||
--opts decoding.batch_size ${batch_size}
|
||||
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Failed in ctc alignment!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
exit 0
|
@ -1,4 +1,4 @@
|
||||
export MAIN_ROOT=${PWD}/../../
|
||||
export MAIN_ROOT=${PWD}/../../../
|
||||
|
||||
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
|
||||
export LC_ALL=C
|
@ -0,0 +1,43 @@
|
||||
#! /usr/bin/env bash
|
||||
|
||||
if [ $# != 2 ];then
|
||||
echo "usage: ${0} config_path ckpt_path_prefix"
|
||||
exit -1
|
||||
fi
|
||||
|
||||
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
|
||||
echo "using $ngpu gpus..."
|
||||
|
||||
device=gpu
|
||||
if [ ngpu == 0 ];then
|
||||
device=cpu
|
||||
fi
|
||||
config_path=$1
|
||||
ckpt_prefix=$2
|
||||
|
||||
ckpt_name=$(basename ${ckpt_prefxi})
|
||||
|
||||
mkdir -p exp
|
||||
|
||||
|
||||
|
||||
batch_size=1
|
||||
output_dir=${ckpt_prefix}
|
||||
mkdir -p ${output_dir}
|
||||
|
||||
# align dump in `result_file`
|
||||
# .tier, .TextGrid dump in `dir of result_file`
|
||||
python3 -u ${BIN_DIR}/alignment.py \
|
||||
--device ${device} \
|
||||
--nproc 1 \
|
||||
--config ${config_path} \
|
||||
--result_file ${output_dir}/${type}.align \
|
||||
--checkpoint_path ${ckpt_prefix} \
|
||||
--opts decoding.batch_size ${batch_size}
|
||||
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Failed in ctc alignment!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
exit 0
|
@ -0,0 +1,77 @@
|
||||
cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
|
||||
|
||||
project(deepspeech VERSION 0.1)
|
||||
|
||||
set(CMAKE_VERBOSE_MAKEFILE on)
|
||||
# set std-14
|
||||
set(CMAKE_CXX_STANDARD 14)
|
||||
|
||||
# include file
|
||||
include(FetchContent)
|
||||
include(ExternalProject)
|
||||
# fc_patch dir
|
||||
set(FETCHCONTENT_QUIET off)
|
||||
get_filename_component(fc_patch "fc_patch" REALPATH BASE_DIR "${CMAKE_SOURCE_DIR}")
|
||||
set(FETCHCONTENT_BASE_DIR ${fc_patch})
|
||||
|
||||
|
||||
###############################################################################
|
||||
# Option Configurations
|
||||
###############################################################################
|
||||
# option configurations
|
||||
option(TEST_DEBUG "option for debug" OFF)
|
||||
|
||||
|
||||
###############################################################################
|
||||
# Include third party
|
||||
###############################################################################
|
||||
# #example for include third party
|
||||
# FetchContent_Declare()
|
||||
# # FetchContent_MakeAvailable was not added until CMake 3.14
|
||||
# FetchContent_MakeAvailable()
|
||||
# include_directories()
|
||||
|
||||
# ABSEIL-CPP
|
||||
include(FetchContent)
|
||||
FetchContent_Declare(
|
||||
absl
|
||||
GIT_REPOSITORY "https://github.com/abseil/abseil-cpp.git"
|
||||
GIT_TAG "20210324.1"
|
||||
)
|
||||
FetchContent_MakeAvailable(absl)
|
||||
|
||||
# libsndfile
|
||||
include(FetchContent)
|
||||
FetchContent_Declare(
|
||||
libsndfile
|
||||
GIT_REPOSITORY "https://github.com/libsndfile/libsndfile.git"
|
||||
GIT_TAG "1.0.31"
|
||||
)
|
||||
FetchContent_MakeAvailable(libsndfile)
|
||||
|
||||
|
||||
###############################################################################
|
||||
# Add local library
|
||||
###############################################################################
|
||||
# system lib
|
||||
find_package()
|
||||
# if dir have CmakeLists.txt
|
||||
add_subdirectory()
|
||||
# if dir do not have CmakeLists.txt
|
||||
add_library(lib_name STATIC file.cc)
|
||||
target_link_libraries(lib_name item0 item1)
|
||||
add_dependencies(lib_name depend-target)
|
||||
|
||||
|
||||
###############################################################################
|
||||
# Library installation
|
||||
###############################################################################
|
||||
install()
|
||||
|
||||
|
||||
###############################################################################
|
||||
# Build binary file
|
||||
###############################################################################
|
||||
add_executable()
|
||||
target_link_libraries()
|
||||
|
@ -0,0 +1,2 @@
|
||||
aux_source_directory(. DIR_LIB_SRCS)
|
||||
add_library(decoder STATIC ${DIR_LIB_SRCS})
|
Loading…
Reference in new issue