PaddleSpeech/dataset/mini_librispeech/mini_librispeech.py

# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Prepare Librispeech ASR datasets.

Download, unpack and create manifest files.
Manifest file is a json-format file with each line containing the
meta data (i.e. audio filepath, transcript and audio duration)
of each audio file in the data set.
"""
import argparse
import codecs
import io
import json
import os
from multiprocessing.pool import Pool

import soundfile

from paddlespeech.dataset.download import download
from paddlespeech.dataset.download import unpack

URL_ROOT = "http://openslr.elda.org/resources/31"
URL_TRAIN_CLEAN = URL_ROOT + "/train-clean-5.tar.gz"
URL_DEV_CLEAN = URL_ROOT + "/dev-clean-2.tar.gz"

MD5_TRAIN_CLEAN = "5df7d4e78065366204ca6845bb08f490"
MD5_DEV_CLEAN = "6d7ab67ac6a1d2c993d050e16d61080d"

parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
    "--target_dir",
    default='~/.cache/paddle/dataset/speech/libri',
    type=str,
    help="Directory to save the dataset. (default: %(default)s)")
parser.add_argument(
    "--manifest_prefix",
    default="manifest",
    type=str,
    help="Filepath prefix for output manifests. (default: %(default)s)")
args = parser.parse_args()


def create_manifest(data_dir, manifest_path):
    """Create a manifest json file summarizing the data set, with each line
    containing the meta data (i.e. audio filepath, transcription text, audio
    duration) of each audio file within the data set.
    """
    print("Creating manifest %s ..." % manifest_path)
    json_lines = []
    total_sec = 0.0
    total_text = 0.0
    total_num = 0

    for subfolder, _, filelist in sorted(os.walk(data_dir)):
        text_filelist = [
            filename for filename in filelist if filename.endswith('trans.txt')
        ]
        if len(text_filelist) > 0:
            text_filepath = os.path.join(subfolder, text_filelist[0])
            for line in io.open(text_filepath, encoding="utf8"):
                segments = line.strip().split()
                text = ' '.join(segments[1:]).lower()
                audio_filepath = os.path.join(subfolder, segments[0] + '.flac')
                audio_data, samplerate = soundfile.read(audio_filepath)
                duration = float(len(audio_data)) / samplerate

                utt = os.path.splitext(os.path.basename(audio_filepath))[0]
                utt2spk = '-'.join(utt.split('-')[:2])
                json_lines.append(
                    json.dumps({
                        'utt': utt,
                        'utt2spk': utt2spk,
                        'feat': audio_filepath,
                        'feat_shape': (duration, ),  #second
                        'text': text,
                    }))

                total_sec += duration
                total_text += len(text)
                total_num += 1

    with codecs.open(manifest_path, 'w', 'utf-8') as out_file:
        for line in json_lines:
            out_file.write(line + '\n')

    subset = os.path.splitext(manifest_path)[1][1:]
    manifest_dir = os.path.dirname(manifest_path)
    data_dir_name = os.path.split(data_dir)[-1]
    meta_path = os.path.join(manifest_dir, data_dir_name) + '.meta'
    with open(meta_path, 'w') as f:
        print(f"{subset}:", file=f)
        print(f"{total_num} utts", file=f)
        print(f"{total_sec / (60*60)} h", file=f)
        print(f"{total_text} text", file=f)
        print(f"{total_text / total_sec} text/sec", file=f)
        print(f"{total_sec / total_num} sec/utt", file=f)


def prepare_dataset(url, md5sum, target_dir, manifest_path):
    """Download, unpack and create summmary manifest file.
    """
    if not os.path.exists(os.path.join(target_dir, "LibriSpeech")):
        # download
        filepath = download(url, md5sum, target_dir)
        # unpack
        unpack(filepath, target_dir)
    else:
        print("Skip downloading and unpacking. Data already exists in %s." %
              target_dir)
    # create manifest json file
    create_manifest(target_dir, manifest_path)


def main():
    if args.target_dir.startswith('~'):
        args.target_dir = os.path.expanduser(args.target_dir)

    tasks = [
        (URL_TRAIN_CLEAN, MD5_TRAIN_CLEAN,
         os.path.join(args.target_dir, "train-clean"),
         args.manifest_prefix + ".train-clean"),
        (URL_DEV_CLEAN, MD5_DEV_CLEAN, os.path.join(
            args.target_dir, "dev-clean"), args.manifest_prefix + ".dev-clean"),
    ]

    with Pool(2) as pool:
        pool.starmap(prepare_dataset, tasks)

    print("Data download and manifest prepare done!")


if __name__ == '__main__':
    main()
add copyright 4 years ago			`# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
refactor tiny egs 4 years ago			`"""Prepare Librispeech ASR datasets.`

			`Download, unpack and create manifest files.`
			`Manifest file is a json-format file with each line containing the`
			`meta data (i.e. audio filepath, transcript and audio duration)`
			`of each audio file in the data set.`
			`"""`
			`import argparse`
			`import codecs`
			`import io`
E2E/Streaming Transformer/Conformer ASR (#578) * add cmvn and label smoothing loss layer * add layer for transformer * add glu and conformer conv * add torch compatiable hack, mask funcs * not hack size since it exists * add test; attention * add attention, common utils, hack paddle * add audio utils * conformer batch padding mask bug fix #223 * fix typo, python infer fix rnn mem opt name error and batchnorm1d, will be available at 2.0.2 * fix ci * fix ci * add encoder * refactor egs * add decoder * refactor ctc, add ctc align, refactor ckpt, add warmup lr scheduler, cmvn utils * refactor docs * add fix * fix readme * fix bugs, refactor collator, add pad_sequence, fix ckpt bugs * fix docstring * refactor data feed order * add u2 model * refactor cmvn, test * add utils * add u2 config * fix bugs * fix bugs * fix autograd maybe has problem when using inplace operation * refactor data, build vocab; add format data * fix text featurizer * refactor build vocab * add fbank, refactor feature of speech * refactor audio feat * refactor data preprare * refactor data * model init from config * add u2 bins * flake8 * can train * fix bugs, add coverage, add scripts * test can run * fix data * speed perturb with sox * add spec aug * fix for train * fix train logitc * fix logger * log valid loss, time dataset process * using np for speed perturb, remove some debug log of grad clip * fix logger * fix build vocab * fix logger name * using module logger as default * fix * fix install * reorder imports * fix board logger * fix logger * kaldi fbank and mfcc * fix cmvn and print prarams * fix add_eos_sos and cmvn * fix cmvn compute * fix logger and cmvn * fix subsampling, label smoothing loss, remove useless * add notebook test * fix log * fix tb logger * multi gpu valid * fix log * fix log * fix config * fix compute cmvn, need paddle 2.1 * add cmvn notebook * fix layer tools * fix compute cmvn * add rtf * fix decoding * fix layer tools * fix log, add avg script * more avg and test info * fix dataset pickle problem; using 2.1 paddle; num_workers can > 0; ckpt save in exp dir;fix setup.sh; * add vimrc * refactor tiny script, add transformer and stream conf * spm demo; librisppech scripts and confs * fix log * add librispeech scripts * refactor data pipe; fix conf; fix u2 default params * fix bugs * refactor aishell scripts * fix test * fix cmvn * fix s0 scripts * fix ds2 scripts and bugs * fix dev & test dataset filter * fix dataset filter * filter dev * fix ckpt path * filter test, since librispeech will cause OOM, but all test wer will be worse, since mismatch train with test * add comment * add syllable doc * fix ds2 configs * add doc * add pypinyin tools * fix decoder using blank_id=0 * mmseg with pybind11 * format code 4 years ago			`import json`
			`import os`
			`from multiprocessing.pool import Pool`

			`import soundfile`
format and fix pre-commit (#1120) 3 years ago
[s2t] mv dataset into paddlespeech.dataset (#3183) * mv dataset into paddlespeech.dataset * add aidatatang * fix import 2 years ago			`from paddlespeech.dataset.download import download`
			`from paddlespeech.dataset.download import unpack`
refactor tiny egs 4 years ago
fix url in librispeech.py 2 years ago			`URL_ROOT = "http://openslr.elda.org/resources/31"`
Support paddle 2.x (#538) * 2.x model * model test pass * fix data * fix soundfile with flac support * one thread dataloader test pass * export feasture size add trainer and utils add setup model and dataloader update travis using Bionic dist * add venv; test under venv * fix unittest; train and valid * add train and config * add config and train script * fix ctc cuda memcopy error * fix imports * fix train valid log * fix dataset batch shuffle shift start from 1 fix rank_zero_only decreator error close tensorboard when train over add decoding config and code * test process can run * test with decoding * test and infer with decoding * fix infer * fix ctc loss lr schedule sortagrad logger * aishell egs * refactor train add aishell egs * fix dataset batch shuffle and add batch sampler log print model parameter * fix model and ctc * sequence_mask make all inputs zeros, which cause grad be zero, this is a bug of LessThanOp add grad clip by global norm add model train test notebook * ctc loss remove run prefix using ord value as text id * using unk when training compute_loss need text ids ord id using in test mode, which compute wer/cer * fix tester * add lr_deacy refactor code * fix tools * fix ci add tune fix gru model bugs add dataset and model test * fix decoding * refactor repo fix decoding * fix musan and rir dataset * refactor io, loss, conv, rnn, gradclip, model, utils * fix ci and import * refactor model add export jit model * add deploy bin and test it * rm uselss egs * add layer tools * refactor socket server new model from pretrain * remve useless * fix instability loss and grad nan or inf for librispeech training * fix sampler * fix libri train.sh * fix doc * add license on cpp * fix doc * fix libri script * fix install * clip 5 wer 7.39, clip 400 wer 7.54, 1.8 clip 400 baseline 7.49 4 years ago			`URL_TRAIN_CLEAN = URL_ROOT + "/train-clean-5.tar.gz"`
			`URL_DEV_CLEAN = URL_ROOT + "/dev-clean-2.tar.gz"`
refactor tiny egs 4 years ago
Support paddle 2.x (#538) * 2.x model * model test pass * fix data * fix soundfile with flac support * one thread dataloader test pass * export feasture size add trainer and utils add setup model and dataloader update travis using Bionic dist * add venv; test under venv * fix unittest; train and valid * add train and config * add config and train script * fix ctc cuda memcopy error * fix imports * fix train valid log * fix dataset batch shuffle shift start from 1 fix rank_zero_only decreator error close tensorboard when train over add decoding config and code * test process can run * test with decoding * test and infer with decoding * fix infer * fix ctc loss lr schedule sortagrad logger * aishell egs * refactor train add aishell egs * fix dataset batch shuffle and add batch sampler log print model parameter * fix model and ctc * sequence_mask make all inputs zeros, which cause grad be zero, this is a bug of LessThanOp add grad clip by global norm add model train test notebook * ctc loss remove run prefix using ord value as text id * using unk when training compute_loss need text ids ord id using in test mode, which compute wer/cer * fix tester * add lr_deacy refactor code * fix tools * fix ci add tune fix gru model bugs add dataset and model test * fix decoding * refactor repo fix decoding * fix musan and rir dataset * refactor io, loss, conv, rnn, gradclip, model, utils * fix ci and import * refactor model add export jit model * add deploy bin and test it * rm uselss egs * add layer tools * refactor socket server new model from pretrain * remve useless * fix instability loss and grad nan or inf for librispeech training * fix sampler * fix libri train.sh * fix doc * add license on cpp * fix doc * fix libri script * fix install * clip 5 wer 7.39, clip 400 wer 7.54, 1.8 clip 400 baseline 7.49 4 years ago			`MD5_TRAIN_CLEAN = "5df7d4e78065366204ca6845bb08f490"`
			`MD5_DEV_CLEAN = "6d7ab67ac6a1d2c993d050e16d61080d"`
refactor tiny egs 4 years ago
			`parser = argparse.ArgumentParser(description=__doc__)`
			`parser.add_argument(`
			`"--target_dir",`
			`default='~/.cache/paddle/dataset/speech/libri',`
			`type=str,`
			`help="Directory to save the dataset. (default: %(default)s)")`
			`parser.add_argument(`
			`"--manifest_prefix",`
			`default="manifest",`
			`type=str,`
			`help="Filepath prefix for output manifests. (default: %(default)s)")`
			`args = parser.parse_args()`


			`def create_manifest(data_dir, manifest_path):`
			`"""Create a manifest json file summarizing the data set, with each line`
			`containing the meta data (i.e. audio filepath, transcription text, audio`
			`duration) of each audio file within the data set.`
			`"""`
			`print("Creating manifest %s ..." % manifest_path)`
			`json_lines = []`
add thchs30, aidatatang; 3 years ago			`total_sec = 0.0`
			`total_text = 0.0`
			`total_num = 0`

refactor tiny egs 4 years ago			`for subfolder, _, filelist in sorted(os.walk(data_dir)):`
			`text_filelist = [`
			`filename for filename in filelist if filename.endswith('trans.txt')`
			`]`
			`if len(text_filelist) > 0:`
			`text_filepath = os.path.join(subfolder, text_filelist[0])`
			`for line in io.open(text_filepath, encoding="utf8"):`
			`segments = line.strip().split()`
			`text = ' '.join(segments[1:]).lower()`
			`audio_filepath = os.path.join(subfolder, segments[0] + '.flac')`
			`audio_data, samplerate = soundfile.read(audio_filepath)`
			`duration = float(len(audio_data)) / samplerate`
add utt2spk for all dataset 3 years ago
			`utt = os.path.splitext(os.path.basename(audio_filepath))[0]`
			`utt2spk = '-'.join(utt.split('-')[:2])`
refactor tiny egs 4 years ago			`json_lines.append(`
			`json.dumps({`
add utt2spk for all dataset 3 years ago			`'utt': utt,`
			`'utt2spk': utt2spk,`
			`'feat': audio_filepath,`
E2E/Streaming Transformer/Conformer ASR (#578) * add cmvn and label smoothing loss layer * add layer for transformer * add glu and conformer conv * add torch compatiable hack, mask funcs * not hack size since it exists * add test; attention * add attention, common utils, hack paddle * add audio utils * conformer batch padding mask bug fix #223 * fix typo, python infer fix rnn mem opt name error and batchnorm1d, will be available at 2.0.2 * fix ci * fix ci * add encoder * refactor egs * add decoder * refactor ctc, add ctc align, refactor ckpt, add warmup lr scheduler, cmvn utils * refactor docs * add fix * fix readme * fix bugs, refactor collator, add pad_sequence, fix ckpt bugs * fix docstring * refactor data feed order * add u2 model * refactor cmvn, test * add utils * add u2 config * fix bugs * fix bugs * fix autograd maybe has problem when using inplace operation * refactor data, build vocab; add format data * fix text featurizer * refactor build vocab * add fbank, refactor feature of speech * refactor audio feat * refactor data preprare * refactor data * model init from config * add u2 bins * flake8 * can train * fix bugs, add coverage, add scripts * test can run * fix data * speed perturb with sox * add spec aug * fix for train * fix train logitc * fix logger * log valid loss, time dataset process * using np for speed perturb, remove some debug log of grad clip * fix logger * fix build vocab * fix logger name * using module logger as default * fix * fix install * reorder imports * fix board logger * fix logger * kaldi fbank and mfcc * fix cmvn and print prarams * fix add_eos_sos and cmvn * fix cmvn compute * fix logger and cmvn * fix subsampling, label smoothing loss, remove useless * add notebook test * fix log * fix tb logger * multi gpu valid * fix log * fix log * fix config * fix compute cmvn, need paddle 2.1 * add cmvn notebook * fix layer tools * fix compute cmvn * add rtf * fix decoding * fix layer tools * fix log, add avg script * more avg and test info * fix dataset pickle problem; using 2.1 paddle; num_workers can > 0; ckpt save in exp dir;fix setup.sh; * add vimrc * refactor tiny script, add transformer and stream conf * spm demo; librisppech scripts and confs * fix log * add librispeech scripts * refactor data pipe; fix conf; fix u2 default params * fix bugs * refactor aishell scripts * fix test * fix cmvn * fix s0 scripts * fix ds2 scripts and bugs * fix dev & test dataset filter * fix dataset filter * filter dev * fix ckpt path * filter test, since librispeech will cause OOM, but all test wer will be worse, since mismatch train with test * add comment * add syllable doc * fix ds2 configs * add doc * add pypinyin tools * fix decoder using blank_id=0 * mmseg with pybind11 * format code 4 years ago			`'feat_shape': (duration, ), #second`
add utt2spk for all dataset 3 years ago			`'text': text,`
refactor tiny egs 4 years ago			`}))`
add thchs30, aidatatang; 3 years ago
			`total_sec += duration`
			`total_text += len(text)`
			`total_num += 1`

refactor tiny egs 4 years ago			`with codecs.open(manifest_path, 'w', 'utf-8') as out_file:`
			`for line in json_lines:`
			`out_file.write(line + '\n')`

fix dataset meta path 3 years ago			`subset = os.path.splitext(manifest_path)[1][1:]`
			`manifest_dir = os.path.dirname(manifest_path)`
修复了librispeech和mini_libirspeech 3 years ago			`data_dir_name = os.path.split(data_dir)[-1]`
			`meta_path = os.path.join(manifest_dir, data_dir_name) + '.meta'`
fix dataset meta path 3 years ago			`with open(meta_path, 'w') as f:`
add thchs30, aidatatang; 3 years ago			`print(f"{subset}:", file=f)`
			`print(f"{total_num} utts", file=f)`
			`print(f"{total_sec / (60*60)} h", file=f)`
			`print(f"{total_text} text", file=f)`
			`print(f"{total_text / total_sec} text/sec", file=f)`
			`print(f"{total_sec / total_num} sec/utt", file=f)`

refactor tiny egs 4 years ago
			`def prepare_dataset(url, md5sum, target_dir, manifest_path):`
			`"""Download, unpack and create summmary manifest file.`
			`"""`
			`if not os.path.exists(os.path.join(target_dir, "LibriSpeech")):`
			`# download`
			`filepath = download(url, md5sum, target_dir)`
			`# unpack`
			`unpack(filepath, target_dir)`
			`else:`
			`print("Skip downloading and unpacking. Data already exists in %s." %`
			`target_dir)`
			`# create manifest json file`
			`create_manifest(target_dir, manifest_path)`


			`def main():`
			`if args.target_dir.startswith('~'):`
			`args.target_dir = os.path.expanduser(args.target_dir)`

E2E/Streaming Transformer/Conformer ASR (#578) * add cmvn and label smoothing loss layer * add layer for transformer * add glu and conformer conv * add torch compatiable hack, mask funcs * not hack size since it exists * add test; attention * add attention, common utils, hack paddle * add audio utils * conformer batch padding mask bug fix #223 * fix typo, python infer fix rnn mem opt name error and batchnorm1d, will be available at 2.0.2 * fix ci * fix ci * add encoder * refactor egs * add decoder * refactor ctc, add ctc align, refactor ckpt, add warmup lr scheduler, cmvn utils * refactor docs * add fix * fix readme * fix bugs, refactor collator, add pad_sequence, fix ckpt bugs * fix docstring * refactor data feed order * add u2 model * refactor cmvn, test * add utils * add u2 config * fix bugs * fix bugs * fix autograd maybe has problem when using inplace operation * refactor data, build vocab; add format data * fix text featurizer * refactor build vocab * add fbank, refactor feature of speech * refactor audio feat * refactor data preprare * refactor data * model init from config * add u2 bins * flake8 * can train * fix bugs, add coverage, add scripts * test can run * fix data * speed perturb with sox * add spec aug * fix for train * fix train logitc * fix logger * log valid loss, time dataset process * using np for speed perturb, remove some debug log of grad clip * fix logger * fix build vocab * fix logger name * using module logger as default * fix * fix install * reorder imports * fix board logger * fix logger * kaldi fbank and mfcc * fix cmvn and print prarams * fix add_eos_sos and cmvn * fix cmvn compute * fix logger and cmvn * fix subsampling, label smoothing loss, remove useless * add notebook test * fix log * fix tb logger * multi gpu valid * fix log * fix log * fix config * fix compute cmvn, need paddle 2.1 * add cmvn notebook * fix layer tools * fix compute cmvn * add rtf * fix decoding * fix layer tools * fix log, add avg script * more avg and test info * fix dataset pickle problem; using 2.1 paddle; num_workers can > 0; ckpt save in exp dir;fix setup.sh; * add vimrc * refactor tiny script, add transformer and stream conf * spm demo; librisppech scripts and confs * fix log * add librispeech scripts * refactor data pipe; fix conf; fix u2 default params * fix bugs * refactor aishell scripts * fix test * fix cmvn * fix s0 scripts * fix ds2 scripts and bugs * fix dev & test dataset filter * fix dataset filter * filter dev * fix ckpt path * filter test, since librispeech will cause OOM, but all test wer will be worse, since mismatch train with test * add comment * add syllable doc * fix ds2 configs * add doc * add pypinyin tools * fix decoder using blank_id=0 * mmseg with pybind11 * format code 4 years ago			`tasks = [`
			`(URL_TRAIN_CLEAN, MD5_TRAIN_CLEAN,`
			`os.path.join(args.target_dir, "train-clean"),`
			`args.manifest_prefix + ".train-clean"),`
			`(URL_DEV_CLEAN, MD5_DEV_CLEAN, os.path.join(`
			`args.target_dir, "dev-clean"), args.manifest_prefix + ".dev-clean"),`
			`]`

			`with Pool(2) as pool:`
			`pool.starmap(prepare_dataset, tasks)`

			`print("Data download and manifest prepare done!")`
refactor tiny egs 4 years ago

			`if __name__ == '__main__':`
			`main()`