PaddleSpeech/examples/dataset/chime3_background/chime3_background.py

# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Prepare CHiME3 background data.

Download, unpack and create manifest files.
Manifest file is a json-format file with each line containing the
meta data (i.e. audio filepath, transcript and audio duration)
of each audio file in the data set.
"""

import distutils.util
import os
import wget
import zipfile
import argparse
import soundfile
import json
import io
from paddle.v2.dataset.common import md5file

#DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
DATA_HOME = os.path.expanduser('.')

URL = "https://d4s.myairbridge.com/packagev2/AG0Y3DNBE5IWRRTV/?dlid=W19XG7T0NNHB027139H0EQ"
MD5 = "c3ff512618d7a67d4f85566ea1bc39ec"

parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
    "--target_dir",
    default=DATA_HOME + "/chime3_background",
    type=str,
    help="Directory to save the dataset. (default: %(default)s)")
parser.add_argument(
    "--manifest_filepath",
    default="manifest.chime3.background",
    type=str,
    help="Filepath for output manifests. (default: %(default)s)")
args = parser.parse_args()


def download(url, md5sum, target_dir, filename=None):
    """Download file from url to target_dir, and check md5sum."""
    if filename == None:
        filename = url.split("/")[-1]
    if not os.path.exists(target_dir): os.makedirs(target_dir)
    filepath = os.path.join(target_dir, filename)
    if not (os.path.exists(filepath) and md5file(filepath) == md5sum):
        print("Downloading %s ..." % url)
        wget.download(url, target_dir)
        print("\nMD5 Chesksum %s ..." % filepath)
        if not md5file(filepath) == md5sum:
            raise RuntimeError("MD5 checksum failed.")
    else:
        print("File exists, skip downloading. (%s)" % filepath)
    return filepath


def unpack(filepath, target_dir):
    """Unpack the file to the target_dir."""
    print("Unpacking %s ..." % filepath)
    if filepath.endswith('.zip'):
        zip = zipfile.ZipFile(filepath, 'r')
        zip.extractall(target_dir)
        zip.close()
    elif filepath.endswith('.tar') or filepath.endswith('.tar.gz'):
        tar = zipfile.open(filepath)
        tar.extractall(target_dir)
        tar.close()
    else:
        raise ValueError("File format is not supported for unpacking.")


def create_manifest(data_dir, manifest_path):
    """Create a manifest json file summarizing the data set, with each line
    containing the meta data (i.e. audio filepath, transcription text, audio
    duration) of each audio file within the data set.
    """
    print("Creating manifest %s ..." % manifest_path)
    json_lines = []
    for subfolder, _, filelist in sorted(os.walk(data_dir)):
        for filename in filelist:
            if filename.endswith('.wav'):
                filepath = os.path.join(data_dir, subfolder, filename)
                audio_data, samplerate = soundfile.read(filepath)
                duration = float(len(audio_data)) / samplerate
                json_lines.append(
                    json.dumps({
                        'audio_filepath': filepath,
                        'duration': duration,
                        'text': ''
                    }))
    with io.open(manifest_path, mode='w', encoding='utf8') as out_file:
        for line in json_lines:
            out_file.write(line + '\n')


def prepare_chime3(url, md5sum, target_dir, manifest_path):
    """Download, unpack and create summmary manifest file."""
    if not os.path.exists(os.path.join(target_dir, "CHiME3")):
        # download
        filepath = download(url, md5sum, target_dir,
                            "myairbridge-AG0Y3DNBE5IWRRTV.zip")
        # unpack
        unpack(filepath, target_dir)
        unpack(
            os.path.join(target_dir, 'CHiME3_background_bus.zip'), target_dir)
        unpack(
            os.path.join(target_dir, 'CHiME3_background_caf.zip'), target_dir)
        unpack(
            os.path.join(target_dir, 'CHiME3_background_ped.zip'), target_dir)
        unpack(
            os.path.join(target_dir, 'CHiME3_background_str.zip'), target_dir)
    else:
        print("Skip downloading and unpacking. Data already exists in %s." %
              target_dir)
    # create manifest json file
    create_manifest(target_dir, manifest_path)


def main():
    prepare_chime3(
        url=URL,
        md5sum=MD5,
        target_dir=args.target_dir,
        manifest_path=args.manifest_filepath)


if __name__ == '__main__':
    main()
add copyright 4 years ago			`# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
Add NoisePerturbAugmentor and CHiME3 data preparation. 7 years ago			`"""Prepare CHiME3 background data.`

			`Download, unpack and create manifest files.`
			`Manifest file is a json-format file with each line containing the`
			`meta data (i.e. audio filepath, transcript and audio duration)`
			`of each audio file in the data set.`
			`"""`

			`import distutils.util`
			`import os`
			`import wget`
			`import zipfile`
			`import argparse`
			`import soundfile`
			`import json`
update deepspeech to fluid api 5 years ago			`import io`
Add NoisePerturbAugmentor and CHiME3 data preparation. 7 years ago			`from paddle.v2.dataset.common import md5file`

Support paddle 2.x (#538) * 2.x model * model test pass * fix data * fix soundfile with flac support * one thread dataloader test pass * export feasture size add trainer and utils add setup model and dataloader update travis using Bionic dist * add venv; test under venv * fix unittest; train and valid * add train and config * add config and train script * fix ctc cuda memcopy error * fix imports * fix train valid log * fix dataset batch shuffle shift start from 1 fix rank_zero_only decreator error close tensorboard when train over add decoding config and code * test process can run * test with decoding * test and infer with decoding * fix infer * fix ctc loss lr schedule sortagrad logger * aishell egs * refactor train add aishell egs * fix dataset batch shuffle and add batch sampler log print model parameter * fix model and ctc * sequence_mask make all inputs zeros, which cause grad be zero, this is a bug of LessThanOp add grad clip by global norm add model train test notebook * ctc loss remove run prefix using ord value as text id * using unk when training compute_loss need text ids ord id using in test mode, which compute wer/cer * fix tester * add lr_deacy refactor code * fix tools * fix ci add tune fix gru model bugs add dataset and model test * fix decoding * refactor repo fix decoding * fix musan and rir dataset * refactor io, loss, conv, rnn, gradclip, model, utils * fix ci and import * refactor model add export jit model * add deploy bin and test it * rm uselss egs * add layer tools * refactor socket server new model from pretrain * remve useless * fix instability loss and grad nan or inf for librispeech training * fix sampler * fix libri train.sh * fix doc * add license on cpp * fix doc * fix libri script * fix install * clip 5 wer 7.39, clip 400 wer 7.54, 1.8 clip 400 baseline 7.49 4 years ago			`#DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')`
			`DATA_HOME = os.path.expanduser('.')`
Add NoisePerturbAugmentor and CHiME3 data preparation. 7 years ago
			`URL = "https://d4s.myairbridge.com/packagev2/AG0Y3DNBE5IWRRTV/?dlid=W19XG7T0NNHB027139H0EQ"`
			`MD5 = "c3ff512618d7a67d4f85566ea1bc39ec"`

			`parser = argparse.ArgumentParser(description=__doc__)`
			`parser.add_argument(`
			`"--target_dir",`
			`default=DATA_HOME + "/chime3_background",`
			`type=str,`
			`help="Directory to save the dataset. (default: %(default)s)")`
			`parser.add_argument(`
			`"--manifest_filepath",`
			`default="manifest.chime3.background",`
			`type=str,`
			`help="Filepath for output manifests. (default: %(default)s)")`
			`args = parser.parse_args()`


			`def download(url, md5sum, target_dir, filename=None):`
			`"""Download file from url to target_dir, and check md5sum."""`
			`if filename == None:`
			`filename = url.split("/")[-1]`
			`if not os.path.exists(target_dir): os.makedirs(target_dir)`
			`filepath = os.path.join(target_dir, filename)`
			`if not (os.path.exists(filepath) and md5file(filepath) == md5sum):`
			`print("Downloading %s ..." % url)`
			`wget.download(url, target_dir)`
			`print("\nMD5 Chesksum %s ..." % filepath)`
			`if not md5file(filepath) == md5sum:`
			`raise RuntimeError("MD5 checksum failed.")`
			`else:`
			`print("File exists, skip downloading. (%s)" % filepath)`
			`return filepath`


			`def unpack(filepath, target_dir):`
			`"""Unpack the file to the target_dir."""`
			`print("Unpacking %s ..." % filepath)`
			`if filepath.endswith('.zip'):`
			`zip = zipfile.ZipFile(filepath, 'r')`
			`zip.extractall(target_dir)`
			`zip.close()`
			`elif filepath.endswith('.tar') or filepath.endswith('.tar.gz'):`
			`tar = zipfile.open(filepath)`
			`tar.extractall(target_dir)`
			`tar.close()`
			`else:`
			`raise ValueError("File format is not supported for unpacking.")`


			`def create_manifest(data_dir, manifest_path):`
			`"""Create a manifest json file summarizing the data set, with each line`
			`containing the meta data (i.e. audio filepath, transcription text, audio`
			`duration) of each audio file within the data set.`
			`"""`
			`print("Creating manifest %s ..." % manifest_path)`
			`json_lines = []`
			`for subfolder, _, filelist in sorted(os.walk(data_dir)):`
			`for filename in filelist:`
			`if filename.endswith('.wav'):`
			`filepath = os.path.join(data_dir, subfolder, filename)`
			`audio_data, samplerate = soundfile.read(filepath)`
			`duration = float(len(audio_data)) / samplerate`
			`json_lines.append(`
			`json.dumps({`
			`'audio_filepath': filepath,`
			`'duration': duration,`
			`'text': ''`
			`}))`
update deepspeech to fluid api 5 years ago			`with io.open(manifest_path, mode='w', encoding='utf8') as out_file:`
Add NoisePerturbAugmentor and CHiME3 data preparation. 7 years ago			`for line in json_lines:`
			`out_file.write(line + '\n')`


			`def prepare_chime3(url, md5sum, target_dir, manifest_path):`
			`"""Download, unpack and create summmary manifest file."""`
			`if not os.path.exists(os.path.join(target_dir, "CHiME3")):`
			`# download`
			`filepath = download(url, md5sum, target_dir,`
			`"myairbridge-AG0Y3DNBE5IWRRTV.zip")`
			`# unpack`
			`unpack(filepath, target_dir)`
			`unpack(`
			`os.path.join(target_dir, 'CHiME3_background_bus.zip'), target_dir)`
			`unpack(`
			`os.path.join(target_dir, 'CHiME3_background_caf.zip'), target_dir)`
			`unpack(`
			`os.path.join(target_dir, 'CHiME3_background_ped.zip'), target_dir)`
			`unpack(`
			`os.path.join(target_dir, 'CHiME3_background_str.zip'), target_dir)`
			`else:`
			`print("Skip downloading and unpacking. Data already exists in %s." %`
			`target_dir)`
			`# create manifest json file`
			`create_manifest(target_dir, manifest_path)`


			`def main():`
			`prepare_chime3(`
			`url=URL,`
			`md5sum=MD5,`
			`target_dir=args.target_dir,`
			`manifest_path=args.manifest_filepath)`


			`if __name__ == '__main__':`
			`main()`