PaddleSpeech/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/extract_mel.py

# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import multiprocessing as mp
from functools import partial
from pathlib import Path

import numpy as np
import tqdm

from paddlespeech.t2s.audio import AudioProcessor
from paddlespeech.t2s.audio.spec_normalizer import LogMagnitude
from paddlespeech.t2s.audio.spec_normalizer import NormalizerBase
from paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.config import get_cfg_defaults


def extract_mel(fname: Path,
                input_dir: Path,
                output_dir: Path,
                p: AudioProcessor,
                n: NormalizerBase):
    relative_path = fname.relative_to(input_dir)
    out_path = (output_dir / relative_path).with_suffix(".npy")
    out_path.parent.mkdir(parents=True, exist_ok=True)
    wav = p.read_wav(fname)
    mel = p.mel_spectrogram(wav)
    mel = n.transform(mel)
    np.save(out_path, mel)


def extract_mel_multispeaker(config, input_dir, output_dir, extension=".wav"):
    input_dir = Path(input_dir).expanduser()
    fnames = list(input_dir.rglob(f"*{extension}"))
    output_dir = Path(output_dir).expanduser()
    output_dir.mkdir(parents=True, exist_ok=True)

    p = AudioProcessor(config.sample_rate, config.n_fft, config.win_length,
                       config.hop_length, config.d_mels, config.fmin,
                       config.fmax)
    n = LogMagnitude(1e-5)

    func = partial(
        extract_mel, input_dir=input_dir, output_dir=output_dir, p=p, n=n)

    with mp.Pool(16) as pool:
        list(
            tqdm.tqdm(
                pool.imap(func, fnames), total=len(fnames), unit="utterance"))


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Extract mel spectrogram from processed wav in AiShell3 training dataset."
    )
    parser.add_argument(
        "--config",
        type=str,
        help="yaml config file to overwrite the default config")
    parser.add_argument(
        "--input",
        type=str,
        default="~/datasets/aishell3/train/normalized_wav",
        help="path of the processed wav folder")
    parser.add_argument(
        "--output",
        type=str,
        default="~/datasets/aishell3/train/mel",
        help="path of the folder to save mel spectrograms")
    parser.add_argument(
        "--opts",
        nargs=argparse.REMAINDER,
        help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
    )
    default_config = get_cfg_defaults()

    args = parser.parse_args()
    if args.config:
        default_config.merge_from_file(args.config)
    if args.opts:
        default_config.merge_from_list(args.opts)
    default_config.freeze()
    audio_config = default_config.data

    extract_mel_multispeaker(audio_config, args.input, args.output)
merge parakeet repo into deepspeech 3 years ago			`# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
			`import argparse`
			`import multiprocessing as mp`
			`from functools import partial`
			`from pathlib import Path`

			`import numpy as np`
			`import tqdm`

merge deepspeech, parakeet and text_processing into paddlespeech 3 years ago			`from paddlespeech.t2s.audio import AudioProcessor`
			`from paddlespeech.t2s.audio.spec_normalizer import LogMagnitude`
			`from paddlespeech.t2s.audio.spec_normalizer import NormalizerBase`
			`from paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.config import get_cfg_defaults`
merge parakeet repo into deepspeech 3 years ago

			`def extract_mel(fname: Path,`
			`input_dir: Path,`
			`output_dir: Path,`
			`p: AudioProcessor,`
			`n: NormalizerBase):`
			`relative_path = fname.relative_to(input_dir)`
			`out_path = (output_dir / relative_path).with_suffix(".npy")`
			`out_path.parent.mkdir(parents=True, exist_ok=True)`
			`wav = p.read_wav(fname)`
			`mel = p.mel_spectrogram(wav)`
			`mel = n.transform(mel)`
			`np.save(out_path, mel)`


			`def extract_mel_multispeaker(config, input_dir, output_dir, extension=".wav"):`
			`input_dir = Path(input_dir).expanduser()`
			`fnames = list(input_dir.rglob(f"*{extension}"))`
			`output_dir = Path(output_dir).expanduser()`
			`output_dir.mkdir(parents=True, exist_ok=True)`

			`p = AudioProcessor(config.sample_rate, config.n_fft, config.win_length,`
refactor parakeet examples 3 years ago			`config.hop_length, config.d_mels, config.fmin,`
merge parakeet repo into deepspeech 3 years ago			`config.fmax)`
			`n = LogMagnitude(1e-5)`

			`func = partial(`
			`extract_mel, input_dir=input_dir, output_dir=output_dir, p=p, n=n)`

			`with mp.Pool(16) as pool:`
			`list(`
			`tqdm.tqdm(`
			`pool.imap(func, fnames), total=len(fnames), unit="utterance"))`


			`if __name__ == "__main__":`
			`parser = argparse.ArgumentParser(`
			`description="Extract mel spectrogram from processed wav in AiShell3 training dataset."`
			`)`
			`parser.add_argument(`
			`"--config",`
			`type=str,`
			`help="yaml config file to overwrite the default config")`
			`parser.add_argument(`
			`"--input",`
			`type=str,`
			`default="~/datasets/aishell3/train/normalized_wav",`
			`help="path of the processed wav folder")`
			`parser.add_argument(`
			`"--output",`
			`type=str,`
			`default="~/datasets/aishell3/train/mel",`
			`help="path of the folder to save mel spectrograms")`
			`parser.add_argument(`
			`"--opts",`
			`nargs=argparse.REMAINDER,`
			`help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"`
			`)`
			`default_config = get_cfg_defaults()`

			`args = parser.parse_args()`
			`if args.config:`
			`default_config.merge_from_file(args.config)`
			`if args.opts:`
			`default_config.merge_from_list(args.opts)`
			`default_config.freeze()`
			`audio_config = default_config.data`

			`extract_mel_multispeaker(audio_config, args.input, args.output)`