{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## TTS with Tacotron2 + Waveflow" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import paddle\n", "from matplotlib import pyplot as plt\n", "from IPython import display as ipd\n", "%matplotlib inline\n", "\n", "from parakeet.utils import display\n", "from parakeet.utils import layer_tools\n", "paddle.set_device(\"gpu:0\")\n", "\n", "import sys\n", "sys.path.append(\"../..\")\n", "import examples" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Tacotron2: synthesizer model\n", "\n", "Tacotron2 is used here as a phonemes to spectrogram model. Here we will use an alternative config. In this config, the tacotron2 model does not have a binary classifier to predict whether the generation should stop.\n", "\n", "Instead, the peak position is used as the criterion. When the peak position of the attention reaches the end of the encoder outputs, it implies that the content is exhausted. So we stop the generated after 10 frames." ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from parakeet.models.tacotron2 import Tacotron2\n", "from parakeet.frontend import EnglishCharacter" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "data:\n", " batch_size: 32\n", " fmax: 8000\n", " fmin: 0\n", " hop_length: 256\n", " n_fft: 1024\n", " n_mels: 80\n", " padding_idx: 0\n", " sample_rate: 22050\n", " valid_size: 64\n", " win_length: 1024\n", "model:\n", " attention_filters: 32\n", " attention_kernel_size: 31\n", " d_attention: 128\n", " d_attention_rnn: 1024\n", " d_decoder_rnn: 1024\n", " d_encoder: 512\n", " d_global_condition: None\n", " d_postnet: 512\n", " d_prenet: 256\n", " encoder_conv_layers: 3\n", " encoder_kernel_size: 5\n", " guided_attention_loss_sigma: 0.2\n", " n_tones: None\n", " p_attention_dropout: 0.1\n", " p_decoder_dropout: 0.1\n", " p_encoder_dropout: 0.5\n", " p_postnet_dropout: 0.5\n", " p_prenet_dropout: 0.5\n", " postnet_conv_layers: 5\n", " postnet_kernel_size: 5\n", " reduction_factor: 1\n", " use_guided_attention_loss: True\n", " use_stop_token: False\n", " vocab_size: 37\n", "training:\n", " grad_clip_thresh: 1.0\n", " lr: 0.001\n", " max_iteration: 500000\n", " plot_interval: 1000\n", " save_interval: 1000\n", " valid_interval: 1000\n", " weight_decay: 1e-06\n" ] } ], "source": [ "from examples.tacotron2 import config as tacotron2_config\n", "synthesizer_config = tacotron2_config.get_cfg_defaults()\n", "synthesizer_config.merge_from_file(\"configs/alternative.yaml\")\n", "print(synthesizer_config)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[checkpoint] Rank 0: loaded model from ../../pretrained/tacotron2/tacotron2_ljspeech_ckpt_0.3_alternative/step-50000.pdparams\n" ] } ], "source": [ "frontend = EnglishCharacter()\n", "model = Tacotron2.from_pretrained(\n", " synthesizer_config, \"../../pretrained/tacotron2/tacotron2_ljspeech_ckpt_0.3_alternative/step-50000\")\n", "model.eval()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ " 36%|███▋ | 365/1000 [00:01<00:02, 256.89it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "content exhausted!\n" ] } ], "source": [ "sentence = \"Life was like a box of chocolates, you never know what you're gonna get.\" \n", "sentence = paddle.to_tensor(frontend(sentence)).unsqueeze(0)\n", "\n", "with paddle.no_grad():\n", " outputs = model.infer(sentence)\n", "mel_output = outputs[\"mel_outputs_postnet\"][0].numpy().T\n", "alignment = outputs[\"alignments\"][0].numpy().T" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "fig = display.plot_alignment(alignment)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## WaveFlow: vocoder model\n", "Generated spectrogram is converted to raw audio using a pretrained waveflow model." ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "from parakeet.models.waveflow import ConditionalWaveFlow" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "data:\n", " batch_size: 8\n", " clip_frames: 65\n", " fmax: 8000\n", " fmin: 0\n", " hop_length: 256\n", " n_fft: 1024\n", " n_mels: 80\n", " sample_rate: 22050\n", " valid_size: 16\n", " win_length: 1024\n", "model:\n", " channels: 128\n", " kernel_size: [3, 3]\n", " n_flows: 8\n", " n_group: 16\n", " n_layers: 8\n", " sigma: 1.0\n", " upsample_factors: [16, 16]\n", "training:\n", " lr: 0.0002\n", " max_iteration: 3000000\n", " save_interval: 10000\n", " valid_interval: 1000\n" ] } ], "source": [ "from examples.waveflow import config as waveflow_config\n", "vocoder_config = waveflow_config.get_cfg_defaults()\n", "print(vocoder_config)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[checkpoint] Rank 0: loaded model from ../../pretrained/waveflow/waveflow_ljspeech_ckpt_0.3/step-2000000.pdparams\n" ] } ], "source": [ "vocoder = ConditionalWaveFlow.from_pretrained(\n", " vocoder_config, \n", " \"../../pretrained/waveflow/waveflow_ljspeech_ckpt_0.3/step-2000000\")\n", "layer_tools.recursively_remove_weight_norm(vocoder)\n", "vocoder.eval()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "time: 9.412613868713379s\n" ] } ], "source": [ "audio = vocoder.infer(paddle.transpose(outputs[\"mel_outputs_postnet\"], [0, 2, 1]))\n", "wav = audio[0].numpy()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ipd.Audio(wav, rate=22050)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.7" } }, "nbformat": 4, "nbformat_minor": 4 }