diff --git a/examples/csmsc/tts3/local/inference.sh b/examples/csmsc/tts3/local/inference.sh index 7c58980cd..9322cfd69 100755 --- a/examples/csmsc/tts3/local/inference.sh +++ b/examples/csmsc/tts3/local/inference.sh @@ -48,4 +48,15 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then --text=${BIN_DIR}/../sentences.txt \ --output_dir=${train_output_path}/pd_infer_out \ --phones_dict=dump/phone_id_map.txt +fi + +# wavernn +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + python3 ${BIN_DIR}/../inference.py \ + --inference_dir=${train_output_path}/inference \ + --am=fastspeech2_csmsc \ + --voc=wavernn_csmsc \ + --text=${BIN_DIR}/../sentences.txt \ + --output_dir=${train_output_path}/pd_infer_out \ + --phones_dict=dump/phone_id_map.txt fi \ No newline at end of file diff --git a/examples/csmsc/tts3/local/synthesize_e2e.sh b/examples/csmsc/tts3/local/synthesize_e2e.sh index d4744486c..d1fadf77d 100755 --- a/examples/csmsc/tts3/local/synthesize_e2e.sh +++ b/examples/csmsc/tts3/local/synthesize_e2e.sh @@ -89,3 +89,25 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then --inference_dir=${train_output_path}/inference \ --phones_dict=dump/phone_id_map.txt fi + + +# wavernn +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + echo "in wavernn syn_e2e" + FLAGS_allocator_strategy=naive_best_fit \ + FLAGS_fraction_of_gpu_memory_to_use=0.01 \ + python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=fastspeech2_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=wavernn_csmsc \ + --voc_config=wavernn_test/default.yaml \ + --voc_ckpt=wavernn_test/snapshot_iter_5000.pdz \ + --voc_stat=wavernn_test/feats_stats.npy \ + --lang=zh \ + --text=${BIN_DIR}/../sentences.txt \ + --output_dir=${train_output_path}/test_e2e \ + --phones_dict=dump/phone_id_map.txt \ + --inference_dir=${train_output_path}/inference +fi diff --git a/examples/csmsc/voc6/conf/default.yaml b/examples/csmsc/voc6/conf/default.yaml new file mode 100644 index 000000000..e7696cf40 --- /dev/null +++ b/examples/csmsc/voc6/conf/default.yaml @@ -0,0 +1,67 @@ + +########################################################### +# FEATURE EXTRACTION SETTING # +########################################################### +fs: 24000 # Sampling rate. +n_fft: 2048 # FFT size (samples). +n_shift: 300 # Hop size (samples). 12.5ms +win_length: 1200 # Window length (samples). 50ms + # If set to null, it will be the same as fft_size. +window: "hann" # Window function. +n_mels: 80 # Number of mel basis. +fmin: 80 # Minimum freq in mel basis calculation. (Hz) +fmax: 7600 # Maximum frequency in mel basis calculation. (Hz) +mu_law: True # Recommended to suppress noise if using raw bitsexit() + + +########################################################### +# MODEL SETTING # +########################################################### +model: + rnn_dims: 512 # Hidden dims of RNN Layers. + fc_dims: 512 + bits: 9 # Bit depth of signal + aux_context_window: 2 # Context window size for auxiliary feature. + # If set to 2, previous 2 and future 2 frames will be considered. + aux_channels: 80 # Number of channels for auxiliary feature conv. + # Must be the same as num_mels. + upsample_scales: [4, 5, 3, 5] # Upsampling scales. Prodcut of these must be the same as hop size, same with pwgan here + compute_dims: 128 # Dims of Conv1D in MelResNet. + res_out_dims: 128 # Dims of output in MelResNet. + res_blocks: 10 # Number of residual blocks. + mode: RAW # either 'raw'(softmax on raw bits) or 'mold' (sample from mixture of logistics) +inference: + gen_batched: True # whether to genenate sample in batch mode + target: 12000 # target number of samples to be generated in each batch entry + overlap: 600 # number of samples for crossfading between batches + + +########################################################### +# DATA LOADER SETTING # +########################################################### +batch_size: 64 # Batch size. +batch_max_steps: 4500 # Length of each audio in batch. Make sure dividable by hop_size. +num_workers: 2 # Number of workers in DataLoader. + +########################################################### +# OPTIMIZER SETTING # +########################################################### +grad_clip: 4.0 +learning_rate: 1.0e-4 + + +########################################################### +# INTERVAL SETTING # +########################################################### + +train_max_steps: 400000 # Number of training steps. +save_interval_steps: 5000 # Interval steps to save checkpoint. +eval_interval_steps: 1000 # Interval steps to evaluate the network. +gen_eval_samples_interval_steps: 5000 # the iteration interval of generating valid samples +generate_num: 5 # number of samples to generate at each checkpoint + +########################################################### +# OTHER SETTING # +########################################################### +num_snapshots: 10 # max number of snapshots to keep while training +seed: 42 # random seed for paddle, random, and np.random diff --git a/examples/csmsc/voc6/local/preprocess.sh b/examples/csmsc/voc6/local/preprocess.sh new file mode 100755 index 000000000..2dcc39ac7 --- /dev/null +++ b/examples/csmsc/voc6/local/preprocess.sh @@ -0,0 +1,55 @@ +#!/bin/bash + +stage=0 +stop_stage=100 + +config_path=$1 + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # get durations from MFA's result + echo "Generate durations.txt from MFA results ..." + python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \ + --inputdir=./baker_alignment_tone \ + --output=durations.txt \ + --config=${config_path} +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # extract features + echo "Extract features ..." + python3 ${BIN_DIR}/../gan_vocoder/preprocess.py \ + --rootdir=~/datasets/BZNSYP/ \ + --dataset=baker \ + --dumpdir=dump \ + --dur-file=durations.txt \ + --config=${config_path} \ + --cut-sil=True \ + --num-cpu=20 +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # get features' stats(mean and std) + echo "Get features' stats ..." + python3 ${MAIN_ROOT}/utils/compute_statistics.py \ + --metadata=dump/train/raw/metadata.jsonl \ + --field-name="feats" +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + # normalize, dev and test should use train's stats + echo "Normalize ..." + + python3 ${BIN_DIR}/../gan_vocoder/normalize.py \ + --metadata=dump/train/raw/metadata.jsonl \ + --dumpdir=dump/train/norm \ + --stats=dump/train/feats_stats.npy + python3 ${BIN_DIR}/../gan_vocoder/normalize.py \ + --metadata=dump/dev/raw/metadata.jsonl \ + --dumpdir=dump/dev/norm \ + --stats=dump/train/feats_stats.npy + + python3 ${BIN_DIR}/../gan_vocoder/normalize.py \ + --metadata=dump/test/raw/metadata.jsonl \ + --dumpdir=dump/test/norm \ + --stats=dump/train/feats_stats.npy +fi diff --git a/examples/csmsc/voc6/local/synthesize.sh b/examples/csmsc/voc6/local/synthesize.sh new file mode 100755 index 000000000..7f0cbe48c --- /dev/null +++ b/examples/csmsc/voc6/local/synthesize.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 + +FLAGS_allocator_strategy=naive_best_fit \ +FLAGS_fraction_of_gpu_memory_to_use=0.01 \ +python3 ${BIN_DIR}/synthesize.py \ + --config=${config_path} \ + --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \ + --test-metadata=dump/test/norm/metadata.jsonl \ + --output-dir=${train_output_path}/test diff --git a/examples/csmsc/voc6/local/train.sh b/examples/csmsc/voc6/local/train.sh new file mode 100755 index 000000000..9695631ef --- /dev/null +++ b/examples/csmsc/voc6/local/train.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +config_path=$1 +train_output_path=$2 + +FLAGS_cudnn_exhaustive_search=true \ +FLAGS_conv_workspace_size_limit=4000 \ +python ${BIN_DIR}/train.py \ + --train-metadata=dump/train/norm/metadata.jsonl \ + --dev-metadata=dump/dev/norm/metadata.jsonl \ + --config=${config_path} \ + --output-dir=${train_output_path} \ + --ngpu=1 diff --git a/examples/csmsc/voc6/path.sh b/examples/csmsc/voc6/path.sh new file mode 100755 index 000000000..b0c98584d --- /dev/null +++ b/examples/csmsc/voc6/path.sh @@ -0,0 +1,13 @@ +#!/bin/bash +export MAIN_ROOT=`realpath ${PWD}/../../../` + +export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} +export LC_ALL=C + +export PYTHONDONTWRITEBYTECODE=1 +# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C +export PYTHONIOENCODING=UTF-8 +export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} + +MODEL=wavernn +export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL} \ No newline at end of file diff --git a/examples/csmsc/voc6/run.sh b/examples/csmsc/voc6/run.sh new file mode 100755 index 000000000..5f754fff3 --- /dev/null +++ b/examples/csmsc/voc6/run.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +set -e +source path.sh + +gpus=0,1 +stage=0 +stop_stage=100 + +conf_path=conf/default.yaml +train_output_path=exp/default +test_input=dump/dump_gta_test +ckpt_name=snapshot_iter_100000.pdz + +source ${MAIN_ROOT}/utils/parse_options.sh || exit 1 + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # prepare data + ./local/preprocess.sh ${conf_path} || exit -1 +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # prepare data + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1 +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # synthesize + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 +fi diff --git a/paddlespeech/t2s/audio/__init__.py b/paddlespeech/t2s/audio/__init__.py index 7747b7945..0deefc8bc 100644 --- a/paddlespeech/t2s/audio/__init__.py +++ b/paddlespeech/t2s/audio/__init__.py @@ -12,5 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. from .audio import AudioProcessor +from .codec import * from .spec_normalizer import LogMagnitude from .spec_normalizer import NormalizerBase diff --git a/paddlespeech/t2s/audio/codec.py b/paddlespeech/t2s/audio/codec.py new file mode 100644 index 000000000..2a759ce4c --- /dev/null +++ b/paddlespeech/t2s/audio/codec.py @@ -0,0 +1,51 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import math + +import numpy as np +import paddle + + +# x: [0: 2**bit-1], return: [-1, 1] +def label_2_float(x, bits): + return 2 * x / (2**bits - 1.) - 1. + + +#x: [-1, 1], return: [0, 2**bits-1] +def float_2_label(x, bits): + assert abs(x).max() <= 1.0 + x = (x + 1.) * (2**bits - 1) / 2 + return x.clip(0, 2**bits - 1) + + +# y: [-1, 1], mu: 2**bits, return: [0, 2**bits-1] +# see https://en.wikipedia.org/wiki/%CE%9C-law_algorithm +# be careful the input `mu` here, which is +1 than that of the link above +def encode_mu_law(x, mu): + mu = mu - 1 + fx = np.sign(x) * np.log(1 + mu * np.abs(x)) / np.log(1 + mu) + return np.floor((fx + 1) / 2 * mu + 0.5) + + +# from_labels = True: +# y: [0: 2**bit-1], mu: 2**bits, return: [-1,1] +# from_labels = False: +# y: [-1, 1], return: [-1, 1] +def decode_mu_law(y, mu, from_labels=True): + # TODO: get rid of log2 - makes no sense + if from_labels: + y = label_2_float(y, math.log2(mu)) + mu = mu - 1 + x = paddle.sign(y) / mu * ((1 + mu)**paddle.abs(y) - 1) + return x diff --git a/paddlespeech/t2s/datasets/vocoder_batch_fn.py b/paddlespeech/t2s/datasets/vocoder_batch_fn.py index 2e4f740fb..d969a1d35 100644 --- a/paddlespeech/t2s/datasets/vocoder_batch_fn.py +++ b/paddlespeech/t2s/datasets/vocoder_batch_fn.py @@ -14,6 +14,10 @@ import numpy as np import paddle +from paddlespeech.t2s.audio.codec import encode_mu_law +from paddlespeech.t2s.audio.codec import float_2_label +from paddlespeech.t2s.audio.codec import label_2_float + class Clip(object): """Collate functor for training vocoders. @@ -49,7 +53,7 @@ class Clip(object): self.end_offset = -(self.batch_max_frames + aux_context_window) self.mel_threshold = self.batch_max_frames + 2 * aux_context_window - def __call__(self, examples): + def __call__(self, batch): """Convert into batch tensors. Parameters @@ -67,11 +71,11 @@ class Clip(object): """ # check length - examples = [ - self._adjust_length(b['wave'], b['feats']) for b in examples + batch = [ + self._adjust_length(b['wave'], b['feats']) for b in batch if b['feats'].shape[0] > self.mel_threshold ] - xs, cs = [b[0] for b in examples], [b[1] for b in examples] + xs, cs = [b[0] for b in batch], [b[1] for b in batch] # make batch with random cut c_lengths = [c.shape[0] for c in cs] @@ -89,7 +93,7 @@ class Clip(object): c_batch = np.stack( [c[start:end] for c, start, end in zip(cs, c_starts, c_ends)]) - # convert each batch to tensor, asuume that each item in batch has the same length + # convert each batch to tensor, assume that each item in batch has the same length y_batch = paddle.to_tensor( y_batch, dtype=paddle.float32).unsqueeze(1) # (B, 1, T) c_batch = paddle.to_tensor( @@ -120,3 +124,113 @@ class Clip(object): 0] * self.hop_size, f"wave length: ({len(x)}), mel length: ({c.shape[0]})" return x, c + + +class WaveRNNClip(Clip): + def __init__(self, + mode: str='RAW', + batch_max_steps: int=4500, + hop_size: int=300, + aux_context_window: int=2, + bits: int=9, + mu_law: bool=True): + self.mode = mode + self.mel_win = batch_max_steps // hop_size + 2 * aux_context_window + self.batch_max_steps = batch_max_steps + self.hop_size = hop_size + self.aux_context_window = aux_context_window + self.mu_law = mu_law + self.batch_max_frames = batch_max_steps // hop_size + self.mel_threshold = self.batch_max_frames + 2 * aux_context_window + if self.mode == 'MOL': + self.bits = 16 + else: + self.bits = bits + + def to_quant(self, wav): + if self.mode == 'RAW': + if self.mu_law: + quant = encode_mu_law(wav, mu=2**self.bits) + else: + quant = float_2_label(wav, bits=self.bits) + elif self.mode == 'MOL': + quant = float_2_label(wav, bits=16) + quant = quant.astype(np.int64) + return quant + + def __call__(self, batch): + # voc_pad = 2 this will pad the input so that the resnet can 'see' wider than input length + # max_offsets = n_frames - 2 - (mel_win + 2 * hp.voc_pad) = n_frames - 15 + """Convert into batch tensors. + + Parameters + ---------- + batch : list + list of tuple of the pair of audio and features. + Audio shape (T, ), features shape(T', C). + + Returns + ---------- + Tensor + Input signal batch (B, 1, T). + Tensor + Target signal batch (B, 1, T). + Tensor + Auxiliary feature batch (B, C, T'), where + T = (T' - 2 * aux_context_window) * hop_size. + + """ + # check length + batch = [ + self._adjust_length(b['wave'], b['feats']) for b in batch + if b['feats'].shape[0] > self.mel_threshold + ] + wav, mel = [b[0] for b in batch], [b[1] for b in batch] + # mel 此处需要转置 + mel = [x.T for x in mel] + max_offsets = [ + x.shape[-1] - 2 - (self.mel_win + 2 * self.aux_context_window) + for x in mel + ] + # the slice point of mel selecting randomly + mel_offsets = [np.random.randint(0, offset) for offset in max_offsets] + # the slice point of wav selecting randomly, which is behind 2(=pad) frames + sig_offsets = [(offset + self.aux_context_window) * self.hop_size + for offset in mel_offsets] + # mels.shape[1] = voc_seq_len // hop_length + 2 * voc_pad + mels = [ + x[:, mel_offsets[i]:mel_offsets[i] + self.mel_win] + for i, x in enumerate(mel) + ] + # label.shape[1] = voc_seq_len + 1 + wav = [self.to_quant(x) for x in wav] + + labels = [ + x[sig_offsets[i]:sig_offsets[i] + self.batch_max_steps + 1] + for i, x in enumerate(wav) + ] + + mels = np.stack(mels).astype(np.float32) + labels = np.stack(labels).astype(np.int64) + + mels = paddle.to_tensor(mels) + labels = paddle.to_tensor(labels, dtype='int64') + # x is input, y is label + x = labels[:, :self.batch_max_steps] + y = labels[:, 1:] + ''' + mode = RAW: + mu_law = True: + quant: bits = 9 0, 1, 2, ..., 509, 510, 511 int + mu_law = False + quant bits = 9 [0, 511] float + mode = MOL: + quant: bits = 16 [0. 65536] float + ''' + # x should be normalizes in.[0, 1] in RAW mode + x = label_2_float(paddle.cast(x, dtype='float32'), self.bits) + # y should be normalizes in.[0, 1] in MOL mode + if self.mode == 'MOL': + y = label_2_float(paddle.cast(y, dtype='float32'), self.bits) + + return x, y, mels diff --git a/paddlespeech/t2s/exps/inference.py b/paddlespeech/t2s/exps/inference.py index c3510beaa..26d7e2c08 100644 --- a/paddlespeech/t2s/exps/inference.py +++ b/paddlespeech/t2s/exps/inference.py @@ -54,7 +54,7 @@ def main(): default='pwgan_csmsc', choices=[ 'pwgan_csmsc', 'mb_melgan_csmsc', 'hifigan_csmsc', 'pwgan_aishell3', - 'pwgan_vctk' + 'pwgan_vctk', 'wavernn_csmsc' ], help='Choose vocoder type of tts task.') # other diff --git a/paddlespeech/t2s/exps/synthesize_e2e.py b/paddlespeech/t2s/exps/synthesize_e2e.py index d615f4f5c..0b95a8832 100644 --- a/paddlespeech/t2s/exps/synthesize_e2e.py +++ b/paddlespeech/t2s/exps/synthesize_e2e.py @@ -59,6 +59,10 @@ model_alias = { "paddlespeech.t2s.models.hifigan:HiFiGANGenerator", "hifigan_inference": "paddlespeech.t2s.models.hifigan:HiFiGANInference", + "wavernn": + "paddlespeech.t2s.models.wavernn:WaveRNN", + "wavernn_inference": + "paddlespeech.t2s.models.wavernn:WaveRNNInference", } @@ -151,10 +155,16 @@ def evaluate(args): voc_name = args.voc[:args.voc.rindex('_')] voc_class = dynamic_import(voc_name, model_alias) voc_inference_class = dynamic_import(voc_name + '_inference', model_alias) - voc = voc_class(**voc_config["generator_params"]) - voc.set_state_dict(paddle.load(args.voc_ckpt)["generator_params"]) - voc.remove_weight_norm() - voc.eval() + if voc_name != 'wavernn': + voc = voc_class(**voc_config["generator_params"]) + voc.set_state_dict(paddle.load(args.voc_ckpt)["generator_params"]) + voc.remove_weight_norm() + voc.eval() + else: + voc = voc_class(**voc_config["model"]) + voc.set_state_dict(paddle.load(args.voc_ckpt)["main_params"]) + voc.eval() + voc_mu, voc_std = np.load(args.voc_stat) voc_mu = paddle.to_tensor(voc_mu) voc_std = paddle.to_tensor(voc_std) @@ -322,7 +332,8 @@ def main(): default='pwgan_csmsc', choices=[ 'pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3', 'pwgan_vctk', - 'mb_melgan_csmsc', 'style_melgan_csmsc', 'hifigan_csmsc' + 'mb_melgan_csmsc', 'style_melgan_csmsc', 'hifigan_csmsc', + 'wavernn_csmsc' ], help='Choose vocoder type of tts task.') diff --git a/paddlespeech/t2s/exps/wavernn/__init__.py b/paddlespeech/t2s/exps/wavernn/__init__.py new file mode 100644 index 000000000..abf198b97 --- /dev/null +++ b/paddlespeech/t2s/exps/wavernn/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/paddlespeech/t2s/exps/wavernn/synthesize.py b/paddlespeech/t2s/exps/wavernn/synthesize.py new file mode 100644 index 000000000..61723e039 --- /dev/null +++ b/paddlespeech/t2s/exps/wavernn/synthesize.py @@ -0,0 +1,108 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import os +from pathlib import Path + +import jsonlines +import numpy as np +import paddle +import soundfile as sf +import yaml +from paddle import distributed as dist +from timer import timer +from yacs.config import CfgNode + +from paddlespeech.t2s.datasets.data_table import DataTable +from paddlespeech.t2s.models.wavernn import WaveRNN + + +def main(): + parser = argparse.ArgumentParser(description="Synthesize with WaveRNN.") + + parser.add_argument("--config", type=str, help="GANVocoder config file.") + parser.add_argument("--checkpoint", type=str, help="snapshot to load.") + parser.add_argument("--test-metadata", type=str, help="dev data.") + parser.add_argument("--output-dir", type=str, help="output dir.") + parser.add_argument( + "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") + + args = parser.parse_args() + + with open(args.config) as f: + config = CfgNode(yaml.safe_load(f)) + + print("========Args========") + print(yaml.safe_dump(vars(args))) + print("========Config========") + print(config) + print( + f"master see the word size: {dist.get_world_size()}, from pid: {os.getpid()}" + ) + + if args.ngpu == 0: + paddle.set_device("cpu") + elif args.ngpu > 0: + paddle.set_device("gpu") + else: + print("ngpu should >= 0 !") + + model = WaveRNN( + hop_length=config.n_shift, sample_rate=config.fs, **config["model"]) + state_dict = paddle.load(args.checkpoint) + model.set_state_dict(state_dict["main_params"]) + + model.eval() + + with jsonlines.open(args.test_metadata, 'r') as reader: + metadata = list(reader) + test_dataset = DataTable( + metadata, + fields=['utt_id', 'feats'], + converters={ + 'utt_id': None, + 'feats': np.load, + }) + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + N = 0 + T = 0 + for example in test_dataset: + utt_id = example['utt_id'] + mel = example['feats'] + mel = paddle.to_tensor(mel) # (T, C) + with timer() as t: + with paddle.no_grad(): + wav = model.generate( + c=mel, + batched=config.inference.gen_batched, + target=config.inference.target, + overlap=config.inference.overlap, + mu_law=config.mu_law, + gen_display=True) + wav = wav.numpy() + N += wav.size + T += t.elapse + speed = wav.size / t.elapse + rtf = config.fs / speed + print( + f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}." + ) + sf.write(str(output_dir / (utt_id + ".wav")), wav, samplerate=config.fs) + print(f"generation speed: {N / T}Hz, RTF: {config.fs / (N / T) }") + + +if __name__ == "__main__": + main() diff --git a/paddlespeech/t2s/exps/wavernn/train.py b/paddlespeech/t2s/exps/wavernn/train.py new file mode 100644 index 000000000..aec745f76 --- /dev/null +++ b/paddlespeech/t2s/exps/wavernn/train.py @@ -0,0 +1,212 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import os +import shutil +from pathlib import Path + +import jsonlines +import numpy as np +import paddle +import yaml +from paddle import DataParallel +from paddle import distributed as dist +from paddle.io import DataLoader +from paddle.io import DistributedBatchSampler +from paddle.optimizer import Adam +from yacs.config import CfgNode + +from paddlespeech.t2s.datasets.data_table import DataTable +from paddlespeech.t2s.datasets.vocoder_batch_fn import WaveRNNClip +from paddlespeech.t2s.models.wavernn import WaveRNN +from paddlespeech.t2s.models.wavernn import WaveRNNEvaluator +from paddlespeech.t2s.models.wavernn import WaveRNNUpdater +from paddlespeech.t2s.modules.losses import discretized_mix_logistic_loss +from paddlespeech.t2s.training.extensions.snapshot import Snapshot +from paddlespeech.t2s.training.extensions.visualizer import VisualDL +from paddlespeech.t2s.training.seeding import seed_everything +from paddlespeech.t2s.training.trainer import Trainer + + +def train_sp(args, config): + # decides device type and whether to run in parallel + # setup running environment correctly + world_size = paddle.distributed.get_world_size() + if (not paddle.is_compiled_with_cuda()) or args.ngpu == 0: + paddle.set_device("cpu") + else: + paddle.set_device("gpu") + if world_size > 1: + paddle.distributed.init_parallel_env() + + # set the random seed, it is a must for multiprocess training + seed_everything(config.seed) + + print( + f"rank: {dist.get_rank()}, pid: {os.getpid()}, parent_pid: {os.getppid()}", + ) + + # construct dataset for training and validation + with jsonlines.open(args.train_metadata, 'r') as reader: + train_metadata = list(reader) + train_dataset = DataTable( + data=train_metadata, + fields=["wave", "feats"], + converters={ + "wave": np.load, + "feats": np.load, + }, ) + + with jsonlines.open(args.dev_metadata, 'r') as reader: + dev_metadata = list(reader) + dev_dataset = DataTable( + data=dev_metadata, + fields=["wave", "feats"], + converters={ + "wave": np.load, + "feats": np.load, + }, ) + + batch_fn = WaveRNNClip( + mode=config.model.mode, + aux_context_window=config.model.aux_context_window, + hop_size=config.n_shift, + batch_max_steps=config.batch_max_steps, + bits=config.model.bits) + + # collate function and dataloader + train_sampler = DistributedBatchSampler( + train_dataset, + batch_size=config.batch_size, + shuffle=True, + drop_last=True) + dev_sampler = DistributedBatchSampler( + dev_dataset, + batch_size=config.batch_size, + shuffle=False, + drop_last=False) + print("samplers done!") + + train_dataloader = DataLoader( + train_dataset, + batch_sampler=train_sampler, + collate_fn=batch_fn, + num_workers=config.num_workers) + + dev_dataloader = DataLoader( + dev_dataset, + collate_fn=batch_fn, + batch_sampler=dev_sampler, + num_workers=config.num_workers) + + valid_generate_loader = DataLoader(dev_dataset, batch_size=1) + + print("dataloaders done!") + + model = WaveRNN( + hop_length=config.n_shift, sample_rate=config.fs, **config["model"]) + if world_size > 1: + model = DataParallel(model) + print("model done!") + + if config.model.mode == 'RAW': + criterion = paddle.nn.CrossEntropyLoss(axis=1) + elif config.model.mode == 'MOL': + criterion = discretized_mix_logistic_loss + else: + criterion = None + RuntimeError('Unknown model mode value - ', config.model.mode) + print("criterions done!") + clip = paddle.nn.ClipGradByGlobalNorm(config.grad_clip) + optimizer = Adam( + parameters=model.parameters(), + learning_rate=config.learning_rate, + grad_clip=clip) + + print("optimizer done!") + + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + if dist.get_rank() == 0: + config_name = args.config.split("/")[-1] + # copy conf to output_dir + shutil.copyfile(args.config, output_dir / config_name) + + updater = WaveRNNUpdater( + model=model, + optimizer=optimizer, + criterion=criterion, + dataloader=train_dataloader, + output_dir=output_dir, + mode=config.model.mode) + + evaluator = WaveRNNEvaluator( + model=model, + dataloader=dev_dataloader, + criterion=criterion, + output_dir=output_dir, + valid_generate_loader=valid_generate_loader, + config=config) + + trainer = Trainer( + updater, + stop_trigger=(config.train_max_steps, "iteration"), + out=output_dir) + + if dist.get_rank() == 0: + trainer.extend( + evaluator, trigger=(config.eval_interval_steps, 'iteration')) + trainer.extend(VisualDL(output_dir), trigger=(1, 'iteration')) + trainer.extend( + Snapshot(max_size=config.num_snapshots), + trigger=(config.save_interval_steps, 'iteration')) + + print("Trainer Done!") + trainer.run() + + +def main(): + # parse args and config and redirect to train_sp + + parser = argparse.ArgumentParser(description="Train a HiFiGAN model.") + parser.add_argument( + "--config", type=str, help="config file to overwrite default config.") + parser.add_argument("--train-metadata", type=str, help="training data.") + parser.add_argument("--dev-metadata", type=str, help="dev data.") + parser.add_argument("--output-dir", type=str, help="output dir.") + parser.add_argument( + "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") + + args = parser.parse_args() + + with open(args.config, 'rt') as f: + config = CfgNode(yaml.safe_load(f)) + + print("========Args========") + print(yaml.safe_dump(vars(args))) + print("========Config========") + print(config) + print( + f"master see the word size: {dist.get_world_size()}, from pid: {os.getpid()}" + ) + + # dispatch + if args.ngpu > 1: + dist.spawn(train_sp, (args, config), nprocs=args.ngpu) + else: + train_sp(args, config) + + +if __name__ == "__main__": + main() diff --git a/paddlespeech/t2s/models/__init__.py b/paddlespeech/t2s/models/__init__.py index 738943f8a..3b90a414c 100644 --- a/paddlespeech/t2s/models/__init__.py +++ b/paddlespeech/t2s/models/__init__.py @@ -19,3 +19,4 @@ from .parallel_wavegan import * from .speedyspeech import * from .transformer_tts import * from .waveflow import * +from .wavernn import * diff --git a/paddlespeech/t2s/models/wavernn/__init__.py b/paddlespeech/t2s/models/wavernn/__init__.py new file mode 100644 index 000000000..80ffd0688 --- /dev/null +++ b/paddlespeech/t2s/models/wavernn/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from .wavernn import * +from .wavernn_updater import * diff --git a/paddlespeech/t2s/models/wavernn/wavernn.py b/paddlespeech/t2s/models/wavernn/wavernn.py new file mode 100644 index 000000000..fcf39a482 --- /dev/null +++ b/paddlespeech/t2s/models/wavernn/wavernn.py @@ -0,0 +1,627 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import sys +import time +from typing import List + +import numpy as np +import paddle +from paddle import nn +from paddle.nn import functional as F + +from paddlespeech.t2s.audio.codec import decode_mu_law +from paddlespeech.t2s.modules.losses import sample_from_discretized_mix_logistic +from paddlespeech.t2s.modules.nets_utils import initialize +from paddlespeech.t2s.modules.upsample import Stretch2D + + +class ResBlock(nn.Layer): + def __init__(self, dims): + super().__init__() + self.conv1 = nn.Conv1D(dims, dims, kernel_size=1, bias_attr=False) + self.conv2 = nn.Conv1D(dims, dims, kernel_size=1, bias_attr=False) + self.batch_norm1 = nn.BatchNorm1D(dims) + self.batch_norm2 = nn.BatchNorm1D(dims) + + def forward(self, x): + ''' + conv -> bn -> relu -> conv -> bn + residual connection + ''' + residual = x + x = self.conv1(x) + x = self.batch_norm1(x) + x = F.relu(x) + x = self.conv2(x) + x = self.batch_norm2(x) + return x + residual + + +class MelResNet(nn.Layer): + def __init__(self, + res_blocks: int=10, + compute_dims: int=128, + res_out_dims: int=128, + aux_channels: int=80, + aux_context_window: int=0): + super().__init__() + k_size = aux_context_window * 2 + 1 + # pay attention here, the dim reduces aux_context_window * 2 + self.conv_in = nn.Conv1D( + aux_channels, compute_dims, kernel_size=k_size, bias_attr=False) + self.batch_norm = nn.BatchNorm1D(compute_dims) + self.layers = nn.LayerList() + for _ in range(res_blocks): + self.layers.append(ResBlock(compute_dims)) + self.conv_out = nn.Conv1D(compute_dims, res_out_dims, kernel_size=1) + + def forward(self, x): + ''' + Parameters + ---------- + x : Tensor + Input tensor (B, in_dims, T). + Returns + ---------- + Tensor + Output tensor (B, res_out_dims, T). + ''' + + x = self.conv_in(x) + x = self.batch_norm(x) + x = F.relu(x) + for f in self.layers: + x = f(x) + x = self.conv_out(x) + return x + + +class UpsampleNetwork(nn.Layer): + def __init__(self, + aux_channels: int=80, + upsample_scales: List[int]=[4, 5, 3, 5], + compute_dims: int=128, + res_blocks: int=10, + res_out_dims: int=128, + aux_context_window: int=2): + super().__init__() + # total_scale is the total Up sampling multiple + total_scale = np.prod(upsample_scales) + # TODO pad*total_scale is numpy.int64 + self.indent = int(aux_context_window * total_scale) + self.resnet = MelResNet( + res_blocks=res_blocks, + aux_channels=aux_channels, + compute_dims=compute_dims, + res_out_dims=res_out_dims, + aux_context_window=aux_context_window) + self.resnet_stretch = Stretch2D(total_scale, 1) + self.up_layers = nn.LayerList() + for scale in upsample_scales: + k_size = (1, scale * 2 + 1) + padding = (0, scale) + stretch = Stretch2D(scale, 1) + + conv = nn.Conv2D( + 1, 1, kernel_size=k_size, padding=padding, bias_attr=False) + weight_ = paddle.full_like(conv.weight, 1. / k_size[1]) + conv.weight.set_value(weight_) + self.up_layers.append(stretch) + self.up_layers.append(conv) + + def forward(self, m): + ''' + Parameters + ---------- + c : Tensor + Input tensor (B, C_aux, T). + Returns + ---------- + Tensor + Output tensor (B, (T - 2 * pad) * prob(upsample_scales), C_aux). + Tensor + Output tensor (B, (T - 2 * pad) * prob(upsample_scales), res_out_dims). + ''' + # aux: [B, C_aux, T] + # -> [B, res_out_dims, T - 2 * aux_context_window] + # -> [B, 1, res_out_dims, T - 2 * aux_context_window] + aux = self.resnet(m).unsqueeze(1) + # aux: [B, 1, res_out_dims, T - 2 * aux_context_window] + # -> [B, 1, res_out_dims, (T - 2 * pad) * prob(upsample_scales)] + aux = self.resnet_stretch(aux) + # aux: [B, 1, res_out_dims, T * prob(upsample_scales)] + # -> [B, res_out_dims, T * prob(upsample_scales)] + aux = aux.squeeze(1) + # m: [B, C_aux, T] -> [B, 1, C_aux, T] + m = m.unsqueeze(1) + for f in self.up_layers: + m = f(m) + # m: [B, 1, C_aux, T*prob(upsample_scales)] + # -> [B, C_aux, T * prob(upsample_scales)] + # -> [B, C_aux, (T - 2 * pad) * prob(upsample_scales)] + m = m.squeeze(1)[:, :, self.indent:-self.indent] + # m: [B, (T - 2 * pad) * prob(upsample_scales), C_aux] + # aux: [B, (T - 2 * pad) * prob(upsample_scales), res_out_dims] + return m.transpose([0, 2, 1]), aux.transpose([0, 2, 1]) + + +class WaveRNN(nn.Layer): + def __init__( + self, + rnn_dims: int=512, + fc_dims: int=512, + bits: int=9, + aux_context_window: int=2, + upsample_scales: List[int]=[4, 5, 3, 5], + aux_channels: int=80, + compute_dims: int=128, + res_out_dims: int=128, + res_blocks: int=10, + hop_length: int=300, + sample_rate: int=24000, + mode='RAW', + init_type: str="xavier_uniform", ): + ''' + Parameters + ---------- + rnn_dims : int, optional + Hidden dims of RNN Layers. + fc_dims : int, optional + Dims of FC Layers. + bits : int, optional + bit depth of signal. + aux_context_window : int, optional + The context window size of the first convolution applied to the + auxiliary input, by default 2 + upsample_scales : List[int], optional + Upsample scales of the upsample network. + aux_channels : int, optional + Auxiliary channel of the residual blocks. + compute_dims : int, optional + Dims of Conv1D in MelResNet. + res_out_dims : int, optional + Dims of output in MelResNet. + res_blocks : int, optional + Number of residual blocks. + mode : str, optional + Output mode of the WaveRNN vocoder. `MOL` for Mixture of Logistic Distribution, + and `RAW` for quantized bits as the model's output. + init_type : str + How to initialize parameters. + ''' + super().__init__() + self.mode = mode + self.aux_context_window = aux_context_window + if self.mode == 'RAW': + self.n_classes = 2**bits + elif self.mode == 'MOL': + self.n_classes = 10 * 3 + else: + RuntimeError('Unknown model mode value - ', self.mode) + + # List of rnns to call 'flatten_parameters()' on + self._to_flatten = [] + + self.rnn_dims = rnn_dims + self.aux_dims = res_out_dims // 4 + self.hop_length = hop_length + self.sample_rate = sample_rate + + # initialize parameters + initialize(self, init_type) + + self.upsample = UpsampleNetwork( + aux_channels=aux_channels, + upsample_scales=upsample_scales, + compute_dims=compute_dims, + res_blocks=res_blocks, + res_out_dims=res_out_dims, + aux_context_window=aux_context_window) + self.I = nn.Linear(aux_channels + self.aux_dims + 1, rnn_dims) + + self.rnn1 = nn.GRU(rnn_dims, rnn_dims) + self.rnn2 = nn.GRU(rnn_dims + self.aux_dims, rnn_dims) + + self._to_flatten += [self.rnn1, self.rnn2] + + self.fc1 = nn.Linear(rnn_dims + self.aux_dims, fc_dims) + self.fc2 = nn.Linear(fc_dims + self.aux_dims, fc_dims) + self.fc3 = nn.Linear(fc_dims, self.n_classes) + + # Avoid fragmentation of RNN parameters and associated warning + self._flatten_parameters() + + nn.initializer.set_global_initializer(None) + + def forward(self, x, c): + ''' + Parameters + ---------- + x : Tensor + wav sequence, [B, T] + c : Tensor + mel spectrogram [B, C_aux, T'] + + T = (T' - 2 * aux_context_window ) * hop_length + Returns + ---------- + Tensor + [B, T, n_classes] + ''' + # Although we `_flatten_parameters()` on init, when using DataParallel + # the model gets replicated, making it no longer guaranteed that the + # weights are contiguous in GPU memory. Hence, we must call it again + self._flatten_parameters() + + bsize = paddle.shape(x)[0] + h1 = paddle.zeros([1, bsize, self.rnn_dims]) + h2 = paddle.zeros([1, bsize, self.rnn_dims]) + # c: [B, T, C_aux] + # aux: [B, T, res_out_dims] + c, aux = self.upsample(c) + + aux_idx = [self.aux_dims * i for i in range(5)] + a1 = aux[:, :, aux_idx[0]:aux_idx[1]] + a2 = aux[:, :, aux_idx[1]:aux_idx[2]] + a3 = aux[:, :, aux_idx[2]:aux_idx[3]] + a4 = aux[:, :, aux_idx[3]:aux_idx[4]] + + x = paddle.concat([x.unsqueeze(-1), c, a1], axis=2) + x = self.I(x) + res = x + x, _ = self.rnn1(x, h1) + + x = x + res + res = x + x = paddle.concat([x, a2], axis=2) + x, _ = self.rnn2(x, h2) + + x = x + res + x = paddle.concat([x, a3], axis=2) + x = F.relu(self.fc1(x)) + + x = paddle.concat([x, a4], axis=2) + x = F.relu(self.fc2(x)) + + return self.fc3(x) + + @paddle.no_grad() + def generate(self, + c, + batched: bool=True, + target: int=12000, + overlap: int=600, + mu_law: bool=True, + gen_display: bool=False): + """ + Parameters + ---------- + c : Tensor + input mels, (T', C_aux) + batched : bool + generate in batch or not + target : int + target number of samples to be generated in each batch entry + overlap : int + number of samples for crossfading between batches + mu_law : bool + use mu law or not + Returns + ---------- + wav sequence + Output (T' * prod(upsample_scales), out_channels, C_out). + """ + + self.eval() + + mu_law = mu_law if self.mode == 'RAW' else False + + output = [] + start = time.time() + + # pseudo batch + # (T, C_aux) -> (1, C_aux, T) + c = paddle.transpose(c, [1, 0]).unsqueeze(0) + T = paddle.shape(c)[-1] + wave_len = T * self.hop_length + # TODO remove two transpose op by modifying function pad_tensor + c = self.pad_tensor( + c.transpose([0, 2, 1]), pad=self.aux_context_window, + side='both').transpose([0, 2, 1]) + + c, aux = self.upsample(c) + + if batched: + # (num_folds, target + 2 * overlap, features) + c = self.fold_with_overlap(c, target, overlap) + aux = self.fold_with_overlap(aux, target, overlap) + + # for dygraph to static graph, if use seq_len of `b_size, seq_len, _ = paddle.shape(c)` in for + # will not get TensorArray + # see https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/04_dygraph_to_static/case_analysis_cn.html#list-lodtensorarray + # b_size, seq_len, _ = paddle.shape(c) + b_size = paddle.shape(c)[0] + seq_len = paddle.shape(c)[1] + + h1 = paddle.zeros([b_size, self.rnn_dims]) + h2 = paddle.zeros([b_size, self.rnn_dims]) + x = paddle.zeros([b_size, 1]) + + d = self.aux_dims + aux_split = [aux[:, :, d * i:d * (i + 1)] for i in range(4)] + + for i in range(seq_len): + m_t = c[:, i, :] + # for dygraph to static graph + # a1_t, a2_t, a3_t, a4_t = (a[:, i, :] for a in aux_split) + a1_t = aux_split[0][:, i, :] + a2_t = aux_split[1][:, i, :] + a3_t = aux_split[2][:, i, :] + a4_t = aux_split[3][:, i, :] + x = paddle.concat([x, m_t, a1_t], axis=1) + x = self.I(x) + # use GRUCell here + h1, _ = self.rnn1[0].cell(x, h1) + x = x + h1 + inp = paddle.concat([x, a2_t], axis=1) + # use GRUCell here + h2, _ = self.rnn2[0].cell(inp, h2) + + x = x + h2 + x = paddle.concat([x, a3_t], axis=1) + x = F.relu(self.fc1(x)) + + x = paddle.concat([x, a4_t], axis=1) + x = F.relu(self.fc2(x)) + + logits = self.fc3(x) + + if self.mode == 'MOL': + sample = sample_from_discretized_mix_logistic( + logits.unsqueeze(0).transpose([0, 2, 1])) + output.append(sample.reshape([-1])) + x = sample.transpose([1, 0, 2]) + + elif self.mode == 'RAW': + posterior = F.softmax(logits, axis=1) + distrib = paddle.distribution.Categorical(posterior) + # corresponding operate [np.floor((fx + 1) / 2 * mu + 0.5)] in enocde_mu_law + # distrib.sample([1])[0].cast('float32'): [0, 2**bits-1] + # sample: [-1, 1] + sample = 2 * distrib.sample([1])[0].cast('float32') / ( + self.n_classes - 1.) - 1. + output.append(sample) + x = sample.unsqueeze(-1) + else: + raise RuntimeError('Unknown model mode value - ', self.mode) + + if gen_display: + if i % 1000 == 0: + self.gen_display(i, int(seq_len), int(b_size), start) + + output = paddle.stack(output).transpose([1, 0]) + + if mu_law: + output = decode_mu_law(output, self.n_classes, False) + + if batched: + output = self.xfade_and_unfold(output, target, overlap) + else: + output = output[0] + + # Fade-out at the end to avoid signal cutting out suddenly + fade_out = paddle.linspace(1, 0, 10 * self.hop_length) + output = output[:wave_len] + output[-10 * self.hop_length:] *= fade_out + + self.train() + + # 增加 C_out 维度 + return output.unsqueeze(-1) + + def _flatten_parameters(self): + [m.flatten_parameters() for m in self._to_flatten] + + def pad_tensor(self, x, pad, side='both'): + ''' + Parameters + ---------- + x : Tensor + mel, [1, n_frames, 80] + pad : int + side : str + 'both', 'before' or 'after' + Returns + ---------- + Tensor + ''' + b, t, _ = paddle.shape(x) + # for dygraph to static graph + c = x.shape[-1] + total = t + 2 * pad if side == 'both' else t + pad + padded = paddle.zeros([b, total, c]) + if side == 'before' or side == 'both': + padded[:, pad:pad + t, :] = x + elif side == 'after': + padded[:, :t, :] = x + return padded + + def fold_with_overlap(self, x, target, overlap): + ''' + Fold the tensor with overlap for quick batched inference. + Overlap will be used for crossfading in xfade_and_unfold() + + Parameters + ---------- + x : Tensor + Upsampled conditioning features. mels or aux + shape=(1, T, features) + mels: [1, T, 80] + aux: [1, T, 128] + target : int + Target timesteps for each index of batch + overlap : int + Timesteps for both xfade and rnn warmup + overlap = hop_length * 2 + + Returns + ---------- + Tensor + shape=(num_folds, target + 2 * overlap, features) + num_flods = (time_seq - overlap) // (target + overlap) + mel: [num_folds, target + 2 * overlap, 80] + aux: [num_folds, target + 2 * overlap, 128] + + Details + ---------- + x = [[h1, h2, ... hn]] + + Where each h is a vector of conditioning features + + Eg: target=2, overlap=1 with x.size(1)=10 + + folded = [[h1, h2, h3, h4], + [h4, h5, h6, h7], + [h7, h8, h9, h10]] + ''' + + _, total_len, features = paddle.shape(x) + + # Calculate variables needed + num_folds = (total_len - overlap) // (target + overlap) + extended_len = num_folds * (overlap + target) + overlap + remaining = total_len - extended_len + + # Pad if some time steps poking out + if remaining != 0: + num_folds += 1 + padding = target + 2 * overlap - remaining + x = self.pad_tensor(x, padding, side='after') + + folded = paddle.zeros([num_folds, target + 2 * overlap, features]) + + # Get the values for the folded tensor + for i in range(num_folds): + start = i * (target + overlap) + end = start + target + 2 * overlap + folded[i] = x[0][start:end, :] + return folded + + def xfade_and_unfold(self, y, target: int=12000, overlap: int=600): + ''' Applies a crossfade and unfolds into a 1d array. + + Parameters + ---------- + y : Tensor + Batched sequences of audio samples + shape=(num_folds, target + 2 * overlap) + dtype=paddle.float32 + overlap : int + Timesteps for both xfade and rnn warmup + + Returns + ---------- + Tensor + audio samples in a 1d array + shape=(total_len) + dtype=paddle.float32 + + Details + ---------- + y = [[seq1], + [seq2], + [seq3]] + + Apply a gain envelope at both ends of the sequences + + y = [[seq1_in, seq1_target, seq1_out], + [seq2_in, seq2_target, seq2_out], + [seq3_in, seq3_target, seq3_out]] + + Stagger and add up the groups of samples: + + [seq1_in, seq1_target, (seq1_out + seq2_in), seq2_target, ...] + + ''' + # num_folds = (total_len - overlap) // (target + overlap) + num_folds, length = paddle.shape(y) + target = length - 2 * overlap + total_len = num_folds * (target + overlap) + overlap + + # Need some silence for the run warmup + slience_len = overlap // 2 + fade_len = overlap - slience_len + slience = paddle.zeros([slience_len], dtype=paddle.float32) + linear = paddle.ones([fade_len], dtype=paddle.float32) + + # Equal power crossfade + # fade_in increase from 0 to 1, fade_out reduces from 1 to 0 + t = paddle.linspace(-1, 1, fade_len, dtype=paddle.float32) + fade_in = paddle.sqrt(0.5 * (1 + t)) + fade_out = paddle.sqrt(0.5 * (1 - t)) + # Concat the silence to the fades + fade_out = paddle.concat([linear, fade_out]) + fade_in = paddle.concat([slience, fade_in]) + + # Apply the gain to the overlap samples + y[:, :overlap] *= fade_in + y[:, -overlap:] *= fade_out + + unfolded = paddle.zeros([total_len], dtype=paddle.float32) + + # Loop to add up all the samples + for i in range(num_folds): + start = i * (target + overlap) + end = start + target + 2 * overlap + unfolded[start:end] += y[i] + + return unfolded + + def gen_display(self, i, seq_len, b_size, start): + gen_rate = (i + 1) / (time.time() - start) * b_size / 1000 + pbar = self.progbar(i, seq_len) + msg = f'| {pbar} {i*b_size}/{seq_len*b_size} | Batch Size: {b_size} | Gen Rate: {gen_rate:.1f}kHz | ' + sys.stdout.write(f"\r{msg}") + + def progbar(self, i, n, size=16): + done = int(i * size) // n + bar = '' + for i in range(size): + bar += '█' if i <= done else '░' + return bar + + +class WaveRNNInference(nn.Layer): + def __init__(self, normalizer, wavernn): + super().__init__() + self.normalizer = normalizer + self.wavernn = wavernn + + def forward(self, + logmel, + batched: bool=True, + target: int=12000, + overlap: int=600, + mu_law: bool=True, + gen_display: bool=False): + normalized_mel = self.normalizer(logmel) + + wav = self.wavernn.generate( + normalized_mel, ) + # batched=batched, + # target=target, + # overlap=overlap, + # mu_law=mu_law, + # gen_display=gen_display) + + return wav diff --git a/paddlespeech/t2s/models/wavernn/wavernn_updater.py b/paddlespeech/t2s/models/wavernn/wavernn_updater.py new file mode 100644 index 000000000..b2756d00c --- /dev/null +++ b/paddlespeech/t2s/models/wavernn/wavernn_updater.py @@ -0,0 +1,201 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import logging +from pathlib import Path + +import paddle +import soundfile as sf +from paddle import distributed as dist +from paddle.io import DataLoader +from paddle.nn import Layer +from paddle.optimizer import Optimizer + +from paddlespeech.t2s.training.extensions.evaluator import StandardEvaluator +from paddlespeech.t2s.training.reporter import report +from paddlespeech.t2s.training.updaters.standard_updater import StandardUpdater +logging.basicConfig( + format='%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s', + datefmt='[%Y-%m-%d %H:%M:%S]') +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +def calculate_grad_norm(parameters, norm_type: str=2): + ''' + calculate grad norm of mdoel's parameters + parameters: + model's parameters + norm_type: str + Returns + ------------ + Tensor + grad_norm + ''' + + grad_list = [ + paddle.to_tensor(p.grad) for p in parameters if p.grad is not None + ] + norm_list = paddle.stack( + [paddle.norm(grad, norm_type) for grad in grad_list]) + total_norm = paddle.norm(norm_list) + return total_norm + + +# for save name in gen_valid_samples() +ITERATION = 0 + + +class WaveRNNUpdater(StandardUpdater): + def __init__(self, + model: Layer, + optimizer: Optimizer, + criterion: Layer, + dataloader: DataLoader, + init_state=None, + output_dir: Path=None, + mode='RAW'): + super().__init__(model, optimizer, dataloader, init_state=None) + + self.criterion = criterion + # self.scheduler = scheduler + + log_file = output_dir / 'worker_{}.log'.format(dist.get_rank()) + self.filehandler = logging.FileHandler(str(log_file)) + logger.addHandler(self.filehandler) + self.logger = logger + self.msg = "" + self.mode = mode + + def update_core(self, batch): + + self.msg = "Rank: {}, ".format(dist.get_rank()) + losses_dict = {} + # parse batch + self.model.train() + self.optimizer.clear_grad() + + wav, y, mel = batch + + y_hat = self.model(wav, mel) + if self.mode == 'RAW': + y_hat = y_hat.transpose([0, 2, 1]).unsqueeze(-1) + elif self.mode == 'MOL': + y_hat = paddle.cast(y, dtype='float32') + + y = y.unsqueeze(-1) + loss = self.criterion(y_hat, y) + loss.backward() + grad_norm = float( + calculate_grad_norm(self.model.parameters(), norm_type=2)) + + self.optimizer.step() + + report("train/loss", float(loss)) + report("train/grad_norm", float(grad_norm)) + + losses_dict["loss"] = float(loss) + losses_dict["grad_norm"] = float(grad_norm) + self.msg += ', '.join('{}: {:>.6f}'.format(k, v) + for k, v in losses_dict.items()) + global ITERATION + ITERATION = self.state.iteration + 1 + + +class WaveRNNEvaluator(StandardEvaluator): + def __init__(self, + model: Layer, + criterion: Layer, + dataloader: Optimizer, + output_dir: Path=None, + valid_generate_loader=None, + config=None): + super().__init__(model, dataloader) + + log_file = output_dir / 'worker_{}.log'.format(dist.get_rank()) + self.filehandler = logging.FileHandler(str(log_file)) + logger.addHandler(self.filehandler) + self.logger = logger + self.msg = "" + + self.criterion = criterion + self.valid_generate_loader = valid_generate_loader + self.config = config + self.mode = config.model.mode + + self.valid_samples_dir = output_dir / "valid_samples" + self.valid_samples_dir.mkdir(parents=True, exist_ok=True) + + def evaluate_core(self, batch): + self.msg = "Evaluate: " + losses_dict = {} + # parse batch + wav, y, mel = batch + y_hat = self.model(wav, mel) + + if self.mode == 'RAW': + y_hat = y_hat.transpose([0, 2, 1]).unsqueeze(-1) + elif self.mode == 'MOL': + y_hat = paddle.cast(y, dtype='float32') + + y = y.unsqueeze(-1) + loss = self.criterion(y_hat, y) + report("eval/loss", float(loss)) + + losses_dict["loss"] = float(loss) + + self.msg += ', '.join('{}: {:>.6f}'.format(k, v) + for k, v in losses_dict.items()) + self.logger.info(self.msg) + + def gen_valid_samples(self): + + for i, item in enumerate(self.valid_generate_loader): + if i >= self.config.generate_num: + break + print( + '\n| Generating: {}/{}'.format(i + 1, self.config.generate_num)) + + mel = item['feats'] + wav = item['wave'] + wav = wav.squeeze(0) + + origin_save_path = self.valid_samples_dir / '{}_steps_{}_target.wav'.format( + self.iteration, i) + sf.write(origin_save_path, wav.numpy(), samplerate=self.config.fs) + + if self.config.inference.gen_batched: + batch_str = 'gen_batched_target{}_overlap{}'.format( + self.config.inference.target, self.config.inference.overlap) + else: + batch_str = 'gen_not_batched' + gen_save_path = str(self.valid_samples_dir / + '{}_steps_{}_{}.wav'.format(self.iteration, i, + batch_str)) + # (1, T, C_aux) -> (T, C_aux) + mel = mel.squeeze(0) + gen_sample = self.model.generate( + mel, self.config.inference.gen_batched, + self.config.inference.target, self.config.inference.overlap, + self.config.mu_law) + sf.write( + gen_save_path, gen_sample.numpy(), samplerate=self.config.fs) + + def __call__(self, trainer=None): + summary = self.evaluate() + for k, v in summary.items(): + report(k, v) + # gen samples at then end of evaluate + self.iteration = ITERATION + if self.iteration % self.config.gen_eval_samples_interval_steps == 0: + self.gen_valid_samples() diff --git a/paddlespeech/t2s/modules/losses.py b/paddlespeech/t2s/modules/losses.py index 3cc7a93cb..618f444a1 100644 --- a/paddlespeech/t2s/modules/losses.py +++ b/paddlespeech/t2s/modules/losses.py @@ -14,6 +14,7 @@ import math import librosa +import numpy as np import paddle from paddle import nn from paddle.fluid.layers import sequence_mask @@ -23,6 +24,145 @@ from scipy import signal from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask +# Losses for WaveRNN +def log_sum_exp(x): + """ numerically stable log_sum_exp implementation that prevents overflow """ + # TF ordering + axis = len(x.shape) - 1 + m = paddle.max(x, axis=axis) + m2 = paddle.max(x, axis=axis, keepdim=True) + return m + paddle.log(paddle.sum(paddle.exp(x - m2), axis=axis)) + + +# It is adapted from https://github.com/r9y9/wavenet_vocoder/blob/master/wavenet_vocoder/mixture.py +def discretized_mix_logistic_loss(y_hat, + y, + num_classes=65536, + log_scale_min=None, + reduce=True): + if log_scale_min is None: + log_scale_min = float(np.log(1e-14)) + y_hat = y_hat.transpose([0, 2, 1]) + assert y_hat.dim() == 3 + assert y_hat.shape[1] % 3 == 0 + nr_mix = y_hat.shape[1] // 3 + + # (B x T x C) + y_hat = y_hat.transpose([0, 2, 1]) + + # unpack parameters. (B, T, num_mixtures) x 3 + logit_probs = y_hat[:, :, :nr_mix] + means = y_hat[:, :, nr_mix:2 * nr_mix] + log_scales = paddle.clip( + y_hat[:, :, 2 * nr_mix:3 * nr_mix], min=log_scale_min) + + # B x T x 1 -> B x T x num_mixtures + y = y.expand_as(means) + centered_y = paddle.cast(y, dtype=paddle.get_default_dtype()) - means + inv_stdv = paddle.exp(-log_scales) + plus_in = inv_stdv * (centered_y + 1. / (num_classes - 1)) + cdf_plus = F.sigmoid(plus_in) + min_in = inv_stdv * (centered_y - 1. / (num_classes - 1)) + cdf_min = F.sigmoid(min_in) + + # log probability for edge case of 0 (before scaling) + # equivalent: torch.log(F.sigmoid(plus_in)) + # softplus: log(1+ e^{-x}) + log_cdf_plus = plus_in - F.softplus(plus_in) + + # log probability for edge case of 255 (before scaling) + # equivalent: (1 - F.sigmoid(min_in)).log() + log_one_minus_cdf_min = -F.softplus(min_in) + + # probability for all other cases + cdf_delta = cdf_plus - cdf_min + + mid_in = inv_stdv * centered_y + # log probability in the center of the bin, to be used in extreme cases + # (not actually used in our code) + log_pdf_mid = mid_in - log_scales - 2. * F.softplus(mid_in) + + # TODO: cdf_delta <= 1e-5 actually can happen. How can we choose the value + # for num_classes=65536 case? 1e-7? not sure.. + inner_inner_cond = cdf_delta > 1e-5 + + inner_inner_cond = paddle.cast( + inner_inner_cond, dtype=paddle.get_default_dtype()) + + # inner_inner_out = inner_inner_cond * \ + # paddle.log(paddle.clip(cdf_delta, min=1e-12)) + \ + # (1. - inner_inner_cond) * (log_pdf_mid - np.log((num_classes - 1) / 2)) + + inner_inner_out = inner_inner_cond * paddle.log( + paddle.clip(cdf_delta, min=1e-12)) + (1. - inner_inner_cond) * ( + log_pdf_mid - np.log((num_classes - 1) / 2)) + + inner_cond = y > 0.999 + + inner_cond = paddle.cast(inner_cond, dtype=paddle.get_default_dtype()) + + inner_out = inner_cond * log_one_minus_cdf_min + (1. - inner_cond + ) * inner_inner_out + cond = y < -0.999 + cond = paddle.cast(cond, dtype=paddle.get_default_dtype()) + + log_probs = cond * log_cdf_plus + (1. - cond) * inner_out + log_probs = log_probs + F.log_softmax(logit_probs, -1) + + if reduce: + return -paddle.mean(log_sum_exp(log_probs)) + else: + return -log_sum_exp(log_probs).unsqueeze(-1) + + +def sample_from_discretized_mix_logistic(y, log_scale_min=None): + """ + Sample from discretized mixture of logistic distributions + Parameters + ---------- + y : Tensor + (B, C, T) + log_scale_min : float + Log scale minimum value + Returns + ---------- + Tensor + sample in range of [-1, 1]. + """ + if log_scale_min is None: + log_scale_min = float(np.log(1e-14)) + + assert y.shape[1] % 3 == 0 + nr_mix = y.shape[1] // 3 + + # (B, T, C) + y = y.transpose([0, 2, 1]) + logit_probs = y[:, :, :nr_mix] + + # sample mixture indicator from softmax + temp = paddle.uniform( + logit_probs.shape, dtype=logit_probs.dtype, min=1e-5, max=1.0 - 1e-5) + temp = logit_probs - paddle.log(-paddle.log(temp)) + argmax = paddle.argmax(temp, axis=-1) + + # (B, T) -> (B, T, nr_mix) + one_hot = F.one_hot(argmax, nr_mix) + one_hot = paddle.cast(one_hot, dtype=paddle.get_default_dtype()) + + # select logistic parameters + means = paddle.sum(y[:, :, nr_mix:2 * nr_mix] * one_hot, axis=-1) + log_scales = paddle.clip( + paddle.sum(y[:, :, 2 * nr_mix:3 * nr_mix] * one_hot, axis=-1), + min=log_scale_min) + # sample from logistic & clip to interval + # we don't actually round to the nearest 8bit value when sampling + u = paddle.uniform(means.shape, min=1e-5, max=1.0 - 1e-5) + x = means + paddle.exp(log_scales) * (paddle.log(u) - paddle.log(1. - u)) + x = paddle.clip(x, min=-1., max=-1.) + + return x + + # Loss for new Tacotron2 class GuidedAttentionLoss(nn.Layer): """Guided attention loss function module.