update wavernn, test=tts

pull/1379/head
TianYuan 2 years ago
parent 001afee644
commit 1b0c034134

@ -12,7 +12,6 @@ n_mels: 80 # Number of mel basis.
fmin: 80 # Minimum freq in mel basis calculation. (Hz)
fmax: 7600 # Maximum frequency in mel basis calculation. (Hz)
mu_law: True # Recommended to suppress noise if using raw bitsexit()
peak_norm: True
###########################################################
@ -22,13 +21,14 @@ model:
rnn_dims: 512 # Hidden dims of RNN Layers.
fc_dims: 512
bits: 9 # Bit depth of signal
aux_context_window: 2
aux_context_window: 2 # Context window size for auxiliary feature.
# If set to 2, previous 2 and future 2 frames will be considered.
aux_channels: 80 # Number of channels for auxiliary feature conv.
# Must be the same as num_mels.
upsample_scales: [4, 5, 3, 5] # Upsampling scales. Prodcut of these must be the same as hop size, same with pwgan here
compute_dims: 128
res_out_dims: 128
res_blocks: 10
compute_dims: 128 # Dims of Conv1D in MelResNet.
res_out_dims: 128 # Dims of output in MelResNet.
res_blocks: 10 # Number of residual blocks.
mode: RAW # either 'raw'(softmax on raw bits) or 'mold' (sample from mixture of logistics)
inference:
gen_batched: True # whether to genenate sample in batch mode
@ -42,7 +42,6 @@ inference:
batch_size: 64 # Batch size.
batch_max_steps: 4500 # Length of each audio in batch. Make sure dividable by hop_size.
num_workers: 2 # Number of workers in DataLoader.
valid_size: 50
###########################################################
# OPTIMIZER SETTING #

@ -12,5 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from .audio import AudioProcessor
from .codec import *
from .spec_normalizer import LogMagnitude
from .spec_normalizer import NormalizerBase

@ -0,0 +1,51 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import numpy as np
import paddle
# x: [0: 2**bit-1], return: [-1, 1]
def label_2_float(x, bits):
return 2 * x / (2**bits - 1.) - 1.
#x: [-1, 1], return: [0, 2**bits-1]
def float_2_label(x, bits):
assert abs(x).max() <= 1.0
x = (x + 1.) * (2**bits - 1) / 2
return x.clip(0, 2**bits - 1)
# y: [-1, 1], mu: 2**bits, return: [0, 2**bits-1]
# see https://en.wikipedia.org/wiki/%CE%9C-law_algorithm
# be careful the input `mu` here, which is +1 than that of the link above
def encode_mu_law(x, mu):
mu = mu - 1
fx = np.sign(x) * np.log(1 + mu * np.abs(x)) / np.log(1 + mu)
return np.floor((fx + 1) / 2 * mu + 0.5)
# from_labels = True:
# y: [0: 2**bit-1], mu: 2**bits, return: [-1,1]
# from_labels = False:
# y: [-1, 1], return: [-1, 1]
def decode_mu_law(y, mu, from_labels=True):
# TODO: get rid of log2 - makes no sense
if from_labels:
y = label_2_float(y, math.log2(mu))
mu = mu - 1
x = paddle.sign(y) / mu * ((1 + mu)**paddle.abs(y) - 1)
return x

@ -11,35 +11,12 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import numpy as np
import paddle
def label_2_float(x, bits):
return 2 * x / (2**bits - 1.) - 1.
def float_2_label(x, bits):
assert abs(x).max() <= 1.0
x = (x + 1.) * (2**bits - 1) / 2
return x.clip(0, 2**bits - 1)
def encode_mu_law(x, mu):
mu = mu - 1
fx = np.sign(x) * np.log(1 + mu * np.abs(x)) / np.log(1 + mu)
return np.floor((fx + 1) / 2 * mu + 0.5)
def decode_mu_law(y, mu, from_labels=True):
# TODO: get rid of log2 - makes no sense
if from_labels:
y = label_2_float(y, math.log2(mu))
mu = mu - 1
x = paddle.sign(y) / mu * ((1 + mu)**paddle.abs(y) - 1)
return x
from paddlespeech.t2s.audio.codec import encode_mu_law
from paddlespeech.t2s.audio.codec import float_2_label
from paddlespeech.t2s.audio.codec import label_2_float
class Clip(object):
@ -195,10 +172,12 @@ class WaveRNNClip(Clip):
Returns
----------
Tensor
Auxiliary feature batch (B, C, T'), where
T = (T' - 2 * aux_context_window) * hop_size.
Input signal batch (B, 1, T).
Tensor
Target signal batch (B, 1, T).
Tensor
Auxiliary feature batch (B, C, T'), where
T = (T' - 2 * aux_context_window) * hop_size.
"""
# check length

@ -20,7 +20,7 @@ import paddle
from paddle import nn
from paddle.nn import functional as F
from paddlespeech.t2s.datasets.vocoder_batch_fn import decode_mu_law
from paddlespeech.t2s.audio.codec import decode_mu_law
from paddlespeech.t2s.modules.losses import sample_from_discretized_mix_logistic
from paddlespeech.t2s.modules.nets_utils import initialize
from paddlespeech.t2s.modules.upsample import Stretch2D
@ -28,7 +28,7 @@ from paddlespeech.t2s.modules.upsample import Stretch2D
class ResBlock(nn.Layer):
def __init__(self, dims):
super(ResBlock, self).__init__()
super().__init__()
self.conv1 = nn.Conv1D(dims, dims, kernel_size=1, bias_attr=False)
self.conv2 = nn.Conv1D(dims, dims, kernel_size=1, bias_attr=False)
self.batch_norm1 = nn.BatchNorm1D(dims)
@ -205,7 +205,7 @@ class WaveRNN(nn.Layer):
if self.mode == 'RAW':
self.n_classes = 2**bits
elif self.mode == 'MOL':
self.n_classes = 30
self.n_classes = 10 * 3
else:
RuntimeError('Unknown model mode value - ', self.mode)
@ -333,7 +333,7 @@ class WaveRNN(nn.Layer):
# (T, C_aux) -> (1, C_aux, T)
c = paddle.transpose(c, [1, 0]).unsqueeze(0)
T = paddle.shape(c)[-1]
wave_len = (T - 1) * self.hop_length
wave_len = T * self.hop_length
# TODO remove two transpose op by modifying function pad_tensor
c = self.pad_tensor(
c.transpose([0, 2, 1]), pad=self.aux_context_window,
@ -396,6 +396,8 @@ class WaveRNN(nn.Layer):
posterior = F.softmax(logits, axis=1)
distrib = paddle.distribution.Categorical(posterior)
# corresponding operate [np.floor((fx + 1) / 2 * mu + 0.5)] in enocde_mu_law
# distrib.sample([1])[0].cast('float32'): [0, 2**bits-1]
# sample: [-1, 1]
sample = 2 * distrib.sample([1])[0].cast('float32') / (
self.n_classes - 1.) - 1.
output.append(sample)
@ -418,9 +420,9 @@ class WaveRNN(nn.Layer):
output = output[0]
# Fade-out at the end to avoid signal cutting out suddenly
fade_out = paddle.linspace(1, 0, 20 * self.hop_length)
fade_out = paddle.linspace(1, 0, 10 * self.hop_length)
output = output[:wave_len]
output[-20 * self.hop_length:] *= fade_out
output[-10 * self.hop_length:] *= fade_out
self.train()

Loading…
Cancel
Save