From caa391f4614c3766f5476ab11222ce9ee57d3591 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Wed, 12 Jan 2022 15:26:49 +0800 Subject: [PATCH] fix speedyspeech inference, test=tts (#1322) --- .../t2s/models/speedyspeech/speedyspeech.py | 35 +++++-------------- 1 file changed, 9 insertions(+), 26 deletions(-) diff --git a/paddlespeech/t2s/models/speedyspeech/speedyspeech.py b/paddlespeech/t2s/models/speedyspeech/speedyspeech.py index acddd976..3e64e670 100644 --- a/paddlespeech/t2s/models/speedyspeech/speedyspeech.py +++ b/paddlespeech/t2s/models/speedyspeech/speedyspeech.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import numpy as np import paddle from paddle import nn @@ -23,18 +22,16 @@ def expand(encodings: paddle.Tensor, durations: paddle.Tensor) -> paddle.Tensor: encodings: (B, T, C) durations: (B, T) """ - batch_size, t_enc = durations.shape - durations = durations.numpy() - slens = np.sum(durations, -1) - t_dec = np.max(slens) - M = np.zeros([batch_size, t_dec, t_enc]) + batch_size, t_enc = paddle.shape(durations) + slens = paddle.sum(durations, -1) + t_dec = paddle.max(slens) + M = paddle.zeros([batch_size, t_dec, t_enc]) for i in range(batch_size): k = 0 for j in range(t_enc): d = durations[i, j] M[i, k:k + d, j] = 1 k += d - M = paddle.to_tensor(M, dtype=encodings.dtype) encodings = paddle.matmul(M, encodings) return encodings @@ -234,28 +231,14 @@ class SpeedySpeech(nn.Layer): encodings = self.encoder(text, tones, spk_id) - if type(durations) == type(None): - pred_durations = self.duration_predictor(encodings) # (1, T) + if durations is None: + # (1, T) + pred_durations = self.duration_predictor(encodings) durations_to_expand = paddle.round(pred_durations.exp()) - durations_to_expand = (durations_to_expand).astype(paddle.int64) - - slens = paddle.sum(durations_to_expand, -1) # [1] - t_dec = slens[0] # [1] - t_enc = paddle.shape(pred_durations)[-1] - M = paddle.zeros([1, t_dec, t_enc]) - - k = paddle.full([1], 0, dtype=paddle.int64) - for j in range(t_enc): - d = durations_to_expand[0, j] - # If the d == 0, slice action is meaningless and not supported - if d >= 1: - M[0, k:k + d, j] = 1 - k += d - - encodings = paddle.matmul(M, encodings) + durations_to_expand = durations_to_expand.astype(paddle.int64) else: durations_to_expand = durations - encodings = expand(encodings, durations_to_expand) + encodings = expand(encodings, durations_to_expand) shape = paddle.shape(encodings) t_dec, feature_size = shape[1], shape[2]