diff --git a/.mergify.yml b/.mergify.yml index 3347c6dc..f012c2f8 100644 --- a/.mergify.yml +++ b/.mergify.yml @@ -130,7 +130,7 @@ pull_request_rules: add: ["Docker"] - name: "auto add label=Deployment" conditions: - - files~=^speechnn/ + - files~=^speechx/ actions: label: add: ["Deployment"] diff --git a/paddlespeech/__init__.py b/paddlespeech/__init__.py index 8d32f287..5dfc1974 100644 --- a/paddlespeech/__init__.py +++ b/paddlespeech/__init__.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = '0.1.0' +__version__ = '0.1.1' diff --git a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py index 405ad957..6bb651a0 100644 --- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py +++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py @@ -627,7 +627,7 @@ class FastSpeech2(nn.Layer): hs = hs + e_embs + p_embs # (B, Lmax, adim) - hs = self.length_regulator(hs, d_outs, alpha) + hs = self.length_regulator(hs, d_outs, alpha, is_inference=True) else: d_outs = self.duration_predictor(hs, d_masks) # use groundtruth in training @@ -638,7 +638,7 @@ class FastSpeech2(nn.Layer): hs = hs + e_embs + p_embs # (B, Lmax, adim) - hs = self.length_regulator(hs, ds) + hs = self.length_regulator(hs, ds, is_inference=False) # forward decoder if olens is not None and not is_inference: diff --git a/paddlespeech/t2s/models/speedyspeech/speedyspeech.py b/paddlespeech/t2s/models/speedyspeech/speedyspeech.py index cc9e2066..42e8f743 100644 --- a/paddlespeech/t2s/models/speedyspeech/speedyspeech.py +++ b/paddlespeech/t2s/models/speedyspeech/speedyspeech.py @@ -14,28 +14,9 @@ import paddle from paddle import nn +from paddlespeech.t2s.modules.nets_utils import initialize from paddlespeech.t2s.modules.positional_encoding import sinusoid_position_encoding - - -def expand(encodings: paddle.Tensor, durations: paddle.Tensor) -> paddle.Tensor: - """ - encodings: (B, T, C) - durations: (B, T) - """ - batch_size, t_enc = paddle.shape(durations) - slens = paddle.sum(durations, -1) - t_dec = paddle.max(slens) - M = paddle.zeros([batch_size, t_dec, t_enc]) - for i in range(batch_size): - k = 0 - for j in range(t_enc): - d = durations[i, j] - # If the d == 0, slice action is meaningless and not supported - if d >= 1: - M[0, k:k + d, j] = 1 - k += d - encodings = paddle.matmul(M, encodings) - return encodings +from paddlespeech.t2s.modules.predictor.length_regulator import LengthRegulator class ResidualBlock(nn.Layer): @@ -175,19 +156,25 @@ class SpeedySpeechDecoder(nn.Layer): class SpeedySpeech(nn.Layer): - def __init__(self, - vocab_size, - encoder_hidden_size, - encoder_kernel_size, - encoder_dilations, - duration_predictor_hidden_size, - decoder_hidden_size, - decoder_output_size, - decoder_kernel_size, - decoder_dilations, - tone_size=None, - spk_num=None): + def __init__( + self, + vocab_size, + encoder_hidden_size, + encoder_kernel_size, + encoder_dilations, + duration_predictor_hidden_size, + decoder_hidden_size, + decoder_output_size, + decoder_kernel_size, + decoder_dilations, + tone_size=None, + spk_num=None, + init_type: str="xavier_uniform", ): super().__init__() + + # initialize parameters + initialize(self, init_type) + encoder = SpeedySpeechEncoder(vocab_size, tone_size, encoder_hidden_size, encoder_kernel_size, encoder_dilations, spk_num) @@ -198,6 +185,10 @@ class SpeedySpeech(nn.Layer): self.encoder = encoder self.duration_predictor = duration_predictor self.decoder = decoder + # define length regulator + self.length_regulator = LengthRegulator() + + nn.initializer.set_global_initializer(None) def forward(self, text, tones, durations, spk_id: paddle.Tensor=None): # input of embedding must be int64 @@ -212,7 +203,7 @@ class SpeedySpeech(nn.Layer): # expand encodings durations_to_expand = durations - encodings = expand(encodings, durations_to_expand) + encodings = self.length_regulator(encodings, durations_to_expand) # decode # remove positional encoding here @@ -240,7 +231,8 @@ class SpeedySpeech(nn.Layer): durations_to_expand = durations_to_expand.astype(paddle.int64) else: durations_to_expand = durations - encodings = expand(encodings, durations_to_expand) + encodings = self.length_regulator( + encodings, durations_to_expand, is_inference=True) shape = paddle.shape(encodings) t_dec, feature_size = shape[1], shape[2] diff --git a/paddlespeech/t2s/modules/predictor/length_regulator.py b/paddlespeech/t2s/modules/predictor/length_regulator.py index f1ecfb7c..9510dd88 100644 --- a/paddlespeech/t2s/modules/predictor/length_regulator.py +++ b/paddlespeech/t2s/modules/predictor/length_regulator.py @@ -13,6 +13,7 @@ # limitations under the License. # Modified from espnet(https://github.com/espnet/espnet) """Length regulator related modules.""" +import numpy as np import paddle from paddle import nn @@ -43,6 +44,28 @@ class LengthRegulator(nn.Layer): super().__init__() self.pad_value = pad_value + # expand_numpy is faster than expand + def expand_numpy(self, encodings: paddle.Tensor, + durations: paddle.Tensor) -> paddle.Tensor: + """ + encodings: (B, T, C) + durations: (B, T) + """ + batch_size, t_enc = durations.shape + durations = durations.numpy() + slens = np.sum(durations, -1) + t_dec = np.max(slens) + M = np.zeros([batch_size, t_dec, t_enc]) + for i in range(batch_size): + k = 0 + for j in range(t_enc): + d = durations[i, j] + M[i, k:k + d, j] = 1 + k += d + M = paddle.to_tensor(M, dtype=encodings.dtype) + encodings = paddle.matmul(M, encodings) + return encodings + def expand(self, encodings: paddle.Tensor, durations: paddle.Tensor) -> paddle.Tensor: """ @@ -50,20 +73,21 @@ class LengthRegulator(nn.Layer): durations: (B, T) """ batch_size, t_enc = paddle.shape(durations) - slens = durations.sum(-1) - t_dec = slens.max() + slens = paddle.sum(durations, -1) + t_dec = paddle.max(slens) M = paddle.zeros([batch_size, t_dec, t_enc]) for i in range(batch_size): k = 0 for j in range(t_enc): d = durations[i, j] + # If the d == 0, slice action is meaningless and not supported in paddle if d >= 1: M[i, k:k + d, j] = 1 k += d encodings = paddle.matmul(M, encodings) return encodings - def forward(self, xs, ds, alpha=1.0): + def forward(self, xs, ds, alpha=1.0, is_inference=False): """Calculate forward propagation. Parameters @@ -85,4 +109,7 @@ class LengthRegulator(nn.Layer): assert alpha > 0 ds = paddle.round(ds.cast(dtype=paddle.float32) * alpha) ds = ds.cast(dtype=paddle.int64) - return self.expand(xs, ds) + if is_inference: + return self.expand(xs, ds) + else: + return self.expand_numpy(xs, ds) diff --git a/setup.py b/setup.py index a6b18f97..8f68923d 100644 --- a/setup.py +++ b/setup.py @@ -17,6 +17,7 @@ import io import os import subprocess as sp import sys +import paddlespeech from pathlib import Path from setuptools import Command @@ -172,7 +173,7 @@ class UploadCommand(Command): setup_info = dict( # Metadata name='paddlespeech', - version='0.1.1', + version=paddlespeech.__version__, author='PaddlePaddle Speech and Language Team', author_email='paddlesl@baidu.com', url='https://github.com/PaddlePaddle/PaddleSpeech', diff --git a/speechx/CMakeLists.txt b/speechx/CMakeLists.txt new file mode 100644 index 00000000..878374ba --- /dev/null +++ b/speechx/CMakeLists.txt @@ -0,0 +1,77 @@ +cmake_minimum_required(VERSION 3.14 FATAL_ERROR) + +project(deepspeech VERSION 0.1) + +set(CMAKE_VERBOSE_MAKEFILE on) +# set std-14 +set(CMAKE_CXX_STANDARD 14) + +# include file +include(FetchContent) +include(ExternalProject) +# fc_patch dir +set(FETCHCONTENT_QUIET off) +get_filename_component(fc_patch "fc_patch" REALPATH BASE_DIR "${CMAKE_SOURCE_DIR}") +set(FETCHCONTENT_BASE_DIR ${fc_patch}) + + +############################################################################### +# Option Configurations +############################################################################### +# option configurations +option(TEST_DEBUG "option for debug" OFF) + + +############################################################################### +# Include third party +############################################################################### +# #example for include third party +# FetchContent_Declare() +# # FetchContent_MakeAvailable was not added until CMake 3.14 +# FetchContent_MakeAvailable() +# include_directories() + +# ABSEIL-CPP +include(FetchContent) +FetchContent_Declare( + absl + GIT_REPOSITORY "https://github.com/abseil/abseil-cpp.git" + GIT_TAG "20210324.1" +) +FetchContent_MakeAvailable(absl) + +# libsndfile +include(FetchContent) +FetchContent_Declare( + libsndfile + GIT_REPOSITORY "https://github.com/libsndfile/libsndfile.git" + GIT_TAG "1.0.31" +) +FetchContent_MakeAvailable(libsndfile) + + +############################################################################### +# Add local library +############################################################################### +# system lib +find_package() +# if dir have CmakeLists.txt +add_subdirectory() +# if dir do not have CmakeLists.txt +add_library(lib_name STATIC file.cc) +target_link_libraries(lib_name item0 item1) +add_dependencies(lib_name depend-target) + + +############################################################################### +# Library installation +############################################################################### +install() + + +############################################################################### +# Build binary file +############################################################################### +add_executable() +target_link_libraries() + diff --git a/speechx/docker/.gitkeep b/speechx/docker/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/speechx/examples/.gitkeep b/speechx/examples/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/speechx/speechx/CMakeLists.txt b/speechx/speechx/CMakeLists.txt new file mode 100644 index 00000000..e69de29b diff --git a/speechx/speechx/decoder/CMakeLists.txt b/speechx/speechx/decoder/CMakeLists.txt new file mode 100644 index 00000000..259261bd --- /dev/null +++ b/speechx/speechx/decoder/CMakeLists.txt @@ -0,0 +1,2 @@ +aux_source_directory(. DIR_LIB_SRCS) +add_library(decoder STATIC ${DIR_LIB_SRCS}) diff --git a/speechx/speechx/frontend/CMakeLists.txt b/speechx/speechx/frontend/CMakeLists.txt new file mode 100644 index 00000000..e69de29b diff --git a/speechx/speechx/frontend/audio/CMakeLists.txt b/speechx/speechx/frontend/audio/CMakeLists.txt new file mode 100644 index 00000000..e69de29b diff --git a/speechx/speechx/frontend/text/CMakeLists.txt b/speechx/speechx/frontend/text/CMakeLists.txt new file mode 100644 index 00000000..e69de29b diff --git a/speechx/speechx/kaldi/.gitkeep b/speechx/speechx/kaldi/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/speechx/speechx/model/CMakeLists.txt b/speechx/speechx/model/CMakeLists.txt new file mode 100644 index 00000000..e69de29b diff --git a/speechx/speechx/protocol/CMakeLists.txt b/speechx/speechx/protocol/CMakeLists.txt new file mode 100644 index 00000000..e69de29b diff --git a/speechx/speechx/third_party/CMakeLists.txt b/speechx/speechx/third_party/CMakeLists.txt new file mode 100644 index 00000000..e69de29b diff --git a/speechx/speechx/utils/CMakeLists.txt b/speechx/speechx/utils/CMakeLists.txt new file mode 100644 index 00000000..e69de29b