Merge branch 'develop' into patch-12

pull/3970/head
liyulingyue 8 months ago
commit 0cb86f60af

@ -734,8 +734,8 @@ def default_collate(batch,
if not all(len(elem) == elem_size for elem in it):
raise RuntimeError(
"each element in list of batch should be of equal size")
transposed = list(zip(
*batch)) # It may be accessed twice, so we use a list.
transposed = list(
zip(*batch)) # It may be accessed twice, so we use a list.
if isinstance(elem, tuple):
return [

@ -202,9 +202,9 @@ class AudioDataset:
Examples
--------
>>> from audio.audiotools.data.datasets import AudioLoader
>>> from audio.audiotools.data.datasets import AudioDataset
>>> from audio.audiotools import transforms as tfm
>>> from paddlespeech.audiotools.data.datasets import AudioLoader
>>> from paddlespeech.audiotools.data.datasets import AudioDataset
>>> from paddlespeech.audiotools import transforms as tfm
>>> import numpy as np
>>>
>>> loaders = [
@ -237,9 +237,9 @@ class AudioDataset:
Below is an example of how one could load MUSDB multitrack data:
>>> from audio import audiotools as at
>>> from paddlespeech import audiotools as at
>>> from pathlib import Path
>>> from audio.audiotools import transforms as tfm
>>> from paddlespeech.audiotools import transforms as tfm
>>> import numpy as np
>>> import torch
>>>
@ -296,9 +296,9 @@ class AudioDataset:
Similarly, here's example code for loading Slakh data:
>>> from audio import audiotools as at
>>> from paddlespeech import audiotools as at
>>> from pathlib import Path
>>> from audio.audiotools import transforms as tfm
>>> from paddlespeech.audiotools import transforms as tfm
>>> import numpy as np
>>> import torch
>>> import glob

@ -37,7 +37,7 @@ def create_csv(audio_files: list,
You can produce a CSV file from a directory of audio files via:
>>> from audio import audiotools
>>> from paddlespeech import audiotools
>>> directory = ...
>>> audio_files = audiotools.util.find_audio(directory)
>>> output_path = "train.csv"

@ -6,7 +6,7 @@ import typing
import paddle
from audio.audiotools.core import AudioSignal
from paddlespeech.audiotools.core import AudioSignal
def audio_table(

@ -2,5 +2,4 @@ ffmpeg-python
ffmpy
flatten_dict
pyloudnorm
pytest
rich

@ -19,7 +19,7 @@ from typing import Tuple
import paddle
from paddle import nn
from paddle.nn import initializer as I
from typeguard import check_argument_types
from typeguard import typechecked
from paddlespeech.s2t.modules.align import BatchNorm1D
from paddlespeech.s2t.modules.align import Conv1D
@ -34,6 +34,7 @@ __all__ = ['ConvolutionModule']
class ConvolutionModule(nn.Layer):
"""ConvolutionModule in Conformer model."""
@typechecked
def __init__(self,
channels: int,
kernel_size: int=15,
@ -52,7 +53,6 @@ class ConvolutionModule(nn.Layer):
causal (bool): Whether use causal convolution or not
bias (bool): Whether Conv with bias or not
"""
assert check_argument_types()
super().__init__()
self.bias = bias
self.channels = channels

@ -17,7 +17,7 @@ from typing import Union
import paddle
from paddle import nn
from paddle.nn import functional as F
from typeguard import check_argument_types
from typeguard import typechecked
from paddlespeech.s2t.modules.align import Linear
from paddlespeech.s2t.modules.loss import CTCLoss
@ -48,6 +48,7 @@ __all__ = ['CTCDecoder']
class CTCDecoderBase(nn.Layer):
@typechecked
def __init__(self,
odim,
enc_n_units,
@ -66,7 +67,6 @@ class CTCDecoderBase(nn.Layer):
batch_average (bool): do batch dim wise average.
grad_norm_type (str): Default, None. one of 'instance', 'batch', 'frame', None.
"""
assert check_argument_types()
super().__init__()
self.blank_id = blank_id

@ -21,7 +21,7 @@ from typing import Tuple
import paddle
from paddle import nn
from typeguard import check_argument_types
from typeguard import typechecked
from paddlespeech.s2t.decoders.scorers.scorer_interface import BatchScorerInterface
from paddlespeech.s2t.modules.align import Embedding
@ -61,6 +61,7 @@ class TransformerDecoder(BatchScorerInterface, nn.Layer):
False: x -> x + att(x)
"""
@typechecked
def __init__(self,
vocab_size: int,
encoder_output_size: int,
@ -77,8 +78,6 @@ class TransformerDecoder(BatchScorerInterface, nn.Layer):
concat_after: bool=False,
max_len: int=5000):
assert check_argument_types()
nn.Layer.__init__(self)
self.selfattention_layer_type = 'selfattn'
attention_dim = encoder_output_size
@ -276,6 +275,7 @@ class BiTransformerDecoder(BatchScorerInterface, nn.Layer):
False: x -> x + att(x)
"""
@typechecked
def __init__(self,
vocab_size: int,
encoder_output_size: int,
@ -293,8 +293,6 @@ class BiTransformerDecoder(BatchScorerInterface, nn.Layer):
concat_after: bool=False,
max_len: int=5000):
assert check_argument_types()
nn.Layer.__init__(self)
self.left_decoder = TransformerDecoder(
vocab_size, encoder_output_size, attention_heads, linear_units,

@ -21,7 +21,7 @@ from typing import Union
import paddle
from paddle import nn
from typeguard import check_argument_types
from typeguard import typechecked
from paddlespeech.s2t.modules.activation import get_activation
from paddlespeech.s2t.modules.align import LayerNorm
@ -58,6 +58,7 @@ __all__ = [
class BaseEncoder(nn.Layer):
@typechecked
def __init__(self,
input_size: int,
output_size: int=256,
@ -73,7 +74,7 @@ class BaseEncoder(nn.Layer):
concat_after: bool=False,
static_chunk_size: int=0,
use_dynamic_chunk: bool=False,
global_cmvn: paddle.nn.Layer=None,
global_cmvn: Optional[nn.Layer]=None,
use_dynamic_left_chunk: bool=False,
max_len: int=5000):
"""
@ -108,7 +109,6 @@ class BaseEncoder(nn.Layer):
use_dynamic_left_chunk (bool): whether use dynamic left chunk in
dynamic chunk training
"""
assert check_argument_types()
super().__init__()
self._output_size = output_size
@ -349,6 +349,7 @@ class BaseEncoder(nn.Layer):
class TransformerEncoder(BaseEncoder):
"""Transformer encoder module."""
@typechecked
def __init__(
self,
input_size: int,
@ -365,12 +366,11 @@ class TransformerEncoder(BaseEncoder):
concat_after: bool=False,
static_chunk_size: int=0,
use_dynamic_chunk: bool=False,
global_cmvn: nn.Layer=None,
global_cmvn: Optional[nn.Layer]=None,
use_dynamic_left_chunk: bool=False, ):
""" Construct TransformerEncoder
See Encoder for the meaning of each parameter.
"""
assert check_argument_types()
super().__init__(input_size, output_size, attention_heads, linear_units,
num_blocks, dropout_rate, positional_dropout_rate,
attention_dropout_rate, input_layer,
@ -424,6 +424,7 @@ class TransformerEncoder(BaseEncoder):
class ConformerEncoder(BaseEncoder):
"""Conformer encoder module."""
@typechecked
def __init__(self,
input_size: int,
output_size: int=256,
@ -439,7 +440,7 @@ class ConformerEncoder(BaseEncoder):
concat_after: bool=False,
static_chunk_size: int=0,
use_dynamic_chunk: bool=False,
global_cmvn: nn.Layer=None,
global_cmvn: Optional[nn.Layer]=None,
use_dynamic_left_chunk: bool=False,
positionwise_conv_kernel_size: int=1,
macaron_style: bool=True,
@ -466,8 +467,6 @@ class ConformerEncoder(BaseEncoder):
causal (bool): whether to use causal convolution or not.
cnn_module_norm (str): cnn conv norm type, Optional['batch_norm','layer_norm']
"""
assert check_argument_types()
super().__init__(input_size, output_size, attention_heads, linear_units,
num_blocks, dropout_rate, positional_dropout_rate,
attention_dropout_rate, input_layer,
@ -519,6 +518,7 @@ class ConformerEncoder(BaseEncoder):
class SqueezeformerEncoder(nn.Layer):
@typechecked
def __init__(self,
input_size: int,
encoder_dim: int=256,
@ -541,7 +541,7 @@ class SqueezeformerEncoder(nn.Layer):
adaptive_scale: bool=True,
activation_type: str="swish",
init_weights: bool=True,
global_cmvn: paddle.nn.Layer=None,
global_cmvn: Optional[nn.Layer]=None,
normalize_before: bool=False,
use_dynamic_chunk: bool=False,
concat_after: bool=False,
@ -572,7 +572,6 @@ class SqueezeformerEncoder(nn.Layer):
init_weights (bool): Whether to initialize weights.
causal (bool): whether to use causal convolution or not.
"""
assert check_argument_types()
super().__init__()
self.global_cmvn = global_cmvn
self.reduce_idx: Optional[Union[int, List[int]]] = [reduce_idx] \

@ -19,7 +19,7 @@ from typing import Union
import paddle
from paddle.optimizer.lr import LRScheduler
from typeguard import check_argument_types
from typeguard import typechecked
from paddlespeech.s2t.utils.dynamic_import import dynamic_import
from paddlespeech.s2t.utils.dynamic_import import instance_class
@ -57,13 +57,13 @@ class WarmupLR(LRScheduler):
Note that the maximum lr equals to optimizer.lr in this scheduler.
"""
@typechecked
def __init__(self,
warmup_steps: Union[int, float]=25000,
learning_rate=1.0,
last_epoch=-1,
verbose=False,
**kwargs):
assert check_argument_types()
self.warmup_steps = warmup_steps
super().__init__(learning_rate, last_epoch, verbose)

@ -20,7 +20,7 @@ from typing import Tuple
import numpy as np
import paddle
from paddle import nn
from typeguard import check_argument_types
from typeguard import typechecked
from paddlespeech.t2s.models.diffsinger.fastspeech2midi import FastSpeech2MIDI
from paddlespeech.t2s.modules.diffnet import DiffNet
@ -40,6 +40,7 @@ class DiffSinger(nn.Layer):
"""
@typechecked
def __init__(
self,
# min and max spec for stretching before diffusion
@ -157,7 +158,6 @@ class DiffSinger(nn.Layer):
denoiser_params (Dict[str, Any]): Parameter dict for dinoiser module.
diffusion_params (Dict[str, Any]): Parameter dict for diffusion module.
"""
assert check_argument_types()
super().__init__()
self.fs2 = FastSpeech2MIDI(
idim=idim,
@ -336,6 +336,7 @@ class DiffSingerInference(nn.Layer):
class DiffusionLoss(nn.Layer):
"""Loss function module for Diffusion module on DiffSinger."""
@typechecked
def __init__(self, use_masking: bool=True,
use_weighted_masking: bool=False):
"""Initialize feed-forward Transformer loss module.
@ -345,7 +346,6 @@ class DiffusionLoss(nn.Layer):
use_weighted_masking (bool):
Whether to weighted masking in loss calculation.
"""
assert check_argument_types()
super().__init__()
assert (use_masking != use_weighted_masking) or not use_masking

@ -19,7 +19,7 @@ from typing import Tuple
import paddle
from paddle import nn
from typeguard import check_argument_types
from typeguard import typechecked
from paddlespeech.t2s.models.fastspeech2 import FastSpeech2
from paddlespeech.t2s.models.fastspeech2 import FastSpeech2Loss
@ -33,6 +33,7 @@ class FastSpeech2MIDI(FastSpeech2):
"""The Fastspeech2 module of DiffSinger.
"""
@typechecked
def __init__(
self,
# fastspeech2 network structure related
@ -57,7 +58,6 @@ class FastSpeech2MIDI(FastSpeech2):
is_slur_ids will be provided as the input
"""
assert check_argument_types()
super().__init__(idim=idim, odim=odim, **fastspeech2_params)
self.use_energy_pred = use_energy_pred
self.use_postnet = use_postnet
@ -495,6 +495,7 @@ class FastSpeech2MIDI(FastSpeech2):
class FastSpeech2MIDILoss(FastSpeech2Loss):
"""Loss function module for DiffSinger."""
@typechecked
def __init__(self, use_masking: bool=True,
use_weighted_masking: bool=False):
"""Initialize feed-forward Transformer loss module.
@ -504,7 +505,6 @@ class FastSpeech2MIDILoss(FastSpeech2Loss):
use_weighted_masking (bool):
Whether to weighted masking in loss calculation.
"""
assert check_argument_types()
super().__init__(use_masking, use_weighted_masking)
def forward(

@ -15,6 +15,7 @@
"""Fastspeech2 related modules for paddle"""
from typing import Dict
from typing import List
from typing import Optional
from typing import Sequence
from typing import Tuple
from typing import Union
@ -23,7 +24,7 @@ import numpy as np
import paddle
import paddle.nn.functional as F
from paddle import nn
from typeguard import check_argument_types
from typeguard import typechecked
from paddlespeech.t2s.modules.adversarial_loss.gradient_reversal import GradientReversalLayer
from paddlespeech.t2s.modules.adversarial_loss.speaker_classifier import SpeakerClassifier
@ -60,6 +61,7 @@ class FastSpeech2(nn.Layer):
"""
@typechecked
def __init__(
self,
# network structure related
@ -131,12 +133,12 @@ class FastSpeech2(nn.Layer):
pitch_embed_dropout: float=0.5,
stop_gradient_from_pitch_predictor: bool=False,
# spk emb
spk_num: int=None,
spk_embed_dim: int=None,
spk_num: Optional[int]=None,
spk_embed_dim: Optional[int]=None,
spk_embed_integration_type: str="add",
# tone emb
tone_num: int=None,
tone_embed_dim: int=None,
tone_num: Optional[int]=None,
tone_embed_dim: Optional[int]=None,
tone_embed_integration_type: str="add",
# training related
init_type: str="xavier_uniform",
@ -282,7 +284,6 @@ class FastSpeech2(nn.Layer):
The hidden layer dim of speaker classifier
"""
assert check_argument_types()
super().__init__()
# store hyperparameters
@ -1070,6 +1071,7 @@ class StyleFastSpeech2Inference(FastSpeech2Inference):
class FastSpeech2Loss(nn.Layer):
"""Loss function module for FastSpeech2."""
@typechecked
def __init__(self, use_masking: bool=True,
use_weighted_masking: bool=False):
"""Initialize feed-forward Transformer loss module.
@ -1079,7 +1081,6 @@ class FastSpeech2Loss(nn.Layer):
use_weighted_masking (bool):
Whether to weighted masking in loss calculation.
"""
assert check_argument_types()
super().__init__()
assert (use_masking != use_weighted_masking) or not use_masking

@ -28,7 +28,6 @@ from typing import Tuple
import numpy as np
import paddle
from paddle import nn
from typeguard import check_argument_types
from paddlespeech.t2s.models.hifigan import HiFiGANGenerator
from paddlespeech.t2s.models.jets.alignments import AlignmentModule

@ -24,7 +24,7 @@ from typing import Optional
import paddle
from paddle import nn
from typeguard import check_argument_types
from typeguard import typechecked
from paddlespeech.t2s.models.hifigan import HiFiGANMultiPeriodDiscriminator
from paddlespeech.t2s.models.hifigan import HiFiGANMultiScaleDiscriminator
@ -64,6 +64,7 @@ class JETS(nn.Layer):
Text-to-Speech`: https://arxiv.org/abs/2203.16852v1
"""
@typechecked
def __init__(
self,
# generator related
@ -225,7 +226,6 @@ class JETS(nn.Layer):
cache_generator_outputs (bool):
Whether to cache generator outputs.
"""
assert check_argument_types()
super().__init__()
# define modules
@ -279,8 +279,7 @@ class JETS(nn.Layer):
lids: Optional[paddle.Tensor]=None,
forward_generator: bool=True,
use_alignment_module: bool=False,
**kwargs,
) -> Dict[str, Any]:
**kwargs, ) -> Dict[str, Any]:
"""Perform generator forward.
Args:
text (Tensor):

@ -21,7 +21,7 @@ from typing import Tuple
import paddle
import paddle.nn.functional as F
from paddle import nn
from typeguard import check_argument_types
from typeguard import typechecked
from paddlespeech.t2s.modules.nets_utils import initialize
from paddlespeech.t2s.modules.nets_utils import make_pad_mask
@ -44,6 +44,7 @@ class Tacotron2(nn.Layer):
"""
@typechecked
def __init__(
self,
# network structure related
@ -67,7 +68,7 @@ class Tacotron2(nn.Layer):
postnet_layers: int=5,
postnet_chans: int=512,
postnet_filts: int=5,
output_activation: str=None,
output_activation: Optional[str]=None,
use_batch_norm: bool=True,
use_concate: bool=True,
use_residual: bool=False,
@ -145,7 +146,6 @@ class Tacotron2(nn.Layer):
zoneout_rate (float):
Zoneout rate.
"""
assert check_argument_types()
super().__init__()
# store hyperparameters

@ -13,7 +13,9 @@
# limitations under the License.
# Modified from espnet(https://github.com/espnet/espnet)
"""Fastspeech2 related modules for paddle"""
from optparse import Option
from typing import Dict
from typing import Optional
from typing import Sequence
from typing import Tuple
@ -21,7 +23,7 @@ import numpy
import paddle
import paddle.nn.functional as F
from paddle import nn
from typeguard import check_argument_types
from typeguard import typechecked
from paddlespeech.t2s.modules.nets_utils import initialize
from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask
@ -169,6 +171,7 @@ class TransformerTTS(nn.Layer):
Number of layers to apply guided attention loss.
"""
@typechecked
def __init__(
self,
# network structure related
@ -198,7 +201,7 @@ class TransformerTTS(nn.Layer):
encoder_concat_after: bool=False,
decoder_concat_after: bool=False,
reduction_factor: int=1,
spk_embed_dim: int=None,
spk_embed_dim: Optional[int]=None,
spk_embed_integration_type: str="add",
use_gst: bool=False,
gst_tokens: int=10,
@ -227,7 +230,7 @@ class TransformerTTS(nn.Layer):
num_heads_applied_guided_attn: int=2,
num_layers_applied_guided_attn: int=2, ):
"""Initialize Transformer module."""
assert check_argument_types()
super().__init__()
# store hyperparameters

@ -20,7 +20,7 @@ from typing import Optional
import paddle
from paddle import nn
from typeguard import check_argument_types
from typeguard import typechecked
from paddlespeech.t2s.models.hifigan import HiFiGANMultiPeriodDiscriminator
from paddlespeech.t2s.models.hifigan import HiFiGANMultiScaleDiscriminator
@ -60,6 +60,7 @@ class VITS(nn.Layer):
Text-to-Speech`: https://arxiv.org/abs/2006.04558
"""
@typechecked
def __init__(
self,
# generator related
@ -181,7 +182,6 @@ class VITS(nn.Layer):
cache_generator_outputs (bool):
Whether to cache generator outputs.
"""
assert check_argument_types()
super().__init__()
# define modules
@ -504,8 +504,9 @@ class VITS(nn.Layer):
def reset_parameters(self):
def _reset_parameters(module):
if isinstance(module,
(nn.Conv1D, nn.Conv1DTranspose, nn.Conv2D, nn.Conv2DTranspose)):
if isinstance(
module,
(nn.Conv1D, nn.Conv1DTranspose, nn.Conv2D, nn.Conv2DTranspose)):
kaiming_uniform_(module.weight, a=math.sqrt(5))
if module.bias is not None:
fan_in, _ = _calculate_fan_in_and_fan_out(module.weight)
@ -513,8 +514,9 @@ class VITS(nn.Layer):
bound = 1 / math.sqrt(fan_in)
uniform_(module.bias, -bound, bound)
if isinstance(module,
(nn.BatchNorm1D, nn.BatchNorm2D, nn.GroupNorm, nn.LayerNorm)):
if isinstance(
module,
(nn.BatchNorm1D, nn.BatchNorm2D, nn.GroupNorm, nn.LayerNorm)):
ones_(module.weight)
zeros_(module.bias)
@ -533,13 +535,13 @@ class VITS(nn.Layer):
self.apply(_reset_parameters)
class VITSInference(nn.Layer):
def __init__(self, model):
super().__init__()
self.acoustic_model = model
def forward(self, text, sids=None):
out = self.acoustic_model.inference(
text, sids=sids)
out = self.acoustic_model.inference(text, sids=sids)
wav = out['wav']
return wav

@ -14,16 +14,16 @@
# Modified from Cross-Lingual-Voice-Cloning(https://github.com/deterministic-algorithms-lab/Cross-Lingual-Voice-Cloning)
import paddle
from paddle import nn
from typeguard import check_argument_types
from typeguard import typechecked
class SpeakerClassifier(nn.Layer):
@typechecked
def __init__(
self,
idim: int,
hidden_sc_dim: int,
spk_num: int, ):
assert check_argument_types()
super().__init__()
# store hyperparameters
self.idim = idim

@ -21,7 +21,7 @@ from paddle import nn
from paddle.nn import functional as F
from scipy import signal
from scipy.stats import betabinom
from typeguard import check_argument_types
from typeguard import typechecked
from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask
from paddlespeech.t2s.modules.predictor.duration_predictor import (
@ -1137,6 +1137,7 @@ class MLMLoss(nn.Layer):
class VarianceLoss(nn.Layer):
@typechecked
def __init__(self, use_masking: bool=True,
use_weighted_masking: bool=False):
"""Initialize JETS variance loss module.
@ -1147,7 +1148,6 @@ class VarianceLoss(nn.Layer):
calculation.
"""
assert check_argument_types()
super().__init__()
assert (use_masking != use_weighted_masking) or not use_masking

@ -18,7 +18,7 @@ from typing import Tuple
import numpy as np
import paddle
from paddle import nn
from typeguard import check_argument_types
from typeguard import typechecked
from paddlespeech.utils.initialize import _calculate_fan_in_and_fan_out
from paddlespeech.utils.initialize import kaiming_uniform_
@ -301,6 +301,7 @@ def make_non_pad_mask(lengths, xs=None, length_dim=-1):
return paddle.logical_not(make_pad_mask(lengths, xs, length_dim))
@typechecked
def initialize(model: nn.Layer, init: str):
"""Initialize weights of a neural network module.
@ -314,8 +315,6 @@ def initialize(model: nn.Layer, init: str):
init (str):
Method of initialization.
"""
assert check_argument_types()
if init == "xavier_uniform":
nn.initializer.set_global_initializer(nn.initializer.XavierUniform(),
nn.initializer.Constant())

@ -15,7 +15,7 @@
"""Variance predictor related modules."""
import paddle
from paddle import nn
from typeguard import check_argument_types
from typeguard import typechecked
from paddlespeech.t2s.modules.layer_norm import LayerNorm
from paddlespeech.t2s.modules.masked_fill import masked_fill
@ -32,6 +32,7 @@ class VariancePredictor(nn.Layer):
"""
@typechecked
def __init__(
self,
idim: int,
@ -54,7 +55,6 @@ class VariancePredictor(nn.Layer):
dropout_rate (float, optional):
Dropout rate.
"""
assert check_argument_types()
super().__init__()
self.conv = nn.LayerList()
for idx in range(n_layers):

@ -17,7 +17,7 @@ from typing import Sequence
import paddle
from paddle import nn
from typeguard import check_argument_types
from typeguard import typechecked
from paddlespeech.t2s.modules.transformer.attention import MultiHeadedAttention as BaseMultiHeadedAttention
@ -58,6 +58,7 @@ class StyleEncoder(nn.Layer):
"""
@typechecked
def __init__(
self,
idim: int=80,
@ -71,7 +72,6 @@ class StyleEncoder(nn.Layer):
gru_layers: int=1,
gru_units: int=128, ):
"""Initilize global style encoder module."""
assert check_argument_types()
super().__init__()
self.ref_enc = ReferenceEncoder(
@ -132,6 +132,7 @@ class ReferenceEncoder(nn.Layer):
"""
@typechecked
def __init__(
self,
idim=80,
@ -142,7 +143,6 @@ class ReferenceEncoder(nn.Layer):
gru_layers: int=1,
gru_units: int=128, ):
"""Initilize reference encoder module."""
assert check_argument_types()
super().__init__()
# check hyperparameters are valid
@ -232,6 +232,7 @@ class StyleTokenLayer(nn.Layer):
"""
@typechecked
def __init__(
self,
ref_embed_dim: int=128,
@ -240,7 +241,6 @@ class StyleTokenLayer(nn.Layer):
gst_heads: int=4,
dropout_rate: float=0.0, ):
"""Initilize style token layer module."""
assert check_argument_types()
super().__init__()
gst_embs = paddle.randn(shape=[gst_tokens, gst_token_dim // gst_heads])

@ -31,6 +31,26 @@ HERE = Path(os.path.abspath(os.path.dirname(__file__)))
VERSION = '0.0.0'
COMMITID = 'none'
def determine_opencc_version():
# get gcc version
gcc_version = None
try:
output = sp.check_output(
['gcc', '--version'], stderr=sp.STDOUT, text=True)
for line in output.splitlines():
if "gcc" in line:
gcc_version = line.split()[-1]
except Exception as e:
gcc_version = None
# determine opencc version
if gcc_version:
if int(gcc_version.split(".")[0]) <= 9:
return "opencc==1.1.6" # GCC<=9 need opencc==1.1.6
return "opencc" # default
base = [
"braceexpand",
"editdistance",
@ -48,7 +68,7 @@ base = [
"matplotlib",
"nara_wpe",
"onnxruntime>=1.11.0",
"opencc==1.1.6",
determine_opencc_version(), # opencc or opencc==1.1.6
"opencc-python-reimplemented",
"pandas",
"paddleaudio>=1.1.0",
@ -69,8 +89,8 @@ base = [
"soundfile",
"textgrid",
"timer",
"ToJyutping==0.2.1",
"typeguard==2.13.3",
"ToJyutping",
"typeguard",
"webrtcvad",
"yacs~=0.1.8",
"zhon",
@ -318,9 +338,9 @@ setup_info = dict(
'License :: OSI Approved :: Apache Software License',
'Programming Language :: Python',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.7',
'Programming Language :: Python :: 3.8',
'Programming Language :: Python :: 3.9',
'Programming Language :: Python :: 3.10',
],
entry_points={
'console_scripts': [

@ -12,9 +12,9 @@ import paddle
import pytest
import rich
from audio import audiotools
from audio.audiotools import AudioSignal
from audio.audiotools import util
from paddlespeech import audiotools
from paddlespeech.audiotools import AudioSignal
from paddlespeech.audiotools import util
def test_io():

@ -8,9 +8,9 @@ import unittest
import paddle
from audio.audiotools.core import pure_tone
from audio.audiotools.core import split_bands
from audio.audiotools.core import SplitBands
from paddlespeech.audiotools.core import pure_tone
from paddlespeech.audiotools.core import split_bands
from paddlespeech.audiotools.core import SplitBands
def delta(a, b, ref, fraction=0.9):

@ -8,7 +8,7 @@ from pathlib import Path
import numpy as np
from visualdl import LogWriter
from audio.audiotools import AudioSignal
from paddlespeech.audiotools import AudioSignal
def test_specshow():

@ -8,8 +8,8 @@ import numpy as np
import paddle
import pytest
from audio.audiotools import AudioSignal
from audio.audiotools.core.util import sample_from_dist
from paddlespeech.audiotools import AudioSignal
from paddlespeech.audiotools.core.util import sample_from_dist
@pytest.mark.parametrize("window_duration", [0.1, 0.25, 0.5, 1.0])

@ -8,7 +8,7 @@ import numpy as np
import paddle
import pytest
from audio.audiotools import AudioSignal
from paddlespeech.audiotools import AudioSignal
def test_normalize():

@ -9,8 +9,8 @@ import unittest
import paddle
import paddle.nn.functional as F
from audio.audiotools.core import fft_conv1d
from audio.audiotools.core import FFTConv1D
from paddlespeech.audiotools.core import fft_conv1d
from paddlespeech.audiotools.core import FFTConv1D
TOLERANCE = 1e-4 # as relative delta in percentage

@ -9,7 +9,7 @@ import numpy as np
import paddle
import pytest
from audio.audiotools import AudioSignal
from paddlespeech.audiotools import AudioSignal
def test_audio_grad():

@ -9,8 +9,8 @@ import unittest
import paddle
from audio.audiotools.core import highpass_filter
from audio.audiotools.core import highpass_filters
from paddlespeech.audiotools.core import highpass_filter
from paddlespeech.audiotools.core import highpass_filters
def pure_tone(freq: float, sr: float=128, dur: float=4, device=None):

@ -8,10 +8,10 @@ import numpy as np
import pyloudnorm
import soundfile as sf
from audio.audiotools import AudioSignal
from audio.audiotools import datasets
from audio.audiotools import Meter
from audio.audiotools import transforms
from paddlespeech.audiotools import AudioSignal
from paddlespeech.audiotools import datasets
from paddlespeech.audiotools import Meter
from paddlespeech.audiotools import transforms
ATOL = 1e-1

@ -10,10 +10,10 @@ import unittest
import numpy as np
import paddle
from audio.audiotools.core import lowpass_filter
from audio.audiotools.core import LowPassFilter
from audio.audiotools.core import LowPassFilters
from audio.audiotools.core import resample_frac
from paddlespeech.audiotools.core import lowpass_filter
from paddlespeech.audiotools.core import LowPassFilter
from paddlespeech.audiotools.core import LowPassFilters
from paddlespeech.audiotools.core import resample_frac
def pure_tone(freq: float, sr: float=128, dur: float=4, device=None):

@ -11,8 +11,8 @@ import numpy as np
import paddle
import pytest
from audio.audiotools import util
from audio.audiotools.core.audio_signal import AudioSignal
from paddlespeech.audiotools import util
from paddlespeech.audiotools.core.audio_signal import AudioSignal
from paddlespeech.vector.training.seeding import seed_everything

@ -10,8 +10,8 @@ import numpy as np
import paddle
import pytest
from audio import audiotools
from audio.audiotools.data import transforms as tfm
from paddlespeech import audiotools
from paddlespeech.audiotools.data import transforms as tfm
def test_align_lists():

@ -8,9 +8,9 @@ from pathlib import Path
import paddle
from audio.audiotools.core.util import find_audio
from audio.audiotools.core.util import read_sources
from audio.audiotools.data import preprocess
from paddlespeech.audiotools.core.util import find_audio
from paddlespeech.audiotools.core.util import read_sources
from paddlespeech.audiotools.data import preprocess
def test_create_csv():

@ -11,11 +11,11 @@ import numpy as np
import paddle
import pytest
from audio import audiotools
from audio.audiotools import AudioSignal
from audio.audiotools import util
from audio.audiotools.data import transforms as tfm
from audio.audiotools.data.datasets import AudioDataset
from paddlespeech import audiotools
from paddlespeech.audiotools import AudioSignal
from paddlespeech.audiotools import util
from paddlespeech.audiotools.data import transforms as tfm
from paddlespeech.audiotools.data.datasets import AudioDataset
from paddlespeech.vector.training.seeding import seed_everything
non_deterministic_transforms = ["TimeNoise", "FrequencyNoise"]

@ -8,10 +8,10 @@ import time
import paddle
from visualdl import LogWriter
from audio.audiotools import util
from audio.audiotools.ml.decorators import timer
from audio.audiotools.ml.decorators import Tracker
from audio.audiotools.ml.decorators import when
from paddlespeech.audiotools import util
from paddlespeech.audiotools.ml.decorators import timer
from paddlespeech.audiotools.ml.decorators import Tracker
from paddlespeech.audiotools.ml.decorators import when
def test_all_decorators():

@ -8,10 +8,10 @@ import tempfile
import paddle
from paddle import nn
from audio.audiotools import ml
from audio.audiotools import util
from paddlespeech.audiotools import ml
from paddlespeech.audiotools import util
from paddlespeech.vector.training.seeding import seed_everything
SEED = 0
SEED = 1024
def seed_and_run(model, *args, **kwargs):

@ -1,5 +1,4 @@
python -m pip install -r ../../audiotools/requirements.txt
export PYTHONPATH=$PYTHONPATH:$(realpath ../../..) # this is root path of `PaddleSpeech`
python -m pip install -r ../../../paddlespeech/audiotools/requirements.txt
wget https://paddlespeech.bj.bcebos.com/PaddleAudio/audio_tools/audio.tar.gz
wget https://paddlespeech.bj.bcebos.com/PaddleAudio/audio_tools/regression.tar.gz
tar -zxvf audio.tar.gz

@ -5,9 +5,9 @@
import sys
from pathlib import Path
from audio.audiotools import AudioSignal
from audio.audiotools import post
from audio.audiotools import transforms
from paddlespeech.audiotools import AudioSignal
from paddlespeech.audiotools import post
from paddlespeech.audiotools import transforms
def test_audio_table():

@ -34,7 +34,7 @@ function main(){
echo "End server"
echo "Start testing audiotools"
cd ${speech_ci_path}/../../audio/tests/audiotools
cd ${speech_ci_path}/audiotools
bash test_audiotools.sh
echo "End testing audiotools"

@ -13,10 +13,10 @@
# limitations under the License.
import paddle
from paddlespeech.t2s.modules import expansion
# from paddlespeech.t2s.modules import expansion
def test_expand():
def _test_expand():
x = paddle.randn([2, 4, 3]) # (B, T, C)
lengths = paddle.to_tensor([[1, 2, 2, 1], [3, 1, 4, 0]])
y = expansion.expand(x, lengths)

Loading…
Cancel
Save