fix code style

pull/4137/head
supotato6 4 months ago
parent a7b9a9c9b0
commit fe2db3292f

@ -1,11 +0,0 @@
import paddle,torch,numpy
torch_linear = torch.load("q.pt").cpu()
paddle_linear_state = paddle.load("q.pdparams")
paddle_linear = paddle.nn.Linear(896,896,bias_attr=True)
hidden_states = paddle.load("hidden_states.pdparams")
paddle_linear.set_state_dict(paddle_linear_state)
torch_forward_res = torch_linear(torch.tensor(hidden_states.numpy()))
paddle_forward_res = paddle_linear(hidden_states)
print("torch_forward_res:",torch_forward_res)
print("paddle_forward_res:",paddle_forward_res)
print('allclose_res:',numpy.testing.assert_allclose(torch_forward_res.detach().numpy(),paddle_forward_res))

@ -1,34 +0,0 @@
from paddlespeech.t2s.models.CosyVoice.cosyvoice import CosyVoice2
import sys
from paddlenlp.transformers import AutoTokenizer, AutoModelForCausalLM
from pathlib import Path
import paddle
import torch
from paddlespeech.t2s.models.CosyVoice.llm import Qwen2LM,ras_sampling,Qwen2Encoder
# cosyvoice_model = CosyVoice2("../CosyVoice/pretrained_models/CosyVoice2-0.5B_paddle")
model = AutoModelForCausalLM.from_pretrained('Qwen/Qwen2-0.5B')
llm = Qwen2Encoder(model)
qwen_lm = Qwen2LM(896,896,6561,llm,ras_sampling)
state_dict = paddle.load("/root/paddlejob/workspace/zhangjinghong/CosyVoice/pretrained_models/CosyVoice2-0.5B_paddle/llm.pdparams")
qwen_lm.set_state_dict(state_dict)
new_dict = torch.load("data.pt")
text = new_dict['text']
text_len = new_dict['text_len']
prompt_text = new_dict['prompt_text']
prompt_text_len = new_dict['prompt_text_len']
prompt_speech_token = new_dict['prompt_speech_token']
prompt_speech_token_len = new_dict['prompt_speech_token_len']
embedding = new_dict['embedding']
uuid = new_dict['uuid']
print("text:",text)
# for i in qwen_lm.inference(text=paddle.to_tensor(text),
# text_len=text_len,
# prompt_text=paddle.to_tensor(prompt_text),
# prompt_text_len=prompt_text_len,
# prompt_speech_token=paddle.to_tensor(prompt_speech_token),
# prompt_speech_token_len=prompt_speech_token_len,
# embedding=paddle.to_tensor(embedding,dtype = 'float32'),
# uuid=uuid):
# print(text)
# print(i)

@ -1,4 +1,4 @@
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.

@ -1,9 +1,17 @@
import paddle
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# from cosyvoice.cli.model import CosyVoice2Model, CosyVoiceModel
# from cosyvoice.flow.flow import CausalMaskedDiffWithXvec, MaskedDiffWithXvec
# from cosyvoice.hifigan.generator import HiFTGenerator
# from cosyvoice.llm.llm import Qwen2LM, TransformerLM
from paddlespeech.t2s.modules.transformer.activation import Swish
from paddlespeech.t2s.modules.transformer.attention import RelPositionMultiHeadedAttention
from paddlespeech.t2s.modules.transformer.embedding import EspnetRelPositionalEncoding
@ -23,18 +31,3 @@ COSYVOICE_ATTENTION_CLASSES = {
"rel_selfattn": RelPositionMultiHeadedAttention,
}
# def get_model_type(configs):
# if (
# isinstance(configs["llm"], TransformerLM)
# and isinstance(configs["flow"], MaskedDiffWithXvec)
# and isinstance(configs["hift"], HiFTGenerator)
# ):
# return CosyVoiceModel
# if (
# isinstance(configs["llm"], Qwen2LM)
# and isinstance(configs["flow"], CausalMaskedDiffWithXvec)
# and isinstance(configs["hift"], HiFTGenerator)
# ):
# return CosyVoice2Model
# raise TypeError("No valid model type found!")

@ -1,14 +1,24 @@
import paddle
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Unility functions for Transformer."""
import paddle
import queue
import random
from typing import List
import numpy as np
############################## 相关utils函数如下 ##############################
def device2str(type=None, index=None, *, device=None):
type = device if device else type
if isinstance(type, int):
@ -26,7 +36,6 @@ def device2str(type=None, index=None, *, device=None):
type = f'gpu:{type.get_device_id()}'
return type
############################## 相关utils函数如上 ##############################
IGNORE_ID = -1
@ -128,7 +137,6 @@ def ras_sampling(
.sum()
.item()
)
print("top_ids:",top_ids)
if rep_num >= win_size * tau_r:
top_ids = random_sampling(weighted_scores, decoded_tokens, sampling)[0]
return top_ids
@ -150,7 +158,6 @@ def nucleus_sampling(weighted_scores, top_p=0.8, top_k=25):
break
prob = paddle.to_tensor(prob).cuda()
indices = paddle.to_tensor(indices, dtype=paddle.long).to(weighted_scores.place)
print("indices:",indices)
# top_ids = indices[prob.multinomial(num_samples=1, replacement=True)]
top_ids = indices[0]
return top_ids
@ -160,7 +167,6 @@ def random_sampling(weighted_scores, decoded_tokens, sampling):
top_ids = weighted_scores.softmax(axis=0).multinomial(
num_samples=1, replacement=True
)
print("random_sampling:",top_ids)
return top_ids

@ -1,3 +1,17 @@
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import time
from typing import Generator

@ -1,3 +1,17 @@
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
from typing import Any
from typing import Dict

@ -1,3 +1,16 @@
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import os
import re
@ -320,8 +333,6 @@ class CosyVoiceFrontEnd:
def frontend_sft(self, tts_text, spk_id):
tts_text_token, tts_text_token_len = self._extract_text_token(tts_text)
print("1" * 30)
print(self.spk2info.keys())
embedding = self.spk2info[spk_id]["embedding"]
model_input = {
"text": tts_text_token,

@ -1,3 +1,17 @@
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import queue
import random
import threading
@ -7,10 +21,6 @@ import logging
import paddle.nn.functional as F
import paddle
IGNORE_ID = -1
# from cosyvoice.transformer.label_smoothing_loss import LabelSmoothingLoss
# from cosyvoice.utils.common import IGNORE_ID, th_accuracy
# from cosyvoice.utils.file_utils import logging
# from cosyvoice.utils.mask import make_pad_mask
import torch
LabelSmoothingLoss = None
def ras_sampling(weighted_scores, decoded_tokens, sampling, top_p=0.8, top_k=25, win_size=10, tau_r=0.1):
@ -368,23 +378,6 @@ class Qwen2LM(TransformerLM):
self.llm_decoder = paddle.nn.Linear(
in_features=llm_output_size, out_features=speech_token_size + 3
)
# self.llm_decoder.weight = paddle.create_parameter(
# shape=self.llm_decoder.weight.shape,
# dtype='bfloat16',
# default_initializer=paddle.nn.initializer.Assign(self.llm_decoder.weight.astype('bfloat16'))
# )
# if self.llm_decoder.bias is not None:
# self.llm_decoder.bias = paddle.create_parameter(
# shape=self.llm_decoder.bias.shape,
# dtype='bfloat16',
# default_initializer=paddle.nn.initializer.Assign(self.llm_decoder.bias.astype('bfloat16'))
# )
# self.criterion_ce = LabelSmoothingLoss(
# size=speech_token_size + 3,
# padding_idx=IGNORE_ID,
# smoothing=lsm_weight,
# normalize_length=length_normalized_loss,
# )
self.speech_embedding = paddle.nn.Embedding(
speech_token_size + 3, llm_input_size
)
@ -393,104 +386,104 @@ class Qwen2LM(TransformerLM):
self.stop_token_ids = [(speech_token_size + i) for i in range(3)]
self.vllm_output_queue = {}
# def prepare_lm_input_target(
# self,
# text_token,
# text_token_emb,
# text_token_len,
# speech_token,
# speech_token_emb,
# speech_token_len,
# ):
# lm_target, lm_input = [], []
# text_token = torch.nn.utils.rnn.unpad_sequence(
# text_token, text_token_len.cpu(), batch_first=True
# )
# speech_token = torch.nn.utils.rnn.unpad_sequence(
# speech_token, speech_token_len.cpu(), batch_first=True
# )
# text_token_emb = torch.nn.utils.rnn.unpad_sequence(
# text_token_emb, text_token_len.cpu(), batch_first=True
# )
# speech_token_emb = torch.nn.utils.rnn.unpad_sequence(
# speech_token_emb, speech_token_len.cpu(), batch_first=True
# )
# for i in range(len(text_token)):
# if (
# random.random() < 0.5
# and speech_token_len[i] / text_token_len[i]
# > self.mix_ratio[1] / self.mix_ratio[0]
# ):
# this_lm_target, this_lm_input = [], []
# this_lm_target.append(IGNORE_ID)
# this_lm_input.append(
# self.llm_embedding.weight[self.sos_eos].reshape(1, -1)
# )
# for j in range(
# ((text_token_len[i] + 1) / self.mix_ratio[0]).ceil().int().item()
# ):
# this_text_token = text_token[i][
# j * self.mix_ratio[0] : (j + 1) * self.mix_ratio[0]
# ].tolist()
# this_speech_token = speech_token[i][
# j * self.mix_ratio[1] : (j + 1) * self.mix_ratio[1]
# ].tolist()
# if len(this_text_token) == self.mix_ratio[0]:
# assert len(this_speech_token) == self.mix_ratio[1]
# this_lm_target += [IGNORE_ID] * (self.mix_ratio[0] - 1)
# this_lm_target += this_speech_token
# this_lm_target.append(self.speech_token_size + 2)
# this_lm_input.append(
# text_token_emb[i][
# j * self.mix_ratio[0] : (j + 1) * self.mix_ratio[0]
# ]
# )
# this_lm_input.append(
# speech_token_emb[i][
# j * self.mix_ratio[1] : (j + 1) * self.mix_ratio[1]
# ]
# )
# else:
# this_lm_target += [-1] * len(this_text_token)
# this_lm_target += speech_token[i][
# j * self.mix_ratio[1] :
# ].tolist()
# this_lm_target.append(self.speech_token_size)
# this_lm_input.append(text_token_emb[i][j * self.mix_ratio[0] :])
# this_lm_input.append(
# self.llm_embedding.weight[self.task_id].reshape(1, -1)
# )
# this_lm_input.append(
# speech_token_emb[i][j * self.mix_ratio[1] :]
# )
# this_lm_target, this_lm_input = paddle.tensor(
# this_lm_target
# ), paddle.cat(this_lm_input, dim=0)
# else:
# this_lm_target = paddle.tensor(
# [IGNORE_ID] * (1 + text_token_len[i])
# + speech_token[i].tolist()
# + [self.speech_token_size]
# )
# this_lm_input = paddle.cat(
# [
# self.llm_embedding.weight[self.sos_eos].reshape(1, -1),
# text_token_emb[i],
# self.llm_embedding.weight[self.task_id].reshape(1, -1),
# speech_token_emb[i],
# ],
# dim=0,
# )
# lm_target.append(this_lm_target)
# lm_input.append(this_lm_input)
# lm_input_len = paddle.tensor([i.size(0) for i in lm_input], dtype=paddle.int32)
# lm_input = torch.nn.utils.rnn.pad_sequence(
# lm_input, batch_first=True, padding_value=IGNORE_ID
# )
# lm_target = torch.nn.utils.rnn.pad_sequence(
# lm_target, batch_first=True, padding_value=IGNORE_ID
# )
# return lm_target, lm_input, lm_input_len
def prepare_lm_input_target(
self,
text_token,
text_token_emb,
text_token_len,
speech_token,
speech_token_emb,
speech_token_len,
):
lm_target, lm_input = [], []
text_token = torch.nn.utils.rnn.unpad_sequence(
text_token, text_token_len.cpu(), batch_first=True
)
speech_token = torch.nn.utils.rnn.unpad_sequence(
speech_token, speech_token_len.cpu(), batch_first=True
)
text_token_emb = torch.nn.utils.rnn.unpad_sequence(
text_token_emb, text_token_len.cpu(), batch_first=True
)
speech_token_emb = torch.nn.utils.rnn.unpad_sequence(
speech_token_emb, speech_token_len.cpu(), batch_first=True
)
for i in range(len(text_token)):
if (
random.random() < 0.5
and speech_token_len[i] / text_token_len[i]
> self.mix_ratio[1] / self.mix_ratio[0]
):
this_lm_target, this_lm_input = [], []
this_lm_target.append(IGNORE_ID)
this_lm_input.append(
self.llm_embedding.weight[self.sos_eos].reshape(1, -1)
)
for j in range(
((text_token_len[i] + 1) / self.mix_ratio[0]).ceil().int().item()
):
this_text_token = text_token[i][
j * self.mix_ratio[0] : (j + 1) * self.mix_ratio[0]
].tolist()
this_speech_token = speech_token[i][
j * self.mix_ratio[1] : (j + 1) * self.mix_ratio[1]
].tolist()
if len(this_text_token) == self.mix_ratio[0]:
assert len(this_speech_token) == self.mix_ratio[1]
this_lm_target += [IGNORE_ID] * (self.mix_ratio[0] - 1)
this_lm_target += this_speech_token
this_lm_target.append(self.speech_token_size + 2)
this_lm_input.append(
text_token_emb[i][
j * self.mix_ratio[0] : (j + 1) * self.mix_ratio[0]
]
)
this_lm_input.append(
speech_token_emb[i][
j * self.mix_ratio[1] : (j + 1) * self.mix_ratio[1]
]
)
else:
this_lm_target += [-1] * len(this_text_token)
this_lm_target += speech_token[i][
j * self.mix_ratio[1] :
].tolist()
this_lm_target.append(self.speech_token_size)
this_lm_input.append(text_token_emb[i][j * self.mix_ratio[0] :])
this_lm_input.append(
self.llm_embedding.weight[self.task_id].reshape(1, -1)
)
this_lm_input.append(
speech_token_emb[i][j * self.mix_ratio[1] :]
)
this_lm_target, this_lm_input = paddle.tensor(
this_lm_target
), paddle.cat(this_lm_input, dim=0)
else:
this_lm_target = paddle.tensor(
[IGNORE_ID] * (1 + text_token_len[i])
+ speech_token[i].tolist()
+ [self.speech_token_size]
)
this_lm_input = paddle.cat(
[
self.llm_embedding.weight[self.sos_eos].reshape(1, -1),
text_token_emb[i],
self.llm_embedding.weight[self.task_id].reshape(1, -1),
speech_token_emb[i],
],
dim=0,
)
lm_target.append(this_lm_target)
lm_input.append(this_lm_input)
lm_input_len = paddle.tensor([i.size(0) for i in lm_input], dtype=paddle.int32)
lm_input = torch.nn.utils.rnn.pad_sequence(
lm_input, batch_first=True, padding_value=IGNORE_ID
)
lm_target = torch.nn.utils.rnn.pad_sequence(
lm_target, batch_first=True, padding_value=IGNORE_ID
)
return lm_target, lm_input, lm_input_len
@paddle.no_grad()
def inference(
@ -598,7 +591,6 @@ class Qwen2LM(TransformerLM):
yield top_ids
out_tokens.append(top_ids)
lm_input = self.speech_embedding.weight[top_ids].reshape([1, 1, -1])
print(len(out_tokens))
@paddle.no_grad()
def inference_bistream(
self,

@ -1,189 +1,18 @@
import paddle
# ############################## 相关utils函数如下 ##############################
# def device2str(type=None, index=None, *, device=None):
# type = device if device else type
# if isinstance(type, int):
# type = f'gpu:{type}'
# elif isinstance(type, str):
# if 'cuda' in type:
# type = type.replace('cuda', 'gpu')
# if 'cpu' in type:
# type = 'cpu'
# elif index is not None:
# type = f'{type}:{index}'
# elif isinstance(type, paddle.CPUPlace) or (type is None):
# type = 'cpu'
# elif isinstance(type, paddle.CUDAPlace):
# type = f'gpu:{type.get_device_id()}'
# return type
# def _Tensor_max(self, *args, **kwargs):
# if "other" in kwargs:
# kwargs["y"] = kwargs.pop("other")
# ret = paddle.maximum(self, *args, **kwargs)
# elif len(args) == 1 and isinstance(args[0], paddle.Tensor):
# ret = paddle.maximum(self, *args, **kwargs)
# else:
# if "dim" in kwargs:
# kwargs["axis"] = kwargs.pop("dim")
# if "axis" in kwargs or len(args) >= 1:
# ret = paddle.max(self, *args, **kwargs), paddle.argmax(self, *args, **kwargs)
# else:
# ret = paddle.max(self, *args, **kwargs)
# return ret
# setattr(paddle.Tensor, "_max", _Tensor_max)
# ############################## 相关utils函数如上 ##############################
# """
# def subsequent_mask(
# size: int,
# device: torch.device = torch.device("cpu"),
# ) -> torch.Tensor:
# ""\"Create mask for subsequent steps (size, size).
# This mask is used only in decoder which works in an auto-regressive mode.
# This means the current step could only do attention with its left steps.
# In encoder, fully attention is used when streaming is not necessary and
# the sequence is not long. In this case, no attention mask is needed.
# When streaming is need, chunk-based attention is used in encoder. See
# subsequent_chunk_mask for the chunk-based attention mask.
# Args:
# size (int): size of mask
# str device (str): "cpu" or "cuda" or torch.Tensor.device
# dtype (torch.device): result dtype
# Returns:
# torch.Tensor: mask
# Examples:
# >>> subsequent_mask(3)
# [[1, 0, 0],
# [1, 1, 0],
# [1, 1, 1]]
# ""\"
# ret = torch.ones(size, size, device=device, dtype=torch.bool)
# return torch.tril(ret)
# """
# def subsequent_mask(
# size: int, device: paddle.device
# ) -> paddle.Tensor:
# """Create mask for subsequent steps (size, size).
# This mask is used only in decoder which works in an auto-regressive mode.
# This means the current step could only do attention with its left steps.
# In encoder, fully attention is used when streaming is not necessary and
# the sequence is not long. In this case, no attention mask is needed.
# When streaming is need, chunk-based attention is used in encoder. See
# subsequent_chunk_mask for the chunk-based attention mask.
# Args:
# size (int): size of mask
# str device (str): "cpu" or "cuda" or torch.Tensor.device
# dtype (torch.device): result dtype
# Returns:
# torch.Tensor: mask
# Examples:
# >>> subsequent_mask(3)
# [[1, 0, 0],
# [1, 1, 0],
# [1, 1, 1]]
# """
# arange = paddle.arange(size, device=device)
# mask = arange.expand(size, size)
# arange = arange.unsqueeze(-1)
# mask = mask <= arange
# return mask
# def subsequent_chunk_mask_deprecated(
# size: int,
# chunk_size: int,
# num_left_chunks: int = -1,
# >>>>>> device: torch.device = device2str("cpu"),
# ) -> paddle.Tensor:
# """Create mask for subsequent steps (size, size) with chunk size,
# this is for streaming encoder
# Args:
# size (int): size of mask
# chunk_size (int): size of chunk
# num_left_chunks (int): number of left chunks
# <0: use full chunk
# >=0: use num_left_chunks
# device (torch.device): "cpu" or "cuda" or torch.Tensor.device
# Returns:
# torch.Tensor: mask
# Examples:
# >>> subsequent_chunk_mask(4, 2)
# [[1, 1, 0, 0],
# [1, 1, 0, 0],
# [1, 1, 1, 1],
# [1, 1, 1, 1]]
# """
# ret = paddle.zeros(size, size, device=device, dtype=paddle.bool)
# for i in range(size):
# if num_left_chunks < 0:
# start = 0
# else:
# start = max((i // chunk_size - num_left_chunks) * chunk_size, 0)
# ending = min((i // chunk_size + 1) * chunk_size, size)
# ret[i, start:ending] = True
# return ret
# def subsequent_chunk_mask(
# size: int,
# chunk_size: int,
# num_left_chunks: int = -1,
# >>>>>> device: torch.device = device2str("cpu"),
# ) -> paddle.Tensor:
# """Create mask for subsequent steps (size, size) with chunk size,
# this is for streaming encoder
# Args:
# size (int): size of mask
# chunk_size (int): size of chunk
# num_left_chunks (int): number of left chunks
# <0: use full chunk
# >=0: use num_left_chunks
# device (torch.device): "cpu" or "cuda" or torch.Tensor.device
# Returns:
# torch.Tensor: mask
# Examples:
# >>> subsequent_chunk_mask(4, 2)
# [[1, 1, 0, 0],
# [1, 1, 0, 0],
# [1, 1, 1, 1],
# [1, 1, 1, 1]]
# """
# pos_idx = paddle.arange(size, device=device)
# block_value = (
# paddle.div(pos_idx, chunk_size, rounding_mode="trunc") + 1
# ) * chunk_size
# ret = pos_idx.unsqueeze(0) < block_value.unsqueeze(1)
# return ret
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
def add_optional_chunk_mask(
xs: paddle.Tensor,
masks: paddle.Tensor,

@ -1,3 +1,17 @@
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import threading
import time
@ -8,10 +22,6 @@ from typing import Generator
import numpy as np
import paddle
# from cosyvoice.utils.common import TrtContextWrapper, fade_in_out
# from cosyvoice.utils.file_utils import *
# from cosyvoice.utils.file_utils import convert_onnx_to_trt, export_cosyvoice2_vllm
class CosyVoiceModel:
def __init__(

@ -1,2 +0,0 @@
import torchaudio
import

@ -1,4 +1,4 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.

@ -1,4 +1,4 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.

@ -1,4 +1,4 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.

@ -1,3 +1,18 @@
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
"""Swish() activation function for Conformer."""

@ -1,4 +1,4 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@ -14,11 +14,9 @@
# Modified from espnet(https://github.com/espnet/espnet)
"""Multi-Head Attention layer definition."""
import math
import numpy
import paddle
from paddle import nn
from paddlespeech.t2s.modules.masked_fill import masked_fill

@ -1,3 +1,17 @@
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
"""ConvolutionModule definition."""

Loading…
Cancel
Save