fix code style

4 months ago · fe2db3292f
parent a7b9a9c9b0
commit fe2db3292f
18 changed files with 242 additions and 393 deletions
--- a/Linear_test.py
+++ b/Linear_test.py
@ -1,11 +0,0 @@
-import paddle,torch,numpy
-torch_linear = torch.load("q.pt").cpu()
-paddle_linear_state = paddle.load("q.pdparams")
-paddle_linear = paddle.nn.Linear(896,896,bias_attr=True)
-hidden_states = paddle.load("hidden_states.pdparams")
-paddle_linear.set_state_dict(paddle_linear_state)
-torch_forward_res = torch_linear(torch.tensor(hidden_states.numpy()))
-paddle_forward_res = paddle_linear(hidden_states)
-print("torch_forward_res:",torch_forward_res)
-print("paddle_forward_res:",paddle_forward_res)
-print('allclose_res:',numpy.testing.assert_allclose(torch_forward_res.detach().numpy(),paddle_forward_res))
--- a/paddlespeech/cli/tts/cosyvoice.py
+++ b/paddlespeech/cli/tts/cosyvoice.py
@ -1,34 +0,0 @@
-from paddlespeech.t2s.models.CosyVoice.cosyvoice import CosyVoice2
-import sys
-from paddlenlp.transformers import AutoTokenizer, AutoModelForCausalLM
-from pathlib import Path
-import paddle
-import torch
-from paddlespeech.t2s.models.CosyVoice.llm import Qwen2LM,ras_sampling,Qwen2Encoder
-# cosyvoice_model = CosyVoice2("../CosyVoice/pretrained_models/CosyVoice2-0.5B_paddle")
-model = AutoModelForCausalLM.from_pretrained('Qwen/Qwen2-0.5B')
-llm = Qwen2Encoder(model)
-qwen_lm = Qwen2LM(896,896,6561,llm,ras_sampling)
-state_dict = paddle.load("/root/paddlejob/workspace/zhangjinghong/CosyVoice/pretrained_models/CosyVoice2-0.5B_paddle/llm.pdparams")
-qwen_lm.set_state_dict(state_dict)
-new_dict = torch.load("data.pt")
-text = new_dict['text'] 
-text_len = new_dict['text_len']
-prompt_text = new_dict['prompt_text']
-prompt_text_len = new_dict['prompt_text_len']
-prompt_speech_token = new_dict['prompt_speech_token']
-prompt_speech_token_len = new_dict['prompt_speech_token_len']
-embedding = new_dict['embedding']
-uuid = new_dict['uuid']
-print("text:",text)
-# for i in qwen_lm.inference(text=paddle.to_tensor(text),
-#     text_len=text_len,
-#     prompt_text=paddle.to_tensor(prompt_text),
-#     prompt_text_len=prompt_text_len,
-#     prompt_speech_token=paddle.to_tensor(prompt_speech_token),
-#     prompt_speech_token_len=prompt_speech_token_len,
-#     embedding=paddle.to_tensor(embedding,dtype = 'float32'),
-#     uuid=uuid):
-#     print(text)
-#     print(i)
-
--- a/paddlespeech/t2s/models/CosyVoice/init.py
+++ b/paddlespeech/t2s/models/CosyVoice/init.py
@ -1,4 +1,4 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
--- a/paddlespeech/t2s/models/CosyVoice/class_utils.py
+++ b/paddlespeech/t2s/models/CosyVoice/class_utils.py
@ -1,9 +1,17 @@
-import paddle
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

-# from cosyvoice.cli.model import CosyVoice2Model, CosyVoiceModel
-# from cosyvoice.flow.flow import CausalMaskedDiffWithXvec, MaskedDiffWithXvec
-# from cosyvoice.hifigan.generator import HiFTGenerator
-# from cosyvoice.llm.llm import Qwen2LM, TransformerLM
 from paddlespeech.t2s.modules.transformer.activation import Swish
 from paddlespeech.t2s.modules.transformer.attention import RelPositionMultiHeadedAttention
 from paddlespeech.t2s.modules.transformer.embedding import EspnetRelPositionalEncoding
@ -23,18 +31,3 @@ COSYVOICE_ATTENTION_CLASSES = {
    "rel_selfattn": RelPositionMultiHeadedAttention,
 }

-
-# def get_model_type(configs):
-#     if (
-#         isinstance(configs["llm"], TransformerLM)
-#         and isinstance(configs["flow"], MaskedDiffWithXvec)
-#         and isinstance(configs["hift"], HiFTGenerator)
-#     ):
-#         return CosyVoiceModel
-#     if (
-#         isinstance(configs["llm"], Qwen2LM)
-#         and isinstance(configs["flow"], CausalMaskedDiffWithXvec)
-#         and isinstance(configs["hift"], HiFTGenerator)
-#     ):
-#         return CosyVoice2Model
-#     raise TypeError("No valid model type found!")
--- a/paddlespeech/t2s/models/CosyVoice/common.py
+++ b/paddlespeech/t2s/models/CosyVoice/common.py
@ -1,14 +1,24 @@
-import paddle
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

-"""Unility functions for Transformer."""
+import paddle
 import queue
 import random
 from typing import List

 import numpy as np

-############################## 相关utils函数，如下 ##############################
-
 def device2str(type=None, index=None, *, device=None):
    type = device if device else type
    if isinstance(type, int):
@ -26,7 +36,6 @@ def device2str(type=None, index=None, *, device=None):
        type = f'gpu:{type.get_device_id()}'

    return type
-############################## 相关utils函数，如上 ##############################


 IGNORE_ID = -1
@ -128,7 +137,6 @@ def ras_sampling(
        .sum()
        .item()
    )
-    print("top_ids:",top_ids)
    if rep_num >= win_size * tau_r:
        top_ids = random_sampling(weighted_scores, decoded_tokens, sampling)[0]
    return top_ids
@ -150,7 +158,6 @@ def nucleus_sampling(weighted_scores, top_p=0.8, top_k=25):
            break
    prob = paddle.to_tensor(prob).cuda()
    indices = paddle.to_tensor(indices, dtype=paddle.long).to(weighted_scores.place)
-    print("indices:",indices)
    # top_ids = indices[prob.multinomial(num_samples=1, replacement=True)]
    top_ids = indices[0]
    return top_ids
@ -160,7 +167,6 @@ def random_sampling(weighted_scores, decoded_tokens, sampling):
    top_ids = weighted_scores.softmax(axis=0).multinomial(
        num_samples=1, replacement=True
    )
-    print("random_sampling:",top_ids)
    return top_ids


--- a/paddlespeech/t2s/models/CosyVoice/cosyvoice.py
+++ b/paddlespeech/t2s/models/CosyVoice/cosyvoice.py
@ -1,3 +1,17 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 import time
 from typing import Generator
--- a/paddlespeech/t2s/models/CosyVoice/flow.py
+++ b/paddlespeech/t2s/models/CosyVoice/flow.py
@ -1,3 +1,17 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import math
 from typing import Any
 from typing import Dict
--- a/paddlespeech/t2s/models/CosyVoice/frontend.py
+++ b/paddlespeech/t2s/models/CosyVoice/frontend.py
@ -1,3 +1,16 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import json
 import os
 import re
@ -320,8 +333,6 @@ class CosyVoiceFrontEnd:

    def frontend_sft(self, tts_text, spk_id):
        tts_text_token, tts_text_token_len = self._extract_text_token(tts_text)
-        print("1" * 30)
-        print(self.spk2info.keys())
        embedding = self.spk2info[spk_id]["embedding"]
        model_input = {
            "text": tts_text_token,
--- a/paddlespeech/t2s/models/CosyVoice/llm.py
+++ b/paddlespeech/t2s/models/CosyVoice/llm.py
@ -1,3 +1,17 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import queue
 import random
 import threading
@ -7,10 +21,6 @@ import logging
 import paddle.nn.functional as F
 import paddle
 IGNORE_ID = -1
-# from cosyvoice.transformer.label_smoothing_loss import LabelSmoothingLoss
-# from cosyvoice.utils.common import IGNORE_ID, th_accuracy
-# from cosyvoice.utils.file_utils import logging
-# from cosyvoice.utils.mask import make_pad_mask
 import torch
 LabelSmoothingLoss = None
 def ras_sampling(weighted_scores, decoded_tokens, sampling, top_p=0.8, top_k=25, win_size=10, tau_r=0.1):
@ -368,23 +378,6 @@ class Qwen2LM(TransformerLM):
        self.llm_decoder = paddle.nn.Linear(
            in_features=llm_output_size, out_features=speech_token_size + 3
        )
-        # self.llm_decoder.weight = paddle.create_parameter(
-        #     shape=self.llm_decoder.weight.shape,
-        #     dtype='bfloat16',
-        #     default_initializer=paddle.nn.initializer.Assign(self.llm_decoder.weight.astype('bfloat16'))
-        # )
-        # if self.llm_decoder.bias is not None:
-        #     self.llm_decoder.bias = paddle.create_parameter(
-        #         shape=self.llm_decoder.bias.shape,
-        #         dtype='bfloat16',
-        #         default_initializer=paddle.nn.initializer.Assign(self.llm_decoder.bias.astype('bfloat16'))
-        #     )
-        # self.criterion_ce = LabelSmoothingLoss(
-        #     size=speech_token_size + 3,
-        #     padding_idx=IGNORE_ID,
-        #     smoothing=lsm_weight,
-        #     normalize_length=length_normalized_loss,
-        # )
        self.speech_embedding = paddle.nn.Embedding(
            speech_token_size + 3, llm_input_size
        )
@ -393,104 +386,104 @@ class Qwen2LM(TransformerLM):
        self.stop_token_ids = [(speech_token_size + i) for i in range(3)]
        self.vllm_output_queue = {}

-    # def prepare_lm_input_target(
-    #     self,
-    #     text_token,
-    #     text_token_emb,
-    #     text_token_len,
-    #     speech_token,
-    #     speech_token_emb,
-    #     speech_token_len,
-    # ):
-    #     lm_target, lm_input = [], []
-    #     text_token = torch.nn.utils.rnn.unpad_sequence(
-    #         text_token, text_token_len.cpu(), batch_first=True
-    #     )
-    #     speech_token = torch.nn.utils.rnn.unpad_sequence(
-    #         speech_token, speech_token_len.cpu(), batch_first=True
-    #     )
-    #     text_token_emb = torch.nn.utils.rnn.unpad_sequence(
-    #         text_token_emb, text_token_len.cpu(), batch_first=True
-    #     )
-    #     speech_token_emb = torch.nn.utils.rnn.unpad_sequence(
-    #         speech_token_emb, speech_token_len.cpu(), batch_first=True
-    #     )
-    #     for i in range(len(text_token)):
-    #         if (
-    #             random.random() < 0.5
-    #             and speech_token_len[i] / text_token_len[i]
-    #             > self.mix_ratio[1] / self.mix_ratio[0]
-    #         ):
-    #             this_lm_target, this_lm_input = [], []
-    #             this_lm_target.append(IGNORE_ID)
-    #             this_lm_input.append(
-    #                 self.llm_embedding.weight[self.sos_eos].reshape(1, -1)
-    #             )
-    #             for j in range(
-    #                 ((text_token_len[i] + 1) / self.mix_ratio[0]).ceil().int().item()
-    #             ):
-    #                 this_text_token = text_token[i][
-    #                     j * self.mix_ratio[0] : (j + 1) * self.mix_ratio[0]
-    #                 ].tolist()
-    #                 this_speech_token = speech_token[i][
-    #                     j * self.mix_ratio[1] : (j + 1) * self.mix_ratio[1]
-    #                 ].tolist()
-    #                 if len(this_text_token) == self.mix_ratio[0]:
-    #                     assert len(this_speech_token) == self.mix_ratio[1]
-    #                     this_lm_target += [IGNORE_ID] * (self.mix_ratio[0] - 1)
-    #                     this_lm_target += this_speech_token
-    #                     this_lm_target.append(self.speech_token_size + 2)
-    #                     this_lm_input.append(
-    #                         text_token_emb[i][
-    #                             j * self.mix_ratio[0] : (j + 1) * self.mix_ratio[0]
-    #                         ]
-    #                     )
-    #                     this_lm_input.append(
-    #                         speech_token_emb[i][
-    #                             j * self.mix_ratio[1] : (j + 1) * self.mix_ratio[1]
-    #                         ]
-    #                     )
-    #                 else:
-    #                     this_lm_target += [-1] * len(this_text_token)
-    #                     this_lm_target += speech_token[i][
-    #                         j * self.mix_ratio[1] :
-    #                     ].tolist()
-    #                     this_lm_target.append(self.speech_token_size)
-    #                     this_lm_input.append(text_token_emb[i][j * self.mix_ratio[0] :])
-    #                     this_lm_input.append(
-    #                         self.llm_embedding.weight[self.task_id].reshape(1, -1)
-    #                     )
-    #                     this_lm_input.append(
-    #                         speech_token_emb[i][j * self.mix_ratio[1] :]
-    #                     )
-    #             this_lm_target, this_lm_input = paddle.tensor(
-    #                 this_lm_target
-    #             ), paddle.cat(this_lm_input, dim=0)
-    #         else:
-    #             this_lm_target = paddle.tensor(
-    #                 [IGNORE_ID] * (1 + text_token_len[i])
-    #                 + speech_token[i].tolist()
-    #                 + [self.speech_token_size]
-    #             )
-    #             this_lm_input = paddle.cat(
-    #                 [
-    #                     self.llm_embedding.weight[self.sos_eos].reshape(1, -1),
-    #                     text_token_emb[i],
-    #                     self.llm_embedding.weight[self.task_id].reshape(1, -1),
-    #                     speech_token_emb[i],
-    #                 ],
-    #                 dim=0,
-    #             )
-    #         lm_target.append(this_lm_target)
-    #         lm_input.append(this_lm_input)
-    #     lm_input_len = paddle.tensor([i.size(0) for i in lm_input], dtype=paddle.int32)
-    #     lm_input = torch.nn.utils.rnn.pad_sequence(
-    #         lm_input, batch_first=True, padding_value=IGNORE_ID
-    #     )
-    #     lm_target = torch.nn.utils.rnn.pad_sequence(
-    #         lm_target, batch_first=True, padding_value=IGNORE_ID
-    #     )
-    #     return lm_target, lm_input, lm_input_len
+    def prepare_lm_input_target(
+        self,
+        text_token,
+        text_token_emb,
+        text_token_len,
+        speech_token,
+        speech_token_emb,
+        speech_token_len,
+    ):
+        lm_target, lm_input = [], []
+        text_token = torch.nn.utils.rnn.unpad_sequence(
+            text_token, text_token_len.cpu(), batch_first=True
+        )
+        speech_token = torch.nn.utils.rnn.unpad_sequence(
+            speech_token, speech_token_len.cpu(), batch_first=True
+        )
+        text_token_emb = torch.nn.utils.rnn.unpad_sequence(
+            text_token_emb, text_token_len.cpu(), batch_first=True
+        )
+        speech_token_emb = torch.nn.utils.rnn.unpad_sequence(
+            speech_token_emb, speech_token_len.cpu(), batch_first=True
+        )
+        for i in range(len(text_token)):
+            if (
+                random.random() < 0.5
+                and speech_token_len[i] / text_token_len[i]
+                > self.mix_ratio[1] / self.mix_ratio[0]
+            ):
+                this_lm_target, this_lm_input = [], []
+                this_lm_target.append(IGNORE_ID)
+                this_lm_input.append(
+                    self.llm_embedding.weight[self.sos_eos].reshape(1, -1)
+                )
+                for j in range(
+                    ((text_token_len[i] + 1) / self.mix_ratio[0]).ceil().int().item()
+                ):
+                    this_text_token = text_token[i][
+                        j * self.mix_ratio[0] : (j + 1) * self.mix_ratio[0]
+                    ].tolist()
+                    this_speech_token = speech_token[i][
+                        j * self.mix_ratio[1] : (j + 1) * self.mix_ratio[1]
+                    ].tolist()
+                    if len(this_text_token) == self.mix_ratio[0]:
+                        assert len(this_speech_token) == self.mix_ratio[1]
+                        this_lm_target += [IGNORE_ID] * (self.mix_ratio[0] - 1)
+                        this_lm_target += this_speech_token
+                        this_lm_target.append(self.speech_token_size + 2)
+                        this_lm_input.append(
+                            text_token_emb[i][
+                                j * self.mix_ratio[0] : (j + 1) * self.mix_ratio[0]
+                            ]
+                        )
+                        this_lm_input.append(
+                            speech_token_emb[i][
+                                j * self.mix_ratio[1] : (j + 1) * self.mix_ratio[1]
+                            ]
+                        )
+                    else:
+                        this_lm_target += [-1] * len(this_text_token)
+                        this_lm_target += speech_token[i][
+                            j * self.mix_ratio[1] :
+                        ].tolist()
+                        this_lm_target.append(self.speech_token_size)
+                        this_lm_input.append(text_token_emb[i][j * self.mix_ratio[0] :])
+                        this_lm_input.append(
+                            self.llm_embedding.weight[self.task_id].reshape(1, -1)
+                        )
+                        this_lm_input.append(
+                            speech_token_emb[i][j * self.mix_ratio[1] :]
+                        )
+                this_lm_target, this_lm_input = paddle.tensor(
+                    this_lm_target
+                ), paddle.cat(this_lm_input, dim=0)
+            else:
+                this_lm_target = paddle.tensor(
+                    [IGNORE_ID] * (1 + text_token_len[i])
+                    + speech_token[i].tolist()
+                    + [self.speech_token_size]
+                )
+                this_lm_input = paddle.cat(
+                    [
+                        self.llm_embedding.weight[self.sos_eos].reshape(1, -1),
+                        text_token_emb[i],
+                        self.llm_embedding.weight[self.task_id].reshape(1, -1),
+                        speech_token_emb[i],
+                    ],
+                    dim=0,
+                )
+            lm_target.append(this_lm_target)
+            lm_input.append(this_lm_input)
+        lm_input_len = paddle.tensor([i.size(0) for i in lm_input], dtype=paddle.int32)
+        lm_input = torch.nn.utils.rnn.pad_sequence(
+            lm_input, batch_first=True, padding_value=IGNORE_ID
+        )
+        lm_target = torch.nn.utils.rnn.pad_sequence(
+            lm_target, batch_first=True, padding_value=IGNORE_ID
+        )
+        return lm_target, lm_input, lm_input_len

    @paddle.no_grad()
    def inference(
@ -598,7 +591,6 @@ class Qwen2LM(TransformerLM):
                yield top_ids
                out_tokens.append(top_ids)
                lm_input = self.speech_embedding.weight[top_ids].reshape([1, 1, -1])
-            print(len(out_tokens))
    @paddle.no_grad()
    def inference_bistream(
        self,
--- a/paddlespeech/t2s/models/CosyVoice/mask.py
+++ b/paddlespeech/t2s/models/CosyVoice/mask.py
@ -1,189 +1,18 @@
-import paddle
-
-# ############################## 相关utils函数，如下 ##############################
-
-# def device2str(type=None, index=None, *, device=None):
-#     type = device if device else type
-#     if isinstance(type, int):
-#         type = f'gpu:{type}'
-#     elif isinstance(type, str):
-#         if 'cuda' in type:
-#             type = type.replace('cuda', 'gpu')
-#         if 'cpu' in type:
-#             type = 'cpu'
-#         elif index is not None:
-#             type = f'{type}:{index}'
-#     elif isinstance(type, paddle.CPUPlace) or (type is None):
-#         type = 'cpu'
-#     elif isinstance(type, paddle.CUDAPlace):
-#         type = f'gpu:{type.get_device_id()}'
-
-#     return type
-
-# def _Tensor_max(self, *args, **kwargs):
-#     if "other" in kwargs:
-#         kwargs["y"] = kwargs.pop("other")
-#         ret = paddle.maximum(self, *args, **kwargs)
-#     elif len(args) == 1 and isinstance(args[0], paddle.Tensor):
-#         ret = paddle.maximum(self, *args, **kwargs)
-#     else:
-#         if "dim" in kwargs:
-#             kwargs["axis"] = kwargs.pop("dim")
-
-#         if "axis" in kwargs or len(args) >= 1:
-#             ret = paddle.max(self, *args, **kwargs), paddle.argmax(self, *args, **kwargs)
-#         else:
-#             ret = paddle.max(self, *args, **kwargs)
-
-#     return ret
-
-# setattr(paddle.Tensor, "_max", _Tensor_max)
-# ############################## 相关utils函数，如上 ##############################
-
-
-# """
-# def subsequent_mask(
-#         size: int,
-#         device: torch.device = torch.device("cpu"),
-# ) -> torch.Tensor:
-#     ""\"Create mask for subsequent steps (size, size).
-
-#     This mask is used only in decoder which works in an auto-regressive mode.
-#     This means the current step could only do attention with its left steps.
-
-#     In encoder, fully attention is used when streaming is not necessary and
-#     the sequence is not long. In this  case, no attention mask is needed.
-
-#     When streaming is need, chunk-based attention is used in encoder. See
-#     subsequent_chunk_mask for the chunk-based attention mask.
-
-#     Args:
-#         size (int): size of mask
-#         str device (str): "cpu" or "cuda" or torch.Tensor.device
-#         dtype (torch.device): result dtype
-
-#     Returns:
-#         torch.Tensor: mask
-
-#     Examples:
-#         >>> subsequent_mask(3)
-#         [[1, 0, 0],
-#          [1, 1, 0],
-#          [1, 1, 1]]
-#     ""\"
-#     ret = torch.ones(size, size, device=device, dtype=torch.bool)
-#     return torch.tril(ret)
-# """
-
-
-# def subsequent_mask(
-#     size: int, device: paddle.device
-# ) -> paddle.Tensor:
-#     """Create mask for subsequent steps (size, size).
-
-#     This mask is used only in decoder which works in an auto-regressive mode.
-#     This means the current step could only do attention with its left steps.
-
-#     In encoder, fully attention is used when streaming is not necessary and
-#     the sequence is not long. In this  case, no attention mask is needed.
-
-#     When streaming is need, chunk-based attention is used in encoder. See
-#     subsequent_chunk_mask for the chunk-based attention mask.
-
-#     Args:
-#         size (int): size of mask
-#         str device (str): "cpu" or "cuda" or torch.Tensor.device
-#         dtype (torch.device): result dtype
-
-#     Returns:
-#         torch.Tensor: mask
-
-#     Examples:
-#         >>> subsequent_mask(3)
-#         [[1, 0, 0],
-#          [1, 1, 0],
-#          [1, 1, 1]]
-#     """
-#     arange = paddle.arange(size, device=device)
-#     mask = arange.expand(size, size)
-#     arange = arange.unsqueeze(-1)
-#     mask = mask <= arange
-#     return mask
-
-
-# def subsequent_chunk_mask_deprecated(
-#     size: int,
-#     chunk_size: int,
-#     num_left_chunks: int = -1,
-# >>>>>>    device: torch.device = device2str("cpu"),
-# ) -> paddle.Tensor:
-#     """Create mask for subsequent steps (size, size) with chunk size,
-#        this is for streaming encoder
-
-#     Args:
-#         size (int): size of mask
-#         chunk_size (int): size of chunk
-#         num_left_chunks (int): number of left chunks
-#             <0: use full chunk
-#             >=0: use num_left_chunks
-#         device (torch.device): "cpu" or "cuda" or torch.Tensor.device
-
-#     Returns:
-#         torch.Tensor: mask
-
-#     Examples:
-#         >>> subsequent_chunk_mask(4, 2)
-#         [[1, 1, 0, 0],
-#          [1, 1, 0, 0],
-#          [1, 1, 1, 1],
-#          [1, 1, 1, 1]]
-#     """
-#     ret = paddle.zeros(size, size, device=device, dtype=paddle.bool)
-#     for i in range(size):
-#         if num_left_chunks < 0:
-#             start = 0
-#         else:
-#             start = max((i // chunk_size - num_left_chunks) * chunk_size, 0)
-#         ending = min((i // chunk_size + 1) * chunk_size, size)
-#         ret[i, start:ending] = True
-#     return ret
-
-
-# def subsequent_chunk_mask(
-#     size: int,
-#     chunk_size: int,
-#     num_left_chunks: int = -1,
-# >>>>>>    device: torch.device = device2str("cpu"),
-# ) -> paddle.Tensor:
-#     """Create mask for subsequent steps (size, size) with chunk size,
-#        this is for streaming encoder
-
-#     Args:
-#         size (int): size of mask
-#         chunk_size (int): size of chunk
-#         num_left_chunks (int): number of left chunks
-#             <0: use full chunk
-#             >=0: use num_left_chunks
-#         device (torch.device): "cpu" or "cuda" or torch.Tensor.device
-
-#     Returns:
-#         torch.Tensor: mask
-
-#     Examples:
-#         >>> subsequent_chunk_mask(4, 2)
-#         [[1, 1, 0, 0],
-#          [1, 1, 0, 0],
-#          [1, 1, 1, 1],
-#          [1, 1, 1, 1]]
-#     """
-#     pos_idx = paddle.arange(size, device=device)
-#     block_value = (
-#         paddle.div(pos_idx, chunk_size, rounding_mode="trunc") + 1
-#     ) * chunk_size
-#     ret = pos_idx.unsqueeze(0) < block_value.unsqueeze(1)
-#     return ret
-
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

+import paddle
 def add_optional_chunk_mask(
    xs: paddle.Tensor,
    masks: paddle.Tensor,
--- a/paddlespeech/t2s/models/CosyVoice/model.py
+++ b/paddlespeech/t2s/models/CosyVoice/model.py
@ -1,3 +1,17 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 import threading
 import time
@ -8,10 +22,6 @@ from typing import Generator
 import numpy as np
 import paddle

-# from cosyvoice.utils.common import TrtContextWrapper, fade_in_out
-# from cosyvoice.utils.file_utils import *
-# from cosyvoice.utils.file_utils import convert_onnx_to_trt, export_cosyvoice2_vllm
-

 class CosyVoiceModel:
    def __init__(
--- a/paddlespeech/t2s/models/CosyVoice/test.py
+++ b/paddlespeech/t2s/models/CosyVoice/test.py
@ -1,2 +0,0 @@
-import torchaudio
-import 
--- a/paddlespeech/t2s/modules/activation.py
+++ b/paddlespeech/t2s/modules/activation.py
@ -1,4 +1,4 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
--- a/paddlespeech/t2s/modules/conv.py
+++ b/paddlespeech/t2s/modules/conv.py
@ -1,4 +1,4 @@
- # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+ # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
--- a/paddlespeech/t2s/modules/transformer/init.py
+++ b/paddlespeech/t2s/modules/transformer/init.py
@ -1,4 +1,4 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
--- a/paddlespeech/t2s/modules/transformer/activation.py
+++ b/paddlespeech/t2s/modules/transformer/activation.py
@ -1,3 +1,18 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
 import paddle

 """Swish() activation function for Conformer."""
--- a/paddlespeech/t2s/modules/transformer/attention.py
+++ b/paddlespeech/t2s/modules/transformer/attention.py
@ -1,4 +1,4 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@ -14,11 +14,9 @@
 # Modified from espnet(https://github.com/espnet/espnet)
 """Multi-Head Attention layer definition."""
 import math
-
 import numpy
 import paddle
 from paddle import nn
-
 from paddlespeech.t2s.modules.masked_fill import masked_fill


--- a/paddlespeech/t2s/modules/transformer/convolution.py
+++ b/paddlespeech/t2s/modules/transformer/convolution.py
@ -1,3 +1,17 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import paddle

 """ConvolutionModule definition."""