# MIT License, Copyright (c) 2023-Present, Descript. # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. # # Modified from audiotools(https://github.com/descriptinc/audiotools/blob/master/audiotools/ml/accelerator.py) import os import typing import paddle import paddle.distributed as dist from paddle.io import DataLoader from paddle.io import DistributedBatchSampler from paddle.io import SequenceSampler class ResumableDistributedSampler(DistributedBatchSampler): """Distributed sampler that can be resumed from a given start index.""" def __init__(self, dataset, start_idx: int=None, **kwargs): super().__init__(dataset, **kwargs) # Start index, allows to resume an experiment at the index it was self.start_idx = start_idx // self.num_replicas if start_idx is not None else 0 def __iter__(self): for i, idx in enumerate(super().__iter__()): if i >= self.start_idx: yield idx self.start_idx = 0 # set the index back to 0 so for the next epoch class ResumableSequentialSampler(SequenceSampler): """Sequential sampler that can be resumed from a given start index.""" def __init__(self, dataset, start_idx: int=None, **kwargs): super().__init__(dataset, **kwargs) # Start index, allows to resume an experiment at the index it was self.start_idx = start_idx if start_idx is not None else 0 def __iter__(self): for i, idx in enumerate(super().__iter__()): if i >= self.start_idx: yield idx self.start_idx = 0 # set the index back to 0 so for the next epoch class Accelerator: """This class is used to prepare models and dataloaders for usage with DDP or DP. Use the functions prepare_model, prepare_dataloader to prepare the respective objects. In the case of models, they are moved to the appropriate GPU. In the case of dataloaders, a sampler is created and the dataloader is initialized with that sampler. If the world size is 1, prepare_model and prepare_dataloader are no-ops. If the environment variable ``PADDLE_TRAINER_ID`` is not set, then the script was launched without ``paddle.distributed.launch``, and ``DataParallel`` will be used instead of ``DistributedDataParallel`` (not recommended), if the world size (number of GPUs) is greater than 1. Parameters ---------- amp : bool, optional Whether or not to enable automatic mixed precision, by default False (Note: This is a placeholder as PaddlePaddle doesn't have native support for AMP as of now) """ def __init__(self, amp: bool=False): trainer_id = os.getenv("PADDLE_TRAINER_ID", None) self.world_size = paddle.distributed.get_world_size() self.use_ddp = self.world_size > 1 and trainer_id is not None self.use_dp = self.world_size > 1 and trainer_id is None self.device = "cpu" if self.world_size == 0 else "cuda" if self.use_ddp: trainer_id = int(trainer_id) dist.init_parallel_env() self.local_rank = 0 if trainer_id is None else int(trainer_id) self.amp = amp class DummyScaler: def __init__(self): pass def step(self, optimizer): optimizer.step() def scale(self, loss): return loss def unscale_(self, optimizer): return optimizer def update(self): pass self.scaler = paddle.amp.GradScaler() if self.amp else DummyScaler() def __enter__(self): return self def __exit__(self, exc_type, exc_value, traceback): pass def prepare_model(self, model: paddle.nn.Layer, **kwargs): """Prepares model for DDP or DP. The model is moved to the device of the correct rank. Parameters ---------- model : paddle.nn.Layer Model that is converted for DDP or DP. Returns ------- paddle.nn.Layer Wrapped model, or original model if DDP and DP are turned off. """ if self.use_ddp: model = paddle.nn.SyncBatchNorm.convert_sync_batchnorm(model) model = paddle.DataParallel(model, **kwargs) elif self.use_dp: model = paddle.DataParallel(model, **kwargs) return model def autocast(self, *args, **kwargs): return paddle.amp.auto_cast(self.amp, *args, **kwargs) def backward(self, loss: paddle.Tensor): """Backwards pass. Parameters ---------- loss : paddle.Tensor Loss value. """ scaled = self.scaler.scale(loss) # scale the loss scaled.backward() def step(self, optimizer: paddle.optimizer.Optimizer): """Steps the optimizer. Parameters ---------- optimizer : paddle.optimizer.Optimizer Optimizer to step forward. """ self.scaler.step(optimizer) def update(self): # https://www.paddlepaddle.org.cn/documentation/docs/zh/2.6/api/paddle/amp/GradScaler_cn.html#step-optimizer self.scaler.update() def prepare_dataloader(self, dataset: typing.Iterable, start_idx: int=None, **kwargs): """Wraps a dataset with a DataLoader, using the correct sampler if DDP is enabled. Parameters ---------- dataset : typing.Iterable Dataset to build Dataloader around. start_idx : int, optional Start index of sampler, useful if resuming from some epoch, by default None Returns ------- DataLoader Wrapped DataLoader. """ if self.use_ddp: sampler = ResumableDistributedSampler( dataset, start_idx, batch_size=kwargs.get("batch_size", 1), shuffle=kwargs.get("shuffle", True), drop_last=kwargs.get("drop_last", False), num_replicas=self.world_size, rank=self.local_rank, ) if "num_workers" in kwargs: kwargs["num_workers"] = max(kwargs["num_workers"] // self.world_size, 1) else: sampler = ResumableSequentialSampler(dataset, start_idx) dataloader = DataLoader( dataset, batch_sampler=sampler if self.use_ddp else None, sampler=sampler if not self.use_ddp else None, **kwargs, ) return dataloader @staticmethod def unwrap(model): return model