Merge e97d6dbec3
into 8247eba840
commit
78ea3e9f54
@ -0,0 +1,37 @@
|
||||
# Configuration for DAC model
|
||||
|
||||
# Model Configuration
|
||||
model:
|
||||
sample_rate: 44100
|
||||
encoder_dims: 512
|
||||
decoder_dims: 512
|
||||
num_residual_layers: 10
|
||||
n_fft: 1024
|
||||
hop_length: 256
|
||||
|
||||
# Training Configuration
|
||||
training:
|
||||
batch_size: 16
|
||||
lr: 0.0001
|
||||
weight_decay: 0.0001
|
||||
gradient_clip_val: 1.0
|
||||
max_epochs: 200
|
||||
warmup_steps: 1000
|
||||
|
||||
# Loss weights
|
||||
recon_loss_weight: 1.0
|
||||
mel_loss_weight: 10.0
|
||||
adversarial_loss_weight: 0.1
|
||||
|
||||
# Data Configuration
|
||||
data:
|
||||
train_manifest: "data/train/manifest.json"
|
||||
dev_manifest: "data/dev/manifest.json"
|
||||
test_manifest: "data/test/manifest.json"
|
||||
max_duration: 10.0 # Maximum audio length in seconds
|
||||
min_duration: 1.0 # Minimum audio length in seconds
|
||||
|
||||
# Distributed Training Configuration
|
||||
distributed:
|
||||
world_size: 1
|
||||
find_unused_parameters: true
|
@ -0,0 +1,217 @@
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Evaluation script for DAC model.
|
||||
|
||||
This script evaluates the DAC model using multiple quality metrics.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import json
|
||||
import numpy as np
|
||||
import soundfile as sf
|
||||
from tqdm import tqdm
|
||||
import yaml
|
||||
|
||||
import paddle
|
||||
from paddle.io import DataLoader
|
||||
|
||||
from paddlespeech.audio.codec.dac.model import DACModel
|
||||
from paddlespeech.audio.codec.dac.evaluator import DACEvaluator, DACAudioMetrics
|
||||
from paddlespeech.audio.codec.dac.inferencer import DACInferencer
|
||||
# TODO: Import dataset classes once implemented
|
||||
|
||||
|
||||
def main(args):
|
||||
"""Run DAC model evaluation on test dataset.
|
||||
|
||||
Args:
|
||||
args: Command line arguments
|
||||
"""
|
||||
# Load configuration
|
||||
with open(args.config, 'r') as f:
|
||||
config = yaml.safe_load(f)
|
||||
|
||||
# Initialize model
|
||||
model = DACModel(**config['model'])
|
||||
|
||||
# Load model checkpoint
|
||||
state_dict = paddle.load(args.checkpoint)
|
||||
model.set_state_dict(state_dict)
|
||||
model.eval()
|
||||
|
||||
# Setup distributed evaluation if requested
|
||||
if args.ngpus > 1:
|
||||
paddle.distributed.init_parallel_env()
|
||||
model = paddle.DataParallel(model)
|
||||
|
||||
# TODO: Setup dataset and dataloader
|
||||
# This is a placeholder for the dataset setup
|
||||
# test_dataset = ...
|
||||
# test_dataloader = DataLoader(...)
|
||||
|
||||
# Initialize evaluator
|
||||
evaluator = DACEvaluator(
|
||||
model=model,
|
||||
dataloader=None, # TODO: Replace with actual test_dataloader
|
||||
sample_rate=config['model'].get('sample_rate', 44100))
|
||||
|
||||
# Run evaluation
|
||||
results = {}
|
||||
if args.mode == 'dataset':
|
||||
# Evaluate on full dataset
|
||||
results = evaluator.evaluate()
|
||||
elif args.mode == 'directory':
|
||||
# Evaluate on directory of audio files
|
||||
results = evaluate_directory(args.input_dir, args.reference_dir, model, config)
|
||||
|
||||
# Save results
|
||||
os.makedirs(os.path.dirname(args.output), exist_ok=True)
|
||||
with open(args.output, 'w') as f:
|
||||
json.dump(results, f, indent=2)
|
||||
|
||||
# Print summary
|
||||
print("\nEvaluation Results:")
|
||||
print(json.dumps(results, indent=2))
|
||||
|
||||
|
||||
def evaluate_directory(input_dir, reference_dir, model, config):
|
||||
"""Evaluate model on directory of audio files.
|
||||
|
||||
Args:
|
||||
input_dir: Directory containing input audio files
|
||||
reference_dir: Directory containing reference audio files
|
||||
model: DAC model instance
|
||||
config: Configuration dictionary
|
||||
|
||||
Returns:
|
||||
dict: Dictionary of evaluation metrics
|
||||
"""
|
||||
inferencer = DACInferencer(
|
||||
checkpoint_path=None, # We already loaded the model
|
||||
model_config=config['model'])
|
||||
inferencer.model = model
|
||||
|
||||
metrics_calculator = DACAudioMetrics(
|
||||
sample_rate=config['model'].get('sample_rate', 44100))
|
||||
|
||||
all_metrics = {}
|
||||
file_metrics = []
|
||||
|
||||
# Get list of files
|
||||
files = [f for f in os.listdir(input_dir) if f.endswith(('.wav', '.mp3', '.flac'))]
|
||||
|
||||
# Process each file
|
||||
for filename in tqdm(files):
|
||||
input_path = os.path.join(input_dir, filename)
|
||||
reference_path = os.path.join(reference_dir, filename)
|
||||
|
||||
if not os.path.exists(reference_path):
|
||||
print(f"Warning: Reference file not found: {reference_path}")
|
||||
continue
|
||||
|
||||
# Load audio files
|
||||
input_audio, sr_in = sf.read(input_path)
|
||||
reference_audio, sr_ref = sf.read(reference_path)
|
||||
|
||||
# Make mono if stereo
|
||||
if input_audio.ndim > 1:
|
||||
input_audio = input_audio.mean(axis=1)
|
||||
if reference_audio.ndim > 1:
|
||||
reference_audio = reference_audio.mean(axis=1)
|
||||
|
||||
# Ensure same length
|
||||
min_len = min(len(input_audio), len(reference_audio))
|
||||
input_audio = input_audio[:min_len]
|
||||
reference_audio = reference_audio[:min_len]
|
||||
|
||||
# Process through model
|
||||
reconstructed_audio = inferencer.reconstruct(input_audio)
|
||||
|
||||
# Calculate metrics
|
||||
metrics = metrics_calculator.compute_metrics(reference_audio, reconstructed_audio)
|
||||
|
||||
# Store per-file results
|
||||
file_result = {
|
||||
'filename': filename,
|
||||
'metrics': metrics
|
||||
}
|
||||
file_metrics.append(file_result)
|
||||
|
||||
# Accumulate metrics
|
||||
for key, value in metrics.items():
|
||||
if key not in all_metrics:
|
||||
all_metrics[key] = []
|
||||
all_metrics[key].append(value)
|
||||
|
||||
# Calculate averages
|
||||
avg_metrics = {key: np.mean(values) for key, values in all_metrics.items()}
|
||||
|
||||
# Prepare results
|
||||
results = {
|
||||
'average_metrics': avg_metrics,
|
||||
'per_file_metrics': file_metrics
|
||||
}
|
||||
|
||||
return results
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Evaluate DAC model")
|
||||
|
||||
parser.add_argument(
|
||||
"--config",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Path to configuration file")
|
||||
|
||||
parser.add_argument(
|
||||
"--checkpoint",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Path to model checkpoint")
|
||||
|
||||
parser.add_argument(
|
||||
"--mode",
|
||||
type=str,
|
||||
default="dataset",
|
||||
choices=["dataset", "directory"],
|
||||
help="Evaluation mode: use test dataset or directory of files")
|
||||
|
||||
parser.add_argument(
|
||||
"--input-dir",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Directory containing input audio files (for directory mode)")
|
||||
|
||||
parser.add_argument(
|
||||
"--reference-dir",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Directory containing reference audio files (for directory mode)")
|
||||
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Path to save evaluation results (JSON format)")
|
||||
|
||||
parser.add_argument(
|
||||
"--ngpus",
|
||||
type=int,
|
||||
default=1,
|
||||
help="Number of GPUs for distributed evaluation")
|
||||
|
||||
args = parser.parse_args()
|
||||
main(args)
|
@ -0,0 +1,149 @@
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Inference script for DAC model.
|
||||
|
||||
This script demonstrates how to use the DAC model for audio encoding/decoding.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import numpy as np
|
||||
import soundfile as sf
|
||||
import yaml
|
||||
|
||||
import paddle
|
||||
from paddlespeech.audio.codec.dac.inferencer import DACInferencer
|
||||
|
||||
|
||||
def main(args):
|
||||
"""Run DAC inference on audio file(s).
|
||||
|
||||
Args:
|
||||
args: Command line arguments
|
||||
"""
|
||||
# Load configuration if specified
|
||||
model_config = {}
|
||||
if args.config:
|
||||
with open(args.config, 'r') as f:
|
||||
config = yaml.safe_load(f)
|
||||
model_config = config.get('model', {})
|
||||
|
||||
# Initialize inferencer
|
||||
inferencer = DACInferencer(
|
||||
checkpoint_path=args.checkpoint,
|
||||
model_config=model_config,
|
||||
device=args.device)
|
||||
|
||||
# Process input file(s)
|
||||
if os.path.isfile(args.input):
|
||||
process_file(inferencer, args.input, args.output, args.mode)
|
||||
elif os.path.isdir(args.input):
|
||||
os.makedirs(args.output, exist_ok=True)
|
||||
for filename in os.listdir(args.input):
|
||||
if filename.endswith(('.wav', '.mp3', '.flac')):
|
||||
input_path = os.path.join(args.input, filename)
|
||||
output_path = os.path.join(args.output, filename)
|
||||
process_file(inferencer, input_path, output_path, args.mode)
|
||||
|
||||
|
||||
def process_file(inferencer, input_path, output_path, mode):
|
||||
"""Process a single audio file.
|
||||
|
||||
Args:
|
||||
inferencer: DAC inferencer instance
|
||||
input_path: Path to input audio file
|
||||
output_path: Path to output audio file
|
||||
mode: Processing mode (encode, decode, or reconstruct)
|
||||
"""
|
||||
print(f"Processing: {input_path}")
|
||||
|
||||
# Load audio
|
||||
audio, sample_rate = sf.read(input_path)
|
||||
if audio.ndim > 1:
|
||||
audio = audio.mean(axis=1) # Convert to mono if stereo
|
||||
|
||||
# Resample if needed
|
||||
if sample_rate != inferencer.model.sample_rate:
|
||||
# TODO: Implement resampling
|
||||
pass
|
||||
|
||||
# Process based on mode
|
||||
if mode == 'encode':
|
||||
# Encode to latent representation
|
||||
latent = inferencer.encode(audio)
|
||||
# Save latent representation
|
||||
np.save(output_path.replace('.wav', '.npy'), latent.numpy())
|
||||
|
||||
elif mode == 'decode':
|
||||
# Load latent representation
|
||||
if input_path.endswith('.npy'):
|
||||
latent = paddle.to_tensor(np.load(input_path))
|
||||
# Decode from latent representation
|
||||
audio_out = inferencer.decode(latent)
|
||||
# Save audio
|
||||
sf.write(output_path, audio_out, inferencer.model.sample_rate)
|
||||
else:
|
||||
print(f"Decode mode expects .npy file, got {input_path}")
|
||||
|
||||
elif mode == 'reconstruct':
|
||||
# Reconstruct audio through encoder-decoder
|
||||
audio_out = inferencer.reconstruct(audio)
|
||||
# Save reconstructed audio
|
||||
sf.write(output_path, audio_out, inferencer.model.sample_rate)
|
||||
|
||||
print(f"Saved to: {output_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Run inference with DAC model")
|
||||
|
||||
parser.add_argument(
|
||||
"--input",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Input audio file or directory of audio files")
|
||||
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Output file or directory")
|
||||
|
||||
parser.add_argument(
|
||||
"--checkpoint",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Path to model checkpoint")
|
||||
|
||||
parser.add_argument(
|
||||
"--config",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Path to model configuration file")
|
||||
|
||||
parser.add_argument(
|
||||
"--mode",
|
||||
type=str,
|
||||
default="reconstruct",
|
||||
choices=["encode", "decode", "reconstruct"],
|
||||
help="Mode: encode (audio to latent), decode (latent to audio), reconstruct (audio to audio)")
|
||||
|
||||
parser.add_argument(
|
||||
"--device",
|
||||
type=str,
|
||||
default=paddle.get_device(),
|
||||
help="Device for inference")
|
||||
|
||||
args = parser.parse_args()
|
||||
main(args)
|
@ -0,0 +1,117 @@
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Training script for DAC model.
|
||||
|
||||
This script demonstrates how to train the DAC model with distributed training support.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import paddle
|
||||
import paddle.distributed as dist
|
||||
from paddle.io import DataLoader, BatchSampler, DistributedBatchSampler
|
||||
import yaml
|
||||
|
||||
from paddlespeech.audio.codec.dac.model import DACModel
|
||||
from paddlespeech.audio.codec.dac.trainer import DACTrainer
|
||||
from paddlespeech.audio.codec.dac.processor import DACProcessor
|
||||
# TODO: Import dataset classes once implemented
|
||||
|
||||
|
||||
def main(args):
|
||||
"""Main training function.
|
||||
|
||||
Args:
|
||||
args: Command line arguments
|
||||
"""
|
||||
# Setup distributed training environment
|
||||
if args.ngpus > 1:
|
||||
dist.init_parallel_env()
|
||||
|
||||
# Load configuration
|
||||
with open(args.config, 'r') as f:
|
||||
config = yaml.safe_load(f)
|
||||
|
||||
# Create output directory
|
||||
output_dir = Path(args.output_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Setup model
|
||||
model = DACModel(**config['model'])
|
||||
|
||||
if args.ngpus > 1:
|
||||
model = paddle.DataParallel(model)
|
||||
|
||||
# Setup optimizer
|
||||
optimizer = paddle.optimizer.Adam(
|
||||
parameters=model.parameters(),
|
||||
learning_rate=config['training']['lr'],
|
||||
weight_decay=config['training']['weight_decay'])
|
||||
|
||||
# TODO: Setup dataset and dataloader
|
||||
# This is a placeholder for the dataset setup
|
||||
# train_dataset = ...
|
||||
# valid_dataset = ...
|
||||
|
||||
# batch_sampler = BatchSampler(...)
|
||||
# if args.ngpus > 1:
|
||||
# batch_sampler = DistributedBatchSampler(...)
|
||||
|
||||
# train_dataloader = DataLoader(...)
|
||||
# valid_dataloader = DataLoader(...)
|
||||
|
||||
# Setup trainer
|
||||
trainer = DACTrainer(
|
||||
model=model,
|
||||
optimizer=optimizer,
|
||||
dataloader=None, # TODO: Replace with actual train_dataloader
|
||||
output_dir=output_dir,
|
||||
config=config,
|
||||
max_epoch=args.max_epoch)
|
||||
|
||||
# Run training
|
||||
trainer.run()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Train DAC model")
|
||||
|
||||
parser.add_argument(
|
||||
"--config",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Path to configuration file")
|
||||
|
||||
parser.add_argument(
|
||||
"--output-dir",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Directory to save model checkpoints and logs")
|
||||
|
||||
parser.add_argument(
|
||||
"--ngpus",
|
||||
type=int,
|
||||
default=1,
|
||||
help="Number of GPUs for distributed training")
|
||||
|
||||
parser.add_argument(
|
||||
"--max-epoch",
|
||||
type=int,
|
||||
default=200,
|
||||
help="Maximum number of training epochs")
|
||||
|
||||
args = parser.parse_args()
|
||||
main(args)
|
@ -0,0 +1,14 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Audio codec module."""
|
@ -0,0 +1,44 @@
|
||||
# Descript Audio Codec (DAC) Implementation in PaddleSpeech
|
||||
|
||||
This is an implementation of the Descript Audio Codec (DAC) in PaddleSpeech, based on the paper ["DAC: A Unified Approach to Neural Codec Modeling"](https://arxiv.org/abs/2306.06546).
|
||||
|
||||
## Overview
|
||||
|
||||
DAC is a neural audio codec that provides high-quality audio compression at various bit rates while maintaining excellent perceptual quality. This implementation includes:
|
||||
|
||||
- DAC model architecture
|
||||
- Distributed training pipeline
|
||||
- Inference API
|
||||
- Evaluation metrics
|
||||
|
||||
## Features
|
||||
|
||||
- High-quality audio compression and reconstruction
|
||||
- Variable bitrate support
|
||||
- Support for different audio domains
|
||||
- Compatible with PaddleSpeech's distributed training infrastructure
|
||||
|
||||
## Usage
|
||||
|
||||
### Training
|
||||
|
||||
See the example training script at `examples/audio/codec/dac/train.py`
|
||||
|
||||
### Inference
|
||||
|
||||
See the example inference script at `examples/audio/codec/dac/infer.py`
|
||||
|
||||
### Evaluation
|
||||
|
||||
See the example evaluation script at `examples/audio/codec/dac/evaluate.py`
|
||||
|
||||
## Citation
|
||||
|
||||
```bibtex
|
||||
@article{kumar2023dac,
|
||||
title={DAC: A Unified Approach to Neural Codec Modeling},
|
||||
author={Kumar, Manoj and Shor, Joel and Zhang, Yu and Han, Wei and Wu, Yonghui and Itkin, Eli and Venkatesh, Sree and Ryabov, Andrew and Rubanov, Oleg and Wilkins, Peter and Chen, Jiayu and Olga Vitek, Bryan Catanzaro},
|
||||
journal={arXiv preprint arXiv:2306.06546},
|
||||
year={2023}
|
||||
}
|
||||
```
|
@ -0,0 +1,22 @@
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Descript Audio Codec (DAC) module.
|
||||
|
||||
Reference:
|
||||
DAC: A Unified Approach to Neural Codec Modeling. https://arxiv.org/abs/2306.06546
|
||||
Original implementation: https://github.com/descriptinc/descript-audio-codec
|
||||
"""
|
||||
|
||||
from paddlespeech.audio.codec.dac.model import *
|
||||
from paddlespeech.audio.codec.dac.processor import *
|
@ -0,0 +1,132 @@
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""DAC model evaluation metrics and evaluation implementation.
|
||||
|
||||
This module contains the evaluation metrics and evaluation pipeline for DAC model.
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import paddle
|
||||
import paddle.nn.functional as F
|
||||
from scipy.signal import correlate
|
||||
import librosa
|
||||
|
||||
from paddlespeech.s2t.training.extensions.evaluator import StandardEvaluator
|
||||
|
||||
|
||||
def compute_pesq(ref, deg, sample_rate=44100):
|
||||
"""Compute PESQ (Perceptual Evaluation of Speech Quality) metric.
|
||||
|
||||
Args:
|
||||
ref (numpy.ndarray): Reference audio
|
||||
deg (numpy.ndarray): Degraded audio
|
||||
sample_rate (int, optional): Sample rate. Defaults to 44100.
|
||||
|
||||
Returns:
|
||||
float: PESQ score
|
||||
"""
|
||||
# TODO: Implement PESQ calculation
|
||||
return 0.0
|
||||
|
||||
|
||||
def compute_sisdr(reference, estimation):
|
||||
"""Compute Scale-Invariant Signal-to-Distortion Ratio (SI-SDR).
|
||||
|
||||
Args:
|
||||
reference (numpy.ndarray): Reference signal
|
||||
estimation (numpy.ndarray): Estimated signal
|
||||
|
||||
Returns:
|
||||
float: SI-SDR value in dB
|
||||
"""
|
||||
# TODO: Implement SI-SDR calculation
|
||||
return 0.0
|
||||
|
||||
|
||||
class DACAudioMetrics:
|
||||
"""Audio quality metrics for DAC evaluation."""
|
||||
|
||||
def __init__(self, sample_rate=44100):
|
||||
"""Initialize audio metrics calculator.
|
||||
|
||||
Args:
|
||||
sample_rate (int, optional): Audio sample rate. Defaults to 44100.
|
||||
"""
|
||||
self.sample_rate = sample_rate
|
||||
|
||||
def compute_metrics(self, reference, estimation):
|
||||
"""Compute all audio quality metrics.
|
||||
|
||||
Args:
|
||||
reference (numpy.ndarray): Reference audio
|
||||
estimation (numpy.ndarray): Estimated audio
|
||||
|
||||
Returns:
|
||||
dict: Dictionary of metric names and values
|
||||
"""
|
||||
metrics = {}
|
||||
|
||||
# SI-SDR (Scale-invariant signal-to-distortion ratio)
|
||||
metrics['si_sdr'] = compute_sisdr(reference, estimation)
|
||||
|
||||
# PESQ (Perceptual Evaluation of Speech Quality)
|
||||
metrics['pesq'] = compute_pesq(reference, estimation, self.sample_rate)
|
||||
|
||||
# TODO: Add more metrics as described in the DAC paper
|
||||
|
||||
return metrics
|
||||
|
||||
|
||||
class DACEvaluator(StandardEvaluator):
|
||||
"""Evaluator for DAC model.
|
||||
|
||||
Extends the StandardEvaluator with DAC-specific metrics calculation.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
model,
|
||||
dataloader,
|
||||
sample_rate=44100,
|
||||
**kwargs):
|
||||
"""Initialize DAC evaluator.
|
||||
|
||||
Args:
|
||||
model (nn.Layer): DAC model instance
|
||||
dataloader (DataLoader): Evaluation dataloader
|
||||
sample_rate (int, optional): Audio sample rate. Defaults to 44100.
|
||||
"""
|
||||
super().__init__(model, dataloader, **kwargs)
|
||||
self.sample_rate = sample_rate
|
||||
self.metrics_calculator = DACAudioMetrics(sample_rate=sample_rate)
|
||||
|
||||
def evaluate_batch(self, batch):
|
||||
"""Evaluate one batch of data.
|
||||
|
||||
Args:
|
||||
batch (dict): Batch of data
|
||||
|
||||
Returns:
|
||||
dict: Dictionary of evaluation metrics
|
||||
"""
|
||||
# TODO: Implement batch evaluation logic
|
||||
return {}
|
||||
|
||||
def evaluate(self):
|
||||
"""Run evaluation on the entire dataset.
|
||||
|
||||
Returns:
|
||||
dict: Overall evaluation metrics
|
||||
"""
|
||||
# TODO: Implement full evaluation logic with distributed support
|
||||
return {}
|
@ -0,0 +1,107 @@
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""DAC model inference implementation.
|
||||
|
||||
This module contains the inference implementation for the DAC model.
|
||||
"""
|
||||
|
||||
import os
|
||||
import logging
|
||||
import numpy as np
|
||||
from typing import Dict, List, Union
|
||||
|
||||
import paddle
|
||||
|
||||
from paddlespeech.audio.codec.dac.model import DACModel, EncoderModel, DecoderModel
|
||||
from paddlespeech.audio.codec.dac.processor import DACProcessor
|
||||
|
||||
|
||||
class DACInferencer:
|
||||
"""Inference class for DAC model."""
|
||||
|
||||
def __init__(self,
|
||||
checkpoint_path,
|
||||
model_config=None,
|
||||
device=paddle.get_device()):
|
||||
"""Initialize DAC inferencer.
|
||||
|
||||
Args:
|
||||
checkpoint_path (str): Path to model checkpoint
|
||||
model_config (dict, optional): Model configuration. Defaults to None.
|
||||
device (str, optional): Device to run inference on. Defaults to paddle.get_device().
|
||||
"""
|
||||
paddle.set_device(device)
|
||||
|
||||
self.checkpoint_path = checkpoint_path
|
||||
self.model_config = model_config or {}
|
||||
|
||||
# Initialize model and processor
|
||||
self._init_model()
|
||||
self.processor = DACProcessor(sample_rate=self.model.sample_rate)
|
||||
|
||||
def _init_model(self):
|
||||
"""Initialize the DAC model from checkpoint."""
|
||||
# TODO: Implement model loading from checkpoint
|
||||
self.model = DACModel(**self.model_config)
|
||||
|
||||
# Load model parameters
|
||||
if os.path.isfile(self.checkpoint_path):
|
||||
state_dict = paddle.load(self.checkpoint_path)
|
||||
self.model.set_state_dict(state_dict)
|
||||
self.model.eval()
|
||||
else:
|
||||
raise FileNotFoundError(f"Model checkpoint not found: {self.checkpoint_path}")
|
||||
|
||||
def encode(self, audio, **kwargs):
|
||||
"""Encode audio to latent representation.
|
||||
|
||||
Args:
|
||||
audio (numpy.ndarray): Input audio array
|
||||
|
||||
Returns:
|
||||
paddle.Tensor: Encoded latent representation
|
||||
"""
|
||||
# TODO: Implement encoding logic
|
||||
pass
|
||||
|
||||
def decode(self, latent, **kwargs):
|
||||
"""Decode latent representation to audio.
|
||||
|
||||
Args:
|
||||
latent (paddle.Tensor): Encoded latent representation
|
||||
|
||||
Returns:
|
||||
numpy.ndarray: Decoded audio
|
||||
"""
|
||||
# TODO: Implement decoding logic
|
||||
pass
|
||||
|
||||
def reconstruct(self, audio, **kwargs):
|
||||
"""Reconstruct audio by encoding and decoding.
|
||||
|
||||
Args:
|
||||
audio (numpy.ndarray): Input audio array
|
||||
|
||||
Returns:
|
||||
numpy.ndarray: Reconstructed audio
|
||||
"""
|
||||
# Preprocess audio
|
||||
audio_tensor = self.processor.preprocess(audio)
|
||||
|
||||
# Run inference
|
||||
with paddle.no_grad():
|
||||
output, _ = self.model(audio_tensor.unsqueeze(0))
|
||||
|
||||
# Postprocess output
|
||||
return self.processor.postprocess(output.squeeze(0))
|
@ -0,0 +1,106 @@
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""DAC model implementation.
|
||||
|
||||
This module contains the implementation of the Descript Audio Codec (DAC) model.
|
||||
Reference: https://arxiv.org/abs/2306.06546
|
||||
"""
|
||||
|
||||
import paddle
|
||||
import paddle.nn as nn
|
||||
import paddle.nn.functional as F
|
||||
|
||||
|
||||
class EncoderBlock(nn.Layer):
|
||||
"""Encoder block for the DAC model."""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
# TODO: Implement encoder block according to the DAC paper
|
||||
|
||||
|
||||
class DecoderBlock(nn.Layer):
|
||||
"""Decoder block for the DAC model."""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
# TODO: Implement decoder block according to the DAC paper
|
||||
|
||||
|
||||
class DACModel(nn.Layer):
|
||||
"""Descript Audio Codec (DAC) model.
|
||||
|
||||
A neural audio codec that provides high-quality audio compression.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
sample_rate=44100,
|
||||
encoder_dims=512,
|
||||
decoder_dims=512,
|
||||
num_residual_layers=10,
|
||||
**kwargs):
|
||||
"""Initialize DAC model.
|
||||
|
||||
Args:
|
||||
sample_rate (int, optional): Audio sample rate. Defaults to 44100.
|
||||
encoder_dims (int, optional): Encoder dimension. Defaults to 512.
|
||||
decoder_dims (int, optional): Decoder dimension. Defaults to 512.
|
||||
num_residual_layers (int, optional): Number of residual layers. Defaults to 10.
|
||||
"""
|
||||
super().__init__()
|
||||
self.sample_rate = sample_rate
|
||||
self.encoder_dims = encoder_dims
|
||||
self.decoder_dims = decoder_dims
|
||||
|
||||
# TODO: Implement model components according to the DAC paper
|
||||
|
||||
def forward(self, x):
|
||||
"""Forward pass.
|
||||
|
||||
Args:
|
||||
x (Tensor): Input audio tensor [B, T]
|
||||
|
||||
Returns:
|
||||
tuple: Tuple containing:
|
||||
- output (Tensor): Reconstructed audio
|
||||
- auxiliary outputs (dict): Extra model outputs
|
||||
"""
|
||||
# TODO: Implement forward pass
|
||||
return x, {}
|
||||
|
||||
|
||||
class EncoderModel(nn.Layer):
|
||||
"""Encoder part of the DAC model for inference."""
|
||||
|
||||
def __init__(self, dac_model):
|
||||
"""Initialize encoder model.
|
||||
|
||||
Args:
|
||||
dac_model (DACModel): Trained DAC model
|
||||
"""
|
||||
super().__init__()
|
||||
# TODO: Extract encoder from DAC model
|
||||
|
||||
|
||||
class DecoderModel(nn.Layer):
|
||||
"""Decoder part of the DAC model for inference."""
|
||||
|
||||
def __init__(self, dac_model):
|
||||
"""Initialize decoder model.
|
||||
|
||||
Args:
|
||||
dac_model (DACModel): Trained DAC model
|
||||
"""
|
||||
super().__init__()
|
||||
# TODO: Extract decoder from DAC model
|
@ -0,0 +1,61 @@
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Audio data preprocessing for DAC model.
|
||||
|
||||
This module contains preprocessing functions for DAC model inputs.
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import paddle
|
||||
|
||||
|
||||
class DACProcessor:
|
||||
"""Audio processor for the DAC model."""
|
||||
|
||||
def __init__(self, sample_rate=44100, n_fft=1024, hop_length=256):
|
||||
"""Initialize the DAC processor.
|
||||
|
||||
Args:
|
||||
sample_rate (int, optional): Audio sample rate. Defaults to 44100.
|
||||
n_fft (int, optional): FFT size. Defaults to 1024.
|
||||
hop_length (int, optional): Hop length for STFT. Defaults to 256.
|
||||
"""
|
||||
self.sample_rate = sample_rate
|
||||
self.n_fft = n_fft
|
||||
self.hop_length = hop_length
|
||||
|
||||
def preprocess(self, audio, normalize=True):
|
||||
"""Preprocess audio for DAC model input.
|
||||
|
||||
Args:
|
||||
audio (numpy.ndarray): Input audio waveform
|
||||
normalize (bool, optional): Whether to normalize audio. Defaults to True.
|
||||
|
||||
Returns:
|
||||
paddle.Tensor: Preprocessed audio tensor
|
||||
"""
|
||||
# TODO: Implement preprocessing according to DAC paper
|
||||
return paddle.to_tensor(audio)
|
||||
|
||||
def postprocess(self, tensor):
|
||||
"""Convert model output to audio waveform.
|
||||
|
||||
Args:
|
||||
tensor (paddle.Tensor): Model output tensor
|
||||
|
||||
Returns:
|
||||
numpy.ndarray: Audio waveform
|
||||
"""
|
||||
# TODO: Implement postprocessing according to DAC paper
|
||||
return tensor.numpy()
|
@ -0,0 +1,74 @@
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""DAC model distributed training implementation.
|
||||
|
||||
This module contains the distributed training implementation for the DAC model.
|
||||
"""
|
||||
|
||||
import os
|
||||
import time
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
import paddle
|
||||
import paddle.nn as nn
|
||||
import paddle.distributed as dist
|
||||
from paddle.io import DataLoader, DistributedBatchSampler
|
||||
from visualdl import LogWriter
|
||||
|
||||
from paddlespeech.audio.codec.dac.model import DACModel
|
||||
from paddlespeech.s2t.training.extensions.evaluator import StandardEvaluator
|
||||
from paddlespeech.s2t.training.trainer import Trainer
|
||||
|
||||
|
||||
class DACTrainer(Trainer):
|
||||
"""Trainer for DAC model implementing distributed training.
|
||||
|
||||
Extends paddlespeech.s2t.training.trainer.Trainer with DAC-specific functionality.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
model,
|
||||
optimizer,
|
||||
dataloader,
|
||||
output_dir,
|
||||
config=None,
|
||||
max_epoch=100,
|
||||
**kwargs):
|
||||
"""Initialize the DAC trainer.
|
||||
|
||||
Args:
|
||||
model (nn.Layer): DAC model instance
|
||||
optimizer (Optimizer): Optimizer instance
|
||||
dataloader (DataLoader): Training data loader
|
||||
output_dir (str): Output directory for saving models and logs
|
||||
config (CfgNode, optional): Training config. Defaults to None.
|
||||
max_epoch (int, optional): Maximum number of training epochs. Defaults to 100.
|
||||
"""
|
||||
super().__init__(model, optimizer, dataloader, output_dir, **kwargs)
|
||||
self.config = config
|
||||
self.max_epoch = max_epoch
|
||||
|
||||
# Setup distributed training
|
||||
# TODO: Implement distributed training setup
|
||||
|
||||
def train_batch(self):
|
||||
"""Train on one mini-batch data."""
|
||||
# TODO: Implement batch training logic with distributed support
|
||||
pass
|
||||
|
||||
def run(self):
|
||||
"""Run training with distributed optimization."""
|
||||
# TODO: Implement distributed training loop
|
||||
pass
|
Loading…
Reference in new issue