You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
75 lines
2.3 KiB
75 lines
2.3 KiB
# MIT License, Copyright (c) 2023-Present, Descript.
|
|
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
|
#
|
|
# Modified from audiotools(https://github.com/descriptinc/audiotools/blob/master/audiotools/metrics/quality.py)
|
|
import os
|
|
|
|
import numpy as np
|
|
import paddle
|
|
|
|
from ..core import AudioSignal
|
|
|
|
|
|
def visqol(
|
|
estimates: AudioSignal,
|
|
references: AudioSignal,
|
|
mode: str="audio", ):
|
|
"""ViSQOL score.
|
|
|
|
Parameters
|
|
----------
|
|
estimates : AudioSignal
|
|
Degraded AudioSignal
|
|
references : AudioSignal
|
|
Reference AudioSignal
|
|
mode : str, optional
|
|
'audio' or 'speech', by default 'audio'
|
|
|
|
Returns
|
|
-------
|
|
Tensor[float]
|
|
ViSQOL score (MOS-LQO)
|
|
"""
|
|
try:
|
|
from pyvisqol import visqol_lib_py
|
|
from pyvisqol.pb2 import visqol_config_pb2
|
|
from pyvisqol.pb2 import similarity_result_pb2
|
|
except ImportError:
|
|
from visqol import visqol_lib_py
|
|
from visqol.pb2 import visqol_config_pb2
|
|
from visqol.pb2 import similarity_result_pb2
|
|
|
|
config = visqol_config_pb2.VisqolConfig()
|
|
if mode == "audio":
|
|
target_sr = 48000
|
|
config.options.use_speech_scoring = False
|
|
svr_model_path = "libsvm_nu_svr_model.txt"
|
|
elif mode == "speech":
|
|
target_sr = 16000
|
|
config.options.use_speech_scoring = True
|
|
svr_model_path = "lattice_tcditugenmeetpackhref_ls2_nl60_lr12_bs2048_learn.005_ep2400_train1_7_raw.tflite"
|
|
else:
|
|
raise ValueError(f"Unrecognized mode: {mode}")
|
|
config.audio.sample_rate = target_sr
|
|
config.options.svr_model_path = os.path.join(
|
|
os.path.dirname(visqol_lib_py.__file__), "model", svr_model_path)
|
|
|
|
api = visqol_lib_py.VisqolApi()
|
|
api.Create(config)
|
|
|
|
estimates = estimates.clone().to_mono().resample(target_sr)
|
|
references = references.clone().to_mono().resample(target_sr)
|
|
|
|
visqols = []
|
|
for i in range(estimates.batch_size):
|
|
_visqol = api.Measure(
|
|
references.audio_data[i, 0].detach().cpu().numpy().astype(float),
|
|
estimates.audio_data[i, 0].detach().cpu().numpy().astype(float), )
|
|
visqols.append(_visqol.moslqo)
|
|
return paddle.to_tensor(np.array(visqols))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
signal = AudioSignal(paddle.randn([44100]), 44100)
|
|
print(visqol(signal, signal))
|