parent
b9c7835eb9
commit
d250ab0f95
@ -0,0 +1,3 @@
|
||||
# from . import datasets
|
||||
from . import preprocess
|
||||
# from . import transforms
|
@ -0,0 +1,83 @@
|
||||
import csv
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from audio_signal import AudioSignal
|
||||
from tqdm import tqdm
|
||||
# from ..core import AudioSignal
|
||||
|
||||
|
||||
def create_csv(audio_files: list,
|
||||
output_csv: Path,
|
||||
loudness: bool=False,
|
||||
data_path: str=None):
|
||||
"""Converts a folder of audio files to a CSV file. If ``loudness = True``,
|
||||
the output of this function will create a CSV file that looks something
|
||||
like:
|
||||
|
||||
.. csv-table::
|
||||
:header: path,loudness
|
||||
|
||||
daps/produced/f1_script1_produced.wav,-16.299999237060547
|
||||
daps/produced/f1_script2_produced.wav,-16.600000381469727
|
||||
daps/produced/f1_script3_produced.wav,-17.299999237060547
|
||||
daps/produced/f1_script4_produced.wav,-16.100000381469727
|
||||
daps/produced/f1_script5_produced.wav,-16.700000762939453
|
||||
daps/produced/f3_script1_produced.wav,-16.5
|
||||
|
||||
.. note::
|
||||
The paths above are written relative to the ``data_path`` argument
|
||||
which defaults to the environment variable ``PATH_TO_DATA`` if
|
||||
it isn't passed to this function, and defaults to the empty string
|
||||
if that environment variable is not set.
|
||||
|
||||
You can produce a CSV file from a directory of audio files via:
|
||||
|
||||
>>> import audiotools
|
||||
>>> directory = ...
|
||||
>>> audio_files = audiotools.util.find_audio(directory)
|
||||
>>> output_path = "train.csv"
|
||||
>>> audiotools.data.preprocess.create_csv(
|
||||
>>> audio_files, output_csv, loudness=True
|
||||
>>> )
|
||||
|
||||
Note that you can create empty rows in the CSV file by passing an empty
|
||||
string or None in the ``audio_files`` list. This is useful if you want to
|
||||
sync multiple CSV files in a multitrack setting. The loudness of these
|
||||
empty rows will be set to -inf.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
audio_files : list
|
||||
List of audio files.
|
||||
output_csv : Path
|
||||
Output CSV, with each row containing the relative path of every file
|
||||
to ``data_path``, if specified (defaults to None).
|
||||
loudness : bool
|
||||
Compute loudness of entire file and store alongside path.
|
||||
"""
|
||||
|
||||
info = []
|
||||
pbar = tqdm(audio_files)
|
||||
for af in pbar:
|
||||
af = Path(af)
|
||||
pbar.set_description(f"Processing {af.name}")
|
||||
_info = {}
|
||||
if af.name == "":
|
||||
_info["path"] = ""
|
||||
if loudness:
|
||||
_info["loudness"] = -float("inf")
|
||||
else:
|
||||
_info["path"] = af.relative_to(
|
||||
data_path) if data_path is not None else af
|
||||
if loudness:
|
||||
_info["loudness"] = AudioSignal(af).ffmpeg_loudness().item()
|
||||
|
||||
info.append(_info)
|
||||
|
||||
with open(output_csv, "w") as f:
|
||||
writer = csv.DictWriter(f, fieldnames=list(info[0].keys()))
|
||||
writer.writeheader()
|
||||
|
||||
for item in info:
|
||||
writer.writerow(item)
|
@ -0,0 +1,69 @@
|
||||
import os
|
||||
|
||||
import numpy as np
|
||||
import paddle
|
||||
from audio_signal import AudioSignal
|
||||
|
||||
|
||||
def visqol(
|
||||
estimates: AudioSignal,
|
||||
references: AudioSignal,
|
||||
mode: str="audio", ): # pragma: no cover
|
||||
"""ViSQOL score.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
estimates : AudioSignal
|
||||
Degraded AudioSignal
|
||||
references : AudioSignal
|
||||
Reference AudioSignal
|
||||
mode : str, optional
|
||||
'audio' or 'speech', by default 'audio'
|
||||
|
||||
Returns
|
||||
-------
|
||||
Tensor[float]
|
||||
ViSQOL score (MOS-LQO)
|
||||
"""
|
||||
try:
|
||||
from pyvisqol import visqol_lib_py
|
||||
from pyvisqol.pb2 import visqol_config_pb2
|
||||
from pyvisqol.pb2 import similarity_result_pb2
|
||||
except ImportError:
|
||||
from visqol import visqol_lib_py
|
||||
from visqol.pb2 import visqol_config_pb2
|
||||
from visqol.pb2 import similarity_result_pb2
|
||||
|
||||
config = visqol_config_pb2.VisqolConfig()
|
||||
if mode == "audio":
|
||||
target_sr = 48000
|
||||
config.options.use_speech_scoring = False
|
||||
svr_model_path = "libsvm_nu_svr_model.txt"
|
||||
elif mode == "speech":
|
||||
target_sr = 16000
|
||||
config.options.use_speech_scoring = True
|
||||
svr_model_path = "lattice_tcditugenmeetpackhref_ls2_nl60_lr12_bs2048_learn.005_ep2400_train1_7_raw.tflite"
|
||||
else:
|
||||
raise ValueError(f"Unrecognized mode: {mode}")
|
||||
config.audio.sample_rate = target_sr
|
||||
config.options.svr_model_path = os.path.join(
|
||||
os.path.dirname(visqol_lib_py.__file__), "model", svr_model_path)
|
||||
|
||||
api = visqol_lib_py.VisqolApi()
|
||||
api.Create(config)
|
||||
|
||||
estimates = estimates.clone().to_mono().resample(target_sr)
|
||||
references = references.clone().to_mono().resample(target_sr)
|
||||
|
||||
visqols = []
|
||||
for i in range(estimates.batch_size):
|
||||
_visqol = api.Measure(
|
||||
references.audio_data[i, 0].detach().cpu().numpy().astype(float),
|
||||
estimates.audio_data[i, 0].detach().cpu().numpy().astype(float), )
|
||||
visqols.append(_visqol.moslqo)
|
||||
return paddle.to_tensor(np.array(visqols))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
signal = AudioSignal(paddle.randn([44100]), 44100)
|
||||
print(visqol(signal, signal))
|
Loading…
Reference in new issue