parent
b9c7835eb9
commit
d250ab0f95
@ -0,0 +1,3 @@
|
|||||||
|
# from . import datasets
|
||||||
|
from . import preprocess
|
||||||
|
# from . import transforms
|
@ -0,0 +1,83 @@
|
|||||||
|
import csv
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from audio_signal import AudioSignal
|
||||||
|
from tqdm import tqdm
|
||||||
|
# from ..core import AudioSignal
|
||||||
|
|
||||||
|
|
||||||
|
def create_csv(audio_files: list,
|
||||||
|
output_csv: Path,
|
||||||
|
loudness: bool=False,
|
||||||
|
data_path: str=None):
|
||||||
|
"""Converts a folder of audio files to a CSV file. If ``loudness = True``,
|
||||||
|
the output of this function will create a CSV file that looks something
|
||||||
|
like:
|
||||||
|
|
||||||
|
.. csv-table::
|
||||||
|
:header: path,loudness
|
||||||
|
|
||||||
|
daps/produced/f1_script1_produced.wav,-16.299999237060547
|
||||||
|
daps/produced/f1_script2_produced.wav,-16.600000381469727
|
||||||
|
daps/produced/f1_script3_produced.wav,-17.299999237060547
|
||||||
|
daps/produced/f1_script4_produced.wav,-16.100000381469727
|
||||||
|
daps/produced/f1_script5_produced.wav,-16.700000762939453
|
||||||
|
daps/produced/f3_script1_produced.wav,-16.5
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
The paths above are written relative to the ``data_path`` argument
|
||||||
|
which defaults to the environment variable ``PATH_TO_DATA`` if
|
||||||
|
it isn't passed to this function, and defaults to the empty string
|
||||||
|
if that environment variable is not set.
|
||||||
|
|
||||||
|
You can produce a CSV file from a directory of audio files via:
|
||||||
|
|
||||||
|
>>> import audiotools
|
||||||
|
>>> directory = ...
|
||||||
|
>>> audio_files = audiotools.util.find_audio(directory)
|
||||||
|
>>> output_path = "train.csv"
|
||||||
|
>>> audiotools.data.preprocess.create_csv(
|
||||||
|
>>> audio_files, output_csv, loudness=True
|
||||||
|
>>> )
|
||||||
|
|
||||||
|
Note that you can create empty rows in the CSV file by passing an empty
|
||||||
|
string or None in the ``audio_files`` list. This is useful if you want to
|
||||||
|
sync multiple CSV files in a multitrack setting. The loudness of these
|
||||||
|
empty rows will be set to -inf.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
audio_files : list
|
||||||
|
List of audio files.
|
||||||
|
output_csv : Path
|
||||||
|
Output CSV, with each row containing the relative path of every file
|
||||||
|
to ``data_path``, if specified (defaults to None).
|
||||||
|
loudness : bool
|
||||||
|
Compute loudness of entire file and store alongside path.
|
||||||
|
"""
|
||||||
|
|
||||||
|
info = []
|
||||||
|
pbar = tqdm(audio_files)
|
||||||
|
for af in pbar:
|
||||||
|
af = Path(af)
|
||||||
|
pbar.set_description(f"Processing {af.name}")
|
||||||
|
_info = {}
|
||||||
|
if af.name == "":
|
||||||
|
_info["path"] = ""
|
||||||
|
if loudness:
|
||||||
|
_info["loudness"] = -float("inf")
|
||||||
|
else:
|
||||||
|
_info["path"] = af.relative_to(
|
||||||
|
data_path) if data_path is not None else af
|
||||||
|
if loudness:
|
||||||
|
_info["loudness"] = AudioSignal(af).ffmpeg_loudness().item()
|
||||||
|
|
||||||
|
info.append(_info)
|
||||||
|
|
||||||
|
with open(output_csv, "w") as f:
|
||||||
|
writer = csv.DictWriter(f, fieldnames=list(info[0].keys()))
|
||||||
|
writer.writeheader()
|
||||||
|
|
||||||
|
for item in info:
|
||||||
|
writer.writerow(item)
|
@ -0,0 +1,69 @@
|
|||||||
|
import os
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import paddle
|
||||||
|
from audio_signal import AudioSignal
|
||||||
|
|
||||||
|
|
||||||
|
def visqol(
|
||||||
|
estimates: AudioSignal,
|
||||||
|
references: AudioSignal,
|
||||||
|
mode: str="audio", ): # pragma: no cover
|
||||||
|
"""ViSQOL score.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
estimates : AudioSignal
|
||||||
|
Degraded AudioSignal
|
||||||
|
references : AudioSignal
|
||||||
|
Reference AudioSignal
|
||||||
|
mode : str, optional
|
||||||
|
'audio' or 'speech', by default 'audio'
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Tensor[float]
|
||||||
|
ViSQOL score (MOS-LQO)
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
from pyvisqol import visqol_lib_py
|
||||||
|
from pyvisqol.pb2 import visqol_config_pb2
|
||||||
|
from pyvisqol.pb2 import similarity_result_pb2
|
||||||
|
except ImportError:
|
||||||
|
from visqol import visqol_lib_py
|
||||||
|
from visqol.pb2 import visqol_config_pb2
|
||||||
|
from visqol.pb2 import similarity_result_pb2
|
||||||
|
|
||||||
|
config = visqol_config_pb2.VisqolConfig()
|
||||||
|
if mode == "audio":
|
||||||
|
target_sr = 48000
|
||||||
|
config.options.use_speech_scoring = False
|
||||||
|
svr_model_path = "libsvm_nu_svr_model.txt"
|
||||||
|
elif mode == "speech":
|
||||||
|
target_sr = 16000
|
||||||
|
config.options.use_speech_scoring = True
|
||||||
|
svr_model_path = "lattice_tcditugenmeetpackhref_ls2_nl60_lr12_bs2048_learn.005_ep2400_train1_7_raw.tflite"
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unrecognized mode: {mode}")
|
||||||
|
config.audio.sample_rate = target_sr
|
||||||
|
config.options.svr_model_path = os.path.join(
|
||||||
|
os.path.dirname(visqol_lib_py.__file__), "model", svr_model_path)
|
||||||
|
|
||||||
|
api = visqol_lib_py.VisqolApi()
|
||||||
|
api.Create(config)
|
||||||
|
|
||||||
|
estimates = estimates.clone().to_mono().resample(target_sr)
|
||||||
|
references = references.clone().to_mono().resample(target_sr)
|
||||||
|
|
||||||
|
visqols = []
|
||||||
|
for i in range(estimates.batch_size):
|
||||||
|
_visqol = api.Measure(
|
||||||
|
references.audio_data[i, 0].detach().cpu().numpy().astype(float),
|
||||||
|
estimates.audio_data[i, 0].detach().cpu().numpy().astype(float), )
|
||||||
|
visqols.append(_visqol.moslqo)
|
||||||
|
return paddle.to_tensor(np.array(visqols))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
signal = AudioSignal(paddle.randn([44100]), 44100)
|
||||||
|
print(visqol(signal, signal))
|
Loading…
Reference in new issue