You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
PaddleSpeech/paddlespeech/audiotools/core/ffmpeg.py

120 lines
3.5 KiB

# MIT License, Copyright (c) 2023-Present, Descript.
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Modified from audiotools(https://github.com/descriptinc/audiotools/blob/master/audiotools/core/ffmpeg.py)
import json
import shlex
import subprocess
import tempfile
from pathlib import Path
from typing import Tuple
import ffmpy
import numpy as np
import paddle
def r128stats(filepath: str, quiet: bool):
"""Takes a path to an audio file, returns a dict with the loudness
stats computed by the ffmpeg ebur128 filter.
Parameters
----------
filepath : str
Path to compute loudness stats on.
quiet : bool
Whether to show FFMPEG output during computation.
Returns
-------
dict
Dictionary containing loudness stats.
"""
ffargs = [
"ffmpeg",
"-nostats",
"-i",
filepath,
"-filter_complex",
"ebur128",
"-f",
"null",
"-",
]
if quiet:
ffargs += ["-hide_banner"]
proc = subprocess.Popen(
ffargs, stderr=subprocess.PIPE, universal_newlines=True)
stats = proc.communicate()[1]
summary_index = stats.rfind("Summary:")
summary_list = stats[summary_index:].split()
i_lufs = float(summary_list[summary_list.index("I:") + 1])
i_thresh = float(summary_list[summary_list.index("I:") + 4])
lra = float(summary_list[summary_list.index("LRA:") + 1])
lra_thresh = float(summary_list[summary_list.index("LRA:") + 4])
lra_low = float(summary_list[summary_list.index("low:") + 1])
lra_high = float(summary_list[summary_list.index("high:") + 1])
stats_dict = {
"I": i_lufs,
"I Threshold": i_thresh,
"LRA": lra,
"LRA Threshold": lra_thresh,
"LRA Low": lra_low,
"LRA High": lra_high,
}
return stats_dict
def ffprobe_offset_and_codec(path: str) -> Tuple[float, str]:
"""Given a path to a file, returns the start time offset and codec of
the first audio stream.
"""
ff = ffmpy.FFprobe(
inputs={path: None},
global_options="-show_entries format=start_time:stream=duration,start_time,codec_type,codec_name,start_pts,time_base -of json -v quiet",
)
streams = json.loads(ff.run(stdout=subprocess.PIPE)[0])["streams"]
seconds_offset = 0.0
codec = None
# Get the offset and codec of the first audio stream we find
# and return its start time, if it has one.
for stream in streams:
if stream["codec_type"] == "audio":
seconds_offset = stream.get("start_time", 0.0)
codec = stream.get("codec_name")
break
return float(seconds_offset), codec
class FFMPEGMixin:
_loudness = None
def ffmpeg_loudness(self, quiet: bool=True):
"""Computes loudness of audio file using FFMPEG.
Parameters
----------
quiet : bool, optional
Whether to show FFMPEG output during computation,
by default True
Returns
-------
paddle.Tensor
Loudness of every item in the batch, computed via
FFMPEG.
"""
loudness = []
with tempfile.NamedTemporaryFile(suffix=".wav") as f:
for i in range(self.batch_size):
self[i].write(f.name)
loudness_stats = r128stats(f.name, quiet=quiet)
loudness.append(loudness_stats["I"])
self._loudness = paddle.to_tensor(np.array(loudness)).astype("float32")
return self.loudness()