You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
RedditVideoMakerBot/utils/caption_renderer.py

351 lines
12 KiB

"""
caption_renderer.py
───────────────────
All caption rendering logic. Three display modes:
multi → full sentence on one image (1 RenderJob per sentence)
single → sentence split into word chunks (N RenderJobs per sentence)
aligned → word-level timestamps from WhisperX (perfect sync, any TTS)
RenderJob is the contract between this module and final_video.py.
Two types of timing:
FRACTION-based (multi, single):
audio_idx + time_fraction → final_video computes absolute time
time_fraction = fraction of audio_clips_durations[audio_idx+1]
ABSOLUTE-based (aligned):
clip_start + clip_end → final_video uses directly
These are absolute seconds in the video timeline (after title card)
final_video.py checks job["timing_type"] to know which to use.
"""
import os
from dataclasses import dataclass, field
from typing import List, Optional
from PIL import Image, ImageDraw, ImageFont
from utils.fonts import getsize
# ─────────────────────────────────────────────────────────────────────────────
# RenderJob — the contract
# ─────────────────────────────────────────────────────────────────────────────
@dataclass
class RenderJob:
"""
Describes exactly one output image (img{idx}.png).
timing_type = "fraction":
audio_idx + time_fraction used by final_video to compute display time.
time_fraction = 1.0 means shown for full audio file duration.
time_fraction = 0.25 means shown for 25% of audio file duration.
timing_type = "absolute":
clip_start + clip_end are absolute seconds in the video timeline.
final_video uses these directly — no calculation needed.
"""
idx: int
lines: List[str]
timing_type: str # "fraction" or "absolute"
# fraction-based fields
audio_idx: int = 0
time_fraction: float = 1.0
# absolute-based fields
clip_start: float = 0.0
clip_end: float = 0.0
# ─────────────────────────────────────────────────────────────────────────────
# Display modes
# ─────────────────────────────────────────────────────────────────────────────
DISPLAY_MODES = {"single", "multi", "aligned"}
def render_multi_mode(
sentence: str,
style: dict,
audio_idx: int,
start_idx: int,
) -> List[RenderJob]:
"""
Full sentence on one image, wrapped into lines.
One RenderJob, time_fraction = 1.0.
Best for: funny, sad, wholesome, happy.
"""
words = sentence.split()
wpl = style["words_per_chunk"]
lines = [" ".join(words[i:i + wpl]) for i in range(0, len(words), wpl)]
if not lines:
lines = [sentence]
return [RenderJob(
idx=start_idx,
lines=lines,
timing_type="fraction",
audio_idx=audio_idx,
time_fraction=1.0,
)]
def render_single_mode(
sentence: str,
style: dict,
audio_idx: int,
start_idx: int,
) -> List[RenderJob]:
"""
Sentence split into word chunks, one per image.
Each shown for (1/N) of the audio duration.
Best for: scary, dramatic, angry, mysterious.
"""
wpc = style["words_per_chunk"]
words = sentence.split()
raw = [" ".join(words[i:i + wpc]) for i in range(0, len(words), wpc)]
raw = [c for c in raw if c.strip()] or [sentence]
n = len(raw)
fraction = 1.0 / n
return [
RenderJob(
idx=start_idx + i,
lines=[chunk],
timing_type="fraction",
audio_idx=audio_idx,
time_fraction=fraction,
)
for i, chunk in enumerate(raw)
]
def render_aligned_mode(
sentence: str,
style: dict,
audio_idx: int,
start_idx: int,
word_timestamps: List[dict],
audio_start_time: float,
audio_duration: float,
) -> List[RenderJob]:
"""
Word-level aligned mode using WhisperX timestamps.
Groups consecutive words into chunks of words_per_chunk words.
Each chunk's clip_start = timestamp of first word in chunk.
Each chunk's clip_end = timestamp of last word in chunk + its duration.
audio_start_time: absolute time in video when this audio file starts.
audio_duration: duration of this audio file (used as fallback end time).
Falls back to single mode if timestamps are empty or malformed.
"""
wpc = style["words_per_chunk"]
if not word_timestamps:
return render_single_mode(sentence, style, audio_idx, start_idx)
# Group word timestamps into chunks of wpc words
jobs = []
n = len(word_timestamps)
for chunk_start in range(0, n, wpc):
chunk_words = word_timestamps[chunk_start:chunk_start + wpc]
if not chunk_words:
continue
text = " ".join(w["word"] for w in chunk_words)
clip_start = audio_start_time + chunk_words[0]["start"]
# clip_end = end of last word in chunk,
# or start of next chunk if available, capped at audio end
if chunk_start + wpc < n:
clip_end = audio_start_time + word_timestamps[chunk_start + wpc]["start"]
else:
last_end = chunk_words[-1].get("end", chunk_words[-1]["start"] + 0.3)
clip_end = audio_start_time + last_end
# Safety: never exceed audio boundary
audio_end = audio_start_time + audio_duration
clip_end = min(clip_end, audio_end)
clip_end = max(clip_end, clip_start + 0.1) # minimum 100ms visibility
jobs.append(RenderJob(
idx=start_idx + len(jobs),
lines=[text],
timing_type="absolute",
clip_start=round(clip_start, 3),
clip_end=round(clip_end, 3),
))
return jobs if jobs else render_single_mode(sentence, style, audio_idx, start_idx)
# ─────────────────────────────────────────────────────────────────────────────
# Router
# ─────────────────────────────────────────────────────────────────────────────
def get_render_jobs(
sentences: List[str],
style: dict,
mp3_dir: Optional[str] = None,
audio_start_times: Optional[List[float]] = None,
audio_durations: Optional[List[float]] = None,
) -> List[RenderJob]:
"""
Route each sentence to the correct renderer.
Returns flat ordered list of all RenderJobs.
For "aligned" mode, loads word timestamps from
{mp3_dir}/postaudio-{i}_words.json written by engine_wrapper.
Falls back to "single" mode per sentence if timestamps missing.
Parameters
----------
sentences : one per postaudio-{i}.mp3
style : STYLE_MAP entry for current sentiment
mp3_dir : path to mp3 folder (needed for aligned mode)
audio_start_times : absolute start time of each audio in video (needed for aligned)
audio_durations : duration of each audio file (needed for aligned)
"""
mode = style.get("display_mode", "multi")
if mode not in DISPLAY_MODES:
print(f"[caption_renderer] Unknown display_mode '{mode}', using 'multi'")
mode = "multi"
all_jobs: List[RenderJob] = []
img_counter: int = 0
for audio_idx, sentence in enumerate(sentences):
if mode == "aligned" and mp3_dir and audio_start_times and audio_durations:
# Try to load word timestamps for this sentence
from utils.whisper_aligner import load_word_timestamps
audio_path = os.path.join(mp3_dir, f"postaudio-{audio_idx}.mp3")
word_ts = load_word_timestamps(audio_path)
if word_ts:
jobs = render_aligned_mode(
sentence=sentence,
style=style,
audio_idx=audio_idx,
start_idx=img_counter,
word_timestamps=word_ts,
audio_start_time=audio_start_times[audio_idx],
audio_duration=audio_durations[audio_idx],
)
else:
# WhisperX not available or failed — fall back to single mode
print(f"[caption_renderer] No timestamps for sentence {audio_idx}, using single mode")
jobs = render_single_mode(sentence, style, audio_idx, img_counter)
elif mode == "single":
jobs = render_single_mode(sentence, style, audio_idx, img_counter)
else:
jobs = render_multi_mode(sentence, style, audio_idx, img_counter)
all_jobs.extend(jobs)
img_counter += len(jobs)
return all_jobs
# ─────────────────────────────────────────────────────────────────────────────
# Drawing primitives
# ─────────────────────────────────────────────────────────────────────────────
def measure_text_block(
draw: ImageDraw.ImageDraw,
lines: List[str],
font: ImageFont.FreeTypeFont,
line_spacing: int,
) -> tuple:
max_w = 0
total_h = 0
for i, line in enumerate(lines):
w, h = getsize(font, line)
if w > max_w:
max_w = w
total_h += h
if i < len(lines) - 1:
total_h += line_spacing
return max_w, total_h
def draw_stroked_text(
draw: ImageDraw.ImageDraw,
x: int,
y: int,
line: str,
font: ImageFont.FreeTypeFont,
fill_color: tuple,
stroke_color: tuple,
stroke_width: int,
) -> None:
sw = stroke_width
half = max(1, sw // 2)
offsets = [
(-sw, 0), (sw, 0), (0, -sw), (0, sw),
(-sw, -sw), (sw, -sw), (-sw, sw), (sw, sw),
(-sw, -half), (sw, -half), (-sw, half), (sw, half),
(-half, -sw), (half, -sw), (-half, sw), (half, sw),
]
for ox, oy in offsets:
draw.text((x + ox, y + oy), line, font=font, fill=stroke_color)
draw.text((x, y), line, font=font, fill=fill_color)
def fit_font(
style: dict,
lines: List[str],
canvas_w: int,
canvas_h: int,
line_spacing: int,
max_width_ratio: float = 0.88,
max_height_ratio: float = 0.45,
) -> ImageFont.FreeTypeFont:
font_size = style["font_size"]
font_path = os.path.join("fonts", style["font_file"])
if not os.path.exists(font_path):
font_path = os.path.join("fonts", "Roboto-Bold.ttf")
max_w = int(canvas_w * max_width_ratio)
max_h = int(canvas_h * max_height_ratio)
while font_size > 30:
font = ImageFont.truetype(font_path, font_size)
dummy = Image.new("RGBA", (canvas_w, canvas_h), (0, 0, 0, 0))
dummy_d = ImageDraw.Draw(dummy)
bw, bh = measure_text_block(dummy_d, lines, font, line_spacing)
if bw <= max_w and bh <= max_h:
return font
font_size -= 4
return ImageFont.truetype(font_path, 30)
def render_job_to_image(
job: RenderJob,
style: dict,
canvas_w: int,
canvas_h: int,
line_spacing: int,
) -> Image.Image:
font = fit_font(style, job.lines, canvas_w, canvas_h, line_spacing)
image = Image.new("RGBA", (canvas_w, canvas_h), (0, 0, 0, 0))
draw = ImageDraw.Draw(image)
bw, bh = measure_text_block(draw, job.lines, font, line_spacing)
anchor_y = int(canvas_h * style["y_position"]) - (bh // 2)
cy = anchor_y
for line in job.lines:
w, h = getsize(font, line)
x = (canvas_w - w) // 2
draw_stroked_text(draw, x, cy, line, font,
style["fill_color"], style["stroke_color"], style["stroke_width"])
cy += h + line_spacing
return image