""" caption_renderer.py ─────────────────── All caption rendering logic. Three display modes: multi → full sentence on one image (1 RenderJob per sentence) single → sentence split into word chunks (N RenderJobs per sentence) aligned → word-level timestamps from WhisperX (perfect sync, any TTS) RenderJob is the contract between this module and final_video.py. Two types of timing: FRACTION-based (multi, single): audio_idx + time_fraction → final_video computes absolute time time_fraction = fraction of audio_clips_durations[audio_idx+1] ABSOLUTE-based (aligned): clip_start + clip_end → final_video uses directly These are absolute seconds in the video timeline (after title card) final_video.py checks job["timing_type"] to know which to use. """ import os from dataclasses import dataclass, field from typing import List, Optional from PIL import Image, ImageDraw, ImageFont from utils.fonts import getsize # ───────────────────────────────────────────────────────────────────────────── # RenderJob — the contract # ───────────────────────────────────────────────────────────────────────────── @dataclass class RenderJob: """ Describes exactly one output image (img{idx}.png). timing_type = "fraction": audio_idx + time_fraction used by final_video to compute display time. time_fraction = 1.0 means shown for full audio file duration. time_fraction = 0.25 means shown for 25% of audio file duration. timing_type = "absolute": clip_start + clip_end are absolute seconds in the video timeline. final_video uses these directly — no calculation needed. """ idx: int lines: List[str] timing_type: str # "fraction" or "absolute" # fraction-based fields audio_idx: int = 0 time_fraction: float = 1.0 # absolute-based fields clip_start: float = 0.0 clip_end: float = 0.0 # ───────────────────────────────────────────────────────────────────────────── # Display modes # ───────────────────────────────────────────────────────────────────────────── DISPLAY_MODES = {"single", "multi", "aligned"} def render_multi_mode( sentence: str, style: dict, audio_idx: int, start_idx: int, ) -> List[RenderJob]: """ Full sentence on one image, wrapped into lines. One RenderJob, time_fraction = 1.0. Best for: funny, sad, wholesome, happy. """ words = sentence.split() wpl = style["words_per_chunk"] lines = [" ".join(words[i:i + wpl]) for i in range(0, len(words), wpl)] if not lines: lines = [sentence] return [RenderJob( idx=start_idx, lines=lines, timing_type="fraction", audio_idx=audio_idx, time_fraction=1.0, )] def render_single_mode( sentence: str, style: dict, audio_idx: int, start_idx: int, ) -> List[RenderJob]: """ Sentence split into word chunks, one per image. Each shown for (1/N) of the audio duration. Best for: scary, dramatic, angry, mysterious. """ wpc = style["words_per_chunk"] words = sentence.split() raw = [" ".join(words[i:i + wpc]) for i in range(0, len(words), wpc)] raw = [c for c in raw if c.strip()] or [sentence] n = len(raw) fraction = 1.0 / n return [ RenderJob( idx=start_idx + i, lines=[chunk], timing_type="fraction", audio_idx=audio_idx, time_fraction=fraction, ) for i, chunk in enumerate(raw) ] def render_aligned_mode( sentence: str, style: dict, audio_idx: int, start_idx: int, word_timestamps: List[dict], audio_start_time: float, audio_duration: float, ) -> List[RenderJob]: """ Word-level aligned mode using WhisperX timestamps. Groups consecutive words into chunks of words_per_chunk words. Each chunk's clip_start = timestamp of first word in chunk. Each chunk's clip_end = timestamp of last word in chunk + its duration. audio_start_time: absolute time in video when this audio file starts. audio_duration: duration of this audio file (used as fallback end time). Falls back to single mode if timestamps are empty or malformed. """ wpc = style["words_per_chunk"] if not word_timestamps: return render_single_mode(sentence, style, audio_idx, start_idx) # Group word timestamps into chunks of wpc words jobs = [] n = len(word_timestamps) for chunk_start in range(0, n, wpc): chunk_words = word_timestamps[chunk_start:chunk_start + wpc] if not chunk_words: continue text = " ".join(w["word"] for w in chunk_words) clip_start = audio_start_time + chunk_words[0]["start"] # clip_end = end of last word in chunk, # or start of next chunk if available, capped at audio end if chunk_start + wpc < n: clip_end = audio_start_time + word_timestamps[chunk_start + wpc]["start"] else: last_end = chunk_words[-1].get("end", chunk_words[-1]["start"] + 0.3) clip_end = audio_start_time + last_end # Safety: never exceed audio boundary audio_end = audio_start_time + audio_duration clip_end = min(clip_end, audio_end) clip_end = max(clip_end, clip_start + 0.1) # minimum 100ms visibility jobs.append(RenderJob( idx=start_idx + len(jobs), lines=[text], timing_type="absolute", clip_start=round(clip_start, 3), clip_end=round(clip_end, 3), )) return jobs if jobs else render_single_mode(sentence, style, audio_idx, start_idx) # ───────────────────────────────────────────────────────────────────────────── # Router # ───────────────────────────────────────────────────────────────────────────── def get_render_jobs( sentences: List[str], style: dict, mp3_dir: Optional[str] = None, audio_start_times: Optional[List[float]] = None, audio_durations: Optional[List[float]] = None, ) -> List[RenderJob]: """ Route each sentence to the correct renderer. Returns flat ordered list of all RenderJobs. For "aligned" mode, loads word timestamps from {mp3_dir}/postaudio-{i}_words.json written by engine_wrapper. Falls back to "single" mode per sentence if timestamps missing. Parameters ---------- sentences : one per postaudio-{i}.mp3 style : STYLE_MAP entry for current sentiment mp3_dir : path to mp3 folder (needed for aligned mode) audio_start_times : absolute start time of each audio in video (needed for aligned) audio_durations : duration of each audio file (needed for aligned) """ mode = style.get("display_mode", "multi") if mode not in DISPLAY_MODES: print(f"[caption_renderer] Unknown display_mode '{mode}', using 'multi'") mode = "multi" all_jobs: List[RenderJob] = [] img_counter: int = 0 for audio_idx, sentence in enumerate(sentences): if mode == "aligned" and mp3_dir and audio_start_times and audio_durations: # Try to load word timestamps for this sentence from utils.whisper_aligner import load_word_timestamps audio_path = os.path.join(mp3_dir, f"postaudio-{audio_idx}.mp3") word_ts = load_word_timestamps(audio_path) if word_ts: jobs = render_aligned_mode( sentence=sentence, style=style, audio_idx=audio_idx, start_idx=img_counter, word_timestamps=word_ts, audio_start_time=audio_start_times[audio_idx], audio_duration=audio_durations[audio_idx], ) else: # WhisperX not available or failed — fall back to single mode print(f"[caption_renderer] No timestamps for sentence {audio_idx}, using single mode") jobs = render_single_mode(sentence, style, audio_idx, img_counter) elif mode == "single": jobs = render_single_mode(sentence, style, audio_idx, img_counter) else: jobs = render_multi_mode(sentence, style, audio_idx, img_counter) all_jobs.extend(jobs) img_counter += len(jobs) return all_jobs # ───────────────────────────────────────────────────────────────────────────── # Drawing primitives # ───────────────────────────────────────────────────────────────────────────── def measure_text_block( draw: ImageDraw.ImageDraw, lines: List[str], font: ImageFont.FreeTypeFont, line_spacing: int, ) -> tuple: max_w = 0 total_h = 0 for i, line in enumerate(lines): w, h = getsize(font, line) if w > max_w: max_w = w total_h += h if i < len(lines) - 1: total_h += line_spacing return max_w, total_h def draw_stroked_text( draw: ImageDraw.ImageDraw, x: int, y: int, line: str, font: ImageFont.FreeTypeFont, fill_color: tuple, stroke_color: tuple, stroke_width: int, ) -> None: sw = stroke_width half = max(1, sw // 2) offsets = [ (-sw, 0), (sw, 0), (0, -sw), (0, sw), (-sw, -sw), (sw, -sw), (-sw, sw), (sw, sw), (-sw, -half), (sw, -half), (-sw, half), (sw, half), (-half, -sw), (half, -sw), (-half, sw), (half, sw), ] for ox, oy in offsets: draw.text((x + ox, y + oy), line, font=font, fill=stroke_color) draw.text((x, y), line, font=font, fill=fill_color) def fit_font( style: dict, lines: List[str], canvas_w: int, canvas_h: int, line_spacing: int, max_width_ratio: float = 0.88, max_height_ratio: float = 0.45, ) -> ImageFont.FreeTypeFont: font_size = style["font_size"] font_path = os.path.join("fonts", style["font_file"]) if not os.path.exists(font_path): font_path = os.path.join("fonts", "Roboto-Bold.ttf") max_w = int(canvas_w * max_width_ratio) max_h = int(canvas_h * max_height_ratio) while font_size > 30: font = ImageFont.truetype(font_path, font_size) dummy = Image.new("RGBA", (canvas_w, canvas_h), (0, 0, 0, 0)) dummy_d = ImageDraw.Draw(dummy) bw, bh = measure_text_block(dummy_d, lines, font, line_spacing) if bw <= max_w and bh <= max_h: return font font_size -= 4 return ImageFont.truetype(font_path, 30) def render_job_to_image( job: RenderJob, style: dict, canvas_w: int, canvas_h: int, line_spacing: int, ) -> Image.Image: font = fit_font(style, job.lines, canvas_w, canvas_h, line_spacing) image = Image.new("RGBA", (canvas_w, canvas_h), (0, 0, 0, 0)) draw = ImageDraw.Draw(image) bw, bh = measure_text_block(draw, job.lines, font, line_spacing) anchor_y = int(canvas_h * style["y_position"]) - (bh // 2) cy = anchor_y for line in job.lines: w, h = getsize(font, line) x = (canvas_w - w) // 2 draw_stroked_text(draw, x, cy, line, font, style["fill_color"], style["stroke_color"], style["stroke_width"]) cy += h + line_spacing return image