Core changes: - utils/caption_renderer.py: new single-responsibility rendering engine - Three display modes: aligned, single, multi - 8-direction stroke technique for clean text outlines - Transparent PNG overlays (no more solid box) - utils/whisper_aligner.py: WhisperX forced alignment module - Word-level timestamps from any TTS audio - Graceful fallback to single mode if unavailable - utils/imagenarator.py: refactored as thin orchestrator - Delegates to caption_renderer - Saves timing_map.json for final_video sync - utils/sentiment_map.py: added STYLE_MAP with display_mode per sentiment - utils/sentiment.py: stores sentiment in settings for downstream use - TTS/engine_wrapper.py: runs WhisperX after each TTS save - video_creation/final_video.py: reads timing_map, handles absolute + fraction timing - video_creation/screenshot_downloader.py: clean imagemaker call Assets: - fonts/: added Montserrat, Nunito, Oswald, Raleway, Lato, Anton font families Dependencies: - requirements.txt: updated with all current dependenciespull/2557/head
parent
af0940045c
commit
076b65f04c
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -1,21 +1,198 @@
|
||||
aiohappyeyeballs==2.6.2
|
||||
aiohttp==3.13.5
|
||||
aiosignal==1.4.0
|
||||
alembic==1.18.4
|
||||
annotated-doc==0.0.4
|
||||
annotated-types==0.7.0
|
||||
antlr4-python3-runtime==4.9.3
|
||||
anyio==4.13.0
|
||||
asteroid-filterbanks==0.4.0
|
||||
attrs==26.1.0
|
||||
av==17.0.1
|
||||
blinker==1.9.0
|
||||
blis==1.3.3
|
||||
boto3==1.36.8
|
||||
botocore==1.36.8
|
||||
catalogue==2.0.10
|
||||
certifi==2026.5.20
|
||||
cffi==2.0.0
|
||||
charset-normalizer==3.4.7
|
||||
clean-text==0.6.0
|
||||
click==8.1.8
|
||||
cloudpathlib==0.24.0
|
||||
colorlog==6.10.1
|
||||
confection==0.1.5
|
||||
contourpy==1.3.3
|
||||
cryptography==48.0.0
|
||||
ctranslate2==4.7.2
|
||||
cycler==0.12.1
|
||||
cymem==2.0.13
|
||||
decorator==5.3.1
|
||||
dill==0.4.1
|
||||
distro==1.9.0
|
||||
einops==0.8.2
|
||||
elevenlabs==1.57.0
|
||||
emoji==1.7.0
|
||||
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl#sha256=1932429db727d4bff3deed6b34cfc05df17794f4a52eeb26cf8928f7c1a0fb85
|
||||
exejs==0.0.7
|
||||
faster-whisper==1.2.1
|
||||
ffmpeg-python==0.2.0
|
||||
filelock==3.29.0
|
||||
Flask==3.1.1
|
||||
flatbuffers==25.12.19
|
||||
fonttools==4.63.0
|
||||
frozenlist==1.8.0
|
||||
fsspec==2026.4.0
|
||||
ftfy==6.3.1
|
||||
future==1.0.0
|
||||
googleapis-common-protos==1.75.0
|
||||
greenlet==3.1.1
|
||||
grpcio==1.80.0
|
||||
gTTS==2.5.4
|
||||
h11==0.16.0
|
||||
hf-xet==1.5.0
|
||||
httpcore==1.0.9
|
||||
httpx==0.28.1
|
||||
huggingface_hub==0.36.2
|
||||
idna==3.17
|
||||
ImageIO==2.37.3
|
||||
imageio-ffmpeg==0.6.0
|
||||
itsdangerous==2.2.0
|
||||
jh2==5.0.11
|
||||
Jinja2==3.1.6
|
||||
jiter==0.15.0
|
||||
jmespath==1.1.0
|
||||
joblib==1.5.3
|
||||
julius==0.2.7
|
||||
kiwisolver==1.5.0
|
||||
langcodes==3.5.1
|
||||
lightning==2.6.5
|
||||
lightning-utilities==0.15.3
|
||||
lxml==6.1.1
|
||||
Mako==1.3.12
|
||||
markdown-it-py==4.2.0
|
||||
MarkupSafe==3.0.3
|
||||
matplotlib==3.10.9
|
||||
mdurl==0.1.2
|
||||
moviepy==2.2.1
|
||||
mpmath==1.3.0
|
||||
multidict==6.7.1
|
||||
multiprocess==0.70.19
|
||||
murmurhash==1.0.15
|
||||
networkx==3.6.1
|
||||
niquests==3.18.8
|
||||
nltk==3.9.4
|
||||
numpy==2.4.6
|
||||
nvidia-cublas-cu12==12.8.4.1
|
||||
nvidia-cuda-cupti-cu12==12.8.90
|
||||
nvidia-cuda-nvrtc-cu12==12.8.93
|
||||
nvidia-cuda-runtime-cu12==12.8.90
|
||||
nvidia-cudnn-cu12==9.10.2.21
|
||||
nvidia-cufft-cu12==11.3.3.83
|
||||
nvidia-cufile-cu12==1.13.1.3
|
||||
nvidia-curand-cu12==10.3.9.90
|
||||
nvidia-cusolver-cu12==11.7.3.90
|
||||
nvidia-cusparse-cu12==12.5.8.93
|
||||
nvidia-cusparselt-cu12==0.7.1
|
||||
nvidia-nccl-cu12==2.27.3
|
||||
nvidia-nvjitlink-cu12==12.8.93
|
||||
nvidia-nvtx-cu12==12.8.90
|
||||
omegaconf==2.3.0
|
||||
onnxruntime==1.26.0
|
||||
openai==2.38.0
|
||||
opentelemetry-api==1.42.1
|
||||
opentelemetry-exporter-otlp==1.42.1
|
||||
opentelemetry-exporter-otlp-proto-common==1.42.1
|
||||
opentelemetry-exporter-otlp-proto-grpc==1.42.1
|
||||
opentelemetry-exporter-otlp-proto-http==1.42.1
|
||||
opentelemetry-proto==1.42.1
|
||||
opentelemetry-sdk==1.42.1
|
||||
opentelemetry-semantic-conventions==0.63b1
|
||||
optuna==4.8.0
|
||||
packaging==26.2
|
||||
pandas==3.0.3
|
||||
pathos==0.3.5
|
||||
pillow==11.3.0
|
||||
playwright==1.49.1
|
||||
pox==0.3.7
|
||||
ppft==1.7.8
|
||||
praw==7.8.1
|
||||
prawcore==2.4.0
|
||||
preshed==3.0.13
|
||||
primePy==1.3
|
||||
proglog==0.1.12
|
||||
propcache==0.5.2
|
||||
protobuf==6.33.6
|
||||
pyannote-audio==4.0.4
|
||||
pyannote-core==6.0.1
|
||||
pyannote-database==6.1.1
|
||||
pyannote-metrics==4.1
|
||||
pyannote-pipeline==4.0.0
|
||||
pyannoteai-sdk==0.4.0
|
||||
pycparser==3.0
|
||||
pydantic==2.13.4
|
||||
pydantic_core==2.46.4
|
||||
pyee==12.0.0
|
||||
Pygments==2.20.0
|
||||
pyparsing==3.3.2
|
||||
python-dateutil==2.9.0.post0
|
||||
python-dotenv==1.2.2
|
||||
pytorch-lightning==2.6.5
|
||||
pytorch-metric-learning==2.9.0
|
||||
pyttsx3==2.98
|
||||
PyYAML==6.0.3
|
||||
qh3==1.8.1
|
||||
regex==2026.5.9
|
||||
requests==2.32.3
|
||||
rich==13.9.4
|
||||
s3transfer==0.11.3
|
||||
safetensors==0.7.0
|
||||
scikit-learn==1.8.0
|
||||
scipy==1.17.1
|
||||
setuptools==82.0.1
|
||||
shellingham==1.5.4
|
||||
six==1.17.0
|
||||
smart_open==7.6.1
|
||||
sniffio==1.3.1
|
||||
sortedcontainers==2.4.0
|
||||
spacy==3.8.7
|
||||
spacy-legacy==3.0.12
|
||||
spacy-loggers==1.0.5
|
||||
SQLAlchemy==2.0.50
|
||||
srsly==2.5.3
|
||||
sympy==1.14.0
|
||||
thinc==8.3.11
|
||||
threadpoolctl==3.6.0
|
||||
tokenizers==0.21.4
|
||||
toml==0.10.2
|
||||
translators==5.9.9
|
||||
pyttsx3==2.98
|
||||
tomlkit==0.13.2
|
||||
Flask==3.1.1
|
||||
clean-text==0.6.0
|
||||
unidecode==1.4.0
|
||||
spacy==3.8.7
|
||||
torch==2.7.0
|
||||
torch==2.8.0
|
||||
torch-audiomentations==0.12.0
|
||||
torch_pitch_shift==1.2.5
|
||||
torchaudio==2.8.0
|
||||
torchcodec==0.7.0
|
||||
torchmetrics==1.9.0
|
||||
torchvision==0.23.0
|
||||
tqdm==4.67.3
|
||||
transformers==4.52.4
|
||||
ffmpeg-python==0.2.0
|
||||
elevenlabs==1.57.0
|
||||
yt-dlp==2025.10.22
|
||||
translators==5.9.9
|
||||
triton==3.4.0
|
||||
typer==0.26.2
|
||||
typer-slim==0.24.0
|
||||
typing-inspection==0.4.2
|
||||
typing_extensions==4.15.0
|
||||
Unidecode==1.4.0
|
||||
update-checker==0.18.0
|
||||
urllib3==2.7.0
|
||||
urllib3-future==2.20.907
|
||||
wasabi==1.1.3
|
||||
wassima==2.1.0
|
||||
wcwidth==0.7.0
|
||||
weasel==0.4.3
|
||||
websocket-client==1.9.0
|
||||
websockets==16.0
|
||||
Werkzeug==3.1.8
|
||||
whisperx==3.8.6
|
||||
wrapt==2.2.1
|
||||
yarl==1.24.2
|
||||
yt-dlp==2026.3.17
|
||||
|
||||
@ -0,0 +1,351 @@
|
||||
"""
|
||||
caption_renderer.py
|
||||
───────────────────
|
||||
All caption rendering logic. Three display modes:
|
||||
|
||||
multi → full sentence on one image (1 RenderJob per sentence)
|
||||
single → sentence split into word chunks (N RenderJobs per sentence)
|
||||
aligned → word-level timestamps from WhisperX (perfect sync, any TTS)
|
||||
|
||||
RenderJob is the contract between this module and final_video.py.
|
||||
Two types of timing:
|
||||
|
||||
FRACTION-based (multi, single):
|
||||
audio_idx + time_fraction → final_video computes absolute time
|
||||
time_fraction = fraction of audio_clips_durations[audio_idx+1]
|
||||
|
||||
ABSOLUTE-based (aligned):
|
||||
clip_start + clip_end → final_video uses directly
|
||||
These are absolute seconds in the video timeline (after title card)
|
||||
|
||||
final_video.py checks job["timing_type"] to know which to use.
|
||||
"""
|
||||
|
||||
import os
|
||||
from dataclasses import dataclass, field
|
||||
from typing import List, Optional
|
||||
|
||||
from PIL import Image, ImageDraw, ImageFont
|
||||
|
||||
from utils.fonts import getsize
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# RenderJob — the contract
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
@dataclass
|
||||
class RenderJob:
|
||||
"""
|
||||
Describes exactly one output image (img{idx}.png).
|
||||
|
||||
timing_type = "fraction":
|
||||
audio_idx + time_fraction used by final_video to compute display time.
|
||||
time_fraction = 1.0 means shown for full audio file duration.
|
||||
time_fraction = 0.25 means shown for 25% of audio file duration.
|
||||
|
||||
timing_type = "absolute":
|
||||
clip_start + clip_end are absolute seconds in the video timeline.
|
||||
final_video uses these directly — no calculation needed.
|
||||
"""
|
||||
idx: int
|
||||
lines: List[str]
|
||||
timing_type: str # "fraction" or "absolute"
|
||||
|
||||
# fraction-based fields
|
||||
audio_idx: int = 0
|
||||
time_fraction: float = 1.0
|
||||
|
||||
# absolute-based fields
|
||||
clip_start: float = 0.0
|
||||
clip_end: float = 0.0
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# Display modes
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
DISPLAY_MODES = {"single", "multi", "aligned"}
|
||||
|
||||
|
||||
def render_multi_mode(
|
||||
sentence: str,
|
||||
style: dict,
|
||||
audio_idx: int,
|
||||
start_idx: int,
|
||||
) -> List[RenderJob]:
|
||||
"""
|
||||
Full sentence on one image, wrapped into lines.
|
||||
One RenderJob, time_fraction = 1.0.
|
||||
Best for: funny, sad, wholesome, happy.
|
||||
"""
|
||||
words = sentence.split()
|
||||
wpl = style["words_per_chunk"]
|
||||
lines = [" ".join(words[i:i + wpl]) for i in range(0, len(words), wpl)]
|
||||
if not lines:
|
||||
lines = [sentence]
|
||||
|
||||
return [RenderJob(
|
||||
idx=start_idx,
|
||||
lines=lines,
|
||||
timing_type="fraction",
|
||||
audio_idx=audio_idx,
|
||||
time_fraction=1.0,
|
||||
)]
|
||||
|
||||
|
||||
def render_single_mode(
|
||||
sentence: str,
|
||||
style: dict,
|
||||
audio_idx: int,
|
||||
start_idx: int,
|
||||
) -> List[RenderJob]:
|
||||
"""
|
||||
Sentence split into word chunks, one per image.
|
||||
Each shown for (1/N) of the audio duration.
|
||||
Best for: scary, dramatic, angry, mysterious.
|
||||
"""
|
||||
wpc = style["words_per_chunk"]
|
||||
words = sentence.split()
|
||||
raw = [" ".join(words[i:i + wpc]) for i in range(0, len(words), wpc)]
|
||||
raw = [c for c in raw if c.strip()] or [sentence]
|
||||
|
||||
n = len(raw)
|
||||
fraction = 1.0 / n
|
||||
|
||||
return [
|
||||
RenderJob(
|
||||
idx=start_idx + i,
|
||||
lines=[chunk],
|
||||
timing_type="fraction",
|
||||
audio_idx=audio_idx,
|
||||
time_fraction=fraction,
|
||||
)
|
||||
for i, chunk in enumerate(raw)
|
||||
]
|
||||
|
||||
|
||||
def render_aligned_mode(
|
||||
sentence: str,
|
||||
style: dict,
|
||||
audio_idx: int,
|
||||
start_idx: int,
|
||||
word_timestamps: List[dict],
|
||||
audio_start_time: float,
|
||||
audio_duration: float,
|
||||
) -> List[RenderJob]:
|
||||
"""
|
||||
Word-level aligned mode using WhisperX timestamps.
|
||||
|
||||
Groups consecutive words into chunks of words_per_chunk words.
|
||||
Each chunk's clip_start = timestamp of first word in chunk.
|
||||
Each chunk's clip_end = timestamp of last word in chunk + its duration.
|
||||
|
||||
audio_start_time: absolute time in video when this audio file starts.
|
||||
audio_duration: duration of this audio file (used as fallback end time).
|
||||
|
||||
Falls back to single mode if timestamps are empty or malformed.
|
||||
"""
|
||||
wpc = style["words_per_chunk"]
|
||||
|
||||
if not word_timestamps:
|
||||
return render_single_mode(sentence, style, audio_idx, start_idx)
|
||||
|
||||
# Group word timestamps into chunks of wpc words
|
||||
jobs = []
|
||||
n = len(word_timestamps)
|
||||
|
||||
for chunk_start in range(0, n, wpc):
|
||||
chunk_words = word_timestamps[chunk_start:chunk_start + wpc]
|
||||
if not chunk_words:
|
||||
continue
|
||||
|
||||
text = " ".join(w["word"] for w in chunk_words)
|
||||
clip_start = audio_start_time + chunk_words[0]["start"]
|
||||
|
||||
# clip_end = end of last word in chunk,
|
||||
# or start of next chunk if available, capped at audio end
|
||||
if chunk_start + wpc < n:
|
||||
clip_end = audio_start_time + word_timestamps[chunk_start + wpc]["start"]
|
||||
else:
|
||||
last_end = chunk_words[-1].get("end", chunk_words[-1]["start"] + 0.3)
|
||||
clip_end = audio_start_time + last_end
|
||||
|
||||
# Safety: never exceed audio boundary
|
||||
audio_end = audio_start_time + audio_duration
|
||||
clip_end = min(clip_end, audio_end)
|
||||
clip_end = max(clip_end, clip_start + 0.1) # minimum 100ms visibility
|
||||
|
||||
jobs.append(RenderJob(
|
||||
idx=start_idx + len(jobs),
|
||||
lines=[text],
|
||||
timing_type="absolute",
|
||||
clip_start=round(clip_start, 3),
|
||||
clip_end=round(clip_end, 3),
|
||||
))
|
||||
|
||||
return jobs if jobs else render_single_mode(sentence, style, audio_idx, start_idx)
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# Router
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
def get_render_jobs(
|
||||
sentences: List[str],
|
||||
style: dict,
|
||||
mp3_dir: Optional[str] = None,
|
||||
audio_start_times: Optional[List[float]] = None,
|
||||
audio_durations: Optional[List[float]] = None,
|
||||
) -> List[RenderJob]:
|
||||
"""
|
||||
Route each sentence to the correct renderer.
|
||||
Returns flat ordered list of all RenderJobs.
|
||||
|
||||
For "aligned" mode, loads word timestamps from
|
||||
{mp3_dir}/postaudio-{i}_words.json written by engine_wrapper.
|
||||
Falls back to "single" mode per sentence if timestamps missing.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
sentences : one per postaudio-{i}.mp3
|
||||
style : STYLE_MAP entry for current sentiment
|
||||
mp3_dir : path to mp3 folder (needed for aligned mode)
|
||||
audio_start_times : absolute start time of each audio in video (needed for aligned)
|
||||
audio_durations : duration of each audio file (needed for aligned)
|
||||
"""
|
||||
mode = style.get("display_mode", "multi")
|
||||
|
||||
if mode not in DISPLAY_MODES:
|
||||
print(f"[caption_renderer] Unknown display_mode '{mode}', using 'multi'")
|
||||
mode = "multi"
|
||||
|
||||
all_jobs: List[RenderJob] = []
|
||||
img_counter: int = 0
|
||||
|
||||
for audio_idx, sentence in enumerate(sentences):
|
||||
|
||||
if mode == "aligned" and mp3_dir and audio_start_times and audio_durations:
|
||||
# Try to load word timestamps for this sentence
|
||||
from utils.whisper_aligner import load_word_timestamps
|
||||
audio_path = os.path.join(mp3_dir, f"postaudio-{audio_idx}.mp3")
|
||||
word_ts = load_word_timestamps(audio_path)
|
||||
|
||||
if word_ts:
|
||||
jobs = render_aligned_mode(
|
||||
sentence=sentence,
|
||||
style=style,
|
||||
audio_idx=audio_idx,
|
||||
start_idx=img_counter,
|
||||
word_timestamps=word_ts,
|
||||
audio_start_time=audio_start_times[audio_idx],
|
||||
audio_duration=audio_durations[audio_idx],
|
||||
)
|
||||
else:
|
||||
# WhisperX not available or failed — fall back to single mode
|
||||
print(f"[caption_renderer] No timestamps for sentence {audio_idx}, using single mode")
|
||||
jobs = render_single_mode(sentence, style, audio_idx, img_counter)
|
||||
|
||||
elif mode == "single":
|
||||
jobs = render_single_mode(sentence, style, audio_idx, img_counter)
|
||||
|
||||
else:
|
||||
jobs = render_multi_mode(sentence, style, audio_idx, img_counter)
|
||||
|
||||
all_jobs.extend(jobs)
|
||||
img_counter += len(jobs)
|
||||
|
||||
return all_jobs
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# Drawing primitives
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
def measure_text_block(
|
||||
draw: ImageDraw.ImageDraw,
|
||||
lines: List[str],
|
||||
font: ImageFont.FreeTypeFont,
|
||||
line_spacing: int,
|
||||
) -> tuple:
|
||||
max_w = 0
|
||||
total_h = 0
|
||||
for i, line in enumerate(lines):
|
||||
w, h = getsize(font, line)
|
||||
if w > max_w:
|
||||
max_w = w
|
||||
total_h += h
|
||||
if i < len(lines) - 1:
|
||||
total_h += line_spacing
|
||||
return max_w, total_h
|
||||
|
||||
|
||||
def draw_stroked_text(
|
||||
draw: ImageDraw.ImageDraw,
|
||||
x: int,
|
||||
y: int,
|
||||
line: str,
|
||||
font: ImageFont.FreeTypeFont,
|
||||
fill_color: tuple,
|
||||
stroke_color: tuple,
|
||||
stroke_width: int,
|
||||
) -> None:
|
||||
sw = stroke_width
|
||||
half = max(1, sw // 2)
|
||||
offsets = [
|
||||
(-sw, 0), (sw, 0), (0, -sw), (0, sw),
|
||||
(-sw, -sw), (sw, -sw), (-sw, sw), (sw, sw),
|
||||
(-sw, -half), (sw, -half), (-sw, half), (sw, half),
|
||||
(-half, -sw), (half, -sw), (-half, sw), (half, sw),
|
||||
]
|
||||
for ox, oy in offsets:
|
||||
draw.text((x + ox, y + oy), line, font=font, fill=stroke_color)
|
||||
draw.text((x, y), line, font=font, fill=fill_color)
|
||||
|
||||
|
||||
def fit_font(
|
||||
style: dict,
|
||||
lines: List[str],
|
||||
canvas_w: int,
|
||||
canvas_h: int,
|
||||
line_spacing: int,
|
||||
max_width_ratio: float = 0.88,
|
||||
max_height_ratio: float = 0.45,
|
||||
) -> ImageFont.FreeTypeFont:
|
||||
font_size = style["font_size"]
|
||||
font_path = os.path.join("fonts", style["font_file"])
|
||||
if not os.path.exists(font_path):
|
||||
font_path = os.path.join("fonts", "Roboto-Bold.ttf")
|
||||
max_w = int(canvas_w * max_width_ratio)
|
||||
max_h = int(canvas_h * max_height_ratio)
|
||||
while font_size > 30:
|
||||
font = ImageFont.truetype(font_path, font_size)
|
||||
dummy = Image.new("RGBA", (canvas_w, canvas_h), (0, 0, 0, 0))
|
||||
dummy_d = ImageDraw.Draw(dummy)
|
||||
bw, bh = measure_text_block(dummy_d, lines, font, line_spacing)
|
||||
if bw <= max_w and bh <= max_h:
|
||||
return font
|
||||
font_size -= 4
|
||||
return ImageFont.truetype(font_path, 30)
|
||||
|
||||
|
||||
def render_job_to_image(
|
||||
job: RenderJob,
|
||||
style: dict,
|
||||
canvas_w: int,
|
||||
canvas_h: int,
|
||||
line_spacing: int,
|
||||
) -> Image.Image:
|
||||
font = fit_font(style, job.lines, canvas_w, canvas_h, line_spacing)
|
||||
image = Image.new("RGBA", (canvas_w, canvas_h), (0, 0, 0, 0))
|
||||
draw = ImageDraw.Draw(image)
|
||||
bw, bh = measure_text_block(draw, job.lines, font, line_spacing)
|
||||
anchor_y = int(canvas_h * style["y_position"]) - (bh // 2)
|
||||
cy = anchor_y
|
||||
for line in job.lines:
|
||||
w, h = getsize(font, line)
|
||||
x = (canvas_w - w) // 2
|
||||
draw_stroked_text(draw, x, cy, line, font,
|
||||
style["fill_color"], style["stroke_color"], style["stroke_width"])
|
||||
cy += h + line_spacing
|
||||
return image
|
||||
@ -1,74 +1,156 @@
|
||||
"""
|
||||
imagenarator.py
|
||||
───────────────
|
||||
Thin orchestrator. Does exactly:
|
||||
1. Extract sentences from reddit_obj
|
||||
2. Probe audio durations + compute audio start times (needed for aligned mode)
|
||||
3. Call caption_renderer.get_render_jobs()
|
||||
4. Render each job to PNG
|
||||
5. Save timing_map.json for final_video.py
|
||||
"""
|
||||
|
||||
import glob
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import textwrap
|
||||
from typing import List, Optional
|
||||
|
||||
from PIL import Image, ImageDraw, ImageFont
|
||||
import ffmpeg
|
||||
from rich.progress import track
|
||||
|
||||
from TTS.engine_wrapper import process_text
|
||||
from utils.fonts import getheight, getsize
|
||||
from utils import settings
|
||||
from utils.id import extract_id
|
||||
from utils.sentiment_map import STYLE_MAP, DEFAULT_STYLE
|
||||
from utils.caption_renderer import get_render_jobs, render_job_to_image, RenderJob
|
||||
|
||||
|
||||
LINE_SPACING: int = 20
|
||||
|
||||
def draw_multiple_line_text(
|
||||
image, text, font, text_color, padding, wrap=50, transparent=False
|
||||
) -> None:
|
||||
|
||||
def _extract_sentences(reddit_obj: dict, style: dict) -> List[str]:
|
||||
"""
|
||||
Draw multiline text over given image
|
||||
Extract sentences from thread_post.
|
||||
One sentence per postaudio-{i}.mp3 — order preserved.
|
||||
"""
|
||||
draw = ImageDraw.Draw(image)
|
||||
font_height = getheight(font, text)
|
||||
image_width, image_height = image.size
|
||||
lines = textwrap.wrap(text, width=wrap)
|
||||
y = (image_height / 2) - (((font_height + (len(lines) * padding) / len(lines)) * len(lines)) / 2)
|
||||
for line in lines:
|
||||
line_width, line_height = getsize(font, line)
|
||||
if transparent:
|
||||
shadowcolor = "black"
|
||||
for i in range(1, 5):
|
||||
draw.text(
|
||||
((image_width - line_width) / 2 - i, y - i),
|
||||
line,
|
||||
font=font,
|
||||
fill=shadowcolor,
|
||||
)
|
||||
draw.text(
|
||||
((image_width - line_width) / 2 + i, y - i),
|
||||
line,
|
||||
font=font,
|
||||
fill=shadowcolor,
|
||||
)
|
||||
draw.text(
|
||||
((image_width - line_width) / 2 - i, y + i),
|
||||
line,
|
||||
font=font,
|
||||
fill=shadowcolor,
|
||||
)
|
||||
draw.text(
|
||||
((image_width - line_width) / 2 + i, y + i),
|
||||
line,
|
||||
font=font,
|
||||
fill=shadowcolor,
|
||||
)
|
||||
draw.text(((image_width - line_width) / 2, y), line, font=font, fill=text_color)
|
||||
y += line_height + padding
|
||||
|
||||
|
||||
def imagemaker(theme, reddit_obj: dict, txtclr, padding=5, transparent=False) -> None:
|
||||
raw_texts = reddit_obj["thread_post"]
|
||||
sentences: List[str] = []
|
||||
for item in raw_texts:
|
||||
if isinstance(item, dict):
|
||||
text = item.get("text", "")
|
||||
elif isinstance(item, str):
|
||||
text = item
|
||||
else:
|
||||
text = str(item)
|
||||
text = process_text(text, False).strip()
|
||||
if style.get("uppercase", False):
|
||||
text = text.upper()
|
||||
if text:
|
||||
sentences.append(text)
|
||||
return sentences if sentences else ["..."]
|
||||
|
||||
|
||||
def _get_audio_info(mp3_dir: str) -> tuple:
|
||||
"""
|
||||
Discover postaudio files and compute:
|
||||
- durations list (one per postaudio file)
|
||||
- start times list (absolute seconds in video, after title card)
|
||||
|
||||
Returns (postaudio_files, durations, start_times)
|
||||
"""
|
||||
Render Images for video
|
||||
postaudio_files = sorted(
|
||||
glob.glob(os.path.join(mp3_dir, "postaudio-*.mp3")),
|
||||
key=lambda x: int(re.search(r"postaudio-(\d+)", x).group(1))
|
||||
)
|
||||
|
||||
title_path = os.path.join(mp3_dir, "title.mp3")
|
||||
try:
|
||||
title_duration = float(ffmpeg.probe(title_path)["format"]["duration"])
|
||||
except Exception:
|
||||
title_duration = 0.0
|
||||
|
||||
durations = []
|
||||
start_times = []
|
||||
current = title_duration
|
||||
|
||||
for f in postaudio_files:
|
||||
try:
|
||||
dur = float(ffmpeg.probe(f)["format"]["duration"])
|
||||
except Exception:
|
||||
dur = 0.0
|
||||
start_times.append(current)
|
||||
durations.append(dur)
|
||||
current += dur
|
||||
|
||||
return postaudio_files, durations, start_times
|
||||
|
||||
|
||||
def imagemaker(theme, reddit_obj: dict, txtclr, padding=5, transparent=False) -> int:
|
||||
"""
|
||||
texts = reddit_obj["thread_post"]
|
||||
reddit_id = extract_id(reddit_obj)
|
||||
if transparent:
|
||||
font = ImageFont.truetype(os.path.join("fonts", "Roboto-Bold.ttf"), 100)
|
||||
else:
|
||||
font = ImageFont.truetype(os.path.join("fonts", "Roboto-Regular.ttf"), 100)
|
||||
|
||||
size = (1920, 1080)
|
||||
|
||||
for idx, text in track(enumerate(texts), "Rendering Image"):
|
||||
image = Image.new("RGBA", size, theme)
|
||||
text = process_text(text, False)
|
||||
draw_multiple_line_text(image, text, font, txtclr, padding, wrap=30, transparent=transparent)
|
||||
image.save(f"assets/temp/{reddit_id}/png/img{idx}.png")
|
||||
Render caption images for the video.
|
||||
|
||||
Flow:
|
||||
sentences + audio info
|
||||
→ caption_renderer.get_render_jobs()
|
||||
→ List[RenderJob]
|
||||
each RenderJob → transparent PNG (img{idx}.png)
|
||||
timing_map.json → saved for final_video.py
|
||||
|
||||
timing_map.json entry for fraction-based jobs:
|
||||
{"timing_type": "fraction", "audio_idx": N, "time_fraction": F}
|
||||
|
||||
timing_map.json entry for absolute-based jobs (aligned mode):
|
||||
{"timing_type": "absolute", "clip_start": S, "clip_end": E}
|
||||
|
||||
Returns:
|
||||
int: total number of images generated
|
||||
"""
|
||||
# 1. Style
|
||||
sentiment = settings.config["settings"].get("sentiment", "dramatic")
|
||||
style = STYLE_MAP.get(sentiment, DEFAULT_STYLE)
|
||||
CANVAS_W: int = int(settings.config["settings"]["resolution_w"])
|
||||
CANVAS_H: int = int(settings.config["settings"]["resolution_h"])
|
||||
reddit_id = extract_id(reddit_obj)
|
||||
mp3_dir = f"assets/temp/{reddit_id}/mp3"
|
||||
|
||||
# 2. Extract sentences
|
||||
sentences = _extract_sentences(reddit_obj, style)
|
||||
|
||||
# 3. Get audio timing info (needed for aligned mode)
|
||||
_, durations, start_times = _get_audio_info(mp3_dir)
|
||||
|
||||
# 4. Get render jobs
|
||||
jobs: List[RenderJob] = get_render_jobs(
|
||||
sentences=sentences,
|
||||
style=style,
|
||||
mp3_dir=mp3_dir,
|
||||
audio_start_times=start_times if start_times else None,
|
||||
audio_durations=durations if durations else None,
|
||||
)
|
||||
|
||||
# 5. Render each job to a transparent PNG
|
||||
for job in track(jobs, description="Rendering caption images"):
|
||||
image = render_job_to_image(job, style, CANVAS_W, CANVAS_H, LINE_SPACING)
|
||||
image.save(f"assets/temp/{reddit_id}/png/img{job.idx}.png")
|
||||
|
||||
# 6. Save timing map
|
||||
timing_map = []
|
||||
for job in jobs:
|
||||
if job.timing_type == "absolute":
|
||||
timing_map.append({
|
||||
"timing_type": "absolute",
|
||||
"clip_start": job.clip_start,
|
||||
"clip_end": job.clip_end,
|
||||
})
|
||||
else:
|
||||
timing_map.append({
|
||||
"timing_type": "fraction",
|
||||
"audio_idx": job.audio_idx,
|
||||
"time_fraction": job.time_fraction,
|
||||
})
|
||||
|
||||
timing_map_path = f"assets/temp/{reddit_id}/timing_map.json"
|
||||
with open(timing_map_path, "w") as f:
|
||||
json.dump(timing_map, f, indent=2)
|
||||
|
||||
return len(jobs)
|
||||
@ -0,0 +1,168 @@
|
||||
"""
|
||||
whisper_aligner.py
|
||||
──────────────────
|
||||
Word-level timestamp extraction using WhisperX.
|
||||
|
||||
This module runs after each TTS audio file is saved.
|
||||
It produces a word-level timestamp JSON for every postaudio-{i}.mp3.
|
||||
|
||||
Output format (postaudio-{i}_words.json):
|
||||
[
|
||||
{"word": "I", "start": 0.00, "end": 0.18},
|
||||
{"word": "told", "start": 0.18, "end": 0.42},
|
||||
...
|
||||
]
|
||||
|
||||
WhisperX is used because:
|
||||
- Works with ANY TTS engine (Google, OpenAI, ElevenLabs, etc.)
|
||||
- Free, runs locally, no API cost
|
||||
- Word-level accuracy (not sentence-level)
|
||||
- Fast on CPU for short audio clips
|
||||
|
||||
If WhisperX is not installed or fails for any reason,
|
||||
this module returns None and the system falls back to
|
||||
time_fraction-based sync (single/multi mode).
|
||||
No crashes, no interruptions.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
from typing import List, Optional
|
||||
|
||||
from utils.console import print_substep
|
||||
|
||||
|
||||
# ── WhisperX model is loaded once and reused across all audio files ───────────
|
||||
# Loading is expensive (~2-3s). We cache it as a module-level singleton.
|
||||
_whisper_model = None
|
||||
_whisper_model_lang = None
|
||||
|
||||
|
||||
def _get_model(language: str = "en"):
|
||||
"""
|
||||
Lazy-load WhisperX model. Loaded once per run, reused for all audio files.
|
||||
Returns None if WhisperX is not installed.
|
||||
"""
|
||||
global _whisper_model, _whisper_model_lang
|
||||
|
||||
if _whisper_model is not None and _whisper_model_lang == language:
|
||||
return _whisper_model
|
||||
|
||||
try:
|
||||
import whisperx
|
||||
print_substep("Loading WhisperX model (first run only)...", style="bold blue")
|
||||
_whisper_model = whisperx.load_model(
|
||||
"base", # small enough for CPU, accurate enough for TTS
|
||||
device="cpu",
|
||||
compute_type="int8",
|
||||
language=language,
|
||||
)
|
||||
_whisper_model_lang = language
|
||||
return _whisper_model
|
||||
except ImportError:
|
||||
return None
|
||||
except Exception as e:
|
||||
print_substep(f"WhisperX model load failed: {e}", style="yellow")
|
||||
return None
|
||||
|
||||
|
||||
def align_audio(audio_path: str, language: str = "en") -> Optional[List[dict]]:
|
||||
"""
|
||||
Run WhisperX on a single audio file and return word-level timestamps.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
audio_path : str
|
||||
Path to the .mp3 file to align.
|
||||
language : str
|
||||
Language code (default: "en"). Matches TTS language.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Optional[List[dict]]
|
||||
List of {"word": str, "start": float, "end": float} dicts.
|
||||
Returns None if WhisperX is unavailable or alignment fails.
|
||||
"""
|
||||
try:
|
||||
import whisperx
|
||||
|
||||
model = _get_model(language)
|
||||
if model is None:
|
||||
return None
|
||||
|
||||
# Transcribe + align
|
||||
audio = whisperx.load_audio(audio_path)
|
||||
result = model.transcribe(audio, batch_size=4)
|
||||
|
||||
# Align to get word-level timestamps
|
||||
align_model, metadata = whisperx.load_align_model(
|
||||
language_code=language,
|
||||
device="cpu",
|
||||
)
|
||||
aligned = whisperx.align(
|
||||
result["segments"],
|
||||
align_model,
|
||||
metadata,
|
||||
audio,
|
||||
device="cpu",
|
||||
return_char_alignments=False,
|
||||
)
|
||||
|
||||
# Flatten all words across all segments
|
||||
words = []
|
||||
for segment in aligned.get("word_segments", []):
|
||||
word = segment.get("word", "").strip()
|
||||
start = segment.get("start")
|
||||
end = segment.get("end")
|
||||
if word and start is not None and end is not None:
|
||||
words.append({
|
||||
"word": word,
|
||||
"start": round(float(start), 3),
|
||||
"end": round(float(end), 3),
|
||||
})
|
||||
|
||||
return words if words else None
|
||||
|
||||
except Exception as e:
|
||||
print_substep(f"WhisperX alignment failed for {audio_path}: {e}", style="yellow")
|
||||
return None
|
||||
|
||||
|
||||
def align_and_save(audio_path: str, language: str = "en") -> Optional[str]:
|
||||
"""
|
||||
Align audio and save word timestamps as a JSON file next to the audio.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
audio_path : str
|
||||
e.g. "assets/temp/abc123/mp3/postaudio-0.mp3"
|
||||
language : str
|
||||
Language code.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Optional[str]
|
||||
Path to saved JSON file, or None if alignment failed.
|
||||
"""
|
||||
words = align_audio(audio_path, language)
|
||||
|
||||
if words is None:
|
||||
return None
|
||||
|
||||
json_path = audio_path.replace(".mp3", "_words.json")
|
||||
with open(json_path, "w", encoding="utf-8") as f:
|
||||
json.dump(words, f, indent=2, ensure_ascii=False)
|
||||
|
||||
return json_path
|
||||
|
||||
|
||||
def load_word_timestamps(audio_path: str) -> Optional[List[dict]]:
|
||||
"""
|
||||
Load previously saved word timestamps for an audio file.
|
||||
Returns None if the file doesn't exist.
|
||||
"""
|
||||
json_path = audio_path.replace(".mp3", "_words.json")
|
||||
if not os.path.exists(json_path):
|
||||
return None
|
||||
with open(json_path, "r", encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
Loading…
Reference in new issue