feat: pro caption system with WhisperX word-level alignment

Core changes:
- utils/caption_renderer.py: new single-responsibility rendering engine
  - Three display modes: aligned, single, multi
  - 8-direction stroke technique for clean text outlines
  - Transparent PNG overlays (no more solid box)
- utils/whisper_aligner.py: WhisperX forced alignment module
  - Word-level timestamps from any TTS audio
  - Graceful fallback to single mode if unavailable
- utils/imagenarator.py: refactored as thin orchestrator
  - Delegates to caption_renderer
  - Saves timing_map.json for final_video sync
- utils/sentiment_map.py: added STYLE_MAP with display_mode per sentiment
- utils/sentiment.py: stores sentiment in settings for downstream use
- TTS/engine_wrapper.py: runs WhisperX after each TTS save
- video_creation/final_video.py: reads timing_map, handles absolute + fraction timing
- video_creation/screenshot_downloader.py: clean imagemaker call

Assets:
- fonts/: added Montserrat, Nunito, Oswald, Raleway, Lato, Anton font families

Dependencies:
- requirements.txt: updated with all current dependencies
pull/2557/head
Abdessamad Haddouche 3 weeks ago
parent af0940045c
commit 076b65f04c

@ -14,23 +14,11 @@ from utils import settings
from utils.console import print_step, print_substep
from utils.voice import sanitize_text
DEFAULT_MAX_LENGTH: int = (
50 # Video length variable, edit this on your own risk. It should work, but it's not supported
)
DEFAULT_MAX_LENGTH: int = 50
class TTSEngine:
"""Calls the given TTS engine to reduce code duplication and allow multiple TTS engines.
Args:
tts_module : The TTS module. Your module should handle the TTS itself and saving to the given path under the run method.
reddit_object : The reddit object that contains the posts to read.
path (Optional) : The unix style path to save the mp3 files to. This must not have leading or trailing slashes.
max_length (Optional) : The maximum length of the mp3 files in total.
Notes:
tts_module must take the arguments text and filepath.
"""
"""Calls the given TTS engine to reduce code duplication and allow multiple TTS engines."""
def __init__(
self,
@ -42,18 +30,14 @@ class TTSEngine:
):
self.tts_module = tts_module()
self.reddit_object = reddit_object
self.redditid = re.sub(r"[^\w\s-]", "", reddit_object["thread_id"])
self.path = path + self.redditid + "/mp3"
self.max_length = max_length
self.length = 0
self.last_clip_length = last_clip_length
def add_periods(
self,
): # adds periods to the end of paragraphs (where people often forget to put them) so tts doesn't blend sentences
def add_periods(self):
for comment in self.reddit_object["comments"]:
# remove links
regex_urls = r"((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*"
comment["comment_body"] = re.sub(regex_urls, " ", comment["comment_body"])
comment["comment_body"] = comment["comment_body"].replace("\n", ". ")
@ -72,7 +56,6 @@ class TTSEngine:
self.add_periods()
self.call_tts("title", process_text(self.reddit_object["thread_title"]))
# processed_text = ##self.reddit_object["thread_post"] != ""
idx = 0
if settings.config["settings"]["storymode"]:
@ -84,24 +67,41 @@ class TTSEngine:
elif settings.config["settings"]["storymodemethod"] == 1:
for idx, text in track(enumerate(self.reddit_object["thread_post"])):
self.call_tts(f"postaudio-{idx}", process_text(text))
# ── WhisperX alignment ────────────────────────────────────
# Run immediately after each TTS save so word timestamps
# are ready when imagemaker() runs later.
# Fails silently — never blocks video generation.
self._align_audio(f"postaudio-{idx}")
else:
for idx, comment in track(enumerate(self.reddit_object["comments"]), "Saving..."):
# ! Stop creating mp3 files if the length is greater than max length.
if self.length > self.max_length and idx > 1:
self.length -= self.last_clip_length
idx -= 1
break
if (
len(comment["comment_body"]) > self.tts_module.max_chars
): # Split the comment if it is too long
self.split_post(comment["comment_body"], idx) # Split the comment
else: # If the comment is not too long, just call the tts engine
if len(comment["comment_body"]) > self.tts_module.max_chars:
self.split_post(comment["comment_body"], idx)
else:
self.call_tts(f"{idx}", process_text(comment["comment_body"]))
print_substep("Saved Text to MP3 files successfully.", style="bold green")
return self.length, idx
def _align_audio(self, filename: str) -> None:
"""
Run WhisperX on a saved audio file to produce word-level timestamps.
Called immediately after each postaudio-{i}.mp3 is saved.
Fails silently system falls back to time_fraction mode if unavailable.
"""
try:
from utils.whisper_aligner import align_and_save
audio_path = f"{self.path}/{filename}.mp3"
lang = settings.config["reddit"]["thread"].get("post_lang", "en") or "en"
result = align_and_save(audio_path, language=lang)
if result:
print_substep(f"Word timestamps saved → {result}", style="dim")
except Exception:
pass # Never crash on alignment failure
def split_post(self, text: str, idx):
split_files = []
split_text = [
@ -114,8 +114,6 @@ class TTSEngine:
for idy, text_cut in enumerate(split_text):
newtext = process_text(text_cut)
# print(f"{idx}-{idy}: {newtext}\n")
if not newtext or newtext.isspace():
print("newtext was blank because sanitized split text resulted in none")
continue
@ -144,7 +142,6 @@ class TTSEngine:
def call_tts(self, filename: str, text: str):
if settings.config["settings"]["tts"]["voice_choice"] == "googletranslate":
# GTTS does not have the argument 'random_voice'
self.tts_module.run(
text,
filepath=f"{self.path}/{filename}.mp3",
@ -155,10 +152,6 @@ class TTSEngine:
filepath=f"{self.path}/{filename}.mp3",
random_voice=settings.config["settings"]["tts"]["random_voice"],
)
# try:
# self.length += MP3(f"{self.path}/{filename}.mp3").info.length
# except (MutagenError, HeaderNotFoundError):
# self.length += sox.file_info.duration(f"{self.path}/{filename}.mp3")
try:
clip = AudioFileClip(f"{self.path}/{filename}.mp3")
self.last_clip_length = clip.duration
@ -185,4 +178,4 @@ def process_text(text: str, clean: bool = True):
print_substep("Translating Text...")
translated_text = translators.translate_text(text, translator="google", to_language=lang)
new_text = sanitize_text(translated_text)
return new_text
return new_text

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

@ -1,21 +1,198 @@
aiohappyeyeballs==2.6.2
aiohttp==3.13.5
aiosignal==1.4.0
alembic==1.18.4
annotated-doc==0.0.4
annotated-types==0.7.0
antlr4-python3-runtime==4.9.3
anyio==4.13.0
asteroid-filterbanks==0.4.0
attrs==26.1.0
av==17.0.1
blinker==1.9.0
blis==1.3.3
boto3==1.36.8
botocore==1.36.8
catalogue==2.0.10
certifi==2026.5.20
cffi==2.0.0
charset-normalizer==3.4.7
clean-text==0.6.0
click==8.1.8
cloudpathlib==0.24.0
colorlog==6.10.1
confection==0.1.5
contourpy==1.3.3
cryptography==48.0.0
ctranslate2==4.7.2
cycler==0.12.1
cymem==2.0.13
decorator==5.3.1
dill==0.4.1
distro==1.9.0
einops==0.8.2
elevenlabs==1.57.0
emoji==1.7.0
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl#sha256=1932429db727d4bff3deed6b34cfc05df17794f4a52eeb26cf8928f7c1a0fb85
exejs==0.0.7
faster-whisper==1.2.1
ffmpeg-python==0.2.0
filelock==3.29.0
Flask==3.1.1
flatbuffers==25.12.19
fonttools==4.63.0
frozenlist==1.8.0
fsspec==2026.4.0
ftfy==6.3.1
future==1.0.0
googleapis-common-protos==1.75.0
greenlet==3.1.1
grpcio==1.80.0
gTTS==2.5.4
h11==0.16.0
hf-xet==1.5.0
httpcore==1.0.9
httpx==0.28.1
huggingface_hub==0.36.2
idna==3.17
ImageIO==2.37.3
imageio-ffmpeg==0.6.0
itsdangerous==2.2.0
jh2==5.0.11
Jinja2==3.1.6
jiter==0.15.0
jmespath==1.1.0
joblib==1.5.3
julius==0.2.7
kiwisolver==1.5.0
langcodes==3.5.1
lightning==2.6.5
lightning-utilities==0.15.3
lxml==6.1.1
Mako==1.3.12
markdown-it-py==4.2.0
MarkupSafe==3.0.3
matplotlib==3.10.9
mdurl==0.1.2
moviepy==2.2.1
mpmath==1.3.0
multidict==6.7.1
multiprocess==0.70.19
murmurhash==1.0.15
networkx==3.6.1
niquests==3.18.8
nltk==3.9.4
numpy==2.4.6
nvidia-cublas-cu12==12.8.4.1
nvidia-cuda-cupti-cu12==12.8.90
nvidia-cuda-nvrtc-cu12==12.8.93
nvidia-cuda-runtime-cu12==12.8.90
nvidia-cudnn-cu12==9.10.2.21
nvidia-cufft-cu12==11.3.3.83
nvidia-cufile-cu12==1.13.1.3
nvidia-curand-cu12==10.3.9.90
nvidia-cusolver-cu12==11.7.3.90
nvidia-cusparse-cu12==12.5.8.93
nvidia-cusparselt-cu12==0.7.1
nvidia-nccl-cu12==2.27.3
nvidia-nvjitlink-cu12==12.8.93
nvidia-nvtx-cu12==12.8.90
omegaconf==2.3.0
onnxruntime==1.26.0
openai==2.38.0
opentelemetry-api==1.42.1
opentelemetry-exporter-otlp==1.42.1
opentelemetry-exporter-otlp-proto-common==1.42.1
opentelemetry-exporter-otlp-proto-grpc==1.42.1
opentelemetry-exporter-otlp-proto-http==1.42.1
opentelemetry-proto==1.42.1
opentelemetry-sdk==1.42.1
opentelemetry-semantic-conventions==0.63b1
optuna==4.8.0
packaging==26.2
pandas==3.0.3
pathos==0.3.5
pillow==11.3.0
playwright==1.49.1
pox==0.3.7
ppft==1.7.8
praw==7.8.1
prawcore==2.4.0
preshed==3.0.13
primePy==1.3
proglog==0.1.12
propcache==0.5.2
protobuf==6.33.6
pyannote-audio==4.0.4
pyannote-core==6.0.1
pyannote-database==6.1.1
pyannote-metrics==4.1
pyannote-pipeline==4.0.0
pyannoteai-sdk==0.4.0
pycparser==3.0
pydantic==2.13.4
pydantic_core==2.46.4
pyee==12.0.0
Pygments==2.20.0
pyparsing==3.3.2
python-dateutil==2.9.0.post0
python-dotenv==1.2.2
pytorch-lightning==2.6.5
pytorch-metric-learning==2.9.0
pyttsx3==2.98
PyYAML==6.0.3
qh3==1.8.1
regex==2026.5.9
requests==2.32.3
rich==13.9.4
s3transfer==0.11.3
safetensors==0.7.0
scikit-learn==1.8.0
scipy==1.17.1
setuptools==82.0.1
shellingham==1.5.4
six==1.17.0
smart_open==7.6.1
sniffio==1.3.1
sortedcontainers==2.4.0
spacy==3.8.7
spacy-legacy==3.0.12
spacy-loggers==1.0.5
SQLAlchemy==2.0.50
srsly==2.5.3
sympy==1.14.0
thinc==8.3.11
threadpoolctl==3.6.0
tokenizers==0.21.4
toml==0.10.2
translators==5.9.9
pyttsx3==2.98
tomlkit==0.13.2
Flask==3.1.1
clean-text==0.6.0
unidecode==1.4.0
spacy==3.8.7
torch==2.7.0
torch==2.8.0
torch-audiomentations==0.12.0
torch_pitch_shift==1.2.5
torchaudio==2.8.0
torchcodec==0.7.0
torchmetrics==1.9.0
torchvision==0.23.0
tqdm==4.67.3
transformers==4.52.4
ffmpeg-python==0.2.0
elevenlabs==1.57.0
yt-dlp==2025.10.22
translators==5.9.9
triton==3.4.0
typer==0.26.2
typer-slim==0.24.0
typing-inspection==0.4.2
typing_extensions==4.15.0
Unidecode==1.4.0
update-checker==0.18.0
urllib3==2.7.0
urllib3-future==2.20.907
wasabi==1.1.3
wassima==2.1.0
wcwidth==0.7.0
weasel==0.4.3
websocket-client==1.9.0
websockets==16.0
Werkzeug==3.1.8
whisperx==3.8.6
wrapt==2.2.1
yarl==1.24.2
yt-dlp==2026.3.17

@ -0,0 +1,351 @@
"""
caption_renderer.py
All caption rendering logic. Three display modes:
multi full sentence on one image (1 RenderJob per sentence)
single sentence split into word chunks (N RenderJobs per sentence)
aligned word-level timestamps from WhisperX (perfect sync, any TTS)
RenderJob is the contract between this module and final_video.py.
Two types of timing:
FRACTION-based (multi, single):
audio_idx + time_fraction final_video computes absolute time
time_fraction = fraction of audio_clips_durations[audio_idx+1]
ABSOLUTE-based (aligned):
clip_start + clip_end final_video uses directly
These are absolute seconds in the video timeline (after title card)
final_video.py checks job["timing_type"] to know which to use.
"""
import os
from dataclasses import dataclass, field
from typing import List, Optional
from PIL import Image, ImageDraw, ImageFont
from utils.fonts import getsize
# ─────────────────────────────────────────────────────────────────────────────
# RenderJob — the contract
# ─────────────────────────────────────────────────────────────────────────────
@dataclass
class RenderJob:
"""
Describes exactly one output image (img{idx}.png).
timing_type = "fraction":
audio_idx + time_fraction used by final_video to compute display time.
time_fraction = 1.0 means shown for full audio file duration.
time_fraction = 0.25 means shown for 25% of audio file duration.
timing_type = "absolute":
clip_start + clip_end are absolute seconds in the video timeline.
final_video uses these directly no calculation needed.
"""
idx: int
lines: List[str]
timing_type: str # "fraction" or "absolute"
# fraction-based fields
audio_idx: int = 0
time_fraction: float = 1.0
# absolute-based fields
clip_start: float = 0.0
clip_end: float = 0.0
# ─────────────────────────────────────────────────────────────────────────────
# Display modes
# ─────────────────────────────────────────────────────────────────────────────
DISPLAY_MODES = {"single", "multi", "aligned"}
def render_multi_mode(
sentence: str,
style: dict,
audio_idx: int,
start_idx: int,
) -> List[RenderJob]:
"""
Full sentence on one image, wrapped into lines.
One RenderJob, time_fraction = 1.0.
Best for: funny, sad, wholesome, happy.
"""
words = sentence.split()
wpl = style["words_per_chunk"]
lines = [" ".join(words[i:i + wpl]) for i in range(0, len(words), wpl)]
if not lines:
lines = [sentence]
return [RenderJob(
idx=start_idx,
lines=lines,
timing_type="fraction",
audio_idx=audio_idx,
time_fraction=1.0,
)]
def render_single_mode(
sentence: str,
style: dict,
audio_idx: int,
start_idx: int,
) -> List[RenderJob]:
"""
Sentence split into word chunks, one per image.
Each shown for (1/N) of the audio duration.
Best for: scary, dramatic, angry, mysterious.
"""
wpc = style["words_per_chunk"]
words = sentence.split()
raw = [" ".join(words[i:i + wpc]) for i in range(0, len(words), wpc)]
raw = [c for c in raw if c.strip()] or [sentence]
n = len(raw)
fraction = 1.0 / n
return [
RenderJob(
idx=start_idx + i,
lines=[chunk],
timing_type="fraction",
audio_idx=audio_idx,
time_fraction=fraction,
)
for i, chunk in enumerate(raw)
]
def render_aligned_mode(
sentence: str,
style: dict,
audio_idx: int,
start_idx: int,
word_timestamps: List[dict],
audio_start_time: float,
audio_duration: float,
) -> List[RenderJob]:
"""
Word-level aligned mode using WhisperX timestamps.
Groups consecutive words into chunks of words_per_chunk words.
Each chunk's clip_start = timestamp of first word in chunk.
Each chunk's clip_end = timestamp of last word in chunk + its duration.
audio_start_time: absolute time in video when this audio file starts.
audio_duration: duration of this audio file (used as fallback end time).
Falls back to single mode if timestamps are empty or malformed.
"""
wpc = style["words_per_chunk"]
if not word_timestamps:
return render_single_mode(sentence, style, audio_idx, start_idx)
# Group word timestamps into chunks of wpc words
jobs = []
n = len(word_timestamps)
for chunk_start in range(0, n, wpc):
chunk_words = word_timestamps[chunk_start:chunk_start + wpc]
if not chunk_words:
continue
text = " ".join(w["word"] for w in chunk_words)
clip_start = audio_start_time + chunk_words[0]["start"]
# clip_end = end of last word in chunk,
# or start of next chunk if available, capped at audio end
if chunk_start + wpc < n:
clip_end = audio_start_time + word_timestamps[chunk_start + wpc]["start"]
else:
last_end = chunk_words[-1].get("end", chunk_words[-1]["start"] + 0.3)
clip_end = audio_start_time + last_end
# Safety: never exceed audio boundary
audio_end = audio_start_time + audio_duration
clip_end = min(clip_end, audio_end)
clip_end = max(clip_end, clip_start + 0.1) # minimum 100ms visibility
jobs.append(RenderJob(
idx=start_idx + len(jobs),
lines=[text],
timing_type="absolute",
clip_start=round(clip_start, 3),
clip_end=round(clip_end, 3),
))
return jobs if jobs else render_single_mode(sentence, style, audio_idx, start_idx)
# ─────────────────────────────────────────────────────────────────────────────
# Router
# ─────────────────────────────────────────────────────────────────────────────
def get_render_jobs(
sentences: List[str],
style: dict,
mp3_dir: Optional[str] = None,
audio_start_times: Optional[List[float]] = None,
audio_durations: Optional[List[float]] = None,
) -> List[RenderJob]:
"""
Route each sentence to the correct renderer.
Returns flat ordered list of all RenderJobs.
For "aligned" mode, loads word timestamps from
{mp3_dir}/postaudio-{i}_words.json written by engine_wrapper.
Falls back to "single" mode per sentence if timestamps missing.
Parameters
----------
sentences : one per postaudio-{i}.mp3
style : STYLE_MAP entry for current sentiment
mp3_dir : path to mp3 folder (needed for aligned mode)
audio_start_times : absolute start time of each audio in video (needed for aligned)
audio_durations : duration of each audio file (needed for aligned)
"""
mode = style.get("display_mode", "multi")
if mode not in DISPLAY_MODES:
print(f"[caption_renderer] Unknown display_mode '{mode}', using 'multi'")
mode = "multi"
all_jobs: List[RenderJob] = []
img_counter: int = 0
for audio_idx, sentence in enumerate(sentences):
if mode == "aligned" and mp3_dir and audio_start_times and audio_durations:
# Try to load word timestamps for this sentence
from utils.whisper_aligner import load_word_timestamps
audio_path = os.path.join(mp3_dir, f"postaudio-{audio_idx}.mp3")
word_ts = load_word_timestamps(audio_path)
if word_ts:
jobs = render_aligned_mode(
sentence=sentence,
style=style,
audio_idx=audio_idx,
start_idx=img_counter,
word_timestamps=word_ts,
audio_start_time=audio_start_times[audio_idx],
audio_duration=audio_durations[audio_idx],
)
else:
# WhisperX not available or failed — fall back to single mode
print(f"[caption_renderer] No timestamps for sentence {audio_idx}, using single mode")
jobs = render_single_mode(sentence, style, audio_idx, img_counter)
elif mode == "single":
jobs = render_single_mode(sentence, style, audio_idx, img_counter)
else:
jobs = render_multi_mode(sentence, style, audio_idx, img_counter)
all_jobs.extend(jobs)
img_counter += len(jobs)
return all_jobs
# ─────────────────────────────────────────────────────────────────────────────
# Drawing primitives
# ─────────────────────────────────────────────────────────────────────────────
def measure_text_block(
draw: ImageDraw.ImageDraw,
lines: List[str],
font: ImageFont.FreeTypeFont,
line_spacing: int,
) -> tuple:
max_w = 0
total_h = 0
for i, line in enumerate(lines):
w, h = getsize(font, line)
if w > max_w:
max_w = w
total_h += h
if i < len(lines) - 1:
total_h += line_spacing
return max_w, total_h
def draw_stroked_text(
draw: ImageDraw.ImageDraw,
x: int,
y: int,
line: str,
font: ImageFont.FreeTypeFont,
fill_color: tuple,
stroke_color: tuple,
stroke_width: int,
) -> None:
sw = stroke_width
half = max(1, sw // 2)
offsets = [
(-sw, 0), (sw, 0), (0, -sw), (0, sw),
(-sw, -sw), (sw, -sw), (-sw, sw), (sw, sw),
(-sw, -half), (sw, -half), (-sw, half), (sw, half),
(-half, -sw), (half, -sw), (-half, sw), (half, sw),
]
for ox, oy in offsets:
draw.text((x + ox, y + oy), line, font=font, fill=stroke_color)
draw.text((x, y), line, font=font, fill=fill_color)
def fit_font(
style: dict,
lines: List[str],
canvas_w: int,
canvas_h: int,
line_spacing: int,
max_width_ratio: float = 0.88,
max_height_ratio: float = 0.45,
) -> ImageFont.FreeTypeFont:
font_size = style["font_size"]
font_path = os.path.join("fonts", style["font_file"])
if not os.path.exists(font_path):
font_path = os.path.join("fonts", "Roboto-Bold.ttf")
max_w = int(canvas_w * max_width_ratio)
max_h = int(canvas_h * max_height_ratio)
while font_size > 30:
font = ImageFont.truetype(font_path, font_size)
dummy = Image.new("RGBA", (canvas_w, canvas_h), (0, 0, 0, 0))
dummy_d = ImageDraw.Draw(dummy)
bw, bh = measure_text_block(dummy_d, lines, font, line_spacing)
if bw <= max_w and bh <= max_h:
return font
font_size -= 4
return ImageFont.truetype(font_path, 30)
def render_job_to_image(
job: RenderJob,
style: dict,
canvas_w: int,
canvas_h: int,
line_spacing: int,
) -> Image.Image:
font = fit_font(style, job.lines, canvas_w, canvas_h, line_spacing)
image = Image.new("RGBA", (canvas_w, canvas_h), (0, 0, 0, 0))
draw = ImageDraw.Draw(image)
bw, bh = measure_text_block(draw, job.lines, font, line_spacing)
anchor_y = int(canvas_h * style["y_position"]) - (bh // 2)
cy = anchor_y
for line in job.lines:
w, h = getsize(font, line)
x = (canvas_w - w) // 2
draw_stroked_text(draw, x, cy, line, font,
style["fill_color"], style["stroke_color"], style["stroke_width"])
cy += h + line_spacing
return image

@ -1,74 +1,156 @@
"""
imagenarator.py
Thin orchestrator. Does exactly:
1. Extract sentences from reddit_obj
2. Probe audio durations + compute audio start times (needed for aligned mode)
3. Call caption_renderer.get_render_jobs()
4. Render each job to PNG
5. Save timing_map.json for final_video.py
"""
import glob
import json
import os
import re
import textwrap
from typing import List, Optional
from PIL import Image, ImageDraw, ImageFont
import ffmpeg
from rich.progress import track
from TTS.engine_wrapper import process_text
from utils.fonts import getheight, getsize
from utils import settings
from utils.id import extract_id
from utils.sentiment_map import STYLE_MAP, DEFAULT_STYLE
from utils.caption_renderer import get_render_jobs, render_job_to_image, RenderJob
LINE_SPACING: int = 20
def draw_multiple_line_text(
image, text, font, text_color, padding, wrap=50, transparent=False
) -> None:
def _extract_sentences(reddit_obj: dict, style: dict) -> List[str]:
"""
Draw multiline text over given image
Extract sentences from thread_post.
One sentence per postaudio-{i}.mp3 order preserved.
"""
draw = ImageDraw.Draw(image)
font_height = getheight(font, text)
image_width, image_height = image.size
lines = textwrap.wrap(text, width=wrap)
y = (image_height / 2) - (((font_height + (len(lines) * padding) / len(lines)) * len(lines)) / 2)
for line in lines:
line_width, line_height = getsize(font, line)
if transparent:
shadowcolor = "black"
for i in range(1, 5):
draw.text(
((image_width - line_width) / 2 - i, y - i),
line,
font=font,
fill=shadowcolor,
)
draw.text(
((image_width - line_width) / 2 + i, y - i),
line,
font=font,
fill=shadowcolor,
)
draw.text(
((image_width - line_width) / 2 - i, y + i),
line,
font=font,
fill=shadowcolor,
)
draw.text(
((image_width - line_width) / 2 + i, y + i),
line,
font=font,
fill=shadowcolor,
)
draw.text(((image_width - line_width) / 2, y), line, font=font, fill=text_color)
y += line_height + padding
def imagemaker(theme, reddit_obj: dict, txtclr, padding=5, transparent=False) -> None:
raw_texts = reddit_obj["thread_post"]
sentences: List[str] = []
for item in raw_texts:
if isinstance(item, dict):
text = item.get("text", "")
elif isinstance(item, str):
text = item
else:
text = str(item)
text = process_text(text, False).strip()
if style.get("uppercase", False):
text = text.upper()
if text:
sentences.append(text)
return sentences if sentences else ["..."]
def _get_audio_info(mp3_dir: str) -> tuple:
"""
Discover postaudio files and compute:
- durations list (one per postaudio file)
- start times list (absolute seconds in video, after title card)
Returns (postaudio_files, durations, start_times)
"""
Render Images for video
postaudio_files = sorted(
glob.glob(os.path.join(mp3_dir, "postaudio-*.mp3")),
key=lambda x: int(re.search(r"postaudio-(\d+)", x).group(1))
)
title_path = os.path.join(mp3_dir, "title.mp3")
try:
title_duration = float(ffmpeg.probe(title_path)["format"]["duration"])
except Exception:
title_duration = 0.0
durations = []
start_times = []
current = title_duration
for f in postaudio_files:
try:
dur = float(ffmpeg.probe(f)["format"]["duration"])
except Exception:
dur = 0.0
start_times.append(current)
durations.append(dur)
current += dur
return postaudio_files, durations, start_times
def imagemaker(theme, reddit_obj: dict, txtclr, padding=5, transparent=False) -> int:
"""
texts = reddit_obj["thread_post"]
reddit_id = extract_id(reddit_obj)
if transparent:
font = ImageFont.truetype(os.path.join("fonts", "Roboto-Bold.ttf"), 100)
else:
font = ImageFont.truetype(os.path.join("fonts", "Roboto-Regular.ttf"), 100)
size = (1920, 1080)
for idx, text in track(enumerate(texts), "Rendering Image"):
image = Image.new("RGBA", size, theme)
text = process_text(text, False)
draw_multiple_line_text(image, text, font, txtclr, padding, wrap=30, transparent=transparent)
image.save(f"assets/temp/{reddit_id}/png/img{idx}.png")
Render caption images for the video.
Flow:
sentences + audio info
caption_renderer.get_render_jobs()
List[RenderJob]
each RenderJob transparent PNG (img{idx}.png)
timing_map.json saved for final_video.py
timing_map.json entry for fraction-based jobs:
{"timing_type": "fraction", "audio_idx": N, "time_fraction": F}
timing_map.json entry for absolute-based jobs (aligned mode):
{"timing_type": "absolute", "clip_start": S, "clip_end": E}
Returns:
int: total number of images generated
"""
# 1. Style
sentiment = settings.config["settings"].get("sentiment", "dramatic")
style = STYLE_MAP.get(sentiment, DEFAULT_STYLE)
CANVAS_W: int = int(settings.config["settings"]["resolution_w"])
CANVAS_H: int = int(settings.config["settings"]["resolution_h"])
reddit_id = extract_id(reddit_obj)
mp3_dir = f"assets/temp/{reddit_id}/mp3"
# 2. Extract sentences
sentences = _extract_sentences(reddit_obj, style)
# 3. Get audio timing info (needed for aligned mode)
_, durations, start_times = _get_audio_info(mp3_dir)
# 4. Get render jobs
jobs: List[RenderJob] = get_render_jobs(
sentences=sentences,
style=style,
mp3_dir=mp3_dir,
audio_start_times=start_times if start_times else None,
audio_durations=durations if durations else None,
)
# 5. Render each job to a transparent PNG
for job in track(jobs, description="Rendering caption images"):
image = render_job_to_image(job, style, CANVAS_W, CANVAS_H, LINE_SPACING)
image.save(f"assets/temp/{reddit_id}/png/img{job.idx}.png")
# 6. Save timing map
timing_map = []
for job in jobs:
if job.timing_type == "absolute":
timing_map.append({
"timing_type": "absolute",
"clip_start": job.clip_start,
"clip_end": job.clip_end,
})
else:
timing_map.append({
"timing_type": "fraction",
"audio_idx": job.audio_idx,
"time_fraction": job.time_fraction,
})
timing_map_path = f"assets/temp/{reddit_id}/timing_map.json"
with open(timing_map_path, "w") as f:
json.dump(timing_map, f, indent=2)
return len(jobs)

@ -196,12 +196,16 @@ def apply_sentiment_config(reddit_object: dict) -> None:
sentiment = detect_sentiment(reddit_object)
# ── Background ───────────────────────────────────────────
# ── Sentiment label — stored in memory so imagenarator.py can read it ────
# This is the key that STYLE_MAP lookups depend on at render time.
settings.config["settings"]["sentiment"] = sentiment
# ── Background ───────────────────────────────────────────────────────────
bg_video, bg_audio = BACKGROUND_MAP[sentiment]
settings.config["settings"]["background"]["background_video"] = bg_video
settings.config["settings"]["background"]["background_audio"] = bg_audio
# ── Voice ────────────────────────────────────────────────
# ── Voice ────────────────────────────────────────────────────────────────
voice_choice = settings.config["settings"]["tts"]["voice_choice"].lower()
if voice_choice == "elevenlabs":
@ -213,12 +217,12 @@ def apply_sentiment_config(reddit_object: dict) -> None:
else:
voice = f"(voice override not supported for {voice_choice})"
# ── Metadata ─────────────────────────────────────────────
# ── Metadata ─────────────────────────────────────────────────────────────
print_substep("Generating titles, captions and hashtags... ✍️", style="bold blue")
metadata = generate_metadata(reddit_object, sentiment)
save_metadata(metadata, reddit_object)
# ── Log ──────────────────────────────────────────────────
# ── Log ──────────────────────────────────────────────────────────────────
print_substep(f"Sentiment detected : {sentiment} 🎯", style="bold green")
print_substep(f"Background video : {bg_video}", style="bold blue")
print_substep(f"Background audio : {bg_audio if bg_audio else 'none'}", style="bold blue")

@ -1,16 +1,14 @@
# Maps sentiment → (background_video, background_audio)
BACKGROUND_MAP = {
"sad": ("minecraft", "lofi"), # slow, melancholic
"happy": ("fall-guys", "chill-summer"),# upbeat, fun
"angry": ("gta", "lofi"), # lofi keeps intensity without distraction
"mysterious": ("csgo-surf", "lofi-2"), # lofi-2 is more atmospheric
"funny": ("cluster-truck", "chill-summer"),# light and playful
"dramatic": ("rocket-league", "lofi"), # lofi under dramatic = tension
"wholesome": ("steep", "chill-summer"),# warm and positive
"scary": ("minecraft-2", "lofi-2"), # lofi-2 is darker/moodier
"sad": ("minecraft", "lofi"),
"happy": ("fall-guys", "chill-summer"),
"angry": ("gta", "lofi"),
"mysterious": ("csgo-surf", "lofi-2"),
"funny": ("cluster-truck", "chill-summer"),
"dramatic": ("rocket-league", "lofi"),
"wholesome": ("steep", "chill-summer"),
"scary": ("minecraft-2", "lofi-2"),
}
# Maps sentiment → OpenAI voice name
OPENAI_VOICE_MAP = {
"sad": "nova",
"happy": "shimmer",
@ -22,7 +20,6 @@ OPENAI_VOICE_MAP = {
"scary": "onyx",
}
# Maps sentiment → ElevenLabs voice name
ELEVENLABS_VOICE_MAP = {
"sad": "Brian - Deep, Resonant and Comforting",
"happy": "Jessica - Playful, Bright, Warm",
@ -34,8 +31,128 @@ ELEVENLABS_VOICE_MAP = {
"scary": "Harry - Fierce Warrior",
}
# All valid sentiment labels
VALID_SENTIMENTS = list(BACKGROUND_MAP.keys())
DEFAULT_SENTIMENT = "dramatic"
# Fallback if detection fails — maps to rocket-league + lofi + alloy
DEFAULT_SENTIMENT = "dramatic"
# ─────────────────────────────────────────────────────────────────────────────
# STYLE_MAP
# ─────────────────────────────────────────────────────────────────────────────
#
# display_mode options:
#
# "aligned" → WhisperX word timestamps — perfect sync with any TTS.
# Falls back to "single" per sentence if timestamps unavailable.
# USE THIS for all sentiments once WhisperX is installed.
#
# "single" → Split sentence into word chunks, equal time per chunk.
# Good fallback when WhisperX is not installed.
#
# "multi" → Full sentence on one image. No splitting.
# Best for slow TTS or wholesome/sad content.
#
# words_per_chunk:
# In "aligned" mode: words grouped per visible chunk (3-5 recommended)
# In "single" mode: words per chunk (higher = fewer chunks = slower pace)
# In "multi" mode: words per line in the wrapped text block
#
STYLE_MAP = {
"dramatic": {
"font_file": "Montserrat-ExtraBold.ttf",
"font_size": 95,
"fill_color": (255, 255, 255, 255),
"stroke_color": (0, 0, 0, 255),
"stroke_width": 4,
"words_per_chunk": 4,
"y_position": 0.65,
"uppercase": False,
"display_mode": "aligned",
},
"scary": {
"font_file": "Oswald-Bold.ttf",
"font_size": 95,
"fill_color": (232, 244, 248, 255),
"stroke_color": (0, 0, 0, 255),
"stroke_width": 5,
"words_per_chunk": 3,
"y_position": 0.65,
"uppercase": False,
"display_mode": "aligned",
},
"angry": {
"font_file": "Anton-Regular.ttf",
"font_size": 105,
"fill_color": (255, 69, 0, 255),
"stroke_color": (0, 0, 0, 255),
"stroke_width": 5,
"words_per_chunk": 3,
"y_position": 0.65,
"uppercase": True,
"display_mode": "aligned",
},
"mysterious": {
"font_file": "Raleway-Bold.ttf",
"font_size": 90,
"fill_color": (184, 212, 232, 255),
"stroke_color": (0, 0, 0, 255),
"stroke_width": 4,
"words_per_chunk": 3,
"y_position": 0.65,
"uppercase": False,
"display_mode": "aligned",
},
"funny": {
"font_file": "Nunito-ExtraBold.ttf",
"font_size": 90,
"fill_color": (255, 230, 0, 255),
"stroke_color": (0, 0, 0, 255),
"stroke_width": 4,
"words_per_chunk": 5,
"y_position": 0.65,
"uppercase": False,
"display_mode": "aligned",
},
"sad": {
"font_file": "Lato-Bold.ttf",
"font_size": 88,
"fill_color": (220, 225, 255, 255),
"stroke_color": (10, 10, 46, 255),
"stroke_width": 3,
"words_per_chunk": 5,
"y_position": 0.65,
"uppercase": False,
"display_mode": "aligned",
},
"wholesome": {
"font_file": "Nunito-ExtraBold.ttf",
"font_size": 88,
"fill_color": (255, 248, 231, 255),
"stroke_color": (26, 10, 0, 255),
"stroke_width": 3,
"words_per_chunk": 5,
"y_position": 0.65,
"uppercase": False,
"display_mode": "aligned",
},
"happy": {
"font_file": "Nunito-ExtraBold.ttf",
"font_size": 90,
"fill_color": (255, 230, 0, 255),
"stroke_color": (0, 0, 0, 255),
"stroke_width": 4,
"words_per_chunk": 5,
"y_position": 0.65,
"uppercase": False,
"display_mode": "aligned",
},
}
DEFAULT_STYLE = STYLE_MAP["dramatic"]

@ -0,0 +1,168 @@
"""
whisper_aligner.py
Word-level timestamp extraction using WhisperX.
This module runs after each TTS audio file is saved.
It produces a word-level timestamp JSON for every postaudio-{i}.mp3.
Output format (postaudio-{i}_words.json):
[
{"word": "I", "start": 0.00, "end": 0.18},
{"word": "told", "start": 0.18, "end": 0.42},
...
]
WhisperX is used because:
- Works with ANY TTS engine (Google, OpenAI, ElevenLabs, etc.)
- Free, runs locally, no API cost
- Word-level accuracy (not sentence-level)
- Fast on CPU for short audio clips
If WhisperX is not installed or fails for any reason,
this module returns None and the system falls back to
time_fraction-based sync (single/multi mode).
No crashes, no interruptions.
"""
import json
import os
from typing import List, Optional
from utils.console import print_substep
# ── WhisperX model is loaded once and reused across all audio files ───────────
# Loading is expensive (~2-3s). We cache it as a module-level singleton.
_whisper_model = None
_whisper_model_lang = None
def _get_model(language: str = "en"):
"""
Lazy-load WhisperX model. Loaded once per run, reused for all audio files.
Returns None if WhisperX is not installed.
"""
global _whisper_model, _whisper_model_lang
if _whisper_model is not None and _whisper_model_lang == language:
return _whisper_model
try:
import whisperx
print_substep("Loading WhisperX model (first run only)...", style="bold blue")
_whisper_model = whisperx.load_model(
"base", # small enough for CPU, accurate enough for TTS
device="cpu",
compute_type="int8",
language=language,
)
_whisper_model_lang = language
return _whisper_model
except ImportError:
return None
except Exception as e:
print_substep(f"WhisperX model load failed: {e}", style="yellow")
return None
def align_audio(audio_path: str, language: str = "en") -> Optional[List[dict]]:
"""
Run WhisperX on a single audio file and return word-level timestamps.
Parameters
----------
audio_path : str
Path to the .mp3 file to align.
language : str
Language code (default: "en"). Matches TTS language.
Returns
-------
Optional[List[dict]]
List of {"word": str, "start": float, "end": float} dicts.
Returns None if WhisperX is unavailable or alignment fails.
"""
try:
import whisperx
model = _get_model(language)
if model is None:
return None
# Transcribe + align
audio = whisperx.load_audio(audio_path)
result = model.transcribe(audio, batch_size=4)
# Align to get word-level timestamps
align_model, metadata = whisperx.load_align_model(
language_code=language,
device="cpu",
)
aligned = whisperx.align(
result["segments"],
align_model,
metadata,
audio,
device="cpu",
return_char_alignments=False,
)
# Flatten all words across all segments
words = []
for segment in aligned.get("word_segments", []):
word = segment.get("word", "").strip()
start = segment.get("start")
end = segment.get("end")
if word and start is not None and end is not None:
words.append({
"word": word,
"start": round(float(start), 3),
"end": round(float(end), 3),
})
return words if words else None
except Exception as e:
print_substep(f"WhisperX alignment failed for {audio_path}: {e}", style="yellow")
return None
def align_and_save(audio_path: str, language: str = "en") -> Optional[str]:
"""
Align audio and save word timestamps as a JSON file next to the audio.
Parameters
----------
audio_path : str
e.g. "assets/temp/abc123/mp3/postaudio-0.mp3"
language : str
Language code.
Returns
-------
Optional[str]
Path to saved JSON file, or None if alignment failed.
"""
words = align_audio(audio_path, language)
if words is None:
return None
json_path = audio_path.replace(".mp3", "_words.json")
with open(json_path, "w", encoding="utf-8") as f:
json.dump(words, f, indent=2, ensure_ascii=False)
return json_path
def load_word_timestamps(audio_path: str) -> Optional[List[dict]]:
"""
Load previously saved word timestamps for an audio file.
Returns None if the file doesn't exist.
"""
json_path = audio_path.replace(".mp3", "_words.json")
if not os.path.exists(json_path):
return None
with open(json_path, "r", encoding="utf-8") as f:
return json.load(f)

@ -5,9 +5,11 @@ import tempfile
import textwrap
import threading
import time
from os.path import exists # Needs to be imported specifically
from os.path import exists
from pathlib import Path
from typing import Dict, Final, Tuple
import glob
import json
import ffmpeg
import translators
@ -44,7 +46,6 @@ class ProgressFfmpeg(threading.Thread):
def get_latest_ms_progress(self):
lines = self.output_file.readlines()
if lines:
for line in lines:
if "out_time_ms" in line:
@ -52,7 +53,6 @@ class ProgressFfmpeg(threading.Thread):
if out_time_ms_str.isnumeric():
return float(out_time_ms_str) / 1000000.0
else:
# Handle the case when "N/A" is encountered
return None
return None
@ -74,7 +74,6 @@ def name_normalize(name: str) -> str:
name = re.sub(r"(\d+)\s?\/\s?(\d+)", r"\1 of \2", name)
name = re.sub(r"(\w+)\s?\/\s?(\w+)", r"\1 or \2", name)
name = re.sub(r"\/", r"", name)
lang = settings.config["reddit"]["thread"]["post_lang"]
if lang:
print_substep("Translating filename...")
@ -119,51 +118,38 @@ def get_text_height(draw, text, font, max_width):
def create_fancy_thumbnail(image, text, text_color, padding, wrap=35):
"""
It will take the 1px from the middle of the template and will be resized (stretched) vertically to accommodate the extra height needed for the title.
"""
print_step(f"Creating fancy thumbnail for: {text}")
font_title_size = 47
font = ImageFont.truetype(os.path.join("fonts", "Roboto-Bold.ttf"), font_title_size)
image_width, image_height = image.size
# Calculate text height to determine new image height
draw = ImageDraw.Draw(image)
text_height = get_text_height(draw, text, font, wrap)
lines = textwrap.wrap(text, width=wrap)
# This is -50 to reduce the empty space at the bottom of the image,
# change it as per your requirement if needed otherwise leave it.
new_image_height = image_height + text_height + padding * (len(lines) - 1) - 50
# Separate the image into top, middle (1px), and bottom parts
top_part_height = image_height // 2
middle_part_height = 1 # 1px height middle section
top_part_height = image_height // 2
middle_part_height = 1
bottom_part_height = image_height - top_part_height - middle_part_height
top_part = image.crop((0, 0, image_width, top_part_height))
top_part = image.crop((0, 0, image_width, top_part_height))
middle_part = image.crop((0, top_part_height, image_width, top_part_height + middle_part_height))
bottom_part = image.crop((0, top_part_height + middle_part_height, image_width, image_height))
# Stretch the middle part
new_middle_height = new_image_height - top_part_height - bottom_part_height
new_middle_height = max(1, new_image_height - top_part_height - bottom_part_height)
middle_part = middle_part.resize((image_width, new_middle_height))
# Create new image with the calculated height
new_image = Image.new("RGBA", (image_width, new_image_height))
# Paste the top, stretched middle, and bottom parts into the new image
new_image.paste(top_part, (0, 0))
new_image.paste(top_part, (0, 0))
new_image.paste(middle_part, (0, top_part_height))
new_image.paste(bottom_part, (0, top_part_height + new_middle_height))
# Draw the title text on the new image
draw = ImageDraw.Draw(new_image)
y = top_part_height + padding
for line in lines:
draw.text((120, y), line, font=font, fill=text_color, align="left")
y += get_text_height(draw, line, font, wrap) + padding
# Draw the username "PlotPulse" at the specific position
username_font = ImageFont.truetype(os.path.join("fonts", "Roboto-Bold.ttf"), 30)
draw.text(
(205, 825),
@ -172,28 +158,44 @@ def create_fancy_thumbnail(image, text, text_color, padding, wrap=35):
fill=text_color,
align="left",
)
return new_image
def merge_background_audio(audio: ffmpeg, reddit_id: str):
"""Gather an audio and merge with assets/backgrounds/background.mp3
Args:
audio (ffmpeg): The TTS final audio but without background.
reddit_id (str): The ID of subreddit
"""
background_audio_volume = settings.config["settings"]["background"]["background_audio_volume"]
if background_audio_volume == 0:
return audio # Return the original audio
else:
# sets volume to config
bg_audio = ffmpeg.input(f"assets/temp/{reddit_id}/background.mp3").filter(
"volume",
background_audio_volume,
)
# Merges audio and background_audio
merged_audio = ffmpeg.filter([audio, bg_audio], "amix", duration="longest")
return merged_audio # Return merged audio
return audio
bg_audio = ffmpeg.input(f"assets/temp/{reddit_id}/background.mp3").filter(
"volume", background_audio_volume,
)
return ffmpeg.filter([audio, bg_audio], "amix", duration="longest")
def _load_timing_map(reddit_id: str, img_files: list, postaudio_files: list,
audio_clips_durations: list, title_duration: float) -> list:
"""
Load timing_map.json written by imagemaker().
Each entry is one of:
{"timing_type": "absolute", "clip_start": S, "clip_end": E}
used directly as FFmpeg enable times
{"timing_type": "fraction", "audio_idx": N, "time_fraction": F}
clip time computed as: audio_start[N] + accumulated_fraction * audio_dur[N]
Falls back to 1:1 mapping if file missing.
"""
timing_map_path = f"assets/temp/{reddit_id}/timing_map.json"
if os.path.exists(timing_map_path):
with open(timing_map_path) as f:
return json.load(f)
# Fallback: 1:1
print_substep("timing_map.json not found — using 1:1 fallback", style="yellow")
return [
{"timing_type": "fraction", "audio_idx": i, "time_fraction": 1.0}
for i in range(len(img_files))
]
def make_final_video(
@ -202,20 +204,10 @@ def make_final_video(
reddit_obj: dict,
background_config: Dict[str, Tuple],
):
"""Gathers audio clips, gathers all screenshots, stitches them together and saves the final video to assets/temp
Args:
number_of_clips (int): Index to end at when going through the screenshots'
length (int): Length of the video
reddit_obj (dict): The reddit object that contains the posts to read.
background_config (Tuple[str, str, str, Any]): The background config to use.
"""
# settings values
W: Final[int] = int(settings.config["settings"]["resolution_w"])
H: Final[int] = int(settings.config["settings"]["resolution_h"])
opacity = settings.config["settings"]["opacity"]
reddit_id = extract_id(reddit_obj)
opacity = settings.config["settings"]["opacity"]
reddit_id = extract_id(reddit_obj)
allowOnlyTTSFolder: bool = (
settings.config["settings"]["background"]["enable_extra_audio"]
@ -223,33 +215,31 @@ def make_final_video(
)
print_step("Creating the final video 🎥")
background_clip = ffmpeg.input(prepare_background(reddit_id, W=W, H=H))
# Gather all audio clips
# ── Audio clips ───────────────────────────────────────────────────────────
audio_clips = list()
if number_of_clips == 0 and settings.config["settings"]["storymode"] == "false":
print(
"No audio clips to gather. Please use a different TTS or post."
) # This is to fix the TypeError: unsupported operand type(s) for +: 'int' and 'NoneType'
print("No audio clips to gather.")
exit()
if settings.config["settings"]["storymode"]:
if settings.config["settings"]["storymodemethod"] == 0:
audio_clips = [ffmpeg.input(f"assets/temp/{reddit_id}/mp3/title.mp3")]
audio_clips.insert(1, ffmpeg.input(f"assets/temp/{reddit_id}/mp3/postaudio.mp3"))
elif settings.config["settings"]["storymodemethod"] == 1:
audio_clips = [
ffmpeg.input(f"assets/temp/{reddit_id}/mp3/postaudio-{i}.mp3")
for i in track(range(number_of_clips + 1), "Collecting the audio files...")
]
postaudio_files = sorted(
glob.glob(f"assets/temp/{reddit_id}/mp3/postaudio-*.mp3"),
key=lambda x: int(re.search(r"postaudio-(\d+)", x).group(1))
)
audio_clips = [ffmpeg.input(f) for f in postaudio_files]
audio_clips.insert(0, ffmpeg.input(f"assets/temp/{reddit_id}/mp3/title.mp3"))
else:
audio_clips = [
ffmpeg.input(f"assets/temp/{reddit_id}/mp3/{i}.mp3") for i in range(number_of_clips)
ffmpeg.input(f"assets/temp/{reddit_id}/mp3/{i}.mp3")
for i in range(number_of_clips)
]
audio_clips.insert(0, ffmpeg.input(f"assets/temp/{reddit_id}/mp3/title.mp3"))
audio_clips_durations = [
float(ffmpeg.probe(f"assets/temp/{reddit_id}/mp3/{i}.mp3")["format"]["duration"])
for i in range(number_of_clips)
@ -258,6 +248,7 @@ def make_final_video(
0,
float(ffmpeg.probe(f"assets/temp/{reddit_id}/mp3/title.mp3")["format"]["duration"]),
)
audio_concat = ffmpeg.concat(*audio_clips, a=1, v=0)
ffmpeg.output(
audio_concat, f"assets/temp/{reddit_id}/audio.mp3", **{"b:a": "192k"}
@ -266,27 +257,16 @@ def make_final_video(
console.log(f"[bold green] Video Will Be: {length} Seconds Long")
screenshot_width = int((W * 45) // 100)
audio = ffmpeg.input(f"assets/temp/{reddit_id}/audio.mp3")
audio = ffmpeg.input(f"assets/temp/{reddit_id}/audio.mp3")
final_audio = merge_background_audio(audio, reddit_id)
image_clips = list()
Path(f"assets/temp/{reddit_id}/png").mkdir(parents=True, exist_ok=True)
# Credits to tim (beingbored)
# get the title_template image and draw a text in the middle part of it with the title of the thread
# ── Title card ────────────────────────────────────────────────────────────
title_template = Image.open("assets/title_template.png")
title = reddit_obj["thread_title"]
title = name_normalize(title)
font_color = "#000000"
padding = 5
# create_fancy_thumbnail(image, text, text_color, padding
title_img = create_fancy_thumbnail(title_template, title, font_color, padding)
title = name_normalize(reddit_obj["thread_title"])
title_img = create_fancy_thumbnail(title_template, title, "#000000", 5)
title_img.save(f"assets/temp/{reddit_id}/png/title.png")
image_clips.insert(
0,
@ -296,18 +276,17 @@ def make_final_video(
)
current_time = 0
if settings.config["settings"]["storymode"]:
audio_clips_durations = [
float(
ffmpeg.probe(f"assets/temp/{reddit_id}/mp3/postaudio-{i}.mp3")["format"]["duration"]
)
for i in range(number_of_clips)
]
audio_clips_durations.insert(
0,
float(ffmpeg.probe(f"assets/temp/{reddit_id}/mp3/title.mp3")["format"]["duration"]),
)
if settings.config["settings"]["storymodemethod"] == 0:
audio_clips_durations = [
float(ffmpeg.probe(f"assets/temp/{reddit_id}/mp3/postaudio.mp3")["format"]["duration"])
]
audio_clips_durations.insert(
0,
float(ffmpeg.probe(f"assets/temp/{reddit_id}/mp3/title.mp3")["format"]["duration"]),
)
image_clips.insert(
1,
ffmpeg.input(f"assets/temp/{reddit_id}/png/story_content.png").filter(
@ -321,20 +300,97 @@ def make_final_video(
y="(main_h-overlay_h)/2",
)
current_time += audio_clips_durations[0]
elif settings.config["settings"]["storymodemethod"] == 1:
for i in track(range(0, number_of_clips + 1), "Collecting the image files..."):
image_clips.append(
ffmpeg.input(f"assets/temp/{reddit_id}/png/img{i}.png")["v"].filter(
"scale", screenshot_width, -1
)
# ── Discover postaudio files ──────────────────────────────────────
postaudio_files = sorted(
glob.glob(f"assets/temp/{reddit_id}/mp3/postaudio-*.mp3"),
key=lambda x: int(re.search(r"postaudio-(\d+)", x).group(1))
)
# ── Build durations ───────────────────────────────────────────────
# audio_clips_durations[0] = title
# audio_clips_durations[1+i] = postaudio-{i}
audio_clips_durations = [
float(ffmpeg.probe(f)["format"]["duration"])
for f in postaudio_files
]
title_duration = float(
ffmpeg.probe(f"assets/temp/{reddit_id}/mp3/title.mp3")["format"]["duration"]
)
audio_clips_durations.insert(0, title_duration)
# ── Pre-compute absolute start time per audio file ────────────────
# audio_start_times[i] = when postaudio-{i} starts in the video
audio_start_times = []
t = title_duration
for dur in audio_clips_durations[1:]:
audio_start_times.append(t)
t += dur
# ── Title card overlay ────────────────────────────────────────────
background_clip = background_clip.overlay(
image_clips[0],
enable=f"between(t,0,{title_duration})",
x="(main_w-overlay_w)/2",
y="(main_h-overlay_h)/2",
)
current_time = title_duration
# ── Load image files ──────────────────────────────────────────────
img_files = sorted(
glob.glob(f"assets/temp/{reddit_id}/png/img*.png"),
key=lambda x: int(re.search(r"img(\d+)", x).group(1))
)
# ── Load timing map ───────────────────────────────────────────────
timing_map = _load_timing_map(
reddit_id, img_files, postaudio_files,
audio_clips_durations, title_duration
)
# ── Overlay each image ────────────────────────────────────────────
# Handles both absolute and fraction timing types cleanly.
# For fraction: track time_consumed per audio_idx
audio_time_used = {}
for i, img_file in enumerate(img_files):
if i >= len(timing_map):
break
entry = timing_map[i]
timing_type = entry.get("timing_type", "fraction")
if timing_type == "absolute":
# WhisperX aligned — use timestamps directly
clip_start = entry["clip_start"]
clip_end = entry["clip_end"]
else:
# Fraction-based — compute from audio duration
audio_idx = entry["audio_idx"]
time_fraction = entry["time_fraction"]
audio_dur = audio_clips_durations[audio_idx + 1]
display_dur = audio_dur * time_fraction
offset = audio_time_used.get(audio_idx, 0.0)
clip_start = audio_start_times[audio_idx] + offset
clip_end = clip_start + display_dur
audio_time_used[audio_idx] = offset + display_dur
img_clip = ffmpeg.input(img_file)["v"].filter(
"scale", screenshot_width, -1
)
image_clips.append(img_clip)
background_clip = background_clip.overlay(
image_clips[i],
enable=f"between(t,{current_time},{current_time + audio_clips_durations[i]})",
img_clip,
enable=f"between(t,{clip_start:.3f},{clip_end:.3f})",
x="(main_w-overlay_w)/2",
y="(main_h-overlay_h)/2",
)
current_time += audio_clips_durations[i]
current_time = t
else:
for i in range(0, number_of_clips + 1):
image_clips.append(
@ -343,9 +399,7 @@ def make_final_video(
)
)
image_overlay = image_clips[i].filter("colorchannelmixer", aa=opacity)
assert (
audio_clips_durations is not None
), "Please make a GitHub issue if you see this. Ping @JasonLovesDoggo on GitHub."
assert audio_clips_durations is not None
background_clip = background_clip.overlay(
image_overlay,
enable=f"between(t,{current_time},{current_time + audio_clips_durations[i]})",
@ -354,67 +408,48 @@ def make_final_video(
)
current_time += audio_clips_durations[i]
title = extract_id(reddit_obj, "thread_title")
idx = extract_id(reddit_obj)
title_thumb = reddit_obj["thread_title"]
filename = f"{name_normalize(title)[:100]}"
subreddit = reddit_obj.get("thread_subreddit", settings.config["reddit"]["thread"]["subreddit"])
sentiment = settings.config["settings"]["background"].get("background_video", "unknown")
# Per-video folder: results/{subreddit}/{thread_id}_{sentiment}/
# ── Output ────────────────────────────────────────────────────────────────
title_str = extract_id(reddit_obj, "thread_title")
idx = extract_id(reddit_obj)
title_thumb = reddit_obj["thread_title"]
subreddit = reddit_obj.get("thread_subreddit", settings.config["reddit"]["thread"]["subreddit"])
sentiment = settings.config["settings"]["background"].get("background_video", "unknown")
video_folder = f"./results/{subreddit}/{idx}_{sentiment}"
os.makedirs(video_folder, exist_ok=True)
if allowOnlyTTSFolder:
os.makedirs(f"{video_folder}/OnlyTTS", exist_ok=True)
# create a thumbnail for the video
settingsbackground = settings.config["settings"]["background"]
if settingsbackground["background_thumbnail"]:
if not exists(f"{video_folder}"):
os.makedirs(f"{video_folder}", exist_ok=True)
# get the first file with the .png extension from assets/backgrounds and use it as a background for the thumbnail
first_image = next(
(file for file in os.listdir("assets/backgrounds") if file.endswith(".png")),
None,
(f for f in os.listdir("assets/backgrounds") if f.endswith(".png")), None
)
if first_image is None:
print_substep("No png files found in assets/backgrounds", "red")
else:
font_family = settingsbackground["background_thumbnail_font_family"]
font_size = settingsbackground["background_thumbnail_font_size"]
font_color = settingsbackground["background_thumbnail_font_color"]
thumbnail = Image.open(f"assets/backgrounds/{first_image}")
width, height = thumbnail.size
w, h = thumbnail.size
thumbnailSave = create_thumbnail(
thumbnail,
font_family,
font_size,
font_color,
width,
height,
title_thumb,
settingsbackground["background_thumbnail_font_family"],
settingsbackground["background_thumbnail_font_size"],
settingsbackground["background_thumbnail_font_color"],
w, h, title_thumb,
)
thumbnailSave.save(f"{video_folder}/thumbnail.png")
print_substep(f"Thumbnail - Building Thumbnail in assets/temp/{reddit_id}/thumbnail.png")
text = f"Background by {background_config['video'][2]}"
background_clip = ffmpeg.drawtext(
background_clip,
text=text,
x=f"(w-text_w)",
y=f"(h-text_h)",
fontsize=5,
fontcolor="White",
text=f"Background by {background_config['video'][2]}",
x="(w-text_w)", y="(h-text_h)",
fontsize=5, fontcolor="White",
fontfile=os.path.join("fonts", "Roboto-Regular.ttf"),
)
background_clip = background_clip.filter("scale", W, H)
print_step("Rendering the video 🎥")
from tqdm import tqdm
pbar = tqdm(total=100, desc="Progress: ", bar_format="{l_bar}{bar}", unit=" %")
def on_update_example(progress) -> None:
@ -422,14 +457,11 @@ def make_final_video(
old_percentage = pbar.n
pbar.update(status - old_percentage)
defaultPath = video_folder
with ProgressFfmpeg(length, on_update_example) as progress:
path = f"{video_folder}/video.mp4"
try:
ffmpeg.output(
background_clip,
final_audio,
path,
background_clip, final_audio, path,
f="mp4",
**{
"c:v": "h264_nvenc",
@ -438,26 +470,23 @@ def make_final_video(
"threads": multiprocessing.cpu_count(),
},
).overwrite_output().global_args("-progress", progress.output_file.name).run(
quiet=True,
overwrite_output=True,
capture_stdout=False,
capture_stderr=False,
quiet=True, overwrite_output=True,
capture_stdout=False, capture_stderr=False,
)
except ffmpeg.Error as e:
print(e.stderr.decode("utf8"))
exit(1)
old_percentage = pbar.n
pbar.update(100 - old_percentage)
if allowOnlyTTSFolder:
path = f"{video_folder}/OnlyTTS/video.mp4"
# Prevent a error by limiting the path length, do not change this.
print_step("Rendering the Only TTS Video 🎥")
with ProgressFfmpeg(length, on_update_example) as progress:
try:
ffmpeg.output(
background_clip,
audio,
path,
background_clip, audio, path,
f="mp4",
**{
"c:v": "h264_nvenc",
@ -466,20 +495,18 @@ def make_final_video(
"threads": multiprocessing.cpu_count(),
},
).overwrite_output().global_args("-progress", progress.output_file.name).run(
quiet=True,
overwrite_output=True,
capture_stdout=False,
capture_stderr=False,
quiet=True, overwrite_output=True,
capture_stdout=False, capture_stderr=False,
)
except ffmpeg.Error as e:
print(e.stderr.decode("utf8"))
exit(1)
old_percentage = pbar.n
pbar.update(100 - old_percentage)
pbar.close()
save_data(subreddit, f"{idx}_{sentiment}/video.mp4", title, idx, background_config["video"][2])
save_data(subreddit, f"{idx}_{sentiment}/video.mp4", title_str, idx, background_config["video"][2])
print_step("Removing temporary files 🗑")
cleanups = cleanup(reddit_id)
print_substep(f"Removed {cleanups} temporary files 🗑")
print_step("Done! 🎉 The video is in the results folder 📁")
print_step("Done! 🎉 The video is in the results folder 📁")

@ -62,10 +62,10 @@ def get_screenshots_of_reddit_posts(reddit_object: dict, screenshot_num: int):
if storymode and settings.config["settings"]["storymodemethod"] == 1:
print_substep("Generating images...")
return imagemaker(
theme=bgcolor,
theme=(0, 0, 0, 0),
reddit_obj=reddit_object,
txtclr=txtcolor,
transparent=transparent,
txtclr=(255, 255, 255),
transparent=True,
)
screenshot_num: int

Loading…
Cancel
Save