feat: pro caption system with WhisperX word-level alignment

Core changes: - utils/caption_renderer.py: new single-responsibility rendering engine - Three display modes: aligned, single, multi - 8-direction stroke technique for clean text outlines - Transparent PNG overlays (no more solid box) - utils/whisper_aligner.py: WhisperX forced alignment module - Word-level timestamps from any TTS audio - Graceful fallback to single mode if unavailable - utils/imagenarator.py: refactored as thin orchestrator - Delegates to caption_renderer - Saves timing_map.json for final_video sync - utils/sentiment_map.py: added STYLE_MAP with display_mode per sentiment - utils/sentiment.py: stores sentiment in settings for downstream use - TTS/engine_wrapper.py: runs WhisperX after each TTS save - video_creation/final_video.py: reads timing_map, handles absolute + fraction timing - video_creation/screenshot_downloader.py: clean imagemaker call Assets: - fonts/: added Montserrat, Nunito, Oswald, Raleway, Lato, Anton font families Dependencies: - requirements.txt: updated with all current dependencies
2 months ago · 076b65f04c
parent af0940045c
commit 076b65f04c
85 changed files with 1198 additions and 279 deletions
--- a/TTS/engine_wrapper.py
+++ b/TTS/engine_wrapper.py
@ -14,23 +14,11 @@ from utils import settings
 from utils.console import print_step, print_substep
 from utils.voice import sanitize_text

-DEFAULT_MAX_LENGTH: int = (
-    50  # Video length variable, edit this on your own risk. It should work, but it's not supported
-)
+DEFAULT_MAX_LENGTH: int = 50


 class TTSEngine:
-    """Calls the given TTS engine to reduce code duplication and allow multiple TTS engines.
-
-    Args:
-        tts_module            : The TTS module. Your module should handle the TTS itself and saving to the given path under the run method.
-        reddit_object         : The reddit object that contains the posts to read.
-        path (Optional)       : The unix style path to save the mp3 files to. This must not have leading or trailing slashes.
-        max_length (Optional) : The maximum length of the mp3 files in total.
-
-    Notes:
-        tts_module must take the arguments text and filepath.
-    """
+    """Calls the given TTS engine to reduce code duplication and allow multiple TTS engines."""

    def __init__(
        self,
@ -42,18 +30,14 @@ class TTSEngine:
    ):
        self.tts_module = tts_module()
        self.reddit_object = reddit_object
-
        self.redditid = re.sub(r"[^\w\s-]", "", reddit_object["thread_id"])
        self.path = path + self.redditid + "/mp3"
        self.max_length = max_length
        self.length = 0
        self.last_clip_length = last_clip_length

-    def add_periods(
-        self,
-    ):  # adds periods to the end of paragraphs (where people often forget to put them) so tts doesn't blend sentences
+    def add_periods(self):
        for comment in self.reddit_object["comments"]:
-            # remove links
            regex_urls = r"((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*"
            comment["comment_body"] = re.sub(regex_urls, " ", comment["comment_body"])
            comment["comment_body"] = comment["comment_body"].replace("\n", ". ")
@ -72,7 +56,6 @@ class TTSEngine:

        self.add_periods()
        self.call_tts("title", process_text(self.reddit_object["thread_title"]))
-        # processed_text = ##self.reddit_object["thread_post"] != ""
        idx = 0

        if settings.config["settings"]["storymode"]:
@ -84,24 +67,41 @@ class TTSEngine:
            elif settings.config["settings"]["storymodemethod"] == 1:
                for idx, text in track(enumerate(self.reddit_object["thread_post"])):
                    self.call_tts(f"postaudio-{idx}", process_text(text))
-
+                    # ── WhisperX alignment ────────────────────────────────────
+                    # Run immediately after each TTS save so word timestamps
+                    # are ready when imagemaker() runs later.
+                    # Fails silently — never blocks video generation.
+                    self._align_audio(f"postaudio-{idx}")
        else:
            for idx, comment in track(enumerate(self.reddit_object["comments"]), "Saving..."):
-                # ! Stop creating mp3 files if the length is greater than max length.
                if self.length > self.max_length and idx > 1:
                    self.length -= self.last_clip_length
                    idx -= 1
                    break
-                if (
-                    len(comment["comment_body"]) > self.tts_module.max_chars
-                ):  # Split the comment if it is too long
-                    self.split_post(comment["comment_body"], idx)  # Split the comment
-                else:  # If the comment is not too long, just call the tts engine
+                if len(comment["comment_body"]) > self.tts_module.max_chars:
+                    self.split_post(comment["comment_body"], idx)
+                else:
                    self.call_tts(f"{idx}", process_text(comment["comment_body"]))

        print_substep("Saved Text to MP3 files successfully.", style="bold green")
        return self.length, idx

+    def _align_audio(self, filename: str) -> None:
+        """
+        Run WhisperX on a saved audio file to produce word-level timestamps.
+        Called immediately after each postaudio-{i}.mp3 is saved.
+        Fails silently — system falls back to time_fraction mode if unavailable.
+        """
+        try:
+            from utils.whisper_aligner import align_and_save
+            audio_path = f"{self.path}/{filename}.mp3"
+            lang = settings.config["reddit"]["thread"].get("post_lang", "en") or "en"
+            result = align_and_save(audio_path, language=lang)
+            if result:
+                print_substep(f"Word timestamps saved → {result}", style="dim")
+        except Exception:
+            pass  # Never crash on alignment failure
+
    def split_post(self, text: str, idx):
        split_files = []
        split_text = [
@ -114,8 +114,6 @@ class TTSEngine:

        for idy, text_cut in enumerate(split_text):
            newtext = process_text(text_cut)
-            # print(f"{idx}-{idy}: {newtext}\n")
-
            if not newtext or newtext.isspace():
                print("newtext was blank because sanitized split text resulted in none")
                continue
@ -144,7 +142,6 @@ class TTSEngine:

    def call_tts(self, filename: str, text: str):
        if settings.config["settings"]["tts"]["voice_choice"] == "googletranslate":
-            # GTTS does not have the argument 'random_voice'
            self.tts_module.run(
                text,
                filepath=f"{self.path}/{filename}.mp3",
@ -155,10 +152,6 @@ class TTSEngine:
                filepath=f"{self.path}/{filename}.mp3",
                random_voice=settings.config["settings"]["tts"]["random_voice"],
            )
-        # try:
-        #     self.length += MP3(f"{self.path}/{filename}.mp3").info.length
-        # except (MutagenError, HeaderNotFoundError):
-        #     self.length += sox.file_info.duration(f"{self.path}/{filename}.mp3")
        try:
            clip = AudioFileClip(f"{self.path}/{filename}.mp3")
            self.last_clip_length = clip.duration
@ -185,4 +178,4 @@ def process_text(text: str, clean: bool = True):
        print_substep("Translating Text...")
        translated_text = translators.translate_text(text, translator="google", to_language=lang)
        new_text = sanitize_text(translated_text)
-    return new_text
+    return new_text
--- a/fonts/Anton-Regular.ttf
+++ b/fonts/Anton-Regular.ttf
--- a/fonts/Lato-Black.ttf
+++ b/fonts/Lato-Black.ttf
--- a/fonts/Lato-BlackItalic.ttf
+++ b/fonts/Lato-BlackItalic.ttf
--- a/fonts/Lato-Bold.ttf
+++ b/fonts/Lato-Bold.ttf
--- a/fonts/Lato-BoldItalic.ttf
+++ b/fonts/Lato-BoldItalic.ttf
--- a/fonts/Lato-Italic.ttf
+++ b/fonts/Lato-Italic.ttf
--- a/fonts/Lato-Light.ttf
+++ b/fonts/Lato-Light.ttf
--- a/fonts/Lato-LightItalic.ttf
+++ b/fonts/Lato-LightItalic.ttf
--- a/fonts/Lato-Regular.ttf
+++ b/fonts/Lato-Regular.ttf
--- a/fonts/Lato-Thin.ttf
+++ b/fonts/Lato-Thin.ttf
--- a/fonts/Lato-ThinItalic.ttf
+++ b/fonts/Lato-ThinItalic.ttf
--- a/fonts/Montserrat-Black.ttf
+++ b/fonts/Montserrat-Black.ttf
--- a/fonts/Montserrat-BlackItalic.ttf
+++ b/fonts/Montserrat-BlackItalic.ttf
--- a/fonts/Montserrat-Bold.ttf
+++ b/fonts/Montserrat-Bold.ttf
--- a/fonts/Montserrat-BoldItalic.ttf
+++ b/fonts/Montserrat-BoldItalic.ttf
--- a/fonts/Montserrat-ExtraBold.ttf
+++ b/fonts/Montserrat-ExtraBold.ttf
--- a/fonts/Montserrat-ExtraBoldItalic.ttf
+++ b/fonts/Montserrat-ExtraBoldItalic.ttf
--- a/fonts/Montserrat-ExtraLight.ttf
+++ b/fonts/Montserrat-ExtraLight.ttf
--- a/fonts/Montserrat-ExtraLightItalic.ttf
+++ b/fonts/Montserrat-ExtraLightItalic.ttf
--- a/fonts/Montserrat-Italic-VariableFont_wght.ttf
+++ b/fonts/Montserrat-Italic-VariableFont_wght.ttf
--- a/fonts/Montserrat-Italic.ttf
+++ b/fonts/Montserrat-Italic.ttf
--- a/fonts/Montserrat-Light.ttf
+++ b/fonts/Montserrat-Light.ttf
--- a/fonts/Montserrat-LightItalic.ttf
+++ b/fonts/Montserrat-LightItalic.ttf
--- a/fonts/Montserrat-Medium.ttf
+++ b/fonts/Montserrat-Medium.ttf
--- a/fonts/Montserrat-MediumItalic.ttf
+++ b/fonts/Montserrat-MediumItalic.ttf
--- a/fonts/Montserrat-Regular.ttf
+++ b/fonts/Montserrat-Regular.ttf
--- a/fonts/Montserrat-SemiBold.ttf
+++ b/fonts/Montserrat-SemiBold.ttf
--- a/fonts/Montserrat-SemiBoldItalic.ttf
+++ b/fonts/Montserrat-SemiBoldItalic.ttf
--- a/fonts/Montserrat-Thin.ttf
+++ b/fonts/Montserrat-Thin.ttf
--- a/fonts/Montserrat-ThinItalic.ttf
+++ b/fonts/Montserrat-ThinItalic.ttf
--- a/fonts/Montserrat-VariableFont_wght.ttf
+++ b/fonts/Montserrat-VariableFont_wght.ttf
--- a/fonts/Nunito-Black.ttf
+++ b/fonts/Nunito-Black.ttf
--- a/fonts/Nunito-BlackItalic.ttf
+++ b/fonts/Nunito-BlackItalic.ttf
--- a/fonts/Nunito-Bold.ttf
+++ b/fonts/Nunito-Bold.ttf
--- a/fonts/Nunito-BoldItalic.ttf
+++ b/fonts/Nunito-BoldItalic.ttf
--- a/fonts/Nunito-ExtraBold.ttf
+++ b/fonts/Nunito-ExtraBold.ttf
--- a/fonts/Nunito-ExtraBoldItalic.ttf
+++ b/fonts/Nunito-ExtraBoldItalic.ttf
--- a/fonts/Nunito-ExtraLight.ttf
+++ b/fonts/Nunito-ExtraLight.ttf
--- a/fonts/Nunito-ExtraLightItalic.ttf
+++ b/fonts/Nunito-ExtraLightItalic.ttf
--- a/fonts/Nunito-Italic-VariableFont_wght.ttf
+++ b/fonts/Nunito-Italic-VariableFont_wght.ttf
--- a/fonts/Nunito-Italic.ttf
+++ b/fonts/Nunito-Italic.ttf
--- a/fonts/Nunito-Light.ttf
+++ b/fonts/Nunito-Light.ttf
--- a/fonts/Nunito-LightItalic.ttf
+++ b/fonts/Nunito-LightItalic.ttf
--- a/fonts/Nunito-Medium.ttf
+++ b/fonts/Nunito-Medium.ttf
--- a/fonts/Nunito-MediumItalic.ttf
+++ b/fonts/Nunito-MediumItalic.ttf
--- a/fonts/Nunito-Regular.ttf
+++ b/fonts/Nunito-Regular.ttf
--- a/fonts/Nunito-SemiBold.ttf
+++ b/fonts/Nunito-SemiBold.ttf
--- a/fonts/Nunito-SemiBoldItalic.ttf
+++ b/fonts/Nunito-SemiBoldItalic.ttf
--- a/fonts/Nunito-VariableFont_wght.ttf
+++ b/fonts/Nunito-VariableFont_wght.ttf
--- a/fonts/Oswald-Bold.ttf
+++ b/fonts/Oswald-Bold.ttf
--- a/fonts/Oswald-ExtraLight.ttf
+++ b/fonts/Oswald-ExtraLight.ttf
--- a/fonts/Oswald-Light.ttf
+++ b/fonts/Oswald-Light.ttf
--- a/fonts/Oswald-Medium.ttf
+++ b/fonts/Oswald-Medium.ttf
--- a/fonts/Oswald-Regular.ttf
+++ b/fonts/Oswald-Regular.ttf
--- a/fonts/Oswald-SemiBold.ttf
+++ b/fonts/Oswald-SemiBold.ttf
--- a/fonts/Oswald-VariableFont_wght.ttf
+++ b/fonts/Oswald-VariableFont_wght.ttf
--- a/fonts/Raleway-Black.ttf
+++ b/fonts/Raleway-Black.ttf
--- a/fonts/Raleway-BlackItalic.ttf
+++ b/fonts/Raleway-BlackItalic.ttf
--- a/fonts/Raleway-Bold.ttf
+++ b/fonts/Raleway-Bold.ttf
--- a/fonts/Raleway-BoldItalic.ttf
+++ b/fonts/Raleway-BoldItalic.ttf
--- a/fonts/Raleway-ExtraBold.ttf
+++ b/fonts/Raleway-ExtraBold.ttf
--- a/fonts/Raleway-ExtraBoldItalic.ttf
+++ b/fonts/Raleway-ExtraBoldItalic.ttf
--- a/fonts/Raleway-ExtraLight.ttf
+++ b/fonts/Raleway-ExtraLight.ttf
--- a/fonts/Raleway-ExtraLightItalic.ttf
+++ b/fonts/Raleway-ExtraLightItalic.ttf
--- a/fonts/Raleway-Italic-VariableFont_wght.ttf
+++ b/fonts/Raleway-Italic-VariableFont_wght.ttf
--- a/fonts/Raleway-Italic.ttf
+++ b/fonts/Raleway-Italic.ttf
--- a/fonts/Raleway-Light.ttf
+++ b/fonts/Raleway-Light.ttf
--- a/fonts/Raleway-LightItalic.ttf
+++ b/fonts/Raleway-LightItalic.ttf
--- a/fonts/Raleway-Medium.ttf
+++ b/fonts/Raleway-Medium.ttf
--- a/fonts/Raleway-MediumItalic.ttf
+++ b/fonts/Raleway-MediumItalic.ttf
--- a/fonts/Raleway-Regular.ttf
+++ b/fonts/Raleway-Regular.ttf
--- a/fonts/Raleway-SemiBold.ttf
+++ b/fonts/Raleway-SemiBold.ttf
--- a/fonts/Raleway-SemiBoldItalic.ttf
+++ b/fonts/Raleway-SemiBoldItalic.ttf
--- a/fonts/Raleway-Thin.ttf
+++ b/fonts/Raleway-Thin.ttf
--- a/fonts/Raleway-ThinItalic.ttf
+++ b/fonts/Raleway-ThinItalic.ttf
--- a/fonts/Raleway-VariableFont_wght.ttf
+++ b/fonts/Raleway-VariableFont_wght.ttf
--- a/requirements.txt
+++ b/requirements.txt
@ -1,21 +1,198 @@
+aiohappyeyeballs==2.6.2
+aiohttp==3.13.5
+aiosignal==1.4.0
+alembic==1.18.4
+annotated-doc==0.0.4
+annotated-types==0.7.0
+antlr4-python3-runtime==4.9.3
+anyio==4.13.0
+asteroid-filterbanks==0.4.0
+attrs==26.1.0
+av==17.0.1
+blinker==1.9.0
+blis==1.3.3
 boto3==1.36.8
 botocore==1.36.8
+catalogue==2.0.10
+certifi==2026.5.20
+cffi==2.0.0
+charset-normalizer==3.4.7
+clean-text==0.6.0
+click==8.1.8
+cloudpathlib==0.24.0
+colorlog==6.10.1
+confection==0.1.5
+contourpy==1.3.3
+cryptography==48.0.0
+ctranslate2==4.7.2
+cycler==0.12.1
+cymem==2.0.13
+decorator==5.3.1
+dill==0.4.1
+distro==1.9.0
+einops==0.8.2
+elevenlabs==1.57.0
+emoji==1.7.0
+en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl#sha256=1932429db727d4bff3deed6b34cfc05df17794f4a52eeb26cf8928f7c1a0fb85
+exejs==0.0.7
+faster-whisper==1.2.1
+ffmpeg-python==0.2.0
+filelock==3.29.0
+Flask==3.1.1
+flatbuffers==25.12.19
+fonttools==4.63.0
+frozenlist==1.8.0
+fsspec==2026.4.0
+ftfy==6.3.1
+future==1.0.0
+googleapis-common-protos==1.75.0
+greenlet==3.1.1
+grpcio==1.80.0
 gTTS==2.5.4
+h11==0.16.0
+hf-xet==1.5.0
+httpcore==1.0.9
+httpx==0.28.1
+huggingface_hub==0.36.2
+idna==3.17
+ImageIO==2.37.3
+imageio-ffmpeg==0.6.0
+itsdangerous==2.2.0
+jh2==5.0.11
+Jinja2==3.1.6
+jiter==0.15.0
+jmespath==1.1.0
+joblib==1.5.3
+julius==0.2.7
+kiwisolver==1.5.0
+langcodes==3.5.1
+lightning==2.6.5
+lightning-utilities==0.15.3
+lxml==6.1.1
+Mako==1.3.12
+markdown-it-py==4.2.0
+MarkupSafe==3.0.3
+matplotlib==3.10.9
+mdurl==0.1.2
 moviepy==2.2.1
+mpmath==1.3.0
+multidict==6.7.1
+multiprocess==0.70.19
+murmurhash==1.0.15
+networkx==3.6.1
+niquests==3.18.8
+nltk==3.9.4
+numpy==2.4.6
+nvidia-cublas-cu12==12.8.4.1
+nvidia-cuda-cupti-cu12==12.8.90
+nvidia-cuda-nvrtc-cu12==12.8.93
+nvidia-cuda-runtime-cu12==12.8.90
+nvidia-cudnn-cu12==9.10.2.21
+nvidia-cufft-cu12==11.3.3.83
+nvidia-cufile-cu12==1.13.1.3
+nvidia-curand-cu12==10.3.9.90
+nvidia-cusolver-cu12==11.7.3.90
+nvidia-cusparse-cu12==12.5.8.93
+nvidia-cusparselt-cu12==0.7.1
+nvidia-nccl-cu12==2.27.3
+nvidia-nvjitlink-cu12==12.8.93
+nvidia-nvtx-cu12==12.8.90
+omegaconf==2.3.0
+onnxruntime==1.26.0
+openai==2.38.0
+opentelemetry-api==1.42.1
+opentelemetry-exporter-otlp==1.42.1
+opentelemetry-exporter-otlp-proto-common==1.42.1
+opentelemetry-exporter-otlp-proto-grpc==1.42.1
+opentelemetry-exporter-otlp-proto-http==1.42.1
+opentelemetry-proto==1.42.1
+opentelemetry-sdk==1.42.1
+opentelemetry-semantic-conventions==0.63b1
+optuna==4.8.0
+packaging==26.2
+pandas==3.0.3
+pathos==0.3.5
+pillow==11.3.0
 playwright==1.49.1
+pox==0.3.7
+ppft==1.7.8
 praw==7.8.1
+prawcore==2.4.0
+preshed==3.0.13
+primePy==1.3
+proglog==0.1.12
+propcache==0.5.2
+protobuf==6.33.6
+pyannote-audio==4.0.4
+pyannote-core==6.0.1
+pyannote-database==6.1.1
+pyannote-metrics==4.1
+pyannote-pipeline==4.0.0
+pyannoteai-sdk==0.4.0
+pycparser==3.0
+pydantic==2.13.4
+pydantic_core==2.46.4
+pyee==12.0.0
+Pygments==2.20.0
+pyparsing==3.3.2
+python-dateutil==2.9.0.post0
+python-dotenv==1.2.2
+pytorch-lightning==2.6.5
+pytorch-metric-learning==2.9.0
+pyttsx3==2.98
+PyYAML==6.0.3
+qh3==1.8.1
+regex==2026.5.9
 requests==2.32.3
 rich==13.9.4
+s3transfer==0.11.3
+safetensors==0.7.0
+scikit-learn==1.8.0
+scipy==1.17.1
+setuptools==82.0.1
+shellingham==1.5.4
+six==1.17.0
+smart_open==7.6.1
+sniffio==1.3.1
+sortedcontainers==2.4.0
+spacy==3.8.7
+spacy-legacy==3.0.12
+spacy-loggers==1.0.5
+SQLAlchemy==2.0.50
+srsly==2.5.3
+sympy==1.14.0
+thinc==8.3.11
+threadpoolctl==3.6.0
+tokenizers==0.21.4
 toml==0.10.2
-translators==5.9.9
-pyttsx3==2.98
 tomlkit==0.13.2
-Flask==3.1.1
-clean-text==0.6.0
-unidecode==1.4.0
-spacy==3.8.7
-torch==2.7.0
+torch==2.8.0
+torch-audiomentations==0.12.0
+torch_pitch_shift==1.2.5
+torchaudio==2.8.0
+torchcodec==0.7.0
+torchmetrics==1.9.0
+torchvision==0.23.0
+tqdm==4.67.3
 transformers==4.52.4
-ffmpeg-python==0.2.0
-elevenlabs==1.57.0
-yt-dlp==2025.10.22
+translators==5.9.9
+triton==3.4.0
+typer==0.26.2
+typer-slim==0.24.0
+typing-inspection==0.4.2
+typing_extensions==4.15.0
+Unidecode==1.4.0
+update-checker==0.18.0
+urllib3==2.7.0
+urllib3-future==2.20.907
+wasabi==1.1.3
+wassima==2.1.0
+wcwidth==0.7.0
+weasel==0.4.3
+websocket-client==1.9.0
+websockets==16.0
+Werkzeug==3.1.8
+whisperx==3.8.6
+wrapt==2.2.1
+yarl==1.24.2
+yt-dlp==2026.3.17
--- a/utils/caption_renderer.py
+++ b/utils/caption_renderer.py
@ -0,0 +1,351 @@
+"""
+caption_renderer.py
+───────────────────
+All caption rendering logic. Three display modes:
+
+  multi    → full sentence on one image (1 RenderJob per sentence)
+  single   → sentence split into word chunks (N RenderJobs per sentence)
+  aligned  → word-level timestamps from WhisperX (perfect sync, any TTS)
+
+RenderJob is the contract between this module and final_video.py.
+Two types of timing:
+
+  FRACTION-based (multi, single):
+    audio_idx + time_fraction → final_video computes absolute time
+    time_fraction = fraction of audio_clips_durations[audio_idx+1]
+
+  ABSOLUTE-based (aligned):
+    clip_start + clip_end → final_video uses directly
+    These are absolute seconds in the video timeline (after title card)
+
+final_video.py checks job["timing_type"] to know which to use.
+"""
+
+import os
+from dataclasses import dataclass, field
+from typing import List, Optional
+
+from PIL import Image, ImageDraw, ImageFont
+
+from utils.fonts import getsize
+
+
+# ─────────────────────────────────────────────────────────────────────────────
+# RenderJob — the contract
+# ─────────────────────────────────────────────────────────────────────────────
+
+@dataclass
+class RenderJob:
+    """
+    Describes exactly one output image (img{idx}.png).
+
+    timing_type = "fraction":
+        audio_idx + time_fraction used by final_video to compute display time.
+        time_fraction = 1.0 means shown for full audio file duration.
+        time_fraction = 0.25 means shown for 25% of audio file duration.
+
+    timing_type = "absolute":
+        clip_start + clip_end are absolute seconds in the video timeline.
+        final_video uses these directly — no calculation needed.
+    """
+    idx:          int
+    lines:        List[str]
+    timing_type:  str          # "fraction" or "absolute"
+
+    # fraction-based fields
+    audio_idx:    int   = 0
+    time_fraction: float = 1.0
+
+    # absolute-based fields
+    clip_start:   float = 0.0
+    clip_end:     float = 0.0
+
+
+# ─────────────────────────────────────────────────────────────────────────────
+# Display modes
+# ─────────────────────────────────────────────────────────────────────────────
+
+DISPLAY_MODES = {"single", "multi", "aligned"}
+
+
+def render_multi_mode(
+    sentence: str,
+    style: dict,
+    audio_idx: int,
+    start_idx: int,
+) -> List[RenderJob]:
+    """
+    Full sentence on one image, wrapped into lines.
+    One RenderJob, time_fraction = 1.0.
+    Best for: funny, sad, wholesome, happy.
+    """
+    words = sentence.split()
+    wpl   = style["words_per_chunk"]
+    lines = [" ".join(words[i:i + wpl]) for i in range(0, len(words), wpl)]
+    if not lines:
+        lines = [sentence]
+
+    return [RenderJob(
+        idx=start_idx,
+        lines=lines,
+        timing_type="fraction",
+        audio_idx=audio_idx,
+        time_fraction=1.0,
+    )]
+
+
+def render_single_mode(
+    sentence: str,
+    style: dict,
+    audio_idx: int,
+    start_idx: int,
+) -> List[RenderJob]:
+    """
+    Sentence split into word chunks, one per image.
+    Each shown for (1/N) of the audio duration.
+    Best for: scary, dramatic, angry, mysterious.
+    """
+    wpc   = style["words_per_chunk"]
+    words = sentence.split()
+    raw   = [" ".join(words[i:i + wpc]) for i in range(0, len(words), wpc)]
+    raw   = [c for c in raw if c.strip()] or [sentence]
+
+    n        = len(raw)
+    fraction = 1.0 / n
+
+    return [
+        RenderJob(
+            idx=start_idx + i,
+            lines=[chunk],
+            timing_type="fraction",
+            audio_idx=audio_idx,
+            time_fraction=fraction,
+        )
+        for i, chunk in enumerate(raw)
+    ]
+
+
+def render_aligned_mode(
+    sentence: str,
+    style: dict,
+    audio_idx: int,
+    start_idx: int,
+    word_timestamps: List[dict],
+    audio_start_time: float,
+    audio_duration: float,
+) -> List[RenderJob]:
+    """
+    Word-level aligned mode using WhisperX timestamps.
+
+    Groups consecutive words into chunks of words_per_chunk words.
+    Each chunk's clip_start = timestamp of first word in chunk.
+    Each chunk's clip_end   = timestamp of last word in chunk + its duration.
+
+    audio_start_time: absolute time in video when this audio file starts.
+    audio_duration:   duration of this audio file (used as fallback end time).
+
+    Falls back to single mode if timestamps are empty or malformed.
+    """
+    wpc = style["words_per_chunk"]
+
+    if not word_timestamps:
+        return render_single_mode(sentence, style, audio_idx, start_idx)
+
+    # Group word timestamps into chunks of wpc words
+    jobs = []
+    n    = len(word_timestamps)
+
+    for chunk_start in range(0, n, wpc):
+        chunk_words = word_timestamps[chunk_start:chunk_start + wpc]
+        if not chunk_words:
+            continue
+
+        text       = " ".join(w["word"] for w in chunk_words)
+        clip_start = audio_start_time + chunk_words[0]["start"]
+
+        # clip_end = end of last word in chunk,
+        # or start of next chunk if available, capped at audio end
+        if chunk_start + wpc < n:
+            clip_end = audio_start_time + word_timestamps[chunk_start + wpc]["start"]
+        else:
+            last_end = chunk_words[-1].get("end", chunk_words[-1]["start"] + 0.3)
+            clip_end = audio_start_time + last_end
+
+        # Safety: never exceed audio boundary
+        audio_end = audio_start_time + audio_duration
+        clip_end  = min(clip_end, audio_end)
+        clip_end  = max(clip_end, clip_start + 0.1)  # minimum 100ms visibility
+
+        jobs.append(RenderJob(
+            idx=start_idx + len(jobs),
+            lines=[text],
+            timing_type="absolute",
+            clip_start=round(clip_start, 3),
+            clip_end=round(clip_end,   3),
+        ))
+
+    return jobs if jobs else render_single_mode(sentence, style, audio_idx, start_idx)
+
+
+# ─────────────────────────────────────────────────────────────────────────────
+# Router
+# ─────────────────────────────────────────────────────────────────────────────
+
+def get_render_jobs(
+    sentences: List[str],
+    style: dict,
+    mp3_dir: Optional[str] = None,
+    audio_start_times: Optional[List[float]] = None,
+    audio_durations: Optional[List[float]] = None,
+) -> List[RenderJob]:
+    """
+    Route each sentence to the correct renderer.
+    Returns flat ordered list of all RenderJobs.
+
+    For "aligned" mode, loads word timestamps from
+    {mp3_dir}/postaudio-{i}_words.json written by engine_wrapper.
+    Falls back to "single" mode per sentence if timestamps missing.
+
+    Parameters
+    ----------
+    sentences         : one per postaudio-{i}.mp3
+    style             : STYLE_MAP entry for current sentiment
+    mp3_dir           : path to mp3 folder (needed for aligned mode)
+    audio_start_times : absolute start time of each audio in video (needed for aligned)
+    audio_durations   : duration of each audio file (needed for aligned)
+    """
+    mode = style.get("display_mode", "multi")
+
+    if mode not in DISPLAY_MODES:
+        print(f"[caption_renderer] Unknown display_mode '{mode}', using 'multi'")
+        mode = "multi"
+
+    all_jobs:    List[RenderJob] = []
+    img_counter: int             = 0
+
+    for audio_idx, sentence in enumerate(sentences):
+
+        if mode == "aligned" and mp3_dir and audio_start_times and audio_durations:
+            # Try to load word timestamps for this sentence
+            from utils.whisper_aligner import load_word_timestamps
+            audio_path = os.path.join(mp3_dir, f"postaudio-{audio_idx}.mp3")
+            word_ts    = load_word_timestamps(audio_path)
+
+            if word_ts:
+                jobs = render_aligned_mode(
+                    sentence=sentence,
+                    style=style,
+                    audio_idx=audio_idx,
+                    start_idx=img_counter,
+                    word_timestamps=word_ts,
+                    audio_start_time=audio_start_times[audio_idx],
+                    audio_duration=audio_durations[audio_idx],
+                )
+            else:
+                # WhisperX not available or failed — fall back to single mode
+                print(f"[caption_renderer] No timestamps for sentence {audio_idx}, using single mode")
+                jobs = render_single_mode(sentence, style, audio_idx, img_counter)
+
+        elif mode == "single":
+            jobs = render_single_mode(sentence, style, audio_idx, img_counter)
+
+        else:
+            jobs = render_multi_mode(sentence, style, audio_idx, img_counter)
+
+        all_jobs.extend(jobs)
+        img_counter += len(jobs)
+
+    return all_jobs
+
+
+# ─────────────────────────────────────────────────────────────────────────────
+# Drawing primitives
+# ─────────────────────────────────────────────────────────────────────────────
+
+def measure_text_block(
+    draw: ImageDraw.ImageDraw,
+    lines: List[str],
+    font: ImageFont.FreeTypeFont,
+    line_spacing: int,
+) -> tuple:
+    max_w = 0
+    total_h = 0
+    for i, line in enumerate(lines):
+        w, h = getsize(font, line)
+        if w > max_w:
+            max_w = w
+        total_h += h
+        if i < len(lines) - 1:
+            total_h += line_spacing
+    return max_w, total_h
+
+
+def draw_stroked_text(
+    draw: ImageDraw.ImageDraw,
+    x: int,
+    y: int,
+    line: str,
+    font: ImageFont.FreeTypeFont,
+    fill_color: tuple,
+    stroke_color: tuple,
+    stroke_width: int,
+) -> None:
+    sw   = stroke_width
+    half = max(1, sw // 2)
+    offsets = [
+        (-sw, 0), (sw, 0), (0, -sw), (0, sw),
+        (-sw, -sw), (sw, -sw), (-sw, sw), (sw, sw),
+        (-sw, -half), (sw, -half), (-sw, half), (sw, half),
+        (-half, -sw), (half, -sw), (-half, sw), (half, sw),
+    ]
+    for ox, oy in offsets:
+        draw.text((x + ox, y + oy), line, font=font, fill=stroke_color)
+    draw.text((x, y), line, font=font, fill=fill_color)
+
+
+def fit_font(
+    style: dict,
+    lines: List[str],
+    canvas_w: int,
+    canvas_h: int,
+    line_spacing: int,
+    max_width_ratio: float = 0.88,
+    max_height_ratio: float = 0.45,
+) -> ImageFont.FreeTypeFont:
+    font_size = style["font_size"]
+    font_path = os.path.join("fonts", style["font_file"])
+    if not os.path.exists(font_path):
+        font_path = os.path.join("fonts", "Roboto-Bold.ttf")
+    max_w = int(canvas_w * max_width_ratio)
+    max_h = int(canvas_h * max_height_ratio)
+    while font_size > 30:
+        font      = ImageFont.truetype(font_path, font_size)
+        dummy     = Image.new("RGBA", (canvas_w, canvas_h), (0, 0, 0, 0))
+        dummy_d   = ImageDraw.Draw(dummy)
+        bw, bh    = measure_text_block(dummy_d, lines, font, line_spacing)
+        if bw <= max_w and bh <= max_h:
+            return font
+        font_size -= 4
+    return ImageFont.truetype(font_path, 30)
+
+
+def render_job_to_image(
+    job: RenderJob,
+    style: dict,
+    canvas_w: int,
+    canvas_h: int,
+    line_spacing: int,
+) -> Image.Image:
+    font    = fit_font(style, job.lines, canvas_w, canvas_h, line_spacing)
+    image   = Image.new("RGBA", (canvas_w, canvas_h), (0, 0, 0, 0))
+    draw    = ImageDraw.Draw(image)
+    bw, bh  = measure_text_block(draw, job.lines, font, line_spacing)
+    anchor_y = int(canvas_h * style["y_position"]) - (bh // 2)
+    cy = anchor_y
+    for line in job.lines:
+        w, h = getsize(font, line)
+        x    = (canvas_w - w) // 2
+        draw_stroked_text(draw, x, cy, line, font,
+                          style["fill_color"], style["stroke_color"], style["stroke_width"])
+        cy += h + line_spacing
+    return image
--- a/utils/imagenarator.py
+++ b/utils/imagenarator.py
@ -1,74 +1,156 @@
+"""
+imagenarator.py
+───────────────
+Thin orchestrator. Does exactly:
+  1. Extract sentences from reddit_obj
+  2. Probe audio durations + compute audio start times (needed for aligned mode)
+  3. Call caption_renderer.get_render_jobs()
+  4. Render each job to PNG
+  5. Save timing_map.json for final_video.py
+"""
+
+import glob
+import json
 import os
 import re
-import textwrap
+from typing import List, Optional

-from PIL import Image, ImageDraw, ImageFont
+import ffmpeg
 from rich.progress import track

 from TTS.engine_wrapper import process_text
-from utils.fonts import getheight, getsize
+from utils import settings
 from utils.id import extract_id
+from utils.sentiment_map import STYLE_MAP, DEFAULT_STYLE
+from utils.caption_renderer import get_render_jobs, render_job_to_image, RenderJob
+

+LINE_SPACING: int = 20

-def draw_multiple_line_text(
-    image, text, font, text_color, padding, wrap=50, transparent=False
-) -> None:
+
+def _extract_sentences(reddit_obj: dict, style: dict) -> List[str]:
    """
-    Draw multiline text over given image
+    Extract sentences from thread_post.
+    One sentence per postaudio-{i}.mp3 — order preserved.
    """
-    draw = ImageDraw.Draw(image)
-    font_height = getheight(font, text)
-    image_width, image_height = image.size
-    lines = textwrap.wrap(text, width=wrap)
-    y = (image_height / 2) - (((font_height + (len(lines) * padding) / len(lines)) * len(lines)) / 2)
-    for line in lines:
-        line_width, line_height = getsize(font, line)
-        if transparent:
-            shadowcolor = "black"
-            for i in range(1, 5):
-                draw.text(
-                    ((image_width - line_width) / 2 - i, y - i),
-                    line,
-                    font=font,
-                    fill=shadowcolor,
-                )
-                draw.text(
-                    ((image_width - line_width) / 2 + i, y - i),
-                    line,
-                    font=font,
-                    fill=shadowcolor,
-                )
-                draw.text(
-                    ((image_width - line_width) / 2 - i, y + i),
-                    line,
-                    font=font,
-                    fill=shadowcolor,
-                )
-                draw.text(
-                    ((image_width - line_width) / 2 + i, y + i),
-                    line,
-                    font=font,
-                    fill=shadowcolor,
-                )
-        draw.text(((image_width - line_width) / 2, y), line, font=font, fill=text_color)
-        y += line_height + padding
-
-
-def imagemaker(theme, reddit_obj: dict, txtclr, padding=5, transparent=False) -> None:
+    raw_texts = reddit_obj["thread_post"]
+    sentences: List[str] = []
+    for item in raw_texts:
+        if isinstance(item, dict):
+            text = item.get("text", "")
+        elif isinstance(item, str):
+            text = item
+        else:
+            text = str(item)
+        text = process_text(text, False).strip()
+        if style.get("uppercase", False):
+            text = text.upper()
+        if text:
+            sentences.append(text)
+    return sentences if sentences else ["..."]
+
+
+def _get_audio_info(mp3_dir: str) -> tuple:
+    """
+    Discover postaudio files and compute:
+      - durations list (one per postaudio file)
+      - start times list (absolute seconds in video, after title card)
+
+    Returns (postaudio_files, durations, start_times)
    """
-    Render Images for video
+    postaudio_files = sorted(
+        glob.glob(os.path.join(mp3_dir, "postaudio-*.mp3")),
+        key=lambda x: int(re.search(r"postaudio-(\d+)", x).group(1))
+    )
+
+    title_path = os.path.join(mp3_dir, "title.mp3")
+    try:
+        title_duration = float(ffmpeg.probe(title_path)["format"]["duration"])
+    except Exception:
+        title_duration = 0.0
+
+    durations   = []
+    start_times = []
+    current     = title_duration
+
+    for f in postaudio_files:
+        try:
+            dur = float(ffmpeg.probe(f)["format"]["duration"])
+        except Exception:
+            dur = 0.0
+        start_times.append(current)
+        durations.append(dur)
+        current += dur
+
+    return postaudio_files, durations, start_times
+
+
+def imagemaker(theme, reddit_obj: dict, txtclr, padding=5, transparent=False) -> int:
    """
-    texts = reddit_obj["thread_post"]
-    reddit_id = extract_id(reddit_obj)
-    if transparent:
-        font = ImageFont.truetype(os.path.join("fonts", "Roboto-Bold.ttf"), 100)
-    else:
-        font = ImageFont.truetype(os.path.join("fonts", "Roboto-Regular.ttf"), 100)
-
-    size = (1920, 1080)
-
-    for idx, text in track(enumerate(texts), "Rendering Image"):
-        image = Image.new("RGBA", size, theme)
-        text = process_text(text, False)
-        draw_multiple_line_text(image, text, font, txtclr, padding, wrap=30, transparent=transparent)
-        image.save(f"assets/temp/{reddit_id}/png/img{idx}.png")
+    Render caption images for the video.
+
+    Flow:
+        sentences + audio info
+            → caption_renderer.get_render_jobs()
+            → List[RenderJob]
+        each RenderJob → transparent PNG (img{idx}.png)
+        timing_map.json → saved for final_video.py
+
+    timing_map.json entry for fraction-based jobs:
+        {"timing_type": "fraction", "audio_idx": N, "time_fraction": F}
+
+    timing_map.json entry for absolute-based jobs (aligned mode):
+        {"timing_type": "absolute", "clip_start": S, "clip_end": E}
+
+    Returns:
+        int: total number of images generated
+    """
+    # 1. Style
+    sentiment = settings.config["settings"].get("sentiment", "dramatic")
+    style     = STYLE_MAP.get(sentiment, DEFAULT_STYLE)
+    CANVAS_W: int = int(settings.config["settings"]["resolution_w"])
+    CANVAS_H: int = int(settings.config["settings"]["resolution_h"])
+    reddit_id     = extract_id(reddit_obj)
+    mp3_dir       = f"assets/temp/{reddit_id}/mp3"
+
+    # 2. Extract sentences
+    sentences = _extract_sentences(reddit_obj, style)
+
+    # 3. Get audio timing info (needed for aligned mode)
+    _, durations, start_times = _get_audio_info(mp3_dir)
+
+    # 4. Get render jobs
+    jobs: List[RenderJob] = get_render_jobs(
+        sentences=sentences,
+        style=style,
+        mp3_dir=mp3_dir,
+        audio_start_times=start_times if start_times else None,
+        audio_durations=durations   if durations   else None,
+    )
+
+    # 5. Render each job to a transparent PNG
+    for job in track(jobs, description="Rendering caption images"):
+        image = render_job_to_image(job, style, CANVAS_W, CANVAS_H, LINE_SPACING)
+        image.save(f"assets/temp/{reddit_id}/png/img{job.idx}.png")
+
+    # 6. Save timing map
+    timing_map = []
+    for job in jobs:
+        if job.timing_type == "absolute":
+            timing_map.append({
+                "timing_type": "absolute",
+                "clip_start":  job.clip_start,
+                "clip_end":    job.clip_end,
+            })
+        else:
+            timing_map.append({
+                "timing_type":  "fraction",
+                "audio_idx":    job.audio_idx,
+                "time_fraction": job.time_fraction,
+            })
+
+    timing_map_path = f"assets/temp/{reddit_id}/timing_map.json"
+    with open(timing_map_path, "w") as f:
+        json.dump(timing_map, f, indent=2)
+
+    return len(jobs)
--- a/utils/sentiment.py
+++ b/utils/sentiment.py
@ -196,12 +196,16 @@ def apply_sentiment_config(reddit_object: dict) -> None:

    sentiment = detect_sentiment(reddit_object)

-    # ── Background ───────────────────────────────────────────
+    # ── Sentiment label — stored in memory so imagenarator.py can read it ────
+    # This is the key that STYLE_MAP lookups depend on at render time.
+    settings.config["settings"]["sentiment"] = sentiment
+
+    # ── Background ───────────────────────────────────────────────────────────
    bg_video, bg_audio = BACKGROUND_MAP[sentiment]
    settings.config["settings"]["background"]["background_video"] = bg_video
    settings.config["settings"]["background"]["background_audio"] = bg_audio

-    # ── Voice ────────────────────────────────────────────────
+    # ── Voice ────────────────────────────────────────────────────────────────
    voice_choice = settings.config["settings"]["tts"]["voice_choice"].lower()

    if voice_choice == "elevenlabs":
@ -213,12 +217,12 @@ def apply_sentiment_config(reddit_object: dict) -> None:
    else:
        voice = f"(voice override not supported for {voice_choice})"

-    # ── Metadata ─────────────────────────────────────────────
+    # ── Metadata ─────────────────────────────────────────────────────────────
    print_substep("Generating titles, captions and hashtags... ✍️", style="bold blue")
    metadata = generate_metadata(reddit_object, sentiment)
    save_metadata(metadata, reddit_object)

-    # ── Log ──────────────────────────────────────────────────
+    # ── Log ──────────────────────────────────────────────────────────────────
    print_substep(f"Sentiment detected  : {sentiment} 🎯", style="bold green")
    print_substep(f"Background video    : {bg_video}", style="bold blue")
    print_substep(f"Background audio    : {bg_audio if bg_audio else 'none'}", style="bold blue")
--- a/utils/sentiment_map.py
+++ b/utils/sentiment_map.py
@ -1,16 +1,14 @@
-# Maps sentiment → (background_video, background_audio)
 BACKGROUND_MAP = {
-    "sad":        ("minecraft",     "lofi"),        # slow, melancholic
-    "happy":      ("fall-guys",     "chill-summer"),# upbeat, fun
-    "angry":      ("gta",           "lofi"),        # lofi keeps intensity without distraction
-    "mysterious": ("csgo-surf",     "lofi-2"),      # lofi-2 is more atmospheric
-    "funny":      ("cluster-truck", "chill-summer"),# light and playful
-    "dramatic":   ("rocket-league", "lofi"),        # lofi under dramatic = tension
-    "wholesome":  ("steep",         "chill-summer"),# warm and positive
-    "scary":      ("minecraft-2",   "lofi-2"),      # lofi-2 is darker/moodier
+    "sad":        ("minecraft",     "lofi"),
+    "happy":      ("fall-guys",     "chill-summer"),
+    "angry":      ("gta",           "lofi"),
+    "mysterious": ("csgo-surf",     "lofi-2"),
+    "funny":      ("cluster-truck", "chill-summer"),
+    "dramatic":   ("rocket-league", "lofi"),
+    "wholesome":  ("steep",         "chill-summer"),
+    "scary":      ("minecraft-2",   "lofi-2"),
 }

-# Maps sentiment → OpenAI voice name
 OPENAI_VOICE_MAP = {
    "sad":        "nova",
    "happy":      "shimmer",
@ -22,7 +20,6 @@ OPENAI_VOICE_MAP = {
    "scary":      "onyx",
 }

-# Maps sentiment → ElevenLabs voice name
 ELEVENLABS_VOICE_MAP = {
    "sad":        "Brian - Deep, Resonant and Comforting",
    "happy":      "Jessica - Playful, Bright, Warm",
@ -34,8 +31,128 @@ ELEVENLABS_VOICE_MAP = {
    "scary":      "Harry - Fierce Warrior",
 }

-# All valid sentiment labels
 VALID_SENTIMENTS = list(BACKGROUND_MAP.keys())
+DEFAULT_SENTIMENT = "dramatic"

-# Fallback if detection fails — maps to rocket-league + lofi + alloy
-DEFAULT_SENTIMENT = "dramatic"
+
+# ─────────────────────────────────────────────────────────────────────────────
+# STYLE_MAP
+# ─────────────────────────────────────────────────────────────────────────────
+#
+# display_mode options:
+#
+#   "aligned"  → WhisperX word timestamps — perfect sync with any TTS.
+#                Falls back to "single" per sentence if timestamps unavailable.
+#                USE THIS for all sentiments once WhisperX is installed.
+#
+#   "single"   → Split sentence into word chunks, equal time per chunk.
+#                Good fallback when WhisperX is not installed.
+#
+#   "multi"    → Full sentence on one image. No splitting.
+#                Best for slow TTS or wholesome/sad content.
+#
+# words_per_chunk:
+#   In "aligned" mode: words grouped per visible chunk (3-5 recommended)
+#   In "single" mode:  words per chunk (higher = fewer chunks = slower pace)
+#   In "multi" mode:   words per line in the wrapped text block
+#
+STYLE_MAP = {
+
+    "dramatic": {
+        "font_file":       "Montserrat-ExtraBold.ttf",
+        "font_size":       95,
+        "fill_color":      (255, 255, 255, 255),
+        "stroke_color":    (0,   0,   0,   255),
+        "stroke_width":    4,
+        "words_per_chunk": 4,
+        "y_position":      0.65,
+        "uppercase":       False,
+        "display_mode":    "aligned",
+    },
+
+    "scary": {
+        "font_file":       "Oswald-Bold.ttf",
+        "font_size":       95,
+        "fill_color":      (232, 244, 248, 255),
+        "stroke_color":    (0,   0,   0,   255),
+        "stroke_width":    5,
+        "words_per_chunk": 3,
+        "y_position":      0.65,
+        "uppercase":       False,
+        "display_mode":    "aligned",
+    },
+
+    "angry": {
+        "font_file":       "Anton-Regular.ttf",
+        "font_size":       105,
+        "fill_color":      (255, 69,  0,   255),
+        "stroke_color":    (0,   0,   0,   255),
+        "stroke_width":    5,
+        "words_per_chunk": 3,
+        "y_position":      0.65,
+        "uppercase":       True,
+        "display_mode":    "aligned",
+    },
+
+    "mysterious": {
+        "font_file":       "Raleway-Bold.ttf",
+        "font_size":       90,
+        "fill_color":      (184, 212, 232, 255),
+        "stroke_color":    (0,   0,   0,   255),
+        "stroke_width":    4,
+        "words_per_chunk": 3,
+        "y_position":      0.65,
+        "uppercase":       False,
+        "display_mode":    "aligned",
+    },
+
+    "funny": {
+        "font_file":       "Nunito-ExtraBold.ttf",
+        "font_size":       90,
+        "fill_color":      (255, 230, 0,   255),
+        "stroke_color":    (0,   0,   0,   255),
+        "stroke_width":    4,
+        "words_per_chunk": 5,
+        "y_position":      0.65,
+        "uppercase":       False,
+        "display_mode":    "aligned",
+    },
+
+    "sad": {
+        "font_file":       "Lato-Bold.ttf",
+        "font_size":       88,
+        "fill_color":      (220, 225, 255, 255),
+        "stroke_color":    (10,  10,  46,  255),
+        "stroke_width":    3,
+        "words_per_chunk": 5,
+        "y_position":      0.65,
+        "uppercase":       False,
+        "display_mode":    "aligned",
+    },
+
+    "wholesome": {
+        "font_file":       "Nunito-ExtraBold.ttf",
+        "font_size":       88,
+        "fill_color":      (255, 248, 231, 255),
+        "stroke_color":    (26,  10,  0,   255),
+        "stroke_width":    3,
+        "words_per_chunk": 5,
+        "y_position":      0.65,
+        "uppercase":       False,
+        "display_mode":    "aligned",
+    },
+
+    "happy": {
+        "font_file":       "Nunito-ExtraBold.ttf",
+        "font_size":       90,
+        "fill_color":      (255, 230, 0,   255),
+        "stroke_color":    (0,   0,   0,   255),
+        "stroke_width":    4,
+        "words_per_chunk": 5,
+        "y_position":      0.65,
+        "uppercase":       False,
+        "display_mode":    "aligned",
+    },
+}
+
+DEFAULT_STYLE = STYLE_MAP["dramatic"]
--- a/utils/whisper_aligner.py
+++ b/utils/whisper_aligner.py
@ -0,0 +1,168 @@
+"""
+whisper_aligner.py
+──────────────────
+Word-level timestamp extraction using WhisperX.
+
+This module runs after each TTS audio file is saved.
+It produces a word-level timestamp JSON for every postaudio-{i}.mp3.
+
+Output format (postaudio-{i}_words.json):
+[
+  {"word": "I",    "start": 0.00, "end": 0.18},
+  {"word": "told", "start": 0.18, "end": 0.42},
+  ...
+]
+
+WhisperX is used because:
+  - Works with ANY TTS engine (Google, OpenAI, ElevenLabs, etc.)
+  - Free, runs locally, no API cost
+  - Word-level accuracy (not sentence-level)
+  - Fast on CPU for short audio clips
+
+If WhisperX is not installed or fails for any reason,
+this module returns None and the system falls back to
+time_fraction-based sync (single/multi mode).
+No crashes, no interruptions.
+"""
+
+import json
+import os
+from typing import List, Optional
+
+from utils.console import print_substep
+
+
+# ── WhisperX model is loaded once and reused across all audio files ───────────
+# Loading is expensive (~2-3s). We cache it as a module-level singleton.
+_whisper_model = None
+_whisper_model_lang = None
+
+
+def _get_model(language: str = "en"):
+    """
+    Lazy-load WhisperX model. Loaded once per run, reused for all audio files.
+    Returns None if WhisperX is not installed.
+    """
+    global _whisper_model, _whisper_model_lang
+
+    if _whisper_model is not None and _whisper_model_lang == language:
+        return _whisper_model
+
+    try:
+        import whisperx
+        print_substep("Loading WhisperX model (first run only)...", style="bold blue")
+        _whisper_model = whisperx.load_model(
+            "base",          # small enough for CPU, accurate enough for TTS
+            device="cpu",
+            compute_type="int8",
+            language=language,
+        )
+        _whisper_model_lang = language
+        return _whisper_model
+    except ImportError:
+        return None
+    except Exception as e:
+        print_substep(f"WhisperX model load failed: {e}", style="yellow")
+        return None
+
+
+def align_audio(audio_path: str, language: str = "en") -> Optional[List[dict]]:
+    """
+    Run WhisperX on a single audio file and return word-level timestamps.
+
+    Parameters
+    ----------
+    audio_path : str
+        Path to the .mp3 file to align.
+    language : str
+        Language code (default: "en"). Matches TTS language.
+
+    Returns
+    -------
+    Optional[List[dict]]
+        List of {"word": str, "start": float, "end": float} dicts.
+        Returns None if WhisperX is unavailable or alignment fails.
+    """
+    try:
+        import whisperx
+
+        model = _get_model(language)
+        if model is None:
+            return None
+
+        # Transcribe + align
+        audio = whisperx.load_audio(audio_path)
+        result = model.transcribe(audio, batch_size=4)
+
+        # Align to get word-level timestamps
+        align_model, metadata = whisperx.load_align_model(
+            language_code=language,
+            device="cpu",
+        )
+        aligned = whisperx.align(
+            result["segments"],
+            align_model,
+            metadata,
+            audio,
+            device="cpu",
+            return_char_alignments=False,
+        )
+
+        # Flatten all words across all segments
+        words = []
+        for segment in aligned.get("word_segments", []):
+            word  = segment.get("word", "").strip()
+            start = segment.get("start")
+            end   = segment.get("end")
+            if word and start is not None and end is not None:
+                words.append({
+                    "word":  word,
+                    "start": round(float(start), 3),
+                    "end":   round(float(end),   3),
+                })
+
+        return words if words else None
+
+    except Exception as e:
+        print_substep(f"WhisperX alignment failed for {audio_path}: {e}", style="yellow")
+        return None
+
+
+def align_and_save(audio_path: str, language: str = "en") -> Optional[str]:
+    """
+    Align audio and save word timestamps as a JSON file next to the audio.
+
+    Parameters
+    ----------
+    audio_path : str
+        e.g. "assets/temp/abc123/mp3/postaudio-0.mp3"
+    language : str
+        Language code.
+
+    Returns
+    -------
+    Optional[str]
+        Path to saved JSON file, or None if alignment failed.
+    """
+    words = align_audio(audio_path, language)
+
+    if words is None:
+        return None
+
+    json_path = audio_path.replace(".mp3", "_words.json")
+    with open(json_path, "w", encoding="utf-8") as f:
+        json.dump(words, f, indent=2, ensure_ascii=False)
+
+    return json_path
+
+
+def load_word_timestamps(audio_path: str) -> Optional[List[dict]]:
+    """
+    Load previously saved word timestamps for an audio file.
+    Returns None if the file doesn't exist.
+    """
+    json_path = audio_path.replace(".mp3", "_words.json")
+    if not os.path.exists(json_path):
+        return None
+    with open(json_path, "r", encoding="utf-8") as f:
+        return json.load(f)
--- a/video_creation/final_video.py
+++ b/video_creation/final_video.py
@ -5,9 +5,11 @@ import tempfile
 import textwrap
 import threading
 import time
-from os.path import exists  # Needs to be imported specifically
+from os.path import exists
 from pathlib import Path
 from typing import Dict, Final, Tuple
+import glob
+import json

 import ffmpeg
 import translators
@ -44,7 +46,6 @@ class ProgressFfmpeg(threading.Thread):

    def get_latest_ms_progress(self):
        lines = self.output_file.readlines()
-
        if lines:
            for line in lines:
                if "out_time_ms" in line:
@ -52,7 +53,6 @@ class ProgressFfmpeg(threading.Thread):
                    if out_time_ms_str.isnumeric():
                        return float(out_time_ms_str) / 1000000.0
                    else:
-                        # Handle the case when "N/A" is encountered
                        return None
        return None

@ -74,7 +74,6 @@ def name_normalize(name: str) -> str:
    name = re.sub(r"(\d+)\s?\/\s?(\d+)", r"\1 of \2", name)
    name = re.sub(r"(\w+)\s?\/\s?(\w+)", r"\1 or \2", name)
    name = re.sub(r"\/", r"", name)
-
    lang = settings.config["reddit"]["thread"]["post_lang"]
    if lang:
        print_substep("Translating filename...")
@ -119,51 +118,38 @@ def get_text_height(draw, text, font, max_width):


 def create_fancy_thumbnail(image, text, text_color, padding, wrap=35):
-    """
-    It will take the 1px from the middle of the template and will be resized (stretched) vertically to accommodate the extra height needed for the title.
-    """
    print_step(f"Creating fancy thumbnail for: {text}")
    font_title_size = 47
    font = ImageFont.truetype(os.path.join("fonts", "Roboto-Bold.ttf"), font_title_size)
    image_width, image_height = image.size

-    # Calculate text height to determine new image height
    draw = ImageDraw.Draw(image)
    text_height = get_text_height(draw, text, font, wrap)
    lines = textwrap.wrap(text, width=wrap)
-    # This is -50 to reduce the empty space at the bottom of the image,
-    # change it as per your requirement if needed otherwise leave it.
    new_image_height = image_height + text_height + padding * (len(lines) - 1) - 50

-    # Separate the image into top, middle (1px), and bottom parts
-    top_part_height = image_height // 2
-    middle_part_height = 1  # 1px height middle section
+    top_part_height    = image_height // 2
+    middle_part_height = 1
    bottom_part_height = image_height - top_part_height - middle_part_height

-    top_part = image.crop((0, 0, image_width, top_part_height))
+    top_part    = image.crop((0, 0, image_width, top_part_height))
    middle_part = image.crop((0, top_part_height, image_width, top_part_height + middle_part_height))
    bottom_part = image.crop((0, top_part_height + middle_part_height, image_width, image_height))

-    # Stretch the middle part
-    new_middle_height = new_image_height - top_part_height - bottom_part_height
+    new_middle_height = max(1, new_image_height - top_part_height - bottom_part_height)
    middle_part = middle_part.resize((image_width, new_middle_height))

-    # Create new image with the calculated height
    new_image = Image.new("RGBA", (image_width, new_image_height))
-
-    # Paste the top, stretched middle, and bottom parts into the new image
-    new_image.paste(top_part, (0, 0))
+    new_image.paste(top_part,    (0, 0))
    new_image.paste(middle_part, (0, top_part_height))
    new_image.paste(bottom_part, (0, top_part_height + new_middle_height))

-    # Draw the title text on the new image
    draw = ImageDraw.Draw(new_image)
    y = top_part_height + padding
    for line in lines:
        draw.text((120, y), line, font=font, fill=text_color, align="left")
        y += get_text_height(draw, line, font, wrap) + padding

-    # Draw the username "PlotPulse" at the specific position
    username_font = ImageFont.truetype(os.path.join("fonts", "Roboto-Bold.ttf"), 30)
    draw.text(
        (205, 825),
@ -172,28 +158,44 @@ def create_fancy_thumbnail(image, text, text_color, padding, wrap=35):
        fill=text_color,
        align="left",
    )
-
    return new_image


 def merge_background_audio(audio: ffmpeg, reddit_id: str):
-    """Gather an audio and merge with assets/backgrounds/background.mp3
-    Args:
-        audio (ffmpeg): The TTS final audio but without background.
-        reddit_id (str): The ID of subreddit
-    """
    background_audio_volume = settings.config["settings"]["background"]["background_audio_volume"]
    if background_audio_volume == 0:
-        return audio  # Return the original audio
-    else:
-        # sets volume to config
-        bg_audio = ffmpeg.input(f"assets/temp/{reddit_id}/background.mp3").filter(
-            "volume",
-            background_audio_volume,
-        )
-        # Merges audio and background_audio
-        merged_audio = ffmpeg.filter([audio, bg_audio], "amix", duration="longest")
-        return merged_audio  # Return merged audio
+        return audio
+    bg_audio = ffmpeg.input(f"assets/temp/{reddit_id}/background.mp3").filter(
+        "volume", background_audio_volume,
+    )
+    return ffmpeg.filter([audio, bg_audio], "amix", duration="longest")
+
+
+def _load_timing_map(reddit_id: str, img_files: list, postaudio_files: list,
+                     audio_clips_durations: list, title_duration: float) -> list:
+    """
+    Load timing_map.json written by imagemaker().
+
+    Each entry is one of:
+      {"timing_type": "absolute", "clip_start": S, "clip_end": E}
+        → used directly as FFmpeg enable times
+
+      {"timing_type": "fraction", "audio_idx": N, "time_fraction": F}
+        → clip time computed as: audio_start[N] + accumulated_fraction * audio_dur[N]
+
+    Falls back to 1:1 mapping if file missing.
+    """
+    timing_map_path = f"assets/temp/{reddit_id}/timing_map.json"
+    if os.path.exists(timing_map_path):
+        with open(timing_map_path) as f:
+            return json.load(f)
+
+    # Fallback: 1:1
+    print_substep("timing_map.json not found — using 1:1 fallback", style="yellow")
+    return [
+        {"timing_type": "fraction", "audio_idx": i, "time_fraction": 1.0}
+        for i in range(len(img_files))
+    ]


 def make_final_video(
@ -202,20 +204,10 @@ def make_final_video(
    reddit_obj: dict,
    background_config: Dict[str, Tuple],
 ):
-    """Gathers audio clips, gathers all screenshots, stitches them together and saves the final video to assets/temp
-    Args:
-        number_of_clips (int): Index to end at when going through the screenshots'
-        length (int): Length of the video
-        reddit_obj (dict): The reddit object that contains the posts to read.
-        background_config (Tuple[str, str, str, Any]): The background config to use.
-    """
-    # settings values
    W: Final[int] = int(settings.config["settings"]["resolution_w"])
    H: Final[int] = int(settings.config["settings"]["resolution_h"])
-
-    opacity = settings.config["settings"]["opacity"]
-
-    reddit_id = extract_id(reddit_obj)
+    opacity       = settings.config["settings"]["opacity"]
+    reddit_id     = extract_id(reddit_obj)

    allowOnlyTTSFolder: bool = (
        settings.config["settings"]["background"]["enable_extra_audio"]
@ -223,33 +215,31 @@ def make_final_video(
    )

    print_step("Creating the final video 🎥")
-
    background_clip = ffmpeg.input(prepare_background(reddit_id, W=W, H=H))

-    # Gather all audio clips
+    # ── Audio clips ───────────────────────────────────────────────────────────
    audio_clips = list()
    if number_of_clips == 0 and settings.config["settings"]["storymode"] == "false":
-        print(
-            "No audio clips to gather. Please use a different TTS or post."
-        )  # This is to fix the TypeError: unsupported operand type(s) for +: 'int' and 'NoneType'
+        print("No audio clips to gather.")
        exit()
+
    if settings.config["settings"]["storymode"]:
        if settings.config["settings"]["storymodemethod"] == 0:
            audio_clips = [ffmpeg.input(f"assets/temp/{reddit_id}/mp3/title.mp3")]
            audio_clips.insert(1, ffmpeg.input(f"assets/temp/{reddit_id}/mp3/postaudio.mp3"))
        elif settings.config["settings"]["storymodemethod"] == 1:
-            audio_clips = [
-                ffmpeg.input(f"assets/temp/{reddit_id}/mp3/postaudio-{i}.mp3")
-                for i in track(range(number_of_clips + 1), "Collecting the audio files...")
-            ]
+            postaudio_files = sorted(
+                glob.glob(f"assets/temp/{reddit_id}/mp3/postaudio-*.mp3"),
+                key=lambda x: int(re.search(r"postaudio-(\d+)", x).group(1))
+            )
+            audio_clips = [ffmpeg.input(f) for f in postaudio_files]
            audio_clips.insert(0, ffmpeg.input(f"assets/temp/{reddit_id}/mp3/title.mp3"))
-
    else:
        audio_clips = [
-            ffmpeg.input(f"assets/temp/{reddit_id}/mp3/{i}.mp3") for i in range(number_of_clips)
+            ffmpeg.input(f"assets/temp/{reddit_id}/mp3/{i}.mp3")
+            for i in range(number_of_clips)
        ]
        audio_clips.insert(0, ffmpeg.input(f"assets/temp/{reddit_id}/mp3/title.mp3"))
-
        audio_clips_durations = [
            float(ffmpeg.probe(f"assets/temp/{reddit_id}/mp3/{i}.mp3")["format"]["duration"])
            for i in range(number_of_clips)
@ -258,6 +248,7 @@ def make_final_video(
            0,
            float(ffmpeg.probe(f"assets/temp/{reddit_id}/mp3/title.mp3")["format"]["duration"]),
        )
+
    audio_concat = ffmpeg.concat(*audio_clips, a=1, v=0)
    ffmpeg.output(
        audio_concat, f"assets/temp/{reddit_id}/audio.mp3", **{"b:a": "192k"}
@ -266,27 +257,16 @@ def make_final_video(
    console.log(f"[bold green] Video Will Be: {length} Seconds Long")

    screenshot_width = int((W * 45) // 100)
-    audio = ffmpeg.input(f"assets/temp/{reddit_id}/audio.mp3")
+    audio       = ffmpeg.input(f"assets/temp/{reddit_id}/audio.mp3")
    final_audio = merge_background_audio(audio, reddit_id)
-
    image_clips = list()

    Path(f"assets/temp/{reddit_id}/png").mkdir(parents=True, exist_ok=True)

-    # Credits to tim (beingbored)
-    # get the title_template image and draw a text in the middle part of it with the title of the thread
+    # ── Title card ────────────────────────────────────────────────────────────
    title_template = Image.open("assets/title_template.png")
-
-    title = reddit_obj["thread_title"]
-
-    title = name_normalize(title)
-
-    font_color = "#000000"
-    padding = 5
-
-    # create_fancy_thumbnail(image, text, text_color, padding
-    title_img = create_fancy_thumbnail(title_template, title, font_color, padding)
-
+    title          = name_normalize(reddit_obj["thread_title"])
+    title_img      = create_fancy_thumbnail(title_template, title, "#000000", 5)
    title_img.save(f"assets/temp/{reddit_id}/png/title.png")
    image_clips.insert(
        0,
@ -296,18 +276,17 @@ def make_final_video(
    )

    current_time = 0
+
    if settings.config["settings"]["storymode"]:
-        audio_clips_durations = [
-            float(
-                ffmpeg.probe(f"assets/temp/{reddit_id}/mp3/postaudio-{i}.mp3")["format"]["duration"]
-            )
-            for i in range(number_of_clips)
-        ]
-        audio_clips_durations.insert(
-            0,
-            float(ffmpeg.probe(f"assets/temp/{reddit_id}/mp3/title.mp3")["format"]["duration"]),
-        )
+
        if settings.config["settings"]["storymodemethod"] == 0:
+            audio_clips_durations = [
+                float(ffmpeg.probe(f"assets/temp/{reddit_id}/mp3/postaudio.mp3")["format"]["duration"])
+            ]
+            audio_clips_durations.insert(
+                0,
+                float(ffmpeg.probe(f"assets/temp/{reddit_id}/mp3/title.mp3")["format"]["duration"]),
+            )
            image_clips.insert(
                1,
                ffmpeg.input(f"assets/temp/{reddit_id}/png/story_content.png").filter(
@ -321,20 +300,97 @@ def make_final_video(
                y="(main_h-overlay_h)/2",
            )
            current_time += audio_clips_durations[0]
+
        elif settings.config["settings"]["storymodemethod"] == 1:
-            for i in track(range(0, number_of_clips + 1), "Collecting the image files..."):
-                image_clips.append(
-                    ffmpeg.input(f"assets/temp/{reddit_id}/png/img{i}.png")["v"].filter(
-                        "scale", screenshot_width, -1
-                    )
+
+            # ── Discover postaudio files ──────────────────────────────────────
+            postaudio_files = sorted(
+                glob.glob(f"assets/temp/{reddit_id}/mp3/postaudio-*.mp3"),
+                key=lambda x: int(re.search(r"postaudio-(\d+)", x).group(1))
+            )
+
+            # ── Build durations ───────────────────────────────────────────────
+            # audio_clips_durations[0]   = title
+            # audio_clips_durations[1+i] = postaudio-{i}
+            audio_clips_durations = [
+                float(ffmpeg.probe(f)["format"]["duration"])
+                for f in postaudio_files
+            ]
+            title_duration = float(
+                ffmpeg.probe(f"assets/temp/{reddit_id}/mp3/title.mp3")["format"]["duration"]
+            )
+            audio_clips_durations.insert(0, title_duration)
+
+            # ── Pre-compute absolute start time per audio file ────────────────
+            # audio_start_times[i] = when postaudio-{i} starts in the video
+            audio_start_times = []
+            t = title_duration
+            for dur in audio_clips_durations[1:]:
+                audio_start_times.append(t)
+                t += dur
+
+            # ── Title card overlay ────────────────────────────────────────────
+            background_clip = background_clip.overlay(
+                image_clips[0],
+                enable=f"between(t,0,{title_duration})",
+                x="(main_w-overlay_w)/2",
+                y="(main_h-overlay_h)/2",
+            )
+            current_time = title_duration
+
+            # ── Load image files ──────────────────────────────────────────────
+            img_files = sorted(
+                glob.glob(f"assets/temp/{reddit_id}/png/img*.png"),
+                key=lambda x: int(re.search(r"img(\d+)", x).group(1))
+            )
+
+            # ── Load timing map ───────────────────────────────────────────────
+            timing_map = _load_timing_map(
+                reddit_id, img_files, postaudio_files,
+                audio_clips_durations, title_duration
+            )
+
+            # ── Overlay each image ────────────────────────────────────────────
+            # Handles both absolute and fraction timing types cleanly.
+            # For fraction: track time_consumed per audio_idx
+            audio_time_used = {}
+
+            for i, img_file in enumerate(img_files):
+                if i >= len(timing_map):
+                    break
+
+                entry       = timing_map[i]
+                timing_type = entry.get("timing_type", "fraction")
+
+                if timing_type == "absolute":
+                    # WhisperX aligned — use timestamps directly
+                    clip_start = entry["clip_start"]
+                    clip_end   = entry["clip_end"]
+
+                else:
+                    # Fraction-based — compute from audio duration
+                    audio_idx     = entry["audio_idx"]
+                    time_fraction = entry["time_fraction"]
+                    audio_dur     = audio_clips_durations[audio_idx + 1]
+                    display_dur   = audio_dur * time_fraction
+                    offset        = audio_time_used.get(audio_idx, 0.0)
+                    clip_start    = audio_start_times[audio_idx] + offset
+                    clip_end      = clip_start + display_dur
+                    audio_time_used[audio_idx] = offset + display_dur
+
+                img_clip = ffmpeg.input(img_file)["v"].filter(
+                    "scale", screenshot_width, -1
                )
+                image_clips.append(img_clip)
                background_clip = background_clip.overlay(
-                    image_clips[i],
-                    enable=f"between(t,{current_time},{current_time + audio_clips_durations[i]})",
+                    img_clip,
+                    enable=f"between(t,{clip_start:.3f},{clip_end:.3f})",
                    x="(main_w-overlay_w)/2",
                    y="(main_h-overlay_h)/2",
                )
-                current_time += audio_clips_durations[i]
+
+            current_time = t
+
    else:
        for i in range(0, number_of_clips + 1):
            image_clips.append(
@ -343,9 +399,7 @@ def make_final_video(
                )
            )
            image_overlay = image_clips[i].filter("colorchannelmixer", aa=opacity)
-            assert (
-                audio_clips_durations is not None
-            ), "Please make a GitHub issue if you see this. Ping @JasonLovesDoggo on GitHub."
+            assert audio_clips_durations is not None
            background_clip = background_clip.overlay(
                image_overlay,
                enable=f"between(t,{current_time},{current_time + audio_clips_durations[i]})",
@ -354,67 +408,48 @@ def make_final_video(
            )
            current_time += audio_clips_durations[i]

-    title = extract_id(reddit_obj, "thread_title")
-    idx = extract_id(reddit_obj)
-    title_thumb = reddit_obj["thread_title"]
-
-    filename = f"{name_normalize(title)[:100]}"
-    subreddit = reddit_obj.get("thread_subreddit", settings.config["reddit"]["thread"]["subreddit"])
-    sentiment = settings.config["settings"]["background"].get("background_video", "unknown")
-
-    # Per-video folder: results/{subreddit}/{thread_id}_{sentiment}/
+    # ── Output ────────────────────────────────────────────────────────────────
+    title_str    = extract_id(reddit_obj, "thread_title")
+    idx          = extract_id(reddit_obj)
+    title_thumb  = reddit_obj["thread_title"]
+    subreddit    = reddit_obj.get("thread_subreddit", settings.config["reddit"]["thread"]["subreddit"])
+    sentiment    = settings.config["settings"]["background"].get("background_video", "unknown")
    video_folder = f"./results/{subreddit}/{idx}_{sentiment}"
    os.makedirs(video_folder, exist_ok=True)

    if allowOnlyTTSFolder:
        os.makedirs(f"{video_folder}/OnlyTTS", exist_ok=True)

-    # create a thumbnail for the video
    settingsbackground = settings.config["settings"]["background"]
-
    if settingsbackground["background_thumbnail"]:
-        if not exists(f"{video_folder}"):
-            os.makedirs(f"{video_folder}", exist_ok=True)
-        # get the first file with the .png extension from assets/backgrounds and use it as a background for the thumbnail
        first_image = next(
-            (file for file in os.listdir("assets/backgrounds") if file.endswith(".png")),
-            None,
+            (f for f in os.listdir("assets/backgrounds") if f.endswith(".png")), None
        )
        if first_image is None:
            print_substep("No png files found in assets/backgrounds", "red")
-
        else:
-            font_family = settingsbackground["background_thumbnail_font_family"]
-            font_size = settingsbackground["background_thumbnail_font_size"]
-            font_color = settingsbackground["background_thumbnail_font_color"]
            thumbnail = Image.open(f"assets/backgrounds/{first_image}")
-            width, height = thumbnail.size
+            w, h = thumbnail.size
            thumbnailSave = create_thumbnail(
                thumbnail,
-                font_family,
-                font_size,
-                font_color,
-                width,
-                height,
-                title_thumb,
+                settingsbackground["background_thumbnail_font_family"],
+                settingsbackground["background_thumbnail_font_size"],
+                settingsbackground["background_thumbnail_font_color"],
+                w, h, title_thumb,
            )
            thumbnailSave.save(f"{video_folder}/thumbnail.png")
-            print_substep(f"Thumbnail - Building Thumbnail in assets/temp/{reddit_id}/thumbnail.png")

-    text = f"Background by {background_config['video'][2]}"
    background_clip = ffmpeg.drawtext(
        background_clip,
-        text=text,
-        x=f"(w-text_w)",
-        y=f"(h-text_h)",
-        fontsize=5,
-        fontcolor="White",
+        text=f"Background by {background_config['video'][2]}",
+        x="(w-text_w)", y="(h-text_h)",
+        fontsize=5, fontcolor="White",
        fontfile=os.path.join("fonts", "Roboto-Regular.ttf"),
    )
    background_clip = background_clip.filter("scale", W, H)
+
    print_step("Rendering the video 🎥")
    from tqdm import tqdm
-
    pbar = tqdm(total=100, desc="Progress: ", bar_format="{l_bar}{bar}", unit=" %")

    def on_update_example(progress) -> None:
@ -422,14 +457,11 @@ def make_final_video(
        old_percentage = pbar.n
        pbar.update(status - old_percentage)

-    defaultPath = video_folder
    with ProgressFfmpeg(length, on_update_example) as progress:
        path = f"{video_folder}/video.mp4"
        try:
            ffmpeg.output(
-                background_clip,
-                final_audio,
-                path,
+                background_clip, final_audio, path,
                f="mp4",
                **{
                    "c:v": "h264_nvenc",
@ -438,26 +470,23 @@ def make_final_video(
                    "threads": multiprocessing.cpu_count(),
                },
            ).overwrite_output().global_args("-progress", progress.output_file.name).run(
-                quiet=True,
-                overwrite_output=True,
-                capture_stdout=False,
-                capture_stderr=False,
+                quiet=True, overwrite_output=True,
+                capture_stdout=False, capture_stderr=False,
            )
        except ffmpeg.Error as e:
            print(e.stderr.decode("utf8"))
            exit(1)
+
    old_percentage = pbar.n
    pbar.update(100 - old_percentage)
+
    if allowOnlyTTSFolder:
        path = f"{video_folder}/OnlyTTS/video.mp4"
-        # Prevent a error by limiting the path length, do not change this.
        print_step("Rendering the Only TTS Video 🎥")
        with ProgressFfmpeg(length, on_update_example) as progress:
            try:
                ffmpeg.output(
-                    background_clip,
-                    audio,
-                    path,
+                    background_clip, audio, path,
                    f="mp4",
                    **{
                        "c:v": "h264_nvenc",
@ -466,20 +495,18 @@ def make_final_video(
                        "threads": multiprocessing.cpu_count(),
                    },
                ).overwrite_output().global_args("-progress", progress.output_file.name).run(
-                    quiet=True,
-                    overwrite_output=True,
-                    capture_stdout=False,
-                    capture_stderr=False,
+                    quiet=True, overwrite_output=True,
+                    capture_stdout=False, capture_stderr=False,
                )
            except ffmpeg.Error as e:
                print(e.stderr.decode("utf8"))
                exit(1)
-
        old_percentage = pbar.n
        pbar.update(100 - old_percentage)
+
    pbar.close()
-    save_data(subreddit, f"{idx}_{sentiment}/video.mp4", title, idx, background_config["video"][2])
+    save_data(subreddit, f"{idx}_{sentiment}/video.mp4", title_str, idx, background_config["video"][2])
    print_step("Removing temporary files 🗑")
    cleanups = cleanup(reddit_id)
    print_substep(f"Removed {cleanups} temporary files 🗑")
-    print_step("Done! 🎉 The video is in the results folder 📁")
+    print_step("Done! 🎉 The video is in the results folder 📁")
--- a/video_creation/screenshot_downloader.py
+++ b/video_creation/screenshot_downloader.py
@ -62,10 +62,10 @@ def get_screenshots_of_reddit_posts(reddit_object: dict, screenshot_num: int):
    if storymode and settings.config["settings"]["storymodemethod"] == 1:
        print_substep("Generating images...")
        return imagemaker(
-            theme=bgcolor,
+            theme=(0, 0, 0, 0),
            reddit_obj=reddit_object,
-            txtclr=txtcolor,
-            transparent=transparent,
+            txtclr=(255, 255, 255),
+            transparent=True,
        )

    screenshot_num: int