pull/2557/merge
Abdessamad Haddouche 3 weeks ago committed by GitHub
commit 71cbbacd60
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -1,11 +1,8 @@
import random
from elevenlabs import save
from elevenlabs.client import ElevenLabs
from utils import settings
class elevenlabs:
def __init__(self):
self.max_chars = 2500
@ -17,9 +14,14 @@ class elevenlabs:
if random_voice:
voice = self.randomvoice()
else:
voice = str(settings.config["settings"]["tts"]["elevenlabs_voice_name"]).capitalize()
audio = self.client.generate(text=text, voice=voice, model="eleven_multilingual_v1")
voice_name = str(settings.config["settings"]["tts"]["elevenlabs_voice_name"])
all_voices = self.client.voices.get_all().voices
matched = [v for v in all_voices if v.name.lower() == voice_name.lower()]
if matched:
voice = matched[0].voice_id
else:
raise ValueError(f"Voice '{voice_name}' not found in your ElevenLabs account.")
audio = self.client.generate(text=text, voice=voice, model="eleven_multilingual_v2")
save(audio=audio, filename=filepath)
def initialize(self):
@ -27,12 +29,11 @@ class elevenlabs:
api_key = settings.config["settings"]["tts"]["elevenlabs_api_key"]
else:
raise ValueError(
"You didn't set an Elevenlabs API key! Please set the config variable ELEVENLABS_API_KEY to a valid API key."
"You didn't set an Elevenlabs API key!"
)
self.client = ElevenLabs(api_key=api_key)
def randomvoice(self):
if self.client is None:
self.initialize()
return random.choice(self.client.voices.get_all().voices).name
return random.choice(self.client.voices.get_all().voices).voice_id

@ -14,23 +14,11 @@ from utils import settings
from utils.console import print_step, print_substep
from utils.voice import sanitize_text
DEFAULT_MAX_LENGTH: int = (
50 # Video length variable, edit this on your own risk. It should work, but it's not supported
)
DEFAULT_MAX_LENGTH: int = 50
class TTSEngine:
"""Calls the given TTS engine to reduce code duplication and allow multiple TTS engines.
Args:
tts_module : The TTS module. Your module should handle the TTS itself and saving to the given path under the run method.
reddit_object : The reddit object that contains the posts to read.
path (Optional) : The unix style path to save the mp3 files to. This must not have leading or trailing slashes.
max_length (Optional) : The maximum length of the mp3 files in total.
Notes:
tts_module must take the arguments text and filepath.
"""
"""Calls the given TTS engine to reduce code duplication and allow multiple TTS engines."""
def __init__(
self,
@ -42,18 +30,14 @@ class TTSEngine:
):
self.tts_module = tts_module()
self.reddit_object = reddit_object
self.redditid = re.sub(r"[^\w\s-]", "", reddit_object["thread_id"])
self.path = path + self.redditid + "/mp3"
self.max_length = max_length
self.length = 0
self.last_clip_length = last_clip_length
def add_periods(
self,
): # adds periods to the end of paragraphs (where people often forget to put them) so tts doesn't blend sentences
def add_periods(self):
for comment in self.reddit_object["comments"]:
# remove links
regex_urls = r"((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*"
comment["comment_body"] = re.sub(regex_urls, " ", comment["comment_body"])
comment["comment_body"] = comment["comment_body"].replace("\n", ". ")
@ -72,7 +56,6 @@ class TTSEngine:
self.add_periods()
self.call_tts("title", process_text(self.reddit_object["thread_title"]))
# processed_text = ##self.reddit_object["thread_post"] != ""
idx = 0
if settings.config["settings"]["storymode"]:
@ -84,24 +67,41 @@ class TTSEngine:
elif settings.config["settings"]["storymodemethod"] == 1:
for idx, text in track(enumerate(self.reddit_object["thread_post"])):
self.call_tts(f"postaudio-{idx}", process_text(text))
# ── WhisperX alignment ────────────────────────────────────
# Run immediately after each TTS save so word timestamps
# are ready when imagemaker() runs later.
# Fails silently — never blocks video generation.
self._align_audio(f"postaudio-{idx}")
else:
for idx, comment in track(enumerate(self.reddit_object["comments"]), "Saving..."):
# ! Stop creating mp3 files if the length is greater than max length.
if self.length > self.max_length and idx > 1:
self.length -= self.last_clip_length
idx -= 1
break
if (
len(comment["comment_body"]) > self.tts_module.max_chars
): # Split the comment if it is too long
self.split_post(comment["comment_body"], idx) # Split the comment
else: # If the comment is not too long, just call the tts engine
if len(comment["comment_body"]) > self.tts_module.max_chars:
self.split_post(comment["comment_body"], idx)
else:
self.call_tts(f"{idx}", process_text(comment["comment_body"]))
print_substep("Saved Text to MP3 files successfully.", style="bold green")
return self.length, idx
def _align_audio(self, filename: str) -> None:
"""
Run WhisperX on a saved audio file to produce word-level timestamps.
Called immediately after each postaudio-{i}.mp3 is saved.
Fails silently system falls back to time_fraction mode if unavailable.
"""
try:
from utils.whisper_aligner import align_and_save
audio_path = f"{self.path}/{filename}.mp3"
lang = settings.config["reddit"]["thread"].get("post_lang", "en") or "en"
result = align_and_save(audio_path, language=lang)
if result:
print_substep(f"Word timestamps saved → {result}", style="dim")
except Exception:
pass # Never crash on alignment failure
def split_post(self, text: str, idx):
split_files = []
split_text = [
@ -114,8 +114,6 @@ class TTSEngine:
for idy, text_cut in enumerate(split_text):
newtext = process_text(text_cut)
# print(f"{idx}-{idy}: {newtext}\n")
if not newtext or newtext.isspace():
print("newtext was blank because sanitized split text resulted in none")
continue
@ -144,7 +142,6 @@ class TTSEngine:
def call_tts(self, filename: str, text: str):
if settings.config["settings"]["tts"]["voice_choice"] == "googletranslate":
# GTTS does not have the argument 'random_voice'
self.tts_module.run(
text,
filepath=f"{self.path}/{filename}.mp3",
@ -155,10 +152,6 @@ class TTSEngine:
filepath=f"{self.path}/{filename}.mp3",
random_voice=settings.config["settings"]["tts"]["random_voice"],
)
# try:
# self.length += MP3(f"{self.path}/{filename}.mp3").info.length
# except (MutagenError, HeaderNotFoundError):
# self.length += sox.file_info.duration(f"{self.path}/{filename}.mp3")
try:
clip = AudioFileClip(f"{self.path}/{filename}.mp3")
self.last_clip_length = clip.duration
@ -185,4 +178,4 @@ def process_text(text: str, clean: bool = True):
print_substep("Translating Text...")
translated_text = translators.translate_text(text, translator="google", to_language=lang)
new_text = sanitize_text(translated_text)
return new_text
return new_text

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

@ -49,6 +49,15 @@ reddit_object: Dict[str, str | list]
def main(POST_ID=None) -> None:
global reddit_id, reddit_object
reddit_object = get_subreddit_threads(POST_ID)
# ── SENTIMENT DETECTION ──────────────────────────────────
if settings.config["deepseek"].get("enabled", True):
from utils.sentiment import apply_sentiment_config
apply_sentiment_config(reddit_object)
else:
print_substep("Sentiment detection disabled. Using config defaults.", style="yellow")
# ─────────────────────────────────────────────────────────
reddit_id = extract_id(reddit_object)
print_substep(f"Thread ID is {reddit_id}", style="bold blue")
length, number_of_comments = save_text_to_mp3(reddit_object)

@ -120,6 +120,7 @@ def get_subreddit_threads(POST_ID: str):
content["thread_url"] = threadurl
content["thread_title"] = submission.title
content["thread_id"] = submission.id
content["thread_subreddit"] = submission.subreddit.display_name
content["is_nsfw"] = submission.over_18
content["comments"] = []
if settings.config["settings"]["storymode"]:

@ -1,21 +1,198 @@
aiohappyeyeballs==2.6.2
aiohttp==3.13.5
aiosignal==1.4.0
alembic==1.18.4
annotated-doc==0.0.4
annotated-types==0.7.0
antlr4-python3-runtime==4.9.3
anyio==4.13.0
asteroid-filterbanks==0.4.0
attrs==26.1.0
av==17.0.1
blinker==1.9.0
blis==1.3.3
boto3==1.36.8
botocore==1.36.8
catalogue==2.0.10
certifi==2026.5.20
cffi==2.0.0
charset-normalizer==3.4.7
clean-text==0.6.0
click==8.1.8
cloudpathlib==0.24.0
colorlog==6.10.1
confection==0.1.5
contourpy==1.3.3
cryptography==48.0.0
ctranslate2==4.7.2
cycler==0.12.1
cymem==2.0.13
decorator==5.3.1
dill==0.4.1
distro==1.9.0
einops==0.8.2
elevenlabs==1.57.0
emoji==1.7.0
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl#sha256=1932429db727d4bff3deed6b34cfc05df17794f4a52eeb26cf8928f7c1a0fb85
exejs==0.0.7
faster-whisper==1.2.1
ffmpeg-python==0.2.0
filelock==3.29.0
Flask==3.1.1
flatbuffers==25.12.19
fonttools==4.63.0
frozenlist==1.8.0
fsspec==2026.4.0
ftfy==6.3.1
future==1.0.0
googleapis-common-protos==1.75.0
greenlet==3.1.1
grpcio==1.80.0
gTTS==2.5.4
h11==0.16.0
hf-xet==1.5.0
httpcore==1.0.9
httpx==0.28.1
huggingface_hub==0.36.2
idna==3.17
ImageIO==2.37.3
imageio-ffmpeg==0.6.0
itsdangerous==2.2.0
jh2==5.0.11
Jinja2==3.1.6
jiter==0.15.0
jmespath==1.1.0
joblib==1.5.3
julius==0.2.7
kiwisolver==1.5.0
langcodes==3.5.1
lightning==2.6.5
lightning-utilities==0.15.3
lxml==6.1.1
Mako==1.3.12
markdown-it-py==4.2.0
MarkupSafe==3.0.3
matplotlib==3.10.9
mdurl==0.1.2
moviepy==2.2.1
mpmath==1.3.0
multidict==6.7.1
multiprocess==0.70.19
murmurhash==1.0.15
networkx==3.6.1
niquests==3.18.8
nltk==3.9.4
numpy==2.4.6
nvidia-cublas-cu12==12.8.4.1
nvidia-cuda-cupti-cu12==12.8.90
nvidia-cuda-nvrtc-cu12==12.8.93
nvidia-cuda-runtime-cu12==12.8.90
nvidia-cudnn-cu12==9.10.2.21
nvidia-cufft-cu12==11.3.3.83
nvidia-cufile-cu12==1.13.1.3
nvidia-curand-cu12==10.3.9.90
nvidia-cusolver-cu12==11.7.3.90
nvidia-cusparse-cu12==12.5.8.93
nvidia-cusparselt-cu12==0.7.1
nvidia-nccl-cu12==2.27.3
nvidia-nvjitlink-cu12==12.8.93
nvidia-nvtx-cu12==12.8.90
omegaconf==2.3.0
onnxruntime==1.26.0
openai==2.38.0
opentelemetry-api==1.42.1
opentelemetry-exporter-otlp==1.42.1
opentelemetry-exporter-otlp-proto-common==1.42.1
opentelemetry-exporter-otlp-proto-grpc==1.42.1
opentelemetry-exporter-otlp-proto-http==1.42.1
opentelemetry-proto==1.42.1
opentelemetry-sdk==1.42.1
opentelemetry-semantic-conventions==0.63b1
optuna==4.8.0
packaging==26.2
pandas==3.0.3
pathos==0.3.5
pillow==11.3.0
playwright==1.49.1
pox==0.3.7
ppft==1.7.8
praw==7.8.1
prawcore==2.4.0
preshed==3.0.13
primePy==1.3
proglog==0.1.12
propcache==0.5.2
protobuf==6.33.6
pyannote-audio==4.0.4
pyannote-core==6.0.1
pyannote-database==6.1.1
pyannote-metrics==4.1
pyannote-pipeline==4.0.0
pyannoteai-sdk==0.4.0
pycparser==3.0
pydantic==2.13.4
pydantic_core==2.46.4
pyee==12.0.0
Pygments==2.20.0
pyparsing==3.3.2
python-dateutil==2.9.0.post0
python-dotenv==1.2.2
pytorch-lightning==2.6.5
pytorch-metric-learning==2.9.0
pyttsx3==2.98
PyYAML==6.0.3
qh3==1.8.1
regex==2026.5.9
requests==2.32.3
rich==13.9.4
s3transfer==0.11.3
safetensors==0.7.0
scikit-learn==1.8.0
scipy==1.17.1
setuptools==82.0.1
shellingham==1.5.4
six==1.17.0
smart_open==7.6.1
sniffio==1.3.1
sortedcontainers==2.4.0
spacy==3.8.7
spacy-legacy==3.0.12
spacy-loggers==1.0.5
SQLAlchemy==2.0.50
srsly==2.5.3
sympy==1.14.0
thinc==8.3.11
threadpoolctl==3.6.0
tokenizers==0.21.4
toml==0.10.2
translators==5.9.9
pyttsx3==2.98
tomlkit==0.13.2
Flask==3.1.1
clean-text==0.6.0
unidecode==1.4.0
spacy==3.8.7
torch==2.7.0
torch==2.8.0
torch-audiomentations==0.12.0
torch_pitch_shift==1.2.5
torchaudio==2.8.0
torchcodec==0.7.0
torchmetrics==1.9.0
torchvision==0.23.0
tqdm==4.67.3
transformers==4.52.4
ffmpeg-python==0.2.0
elevenlabs==1.57.0
yt-dlp==2025.10.22
translators==5.9.9
triton==3.4.0
typer==0.26.2
typer-slim==0.24.0
typing-inspection==0.4.2
typing_extensions==4.15.0
Unidecode==1.4.0
update-checker==0.18.0
urllib3==2.7.0
urllib3-future==2.20.907
wasabi==1.1.3
wassima==2.1.0
wcwidth==0.7.0
weasel==0.4.3
websocket-client==1.9.0
websockets==16.0
Werkzeug==3.1.8
whisperx==3.8.6
wrapt==2.2.1
yarl==1.24.2
yt-dlp==2026.3.17

@ -47,7 +47,7 @@ background_thumbnail_font_color = { optional = true, default = "255,255,255", ex
[settings.tts]
voice_choice = { optional = false, default = "tiktok", options = ["elevenlabs", "streamlabspolly", "tiktok", "googletranslate", "awspolly", "pyttsx", "OpenAI"], example = "tiktok", explanation = "The voice platform used for TTS generation. " }
random_voice = { optional = false, type = "bool", default = true, example = true, options = [true, false,], explanation = "Randomizes the voice used for each comment" }
elevenlabs_voice_name = { optional = false, default = "Bella", example = "Bella", explanation = "The voice used for elevenlabs", options = ["Adam", "Antoni", "Arnold", "Bella", "Domi", "Elli", "Josh", "Rachel", "Sam", ] }
elevenlabs_voice_name = { optional = true, default = "Sarah - Mature, Reassuring, Confident", example = "Bella", explanation = "The voice used for elevenlabs", options = [] }
elevenlabs_api_key = { optional = true, example = "21f13f91f54d741e2ae27d2ab1b99d59", explanation = "Elevenlabs API key" }
aws_polly_voice = { optional = false, default = "Matthew", example = "Matthew", explanation = "The voice used for AWS Polly" }
streamlabs_polly_voice = { optional = false, default = "Matthew", example = "Matthew", explanation = "The voice used for Streamlabs Polly" }
@ -61,3 +61,7 @@ openai_api_url = { optional = true, default = "https://api.openai.com/v1/", exam
openai_api_key = { optional = true, example = "sk-abc123def456...", explanation = "Your OpenAI API key for TTS generation" }
openai_voice_name = { optional = false, default = "alloy", example = "alloy", explanation = "The voice used for OpenAI TTS generation", options = ["alloy", "ash", "coral", "echo", "fable", "onyx", "nova", "sage", "shimmer", "af_heart"] }
openai_model = { optional = false, default = "tts-1", example = "tts-1", explanation = "The model variant used for OpenAI TTS generation", options = ["tts-1", "tts-1-hd", "gpt-4o-mini-tts"] }
[deepseek]
api_key = { optional = true, default = "", explanation = "DeepSeek API key for sentiment detection. Get yours at platform.deepseek.com", example = "sk-xxxxxxxx" }
enabled = { optional = true, type = "bool", default = true, options = [true, false], explanation = "Enable or disable sentiment-aware video generation", example = true }

@ -0,0 +1,351 @@
"""
caption_renderer.py
All caption rendering logic. Three display modes:
multi full sentence on one image (1 RenderJob per sentence)
single sentence split into word chunks (N RenderJobs per sentence)
aligned word-level timestamps from WhisperX (perfect sync, any TTS)
RenderJob is the contract between this module and final_video.py.
Two types of timing:
FRACTION-based (multi, single):
audio_idx + time_fraction final_video computes absolute time
time_fraction = fraction of audio_clips_durations[audio_idx+1]
ABSOLUTE-based (aligned):
clip_start + clip_end final_video uses directly
These are absolute seconds in the video timeline (after title card)
final_video.py checks job["timing_type"] to know which to use.
"""
import os
from dataclasses import dataclass, field
from typing import List, Optional
from PIL import Image, ImageDraw, ImageFont
from utils.fonts import getsize
# ─────────────────────────────────────────────────────────────────────────────
# RenderJob — the contract
# ─────────────────────────────────────────────────────────────────────────────
@dataclass
class RenderJob:
"""
Describes exactly one output image (img{idx}.png).
timing_type = "fraction":
audio_idx + time_fraction used by final_video to compute display time.
time_fraction = 1.0 means shown for full audio file duration.
time_fraction = 0.25 means shown for 25% of audio file duration.
timing_type = "absolute":
clip_start + clip_end are absolute seconds in the video timeline.
final_video uses these directly no calculation needed.
"""
idx: int
lines: List[str]
timing_type: str # "fraction" or "absolute"
# fraction-based fields
audio_idx: int = 0
time_fraction: float = 1.0
# absolute-based fields
clip_start: float = 0.0
clip_end: float = 0.0
# ─────────────────────────────────────────────────────────────────────────────
# Display modes
# ─────────────────────────────────────────────────────────────────────────────
DISPLAY_MODES = {"single", "multi", "aligned"}
def render_multi_mode(
sentence: str,
style: dict,
audio_idx: int,
start_idx: int,
) -> List[RenderJob]:
"""
Full sentence on one image, wrapped into lines.
One RenderJob, time_fraction = 1.0.
Best for: funny, sad, wholesome, happy.
"""
words = sentence.split()
wpl = style["words_per_chunk"]
lines = [" ".join(words[i:i + wpl]) for i in range(0, len(words), wpl)]
if not lines:
lines = [sentence]
return [RenderJob(
idx=start_idx,
lines=lines,
timing_type="fraction",
audio_idx=audio_idx,
time_fraction=1.0,
)]
def render_single_mode(
sentence: str,
style: dict,
audio_idx: int,
start_idx: int,
) -> List[RenderJob]:
"""
Sentence split into word chunks, one per image.
Each shown for (1/N) of the audio duration.
Best for: scary, dramatic, angry, mysterious.
"""
wpc = style["words_per_chunk"]
words = sentence.split()
raw = [" ".join(words[i:i + wpc]) for i in range(0, len(words), wpc)]
raw = [c for c in raw if c.strip()] or [sentence]
n = len(raw)
fraction = 1.0 / n
return [
RenderJob(
idx=start_idx + i,
lines=[chunk],
timing_type="fraction",
audio_idx=audio_idx,
time_fraction=fraction,
)
for i, chunk in enumerate(raw)
]
def render_aligned_mode(
sentence: str,
style: dict,
audio_idx: int,
start_idx: int,
word_timestamps: List[dict],
audio_start_time: float,
audio_duration: float,
) -> List[RenderJob]:
"""
Word-level aligned mode using WhisperX timestamps.
Groups consecutive words into chunks of words_per_chunk words.
Each chunk's clip_start = timestamp of first word in chunk.
Each chunk's clip_end = timestamp of last word in chunk + its duration.
audio_start_time: absolute time in video when this audio file starts.
audio_duration: duration of this audio file (used as fallback end time).
Falls back to single mode if timestamps are empty or malformed.
"""
wpc = style["words_per_chunk"]
if not word_timestamps:
return render_single_mode(sentence, style, audio_idx, start_idx)
# Group word timestamps into chunks of wpc words
jobs = []
n = len(word_timestamps)
for chunk_start in range(0, n, wpc):
chunk_words = word_timestamps[chunk_start:chunk_start + wpc]
if not chunk_words:
continue
text = " ".join(w["word"] for w in chunk_words)
clip_start = audio_start_time + chunk_words[0]["start"]
# clip_end = end of last word in chunk,
# or start of next chunk if available, capped at audio end
if chunk_start + wpc < n:
clip_end = audio_start_time + word_timestamps[chunk_start + wpc]["start"]
else:
last_end = chunk_words[-1].get("end", chunk_words[-1]["start"] + 0.3)
clip_end = audio_start_time + last_end
# Safety: never exceed audio boundary
audio_end = audio_start_time + audio_duration
clip_end = min(clip_end, audio_end)
clip_end = max(clip_end, clip_start + 0.1) # minimum 100ms visibility
jobs.append(RenderJob(
idx=start_idx + len(jobs),
lines=[text],
timing_type="absolute",
clip_start=round(clip_start, 3),
clip_end=round(clip_end, 3),
))
return jobs if jobs else render_single_mode(sentence, style, audio_idx, start_idx)
# ─────────────────────────────────────────────────────────────────────────────
# Router
# ─────────────────────────────────────────────────────────────────────────────
def get_render_jobs(
sentences: List[str],
style: dict,
mp3_dir: Optional[str] = None,
audio_start_times: Optional[List[float]] = None,
audio_durations: Optional[List[float]] = None,
) -> List[RenderJob]:
"""
Route each sentence to the correct renderer.
Returns flat ordered list of all RenderJobs.
For "aligned" mode, loads word timestamps from
{mp3_dir}/postaudio-{i}_words.json written by engine_wrapper.
Falls back to "single" mode per sentence if timestamps missing.
Parameters
----------
sentences : one per postaudio-{i}.mp3
style : STYLE_MAP entry for current sentiment
mp3_dir : path to mp3 folder (needed for aligned mode)
audio_start_times : absolute start time of each audio in video (needed for aligned)
audio_durations : duration of each audio file (needed for aligned)
"""
mode = style.get("display_mode", "multi")
if mode not in DISPLAY_MODES:
print(f"[caption_renderer] Unknown display_mode '{mode}', using 'multi'")
mode = "multi"
all_jobs: List[RenderJob] = []
img_counter: int = 0
for audio_idx, sentence in enumerate(sentences):
if mode == "aligned" and mp3_dir and audio_start_times and audio_durations:
# Try to load word timestamps for this sentence
from utils.whisper_aligner import load_word_timestamps
audio_path = os.path.join(mp3_dir, f"postaudio-{audio_idx}.mp3")
word_ts = load_word_timestamps(audio_path)
if word_ts:
jobs = render_aligned_mode(
sentence=sentence,
style=style,
audio_idx=audio_idx,
start_idx=img_counter,
word_timestamps=word_ts,
audio_start_time=audio_start_times[audio_idx],
audio_duration=audio_durations[audio_idx],
)
else:
# WhisperX not available or failed — fall back to single mode
print(f"[caption_renderer] No timestamps for sentence {audio_idx}, using single mode")
jobs = render_single_mode(sentence, style, audio_idx, img_counter)
elif mode == "single":
jobs = render_single_mode(sentence, style, audio_idx, img_counter)
else:
jobs = render_multi_mode(sentence, style, audio_idx, img_counter)
all_jobs.extend(jobs)
img_counter += len(jobs)
return all_jobs
# ─────────────────────────────────────────────────────────────────────────────
# Drawing primitives
# ─────────────────────────────────────────────────────────────────────────────
def measure_text_block(
draw: ImageDraw.ImageDraw,
lines: List[str],
font: ImageFont.FreeTypeFont,
line_spacing: int,
) -> tuple:
max_w = 0
total_h = 0
for i, line in enumerate(lines):
w, h = getsize(font, line)
if w > max_w:
max_w = w
total_h += h
if i < len(lines) - 1:
total_h += line_spacing
return max_w, total_h
def draw_stroked_text(
draw: ImageDraw.ImageDraw,
x: int,
y: int,
line: str,
font: ImageFont.FreeTypeFont,
fill_color: tuple,
stroke_color: tuple,
stroke_width: int,
) -> None:
sw = stroke_width
half = max(1, sw // 2)
offsets = [
(-sw, 0), (sw, 0), (0, -sw), (0, sw),
(-sw, -sw), (sw, -sw), (-sw, sw), (sw, sw),
(-sw, -half), (sw, -half), (-sw, half), (sw, half),
(-half, -sw), (half, -sw), (-half, sw), (half, sw),
]
for ox, oy in offsets:
draw.text((x + ox, y + oy), line, font=font, fill=stroke_color)
draw.text((x, y), line, font=font, fill=fill_color)
def fit_font(
style: dict,
lines: List[str],
canvas_w: int,
canvas_h: int,
line_spacing: int,
max_width_ratio: float = 0.88,
max_height_ratio: float = 0.45,
) -> ImageFont.FreeTypeFont:
font_size = style["font_size"]
font_path = os.path.join("fonts", style["font_file"])
if not os.path.exists(font_path):
font_path = os.path.join("fonts", "Roboto-Bold.ttf")
max_w = int(canvas_w * max_width_ratio)
max_h = int(canvas_h * max_height_ratio)
while font_size > 30:
font = ImageFont.truetype(font_path, font_size)
dummy = Image.new("RGBA", (canvas_w, canvas_h), (0, 0, 0, 0))
dummy_d = ImageDraw.Draw(dummy)
bw, bh = measure_text_block(dummy_d, lines, font, line_spacing)
if bw <= max_w and bh <= max_h:
return font
font_size -= 4
return ImageFont.truetype(font_path, 30)
def render_job_to_image(
job: RenderJob,
style: dict,
canvas_w: int,
canvas_h: int,
line_spacing: int,
) -> Image.Image:
font = fit_font(style, job.lines, canvas_w, canvas_h, line_spacing)
image = Image.new("RGBA", (canvas_w, canvas_h), (0, 0, 0, 0))
draw = ImageDraw.Draw(image)
bw, bh = measure_text_block(draw, job.lines, font, line_spacing)
anchor_y = int(canvas_h * style["y_position"]) - (bh // 2)
cy = anchor_y
for line in job.lines:
w, h = getsize(font, line)
x = (canvas_w - w) // 2
draw_stroked_text(draw, x, cy, line, font,
style["fill_color"], style["stroke_color"], style["stroke_width"])
cy += h + line_spacing
return image

@ -1,74 +1,156 @@
"""
imagenarator.py
Thin orchestrator. Does exactly:
1. Extract sentences from reddit_obj
2. Probe audio durations + compute audio start times (needed for aligned mode)
3. Call caption_renderer.get_render_jobs()
4. Render each job to PNG
5. Save timing_map.json for final_video.py
"""
import glob
import json
import os
import re
import textwrap
from typing import List, Optional
from PIL import Image, ImageDraw, ImageFont
import ffmpeg
from rich.progress import track
from TTS.engine_wrapper import process_text
from utils.fonts import getheight, getsize
from utils import settings
from utils.id import extract_id
from utils.sentiment_map import STYLE_MAP, DEFAULT_STYLE
from utils.caption_renderer import get_render_jobs, render_job_to_image, RenderJob
LINE_SPACING: int = 20
def draw_multiple_line_text(
image, text, font, text_color, padding, wrap=50, transparent=False
) -> None:
def _extract_sentences(reddit_obj: dict, style: dict) -> List[str]:
"""
Draw multiline text over given image
Extract sentences from thread_post.
One sentence per postaudio-{i}.mp3 order preserved.
"""
draw = ImageDraw.Draw(image)
font_height = getheight(font, text)
image_width, image_height = image.size
lines = textwrap.wrap(text, width=wrap)
y = (image_height / 2) - (((font_height + (len(lines) * padding) / len(lines)) * len(lines)) / 2)
for line in lines:
line_width, line_height = getsize(font, line)
if transparent:
shadowcolor = "black"
for i in range(1, 5):
draw.text(
((image_width - line_width) / 2 - i, y - i),
line,
font=font,
fill=shadowcolor,
)
draw.text(
((image_width - line_width) / 2 + i, y - i),
line,
font=font,
fill=shadowcolor,
)
draw.text(
((image_width - line_width) / 2 - i, y + i),
line,
font=font,
fill=shadowcolor,
)
draw.text(
((image_width - line_width) / 2 + i, y + i),
line,
font=font,
fill=shadowcolor,
)
draw.text(((image_width - line_width) / 2, y), line, font=font, fill=text_color)
y += line_height + padding
def imagemaker(theme, reddit_obj: dict, txtclr, padding=5, transparent=False) -> None:
raw_texts = reddit_obj["thread_post"]
sentences: List[str] = []
for item in raw_texts:
if isinstance(item, dict):
text = item.get("text", "")
elif isinstance(item, str):
text = item
else:
text = str(item)
text = process_text(text, False).strip()
if style.get("uppercase", False):
text = text.upper()
if text:
sentences.append(text)
return sentences if sentences else ["..."]
def _get_audio_info(mp3_dir: str) -> tuple:
"""
Discover postaudio files and compute:
- durations list (one per postaudio file)
- start times list (absolute seconds in video, after title card)
Returns (postaudio_files, durations, start_times)
"""
Render Images for video
postaudio_files = sorted(
glob.glob(os.path.join(mp3_dir, "postaudio-*.mp3")),
key=lambda x: int(re.search(r"postaudio-(\d+)", x).group(1))
)
title_path = os.path.join(mp3_dir, "title.mp3")
try:
title_duration = float(ffmpeg.probe(title_path)["format"]["duration"])
except Exception:
title_duration = 0.0
durations = []
start_times = []
current = title_duration
for f in postaudio_files:
try:
dur = float(ffmpeg.probe(f)["format"]["duration"])
except Exception:
dur = 0.0
start_times.append(current)
durations.append(dur)
current += dur
return postaudio_files, durations, start_times
def imagemaker(theme, reddit_obj: dict, txtclr, padding=5, transparent=False) -> int:
"""
texts = reddit_obj["thread_post"]
reddit_id = extract_id(reddit_obj)
if transparent:
font = ImageFont.truetype(os.path.join("fonts", "Roboto-Bold.ttf"), 100)
else:
font = ImageFont.truetype(os.path.join("fonts", "Roboto-Regular.ttf"), 100)
size = (1920, 1080)
for idx, text in track(enumerate(texts), "Rendering Image"):
image = Image.new("RGBA", size, theme)
text = process_text(text, False)
draw_multiple_line_text(image, text, font, txtclr, padding, wrap=30, transparent=transparent)
image.save(f"assets/temp/{reddit_id}/png/img{idx}.png")
Render caption images for the video.
Flow:
sentences + audio info
caption_renderer.get_render_jobs()
List[RenderJob]
each RenderJob transparent PNG (img{idx}.png)
timing_map.json saved for final_video.py
timing_map.json entry for fraction-based jobs:
{"timing_type": "fraction", "audio_idx": N, "time_fraction": F}
timing_map.json entry for absolute-based jobs (aligned mode):
{"timing_type": "absolute", "clip_start": S, "clip_end": E}
Returns:
int: total number of images generated
"""
# 1. Style
sentiment = settings.config["settings"].get("sentiment", "dramatic")
style = STYLE_MAP.get(sentiment, DEFAULT_STYLE)
CANVAS_W: int = int(settings.config["settings"]["resolution_w"])
CANVAS_H: int = int(settings.config["settings"]["resolution_h"])
reddit_id = extract_id(reddit_obj)
mp3_dir = f"assets/temp/{reddit_id}/mp3"
# 2. Extract sentences
sentences = _extract_sentences(reddit_obj, style)
# 3. Get audio timing info (needed for aligned mode)
_, durations, start_times = _get_audio_info(mp3_dir)
# 4. Get render jobs
jobs: List[RenderJob] = get_render_jobs(
sentences=sentences,
style=style,
mp3_dir=mp3_dir,
audio_start_times=start_times if start_times else None,
audio_durations=durations if durations else None,
)
# 5. Render each job to a transparent PNG
for job in track(jobs, description="Rendering caption images"):
image = render_job_to_image(job, style, CANVAS_W, CANVAS_H, LINE_SPACING)
image.save(f"assets/temp/{reddit_id}/png/img{job.idx}.png")
# 6. Save timing map
timing_map = []
for job in jobs:
if job.timing_type == "absolute":
timing_map.append({
"timing_type": "absolute",
"clip_start": job.clip_start,
"clip_end": job.clip_end,
})
else:
timing_map.append({
"timing_type": "fraction",
"audio_idx": job.audio_idx,
"time_fraction": job.time_fraction,
})
timing_map_path = f"assets/temp/{reddit_id}/timing_map.json"
with open(timing_map_path, "w") as f:
json.dump(timing_map, f, indent=2)
return len(jobs)

@ -0,0 +1,231 @@
import json
import os
from openai import OpenAI
from utils import settings
from utils.console import print_step, print_substep
from utils.sentiment_map import (
BACKGROUND_MAP,
OPENAI_VOICE_MAP,
ELEVENLABS_VOICE_MAP,
VALID_SENTIMENTS,
DEFAULT_SENTIMENT,
)
def _get_client() -> OpenAI:
api_key = settings.config["deepseek"]["api_key"]
return OpenAI(
api_key=api_key,
base_url="https://api.deepseek.com",
)
def _extract_text(reddit_object: dict) -> tuple:
title = reddit_object.get("thread_title", "")
post = reddit_object.get("thread_post", "")
if isinstance(post, list):
post = " ".join([p.get("text", "") for p in post if isinstance(p, dict)])
return title, post
def detect_sentiment(reddit_object: dict) -> str:
"""
Sends the post title + body to DeepSeek and returns a sentiment label.
Falls back to DEFAULT_SENTIMENT on any error.
"""
try:
api_key = settings.config["deepseek"]["api_key"]
if not api_key:
print_substep("No DeepSeek API key found. Using default sentiment.", style="yellow")
return DEFAULT_SENTIMENT
title, post = _extract_text(reddit_object)
text = f"Title: {title}\nPost: {post[:500]}"
client = _get_client()
response = client.chat.completions.create(
model="deepseek-chat",
messages=[
{
"role": "system",
"content": (
"You are a sentiment classifier for Reddit stories. "
"Classify the post into exactly one of these labels: "
"sad, happy, angry, mysterious, funny, dramatic, wholesome, scary. "
"Respond with only the label, nothing else. No punctuation, no explanation."
),
},
{
"role": "user",
"content": text,
},
],
max_tokens=10,
temperature=0,
)
label = response.choices[0].message.content.strip().lower()
if label not in VALID_SENTIMENTS:
print_substep(
f"DeepSeek returned unexpected label '{label}'. Using default: {DEFAULT_SENTIMENT}",
style="yellow",
)
return DEFAULT_SENTIMENT
return label
except Exception as e:
print_substep(f"Sentiment detection failed: {e}. Using default: {DEFAULT_SENTIMENT}", style="yellow")
return DEFAULT_SENTIMENT
def generate_metadata(reddit_object: dict, sentiment: str) -> dict:
"""
Generates YouTube title, description, TikTok/Instagram/Facebook captions,
and hashtags in a single DeepSeek call.
Saves output as JSON next to the video in results/.
Falls back to basic metadata on any error.
"""
try:
api_key = settings.config["deepseek"]["api_key"]
if not api_key:
return _fallback_metadata(reddit_object, sentiment)
title, post = _extract_text(reddit_object)
text = f"Title: {title}\nPost: {post[:800]}"
channel_name = settings.config["settings"].get("channel_name", "Reddit Tales")
client = _get_client()
response = client.chat.completions.create(
model="deepseek-chat",
messages=[
{
"role": "system",
"content": (
"You are a social media content creator specializing in Reddit story videos. "
"Generate engaging titles, captions, and hashtags for a Reddit story video. "
"Return ONLY a valid JSON object with these exact keys: "
"youtube_title, youtube_description, tiktok_caption, instagram_caption, facebook_caption, hashtags. "
"hashtags must be a list of strings. "
"Keep youtube_title under 70 characters. "
"Keep tiktok_caption under 150 characters including hashtags. "
"Make content engaging and click-worthy. "
f"The channel name is '{channel_name}'. "
f"The story mood is: {sentiment}. "
"No markdown, no explanation, just the JSON object."
),
},
{
"role": "user",
"content": text,
},
],
max_tokens=600,
temperature=0.7,
)
raw = response.choices[0].message.content.strip()
# Strip markdown code blocks if present
if raw.startswith("```"):
raw = raw.split("```")[1]
if raw.startswith("json"):
raw = raw[4:]
raw = raw.strip()
metadata = json.loads(raw)
# Validate all required keys exist
required_keys = [
"youtube_title", "youtube_description",
"tiktok_caption", "instagram_caption",
"facebook_caption", "hashtags"
]
for key in required_keys:
if key not in metadata:
metadata[key] = ""
metadata["sentiment"] = sentiment
return metadata
except Exception as e:
print_substep(f"Metadata generation failed: {e}. Using fallback.", style="yellow")
return _fallback_metadata(reddit_object, sentiment)
def _fallback_metadata(reddit_object: dict, sentiment: str) -> dict:
"""Basic fallback metadata if DeepSeek fails."""
title = reddit_object.get("thread_title", "Reddit Story")
return {
"sentiment": sentiment,
"youtube_title": title[:70],
"youtube_description": f"{title}\n\n#reddit #stories",
"tiktok_caption": f"{title[:100]} #reddit #storytime",
"instagram_caption": f"{title[:100]} #reddit #stories",
"facebook_caption": title,
"hashtags": ["#reddit", "#storytime", "#stories", "#redditstories"],
}
def save_metadata(metadata: dict, reddit_object: dict) -> None:
"""Saves metadata JSON inside the per-video folder."""
try:
subreddit = reddit_object.get("thread_subreddit", settings.config["reddit"]["thread"]["subreddit"])
thread_id = reddit_object.get("thread_id", "unknown")
sentiment_bg = settings.config["settings"]["background"].get("background_video", "unknown")
video_folder = f"results/{subreddit}/{thread_id}_{sentiment_bg}"
os.makedirs(video_folder, exist_ok=True)
filepath = f"{video_folder}/metadata.json"
with open(filepath, "w", encoding="utf-8") as f:
json.dump(metadata, f, indent=2, ensure_ascii=False)
print_substep(f"Metadata saved → {filepath}", style="bold green")
except Exception as e:
print_substep(f"Failed to save metadata: {e}", style="yellow")
def apply_sentiment_config(reddit_object: dict) -> None:
"""
Detects sentiment, overrides in-memory config for background/voice,
generates metadata, and saves it to disk.
Does NOT write to config.toml changes are per-run only.
"""
print_step("Detecting sentiment and generating metadata... 🎭")
sentiment = detect_sentiment(reddit_object)
# ── Sentiment label — stored in memory so imagenarator.py can read it ────
# This is the key that STYLE_MAP lookups depend on at render time.
settings.config["settings"]["sentiment"] = sentiment
# ── Background ───────────────────────────────────────────────────────────
bg_video, bg_audio = BACKGROUND_MAP[sentiment]
settings.config["settings"]["background"]["background_video"] = bg_video
settings.config["settings"]["background"]["background_audio"] = bg_audio
# ── Voice ────────────────────────────────────────────────────────────────
voice_choice = settings.config["settings"]["tts"]["voice_choice"].lower()
if voice_choice == "elevenlabs":
voice = ELEVENLABS_VOICE_MAP[sentiment]
settings.config["settings"]["tts"]["elevenlabs_voice_name"] = voice
elif voice_choice == "openai":
voice = OPENAI_VOICE_MAP[sentiment]
settings.config["settings"]["tts"]["openai_voice_name"] = voice
else:
voice = f"(voice override not supported for {voice_choice})"
# ── Metadata ─────────────────────────────────────────────────────────────
print_substep("Generating titles, captions and hashtags... ✍️", style="bold blue")
metadata = generate_metadata(reddit_object, sentiment)
save_metadata(metadata, reddit_object)
# ── Log ──────────────────────────────────────────────────────────────────
print_substep(f"Sentiment detected : {sentiment} 🎯", style="bold green")
print_substep(f"Background video : {bg_video}", style="bold blue")
print_substep(f"Background audio : {bg_audio if bg_audio else 'none'}", style="bold blue")
print_substep(f"Voice : {voice}", style="bold blue")
print_substep(f"YouTube title : {metadata['youtube_title']}", style="bold blue")
print_substep(f"TikTok caption : {metadata['tiktok_caption']}", style="bold blue")

@ -0,0 +1,158 @@
BACKGROUND_MAP = {
"sad": ("minecraft", "lofi"),
"happy": ("fall-guys", "chill-summer"),
"angry": ("gta", "lofi"),
"mysterious": ("csgo-surf", "lofi-2"),
"funny": ("cluster-truck", "chill-summer"),
"dramatic": ("rocket-league", "lofi"),
"wholesome": ("steep", "chill-summer"),
"scary": ("minecraft-2", "lofi-2"),
}
OPENAI_VOICE_MAP = {
"sad": "nova",
"happy": "shimmer",
"angry": "onyx",
"mysterious": "echo",
"funny": "fable",
"dramatic": "alloy",
"wholesome": "nova",
"scary": "onyx",
}
ELEVENLABS_VOICE_MAP = {
"sad": "Brian - Deep, Resonant and Comforting",
"happy": "Jessica - Playful, Bright, Warm",
"angry": "Adam - Dominant, Firm",
"mysterious": "Callum - Husky Trickster",
"funny": "Laura - Enthusiast, Quirky Attitude",
"dramatic": "George - Warm, Captivating Storyteller",
"wholesome": "Matilda - Knowledgable, Professional",
"scary": "Harry - Fierce Warrior",
}
VALID_SENTIMENTS = list(BACKGROUND_MAP.keys())
DEFAULT_SENTIMENT = "dramatic"
# ─────────────────────────────────────────────────────────────────────────────
# STYLE_MAP
# ─────────────────────────────────────────────────────────────────────────────
#
# display_mode options:
#
# "aligned" → WhisperX word timestamps — perfect sync with any TTS.
# Falls back to "single" per sentence if timestamps unavailable.
# USE THIS for all sentiments once WhisperX is installed.
#
# "single" → Split sentence into word chunks, equal time per chunk.
# Good fallback when WhisperX is not installed.
#
# "multi" → Full sentence on one image. No splitting.
# Best for slow TTS or wholesome/sad content.
#
# words_per_chunk:
# In "aligned" mode: words grouped per visible chunk (3-5 recommended)
# In "single" mode: words per chunk (higher = fewer chunks = slower pace)
# In "multi" mode: words per line in the wrapped text block
#
STYLE_MAP = {
"dramatic": {
"font_file": "Montserrat-ExtraBold.ttf",
"font_size": 95,
"fill_color": (255, 255, 255, 255),
"stroke_color": (0, 0, 0, 255),
"stroke_width": 4,
"words_per_chunk": 4,
"y_position": 0.65,
"uppercase": False,
"display_mode": "aligned",
},
"scary": {
"font_file": "Oswald-Bold.ttf",
"font_size": 95,
"fill_color": (232, 244, 248, 255),
"stroke_color": (0, 0, 0, 255),
"stroke_width": 5,
"words_per_chunk": 3,
"y_position": 0.65,
"uppercase": False,
"display_mode": "aligned",
},
"angry": {
"font_file": "Anton-Regular.ttf",
"font_size": 105,
"fill_color": (255, 69, 0, 255),
"stroke_color": (0, 0, 0, 255),
"stroke_width": 5,
"words_per_chunk": 3,
"y_position": 0.65,
"uppercase": True,
"display_mode": "aligned",
},
"mysterious": {
"font_file": "Raleway-Bold.ttf",
"font_size": 90,
"fill_color": (184, 212, 232, 255),
"stroke_color": (0, 0, 0, 255),
"stroke_width": 4,
"words_per_chunk": 3,
"y_position": 0.65,
"uppercase": False,
"display_mode": "aligned",
},
"funny": {
"font_file": "Nunito-ExtraBold.ttf",
"font_size": 90,
"fill_color": (255, 230, 0, 255),
"stroke_color": (0, 0, 0, 255),
"stroke_width": 4,
"words_per_chunk": 5,
"y_position": 0.65,
"uppercase": False,
"display_mode": "aligned",
},
"sad": {
"font_file": "Lato-Bold.ttf",
"font_size": 88,
"fill_color": (220, 225, 255, 255),
"stroke_color": (10, 10, 46, 255),
"stroke_width": 3,
"words_per_chunk": 5,
"y_position": 0.65,
"uppercase": False,
"display_mode": "aligned",
},
"wholesome": {
"font_file": "Nunito-ExtraBold.ttf",
"font_size": 88,
"fill_color": (255, 248, 231, 255),
"stroke_color": (26, 10, 0, 255),
"stroke_width": 3,
"words_per_chunk": 5,
"y_position": 0.65,
"uppercase": False,
"display_mode": "aligned",
},
"happy": {
"font_file": "Nunito-ExtraBold.ttf",
"font_size": 90,
"fill_color": (255, 230, 0, 255),
"stroke_color": (0, 0, 0, 255),
"stroke_width": 4,
"words_per_chunk": 5,
"y_position": 0.65,
"uppercase": False,
"display_mode": "aligned",
},
}
DEFAULT_STYLE = STYLE_MAP["dramatic"]

@ -0,0 +1,168 @@
"""
whisper_aligner.py
Word-level timestamp extraction using WhisperX.
This module runs after each TTS audio file is saved.
It produces a word-level timestamp JSON for every postaudio-{i}.mp3.
Output format (postaudio-{i}_words.json):
[
{"word": "I", "start": 0.00, "end": 0.18},
{"word": "told", "start": 0.18, "end": 0.42},
...
]
WhisperX is used because:
- Works with ANY TTS engine (Google, OpenAI, ElevenLabs, etc.)
- Free, runs locally, no API cost
- Word-level accuracy (not sentence-level)
- Fast on CPU for short audio clips
If WhisperX is not installed or fails for any reason,
this module returns None and the system falls back to
time_fraction-based sync (single/multi mode).
No crashes, no interruptions.
"""
import json
import os
from typing import List, Optional
from utils.console import print_substep
# ── WhisperX model is loaded once and reused across all audio files ───────────
# Loading is expensive (~2-3s). We cache it as a module-level singleton.
_whisper_model = None
_whisper_model_lang = None
def _get_model(language: str = "en"):
"""
Lazy-load WhisperX model. Loaded once per run, reused for all audio files.
Returns None if WhisperX is not installed.
"""
global _whisper_model, _whisper_model_lang
if _whisper_model is not None and _whisper_model_lang == language:
return _whisper_model
try:
import whisperx
print_substep("Loading WhisperX model (first run only)...", style="bold blue")
_whisper_model = whisperx.load_model(
"base", # small enough for CPU, accurate enough for TTS
device="cpu",
compute_type="int8",
language=language,
)
_whisper_model_lang = language
return _whisper_model
except ImportError:
return None
except Exception as e:
print_substep(f"WhisperX model load failed: {e}", style="yellow")
return None
def align_audio(audio_path: str, language: str = "en") -> Optional[List[dict]]:
"""
Run WhisperX on a single audio file and return word-level timestamps.
Parameters
----------
audio_path : str
Path to the .mp3 file to align.
language : str
Language code (default: "en"). Matches TTS language.
Returns
-------
Optional[List[dict]]
List of {"word": str, "start": float, "end": float} dicts.
Returns None if WhisperX is unavailable or alignment fails.
"""
try:
import whisperx
model = _get_model(language)
if model is None:
return None
# Transcribe + align
audio = whisperx.load_audio(audio_path)
result = model.transcribe(audio, batch_size=4)
# Align to get word-level timestamps
align_model, metadata = whisperx.load_align_model(
language_code=language,
device="cpu",
)
aligned = whisperx.align(
result["segments"],
align_model,
metadata,
audio,
device="cpu",
return_char_alignments=False,
)
# Flatten all words across all segments
words = []
for segment in aligned.get("word_segments", []):
word = segment.get("word", "").strip()
start = segment.get("start")
end = segment.get("end")
if word and start is not None and end is not None:
words.append({
"word": word,
"start": round(float(start), 3),
"end": round(float(end), 3),
})
return words if words else None
except Exception as e:
print_substep(f"WhisperX alignment failed for {audio_path}: {e}", style="yellow")
return None
def align_and_save(audio_path: str, language: str = "en") -> Optional[str]:
"""
Align audio and save word timestamps as a JSON file next to the audio.
Parameters
----------
audio_path : str
e.g. "assets/temp/abc123/mp3/postaudio-0.mp3"
language : str
Language code.
Returns
-------
Optional[str]
Path to saved JSON file, or None if alignment failed.
"""
words = align_audio(audio_path, language)
if words is None:
return None
json_path = audio_path.replace(".mp3", "_words.json")
with open(json_path, "w", encoding="utf-8") as f:
json.dump(words, f, indent=2, ensure_ascii=False)
return json_path
def load_word_timestamps(audio_path: str) -> Optional[List[dict]]:
"""
Load previously saved word timestamps for an audio file.
Returns None if the file doesn't exist.
"""
json_path = audio_path.replace(".mp3", "_words.json")
if not os.path.exists(json_path):
return None
with open(json_path, "r", encoding="utf-8") as f:
return json.load(f)

@ -5,9 +5,11 @@ import tempfile
import textwrap
import threading
import time
from os.path import exists # Needs to be imported specifically
from os.path import exists
from pathlib import Path
from typing import Dict, Final, Tuple
import glob
import json
import ffmpeg
import translators
@ -44,7 +46,6 @@ class ProgressFfmpeg(threading.Thread):
def get_latest_ms_progress(self):
lines = self.output_file.readlines()
if lines:
for line in lines:
if "out_time_ms" in line:
@ -52,7 +53,6 @@ class ProgressFfmpeg(threading.Thread):
if out_time_ms_str.isnumeric():
return float(out_time_ms_str) / 1000000.0
else:
# Handle the case when "N/A" is encountered
return None
return None
@ -74,7 +74,6 @@ def name_normalize(name: str) -> str:
name = re.sub(r"(\d+)\s?\/\s?(\d+)", r"\1 of \2", name)
name = re.sub(r"(\w+)\s?\/\s?(\w+)", r"\1 or \2", name)
name = re.sub(r"\/", r"", name)
lang = settings.config["reddit"]["thread"]["post_lang"]
if lang:
print_substep("Translating filename...")
@ -119,51 +118,38 @@ def get_text_height(draw, text, font, max_width):
def create_fancy_thumbnail(image, text, text_color, padding, wrap=35):
"""
It will take the 1px from the middle of the template and will be resized (stretched) vertically to accommodate the extra height needed for the title.
"""
print_step(f"Creating fancy thumbnail for: {text}")
font_title_size = 47
font = ImageFont.truetype(os.path.join("fonts", "Roboto-Bold.ttf"), font_title_size)
image_width, image_height = image.size
# Calculate text height to determine new image height
draw = ImageDraw.Draw(image)
text_height = get_text_height(draw, text, font, wrap)
lines = textwrap.wrap(text, width=wrap)
# This is -50 to reduce the empty space at the bottom of the image,
# change it as per your requirement if needed otherwise leave it.
new_image_height = image_height + text_height + padding * (len(lines) - 1) - 50
# Separate the image into top, middle (1px), and bottom parts
top_part_height = image_height // 2
middle_part_height = 1 # 1px height middle section
top_part_height = image_height // 2
middle_part_height = 1
bottom_part_height = image_height - top_part_height - middle_part_height
top_part = image.crop((0, 0, image_width, top_part_height))
top_part = image.crop((0, 0, image_width, top_part_height))
middle_part = image.crop((0, top_part_height, image_width, top_part_height + middle_part_height))
bottom_part = image.crop((0, top_part_height + middle_part_height, image_width, image_height))
# Stretch the middle part
new_middle_height = new_image_height - top_part_height - bottom_part_height
new_middle_height = max(1, new_image_height - top_part_height - bottom_part_height)
middle_part = middle_part.resize((image_width, new_middle_height))
# Create new image with the calculated height
new_image = Image.new("RGBA", (image_width, new_image_height))
# Paste the top, stretched middle, and bottom parts into the new image
new_image.paste(top_part, (0, 0))
new_image.paste(top_part, (0, 0))
new_image.paste(middle_part, (0, top_part_height))
new_image.paste(bottom_part, (0, top_part_height + new_middle_height))
# Draw the title text on the new image
draw = ImageDraw.Draw(new_image)
y = top_part_height + padding
for line in lines:
draw.text((120, y), line, font=font, fill=text_color, align="left")
y += get_text_height(draw, line, font, wrap) + padding
# Draw the username "PlotPulse" at the specific position
username_font = ImageFont.truetype(os.path.join("fonts", "Roboto-Bold.ttf"), 30)
draw.text(
(205, 825),
@ -172,28 +158,44 @@ def create_fancy_thumbnail(image, text, text_color, padding, wrap=35):
fill=text_color,
align="left",
)
return new_image
def merge_background_audio(audio: ffmpeg, reddit_id: str):
"""Gather an audio and merge with assets/backgrounds/background.mp3
Args:
audio (ffmpeg): The TTS final audio but without background.
reddit_id (str): The ID of subreddit
"""
background_audio_volume = settings.config["settings"]["background"]["background_audio_volume"]
if background_audio_volume == 0:
return audio # Return the original audio
else:
# sets volume to config
bg_audio = ffmpeg.input(f"assets/temp/{reddit_id}/background.mp3").filter(
"volume",
background_audio_volume,
)
# Merges audio and background_audio
merged_audio = ffmpeg.filter([audio, bg_audio], "amix", duration="longest")
return merged_audio # Return merged audio
return audio
bg_audio = ffmpeg.input(f"assets/temp/{reddit_id}/background.mp3").filter(
"volume", background_audio_volume,
)
return ffmpeg.filter([audio, bg_audio], "amix", duration="longest")
def _load_timing_map(reddit_id: str, img_files: list, postaudio_files: list,
audio_clips_durations: list, title_duration: float) -> list:
"""
Load timing_map.json written by imagemaker().
Each entry is one of:
{"timing_type": "absolute", "clip_start": S, "clip_end": E}
used directly as FFmpeg enable times
{"timing_type": "fraction", "audio_idx": N, "time_fraction": F}
clip time computed as: audio_start[N] + accumulated_fraction * audio_dur[N]
Falls back to 1:1 mapping if file missing.
"""
timing_map_path = f"assets/temp/{reddit_id}/timing_map.json"
if os.path.exists(timing_map_path):
with open(timing_map_path) as f:
return json.load(f)
# Fallback: 1:1
print_substep("timing_map.json not found — using 1:1 fallback", style="yellow")
return [
{"timing_type": "fraction", "audio_idx": i, "time_fraction": 1.0}
for i in range(len(img_files))
]
def make_final_video(
@ -202,20 +204,10 @@ def make_final_video(
reddit_obj: dict,
background_config: Dict[str, Tuple],
):
"""Gathers audio clips, gathers all screenshots, stitches them together and saves the final video to assets/temp
Args:
number_of_clips (int): Index to end at when going through the screenshots'
length (int): Length of the video
reddit_obj (dict): The reddit object that contains the posts to read.
background_config (Tuple[str, str, str, Any]): The background config to use.
"""
# settings values
W: Final[int] = int(settings.config["settings"]["resolution_w"])
H: Final[int] = int(settings.config["settings"]["resolution_h"])
opacity = settings.config["settings"]["opacity"]
reddit_id = extract_id(reddit_obj)
opacity = settings.config["settings"]["opacity"]
reddit_id = extract_id(reddit_obj)
allowOnlyTTSFolder: bool = (
settings.config["settings"]["background"]["enable_extra_audio"]
@ -223,33 +215,31 @@ def make_final_video(
)
print_step("Creating the final video 🎥")
background_clip = ffmpeg.input(prepare_background(reddit_id, W=W, H=H))
# Gather all audio clips
# ── Audio clips ───────────────────────────────────────────────────────────
audio_clips = list()
if number_of_clips == 0 and settings.config["settings"]["storymode"] == "false":
print(
"No audio clips to gather. Please use a different TTS or post."
) # This is to fix the TypeError: unsupported operand type(s) for +: 'int' and 'NoneType'
print("No audio clips to gather.")
exit()
if settings.config["settings"]["storymode"]:
if settings.config["settings"]["storymodemethod"] == 0:
audio_clips = [ffmpeg.input(f"assets/temp/{reddit_id}/mp3/title.mp3")]
audio_clips.insert(1, ffmpeg.input(f"assets/temp/{reddit_id}/mp3/postaudio.mp3"))
elif settings.config["settings"]["storymodemethod"] == 1:
audio_clips = [
ffmpeg.input(f"assets/temp/{reddit_id}/mp3/postaudio-{i}.mp3")
for i in track(range(number_of_clips + 1), "Collecting the audio files...")
]
postaudio_files = sorted(
glob.glob(f"assets/temp/{reddit_id}/mp3/postaudio-*.mp3"),
key=lambda x: int(re.search(r"postaudio-(\d+)", x).group(1))
)
audio_clips = [ffmpeg.input(f) for f in postaudio_files]
audio_clips.insert(0, ffmpeg.input(f"assets/temp/{reddit_id}/mp3/title.mp3"))
else:
audio_clips = [
ffmpeg.input(f"assets/temp/{reddit_id}/mp3/{i}.mp3") for i in range(number_of_clips)
ffmpeg.input(f"assets/temp/{reddit_id}/mp3/{i}.mp3")
for i in range(number_of_clips)
]
audio_clips.insert(0, ffmpeg.input(f"assets/temp/{reddit_id}/mp3/title.mp3"))
audio_clips_durations = [
float(ffmpeg.probe(f"assets/temp/{reddit_id}/mp3/{i}.mp3")["format"]["duration"])
for i in range(number_of_clips)
@ -258,35 +248,25 @@ def make_final_video(
0,
float(ffmpeg.probe(f"assets/temp/{reddit_id}/mp3/title.mp3")["format"]["duration"]),
)
audio_concat = ffmpeg.concat(*audio_clips, a=1, v=0)
ffmpeg.output(
audio_concat, f"assets/temp/{reddit_id}/audio.mp3", **{"b:a": "192k"}
).overwrite_output().run(quiet=True)
).overwrite_output().run(quiet=False)
console.log(f"[bold green] Video Will Be: {length} Seconds Long")
screenshot_width = int((W * 45) // 100)
audio = ffmpeg.input(f"assets/temp/{reddit_id}/audio.mp3")
audio = ffmpeg.input(f"assets/temp/{reddit_id}/audio.mp3")
final_audio = merge_background_audio(audio, reddit_id)
image_clips = list()
Path(f"assets/temp/{reddit_id}/png").mkdir(parents=True, exist_ok=True)
# Credits to tim (beingbored)
# get the title_template image and draw a text in the middle part of it with the title of the thread
# ── Title card ────────────────────────────────────────────────────────────
title_template = Image.open("assets/title_template.png")
title = reddit_obj["thread_title"]
title = name_normalize(title)
font_color = "#000000"
padding = 5
# create_fancy_thumbnail(image, text, text_color, padding
title_img = create_fancy_thumbnail(title_template, title, font_color, padding)
title = name_normalize(reddit_obj["thread_title"])
title_img = create_fancy_thumbnail(title_template, title, "#000000", 5)
title_img.save(f"assets/temp/{reddit_id}/png/title.png")
image_clips.insert(
0,
@ -296,18 +276,17 @@ def make_final_video(
)
current_time = 0
if settings.config["settings"]["storymode"]:
audio_clips_durations = [
float(
ffmpeg.probe(f"assets/temp/{reddit_id}/mp3/postaudio-{i}.mp3")["format"]["duration"]
)
for i in range(number_of_clips)
]
audio_clips_durations.insert(
0,
float(ffmpeg.probe(f"assets/temp/{reddit_id}/mp3/title.mp3")["format"]["duration"]),
)
if settings.config["settings"]["storymodemethod"] == 0:
audio_clips_durations = [
float(ffmpeg.probe(f"assets/temp/{reddit_id}/mp3/postaudio.mp3")["format"]["duration"])
]
audio_clips_durations.insert(
0,
float(ffmpeg.probe(f"assets/temp/{reddit_id}/mp3/title.mp3")["format"]["duration"]),
)
image_clips.insert(
1,
ffmpeg.input(f"assets/temp/{reddit_id}/png/story_content.png").filter(
@ -321,20 +300,99 @@ def make_final_video(
y="(main_h-overlay_h)/2",
)
current_time += audio_clips_durations[0]
elif settings.config["settings"]["storymodemethod"] == 1:
for i in track(range(0, number_of_clips + 1), "Collecting the image files..."):
image_clips.append(
ffmpeg.input(f"assets/temp/{reddit_id}/png/img{i}.png")["v"].filter(
"scale", screenshot_width, -1
)
# ── Discover postaudio files ──────────────────────────────────────
postaudio_files = sorted(
glob.glob(f"assets/temp/{reddit_id}/mp3/postaudio-*.mp3"),
key=lambda x: int(re.search(r"postaudio-(\d+)", x).group(1))
)
# ── Build durations ───────────────────────────────────────────────
# audio_clips_durations[0] = title
# audio_clips_durations[1+i] = postaudio-{i}
audio_clips_durations = [
float(ffmpeg.probe(f)["format"]["duration"])
for f in postaudio_files
]
title_duration = float(
ffmpeg.probe(f"assets/temp/{reddit_id}/mp3/title.mp3")["format"]["duration"]
)
audio_clips_durations.insert(0, title_duration)
# ── Pre-compute absolute start time per audio file ────────────────
# audio_start_times[i] = when postaudio-{i} starts in the video
audio_start_times = []
t = title_duration
for dur in audio_clips_durations[1:]:
audio_start_times.append(t)
t += dur
# ── Title card overlay ────────────────────────────────────────────
background_clip = background_clip.overlay(
image_clips[0],
enable=f"between(t,0,{title_duration})",
x="(main_w-overlay_w)/2",
y="(main_h-overlay_h)/2",
)
current_time = title_duration
# ── Load image files ──────────────────────────────────────────────
img_files = sorted(
glob.glob(f"assets/temp/{reddit_id}/png/img*.png"),
key=lambda x: int(re.search(r"img(\d+)", x).group(1))
)
# ── Load timing map ───────────────────────────────────────────────
timing_map = _load_timing_map(
reddit_id, img_files, postaudio_files,
audio_clips_durations, title_duration
)
# ── Overlay each image ────────────────────────────────────────────
# Handles both absolute and fraction timing types cleanly.
# For fraction: track time_consumed per audio_idx
audio_time_used = {}
for i, img_file in enumerate(img_files):
if i >= len(timing_map):
break
entry = timing_map[i]
timing_type = entry.get("timing_type", "fraction")
if timing_type == "absolute":
# WhisperX aligned — use timestamps directly
clip_start = entry["clip_start"]
clip_end = entry["clip_end"]
else:
# Fraction-based — compute from audio duration
audio_idx = entry["audio_idx"]
time_fraction = entry["time_fraction"]
if audio_idx + 1 >= len(audio_clips_durations):
break
audio_dur = audio_clips_durations[audio_idx + 1]
display_dur = audio_dur * time_fraction
offset = audio_time_used.get(audio_idx, 0.0)
clip_start = audio_start_times[audio_idx] + offset
clip_end = clip_start + display_dur
audio_time_used[audio_idx] = offset + display_dur
img_clip = ffmpeg.input(img_file)["v"].filter(
"scale", screenshot_width, -1
)
image_clips.append(img_clip)
background_clip = background_clip.overlay(
image_clips[i],
enable=f"between(t,{current_time},{current_time + audio_clips_durations[i]})",
img_clip,
enable=f"between(t,{clip_start:.3f},{clip_end:.3f})",
x="(main_w-overlay_w)/2",
y="(main_h-overlay_h)/2",
)
current_time += audio_clips_durations[i]
current_time = t
else:
for i in range(0, number_of_clips + 1):
image_clips.append(
@ -343,9 +401,7 @@ def make_final_video(
)
)
image_overlay = image_clips[i].filter("colorchannelmixer", aa=opacity)
assert (
audio_clips_durations is not None
), "Please make a GitHub issue if you see this. Ping @JasonLovesDoggo on GitHub."
assert audio_clips_durations is not None
background_clip = background_clip.overlay(
image_overlay,
enable=f"between(t,{current_time},{current_time + audio_clips_durations[i]})",
@ -354,70 +410,48 @@ def make_final_video(
)
current_time += audio_clips_durations[i]
title = extract_id(reddit_obj, "thread_title")
idx = extract_id(reddit_obj)
title_thumb = reddit_obj["thread_title"]
filename = f"{name_normalize(title)[:251]}"
subreddit = settings.config["reddit"]["thread"]["subreddit"]
if not exists(f"./results/{subreddit}"):
print_substep("The 'results' folder could not be found so it was automatically created.")
os.makedirs(f"./results/{subreddit}")
# ── Output ────────────────────────────────────────────────────────────────
title_str = extract_id(reddit_obj, "thread_title")
idx = extract_id(reddit_obj)
title_thumb = reddit_obj["thread_title"]
subreddit = reddit_obj.get("thread_subreddit", settings.config["reddit"]["thread"]["subreddit"])
sentiment = settings.config["settings"]["background"].get("background_video", "unknown")
video_folder = f"./results/{subreddit}/{idx}_{sentiment}"
os.makedirs(video_folder, exist_ok=True)
if not exists(f"./results/{subreddit}/OnlyTTS") and allowOnlyTTSFolder:
print_substep("The 'OnlyTTS' folder could not be found so it was automatically created.")
os.makedirs(f"./results/{subreddit}/OnlyTTS")
if allowOnlyTTSFolder:
os.makedirs(f"{video_folder}/OnlyTTS", exist_ok=True)
# create a thumbnail for the video
settingsbackground = settings.config["settings"]["background"]
if settingsbackground["background_thumbnail"]:
if not exists(f"./results/{subreddit}/thumbnails"):
print_substep(
"The 'results/thumbnails' folder could not be found so it was automatically created."
)
os.makedirs(f"./results/{subreddit}/thumbnails")
# get the first file with the .png extension from assets/backgrounds and use it as a background for the thumbnail
first_image = next(
(file for file in os.listdir("assets/backgrounds") if file.endswith(".png")),
None,
(f for f in os.listdir("assets/backgrounds") if f.endswith(".png")), None
)
if first_image is None:
print_substep("No png files found in assets/backgrounds", "red")
else:
font_family = settingsbackground["background_thumbnail_font_family"]
font_size = settingsbackground["background_thumbnail_font_size"]
font_color = settingsbackground["background_thumbnail_font_color"]
thumbnail = Image.open(f"assets/backgrounds/{first_image}")
width, height = thumbnail.size
w, h = thumbnail.size
thumbnailSave = create_thumbnail(
thumbnail,
font_family,
font_size,
font_color,
width,
height,
title_thumb,
settingsbackground["background_thumbnail_font_family"],
settingsbackground["background_thumbnail_font_size"],
settingsbackground["background_thumbnail_font_color"],
w, h, title_thumb,
)
thumbnailSave.save(f"./assets/temp/{reddit_id}/thumbnail.png")
print_substep(f"Thumbnail - Building Thumbnail in assets/temp/{reddit_id}/thumbnail.png")
thumbnailSave.save(f"{video_folder}/thumbnail.png")
text = f"Background by {background_config['video'][2]}"
background_clip = ffmpeg.drawtext(
background_clip,
text=text,
x=f"(w-text_w)",
y=f"(h-text_h)",
fontsize=5,
fontcolor="White",
text=f"Background by {background_config['video'][2]}",
x="(w-text_w)", y="(h-text_h)",
fontsize=5, fontcolor="White",
fontfile=os.path.join("fonts", "Roboto-Regular.ttf"),
)
background_clip = background_clip.filter("scale", W, H)
print_step("Rendering the video 🎥")
from tqdm import tqdm
pbar = tqdm(total=100, desc="Progress: ", bar_format="{l_bar}{bar}", unit=" %")
def on_update_example(progress) -> None:
@ -425,17 +459,11 @@ def make_final_video(
old_percentage = pbar.n
pbar.update(status - old_percentage)
defaultPath = f"results/{subreddit}"
with ProgressFfmpeg(length, on_update_example) as progress:
path = defaultPath + f"/{filename}"
path = (
path[:251] + ".mp4"
) # Prevent a error by limiting the path length, do not change this.
path = f"{video_folder}/video.mp4"
try:
ffmpeg.output(
background_clip,
final_audio,
path,
background_clip, final_audio, path,
f="mp4",
**{
"c:v": "h264_nvenc",
@ -444,28 +472,23 @@ def make_final_video(
"threads": multiprocessing.cpu_count(),
},
).overwrite_output().global_args("-progress", progress.output_file.name).run(
quiet=True,
overwrite_output=True,
capture_stdout=False,
capture_stderr=False,
quiet=True, overwrite_output=True,
capture_stdout=False, capture_stderr=False,
)
except ffmpeg.Error as e:
print(e.stderr.decode("utf8"))
exit(1)
old_percentage = pbar.n
pbar.update(100 - old_percentage)
if allowOnlyTTSFolder:
path = defaultPath + f"/OnlyTTS/{filename}"
path = (
path[:251] + ".mp4"
) # Prevent a error by limiting the path length, do not change this.
path = f"{video_folder}/OnlyTTS/video.mp4"
print_step("Rendering the Only TTS Video 🎥")
with ProgressFfmpeg(length, on_update_example) as progress:
try:
ffmpeg.output(
background_clip,
audio,
path,
background_clip, audio, path,
f="mp4",
**{
"c:v": "h264_nvenc",
@ -474,20 +497,18 @@ def make_final_video(
"threads": multiprocessing.cpu_count(),
},
).overwrite_output().global_args("-progress", progress.output_file.name).run(
quiet=True,
overwrite_output=True,
capture_stdout=False,
capture_stderr=False,
quiet=True, overwrite_output=True,
capture_stdout=False, capture_stderr=False,
)
except ffmpeg.Error as e:
print(e.stderr.decode("utf8"))
exit(1)
old_percentage = pbar.n
pbar.update(100 - old_percentage)
pbar.close()
save_data(subreddit, filename + ".mp4", title, idx, background_config["video"][2])
save_data(subreddit, f"{idx}_{sentiment}/video.mp4", title_str, idx, background_config["video"][2])
print_step("Removing temporary files 🗑")
cleanups = cleanup(reddit_id)
print_substep(f"Removed {cleanups} temporary files 🗑")
print_step("Done! 🎉 The video is in the results folder 📁")
print_step("Done! 🎉 The video is in the results folder 📁")

@ -62,10 +62,10 @@ def get_screenshots_of_reddit_posts(reddit_object: dict, screenshot_num: int):
if storymode and settings.config["settings"]["storymodemethod"] == 1:
print_substep("Generating images...")
return imagemaker(
theme=bgcolor,
theme=(0, 0, 0, 0),
reddit_obj=reddit_object,
txtclr=txtcolor,
transparent=transparent,
txtclr=(255, 255, 255),
transparent=True,
)
screenshot_num: int

Loading…
Cancel
Save