Merge 2c5f2c594a into 569f25098a
commit
71cbbacd60
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -1,21 +1,198 @@
|
||||
aiohappyeyeballs==2.6.2
|
||||
aiohttp==3.13.5
|
||||
aiosignal==1.4.0
|
||||
alembic==1.18.4
|
||||
annotated-doc==0.0.4
|
||||
annotated-types==0.7.0
|
||||
antlr4-python3-runtime==4.9.3
|
||||
anyio==4.13.0
|
||||
asteroid-filterbanks==0.4.0
|
||||
attrs==26.1.0
|
||||
av==17.0.1
|
||||
blinker==1.9.0
|
||||
blis==1.3.3
|
||||
boto3==1.36.8
|
||||
botocore==1.36.8
|
||||
catalogue==2.0.10
|
||||
certifi==2026.5.20
|
||||
cffi==2.0.0
|
||||
charset-normalizer==3.4.7
|
||||
clean-text==0.6.0
|
||||
click==8.1.8
|
||||
cloudpathlib==0.24.0
|
||||
colorlog==6.10.1
|
||||
confection==0.1.5
|
||||
contourpy==1.3.3
|
||||
cryptography==48.0.0
|
||||
ctranslate2==4.7.2
|
||||
cycler==0.12.1
|
||||
cymem==2.0.13
|
||||
decorator==5.3.1
|
||||
dill==0.4.1
|
||||
distro==1.9.0
|
||||
einops==0.8.2
|
||||
elevenlabs==1.57.0
|
||||
emoji==1.7.0
|
||||
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl#sha256=1932429db727d4bff3deed6b34cfc05df17794f4a52eeb26cf8928f7c1a0fb85
|
||||
exejs==0.0.7
|
||||
faster-whisper==1.2.1
|
||||
ffmpeg-python==0.2.0
|
||||
filelock==3.29.0
|
||||
Flask==3.1.1
|
||||
flatbuffers==25.12.19
|
||||
fonttools==4.63.0
|
||||
frozenlist==1.8.0
|
||||
fsspec==2026.4.0
|
||||
ftfy==6.3.1
|
||||
future==1.0.0
|
||||
googleapis-common-protos==1.75.0
|
||||
greenlet==3.1.1
|
||||
grpcio==1.80.0
|
||||
gTTS==2.5.4
|
||||
h11==0.16.0
|
||||
hf-xet==1.5.0
|
||||
httpcore==1.0.9
|
||||
httpx==0.28.1
|
||||
huggingface_hub==0.36.2
|
||||
idna==3.17
|
||||
ImageIO==2.37.3
|
||||
imageio-ffmpeg==0.6.0
|
||||
itsdangerous==2.2.0
|
||||
jh2==5.0.11
|
||||
Jinja2==3.1.6
|
||||
jiter==0.15.0
|
||||
jmespath==1.1.0
|
||||
joblib==1.5.3
|
||||
julius==0.2.7
|
||||
kiwisolver==1.5.0
|
||||
langcodes==3.5.1
|
||||
lightning==2.6.5
|
||||
lightning-utilities==0.15.3
|
||||
lxml==6.1.1
|
||||
Mako==1.3.12
|
||||
markdown-it-py==4.2.0
|
||||
MarkupSafe==3.0.3
|
||||
matplotlib==3.10.9
|
||||
mdurl==0.1.2
|
||||
moviepy==2.2.1
|
||||
mpmath==1.3.0
|
||||
multidict==6.7.1
|
||||
multiprocess==0.70.19
|
||||
murmurhash==1.0.15
|
||||
networkx==3.6.1
|
||||
niquests==3.18.8
|
||||
nltk==3.9.4
|
||||
numpy==2.4.6
|
||||
nvidia-cublas-cu12==12.8.4.1
|
||||
nvidia-cuda-cupti-cu12==12.8.90
|
||||
nvidia-cuda-nvrtc-cu12==12.8.93
|
||||
nvidia-cuda-runtime-cu12==12.8.90
|
||||
nvidia-cudnn-cu12==9.10.2.21
|
||||
nvidia-cufft-cu12==11.3.3.83
|
||||
nvidia-cufile-cu12==1.13.1.3
|
||||
nvidia-curand-cu12==10.3.9.90
|
||||
nvidia-cusolver-cu12==11.7.3.90
|
||||
nvidia-cusparse-cu12==12.5.8.93
|
||||
nvidia-cusparselt-cu12==0.7.1
|
||||
nvidia-nccl-cu12==2.27.3
|
||||
nvidia-nvjitlink-cu12==12.8.93
|
||||
nvidia-nvtx-cu12==12.8.90
|
||||
omegaconf==2.3.0
|
||||
onnxruntime==1.26.0
|
||||
openai==2.38.0
|
||||
opentelemetry-api==1.42.1
|
||||
opentelemetry-exporter-otlp==1.42.1
|
||||
opentelemetry-exporter-otlp-proto-common==1.42.1
|
||||
opentelemetry-exporter-otlp-proto-grpc==1.42.1
|
||||
opentelemetry-exporter-otlp-proto-http==1.42.1
|
||||
opentelemetry-proto==1.42.1
|
||||
opentelemetry-sdk==1.42.1
|
||||
opentelemetry-semantic-conventions==0.63b1
|
||||
optuna==4.8.0
|
||||
packaging==26.2
|
||||
pandas==3.0.3
|
||||
pathos==0.3.5
|
||||
pillow==11.3.0
|
||||
playwright==1.49.1
|
||||
pox==0.3.7
|
||||
ppft==1.7.8
|
||||
praw==7.8.1
|
||||
prawcore==2.4.0
|
||||
preshed==3.0.13
|
||||
primePy==1.3
|
||||
proglog==0.1.12
|
||||
propcache==0.5.2
|
||||
protobuf==6.33.6
|
||||
pyannote-audio==4.0.4
|
||||
pyannote-core==6.0.1
|
||||
pyannote-database==6.1.1
|
||||
pyannote-metrics==4.1
|
||||
pyannote-pipeline==4.0.0
|
||||
pyannoteai-sdk==0.4.0
|
||||
pycparser==3.0
|
||||
pydantic==2.13.4
|
||||
pydantic_core==2.46.4
|
||||
pyee==12.0.0
|
||||
Pygments==2.20.0
|
||||
pyparsing==3.3.2
|
||||
python-dateutil==2.9.0.post0
|
||||
python-dotenv==1.2.2
|
||||
pytorch-lightning==2.6.5
|
||||
pytorch-metric-learning==2.9.0
|
||||
pyttsx3==2.98
|
||||
PyYAML==6.0.3
|
||||
qh3==1.8.1
|
||||
regex==2026.5.9
|
||||
requests==2.32.3
|
||||
rich==13.9.4
|
||||
s3transfer==0.11.3
|
||||
safetensors==0.7.0
|
||||
scikit-learn==1.8.0
|
||||
scipy==1.17.1
|
||||
setuptools==82.0.1
|
||||
shellingham==1.5.4
|
||||
six==1.17.0
|
||||
smart_open==7.6.1
|
||||
sniffio==1.3.1
|
||||
sortedcontainers==2.4.0
|
||||
spacy==3.8.7
|
||||
spacy-legacy==3.0.12
|
||||
spacy-loggers==1.0.5
|
||||
SQLAlchemy==2.0.50
|
||||
srsly==2.5.3
|
||||
sympy==1.14.0
|
||||
thinc==8.3.11
|
||||
threadpoolctl==3.6.0
|
||||
tokenizers==0.21.4
|
||||
toml==0.10.2
|
||||
translators==5.9.9
|
||||
pyttsx3==2.98
|
||||
tomlkit==0.13.2
|
||||
Flask==3.1.1
|
||||
clean-text==0.6.0
|
||||
unidecode==1.4.0
|
||||
spacy==3.8.7
|
||||
torch==2.7.0
|
||||
torch==2.8.0
|
||||
torch-audiomentations==0.12.0
|
||||
torch_pitch_shift==1.2.5
|
||||
torchaudio==2.8.0
|
||||
torchcodec==0.7.0
|
||||
torchmetrics==1.9.0
|
||||
torchvision==0.23.0
|
||||
tqdm==4.67.3
|
||||
transformers==4.52.4
|
||||
ffmpeg-python==0.2.0
|
||||
elevenlabs==1.57.0
|
||||
yt-dlp==2025.10.22
|
||||
translators==5.9.9
|
||||
triton==3.4.0
|
||||
typer==0.26.2
|
||||
typer-slim==0.24.0
|
||||
typing-inspection==0.4.2
|
||||
typing_extensions==4.15.0
|
||||
Unidecode==1.4.0
|
||||
update-checker==0.18.0
|
||||
urllib3==2.7.0
|
||||
urllib3-future==2.20.907
|
||||
wasabi==1.1.3
|
||||
wassima==2.1.0
|
||||
wcwidth==0.7.0
|
||||
weasel==0.4.3
|
||||
websocket-client==1.9.0
|
||||
websockets==16.0
|
||||
Werkzeug==3.1.8
|
||||
whisperx==3.8.6
|
||||
wrapt==2.2.1
|
||||
yarl==1.24.2
|
||||
yt-dlp==2026.3.17
|
||||
|
||||
@ -0,0 +1,351 @@
|
||||
"""
|
||||
caption_renderer.py
|
||||
───────────────────
|
||||
All caption rendering logic. Three display modes:
|
||||
|
||||
multi → full sentence on one image (1 RenderJob per sentence)
|
||||
single → sentence split into word chunks (N RenderJobs per sentence)
|
||||
aligned → word-level timestamps from WhisperX (perfect sync, any TTS)
|
||||
|
||||
RenderJob is the contract between this module and final_video.py.
|
||||
Two types of timing:
|
||||
|
||||
FRACTION-based (multi, single):
|
||||
audio_idx + time_fraction → final_video computes absolute time
|
||||
time_fraction = fraction of audio_clips_durations[audio_idx+1]
|
||||
|
||||
ABSOLUTE-based (aligned):
|
||||
clip_start + clip_end → final_video uses directly
|
||||
These are absolute seconds in the video timeline (after title card)
|
||||
|
||||
final_video.py checks job["timing_type"] to know which to use.
|
||||
"""
|
||||
|
||||
import os
|
||||
from dataclasses import dataclass, field
|
||||
from typing import List, Optional
|
||||
|
||||
from PIL import Image, ImageDraw, ImageFont
|
||||
|
||||
from utils.fonts import getsize
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# RenderJob — the contract
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
@dataclass
|
||||
class RenderJob:
|
||||
"""
|
||||
Describes exactly one output image (img{idx}.png).
|
||||
|
||||
timing_type = "fraction":
|
||||
audio_idx + time_fraction used by final_video to compute display time.
|
||||
time_fraction = 1.0 means shown for full audio file duration.
|
||||
time_fraction = 0.25 means shown for 25% of audio file duration.
|
||||
|
||||
timing_type = "absolute":
|
||||
clip_start + clip_end are absolute seconds in the video timeline.
|
||||
final_video uses these directly — no calculation needed.
|
||||
"""
|
||||
idx: int
|
||||
lines: List[str]
|
||||
timing_type: str # "fraction" or "absolute"
|
||||
|
||||
# fraction-based fields
|
||||
audio_idx: int = 0
|
||||
time_fraction: float = 1.0
|
||||
|
||||
# absolute-based fields
|
||||
clip_start: float = 0.0
|
||||
clip_end: float = 0.0
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# Display modes
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
DISPLAY_MODES = {"single", "multi", "aligned"}
|
||||
|
||||
|
||||
def render_multi_mode(
|
||||
sentence: str,
|
||||
style: dict,
|
||||
audio_idx: int,
|
||||
start_idx: int,
|
||||
) -> List[RenderJob]:
|
||||
"""
|
||||
Full sentence on one image, wrapped into lines.
|
||||
One RenderJob, time_fraction = 1.0.
|
||||
Best for: funny, sad, wholesome, happy.
|
||||
"""
|
||||
words = sentence.split()
|
||||
wpl = style["words_per_chunk"]
|
||||
lines = [" ".join(words[i:i + wpl]) for i in range(0, len(words), wpl)]
|
||||
if not lines:
|
||||
lines = [sentence]
|
||||
|
||||
return [RenderJob(
|
||||
idx=start_idx,
|
||||
lines=lines,
|
||||
timing_type="fraction",
|
||||
audio_idx=audio_idx,
|
||||
time_fraction=1.0,
|
||||
)]
|
||||
|
||||
|
||||
def render_single_mode(
|
||||
sentence: str,
|
||||
style: dict,
|
||||
audio_idx: int,
|
||||
start_idx: int,
|
||||
) -> List[RenderJob]:
|
||||
"""
|
||||
Sentence split into word chunks, one per image.
|
||||
Each shown for (1/N) of the audio duration.
|
||||
Best for: scary, dramatic, angry, mysterious.
|
||||
"""
|
||||
wpc = style["words_per_chunk"]
|
||||
words = sentence.split()
|
||||
raw = [" ".join(words[i:i + wpc]) for i in range(0, len(words), wpc)]
|
||||
raw = [c for c in raw if c.strip()] or [sentence]
|
||||
|
||||
n = len(raw)
|
||||
fraction = 1.0 / n
|
||||
|
||||
return [
|
||||
RenderJob(
|
||||
idx=start_idx + i,
|
||||
lines=[chunk],
|
||||
timing_type="fraction",
|
||||
audio_idx=audio_idx,
|
||||
time_fraction=fraction,
|
||||
)
|
||||
for i, chunk in enumerate(raw)
|
||||
]
|
||||
|
||||
|
||||
def render_aligned_mode(
|
||||
sentence: str,
|
||||
style: dict,
|
||||
audio_idx: int,
|
||||
start_idx: int,
|
||||
word_timestamps: List[dict],
|
||||
audio_start_time: float,
|
||||
audio_duration: float,
|
||||
) -> List[RenderJob]:
|
||||
"""
|
||||
Word-level aligned mode using WhisperX timestamps.
|
||||
|
||||
Groups consecutive words into chunks of words_per_chunk words.
|
||||
Each chunk's clip_start = timestamp of first word in chunk.
|
||||
Each chunk's clip_end = timestamp of last word in chunk + its duration.
|
||||
|
||||
audio_start_time: absolute time in video when this audio file starts.
|
||||
audio_duration: duration of this audio file (used as fallback end time).
|
||||
|
||||
Falls back to single mode if timestamps are empty or malformed.
|
||||
"""
|
||||
wpc = style["words_per_chunk"]
|
||||
|
||||
if not word_timestamps:
|
||||
return render_single_mode(sentence, style, audio_idx, start_idx)
|
||||
|
||||
# Group word timestamps into chunks of wpc words
|
||||
jobs = []
|
||||
n = len(word_timestamps)
|
||||
|
||||
for chunk_start in range(0, n, wpc):
|
||||
chunk_words = word_timestamps[chunk_start:chunk_start + wpc]
|
||||
if not chunk_words:
|
||||
continue
|
||||
|
||||
text = " ".join(w["word"] for w in chunk_words)
|
||||
clip_start = audio_start_time + chunk_words[0]["start"]
|
||||
|
||||
# clip_end = end of last word in chunk,
|
||||
# or start of next chunk if available, capped at audio end
|
||||
if chunk_start + wpc < n:
|
||||
clip_end = audio_start_time + word_timestamps[chunk_start + wpc]["start"]
|
||||
else:
|
||||
last_end = chunk_words[-1].get("end", chunk_words[-1]["start"] + 0.3)
|
||||
clip_end = audio_start_time + last_end
|
||||
|
||||
# Safety: never exceed audio boundary
|
||||
audio_end = audio_start_time + audio_duration
|
||||
clip_end = min(clip_end, audio_end)
|
||||
clip_end = max(clip_end, clip_start + 0.1) # minimum 100ms visibility
|
||||
|
||||
jobs.append(RenderJob(
|
||||
idx=start_idx + len(jobs),
|
||||
lines=[text],
|
||||
timing_type="absolute",
|
||||
clip_start=round(clip_start, 3),
|
||||
clip_end=round(clip_end, 3),
|
||||
))
|
||||
|
||||
return jobs if jobs else render_single_mode(sentence, style, audio_idx, start_idx)
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# Router
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
def get_render_jobs(
|
||||
sentences: List[str],
|
||||
style: dict,
|
||||
mp3_dir: Optional[str] = None,
|
||||
audio_start_times: Optional[List[float]] = None,
|
||||
audio_durations: Optional[List[float]] = None,
|
||||
) -> List[RenderJob]:
|
||||
"""
|
||||
Route each sentence to the correct renderer.
|
||||
Returns flat ordered list of all RenderJobs.
|
||||
|
||||
For "aligned" mode, loads word timestamps from
|
||||
{mp3_dir}/postaudio-{i}_words.json written by engine_wrapper.
|
||||
Falls back to "single" mode per sentence if timestamps missing.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
sentences : one per postaudio-{i}.mp3
|
||||
style : STYLE_MAP entry for current sentiment
|
||||
mp3_dir : path to mp3 folder (needed for aligned mode)
|
||||
audio_start_times : absolute start time of each audio in video (needed for aligned)
|
||||
audio_durations : duration of each audio file (needed for aligned)
|
||||
"""
|
||||
mode = style.get("display_mode", "multi")
|
||||
|
||||
if mode not in DISPLAY_MODES:
|
||||
print(f"[caption_renderer] Unknown display_mode '{mode}', using 'multi'")
|
||||
mode = "multi"
|
||||
|
||||
all_jobs: List[RenderJob] = []
|
||||
img_counter: int = 0
|
||||
|
||||
for audio_idx, sentence in enumerate(sentences):
|
||||
|
||||
if mode == "aligned" and mp3_dir and audio_start_times and audio_durations:
|
||||
# Try to load word timestamps for this sentence
|
||||
from utils.whisper_aligner import load_word_timestamps
|
||||
audio_path = os.path.join(mp3_dir, f"postaudio-{audio_idx}.mp3")
|
||||
word_ts = load_word_timestamps(audio_path)
|
||||
|
||||
if word_ts:
|
||||
jobs = render_aligned_mode(
|
||||
sentence=sentence,
|
||||
style=style,
|
||||
audio_idx=audio_idx,
|
||||
start_idx=img_counter,
|
||||
word_timestamps=word_ts,
|
||||
audio_start_time=audio_start_times[audio_idx],
|
||||
audio_duration=audio_durations[audio_idx],
|
||||
)
|
||||
else:
|
||||
# WhisperX not available or failed — fall back to single mode
|
||||
print(f"[caption_renderer] No timestamps for sentence {audio_idx}, using single mode")
|
||||
jobs = render_single_mode(sentence, style, audio_idx, img_counter)
|
||||
|
||||
elif mode == "single":
|
||||
jobs = render_single_mode(sentence, style, audio_idx, img_counter)
|
||||
|
||||
else:
|
||||
jobs = render_multi_mode(sentence, style, audio_idx, img_counter)
|
||||
|
||||
all_jobs.extend(jobs)
|
||||
img_counter += len(jobs)
|
||||
|
||||
return all_jobs
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# Drawing primitives
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
def measure_text_block(
|
||||
draw: ImageDraw.ImageDraw,
|
||||
lines: List[str],
|
||||
font: ImageFont.FreeTypeFont,
|
||||
line_spacing: int,
|
||||
) -> tuple:
|
||||
max_w = 0
|
||||
total_h = 0
|
||||
for i, line in enumerate(lines):
|
||||
w, h = getsize(font, line)
|
||||
if w > max_w:
|
||||
max_w = w
|
||||
total_h += h
|
||||
if i < len(lines) - 1:
|
||||
total_h += line_spacing
|
||||
return max_w, total_h
|
||||
|
||||
|
||||
def draw_stroked_text(
|
||||
draw: ImageDraw.ImageDraw,
|
||||
x: int,
|
||||
y: int,
|
||||
line: str,
|
||||
font: ImageFont.FreeTypeFont,
|
||||
fill_color: tuple,
|
||||
stroke_color: tuple,
|
||||
stroke_width: int,
|
||||
) -> None:
|
||||
sw = stroke_width
|
||||
half = max(1, sw // 2)
|
||||
offsets = [
|
||||
(-sw, 0), (sw, 0), (0, -sw), (0, sw),
|
||||
(-sw, -sw), (sw, -sw), (-sw, sw), (sw, sw),
|
||||
(-sw, -half), (sw, -half), (-sw, half), (sw, half),
|
||||
(-half, -sw), (half, -sw), (-half, sw), (half, sw),
|
||||
]
|
||||
for ox, oy in offsets:
|
||||
draw.text((x + ox, y + oy), line, font=font, fill=stroke_color)
|
||||
draw.text((x, y), line, font=font, fill=fill_color)
|
||||
|
||||
|
||||
def fit_font(
|
||||
style: dict,
|
||||
lines: List[str],
|
||||
canvas_w: int,
|
||||
canvas_h: int,
|
||||
line_spacing: int,
|
||||
max_width_ratio: float = 0.88,
|
||||
max_height_ratio: float = 0.45,
|
||||
) -> ImageFont.FreeTypeFont:
|
||||
font_size = style["font_size"]
|
||||
font_path = os.path.join("fonts", style["font_file"])
|
||||
if not os.path.exists(font_path):
|
||||
font_path = os.path.join("fonts", "Roboto-Bold.ttf")
|
||||
max_w = int(canvas_w * max_width_ratio)
|
||||
max_h = int(canvas_h * max_height_ratio)
|
||||
while font_size > 30:
|
||||
font = ImageFont.truetype(font_path, font_size)
|
||||
dummy = Image.new("RGBA", (canvas_w, canvas_h), (0, 0, 0, 0))
|
||||
dummy_d = ImageDraw.Draw(dummy)
|
||||
bw, bh = measure_text_block(dummy_d, lines, font, line_spacing)
|
||||
if bw <= max_w and bh <= max_h:
|
||||
return font
|
||||
font_size -= 4
|
||||
return ImageFont.truetype(font_path, 30)
|
||||
|
||||
|
||||
def render_job_to_image(
|
||||
job: RenderJob,
|
||||
style: dict,
|
||||
canvas_w: int,
|
||||
canvas_h: int,
|
||||
line_spacing: int,
|
||||
) -> Image.Image:
|
||||
font = fit_font(style, job.lines, canvas_w, canvas_h, line_spacing)
|
||||
image = Image.new("RGBA", (canvas_w, canvas_h), (0, 0, 0, 0))
|
||||
draw = ImageDraw.Draw(image)
|
||||
bw, bh = measure_text_block(draw, job.lines, font, line_spacing)
|
||||
anchor_y = int(canvas_h * style["y_position"]) - (bh // 2)
|
||||
cy = anchor_y
|
||||
for line in job.lines:
|
||||
w, h = getsize(font, line)
|
||||
x = (canvas_w - w) // 2
|
||||
draw_stroked_text(draw, x, cy, line, font,
|
||||
style["fill_color"], style["stroke_color"], style["stroke_width"])
|
||||
cy += h + line_spacing
|
||||
return image
|
||||
@ -1,74 +1,156 @@
|
||||
"""
|
||||
imagenarator.py
|
||||
───────────────
|
||||
Thin orchestrator. Does exactly:
|
||||
1. Extract sentences from reddit_obj
|
||||
2. Probe audio durations + compute audio start times (needed for aligned mode)
|
||||
3. Call caption_renderer.get_render_jobs()
|
||||
4. Render each job to PNG
|
||||
5. Save timing_map.json for final_video.py
|
||||
"""
|
||||
|
||||
import glob
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import textwrap
|
||||
from typing import List, Optional
|
||||
|
||||
from PIL import Image, ImageDraw, ImageFont
|
||||
import ffmpeg
|
||||
from rich.progress import track
|
||||
|
||||
from TTS.engine_wrapper import process_text
|
||||
from utils.fonts import getheight, getsize
|
||||
from utils import settings
|
||||
from utils.id import extract_id
|
||||
from utils.sentiment_map import STYLE_MAP, DEFAULT_STYLE
|
||||
from utils.caption_renderer import get_render_jobs, render_job_to_image, RenderJob
|
||||
|
||||
|
||||
LINE_SPACING: int = 20
|
||||
|
||||
def draw_multiple_line_text(
|
||||
image, text, font, text_color, padding, wrap=50, transparent=False
|
||||
) -> None:
|
||||
|
||||
def _extract_sentences(reddit_obj: dict, style: dict) -> List[str]:
|
||||
"""
|
||||
Draw multiline text over given image
|
||||
Extract sentences from thread_post.
|
||||
One sentence per postaudio-{i}.mp3 — order preserved.
|
||||
"""
|
||||
draw = ImageDraw.Draw(image)
|
||||
font_height = getheight(font, text)
|
||||
image_width, image_height = image.size
|
||||
lines = textwrap.wrap(text, width=wrap)
|
||||
y = (image_height / 2) - (((font_height + (len(lines) * padding) / len(lines)) * len(lines)) / 2)
|
||||
for line in lines:
|
||||
line_width, line_height = getsize(font, line)
|
||||
if transparent:
|
||||
shadowcolor = "black"
|
||||
for i in range(1, 5):
|
||||
draw.text(
|
||||
((image_width - line_width) / 2 - i, y - i),
|
||||
line,
|
||||
font=font,
|
||||
fill=shadowcolor,
|
||||
)
|
||||
draw.text(
|
||||
((image_width - line_width) / 2 + i, y - i),
|
||||
line,
|
||||
font=font,
|
||||
fill=shadowcolor,
|
||||
)
|
||||
draw.text(
|
||||
((image_width - line_width) / 2 - i, y + i),
|
||||
line,
|
||||
font=font,
|
||||
fill=shadowcolor,
|
||||
)
|
||||
draw.text(
|
||||
((image_width - line_width) / 2 + i, y + i),
|
||||
line,
|
||||
font=font,
|
||||
fill=shadowcolor,
|
||||
)
|
||||
draw.text(((image_width - line_width) / 2, y), line, font=font, fill=text_color)
|
||||
y += line_height + padding
|
||||
|
||||
|
||||
def imagemaker(theme, reddit_obj: dict, txtclr, padding=5, transparent=False) -> None:
|
||||
raw_texts = reddit_obj["thread_post"]
|
||||
sentences: List[str] = []
|
||||
for item in raw_texts:
|
||||
if isinstance(item, dict):
|
||||
text = item.get("text", "")
|
||||
elif isinstance(item, str):
|
||||
text = item
|
||||
else:
|
||||
text = str(item)
|
||||
text = process_text(text, False).strip()
|
||||
if style.get("uppercase", False):
|
||||
text = text.upper()
|
||||
if text:
|
||||
sentences.append(text)
|
||||
return sentences if sentences else ["..."]
|
||||
|
||||
|
||||
def _get_audio_info(mp3_dir: str) -> tuple:
|
||||
"""
|
||||
Discover postaudio files and compute:
|
||||
- durations list (one per postaudio file)
|
||||
- start times list (absolute seconds in video, after title card)
|
||||
|
||||
Returns (postaudio_files, durations, start_times)
|
||||
"""
|
||||
Render Images for video
|
||||
postaudio_files = sorted(
|
||||
glob.glob(os.path.join(mp3_dir, "postaudio-*.mp3")),
|
||||
key=lambda x: int(re.search(r"postaudio-(\d+)", x).group(1))
|
||||
)
|
||||
|
||||
title_path = os.path.join(mp3_dir, "title.mp3")
|
||||
try:
|
||||
title_duration = float(ffmpeg.probe(title_path)["format"]["duration"])
|
||||
except Exception:
|
||||
title_duration = 0.0
|
||||
|
||||
durations = []
|
||||
start_times = []
|
||||
current = title_duration
|
||||
|
||||
for f in postaudio_files:
|
||||
try:
|
||||
dur = float(ffmpeg.probe(f)["format"]["duration"])
|
||||
except Exception:
|
||||
dur = 0.0
|
||||
start_times.append(current)
|
||||
durations.append(dur)
|
||||
current += dur
|
||||
|
||||
return postaudio_files, durations, start_times
|
||||
|
||||
|
||||
def imagemaker(theme, reddit_obj: dict, txtclr, padding=5, transparent=False) -> int:
|
||||
"""
|
||||
texts = reddit_obj["thread_post"]
|
||||
reddit_id = extract_id(reddit_obj)
|
||||
if transparent:
|
||||
font = ImageFont.truetype(os.path.join("fonts", "Roboto-Bold.ttf"), 100)
|
||||
else:
|
||||
font = ImageFont.truetype(os.path.join("fonts", "Roboto-Regular.ttf"), 100)
|
||||
|
||||
size = (1920, 1080)
|
||||
|
||||
for idx, text in track(enumerate(texts), "Rendering Image"):
|
||||
image = Image.new("RGBA", size, theme)
|
||||
text = process_text(text, False)
|
||||
draw_multiple_line_text(image, text, font, txtclr, padding, wrap=30, transparent=transparent)
|
||||
image.save(f"assets/temp/{reddit_id}/png/img{idx}.png")
|
||||
Render caption images for the video.
|
||||
|
||||
Flow:
|
||||
sentences + audio info
|
||||
→ caption_renderer.get_render_jobs()
|
||||
→ List[RenderJob]
|
||||
each RenderJob → transparent PNG (img{idx}.png)
|
||||
timing_map.json → saved for final_video.py
|
||||
|
||||
timing_map.json entry for fraction-based jobs:
|
||||
{"timing_type": "fraction", "audio_idx": N, "time_fraction": F}
|
||||
|
||||
timing_map.json entry for absolute-based jobs (aligned mode):
|
||||
{"timing_type": "absolute", "clip_start": S, "clip_end": E}
|
||||
|
||||
Returns:
|
||||
int: total number of images generated
|
||||
"""
|
||||
# 1. Style
|
||||
sentiment = settings.config["settings"].get("sentiment", "dramatic")
|
||||
style = STYLE_MAP.get(sentiment, DEFAULT_STYLE)
|
||||
CANVAS_W: int = int(settings.config["settings"]["resolution_w"])
|
||||
CANVAS_H: int = int(settings.config["settings"]["resolution_h"])
|
||||
reddit_id = extract_id(reddit_obj)
|
||||
mp3_dir = f"assets/temp/{reddit_id}/mp3"
|
||||
|
||||
# 2. Extract sentences
|
||||
sentences = _extract_sentences(reddit_obj, style)
|
||||
|
||||
# 3. Get audio timing info (needed for aligned mode)
|
||||
_, durations, start_times = _get_audio_info(mp3_dir)
|
||||
|
||||
# 4. Get render jobs
|
||||
jobs: List[RenderJob] = get_render_jobs(
|
||||
sentences=sentences,
|
||||
style=style,
|
||||
mp3_dir=mp3_dir,
|
||||
audio_start_times=start_times if start_times else None,
|
||||
audio_durations=durations if durations else None,
|
||||
)
|
||||
|
||||
# 5. Render each job to a transparent PNG
|
||||
for job in track(jobs, description="Rendering caption images"):
|
||||
image = render_job_to_image(job, style, CANVAS_W, CANVAS_H, LINE_SPACING)
|
||||
image.save(f"assets/temp/{reddit_id}/png/img{job.idx}.png")
|
||||
|
||||
# 6. Save timing map
|
||||
timing_map = []
|
||||
for job in jobs:
|
||||
if job.timing_type == "absolute":
|
||||
timing_map.append({
|
||||
"timing_type": "absolute",
|
||||
"clip_start": job.clip_start,
|
||||
"clip_end": job.clip_end,
|
||||
})
|
||||
else:
|
||||
timing_map.append({
|
||||
"timing_type": "fraction",
|
||||
"audio_idx": job.audio_idx,
|
||||
"time_fraction": job.time_fraction,
|
||||
})
|
||||
|
||||
timing_map_path = f"assets/temp/{reddit_id}/timing_map.json"
|
||||
with open(timing_map_path, "w") as f:
|
||||
json.dump(timing_map, f, indent=2)
|
||||
|
||||
return len(jobs)
|
||||
@ -0,0 +1,231 @@
|
||||
import json
|
||||
import os
|
||||
from openai import OpenAI
|
||||
from utils import settings
|
||||
from utils.console import print_step, print_substep
|
||||
from utils.sentiment_map import (
|
||||
BACKGROUND_MAP,
|
||||
OPENAI_VOICE_MAP,
|
||||
ELEVENLABS_VOICE_MAP,
|
||||
VALID_SENTIMENTS,
|
||||
DEFAULT_SENTIMENT,
|
||||
)
|
||||
|
||||
|
||||
def _get_client() -> OpenAI:
|
||||
api_key = settings.config["deepseek"]["api_key"]
|
||||
return OpenAI(
|
||||
api_key=api_key,
|
||||
base_url="https://api.deepseek.com",
|
||||
)
|
||||
|
||||
|
||||
def _extract_text(reddit_object: dict) -> tuple:
|
||||
title = reddit_object.get("thread_title", "")
|
||||
post = reddit_object.get("thread_post", "")
|
||||
if isinstance(post, list):
|
||||
post = " ".join([p.get("text", "") for p in post if isinstance(p, dict)])
|
||||
return title, post
|
||||
|
||||
|
||||
def detect_sentiment(reddit_object: dict) -> str:
|
||||
"""
|
||||
Sends the post title + body to DeepSeek and returns a sentiment label.
|
||||
Falls back to DEFAULT_SENTIMENT on any error.
|
||||
"""
|
||||
try:
|
||||
api_key = settings.config["deepseek"]["api_key"]
|
||||
if not api_key:
|
||||
print_substep("No DeepSeek API key found. Using default sentiment.", style="yellow")
|
||||
return DEFAULT_SENTIMENT
|
||||
|
||||
title, post = _extract_text(reddit_object)
|
||||
text = f"Title: {title}\nPost: {post[:500]}"
|
||||
|
||||
client = _get_client()
|
||||
|
||||
response = client.chat.completions.create(
|
||||
model="deepseek-chat",
|
||||
messages=[
|
||||
{
|
||||
"role": "system",
|
||||
"content": (
|
||||
"You are a sentiment classifier for Reddit stories. "
|
||||
"Classify the post into exactly one of these labels: "
|
||||
"sad, happy, angry, mysterious, funny, dramatic, wholesome, scary. "
|
||||
"Respond with only the label, nothing else. No punctuation, no explanation."
|
||||
),
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": text,
|
||||
},
|
||||
],
|
||||
max_tokens=10,
|
||||
temperature=0,
|
||||
)
|
||||
|
||||
label = response.choices[0].message.content.strip().lower()
|
||||
|
||||
if label not in VALID_SENTIMENTS:
|
||||
print_substep(
|
||||
f"DeepSeek returned unexpected label '{label}'. Using default: {DEFAULT_SENTIMENT}",
|
||||
style="yellow",
|
||||
)
|
||||
return DEFAULT_SENTIMENT
|
||||
|
||||
return label
|
||||
|
||||
except Exception as e:
|
||||
print_substep(f"Sentiment detection failed: {e}. Using default: {DEFAULT_SENTIMENT}", style="yellow")
|
||||
return DEFAULT_SENTIMENT
|
||||
|
||||
|
||||
def generate_metadata(reddit_object: dict, sentiment: str) -> dict:
|
||||
"""
|
||||
Generates YouTube title, description, TikTok/Instagram/Facebook captions,
|
||||
and hashtags in a single DeepSeek call.
|
||||
Saves output as JSON next to the video in results/.
|
||||
Falls back to basic metadata on any error.
|
||||
"""
|
||||
try:
|
||||
api_key = settings.config["deepseek"]["api_key"]
|
||||
if not api_key:
|
||||
return _fallback_metadata(reddit_object, sentiment)
|
||||
|
||||
title, post = _extract_text(reddit_object)
|
||||
text = f"Title: {title}\nPost: {post[:800]}"
|
||||
channel_name = settings.config["settings"].get("channel_name", "Reddit Tales")
|
||||
|
||||
client = _get_client()
|
||||
|
||||
response = client.chat.completions.create(
|
||||
model="deepseek-chat",
|
||||
messages=[
|
||||
{
|
||||
"role": "system",
|
||||
"content": (
|
||||
"You are a social media content creator specializing in Reddit story videos. "
|
||||
"Generate engaging titles, captions, and hashtags for a Reddit story video. "
|
||||
"Return ONLY a valid JSON object with these exact keys: "
|
||||
"youtube_title, youtube_description, tiktok_caption, instagram_caption, facebook_caption, hashtags. "
|
||||
"hashtags must be a list of strings. "
|
||||
"Keep youtube_title under 70 characters. "
|
||||
"Keep tiktok_caption under 150 characters including hashtags. "
|
||||
"Make content engaging and click-worthy. "
|
||||
f"The channel name is '{channel_name}'. "
|
||||
f"The story mood is: {sentiment}. "
|
||||
"No markdown, no explanation, just the JSON object."
|
||||
),
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": text,
|
||||
},
|
||||
],
|
||||
max_tokens=600,
|
||||
temperature=0.7,
|
||||
)
|
||||
|
||||
raw = response.choices[0].message.content.strip()
|
||||
|
||||
# Strip markdown code blocks if present
|
||||
if raw.startswith("```"):
|
||||
raw = raw.split("```")[1]
|
||||
if raw.startswith("json"):
|
||||
raw = raw[4:]
|
||||
raw = raw.strip()
|
||||
|
||||
metadata = json.loads(raw)
|
||||
|
||||
# Validate all required keys exist
|
||||
required_keys = [
|
||||
"youtube_title", "youtube_description",
|
||||
"tiktok_caption", "instagram_caption",
|
||||
"facebook_caption", "hashtags"
|
||||
]
|
||||
for key in required_keys:
|
||||
if key not in metadata:
|
||||
metadata[key] = ""
|
||||
|
||||
metadata["sentiment"] = sentiment
|
||||
return metadata
|
||||
|
||||
except Exception as e:
|
||||
print_substep(f"Metadata generation failed: {e}. Using fallback.", style="yellow")
|
||||
return _fallback_metadata(reddit_object, sentiment)
|
||||
|
||||
|
||||
def _fallback_metadata(reddit_object: dict, sentiment: str) -> dict:
|
||||
"""Basic fallback metadata if DeepSeek fails."""
|
||||
title = reddit_object.get("thread_title", "Reddit Story")
|
||||
return {
|
||||
"sentiment": sentiment,
|
||||
"youtube_title": title[:70],
|
||||
"youtube_description": f"{title}\n\n#reddit #stories",
|
||||
"tiktok_caption": f"{title[:100]} #reddit #storytime",
|
||||
"instagram_caption": f"{title[:100]} #reddit #stories",
|
||||
"facebook_caption": title,
|
||||
"hashtags": ["#reddit", "#storytime", "#stories", "#redditstories"],
|
||||
}
|
||||
|
||||
|
||||
def save_metadata(metadata: dict, reddit_object: dict) -> None:
|
||||
"""Saves metadata JSON inside the per-video folder."""
|
||||
try:
|
||||
subreddit = reddit_object.get("thread_subreddit", settings.config["reddit"]["thread"]["subreddit"])
|
||||
thread_id = reddit_object.get("thread_id", "unknown")
|
||||
sentiment_bg = settings.config["settings"]["background"].get("background_video", "unknown")
|
||||
video_folder = f"results/{subreddit}/{thread_id}_{sentiment_bg}"
|
||||
os.makedirs(video_folder, exist_ok=True)
|
||||
filepath = f"{video_folder}/metadata.json"
|
||||
with open(filepath, "w", encoding="utf-8") as f:
|
||||
json.dump(metadata, f, indent=2, ensure_ascii=False)
|
||||
print_substep(f"Metadata saved → {filepath}", style="bold green")
|
||||
except Exception as e:
|
||||
print_substep(f"Failed to save metadata: {e}", style="yellow")
|
||||
|
||||
|
||||
def apply_sentiment_config(reddit_object: dict) -> None:
|
||||
"""
|
||||
Detects sentiment, overrides in-memory config for background/voice,
|
||||
generates metadata, and saves it to disk.
|
||||
Does NOT write to config.toml — changes are per-run only.
|
||||
"""
|
||||
print_step("Detecting sentiment and generating metadata... 🎭")
|
||||
|
||||
sentiment = detect_sentiment(reddit_object)
|
||||
|
||||
# ── Sentiment label — stored in memory so imagenarator.py can read it ────
|
||||
# This is the key that STYLE_MAP lookups depend on at render time.
|
||||
settings.config["settings"]["sentiment"] = sentiment
|
||||
|
||||
# ── Background ───────────────────────────────────────────────────────────
|
||||
bg_video, bg_audio = BACKGROUND_MAP[sentiment]
|
||||
settings.config["settings"]["background"]["background_video"] = bg_video
|
||||
settings.config["settings"]["background"]["background_audio"] = bg_audio
|
||||
|
||||
# ── Voice ────────────────────────────────────────────────────────────────
|
||||
voice_choice = settings.config["settings"]["tts"]["voice_choice"].lower()
|
||||
|
||||
if voice_choice == "elevenlabs":
|
||||
voice = ELEVENLABS_VOICE_MAP[sentiment]
|
||||
settings.config["settings"]["tts"]["elevenlabs_voice_name"] = voice
|
||||
elif voice_choice == "openai":
|
||||
voice = OPENAI_VOICE_MAP[sentiment]
|
||||
settings.config["settings"]["tts"]["openai_voice_name"] = voice
|
||||
else:
|
||||
voice = f"(voice override not supported for {voice_choice})"
|
||||
|
||||
# ── Metadata ─────────────────────────────────────────────────────────────
|
||||
print_substep("Generating titles, captions and hashtags... ✍️", style="bold blue")
|
||||
metadata = generate_metadata(reddit_object, sentiment)
|
||||
save_metadata(metadata, reddit_object)
|
||||
|
||||
# ── Log ──────────────────────────────────────────────────────────────────
|
||||
print_substep(f"Sentiment detected : {sentiment} 🎯", style="bold green")
|
||||
print_substep(f"Background video : {bg_video}", style="bold blue")
|
||||
print_substep(f"Background audio : {bg_audio if bg_audio else 'none'}", style="bold blue")
|
||||
print_substep(f"Voice : {voice}", style="bold blue")
|
||||
print_substep(f"YouTube title : {metadata['youtube_title']}", style="bold blue")
|
||||
print_substep(f"TikTok caption : {metadata['tiktok_caption']}", style="bold blue")
|
||||
@ -0,0 +1,158 @@
|
||||
BACKGROUND_MAP = {
|
||||
"sad": ("minecraft", "lofi"),
|
||||
"happy": ("fall-guys", "chill-summer"),
|
||||
"angry": ("gta", "lofi"),
|
||||
"mysterious": ("csgo-surf", "lofi-2"),
|
||||
"funny": ("cluster-truck", "chill-summer"),
|
||||
"dramatic": ("rocket-league", "lofi"),
|
||||
"wholesome": ("steep", "chill-summer"),
|
||||
"scary": ("minecraft-2", "lofi-2"),
|
||||
}
|
||||
|
||||
OPENAI_VOICE_MAP = {
|
||||
"sad": "nova",
|
||||
"happy": "shimmer",
|
||||
"angry": "onyx",
|
||||
"mysterious": "echo",
|
||||
"funny": "fable",
|
||||
"dramatic": "alloy",
|
||||
"wholesome": "nova",
|
||||
"scary": "onyx",
|
||||
}
|
||||
|
||||
ELEVENLABS_VOICE_MAP = {
|
||||
"sad": "Brian - Deep, Resonant and Comforting",
|
||||
"happy": "Jessica - Playful, Bright, Warm",
|
||||
"angry": "Adam - Dominant, Firm",
|
||||
"mysterious": "Callum - Husky Trickster",
|
||||
"funny": "Laura - Enthusiast, Quirky Attitude",
|
||||
"dramatic": "George - Warm, Captivating Storyteller",
|
||||
"wholesome": "Matilda - Knowledgable, Professional",
|
||||
"scary": "Harry - Fierce Warrior",
|
||||
}
|
||||
|
||||
VALID_SENTIMENTS = list(BACKGROUND_MAP.keys())
|
||||
DEFAULT_SENTIMENT = "dramatic"
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# STYLE_MAP
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
#
|
||||
# display_mode options:
|
||||
#
|
||||
# "aligned" → WhisperX word timestamps — perfect sync with any TTS.
|
||||
# Falls back to "single" per sentence if timestamps unavailable.
|
||||
# USE THIS for all sentiments once WhisperX is installed.
|
||||
#
|
||||
# "single" → Split sentence into word chunks, equal time per chunk.
|
||||
# Good fallback when WhisperX is not installed.
|
||||
#
|
||||
# "multi" → Full sentence on one image. No splitting.
|
||||
# Best for slow TTS or wholesome/sad content.
|
||||
#
|
||||
# words_per_chunk:
|
||||
# In "aligned" mode: words grouped per visible chunk (3-5 recommended)
|
||||
# In "single" mode: words per chunk (higher = fewer chunks = slower pace)
|
||||
# In "multi" mode: words per line in the wrapped text block
|
||||
#
|
||||
STYLE_MAP = {
|
||||
|
||||
"dramatic": {
|
||||
"font_file": "Montserrat-ExtraBold.ttf",
|
||||
"font_size": 95,
|
||||
"fill_color": (255, 255, 255, 255),
|
||||
"stroke_color": (0, 0, 0, 255),
|
||||
"stroke_width": 4,
|
||||
"words_per_chunk": 4,
|
||||
"y_position": 0.65,
|
||||
"uppercase": False,
|
||||
"display_mode": "aligned",
|
||||
},
|
||||
|
||||
"scary": {
|
||||
"font_file": "Oswald-Bold.ttf",
|
||||
"font_size": 95,
|
||||
"fill_color": (232, 244, 248, 255),
|
||||
"stroke_color": (0, 0, 0, 255),
|
||||
"stroke_width": 5,
|
||||
"words_per_chunk": 3,
|
||||
"y_position": 0.65,
|
||||
"uppercase": False,
|
||||
"display_mode": "aligned",
|
||||
},
|
||||
|
||||
"angry": {
|
||||
"font_file": "Anton-Regular.ttf",
|
||||
"font_size": 105,
|
||||
"fill_color": (255, 69, 0, 255),
|
||||
"stroke_color": (0, 0, 0, 255),
|
||||
"stroke_width": 5,
|
||||
"words_per_chunk": 3,
|
||||
"y_position": 0.65,
|
||||
"uppercase": True,
|
||||
"display_mode": "aligned",
|
||||
},
|
||||
|
||||
"mysterious": {
|
||||
"font_file": "Raleway-Bold.ttf",
|
||||
"font_size": 90,
|
||||
"fill_color": (184, 212, 232, 255),
|
||||
"stroke_color": (0, 0, 0, 255),
|
||||
"stroke_width": 4,
|
||||
"words_per_chunk": 3,
|
||||
"y_position": 0.65,
|
||||
"uppercase": False,
|
||||
"display_mode": "aligned",
|
||||
},
|
||||
|
||||
"funny": {
|
||||
"font_file": "Nunito-ExtraBold.ttf",
|
||||
"font_size": 90,
|
||||
"fill_color": (255, 230, 0, 255),
|
||||
"stroke_color": (0, 0, 0, 255),
|
||||
"stroke_width": 4,
|
||||
"words_per_chunk": 5,
|
||||
"y_position": 0.65,
|
||||
"uppercase": False,
|
||||
"display_mode": "aligned",
|
||||
},
|
||||
|
||||
"sad": {
|
||||
"font_file": "Lato-Bold.ttf",
|
||||
"font_size": 88,
|
||||
"fill_color": (220, 225, 255, 255),
|
||||
"stroke_color": (10, 10, 46, 255),
|
||||
"stroke_width": 3,
|
||||
"words_per_chunk": 5,
|
||||
"y_position": 0.65,
|
||||
"uppercase": False,
|
||||
"display_mode": "aligned",
|
||||
},
|
||||
|
||||
"wholesome": {
|
||||
"font_file": "Nunito-ExtraBold.ttf",
|
||||
"font_size": 88,
|
||||
"fill_color": (255, 248, 231, 255),
|
||||
"stroke_color": (26, 10, 0, 255),
|
||||
"stroke_width": 3,
|
||||
"words_per_chunk": 5,
|
||||
"y_position": 0.65,
|
||||
"uppercase": False,
|
||||
"display_mode": "aligned",
|
||||
},
|
||||
|
||||
"happy": {
|
||||
"font_file": "Nunito-ExtraBold.ttf",
|
||||
"font_size": 90,
|
||||
"fill_color": (255, 230, 0, 255),
|
||||
"stroke_color": (0, 0, 0, 255),
|
||||
"stroke_width": 4,
|
||||
"words_per_chunk": 5,
|
||||
"y_position": 0.65,
|
||||
"uppercase": False,
|
||||
"display_mode": "aligned",
|
||||
},
|
||||
}
|
||||
|
||||
DEFAULT_STYLE = STYLE_MAP["dramatic"]
|
||||
@ -0,0 +1,168 @@
|
||||
"""
|
||||
whisper_aligner.py
|
||||
──────────────────
|
||||
Word-level timestamp extraction using WhisperX.
|
||||
|
||||
This module runs after each TTS audio file is saved.
|
||||
It produces a word-level timestamp JSON for every postaudio-{i}.mp3.
|
||||
|
||||
Output format (postaudio-{i}_words.json):
|
||||
[
|
||||
{"word": "I", "start": 0.00, "end": 0.18},
|
||||
{"word": "told", "start": 0.18, "end": 0.42},
|
||||
...
|
||||
]
|
||||
|
||||
WhisperX is used because:
|
||||
- Works with ANY TTS engine (Google, OpenAI, ElevenLabs, etc.)
|
||||
- Free, runs locally, no API cost
|
||||
- Word-level accuracy (not sentence-level)
|
||||
- Fast on CPU for short audio clips
|
||||
|
||||
If WhisperX is not installed or fails for any reason,
|
||||
this module returns None and the system falls back to
|
||||
time_fraction-based sync (single/multi mode).
|
||||
No crashes, no interruptions.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
from typing import List, Optional
|
||||
|
||||
from utils.console import print_substep
|
||||
|
||||
|
||||
# ── WhisperX model is loaded once and reused across all audio files ───────────
|
||||
# Loading is expensive (~2-3s). We cache it as a module-level singleton.
|
||||
_whisper_model = None
|
||||
_whisper_model_lang = None
|
||||
|
||||
|
||||
def _get_model(language: str = "en"):
|
||||
"""
|
||||
Lazy-load WhisperX model. Loaded once per run, reused for all audio files.
|
||||
Returns None if WhisperX is not installed.
|
||||
"""
|
||||
global _whisper_model, _whisper_model_lang
|
||||
|
||||
if _whisper_model is not None and _whisper_model_lang == language:
|
||||
return _whisper_model
|
||||
|
||||
try:
|
||||
import whisperx
|
||||
print_substep("Loading WhisperX model (first run only)...", style="bold blue")
|
||||
_whisper_model = whisperx.load_model(
|
||||
"base", # small enough for CPU, accurate enough for TTS
|
||||
device="cpu",
|
||||
compute_type="int8",
|
||||
language=language,
|
||||
)
|
||||
_whisper_model_lang = language
|
||||
return _whisper_model
|
||||
except ImportError:
|
||||
return None
|
||||
except Exception as e:
|
||||
print_substep(f"WhisperX model load failed: {e}", style="yellow")
|
||||
return None
|
||||
|
||||
|
||||
def align_audio(audio_path: str, language: str = "en") -> Optional[List[dict]]:
|
||||
"""
|
||||
Run WhisperX on a single audio file and return word-level timestamps.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
audio_path : str
|
||||
Path to the .mp3 file to align.
|
||||
language : str
|
||||
Language code (default: "en"). Matches TTS language.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Optional[List[dict]]
|
||||
List of {"word": str, "start": float, "end": float} dicts.
|
||||
Returns None if WhisperX is unavailable or alignment fails.
|
||||
"""
|
||||
try:
|
||||
import whisperx
|
||||
|
||||
model = _get_model(language)
|
||||
if model is None:
|
||||
return None
|
||||
|
||||
# Transcribe + align
|
||||
audio = whisperx.load_audio(audio_path)
|
||||
result = model.transcribe(audio, batch_size=4)
|
||||
|
||||
# Align to get word-level timestamps
|
||||
align_model, metadata = whisperx.load_align_model(
|
||||
language_code=language,
|
||||
device="cpu",
|
||||
)
|
||||
aligned = whisperx.align(
|
||||
result["segments"],
|
||||
align_model,
|
||||
metadata,
|
||||
audio,
|
||||
device="cpu",
|
||||
return_char_alignments=False,
|
||||
)
|
||||
|
||||
# Flatten all words across all segments
|
||||
words = []
|
||||
for segment in aligned.get("word_segments", []):
|
||||
word = segment.get("word", "").strip()
|
||||
start = segment.get("start")
|
||||
end = segment.get("end")
|
||||
if word and start is not None and end is not None:
|
||||
words.append({
|
||||
"word": word,
|
||||
"start": round(float(start), 3),
|
||||
"end": round(float(end), 3),
|
||||
})
|
||||
|
||||
return words if words else None
|
||||
|
||||
except Exception as e:
|
||||
print_substep(f"WhisperX alignment failed for {audio_path}: {e}", style="yellow")
|
||||
return None
|
||||
|
||||
|
||||
def align_and_save(audio_path: str, language: str = "en") -> Optional[str]:
|
||||
"""
|
||||
Align audio and save word timestamps as a JSON file next to the audio.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
audio_path : str
|
||||
e.g. "assets/temp/abc123/mp3/postaudio-0.mp3"
|
||||
language : str
|
||||
Language code.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Optional[str]
|
||||
Path to saved JSON file, or None if alignment failed.
|
||||
"""
|
||||
words = align_audio(audio_path, language)
|
||||
|
||||
if words is None:
|
||||
return None
|
||||
|
||||
json_path = audio_path.replace(".mp3", "_words.json")
|
||||
with open(json_path, "w", encoding="utf-8") as f:
|
||||
json.dump(words, f, indent=2, ensure_ascii=False)
|
||||
|
||||
return json_path
|
||||
|
||||
|
||||
def load_word_timestamps(audio_path: str) -> Optional[List[dict]]:
|
||||
"""
|
||||
Load previously saved word timestamps for an audio file.
|
||||
Returns None if the file doesn't exist.
|
||||
"""
|
||||
json_path = audio_path.replace(".mp3", "_words.json")
|
||||
if not os.path.exists(json_path):
|
||||
return None
|
||||
with open(json_path, "r", encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
Loading…
Reference in new issue