diff --git a/TTS/engine_wrapper.py b/TTS/engine_wrapper.py index 1026a6d..d86e767 100644 --- a/TTS/engine_wrapper.py +++ b/TTS/engine_wrapper.py @@ -14,23 +14,11 @@ from utils import settings from utils.console import print_step, print_substep from utils.voice import sanitize_text -DEFAULT_MAX_LENGTH: int = ( - 50 # Video length variable, edit this on your own risk. It should work, but it's not supported -) +DEFAULT_MAX_LENGTH: int = 50 class TTSEngine: - """Calls the given TTS engine to reduce code duplication and allow multiple TTS engines. - - Args: - tts_module : The TTS module. Your module should handle the TTS itself and saving to the given path under the run method. - reddit_object : The reddit object that contains the posts to read. - path (Optional) : The unix style path to save the mp3 files to. This must not have leading or trailing slashes. - max_length (Optional) : The maximum length of the mp3 files in total. - - Notes: - tts_module must take the arguments text and filepath. - """ + """Calls the given TTS engine to reduce code duplication and allow multiple TTS engines.""" def __init__( self, @@ -42,18 +30,14 @@ class TTSEngine: ): self.tts_module = tts_module() self.reddit_object = reddit_object - self.redditid = re.sub(r"[^\w\s-]", "", reddit_object["thread_id"]) self.path = path + self.redditid + "/mp3" self.max_length = max_length self.length = 0 self.last_clip_length = last_clip_length - def add_periods( - self, - ): # adds periods to the end of paragraphs (where people often forget to put them) so tts doesn't blend sentences + def add_periods(self): for comment in self.reddit_object["comments"]: - # remove links regex_urls = r"((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*" comment["comment_body"] = re.sub(regex_urls, " ", comment["comment_body"]) comment["comment_body"] = comment["comment_body"].replace("\n", ". ") @@ -72,7 +56,6 @@ class TTSEngine: self.add_periods() self.call_tts("title", process_text(self.reddit_object["thread_title"])) - # processed_text = ##self.reddit_object["thread_post"] != "" idx = 0 if settings.config["settings"]["storymode"]: @@ -84,24 +67,41 @@ class TTSEngine: elif settings.config["settings"]["storymodemethod"] == 1: for idx, text in track(enumerate(self.reddit_object["thread_post"])): self.call_tts(f"postaudio-{idx}", process_text(text)) - + # ── WhisperX alignment ──────────────────────────────────── + # Run immediately after each TTS save so word timestamps + # are ready when imagemaker() runs later. + # Fails silently — never blocks video generation. + self._align_audio(f"postaudio-{idx}") else: for idx, comment in track(enumerate(self.reddit_object["comments"]), "Saving..."): - # ! Stop creating mp3 files if the length is greater than max length. if self.length > self.max_length and idx > 1: self.length -= self.last_clip_length idx -= 1 break - if ( - len(comment["comment_body"]) > self.tts_module.max_chars - ): # Split the comment if it is too long - self.split_post(comment["comment_body"], idx) # Split the comment - else: # If the comment is not too long, just call the tts engine + if len(comment["comment_body"]) > self.tts_module.max_chars: + self.split_post(comment["comment_body"], idx) + else: self.call_tts(f"{idx}", process_text(comment["comment_body"])) print_substep("Saved Text to MP3 files successfully.", style="bold green") return self.length, idx + def _align_audio(self, filename: str) -> None: + """ + Run WhisperX on a saved audio file to produce word-level timestamps. + Called immediately after each postaudio-{i}.mp3 is saved. + Fails silently — system falls back to time_fraction mode if unavailable. + """ + try: + from utils.whisper_aligner import align_and_save + audio_path = f"{self.path}/{filename}.mp3" + lang = settings.config["reddit"]["thread"].get("post_lang", "en") or "en" + result = align_and_save(audio_path, language=lang) + if result: + print_substep(f"Word timestamps saved → {result}", style="dim") + except Exception: + pass # Never crash on alignment failure + def split_post(self, text: str, idx): split_files = [] split_text = [ @@ -114,8 +114,6 @@ class TTSEngine: for idy, text_cut in enumerate(split_text): newtext = process_text(text_cut) - # print(f"{idx}-{idy}: {newtext}\n") - if not newtext or newtext.isspace(): print("newtext was blank because sanitized split text resulted in none") continue @@ -144,7 +142,6 @@ class TTSEngine: def call_tts(self, filename: str, text: str): if settings.config["settings"]["tts"]["voice_choice"] == "googletranslate": - # GTTS does not have the argument 'random_voice' self.tts_module.run( text, filepath=f"{self.path}/{filename}.mp3", @@ -155,10 +152,6 @@ class TTSEngine: filepath=f"{self.path}/{filename}.mp3", random_voice=settings.config["settings"]["tts"]["random_voice"], ) - # try: - # self.length += MP3(f"{self.path}/{filename}.mp3").info.length - # except (MutagenError, HeaderNotFoundError): - # self.length += sox.file_info.duration(f"{self.path}/{filename}.mp3") try: clip = AudioFileClip(f"{self.path}/{filename}.mp3") self.last_clip_length = clip.duration @@ -185,4 +178,4 @@ def process_text(text: str, clean: bool = True): print_substep("Translating Text...") translated_text = translators.translate_text(text, translator="google", to_language=lang) new_text = sanitize_text(translated_text) - return new_text + return new_text \ No newline at end of file diff --git a/fonts/Anton-Regular.ttf b/fonts/Anton-Regular.ttf new file mode 100644 index 0000000..3eb0e15 Binary files /dev/null and b/fonts/Anton-Regular.ttf differ diff --git a/fonts/Lato-Black.ttf b/fonts/Lato-Black.ttf new file mode 100644 index 0000000..4340502 Binary files /dev/null and b/fonts/Lato-Black.ttf differ diff --git a/fonts/Lato-BlackItalic.ttf b/fonts/Lato-BlackItalic.ttf new file mode 100644 index 0000000..4df1555 Binary files /dev/null and b/fonts/Lato-BlackItalic.ttf differ diff --git a/fonts/Lato-Bold.ttf b/fonts/Lato-Bold.ttf new file mode 100644 index 0000000..016068b Binary files /dev/null and b/fonts/Lato-Bold.ttf differ diff --git a/fonts/Lato-BoldItalic.ttf b/fonts/Lato-BoldItalic.ttf new file mode 100644 index 0000000..a05d503 Binary files /dev/null and b/fonts/Lato-BoldItalic.ttf differ diff --git a/fonts/Lato-Italic.ttf b/fonts/Lato-Italic.ttf new file mode 100644 index 0000000..0d0f69e Binary files /dev/null and b/fonts/Lato-Italic.ttf differ diff --git a/fonts/Lato-Light.ttf b/fonts/Lato-Light.ttf new file mode 100644 index 0000000..dfa72ce Binary files /dev/null and b/fonts/Lato-Light.ttf differ diff --git a/fonts/Lato-LightItalic.ttf b/fonts/Lato-LightItalic.ttf new file mode 100644 index 0000000..12f2b6c Binary files /dev/null and b/fonts/Lato-LightItalic.ttf differ diff --git a/fonts/Lato-Regular.ttf b/fonts/Lato-Regular.ttf new file mode 100644 index 0000000..bb2e887 Binary files /dev/null and b/fonts/Lato-Regular.ttf differ diff --git a/fonts/Lato-Thin.ttf b/fonts/Lato-Thin.ttf new file mode 100644 index 0000000..ba58da1 Binary files /dev/null and b/fonts/Lato-Thin.ttf differ diff --git a/fonts/Lato-ThinItalic.ttf b/fonts/Lato-ThinItalic.ttf new file mode 100644 index 0000000..4d82766 Binary files /dev/null and b/fonts/Lato-ThinItalic.ttf differ diff --git a/fonts/Montserrat-Black.ttf b/fonts/Montserrat-Black.ttf new file mode 100644 index 0000000..2fab7ab Binary files /dev/null and b/fonts/Montserrat-Black.ttf differ diff --git a/fonts/Montserrat-BlackItalic.ttf b/fonts/Montserrat-BlackItalic.ttf new file mode 100644 index 0000000..04d3b47 Binary files /dev/null and b/fonts/Montserrat-BlackItalic.ttf differ diff --git a/fonts/Montserrat-Bold.ttf b/fonts/Montserrat-Bold.ttf new file mode 100644 index 0000000..4033587 Binary files /dev/null and b/fonts/Montserrat-Bold.ttf differ diff --git a/fonts/Montserrat-BoldItalic.ttf b/fonts/Montserrat-BoldItalic.ttf new file mode 100644 index 0000000..0cc5c2c Binary files /dev/null and b/fonts/Montserrat-BoldItalic.ttf differ diff --git a/fonts/Montserrat-ExtraBold.ttf b/fonts/Montserrat-ExtraBold.ttf new file mode 100644 index 0000000..476ec30 Binary files /dev/null and b/fonts/Montserrat-ExtraBold.ttf differ diff --git a/fonts/Montserrat-ExtraBoldItalic.ttf b/fonts/Montserrat-ExtraBoldItalic.ttf new file mode 100644 index 0000000..a1ac9a9 Binary files /dev/null and b/fonts/Montserrat-ExtraBoldItalic.ttf differ diff --git a/fonts/Montserrat-ExtraLight.ttf b/fonts/Montserrat-ExtraLight.ttf new file mode 100644 index 0000000..efaeab0 Binary files /dev/null and b/fonts/Montserrat-ExtraLight.ttf differ diff --git a/fonts/Montserrat-ExtraLightItalic.ttf b/fonts/Montserrat-ExtraLightItalic.ttf new file mode 100644 index 0000000..a8d18de Binary files /dev/null and b/fonts/Montserrat-ExtraLightItalic.ttf differ diff --git a/fonts/Montserrat-Italic-VariableFont_wght.ttf b/fonts/Montserrat-Italic-VariableFont_wght.ttf new file mode 100644 index 0000000..76e3f9d Binary files /dev/null and b/fonts/Montserrat-Italic-VariableFont_wght.ttf differ diff --git a/fonts/Montserrat-Italic.ttf b/fonts/Montserrat-Italic.ttf new file mode 100644 index 0000000..5f08df0 Binary files /dev/null and b/fonts/Montserrat-Italic.ttf differ diff --git a/fonts/Montserrat-Light.ttf b/fonts/Montserrat-Light.ttf new file mode 100644 index 0000000..881f12d Binary files /dev/null and b/fonts/Montserrat-Light.ttf differ diff --git a/fonts/Montserrat-LightItalic.ttf b/fonts/Montserrat-LightItalic.ttf new file mode 100644 index 0000000..b2991d0 Binary files /dev/null and b/fonts/Montserrat-LightItalic.ttf differ diff --git a/fonts/Montserrat-Medium.ttf b/fonts/Montserrat-Medium.ttf new file mode 100644 index 0000000..c9a39ea Binary files /dev/null and b/fonts/Montserrat-Medium.ttf differ diff --git a/fonts/Montserrat-MediumItalic.ttf b/fonts/Montserrat-MediumItalic.ttf new file mode 100644 index 0000000..086dd6e Binary files /dev/null and b/fonts/Montserrat-MediumItalic.ttf differ diff --git a/fonts/Montserrat-Regular.ttf b/fonts/Montserrat-Regular.ttf new file mode 100644 index 0000000..895e220 Binary files /dev/null and b/fonts/Montserrat-Regular.ttf differ diff --git a/fonts/Montserrat-SemiBold.ttf b/fonts/Montserrat-SemiBold.ttf new file mode 100644 index 0000000..161477a Binary files /dev/null and b/fonts/Montserrat-SemiBold.ttf differ diff --git a/fonts/Montserrat-SemiBoldItalic.ttf b/fonts/Montserrat-SemiBoldItalic.ttf new file mode 100644 index 0000000..73dc6c6 Binary files /dev/null and b/fonts/Montserrat-SemiBoldItalic.ttf differ diff --git a/fonts/Montserrat-Thin.ttf b/fonts/Montserrat-Thin.ttf new file mode 100644 index 0000000..c9cf195 Binary files /dev/null and b/fonts/Montserrat-Thin.ttf differ diff --git a/fonts/Montserrat-ThinItalic.ttf b/fonts/Montserrat-ThinItalic.ttf new file mode 100644 index 0000000..e6dfc05 Binary files /dev/null and b/fonts/Montserrat-ThinItalic.ttf differ diff --git a/fonts/Montserrat-VariableFont_wght.ttf b/fonts/Montserrat-VariableFont_wght.ttf new file mode 100644 index 0000000..451e692 Binary files /dev/null and b/fonts/Montserrat-VariableFont_wght.ttf differ diff --git a/fonts/Nunito-Black.ttf b/fonts/Nunito-Black.ttf new file mode 100644 index 0000000..99491f8 Binary files /dev/null and b/fonts/Nunito-Black.ttf differ diff --git a/fonts/Nunito-BlackItalic.ttf b/fonts/Nunito-BlackItalic.ttf new file mode 100644 index 0000000..6004938 Binary files /dev/null and b/fonts/Nunito-BlackItalic.ttf differ diff --git a/fonts/Nunito-Bold.ttf b/fonts/Nunito-Bold.ttf new file mode 100644 index 0000000..6909689 Binary files /dev/null and b/fonts/Nunito-Bold.ttf differ diff --git a/fonts/Nunito-BoldItalic.ttf b/fonts/Nunito-BoldItalic.ttf new file mode 100644 index 0000000..2479c36 Binary files /dev/null and b/fonts/Nunito-BoldItalic.ttf differ diff --git a/fonts/Nunito-ExtraBold.ttf b/fonts/Nunito-ExtraBold.ttf new file mode 100644 index 0000000..6f4ccde Binary files /dev/null and b/fonts/Nunito-ExtraBold.ttf differ diff --git a/fonts/Nunito-ExtraBoldItalic.ttf b/fonts/Nunito-ExtraBoldItalic.ttf new file mode 100644 index 0000000..a82e6a2 Binary files /dev/null and b/fonts/Nunito-ExtraBoldItalic.ttf differ diff --git a/fonts/Nunito-ExtraLight.ttf b/fonts/Nunito-ExtraLight.ttf new file mode 100644 index 0000000..96711f9 Binary files /dev/null and b/fonts/Nunito-ExtraLight.ttf differ diff --git a/fonts/Nunito-ExtraLightItalic.ttf b/fonts/Nunito-ExtraLightItalic.ttf new file mode 100644 index 0000000..ff043a4 Binary files /dev/null and b/fonts/Nunito-ExtraLightItalic.ttf differ diff --git a/fonts/Nunito-Italic-VariableFont_wght.ttf b/fonts/Nunito-Italic-VariableFont_wght.ttf new file mode 100644 index 0000000..4622420 Binary files /dev/null and b/fonts/Nunito-Italic-VariableFont_wght.ttf differ diff --git a/fonts/Nunito-Italic.ttf b/fonts/Nunito-Italic.ttf new file mode 100644 index 0000000..97fd169 Binary files /dev/null and b/fonts/Nunito-Italic.ttf differ diff --git a/fonts/Nunito-Light.ttf b/fonts/Nunito-Light.ttf new file mode 100644 index 0000000..fb050fc Binary files /dev/null and b/fonts/Nunito-Light.ttf differ diff --git a/fonts/Nunito-LightItalic.ttf b/fonts/Nunito-LightItalic.ttf new file mode 100644 index 0000000..0914950 Binary files /dev/null and b/fonts/Nunito-LightItalic.ttf differ diff --git a/fonts/Nunito-Medium.ttf b/fonts/Nunito-Medium.ttf new file mode 100644 index 0000000..a6993eb Binary files /dev/null and b/fonts/Nunito-Medium.ttf differ diff --git a/fonts/Nunito-MediumItalic.ttf b/fonts/Nunito-MediumItalic.ttf new file mode 100644 index 0000000..1913632 Binary files /dev/null and b/fonts/Nunito-MediumItalic.ttf differ diff --git a/fonts/Nunito-Regular.ttf b/fonts/Nunito-Regular.ttf new file mode 100644 index 0000000..be80c3f Binary files /dev/null and b/fonts/Nunito-Regular.ttf differ diff --git a/fonts/Nunito-SemiBold.ttf b/fonts/Nunito-SemiBold.ttf new file mode 100644 index 0000000..06f29ea Binary files /dev/null and b/fonts/Nunito-SemiBold.ttf differ diff --git a/fonts/Nunito-SemiBoldItalic.ttf b/fonts/Nunito-SemiBoldItalic.ttf new file mode 100644 index 0000000..5af8133 Binary files /dev/null and b/fonts/Nunito-SemiBoldItalic.ttf differ diff --git a/fonts/Nunito-VariableFont_wght.ttf b/fonts/Nunito-VariableFont_wght.ttf new file mode 100644 index 0000000..10387be Binary files /dev/null and b/fonts/Nunito-VariableFont_wght.ttf differ diff --git a/fonts/Oswald-Bold.ttf b/fonts/Oswald-Bold.ttf new file mode 100644 index 0000000..cf01c4d Binary files /dev/null and b/fonts/Oswald-Bold.ttf differ diff --git a/fonts/Oswald-ExtraLight.ttf b/fonts/Oswald-ExtraLight.ttf new file mode 100644 index 0000000..2d127c1 Binary files /dev/null and b/fonts/Oswald-ExtraLight.ttf differ diff --git a/fonts/Oswald-Light.ttf b/fonts/Oswald-Light.ttf new file mode 100644 index 0000000..d6e1171 Binary files /dev/null and b/fonts/Oswald-Light.ttf differ diff --git a/fonts/Oswald-Medium.ttf b/fonts/Oswald-Medium.ttf new file mode 100644 index 0000000..9ef04ac Binary files /dev/null and b/fonts/Oswald-Medium.ttf differ diff --git a/fonts/Oswald-Regular.ttf b/fonts/Oswald-Regular.ttf new file mode 100644 index 0000000..5cc7b31 Binary files /dev/null and b/fonts/Oswald-Regular.ttf differ diff --git a/fonts/Oswald-SemiBold.ttf b/fonts/Oswald-SemiBold.ttf new file mode 100644 index 0000000..72495ed Binary files /dev/null and b/fonts/Oswald-SemiBold.ttf differ diff --git a/fonts/Oswald-VariableFont_wght.ttf b/fonts/Oswald-VariableFont_wght.ttf new file mode 100644 index 0000000..c89bdf6 Binary files /dev/null and b/fonts/Oswald-VariableFont_wght.ttf differ diff --git a/fonts/Raleway-Black.ttf b/fonts/Raleway-Black.ttf new file mode 100644 index 0000000..658de36 Binary files /dev/null and b/fonts/Raleway-Black.ttf differ diff --git a/fonts/Raleway-BlackItalic.ttf b/fonts/Raleway-BlackItalic.ttf new file mode 100644 index 0000000..99d169c Binary files /dev/null and b/fonts/Raleway-BlackItalic.ttf differ diff --git a/fonts/Raleway-Bold.ttf b/fonts/Raleway-Bold.ttf new file mode 100644 index 0000000..92d30ce Binary files /dev/null and b/fonts/Raleway-Bold.ttf differ diff --git a/fonts/Raleway-BoldItalic.ttf b/fonts/Raleway-BoldItalic.ttf new file mode 100644 index 0000000..cdf44e1 Binary files /dev/null and b/fonts/Raleway-BoldItalic.ttf differ diff --git a/fonts/Raleway-ExtraBold.ttf b/fonts/Raleway-ExtraBold.ttf new file mode 100644 index 0000000..4b0371b Binary files /dev/null and b/fonts/Raleway-ExtraBold.ttf differ diff --git a/fonts/Raleway-ExtraBoldItalic.ttf b/fonts/Raleway-ExtraBoldItalic.ttf new file mode 100644 index 0000000..49f5cb1 Binary files /dev/null and b/fonts/Raleway-ExtraBoldItalic.ttf differ diff --git a/fonts/Raleway-ExtraLight.ttf b/fonts/Raleway-ExtraLight.ttf new file mode 100644 index 0000000..d682219 Binary files /dev/null and b/fonts/Raleway-ExtraLight.ttf differ diff --git a/fonts/Raleway-ExtraLightItalic.ttf b/fonts/Raleway-ExtraLightItalic.ttf new file mode 100644 index 0000000..2d47fbf Binary files /dev/null and b/fonts/Raleway-ExtraLightItalic.ttf differ diff --git a/fonts/Raleway-Italic-VariableFont_wght.ttf b/fonts/Raleway-Italic-VariableFont_wght.ttf new file mode 100644 index 0000000..64f5882 Binary files /dev/null and b/fonts/Raleway-Italic-VariableFont_wght.ttf differ diff --git a/fonts/Raleway-Italic.ttf b/fonts/Raleway-Italic.ttf new file mode 100644 index 0000000..0f8107e Binary files /dev/null and b/fonts/Raleway-Italic.ttf differ diff --git a/fonts/Raleway-Light.ttf b/fonts/Raleway-Light.ttf new file mode 100644 index 0000000..23f7fb1 Binary files /dev/null and b/fonts/Raleway-Light.ttf differ diff --git a/fonts/Raleway-LightItalic.ttf b/fonts/Raleway-LightItalic.ttf new file mode 100644 index 0000000..187619e Binary files /dev/null and b/fonts/Raleway-LightItalic.ttf differ diff --git a/fonts/Raleway-Medium.ttf b/fonts/Raleway-Medium.ttf new file mode 100644 index 0000000..b46bd18 Binary files /dev/null and b/fonts/Raleway-Medium.ttf differ diff --git a/fonts/Raleway-MediumItalic.ttf b/fonts/Raleway-MediumItalic.ttf new file mode 100644 index 0000000..aabbfe9 Binary files /dev/null and b/fonts/Raleway-MediumItalic.ttf differ diff --git a/fonts/Raleway-Regular.ttf b/fonts/Raleway-Regular.ttf new file mode 100644 index 0000000..ec9b483 Binary files /dev/null and b/fonts/Raleway-Regular.ttf differ diff --git a/fonts/Raleway-SemiBold.ttf b/fonts/Raleway-SemiBold.ttf new file mode 100644 index 0000000..d228458 Binary files /dev/null and b/fonts/Raleway-SemiBold.ttf differ diff --git a/fonts/Raleway-SemiBoldItalic.ttf b/fonts/Raleway-SemiBoldItalic.ttf new file mode 100644 index 0000000..6c5bbc1 Binary files /dev/null and b/fonts/Raleway-SemiBoldItalic.ttf differ diff --git a/fonts/Raleway-Thin.ttf b/fonts/Raleway-Thin.ttf new file mode 100644 index 0000000..ed48751 Binary files /dev/null and b/fonts/Raleway-Thin.ttf differ diff --git a/fonts/Raleway-ThinItalic.ttf b/fonts/Raleway-ThinItalic.ttf new file mode 100644 index 0000000..a431542 Binary files /dev/null and b/fonts/Raleway-ThinItalic.ttf differ diff --git a/fonts/Raleway-VariableFont_wght.ttf b/fonts/Raleway-VariableFont_wght.ttf new file mode 100644 index 0000000..8aa2226 Binary files /dev/null and b/fonts/Raleway-VariableFont_wght.ttf differ diff --git a/requirements.txt b/requirements.txt index 7aa38ee..048f115 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,21 +1,198 @@ +aiohappyeyeballs==2.6.2 +aiohttp==3.13.5 +aiosignal==1.4.0 +alembic==1.18.4 +annotated-doc==0.0.4 +annotated-types==0.7.0 +antlr4-python3-runtime==4.9.3 +anyio==4.13.0 +asteroid-filterbanks==0.4.0 +attrs==26.1.0 +av==17.0.1 +blinker==1.9.0 +blis==1.3.3 boto3==1.36.8 botocore==1.36.8 +catalogue==2.0.10 +certifi==2026.5.20 +cffi==2.0.0 +charset-normalizer==3.4.7 +clean-text==0.6.0 +click==8.1.8 +cloudpathlib==0.24.0 +colorlog==6.10.1 +confection==0.1.5 +contourpy==1.3.3 +cryptography==48.0.0 +ctranslate2==4.7.2 +cycler==0.12.1 +cymem==2.0.13 +decorator==5.3.1 +dill==0.4.1 +distro==1.9.0 +einops==0.8.2 +elevenlabs==1.57.0 +emoji==1.7.0 +en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl#sha256=1932429db727d4bff3deed6b34cfc05df17794f4a52eeb26cf8928f7c1a0fb85 +exejs==0.0.7 +faster-whisper==1.2.1 +ffmpeg-python==0.2.0 +filelock==3.29.0 +Flask==3.1.1 +flatbuffers==25.12.19 +fonttools==4.63.0 +frozenlist==1.8.0 +fsspec==2026.4.0 +ftfy==6.3.1 +future==1.0.0 +googleapis-common-protos==1.75.0 +greenlet==3.1.1 +grpcio==1.80.0 gTTS==2.5.4 +h11==0.16.0 +hf-xet==1.5.0 +httpcore==1.0.9 +httpx==0.28.1 +huggingface_hub==0.36.2 +idna==3.17 +ImageIO==2.37.3 +imageio-ffmpeg==0.6.0 +itsdangerous==2.2.0 +jh2==5.0.11 +Jinja2==3.1.6 +jiter==0.15.0 +jmespath==1.1.0 +joblib==1.5.3 +julius==0.2.7 +kiwisolver==1.5.0 +langcodes==3.5.1 +lightning==2.6.5 +lightning-utilities==0.15.3 +lxml==6.1.1 +Mako==1.3.12 +markdown-it-py==4.2.0 +MarkupSafe==3.0.3 +matplotlib==3.10.9 +mdurl==0.1.2 moviepy==2.2.1 +mpmath==1.3.0 +multidict==6.7.1 +multiprocess==0.70.19 +murmurhash==1.0.15 +networkx==3.6.1 +niquests==3.18.8 +nltk==3.9.4 +numpy==2.4.6 +nvidia-cublas-cu12==12.8.4.1 +nvidia-cuda-cupti-cu12==12.8.90 +nvidia-cuda-nvrtc-cu12==12.8.93 +nvidia-cuda-runtime-cu12==12.8.90 +nvidia-cudnn-cu12==9.10.2.21 +nvidia-cufft-cu12==11.3.3.83 +nvidia-cufile-cu12==1.13.1.3 +nvidia-curand-cu12==10.3.9.90 +nvidia-cusolver-cu12==11.7.3.90 +nvidia-cusparse-cu12==12.5.8.93 +nvidia-cusparselt-cu12==0.7.1 +nvidia-nccl-cu12==2.27.3 +nvidia-nvjitlink-cu12==12.8.93 +nvidia-nvtx-cu12==12.8.90 +omegaconf==2.3.0 +onnxruntime==1.26.0 +openai==2.38.0 +opentelemetry-api==1.42.1 +opentelemetry-exporter-otlp==1.42.1 +opentelemetry-exporter-otlp-proto-common==1.42.1 +opentelemetry-exporter-otlp-proto-grpc==1.42.1 +opentelemetry-exporter-otlp-proto-http==1.42.1 +opentelemetry-proto==1.42.1 +opentelemetry-sdk==1.42.1 +opentelemetry-semantic-conventions==0.63b1 +optuna==4.8.0 +packaging==26.2 +pandas==3.0.3 +pathos==0.3.5 +pillow==11.3.0 playwright==1.49.1 +pox==0.3.7 +ppft==1.7.8 praw==7.8.1 +prawcore==2.4.0 +preshed==3.0.13 +primePy==1.3 +proglog==0.1.12 +propcache==0.5.2 +protobuf==6.33.6 +pyannote-audio==4.0.4 +pyannote-core==6.0.1 +pyannote-database==6.1.1 +pyannote-metrics==4.1 +pyannote-pipeline==4.0.0 +pyannoteai-sdk==0.4.0 +pycparser==3.0 +pydantic==2.13.4 +pydantic_core==2.46.4 +pyee==12.0.0 +Pygments==2.20.0 +pyparsing==3.3.2 +python-dateutil==2.9.0.post0 +python-dotenv==1.2.2 +pytorch-lightning==2.6.5 +pytorch-metric-learning==2.9.0 +pyttsx3==2.98 +PyYAML==6.0.3 +qh3==1.8.1 +regex==2026.5.9 requests==2.32.3 rich==13.9.4 +s3transfer==0.11.3 +safetensors==0.7.0 +scikit-learn==1.8.0 +scipy==1.17.1 +setuptools==82.0.1 +shellingham==1.5.4 +six==1.17.0 +smart_open==7.6.1 +sniffio==1.3.1 +sortedcontainers==2.4.0 +spacy==3.8.7 +spacy-legacy==3.0.12 +spacy-loggers==1.0.5 +SQLAlchemy==2.0.50 +srsly==2.5.3 +sympy==1.14.0 +thinc==8.3.11 +threadpoolctl==3.6.0 +tokenizers==0.21.4 toml==0.10.2 -translators==5.9.9 -pyttsx3==2.98 tomlkit==0.13.2 -Flask==3.1.1 -clean-text==0.6.0 -unidecode==1.4.0 -spacy==3.8.7 -torch==2.7.0 +torch==2.8.0 +torch-audiomentations==0.12.0 +torch_pitch_shift==1.2.5 +torchaudio==2.8.0 +torchcodec==0.7.0 +torchmetrics==1.9.0 +torchvision==0.23.0 +tqdm==4.67.3 transformers==4.52.4 -ffmpeg-python==0.2.0 -elevenlabs==1.57.0 -yt-dlp==2025.10.22 +translators==5.9.9 +triton==3.4.0 +typer==0.26.2 +typer-slim==0.24.0 +typing-inspection==0.4.2 +typing_extensions==4.15.0 +Unidecode==1.4.0 +update-checker==0.18.0 +urllib3==2.7.0 +urllib3-future==2.20.907 +wasabi==1.1.3 +wassima==2.1.0 +wcwidth==0.7.0 +weasel==0.4.3 +websocket-client==1.9.0 +websockets==16.0 +Werkzeug==3.1.8 +whisperx==3.8.6 +wrapt==2.2.1 +yarl==1.24.2 +yt-dlp==2026.3.17 diff --git a/utils/caption_renderer.py b/utils/caption_renderer.py new file mode 100644 index 0000000..84a1a94 --- /dev/null +++ b/utils/caption_renderer.py @@ -0,0 +1,351 @@ +""" +caption_renderer.py +─────────────────── +All caption rendering logic. Three display modes: + + multi → full sentence on one image (1 RenderJob per sentence) + single → sentence split into word chunks (N RenderJobs per sentence) + aligned → word-level timestamps from WhisperX (perfect sync, any TTS) + +RenderJob is the contract between this module and final_video.py. +Two types of timing: + + FRACTION-based (multi, single): + audio_idx + time_fraction → final_video computes absolute time + time_fraction = fraction of audio_clips_durations[audio_idx+1] + + ABSOLUTE-based (aligned): + clip_start + clip_end → final_video uses directly + These are absolute seconds in the video timeline (after title card) + +final_video.py checks job["timing_type"] to know which to use. +""" + +import os +from dataclasses import dataclass, field +from typing import List, Optional + +from PIL import Image, ImageDraw, ImageFont + +from utils.fonts import getsize + + +# ───────────────────────────────────────────────────────────────────────────── +# RenderJob — the contract +# ───────────────────────────────────────────────────────────────────────────── + +@dataclass +class RenderJob: + """ + Describes exactly one output image (img{idx}.png). + + timing_type = "fraction": + audio_idx + time_fraction used by final_video to compute display time. + time_fraction = 1.0 means shown for full audio file duration. + time_fraction = 0.25 means shown for 25% of audio file duration. + + timing_type = "absolute": + clip_start + clip_end are absolute seconds in the video timeline. + final_video uses these directly — no calculation needed. + """ + idx: int + lines: List[str] + timing_type: str # "fraction" or "absolute" + + # fraction-based fields + audio_idx: int = 0 + time_fraction: float = 1.0 + + # absolute-based fields + clip_start: float = 0.0 + clip_end: float = 0.0 + + +# ───────────────────────────────────────────────────────────────────────────── +# Display modes +# ───────────────────────────────────────────────────────────────────────────── + +DISPLAY_MODES = {"single", "multi", "aligned"} + + +def render_multi_mode( + sentence: str, + style: dict, + audio_idx: int, + start_idx: int, +) -> List[RenderJob]: + """ + Full sentence on one image, wrapped into lines. + One RenderJob, time_fraction = 1.0. + Best for: funny, sad, wholesome, happy. + """ + words = sentence.split() + wpl = style["words_per_chunk"] + lines = [" ".join(words[i:i + wpl]) for i in range(0, len(words), wpl)] + if not lines: + lines = [sentence] + + return [RenderJob( + idx=start_idx, + lines=lines, + timing_type="fraction", + audio_idx=audio_idx, + time_fraction=1.0, + )] + + +def render_single_mode( + sentence: str, + style: dict, + audio_idx: int, + start_idx: int, +) -> List[RenderJob]: + """ + Sentence split into word chunks, one per image. + Each shown for (1/N) of the audio duration. + Best for: scary, dramatic, angry, mysterious. + """ + wpc = style["words_per_chunk"] + words = sentence.split() + raw = [" ".join(words[i:i + wpc]) for i in range(0, len(words), wpc)] + raw = [c for c in raw if c.strip()] or [sentence] + + n = len(raw) + fraction = 1.0 / n + + return [ + RenderJob( + idx=start_idx + i, + lines=[chunk], + timing_type="fraction", + audio_idx=audio_idx, + time_fraction=fraction, + ) + for i, chunk in enumerate(raw) + ] + + +def render_aligned_mode( + sentence: str, + style: dict, + audio_idx: int, + start_idx: int, + word_timestamps: List[dict], + audio_start_time: float, + audio_duration: float, +) -> List[RenderJob]: + """ + Word-level aligned mode using WhisperX timestamps. + + Groups consecutive words into chunks of words_per_chunk words. + Each chunk's clip_start = timestamp of first word in chunk. + Each chunk's clip_end = timestamp of last word in chunk + its duration. + + audio_start_time: absolute time in video when this audio file starts. + audio_duration: duration of this audio file (used as fallback end time). + + Falls back to single mode if timestamps are empty or malformed. + """ + wpc = style["words_per_chunk"] + + if not word_timestamps: + return render_single_mode(sentence, style, audio_idx, start_idx) + + # Group word timestamps into chunks of wpc words + jobs = [] + n = len(word_timestamps) + + for chunk_start in range(0, n, wpc): + chunk_words = word_timestamps[chunk_start:chunk_start + wpc] + if not chunk_words: + continue + + text = " ".join(w["word"] for w in chunk_words) + clip_start = audio_start_time + chunk_words[0]["start"] + + # clip_end = end of last word in chunk, + # or start of next chunk if available, capped at audio end + if chunk_start + wpc < n: + clip_end = audio_start_time + word_timestamps[chunk_start + wpc]["start"] + else: + last_end = chunk_words[-1].get("end", chunk_words[-1]["start"] + 0.3) + clip_end = audio_start_time + last_end + + # Safety: never exceed audio boundary + audio_end = audio_start_time + audio_duration + clip_end = min(clip_end, audio_end) + clip_end = max(clip_end, clip_start + 0.1) # minimum 100ms visibility + + jobs.append(RenderJob( + idx=start_idx + len(jobs), + lines=[text], + timing_type="absolute", + clip_start=round(clip_start, 3), + clip_end=round(clip_end, 3), + )) + + return jobs if jobs else render_single_mode(sentence, style, audio_idx, start_idx) + + +# ───────────────────────────────────────────────────────────────────────────── +# Router +# ───────────────────────────────────────────────────────────────────────────── + +def get_render_jobs( + sentences: List[str], + style: dict, + mp3_dir: Optional[str] = None, + audio_start_times: Optional[List[float]] = None, + audio_durations: Optional[List[float]] = None, +) -> List[RenderJob]: + """ + Route each sentence to the correct renderer. + Returns flat ordered list of all RenderJobs. + + For "aligned" mode, loads word timestamps from + {mp3_dir}/postaudio-{i}_words.json written by engine_wrapper. + Falls back to "single" mode per sentence if timestamps missing. + + Parameters + ---------- + sentences : one per postaudio-{i}.mp3 + style : STYLE_MAP entry for current sentiment + mp3_dir : path to mp3 folder (needed for aligned mode) + audio_start_times : absolute start time of each audio in video (needed for aligned) + audio_durations : duration of each audio file (needed for aligned) + """ + mode = style.get("display_mode", "multi") + + if mode not in DISPLAY_MODES: + print(f"[caption_renderer] Unknown display_mode '{mode}', using 'multi'") + mode = "multi" + + all_jobs: List[RenderJob] = [] + img_counter: int = 0 + + for audio_idx, sentence in enumerate(sentences): + + if mode == "aligned" and mp3_dir and audio_start_times and audio_durations: + # Try to load word timestamps for this sentence + from utils.whisper_aligner import load_word_timestamps + audio_path = os.path.join(mp3_dir, f"postaudio-{audio_idx}.mp3") + word_ts = load_word_timestamps(audio_path) + + if word_ts: + jobs = render_aligned_mode( + sentence=sentence, + style=style, + audio_idx=audio_idx, + start_idx=img_counter, + word_timestamps=word_ts, + audio_start_time=audio_start_times[audio_idx], + audio_duration=audio_durations[audio_idx], + ) + else: + # WhisperX not available or failed — fall back to single mode + print(f"[caption_renderer] No timestamps for sentence {audio_idx}, using single mode") + jobs = render_single_mode(sentence, style, audio_idx, img_counter) + + elif mode == "single": + jobs = render_single_mode(sentence, style, audio_idx, img_counter) + + else: + jobs = render_multi_mode(sentence, style, audio_idx, img_counter) + + all_jobs.extend(jobs) + img_counter += len(jobs) + + return all_jobs + + +# ───────────────────────────────────────────────────────────────────────────── +# Drawing primitives +# ───────────────────────────────────────────────────────────────────────────── + +def measure_text_block( + draw: ImageDraw.ImageDraw, + lines: List[str], + font: ImageFont.FreeTypeFont, + line_spacing: int, +) -> tuple: + max_w = 0 + total_h = 0 + for i, line in enumerate(lines): + w, h = getsize(font, line) + if w > max_w: + max_w = w + total_h += h + if i < len(lines) - 1: + total_h += line_spacing + return max_w, total_h + + +def draw_stroked_text( + draw: ImageDraw.ImageDraw, + x: int, + y: int, + line: str, + font: ImageFont.FreeTypeFont, + fill_color: tuple, + stroke_color: tuple, + stroke_width: int, +) -> None: + sw = stroke_width + half = max(1, sw // 2) + offsets = [ + (-sw, 0), (sw, 0), (0, -sw), (0, sw), + (-sw, -sw), (sw, -sw), (-sw, sw), (sw, sw), + (-sw, -half), (sw, -half), (-sw, half), (sw, half), + (-half, -sw), (half, -sw), (-half, sw), (half, sw), + ] + for ox, oy in offsets: + draw.text((x + ox, y + oy), line, font=font, fill=stroke_color) + draw.text((x, y), line, font=font, fill=fill_color) + + +def fit_font( + style: dict, + lines: List[str], + canvas_w: int, + canvas_h: int, + line_spacing: int, + max_width_ratio: float = 0.88, + max_height_ratio: float = 0.45, +) -> ImageFont.FreeTypeFont: + font_size = style["font_size"] + font_path = os.path.join("fonts", style["font_file"]) + if not os.path.exists(font_path): + font_path = os.path.join("fonts", "Roboto-Bold.ttf") + max_w = int(canvas_w * max_width_ratio) + max_h = int(canvas_h * max_height_ratio) + while font_size > 30: + font = ImageFont.truetype(font_path, font_size) + dummy = Image.new("RGBA", (canvas_w, canvas_h), (0, 0, 0, 0)) + dummy_d = ImageDraw.Draw(dummy) + bw, bh = measure_text_block(dummy_d, lines, font, line_spacing) + if bw <= max_w and bh <= max_h: + return font + font_size -= 4 + return ImageFont.truetype(font_path, 30) + + +def render_job_to_image( + job: RenderJob, + style: dict, + canvas_w: int, + canvas_h: int, + line_spacing: int, +) -> Image.Image: + font = fit_font(style, job.lines, canvas_w, canvas_h, line_spacing) + image = Image.new("RGBA", (canvas_w, canvas_h), (0, 0, 0, 0)) + draw = ImageDraw.Draw(image) + bw, bh = measure_text_block(draw, job.lines, font, line_spacing) + anchor_y = int(canvas_h * style["y_position"]) - (bh // 2) + cy = anchor_y + for line in job.lines: + w, h = getsize(font, line) + x = (canvas_w - w) // 2 + draw_stroked_text(draw, x, cy, line, font, + style["fill_color"], style["stroke_color"], style["stroke_width"]) + cy += h + line_spacing + return image \ No newline at end of file diff --git a/utils/imagenarator.py b/utils/imagenarator.py index ad75331..3b1e37f 100644 --- a/utils/imagenarator.py +++ b/utils/imagenarator.py @@ -1,74 +1,156 @@ +""" +imagenarator.py +─────────────── +Thin orchestrator. Does exactly: + 1. Extract sentences from reddit_obj + 2. Probe audio durations + compute audio start times (needed for aligned mode) + 3. Call caption_renderer.get_render_jobs() + 4. Render each job to PNG + 5. Save timing_map.json for final_video.py +""" + +import glob +import json import os import re -import textwrap +from typing import List, Optional -from PIL import Image, ImageDraw, ImageFont +import ffmpeg from rich.progress import track from TTS.engine_wrapper import process_text -from utils.fonts import getheight, getsize +from utils import settings from utils.id import extract_id +from utils.sentiment_map import STYLE_MAP, DEFAULT_STYLE +from utils.caption_renderer import get_render_jobs, render_job_to_image, RenderJob + +LINE_SPACING: int = 20 -def draw_multiple_line_text( - image, text, font, text_color, padding, wrap=50, transparent=False -) -> None: + +def _extract_sentences(reddit_obj: dict, style: dict) -> List[str]: """ - Draw multiline text over given image + Extract sentences from thread_post. + One sentence per postaudio-{i}.mp3 — order preserved. """ - draw = ImageDraw.Draw(image) - font_height = getheight(font, text) - image_width, image_height = image.size - lines = textwrap.wrap(text, width=wrap) - y = (image_height / 2) - (((font_height + (len(lines) * padding) / len(lines)) * len(lines)) / 2) - for line in lines: - line_width, line_height = getsize(font, line) - if transparent: - shadowcolor = "black" - for i in range(1, 5): - draw.text( - ((image_width - line_width) / 2 - i, y - i), - line, - font=font, - fill=shadowcolor, - ) - draw.text( - ((image_width - line_width) / 2 + i, y - i), - line, - font=font, - fill=shadowcolor, - ) - draw.text( - ((image_width - line_width) / 2 - i, y + i), - line, - font=font, - fill=shadowcolor, - ) - draw.text( - ((image_width - line_width) / 2 + i, y + i), - line, - font=font, - fill=shadowcolor, - ) - draw.text(((image_width - line_width) / 2, y), line, font=font, fill=text_color) - y += line_height + padding - - -def imagemaker(theme, reddit_obj: dict, txtclr, padding=5, transparent=False) -> None: + raw_texts = reddit_obj["thread_post"] + sentences: List[str] = [] + for item in raw_texts: + if isinstance(item, dict): + text = item.get("text", "") + elif isinstance(item, str): + text = item + else: + text = str(item) + text = process_text(text, False).strip() + if style.get("uppercase", False): + text = text.upper() + if text: + sentences.append(text) + return sentences if sentences else ["..."] + + +def _get_audio_info(mp3_dir: str) -> tuple: + """ + Discover postaudio files and compute: + - durations list (one per postaudio file) + - start times list (absolute seconds in video, after title card) + + Returns (postaudio_files, durations, start_times) """ - Render Images for video + postaudio_files = sorted( + glob.glob(os.path.join(mp3_dir, "postaudio-*.mp3")), + key=lambda x: int(re.search(r"postaudio-(\d+)", x).group(1)) + ) + + title_path = os.path.join(mp3_dir, "title.mp3") + try: + title_duration = float(ffmpeg.probe(title_path)["format"]["duration"]) + except Exception: + title_duration = 0.0 + + durations = [] + start_times = [] + current = title_duration + + for f in postaudio_files: + try: + dur = float(ffmpeg.probe(f)["format"]["duration"]) + except Exception: + dur = 0.0 + start_times.append(current) + durations.append(dur) + current += dur + + return postaudio_files, durations, start_times + + +def imagemaker(theme, reddit_obj: dict, txtclr, padding=5, transparent=False) -> int: """ - texts = reddit_obj["thread_post"] - reddit_id = extract_id(reddit_obj) - if transparent: - font = ImageFont.truetype(os.path.join("fonts", "Roboto-Bold.ttf"), 100) - else: - font = ImageFont.truetype(os.path.join("fonts", "Roboto-Regular.ttf"), 100) - - size = (1920, 1080) - - for idx, text in track(enumerate(texts), "Rendering Image"): - image = Image.new("RGBA", size, theme) - text = process_text(text, False) - draw_multiple_line_text(image, text, font, txtclr, padding, wrap=30, transparent=transparent) - image.save(f"assets/temp/{reddit_id}/png/img{idx}.png") + Render caption images for the video. + + Flow: + sentences + audio info + → caption_renderer.get_render_jobs() + → List[RenderJob] + each RenderJob → transparent PNG (img{idx}.png) + timing_map.json → saved for final_video.py + + timing_map.json entry for fraction-based jobs: + {"timing_type": "fraction", "audio_idx": N, "time_fraction": F} + + timing_map.json entry for absolute-based jobs (aligned mode): + {"timing_type": "absolute", "clip_start": S, "clip_end": E} + + Returns: + int: total number of images generated + """ + # 1. Style + sentiment = settings.config["settings"].get("sentiment", "dramatic") + style = STYLE_MAP.get(sentiment, DEFAULT_STYLE) + CANVAS_W: int = int(settings.config["settings"]["resolution_w"]) + CANVAS_H: int = int(settings.config["settings"]["resolution_h"]) + reddit_id = extract_id(reddit_obj) + mp3_dir = f"assets/temp/{reddit_id}/mp3" + + # 2. Extract sentences + sentences = _extract_sentences(reddit_obj, style) + + # 3. Get audio timing info (needed for aligned mode) + _, durations, start_times = _get_audio_info(mp3_dir) + + # 4. Get render jobs + jobs: List[RenderJob] = get_render_jobs( + sentences=sentences, + style=style, + mp3_dir=mp3_dir, + audio_start_times=start_times if start_times else None, + audio_durations=durations if durations else None, + ) + + # 5. Render each job to a transparent PNG + for job in track(jobs, description="Rendering caption images"): + image = render_job_to_image(job, style, CANVAS_W, CANVAS_H, LINE_SPACING) + image.save(f"assets/temp/{reddit_id}/png/img{job.idx}.png") + + # 6. Save timing map + timing_map = [] + for job in jobs: + if job.timing_type == "absolute": + timing_map.append({ + "timing_type": "absolute", + "clip_start": job.clip_start, + "clip_end": job.clip_end, + }) + else: + timing_map.append({ + "timing_type": "fraction", + "audio_idx": job.audio_idx, + "time_fraction": job.time_fraction, + }) + + timing_map_path = f"assets/temp/{reddit_id}/timing_map.json" + with open(timing_map_path, "w") as f: + json.dump(timing_map, f, indent=2) + + return len(jobs) \ No newline at end of file diff --git a/utils/sentiment.py b/utils/sentiment.py index 362688d..92e9d6f 100644 --- a/utils/sentiment.py +++ b/utils/sentiment.py @@ -196,12 +196,16 @@ def apply_sentiment_config(reddit_object: dict) -> None: sentiment = detect_sentiment(reddit_object) - # ── Background ─────────────────────────────────────────── + # ── Sentiment label — stored in memory so imagenarator.py can read it ──── + # This is the key that STYLE_MAP lookups depend on at render time. + settings.config["settings"]["sentiment"] = sentiment + + # ── Background ─────────────────────────────────────────────────────────── bg_video, bg_audio = BACKGROUND_MAP[sentiment] settings.config["settings"]["background"]["background_video"] = bg_video settings.config["settings"]["background"]["background_audio"] = bg_audio - # ── Voice ──────────────────────────────────────────────── + # ── Voice ──────────────────────────────────────────────────────────────── voice_choice = settings.config["settings"]["tts"]["voice_choice"].lower() if voice_choice == "elevenlabs": @@ -213,12 +217,12 @@ def apply_sentiment_config(reddit_object: dict) -> None: else: voice = f"(voice override not supported for {voice_choice})" - # ── Metadata ───────────────────────────────────────────── + # ── Metadata ───────────────────────────────────────────────────────────── print_substep("Generating titles, captions and hashtags... ✍️", style="bold blue") metadata = generate_metadata(reddit_object, sentiment) save_metadata(metadata, reddit_object) - # ── Log ────────────────────────────────────────────────── + # ── Log ────────────────────────────────────────────────────────────────── print_substep(f"Sentiment detected : {sentiment} 🎯", style="bold green") print_substep(f"Background video : {bg_video}", style="bold blue") print_substep(f"Background audio : {bg_audio if bg_audio else 'none'}", style="bold blue") diff --git a/utils/sentiment_map.py b/utils/sentiment_map.py index 8259c80..f39ec63 100644 --- a/utils/sentiment_map.py +++ b/utils/sentiment_map.py @@ -1,16 +1,14 @@ -# Maps sentiment → (background_video, background_audio) BACKGROUND_MAP = { - "sad": ("minecraft", "lofi"), # slow, melancholic - "happy": ("fall-guys", "chill-summer"),# upbeat, fun - "angry": ("gta", "lofi"), # lofi keeps intensity without distraction - "mysterious": ("csgo-surf", "lofi-2"), # lofi-2 is more atmospheric - "funny": ("cluster-truck", "chill-summer"),# light and playful - "dramatic": ("rocket-league", "lofi"), # lofi under dramatic = tension - "wholesome": ("steep", "chill-summer"),# warm and positive - "scary": ("minecraft-2", "lofi-2"), # lofi-2 is darker/moodier + "sad": ("minecraft", "lofi"), + "happy": ("fall-guys", "chill-summer"), + "angry": ("gta", "lofi"), + "mysterious": ("csgo-surf", "lofi-2"), + "funny": ("cluster-truck", "chill-summer"), + "dramatic": ("rocket-league", "lofi"), + "wholesome": ("steep", "chill-summer"), + "scary": ("minecraft-2", "lofi-2"), } -# Maps sentiment → OpenAI voice name OPENAI_VOICE_MAP = { "sad": "nova", "happy": "shimmer", @@ -22,7 +20,6 @@ OPENAI_VOICE_MAP = { "scary": "onyx", } -# Maps sentiment → ElevenLabs voice name ELEVENLABS_VOICE_MAP = { "sad": "Brian - Deep, Resonant and Comforting", "happy": "Jessica - Playful, Bright, Warm", @@ -34,8 +31,128 @@ ELEVENLABS_VOICE_MAP = { "scary": "Harry - Fierce Warrior", } -# All valid sentiment labels VALID_SENTIMENTS = list(BACKGROUND_MAP.keys()) +DEFAULT_SENTIMENT = "dramatic" -# Fallback if detection fails — maps to rocket-league + lofi + alloy -DEFAULT_SENTIMENT = "dramatic" \ No newline at end of file + +# ───────────────────────────────────────────────────────────────────────────── +# STYLE_MAP +# ───────────────────────────────────────────────────────────────────────────── +# +# display_mode options: +# +# "aligned" → WhisperX word timestamps — perfect sync with any TTS. +# Falls back to "single" per sentence if timestamps unavailable. +# USE THIS for all sentiments once WhisperX is installed. +# +# "single" → Split sentence into word chunks, equal time per chunk. +# Good fallback when WhisperX is not installed. +# +# "multi" → Full sentence on one image. No splitting. +# Best for slow TTS or wholesome/sad content. +# +# words_per_chunk: +# In "aligned" mode: words grouped per visible chunk (3-5 recommended) +# In "single" mode: words per chunk (higher = fewer chunks = slower pace) +# In "multi" mode: words per line in the wrapped text block +# +STYLE_MAP = { + + "dramatic": { + "font_file": "Montserrat-ExtraBold.ttf", + "font_size": 95, + "fill_color": (255, 255, 255, 255), + "stroke_color": (0, 0, 0, 255), + "stroke_width": 4, + "words_per_chunk": 4, + "y_position": 0.65, + "uppercase": False, + "display_mode": "aligned", + }, + + "scary": { + "font_file": "Oswald-Bold.ttf", + "font_size": 95, + "fill_color": (232, 244, 248, 255), + "stroke_color": (0, 0, 0, 255), + "stroke_width": 5, + "words_per_chunk": 3, + "y_position": 0.65, + "uppercase": False, + "display_mode": "aligned", + }, + + "angry": { + "font_file": "Anton-Regular.ttf", + "font_size": 105, + "fill_color": (255, 69, 0, 255), + "stroke_color": (0, 0, 0, 255), + "stroke_width": 5, + "words_per_chunk": 3, + "y_position": 0.65, + "uppercase": True, + "display_mode": "aligned", + }, + + "mysterious": { + "font_file": "Raleway-Bold.ttf", + "font_size": 90, + "fill_color": (184, 212, 232, 255), + "stroke_color": (0, 0, 0, 255), + "stroke_width": 4, + "words_per_chunk": 3, + "y_position": 0.65, + "uppercase": False, + "display_mode": "aligned", + }, + + "funny": { + "font_file": "Nunito-ExtraBold.ttf", + "font_size": 90, + "fill_color": (255, 230, 0, 255), + "stroke_color": (0, 0, 0, 255), + "stroke_width": 4, + "words_per_chunk": 5, + "y_position": 0.65, + "uppercase": False, + "display_mode": "aligned", + }, + + "sad": { + "font_file": "Lato-Bold.ttf", + "font_size": 88, + "fill_color": (220, 225, 255, 255), + "stroke_color": (10, 10, 46, 255), + "stroke_width": 3, + "words_per_chunk": 5, + "y_position": 0.65, + "uppercase": False, + "display_mode": "aligned", + }, + + "wholesome": { + "font_file": "Nunito-ExtraBold.ttf", + "font_size": 88, + "fill_color": (255, 248, 231, 255), + "stroke_color": (26, 10, 0, 255), + "stroke_width": 3, + "words_per_chunk": 5, + "y_position": 0.65, + "uppercase": False, + "display_mode": "aligned", + }, + + "happy": { + "font_file": "Nunito-ExtraBold.ttf", + "font_size": 90, + "fill_color": (255, 230, 0, 255), + "stroke_color": (0, 0, 0, 255), + "stroke_width": 4, + "words_per_chunk": 5, + "y_position": 0.65, + "uppercase": False, + "display_mode": "aligned", + }, +} + +DEFAULT_STYLE = STYLE_MAP["dramatic"] \ No newline at end of file diff --git a/utils/whisper_aligner.py b/utils/whisper_aligner.py new file mode 100644 index 0000000..c3ada9d --- /dev/null +++ b/utils/whisper_aligner.py @@ -0,0 +1,168 @@ +""" +whisper_aligner.py +────────────────── +Word-level timestamp extraction using WhisperX. + +This module runs after each TTS audio file is saved. +It produces a word-level timestamp JSON for every postaudio-{i}.mp3. + +Output format (postaudio-{i}_words.json): +[ + {"word": "I", "start": 0.00, "end": 0.18}, + {"word": "told", "start": 0.18, "end": 0.42}, + ... +] + +WhisperX is used because: + - Works with ANY TTS engine (Google, OpenAI, ElevenLabs, etc.) + - Free, runs locally, no API cost + - Word-level accuracy (not sentence-level) + - Fast on CPU for short audio clips + +If WhisperX is not installed or fails for any reason, +this module returns None and the system falls back to +time_fraction-based sync (single/multi mode). +No crashes, no interruptions. +""" + +import json +import os +from typing import List, Optional + +from utils.console import print_substep + + +# ── WhisperX model is loaded once and reused across all audio files ─────────── +# Loading is expensive (~2-3s). We cache it as a module-level singleton. +_whisper_model = None +_whisper_model_lang = None + + +def _get_model(language: str = "en"): + """ + Lazy-load WhisperX model. Loaded once per run, reused for all audio files. + Returns None if WhisperX is not installed. + """ + global _whisper_model, _whisper_model_lang + + if _whisper_model is not None and _whisper_model_lang == language: + return _whisper_model + + try: + import whisperx + print_substep("Loading WhisperX model (first run only)...", style="bold blue") + _whisper_model = whisperx.load_model( + "base", # small enough for CPU, accurate enough for TTS + device="cpu", + compute_type="int8", + language=language, + ) + _whisper_model_lang = language + return _whisper_model + except ImportError: + return None + except Exception as e: + print_substep(f"WhisperX model load failed: {e}", style="yellow") + return None + + +def align_audio(audio_path: str, language: str = "en") -> Optional[List[dict]]: + """ + Run WhisperX on a single audio file and return word-level timestamps. + + Parameters + ---------- + audio_path : str + Path to the .mp3 file to align. + language : str + Language code (default: "en"). Matches TTS language. + + Returns + ------- + Optional[List[dict]] + List of {"word": str, "start": float, "end": float} dicts. + Returns None if WhisperX is unavailable or alignment fails. + """ + try: + import whisperx + + model = _get_model(language) + if model is None: + return None + + # Transcribe + align + audio = whisperx.load_audio(audio_path) + result = model.transcribe(audio, batch_size=4) + + # Align to get word-level timestamps + align_model, metadata = whisperx.load_align_model( + language_code=language, + device="cpu", + ) + aligned = whisperx.align( + result["segments"], + align_model, + metadata, + audio, + device="cpu", + return_char_alignments=False, + ) + + # Flatten all words across all segments + words = [] + for segment in aligned.get("word_segments", []): + word = segment.get("word", "").strip() + start = segment.get("start") + end = segment.get("end") + if word and start is not None and end is not None: + words.append({ + "word": word, + "start": round(float(start), 3), + "end": round(float(end), 3), + }) + + return words if words else None + + except Exception as e: + print_substep(f"WhisperX alignment failed for {audio_path}: {e}", style="yellow") + return None + + +def align_and_save(audio_path: str, language: str = "en") -> Optional[str]: + """ + Align audio and save word timestamps as a JSON file next to the audio. + + Parameters + ---------- + audio_path : str + e.g. "assets/temp/abc123/mp3/postaudio-0.mp3" + language : str + Language code. + + Returns + ------- + Optional[str] + Path to saved JSON file, or None if alignment failed. + """ + words = align_audio(audio_path, language) + + if words is None: + return None + + json_path = audio_path.replace(".mp3", "_words.json") + with open(json_path, "w", encoding="utf-8") as f: + json.dump(words, f, indent=2, ensure_ascii=False) + + return json_path + + +def load_word_timestamps(audio_path: str) -> Optional[List[dict]]: + """ + Load previously saved word timestamps for an audio file. + Returns None if the file doesn't exist. + """ + json_path = audio_path.replace(".mp3", "_words.json") + if not os.path.exists(json_path): + return None + with open(json_path, "r", encoding="utf-8") as f: + return json.load(f) \ No newline at end of file diff --git a/video_creation/final_video.py b/video_creation/final_video.py index 5764fde..a0e38a2 100644 --- a/video_creation/final_video.py +++ b/video_creation/final_video.py @@ -5,9 +5,11 @@ import tempfile import textwrap import threading import time -from os.path import exists # Needs to be imported specifically +from os.path import exists from pathlib import Path from typing import Dict, Final, Tuple +import glob +import json import ffmpeg import translators @@ -44,7 +46,6 @@ class ProgressFfmpeg(threading.Thread): def get_latest_ms_progress(self): lines = self.output_file.readlines() - if lines: for line in lines: if "out_time_ms" in line: @@ -52,7 +53,6 @@ class ProgressFfmpeg(threading.Thread): if out_time_ms_str.isnumeric(): return float(out_time_ms_str) / 1000000.0 else: - # Handle the case when "N/A" is encountered return None return None @@ -74,7 +74,6 @@ def name_normalize(name: str) -> str: name = re.sub(r"(\d+)\s?\/\s?(\d+)", r"\1 of \2", name) name = re.sub(r"(\w+)\s?\/\s?(\w+)", r"\1 or \2", name) name = re.sub(r"\/", r"", name) - lang = settings.config["reddit"]["thread"]["post_lang"] if lang: print_substep("Translating filename...") @@ -119,51 +118,38 @@ def get_text_height(draw, text, font, max_width): def create_fancy_thumbnail(image, text, text_color, padding, wrap=35): - """ - It will take the 1px from the middle of the template and will be resized (stretched) vertically to accommodate the extra height needed for the title. - """ print_step(f"Creating fancy thumbnail for: {text}") font_title_size = 47 font = ImageFont.truetype(os.path.join("fonts", "Roboto-Bold.ttf"), font_title_size) image_width, image_height = image.size - # Calculate text height to determine new image height draw = ImageDraw.Draw(image) text_height = get_text_height(draw, text, font, wrap) lines = textwrap.wrap(text, width=wrap) - # This is -50 to reduce the empty space at the bottom of the image, - # change it as per your requirement if needed otherwise leave it. new_image_height = image_height + text_height + padding * (len(lines) - 1) - 50 - # Separate the image into top, middle (1px), and bottom parts - top_part_height = image_height // 2 - middle_part_height = 1 # 1px height middle section + top_part_height = image_height // 2 + middle_part_height = 1 bottom_part_height = image_height - top_part_height - middle_part_height - top_part = image.crop((0, 0, image_width, top_part_height)) + top_part = image.crop((0, 0, image_width, top_part_height)) middle_part = image.crop((0, top_part_height, image_width, top_part_height + middle_part_height)) bottom_part = image.crop((0, top_part_height + middle_part_height, image_width, image_height)) - # Stretch the middle part - new_middle_height = new_image_height - top_part_height - bottom_part_height + new_middle_height = max(1, new_image_height - top_part_height - bottom_part_height) middle_part = middle_part.resize((image_width, new_middle_height)) - # Create new image with the calculated height new_image = Image.new("RGBA", (image_width, new_image_height)) - - # Paste the top, stretched middle, and bottom parts into the new image - new_image.paste(top_part, (0, 0)) + new_image.paste(top_part, (0, 0)) new_image.paste(middle_part, (0, top_part_height)) new_image.paste(bottom_part, (0, top_part_height + new_middle_height)) - # Draw the title text on the new image draw = ImageDraw.Draw(new_image) y = top_part_height + padding for line in lines: draw.text((120, y), line, font=font, fill=text_color, align="left") y += get_text_height(draw, line, font, wrap) + padding - # Draw the username "PlotPulse" at the specific position username_font = ImageFont.truetype(os.path.join("fonts", "Roboto-Bold.ttf"), 30) draw.text( (205, 825), @@ -172,28 +158,44 @@ def create_fancy_thumbnail(image, text, text_color, padding, wrap=35): fill=text_color, align="left", ) - return new_image def merge_background_audio(audio: ffmpeg, reddit_id: str): - """Gather an audio and merge with assets/backgrounds/background.mp3 - Args: - audio (ffmpeg): The TTS final audio but without background. - reddit_id (str): The ID of subreddit - """ background_audio_volume = settings.config["settings"]["background"]["background_audio_volume"] if background_audio_volume == 0: - return audio # Return the original audio - else: - # sets volume to config - bg_audio = ffmpeg.input(f"assets/temp/{reddit_id}/background.mp3").filter( - "volume", - background_audio_volume, - ) - # Merges audio and background_audio - merged_audio = ffmpeg.filter([audio, bg_audio], "amix", duration="longest") - return merged_audio # Return merged audio + return audio + bg_audio = ffmpeg.input(f"assets/temp/{reddit_id}/background.mp3").filter( + "volume", background_audio_volume, + ) + return ffmpeg.filter([audio, bg_audio], "amix", duration="longest") + + +def _load_timing_map(reddit_id: str, img_files: list, postaudio_files: list, + audio_clips_durations: list, title_duration: float) -> list: + """ + Load timing_map.json written by imagemaker(). + + Each entry is one of: + {"timing_type": "absolute", "clip_start": S, "clip_end": E} + → used directly as FFmpeg enable times + + {"timing_type": "fraction", "audio_idx": N, "time_fraction": F} + → clip time computed as: audio_start[N] + accumulated_fraction * audio_dur[N] + + Falls back to 1:1 mapping if file missing. + """ + timing_map_path = f"assets/temp/{reddit_id}/timing_map.json" + if os.path.exists(timing_map_path): + with open(timing_map_path) as f: + return json.load(f) + + # Fallback: 1:1 + print_substep("timing_map.json not found — using 1:1 fallback", style="yellow") + return [ + {"timing_type": "fraction", "audio_idx": i, "time_fraction": 1.0} + for i in range(len(img_files)) + ] def make_final_video( @@ -202,20 +204,10 @@ def make_final_video( reddit_obj: dict, background_config: Dict[str, Tuple], ): - """Gathers audio clips, gathers all screenshots, stitches them together and saves the final video to assets/temp - Args: - number_of_clips (int): Index to end at when going through the screenshots' - length (int): Length of the video - reddit_obj (dict): The reddit object that contains the posts to read. - background_config (Tuple[str, str, str, Any]): The background config to use. - """ - # settings values W: Final[int] = int(settings.config["settings"]["resolution_w"]) H: Final[int] = int(settings.config["settings"]["resolution_h"]) - - opacity = settings.config["settings"]["opacity"] - - reddit_id = extract_id(reddit_obj) + opacity = settings.config["settings"]["opacity"] + reddit_id = extract_id(reddit_obj) allowOnlyTTSFolder: bool = ( settings.config["settings"]["background"]["enable_extra_audio"] @@ -223,33 +215,31 @@ def make_final_video( ) print_step("Creating the final video 🎥") - background_clip = ffmpeg.input(prepare_background(reddit_id, W=W, H=H)) - # Gather all audio clips + # ── Audio clips ─────────────────────────────────────────────────────────── audio_clips = list() if number_of_clips == 0 and settings.config["settings"]["storymode"] == "false": - print( - "No audio clips to gather. Please use a different TTS or post." - ) # This is to fix the TypeError: unsupported operand type(s) for +: 'int' and 'NoneType' + print("No audio clips to gather.") exit() + if settings.config["settings"]["storymode"]: if settings.config["settings"]["storymodemethod"] == 0: audio_clips = [ffmpeg.input(f"assets/temp/{reddit_id}/mp3/title.mp3")] audio_clips.insert(1, ffmpeg.input(f"assets/temp/{reddit_id}/mp3/postaudio.mp3")) elif settings.config["settings"]["storymodemethod"] == 1: - audio_clips = [ - ffmpeg.input(f"assets/temp/{reddit_id}/mp3/postaudio-{i}.mp3") - for i in track(range(number_of_clips + 1), "Collecting the audio files...") - ] + postaudio_files = sorted( + glob.glob(f"assets/temp/{reddit_id}/mp3/postaudio-*.mp3"), + key=lambda x: int(re.search(r"postaudio-(\d+)", x).group(1)) + ) + audio_clips = [ffmpeg.input(f) for f in postaudio_files] audio_clips.insert(0, ffmpeg.input(f"assets/temp/{reddit_id}/mp3/title.mp3")) - else: audio_clips = [ - ffmpeg.input(f"assets/temp/{reddit_id}/mp3/{i}.mp3") for i in range(number_of_clips) + ffmpeg.input(f"assets/temp/{reddit_id}/mp3/{i}.mp3") + for i in range(number_of_clips) ] audio_clips.insert(0, ffmpeg.input(f"assets/temp/{reddit_id}/mp3/title.mp3")) - audio_clips_durations = [ float(ffmpeg.probe(f"assets/temp/{reddit_id}/mp3/{i}.mp3")["format"]["duration"]) for i in range(number_of_clips) @@ -258,6 +248,7 @@ def make_final_video( 0, float(ffmpeg.probe(f"assets/temp/{reddit_id}/mp3/title.mp3")["format"]["duration"]), ) + audio_concat = ffmpeg.concat(*audio_clips, a=1, v=0) ffmpeg.output( audio_concat, f"assets/temp/{reddit_id}/audio.mp3", **{"b:a": "192k"} @@ -266,27 +257,16 @@ def make_final_video( console.log(f"[bold green] Video Will Be: {length} Seconds Long") screenshot_width = int((W * 45) // 100) - audio = ffmpeg.input(f"assets/temp/{reddit_id}/audio.mp3") + audio = ffmpeg.input(f"assets/temp/{reddit_id}/audio.mp3") final_audio = merge_background_audio(audio, reddit_id) - image_clips = list() Path(f"assets/temp/{reddit_id}/png").mkdir(parents=True, exist_ok=True) - # Credits to tim (beingbored) - # get the title_template image and draw a text in the middle part of it with the title of the thread + # ── Title card ──────────────────────────────────────────────────────────── title_template = Image.open("assets/title_template.png") - - title = reddit_obj["thread_title"] - - title = name_normalize(title) - - font_color = "#000000" - padding = 5 - - # create_fancy_thumbnail(image, text, text_color, padding - title_img = create_fancy_thumbnail(title_template, title, font_color, padding) - + title = name_normalize(reddit_obj["thread_title"]) + title_img = create_fancy_thumbnail(title_template, title, "#000000", 5) title_img.save(f"assets/temp/{reddit_id}/png/title.png") image_clips.insert( 0, @@ -296,18 +276,17 @@ def make_final_video( ) current_time = 0 + if settings.config["settings"]["storymode"]: - audio_clips_durations = [ - float( - ffmpeg.probe(f"assets/temp/{reddit_id}/mp3/postaudio-{i}.mp3")["format"]["duration"] - ) - for i in range(number_of_clips) - ] - audio_clips_durations.insert( - 0, - float(ffmpeg.probe(f"assets/temp/{reddit_id}/mp3/title.mp3")["format"]["duration"]), - ) + if settings.config["settings"]["storymodemethod"] == 0: + audio_clips_durations = [ + float(ffmpeg.probe(f"assets/temp/{reddit_id}/mp3/postaudio.mp3")["format"]["duration"]) + ] + audio_clips_durations.insert( + 0, + float(ffmpeg.probe(f"assets/temp/{reddit_id}/mp3/title.mp3")["format"]["duration"]), + ) image_clips.insert( 1, ffmpeg.input(f"assets/temp/{reddit_id}/png/story_content.png").filter( @@ -321,20 +300,97 @@ def make_final_video( y="(main_h-overlay_h)/2", ) current_time += audio_clips_durations[0] + elif settings.config["settings"]["storymodemethod"] == 1: - for i in track(range(0, number_of_clips + 1), "Collecting the image files..."): - image_clips.append( - ffmpeg.input(f"assets/temp/{reddit_id}/png/img{i}.png")["v"].filter( - "scale", screenshot_width, -1 - ) + + # ── Discover postaudio files ────────────────────────────────────── + postaudio_files = sorted( + glob.glob(f"assets/temp/{reddit_id}/mp3/postaudio-*.mp3"), + key=lambda x: int(re.search(r"postaudio-(\d+)", x).group(1)) + ) + + # ── Build durations ─────────────────────────────────────────────── + # audio_clips_durations[0] = title + # audio_clips_durations[1+i] = postaudio-{i} + audio_clips_durations = [ + float(ffmpeg.probe(f)["format"]["duration"]) + for f in postaudio_files + ] + title_duration = float( + ffmpeg.probe(f"assets/temp/{reddit_id}/mp3/title.mp3")["format"]["duration"] + ) + audio_clips_durations.insert(0, title_duration) + + # ── Pre-compute absolute start time per audio file ──────────────── + # audio_start_times[i] = when postaudio-{i} starts in the video + audio_start_times = [] + t = title_duration + for dur in audio_clips_durations[1:]: + audio_start_times.append(t) + t += dur + + # ── Title card overlay ──────────────────────────────────────────── + background_clip = background_clip.overlay( + image_clips[0], + enable=f"between(t,0,{title_duration})", + x="(main_w-overlay_w)/2", + y="(main_h-overlay_h)/2", + ) + current_time = title_duration + + # ── Load image files ────────────────────────────────────────────── + img_files = sorted( + glob.glob(f"assets/temp/{reddit_id}/png/img*.png"), + key=lambda x: int(re.search(r"img(\d+)", x).group(1)) + ) + + # ── Load timing map ─────────────────────────────────────────────── + timing_map = _load_timing_map( + reddit_id, img_files, postaudio_files, + audio_clips_durations, title_duration + ) + + # ── Overlay each image ──────────────────────────────────────────── + # Handles both absolute and fraction timing types cleanly. + # For fraction: track time_consumed per audio_idx + audio_time_used = {} + + for i, img_file in enumerate(img_files): + if i >= len(timing_map): + break + + entry = timing_map[i] + timing_type = entry.get("timing_type", "fraction") + + if timing_type == "absolute": + # WhisperX aligned — use timestamps directly + clip_start = entry["clip_start"] + clip_end = entry["clip_end"] + + else: + # Fraction-based — compute from audio duration + audio_idx = entry["audio_idx"] + time_fraction = entry["time_fraction"] + audio_dur = audio_clips_durations[audio_idx + 1] + display_dur = audio_dur * time_fraction + offset = audio_time_used.get(audio_idx, 0.0) + clip_start = audio_start_times[audio_idx] + offset + clip_end = clip_start + display_dur + audio_time_used[audio_idx] = offset + display_dur + + img_clip = ffmpeg.input(img_file)["v"].filter( + "scale", screenshot_width, -1 ) + image_clips.append(img_clip) background_clip = background_clip.overlay( - image_clips[i], - enable=f"between(t,{current_time},{current_time + audio_clips_durations[i]})", + img_clip, + enable=f"between(t,{clip_start:.3f},{clip_end:.3f})", x="(main_w-overlay_w)/2", y="(main_h-overlay_h)/2", ) - current_time += audio_clips_durations[i] + + current_time = t + else: for i in range(0, number_of_clips + 1): image_clips.append( @@ -343,9 +399,7 @@ def make_final_video( ) ) image_overlay = image_clips[i].filter("colorchannelmixer", aa=opacity) - assert ( - audio_clips_durations is not None - ), "Please make a GitHub issue if you see this. Ping @JasonLovesDoggo on GitHub." + assert audio_clips_durations is not None background_clip = background_clip.overlay( image_overlay, enable=f"between(t,{current_time},{current_time + audio_clips_durations[i]})", @@ -354,67 +408,48 @@ def make_final_video( ) current_time += audio_clips_durations[i] - title = extract_id(reddit_obj, "thread_title") - idx = extract_id(reddit_obj) - title_thumb = reddit_obj["thread_title"] - - filename = f"{name_normalize(title)[:100]}" - subreddit = reddit_obj.get("thread_subreddit", settings.config["reddit"]["thread"]["subreddit"]) - sentiment = settings.config["settings"]["background"].get("background_video", "unknown") - - # Per-video folder: results/{subreddit}/{thread_id}_{sentiment}/ + # ── Output ──────────────────────────────────────────────────────────────── + title_str = extract_id(reddit_obj, "thread_title") + idx = extract_id(reddit_obj) + title_thumb = reddit_obj["thread_title"] + subreddit = reddit_obj.get("thread_subreddit", settings.config["reddit"]["thread"]["subreddit"]) + sentiment = settings.config["settings"]["background"].get("background_video", "unknown") video_folder = f"./results/{subreddit}/{idx}_{sentiment}" os.makedirs(video_folder, exist_ok=True) if allowOnlyTTSFolder: os.makedirs(f"{video_folder}/OnlyTTS", exist_ok=True) - # create a thumbnail for the video settingsbackground = settings.config["settings"]["background"] - if settingsbackground["background_thumbnail"]: - if not exists(f"{video_folder}"): - os.makedirs(f"{video_folder}", exist_ok=True) - # get the first file with the .png extension from assets/backgrounds and use it as a background for the thumbnail first_image = next( - (file for file in os.listdir("assets/backgrounds") if file.endswith(".png")), - None, + (f for f in os.listdir("assets/backgrounds") if f.endswith(".png")), None ) if first_image is None: print_substep("No png files found in assets/backgrounds", "red") - else: - font_family = settingsbackground["background_thumbnail_font_family"] - font_size = settingsbackground["background_thumbnail_font_size"] - font_color = settingsbackground["background_thumbnail_font_color"] thumbnail = Image.open(f"assets/backgrounds/{first_image}") - width, height = thumbnail.size + w, h = thumbnail.size thumbnailSave = create_thumbnail( thumbnail, - font_family, - font_size, - font_color, - width, - height, - title_thumb, + settingsbackground["background_thumbnail_font_family"], + settingsbackground["background_thumbnail_font_size"], + settingsbackground["background_thumbnail_font_color"], + w, h, title_thumb, ) thumbnailSave.save(f"{video_folder}/thumbnail.png") - print_substep(f"Thumbnail - Building Thumbnail in assets/temp/{reddit_id}/thumbnail.png") - text = f"Background by {background_config['video'][2]}" background_clip = ffmpeg.drawtext( background_clip, - text=text, - x=f"(w-text_w)", - y=f"(h-text_h)", - fontsize=5, - fontcolor="White", + text=f"Background by {background_config['video'][2]}", + x="(w-text_w)", y="(h-text_h)", + fontsize=5, fontcolor="White", fontfile=os.path.join("fonts", "Roboto-Regular.ttf"), ) background_clip = background_clip.filter("scale", W, H) + print_step("Rendering the video 🎥") from tqdm import tqdm - pbar = tqdm(total=100, desc="Progress: ", bar_format="{l_bar}{bar}", unit=" %") def on_update_example(progress) -> None: @@ -422,14 +457,11 @@ def make_final_video( old_percentage = pbar.n pbar.update(status - old_percentage) - defaultPath = video_folder with ProgressFfmpeg(length, on_update_example) as progress: path = f"{video_folder}/video.mp4" try: ffmpeg.output( - background_clip, - final_audio, - path, + background_clip, final_audio, path, f="mp4", **{ "c:v": "h264_nvenc", @@ -438,26 +470,23 @@ def make_final_video( "threads": multiprocessing.cpu_count(), }, ).overwrite_output().global_args("-progress", progress.output_file.name).run( - quiet=True, - overwrite_output=True, - capture_stdout=False, - capture_stderr=False, + quiet=True, overwrite_output=True, + capture_stdout=False, capture_stderr=False, ) except ffmpeg.Error as e: print(e.stderr.decode("utf8")) exit(1) + old_percentage = pbar.n pbar.update(100 - old_percentage) + if allowOnlyTTSFolder: path = f"{video_folder}/OnlyTTS/video.mp4" - # Prevent a error by limiting the path length, do not change this. print_step("Rendering the Only TTS Video 🎥") with ProgressFfmpeg(length, on_update_example) as progress: try: ffmpeg.output( - background_clip, - audio, - path, + background_clip, audio, path, f="mp4", **{ "c:v": "h264_nvenc", @@ -466,20 +495,18 @@ def make_final_video( "threads": multiprocessing.cpu_count(), }, ).overwrite_output().global_args("-progress", progress.output_file.name).run( - quiet=True, - overwrite_output=True, - capture_stdout=False, - capture_stderr=False, + quiet=True, overwrite_output=True, + capture_stdout=False, capture_stderr=False, ) except ffmpeg.Error as e: print(e.stderr.decode("utf8")) exit(1) - old_percentage = pbar.n pbar.update(100 - old_percentage) + pbar.close() - save_data(subreddit, f"{idx}_{sentiment}/video.mp4", title, idx, background_config["video"][2]) + save_data(subreddit, f"{idx}_{sentiment}/video.mp4", title_str, idx, background_config["video"][2]) print_step("Removing temporary files 🗑") cleanups = cleanup(reddit_id) print_substep(f"Removed {cleanups} temporary files 🗑") - print_step("Done! 🎉 The video is in the results folder 📁") + print_step("Done! 🎉 The video is in the results folder 📁") \ No newline at end of file diff --git a/video_creation/screenshot_downloader.py b/video_creation/screenshot_downloader.py index 8dafaf6..dcddc05 100644 --- a/video_creation/screenshot_downloader.py +++ b/video_creation/screenshot_downloader.py @@ -62,10 +62,10 @@ def get_screenshots_of_reddit_posts(reddit_object: dict, screenshot_num: int): if storymode and settings.config["settings"]["storymodemethod"] == 1: print_substep("Generating images...") return imagemaker( - theme=bgcolor, + theme=(0, 0, 0, 0), reddit_obj=reddit_object, - txtclr=txtcolor, - transparent=transparent, + txtclr=(255, 255, 255), + transparent=True, ) screenshot_num: int