From 1186efbe4d15df83c16b57701d3a8ecc2002d868 Mon Sep 17 00:00:00 2001 From: Hong Phuc Date: Mon, 25 May 2026 16:20:49 +0700 Subject: [PATCH] chore: cleanup code --- .gitignore | 1 + .gitmodules | 3 - CLAUDE.md | 129 +++++++++++------------ TTS/supertonic_tts.py | 49 +++++++-- platforms/threads/scraper.py | 15 +++ utils/.config.template.toml | 1 + vendor/FullyAutomatedRedditVideoMakerBot | 1 - 7 files changed, 121 insertions(+), 78 deletions(-) delete mode 100644 .gitmodules delete mode 160000 vendor/FullyAutomatedRedditVideoMakerBot diff --git a/.gitignore b/.gitignore index fa8e08c..675fe1b 100644 --- a/.gitignore +++ b/.gitignore @@ -274,3 +274,4 @@ pipeline-ui-*.png .agents .config .local +.gstack/ diff --git a/.gitmodules b/.gitmodules deleted file mode 100644 index c5bf4d4..0000000 --- a/.gitmodules +++ /dev/null @@ -1,3 +0,0 @@ -[submodule "vendor/FullyAutomatedRedditVideoMakerBot"] - path = vendor/FullyAutomatedRedditVideoMakerBot - url = https://github.com/raga70/FullyAutomatedRedditVideoMakerBot.git diff --git a/CLAUDE.md b/CLAUDE.md index 4baa332..c968927 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -4,19 +4,19 @@ **VideoMakerBot** — Automated short-form video creator from social media content. -**Status:** Production-ready, actively maintained (v3.4.0) -**Language:** Python 3.14+ (host + Docker image) -**Runtime:** **Docker only** — all CLI, GUI, and test invocations go through `docker compose`. Do not invoke `python` on the host. +**Status:** Production (v3.4.0) +**Language:** Python 3.14+ (host + Docker) +**Runtime:** **Docker only** — CLI, GUI, test go through `docker compose`. Never `python` on host. **Platforms:** Reddit (PRAW API), Threads (Graph API + Web Scraping) ### Core Mission -Transforms social media threads (post + comments/replies) into complete short-form videos with: +Transforms social media threads (post + comments/replies) into short-form videos: - AI-generated speech (7+ TTS providers) -- UI screenshots (Playwright, headless Chromium pre-installed in image) +- UI screenshots (Playwright, headless Chromium in image) - Background video/audio overlays -- FFmpeg composition & output (Linux ffmpeg with full filter set, including `drawtext`) +- FFmpeg composition & output (Linux ffmpeg, full filter set + `drawtext`) - Optional YouTube upload -- Modern web UI (Tailwind CSS + DaisyUI + Lucide + vanilla ES6) on `localhost:4000` +- Web UI (Tailwind CSS + DaisyUI + Lucide + vanilla ES6) on `localhost:4000` --- @@ -195,7 +195,7 @@ client_secret_path = "" # Path to youtube_client_secret.json ### Threads — Web Scraping (discovery_method = "scrape") **DOM Structure:** -- Threads.net uses **div-based card layout** — NO `
` elements anywhere +- Threads.net uses **div-based card layout** — NO `
` elements - Feed posts: `a[href*="/post/"]` links inside `
` cards (class contains `x1a2a7pz`) - Post pages: same structure; main post link appears first, replies follow - Screenshots: Use `a[href*="/post/"]` → ancestor div card, NOT `page.locator("article")` @@ -212,7 +212,7 @@ Last 1-4: engagement metrics (likes, replies, reposts, quotes) - Numbers can be plain ("266") or abbreviated ("1K", "2.5M") - `likes` = first trailing number, `replies` = second, `reposts` = third - `min_engagement` filters by `likes + reposts` total -- Posts are sorted by engagement descending before selection +- Posts sorted by engagement descending before selection **Login Flow:** - Threads uses Instagram auth (`threads.net/login`) @@ -220,11 +220,11 @@ Last 1-4: engagement metrics (likes, replies, reposts, quotes) - Button: `get_by_role("button", name="Log in", exact=True).first` - After click: `page.wait_for_url("https://www.threads.net/", timeout=15000)` — event-wait, not fixed delay - Cookies cached at `video_creation/data/cookie-threads.json` -- Login logic is shared via `platforms/threads/auth.py` +- Login logic shared via `platforms/threads/auth.py` **API Limitation:** - Graph API v1.0 only accesses YOUR OWN posts — no trending/discovery -- Scraping bypasses this entirely — no API token needed +- Scraping bypasses this — no API token needed ### Threads — Graph API (discovery_method = "api") @@ -252,48 +252,47 @@ Last 1-4: engagement metrics (likes, replies, reposts, quotes) 5. **Default to `googletranslate` TTS** for headless containers — no API key, fast, free 6. **Use `libx264` encoder** — `h264_nvenc` is NVIDIA-only and not available in the slim image 7. **Test both Threads discovery methods:** `api` and `scrape` -8. **Bind-mount preserves state** — edits to `config.toml`, `results/`, `assets/temp/`, `video_creation/data/`, and the `utils/background_*.json` catalogs persist across container runs -9. **GUI must bind to `0.0.0.0`** in Docker (already enforced via `GUI_HOST=0.0.0.0` env) -10. **Use `/video/` to serve renders** — the route looks up the file by id in `videos.json`, sanitizes the `Content-Disposition` filename, and avoids 404s caused by literal newlines in titles +8. **Bind-mount preserves state** — edits to `config.toml`, `results/`, `assets/temp/`, `video_creation/data/`, and `utils/background_*.json` catalogs persist across container runs +9. **GUI must bind to `0.0.0.0`** in Docker (enforced via `GUI_HOST=0.0.0.0` env) +10. **Use `/video/` to serve renders** — the route looks up the file by id in `videos.json`, sanitizes `Content-Disposition` filename, avoids 404s from literal newlines in titles ### ❌ DON'T: 1. **Don't run `python GUI.py` or `python main.py` on the host** — Docker is the only supported path -2. **Don't use `
` selectors** on Threads.net — the DOM is div-based +2. **Don't use `
` selectors** on Threads.net — DOM is div-based 3. **Don't hardcode `h264_nvenc`** — use `libx264` for cross-platform compatibility 4. **Don't import platform modules directly** in main.py/utils 5. **Don't assume config keys exist** without `.get()` fallback -6. **Don't reintroduce jQuery, Bootstrap, or ClipboardJS** — the UI is vanilla ES6 + Tailwind + DaisyUI + Lucide -7. **Don't write to `utils/backgrounds.json`** — it is a legacy empty file. Use `utils/background_videos.json` and `utils/background_audios.json` +6. **Don't reintroduce jQuery, Bootstrap, or ClipboardJS** — UI is vanilla ES6 + Tailwind + DaisyUI + Lucide +7. **Don't write to `utils/backgrounds.json`** — legacy empty file. Use `utils/background_videos.json` and `utils/background_audios.json` ### 🔒 Security (hardened May 2026) -1. **No `eval()`** — use `{"int": int, "float": float, "bool": bool, "str": str}` dict dispatch for type coercion. `utils/settings.py` has module-level `_TYPE_COERCION`. -2. **No `os.system()`** — use `subprocess.run([...])` with argument lists. No shell interpretation of paths. +1. **No `eval()`** — use `{"int": int, "float": float, "bool": bool, "str": str}` dict dispatch. `utils/settings.py` has module-level `_TYPE_COERCION`. +2. **No `os.system()`** — use `subprocess.run([...])` with argument lists. No shell interpretation. 3. **No `shell=True`** — removed from all `subprocess.run()` and `Popen()` calls. -4. **No bare `except:`** — always catch specific exception types. Bare excepts swallow `KeyboardInterrupt` and `SystemExit`. -5. **Redact secrets before printing** — `main.py` error handler deep-copies config and masks all credential fields before logging. -6. **Settings page secrets** — `GUI.py` redacts API keys/passwords from the data dict passed to `settings.html`. Sensitive fields show as `********`. -7. **CSRF protection** — `GUI.py` has `@app.before_request` that checks `Origin` header on all mutating requests. -8. **Security headers** — `X-Content-Type-Options: nosniff`, `X-Frame-Options: DENY` on every response. -9. **Flask secret key** — loaded from `FLASK_SECRET_KEY` env var, falls back to `os.urandom(32)` per startup. -10. **Docker non-root** — container runs as `appuser`, not root. -11. **Path traversal** — `/video/` uses `Path.resolve().relative_to()` guard; `add_background()` sanitizes citation with `re.sub(r"[./\\\\]", "_", citation)`. -12. **No hardcoded credentials** in source — all secrets loaded from `config.toml` (gitignored). Rotate passwords regularly. +4. **No bare `except:`** — catch specific exception types. Bare excepts swallow `KeyboardInterrupt` and `SystemExit`. +5. **Redact secrets before printing** — `main.py` error handler deep-copies config and masks all credential fields. `GUI.py` redacts API keys/passwords from settings page data. Sensitive fields show as `********`. +6. **CSRF protection** — `GUI.py` `@app.before_request` checks `Origin` header on all mutating requests. +7. **Security headers** — `X-Content-Type-Options: nosniff`, `X-Frame-Options: DENY` on every response. +8. **Flask secret key** — `FLASK_SECRET_KEY` env var, fallback `os.urandom(32)` per startup. +9. **Docker non-root** — container runs as `appuser`, not root. +10. **Path traversal** — `/video/` uses `Path.resolve().relative_to()` guard; `add_background()` sanitizes citation with `re.sub(r"[./\\\\]", "_", citation)`. +11. **No hardcoded credentials** in source — all secrets from `config.toml` (gitignored). Rotate passwords regularly. --- ## Web UI (Flask, served by `gui` service) -- **Stack:** Tailwind CSS, DaisyUI, Lucide Icons, vanilla ES6 (no jQuery, no Bootstrap, no ClipboardJS) +- **Stack:** Tailwind CSS, DaisyUI, Lucide Icons, vanilla ES6 (no jQuery, Bootstrap, ClipboardJS) - **Routes:** - - `/` — Video Library; cards show source-post link, download, and copy-link buttons - - `/video/` — serves the rendered mp4 by id (lookup via `videos.json`); guards path-traversal and sanitizes the filename for `Content-Disposition` + - `/` — Video Library; cards show source-post link, download, copy-link buttons + - `/video/` — serves rendered mp4 by id (lookup via `videos.json`); path-traversal guard, sanitized `Content-Disposition` - `/backgrounds` — Background Manager UI - - `/backgrounds.json` — serves `utils/background_videos.json` (the videos catalog) - - `/background/add`, `/background/delete` — POST endpoints; mutate **both** `utils/background_videos.json` and the `settings.background.background_video.options` array in `utils/.config.template.toml` + - `/backgrounds.json` — serves `utils/background_videos.json` (videos catalog) + - `/background/add`, `/background/delete` — POST; mutate **both** `utils/background_videos.json` and `settings.background.background_video.options` in `utils/.config.template.toml` - `/settings` — config editor; loads from `config.toml`, validates against `utils/.config.template.toml`, persists via `utils/gui_utils.modify_settings` (preserves comments/formatting via `tomlkit`) -- **HTML escaping:** the `h()` helper in `index.html` escapes `& " < >` for any user-controlled string embedded in attributes — use it for any new dynamic data on the Library page +- **HTML escaping:** `h()` helper in `index.html` escapes `& " < >` for user-controlled strings in attributes --- @@ -303,19 +302,19 @@ Last 1-4: engagement metrics (likes, replies, reposts, quotes) |------|---------| | `main.py` | CLI entry; pipeline orchestration via factory | | `platforms/__init__.py` | Factory dispatch (platform + discovery_method) | -| `platforms/threads/scraper.py` | **NEW** — Web scraping fetcher with engagement parsing | -| `platforms/threads/auth.py` | **NEW** — Shared Playwright login + cookie management | +| `platforms/threads/scraper.py` | Web scraping fetcher with engagement parsing | +| `platforms/threads/auth.py` | Shared Playwright login + cookie management | | `platforms/threads/fetcher.py` | Graph API client (own posts only) | | `platforms/threads/screenshot.py` | Div-based Threads screenshotter | -| `video_creation/final_video.py` | FFmpeg composition (libx264, platform-aware output); exports `get_output_path()` for shared path computation | +| `video_creation/final_video.py` | FFmpeg composition (libx264, platform-aware output); exports `get_output_path()` | | `video_creation/background.py` | Background downloader (local files + yt-dlp); prefers already-downloaded videos | | `video_creation/youtube_uploader.py` | OAuth2 YouTube upload | | `TTS/engine_wrapper.py` | TTS provider abstraction + TikTok→pyttsx3 fallback; single-pass ffmpeg concat | | `TTS/TikTok.py` | Hardened TikTok TTS with graceful error handling | -| `reddit/subreddit.py` | PRAW Reddit fetcher with auto-2FA; retry-depth limit (50) on submission search | +| `reddit/subreddit.py` | PRAW Reddit fetcher with auto-2FA; retry-depth limit (50) | | `utils/settings.py` | Config loading + interactive validation; uses `_TYPE_COERCION` dict (no eval) | | `utils/videos.py` | Video dedup tracking (`check_done`, `check_done_by_id`, `save_data` with truncate) | -| `utils/.config.template.toml` | Config schema (also drives Settings page validation) | +| `utils/.config.template.toml` | Config schema (drives Settings page validation) | | `utils/background_videos.json` | Background video manifest (served at `/backgrounds.json`) | | `utils/background_audios.json` | Background audio manifest | | `utils/gui_utils.py` | `add_background`, `delete_background`, `modify_settings`, `get_checks` (no eval) | @@ -329,52 +328,52 @@ Last 1-4: engagement metrics (likes, replies, reposts, quotes) ## Debugging Tips ### FFmpeg "Unknown encoder 'h264_nvenc'" -→ Use `libx264`. Find-and-replace `h264_nvenc` → `libx264` in `video_creation/final_video.py`. The slim image does not ship with NVIDIA encoders. +→ Use `libx264`. Find-and-replace `h264_nvenc` → `libx264` in `video_creation/final_video.py`. Slim image doesn't ship NVIDIA encoders. ### yt-dlp "Requested format is not available" -→ Bump the pinned version in `requirements.txt` and rebuild (`docker compose build`). Also prefer `best[height<=1080]` over `bestvideo` in `video_creation/background.py` — many videos lack video-only streams. +→ Bump pinned version in `requirements.txt` and rebuild (`docker compose build`). Prefer `best[height<=1080]` over `bestvideo` in `video_creation/background.py` — many videos lack video-only streams. ### Threads screenshots fail ("Main post article not found") -→ Threads.net uses div cards, not `
`. Ensure screenshot code uses `a[href*="/post/"]` → ancestor div approach. +→ Threads.net uses div cards, not `
`. Use `a[href*="/post/"]` → ancestor div approach. ### Config validator EOFError in non-interactive mode -→ `check_toml()` prompts for ALL platform sections regardless of `platform` setting. Either fill all required fields, edit through `/settings`, or pre-populate `config.toml` before `docker compose run cli`. +→ `check_toml()` prompts for ALL platform sections regardless of `platform` setting. Fill all required fields, edit through `/settings`, or pre-populate `config.toml` before `docker compose run cli`. ### Playwright timeout on Threads login -→ Cookies corrupted. Delete `video_creation/data/cookie-threads.json` for fresh login (the file is bind-mounted, so deleting on host clears the container too). Also confirm selectors: button uses `exact=True` due to multiple "Log in" buttons. +→ Cookies corrupted. Delete `video_creation/data/cookie-threads.json` for fresh login (file is bind-mounted, host delete clears container too). Confirm selectors: button uses `exact=True` for multiple "Log in" buttons. ### No viral posts found → Lower `min_engagement` in config. Most Threads feed posts have <100 likes — 10000 filters almost everything. ### Background Manager grid is empty -→ `/backgrounds.json` must serve `utils/background_videos.json` (split catalog), **not** the legacy `utils/backgrounds.json` (empty `{}`). Verify in `GUI.py:backgrounds_json`. +→ `/backgrounds.json` must serve `utils/background_videos.json` (split catalog), **not** legacy `utils/backgrounds.json` (empty `{}`). Verify in `GUI.py:backgrounds_json`. ### `/video/` returns 404 -→ The route looks up the entry in `video_creation/data/videos.json` by `id` and resolves the file under `results//.mp4`. Confirm both the JSON entry and the file exist; the file may have been pruned. +→ Route looks up entry in `video_creation/data/videos.json` by `id`, resolves file under `results//.mp4`. Confirm both JSON entry and file exist; file may have been pruned. ### JS "Unexpected end of input" on Library page -→ Any user-controlled string interpolated into an HTML attribute must go through the `h()` helper in `index.html`. Avoid inline `onclick=` with `${JSON.stringify(...)}`. +→ User-controlled strings in HTML attributes must go through `h()` helper in `index.html`. Avoid inline `onclick=` with `${JSON.stringify(...)}`. ### Stale image after editing `requirements.txt` or `Dockerfile` -→ `docker compose build` to rebuild. Code changes alone do NOT need a rebuild because the repo root is bind-mounted to `/app`. +→ `docker compose build` to rebuild. Code-only changes don't need rebuild — repo root is bind-mounted to `/app`. ### Python bytecode caching in long-running GUI container -→ The GUI process caches imported modules in `sys.modules`. After editing pipeline code (`final_video.py`, `background.py`, `screenshot.py`), restart the GUI (`docker compose restart gui`) or trigger a pipeline run which now calls `importlib.reload()` on all pipeline modules automatically. +→ GUI caches imported modules in `sys.modules`. After editing pipeline code, restart GUI (`docker compose restart gui`) or trigger pipeline run which calls `importlib.reload()` on pipeline modules. ### Reddit image template appearing in Threads videos -→ Verify `platform` in config.toml is `"threads"` (not `"reddit"`). The `if platform == "reddit"` guard in `final_video.py` blocks the Reddit template. If it still appears, restart the GUI container to flush Python bytecode cache. +→ Verify `platform` in config.toml is `"threads"` (not `"reddit"`). The `if platform == "reddit"` guard in `final_video.py` blocks Reddit template. Restart GUI container to flush Python bytecode cache. ### Background video download fails (yt-dlp HTTP 403) -→ `get_background_config()` now prefers already-downloaded videos. Set `background_video` in config.toml to a downloaded video name (check `assets/backgrounds/video/`). If empty, it randomly picks from downloaded videos first. +→ `get_background_config()` prefers already-downloaded videos. Set `background_video` in config.toml to a downloaded video name (check `assets/backgrounds/video/`). If empty, randomly picks from downloaded videos first. ### TTS output has wrong number of audio clips -→ `engine_wrapper.run()` returns `idx + 1` (count, not last index). If you're getting one fewer clip than expected, check the return value consumers — they should treat it as a count. +→ `engine_wrapper.run()` returns `idx + 1` (count, not last index). If getting one fewer clip than expected, check return value consumers — treat as count. ### videos.json corruption (trailing garbage after save) -→ Fixed: `save_data()` now calls `raw_vids.truncate()` after `json.dump()`. If you have an existing corrupted file, delete `video_creation/data/videos.json` and it will be recreated. +→ Fixed: `save_data()` calls `raw_vids.truncate()` after `json.dump()`. Delete `video_creation/data/videos.json` if existing file is corrupted. ### Infinite recursion in Reddit post discovery -→ Fixed: `get_subreddit_threads()` has a retry-depth limit of 50. If you hit this, your subreddit may have no undone posts — try a different subreddit or clear `videos.json`. +→ Fixed: `get_subreddit_threads()` has retry-depth limit of 50. If hit, subreddit may have no undone posts — try different subreddit or clear `videos.json`. --- @@ -388,43 +387,43 @@ docker compose build docker compose up gui # → http://localhost:4000 -# Run the GUI in the background +# Run the GUI in background docker compose up -d gui docker compose logs -f gui docker compose down -# Run the CLI pipeline (one-off, removed on exit) +# Run CLI pipeline (one-off, removed on exit) docker compose run --rm cli docker compose run --rm cli python main.py -# Run the test suite +# Run test suite docker compose run --rm test -# Open a shell in a fresh container for ad-hoc commands +# Shell in fresh container for ad-hoc commands docker compose run --rm --entrypoint /bin/bash gui -# inside: python -m py_compile main.py platforms/threads/scraper.py +# inside: python -m py_compile main.py platforms/threads/scraper.py -# Tail a running GUI container +# Tail running GUI container docker compose exec gui ls /app/results/threads/ ``` -> Anything that needs `pip install`, `playwright install`, or `apt-get` belongs in `Dockerfile` followed by `docker compose build` — never run those on the host. +> Anything needing `pip install`, `playwright install`, or `apt-get` belongs in `Dockerfile` + `docker compose build` — never on host. --- ## Recent Changes (May 2026 Security Hardening) -**eval() removal:** All `eval(checks["type"])(value)` patterns replaced with `{"int": int, "float": float, "bool": bool, "str": str}` dict dispatch in `utils/settings.py`, `utils/console.py`, `utils/gui_utils.py`. +**eval() removal:** `eval(checks["type"])(value)` replaced with `{"int": int, "float": float, "bool": bool, "str": str}` dict dispatch in `utils/settings.py`, `utils/console.py`, `utils/gui_utils.py`. -**os.system() removal:** `TTS/engine_wrapper.py:split_post` now uses `subprocess.run([...])` with argument lists. `utils/posttextparser.py` spacy download uses `subprocess.run([sys.executable, "-m", "spacy", ...])`. +**os.system() removal:** `TTS/engine_wrapper.py:split_post` uses `subprocess.run([...])` with argument lists. `utils/posttextparser.py` spacy download uses `subprocess.run([sys.executable, "-m", "spacy", ...])`. **shell=True removal:** All `subprocess.run(..., shell=True)` and `Popen(..., shell=True)` replaced with argument lists in `main.py` and `utils/ffmpeg_install.py`. -**Credential leak prevention:** `main.py` error handler deep-copies config and redacts all secrets before printing. `GUI.py` masks sensitive keys as `********` in settings page data. +**Credential leak prevention:** `main.py` error handler deep-copies config and redacts all secrets. `GUI.py` masks sensitive keys as `********` in settings page data. **CSRF + security headers:** `GUI.py` checks `Origin` header on POST/PUT/DELETE. `X-Content-Type-Options`, `X-Frame-Options` headers added. -**Docker hardening:** Container runs as `appuser` (non-root). Digest pinning + pip version comments added for production. +**Docker hardening:** Container runs as `appuser` (non-root). Digest pinning + pip version comments added. **Bug fixes (18 total):** - Config overwrite crash (config=None after empty file write) diff --git a/TTS/supertonic_tts.py b/TTS/supertonic_tts.py index 3264b58..78a0172 100644 --- a/TTS/supertonic_tts.py +++ b/TTS/supertonic_tts.py @@ -1,4 +1,5 @@ import random +import re import subprocess import tempfile from pathlib import Path @@ -7,6 +8,11 @@ from supertonic import TTS from utils import settings +# Zero-width and invisible formatting characters rejected by supertonic. +# ​ ZWSP, ‌ ZWNJ, ‍ ZWJ, ‎ LTR, ‏ RTL, +# ⁠ word joiner,  BOM +_ZW_RE = re.compile("[​-‏⁠]") + class SupertonicTTS: def __init__(self): @@ -17,18 +23,43 @@ class SupertonicTTS: output_path = Path(filepath) output_path.parent.mkdir(parents=True, exist_ok=True) + # Strip known invisible characters first, then retry on any others + # that supertonic rejects — social media text has unpredictable Unicode. + text = _ZW_RE.sub("", text) + tts_settings = settings.config["settings"].get("tts", {}) voice_style = self._voice_style(tts_settings, random_voice) + synth_kwargs = { + "voice_style": voice_style, + "lang": tts_settings.get("supertonic_lang", "na"), + "total_steps": int(tts_settings.get("supertonic_steps", 8)), + "speed": float(tts_settings.get("supertonic_speed", 1.05)), + "max_chunk_length": self.max_chars, + "verbose": False, + } - wav, _duration = self.tts.synthesize( - text, - voice_style=voice_style, - lang=tts_settings.get("supertonic_lang", "na"), - total_steps=int(tts_settings.get("supertonic_steps", 8)), - speed=float(tts_settings.get("supertonic_speed", 1.05)), - max_chunk_length=self.max_chars, - verbose=False, - ) + for _ in range(3): + try: + wav, _duration = self.tts.synthesize(text, **synth_kwargs) + break + except ValueError as e: + msg = str(e) + if "unsupported character" not in msg: + raise + # Extract and strip the reported characters + chars = re.findall(r"'(.+?)'", msg) + if not chars: + raise + for c in chars: + text = text.replace(c, "") + if not text.strip(): + raise ValueError( + "Text is empty after stripping unsupported characters" + ) from e + else: + raise RuntimeError( + "Supertonic still rejecting characters after 3 retries" + ) with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav: wav_path = Path(temp_wav.name) diff --git a/platforms/threads/scraper.py b/platforms/threads/scraper.py index 3cfc715..ec6ee28 100644 --- a/platforms/threads/scraper.py +++ b/platforms/threads/scraper.py @@ -317,6 +317,17 @@ def _contains_blocked(text: str, blocked_raw: str) -> bool: return any(word in text_lower for word in blocked) +def _is_english_text(text: str) -> bool: + """Return True if text is predominantly English (Latin alphabet).""" + if not text or not text.strip(): + return False + alpha_chars = [c for c in text if c.isalpha()] + if not alpha_chars: + return False + latin = sum(1 for c in alpha_chars if "a" <= c.lower() <= "z") + return latin / len(alpha_chars) >= 0.70 + + def _filter_candidates(posts: list[dict]) -> list[dict]: """Filter feed posts by engagement, blocked words, and duplicates. @@ -337,6 +348,8 @@ def _filter_candidates(posts: list[dict]) -> list[dict]: continue if not post["body"] or len(post["body"].strip()) < 10: continue + if t_config.get("english_only", False) and not _is_english_text(post["body"]): + continue # Age filter if max_age_hours is not None: post_hours = _parse_timestamp_to_hours(post.get("timestamp", "")) @@ -502,6 +515,8 @@ def _build_content_object(post: dict, replies: list[dict]) -> dict: continue if _contains_blocked(body, blocked_raw): continue + if t_config.get("english_only", False) and not _is_english_text(body): + continue if not (min_len <= len(body) <= max_len): continue sanitised = sanitize_text(body) diff --git a/utils/.config.template.toml b/utils/.config.template.toml index 849f895..392cb8e 100644 --- a/utils/.config.template.toml +++ b/utils/.config.template.toml @@ -40,6 +40,7 @@ min_engagement = { default = 0, optional = true, nmin = 0, type = "int", explana max_post_age = { optional = true, default = "", options = ["", "1h", "6h", "24h", "3d", "7d", "30d"], type = "str", explanation = "Maximum age of posts to consider. Empty = no limit.", example = "7d" } search_queries = { optional = true, default = "news,politics,trending", type = "str", explanation = "Comma-separated search queries to find trending content on Threads. Combined with main feed results.", example = "news,politics,viral" } blocked_words = { optional = true, default = "", type = "str", explanation = "Comma-separated list of blocked words/phrases. Posts and replies containing any of these will be skipped.", example = "nsfw, spoiler, politics" } +english_only = { optional = true, default = false, options = [true, false], type = "bool", explanation = "Only include English-language posts and replies (Latin alphabet heuristic). Set to true for English-only content." } [youtube] enabled = { optional = true, type = "bool", default = false, options = [true, false], explanation = "Enable automatic YouTube upload after video creation" } diff --git a/vendor/FullyAutomatedRedditVideoMakerBot b/vendor/FullyAutomatedRedditVideoMakerBot deleted file mode 160000 index 6e5c9ff..0000000 --- a/vendor/FullyAutomatedRedditVideoMakerBot +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 6e5c9ffacc5ac601cb596cfaa72fc947824a2ca3