Update scheduler to 1 video per 3h with title dedup to prevent duplicates

- Changed default cron from every 6h to every 3h (8 videos/day) - Added utils/title_history.py: tracks used titles in JSON - threads_client.py: skips threads with already-used titles - scheduler/pipeline.py: saves title after successful video creation - main.py: saves title in manual mode too - Updated config template with new scheduler defaults Agent-Logs-Url: https://github.com/thaitien280401-stack/RedditVideoMakerBot/sessions/17c7c41c-cf86-4279-88b8-01cba23ee763 Co-authored-by: thaitien280401-stack <271128961+thaitien280401-stack@users.noreply.github.com>
2 months ago · cb0fddf072
parent 2c6fa251e6
commit cb0fddf072
6 changed files with 138 additions and 5 deletions
--- a/.gitignore
+++ b/.gitignore
@ -242,6 +242,7 @@ reddit-bot-351418-5560ebc49cac.json
 /.idea
 *.pyc
 video_creation/data/videos.json
+video_creation/data/title_history.json
 video_creation/data/envvars.txt

 config.toml
--- a/main.py
+++ b/main.py
@ -95,6 +95,13 @@ def main_threads(POST_ID=None) -> None:
    chop_background(bg_config, length, thread_object)
    make_final_video(number_of_comments, length, thread_object, bg_config)

+    # Lưu title vào lịch sử để tránh tạo trùng lặp
+    from utils.title_history import save_title
+    title = thread_object.get("thread_title", "")
+    tid = thread_object.get("thread_id", "")
+    if title:
+        save_title(title=title, thread_id=tid, source="threads")
+

 def main_threads_with_upload(POST_ID=None) -> None:
    """Pipeline đầy đủ: Threads → Video → Upload lên các platform."""
--- a/scheduler/pipeline.py
+++ b/scheduler/pipeline.py
@ -17,6 +17,7 @@ from utils import settings
 from utils.cleanup import cleanup
 from utils.console import print_markdown, print_step, print_substep
 from utils.id import extract_id
+from utils.title_history import save_title


 def run_pipeline(post_id: Optional[str] = None) -> Optional[str]:
@ -128,6 +129,13 @@ def run_pipeline(post_id: Optional[str] = None) -> Optional[str]:
                    print_substep(f"  ❌ {platform}: Thất bại", style="bold red")

        print_step("✅ Pipeline hoàn tất!")
+
+        # Lưu title vào lịch sử để tránh tạo trùng lặp
+        title = thread_object.get("thread_title", "")
+        tid = thread_object.get("thread_id", "")
+        if title:
+            save_title(title=title, thread_id=tid, source="threads")
+
        return video_path

    except Exception as e:
@ -158,8 +166,8 @@ def run_scheduled():
        return

    timezone = scheduler_config.get("timezone", "Asia/Ho_Chi_Minh")
-    cron_expression = scheduler_config.get("cron", "0 */6 * * *")  # Mặc định mỗi 6 giờ
-    max_videos_per_day = scheduler_config.get("max_videos_per_day", 4)
+    cron_expression = scheduler_config.get("cron", "0 */3 * * *")  # Mặc định mỗi 3 giờ (8 lần/ngày: 00, 03, 06, 09, 12, 15, 18, 21h)
+    max_videos_per_day = scheduler_config.get("max_videos_per_day", 8)

    # Parse cron expression
    cron_parts = cron_expression.split()
--- a/threads/threads_client.py
+++ b/threads/threads_client.py
@ -12,6 +12,7 @@ import requests

 from utils import settings
 from utils.console import print_step, print_substep
+from utils.title_history import is_title_used
 from utils.videos import check_done
 from utils.voice import sanitize_text

@ -158,7 +159,7 @@ def get_threads_posts(POST_ID: str = None) -> dict:
            keyword_list = [k.strip() for k in keywords.split(",") if k.strip()]
            threads_list = client.search_threads_by_keyword(threads_list, keyword_list)

-        # Chọn thread phù hợp (chưa tạo video, đủ replies)
+        # Chọn thread phù hợp (chưa tạo video, đủ replies, title chưa dùng)
        thread = None
        for t in threads_list:
            thread_id = t.get("id", "")
@ -166,6 +167,14 @@ def get_threads_posts(POST_ID: str = None) -> dict:
            text = t.get("text", "")
            if not text or _contains_blocked_words(text):
                continue
+            # Kiểm tra title đã được sử dụng chưa (tránh trùng lặp)
+            title_candidate = text[:200] if len(text) > 200 else text
+            if is_title_used(title_candidate):
+                print_substep(
+                    f"Bỏ qua thread đã tạo video: {text[:50]}...",
+                    style="bold yellow",
+                )
+                continue
            # Kiểm tra số lượng replies
            try:
                replies = client.get_thread_replies(thread_id, limit=min_comments + 5)
--- a/utils/.config.template.toml
+++ b/utils/.config.template.toml
@ -83,9 +83,9 @@ access_token = { optional = true, default = "", explanation = "Facebook Page Acc

 [scheduler]
 enabled = { optional = true, type = "bool", default = false, options = [true, false], explanation = "Bật lên lịch tự động" }
-cron = { optional = true, default = "0 */6 * * *", explanation = "Cron expression. Mặc định: mỗi 6 giờ", example = "0 8,14,20 * * *" }
+cron = { optional = true, default = "0 */3 * * *", explanation = "Cron expression. Mặc định: mỗi 3 giờ (8 video/ngày)", example = "0 */3 * * *" }
 timezone = { optional = true, default = "Asia/Ho_Chi_Minh", explanation = "Múi giờ", example = "Asia/Ho_Chi_Minh" }
-max_videos_per_day = { optional = true, default = 4, type = "int", nmin = 1, nmax = 20, explanation = "Số video tối đa/ngày" }
+max_videos_per_day = { optional = true, default = 8, type = "int", nmin = 1, nmax = 50, explanation = "Số video tối đa/ngày. Mặc định: 8 (mỗi 3 giờ × 1 video)" }

 # ===== LEGACY REDDIT CONFIG =====

--- a/utils/title_history.py
+++ b/utils/title_history.py
@ -0,0 +1,108 @@
+"""
+Title History - Lưu và kiểm tra các title đã được sử dụng để tránh trùng lặp.
+
+Lưu trữ danh sách title đã tạo video vào file JSON.
+Khi chọn thread mới, kiểm tra xem title đã được sử dụng chưa.
+"""
+
+import json
+import os
+import time
+from typing import Optional
+
+from utils.console import print_substep
+
+TITLE_HISTORY_PATH = "./video_creation/data/title_history.json"
+
+
+def _ensure_file_exists() -> None:
+    """Tạo file title_history.json nếu chưa tồn tại."""
+    os.makedirs(os.path.dirname(TITLE_HISTORY_PATH), exist_ok=True)
+    if not os.path.exists(TITLE_HISTORY_PATH):
+        with open(TITLE_HISTORY_PATH, "w", encoding="utf-8") as f:
+            json.dump([], f)
+
+
+def load_title_history() -> list:
+    """Đọc danh sách title đã sử dụng.
+
+    Returns:
+        Danh sách các dict chứa thông tin title đã dùng.
+    """
+    _ensure_file_exists()
+    try:
+        with open(TITLE_HISTORY_PATH, "r", encoding="utf-8") as f:
+            return json.load(f)
+    except (json.JSONDecodeError, ValueError):
+        return []
+
+
+def is_title_used(title: str) -> bool:
+    """Kiểm tra xem title đã được sử dụng chưa.
+
+    So sánh bằng cách chuẩn hóa (lowercase, strip) để tránh trùng lặp
+    do khác biệt chữ hoa/thường hoặc khoảng trắng.
+
+    Args:
+        title: Title cần kiểm tra.
+
+    Returns:
+        True nếu title đã được sử dụng, False nếu chưa.
+    """
+    if not title or not title.strip():
+        return False
+
+    history = load_title_history()
+    normalized_title = title.strip().lower()
+
+    for entry in history:
+        saved_title = entry.get("title", "").strip().lower()
+        if saved_title == normalized_title:
+            return True
+
+    return False
+
+
+def save_title(title: str, thread_id: str = "", source: str = "threads") -> None:
+    """Lưu title đã sử dụng vào lịch sử.
+
+    Args:
+        title: Title của video đã tạo.
+        thread_id: ID của thread (để tham chiếu).
+        source: Nguồn nội dung (threads/reddit).
+    """
+    if not title or not title.strip():
+        return
+
+    _ensure_file_exists()
+
+    history = load_title_history()
+
+    # Kiểm tra trùng trước khi lưu
+    normalized_title = title.strip().lower()
+    for entry in history:
+        if entry.get("title", "").strip().lower() == normalized_title:
+            print_substep(f"Title đã tồn tại trong lịch sử, bỏ qua: {title[:50]}...", style="dim")
+            return
+
+    entry = {
+        "title": title.strip(),
+        "thread_id": thread_id,
+        "source": source,
+        "created_at": int(time.time()),
+    }
+    history.append(entry)
+
+    with open(TITLE_HISTORY_PATH, "w", encoding="utf-8") as f:
+        json.dump(history, f, ensure_ascii=False, indent=4)
+
+    print_substep(f"Đã lưu title vào lịch sử: {title[:50]}...", style="bold green")
+
+
+def get_title_count() -> int:
+    """Đếm số title đã sử dụng.
+
+    Returns:
+        Số lượng title trong lịch sử.
+    """
+    return len(load_title_history())