Merge pull request #4 from thaitien280401-stack/copilot/fetch-content-from-threads

feat: Google Trends fallback for Threads content sourcing
1 month ago · 1c5b309709
parent e696279ea5 069e3d4e3d
commit 1c5b309709
3 changed files with 510 additions and 4 deletions
--- a/threads/google_trends.py
+++ b/threads/google_trends.py
@ -0,0 +1,330 @@
+"""
+Google Trends Integration - Lấy từ khóa trending từ Google Trends.
+
+Sử dụng RSS feed công khai của Google Trends để lấy các từ khóa
+đang thịnh hành tại Việt Nam, sau đó dùng các từ khóa này để tìm
+bài viết trên Threads.
+
+Flow:
+1. Lấy trending keywords từ Google Trends RSS (geo=VN)
+2. Dùng Playwright tìm bài viết trên Threads theo từ khóa
+3. Trả về danh sách bài viết phù hợp
+"""
+
+import xml.etree.ElementTree as ET
+from typing import Dict, List, Optional
+from urllib.parse import quote_plus
+
+import requests
+from playwright.sync_api import (
+    TimeoutError as PlaywrightTimeoutError,
+    sync_playwright,
+)
+
+from utils.console import print_step, print_substep
+
+# Google Trends daily trending RSS endpoint
+_GOOGLE_TRENDS_RSS_URL = "https://trends.google.com/trends/trendingsearches/daily/rss"
+_RSS_REQUEST_TIMEOUT = 15
+
+# Playwright settings (reuse from trending.py)
+_PAGE_LOAD_TIMEOUT_MS = 30_000
+_CONTENT_WAIT_MS = 3_000
+_SCROLL_ITERATIONS = 3
+
+_BROWSER_VIEWPORT = {"width": 1280, "height": 900}
+_BROWSER_USER_AGENT = (
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+    "AppleWebKit/537.36 (KHTML, like Gecko) "
+    "Chrome/131.0.0.0 Safari/537.36"
+)
+_BROWSER_LOCALE = "vi-VN"
+
+# Threads search URL template
+_THREADS_SEARCH_URL = "https://www.threads.net/search?q={query}&serp_type=default"
+
+
+class GoogleTrendsError(Exception):
+    """Lỗi khi lấy dữ liệu từ Google Trends."""
+
+
+def get_google_trending_keywords(
+    geo: str = "VN",
+    limit: int = 10,
+) -> List[Dict[str, str]]:
+    """Lấy danh sách từ khóa trending từ Google Trends RSS feed.
+
+    Args:
+        geo: Mã quốc gia (mặc định: VN cho Việt Nam).
+        limit: Số từ khóa tối đa cần lấy.
+
+    Returns:
+        Danh sách dict chứa ``{title, traffic, news_url}``.
+
+    Raises:
+        GoogleTrendsError: Nếu không thể lấy dữ liệu từ Google Trends.
+    """
+    print_substep(
+        f"🔍 Đang lấy từ khóa trending từ Google Trends (geo={geo})...",
+        style="bold blue",
+    )
+
+    url = f"{_GOOGLE_TRENDS_RSS_URL}?geo={geo}"
+    try:
+        response = requests.get(url, timeout=_RSS_REQUEST_TIMEOUT)
+        response.raise_for_status()
+    except requests.RequestException as exc:
+        raise GoogleTrendsError(
+            f"Không thể kết nối Google Trends RSS: {exc}"
+        ) from exc
+
+    try:
+        root = ET.fromstring(response.content)
+    except ET.ParseError as exc:
+        raise GoogleTrendsError(
+            f"Không thể parse Google Trends RSS XML: {exc}"
+        ) from exc
+
+    # RSS structure: <rss><channel><item>...</item></channel></rss>
+    # Google Trends uses ht: namespace for traffic data
+    namespaces = {"ht": "https://trends.google.com/trends/trendingsearches/daily"}
+
+    keywords: List[Dict[str, str]] = []
+    for item in root.iter("item"):
+        if len(keywords) >= limit:
+            break
+
+        title_elem = item.find("title")
+        title = title_elem.text.strip() if title_elem is not None and title_elem.text else ""
+        if not title:
+            continue
+
+        # Approximate traffic (e.g., "200,000+")
+        traffic_elem = item.find("ht:approx_traffic", namespaces)
+        traffic = traffic_elem.text.strip() if traffic_elem is not None and traffic_elem.text else ""
+
+        # News item URL (optional)
+        news_url = ""
+        news_item = item.find("ht:news_item", namespaces)
+        if news_item is not None:
+            news_url_elem = news_item.find("ht:news_item_url", namespaces)
+            news_url = (
+                news_url_elem.text.strip()
+                if news_url_elem is not None and news_url_elem.text
+                else ""
+            )
+
+        keywords.append({
+            "title": title,
+            "traffic": traffic,
+            "news_url": news_url,
+        })
+
+    if not keywords:
+        raise GoogleTrendsError(
+            f"Không tìm thấy từ khóa trending nào từ Google Trends (geo={geo})."
+        )
+
+    kw_preview = ", ".join(k["title"][:30] for k in keywords[:5])
+    suffix = "..." if len(keywords) > 5 else ""
+    print_substep(
+        f"✅ Tìm thấy {len(keywords)} từ khóa trending: {kw_preview}{suffix}",
+        style="bold green",
+    )
+    return keywords
+
+
+def search_threads_by_query(
+    query: str,
+    max_threads: int = 10,
+) -> List[Dict[str, str]]:
+    """Tìm bài viết trên Threads theo từ khóa bằng Playwright.
+
+    Mở trang tìm kiếm Threads và trích xuất bài viết từ kết quả.
+
+    Args:
+        query: Từ khóa tìm kiếm.
+        max_threads: Số bài viết tối đa cần lấy.
+
+    Returns:
+        Danh sách thread dicts: ``{text, username, permalink, shortcode, keyword}``.
+    """
+    import re
+
+    search_url = _THREADS_SEARCH_URL.format(query=quote_plus(query))
+    threads: List[Dict[str, str]] = []
+
+    with sync_playwright() as p:
+        browser = p.chromium.launch(headless=True)
+        context = browser.new_context(
+            viewport=_BROWSER_VIEWPORT,
+            user_agent=_BROWSER_USER_AGENT,
+            locale=_BROWSER_LOCALE,
+        )
+        page = context.new_page()
+
+        try:
+            page.goto(search_url, timeout=_PAGE_LOAD_TIMEOUT_MS)
+            page.wait_for_load_state("domcontentloaded", timeout=_PAGE_LOAD_TIMEOUT_MS)
+            page.wait_for_timeout(_CONTENT_WAIT_MS)
+
+            # Scroll to load more content
+            for _ in range(_SCROLL_ITERATIONS):
+                page.evaluate("window.scrollBy(0, window.innerHeight)")
+                page.wait_for_timeout(1000)
+
+            # Extract posts from search results
+            seen_shortcodes: set = set()
+            post_links = page.query_selector_all('a[href*="/post/"]')
+
+            for link in post_links:
+                if len(threads) >= max_threads:
+                    break
+                try:
+                    href = link.get_attribute("href") or ""
+                    sc_match = re.search(r"/post/([A-Za-z0-9_-]+)", href)
+                    if not sc_match:
+                        continue
+                    shortcode = sc_match.group(1)
+                    if shortcode in seen_shortcodes:
+                        continue
+                    seen_shortcodes.add(shortcode)
+
+                    # Username from URL: /@username/post/...
+                    user_match = re.search(r"/@([^/]+)/post/", href)
+                    username = user_match.group(1) if user_match else "unknown"
+
+                    # Get post text from parent container
+                    text = _get_post_text_from_link(link)
+                    if not text or len(text) < 10:
+                        continue
+
+                    permalink = (
+                        f"https://www.threads.net{href}"
+                        if href.startswith("/")
+                        else href
+                    )
+                    threads.append({
+                        "text": text,
+                        "username": username,
+                        "permalink": permalink,
+                        "shortcode": shortcode,
+                        "keyword": query,
+                    })
+                except Exception:
+                    continue
+
+        except PlaywrightTimeoutError:
+            print_substep(
+                f"⚠️ Timeout khi tìm kiếm Threads cho từ khóa: {query}",
+                style="bold yellow",
+            )
+        except Exception as exc:
+            print_substep(
+                f"⚠️ Lỗi tìm kiếm Threads cho '{query}': {exc}",
+                style="bold yellow",
+            )
+        finally:
+            browser.close()
+
+    return threads
+
+
+def _get_post_text_from_link(link_handle) -> str:
+    """Walk up the DOM from a link element to extract post text content."""
+    try:
+        container = link_handle.evaluate_handle(
+            """el => {
+                let node = el;
+                for (let i = 0; i < 10; i++) {
+                    node = node.parentElement;
+                    if (!node) return el.parentElement || el;
+                    const text = node.innerText || '';
+                    if (text.length > 30 && (
+                        node.getAttribute('role') === 'article' ||
+                        node.tagName === 'ARTICLE' ||
+                        node.dataset && node.dataset.testid
+                    )) {
+                        return node;
+                    }
+                }
+                return el.parentElement
+                    ? el.parentElement.parentElement || el.parentElement
+                    : el;
+            }"""
+        )
+        raw = container.inner_text().strip() if container else ""
+    except Exception:
+        return ""
+
+    if not raw:
+        return ""
+
+    # Clean: remove short metadata lines (timestamps, UI buttons, etc.)
+    _skip = {"Trả lời", "Thích", "Chia sẻ", "Repost", "Quote", "...", "•"}
+    cleaned_lines: list = []
+    for line in raw.split("\n"):
+        line = line.strip()
+        if not line or len(line) < 3:
+            continue
+        if line in _skip:
+            continue
+        # Skip standalone @username lines
+        if line.startswith("@") and " " not in line and len(line) < 30:
+            continue
+        cleaned_lines.append(line)
+    return "\n".join(cleaned_lines)
+
+
+def get_threads_from_google_trends(
+    geo: str = "VN",
+    max_keywords: int = 5,
+    max_threads_per_keyword: int = 10,
+) -> List[Dict[str, str]]:
+    """Lấy bài viết Threads dựa trên từ khóa trending từ Google Trends.
+
+    Kết hợp Google Trends + Threads search:
+    1. Lấy từ khóa trending từ Google Trends
+    2. Tìm bài viết trên Threads theo từng từ khóa
+
+    Args:
+        geo: Mã quốc gia cho Google Trends.
+        max_keywords: Số từ khóa tối đa cần duyệt.
+        max_threads_per_keyword: Số bài viết tối đa từ mỗi từ khóa.
+
+    Returns:
+        Danh sách thread dicts.
+
+    Raises:
+        GoogleTrendsError: Nếu không lấy được từ khóa từ Google Trends.
+    """
+    print_step("🌐 Đang lấy bài viết từ Threads dựa trên Google Trends...")
+
+    keywords = get_google_trending_keywords(geo=geo, limit=max_keywords)
+    all_threads: List[Dict[str, str]] = []
+
+    for kw in keywords:
+        keyword_title = kw["title"]
+        print_substep(
+            f"  🔎 Đang tìm trên Threads: '{keyword_title}'...",
+            style="bold blue",
+        )
+        found = search_threads_by_query(
+            query=keyword_title,
+            max_threads=max_threads_per_keyword,
+        )
+        all_threads.extend(found)
+        print_substep(
+            f"  📝 '{keyword_title}': {len(found)} bài viết",
+            style="bold blue",
+        )
+
+        # Stop early if we have enough threads
+        if len(all_threads) >= max_threads_per_keyword * 2:
+            break
+
+    print_substep(
+        f"✅ Tổng cộng {len(all_threads)} bài viết từ Google Trends keywords",
+        style="bold green",
+    )
+    return all_threads
--- a/threads/threads_client.py
+++ b/threads/threads_client.py
@ -376,6 +376,141 @@ def _get_trending_content(
    return content


+def _get_google_trends_content(
+    max_comment_length: int,
+    min_comment_length: int,
+) -> Optional[dict]:
+    """Lấy nội dung từ Threads dựa trên từ khóa trending của Google Trends.
+
+    Kết hợp Google Trends (lấy từ khóa) + Playwright (tìm bài viết trên Threads).
+    Trả về None nếu không thể lấy content (để fallback sang user threads).
+    """
+    from threads.google_trends import (
+        GoogleTrendsError,
+        get_threads_from_google_trends,
+    )
+    from threads.trending import scrape_thread_replies
+
+    try:
+        google_threads = get_threads_from_google_trends()
+    except GoogleTrendsError as e:
+        print_substep(f"⚠️ Lỗi lấy Google Trends: {e}", style="bold yellow")
+        return None
+    except Exception as e:
+        print_substep(
+            f"⚠️ Lỗi không mong đợi khi lấy Google Trends: {e}",
+            style="bold yellow",
+        )
+        return None
+
+    if not google_threads:
+        print_substep(
+            "⚠️ Không tìm thấy bài viết Threads nào từ Google Trends keywords.",
+            style="bold yellow",
+        )
+        return None
+
+    # Chọn thread phù hợp (chưa tạo video, không chứa từ bị chặn)
+    thread = None
+    for t in google_threads:
+        text = t.get("text", "")
+        if not text or _contains_blocked_words(text):
+            continue
+        title_candidate = text[:_MAX_TITLE_LENGTH]
+        if is_title_used(title_candidate):
+            print_substep(
+                f"Bỏ qua thread đã tạo video: {text[:50]}...",
+                style="bold yellow",
+            )
+            continue
+        thread = t
+        break
+
+    if thread is None:
+        if google_threads:
+            thread = google_threads[0]
+        else:
+            return None
+
+    thread_text = thread.get("text", "")
+    thread_username = thread.get("username", "unknown")
+    thread_url = thread.get("permalink", "")
+    shortcode = thread.get("shortcode", "")
+    keyword = thread.get("keyword", "")
+
+    # Dùng keyword làm tiêu đề video nếu có
+    display_title = keyword if keyword else thread_text[:_MAX_TITLE_LENGTH]
+
+    print_substep(
+        f"Video sẽ được tạo từ Google Trends: {display_title[:100]}...",
+        style="bold green",
+    )
+    print_substep(f"Thread URL: {thread_url}", style="bold green")
+    print_substep(f"Tác giả: @{thread_username}", style="bold blue")
+    print_substep(f"Từ khóa Google Trends: {keyword}", style="bold blue")
+
+    content: dict = {
+        "thread_url": thread_url,
+        "thread_title": display_title,
+        "thread_id": re.sub(
+            r"[^\w\s-]", "",
+            shortcode or f"gtrends_{hash(thread_text) % 10**8}",
+        ),
+        "thread_author": f"@{thread_username}",
+        "is_nsfw": False,
+        "thread_post": thread_text,
+        "comments": [],
+    }
+
+    if not settings.config["settings"].get("storymode", False):
+        # Lấy replies bằng scraping
+        try:
+            if thread_url:
+                raw_replies = scrape_thread_replies(thread_url, limit=50)
+            else:
+                raw_replies = []
+        except Exception as exc:
+            print_substep(
+                f"⚠️ Lỗi lấy replies (Google Trends): {exc}",
+                style="bold yellow",
+            )
+            raw_replies = []
+
+        for idx, reply in enumerate(raw_replies):
+            reply_text = reply.get("text", "")
+            reply_username = reply.get("username", "unknown")
+
+            if not reply_text or _contains_blocked_words(reply_text):
+                continue
+
+            sanitised = sanitize_text(reply_text)
+            if not sanitised or sanitised.strip() == "":
+                continue
+
+            if len(reply_text) > max_comment_length:
+                continue
+            if len(reply_text) < min_comment_length:
+                continue
+
+            content["comments"].append(
+                {
+                    "comment_body": reply_text,
+                    "comment_url": "",
+                    "comment_id": re.sub(
+                        r"[^\w\s-]", "", f"gtrends_reply_{idx}"
+                    ),
+                    "comment_author": f"@{reply_username}",
+                }
+            )
+
+    print_substep(
+        f"Đã lấy nội dung từ Google Trends thành công! "
+        f"({len(content.get('comments', []))} replies)",
+        style="bold green",
+    )
+    return content
+
+
 def get_threads_posts(POST_ID: str = None) -> dict:
    """Lấy nội dung từ Threads để tạo video.

@ -449,9 +584,35 @@ def get_threads_posts(POST_ID: str = None) -> dict:
        )
        if content is not None:
            return content
-        # Fallback: nếu trending thất bại, tiếp tục dùng user threads
+        # Fallback: trending thất bại → thử Google Trends
+        print_substep(
+            "⚠️ Trending không khả dụng, thử lấy từ Google Trends...",
+            style="bold yellow",
+        )
+        content = _get_google_trends_content(
+            max_comment_length=max_comment_length,
+            min_comment_length=min_comment_length,
+        )
+        if content is not None:
+            return content
        print_substep(
-            "⚠️ Trending không khả dụng, chuyển sang lấy từ user threads...",
+            "⚠️ Google Trends cũng không khả dụng, chuyển sang user threads...",
+            style="bold yellow",
+        )
+
+    # ------------------------------------------------------------------
+    # Source: google_trends  –  Lấy bài viết dựa trên Google Trends
+    # ------------------------------------------------------------------
+    if source == "google_trends" and not POST_ID:
+        content = _get_google_trends_content(
+            max_comment_length=max_comment_length,
+            min_comment_length=min_comment_length,
+        )
+        if content is not None:
+            return content
+        # Fallback: Google Trends thất bại → tiếp tục dùng user threads
+        print_substep(
+            "⚠️ Google Trends không khả dụng, chuyển sang lấy từ user threads...",
            style="bold yellow",
        )

@ -467,13 +628,28 @@ def get_threads_posts(POST_ID: str = None) -> dict:
        threads_list = client.get_user_threads(user_id=target_user, limit=25)

        if not threads_list:
+            print_substep(
+                "⚠️ Không tìm thấy threads từ user API!\n"
+                f"   - User ID đang dùng: {target_user}\n"
+                "   Đang thử lấy bài viết từ Google Trends...",
+                style="bold yellow",
+            )
+            # Fallback cuối cùng: thử Google Trends khi user threads cũng thất bại
+            if source != "google_trends":  # Tránh gọi lại nếu đã thử
+                content = _get_google_trends_content(
+                    max_comment_length=max_comment_length,
+                    min_comment_length=min_comment_length,
+                )
+                if content is not None:
+                    return content
            print_substep(
                "❌ Không tìm thấy threads nào!\n"
                "   Kiểm tra các nguyên nhân sau:\n"
                f"   - User ID đang dùng: {target_user}\n"
                "   - User này có bài viết công khai không?\n"
                "   - Token có quyền threads_basic_read?\n"
-                "   - Token có đúng cho user_id này không?",
+                "   - Token có đúng cho user_id này không?\n"
+                "   - Google Trends fallback cũng không tìm thấy bài viết.",
                style="bold red",
            )
            raise ValueError(
--- a/utils/.config.template.toml
+++ b/utils/.config.template.toml
@ -5,7 +5,7 @@ access_token = { optional = false, nmin = 10, explanation = "Threads API access
 user_id = { optional = false, nmin = 1, explanation = "Threads user ID của bạn", example = "12345678" }

 [threads.thread]
-source = { optional = true, default = "user", options = ["user", "trending"], explanation = "Nguồn lấy bài viết: 'user' (từ user cụ thể) hoặc 'trending' (từ Trending now). Mặc định: user", example = "user" }
+source = { optional = true, default = "user", options = ["user", "trending", "google_trends"], explanation = "Nguồn lấy bài viết: 'user' (từ user cụ thể), 'trending' (từ Trending now), hoặc 'google_trends' (từ khóa Google Trends). Mặc định: user", example = "user" }
 target_user_id = { optional = true, default = "", explanation = "ID user muốn lấy threads. Để trống dùng user của bạn. Chỉ dùng khi source = 'user'.", example = "87654321" }
 post_id = { optional = true, default = "", explanation = "ID cụ thể của thread. Để trống để tự động chọn.", example = "18050000000000000" }
 keywords = { optional = true, default = "", type = "str", explanation = "Từ khóa lọc threads, phân cách bằng dấu phẩy.", example = "viral, trending, hài hước" }