You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
RedditVideoMakerBot/threads/trending.py

373 lines
13 KiB

"""
Threads Trending Scraper - Lấy bài viết từ mục "Trending now" trên Threads.
Threads API chính thức không cung cấp endpoint cho trending topics.
Module này sử dụng Playwright để scrape nội dung trending từ giao diện web Threads.
Flow:
1. Mở trang tìm kiếm Threads (https://www.threads.net/search)
2. Trích xuất trending topic links
3. Truy cập từng topic để lấy danh sách bài viết
4. Truy cập bài viết để lấy replies (nếu cần)
"""
import re
from typing import Dict, List
from playwright.sync_api import (
Page,
TimeoutError as PlaywrightTimeoutError,
sync_playwright,
)
from utils.console import print_step, print_substep
THREADS_SEARCH_URL = "https://www.threads.net/search"
_PAGE_LOAD_TIMEOUT_MS = 30_000
_CONTENT_WAIT_MS = 3_000
_REPLY_SCROLL_ITERATIONS = 5
_TOPIC_SCROLL_ITERATIONS = 2
# Shared browser context settings
_BROWSER_VIEWPORT = {"width": 1280, "height": 900}
_BROWSER_USER_AGENT = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/131.0.0.0 Safari/537.36"
)
_BROWSER_LOCALE = "vi-VN"
class TrendingScrapeError(Exception):
"""Lỗi khi scrape trending content từ Threads."""
# ---------------------------------------------------------------------------
# Internal helpers
# ---------------------------------------------------------------------------
def _extract_topic_links(page: Page, limit: int) -> List[Dict[str, str]]:
"""Extract trending topic links from the search page DOM."""
topics: List[Dict[str, str]] = []
elements = page.query_selector_all('a[href*="/search?q="]')
for elem in elements:
if len(topics) >= limit:
break
try:
href = elem.get_attribute("href") or ""
text = elem.inner_text().strip()
if not text or not href:
continue
lines = [line.strip() for line in text.split("\n") if line.strip()]
title = lines[0] if lines else ""
if not title:
continue
url = f"https://www.threads.net{href}" if href.startswith("/") else href
topics.append({"title": title, "url": url})
except Exception:
continue
return topics
def _extract_post_links(page: Page, limit: int) -> List[Dict[str, str]]:
"""Extract thread post data from a page containing post links."""
threads: List[Dict[str, str]] = []
seen_shortcodes: set = set()
post_links = page.query_selector_all('a[href*="/post/"]')
for link in post_links:
if len(threads) >= limit:
break
try:
href = link.get_attribute("href") or ""
sc_match = re.search(r"/post/([A-Za-z0-9_-]+)", href)
if not sc_match:
continue
shortcode = sc_match.group(1)
if shortcode in seen_shortcodes:
continue
seen_shortcodes.add(shortcode)
# Username from URL: /@username/post/...
user_match = re.search(r"/@([^/]+)/post/", href)
username = user_match.group(1) if user_match else "unknown"
# Walk up the DOM to find a container with the post text
text = _get_post_text(link)
if not text or len(text) < 10:
continue
permalink = (
f"https://www.threads.net{href}" if href.startswith("/") else href
)
threads.append(
{
"text": text,
"username": username,
"permalink": permalink,
"shortcode": shortcode,
}
)
except Exception:
continue
return threads
def _get_post_text(link_handle) -> str:
"""Walk up the DOM from a link element to extract post text content."""
try:
container = link_handle.evaluate_handle(
"""el => {
let node = el;
for (let i = 0; i < 10; i++) {
node = node.parentElement;
if (!node) return el.parentElement || el;
const text = node.innerText || '';
if (text.length > 30 && (
node.getAttribute('role') === 'article' ||
node.tagName === 'ARTICLE' ||
node.dataset && node.dataset.testid
)) {
return node;
}
}
return el.parentElement ? el.parentElement.parentElement || el.parentElement : el;
}"""
)
raw = container.inner_text().strip() if container else ""
except Exception:
return ""
if not raw:
return ""
# Clean: remove short metadata lines (timestamps, UI buttons, etc.)
_skip = {"Trả lời", "Thích", "Chia sẻ", "Repost", "Quote", "...", ""}
cleaned_lines: list = []
for line in raw.split("\n"):
line = line.strip()
if not line or len(line) < 3:
continue
if line in _skip:
continue
# Skip standalone @username lines
if line.startswith("@") and " " not in line and len(line) < 30:
continue
cleaned_lines.append(line)
return "\n".join(cleaned_lines)
def _extract_replies(page: Page, limit: int) -> List[Dict[str, str]]:
"""Extract replies from a thread detail page."""
replies: List[Dict[str, str]] = []
# Scroll to load more replies
for _ in range(_REPLY_SCROLL_ITERATIONS):
page.evaluate("window.scrollBy(0, window.innerHeight)")
page.wait_for_timeout(1000)
articles = page.query_selector_all('div[role="article"], article')
for idx, article in enumerate(articles):
if idx == 0:
continue # Skip main post
if len(replies) >= limit:
break
try:
text = article.inner_text().strip()
if not text or len(text) < 5:
continue
# Username
username_link = article.query_selector('a[href^="/@"]')
username = "unknown"
if username_link:
href = username_link.get_attribute("href") or ""
match = re.match(r"/@([^/]+)", href)
username = match.group(1) if match else "unknown"
# Clean text
_skip = {"Trả lời", "Thích", "Chia sẻ", "Repost", "...", ""}
lines = [
l.strip()
for l in text.split("\n")
if l.strip() and len(l.strip()) > 3 and l.strip() not in _skip
]
clean_text = "\n".join(lines)
if clean_text:
replies.append({"text": clean_text, "username": username})
except Exception:
continue
return replies
def _create_browser_context(playwright):
"""Create a Playwright browser and context with shared settings."""
browser = playwright.chromium.launch(headless=True)
context = browser.new_context(
viewport=_BROWSER_VIEWPORT,
user_agent=_BROWSER_USER_AGENT,
locale=_BROWSER_LOCALE,
)
return browser, context
def _scroll_page(page: Page, times: int = _TOPIC_SCROLL_ITERATIONS) -> None:
"""Scroll down to trigger lazy-loading content."""
for _ in range(times):
page.evaluate("window.scrollBy(0, window.innerHeight)")
page.wait_for_timeout(1000)
# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------
def get_trending_threads(
max_topics: int = 5,
max_threads_per_topic: int = 10,
) -> List[Dict[str, str]]:
"""Lấy danh sách threads từ các trending topics trên Threads.
Mở một phiên Playwright duy nhất, duyệt qua trending topics
và trích xuất bài viết từ mỗi topic.
Args:
max_topics: Số trending topics tối đa cần duyệt.
max_threads_per_topic: Số bài viết tối đa từ mỗi topic.
Returns:
Danh sách thread dicts: ``{text, username, permalink, shortcode, topic_title}``.
Raises:
TrendingScrapeError: Nếu không thể scrape trending.
"""
print_step("🔥 Đang lấy bài viết từ Trending now trên Threads...")
all_threads: List[Dict[str, str]] = []
with sync_playwright() as p:
browser, context = _create_browser_context(p)
page = context.new_page()
try:
# Step 1: Navigate to search page
page.goto(THREADS_SEARCH_URL, timeout=_PAGE_LOAD_TIMEOUT_MS)
page.wait_for_load_state(
"domcontentloaded", timeout=_PAGE_LOAD_TIMEOUT_MS
)
page.wait_for_timeout(_CONTENT_WAIT_MS)
# Step 2: Extract trending topics
topics = _extract_topic_links(page, limit=max_topics)
if not topics:
raise TrendingScrapeError(
"Không tìm thấy trending topics trên Threads. "
"Có thể Threads đã thay đổi giao diện hoặc yêu cầu đăng nhập."
)
topic_names = ", ".join(t["title"][:30] for t in topics[:3])
suffix = "..." if len(topics) > 3 else ""
print_substep(
f"🔥 Tìm thấy {len(topics)} trending topics: {topic_names}{suffix}",
style="bold blue",
)
# Step 3: Visit each topic and extract threads
for topic in topics:
try:
page.goto(topic["url"], timeout=_PAGE_LOAD_TIMEOUT_MS)
page.wait_for_load_state(
"domcontentloaded", timeout=_PAGE_LOAD_TIMEOUT_MS
)
page.wait_for_timeout(_CONTENT_WAIT_MS)
_scroll_page(page, times=2)
threads = _extract_post_links(
page, limit=max_threads_per_topic
)
for t in threads:
t["topic_title"] = topic["title"]
all_threads.extend(threads)
print_substep(
f" 📝 Topic '{topic['title'][:30]}': "
f"{len(threads)} bài viết",
style="bold blue",
)
except PlaywrightTimeoutError:
print_substep(
f" ⚠️ Timeout topic '{topic['title'][:30]}'",
style="bold yellow",
)
except Exception as exc:
print_substep(
f" ⚠️ Lỗi topic '{topic['title'][:30]}': {exc}",
style="bold yellow",
)
except TrendingScrapeError:
raise
except PlaywrightTimeoutError as exc:
raise TrendingScrapeError(
"Timeout khi tải trang Threads. Kiểm tra kết nối mạng."
) from exc
except Exception as exc:
raise TrendingScrapeError(
f"Lỗi khi scrape trending: {exc}"
) from exc
finally:
browser.close()
print_substep(
f"✅ Tổng cộng {len(all_threads)} bài viết từ trending",
style="bold green",
)
return all_threads
def scrape_thread_replies(
thread_url: str, limit: int = 50
) -> List[Dict[str, str]]:
"""Lấy replies của một thread bằng cách scrape trang web.
Sử dụng khi không thể dùng Threads API chính thức
(ví dụ thread không thuộc user đã xác thực).
Args:
thread_url: URL của thread trên Threads.
limit: Số replies tối đa.
Returns:
Danh sách reply dicts: ``{text, username}``.
"""
print_substep(f"💬 Đang lấy replies từ: {thread_url[:60]}...")
replies: List[Dict[str, str]] = []
with sync_playwright() as p:
browser, context = _create_browser_context(p)
page = context.new_page()
try:
page.goto(thread_url, timeout=_PAGE_LOAD_TIMEOUT_MS)
page.wait_for_load_state(
"domcontentloaded", timeout=_PAGE_LOAD_TIMEOUT_MS
)
page.wait_for_timeout(_CONTENT_WAIT_MS)
replies = _extract_replies(page, limit=limit)
except PlaywrightTimeoutError:
print_substep(
"⚠️ Timeout khi tải thread", style="bold yellow"
)
except Exception as exc:
print_substep(
f"⚠️ Lỗi lấy replies: {exc}", style="bold yellow"
)
finally:
browser.close()
print_substep(f"💬 Đã lấy {len(replies)} replies", style="bold blue")
return replies