- Web scraper (platforms/threads/scraper.py) with div-based card parsing - Multi-source discovery: For You feed + configurable search queries - Engagement filtering (min_engagement) and post age filter (max_post_age) - Shared Playwright auth module (platforms/threads/auth.py) - Migrated ffmpeg-python to av (PyAV) for in-process media probing - Video composition uses subprocess ffmpeg (av filter graph segfault workaround) - Updated CLAUDE.md with Threads scraping and macOS-specific notes Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>pull/2551/head
parent
89b1c0d0e5
commit
5e183d8e2c
@ -0,0 +1,95 @@
|
||||
"""Shared Playwright authentication for Threads.net.
|
||||
|
||||
Used by both the screenshotter (screenshot.py) and the web scraper (scraper.py).
|
||||
"""
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
from playwright.sync_api import Browser, BrowserContext, Page, ViewportSize
|
||||
|
||||
from utils import settings
|
||||
from utils.console import print_substep
|
||||
|
||||
THREADS_LOGIN_URL = "https://www.threads.net/login"
|
||||
THREADS_COOKIE_FILE = "./video_creation/data/cookie-threads.json"
|
||||
DEFAULT_USER_AGENT = (
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/120.0.0.0 Safari/537.36"
|
||||
)
|
||||
|
||||
|
||||
def login_to_threads(page: Page, _context: BrowserContext) -> None:
|
||||
"""Log into threads.net via Instagram credentials and persist session cookies."""
|
||||
username = settings.config["threads"]["creds"].get("username", "").strip()
|
||||
password = settings.config["threads"]["creds"].get("password", "").strip()
|
||||
|
||||
if not username or not password:
|
||||
raise RuntimeError(
|
||||
"Threads login requires credentials. "
|
||||
"Set threads.creds.username and threads.creds.password in config.toml"
|
||||
)
|
||||
|
||||
print_substep("Logging into Threads (via Instagram)...")
|
||||
page.goto(THREADS_LOGIN_URL, timeout=0)
|
||||
page.wait_for_load_state("networkidle")
|
||||
|
||||
page.locator('input[autocomplete="username"]').fill(username)
|
||||
page.locator('input[autocomplete="current-password"]').fill(password)
|
||||
page.get_by_role("button", name="Log in", exact=True).first.click()
|
||||
|
||||
page.wait_for_timeout(6000)
|
||||
|
||||
cookies = _context.cookies()
|
||||
Path(THREADS_COOKIE_FILE).parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(THREADS_COOKIE_FILE, "w") as f:
|
||||
json.dump(cookies, f)
|
||||
|
||||
print_substep("Logged into Threads and saved session cookies.", style="bold green")
|
||||
|
||||
|
||||
def ensure_authenticated_context(browser: Browser, **kwargs) -> BrowserContext:
|
||||
"""Create a Playwright browser context with Threads session cookies loaded.
|
||||
|
||||
Loads saved cookies from cookie-threads.json. If no valid session exists,
|
||||
performs a fresh login and persists the cookies.
|
||||
|
||||
Keyword arguments override defaults for locale, viewport, device_scale_factor,
|
||||
color_scheme, and user_agent.
|
||||
"""
|
||||
theme = settings.config["settings"]["theme"]
|
||||
W = int(settings.config["settings"]["resolution_w"])
|
||||
H = int(settings.config["settings"]["resolution_h"])
|
||||
dsf = (W // 600) + 1
|
||||
|
||||
defaults = {
|
||||
"locale": "en-US",
|
||||
"color_scheme": "dark" if theme == "dark" else "light",
|
||||
"viewport": ViewportSize(width=W, height=H),
|
||||
"device_scale_factor": dsf,
|
||||
"user_agent": DEFAULT_USER_AGENT,
|
||||
}
|
||||
defaults.update(kwargs)
|
||||
|
||||
context = browser.new_context(**defaults)
|
||||
|
||||
cookie_path = Path(THREADS_COOKIE_FILE)
|
||||
if cookie_path.exists():
|
||||
try:
|
||||
with open(cookie_path, encoding="utf-8") as f:
|
||||
saved_cookies = json.load(f)
|
||||
context.add_cookies(saved_cookies)
|
||||
print_substep("Loaded saved Threads session cookies.")
|
||||
except (json.JSONDecodeError, IOError):
|
||||
print_substep("Saved cookies corrupted. Logging in fresh...")
|
||||
page = context.new_page()
|
||||
login_to_threads(page, context)
|
||||
page.close()
|
||||
else:
|
||||
print_substep("No saved cookies found. Logging in...")
|
||||
page = context.new_page()
|
||||
login_to_threads(page, context)
|
||||
page.close()
|
||||
|
||||
return context
|
||||
@ -0,0 +1,587 @@
|
||||
"""Web scraping-based trending post discovery for Threads.net.
|
||||
|
||||
Bypasses the Meta Graph API (which only accesses your own posts) by using Playwright
|
||||
to scrape threads.net directly — the "For You" feed, post pages, and replies.
|
||||
Returns the standard content_object dict consumed by the rest of the pipeline.
|
||||
"""
|
||||
|
||||
import re
|
||||
from typing import Optional
|
||||
|
||||
from playwright.sync_api import BrowserContext, Locator, sync_playwright
|
||||
|
||||
from platforms.threads.auth import ensure_authenticated_context
|
||||
from utils import settings
|
||||
from utils.console import print_step, print_substep
|
||||
from utils.voice import sanitize_text
|
||||
from utils.videos import check_done_by_id
|
||||
|
||||
FEED_URL = "https://www.threads.net"
|
||||
SCROLL_DELAY_MS = 2000
|
||||
MAX_FEED_SCROLLS = 36
|
||||
POST_LINK_SELECTOR = 'a[href*="/post/"]'
|
||||
CARD_XPATH = 'xpath=ancestor::div[contains(@class, "x1a2a7pz")][1]'
|
||||
|
||||
|
||||
def _post_id_from_url(url: str) -> str:
|
||||
return url.rstrip("/").split("/")[-1]
|
||||
|
||||
|
||||
def _to_absolute_url(href: str) -> str:
|
||||
if href.startswith("http"):
|
||||
return href
|
||||
return "https://www.threads.net" + href
|
||||
|
||||
|
||||
def _parse_abbreviated_number(s: str) -> int:
|
||||
"""Parse abbreviated numbers like '1K', '2.5M' into integers."""
|
||||
s = s.strip().upper().replace(",", "")
|
||||
if not s:
|
||||
return 0
|
||||
multipliers = {"K": 1_000, "M": 1_000_000}
|
||||
if s[-1] in multipliers:
|
||||
try:
|
||||
return int(float(s[:-1]) * multipliers[s[-1]])
|
||||
except ValueError:
|
||||
return 0
|
||||
try:
|
||||
return int(s)
|
||||
except ValueError:
|
||||
return 0
|
||||
|
||||
|
||||
def _parse_card_text(text: str) -> dict:
|
||||
"""Parse a Threads card's raw text into structured data.
|
||||
|
||||
Threads card format:
|
||||
line 0: username
|
||||
line 1: timestamp (e.g. "14h", "1d")
|
||||
lines 2..N: post body text
|
||||
last 1-4 lines: engagement metrics (likes, replies, reposts, quotes)
|
||||
|
||||
Returns dict with keys: username, timestamp, body, likes, replies, reposts
|
||||
"""
|
||||
if not text:
|
||||
return {"username": "", "timestamp": "", "body": "", "likes": 0, "replies": 0, "reposts": 0}
|
||||
|
||||
lines = text.strip().split("\n")
|
||||
if len(lines) < 3:
|
||||
return {"username": "", "timestamp": "", "body": text, "likes": 0, "replies": 0, "reposts": 0}
|
||||
|
||||
username = lines[0].strip()
|
||||
timestamp = lines[1].strip()
|
||||
|
||||
# Find where engagement metrics start (trailing numeric/abbreviated lines)
|
||||
metric_start = len(lines)
|
||||
for i in range(len(lines) - 1, 1, -1):
|
||||
line = lines[i].strip()
|
||||
if re.match(r'^[\d,.]+[KkMm]?$', line):
|
||||
metric_start = i
|
||||
else:
|
||||
break
|
||||
|
||||
# Body is everything between timestamp and metrics
|
||||
body_lines = lines[2:metric_start]
|
||||
body = "\n".join(body_lines).strip()
|
||||
|
||||
# Parse engagement metrics from the end
|
||||
metrics = lines[metric_start:]
|
||||
likes = 0
|
||||
replies_count = 0
|
||||
reposts = 0
|
||||
|
||||
if len(metrics) >= 1:
|
||||
likes = _parse_abbreviated_number(metrics[0])
|
||||
if len(metrics) >= 2:
|
||||
replies_count = _parse_abbreviated_number(metrics[1])
|
||||
if len(metrics) >= 3:
|
||||
reposts = _parse_abbreviated_number(metrics[2])
|
||||
|
||||
return {
|
||||
"username": username,
|
||||
"timestamp": timestamp,
|
||||
"body": body,
|
||||
"likes": likes,
|
||||
"replies": replies_count,
|
||||
"reposts": reposts,
|
||||
}
|
||||
|
||||
|
||||
def _extract_text_from_card(link: Locator) -> str:
|
||||
"""Walk up from a post link to the card container and extract its raw text."""
|
||||
try:
|
||||
card = link.locator(CARD_XPATH)
|
||||
if card.count():
|
||||
return card.first.inner_text(timeout=3000).strip()
|
||||
except Exception:
|
||||
pass
|
||||
return ""
|
||||
|
||||
|
||||
# --- Feed scraping ---
|
||||
|
||||
|
||||
def _scrape_feed_posts(context: BrowserContext, max_scrolls: int = MAX_FEED_SCROLLS) -> list[dict]:
|
||||
"""Navigate to threads.net feed, scroll, extract post metadata with engagement metrics."""
|
||||
print_step("Scraping Threads trending feed...")
|
||||
page = context.new_page()
|
||||
posts: list[dict] = []
|
||||
seen_ids: set[str] = set()
|
||||
|
||||
try:
|
||||
page.goto(FEED_URL, timeout=0)
|
||||
page.wait_for_timeout(4000)
|
||||
|
||||
last_height = 0
|
||||
|
||||
for i in range(max_scrolls):
|
||||
links = page.locator(POST_LINK_SELECTOR).all()
|
||||
new_found = 0
|
||||
|
||||
for link in links:
|
||||
href = link.get_attribute("href")
|
||||
if not href:
|
||||
continue
|
||||
post_id = _post_id_from_url(href)
|
||||
if post_id in seen_ids:
|
||||
continue
|
||||
seen_ids.add(post_id)
|
||||
|
||||
raw_text = _extract_text_from_card(link)
|
||||
parsed = _parse_card_text(raw_text)
|
||||
|
||||
posts.append({
|
||||
"url": _to_absolute_url(href),
|
||||
"text": raw_text,
|
||||
"body": parsed["body"],
|
||||
"username": parsed["username"],
|
||||
"timestamp": parsed["timestamp"],
|
||||
"likes": parsed["likes"],
|
||||
"replies_shown": parsed["replies"],
|
||||
"reposts": parsed["reposts"],
|
||||
"post_id": post_id,
|
||||
})
|
||||
new_found += 1
|
||||
|
||||
if new_found > 0:
|
||||
top = posts[-1]
|
||||
print_substep(
|
||||
f"Scroll {i + 1}: +{new_found} posts | top: "
|
||||
f"♥{top['likes']:,} 💬{top['replies_shown']} 🔁{top['reposts']} "
|
||||
f"'{top['body'][:50]}...'",
|
||||
style="dim",
|
||||
)
|
||||
|
||||
if new_found == 0 and i > 5:
|
||||
break
|
||||
|
||||
page.evaluate("window.scrollBy(0, document.body.scrollHeight)")
|
||||
page.wait_for_timeout(SCROLL_DELAY_MS)
|
||||
|
||||
new_height = page.evaluate("document.body.scrollHeight")
|
||||
if new_height == last_height:
|
||||
break
|
||||
last_height = new_height
|
||||
|
||||
finally:
|
||||
page.close()
|
||||
|
||||
print_substep(f"Scraped {len(posts)} posts from feed.", style="bold green")
|
||||
return posts
|
||||
|
||||
|
||||
def _scrape_search_page(context: BrowserContext, query: str, max_scrolls: int = 5) -> list[dict]:
|
||||
"""Search Threads for a query and scrape the results.
|
||||
|
||||
Uses the same card extraction as the main feed.
|
||||
"""
|
||||
print_step(f"Scraping Threads search: '{query}'...")
|
||||
page = context.new_page()
|
||||
posts: list[dict] = []
|
||||
seen_ids: set[str] = set()
|
||||
search_url = f"https://www.threads.net/search?q={query}&serp_type=tags"
|
||||
|
||||
try:
|
||||
page.goto(search_url, timeout=0)
|
||||
page.wait_for_timeout(4000)
|
||||
|
||||
for i in range(max_scrolls):
|
||||
links = page.locator(POST_LINK_SELECTOR).all()
|
||||
new_found = 0
|
||||
|
||||
for link in links:
|
||||
href = link.get_attribute("href")
|
||||
if not href:
|
||||
continue
|
||||
post_id = _post_id_from_url(href)
|
||||
if post_id in seen_ids:
|
||||
continue
|
||||
seen_ids.add(post_id)
|
||||
|
||||
raw_text = _extract_text_from_card(link)
|
||||
parsed = _parse_card_text(raw_text)
|
||||
|
||||
posts.append({
|
||||
"url": _to_absolute_url(href),
|
||||
"text": raw_text,
|
||||
"body": parsed["body"],
|
||||
"username": parsed["username"],
|
||||
"timestamp": parsed["timestamp"],
|
||||
"likes": parsed["likes"],
|
||||
"replies_shown": parsed["replies"],
|
||||
"reposts": parsed["reposts"],
|
||||
"post_id": post_id,
|
||||
})
|
||||
new_found += 1
|
||||
|
||||
if new_found == 0:
|
||||
break
|
||||
|
||||
page.evaluate("window.scrollBy(0, document.body.scrollHeight)")
|
||||
page.wait_for_timeout(SCROLL_DELAY_MS)
|
||||
|
||||
finally:
|
||||
page.close()
|
||||
|
||||
print_substep(f"Search '{query}': {len(posts)} posts.", style="dim")
|
||||
return posts
|
||||
|
||||
|
||||
# --- Candidate filtering ---
|
||||
|
||||
|
||||
def _parse_timestamp_to_hours(ts: str) -> float | None:
|
||||
"""Convert a Threads timestamp like '14h', '1d', '3d' to hours.
|
||||
|
||||
Returns None if the format is unrecognized.
|
||||
"""
|
||||
if not ts:
|
||||
return None
|
||||
ts = ts.strip().lower()
|
||||
if ts.endswith("h"):
|
||||
try:
|
||||
return float(ts[:-1])
|
||||
except ValueError:
|
||||
return None
|
||||
elif ts.endswith("d"):
|
||||
try:
|
||||
return float(ts[:-1]) * 24
|
||||
except ValueError:
|
||||
return None
|
||||
elif ts.endswith("w"):
|
||||
try:
|
||||
return float(ts[:-1]) * 24 * 7
|
||||
except ValueError:
|
||||
return None
|
||||
elif ts.endswith("m") and not ts.endswith("min"):
|
||||
try:
|
||||
return float(ts[:-1]) * 24 * 30
|
||||
except ValueError:
|
||||
return None
|
||||
return None
|
||||
|
||||
|
||||
def _age_from_config() -> float | None:
|
||||
"""Parse max_post_age config value into hours. Returns None if disabled."""
|
||||
raw = settings.config["threads"]["thread"].get("max_post_age", "")
|
||||
if not raw:
|
||||
return None
|
||||
return _parse_timestamp_to_hours(raw)
|
||||
|
||||
|
||||
def _contains_blocked(text: str, blocked_raw: str) -> bool:
|
||||
if not blocked_raw:
|
||||
return False
|
||||
blocked = [w.strip().lower() for w in blocked_raw.split(",") if w.strip()]
|
||||
text_lower = text.lower()
|
||||
return any(word in text_lower for word in blocked)
|
||||
|
||||
|
||||
def _filter_candidates(posts: list[dict]) -> list[dict]:
|
||||
"""Filter feed posts by engagement, blocked words, and duplicates.
|
||||
|
||||
Sorts by total engagement (likes + replies) descending so the most
|
||||
viral posts are tried first.
|
||||
"""
|
||||
t_config = settings.config["threads"]["thread"]
|
||||
blocked_raw = t_config.get("blocked_words", "")
|
||||
min_engagement = int(t_config.get("min_engagement", 0))
|
||||
|
||||
max_age_hours = _age_from_config()
|
||||
|
||||
candidates = []
|
||||
for post in posts:
|
||||
if check_done_by_id(post["post_id"]):
|
||||
continue
|
||||
if _contains_blocked(post["body"], blocked_raw):
|
||||
continue
|
||||
if not post["body"] or len(post["body"].strip()) < 10:
|
||||
continue
|
||||
# Age filter
|
||||
if max_age_hours is not None:
|
||||
post_hours = _parse_timestamp_to_hours(post.get("timestamp", ""))
|
||||
if post_hours is not None and post_hours > max_age_hours:
|
||||
continue
|
||||
total_engagement = post.get("likes", 0) + post.get("reposts", 0)
|
||||
if total_engagement < min_engagement:
|
||||
continue
|
||||
post["_total_engagement"] = total_engagement
|
||||
candidates.append(post)
|
||||
|
||||
# Sort by engagement descending — most viral first
|
||||
candidates.sort(key=lambda p: p.get("_total_engagement", 0), reverse=True)
|
||||
|
||||
age_str = f", max age ≤{max_age_hours}h" if max_age_hours else ""
|
||||
if min_engagement > 0:
|
||||
print_substep(
|
||||
f"Filtered {len(posts)} posts -> {len(candidates)} viral candidates "
|
||||
f"(min ♥+🔁 ≥ {min_engagement:,}{age_str})",
|
||||
style="dim",
|
||||
)
|
||||
else:
|
||||
print_substep(
|
||||
f"Filtered {len(posts)} posts -> {len(candidates)} candidates"
|
||||
f"{' (max age ≤' + str(max_age_hours) + 'h)' if max_age_hours else ''}",
|
||||
style="dim",
|
||||
)
|
||||
return candidates
|
||||
|
||||
|
||||
# --- Reply scraping on post pages ---
|
||||
|
||||
|
||||
def _scrape_post_replies(context: BrowserContext, post_url: str, max_replies: int = 100) -> list[dict]:
|
||||
"""Navigate to a post page, scroll to load replies, extract reply data.
|
||||
|
||||
Uses _parse_card_text to separate reply body from metadata (username, timestamp, etc.).
|
||||
"""
|
||||
page = context.new_page()
|
||||
replies: list[dict] = []
|
||||
seen_ids: set[str] = set()
|
||||
main_post_id = _post_id_from_url(post_url)
|
||||
|
||||
try:
|
||||
page.goto(post_url, timeout=0)
|
||||
page.wait_for_timeout(4000)
|
||||
|
||||
stable_count = 0
|
||||
last_count = 0
|
||||
|
||||
for _ in range(15):
|
||||
links = page.locator(POST_LINK_SELECTOR).all()
|
||||
|
||||
for link in links:
|
||||
href = link.get_attribute("href")
|
||||
if not href:
|
||||
continue
|
||||
reply_id = _post_id_from_url(href)
|
||||
if reply_id == main_post_id:
|
||||
continue
|
||||
if reply_id in seen_ids:
|
||||
continue
|
||||
seen_ids.add(reply_id)
|
||||
|
||||
raw_text = _extract_text_from_card(link)
|
||||
if not raw_text:
|
||||
continue
|
||||
|
||||
parsed = _parse_card_text(raw_text)
|
||||
cleaned_body = parsed["body"]
|
||||
|
||||
replies.append({
|
||||
"comment_body": cleaned_body,
|
||||
"comment_url": _to_absolute_url(href),
|
||||
"comment_id": reply_id,
|
||||
})
|
||||
|
||||
if len(replies) >= max_replies:
|
||||
break
|
||||
|
||||
if len(replies) >= max_replies:
|
||||
break
|
||||
|
||||
if len(replies) == last_count:
|
||||
stable_count += 1
|
||||
if stable_count >= 3:
|
||||
break
|
||||
else:
|
||||
stable_count = 0
|
||||
last_count = len(replies)
|
||||
|
||||
page.evaluate("window.scrollBy(0, document.body.scrollHeight)")
|
||||
page.wait_for_timeout(1500)
|
||||
|
||||
finally:
|
||||
page.close()
|
||||
|
||||
return replies
|
||||
|
||||
|
||||
def _scrape_main_post_text(context: BrowserContext, post_url: str) -> str:
|
||||
"""Extract and clean the main post text from a post page."""
|
||||
page = context.new_page()
|
||||
try:
|
||||
page.goto(post_url, timeout=0)
|
||||
page.wait_for_timeout(3000)
|
||||
|
||||
links = page.locator(POST_LINK_SELECTOR).all()
|
||||
for link in links:
|
||||
href = link.get_attribute("href")
|
||||
if href and _post_id_from_url(href) == _post_id_from_url(post_url):
|
||||
raw = _extract_text_from_card(link)
|
||||
if raw:
|
||||
parsed = _parse_card_text(raw)
|
||||
return parsed["body"] or raw
|
||||
return ""
|
||||
finally:
|
||||
page.close()
|
||||
|
||||
|
||||
# --- Content object builder ---
|
||||
|
||||
|
||||
def _build_content_object(post: dict, replies: list[dict]) -> dict:
|
||||
"""Build the standard content_object from scraped post + replies.
|
||||
|
||||
Uses cleaned body text for title and comment bodies.
|
||||
"""
|
||||
t_config = settings.config["threads"]["thread"]
|
||||
max_len = int(t_config["max_reply_length"])
|
||||
min_len = int(t_config["min_reply_length"])
|
||||
blocked_raw = t_config.get("blocked_words", "")
|
||||
|
||||
storymode = settings.config["settings"].get("storymode", False)
|
||||
|
||||
# Use cleaned body text for the title, fall back to raw text
|
||||
title = post.get("body") or post.get("text") or ""
|
||||
|
||||
content: dict = {
|
||||
"thread_id": post["post_id"],
|
||||
"thread_title": title[:280],
|
||||
"thread_url": post["url"],
|
||||
"is_nsfw": False,
|
||||
"thread_category": "threads",
|
||||
"comments": [],
|
||||
}
|
||||
|
||||
if storymode:
|
||||
content["thread_post"] = title
|
||||
print_substep("Storymode: using post text as thread_post.", style="dim")
|
||||
return content
|
||||
|
||||
for reply in replies:
|
||||
body = reply.get("comment_body", "").strip()
|
||||
if not body:
|
||||
continue
|
||||
if _contains_blocked(body, blocked_raw):
|
||||
continue
|
||||
if not (min_len <= len(body) <= max_len):
|
||||
continue
|
||||
sanitised = sanitize_text(body)
|
||||
if not sanitised:
|
||||
continue
|
||||
|
||||
content["comments"].append({
|
||||
"comment_body": body,
|
||||
"comment_url": reply["comment_url"],
|
||||
"comment_id": reply["comment_id"],
|
||||
})
|
||||
|
||||
return content
|
||||
|
||||
|
||||
# --- Main entry point ---
|
||||
|
||||
|
||||
def get_trending_threads_content(POST_ID: Optional[str] = None) -> dict:
|
||||
"""Discover trending Threads posts via web scraping and return a content_object."""
|
||||
print_step("Discovering trending Threads content via web scraping...")
|
||||
|
||||
min_replies = int(settings.config["threads"]["thread"]["min_replies"])
|
||||
min_engagement = int(settings.config["threads"]["thread"].get("min_engagement", 0))
|
||||
|
||||
with sync_playwright() as p:
|
||||
browser = p.chromium.launch(headless=True)
|
||||
try:
|
||||
context = ensure_authenticated_context(browser)
|
||||
|
||||
if POST_ID:
|
||||
post_url = f"https://www.threads.net/t/{POST_ID}"
|
||||
post = {"url": post_url, "post_id": POST_ID, "text": "", "body": ""}
|
||||
replies = _scrape_post_replies(context, post_url)
|
||||
content = _build_content_object(post, replies)
|
||||
if content["comments"] or content.get("thread_post"):
|
||||
return content
|
||||
raise RuntimeError(
|
||||
f"No replies found for post {POST_ID}. "
|
||||
f"Minimum required: {min_replies}."
|
||||
)
|
||||
|
||||
# Scrape from multiple sources: main feed + trending search queries
|
||||
posts = _scrape_feed_posts(context)
|
||||
# Also search for popular topics to find high-engagement content
|
||||
trending_queries = settings.config["threads"]["thread"].get(
|
||||
"search_queries", "news,politics,trending"
|
||||
)
|
||||
for query in trending_queries.split(","):
|
||||
query = query.strip()
|
||||
if query:
|
||||
try:
|
||||
search_posts = _scrape_search_page(context, query)
|
||||
# Merge avoiding duplicates
|
||||
existing_ids = {p["post_id"] for p in posts}
|
||||
for sp in search_posts:
|
||||
if sp["post_id"] not in existing_ids:
|
||||
posts.append(sp)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if not posts:
|
||||
raise RuntimeError("No posts found in feed. Try again later.")
|
||||
|
||||
candidates = _filter_candidates(posts)
|
||||
if not candidates:
|
||||
raise RuntimeError(
|
||||
f"No eligible posts in feed after filtering. "
|
||||
f"Try lowering min_engagement (currently {min_engagement:,}) "
|
||||
f"or min_replies (currently {min_replies})."
|
||||
)
|
||||
|
||||
for i, candidate in enumerate(candidates):
|
||||
eng = candidate.get("_total_engagement", 0)
|
||||
print_substep(
|
||||
f"Trying #{i + 1}: ♥{candidate['likes']:,} "
|
||||
f"💬{candidate['replies_shown']} "
|
||||
f"'{candidate['body'][:60]}...'",
|
||||
style="dim",
|
||||
)
|
||||
try:
|
||||
replies = _scrape_post_replies(context, candidate["url"])
|
||||
if len(replies) >= min_replies:
|
||||
if not candidate.get("body") or len(candidate.get("body", "")) < 50:
|
||||
full_text = _scrape_main_post_text(context, candidate["url"])
|
||||
if full_text:
|
||||
candidate["body"] = full_text
|
||||
content = _build_content_object(candidate, replies)
|
||||
title_preview = content["thread_title"][:60]
|
||||
print_substep(
|
||||
f"Selected: '{title_preview}...' "
|
||||
f"♥{candidate['likes']:,} 💬{len(content['comments'])} replies",
|
||||
style="bold green",
|
||||
)
|
||||
return content
|
||||
print_substep(
|
||||
f" Only {len(replies)} replies (need {min_replies}). Trying next...",
|
||||
style="yellow",
|
||||
)
|
||||
except Exception as e:
|
||||
print_substep(f" Failed: {e}. Trying next...", style="yellow")
|
||||
continue
|
||||
|
||||
raise RuntimeError(
|
||||
f"No eligible posts with {min_replies}+ replies found "
|
||||
f"after trying {len(candidates)} candidates."
|
||||
)
|
||||
|
||||
finally:
|
||||
browser.close()
|
||||
Loading…
Reference in new issue