You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
RedditVideoMakerBot/platforms/threads/scraper.py

588 lines
19 KiB

"""Web scraping-based trending post discovery for Threads.net.
Bypasses the Meta Graph API (which only accesses your own posts) by using Playwright
to scrape threads.net directly — the "For You" feed, post pages, and replies.
Returns the standard content_object dict consumed by the rest of the pipeline.
"""
import re
from typing import Optional
from playwright.sync_api import BrowserContext, Locator, sync_playwright
from platforms.threads.auth import ensure_authenticated_context
from utils import settings
from utils.console import print_step, print_substep
from utils.voice import sanitize_text
from utils.videos import check_done_by_id
FEED_URL = "https://www.threads.net"
SCROLL_DELAY_MS = 2000
MAX_FEED_SCROLLS = 36
POST_LINK_SELECTOR = 'a[href*="/post/"]'
CARD_XPATH = 'xpath=ancestor::div[contains(@class, "x1a2a7pz")][1]'
def _post_id_from_url(url: str) -> str:
return url.rstrip("/").split("/")[-1]
def _to_absolute_url(href: str) -> str:
if href.startswith("http"):
return href
return "https://www.threads.net" + href
def _parse_abbreviated_number(s: str) -> int:
"""Parse abbreviated numbers like '1K', '2.5M' into integers."""
s = s.strip().upper().replace(",", "")
if not s:
return 0
multipliers = {"K": 1_000, "M": 1_000_000}
if s[-1] in multipliers:
try:
return int(float(s[:-1]) * multipliers[s[-1]])
except ValueError:
return 0
try:
return int(s)
except ValueError:
return 0
def _parse_card_text(text: str) -> dict:
"""Parse a Threads card's raw text into structured data.
Threads card format:
line 0: username
line 1: timestamp (e.g. "14h", "1d")
lines 2..N: post body text
last 1-4 lines: engagement metrics (likes, replies, reposts, quotes)
Returns dict with keys: username, timestamp, body, likes, replies, reposts
"""
if not text:
return {"username": "", "timestamp": "", "body": "", "likes": 0, "replies": 0, "reposts": 0}
lines = text.strip().split("\n")
if len(lines) < 3:
return {"username": "", "timestamp": "", "body": text, "likes": 0, "replies": 0, "reposts": 0}
username = lines[0].strip()
timestamp = lines[1].strip()
# Find where engagement metrics start (trailing numeric/abbreviated lines)
metric_start = len(lines)
for i in range(len(lines) - 1, 1, -1):
line = lines[i].strip()
if re.match(r'^[\d,.]+[KkMm]?$', line):
metric_start = i
else:
break
# Body is everything between timestamp and metrics
body_lines = lines[2:metric_start]
body = "\n".join(body_lines).strip()
# Parse engagement metrics from the end
metrics = lines[metric_start:]
likes = 0
replies_count = 0
reposts = 0
if len(metrics) >= 1:
likes = _parse_abbreviated_number(metrics[0])
if len(metrics) >= 2:
replies_count = _parse_abbreviated_number(metrics[1])
if len(metrics) >= 3:
reposts = _parse_abbreviated_number(metrics[2])
return {
"username": username,
"timestamp": timestamp,
"body": body,
"likes": likes,
"replies": replies_count,
"reposts": reposts,
}
def _extract_text_from_card(link: Locator) -> str:
"""Walk up from a post link to the card container and extract its raw text."""
try:
card = link.locator(CARD_XPATH)
if card.count():
return card.first.inner_text(timeout=3000).strip()
except Exception:
pass
return ""
# --- Feed scraping ---
def _scrape_feed_posts(context: BrowserContext, max_scrolls: int = MAX_FEED_SCROLLS) -> list[dict]:
"""Navigate to threads.net feed, scroll, extract post metadata with engagement metrics."""
print_step("Scraping Threads trending feed...")
page = context.new_page()
posts: list[dict] = []
seen_ids: set[str] = set()
try:
page.goto(FEED_URL, timeout=0)
page.wait_for_timeout(4000)
last_height = 0
for i in range(max_scrolls):
links = page.locator(POST_LINK_SELECTOR).all()
new_found = 0
for link in links:
href = link.get_attribute("href")
if not href:
continue
post_id = _post_id_from_url(href)
if post_id in seen_ids:
continue
seen_ids.add(post_id)
raw_text = _extract_text_from_card(link)
parsed = _parse_card_text(raw_text)
posts.append({
"url": _to_absolute_url(href),
"text": raw_text,
"body": parsed["body"],
"username": parsed["username"],
"timestamp": parsed["timestamp"],
"likes": parsed["likes"],
"replies_shown": parsed["replies"],
"reposts": parsed["reposts"],
"post_id": post_id,
})
new_found += 1
if new_found > 0:
top = posts[-1]
print_substep(
f"Scroll {i + 1}: +{new_found} posts | top: "
f"{top['likes']:,} 💬{top['replies_shown']} 🔁{top['reposts']} "
f"'{top['body'][:50]}...'",
style="dim",
)
if new_found == 0 and i > 5:
break
page.evaluate("window.scrollBy(0, document.body.scrollHeight)")
page.wait_for_timeout(SCROLL_DELAY_MS)
new_height = page.evaluate("document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
finally:
page.close()
print_substep(f"Scraped {len(posts)} posts from feed.", style="bold green")
return posts
def _scrape_search_page(context: BrowserContext, query: str, max_scrolls: int = 5) -> list[dict]:
"""Search Threads for a query and scrape the results.
Uses the same card extraction as the main feed.
"""
print_step(f"Scraping Threads search: '{query}'...")
page = context.new_page()
posts: list[dict] = []
seen_ids: set[str] = set()
search_url = f"https://www.threads.net/search?q={query}&serp_type=tags"
try:
page.goto(search_url, timeout=0)
page.wait_for_timeout(4000)
for i in range(max_scrolls):
links = page.locator(POST_LINK_SELECTOR).all()
new_found = 0
for link in links:
href = link.get_attribute("href")
if not href:
continue
post_id = _post_id_from_url(href)
if post_id in seen_ids:
continue
seen_ids.add(post_id)
raw_text = _extract_text_from_card(link)
parsed = _parse_card_text(raw_text)
posts.append({
"url": _to_absolute_url(href),
"text": raw_text,
"body": parsed["body"],
"username": parsed["username"],
"timestamp": parsed["timestamp"],
"likes": parsed["likes"],
"replies_shown": parsed["replies"],
"reposts": parsed["reposts"],
"post_id": post_id,
})
new_found += 1
if new_found == 0:
break
page.evaluate("window.scrollBy(0, document.body.scrollHeight)")
page.wait_for_timeout(SCROLL_DELAY_MS)
finally:
page.close()
print_substep(f"Search '{query}': {len(posts)} posts.", style="dim")
return posts
# --- Candidate filtering ---
def _parse_timestamp_to_hours(ts: str) -> float | None:
"""Convert a Threads timestamp like '14h', '1d', '3d' to hours.
Returns None if the format is unrecognized.
"""
if not ts:
return None
ts = ts.strip().lower()
if ts.endswith("h"):
try:
return float(ts[:-1])
except ValueError:
return None
elif ts.endswith("d"):
try:
return float(ts[:-1]) * 24
except ValueError:
return None
elif ts.endswith("w"):
try:
return float(ts[:-1]) * 24 * 7
except ValueError:
return None
elif ts.endswith("m") and not ts.endswith("min"):
try:
return float(ts[:-1]) * 24 * 30
except ValueError:
return None
return None
def _age_from_config() -> float | None:
"""Parse max_post_age config value into hours. Returns None if disabled."""
raw = settings.config["threads"]["thread"].get("max_post_age", "")
if not raw:
return None
return _parse_timestamp_to_hours(raw)
def _contains_blocked(text: str, blocked_raw: str) -> bool:
if not blocked_raw:
return False
blocked = [w.strip().lower() for w in blocked_raw.split(",") if w.strip()]
text_lower = text.lower()
return any(word in text_lower for word in blocked)
def _filter_candidates(posts: list[dict]) -> list[dict]:
"""Filter feed posts by engagement, blocked words, and duplicates.
Sorts by total engagement (likes + replies) descending so the most
viral posts are tried first.
"""
t_config = settings.config["threads"]["thread"]
blocked_raw = t_config.get("blocked_words", "")
min_engagement = int(t_config.get("min_engagement", 0))
max_age_hours = _age_from_config()
candidates = []
for post in posts:
if check_done_by_id(post["post_id"]):
continue
if _contains_blocked(post["body"], blocked_raw):
continue
if not post["body"] or len(post["body"].strip()) < 10:
continue
# Age filter
if max_age_hours is not None:
post_hours = _parse_timestamp_to_hours(post.get("timestamp", ""))
if post_hours is not None and post_hours > max_age_hours:
continue
total_engagement = post.get("likes", 0) + post.get("reposts", 0)
if total_engagement < min_engagement:
continue
post["_total_engagement"] = total_engagement
candidates.append(post)
# Sort by engagement descending — most viral first
candidates.sort(key=lambda p: p.get("_total_engagement", 0), reverse=True)
age_str = f", max age ≤{max_age_hours}h" if max_age_hours else ""
if min_engagement > 0:
print_substep(
f"Filtered {len(posts)} posts -> {len(candidates)} viral candidates "
f"(min ♥+🔁 ≥ {min_engagement:,}{age_str})",
style="dim",
)
else:
print_substep(
f"Filtered {len(posts)} posts -> {len(candidates)} candidates"
f"{' (max age ≤' + str(max_age_hours) + 'h)' if max_age_hours else ''}",
style="dim",
)
return candidates
# --- Reply scraping on post pages ---
def _scrape_post_replies(context: BrowserContext, post_url: str, max_replies: int = 100) -> list[dict]:
"""Navigate to a post page, scroll to load replies, extract reply data.
Uses _parse_card_text to separate reply body from metadata (username, timestamp, etc.).
"""
page = context.new_page()
replies: list[dict] = []
seen_ids: set[str] = set()
main_post_id = _post_id_from_url(post_url)
try:
page.goto(post_url, timeout=0)
page.wait_for_timeout(4000)
stable_count = 0
last_count = 0
for _ in range(15):
links = page.locator(POST_LINK_SELECTOR).all()
for link in links:
href = link.get_attribute("href")
if not href:
continue
reply_id = _post_id_from_url(href)
if reply_id == main_post_id:
continue
if reply_id in seen_ids:
continue
seen_ids.add(reply_id)
raw_text = _extract_text_from_card(link)
if not raw_text:
continue
parsed = _parse_card_text(raw_text)
cleaned_body = parsed["body"]
replies.append({
"comment_body": cleaned_body,
"comment_url": _to_absolute_url(href),
"comment_id": reply_id,
})
if len(replies) >= max_replies:
break
if len(replies) >= max_replies:
break
if len(replies) == last_count:
stable_count += 1
if stable_count >= 3:
break
else:
stable_count = 0
last_count = len(replies)
page.evaluate("window.scrollBy(0, document.body.scrollHeight)")
page.wait_for_timeout(1500)
finally:
page.close()
return replies
def _scrape_main_post_text(context: BrowserContext, post_url: str) -> str:
"""Extract and clean the main post text from a post page."""
page = context.new_page()
try:
page.goto(post_url, timeout=0)
page.wait_for_timeout(3000)
links = page.locator(POST_LINK_SELECTOR).all()
for link in links:
href = link.get_attribute("href")
if href and _post_id_from_url(href) == _post_id_from_url(post_url):
raw = _extract_text_from_card(link)
if raw:
parsed = _parse_card_text(raw)
return parsed["body"] or raw
return ""
finally:
page.close()
# --- Content object builder ---
def _build_content_object(post: dict, replies: list[dict]) -> dict:
"""Build the standard content_object from scraped post + replies.
Uses cleaned body text for title and comment bodies.
"""
t_config = settings.config["threads"]["thread"]
max_len = int(t_config["max_reply_length"])
min_len = int(t_config["min_reply_length"])
blocked_raw = t_config.get("blocked_words", "")
storymode = settings.config["settings"].get("storymode", False)
# Use cleaned body text for the title, fall back to raw text
title = post.get("body") or post.get("text") or ""
content: dict = {
"thread_id": post["post_id"],
"thread_title": title[:280],
"thread_url": post["url"],
"is_nsfw": False,
"thread_category": "threads",
"comments": [],
}
if storymode:
content["thread_post"] = title
print_substep("Storymode: using post text as thread_post.", style="dim")
return content
for reply in replies:
body = reply.get("comment_body", "").strip()
if not body:
continue
if _contains_blocked(body, blocked_raw):
continue
if not (min_len <= len(body) <= max_len):
continue
sanitised = sanitize_text(body)
if not sanitised:
continue
content["comments"].append({
"comment_body": body,
"comment_url": reply["comment_url"],
"comment_id": reply["comment_id"],
})
return content
# --- Main entry point ---
def get_trending_threads_content(POST_ID: Optional[str] = None) -> dict:
"""Discover trending Threads posts via web scraping and return a content_object."""
print_step("Discovering trending Threads content via web scraping...")
min_replies = int(settings.config["threads"]["thread"]["min_replies"])
min_engagement = int(settings.config["threads"]["thread"].get("min_engagement", 0))
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
try:
context = ensure_authenticated_context(browser)
if POST_ID:
post_url = f"https://www.threads.net/t/{POST_ID}"
post = {"url": post_url, "post_id": POST_ID, "text": "", "body": ""}
replies = _scrape_post_replies(context, post_url)
content = _build_content_object(post, replies)
if content["comments"] or content.get("thread_post"):
return content
raise RuntimeError(
f"No replies found for post {POST_ID}. "
f"Minimum required: {min_replies}."
)
# Scrape from multiple sources: main feed + trending search queries
posts = _scrape_feed_posts(context)
# Also search for popular topics to find high-engagement content
trending_queries = settings.config["threads"]["thread"].get(
"search_queries", "news,politics,trending"
)
for query in trending_queries.split(","):
query = query.strip()
if query:
try:
search_posts = _scrape_search_page(context, query)
# Merge avoiding duplicates
existing_ids = {p["post_id"] for p in posts}
for sp in search_posts:
if sp["post_id"] not in existing_ids:
posts.append(sp)
except Exception:
pass
if not posts:
raise RuntimeError("No posts found in feed. Try again later.")
candidates = _filter_candidates(posts)
if not candidates:
raise RuntimeError(
f"No eligible posts in feed after filtering. "
f"Try lowering min_engagement (currently {min_engagement:,}) "
f"or min_replies (currently {min_replies})."
)
for i, candidate in enumerate(candidates):
eng = candidate.get("_total_engagement", 0)
print_substep(
f"Trying #{i + 1}: ♥{candidate['likes']:,} "
f"💬{candidate['replies_shown']} "
f"'{candidate['body'][:60]}...'",
style="dim",
)
try:
replies = _scrape_post_replies(context, candidate["url"])
if len(replies) >= min_replies:
if not candidate.get("body") or len(candidate.get("body", "")) < 50:
full_text = _scrape_main_post_text(context, candidate["url"])
if full_text:
candidate["body"] = full_text
content = _build_content_object(candidate, replies)
title_preview = content["thread_title"][:60]
print_substep(
f"Selected: '{title_preview}...' "
f"{candidate['likes']:,} 💬{len(content['comments'])} replies",
style="bold green",
)
return content
print_substep(
f" Only {len(replies)} replies (need {min_replies}). Trying next...",
style="yellow",
)
except Exception as e:
print_substep(f" Failed: {e}. Trying next...", style="yellow")
continue
raise RuntimeError(
f"No eligible posts with {min_replies}+ replies found "
f"after trying {len(candidates)} candidates."
)
finally:
browser.close()