You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
507 lines
15 KiB
507 lines
15 KiB
"""
|
|
No-OAuth Reddit scraper using public .json endpoints.
|
|
No API keys required - uses Reddit's public JSON interface.
|
|
|
|
Note: This approach is subject to rate limiting and may be blocked by Reddit.
|
|
For production use, consider using the official Reddit API with OAuth.
|
|
"""
|
|
import json
|
|
import time
|
|
from dataclasses import dataclass
|
|
from datetime import datetime, timedelta, timezone
|
|
from typing import Any, Dict, List, Optional, Tuple
|
|
|
|
import requests
|
|
|
|
from utils.console import print_substep
|
|
|
|
|
|
# Default User-Agent - customize this to avoid rate limiting
|
|
DEFAULT_USER_AGENT = "python:reddit_video_bot:1.0 (no-oauth scraper)"
|
|
|
|
# Reddit base URLs
|
|
REDDIT_BASES = ["https://www.reddit.com", "https://old.reddit.com"]
|
|
|
|
|
|
class RedditScraperError(Exception):
|
|
"""Exception raised for Reddit scraper errors."""
|
|
pass
|
|
|
|
|
|
@dataclass
|
|
class RedditPost:
|
|
"""Represents a Reddit post/submission."""
|
|
id: str
|
|
name: str # t3_xxx
|
|
title: str
|
|
selftext: str
|
|
author: str
|
|
created_utc: float
|
|
score: int
|
|
upvote_ratio: float
|
|
num_comments: int
|
|
permalink: str
|
|
url: str
|
|
over_18: bool
|
|
stickied: bool
|
|
subreddit: str
|
|
|
|
@classmethod
|
|
def from_json(cls, data: Dict[str, Any]) -> "RedditPost":
|
|
return cls(
|
|
id=data.get("id", ""),
|
|
name=data.get("name", ""),
|
|
title=data.get("title", ""),
|
|
selftext=data.get("selftext", ""),
|
|
author=data.get("author", "[deleted]"),
|
|
created_utc=float(data.get("created_utc", 0)),
|
|
score=int(data.get("score", 0)),
|
|
upvote_ratio=float(data.get("upvote_ratio", 0)),
|
|
num_comments=int(data.get("num_comments", 0)),
|
|
permalink=data.get("permalink", ""),
|
|
url=data.get("url", ""),
|
|
over_18=bool(data.get("over_18", False)),
|
|
stickied=bool(data.get("stickied", False)),
|
|
subreddit=data.get("subreddit", ""),
|
|
)
|
|
|
|
|
|
@dataclass
|
|
class RedditComment:
|
|
"""Represents a Reddit comment."""
|
|
id: str
|
|
name: str # t1_xxx
|
|
body: str
|
|
author: str
|
|
created_utc: float
|
|
score: int
|
|
permalink: str
|
|
parent_id: str
|
|
link_id: str
|
|
depth: int
|
|
stickied: bool
|
|
|
|
@classmethod
|
|
def from_json(cls, data: Dict[str, Any], depth: int = 0) -> "RedditComment":
|
|
return cls(
|
|
id=data.get("id", ""),
|
|
name=data.get("name", ""),
|
|
body=data.get("body", ""),
|
|
author=data.get("author", "[deleted]"),
|
|
created_utc=float(data.get("created_utc", 0)),
|
|
score=int(data.get("score", 0)),
|
|
permalink=data.get("permalink", ""),
|
|
parent_id=data.get("parent_id", ""),
|
|
link_id=data.get("link_id", ""),
|
|
depth=depth,
|
|
stickied=bool(data.get("stickied", False)),
|
|
)
|
|
|
|
|
|
class RedditScraper:
|
|
"""
|
|
No-OAuth Reddit scraper using public .json endpoints.
|
|
|
|
Example usage:
|
|
scraper = RedditScraper()
|
|
posts = scraper.get_subreddit_posts("AskReddit", limit=25, sort="hot")
|
|
post, comments = scraper.get_post_with_comments(posts[0].id)
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
user_agent: str = DEFAULT_USER_AGENT,
|
|
base_url: str = REDDIT_BASES[0],
|
|
request_delay: float = 2.0,
|
|
timeout: float = 30.0,
|
|
max_retries: int = 5,
|
|
):
|
|
"""
|
|
Initialize the Reddit scraper.
|
|
|
|
Args:
|
|
user_agent: User-Agent string for requests
|
|
base_url: Reddit base URL (www.reddit.com or old.reddit.com)
|
|
request_delay: Delay between requests in seconds
|
|
timeout: Request timeout in seconds
|
|
max_retries: Maximum number of retries per request
|
|
"""
|
|
self.user_agent = user_agent
|
|
self.base_url = base_url.rstrip("/")
|
|
self.request_delay = request_delay
|
|
self.timeout = timeout
|
|
self.max_retries = max_retries
|
|
self.session = requests.Session()
|
|
self._last_request_time = 0.0
|
|
|
|
def _rate_limit(self) -> None:
|
|
"""Enforce rate limiting between requests."""
|
|
elapsed = time.time() - self._last_request_time
|
|
if elapsed < self.request_delay:
|
|
time.sleep(self.request_delay - elapsed)
|
|
self._last_request_time = time.time()
|
|
|
|
def _fetch_json(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any:
|
|
"""
|
|
Fetch JSON from a Reddit endpoint with retries and rate limiting.
|
|
|
|
Args:
|
|
url: Full URL to fetch
|
|
params: Query parameters
|
|
|
|
Returns:
|
|
Parsed JSON response
|
|
|
|
Raises:
|
|
RedditScraperError: If request fails after retries
|
|
"""
|
|
headers = {
|
|
"User-Agent": self.user_agent,
|
|
"Accept": "application/json",
|
|
}
|
|
|
|
if params is None:
|
|
params = {}
|
|
params["raw_json"] = 1 # Get unescaped JSON
|
|
|
|
last_error: Optional[Exception] = None
|
|
|
|
for attempt in range(self.max_retries):
|
|
self._rate_limit()
|
|
|
|
try:
|
|
response = self.session.get(
|
|
url,
|
|
params=params,
|
|
headers=headers,
|
|
timeout=self.timeout,
|
|
)
|
|
|
|
# Handle rate limiting
|
|
if response.status_code == 429:
|
|
retry_after = int(response.headers.get("Retry-After", 60))
|
|
print_substep(f"Rate limited. Waiting {retry_after}s...", style="yellow")
|
|
time.sleep(max(self.request_delay, retry_after))
|
|
last_error = RedditScraperError(f"Rate limited (429)")
|
|
continue
|
|
|
|
# Handle server errors
|
|
if 500 <= response.status_code < 600:
|
|
wait_time = self.request_delay * (attempt + 1)
|
|
print_substep(f"Server error {response.status_code}. Retrying in {wait_time}s...", style="yellow")
|
|
time.sleep(wait_time)
|
|
last_error = RedditScraperError(f"Server error: {response.status_code}")
|
|
continue
|
|
|
|
# Handle other errors
|
|
if response.status_code != 200:
|
|
raise RedditScraperError(
|
|
f"HTTP {response.status_code}: {response.text[:200]}"
|
|
)
|
|
|
|
return response.json()
|
|
|
|
except requests.exceptions.RequestException as e:
|
|
last_error = e
|
|
wait_time = self.request_delay * (attempt + 1)
|
|
if attempt < self.max_retries - 1:
|
|
print_substep(f"Request failed: {e}. Retrying in {wait_time}s...", style="yellow")
|
|
time.sleep(wait_time)
|
|
continue
|
|
|
|
raise RedditScraperError(f"Failed after {self.max_retries} attempts: {last_error}")
|
|
|
|
def get_subreddit_posts(
|
|
self,
|
|
subreddit: str,
|
|
sort: str = "hot",
|
|
limit: int = 25,
|
|
time_filter: str = "all",
|
|
after: Optional[str] = None,
|
|
) -> List[RedditPost]:
|
|
"""
|
|
Get posts from a subreddit.
|
|
|
|
Args:
|
|
subreddit: Subreddit name (without r/ prefix)
|
|
sort: Sort method (hot, new, top, rising, controversial)
|
|
limit: Maximum number of posts to retrieve (max 100 per request)
|
|
time_filter: Time filter for top/controversial (hour, day, week, month, year, all)
|
|
after: Pagination cursor (fullname of last item)
|
|
|
|
Returns:
|
|
List of RedditPost objects
|
|
"""
|
|
# Clean subreddit name
|
|
subreddit = subreddit.strip()
|
|
if subreddit.lower().startswith("r/"):
|
|
subreddit = subreddit[2:]
|
|
|
|
url = f"{self.base_url}/r/{subreddit}/{sort}.json"
|
|
params: Dict[str, Any] = {"limit": min(limit, 100)}
|
|
|
|
if sort in ("top", "controversial"):
|
|
params["t"] = time_filter
|
|
if after:
|
|
params["after"] = after
|
|
|
|
data = self._fetch_json(url, params)
|
|
|
|
posts = []
|
|
children = data.get("data", {}).get("children", [])
|
|
|
|
for child in children:
|
|
if child.get("kind") != "t3":
|
|
continue
|
|
post_data = child.get("data", {})
|
|
if post_data:
|
|
posts.append(RedditPost.from_json(post_data))
|
|
|
|
return posts
|
|
|
|
def get_post_by_id(self, post_id: str) -> Optional[RedditPost]:
|
|
"""
|
|
Get a single post by ID.
|
|
|
|
Args:
|
|
post_id: Post ID (without t3_ prefix)
|
|
|
|
Returns:
|
|
RedditPost object or None if not found
|
|
"""
|
|
# Remove t3_ prefix if present
|
|
if post_id.startswith("t3_"):
|
|
post_id = post_id[3:]
|
|
|
|
url = f"{self.base_url}/comments/{post_id}.json"
|
|
params = {"limit": 0} # Don't fetch comments
|
|
|
|
try:
|
|
data = self._fetch_json(url, params)
|
|
except RedditScraperError:
|
|
return None
|
|
|
|
if not isinstance(data, list) or len(data) < 1:
|
|
return None
|
|
|
|
post_listing = data[0]
|
|
children = post_listing.get("data", {}).get("children", [])
|
|
|
|
if not children:
|
|
return None
|
|
|
|
post_data = children[0].get("data", {})
|
|
return RedditPost.from_json(post_data) if post_data else None
|
|
|
|
def get_post_with_comments(
|
|
self,
|
|
post_id: str,
|
|
comment_sort: str = "top",
|
|
comment_limit: int = 500,
|
|
comment_depth: int = 10,
|
|
max_comments: int = 1000,
|
|
) -> Tuple[Optional[RedditPost], List[RedditComment]]:
|
|
"""
|
|
Get a post with its comments.
|
|
|
|
Args:
|
|
post_id: Post ID (without t3_ prefix)
|
|
comment_sort: Comment sort (top, new, controversial, best, old, qa)
|
|
comment_limit: Number of comments per request (max ~500)
|
|
comment_depth: Maximum depth of comment tree
|
|
max_comments: Hard cap on total comments to return
|
|
|
|
Returns:
|
|
Tuple of (RedditPost, List[RedditComment])
|
|
"""
|
|
# Remove t3_ prefix if present
|
|
if post_id.startswith("t3_"):
|
|
post_id = post_id[3:]
|
|
|
|
url = f"{self.base_url}/comments/{post_id}.json"
|
|
params = {
|
|
"sort": comment_sort,
|
|
"limit": min(comment_limit, 500),
|
|
"depth": comment_depth,
|
|
}
|
|
|
|
data = self._fetch_json(url, params)
|
|
|
|
if not isinstance(data, list) or len(data) < 2:
|
|
raise RedditScraperError(f"Unexpected response format for post {post_id}")
|
|
|
|
# Parse post
|
|
post_listing = data[0]
|
|
post_children = post_listing.get("data", {}).get("children", [])
|
|
|
|
if not post_children:
|
|
return None, []
|
|
|
|
post_data = post_children[0].get("data", {})
|
|
post = RedditPost.from_json(post_data) if post_data else None
|
|
|
|
# Parse comments
|
|
comment_listing = data[1]
|
|
comment_children = comment_listing.get("data", {}).get("children", [])
|
|
|
|
comments: List[RedditComment] = []
|
|
self._flatten_comments(comment_children, depth=0, out=comments, max_comments=max_comments)
|
|
|
|
return post, comments
|
|
|
|
def _flatten_comments(
|
|
self,
|
|
children: List[Dict[str, Any]],
|
|
depth: int,
|
|
out: List[RedditComment],
|
|
max_comments: int,
|
|
) -> None:
|
|
"""
|
|
Recursively flatten comment tree into a list.
|
|
|
|
Ignores "more" placeholders - some comments may be missing in large threads.
|
|
"""
|
|
for child in children:
|
|
if len(out) >= max_comments:
|
|
return
|
|
|
|
kind = child.get("kind")
|
|
data = child.get("data", {})
|
|
|
|
if kind == "t1":
|
|
# This is a comment
|
|
comment = RedditComment.from_json(data, depth=depth)
|
|
out.append(comment)
|
|
|
|
# Process replies
|
|
replies = data.get("replies")
|
|
if isinstance(replies, dict):
|
|
reply_children = replies.get("data", {}).get("children", [])
|
|
if reply_children:
|
|
self._flatten_comments(
|
|
reply_children,
|
|
depth=depth + 1,
|
|
out=out,
|
|
max_comments=max_comments,
|
|
)
|
|
|
|
elif kind == "more":
|
|
# "More comments" placeholder - skip (some comments will be missing)
|
|
continue
|
|
|
|
def search_subreddit(
|
|
self,
|
|
subreddit: str,
|
|
query: str,
|
|
sort: str = "relevance",
|
|
time_filter: str = "all",
|
|
limit: int = 25,
|
|
) -> List[RedditPost]:
|
|
"""
|
|
Search posts in a subreddit.
|
|
|
|
Args:
|
|
subreddit: Subreddit name
|
|
query: Search query
|
|
sort: Sort method (relevance, hot, top, new, comments)
|
|
time_filter: Time filter (hour, day, week, month, year, all)
|
|
limit: Maximum results
|
|
|
|
Returns:
|
|
List of matching posts
|
|
"""
|
|
subreddit = subreddit.strip()
|
|
if subreddit.lower().startswith("r/"):
|
|
subreddit = subreddit[2:]
|
|
|
|
url = f"{self.base_url}/r/{subreddit}/search.json"
|
|
params = {
|
|
"q": query,
|
|
"sort": sort,
|
|
"t": time_filter,
|
|
"limit": min(limit, 100),
|
|
"restrict_sr": "on", # Restrict to subreddit
|
|
}
|
|
|
|
data = self._fetch_json(url, params)
|
|
|
|
posts = []
|
|
children = data.get("data", {}).get("children", [])
|
|
|
|
for child in children:
|
|
if child.get("kind") != "t3":
|
|
continue
|
|
post_data = child.get("data", {})
|
|
if post_data:
|
|
posts.append(RedditPost.from_json(post_data))
|
|
|
|
return posts
|
|
|
|
def get_posts_newer_than(
|
|
self,
|
|
subreddit: str,
|
|
days: int = 30,
|
|
max_posts: int = 1000,
|
|
) -> List[RedditPost]:
|
|
"""
|
|
Get posts from a subreddit newer than a specified number of days.
|
|
|
|
Note: Reddit listings are capped at ~1000 posts. If the subreddit has
|
|
more posts than this in the time window, older posts will be missed.
|
|
|
|
Args:
|
|
subreddit: Subreddit name
|
|
days: Number of days to look back
|
|
max_posts: Maximum posts to retrieve
|
|
|
|
Returns:
|
|
List of posts within the time window
|
|
"""
|
|
cutoff = datetime.now(timezone.utc) - timedelta(days=days)
|
|
cutoff_ts = cutoff.timestamp()
|
|
|
|
all_posts: List[RedditPost] = []
|
|
after: Optional[str] = None
|
|
|
|
while len(all_posts) < max_posts:
|
|
posts = self.get_subreddit_posts(
|
|
subreddit=subreddit,
|
|
sort="new",
|
|
limit=100,
|
|
after=after,
|
|
)
|
|
|
|
if not posts:
|
|
break
|
|
|
|
for post in posts:
|
|
# Skip stickied posts (they can be old)
|
|
if post.stickied:
|
|
continue
|
|
|
|
if post.created_utc < cutoff_ts:
|
|
# Reached posts older than cutoff
|
|
return all_posts
|
|
|
|
all_posts.append(post)
|
|
|
|
if len(all_posts) >= max_posts:
|
|
return all_posts
|
|
|
|
# Set pagination cursor
|
|
after = posts[-1].name
|
|
|
|
return all_posts
|
|
|
|
|
|
# Global scraper instance
|
|
_scraper: Optional[RedditScraper] = None
|
|
|
|
|
|
def get_scraper() -> RedditScraper:
|
|
"""Get or create the global Reddit scraper instance."""
|
|
global _scraper
|
|
if _scraper is None:
|
|
_scraper = RedditScraper()
|
|
return _scraper
|