You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
RedditVideoMakerBot/reddit/scraper.py

507 lines
15 KiB

"""
No-OAuth Reddit scraper using public .json endpoints.
No API keys required - uses Reddit's public JSON interface.
Note: This approach is subject to rate limiting and may be blocked by Reddit.
For production use, consider using the official Reddit API with OAuth.
"""
import json
import time
from dataclasses import dataclass
from datetime import datetime, timedelta, timezone
from typing import Any, Dict, List, Optional, Tuple
import requests
from utils.console import print_substep
# Default User-Agent - customize this to avoid rate limiting
DEFAULT_USER_AGENT = "python:reddit_video_bot:1.0 (no-oauth scraper)"
# Reddit base URLs
REDDIT_BASES = ["https://www.reddit.com", "https://old.reddit.com"]
class RedditScraperError(Exception):
"""Exception raised for Reddit scraper errors."""
pass
@dataclass
class RedditPost:
"""Represents a Reddit post/submission."""
id: str
name: str # t3_xxx
title: str
selftext: str
author: str
created_utc: float
score: int
upvote_ratio: float
num_comments: int
permalink: str
url: str
over_18: bool
stickied: bool
subreddit: str
@classmethod
def from_json(cls, data: Dict[str, Any]) -> "RedditPost":
return cls(
id=data.get("id", ""),
name=data.get("name", ""),
title=data.get("title", ""),
selftext=data.get("selftext", ""),
author=data.get("author", "[deleted]"),
created_utc=float(data.get("created_utc", 0)),
score=int(data.get("score", 0)),
upvote_ratio=float(data.get("upvote_ratio", 0)),
num_comments=int(data.get("num_comments", 0)),
permalink=data.get("permalink", ""),
url=data.get("url", ""),
over_18=bool(data.get("over_18", False)),
stickied=bool(data.get("stickied", False)),
subreddit=data.get("subreddit", ""),
)
@dataclass
class RedditComment:
"""Represents a Reddit comment."""
id: str
name: str # t1_xxx
body: str
author: str
created_utc: float
score: int
permalink: str
parent_id: str
link_id: str
depth: int
stickied: bool
@classmethod
def from_json(cls, data: Dict[str, Any], depth: int = 0) -> "RedditComment":
return cls(
id=data.get("id", ""),
name=data.get("name", ""),
body=data.get("body", ""),
author=data.get("author", "[deleted]"),
created_utc=float(data.get("created_utc", 0)),
score=int(data.get("score", 0)),
permalink=data.get("permalink", ""),
parent_id=data.get("parent_id", ""),
link_id=data.get("link_id", ""),
depth=depth,
stickied=bool(data.get("stickied", False)),
)
class RedditScraper:
"""
No-OAuth Reddit scraper using public .json endpoints.
Example usage:
scraper = RedditScraper()
posts = scraper.get_subreddit_posts("AskReddit", limit=25, sort="hot")
post, comments = scraper.get_post_with_comments(posts[0].id)
"""
def __init__(
self,
user_agent: str = DEFAULT_USER_AGENT,
base_url: str = REDDIT_BASES[0],
request_delay: float = 2.0,
timeout: float = 30.0,
max_retries: int = 5,
):
"""
Initialize the Reddit scraper.
Args:
user_agent: User-Agent string for requests
base_url: Reddit base URL (www.reddit.com or old.reddit.com)
request_delay: Delay between requests in seconds
timeout: Request timeout in seconds
max_retries: Maximum number of retries per request
"""
self.user_agent = user_agent
self.base_url = base_url.rstrip("/")
self.request_delay = request_delay
self.timeout = timeout
self.max_retries = max_retries
self.session = requests.Session()
self._last_request_time = 0.0
def _rate_limit(self) -> None:
"""Enforce rate limiting between requests."""
elapsed = time.time() - self._last_request_time
if elapsed < self.request_delay:
time.sleep(self.request_delay - elapsed)
self._last_request_time = time.time()
def _fetch_json(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any:
"""
Fetch JSON from a Reddit endpoint with retries and rate limiting.
Args:
url: Full URL to fetch
params: Query parameters
Returns:
Parsed JSON response
Raises:
RedditScraperError: If request fails after retries
"""
headers = {
"User-Agent": self.user_agent,
"Accept": "application/json",
}
if params is None:
params = {}
params["raw_json"] = 1 # Get unescaped JSON
last_error: Optional[Exception] = None
for attempt in range(self.max_retries):
self._rate_limit()
try:
response = self.session.get(
url,
params=params,
headers=headers,
timeout=self.timeout,
)
# Handle rate limiting
if response.status_code == 429:
retry_after = int(response.headers.get("Retry-After", 60))
print_substep(f"Rate limited. Waiting {retry_after}s...", style="yellow")
time.sleep(max(self.request_delay, retry_after))
last_error = RedditScraperError(f"Rate limited (429)")
continue
# Handle server errors
if 500 <= response.status_code < 600:
wait_time = self.request_delay * (attempt + 1)
print_substep(f"Server error {response.status_code}. Retrying in {wait_time}s...", style="yellow")
time.sleep(wait_time)
last_error = RedditScraperError(f"Server error: {response.status_code}")
continue
# Handle other errors
if response.status_code != 200:
raise RedditScraperError(
f"HTTP {response.status_code}: {response.text[:200]}"
)
return response.json()
except requests.exceptions.RequestException as e:
last_error = e
wait_time = self.request_delay * (attempt + 1)
if attempt < self.max_retries - 1:
print_substep(f"Request failed: {e}. Retrying in {wait_time}s...", style="yellow")
time.sleep(wait_time)
continue
raise RedditScraperError(f"Failed after {self.max_retries} attempts: {last_error}")
def get_subreddit_posts(
self,
subreddit: str,
sort: str = "hot",
limit: int = 25,
time_filter: str = "all",
after: Optional[str] = None,
) -> List[RedditPost]:
"""
Get posts from a subreddit.
Args:
subreddit: Subreddit name (without r/ prefix)
sort: Sort method (hot, new, top, rising, controversial)
limit: Maximum number of posts to retrieve (max 100 per request)
time_filter: Time filter for top/controversial (hour, day, week, month, year, all)
after: Pagination cursor (fullname of last item)
Returns:
List of RedditPost objects
"""
# Clean subreddit name
subreddit = subreddit.strip()
if subreddit.lower().startswith("r/"):
subreddit = subreddit[2:]
url = f"{self.base_url}/r/{subreddit}/{sort}.json"
params: Dict[str, Any] = {"limit": min(limit, 100)}
if sort in ("top", "controversial"):
params["t"] = time_filter
if after:
params["after"] = after
data = self._fetch_json(url, params)
posts = []
children = data.get("data", {}).get("children", [])
for child in children:
if child.get("kind") != "t3":
continue
post_data = child.get("data", {})
if post_data:
posts.append(RedditPost.from_json(post_data))
return posts
def get_post_by_id(self, post_id: str) -> Optional[RedditPost]:
"""
Get a single post by ID.
Args:
post_id: Post ID (without t3_ prefix)
Returns:
RedditPost object or None if not found
"""
# Remove t3_ prefix if present
if post_id.startswith("t3_"):
post_id = post_id[3:]
url = f"{self.base_url}/comments/{post_id}.json"
params = {"limit": 0} # Don't fetch comments
try:
data = self._fetch_json(url, params)
except RedditScraperError:
return None
if not isinstance(data, list) or len(data) < 1:
return None
post_listing = data[0]
children = post_listing.get("data", {}).get("children", [])
if not children:
return None
post_data = children[0].get("data", {})
return RedditPost.from_json(post_data) if post_data else None
def get_post_with_comments(
self,
post_id: str,
comment_sort: str = "top",
comment_limit: int = 500,
comment_depth: int = 10,
max_comments: int = 1000,
) -> Tuple[Optional[RedditPost], List[RedditComment]]:
"""
Get a post with its comments.
Args:
post_id: Post ID (without t3_ prefix)
comment_sort: Comment sort (top, new, controversial, best, old, qa)
comment_limit: Number of comments per request (max ~500)
comment_depth: Maximum depth of comment tree
max_comments: Hard cap on total comments to return
Returns:
Tuple of (RedditPost, List[RedditComment])
"""
# Remove t3_ prefix if present
if post_id.startswith("t3_"):
post_id = post_id[3:]
url = f"{self.base_url}/comments/{post_id}.json"
params = {
"sort": comment_sort,
"limit": min(comment_limit, 500),
"depth": comment_depth,
}
data = self._fetch_json(url, params)
if not isinstance(data, list) or len(data) < 2:
raise RedditScraperError(f"Unexpected response format for post {post_id}")
# Parse post
post_listing = data[0]
post_children = post_listing.get("data", {}).get("children", [])
if not post_children:
return None, []
post_data = post_children[0].get("data", {})
post = RedditPost.from_json(post_data) if post_data else None
# Parse comments
comment_listing = data[1]
comment_children = comment_listing.get("data", {}).get("children", [])
comments: List[RedditComment] = []
self._flatten_comments(comment_children, depth=0, out=comments, max_comments=max_comments)
return post, comments
def _flatten_comments(
self,
children: List[Dict[str, Any]],
depth: int,
out: List[RedditComment],
max_comments: int,
) -> None:
"""
Recursively flatten comment tree into a list.
Ignores "more" placeholders - some comments may be missing in large threads.
"""
for child in children:
if len(out) >= max_comments:
return
kind = child.get("kind")
data = child.get("data", {})
if kind == "t1":
# This is a comment
comment = RedditComment.from_json(data, depth=depth)
out.append(comment)
# Process replies
replies = data.get("replies")
if isinstance(replies, dict):
reply_children = replies.get("data", {}).get("children", [])
if reply_children:
self._flatten_comments(
reply_children,
depth=depth + 1,
out=out,
max_comments=max_comments,
)
elif kind == "more":
# "More comments" placeholder - skip (some comments will be missing)
continue
def search_subreddit(
self,
subreddit: str,
query: str,
sort: str = "relevance",
time_filter: str = "all",
limit: int = 25,
) -> List[RedditPost]:
"""
Search posts in a subreddit.
Args:
subreddit: Subreddit name
query: Search query
sort: Sort method (relevance, hot, top, new, comments)
time_filter: Time filter (hour, day, week, month, year, all)
limit: Maximum results
Returns:
List of matching posts
"""
subreddit = subreddit.strip()
if subreddit.lower().startswith("r/"):
subreddit = subreddit[2:]
url = f"{self.base_url}/r/{subreddit}/search.json"
params = {
"q": query,
"sort": sort,
"t": time_filter,
"limit": min(limit, 100),
"restrict_sr": "on", # Restrict to subreddit
}
data = self._fetch_json(url, params)
posts = []
children = data.get("data", {}).get("children", [])
for child in children:
if child.get("kind") != "t3":
continue
post_data = child.get("data", {})
if post_data:
posts.append(RedditPost.from_json(post_data))
return posts
def get_posts_newer_than(
self,
subreddit: str,
days: int = 30,
max_posts: int = 1000,
) -> List[RedditPost]:
"""
Get posts from a subreddit newer than a specified number of days.
Note: Reddit listings are capped at ~1000 posts. If the subreddit has
more posts than this in the time window, older posts will be missed.
Args:
subreddit: Subreddit name
days: Number of days to look back
max_posts: Maximum posts to retrieve
Returns:
List of posts within the time window
"""
cutoff = datetime.now(timezone.utc) - timedelta(days=days)
cutoff_ts = cutoff.timestamp()
all_posts: List[RedditPost] = []
after: Optional[str] = None
while len(all_posts) < max_posts:
posts = self.get_subreddit_posts(
subreddit=subreddit,
sort="new",
limit=100,
after=after,
)
if not posts:
break
for post in posts:
# Skip stickied posts (they can be old)
if post.stickied:
continue
if post.created_utc < cutoff_ts:
# Reached posts older than cutoff
return all_posts
all_posts.append(post)
if len(all_posts) >= max_posts:
return all_posts
# Set pagination cursor
after = posts[-1].name
return all_posts
# Global scraper instance
_scraper: Optional[RedditScraper] = None
def get_scraper() -> RedditScraper:
"""Get or create the global Reddit scraper instance."""
global _scraper
if _scraper is None:
_scraper = RedditScraper()
return _scraper