You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
284 lines
10 KiB
284 lines
10 KiB
"""
|
|
Reddit subreddit thread fetcher using no-OAuth scraper.
|
|
No API keys required - uses Reddit's public JSON endpoints.
|
|
"""
|
|
import re
|
|
from typing import Dict, List, Optional, Any, Tuple
|
|
|
|
from reddit.scraper import get_scraper, RedditPost, RedditComment, RedditScraperError
|
|
from utils import settings
|
|
from utils.ai_methods import sort_by_similarity
|
|
from utils.console import print_step, print_substep
|
|
from utils.posttextparser import posttextparser
|
|
from utils.videos import check_done
|
|
from utils.voice import sanitize_text
|
|
|
|
|
|
class SubmissionWrapper:
|
|
"""Wrapper to make RedditPost compatible with existing utility functions."""
|
|
|
|
def __init__(self, post: RedditPost):
|
|
self.id = post.id
|
|
self.title = post.title
|
|
self.selftext = post.selftext
|
|
self.author = post.author
|
|
self.score = post.score
|
|
self.upvote_ratio = post.upvote_ratio
|
|
self.num_comments = post.num_comments
|
|
self.permalink = post.permalink
|
|
self.url = post.url
|
|
self.over_18 = post.over_18
|
|
self.stickied = post.stickied
|
|
self.subreddit_name = post.subreddit
|
|
self._post = post
|
|
|
|
def to_post(self) -> RedditPost:
|
|
return self._post
|
|
|
|
|
|
def get_subreddit_threads(POST_ID: Optional[str] = None) -> Dict[str, Any]:
|
|
"""
|
|
Fetches a Reddit thread and its comments using the no-OAuth scraper.
|
|
No API keys required.
|
|
|
|
Args:
|
|
POST_ID: Optional specific post ID to fetch
|
|
|
|
Returns:
|
|
Dictionary containing thread data and comments
|
|
"""
|
|
print_substep("Connecting to Reddit (no-auth mode)...")
|
|
|
|
scraper = get_scraper()
|
|
content: Dict[str, Any] = {}
|
|
similarity_score = 0
|
|
|
|
# Get subreddit from config or user input
|
|
print_step("Getting subreddit threads...")
|
|
|
|
subreddit_name = settings.config["reddit"]["thread"].get("subreddit", "")
|
|
|
|
if not subreddit_name:
|
|
subreddit_name = input("What subreddit would you like to pull from? ")
|
|
subreddit_name = re.sub(r"^r/", "", subreddit_name.strip())
|
|
if not subreddit_name:
|
|
subreddit_name = "AskReddit"
|
|
print_substep("Subreddit not defined. Using AskReddit.")
|
|
else:
|
|
# Clean the subreddit name
|
|
if str(subreddit_name).lower().startswith("r/"):
|
|
subreddit_name = subreddit_name[2:]
|
|
print_substep(f"Using subreddit: r/{subreddit_name} from config")
|
|
|
|
# Get the submission
|
|
submission: Optional[RedditPost] = None
|
|
|
|
try:
|
|
if POST_ID:
|
|
# Specific post ID provided (for queued posts)
|
|
submission = scraper.get_post_by_id(POST_ID)
|
|
if not submission:
|
|
raise RedditScraperError(f"Could not find post with ID: {POST_ID}")
|
|
|
|
elif settings.config["reddit"]["thread"].get("post_id"):
|
|
# Post ID from config (single post)
|
|
post_id = str(settings.config["reddit"]["thread"]["post_id"])
|
|
if "+" not in post_id: # Single post, not multiple
|
|
submission = scraper.get_post_by_id(post_id)
|
|
if not submission:
|
|
raise RedditScraperError(f"Could not find post with ID: {post_id}")
|
|
|
|
elif settings.config["ai"].get("ai_similarity_enabled"):
|
|
# AI sorting based on keyword similarity
|
|
print_substep("Fetching posts for AI similarity sorting...")
|
|
posts = scraper.get_subreddit_posts(subreddit_name, sort="hot", limit=50)
|
|
|
|
if not posts:
|
|
raise RedditScraperError(f"No posts found in r/{subreddit_name}")
|
|
|
|
keywords = settings.config["ai"].get("ai_similarity_keywords", "").split(",")
|
|
keywords = [keyword.strip() for keyword in keywords if keyword.strip()]
|
|
|
|
if keywords:
|
|
keywords_print = ", ".join(keywords)
|
|
print_substep(f"Sorting threads by similarity to: {keywords_print}")
|
|
|
|
# Convert posts to format expected by sort_by_similarity
|
|
wrappers = [SubmissionWrapper(post) for post in posts]
|
|
sorted_wrappers, similarity_scores = sort_by_similarity(wrappers, keywords)
|
|
|
|
submission, similarity_score = _get_undone_post(
|
|
sorted_wrappers, subreddit_name, similarity_scores=similarity_scores
|
|
)
|
|
else:
|
|
wrappers = [SubmissionWrapper(post) for post in posts]
|
|
submission = _get_undone_post(wrappers, subreddit_name)
|
|
|
|
else:
|
|
# Default: get hot posts
|
|
posts = scraper.get_subreddit_posts(subreddit_name, sort="hot", limit=25)
|
|
|
|
if not posts:
|
|
raise RedditScraperError(f"No posts found in r/{subreddit_name}")
|
|
|
|
wrappers = [SubmissionWrapper(post) for post in posts]
|
|
submission = _get_undone_post(wrappers, subreddit_name)
|
|
|
|
except RedditScraperError as e:
|
|
print_substep(f"Error fetching Reddit data: {e}", style="bold red")
|
|
raise
|
|
|
|
if submission is None:
|
|
print_substep("No suitable submission found. Retrying...", style="yellow")
|
|
return get_subreddit_threads(POST_ID)
|
|
|
|
# Check if story mode with no comments is okay
|
|
if not submission.num_comments and not settings.config["settings"].get("storymode"):
|
|
print_substep("No comments found. Skipping.", style="bold red")
|
|
exit()
|
|
|
|
# Double-check if this post was already done
|
|
wrapper = SubmissionWrapper(submission)
|
|
checked = check_done(wrapper)
|
|
if checked is None:
|
|
print_substep("Post already processed. Finding another...", style="yellow")
|
|
return get_subreddit_threads(POST_ID)
|
|
|
|
# Display post info
|
|
upvotes = submission.score
|
|
ratio = submission.upvote_ratio * 100
|
|
num_comments = submission.num_comments
|
|
thread_url = f"https://new.reddit.com{submission.permalink}"
|
|
|
|
print_substep(f"Video will be: {submission.title}", style="bold green")
|
|
print_substep(f"Thread url is: {thread_url}", style="bold green")
|
|
print_substep(f"Thread has {upvotes} upvotes", style="bold blue")
|
|
print_substep(f"Thread has a upvote ratio of {ratio:.0f}%", style="bold blue")
|
|
print_substep(f"Thread has {num_comments} comments", style="bold blue")
|
|
|
|
if similarity_score:
|
|
print_substep(
|
|
f"Thread has a similarity score up to {round(similarity_score * 100)}%",
|
|
style="bold blue",
|
|
)
|
|
|
|
# Build content dictionary
|
|
content["thread_url"] = thread_url
|
|
content["thread_title"] = submission.title
|
|
content["thread_id"] = submission.id
|
|
content["is_nsfw"] = submission.over_18
|
|
content["subreddit"] = subreddit_name
|
|
content["comments"] = []
|
|
|
|
if settings.config["settings"].get("storymode"):
|
|
# Story mode - use the post's selftext
|
|
if settings.config["settings"].get("storymodemethod") == 1:
|
|
content["thread_post"] = posttextparser(submission.selftext)
|
|
else:
|
|
content["thread_post"] = submission.selftext
|
|
else:
|
|
# Comment mode - fetch and process comments
|
|
print_substep("Fetching comments...", style="bold blue")
|
|
|
|
try:
|
|
_, comments = scraper.get_post_with_comments(
|
|
submission.id,
|
|
comment_sort="top",
|
|
comment_limit=500,
|
|
max_comments=1000,
|
|
)
|
|
|
|
# Filter and process comments
|
|
max_len = int(settings.config["reddit"]["thread"].get("max_comment_length", 500))
|
|
min_len = int(settings.config["reddit"]["thread"].get("min_comment_length", 1))
|
|
|
|
for comment in comments:
|
|
# Skip non-top-level comments (depth > 0)
|
|
if comment.depth > 0:
|
|
continue
|
|
|
|
# Skip deleted/removed
|
|
if comment.body in ["[removed]", "[deleted]"]:
|
|
continue
|
|
|
|
# Skip stickied comments
|
|
if comment.stickied:
|
|
continue
|
|
|
|
# Sanitize and validate
|
|
sanitized = sanitize_text(comment.body)
|
|
if not sanitized or sanitized.strip() == "":
|
|
continue
|
|
|
|
# Check length constraints
|
|
if len(comment.body) > max_len:
|
|
continue
|
|
if len(comment.body) < min_len:
|
|
continue
|
|
|
|
# Skip if author is deleted
|
|
if comment.author in ["[deleted]", "[removed]"]:
|
|
continue
|
|
|
|
content["comments"].append({
|
|
"comment_body": comment.body,
|
|
"comment_url": comment.permalink,
|
|
"comment_id": comment.id,
|
|
})
|
|
|
|
print_substep(f"Collected {len(content['comments'])} valid comments", style="bold green")
|
|
|
|
except RedditScraperError as e:
|
|
print_substep(f"Error fetching comments: {e}", style="yellow")
|
|
# Continue without comments if fetch fails
|
|
|
|
print_substep("Received subreddit threads successfully.", style="bold green")
|
|
return content
|
|
|
|
|
|
def _get_undone_post(
|
|
wrappers: List[SubmissionWrapper],
|
|
subreddit_name: str,
|
|
similarity_scores: Optional[List[float]] = None,
|
|
) -> Optional[RedditPost] | Tuple[Optional[RedditPost], float]:
|
|
"""
|
|
Find a submission that hasn't been processed yet.
|
|
|
|
Args:
|
|
wrappers: List of SubmissionWrapper objects
|
|
subreddit_name: Name of the subreddit
|
|
similarity_scores: Optional similarity scores for each submission
|
|
|
|
Returns:
|
|
First undone RedditPost, or tuple of (RedditPost, similarity_score) if scores provided
|
|
"""
|
|
allow_nsfw = settings.config["settings"].get("allow_nsfw", False)
|
|
min_comments = int(settings.config["reddit"]["thread"].get("min_comments", 20))
|
|
|
|
for i, wrapper in enumerate(wrappers):
|
|
# Skip NSFW if not allowed
|
|
if wrapper.over_18 and not allow_nsfw:
|
|
continue
|
|
|
|
# Skip stickied posts
|
|
if wrapper.stickied:
|
|
continue
|
|
|
|
# Check minimum comments (unless story mode)
|
|
if not settings.config["settings"].get("storymode"):
|
|
if wrapper.num_comments < min_comments:
|
|
continue
|
|
|
|
# Check if already done
|
|
if check_done(wrapper) is None:
|
|
continue
|
|
|
|
post = wrapper.to_post()
|
|
|
|
if similarity_scores is not None and i < len(similarity_scores):
|
|
return post, similarity_scores[i]
|
|
|
|
return post
|
|
|
|
return None
|