RedditVideoMakerBot/reddit/subreddit.py

"""
Reddit subreddit thread fetcher using no-OAuth scraper.
No API keys required - uses Reddit's public JSON endpoints.
"""
import re
from typing import Dict, List, Optional, Any, Tuple

from reddit.scraper import get_scraper, RedditPost, RedditComment, RedditScraperError
from utils import settings
from utils.ai_methods import sort_by_similarity
from utils.console import print_step, print_substep
from utils.posttextparser import posttextparser
from utils.videos import check_done
from utils.voice import sanitize_text


class SubmissionWrapper:
    """Wrapper to make RedditPost compatible with existing utility functions."""

    def __init__(self, post: RedditPost):
        self.id = post.id
        self.title = post.title
        self.selftext = post.selftext
        self.author = post.author
        self.score = post.score
        self.upvote_ratio = post.upvote_ratio
        self.num_comments = post.num_comments
        self.permalink = post.permalink
        self.url = post.url
        self.over_18 = post.over_18
        self.stickied = post.stickied
        self.subreddit_name = post.subreddit
        self._post = post

    def to_post(self) -> RedditPost:
        return self._post


def get_subreddit_threads(POST_ID: Optional[str] = None) -> Dict[str, Any]:
    """
    Fetches a Reddit thread and its comments using the no-OAuth scraper.
    No API keys required.

    Args:
        POST_ID: Optional specific post ID to fetch

    Returns:
        Dictionary containing thread data and comments
    """
    print_substep("Connecting to Reddit (no-auth mode)...")

    scraper = get_scraper()
    content: Dict[str, Any] = {}
    similarity_score = 0

    # Get subreddit from config or user input
    print_step("Getting subreddit threads...")

    subreddit_name = settings.config["reddit"]["thread"].get("subreddit", "")

    if not subreddit_name:
        subreddit_name = input("What subreddit would you like to pull from? ")
        subreddit_name = re.sub(r"^r/", "", subreddit_name.strip())
        if not subreddit_name:
            subreddit_name = "AskReddit"
            print_substep("Subreddit not defined. Using AskReddit.")
    else:
        # Clean the subreddit name
        if str(subreddit_name).lower().startswith("r/"):
            subreddit_name = subreddit_name[2:]
        print_substep(f"Using subreddit: r/{subreddit_name} from config")

    # Get the submission
    submission: Optional[RedditPost] = None

    try:
        if POST_ID:
            # Specific post ID provided (for queued posts)
            submission = scraper.get_post_by_id(POST_ID)
            if not submission:
                raise RedditScraperError(f"Could not find post with ID: {POST_ID}")

        elif settings.config["reddit"]["thread"].get("post_id"):
            # Post ID from config (single post)
            post_id = str(settings.config["reddit"]["thread"]["post_id"])
            if "+" not in post_id:  # Single post, not multiple
                submission = scraper.get_post_by_id(post_id)
                if not submission:
                    raise RedditScraperError(f"Could not find post with ID: {post_id}")

        elif settings.config["ai"].get("ai_similarity_enabled"):
            # AI sorting based on keyword similarity
            print_substep("Fetching posts for AI similarity sorting...")
            posts = scraper.get_subreddit_posts(subreddit_name, sort="hot", limit=50)

            if not posts:
                raise RedditScraperError(f"No posts found in r/{subreddit_name}")

            keywords = settings.config["ai"].get("ai_similarity_keywords", "").split(",")
            keywords = [keyword.strip() for keyword in keywords if keyword.strip()]

            if keywords:
                keywords_print = ", ".join(keywords)
                print_substep(f"Sorting threads by similarity to: {keywords_print}")

                # Convert posts to format expected by sort_by_similarity
                wrappers = [SubmissionWrapper(post) for post in posts]
                sorted_wrappers, similarity_scores = sort_by_similarity(wrappers, keywords)

                submission, similarity_score = _get_undone_post(
                    sorted_wrappers, subreddit_name, similarity_scores=similarity_scores
                )
            else:
                wrappers = [SubmissionWrapper(post) for post in posts]
                submission = _get_undone_post(wrappers, subreddit_name)

        else:
            # Default: get hot posts
            posts = scraper.get_subreddit_posts(subreddit_name, sort="hot", limit=25)

            if not posts:
                raise RedditScraperError(f"No posts found in r/{subreddit_name}")

            wrappers = [SubmissionWrapper(post) for post in posts]
            submission = _get_undone_post(wrappers, subreddit_name)

    except RedditScraperError as e:
        print_substep(f"Error fetching Reddit data: {e}", style="bold red")
        raise

    if submission is None:
        print_substep("No suitable submission found. Retrying...", style="yellow")
        return get_subreddit_threads(POST_ID)

    # Check if story mode with no comments is okay
    if not submission.num_comments and not settings.config["settings"].get("storymode"):
        print_substep("No comments found. Skipping.", style="bold red")
        exit()

    # Double-check if this post was already done
    wrapper = SubmissionWrapper(submission)
    checked = check_done(wrapper)
    if checked is None:
        print_substep("Post already processed. Finding another...", style="yellow")
        return get_subreddit_threads(POST_ID)

    # Display post info
    upvotes = submission.score
    ratio = submission.upvote_ratio * 100
    num_comments = submission.num_comments
    thread_url = f"https://new.reddit.com{submission.permalink}"

    print_substep(f"Video will be: {submission.title}", style="bold green")
    print_substep(f"Thread url is: {thread_url}", style="bold green")
    print_substep(f"Thread has {upvotes} upvotes", style="bold blue")
    print_substep(f"Thread has a upvote ratio of {ratio:.0f}%", style="bold blue")
    print_substep(f"Thread has {num_comments} comments", style="bold blue")

    if similarity_score:
        print_substep(
            f"Thread has a similarity score up to {round(similarity_score * 100)}%",
            style="bold blue",
        )

    # Build content dictionary
    content["thread_url"] = thread_url
    content["thread_title"] = submission.title
    content["thread_id"] = submission.id
    content["is_nsfw"] = submission.over_18
    content["subreddit"] = subreddit_name
    content["comments"] = []

    if settings.config["settings"].get("storymode"):
        # Story mode - use the post's selftext
        if settings.config["settings"].get("storymodemethod") == 1:
            content["thread_post"] = posttextparser(submission.selftext)
        else:
            content["thread_post"] = submission.selftext
    else:
        # Comment mode - fetch and process comments
        print_substep("Fetching comments...", style="bold blue")

        try:
            _, comments = scraper.get_post_with_comments(
                submission.id,
                comment_sort="top",
                comment_limit=500,
                max_comments=1000,
            )

            # Filter and process comments
            max_len = int(settings.config["reddit"]["thread"].get("max_comment_length", 500))
            min_len = int(settings.config["reddit"]["thread"].get("min_comment_length", 1))

            for comment in comments:
                # Skip non-top-level comments (depth > 0)
                if comment.depth > 0:
                    continue

                # Skip deleted/removed
                if comment.body in ["[removed]", "[deleted]"]:
                    continue

                # Skip stickied comments
                if comment.stickied:
                    continue

                # Sanitize and validate
                sanitized = sanitize_text(comment.body)
                if not sanitized or sanitized.strip() == "":
                    continue

                # Check length constraints
                if len(comment.body) > max_len:
                    continue
                if len(comment.body) < min_len:
                    continue

                # Skip if author is deleted
                if comment.author in ["[deleted]", "[removed]"]:
                    continue

                content["comments"].append({
                    "comment_body": comment.body,
                    "comment_url": comment.permalink,
                    "comment_id": comment.id,
                })

            print_substep(f"Collected {len(content['comments'])} valid comments", style="bold green")

        except RedditScraperError as e:
            print_substep(f"Error fetching comments: {e}", style="yellow")
            # Continue without comments if fetch fails

    print_substep("Received subreddit threads successfully.", style="bold green")
    return content


def _get_undone_post(
    wrappers: List[SubmissionWrapper],
    subreddit_name: str,
    similarity_scores: Optional[List[float]] = None,
) -> Optional[RedditPost] | Tuple[Optional[RedditPost], float]:
    """
    Find a submission that hasn't been processed yet.

    Args:
        wrappers: List of SubmissionWrapper objects
        subreddit_name: Name of the subreddit
        similarity_scores: Optional similarity scores for each submission

    Returns:
        First undone RedditPost, or tuple of (RedditPost, similarity_score) if scores provided
    """
    allow_nsfw = settings.config["settings"].get("allow_nsfw", False)
    min_comments = int(settings.config["reddit"]["thread"].get("min_comments", 20))

    for i, wrapper in enumerate(wrappers):
        # Skip NSFW if not allowed
        if wrapper.over_18 and not allow_nsfw:
            continue

        # Skip stickied posts
        if wrapper.stickied:
            continue

        # Check minimum comments (unless story mode)
        if not settings.config["settings"].get("storymode"):
            if wrapper.num_comments < min_comments:
                continue

        # Check if already done
        if check_done(wrapper) is None:
            continue

        post = wrapper.to_post()

        if similarity_scores is not None and i < len(similarity_scores):
            return post, similarity_scores[i]

        return post

    return None