Merge branch 'master' into develop

# Conflicts: # main.py # reddit/subreddit.py # requirements.txt # utils/subreddit.py
3 years ago · 5e7c05977f
parent bfe577643c 3380d69053
commit 5e7c05977f
5 changed files with 96 additions and 5 deletions
--- a/reddit/subreddit.py
+++ b/reddit/subreddit.py
@ -1,15 +1,18 @@
 import re
 from prawcore.exceptions import ResponseException
 from utils import settings
 import praw
 from praw.models import MoreComments
 from prawcore.exceptions import ResponseException
 from utils import settings
 from utils.console import print_step, print_substep
 from utils.subreddit import get_subreddit_undone
 from utils.videos import check_done
 from utils.voice import sanitize_text
 from utils.posttextparser import posttextparser
 from utils.ai_methods import sort_by_similarity
 def get_subreddit_threads(POST_ID: str):
@ -50,6 +53,7 @@ def get_subreddit_threads(POST_ID: str):
    # Ask user for subreddit input
    print_step("Getting subreddit threads...")
    similarity_score = 0
    if not settings.config["reddit"]["thread"][
        "subreddit"
    ]:  # note to user. you can have multiple subreddits via reddit.subreddit("redditdev+learnpython")
@ -76,6 +80,20 @@ def get_subreddit_threads(POST_ID: str):
    if POST_ID:  # would only be called if there are multiple queued posts
        submission = reddit.submission(id=POST_ID)
    elif (
        settings.config["reddit"]["thread"]["post_id"]
        and len(str(settings.config["reddit"]["thread"]["post_id"]).split("+")) == 1
    ):
        submission = reddit.submission(id=settings.config["reddit"]["thread"]["post_id"])
    elif settings.config["ai"]["ai_similarity_enabled"]: # ai sorting based on comparison
        threads = subreddit.hot(limit=50)
        keywords = settings.config["ai"]["ai_similarity_keywords"].split(',')
        keywords = [keyword.strip() for keyword in keywords]
        # Reformat the keywords for printing
        keywords_print = ", ".join(keywords)
        print(f'Sorting threads by similarity to the given keywords: {keywords_print}')
        threads, similarity_scores = sort_by_similarity(threads, keywords)
        submission, similarity_score = get_subreddit_undone(threads, subreddit, similarity_scores=similarity_scores)
    else:
        threads = subreddit.hot(limit=25)
        submission = get_subreddit_undone(threads, subreddit)
@ -99,7 +117,7 @@ def get_subreddit_threads(POST_ID: str):
        exit()
    submission = check_done(submission)  # double-checking
-    
+
    upvotes = submission.score
    ratio = submission.upvote_ratio * 100
    num_comments = submission.num_comments
@ -110,6 +128,8 @@ def get_subreddit_threads(POST_ID: str):
    print_substep(f"Thread has {upvotes} upvotes", style="bold blue")
    print_substep(f"Thread has a upvote ratio of {ratio}%", style="bold blue")
    print_substep(f"Thread has {num_comments} comments", style="bold blue")
    if similarity_score:
        print_substep(f"Thread has a similarity score up to {round(similarity_score * 100)}%", style="bold blue")
    content["thread_url"] = threadurl
    content["thread_title"] = submission.title
--- a/requirements.txt
+++ b/requirements.txt
@ -16,4 +16,6 @@ tomlkit==0.11.4
 Flask==2.2.2
 clean-text==0.6.0
 unidecode==1.3.2
-spacy==3.4.1
+spacy==3.4.1
 torch==1.12.1
 transformers==4.25.1
--- a/utils/.config.template.toml
+++ b/utils/.config.template.toml
@ -16,6 +16,9 @@ post_lang = { default = "", optional = true, explanation = "The language you wou
 min_comments = { default = 20, optional = false, nmin = 10, type = "int", explanation = "The minimum number of comments a post should have to be included. default is 20", example = 29, oob_error = "the minimum number of comments should be between 15 and 999999" }
 #post_url = { optional = true, default = "", regex = "^https:\\/\\/www\\.reddit\\.com\\/r\\/[a-zA-Z0-9]+\\/comments\\/[a-zA-Z0-9]+\\/[a-zA-Z0-9_]+\\/$", explanation = "Not working currently Use if you want to use a specific post.", example = "https://www.reddit.com/r/buildapc/comments/yzh07p/have_you_switched_to_windows_11/" }
 [ai]
 ai_similarity_enabled = {optional = true, option = [true, false], default = false, type = "bool", explanation = "Threads read from Reddit are sorted based on their similarity to the keywords given below"}
 ai_similarity_keywords = {optional = true, type="str", example= 'Elon Musk, Twitter, Stocks', explanation = "Every keyword or even sentence, seperated with comma, is used to sort the reddit threads based on similarity"}
 [settings]
 allow_nsfw = { optional = false, type = "bool", default = false, example = false, options = [true, false, ], explanation = "Whether to allow NSFW content, True or False" }
--- a/utils/ai_methods.py
+++ b/utils/ai_methods.py
@ -0,0 +1,58 @@
 import numpy as np
 from transformers import AutoTokenizer, AutoModel
 import torch
 # Mean Pooling - Take attention mask into account for correct averaging
 def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]  # First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
 # This function sort the given threads based on their total similarity with the given keywords
 def sort_by_similarity(thread_objects, keywords):
    # Initialize tokenizer + model.
    tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
    model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
    # Transform the generator to a list of Submission Objects, so we can sort later based on context similarity to
    # keywords
    thread_objects = list(thread_objects)
    threads_sentences = []
    for i, thread in enumerate(thread_objects):
        threads_sentences.append(' '.join([thread.title, thread.selftext]))
    # Threads inference
    encoded_threads = tokenizer(threads_sentences, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        threads_embeddings = model(**encoded_threads)
    threads_embeddings = mean_pooling(threads_embeddings, encoded_threads['attention_mask'])
    # Keywords inference
    encoded_keywords = tokenizer(keywords, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        keywords_embeddings = model(**encoded_keywords)
    keywords_embeddings = mean_pooling(keywords_embeddings, encoded_keywords['attention_mask'])
    # Compare every keyword w/ every thread embedding
    threads_embeddings_tensor = torch.tensor(threads_embeddings)
    total_scores = torch.zeros(threads_embeddings_tensor.shape[0])
    cosine_similarity = torch.nn.CosineSimilarity()
    for keyword_embedding in keywords_embeddings:
        keyword_embedding = torch.tensor(keyword_embedding).repeat(threads_embeddings_tensor.shape[0], 1)
        similarity = cosine_similarity(keyword_embedding, threads_embeddings_tensor)
        total_scores += similarity
    similarity_scores, indices = torch.sort(total_scores, descending=True)
    threads_sentences = np.array(threads_sentences)[indices.numpy()]
    thread_objects = np.array(thread_objects)[indices.numpy()].tolist()
    #print('Similarity Thread Ranking')
    #for i, thread in enumerate(thread_objects):
    #    print(f'{i}) {threads_sentences[i]} score {similarity_scores[i]}')
    return thread_objects, similarity_scores
--- a/utils/subreddit.py
+++ b/utils/subreddit.py
@ -3,9 +3,10 @@ from os.path import exists
 from utils import settings
 from utils.console import print_substep
 from utils.ai_methods import sort_by_similarity
-def get_subreddit_undone(submissions: list, subreddit, times_checked=0):
+def get_subreddit_undone(submissions: list, subreddit, times_checked=0, similarity_scores=None):
    """_summary_
    Args:
@ -15,6 +16,11 @@ def get_subreddit_undone(submissions: list, subreddit, times_checked=0):
    Returns:
        Any: The submission that has not been done
    """
    # Second try of getting a valid Submission
    if times_checked and settings.config["ai"]["ai_similarity_enabled"]:
        print('Sorting based on similarity for a different date filter and thread limit..')
        submissions = sort_by_similarity(submissions, keywords=settings.config["ai"]["ai_similarity_enabled"])
    # recursively checks if the top submission in the list was already done.
    if not exists("./video_creation/data/videos.json"):
        with open("./video_creation/data/videos.json", "w+") as f:
@ -23,7 +29,7 @@ def get_subreddit_undone(submissions: list, subreddit, times_checked=0):
        "./video_creation/data/videos.json", "r", encoding="utf-8"
    ) as done_vids_raw:
        done_videos = json.load(done_vids_raw)
-    for submission in submissions:
+    for i, submission in enumerate(submissions):
        if already_done(done_videos, submission):
            continue
        if submission.over_18:
@ -45,6 +51,8 @@ def get_subreddit_undone(submissions: list, subreddit, times_checked=0):
            continue
        if settings.config["settings"]["storymode"] and not submission.is_self:
            continue
        if similarity_scores is not None:
            return submission, similarity_scores[i].item()
        return submission
    print("all submissions have been done going by top submission order")
    VALID_TIME_FILTERS = [