Merge branch 'master' into develop

# Conflicts: # main.py # reddit/subreddit.py # requirements.txt # utils/subreddit.py
2 years ago · 5e7c05977f
parent bfe577643c 3380d69053
commit 5e7c05977f
5 changed files with 96 additions and 5 deletions
--- a/reddit/subreddit.py
+++ b/reddit/subreddit.py
@ -1,15 +1,18 @@
 import re

+from prawcore.exceptions import ResponseException
+
+from utils import settings
 import praw
 from praw.models import MoreComments
 from prawcore.exceptions import ResponseException

-from utils import settings
 from utils.console import print_step, print_substep
 from utils.subreddit import get_subreddit_undone
 from utils.videos import check_done
 from utils.voice import sanitize_text
 from utils.posttextparser import posttextparser
+from utils.ai_methods import sort_by_similarity


 def get_subreddit_threads(POST_ID: str):
@ -50,6 +53,7 @@ def get_subreddit_threads(POST_ID: str):

    # Ask user for subreddit input
    print_step("Getting subreddit threads...")
+    similarity_score = 0
    if not settings.config["reddit"]["thread"][
        "subreddit"
    ]:  # note to user. you can have multiple subreddits via reddit.subreddit("redditdev+learnpython")
@ -76,6 +80,20 @@ def get_subreddit_threads(POST_ID: str):
    if POST_ID:  # would only be called if there are multiple queued posts
        submission = reddit.submission(id=POST_ID)

+    elif (
+        settings.config["reddit"]["thread"]["post_id"]
+        and len(str(settings.config["reddit"]["thread"]["post_id"]).split("+")) == 1
+    ):
+        submission = reddit.submission(id=settings.config["reddit"]["thread"]["post_id"])
+    elif settings.config["ai"]["ai_similarity_enabled"]: # ai sorting based on comparison
+        threads = subreddit.hot(limit=50)
+        keywords = settings.config["ai"]["ai_similarity_keywords"].split(',')
+        keywords = [keyword.strip() for keyword in keywords]
+        # Reformat the keywords for printing
+        keywords_print = ", ".join(keywords)
+        print(f'Sorting threads by similarity to the given keywords: {keywords_print}')
+        threads, similarity_scores = sort_by_similarity(threads, keywords)
+        submission, similarity_score = get_subreddit_undone(threads, subreddit, similarity_scores=similarity_scores)
    else:
        threads = subreddit.hot(limit=25)
        submission = get_subreddit_undone(threads, subreddit)
@ -99,7 +117,7 @@ def get_subreddit_threads(POST_ID: str):
        exit()

    submission = check_done(submission)  # double-checking
-    
+
    upvotes = submission.score
    ratio = submission.upvote_ratio * 100
    num_comments = submission.num_comments
@ -110,6 +128,8 @@ def get_subreddit_threads(POST_ID: str):
    print_substep(f"Thread has {upvotes} upvotes", style="bold blue")
    print_substep(f"Thread has a upvote ratio of {ratio}%", style="bold blue")
    print_substep(f"Thread has {num_comments} comments", style="bold blue")
+    if similarity_score:
+        print_substep(f"Thread has a similarity score up to {round(similarity_score * 100)}%", style="bold blue")

    content["thread_url"] = threadurl
    content["thread_title"] = submission.title
--- a/requirements.txt
+++ b/requirements.txt
@ -16,4 +16,6 @@ tomlkit==0.11.4
 Flask==2.2.2
 clean-text==0.6.0
 unidecode==1.3.2
-spacy==3.4.1
+spacy==3.4.1
+torch==1.12.1
+transformers==4.25.1
--- a/utils/.config.template.toml
+++ b/utils/.config.template.toml
@ -16,6 +16,9 @@ post_lang = { default = "", optional = true, explanation = "The language you wou
 min_comments = { default = 20, optional = false, nmin = 10, type = "int", explanation = "The minimum number of comments a post should have to be included. default is 20", example = 29, oob_error = "the minimum number of comments should be between 15 and 999999" }
 #post_url = { optional = true, default = "", regex = "^https:\\/\\/www\\.reddit\\.com\\/r\\/[a-zA-Z0-9]+\\/comments\\/[a-zA-Z0-9]+\\/[a-zA-Z0-9_]+\\/$", explanation = "Not working currently Use if you want to use a specific post.", example = "https://www.reddit.com/r/buildapc/comments/yzh07p/have_you_switched_to_windows_11/" }

+[ai]
+ai_similarity_enabled = {optional = true, option = [true, false], default = false, type = "bool", explanation = "Threads read from Reddit are sorted based on their similarity to the keywords given below"}
+ai_similarity_keywords = {optional = true, type="str", example= 'Elon Musk, Twitter, Stocks', explanation = "Every keyword or even sentence, seperated with comma, is used to sort the reddit threads based on similarity"}

 [settings]
 allow_nsfw = { optional = false, type = "bool", default = false, example = false, options = [true, false, ], explanation = "Whether to allow NSFW content, True or False" }
--- a/utils/ai_methods.py
+++ b/utils/ai_methods.py
@ -0,0 +1,58 @@
+import numpy as np
+from transformers import AutoTokenizer, AutoModel
+import torch
+
+
+# Mean Pooling - Take attention mask into account for correct averaging
+def mean_pooling(model_output, attention_mask):
+    token_embeddings = model_output[0]  # First element of model_output contains all token embeddings
+    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+
+
+# This function sort the given threads based on their total similarity with the given keywords
+def sort_by_similarity(thread_objects, keywords):
+    # Initialize tokenizer + model.
+    tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
+    model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
+
+    # Transform the generator to a list of Submission Objects, so we can sort later based on context similarity to
+    # keywords
+    thread_objects = list(thread_objects)
+
+    threads_sentences = []
+    for i, thread in enumerate(thread_objects):
+        threads_sentences.append(' '.join([thread.title, thread.selftext]))
+
+    # Threads inference
+    encoded_threads = tokenizer(threads_sentences, padding=True, truncation=True, return_tensors='pt')
+    with torch.no_grad():
+        threads_embeddings = model(**encoded_threads)
+    threads_embeddings = mean_pooling(threads_embeddings, encoded_threads['attention_mask'])
+
+    # Keywords inference
+    encoded_keywords = tokenizer(keywords, padding=True, truncation=True, return_tensors='pt')
+    with torch.no_grad():
+        keywords_embeddings = model(**encoded_keywords)
+    keywords_embeddings = mean_pooling(keywords_embeddings, encoded_keywords['attention_mask'])
+
+    # Compare every keyword w/ every thread embedding
+    threads_embeddings_tensor = torch.tensor(threads_embeddings)
+    total_scores = torch.zeros(threads_embeddings_tensor.shape[0])
+    cosine_similarity = torch.nn.CosineSimilarity()
+    for keyword_embedding in keywords_embeddings:
+        keyword_embedding = torch.tensor(keyword_embedding).repeat(threads_embeddings_tensor.shape[0], 1)
+        similarity = cosine_similarity(keyword_embedding, threads_embeddings_tensor)
+        total_scores += similarity
+
+    similarity_scores, indices = torch.sort(total_scores, descending=True)
+
+    threads_sentences = np.array(threads_sentences)[indices.numpy()]
+
+    thread_objects = np.array(thread_objects)[indices.numpy()].tolist()
+
+    #print('Similarity Thread Ranking')
+    #for i, thread in enumerate(thread_objects):
+    #    print(f'{i}) {threads_sentences[i]} score {similarity_scores[i]}')
+
+    return thread_objects, similarity_scores
--- a/utils/subreddit.py
+++ b/utils/subreddit.py
@ -3,9 +3,10 @@ from os.path import exists

 from utils import settings
 from utils.console import print_substep
+from utils.ai_methods import sort_by_similarity


-def get_subreddit_undone(submissions: list, subreddit, times_checked=0):
+def get_subreddit_undone(submissions: list, subreddit, times_checked=0, similarity_scores=None):
    """_summary_

    Args:
@ -15,6 +16,11 @@ def get_subreddit_undone(submissions: list, subreddit, times_checked=0):
    Returns:
        Any: The submission that has not been done
    """
+    # Second try of getting a valid Submission
+    if times_checked and settings.config["ai"]["ai_similarity_enabled"]:
+        print('Sorting based on similarity for a different date filter and thread limit..')
+        submissions = sort_by_similarity(submissions, keywords=settings.config["ai"]["ai_similarity_enabled"])
+
    # recursively checks if the top submission in the list was already done.
    if not exists("./video_creation/data/videos.json"):
        with open("./video_creation/data/videos.json", "w+") as f:
@ -23,7 +29,7 @@ def get_subreddit_undone(submissions: list, subreddit, times_checked=0):
        "./video_creation/data/videos.json", "r", encoding="utf-8"
    ) as done_vids_raw:
        done_videos = json.load(done_vids_raw)
-    for submission in submissions:
+    for i, submission in enumerate(submissions):
        if already_done(done_videos, submission):
            continue
        if submission.over_18:
@ -45,6 +51,8 @@ def get_subreddit_undone(submissions: list, subreddit, times_checked=0):
            continue
        if settings.config["settings"]["storymode"] and not submission.is_self:
            continue
+        if similarity_scores is not None:
+            return submission, similarity_scores[i].item()
        return submission
    print("all submissions have been done going by top submission order")
    VALID_TIME_FILTERS = [