Merge branch 'master' into develop

# Conflicts:
#	main.py
#	reddit/subreddit.py
#	requirements.txt
#	utils/subreddit.py
pull/1410/head
Simon 3 years ago
commit 5e7c05977f

@ -1,15 +1,18 @@
import re import re
from prawcore.exceptions import ResponseException
from utils import settings
import praw import praw
from praw.models import MoreComments from praw.models import MoreComments
from prawcore.exceptions import ResponseException from prawcore.exceptions import ResponseException
from utils import settings
from utils.console import print_step, print_substep from utils.console import print_step, print_substep
from utils.subreddit import get_subreddit_undone from utils.subreddit import get_subreddit_undone
from utils.videos import check_done from utils.videos import check_done
from utils.voice import sanitize_text from utils.voice import sanitize_text
from utils.posttextparser import posttextparser from utils.posttextparser import posttextparser
from utils.ai_methods import sort_by_similarity
def get_subreddit_threads(POST_ID: str): def get_subreddit_threads(POST_ID: str):
@ -50,6 +53,7 @@ def get_subreddit_threads(POST_ID: str):
# Ask user for subreddit input # Ask user for subreddit input
print_step("Getting subreddit threads...") print_step("Getting subreddit threads...")
similarity_score = 0
if not settings.config["reddit"]["thread"][ if not settings.config["reddit"]["thread"][
"subreddit" "subreddit"
]: # note to user. you can have multiple subreddits via reddit.subreddit("redditdev+learnpython") ]: # note to user. you can have multiple subreddits via reddit.subreddit("redditdev+learnpython")
@ -76,6 +80,20 @@ def get_subreddit_threads(POST_ID: str):
if POST_ID: # would only be called if there are multiple queued posts if POST_ID: # would only be called if there are multiple queued posts
submission = reddit.submission(id=POST_ID) submission = reddit.submission(id=POST_ID)
elif (
settings.config["reddit"]["thread"]["post_id"]
and len(str(settings.config["reddit"]["thread"]["post_id"]).split("+")) == 1
):
submission = reddit.submission(id=settings.config["reddit"]["thread"]["post_id"])
elif settings.config["ai"]["ai_similarity_enabled"]: # ai sorting based on comparison
threads = subreddit.hot(limit=50)
keywords = settings.config["ai"]["ai_similarity_keywords"].split(',')
keywords = [keyword.strip() for keyword in keywords]
# Reformat the keywords for printing
keywords_print = ", ".join(keywords)
print(f'Sorting threads by similarity to the given keywords: {keywords_print}')
threads, similarity_scores = sort_by_similarity(threads, keywords)
submission, similarity_score = get_subreddit_undone(threads, subreddit, similarity_scores=similarity_scores)
else: else:
threads = subreddit.hot(limit=25) threads = subreddit.hot(limit=25)
submission = get_subreddit_undone(threads, subreddit) submission = get_subreddit_undone(threads, subreddit)
@ -99,7 +117,7 @@ def get_subreddit_threads(POST_ID: str):
exit() exit()
submission = check_done(submission) # double-checking submission = check_done(submission) # double-checking
upvotes = submission.score upvotes = submission.score
ratio = submission.upvote_ratio * 100 ratio = submission.upvote_ratio * 100
num_comments = submission.num_comments num_comments = submission.num_comments
@ -110,6 +128,8 @@ def get_subreddit_threads(POST_ID: str):
print_substep(f"Thread has {upvotes} upvotes", style="bold blue") print_substep(f"Thread has {upvotes} upvotes", style="bold blue")
print_substep(f"Thread has a upvote ratio of {ratio}%", style="bold blue") print_substep(f"Thread has a upvote ratio of {ratio}%", style="bold blue")
print_substep(f"Thread has {num_comments} comments", style="bold blue") print_substep(f"Thread has {num_comments} comments", style="bold blue")
if similarity_score:
print_substep(f"Thread has a similarity score up to {round(similarity_score * 100)}%", style="bold blue")
content["thread_url"] = threadurl content["thread_url"] = threadurl
content["thread_title"] = submission.title content["thread_title"] = submission.title

@ -16,4 +16,6 @@ tomlkit==0.11.4
Flask==2.2.2 Flask==2.2.2
clean-text==0.6.0 clean-text==0.6.0
unidecode==1.3.2 unidecode==1.3.2
spacy==3.4.1 spacy==3.4.1
torch==1.12.1
transformers==4.25.1

@ -16,6 +16,9 @@ post_lang = { default = "", optional = true, explanation = "The language you wou
min_comments = { default = 20, optional = false, nmin = 10, type = "int", explanation = "The minimum number of comments a post should have to be included. default is 20", example = 29, oob_error = "the minimum number of comments should be between 15 and 999999" } min_comments = { default = 20, optional = false, nmin = 10, type = "int", explanation = "The minimum number of comments a post should have to be included. default is 20", example = 29, oob_error = "the minimum number of comments should be between 15 and 999999" }
#post_url = { optional = true, default = "", regex = "^https:\\/\\/www\\.reddit\\.com\\/r\\/[a-zA-Z0-9]+\\/comments\\/[a-zA-Z0-9]+\\/[a-zA-Z0-9_]+\\/$", explanation = "Not working currently Use if you want to use a specific post.", example = "https://www.reddit.com/r/buildapc/comments/yzh07p/have_you_switched_to_windows_11/" } #post_url = { optional = true, default = "", regex = "^https:\\/\\/www\\.reddit\\.com\\/r\\/[a-zA-Z0-9]+\\/comments\\/[a-zA-Z0-9]+\\/[a-zA-Z0-9_]+\\/$", explanation = "Not working currently Use if you want to use a specific post.", example = "https://www.reddit.com/r/buildapc/comments/yzh07p/have_you_switched_to_windows_11/" }
[ai]
ai_similarity_enabled = {optional = true, option = [true, false], default = false, type = "bool", explanation = "Threads read from Reddit are sorted based on their similarity to the keywords given below"}
ai_similarity_keywords = {optional = true, type="str", example= 'Elon Musk, Twitter, Stocks', explanation = "Every keyword or even sentence, seperated with comma, is used to sort the reddit threads based on similarity"}
[settings] [settings]
allow_nsfw = { optional = false, type = "bool", default = false, example = false, options = [true, false, ], explanation = "Whether to allow NSFW content, True or False" } allow_nsfw = { optional = false, type = "bool", default = false, example = false, options = [true, false, ], explanation = "Whether to allow NSFW content, True or False" }

@ -0,0 +1,58 @@
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch
# Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
token_embeddings = model_output[0] # First element of model_output contains all token embeddings
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
# This function sort the given threads based on their total similarity with the given keywords
def sort_by_similarity(thread_objects, keywords):
# Initialize tokenizer + model.
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
# Transform the generator to a list of Submission Objects, so we can sort later based on context similarity to
# keywords
thread_objects = list(thread_objects)
threads_sentences = []
for i, thread in enumerate(thread_objects):
threads_sentences.append(' '.join([thread.title, thread.selftext]))
# Threads inference
encoded_threads = tokenizer(threads_sentences, padding=True, truncation=True, return_tensors='pt')
with torch.no_grad():
threads_embeddings = model(**encoded_threads)
threads_embeddings = mean_pooling(threads_embeddings, encoded_threads['attention_mask'])
# Keywords inference
encoded_keywords = tokenizer(keywords, padding=True, truncation=True, return_tensors='pt')
with torch.no_grad():
keywords_embeddings = model(**encoded_keywords)
keywords_embeddings = mean_pooling(keywords_embeddings, encoded_keywords['attention_mask'])
# Compare every keyword w/ every thread embedding
threads_embeddings_tensor = torch.tensor(threads_embeddings)
total_scores = torch.zeros(threads_embeddings_tensor.shape[0])
cosine_similarity = torch.nn.CosineSimilarity()
for keyword_embedding in keywords_embeddings:
keyword_embedding = torch.tensor(keyword_embedding).repeat(threads_embeddings_tensor.shape[0], 1)
similarity = cosine_similarity(keyword_embedding, threads_embeddings_tensor)
total_scores += similarity
similarity_scores, indices = torch.sort(total_scores, descending=True)
threads_sentences = np.array(threads_sentences)[indices.numpy()]
thread_objects = np.array(thread_objects)[indices.numpy()].tolist()
#print('Similarity Thread Ranking')
#for i, thread in enumerate(thread_objects):
# print(f'{i}) {threads_sentences[i]} score {similarity_scores[i]}')
return thread_objects, similarity_scores

@ -3,9 +3,10 @@ from os.path import exists
from utils import settings from utils import settings
from utils.console import print_substep from utils.console import print_substep
from utils.ai_methods import sort_by_similarity
def get_subreddit_undone(submissions: list, subreddit, times_checked=0): def get_subreddit_undone(submissions: list, subreddit, times_checked=0, similarity_scores=None):
"""_summary_ """_summary_
Args: Args:
@ -15,6 +16,11 @@ def get_subreddit_undone(submissions: list, subreddit, times_checked=0):
Returns: Returns:
Any: The submission that has not been done Any: The submission that has not been done
""" """
# Second try of getting a valid Submission
if times_checked and settings.config["ai"]["ai_similarity_enabled"]:
print('Sorting based on similarity for a different date filter and thread limit..')
submissions = sort_by_similarity(submissions, keywords=settings.config["ai"]["ai_similarity_enabled"])
# recursively checks if the top submission in the list was already done. # recursively checks if the top submission in the list was already done.
if not exists("./video_creation/data/videos.json"): if not exists("./video_creation/data/videos.json"):
with open("./video_creation/data/videos.json", "w+") as f: with open("./video_creation/data/videos.json", "w+") as f:
@ -23,7 +29,7 @@ def get_subreddit_undone(submissions: list, subreddit, times_checked=0):
"./video_creation/data/videos.json", "r", encoding="utf-8" "./video_creation/data/videos.json", "r", encoding="utf-8"
) as done_vids_raw: ) as done_vids_raw:
done_videos = json.load(done_vids_raw) done_videos = json.load(done_vids_raw)
for submission in submissions: for i, submission in enumerate(submissions):
if already_done(done_videos, submission): if already_done(done_videos, submission):
continue continue
if submission.over_18: if submission.over_18:
@ -45,6 +51,8 @@ def get_subreddit_undone(submissions: list, subreddit, times_checked=0):
continue continue
if settings.config["settings"]["storymode"] and not submission.is_self: if settings.config["settings"]["storymode"] and not submission.is_self:
continue continue
if similarity_scores is not None:
return submission, similarity_scores[i].item()
return submission return submission
print("all submissions have been done going by top submission order") print("all submissions have been done going by top submission order")
VALID_TIME_FILTERS = [ VALID_TIME_FILTERS = [

Loading…
Cancel
Save