RedditVideoMakerBot/utils/ai_methods.py

import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch


# Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]  # First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


# This function sort the given threads based on their total similarity with the given keywords
def sort_by_similarity(thread_objects, keywords):
    # Initialize tokenizer + model.
    tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
    model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

    # Transform the generator to a list of Submission Objects, so we can sort later based on context similarity to
    # keywords
    thread_objects = list(thread_objects)

    threads_sentences = []
    for i, thread in enumerate(thread_objects):
        threads_sentences.append(' '.join([thread.title, thread.selftext]))

    # Threads inference
    encoded_threads = tokenizer(threads_sentences, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        threads_embeddings = model(**encoded_threads)
    threads_embeddings = mean_pooling(threads_embeddings, encoded_threads['attention_mask'])

    # Keywords inference
    encoded_keywords = tokenizer(keywords, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        keywords_embeddings = model(**encoded_keywords)
    keywords_embeddings = mean_pooling(keywords_embeddings, encoded_keywords['attention_mask'])

    # Compare every keyword w/ every thread embedding
    threads_embeddings_tensor = torch.tensor(threads_embeddings)
    total_scores = torch.zeros(threads_embeddings_tensor.shape[0])
    cosine_similarity = torch.nn.CosineSimilarity()
    for keyword_embedding in keywords_embeddings:
        keyword_embedding = torch.tensor(keyword_embedding).repeat(threads_embeddings_tensor.shape[0], 1)
        similarity = cosine_similarity(keyword_embedding, threads_embeddings_tensor)
        total_scores += similarity

    similarity_scores, indices = torch.sort(total_scores, descending=True)

    threads_sentences = np.array(threads_sentences)[indices.numpy()]

    thread_objects = np.array(thread_objects)[indices.numpy()].tolist()

    #print('Similarity Thread Ranking')
    #for i, thread in enumerate(thread_objects):
    #    print(f'{i}) {threads_sentences[i]} score {similarity_scores[i]}')

    return thread_objects, similarity_scores
Add ai methods that sort threads based on content similarity with given keywords by dmoralis (#1391) Thank you @dmoralis! 2 years ago			`import numpy as np`
			`from transformers import AutoTokenizer, AutoModel`
			`import torch`


			`# Mean Pooling - Take attention mask into account for correct averaging`
			`def mean_pooling(model_output, attention_mask):`
			`token_embeddings = model_output[0] # First element of model_output contains all token embeddings`
			`input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()`
			`return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)`


			`# This function sort the given threads based on their total similarity with the given keywords`
			`def sort_by_similarity(thread_objects, keywords):`
			`# Initialize tokenizer + model.`
			`tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')`
			`model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')`

			`# Transform the generator to a list of Submission Objects, so we can sort later based on context similarity to`
			`# keywords`
			`thread_objects = list(thread_objects)`

			`threads_sentences = []`
			`for i, thread in enumerate(thread_objects):`
			`threads_sentences.append(' '.join([thread.title, thread.selftext]))`

			`# Threads inference`
			`encoded_threads = tokenizer(threads_sentences, padding=True, truncation=True, return_tensors='pt')`
			`with torch.no_grad():`
			`threads_embeddings = model(**encoded_threads)`
			`threads_embeddings = mean_pooling(threads_embeddings, encoded_threads['attention_mask'])`

			`# Keywords inference`
			`encoded_keywords = tokenizer(keywords, padding=True, truncation=True, return_tensors='pt')`
			`with torch.no_grad():`
			`keywords_embeddings = model(**encoded_keywords)`
			`keywords_embeddings = mean_pooling(keywords_embeddings, encoded_keywords['attention_mask'])`

			`# Compare every keyword w/ every thread embedding`
			`threads_embeddings_tensor = torch.tensor(threads_embeddings)`
			`total_scores = torch.zeros(threads_embeddings_tensor.shape[0])`
			`cosine_similarity = torch.nn.CosineSimilarity()`
			`for keyword_embedding in keywords_embeddings:`
			`keyword_embedding = torch.tensor(keyword_embedding).repeat(threads_embeddings_tensor.shape[0], 1)`
			`similarity = cosine_similarity(keyword_embedding, threads_embeddings_tensor)`
			`total_scores += similarity`

			`similarity_scores, indices = torch.sort(total_scores, descending=True)`

			`threads_sentences = np.array(threads_sentences)[indices.numpy()]`

			`thread_objects = np.array(thread_objects)[indices.numpy()].tolist()`

			`#print('Similarity Thread Ranking')`
			`#for i, thread in enumerate(thread_objects):`
			`# print(f'{i}) {threads_sentences[i]} score {similarity_scores[i]}')`

			`return thread_objects, similarity_scores`