import numpy as np from transformers import AutoTokenizer, AutoModel import torch # Mean Pooling - Take attention mask into account for correct averaging def mean_pooling(model_output, attention_mask): token_embeddings = model_output[ 0 ] # First element of model_output contains all token embeddings input_mask_expanded = ( attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() ) return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp( input_mask_expanded.sum(1), min=1e-9 ) # This function sort the given threads based on their total similarity with the given keywords def sort_by_similarity(thread_objects, keywords): # Initialize tokenizer + model. tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2") model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2") # Transform the generator to a list of Submission Objects, so we can sort later based on context similarity to # keywords thread_objects = list(thread_objects) threads_sentences = [] for i, thread in enumerate(thread_objects): threads_sentences.append(" ".join([thread.title, thread.selftext])) # Threads inference encoded_threads = tokenizer( threads_sentences, padding=True, truncation=True, return_tensors="pt" ) with torch.no_grad(): threads_embeddings = model(**encoded_threads) threads_embeddings = mean_pooling( threads_embeddings, encoded_threads["attention_mask"] ) # Keywords inference encoded_keywords = tokenizer( keywords, padding=True, truncation=True, return_tensors="pt" ) with torch.no_grad(): keywords_embeddings = model(**encoded_keywords) keywords_embeddings = mean_pooling( keywords_embeddings, encoded_keywords["attention_mask"] ) # Compare every keyword w/ every thread embedding threads_embeddings_tensor = torch.tensor(threads_embeddings) total_scores = torch.zeros(threads_embeddings_tensor.shape[0]) cosine_similarity = torch.nn.CosineSimilarity() for keyword_embedding in keywords_embeddings: keyword_embedding = torch.tensor(keyword_embedding).repeat( threads_embeddings_tensor.shape[0], 1 ) similarity = cosine_similarity(keyword_embedding, threads_embeddings_tensor) total_scores += similarity similarity_scores, indices = torch.sort(total_scores, descending=True) threads_sentences = np.array(threads_sentences)[indices.numpy()] thread_objects = np.array(thread_objects)[indices.numpy()].tolist() # print('Similarity Thread Ranking') # for i, thread in enumerate(thread_objects): # print(f'{i}) {threads_sentences[i]} score {similarity_scores[i]}') return thread_objects, similarity_scores