from tqdm import tqdm import joblib import numpy as np from sentence_transformers import SentenceTransformer, util # Load the model model = SentenceTransformer('all-MiniLM-L6-v2') def get_documents_from_scores(scores): rankings = [] for score in scores: rankings.append(score[0]) return rankings def cosine_similarity(v1, v2): v1 = np.array(v1) v2 = np.array(v2) if(np.linalg.norm(v1) != 0 and np.linalg.norm(v2) != 0): sim = np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2)) else: sim = 0 return sim def get_open_source_embeddings(documents): documents_embeddings = [] for document in tqdm(documents): documents_embeddings.append(model.encode(document)) return documents_embeddings def open_source_rankings(query, document_embeddings, k): query_embedding = model.encode(query) scores = [] for idx, embedding in enumerate(document_embeddings): scores.append((idx, cosine_similarity(query_embedding, embedding))) scores = sorted(scores, key=lambda x: x[1], reverse=True) scores = scores[:k] rankings = get_documents_from_scores(scores) return rankings, scores def open_source_pipeline(query, documents_embeddings_path="Retrieval/savedModels/open_source_embeddings.pkl", ids_path="Retrieval/savedModels/ids.pkl", k=100): document_embeddings = joblib.load(documents_embeddings_path) ids = joblib.load(ids_path) rankings, scores = open_source_rankings(query, document_embeddings, k) rankings2 = [] for ranking in tqdm(rankings): rankings2.append(ids[ranking]) return rankings2