raghuv-aditya's picture
Upload 18 files
e46379d verified
from tqdm import tqdm
import joblib
import numpy as np
from sentence_transformers import SentenceTransformer, util
# Load the model
model = SentenceTransformer('all-MiniLM-L6-v2')
def get_documents_from_scores(scores):
rankings = []
for score in scores:
rankings.append(score[0])
return rankings
def cosine_similarity(v1, v2):
v1 = np.array(v1)
v2 = np.array(v2)
if(np.linalg.norm(v1) != 0 and np.linalg.norm(v2) != 0):
sim = np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
else:
sim = 0
return sim
def get_open_source_embeddings(documents):
documents_embeddings = []
for document in tqdm(documents):
documents_embeddings.append(model.encode(document))
return documents_embeddings
def open_source_rankings(query, document_embeddings, k):
query_embedding = model.encode(query)
scores = []
for idx, embedding in enumerate(document_embeddings):
scores.append((idx, cosine_similarity(query_embedding, embedding)))
scores = sorted(scores, key=lambda x: x[1], reverse=True)
scores = scores[:k]
rankings = get_documents_from_scores(scores)
return rankings, scores
def open_source_pipeline(query, documents_embeddings_path="Retrieval/savedModels/open_source_embeddings.pkl", ids_path="Retrieval/savedModels/ids.pkl", k=100):
document_embeddings = joblib.load(documents_embeddings_path)
ids = joblib.load(ids_path)
rankings, scores = open_source_rankings(query, document_embeddings, k)
rankings2 = []
for ranking in tqdm(rankings):
rankings2.append(ids[ranking])
return rankings2