raghuv-aditya's picture
Upload 18 files
e46379d verified
raw
history blame
2.5 kB
import numpy as np
from collections import defaultdict
from gensim.utils import simple_preprocess
from tqdm import tqdm
import joblib
def get_tf_query(query):
k = len(query)
tf_query = defaultdict(lambda: 0)
for i in range(k):
tf_query[query[i]] += 1
for token in tf_query.keys():
tf_query[token] /= k
return tf_query
def get_tf_idf_query(query, idf_dict):
query = simple_preprocess(query)
tf_idf_query = defaultdict(lambda: 0)
tf_query = get_tf_query(query)
for token in tf_query.keys():
tf_idf_query[token] = tf_query[token] * idf_dict[token]
return tf_idf_query
def get_tf_idf_vector(tf_idf_instance, vocab):
temp = []
for key in vocab.keys():
temp.append(tf_idf_instance[key])
return temp
def tf_idf_rankings(query, idf_dict, tf_idf_dict, vocab, document_matrix, k):
query_vector = np.reshape(np.array(get_tf_idf_vector(get_tf_idf_query(query, idf_dict), vocab)), (1, -1))
scores = []
dot_products = document_matrix @ query_vector.T
query_norm = np.linalg.norm(query_vector)
doc_norms = np.linalg.norm(document_matrix, axis=1, keepdims=True)
cosine_similarities = dot_products / (doc_norms * query_norm)
cosine_similarities = cosine_similarities.flatten()
rankings = np.argsort(cosine_similarities)[::-1]
rankings = rankings[:k]
scores = []
for rank in rankings:
scores.append(cosine_similarities[rank])
# scores = sorted(cosine_similarities, key=lambda x: x[1], reverse=True)
# scores = scores[:k]
# rankings = get_documents_from_scores(scores)
return rankings, scores
def tf_idf_pipeline(query, idf_dict_path="Retrieval/savedModels/idf.pkl", tf_idf_dict_path="Retrieval/savedModels/tf_idf_dict.pkl", vocab_path="Retrieval/savedModels/vocab.pkl", document_matrix_path="Retrieval/savedModels/document_matrix.pkl", ids_path="Retrieval/savedModels/ids.pkl", k=100):
idf_dict = joblib.load(idf_dict_path)
print("idf loaded...")
tf_idf_dict = joblib.load(tf_idf_dict_path)
print("tf-idf loaded...")
vocab = joblib.load(vocab_path)
print("vocab loaded...")
document_matrix = joblib.load(document_matrix_path)
print("document_matrix loaded...")
ids = joblib.load(ids_path)
print("ids loaded")
rankings, scores = tf_idf_rankings(query, idf_dict, tf_idf_dict, vocab, document_matrix, k)
rankings2 = []
for ranking in tqdm(rankings):
rankings2.append(ids[ranking])
return rankings2