import numpy as np from collections import defaultdict from gensim.utils import simple_preprocess from tqdm import tqdm import joblib def get_tf_query(query): k = len(query) tf_query = defaultdict(lambda: 0) for i in range(k): tf_query[query[i]] += 1 for token in tf_query.keys(): tf_query[token] /= k return tf_query def get_tf_idf_query(query, idf_dict): query = simple_preprocess(query) tf_idf_query = defaultdict(lambda: 0) tf_query = get_tf_query(query) for token in tf_query.keys(): tf_idf_query[token] = tf_query[token] * idf_dict[token] return tf_idf_query def get_tf_idf_vector(tf_idf_instance, vocab): temp = [] for key in vocab.keys(): temp.append(tf_idf_instance[key]) return temp def tf_idf_rankings(query, idf_dict, tf_idf_dict, vocab, document_matrix, k): query_vector = np.reshape(np.array(get_tf_idf_vector(get_tf_idf_query(query, idf_dict), vocab)), (1, -1)) scores = [] dot_products = document_matrix @ query_vector.T query_norm = np.linalg.norm(query_vector) doc_norms = np.linalg.norm(document_matrix, axis=1, keepdims=True) cosine_similarities = dot_products / (doc_norms * query_norm) cosine_similarities = cosine_similarities.flatten() rankings = np.argsort(cosine_similarities)[::-1] rankings = rankings[:k] scores = [] for rank in rankings: scores.append(cosine_similarities[rank]) # scores = sorted(cosine_similarities, key=lambda x: x[1], reverse=True) # scores = scores[:k] # rankings = get_documents_from_scores(scores) return rankings, scores def tf_idf_pipeline(query, idf_dict_path="Retrieval/savedModels/idf.pkl", tf_idf_dict_path="Retrieval/savedModels/tf_idf_dict.pkl", vocab_path="Retrieval/savedModels/vocab.pkl", document_matrix_path="Retrieval/savedModels/document_matrix.pkl", ids_path="Retrieval/savedModels/ids.pkl", k=100): idf_dict = joblib.load(idf_dict_path) print("idf loaded...") tf_idf_dict = joblib.load(tf_idf_dict_path) print("tf-idf loaded...") vocab = joblib.load(vocab_path) print("vocab loaded...") document_matrix = joblib.load(document_matrix_path) print("document_matrix loaded...") ids = joblib.load(ids_path) print("ids loaded") rankings, scores = tf_idf_rankings(query, idf_dict, tf_idf_dict, vocab, document_matrix, k) rankings2 = [] for ranking in tqdm(rankings): rankings2.append(ids[ranking]) return rankings2