File size: 2,504 Bytes
e46379d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import numpy as np
from collections import defaultdict
from gensim.utils import simple_preprocess
from tqdm import tqdm
import joblib


def get_tf_query(query):
    k = len(query)
    tf_query = defaultdict(lambda: 0)
    for i in range(k):
        tf_query[query[i]] += 1
    for token in tf_query.keys():
        tf_query[token] /= k
    return tf_query

def get_tf_idf_query(query, idf_dict):
    query = simple_preprocess(query)
    tf_idf_query = defaultdict(lambda: 0)
    tf_query = get_tf_query(query)
    for token in tf_query.keys():
        tf_idf_query[token] = tf_query[token] * idf_dict[token]
    return tf_idf_query
    
def get_tf_idf_vector(tf_idf_instance, vocab):
    temp = []
    for key in vocab.keys():
        temp.append(tf_idf_instance[key])
    return temp


def tf_idf_rankings(query, idf_dict, tf_idf_dict, vocab, document_matrix, k):
    query_vector = np.reshape(np.array(get_tf_idf_vector(get_tf_idf_query(query, idf_dict), vocab)), (1, -1))
    scores = []
    dot_products = document_matrix @ query_vector.T

    query_norm = np.linalg.norm(query_vector)
    doc_norms = np.linalg.norm(document_matrix, axis=1, keepdims=True)
    cosine_similarities = dot_products / (doc_norms * query_norm)
    cosine_similarities = cosine_similarities.flatten()
    rankings = np.argsort(cosine_similarities)[::-1]
    rankings = rankings[:k]
    scores = []
    for rank in rankings:
        scores.append(cosine_similarities[rank])
    # scores = sorted(cosine_similarities, key=lambda x: x[1], reverse=True)
    # scores = scores[:k]
    # rankings = get_documents_from_scores(scores)
    return rankings, scores

def tf_idf_pipeline(query, idf_dict_path="Retrieval/savedModels/idf.pkl", tf_idf_dict_path="Retrieval/savedModels/tf_idf_dict.pkl", vocab_path="Retrieval/savedModels/vocab.pkl", document_matrix_path="Retrieval/savedModels/document_matrix.pkl", ids_path="Retrieval/savedModels/ids.pkl", k=100):
    idf_dict = joblib.load(idf_dict_path)
    print("idf loaded...")
    tf_idf_dict = joblib.load(tf_idf_dict_path)
    print("tf-idf loaded...")
    vocab = joblib.load(vocab_path)
    print("vocab loaded...")
    document_matrix = joblib.load(document_matrix_path)
    print("document_matrix loaded...")
    ids = joblib.load(ids_path)
    print("ids loaded")
    rankings, scores = tf_idf_rankings(query, idf_dict, tf_idf_dict, vocab, document_matrix, k)
    rankings2 = []
    for ranking in tqdm(rankings):
        rankings2.append(ids[ranking])
    return rankings2