File size: 4,206 Bytes
fc07932
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import pandas as pd
from gensim.corpora import Dictionary
from gensim.similarities import SparseMatrixSimilarity
from gensim.models import TfidfModel
from gensim.parsing import strip_tags, strip_numeric, \
    strip_multiple_whitespaces, stem_text, strip_punctuation, \
    remove_stopwords, preprocess_string

from re import sub
from typing import List
from functools import cache


transform_to_lower = lambda s: s.lower()
remove_single_char = lambda s: sub(r'\s+\w{1}\s+', '', s)

cleaning_filters = [
    strip_tags,
    strip_numeric,
    strip_punctuation, 
    strip_multiple_whitespaces, 
    transform_to_lower,
    remove_stopwords,
    remove_single_char
]

def gensim_tokenizer(docs: List[str]):
    """
    Tokenizes a list of strings using a series of cleaning filters.

    Args:
        docs (List[str]): A list of strings to be tokenized.

    Returns:
        List[List[str]]: A list of tokenized documents, where each document is represented as a list of tokens.
    """
    tokenized_docs = list()
    for doc in docs:
        processed_words = preprocess_string(doc, cleaning_filters)
        tokenized_docs.append(processed_words)
    
    return tokenized_docs


def cleaning_pipe(document):
    """
    Applies a series of cleaning steps to a document.

    Args:
        document (str): The document to be cleaned.

    Returns:
        list: A list of processed words after applying the cleaning filters.
    """
    # Invoking gensim.parsing.preprocess_string method with set of filters
    processed_words = preprocess_string(document, cleaning_filters)
    return processed_words


def get_closest_n(dictionary: Dictionary, index: SparseMatrixSimilarity, tfidf_model : TfidfModel, query: str, n: int):
    '''
    Retrieves the top matching documents as per cosine similarity
    between the TF-IDF vector of the query and all documents.

    Args:
        query (str): The query string to find matching documents.
        n (int): The number of closest documents to retrieve.

    Returns:
        numpy.ndarray: An array of indices representing the top matching documents.
    '''
    # Clean the query document using cleaning_pipe function
    query_document = cleaning_pipe(query)

    # Convert the query document to bag-of-words representation
    query_bow = dictionary.doc2bow(query_document)

    # Calculate similarity scores between the query and all documents using TF-IDF model
    sims = index[tfidf_model[query_bow]]

    # Get the indices of the top n closest documents based on similarity scores
    top_idx = sims.argsort()[-1 * n:][::-1]

    return top_idx


def get_recomendations_metadata(query: str, df: pd.DataFrame, n: int, 
                                dictionary: Dictionary, index: SparseMatrixSimilarity, 
                                tfidf_model : TfidfModel) -> pd.DataFrame:
    '''
    Retrieves metadata recommendations based on a query using cosine similarity.

    Args:
        query (str): The query string for which recommendations are sought.
        n (int): The number of recommendations to retrieve.
        df (pd.DataFrame): The DataFrame containing metadata information.

    Returns:
        pd.DataFrame: A DataFrame containing the recommended metadata, reset with a new index.
    '''
    # Get the indices of the closest matching documents based on the query
    recommendations_idxs = get_closest_n(dictionary, index, tfidf_model, query, n)
    
    # Retrieve the recommended metadata rows from the DataFrame based on the indices
    recommendations_metadata = df.iloc[recommendations_idxs]
    
    # Reset the index of the recommended metadata DataFrame
    recommendations_metadata = recommendations_metadata.reset_index(drop=True)
    
    return recommendations_metadata
    # return recommendations_idxs

@cache
def load_arxiv_parquet(path: str):
    df = pd.read_parquet(path)
    return df
    
@cache  
def load_dict(path: str):
    dict_corpus = Dictionary.load(path)
    return dict_corpus

@cache
def load_model(path: str ):
    tfidf_model = TfidfModel.load(path)
    return tfidf_model

@cache
def load_sparse_matrix(path: str):
    similarities = SparseMatrixSimilarity.load(path)
    return similarities