|
import pandas as pd |
|
from gensim.corpora import Dictionary |
|
from gensim.similarities import SparseMatrixSimilarity |
|
from gensim.models import TfidfModel |
|
from gensim.parsing import strip_tags, strip_numeric, \ |
|
strip_multiple_whitespaces, stem_text, strip_punctuation, \ |
|
remove_stopwords, preprocess_string |
|
|
|
from re import sub |
|
from typing import List |
|
from functools import cache |
|
|
|
|
|
transform_to_lower = lambda s: s.lower() |
|
remove_single_char = lambda s: sub(r'\s+\w{1}\s+', '', s) |
|
|
|
cleaning_filters = [ |
|
strip_tags, |
|
strip_numeric, |
|
strip_punctuation, |
|
strip_multiple_whitespaces, |
|
transform_to_lower, |
|
remove_stopwords, |
|
remove_single_char |
|
] |
|
|
|
def gensim_tokenizer(docs: List[str]): |
|
""" |
|
Tokenizes a list of strings using a series of cleaning filters. |
|
|
|
Args: |
|
docs (List[str]): A list of strings to be tokenized. |
|
|
|
Returns: |
|
List[List[str]]: A list of tokenized documents, where each document is represented as a list of tokens. |
|
""" |
|
tokenized_docs = list() |
|
for doc in docs: |
|
processed_words = preprocess_string(doc, cleaning_filters) |
|
tokenized_docs.append(processed_words) |
|
|
|
return tokenized_docs |
|
|
|
|
|
def cleaning_pipe(document): |
|
""" |
|
Applies a series of cleaning steps to a document. |
|
|
|
Args: |
|
document (str): The document to be cleaned. |
|
|
|
Returns: |
|
list: A list of processed words after applying the cleaning filters. |
|
""" |
|
|
|
processed_words = preprocess_string(document, cleaning_filters) |
|
return processed_words |
|
|
|
|
|
def get_closest_n(dictionary: Dictionary, index: SparseMatrixSimilarity, tfidf_model : TfidfModel, query: str, n: int): |
|
''' |
|
Retrieves the top matching documents as per cosine similarity |
|
between the TF-IDF vector of the query and all documents. |
|
|
|
Args: |
|
query (str): The query string to find matching documents. |
|
n (int): The number of closest documents to retrieve. |
|
|
|
Returns: |
|
numpy.ndarray: An array of indices representing the top matching documents. |
|
''' |
|
|
|
query_document = cleaning_pipe(query) |
|
|
|
|
|
query_bow = dictionary.doc2bow(query_document) |
|
|
|
|
|
sims = index[tfidf_model[query_bow]] |
|
|
|
|
|
top_idx = sims.argsort()[-1 * n:][::-1] |
|
|
|
return top_idx |
|
|
|
|
|
def get_recomendations_metadata(query: str, df: pd.DataFrame, n: int, |
|
dictionary: Dictionary, index: SparseMatrixSimilarity, |
|
tfidf_model : TfidfModel) -> pd.DataFrame: |
|
''' |
|
Retrieves metadata recommendations based on a query using cosine similarity. |
|
|
|
Args: |
|
query (str): The query string for which recommendations are sought. |
|
n (int): The number of recommendations to retrieve. |
|
df (pd.DataFrame): The DataFrame containing metadata information. |
|
|
|
Returns: |
|
pd.DataFrame: A DataFrame containing the recommended metadata, reset with a new index. |
|
''' |
|
|
|
recommendations_idxs = get_closest_n(dictionary, index, tfidf_model, query, n) |
|
|
|
|
|
recommendations_metadata = df.iloc[recommendations_idxs] |
|
|
|
|
|
recommendations_metadata = recommendations_metadata.reset_index(drop=True) |
|
|
|
return recommendations_metadata |
|
|
|
|
|
@cache |
|
def load_arxiv_parquet(path: str): |
|
df = pd.read_parquet(path) |
|
return df |
|
|
|
@cache |
|
def load_dict(path: str): |
|
dict_corpus = Dictionary.load(path) |
|
return dict_corpus |
|
|
|
@cache |
|
def load_model(path: str ): |
|
tfidf_model = TfidfModel.load(path) |
|
return tfidf_model |
|
|
|
@cache |
|
def load_sparse_matrix(path: str): |
|
similarities = SparseMatrixSimilarity.load(path) |
|
return similarities |