File size: 4,206 Bytes
fc07932 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
import pandas as pd
from gensim.corpora import Dictionary
from gensim.similarities import SparseMatrixSimilarity
from gensim.models import TfidfModel
from gensim.parsing import strip_tags, strip_numeric, \
strip_multiple_whitespaces, stem_text, strip_punctuation, \
remove_stopwords, preprocess_string
from re import sub
from typing import List
from functools import cache
transform_to_lower = lambda s: s.lower()
remove_single_char = lambda s: sub(r'\s+\w{1}\s+', '', s)
cleaning_filters = [
strip_tags,
strip_numeric,
strip_punctuation,
strip_multiple_whitespaces,
transform_to_lower,
remove_stopwords,
remove_single_char
]
def gensim_tokenizer(docs: List[str]):
"""
Tokenizes a list of strings using a series of cleaning filters.
Args:
docs (List[str]): A list of strings to be tokenized.
Returns:
List[List[str]]: A list of tokenized documents, where each document is represented as a list of tokens.
"""
tokenized_docs = list()
for doc in docs:
processed_words = preprocess_string(doc, cleaning_filters)
tokenized_docs.append(processed_words)
return tokenized_docs
def cleaning_pipe(document):
"""
Applies a series of cleaning steps to a document.
Args:
document (str): The document to be cleaned.
Returns:
list: A list of processed words after applying the cleaning filters.
"""
# Invoking gensim.parsing.preprocess_string method with set of filters
processed_words = preprocess_string(document, cleaning_filters)
return processed_words
def get_closest_n(dictionary: Dictionary, index: SparseMatrixSimilarity, tfidf_model : TfidfModel, query: str, n: int):
'''
Retrieves the top matching documents as per cosine similarity
between the TF-IDF vector of the query and all documents.
Args:
query (str): The query string to find matching documents.
n (int): The number of closest documents to retrieve.
Returns:
numpy.ndarray: An array of indices representing the top matching documents.
'''
# Clean the query document using cleaning_pipe function
query_document = cleaning_pipe(query)
# Convert the query document to bag-of-words representation
query_bow = dictionary.doc2bow(query_document)
# Calculate similarity scores between the query and all documents using TF-IDF model
sims = index[tfidf_model[query_bow]]
# Get the indices of the top n closest documents based on similarity scores
top_idx = sims.argsort()[-1 * n:][::-1]
return top_idx
def get_recomendations_metadata(query: str, df: pd.DataFrame, n: int,
dictionary: Dictionary, index: SparseMatrixSimilarity,
tfidf_model : TfidfModel) -> pd.DataFrame:
'''
Retrieves metadata recommendations based on a query using cosine similarity.
Args:
query (str): The query string for which recommendations are sought.
n (int): The number of recommendations to retrieve.
df (pd.DataFrame): The DataFrame containing metadata information.
Returns:
pd.DataFrame: A DataFrame containing the recommended metadata, reset with a new index.
'''
# Get the indices of the closest matching documents based on the query
recommendations_idxs = get_closest_n(dictionary, index, tfidf_model, query, n)
# Retrieve the recommended metadata rows from the DataFrame based on the indices
recommendations_metadata = df.iloc[recommendations_idxs]
# Reset the index of the recommended metadata DataFrame
recommendations_metadata = recommendations_metadata.reset_index(drop=True)
return recommendations_metadata
# return recommendations_idxs
@cache
def load_arxiv_parquet(path: str):
df = pd.read_parquet(path)
return df
@cache
def load_dict(path: str):
dict_corpus = Dictionary.load(path)
return dict_corpus
@cache
def load_model(path: str ):
tfidf_model = TfidfModel.load(path)
return tfidf_model
@cache
def load_sparse_matrix(path: str):
similarities = SparseMatrixSimilarity.load(path)
return similarities |