|
import pandas as pd |
|
from gensim import corpora |
|
from gensim import similarities |
|
from gensim.models import TfidfModel |
|
from gensim.parsing import strip_tags, strip_numeric, \ |
|
strip_multiple_whitespaces, stem_text, strip_punctuation, \ |
|
remove_stopwords, preprocess_string |
|
import re |
|
|
|
from typing import List |
|
from utils.constants import TEST_INPUTS |
|
import argparse |
|
from random import choice |
|
|
|
import sys |
|
|
|
|
|
|
|
SAMPLES = 3000 |
|
CORPUS_DICTIONARY_PATH="30Ktokens" |
|
ARXIV_DATASR_PATH = "/Users/luis.morales/Desktop/arxiv-paper-recommender/data/processed/reduced_arxiv_papers.parquet.gzip" |
|
SAVE_DICT = False |
|
QUERY = "" |
|
|
|
transform_to_lower = lambda s: s.lower() |
|
remove_single_char = lambda s: re.sub(r'\s+\w{1}\s+', '', s) |
|
|
|
cleaning_filters = [ |
|
strip_tags, |
|
strip_numeric, |
|
strip_punctuation, |
|
strip_multiple_whitespaces, |
|
transform_to_lower, |
|
remove_stopwords, |
|
remove_single_char |
|
] |
|
|
|
def gensim_tokenizer(docs: List[str]): |
|
""" |
|
Tokenizes a list of strings using a series of cleaning filters. |
|
|
|
Args: |
|
docs (List[str]): A list of strings to be tokenized. |
|
|
|
Returns: |
|
List[List[str]]: A list of tokenized documents, where each document is represented as a list of tokens. |
|
""" |
|
tokenized_docs = list() |
|
for doc in docs: |
|
processed_words = preprocess_string(doc, cleaning_filters) |
|
tokenized_docs.append(processed_words) |
|
|
|
return tokenized_docs |
|
|
|
|
|
def cleaning_pipe(document): |
|
""" |
|
Applies a series of cleaning steps to a document. |
|
|
|
Args: |
|
document (str): The document to be cleaned. |
|
|
|
Returns: |
|
list: A list of processed words after applying the cleaning filters. |
|
""" |
|
|
|
processed_words = preprocess_string(document, cleaning_filters) |
|
return processed_words |
|
|
|
|
|
def get_gensim_dictionary(tokenized_docs: List[str], dict_name: str = "corpus", save_dict: bool = False): |
|
""" |
|
Create dictionary of words in preprocessed corpus and saves the dict object |
|
""" |
|
dictionary = corpora.Dictionary(tokenized_docs) |
|
if save_dict: |
|
parent_folder = "/Users/luis.morales/Desktop/arxiv-paper-recommender/models/nlp_dictionaries" |
|
dictionary.save(f'{parent_folder}/{dict_name}.dict') |
|
return dictionary |
|
|
|
|
|
def get_closest_n(query: str, n: int): |
|
''' |
|
Retrieves the top matching documents as per cosine similarity |
|
between the TF-IDF vector of the query and all documents. |
|
|
|
Args: |
|
query (str): The query string to find matching documents. |
|
n (int): The number of closest documents to retrieve. |
|
|
|
Returns: |
|
numpy.ndarray: An array of indices representing the top matching documents. |
|
''' |
|
|
|
query_document = cleaning_pipe(query) |
|
|
|
|
|
query_bow = dictionary.doc2bow(query_document) |
|
|
|
|
|
sims = index[tfidf_model[query_bow]] |
|
|
|
|
|
top_idx = sims.argsort()[-1 * n:][::-1] |
|
|
|
return top_idx |
|
|
|
|
|
def get_recomendations_metadata(query: str, df: pd.DataFrame, n: int): |
|
''' |
|
Retrieves metadata recommendations based on a query using cosine similarity. |
|
|
|
Args: |
|
query (str): The query string for which recommendations are sought. |
|
n (int): The number of recommendations to retrieve. |
|
df (pd.DataFrame): The DataFrame containing metadata information. |
|
|
|
Returns: |
|
pd.DataFrame: A DataFrame containing the recommended metadata, reset with a new index. |
|
''' |
|
|
|
recommendations_idxs = get_closest_n(query, n) |
|
|
|
|
|
recommendations_metadata = df.iloc[recommendations_idxs] |
|
|
|
|
|
recommendations_metadata = recommendations_metadata.reset_index(drop=True) |
|
|
|
return recommendations_metadata |
|
|
|
if __name__ == "__main__": |
|
""" |
|
Example: |
|
python script.py --samples 3000 --corpus_dictionary_path "30Ktokens.dict" --arxiv_datasr_path "/Users/luis.morales/Desktop/arxiv-paper-recommender/data/processed/reduced_arxiv_papers.parquet.gzip" --save_dict --query "your query here" |
|
|
|
""" |
|
|
|
parser = argparse.ArgumentParser(description='ArXiv Paper Recommender CLI') |
|
parser.add_argument('--samples', default=30000, type=int, help='Number of samples to consider') |
|
parser.add_argument('--corpus_dictionary_path', default=None ,type=str, help='Path to the corpus dictionary') |
|
parser.add_argument('--save_dict', default=False, help='Flag to save the dictionary') |
|
parser.add_argument('--arxiv_dataset_path', |
|
default="/Users/luis.morales/Desktop/arxiv-paper-recommender/data/processed/reduced_arxiv_papers.parquet.gzip", |
|
type=str, help='Path to the ARXIV parquet source') |
|
parser.add_argument('--query', default=None, type=str, help='User query') |
|
args = parser.parse_args() |
|
|
|
num_samples = args.samples |
|
corpus_dictionary_path = args.corpus_dictionary_path |
|
arxiv_dataset_path = args.arxiv_dataset_path |
|
save_dict = args.save_dict |
|
query = args.query |
|
|
|
print("Parameters:") |
|
print(f"num_samples: {num_samples}, type: {type(num_samples)}") |
|
print(f"corpus_dictionary_path: {corpus_dictionary_path}, type: {type(corpus_dictionary_path)}") |
|
print(f"arxiv_dataset_path: {arxiv_dataset_path}, type: {type(arxiv_dataset_path)}") |
|
print(f"save_dict: {save_dict}, type: {type(save_dict)}") |
|
print(f"query: {query}, type: {type(query)}") |
|
|
|
|
|
if num_samples is None: |
|
df = pd.read_parquet(arxiv_dataset_path) |
|
df = pd.read_parquet(arxiv_dataset_path).sample(num_samples).reset_index(drop=True) |
|
|
|
|
|
corpus = df['cleaned_abstracts'].to_list() |
|
tokenized_corpus = gensim_tokenizer(corpus) |
|
|
|
dictionary = get_gensim_dictionary( |
|
tokenized_docs=tokenized_corpus, |
|
dict_name=corpus_dictionary_path, |
|
save_dict=save_dict |
|
) |
|
|
|
BoW_corpus = [dictionary.doc2bow(doc, allow_update=True) for doc in tokenized_corpus] |
|
|
|
tfidf_model = TfidfModel(BoW_corpus) |
|
|
|
index = similarities.SparseMatrixSimilarity(tfidf_model[BoW_corpus], num_features=len(dictionary)) |
|
|
|
if query is None: |
|
query = choice(TEST_INPUTS) |
|
|
|
results_df = get_recomendations_metadata(query=query, df=df, n=3) |
|
|
|
|
|
for abstract in list(zip(results_df['abstract'].to_list(), results_df['title'].to_list())): |
|
print(f"User Request ---- : \n {query}") |
|
print(f"User Request ---- : \n ") |
|
print(f"Title: {abstract[1]}") |
|
print(f"Abstract: {abstract[0]}\n") |
|
print(f"--------------------------") |