GrammarGuru / src /models /train_recommender.py

Initial Setup

fc07932 over 1 year ago

4.04 kB

	import pandas as pd
	from gensim.similarities import SparseMatrixSimilarity
	import argparse
	import logging
	import time

	from utils.utilities import read_yaml_config, validate_and_create_subfolders
	from utils.mlutilities import *

	import logging
	import sys

	logging.basicConfig(
	level=logging.INFO,
	format="%(asctime)s [%(levelname)s] %(message)s",
	handlers=[
	logging.FileHandler("debug.log"),
	logging.StreamHandler(sys.stdout)
	]
	)


	model_configurations = read_yaml_config("/Users/luis.morales/Desktop/arxiv-paper-recommender/src/models/configs.yaml")


	if __name__ == "__main__":
	"""
	Example:
	python3 ./src/models/train_recommender.py --modelsize Medium

	"""
	# Define and parse command-line arguments
	parser = argparse.ArgumentParser(description='ArXiv Paper Recommender CLI')
	parser.add_argument('--modelsize',choices=["Large", "SubLarge", "Medium", "Small"], default=None, type=str, help='Model Size')

	args = parser.parse_args()
	model_size = args.modelsize
	start = time.time()


	if model_size is None:
	raise Exception("The `modelsize` flag was no passed to the CLI.")


	model_config = model_configurations["GensimConfig"][model_size]
	model_name = model_configurations["GensimConfig"][model_size]["ModelName"]
	dataset_fraq_split = model_configurations["GensimConfig"][model_size]["DataSetFracSplit"]
	random_seed = model_configurations["GensimConfig"][model_size]["RandomSeedSplit"]
	logging.info(f"Started training of {model_name} Model.")


	validate_and_create_subfolders(
	model_name=model_name
	)
	logging.info(f"Model Folder `{model_name}` was created successfully.")


	if dataset_fraq_split is None:
	df = pd.read_parquet("/Users/luis.morales/Desktop/arxiv-paper-recommender/data/processed/reduced_arxiv_papers.parquet.gzip")
	logging.info(f"The full text Corpus was readed.")

	else :
	df = pd.read_parquet("/Users/luis.morales/Desktop/arxiv-paper-recommender/data/processed/reduced_arxiv_papers.parquet.gzip") \
	.sample(frac=dataset_fraq_split, random_state=random_seed) \
	.reset_index(drop=True)
	logging.info(f"A random split of {dataset_fraq_split}% was applied on the Text Corpus ")
	logging.info(f"Dimensions of the dataset: {df.shape}")

	df.to_parquet(f"/Users/luis.morales/Desktop/arxiv-paper-recommender/models/data/{model_name}.parquet.gzip", compression='gzip')
	logging.info(f"The Dataset used for this training was successfully saved in: `/Users/luis.morales/Desktop/arxiv-paper-recommender/models/data/{model_name}.parquet.gzip`.")



	corpus = df['cleaned_abstracts'].to_list()
	tokenized_corpus = gensim_tokenizer(corpus)
	logging.info(f"Dictionary Learned on the {model_name} corpus dataset.")


	dictionary = get_gensim_dictionary(tokenized_docs=tokenized_corpus, dict_name=model_name, save_dict=True)
	logging.info("Dictionary Saved Locally.")


	BoW_corpus = [dictionary.doc2bow(doc, allow_update=True) for doc in tokenized_corpus]
	tfidf_model = TfidfModel(BoW_corpus)
	logging.info(f"TD-IDF {model_name} Model was successfully trained.")


	tfidf_model.save(f"/Users/luis.morales/Desktop/arxiv-paper-recommender/models/tfidf/{model_name}.model")
	logging.info(f"Model: {model_name} was successfully saved.")


	index = SparseMatrixSimilarity(tfidf_model[BoW_corpus], num_features=len(dictionary))
	logging.info(f"The Similarities Sparse Matrix was successfully created.")
	index.save(f"/Users/luis.morales/Desktop/arxiv-paper-recommender/models/similarities_matrix/{model_name}")
	logging.info(f"The Similarities Matrix was successfully saved for the model: {model_name}.")

	end = time.time()
	total_time = end - start
	logging.info(f"Full Training of {model_size} model took {total_time} secs.")
	logging.info(f"The {model_name} Model was successfully trained! yei :)")