import streamlit as st import pandas as pd import numpy as np import torch from sentence_transformers import SentenceTransformer import lancedb from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline import time import re import nltk from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer # from google_drive_downloader import GoogleDriveDownloader as gdd # Download NLTK resources if not already downloaded nltk.download('stopwords') nltk.download('wordnet') nltk.download('omw-1.4') # --------------------------- Dynamic Download of Large Files --------------------------- # import gdown import zipfile import os # Function to download and extract folder def download_and_extract_gdrive(file_id, destination, extract_to): # Download the zip file gdown.download(f"https://drive.google.com/uc?id={file_id}", destination, quiet=False) # Extract the zip file with zipfile.ZipFile(destination, 'r') as zip_ref: zip_ref.extractall(extract_to) os.remove(destination) # Clean up the downloaded zip file # Download and extract LanceDB and fine-tuned model st.info("Downloading and setting up necessary data. This might take a while...") download_and_extract_gdrive( file_id="1Qnb8bs_NXWlhDwGoswOgsp2DiLBMbfSY", # Replace with the actual Google Drive file ID destination="lancedb_directory_main", extract_to="./" ) download_and_extract_gdrive( file_id="1_9VVuN_P3zsTBYzg0lAeh4ghd9zhXS3w", # Replace with the actual Google Drive file ID destination="finetuned_all_minilm_l6_v2", extract_to="./" ) # # --------------------------- Load the LanceDB Table and Models --------------------------- # # Connect to LanceDB DB_PATH = "./lancedb_directory_main" TABLE_NAME_1 = "enhanced_papers_pretrained_1" TABLE_NAME_2 = "enhanced_papers_pretrained_2" TABLE_NAME_3 = "enhanced_papers_finetuned" db = lancedb.connect(DB_PATH) table1 = db.open_table(TABLE_NAME_1) table2 = db.open_table(TABLE_NAME_2) table3 = db.open_table(TABLE_NAME_3) # Load the SentenceTransformer models embedding_models = { "all-MiniLM-L6-v2": SentenceTransformer('all-MiniLM-L6-v2'), "allenai-specter": SentenceTransformer('allenai-specter'), "finetuned_all_minilm_l6_v2": SentenceTransformer('./finetuned_all_minilm_l6_v2') } model_tables = { "all-MiniLM-L6-v2": table1, "allenai-specter": table2, "finetuned_all_minilm_l6_v2": table3 } # Load the tokenizer and summarization model for RAG-based explanations MODEL_NAME = "google/flan-t5-large" tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) rag_model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME) rag_pipeline = pipeline("text2text-generation", model=rag_model, tokenizer=tokenizer, device=torch.device("cuda" if torch.cuda.is_available() else "cpu")) # --------------------------- Streamlit UI Components --------------------------- # st.title("Research Paper Recommendation System with RAG-based Explanations") # Initialize stopwords and lemmatizer stop_words = set(stopwords.words('english')) lemmatizer = WordNetLemmatizer() # Function to clean text def clean_text(text): if pd.isnull(text): return "" # Lowercasing text = text.lower() # Remove special characters and punctuation text = re.sub(r'[^a-z0-9\s]', '', text) # Remove extra whitespace and newlines text = re.sub(r'\s+', ' ', text).strip() # Tokenize and remove stopwords, then lemmatize tokens = text.split() tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words] return ' '.join(tokens) # Input abstract from the user user_abstract = st.text_area("Enter the abstract of your paper:", height=200) # Preprocess the user input abstract user_abstract = clean_text(user_abstract) # Number of recommendations slider k = st.slider("Select the number of recommendations (k):", min_value=1, max_value=20, value=5) # Model selection dropdown selected_model_name = st.sidebar.selectbox("Select the embedding model:", list(embedding_models.keys())) # Fetch unique metadata values for filters def get_unique_values(table, column): df = table.to_pandas() return sorted(df[column].dropna().unique()) table = model_tables[selected_model_name] categories = get_unique_values(table, 'categories') authors = get_unique_values(table, 'authors') # Metadata filters st.sidebar.header("Filter Recommendations by Metadata") filter_category = st.sidebar.selectbox("Filter by Category (optional):", [""] + categories) filter_author = st.sidebar.selectbox("Filter by Author (optional):", [""] + authors) # --------------------------- Helper Functions --------------------------- # def generate_explanation(user_abstract, recommended_title, recommended_authors, recommended_abstract, max_input_length=512, max_output_length=200): prompt = ( f"User's Input:\n{user_abstract}\n\n" f"Recommended Paper:\n" f"Title: {recommended_title}\n" f"Authors: {recommended_authors}\n" f"Abstract: {recommended_abstract}\n\n" "Explain briefly, how the recommended paper is relevant to the user's input" ) try: explanation = rag_pipeline( prompt, max_length=max_output_length, min_length=50, do_sample=True, temperature=0.7, top_p=0.9, truncation=True )[0]['generated_text'] return explanation except Exception as e: return f"Error during generation: {e}" def post_process_explanation(text): sentences = list(dict.fromkeys(text.split('. '))) return '. '.join(sentences).strip() def get_recommendations(table, embedding_model, model_name): with st.spinner(f"Generating embedding for your abstract using {model_name}..."): user_embedding = embedding_model.encode(user_abstract, convert_to_tensor=True).cpu().numpy() # Perform similarity search query = table.search(user_embedding).metric("cosine").limit(k) if filter_category: query = query.where(f"categories == '{filter_category}'") if filter_author: query = query.where(f"authors LIKE '%{filter_author}%'") return query.to_pandas() # --------------------------- Main Logic for Recommendations --------------------------- # if st.button("Get Recommendations"): if not user_abstract: st.error("Please enter an abstract to proceed.") else: embedding_model = embedding_models[selected_model_name] table = model_tables[selected_model_name] st.header(f"Recommendations using {selected_model_name}") recommendations = get_recommendations(table, embedding_model, selected_model_name) if recommendations.empty: st.warning(f"No recommendations found for {selected_model_name} based on the current filters.") else: st.success(f"Top {len(recommendations)} Recommendations from {selected_model_name}:") for idx, row in recommendations.iterrows(): st.write(f"### {idx + 1}. {row['title']}") st.write(f"**Category:** {row['categories']}") st.write(f"**Authors:** {row['authors']}") st.write(f"**Abstract:** {row['abstract']}") st.write(f"**Last Updated:** {row['update_date']}") st.write("---") explanation = generate_explanation(user_abstract, row['title'], row['authors'], row['abstract']) explanation = post_process_explanation(explanation) st.write(f"**Explanation:** {explanation}") st.write("---")