arpannookala commited on
Commit
15761f9
·
1 Parent(s): 4261620

Adding app

Browse files
Files changed (2) hide show
  1. app.py +187 -0
  2. requirements.txt +9 -0
app.py ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ import torch
5
+ from sentence_transformers import SentenceTransformer
6
+ import lancedb
7
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
8
+ import time
9
+ import re
10
+ import nltk
11
+ from nltk.corpus import stopwords
12
+ from nltk.stem import WordNetLemmatizer
13
+ from google_drive_downloader import GoogleDriveDownloader as gdd
14
+ # Download NLTK resources if not already downloaded
15
+ nltk.download('stopwords')
16
+ nltk.download('wordnet')
17
+ nltk.download('omw-1.4')
18
+
19
+
20
+ # --------------------------- Dynamic Download of Large Files --------------------------- #
21
+
22
+ # Download LanceDB
23
+ gdd.download_file_from_google_drive(
24
+ file_id='1N7B1ycXbmEPZXRft8OTjvCxcfz92bfku', # Replace with your actual file ID
25
+ dest_path='./lancedb_directory_main.zip',
26
+ unzip=True
27
+ )
28
+
29
+ # Download fine-tuned model
30
+ gdd.download_file_from_google_drive(
31
+ file_id='13ahw_99cSwlW5_U6-MrkIlB9uYy3Zwty', # Replace with your actual file ID
32
+ dest_path='./finetuned_all_minilm_l6_v2.zip',
33
+ unzip=True
34
+ )
35
+ # --------------------------- Load the LanceDB Table and Models --------------------------- #
36
+
37
+ # Connect to LanceDB
38
+ DB_PATH = "/lancedb_directory_main"
39
+ TABLE_NAME_1 = "enhanced_papers_pretrained_1"
40
+ TABLE_NAME_2 = "enhanced_papers_pretrained_2"
41
+ TABLE_NAME_3 = "enhanced_papers_finetuned"
42
+
43
+ db = lancedb.connect(DB_PATH)
44
+ table1 = db.open_table(TABLE_NAME_1)
45
+ table2 = db.open_table(TABLE_NAME_2)
46
+ table3 = db.open_table(TABLE_NAME_3)
47
+
48
+ # Load the SentenceTransformer models
49
+ embedding_models = {
50
+ "all-MiniLM-L6-v2": SentenceTransformer('all-MiniLM-L6-v2'),
51
+ "allenai-specter": SentenceTransformer('allenai-specter'),
52
+ "finetuned_all_minilm_l6_v2": SentenceTransformer('./finetuned_all_minilm_l6_v2')
53
+ }
54
+
55
+ model_tables = {
56
+ "all-MiniLM-L6-v2": table1,
57
+ "allenai-specter": table2,
58
+ "finetuned_all_minilm_l6_v2": table3
59
+ }
60
+
61
+ # Load the tokenizer and summarization model for RAG-based explanations
62
+ MODEL_NAME = "google/flan-t5-large"
63
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
64
+ rag_model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
65
+ rag_pipeline = pipeline("text2text-generation", model=rag_model, tokenizer=tokenizer, device=torch.device("cuda" if torch.cuda.is_available() else "cpu"))
66
+
67
+ # --------------------------- Streamlit UI Components --------------------------- #
68
+
69
+ st.title("Research Paper Recommendation System with RAG-based Explanations")
70
+
71
+ # Initialize stopwords and lemmatizer
72
+ stop_words = set(stopwords.words('english'))
73
+ lemmatizer = WordNetLemmatizer()
74
+
75
+ # Function to clean text
76
+ def clean_text(text):
77
+ if pd.isnull(text):
78
+ return ""
79
+ # Lowercasing
80
+ text = text.lower()
81
+ # Remove special characters and punctuation
82
+ text = re.sub(r'[^a-z0-9\s]', '', text)
83
+ # Remove extra whitespace and newlines
84
+ text = re.sub(r'\s+', ' ', text).strip()
85
+ # Tokenize and remove stopwords, then lemmatize
86
+ tokens = text.split()
87
+ tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
88
+ return ' '.join(tokens)
89
+
90
+ # Input abstract from the user
91
+ user_abstract = st.text_area("Enter the abstract of your paper:", height=200)
92
+
93
+ # Preprocess the user input abstract
94
+ user_abstract = clean_text(user_abstract)
95
+
96
+ # Number of recommendations slider
97
+ k = st.slider("Select the number of recommendations (k):", min_value=1, max_value=20, value=5)
98
+
99
+ # Model selection dropdown
100
+ selected_model_name = st.sidebar.selectbox("Select the embedding model:", list(embedding_models.keys()))
101
+
102
+ # Fetch unique metadata values for filters
103
+ def get_unique_values(table, column):
104
+ df = table.to_pandas()
105
+ return sorted(df[column].dropna().unique())
106
+
107
+ table = model_tables[selected_model_name]
108
+ categories = get_unique_values(table, 'categories')
109
+ authors = get_unique_values(table, 'authors')
110
+
111
+ # Metadata filters
112
+ st.sidebar.header("Filter Recommendations by Metadata")
113
+ filter_category = st.sidebar.selectbox("Filter by Category (optional):", [""] + categories)
114
+ filter_author = st.sidebar.selectbox("Filter by Author (optional):", [""] + authors)
115
+
116
+ # --------------------------- Helper Functions --------------------------- #
117
+
118
+ def generate_explanation(user_abstract, recommended_title, recommended_authors, recommended_abstract, max_input_length=512, max_output_length=200):
119
+ prompt = (
120
+ f"User's Input:\n{user_abstract}\n\n"
121
+ f"Recommended Paper:\n"
122
+ f"Title: {recommended_title}\n"
123
+ f"Authors: {recommended_authors}\n"
124
+ f"Abstract: {recommended_abstract}\n\n"
125
+ "Explain briefly, how the recommended paper is relevant to the user's input"
126
+ )
127
+ try:
128
+ explanation = rag_pipeline(
129
+ prompt,
130
+ max_length=max_output_length,
131
+ min_length=50,
132
+ do_sample=True,
133
+ temperature=0.7,
134
+ top_p=0.9,
135
+ truncation=True
136
+ )[0]['generated_text']
137
+ return explanation
138
+ except Exception as e:
139
+ return f"Error during generation: {e}"
140
+
141
+ def post_process_explanation(text):
142
+ sentences = list(dict.fromkeys(text.split('. ')))
143
+ return '. '.join(sentences).strip()
144
+
145
+ def get_recommendations(table, embedding_model, model_name):
146
+ with st.spinner(f"Generating embedding for your abstract using {model_name}..."):
147
+ user_embedding = embedding_model.encode(user_abstract, convert_to_tensor=True).cpu().numpy()
148
+
149
+ # Perform similarity search
150
+ query = table.search(user_embedding).metric("cosine").limit(k)
151
+
152
+ if filter_category:
153
+ query = query.where(f"categories == '{filter_category}'")
154
+ if filter_author:
155
+ query = query.where(f"authors LIKE '%{filter_author}%'")
156
+
157
+ return query.to_pandas()
158
+
159
+ # --------------------------- Main Logic for Recommendations --------------------------- #
160
+
161
+ if st.button("Get Recommendations"):
162
+ if not user_abstract:
163
+ st.error("Please enter an abstract to proceed.")
164
+ else:
165
+ embedding_model = embedding_models[selected_model_name]
166
+ table = model_tables[selected_model_name]
167
+
168
+ st.header(f"Recommendations using {selected_model_name}")
169
+ recommendations = get_recommendations(table, embedding_model, selected_model_name)
170
+
171
+ if recommendations.empty:
172
+ st.warning(f"No recommendations found for {selected_model_name} based on the current filters.")
173
+ else:
174
+ st.success(f"Top {len(recommendations)} Recommendations from {selected_model_name}:")
175
+
176
+ for idx, row in recommendations.iterrows():
177
+ st.write(f"### {idx + 1}. {row['title']}")
178
+ st.write(f"**Category:** {row['categories']}")
179
+ st.write(f"**Authors:** {row['authors']}")
180
+ st.write(f"**Abstract:** {row['abstract']}")
181
+ st.write(f"**Last Updated:** {row['update_date']}")
182
+ st.write("---")
183
+
184
+ explanation = generate_explanation(user_abstract, row['title'], row['authors'], row['abstract'])
185
+ explanation = post_process_explanation(explanation)
186
+ st.write(f"**Explanation:** {explanation}")
187
+ st.write("---")
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ pandas
3
+ numpy
4
+ torch
5
+ sentence-transformers
6
+ lancedb
7
+ nltk
8
+ google_drive_downloader
9
+ transformers