pdx97 commited on
Commit
a1941ae
·
verified ·
1 Parent(s): 66e4bad

Updated app.py

Browse files
Files changed (1) hide show
  1. app.py +228 -97
app.py CHANGED
@@ -126,53 +126,214 @@
126
  # # return [{"error": f"Error fetching research papers: {str(e)}"}]
127
 
128
 
129
- """------Applied TF-IDF for better semantic search------"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
  import feedparser
131
  import urllib.parse
132
  import yaml
133
- from tools.final_answer import FinalAnswerTool
134
  import numpy as np
135
  from sklearn.feature_extraction.text import TfidfVectorizer
136
  from sklearn.metrics.pairwise import cosine_similarity
137
  import gradio as gr
138
- from smolagents import CodeAgent,DuckDuckGoSearchTool, HfApiModel,load_tool,tool
139
  import nltk
140
 
141
- import datetime
142
- import requests
143
- import pytz
144
- from tools.final_answer import FinalAnswerTool
145
-
146
- from Gradio_UI import GradioUI
147
-
148
  nltk.download("stopwords")
 
149
  from nltk.corpus import stopwords
 
150
 
151
- @tool # Register the function properly as a SmolAgents tool
152
- def fetch_latest_arxiv_papers(keywords: list, num_results: int = 5) -> list:
153
- """Fetches and ranks arXiv papers using TF-IDF and Cosine Similarity.
154
-
155
- Args:
156
- keywords: List of keywords for search.
157
- num_results: Number of results to return.
158
 
159
- Returns:
160
- List of the most relevant papers based on TF-IDF ranking.
161
- """
162
  try:
163
- print(f"DEBUG: Searching arXiv papers with keywords: {keywords}")
164
-
165
- # Use a general keyword search
166
  query = "+AND+".join([f"all:{kw}" for kw in keywords])
167
  query_encoded = urllib.parse.quote(query)
168
  url = f"http://export.arxiv.org/api/query?search_query={query_encoded}&start=0&max_results=50&sortBy=submittedDate&sortOrder=descending"
169
-
170
- print(f"DEBUG: Query URL - {url}")
171
-
172
  feed = feedparser.parse(url)
173
  papers = []
174
-
175
- # Extract papers from arXiv
176
  for entry in feed.entries:
177
  papers.append({
178
  "title": entry.title,
@@ -181,49 +342,44 @@ def fetch_latest_arxiv_papers(keywords: list, num_results: int = 5) -> list:
181
  "abstract": entry.summary,
182
  "link": entry.link
183
  })
184
-
185
  if not papers:
186
  return [{"error": "No results found. Try different keywords."}]
187
 
188
- # Prepare TF-IDF Vectorization
189
  corpus = [paper["title"] + " " + paper["abstract"] for paper in papers]
190
- vectorizer = TfidfVectorizer(stop_words=stopwords.words('english')) # Remove stopwords
191
  tfidf_matrix = vectorizer.fit_transform(corpus)
192
 
193
- # Transform Query into TF-IDF Vector
194
  query_str = " ".join(keywords)
195
  query_vec = vectorizer.transform([query_str])
196
-
197
- #Compute Cosine Similarity
198
  similarity_scores = cosine_similarity(query_vec, tfidf_matrix).flatten()
199
 
200
- #Sort papers based on similarity score
201
  ranked_papers = sorted(zip(papers, similarity_scores), key=lambda x: x[1], reverse=True)
202
-
203
- # Return the most relevant papers
204
- return [paper[0] for paper in ranked_papers[:num_results]]
205
-
 
 
206
  except Exception as e:
207
- print(f"ERROR: {str(e)}")
208
  return [{"error": f"Error fetching research papers: {str(e)}"}]
 
209
  @tool
210
- def get_current_time_in_timezone(timezone: str) -> str:
211
- """A tool that fetches the current local time in a specified timezone.
212
- Args:
213
- timezone: A string representing a valid timezone (e.g., 'America/New_York').
214
- """
215
  try:
216
- # Create timezone object
217
- tz = pytz.timezone(timezone)
218
- # Get current time in that timezone
219
- local_time = datetime.datetime.now(tz).strftime("%Y-%m-%d %H:%M:%S")
220
- return f"The current local time in {timezone} is: {local_time}"
221
- except Exception as e:
222
- return f"Error fetching time for timezone '{timezone}': {str(e)}"
223
-
224
-
225
- final_answer = FinalAnswerTool()
226
 
 
 
 
 
 
 
227
 
228
  # AI Model
229
  model = HfApiModel(
@@ -233,10 +389,6 @@ model = HfApiModel(
233
  custom_role_conversions=None,
234
  )
235
 
236
- # Import tool from Hub
237
- image_generation_tool = load_tool("agents-course/text-to-image", trust_remote_code=True)
238
-
239
-
240
  # Load prompt templates
241
  with open("prompts.yaml", 'r') as stream:
242
  prompt_templates = yaml.safe_load(stream)
@@ -244,61 +396,40 @@ with open("prompts.yaml", 'r') as stream:
244
  # Create the AI Agent
245
  agent = CodeAgent(
246
  model=model,
247
- tools=[final_answer,fetch_latest_arxiv_papers], # Add your tools here
248
  max_steps=6,
249
  verbosity_level=1,
250
  grammar=None,
251
  planning_interval=None,
252
  name="ScholarAgent",
253
- description="An AI agent that fetches the latest research papers from arXiv based on user-defined keywords and filters.",
254
  prompt_templates=prompt_templates
255
  )
256
 
 
 
 
 
 
 
257
 
258
-
259
- #Search Papers
260
- def search_papers(user_input):
261
- keywords = [kw.strip() for kw in user_input.split(",") if kw.strip()] # Ensure valid keywords
262
- print(f"DEBUG: Received input keywords - {keywords}") # Debug user input
263
-
264
- if not keywords:
265
- print("DEBUG: No valid keywords provided.")
266
- return "Error: Please enter at least one valid keyword."
267
-
268
- results = fetch_latest_arxiv_papers(keywords, num_results=3) # Fetch 3 results
269
- print(f"DEBUG: Results received - {results}") # Debug function output
270
-
271
- # Check if the API returned an error
272
- if isinstance(results, list) and len(results) > 0 and "error" in results[0]:
273
- return results[0]["error"] # Return the error message directly
274
-
275
- # Format results only if valid papers exist
276
- if isinstance(results, list) and results and isinstance(results[0], dict):
277
- formatted_results = "\n\n".join([
278
  f"---\n\n"
279
  f"📌 **Title:** {paper['title']}\n\n"
280
  f"👨‍🔬 **Authors:** {paper['authors']}\n\n"
281
  f"📅 **Year:** {paper['year']}\n\n"
282
- f"📖 **Abstract:** {paper['abstract'][:500]}... *(truncated for readability)*\n\n"
 
283
  f"[🔗 Read Full Paper]({paper['link']})\n\n"
284
  for paper in results
285
  ])
286
- return formatted_results
287
-
288
- print("DEBUG: No results found.")
289
- return "No results found. Try different keywords."
290
-
291
-
292
-
293
- # Create Gradio UI
294
- with gr.Blocks() as demo:
295
- gr.Markdown("# ScholarAgent")
296
- keyword_input = gr.Textbox(label="Enter keywords(comma-separated) or even full sentences ", placeholder="e.g., deep learning, reinforcement learning or NLP in finance or Deep learning in Medicine")
297
- output_display = gr.Markdown()
298
- search_button = gr.Button("Search")
299
-
300
  search_button.click(search_papers, inputs=[keyword_input], outputs=[output_display])
301
-
302
  print("DEBUG: Gradio UI is running. Waiting for user input...")
303
 
304
  # Launch Gradio App
 
126
  # # return [{"error": f"Error fetching research papers: {str(e)}"}]
127
 
128
 
129
+ # """------Applied TF-IDF for better semantic search------"""
130
+ # import feedparser
131
+ # import urllib.parse
132
+ # import yaml
133
+ # from tools.final_answer import FinalAnswerTool
134
+ # import numpy as np
135
+ # from sklearn.feature_extraction.text import TfidfVectorizer
136
+ # from sklearn.metrics.pairwise import cosine_similarity
137
+ # import gradio as gr
138
+ # from smolagents import CodeAgent,DuckDuckGoSearchTool, HfApiModel,load_tool,tool
139
+ # import nltk
140
+
141
+ # import datetime
142
+ # import requests
143
+ # import pytz
144
+ # from tools.final_answer import FinalAnswerTool
145
+
146
+ # from Gradio_UI import GradioUI
147
+
148
+ # nltk.download("stopwords")
149
+ # from nltk.corpus import stopwords
150
+
151
+ # @tool # ✅ Register the function properly as a SmolAgents tool
152
+ # def fetch_latest_arxiv_papers(keywords: list, num_results: int = 5) -> list:
153
+ # """Fetches and ranks arXiv papers using TF-IDF and Cosine Similarity.
154
+
155
+ # Args:
156
+ # keywords: List of keywords for search.
157
+ # num_results: Number of results to return.
158
+
159
+ # Returns:
160
+ # List of the most relevant papers based on TF-IDF ranking.
161
+ # """
162
+ # try:
163
+ # print(f"DEBUG: Searching arXiv papers with keywords: {keywords}")
164
+
165
+ # # Use a general keyword search
166
+ # query = "+AND+".join([f"all:{kw}" for kw in keywords])
167
+ # query_encoded = urllib.parse.quote(query)
168
+ # url = f"http://export.arxiv.org/api/query?search_query={query_encoded}&start=0&max_results=50&sortBy=submittedDate&sortOrder=descending"
169
+
170
+ # print(f"DEBUG: Query URL - {url}")
171
+
172
+ # feed = feedparser.parse(url)
173
+ # papers = []
174
+
175
+ # # Extract papers from arXiv
176
+ # for entry in feed.entries:
177
+ # papers.append({
178
+ # "title": entry.title,
179
+ # "authors": ", ".join(author.name for author in entry.authors),
180
+ # "year": entry.published[:4],
181
+ # "abstract": entry.summary,
182
+ # "link": entry.link
183
+ # })
184
+
185
+ # if not papers:
186
+ # return [{"error": "No results found. Try different keywords."}]
187
+
188
+ # # Prepare TF-IDF Vectorization
189
+ # corpus = [paper["title"] + " " + paper["abstract"] for paper in papers]
190
+ # vectorizer = TfidfVectorizer(stop_words=stopwords.words('english')) # Remove stopwords
191
+ # tfidf_matrix = vectorizer.fit_transform(corpus)
192
+
193
+ # # Transform Query into TF-IDF Vector
194
+ # query_str = " ".join(keywords)
195
+ # query_vec = vectorizer.transform([query_str])
196
+
197
+ # #Compute Cosine Similarity
198
+ # similarity_scores = cosine_similarity(query_vec, tfidf_matrix).flatten()
199
+
200
+ # #Sort papers based on similarity score
201
+ # ranked_papers = sorted(zip(papers, similarity_scores), key=lambda x: x[1], reverse=True)
202
+
203
+ # # Return the most relevant papers
204
+ # return [paper[0] for paper in ranked_papers[:num_results]]
205
+
206
+ # except Exception as e:
207
+ # print(f"ERROR: {str(e)}")
208
+ # return [{"error": f"Error fetching research papers: {str(e)}"}]
209
+ # @tool
210
+ # def get_current_time_in_timezone(timezone: str) -> str:
211
+ # """A tool that fetches the current local time in a specified timezone.
212
+ # Args:
213
+ # timezone: A string representing a valid timezone (e.g., 'America/New_York').
214
+ # """
215
+ # try:
216
+ # # Create timezone object
217
+ # tz = pytz.timezone(timezone)
218
+ # # Get current time in that timezone
219
+ # local_time = datetime.datetime.now(tz).strftime("%Y-%m-%d %H:%M:%S")
220
+ # return f"The current local time in {timezone} is: {local_time}"
221
+ # except Exception as e:
222
+ # return f"Error fetching time for timezone '{timezone}': {str(e)}"
223
+
224
+
225
+ # final_answer = FinalAnswerTool()
226
+
227
+
228
+ # # AI Model
229
+ # model = HfApiModel(
230
+ # max_tokens=2096,
231
+ # temperature=0.5,
232
+ # model_id='Qwen/Qwen2.5-Coder-32B-Instruct',
233
+ # custom_role_conversions=None,
234
+ # )
235
+
236
+ # # Import tool from Hub
237
+ # image_generation_tool = load_tool("agents-course/text-to-image", trust_remote_code=True)
238
+
239
+
240
+ # # Load prompt templates
241
+ # with open("prompts.yaml", 'r') as stream:
242
+ # prompt_templates = yaml.safe_load(stream)
243
+
244
+ # # Create the AI Agent
245
+ # agent = CodeAgent(
246
+ # model=model,
247
+ # tools=[final_answer,fetch_latest_arxiv_papers], # Add your tools here
248
+ # max_steps=6,
249
+ # verbosity_level=1,
250
+ # grammar=None,
251
+ # planning_interval=None,
252
+ # name="ScholarAgent",
253
+ # description="An AI agent that fetches the latest research papers from arXiv based on user-defined keywords and filters.",
254
+ # prompt_templates=prompt_templates
255
+ # )
256
+
257
+
258
+
259
+ # #Search Papers
260
+ # def search_papers(user_input):
261
+ # keywords = [kw.strip() for kw in user_input.split(",") if kw.strip()] # Ensure valid keywords
262
+ # print(f"DEBUG: Received input keywords - {keywords}") # Debug user input
263
+
264
+ # if not keywords:
265
+ # print("DEBUG: No valid keywords provided.")
266
+ # return "Error: Please enter at least one valid keyword."
267
+
268
+ # results = fetch_latest_arxiv_papers(keywords, num_results=3) # Fetch 3 results
269
+ # print(f"DEBUG: Results received - {results}") # Debug function output
270
+
271
+ # # Check if the API returned an error
272
+ # if isinstance(results, list) and len(results) > 0 and "error" in results[0]:
273
+ # return results[0]["error"] # Return the error message directly
274
+
275
+ # # Format results only if valid papers exist
276
+ # if isinstance(results, list) and results and isinstance(results[0], dict):
277
+ # formatted_results = "\n\n".join([
278
+ # f"---\n\n"
279
+ # f"📌 **Title:** {paper['title']}\n\n"
280
+ # f"👨‍🔬 **Authors:** {paper['authors']}\n\n"
281
+ # f"📅 **Year:** {paper['year']}\n\n"
282
+ # f"📖 **Abstract:** {paper['abstract'][:500]}... *(truncated for readability)*\n\n"
283
+ # f"[🔗 Read Full Paper]({paper['link']})\n\n"
284
+ # for paper in results
285
+ # ])
286
+ # return formatted_results
287
+
288
+ # print("DEBUG: No results found.")
289
+ # return "No results found. Try different keywords."
290
+
291
+
292
+
293
+ # # Create Gradio UI
294
+ # with gr.Blocks() as demo:
295
+ # gr.Markdown("# ScholarAgent")
296
+ # keyword_input = gr.Textbox(label="Enter keywords(comma-separated) or even full sentences ", placeholder="e.g., deep learning, reinforcement learning or NLP in finance or Deep learning in Medicine")
297
+ # output_display = gr.Markdown()
298
+ # search_button = gr.Button("Search")
299
+
300
+ # search_button.click(search_papers, inputs=[keyword_input], outputs=[output_display])
301
+
302
+ # print("DEBUG: Gradio UI is running. Waiting for user input...")
303
+
304
+ # # Launch Gradio App
305
+ # demo.launch()
306
+ """------New Features-----"""
307
  import feedparser
308
  import urllib.parse
309
  import yaml
310
+ import requests
311
  import numpy as np
312
  from sklearn.feature_extraction.text import TfidfVectorizer
313
  from sklearn.metrics.pairwise import cosine_similarity
314
  import gradio as gr
315
+ from smolagents import CodeAgent, HfApiModel, tool
316
  import nltk
317
 
 
 
 
 
 
 
 
318
  nltk.download("stopwords")
319
+ nltk.download("punkt")
320
  from nltk.corpus import stopwords
321
+ from transformers import pipeline
322
 
323
+ # GPT Summarization Pipeline
324
+ summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
 
 
 
 
 
325
 
326
+ @tool # ✅ Register function as a SmolAgents tool
327
+ def fetch_latest_arxiv_papers(keywords: list, num_results: int = 5) -> list:
328
+ """Fetches and ranks arXiv papers using TF-IDF and Cosine Similarity."""
329
  try:
 
 
 
330
  query = "+AND+".join([f"all:{kw}" for kw in keywords])
331
  query_encoded = urllib.parse.quote(query)
332
  url = f"http://export.arxiv.org/api/query?search_query={query_encoded}&start=0&max_results=50&sortBy=submittedDate&sortOrder=descending"
333
+
 
 
334
  feed = feedparser.parse(url)
335
  papers = []
336
+
 
337
  for entry in feed.entries:
338
  papers.append({
339
  "title": entry.title,
 
342
  "abstract": entry.summary,
343
  "link": entry.link
344
  })
345
+
346
  if not papers:
347
  return [{"error": "No results found. Try different keywords."}]
348
 
 
349
  corpus = [paper["title"] + " " + paper["abstract"] for paper in papers]
350
+ vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'))
351
  tfidf_matrix = vectorizer.fit_transform(corpus)
352
 
 
353
  query_str = " ".join(keywords)
354
  query_vec = vectorizer.transform([query_str])
 
 
355
  similarity_scores = cosine_similarity(query_vec, tfidf_matrix).flatten()
356
 
 
357
  ranked_papers = sorted(zip(papers, similarity_scores), key=lambda x: x[1], reverse=True)
358
+
359
+ for paper, _ in ranked_papers:
360
+ paper["summary"] = summarizer(paper["abstract"], max_length=100, min_length=30, do_sample=False)[0]["summary_text"]
361
+
362
+ return [paper for paper, _ in ranked_papers[:num_results]]
363
+
364
  except Exception as e:
 
365
  return [{"error": f"Error fetching research papers: {str(e)}"}]
366
+
367
  @tool
368
+ def get_citation_count(paper_title: str) -> int:
369
+ """Fetches citation count from Semantic Scholar API."""
 
 
 
370
  try:
371
+ url = f"https://api.semanticscholar.org/v1/paper/search?query={urllib.parse.quote(paper_title)}"
372
+ response = requests.get(url).json()
373
+ return response["results"][0].get("citationCount", 0) if "results" in response else 0
374
+ except:
375
+ return 0
 
 
 
 
 
376
 
377
+ @tool
378
+ def rank_papers_by_citations(papers: list) -> list:
379
+ """Ranks papers based on citation count and TF-IDF similarity."""
380
+ for paper in papers:
381
+ paper["citations"] = get_citation_count(paper["title"])
382
+ return sorted(papers, key=lambda x: (x["citations"], x["tfidf_score"]), reverse=True)
383
 
384
  # AI Model
385
  model = HfApiModel(
 
389
  custom_role_conversions=None,
390
  )
391
 
 
 
 
 
392
  # Load prompt templates
393
  with open("prompts.yaml", 'r') as stream:
394
  prompt_templates = yaml.safe_load(stream)
 
396
  # Create the AI Agent
397
  agent = CodeAgent(
398
  model=model,
399
+ tools=[fetch_latest_arxiv_papers, get_citation_count, rank_papers_by_citations],
400
  max_steps=6,
401
  verbosity_level=1,
402
  grammar=None,
403
  planning_interval=None,
404
  name="ScholarAgent",
405
+ description="An AI agent that fetches and ranks the latest research papers based on citations and relevance.",
406
  prompt_templates=prompt_templates
407
  )
408
 
409
+ # Gradio UI
410
+ with gr.Blocks() as demo:
411
+ gr.Markdown("# ScholarAgent")
412
+ keyword_input = gr.Textbox(label="Enter keywords or full sentences", placeholder="e.g., deep learning, reinforcement learning")
413
+ output_display = gr.Markdown()
414
+ search_button = gr.Button("Search")
415
 
416
+ def search_papers(user_input):
417
+ keywords = [kw.strip() for kw in user_input.split(",") if kw.strip()]
418
+ results = fetch_latest_arxiv_papers(keywords, num_results=3)
419
+ if isinstance(results, list) and len(results) > 0 and "error" in results[0]:
420
+ return results[0]["error"]
421
+ return "\n\n".join([
 
 
 
 
 
 
 
 
 
 
 
 
 
 
422
  f"---\n\n"
423
  f"📌 **Title:** {paper['title']}\n\n"
424
  f"👨‍🔬 **Authors:** {paper['authors']}\n\n"
425
  f"📅 **Year:** {paper['year']}\n\n"
426
+ f"📖 **Summary:** {paper['summary']}\n\n"
427
+ f"🔢 **Citations:** {paper['citations']}\n\n"
428
  f"[🔗 Read Full Paper]({paper['link']})\n\n"
429
  for paper in results
430
  ])
431
+
 
 
 
 
 
 
 
 
 
 
 
 
 
432
  search_button.click(search_papers, inputs=[keyword_input], outputs=[output_display])
 
433
  print("DEBUG: Gradio UI is running. Waiting for user input...")
434
 
435
  # Launch Gradio App