pdx97 commited on
Commit
bde3c06
·
verified ·
1 Parent(s): acce4ba

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +81 -273
app.py CHANGED
@@ -126,209 +126,31 @@
126
  # # return [{"error": f"Error fetching research papers: {str(e)}"}]
127
 
128
 
129
- # """------Applied TF-IDF for better semantic search------"""
130
- # import feedparser
131
- # import urllib.parse
132
- # import yaml
133
- # from tools.final_answer import FinalAnswerTool
134
- # import numpy as np
135
- # from sklearn.feature_extraction.text import TfidfVectorizer
136
- # from sklearn.metrics.pairwise import cosine_similarity
137
- # import gradio as gr
138
- # from smolagents import CodeAgent,DuckDuckGoSearchTool, HfApiModel,load_tool,tool
139
- # import nltk
140
-
141
- # import datetime
142
- # import requests
143
- # import pytz
144
- # from tools.final_answer import FinalAnswerTool
145
-
146
- # from Gradio_UI import GradioUI
147
-
148
- # nltk.download("stopwords")
149
- # from nltk.corpus import stopwords
150
-
151
- # @tool # ✅ Register the function properly as a SmolAgents tool
152
- # def fetch_latest_arxiv_papers(keywords: list, num_results: int = 5) -> list:
153
- # """Fetches and ranks arXiv papers using TF-IDF and Cosine Similarity.
154
-
155
- # Args:
156
- # keywords: List of keywords for search.
157
- # num_results: Number of results to return.
158
-
159
- # Returns:
160
- # List of the most relevant papers based on TF-IDF ranking.
161
- # """
162
- # try:
163
- # print(f"DEBUG: Searching arXiv papers with keywords: {keywords}")
164
-
165
- # # Use a general keyword search
166
- # query = "+AND+".join([f"all:{kw}" for kw in keywords])
167
- # query_encoded = urllib.parse.quote(query)
168
- # url = f"http://export.arxiv.org/api/query?search_query={query_encoded}&start=0&max_results=50&sortBy=submittedDate&sortOrder=descending"
169
-
170
- # print(f"DEBUG: Query URL - {url}")
171
-
172
- # feed = feedparser.parse(url)
173
- # papers = []
174
-
175
- # # Extract papers from arXiv
176
- # for entry in feed.entries:
177
- # papers.append({
178
- # "title": entry.title,
179
- # "authors": ", ".join(author.name for author in entry.authors),
180
- # "year": entry.published[:4],
181
- # "abstract": entry.summary,
182
- # "link": entry.link
183
- # })
184
-
185
- # if not papers:
186
- # return [{"error": "No results found. Try different keywords."}]
187
-
188
- # # Prepare TF-IDF Vectorization
189
- # corpus = [paper["title"] + " " + paper["abstract"] for paper in papers]
190
- # vectorizer = TfidfVectorizer(stop_words=stopwords.words('english')) # Remove stopwords
191
- # tfidf_matrix = vectorizer.fit_transform(corpus)
192
-
193
- # # Transform Query into TF-IDF Vector
194
- # query_str = " ".join(keywords)
195
- # query_vec = vectorizer.transform([query_str])
196
-
197
- # #Compute Cosine Similarity
198
- # similarity_scores = cosine_similarity(query_vec, tfidf_matrix).flatten()
199
-
200
- # #Sort papers based on similarity score
201
- # ranked_papers = sorted(zip(papers, similarity_scores), key=lambda x: x[1], reverse=True)
202
-
203
- # # Return the most relevant papers
204
- # return [paper[0] for paper in ranked_papers[:num_results]]
205
-
206
- # except Exception as e:
207
- # print(f"ERROR: {str(e)}")
208
- # return [{"error": f"Error fetching research papers: {str(e)}"}]
209
- # @tool
210
- # def get_current_time_in_timezone(timezone: str) -> str:
211
- # """A tool that fetches the current local time in a specified timezone.
212
- # Args:
213
- # timezone: A string representing a valid timezone (e.g., 'America/New_York').
214
- # """
215
- # try:
216
- # # Create timezone object
217
- # tz = pytz.timezone(timezone)
218
- # # Get current time in that timezone
219
- # local_time = datetime.datetime.now(tz).strftime("%Y-%m-%d %H:%M:%S")
220
- # return f"The current local time in {timezone} is: {local_time}"
221
- # except Exception as e:
222
- # return f"Error fetching time for timezone '{timezone}': {str(e)}"
223
-
224
-
225
- # final_answer = FinalAnswerTool()
226
-
227
-
228
- # # AI Model
229
- # model = HfApiModel(
230
- # max_tokens=2096,
231
- # temperature=0.5,
232
- # model_id='Qwen/Qwen2.5-Coder-32B-Instruct',
233
- # custom_role_conversions=None,
234
- # )
235
-
236
- # # Import tool from Hub
237
- # image_generation_tool = load_tool("agents-course/text-to-image", trust_remote_code=True)
238
-
239
-
240
- # # Load prompt templates
241
- # with open("prompts.yaml", 'r') as stream:
242
- # prompt_templates = yaml.safe_load(stream)
243
-
244
- # # Create the AI Agent
245
- # agent = CodeAgent(
246
- # model=model,
247
- # tools=[final_answer,fetch_latest_arxiv_papers], # Add your tools here
248
- # max_steps=6,
249
- # verbosity_level=1,
250
- # grammar=None,
251
- # planning_interval=None,
252
- # name="ScholarAgent",
253
- # description="An AI agent that fetches the latest research papers from arXiv based on user-defined keywords and filters.",
254
- # prompt_templates=prompt_templates
255
- # )
256
-
257
-
258
-
259
- # #Search Papers
260
- # def search_papers(user_input):
261
- # keywords = [kw.strip() for kw in user_input.split(",") if kw.strip()] # Ensure valid keywords
262
- # print(f"DEBUG: Received input keywords - {keywords}") # Debug user input
263
-
264
- # if not keywords:
265
- # print("DEBUG: No valid keywords provided.")
266
- # return "Error: Please enter at least one valid keyword."
267
-
268
- # results = fetch_latest_arxiv_papers(keywords, num_results=3) # Fetch 3 results
269
- # print(f"DEBUG: Results received - {results}") # Debug function output
270
-
271
- # # Check if the API returned an error
272
- # if isinstance(results, list) and len(results) > 0 and "error" in results[0]:
273
- # return results[0]["error"] # Return the error message directly
274
-
275
- # # Format results only if valid papers exist
276
- # if isinstance(results, list) and results and isinstance(results[0], dict):
277
- # formatted_results = "\n\n".join([
278
- # f"---\n\n"
279
- # f"📌 **Title:** {paper['title']}\n\n"
280
- # f"👨‍🔬 **Authors:** {paper['authors']}\n\n"
281
- # f"📅 **Year:** {paper['year']}\n\n"
282
- # f"📖 **Abstract:** {paper['abstract'][:500]}... *(truncated for readability)*\n\n"
283
- # f"[🔗 Read Full Paper]({paper['link']})\n\n"
284
- # for paper in results
285
- # ])
286
- # return formatted_results
287
-
288
- # print("DEBUG: No results found.")
289
- # return "No results found. Try different keywords."
290
-
291
-
292
-
293
- # # Create Gradio UI
294
- # with gr.Blocks() as demo:
295
- # gr.Markdown("# ScholarAgent")
296
- # keyword_input = gr.Textbox(label="Enter keywords(comma-separated) or even full sentences ", placeholder="e.g., deep learning, reinforcement learning or NLP in finance or Deep learning in Medicine")
297
- # output_display = gr.Markdown()
298
- # search_button = gr.Button("Search")
299
-
300
- # search_button.click(search_papers, inputs=[keyword_input], outputs=[output_display])
301
-
302
- # print("DEBUG: Gradio UI is running. Waiting for user input...")
303
-
304
- # # Launch Gradio App
305
- # demo.launch()
306
-
307
- """------Enhanced ScholarAgent with Fixes and Features-----"""
308
  import feedparser
309
  import urllib.parse
310
  import yaml
311
- import requests
312
  import numpy as np
313
  from sklearn.feature_extraction.text import TfidfVectorizer
314
  from sklearn.metrics.pairwise import cosine_similarity
315
  import gradio as gr
316
- from smolagents import CodeAgent, HfApiModel, tool
317
  import nltk
318
- from transformers import pipeline
319
 
320
- # ✅ Ensure necessary NLTK data is downloaded
 
 
 
 
 
 
321
  nltk.download("stopwords")
322
- nltk.download("punkt")
323
  from nltk.corpus import stopwords
324
 
325
- # ✅ GPT Summarization Pipeline
326
- summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
327
-
328
- @tool
329
  def fetch_latest_arxiv_papers(keywords: list, num_results: int = 5) -> list:
330
- """
331
- Fetches and ranks arXiv papers using optimized TF-IDF and Cosine Similarity.
332
 
333
  Args:
334
  keywords: List of keywords for search.
@@ -338,16 +160,19 @@ def fetch_latest_arxiv_papers(keywords: list, num_results: int = 5) -> list:
338
  List of the most relevant papers based on TF-IDF ranking.
339
  """
340
  try:
341
- # Encode query properly
342
- query = "+AND+".join([f"all:{kw}" for kw in keywords])
343
- query_encoded = urllib.parse.quote_plus(query)
344
 
345
- url = f"http://export.arxiv.org/api/query?search_query={query_encoded}&start=0&max_results=5&sortBy=submittedDate&sortOrder=descending"
346
- print(f"DEBUG: Query URL - {url}")
 
 
 
 
347
 
348
  feed = feedparser.parse(url)
349
  papers = []
350
 
 
351
  for entry in feed.entries:
352
  papers.append({
353
  "title": entry.title,
@@ -358,77 +183,49 @@ def fetch_latest_arxiv_papers(keywords: list, num_results: int = 5) -> list:
358
  })
359
 
360
  if not papers:
361
- print("DEBUG: No results from ArXiv API")
362
  return [{"error": "No results found. Try different keywords."}]
363
 
364
- # TF-IDF Vectorization
365
  corpus = [paper["title"] + " " + paper["abstract"] for paper in papers]
366
- vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'), max_features=2000)
367
  tfidf_matrix = vectorizer.fit_transform(corpus)
368
 
369
- query_vec = vectorizer.transform([" ".join(keywords)])
 
 
 
 
370
  similarity_scores = cosine_similarity(query_vec, tfidf_matrix).flatten()
371
 
 
372
  ranked_papers = sorted(zip(papers, similarity_scores), key=lambda x: x[1], reverse=True)
373
 
374
- # Apply GPT Summarization with Fallback
375
- for paper, _ in ranked_papers:
376
- try:
377
- paper["summary"] = summarizer(paper["abstract"], max_length=100, min_length=30, do_sample=False)[0]["summary_text"]
378
- except:
379
- paper["summary"] = paper["abstract"][:300] + "..." # ✅ Fallback: First 300 characters of abstract
380
-
381
  return [paper[0] for paper in ranked_papers[:num_results]]
382
 
383
  except Exception as e:
384
  print(f"ERROR: {str(e)}")
385
  return [{"error": f"Error fetching research papers: {str(e)}"}]
386
-
387
-
388
-
389
-
390
  @tool
391
- def get_citation_count(paper_title: str) -> int:
392
- """
393
- Fetches citation count from Semantic Scholar API.
394
-
395
  Args:
396
- paper_title (str): Title of the research paper.
397
-
398
- Returns:
399
- int: Citation count (default 0 if not found).
400
  """
401
  try:
402
- base_url = "https://api.semanticscholar.org/graph/v1/paper/search"
403
- params = {"query": paper_title, "fields": "citationCount"}
404
- response = requests.get(base_url, params=params).json()
405
-
406
- if "data" in response and response["data"]:
407
- return response["data"][0].get("citationCount", 0)
408
- return 0 # Default to 0 if no data found
409
-
410
  except Exception as e:
411
- print(f"ERROR fetching citation count: {e}")
412
- return 0
413
 
414
 
415
- @tool
416
- def rank_papers_by_citations(papers: list) -> list:
417
- """
418
- Ranks papers based on citation count and TF-IDF similarity.
419
-
420
- Args:
421
- papers (list): List of research papers.
422
 
423
- Returns:
424
- list: Papers sorted by citation count and TF-IDF score.
425
- """
426
- for paper in papers:
427
- paper["citations"] = get_citation_count(paper["title"])
428
- return sorted(papers, key=lambda x: (x["citations"], x.get("tfidf_score", 0)), reverse=True)
429
 
430
-
431
- # ✅ AI Model
432
  model = HfApiModel(
433
  max_tokens=2096,
434
  temperature=0.5,
@@ -436,65 +233,76 @@ model = HfApiModel(
436
  custom_role_conversions=None,
437
  )
438
 
439
- # Load prompt templates
 
 
 
 
440
  with open("prompts.yaml", 'r') as stream:
441
  prompt_templates = yaml.safe_load(stream)
442
 
443
- # Create the AI Agent
444
  agent = CodeAgent(
445
  model=model,
446
- tools=[fetch_latest_arxiv_papers, get_citation_count, rank_papers_by_citations],
447
  max_steps=6,
448
  verbosity_level=1,
449
  grammar=None,
450
  planning_interval=None,
451
  name="ScholarAgent",
452
- description="An AI agent that fetches and ranks the latest research papers based on citations and relevance.",
453
  prompt_templates=prompt_templates
454
  )
455
 
456
 
457
- # ✅ Gradio UI
458
- with gr.Blocks() as demo:
459
- gr.Markdown("# ScholarAgent")
460
- keyword_input = gr.Textbox(label="Enter keywords or full sentences", placeholder="e.g., deep learning, reinforcement learning")
461
- output_display = gr.Markdown()
462
- search_button = gr.Button("Search")
463
 
464
- def search_papers(user_input, year_range, min_citations):
465
- keywords = [kw.strip() for kw in user_input.split(",") if kw.strip()]
466
- print(f"DEBUG: Received input keywords - {keywords}")
 
467
 
468
- if not keywords:
469
- print("DEBUG: No valid keywords provided.")
470
- return "Error: Please enter at least one valid keyword."
471
-
472
- results = fetch_latest_arxiv_papers(keywords, num_results=5, year_range=year_range, min_citations=int(min_citations))
473
- print(f"DEBUG: Results received - {results}")
474
-
475
- # If results are empty or an error occurred, display an error message
476
- if not results or isinstance(results, list) and "error" in results[0]:
477
- print(f"DEBUG: Error in fetching results - {results[0]['error']}")
478
- return results[0]["error"] if results else "No results found. Try different keywords."
479
 
480
- # Format output
 
 
 
 
 
 
 
 
481
  formatted_results = "\n\n".join([
 
482
  f"📌 **Title:** {paper['title']}\n\n"
483
  f"👨‍🔬 **Authors:** {paper['authors']}\n\n"
484
  f"📅 **Year:** {paper['year']}\n\n"
485
- f"📖 **Summary:** {paper['summary'] if 'summary' in paper else 'No summary available'}\n\n"
486
- f"🔢 **Citations:** {paper['citations']}\n\n"
487
  f"[🔗 Read Full Paper]({paper['link']})\n\n"
488
  for paper in results
489
  ])
490
- print(f"DEBUG: Formatted Results - {formatted_results}")
491
  return formatted_results
492
 
493
-
 
 
 
 
 
 
 
 
 
 
 
494
  search_button.click(search_papers, inputs=[keyword_input], outputs=[output_display])
 
495
  print("DEBUG: Gradio UI is running. Waiting for user input...")
496
 
497
- # Launch Gradio App
498
  demo.launch()
499
 
500
 
 
 
126
  # # return [{"error": f"Error fetching research papers: {str(e)}"}]
127
 
128
 
129
+ """------Applied TF-IDF for better semantic search------"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
  import feedparser
131
  import urllib.parse
132
  import yaml
133
+ from tools.final_answer import FinalAnswerTool
134
  import numpy as np
135
  from sklearn.feature_extraction.text import TfidfVectorizer
136
  from sklearn.metrics.pairwise import cosine_similarity
137
  import gradio as gr
138
+ from smolagents import CodeAgent,DuckDuckGoSearchTool, HfApiModel,load_tool,tool
139
  import nltk
 
140
 
141
+ import datetime
142
+ import requests
143
+ import pytz
144
+ from tools.final_answer import FinalAnswerTool
145
+
146
+ from Gradio_UI import GradioUI
147
+
148
  nltk.download("stopwords")
 
149
  from nltk.corpus import stopwords
150
 
151
+ @tool # ✅ Register the function properly as a SmolAgents tool
 
 
 
152
  def fetch_latest_arxiv_papers(keywords: list, num_results: int = 5) -> list:
153
+ """Fetches and ranks arXiv papers using TF-IDF and Cosine Similarity.
 
154
 
155
  Args:
156
  keywords: List of keywords for search.
 
160
  List of the most relevant papers based on TF-IDF ranking.
161
  """
162
  try:
163
+ print(f"DEBUG: Searching arXiv papers with keywords: {keywords}")
 
 
164
 
165
+ # Use a general keyword search
166
+ query = "+AND+".join([f"all:{kw}" for kw in keywords])
167
+ query_encoded = urllib.parse.quote(query)
168
+ url = f"http://export.arxiv.org/api/query?search_query={query_encoded}&start=0&max_results=50&sortBy=submittedDate&sortOrder=descending"
169
+
170
+ print(f"DEBUG: Query URL - {url}")
171
 
172
  feed = feedparser.parse(url)
173
  papers = []
174
 
175
+ # Extract papers from arXiv
176
  for entry in feed.entries:
177
  papers.append({
178
  "title": entry.title,
 
183
  })
184
 
185
  if not papers:
 
186
  return [{"error": "No results found. Try different keywords."}]
187
 
188
+ # Prepare TF-IDF Vectorization
189
  corpus = [paper["title"] + " " + paper["abstract"] for paper in papers]
190
+ vectorizer = TfidfVectorizer(stop_words=stopwords.words('english')) # Remove stopwords
191
  tfidf_matrix = vectorizer.fit_transform(corpus)
192
 
193
+ # Transform Query into TF-IDF Vector
194
+ query_str = " ".join(keywords)
195
+ query_vec = vectorizer.transform([query_str])
196
+
197
+ #Compute Cosine Similarity
198
  similarity_scores = cosine_similarity(query_vec, tfidf_matrix).flatten()
199
 
200
+ #Sort papers based on similarity score
201
  ranked_papers = sorted(zip(papers, similarity_scores), key=lambda x: x[1], reverse=True)
202
 
203
+ # Return the most relevant papers
 
 
 
 
 
 
204
  return [paper[0] for paper in ranked_papers[:num_results]]
205
 
206
  except Exception as e:
207
  print(f"ERROR: {str(e)}")
208
  return [{"error": f"Error fetching research papers: {str(e)}"}]
 
 
 
 
209
  @tool
210
+ def get_current_time_in_timezone(timezone: str) -> str:
211
+ """A tool that fetches the current local time in a specified timezone.
 
 
212
  Args:
213
+ timezone: A string representing a valid timezone (e.g., 'America/New_York').
 
 
 
214
  """
215
  try:
216
+ # Create timezone object
217
+ tz = pytz.timezone(timezone)
218
+ # Get current time in that timezone
219
+ local_time = datetime.datetime.now(tz).strftime("%Y-%m-%d %H:%M:%S")
220
+ return f"The current local time in {timezone} is: {local_time}"
 
 
 
221
  except Exception as e:
222
+ return f"Error fetching time for timezone '{timezone}': {str(e)}"
 
223
 
224
 
225
+ final_answer = FinalAnswerTool()
 
 
 
 
 
 
226
 
 
 
 
 
 
 
227
 
228
+ # AI Model
 
229
  model = HfApiModel(
230
  max_tokens=2096,
231
  temperature=0.5,
 
233
  custom_role_conversions=None,
234
  )
235
 
236
+ # Import tool from Hub
237
+ image_generation_tool = load_tool("agents-course/text-to-image", trust_remote_code=True)
238
+
239
+
240
+ # Load prompt templates
241
  with open("prompts.yaml", 'r') as stream:
242
  prompt_templates = yaml.safe_load(stream)
243
 
244
+ # Create the AI Agent
245
  agent = CodeAgent(
246
  model=model,
247
+ tools=[final_answer,fetch_latest_arxiv_papers], # Add your tools here
248
  max_steps=6,
249
  verbosity_level=1,
250
  grammar=None,
251
  planning_interval=None,
252
  name="ScholarAgent",
253
+ description="An AI agent that fetches the latest research papers from arXiv based on user-defined keywords and filters.",
254
  prompt_templates=prompt_templates
255
  )
256
 
257
 
 
 
 
 
 
 
258
 
259
+ #Search Papers
260
+ def search_papers(user_input):
261
+ keywords = [kw.strip() for kw in user_input.split(",") if kw.strip()] # Ensure valid keywords
262
+ print(f"DEBUG: Received input keywords - {keywords}") # Debug user input
263
 
264
+ if not keywords:
265
+ print("DEBUG: No valid keywords provided.")
266
+ return "Error: Please enter at least one valid keyword."
 
 
 
 
 
 
 
 
267
 
268
+ results = fetch_latest_arxiv_papers(keywords, num_results=3) # Fetch 3 results
269
+ print(f"DEBUG: Results received - {results}") # Debug function output
270
+
271
+ # Check if the API returned an error
272
+ if isinstance(results, list) and len(results) > 0 and "error" in results[0]:
273
+ return results[0]["error"] # Return the error message directly
274
+
275
+ # Format results only if valid papers exist
276
+ if isinstance(results, list) and results and isinstance(results[0], dict):
277
  formatted_results = "\n\n".join([
278
+ f"---\n\n"
279
  f"📌 **Title:** {paper['title']}\n\n"
280
  f"👨‍🔬 **Authors:** {paper['authors']}\n\n"
281
  f"📅 **Year:** {paper['year']}\n\n"
282
+ f"📖 **Abstract:** {paper['abstract'][:500]}... *(truncated for readability)*\n\n"
 
283
  f"[🔗 Read Full Paper]({paper['link']})\n\n"
284
  for paper in results
285
  ])
 
286
  return formatted_results
287
 
288
+ print("DEBUG: No results found.")
289
+ return "No results found. Try different keywords."
290
+
291
+
292
+
293
+ # Create Gradio UI
294
+ with gr.Blocks() as demo:
295
+ gr.Markdown("# ScholarAgent")
296
+ keyword_input = gr.Textbox(label="Enter keywords(comma-separated) or even full sentences ", placeholder="e.g., deep learning, reinforcement learning or NLP in finance or Deep learning in Medicine")
297
+ output_display = gr.Markdown()
298
+ search_button = gr.Button("Search")
299
+
300
  search_button.click(search_papers, inputs=[keyword_input], outputs=[output_display])
301
+
302
  print("DEBUG: Gradio UI is running. Waiting for user input...")
303
 
304
+ # Launch Gradio App
305
  demo.launch()
306
 
307
 
308
+