Lasdw commited on
Commit
5386e26
·
1 Parent(s): 942e3f2

updated system prompt and modularized tools

Browse files
Files changed (4) hide show
  1. .gitignore +4 -1
  2. agent.py +116 -777
  3. requirements.txt +4 -1
  4. tools.py +139 -0
.gitignore CHANGED
@@ -2,7 +2,10 @@
2
  .env.*
3
  image.png
4
 
5
- GAIA
 
 
 
6
  GAIA/*
7
 
8
  pycache/*
 
2
  .env.*
3
  image.png
4
 
5
+ GAIA-repo/
6
+ GAIA-repo/*
7
+
8
+ GAIA/
9
  GAIA/*
10
 
11
  pycache/*
agent.py CHANGED
@@ -17,745 +17,58 @@ import pandas as pd
17
  from tabulate import tabulate
18
  import base64
19
 
20
- from langchain_community.document_loaders import WikipediaLoader
21
- from langchain_community.document_loaders import ArxivLoader
22
- from langchain_community.tools.tavily_search import TavilySearchResults
23
- from supabase import create_client, Client
 
 
 
 
 
 
 
 
 
24
 
25
  load_dotenv()
26
 
27
- def run_python_code(code: str):
28
- """Execute Python code safely using exec() instead of subprocess."""
29
- # Check for potentially dangerous operations
30
- dangerous_operations = [
31
- "os.system", "os.popen", "os.unlink", "os.remove",
32
- "subprocess.run", "subprocess.call", "subprocess.Popen",
33
- "shutil.rmtree", "shutil.move", "shutil.copy",
34
- "open(", "file(", "eval(", "exec(",
35
- "__import__", "input(", "raw_input(",
36
- "__builtins__", "globals(", "locals(",
37
- "compile(", "execfile(", "reload("
38
- ]
39
-
40
- # Safe imports that should be allowed
41
- safe_imports = {
42
- "import datetime", "import math", "import random",
43
- "import statistics", "import collections", "import itertools",
44
- "import re", "import json", "import csv", "import numpy",
45
- "import pandas", "from math import", "from datetime import",
46
- "from statistics import", "from collections import",
47
- "from itertools import"
48
- }
49
-
50
- # Check for dangerous operations
51
- for dangerous_op in dangerous_operations:
52
- if dangerous_op in code:
53
- return f"Error: Code contains potentially unsafe operations: {dangerous_op}"
54
-
55
- # Check each line for imports
56
- for line in code.splitlines():
57
- line = line.strip()
58
- if line.startswith("import ") or line.startswith("from "):
59
- # Check if it's in our safe list
60
- is_safe = any(line.startswith(safe_import) for safe_import in safe_imports)
61
- # Also allow basic numpy/pandas imports
62
- is_safe = is_safe or line.startswith("import numpy") or line.startswith("import pandas")
63
- if not is_safe:
64
- return f"Error: Code contains potentially unsafe import: {line}"
65
-
66
- try:
67
- # Capture stdout to get print output
68
- import io
69
- import sys
70
- from contextlib import redirect_stdout
71
-
72
- # Create a restricted globals environment
73
- restricted_globals = {
74
- '__builtins__': {
75
- 'abs': abs, 'all': all, 'any': any, 'bin': bin, 'bool': bool,
76
- 'chr': chr, 'dict': dict, 'dir': dir, 'divmod': divmod,
77
- 'enumerate': enumerate, 'filter': filter, 'float': float,
78
- 'format': format, 'hex': hex, 'int': int, 'len': len,
79
- 'list': list, 'map': map, 'max': max, 'min': min, 'oct': oct,
80
- 'ord': ord, 'pow': pow, 'print': print, 'range': range,
81
- 'reversed': reversed, 'round': round, 'set': set, 'slice': slice,
82
- 'sorted': sorted, 'str': str, 'sum': sum, 'tuple': tuple,
83
- 'type': type, 'zip': zip,
84
- }
85
- }
86
-
87
- # Allow safe modules
88
- import math
89
- import datetime
90
- import random
91
- import statistics
92
- import collections
93
- import itertools
94
- import re
95
- import json
96
- import csv
97
-
98
- restricted_globals['math'] = math
99
- restricted_globals['datetime'] = datetime
100
- restricted_globals['random'] = random
101
- restricted_globals['statistics'] = statistics
102
- restricted_globals['collections'] = collections
103
- restricted_globals['itertools'] = itertools
104
- restricted_globals['re'] = re
105
- restricted_globals['json'] = json
106
- restricted_globals['csv'] = csv
107
-
108
- # Try to import numpy and pandas if available
109
- try:
110
- import numpy as np
111
- restricted_globals['numpy'] = np
112
- restricted_globals['np'] = np
113
- except ImportError:
114
- pass
115
-
116
- try:
117
- import pandas as pd
118
- restricted_globals['pandas'] = pd
119
- restricted_globals['pd'] = pd
120
- except ImportError:
121
- pass
122
-
123
- # Create local scope
124
- local_scope = {}
125
-
126
- # Capture stdout
127
- captured_output = io.StringIO()
128
-
129
- # Execute the entire code block at once
130
- with redirect_stdout(captured_output):
131
- # Try to evaluate as expression first (for simple expressions)
132
- lines = code.strip().split('\n')
133
- if len(lines) == 1 and not any(keyword in code for keyword in ['=', 'import', 'from', 'def', 'class', 'if', 'for', 'while', 'try', 'with']):
134
- try:
135
- result = eval(code, restricted_globals, local_scope)
136
- print(f"Result: {result}")
137
- except:
138
- # If eval fails, use exec
139
- exec(code, restricted_globals, local_scope)
140
- else:
141
- # For multi-line code, execute the entire block
142
- exec(code, restricted_globals, local_scope)
143
-
144
- # Get the captured output
145
- output = captured_output.getvalue()
146
-
147
- if output.strip():
148
- return output.strip()
149
- else:
150
- # If no output, check if there's a result from the last expression
151
- lines = code.strip().split('\n')
152
- last_line = lines[-1].strip() if lines else ""
153
-
154
- # If the last line looks like an expression, try to evaluate it
155
- if last_line and not any(keyword in last_line for keyword in ['=', 'import', 'from', 'def', 'class', 'if', 'for', 'while', 'try', 'with', 'print']):
156
- try:
157
- result = eval(last_line, restricted_globals, local_scope)
158
- return f"Result: {result}"
159
- except:
160
- pass
161
-
162
- return "Code executed successfully with no output."
163
-
164
- except SyntaxError as e:
165
- return f"Syntax Error: {str(e)}"
166
- except NameError as e:
167
- return f"Name Error: {str(e)}"
168
- except ZeroDivisionError as e:
169
- return f"Zero Division Error: {str(e)}"
170
- except Exception as e:
171
- return f"Error executing code: {str(e)}"
172
 
173
- # Apify-based search function
174
- # def apify_google_search(query: str, limit: int = 10) -> str:
175
- # """
176
- # Use Apify's Google Search Results Scraper to get search results
177
- #
178
- # Args:
179
- # query: The search query string
180
- # limit: Number of results to return (10, 20, 30, 40, 50, 100)
181
- #
182
- # Returns:
183
- # Formatted search results as a string
184
- # """
185
- # # You would need to provide a valid Apify API token
186
- # # You can get one by signing up at https://apify.com/
187
- # # Replace this with your actual Apify API token or set as environment variable
188
- # APIFY_API_TOKEN = os.environ.get("APIFY_API_TOKEN", "")
189
- #
190
- # if not APIFY_API_TOKEN:
191
- # print("No Apify API token found. Using fallback search method.")
192
- # return fallback_search(query)
193
- #
194
- # try:
195
- # # Initialize the ApifyClient with API token
196
- # client = ApifyClient(APIFY_API_TOKEN)
197
- #
198
- # # Prepare the Actor input - convert limit to string as required by the API
199
- # run_input = {
200
- # "keyword": query,
201
- # "limit": str(limit), # Convert to string as required by the API
202
- # "country": "US"
203
- # }
204
- #
205
- # # The Actor ID for the Google Search Results Scraper
206
- # ACTOR_ID = "563JCPLOqM1kMmbbP"
207
- #
208
- # print(f"Starting Apify search for: '{query}'")
209
- #
210
- # # Run the Actor and wait for it to finish (with timeout)
211
- # run = client.actor(ACTOR_ID).call(run_input=run_input, timeout_secs=60)
212
- #
213
- # if not run or not run.get("defaultDatasetId"):
214
- # print("Failed to get results from Apify actor")
215
- # return fallback_search(query)
216
- #
217
- # # Fetch Actor results from the run's dataset
218
- # results = []
219
- # for item in client.dataset(run["defaultDatasetId"]).iterate_items():
220
- # results.append(item)
221
- #
222
- # # Format and return the results
223
- # return format_search_results(results, query)
224
- #
225
- # except Exception as e:
226
- # print(f"Error using Apify: {str(e)}")
227
- # return fallback_search(query)
228
 
229
- def scrape_webpage(url: str) -> str:
230
- """
231
- Safely scrape content from a specified URL.
232
-
233
- Args:
234
- url: The URL to scrape
235
-
236
- Returns:
237
- Formatted webpage content as text
238
- """
239
- # Check if the URL is valid
240
- try:
241
- # Parse the URL to validate it
242
- parsed_url = urlparse(url)
243
- if not parsed_url.scheme or not parsed_url.netloc:
244
- return f"Error: Invalid URL format: {url}. Please provide a valid URL with http:// or https:// prefix."
245
-
246
- # Block potentially dangerous URLs
247
- blocked_domains = [
248
- "localhost", "127.0.0.1", "0.0.0.0",
249
- "192.168.", "10.0.", "172.16.", "172.17.", "172.18.", "172.19.", "172.20.",
250
- "172.21.", "172.22.", "172.23.", "172.24.", "172.25.", "172.26.", "172.27.",
251
- "172.28.", "172.29.", "172.30.", "172.31."
252
- ]
253
-
254
- if any(domain in parsed_url.netloc for domain in blocked_domains):
255
- return f"Error: Access to internal/local URLs is blocked for security: {url}"
256
-
257
- print(f"Scraping URL: {url}")
258
-
259
- # Set user agent to avoid being blocked
260
- headers = {
261
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
262
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
263
- 'Accept-Language': 'en-US,en;q=0.5',
264
- 'Connection': 'keep-alive',
265
- 'Upgrade-Insecure-Requests': '1',
266
- 'Cache-Control': 'max-age=0',
267
- }
268
-
269
- # Set a reasonable timeout to avoid hanging
270
- timeout = 10
271
-
272
- # Make the request
273
- response = requests.get(url, headers=headers, timeout=timeout)
274
-
275
- # Check if request was successful
276
- if response.status_code != 200:
277
- return f"Error: Failed to fetch the webpage. Status code: {response.status_code}"
278
-
279
- # Use BeautifulSoup to parse the HTML
280
- soup = BeautifulSoup(response.text, 'html.parser')
281
-
282
- # Remove script and style elements that are not relevant to content
283
- for script_or_style in soup(["script", "style", "iframe", "footer", "nav"]):
284
- script_or_style.decompose()
285
-
286
- # Get the page title
287
- title = soup.title.string if soup.title else "No title found"
288
-
289
- # Extract the main content
290
- # First try to find main content areas
291
- main_content = soup.find('main') or soup.find('article') or soup.find(id='content') or soup.find(class_='content')
292
-
293
- # If no main content area is found, use the entire body
294
- if not main_content:
295
- main_content = soup.body
296
-
297
- # Convert to plain text
298
- h = html2text.HTML2Text()
299
- h.ignore_links = False
300
- h.ignore_images = True
301
- h.ignore_tables = False
302
- h.unicode_snob = True
303
-
304
- if main_content:
305
- text_content = h.handle(str(main_content))
306
- else:
307
- text_content = h.handle(response.text)
308
-
309
- # Limit content length to avoid overwhelming the model
310
- max_content_length = 99999999999
311
- if len(text_content) > max_content_length:
312
- text_content = text_content[:max_content_length] + "\n\n[Content truncated due to length...]"
313
-
314
- # Format the response
315
- result = f"Title: {title}\nURL: {url}\n\n{text_content}"
316
-
317
- return result
318
-
319
- except requests.exceptions.Timeout:
320
- return f"Error: Request timed out while trying to access {url}"
321
- except requests.exceptions.ConnectionError:
322
- return f"Error: Failed to connect to {url}. The site might be down or the URL might be incorrect."
323
- except requests.exceptions.RequestException as e:
324
- return f"Error requesting {url}: {str(e)}"
325
- except Exception as e:
326
- return f"Error scraping webpage {url}: {str(e)}"
327
-
328
- # Comment out the format_search_results function (around line 180)
329
- # def format_search_results(results: List[Dict], query: str) -> str:
330
- # """Format the search results into a readable string"""
331
- # if not results or len(results) == 0:
332
- # return f"No results found for query: {query}"
333
- #
334
- # print(f"Raw search results: {str(results)[:1000]}...")
335
- #
336
- # # Extract search results from the Apify output
337
- # formatted_results = f"Search results for '{query}':\n\n"
338
- #
339
- # # Check if results is a list of dictionaries or a dictionary with nested results
340
- # if isinstance(results, dict) and "results" in results:
341
- # items = results["results"]
342
- # elif isinstance(results, list):
343
- # items = results
344
- # else:
345
- # return f"Unable to process results for query: {query}"
346
- #
347
- # # Handle different Apify result formats
348
- # if len(items) > 0:
349
- # # Check the structure of the first item to determine format
350
- # first_item = items[0]
351
- #
352
- # # If item has 'organicResults', this is the format from some Apify actors
353
- # if isinstance(first_item, dict) and "organicResults" in first_item:
354
- # organic_results = first_item.get("organicResults", [])
355
- # for i, result in enumerate(organic_results[:10], 1):
356
- # if "title" in result and "url" in result:
357
- # formatted_results += f"{i}. {result['title']}\n"
358
- # formatted_results += f" URL: {result['url']}\n"
359
- # if "snippet" in result:
360
- # formatted_results += f" {result['snippet']}\n"
361
- # formatted_results += "\n"
362
- # else:
363
- # # Standard format with title/url/description
364
- # for i, result in enumerate(items[:10], 1):
365
- # if "title" in result and "url" in result:
366
- # formatted_results += f"{i}. {result['title']}\n"
367
- # formatted_results += f" URL: {result['url']}\n"
368
- # if "description" in result:
369
- # formatted_results += f" {result['description']}\n"
370
- # elif "snippet" in result:
371
- # formatted_results += f" {result['snippet']}\n"
372
- # formatted_results += "\n"
373
- #
374
- # return formatted_results
375
-
376
- # Comment out the fallback_search function (around line 220)
377
- # def fallback_search(query: str) -> str:
378
- # """Fallback search method using DuckDuckGo when Apify is not available"""
379
- # try:
380
- # search_tool = DuckDuckGoSearchRun()
381
- # result = search_tool.invoke(query)
382
- # return "Observation: " + result
383
- # except Exception as e:
384
- # return f"Search error: {str(e)}. Please try a different query or method."
385
-
386
- # Comment out the safe_web_search function (around line 230)
387
- # def safe_web_search(query: str) -> str:
388
- # """Search the web safely with error handling and retry logic."""
389
- # if not query:
390
- # return "Error: No search query provided. Please specify what you want to search for."
391
- #
392
- # # Try using Apify first, if it fails it will use the fallback
393
- # return "Observation: " + apify_google_search(query)
394
- #
395
- # # The code below is kept for reference but won't be executed
396
- # max_retries = 3
397
- # backoff_factor = 1.5
398
- #
399
- # for attempt in range(max_retries):
400
- # try:
401
- # # Use the DuckDuckGoSearchRun tool
402
- # search_tool = DuckDuckGoSearchRun()
403
- # result = search_tool.invoke(query)
404
- #
405
- # # If we get an empty result, provide a helpful message
406
- # if not result or len(result.strip()) < 10:
407
- # return f"The search for '{query}' did not return any useful results. Please try a more specific query or a different search engine."
408
- #
409
- # return "Observation: " + result
410
- #
411
- # except Exception as e:
412
- # # If we're being rate limited
413
- # if "Ratelimit" in str(e) or "429" in str(e):
414
- # if attempt < max_retries - 1:
415
- # wait_time = backoff_factor ** attempt
416
- # print(f"Rate limited, waiting {wait_time:.2f} seconds before retrying...")
417
- # time.sleep(wait_time)
418
- # else:
419
- # # On last attempt, return a helpful error
420
- # error_msg = f"I'm currently unable to search for '{query}' due to service rate limits. "
421
- # return error_msg
422
- # else:
423
- # # For other types of errors
424
- # return f"Error while searching for '{query}': {str(e)}"
425
- #
426
- # return f"Failed to search for '{query}' after multiple attempts due to rate limiting."
427
-
428
- def wikipedia_search(query: str, num_results: int = 3) -> str:
429
- """
430
- Search Wikipedia for information about a specific query.
431
-
432
- Args:
433
- query: Search query
434
- num_results: Number of search results to return (default: 3)
435
-
436
- Returns:
437
- Formatted Wikipedia search results
438
- """
439
- try:
440
- # Validate input
441
- if not query or not isinstance(query, str):
442
- return "Error: Please provide a valid search query."
443
-
444
- # Ensure num_results is valid
445
- try:
446
- num_results = int(num_results)
447
- if num_results <= 0:
448
- num_results = 3 # Default to 3 if invalid
449
- except:
450
- num_results = 3 # Default to 3 if conversion fails
451
-
452
- print(f"Searching Wikipedia for: {query}")
453
-
454
- # Use WikipediaLoader from LangChain
455
- loader = WikipediaLoader(query=query, load_max_docs=num_results)
456
- docs = loader.load()
457
-
458
- if not docs:
459
- return f"No Wikipedia results found for '{query}'. Try refining your search."
460
-
461
- # Format the results
462
- formatted_results = f"Wikipedia search results for '{query}':\n\n"
463
-
464
- for i, doc in enumerate(docs, 1):
465
- title = doc.metadata.get('title', 'Unknown Title')
466
- source = doc.metadata.get('source', 'No URL')
467
- content = doc.page_content
468
-
469
- # Truncate content if too long
470
- if len(content) > 500:
471
- content = content[:500] + "..."
472
-
473
- formatted_results += f"{i}. {title}\n"
474
- formatted_results += f" URL: {source}\n"
475
- formatted_results += f" {content}\n\n"
476
-
477
- return formatted_results
478
-
479
- except Exception as e:
480
- return f"Error searching Wikipedia: {str(e)}"
481
-
482
- def tavily_search(query: str, search_depth: str = "basic") -> str:
483
- """
484
- Search the web using the Tavily Search API.
485
-
486
- Args:
487
- query: Search query
488
- search_depth: Depth of search ('basic' or 'comprehensive')
489
-
490
- Returns:
491
- Formatted search results from Tavily
492
- """
493
- try:
494
- # Check for API key
495
- tavily_api_key = os.environ.get("TAVILY_API_KEY")
496
- if not tavily_api_key:
497
- return "Error: Tavily API key not found. Please set the TAVILY_API_KEY environment variable."
498
-
499
- # Validate input
500
- if not query or not isinstance(query, str):
501
- return "Error: Please provide a valid search query."
502
-
503
- # Validate search_depth
504
- if search_depth not in ["basic", "comprehensive"]:
505
- search_depth = "basic" # Default to basic if invalid
506
-
507
- print(f"Searching Tavily for: {query} (depth: {search_depth})")
508
-
509
- # Initialize the Tavily search tool
510
- search = TavilySearchResults(api_key=tavily_api_key)
511
-
512
- # Execute the search
513
- results = search.invoke({"query": query, "search_depth": search_depth})
514
-
515
- if not results:
516
- return f"No Tavily search results found for '{query}'. Try refining your search."
517
-
518
- # Format the results
519
- formatted_results = f"Tavily search results for '{query}':\n\n"
520
-
521
- for i, result in enumerate(results, 1):
522
- formatted_results += f"{i}. {result.get('title', 'No title')}\n"
523
- formatted_results += f" URL: {result.get('url', 'No URL')}\n"
524
- formatted_results += f" {result.get('content', 'No content')}\n\n"
525
-
526
- return formatted_results
527
-
528
- except Exception as e:
529
- return f"Error searching with Tavily: {str(e)}"
530
-
531
- def arxiv_search(query: str, max_results: int = 5) -> str:
532
- """
533
- Search ArXiv for scientific papers matching the query.
534
-
535
- Args:
536
- query: Search query for ArXiv
537
- max_results: Maximum number of results to return
538
-
539
- Returns:
540
- Formatted ArXiv search results
541
- """
542
- try:
543
- # Validate input
544
- if not query or not isinstance(query, str):
545
- return "Error: Please provide a valid search query."
546
-
547
- # Ensure max_results is valid
548
- try:
549
- max_results = int(max_results)
550
- if max_results <= 0 or max_results > 10:
551
- max_results = 5 # Default to 5 if invalid or too large
552
- except:
553
- max_results = 5 # Default to 5 if conversion fails
554
-
555
- print(f"Searching ArXiv for: {query}")
556
-
557
- # Use ArxivLoader from LangChain
558
- loader = ArxivLoader(
559
- query=query,
560
- load_max_docs=max_results,
561
- load_all_available_meta=True
562
- )
563
-
564
- docs = loader.load()
565
-
566
- if not docs:
567
- return f"No ArXiv papers found for '{query}'. Try refining your search."
568
-
569
- # Format the results
570
- formatted_results = f"ArXiv papers for '{query}':\n\n"
571
-
572
- for i, doc in enumerate(docs, 1):
573
- meta = doc.metadata
574
- title = meta.get('Title', 'Unknown Title')
575
- url = meta.get('Entry ID', 'No URL')
576
- authors = meta.get('Authors', 'Unknown Authors')
577
- published = meta.get('Published', 'Unknown Date')
578
-
579
- formatted_results += f"{i}. {title}\n"
580
- formatted_results += f" URL: {url}\n"
581
- formatted_results += f" Authors: {authors}\n"
582
- formatted_results += f" Published: {published}\n"
583
-
584
- # Add abstract, truncated if too long
585
- abstract = doc.page_content.replace('\n', ' ')
586
- if len(abstract) > 300:
587
- abstract = abstract[:300] + "..."
588
- formatted_results += f" Abstract: {abstract}\n\n"
589
-
590
- return formatted_results
591
-
592
- except Exception as e:
593
- return f"Error searching ArXiv: {str(e)}"
594
-
595
- def supabase_operation(operation_type: str, table: str, data: dict = None, filters: dict = None) -> str:
596
- """
597
- Perform operations on Supabase database.
598
-
599
- Args:
600
- operation_type: Type of operation ('insert', 'select', 'update', 'delete')
601
- table: Name of the table to operate on
602
- data: Data to insert/update (for insert/update operations)
603
- filters: Filters for select/update/delete operations (e.g., {"id": 1})
604
-
605
- Returns:
606
- Result of the operation as a formatted string
607
- """
608
- try:
609
- # Get Supabase credentials from environment variables
610
- supabase_url = os.environ.get("SUPABASE_URL")
611
- supabase_key = os.environ.get("SUPABASE_ANON_KEY")
612
-
613
- if not supabase_url or not supabase_key:
614
- return "Error: Supabase credentials not found. Please set SUPABASE_URL and SUPABASE_ANON_KEY environment variables."
615
-
616
- # Create Supabase client
617
- supabase: Client = create_client(supabase_url, supabase_key)
618
-
619
- # Validate inputs
620
- if not table:
621
- return "Error: Table name is required."
622
-
623
- if operation_type not in ['insert', 'select', 'update', 'delete']:
624
- return "Error: Invalid operation type. Use 'insert', 'select', 'update', or 'delete'."
625
-
626
- # Perform the operation based on type
627
- if operation_type == 'insert':
628
- if not data:
629
- return "Error: Data is required for insert operation."
630
-
631
- result = supabase.table(table).insert(data).execute()
632
- return f"Insert successful: {len(result.data)} row(s) inserted into {table}"
633
-
634
- elif operation_type == 'select':
635
- query = supabase.table(table).select("*")
636
-
637
- # Apply filters if provided
638
- if filters:
639
- for key, value in filters.items():
640
- query = query.eq(key, value)
641
-
642
- result = query.execute()
643
- return f"Select successful: Found {len(result.data)} row(s) in {table}\nData: {json.dumps(result.data, indent=2)}"
644
-
645
- elif operation_type == 'update':
646
- if not data or not filters:
647
- return "Error: Both data and filters are required for update operation."
648
-
649
- query = supabase.table(table).update(data)
650
-
651
- # Apply filters
652
- for key, value in filters.items():
653
- query = query.eq(key, value)
654
-
655
- result = query.execute()
656
- return f"Update successful: {len(result.data)} row(s) updated in {table}"
657
-
658
- elif operation_type == 'delete':
659
- if not filters:
660
- return "Error: Filters are required for delete operation."
661
-
662
- query = supabase.table(table).delete()
663
-
664
- # Apply filters
665
- for key, value in filters.items():
666
- query = query.eq(key, value)
667
-
668
- result = query.execute()
669
- return f"Delete successful: Rows deleted from {table}"
670
-
671
- except Exception as e:
672
- return f"Error performing Supabase operation: {str(e)}"
673
-
674
- def excel_to_text(excel_path: str, sheet_name: Optional[str] = None, file_content: Optional[bytes] = None) -> str:
675
- """
676
- Read an Excel file and return a Markdown table of the requested sheet.
677
-
678
- Args:
679
- excel_path: Path to the Excel file (.xlsx or .xls) or name for the attached file.
680
- sheet_name: Optional name or index of the sheet to read. If None, reads the first sheet.
681
- file_content: Optional binary content of the file if provided as an attachment.
682
-
683
- Returns:
684
- A Markdown table representing the Excel sheet, or an error message if the file is not found or cannot be read.
685
- """
686
- try:
687
- # Handle file attachment case
688
- if file_content:
689
- # Create a temporary file to save the attachment
690
- with tempfile.NamedTemporaryFile(suffix='.xlsx', delete=False) as temp_file:
691
- temp_file.write(file_content)
692
- temp_path = temp_file.name
693
-
694
- print(f"Saved attached Excel file to temporary location: {temp_path}")
695
- file_path = Path(temp_path)
696
- else:
697
- # Regular file path case
698
- file_path = Path(excel_path).expanduser().resolve()
699
- if not file_path.is_file():
700
- return f"Error: Excel file not found at {file_path}"
701
-
702
- # Process the Excel file
703
- sheet: Union[str, int] = (
704
- int(sheet_name)
705
- if sheet_name and sheet_name.isdigit()
706
- else sheet_name or 0
707
- )
708
-
709
- df = pd.read_excel(file_path, sheet_name=sheet)
710
-
711
- # Clean up temporary file if we created one
712
- if file_content and os.path.exists(temp_path):
713
- os.unlink(temp_path)
714
- print(f"Deleted temporary Excel file: {temp_path}")
715
-
716
- if hasattr(df, "to_markdown"):
717
- return df.to_markdown(index=False)
718
-
719
- return tabulate(df, headers="keys", tablefmt="github", showindex=False)
720
-
721
- except Exception as e:
722
- # Clean up temporary file in case of error
723
- if file_content and 'temp_path' in locals() and os.path.exists(temp_path):
724
- os.unlink(temp_path)
725
- print(f"Deleted temporary Excel file due to error: {temp_path}")
726
- return f"Error reading Excel file: {e}"
727
 
728
  # System prompt to guide the model's behavior
729
  #web_search: Search the google search engine when Tavily Search and Wikipedia Search do not return a result. Provide a specific search query.
730
  #webpage_scrape: Scrape content from a specific webpage URL when Tavily Search and Wikipedia Search do not return a result. Provide a valid URL to extract information from a particular web page.
731
  #Give preference to using Tavily Search and Wikipedia Search before using web_search or webpage_scrape. When Web_search does not return a result, use Tavily Search.
732
 
733
- SYSTEM_PROMPT = """Answer the following questions as best you can. DO NOT rely on your internal knowledge unless web searches are rate-limited or you're specifically instructed to. You have access to the following tools:
734
-
735
- python_code: Execute Python code. Provide the complete Python code as a string. Use this tool to calculate math problems.
736
- wikipedia_search: Search Wikipedia for information about a specific topic. Optionally specify the number of results to return.
737
- tavily_search: Search the web using Tavily for more comprehensive results. Optionally specify search_depth as 'basic' or 'comprehensive'.
738
- arxiv_search: Search ArXiv for scientific papers on a specific topic. Optionally specify max_results to control the number of papers returned.
739
- supabase_operation: Perform database operations on Supabase (insert, select, update, delete). Provide operation_type, table name, and optional data/filters.
740
- excel_to_text: Read an Excel file and convert it to a Markdown table. You can provide either the path to an Excel file or use a file attachment. For attachments, provide a base64-encoded string of the file content and a filename.
741
 
742
  The way you use the tools is by specifying a json blob.
743
  Specifically, this json should have an `action` key (with the name of the tool to use) and an `action_input` key (with the input to the tool going here).
744
 
745
  The only values that should be in the "action" field are:
746
- python_code: Execute Python code, args: {"code": {"type": "string"}}
747
- wikipedia_search: Search Wikipedia, args: {"query": {"type": "string"}, "num_results": {"type": "integer", "optional": true}}
748
- tavily_search: Search with Tavily, args: {"query": {"type": "string"}, "search_depth": {"type": "string", "optional": true}}
749
- arxiv_search: Search ArXiv papers, args: {"query": {"type": "string"}, "max_results": {"type": "integer", "optional": true}}
750
  webpage_scrape: Scrape a specific webpage, args: {"url": {"type": "string"}}
751
  supabase_operation: Perform database operations, args: {"operation_type": {"type": "string"}, "table": {"type": "string"}, "data": {"type": "object", "optional": true}, "filters": {"type": "object", "optional": true}}
752
- excel_to_text: Convert Excel to Markdown table with file path, args: {"excel_path": {"type": "string"}, "sheet_name": {"type": "string", "optional": true}}
753
  excel_to_text: Convert Excel to Markdown table with attachment, args: {"excel_path": {"type": "string"}, "file_content": {"type": "string"}, "sheet_name": {"type": "string", "optional": true}}
 
754
 
755
  IMPORTANT: Make sure your JSON is properly formatted with double quotes around keys and string values.
756
 
757
- If you do not want to use any tool AND have not yet arrived at a solution, call the python_code tool with an empty string as the code.
758
-
759
  Example use for tools:
760
 
761
  ```json
@@ -767,21 +80,14 @@ Example use for tools:
767
  or
768
  ```json
769
  {
770
- "action": "python_code",
771
- "action_input": {"code": "c = a + b"}
772
- }
773
- ```
774
- or
775
- ```json
776
- {
777
- "action": "excel_to_text",
778
- "action_input": {"excel_path": "data.xlsx", "file_content": "BASE64_ENCODED_CONTENT_HERE", "sheet_name": "Sheet1"}
779
  }
780
  ```
781
 
782
  ALWAYS follow this specific format for your responses. Your entire response will follow this pattern:
783
  Question: [the user's question]
784
- Thought: [your reasoning about what to do next]
785
  Action:
786
  ```json
787
  {
@@ -790,7 +96,7 @@ Action:
790
  }
791
  ```
792
  Observation: [the result from the tool will appear here]
793
- Thought: [your reasoning after seeing the observation]
794
  Action:
795
  ```json
796
  {
@@ -863,6 +169,11 @@ tools_config = [
863
  "name": "excel_to_text",
864
  "description": "Read an Excel file and return a Markdown table. You can provide either the path to an Excel file or use a file attachment. For attachments, provide a base64-encoded string of the file content and a filename.",
865
  "func": excel_to_text
 
 
 
 
 
866
  }
867
  ]
868
 
@@ -984,22 +295,21 @@ def assistant(state: AgentState) -> Dict[str, Any]:
984
  tool_name = action_json["action"]
985
  tool_input = action_json["action_input"]
986
 
987
- # Handle nested JSON issue - if action_input is a string containing JSON
988
- if tool_name == "python_code" and isinstance(tool_input, dict) and "code" in tool_input:
989
- code = tool_input["code"]
990
- if code.startswith("{") and ("action" in code or "action_input" in code):
991
- try:
992
- # Try to see if this is a nested JSON structure
993
- nested_json = json.loads(code)
994
- if isinstance(nested_json, dict) and "action" in nested_json and "action_input" in nested_json:
995
- # Replace with the nested structure
996
- tool_name = nested_json["action"]
997
- tool_input = nested_json["action_input"]
998
- print(f"Unwrapped nested JSON. New tool: {tool_name}")
999
- print(f"New tool input: {tool_input}")
1000
- except:
1001
- # If it fails, keep original values
1002
- pass
1003
 
1004
  print(f"Using tool: {tool_name}")
1005
  print(f"Tool input: {tool_input}")
@@ -1075,7 +385,7 @@ def extract_json_from_text(text: str) -> dict:
1075
  print(f"Found valid JSON object: {parsed}")
1076
  return parsed
1077
  except json.JSONDecodeError:
1078
- continue
1079
 
1080
  # Pattern 4: Look for patterns like 'action': 'tool_name', 'action_input': {...}
1081
  action_pattern = re.search(r"['\"](action)['\"]:\s*['\"](\w+)['\"]", text)
@@ -1505,6 +815,62 @@ def excel_to_text_node(state: AgentState) -> Dict[str, Any]:
1505
  "action_input": None # Clear the action input
1506
  }
1507
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1508
  # Router function to direct to the correct tool
1509
  def router(state: AgentState) -> str:
1510
  """Route to the appropriate tool based on the current_tool field."""
@@ -1513,8 +879,6 @@ def router(state: AgentState) -> str:
1513
  print(f"Routing to: {tool}")
1514
  print(f"Router received action_input: {action_input}")
1515
 
1516
- # if tool == "web_search":
1517
- # return "web_search"
1518
  if tool == "python_code":
1519
  return "python_code"
1520
  elif tool == "webpage_scrape":
@@ -1529,6 +893,8 @@ def router(state: AgentState) -> str:
1529
  return "supabase_operation"
1530
  elif tool == "excel_to_text":
1531
  return "excel_to_text"
 
 
1532
  else:
1533
  return "end"
1534
 
@@ -1539,7 +905,6 @@ def create_agent_graph() -> StateGraph:
1539
 
1540
  # Define nodes: these do the work
1541
  builder.add_node("assistant", assistant)
1542
- # builder.add_node("web_search", web_search_node)
1543
  builder.add_node("python_code", python_code_node)
1544
  builder.add_node("webpage_scrape", webpage_scrape_node)
1545
  builder.add_node("wikipedia_search", wikipedia_search_node)
@@ -1547,6 +912,7 @@ def create_agent_graph() -> StateGraph:
1547
  builder.add_node("arxiv_search", arxiv_search_node)
1548
  builder.add_node("supabase_operation", supabase_operation_node)
1549
  builder.add_node("excel_to_text", excel_to_text_node)
 
1550
 
1551
  # Define edges: these determine how the control flow moves
1552
  builder.add_edge(START, "assistant")
@@ -1571,7 +937,6 @@ def create_agent_graph() -> StateGraph:
1571
  "debug",
1572
  router,
1573
  {
1574
- # "web_search": "web_search",
1575
  "python_code": "python_code",
1576
  "webpage_scrape": "webpage_scrape",
1577
  "wikipedia_search": "wikipedia_search",
@@ -1579,12 +944,12 @@ def create_agent_graph() -> StateGraph:
1579
  "arxiv_search": "arxiv_search",
1580
  "supabase_operation": "supabase_operation",
1581
  "excel_to_text": "excel_to_text",
 
1582
  "end": END
1583
  }
1584
  )
1585
 
1586
  # Tools always go back to assistant
1587
- # builder.add_edge("web_search", "assistant")
1588
  builder.add_edge("python_code", "assistant")
1589
  builder.add_edge("webpage_scrape", "assistant")
1590
  builder.add_edge("wikipedia_search", "assistant")
@@ -1592,6 +957,7 @@ def create_agent_graph() -> StateGraph:
1592
  builder.add_edge("arxiv_search", "assistant")
1593
  builder.add_edge("supabase_operation", "assistant")
1594
  builder.add_edge("excel_to_text", "assistant")
 
1595
 
1596
  # Compile the graph
1597
  return builder.compile()
@@ -1677,30 +1043,3 @@ I need to make headings for the fruits and vegetables. Could you please create a
1677
  print("\nFinal Response:")
1678
  print(response)
1679
 
1680
- def save_attachment_to_tempfile(file_content_b64: str, file_extension: str = '.xlsx') -> str:
1681
- """
1682
- Decode a base64 file content and save it to a temporary file.
1683
-
1684
- Args:
1685
- file_content_b64: Base64 encoded file content
1686
- file_extension: File extension to use for the temporary file
1687
-
1688
- Returns:
1689
- Path to the saved temporary file
1690
- """
1691
- try:
1692
- # Decode the base64 content
1693
- file_content = base64.b64decode(file_content_b64)
1694
-
1695
- # Create a temporary file with the appropriate extension
1696
- with tempfile.NamedTemporaryFile(suffix=file_extension, delete=False) as temp_file:
1697
- temp_file.write(file_content)
1698
- temp_path = temp_file.name
1699
-
1700
- print(f"Saved attachment to temporary file: {temp_path}")
1701
- return temp_path
1702
-
1703
- except Exception as e:
1704
- print(f"Error saving attachment: {e}")
1705
- return None
1706
-
 
17
  from tabulate import tabulate
18
  import base64
19
 
20
+ # Import all tool functions from tools.py
21
+ from tools import (
22
+ tools_config,
23
+ run_python_code,
24
+ scrape_webpage,
25
+ wikipedia_search,
26
+ tavily_search,
27
+ arxiv_search,
28
+ supabase_operation,
29
+ excel_to_text,
30
+ save_attachment_to_tempfile,
31
+ process_youtube_video
32
+ )
33
 
34
  load_dotenv()
35
 
36
+ # Remove the following functions from agent.py since they're now imported from tools.py:
37
+ # - run_python_code (lines ~28-175)
38
+ # - scrape_webpage (lines ~177-310)
39
+ # - wikipedia_search (lines ~345-405)
40
+ # - tavily_search (lines ~407-470)
41
+ # - arxiv_search (lines ~472-535)
42
+ # - supabase_operation (lines ~537-620)
43
+ # - excel_to_text (lines ~622-690)
44
+ # - save_attachment_to_tempfile (lines ~1680-1706)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
+ # Also remove the tools_config definition (lines ~795-870) since it's imported from tools.py
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
 
48
+ # The rest of the file remains the same...
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
  # System prompt to guide the model's behavior
51
  #web_search: Search the google search engine when Tavily Search and Wikipedia Search do not return a result. Provide a specific search query.
52
  #webpage_scrape: Scrape content from a specific webpage URL when Tavily Search and Wikipedia Search do not return a result. Provide a valid URL to extract information from a particular web page.
53
  #Give preference to using Tavily Search and Wikipedia Search before using web_search or webpage_scrape. When Web_search does not return a result, use Tavily Search.
54
 
55
+ SYSTEM_PROMPT = """Answer the following questions as best you can. DO NOT rely on your internal knowledge unless the tools fail to provide a result:
 
 
 
 
 
 
 
56
 
57
  The way you use the tools is by specifying a json blob.
58
  Specifically, this json should have an `action` key (with the name of the tool to use) and an `action_input` key (with the input to the tool going here).
59
 
60
  The only values that should be in the "action" field are:
61
+ python_code: Execute Python code. Use this tool to calculate math problems. args: {"code": {"type": "string"}}
62
+ wikipedia_search: Search Wikipedia for information about a specific topic. Optionally specify the number of results to return, args: {"query": {"type": "string"}, "num_results": {"type": "integer", "optional": true}}
63
+ tavily_search: Search the web using Tavily for more comprehensive results. Optionally specify search_depth as 'basic' or 'comprehensive', args: {"query": {"type": "string"}, "search_depth": {"type": "string", "optional": true}}
64
+ arxiv_search: Search ArXiv for scientific papers. Optionally specify max_results to control the number of papers returned, args: {"query": {"type": "string"}, "max_results": {"type": "integer", "optional": true}}
65
  webpage_scrape: Scrape a specific webpage, args: {"url": {"type": "string"}}
66
  supabase_operation: Perform database operations, args: {"operation_type": {"type": "string"}, "table": {"type": "string"}, "data": {"type": "object", "optional": true}, "filters": {"type": "object", "optional": true}}
 
67
  excel_to_text: Convert Excel to Markdown table with attachment, args: {"excel_path": {"type": "string"}, "file_content": {"type": "string"}, "sheet_name": {"type": "string", "optional": true}}
68
+ process_youtube_video: Extract and analyze YouTube video content by providing the video URL. Returns video metadata and transcript, args: {"url": {"type": "string"}, "summarize": {"type": "boolean", "optional": true}}
69
 
70
  IMPORTANT: Make sure your JSON is properly formatted with double quotes around keys and string values.
71
 
 
 
72
  Example use for tools:
73
 
74
  ```json
 
80
  or
81
  ```json
82
  {
83
+ "action": "process_youtube_video",
84
+ "action_input": {"url": "https://www.youtube.com/watch?v=dQw4w9WgXcQ", "summarize": true}
 
 
 
 
 
 
 
85
  }
86
  ```
87
 
88
  ALWAYS follow this specific format for your responses. Your entire response will follow this pattern:
89
  Question: [the user's question]
90
+ Thought: [your reasoning about what to do next, break it down into smaller steps]
91
  Action:
92
  ```json
93
  {
 
96
  }
97
  ```
98
  Observation: [the result from the tool will appear here]
99
+ Thought: [your reasoning after seeing the observation, break it down into smaller steps]
100
  Action:
101
  ```json
102
  {
 
169
  "name": "excel_to_text",
170
  "description": "Read an Excel file and return a Markdown table. You can provide either the path to an Excel file or use a file attachment. For attachments, provide a base64-encoded string of the file content and a filename.",
171
  "func": excel_to_text
172
+ },
173
+ {
174
+ "name": "process_youtube_video",
175
+ "description": "Extract and analyze YouTube video content by providing the video URL. Returns video metadata and transcript.",
176
+ "func": process_youtube_video
177
  }
178
  ]
179
 
 
295
  tool_name = action_json["action"]
296
  tool_input = action_json["action_input"]
297
 
298
+ # Handle nested JSON issue - check if any value in action_input is a JSON string
299
+ if isinstance(tool_input, dict):
300
+ for key, value in tool_input.items():
301
+ if isinstance(value, str) and value.strip().startswith("{"):
302
+ try:
303
+ nested_json = json.loads(value)
304
+ if isinstance(nested_json, dict) and "action" in nested_json and "action_input" in nested_json:
305
+ # This is a nested structure, use the inner one
306
+ tool_name = nested_json["action"]
307
+ tool_input = nested_json["action_input"]
308
+ print(f"Unwrapped nested JSON. New tool: {tool_name}")
309
+ print(f"New tool input: {tool_input}")
310
+ break
311
+ except json.JSONDecodeError:
312
+ continue
 
313
 
314
  print(f"Using tool: {tool_name}")
315
  print(f"Tool input: {tool_input}")
 
385
  print(f"Found valid JSON object: {parsed}")
386
  return parsed
387
  except json.JSONDecodeError:
388
+ continue
389
 
390
  # Pattern 4: Look for patterns like 'action': 'tool_name', 'action_input': {...}
391
  action_pattern = re.search(r"['\"](action)['\"]:\s*['\"](\w+)['\"]", text)
 
815
  "action_input": None # Clear the action input
816
  }
817
 
818
+ # Add a new node function for processing YouTube videos
819
+ def process_youtube_video_node(state: AgentState) -> Dict[str, Any]:
820
+ """Node that processes YouTube videos."""
821
+ print("YouTube Video Processing Tool Called...\n\n")
822
+
823
+ # Extract tool arguments
824
+ action_input = state.get("action_input", {})
825
+ print(f"YouTube video processing action_input: {action_input}")
826
+
827
+ # Extract URL and other parameters
828
+ url = ""
829
+ summarize = True # Default
830
+
831
+ if isinstance(action_input, dict):
832
+ url = action_input.get("url", "")
833
+ # Check if summarize parameter exists and is a boolean
834
+ if "summarize" in action_input:
835
+ try:
836
+ summarize = bool(action_input["summarize"])
837
+ except:
838
+ print("Invalid summarize parameter, using default (True)")
839
+ elif isinstance(action_input, str):
840
+ # If action_input is just a string, assume it's the URL
841
+ url = action_input
842
+
843
+ print(f"Processing YouTube video: '{url}' (summarize: {summarize})")
844
+
845
+ # Safety check - don't run with empty URL
846
+ if not url:
847
+ result = "Error: No URL provided. Please provide a valid YouTube URL."
848
+ else:
849
+ # Import the function dynamically to ensure we're using the latest version
850
+ from tools import process_youtube_video
851
+ # Call the YouTube processing function
852
+ result = process_youtube_video(url, summarize)
853
+
854
+ print(f"YouTube processing result length: {len(result)}")
855
+
856
+ # Format the observation to continue the ReAct cycle
857
+ tool_message = AIMessage(
858
+ content=f"Observation: {result.strip()}"
859
+ )
860
+
861
+ # Print the observation that will be sent back to the assistant
862
+ print("\n=== TOOL OBSERVATION ===")
863
+ content_preview = tool_message.content[:500] + "..." if len(tool_message.content) > 500 else tool_message.content
864
+ print(content_preview)
865
+ print("=== END OBSERVATION ===\n")
866
+
867
+ # Return the updated state
868
+ return {
869
+ "messages": state["messages"] + [tool_message],
870
+ "current_tool": None, # Reset the current tool
871
+ "action_input": None # Clear the action input
872
+ }
873
+
874
  # Router function to direct to the correct tool
875
  def router(state: AgentState) -> str:
876
  """Route to the appropriate tool based on the current_tool field."""
 
879
  print(f"Routing to: {tool}")
880
  print(f"Router received action_input: {action_input}")
881
 
 
 
882
  if tool == "python_code":
883
  return "python_code"
884
  elif tool == "webpage_scrape":
 
893
  return "supabase_operation"
894
  elif tool == "excel_to_text":
895
  return "excel_to_text"
896
+ elif tool == "process_youtube_video":
897
+ return "process_youtube_video"
898
  else:
899
  return "end"
900
 
 
905
 
906
  # Define nodes: these do the work
907
  builder.add_node("assistant", assistant)
 
908
  builder.add_node("python_code", python_code_node)
909
  builder.add_node("webpage_scrape", webpage_scrape_node)
910
  builder.add_node("wikipedia_search", wikipedia_search_node)
 
912
  builder.add_node("arxiv_search", arxiv_search_node)
913
  builder.add_node("supabase_operation", supabase_operation_node)
914
  builder.add_node("excel_to_text", excel_to_text_node)
915
+ builder.add_node("process_youtube_video", process_youtube_video_node)
916
 
917
  # Define edges: these determine how the control flow moves
918
  builder.add_edge(START, "assistant")
 
937
  "debug",
938
  router,
939
  {
 
940
  "python_code": "python_code",
941
  "webpage_scrape": "webpage_scrape",
942
  "wikipedia_search": "wikipedia_search",
 
944
  "arxiv_search": "arxiv_search",
945
  "supabase_operation": "supabase_operation",
946
  "excel_to_text": "excel_to_text",
947
+ "process_youtube_video": "process_youtube_video",
948
  "end": END
949
  }
950
  )
951
 
952
  # Tools always go back to assistant
 
953
  builder.add_edge("python_code", "assistant")
954
  builder.add_edge("webpage_scrape", "assistant")
955
  builder.add_edge("wikipedia_search", "assistant")
 
957
  builder.add_edge("arxiv_search", "assistant")
958
  builder.add_edge("supabase_operation", "assistant")
959
  builder.add_edge("excel_to_text", "assistant")
960
+ builder.add_edge("process_youtube_video", "assistant")
961
 
962
  # Compile the graph
963
  return builder.compile()
 
1043
  print("\nFinal Response:")
1044
  print(response)
1045
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -10,4 +10,7 @@ beautifulsoup4
10
  html2text
11
  supabase
12
  pandas
13
- tabulate
 
 
 
 
10
  html2text
11
  supabase
12
  pandas
13
+ tabulate
14
+ pytube
15
+ youtube-transcript-api
16
+ python-dotenv
tools.py CHANGED
@@ -16,6 +16,11 @@ from langchain_community.document_loaders import ArxivLoader
16
  from langchain_community.tools.tavily_search import TavilySearchResults
17
  from supabase import create_client, Client
18
 
 
 
 
 
 
19
  load_dotenv()
20
 
21
  def run_python_code(code: str):
@@ -590,6 +595,135 @@ def save_attachment_to_tempfile(file_content_b64: str, file_extension: str = '.x
590
  print(f"Error saving attachment: {e}")
591
  return None
592
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
593
  # Define the tools configuration
594
  tools_config = [
595
  {
@@ -621,5 +755,10 @@ tools_config = [
621
  "name": "excel_to_text",
622
  "description": "Read an Excel file and return a Markdown table. You can provide either the path to an Excel file or use a file attachment. For attachments, provide a base64-encoded string of the file content and a filename.",
623
  "func": excel_to_text
 
 
 
 
 
624
  }
625
  ]
 
16
  from langchain_community.tools.tavily_search import TavilySearchResults
17
  from supabase import create_client, Client
18
 
19
+ # Add new imports for YouTube processing
20
+ import re
21
+ import pytube
22
+ from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
23
+
24
  load_dotenv()
25
 
26
  def run_python_code(code: str):
 
595
  print(f"Error saving attachment: {e}")
596
  return None
597
 
598
+ def process_youtube_video(url: str, summarize: bool = True) -> str:
599
+ """
600
+ Process a YouTube video by extracting its transcript/captions and basic metadata.
601
+ Optionally summarize the content.
602
+
603
+ Args:
604
+ url: URL of the YouTube video
605
+ summarize: Whether to include a summary of the video content
606
+
607
+ Returns:
608
+ Formatted video information including title, description, transcript, and optional summary
609
+ """
610
+ try:
611
+ # Validate YouTube URL
612
+ if "youtube.com" not in url and "youtu.be" not in url:
613
+ return f"Error: The URL {url} doesn't appear to be a valid YouTube link"
614
+
615
+ print(f"Processing YouTube video: {url}")
616
+
617
+ # Extract video ID from the URL
618
+ video_id = None
619
+ if "youtube.com/watch" in url:
620
+ # Format: https://www.youtube.com/watch?v=VIDEO_ID
621
+ query_string = urlparse(url).query
622
+ params = {p.split('=')[0]: p.split('=')[1] for p in query_string.split('&') if '=' in p}
623
+ video_id = params.get('v')
624
+ elif "youtu.be" in url:
625
+ # Format: https://youtu.be/VIDEO_ID
626
+ video_id = url.split('/')[-1]
627
+
628
+ if not video_id:
629
+ return f"Error: Could not extract video ID from the URL: {url}"
630
+
631
+ # Get video metadata using pytube
632
+ try:
633
+ youtube = pytube.YouTube(url)
634
+ video_title = youtube.title
635
+ video_author = youtube.author
636
+ video_description = youtube.description
637
+ video_length = youtube.length # in seconds
638
+ video_views = youtube.views
639
+ video_publish_date = youtube.publish_date
640
+ except Exception as e:
641
+ print(f"Error getting video metadata: {e}")
642
+ video_title = "Unknown title"
643
+ video_author = "Unknown author"
644
+ video_description = "No description available"
645
+ video_length = 0
646
+ video_views = 0
647
+ video_publish_date = None
648
+
649
+ # Format video length from seconds to minutes and seconds
650
+ minutes = video_length // 60
651
+ seconds = video_length % 60
652
+ length_formatted = f"{minutes}:{seconds:02d}"
653
+
654
+ # Get video transcript using youtube_transcript_api
655
+ try:
656
+ transcript_list = YouTubeTranscriptApi.get_transcript(video_id)
657
+
658
+ # Format transcript into readable text
659
+ transcript_text = ""
660
+ for entry in transcript_list:
661
+ start_time = int(entry['start'])
662
+ start_minutes = start_time // 60
663
+ start_seconds = start_time % 60
664
+ text = entry['text']
665
+ transcript_text += f"[{start_minutes}:{start_seconds:02d}] {text}\n"
666
+
667
+ except (TranscriptsDisabled, NoTranscriptFound) as e:
668
+ transcript_text = "No transcript available for this video."
669
+ except Exception as e:
670
+ transcript_text = f"Error retrieving transcript: {str(e)}"
671
+
672
+ # Compile all information
673
+ result = f"Video Title: {video_title}\n"
674
+ result += f"Creator: {video_author}\n"
675
+ result += f"Length: {length_formatted}\n"
676
+ result += f"Views: {video_views:,}\n"
677
+ if video_publish_date:
678
+ result += f"Published: {video_publish_date.strftime('%Y-%m-%d')}\n"
679
+ result += f"URL: {url}\n\n"
680
+
681
+ # Add description (truncated if too long)
682
+ if video_description:
683
+ if len(video_description) > 500:
684
+ description_preview = video_description[:500] + "..."
685
+ else:
686
+ description_preview = video_description
687
+ result += f"Description:\n{description_preview}\n\n"
688
+
689
+ # Add transcript
690
+ result += "Transcript:\n"
691
+
692
+ # Check if transcript is too long (over 5000 chars) and truncate if needed
693
+ if len(transcript_text) > 5000:
694
+ result += transcript_text[:5000] + "...\n[Transcript truncated due to length]\n"
695
+ else:
696
+ result += transcript_text + "\n"
697
+
698
+ return result
699
+
700
+ except Exception as e:
701
+ return f"Error processing YouTube video: {str(e)}"
702
+
703
+ def extract_youtube_video_id(url: str) -> Optional[str]:
704
+ """
705
+ Extract the YouTube video ID from various URL formats.
706
+
707
+ Args:
708
+ url: A YouTube URL
709
+
710
+ Returns:
711
+ The video ID or None if it cannot be extracted
712
+ """
713
+ # Various YouTube URL patterns
714
+ patterns = [
715
+ r'(?:youtube\.com/watch\?v=|youtu\.be/|youtube\.com/embed/|youtube\.com/v/|youtube\.com/e/|youtube\.com/watch\?.*v=|youtube\.com/watch\?.*&v=)([^&?/\s]{11})',
716
+ r'youtube\.com/shorts/([^&?/\s]{11})',
717
+ r'youtube\.com/live/([^&?/\s]{11})'
718
+ ]
719
+
720
+ for pattern in patterns:
721
+ match = re.search(pattern, url)
722
+ if match:
723
+ return match.group(1)
724
+
725
+ return None
726
+
727
  # Define the tools configuration
728
  tools_config = [
729
  {
 
755
  "name": "excel_to_text",
756
  "description": "Read an Excel file and return a Markdown table. You can provide either the path to an Excel file or use a file attachment. For attachments, provide a base64-encoded string of the file content and a filename.",
757
  "func": excel_to_text
758
+ },
759
+ {
760
+ "name": "process_youtube_video",
761
+ "description": "Extract and process information from a YouTube video including its transcript, title, author, and other metadata. Provide a URL in the format: {\"url\": \"https://www.youtube.com/watch?v=VIDEO_ID\", \"summarize\": true}",
762
+ "func": process_youtube_video
763
  }
764
  ]