Spaces:
Sleeping
Sleeping
updated system prompt and modularized tools
Browse files- .gitignore +4 -1
- agent.py +116 -777
- requirements.txt +4 -1
- tools.py +139 -0
.gitignore
CHANGED
@@ -2,7 +2,10 @@
|
|
2 |
.env.*
|
3 |
image.png
|
4 |
|
5 |
-
GAIA
|
|
|
|
|
|
|
6 |
GAIA/*
|
7 |
|
8 |
pycache/*
|
|
|
2 |
.env.*
|
3 |
image.png
|
4 |
|
5 |
+
GAIA-repo/
|
6 |
+
GAIA-repo/*
|
7 |
+
|
8 |
+
GAIA/
|
9 |
GAIA/*
|
10 |
|
11 |
pycache/*
|
agent.py
CHANGED
@@ -17,745 +17,58 @@ import pandas as pd
|
|
17 |
from tabulate import tabulate
|
18 |
import base64
|
19 |
|
20 |
-
from
|
21 |
-
from
|
22 |
-
|
23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
|
25 |
load_dotenv()
|
26 |
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
"__builtins__", "globals(", "locals(",
|
37 |
-
"compile(", "execfile(", "reload("
|
38 |
-
]
|
39 |
-
|
40 |
-
# Safe imports that should be allowed
|
41 |
-
safe_imports = {
|
42 |
-
"import datetime", "import math", "import random",
|
43 |
-
"import statistics", "import collections", "import itertools",
|
44 |
-
"import re", "import json", "import csv", "import numpy",
|
45 |
-
"import pandas", "from math import", "from datetime import",
|
46 |
-
"from statistics import", "from collections import",
|
47 |
-
"from itertools import"
|
48 |
-
}
|
49 |
-
|
50 |
-
# Check for dangerous operations
|
51 |
-
for dangerous_op in dangerous_operations:
|
52 |
-
if dangerous_op in code:
|
53 |
-
return f"Error: Code contains potentially unsafe operations: {dangerous_op}"
|
54 |
-
|
55 |
-
# Check each line for imports
|
56 |
-
for line in code.splitlines():
|
57 |
-
line = line.strip()
|
58 |
-
if line.startswith("import ") or line.startswith("from "):
|
59 |
-
# Check if it's in our safe list
|
60 |
-
is_safe = any(line.startswith(safe_import) for safe_import in safe_imports)
|
61 |
-
# Also allow basic numpy/pandas imports
|
62 |
-
is_safe = is_safe or line.startswith("import numpy") or line.startswith("import pandas")
|
63 |
-
if not is_safe:
|
64 |
-
return f"Error: Code contains potentially unsafe import: {line}"
|
65 |
-
|
66 |
-
try:
|
67 |
-
# Capture stdout to get print output
|
68 |
-
import io
|
69 |
-
import sys
|
70 |
-
from contextlib import redirect_stdout
|
71 |
-
|
72 |
-
# Create a restricted globals environment
|
73 |
-
restricted_globals = {
|
74 |
-
'__builtins__': {
|
75 |
-
'abs': abs, 'all': all, 'any': any, 'bin': bin, 'bool': bool,
|
76 |
-
'chr': chr, 'dict': dict, 'dir': dir, 'divmod': divmod,
|
77 |
-
'enumerate': enumerate, 'filter': filter, 'float': float,
|
78 |
-
'format': format, 'hex': hex, 'int': int, 'len': len,
|
79 |
-
'list': list, 'map': map, 'max': max, 'min': min, 'oct': oct,
|
80 |
-
'ord': ord, 'pow': pow, 'print': print, 'range': range,
|
81 |
-
'reversed': reversed, 'round': round, 'set': set, 'slice': slice,
|
82 |
-
'sorted': sorted, 'str': str, 'sum': sum, 'tuple': tuple,
|
83 |
-
'type': type, 'zip': zip,
|
84 |
-
}
|
85 |
-
}
|
86 |
-
|
87 |
-
# Allow safe modules
|
88 |
-
import math
|
89 |
-
import datetime
|
90 |
-
import random
|
91 |
-
import statistics
|
92 |
-
import collections
|
93 |
-
import itertools
|
94 |
-
import re
|
95 |
-
import json
|
96 |
-
import csv
|
97 |
-
|
98 |
-
restricted_globals['math'] = math
|
99 |
-
restricted_globals['datetime'] = datetime
|
100 |
-
restricted_globals['random'] = random
|
101 |
-
restricted_globals['statistics'] = statistics
|
102 |
-
restricted_globals['collections'] = collections
|
103 |
-
restricted_globals['itertools'] = itertools
|
104 |
-
restricted_globals['re'] = re
|
105 |
-
restricted_globals['json'] = json
|
106 |
-
restricted_globals['csv'] = csv
|
107 |
-
|
108 |
-
# Try to import numpy and pandas if available
|
109 |
-
try:
|
110 |
-
import numpy as np
|
111 |
-
restricted_globals['numpy'] = np
|
112 |
-
restricted_globals['np'] = np
|
113 |
-
except ImportError:
|
114 |
-
pass
|
115 |
-
|
116 |
-
try:
|
117 |
-
import pandas as pd
|
118 |
-
restricted_globals['pandas'] = pd
|
119 |
-
restricted_globals['pd'] = pd
|
120 |
-
except ImportError:
|
121 |
-
pass
|
122 |
-
|
123 |
-
# Create local scope
|
124 |
-
local_scope = {}
|
125 |
-
|
126 |
-
# Capture stdout
|
127 |
-
captured_output = io.StringIO()
|
128 |
-
|
129 |
-
# Execute the entire code block at once
|
130 |
-
with redirect_stdout(captured_output):
|
131 |
-
# Try to evaluate as expression first (for simple expressions)
|
132 |
-
lines = code.strip().split('\n')
|
133 |
-
if len(lines) == 1 and not any(keyword in code for keyword in ['=', 'import', 'from', 'def', 'class', 'if', 'for', 'while', 'try', 'with']):
|
134 |
-
try:
|
135 |
-
result = eval(code, restricted_globals, local_scope)
|
136 |
-
print(f"Result: {result}")
|
137 |
-
except:
|
138 |
-
# If eval fails, use exec
|
139 |
-
exec(code, restricted_globals, local_scope)
|
140 |
-
else:
|
141 |
-
# For multi-line code, execute the entire block
|
142 |
-
exec(code, restricted_globals, local_scope)
|
143 |
-
|
144 |
-
# Get the captured output
|
145 |
-
output = captured_output.getvalue()
|
146 |
-
|
147 |
-
if output.strip():
|
148 |
-
return output.strip()
|
149 |
-
else:
|
150 |
-
# If no output, check if there's a result from the last expression
|
151 |
-
lines = code.strip().split('\n')
|
152 |
-
last_line = lines[-1].strip() if lines else ""
|
153 |
-
|
154 |
-
# If the last line looks like an expression, try to evaluate it
|
155 |
-
if last_line and not any(keyword in last_line for keyword in ['=', 'import', 'from', 'def', 'class', 'if', 'for', 'while', 'try', 'with', 'print']):
|
156 |
-
try:
|
157 |
-
result = eval(last_line, restricted_globals, local_scope)
|
158 |
-
return f"Result: {result}"
|
159 |
-
except:
|
160 |
-
pass
|
161 |
-
|
162 |
-
return "Code executed successfully with no output."
|
163 |
-
|
164 |
-
except SyntaxError as e:
|
165 |
-
return f"Syntax Error: {str(e)}"
|
166 |
-
except NameError as e:
|
167 |
-
return f"Name Error: {str(e)}"
|
168 |
-
except ZeroDivisionError as e:
|
169 |
-
return f"Zero Division Error: {str(e)}"
|
170 |
-
except Exception as e:
|
171 |
-
return f"Error executing code: {str(e)}"
|
172 |
|
173 |
-
#
|
174 |
-
# def apify_google_search(query: str, limit: int = 10) -> str:
|
175 |
-
# """
|
176 |
-
# Use Apify's Google Search Results Scraper to get search results
|
177 |
-
#
|
178 |
-
# Args:
|
179 |
-
# query: The search query string
|
180 |
-
# limit: Number of results to return (10, 20, 30, 40, 50, 100)
|
181 |
-
#
|
182 |
-
# Returns:
|
183 |
-
# Formatted search results as a string
|
184 |
-
# """
|
185 |
-
# # You would need to provide a valid Apify API token
|
186 |
-
# # You can get one by signing up at https://apify.com/
|
187 |
-
# # Replace this with your actual Apify API token or set as environment variable
|
188 |
-
# APIFY_API_TOKEN = os.environ.get("APIFY_API_TOKEN", "")
|
189 |
-
#
|
190 |
-
# if not APIFY_API_TOKEN:
|
191 |
-
# print("No Apify API token found. Using fallback search method.")
|
192 |
-
# return fallback_search(query)
|
193 |
-
#
|
194 |
-
# try:
|
195 |
-
# # Initialize the ApifyClient with API token
|
196 |
-
# client = ApifyClient(APIFY_API_TOKEN)
|
197 |
-
#
|
198 |
-
# # Prepare the Actor input - convert limit to string as required by the API
|
199 |
-
# run_input = {
|
200 |
-
# "keyword": query,
|
201 |
-
# "limit": str(limit), # Convert to string as required by the API
|
202 |
-
# "country": "US"
|
203 |
-
# }
|
204 |
-
#
|
205 |
-
# # The Actor ID for the Google Search Results Scraper
|
206 |
-
# ACTOR_ID = "563JCPLOqM1kMmbbP"
|
207 |
-
#
|
208 |
-
# print(f"Starting Apify search for: '{query}'")
|
209 |
-
#
|
210 |
-
# # Run the Actor and wait for it to finish (with timeout)
|
211 |
-
# run = client.actor(ACTOR_ID).call(run_input=run_input, timeout_secs=60)
|
212 |
-
#
|
213 |
-
# if not run or not run.get("defaultDatasetId"):
|
214 |
-
# print("Failed to get results from Apify actor")
|
215 |
-
# return fallback_search(query)
|
216 |
-
#
|
217 |
-
# # Fetch Actor results from the run's dataset
|
218 |
-
# results = []
|
219 |
-
# for item in client.dataset(run["defaultDatasetId"]).iterate_items():
|
220 |
-
# results.append(item)
|
221 |
-
#
|
222 |
-
# # Format and return the results
|
223 |
-
# return format_search_results(results, query)
|
224 |
-
#
|
225 |
-
# except Exception as e:
|
226 |
-
# print(f"Error using Apify: {str(e)}")
|
227 |
-
# return fallback_search(query)
|
228 |
|
229 |
-
|
230 |
-
"""
|
231 |
-
Safely scrape content from a specified URL.
|
232 |
-
|
233 |
-
Args:
|
234 |
-
url: The URL to scrape
|
235 |
-
|
236 |
-
Returns:
|
237 |
-
Formatted webpage content as text
|
238 |
-
"""
|
239 |
-
# Check if the URL is valid
|
240 |
-
try:
|
241 |
-
# Parse the URL to validate it
|
242 |
-
parsed_url = urlparse(url)
|
243 |
-
if not parsed_url.scheme or not parsed_url.netloc:
|
244 |
-
return f"Error: Invalid URL format: {url}. Please provide a valid URL with http:// or https:// prefix."
|
245 |
-
|
246 |
-
# Block potentially dangerous URLs
|
247 |
-
blocked_domains = [
|
248 |
-
"localhost", "127.0.0.1", "0.0.0.0",
|
249 |
-
"192.168.", "10.0.", "172.16.", "172.17.", "172.18.", "172.19.", "172.20.",
|
250 |
-
"172.21.", "172.22.", "172.23.", "172.24.", "172.25.", "172.26.", "172.27.",
|
251 |
-
"172.28.", "172.29.", "172.30.", "172.31."
|
252 |
-
]
|
253 |
-
|
254 |
-
if any(domain in parsed_url.netloc for domain in blocked_domains):
|
255 |
-
return f"Error: Access to internal/local URLs is blocked for security: {url}"
|
256 |
-
|
257 |
-
print(f"Scraping URL: {url}")
|
258 |
-
|
259 |
-
# Set user agent to avoid being blocked
|
260 |
-
headers = {
|
261 |
-
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
262 |
-
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
263 |
-
'Accept-Language': 'en-US,en;q=0.5',
|
264 |
-
'Connection': 'keep-alive',
|
265 |
-
'Upgrade-Insecure-Requests': '1',
|
266 |
-
'Cache-Control': 'max-age=0',
|
267 |
-
}
|
268 |
-
|
269 |
-
# Set a reasonable timeout to avoid hanging
|
270 |
-
timeout = 10
|
271 |
-
|
272 |
-
# Make the request
|
273 |
-
response = requests.get(url, headers=headers, timeout=timeout)
|
274 |
-
|
275 |
-
# Check if request was successful
|
276 |
-
if response.status_code != 200:
|
277 |
-
return f"Error: Failed to fetch the webpage. Status code: {response.status_code}"
|
278 |
-
|
279 |
-
# Use BeautifulSoup to parse the HTML
|
280 |
-
soup = BeautifulSoup(response.text, 'html.parser')
|
281 |
-
|
282 |
-
# Remove script and style elements that are not relevant to content
|
283 |
-
for script_or_style in soup(["script", "style", "iframe", "footer", "nav"]):
|
284 |
-
script_or_style.decompose()
|
285 |
-
|
286 |
-
# Get the page title
|
287 |
-
title = soup.title.string if soup.title else "No title found"
|
288 |
-
|
289 |
-
# Extract the main content
|
290 |
-
# First try to find main content areas
|
291 |
-
main_content = soup.find('main') or soup.find('article') or soup.find(id='content') or soup.find(class_='content')
|
292 |
-
|
293 |
-
# If no main content area is found, use the entire body
|
294 |
-
if not main_content:
|
295 |
-
main_content = soup.body
|
296 |
-
|
297 |
-
# Convert to plain text
|
298 |
-
h = html2text.HTML2Text()
|
299 |
-
h.ignore_links = False
|
300 |
-
h.ignore_images = True
|
301 |
-
h.ignore_tables = False
|
302 |
-
h.unicode_snob = True
|
303 |
-
|
304 |
-
if main_content:
|
305 |
-
text_content = h.handle(str(main_content))
|
306 |
-
else:
|
307 |
-
text_content = h.handle(response.text)
|
308 |
-
|
309 |
-
# Limit content length to avoid overwhelming the model
|
310 |
-
max_content_length = 99999999999
|
311 |
-
if len(text_content) > max_content_length:
|
312 |
-
text_content = text_content[:max_content_length] + "\n\n[Content truncated due to length...]"
|
313 |
-
|
314 |
-
# Format the response
|
315 |
-
result = f"Title: {title}\nURL: {url}\n\n{text_content}"
|
316 |
-
|
317 |
-
return result
|
318 |
-
|
319 |
-
except requests.exceptions.Timeout:
|
320 |
-
return f"Error: Request timed out while trying to access {url}"
|
321 |
-
except requests.exceptions.ConnectionError:
|
322 |
-
return f"Error: Failed to connect to {url}. The site might be down or the URL might be incorrect."
|
323 |
-
except requests.exceptions.RequestException as e:
|
324 |
-
return f"Error requesting {url}: {str(e)}"
|
325 |
-
except Exception as e:
|
326 |
-
return f"Error scraping webpage {url}: {str(e)}"
|
327 |
-
|
328 |
-
# Comment out the format_search_results function (around line 180)
|
329 |
-
# def format_search_results(results: List[Dict], query: str) -> str:
|
330 |
-
# """Format the search results into a readable string"""
|
331 |
-
# if not results or len(results) == 0:
|
332 |
-
# return f"No results found for query: {query}"
|
333 |
-
#
|
334 |
-
# print(f"Raw search results: {str(results)[:1000]}...")
|
335 |
-
#
|
336 |
-
# # Extract search results from the Apify output
|
337 |
-
# formatted_results = f"Search results for '{query}':\n\n"
|
338 |
-
#
|
339 |
-
# # Check if results is a list of dictionaries or a dictionary with nested results
|
340 |
-
# if isinstance(results, dict) and "results" in results:
|
341 |
-
# items = results["results"]
|
342 |
-
# elif isinstance(results, list):
|
343 |
-
# items = results
|
344 |
-
# else:
|
345 |
-
# return f"Unable to process results for query: {query}"
|
346 |
-
#
|
347 |
-
# # Handle different Apify result formats
|
348 |
-
# if len(items) > 0:
|
349 |
-
# # Check the structure of the first item to determine format
|
350 |
-
# first_item = items[0]
|
351 |
-
#
|
352 |
-
# # If item has 'organicResults', this is the format from some Apify actors
|
353 |
-
# if isinstance(first_item, dict) and "organicResults" in first_item:
|
354 |
-
# organic_results = first_item.get("organicResults", [])
|
355 |
-
# for i, result in enumerate(organic_results[:10], 1):
|
356 |
-
# if "title" in result and "url" in result:
|
357 |
-
# formatted_results += f"{i}. {result['title']}\n"
|
358 |
-
# formatted_results += f" URL: {result['url']}\n"
|
359 |
-
# if "snippet" in result:
|
360 |
-
# formatted_results += f" {result['snippet']}\n"
|
361 |
-
# formatted_results += "\n"
|
362 |
-
# else:
|
363 |
-
# # Standard format with title/url/description
|
364 |
-
# for i, result in enumerate(items[:10], 1):
|
365 |
-
# if "title" in result and "url" in result:
|
366 |
-
# formatted_results += f"{i}. {result['title']}\n"
|
367 |
-
# formatted_results += f" URL: {result['url']}\n"
|
368 |
-
# if "description" in result:
|
369 |
-
# formatted_results += f" {result['description']}\n"
|
370 |
-
# elif "snippet" in result:
|
371 |
-
# formatted_results += f" {result['snippet']}\n"
|
372 |
-
# formatted_results += "\n"
|
373 |
-
#
|
374 |
-
# return formatted_results
|
375 |
-
|
376 |
-
# Comment out the fallback_search function (around line 220)
|
377 |
-
# def fallback_search(query: str) -> str:
|
378 |
-
# """Fallback search method using DuckDuckGo when Apify is not available"""
|
379 |
-
# try:
|
380 |
-
# search_tool = DuckDuckGoSearchRun()
|
381 |
-
# result = search_tool.invoke(query)
|
382 |
-
# return "Observation: " + result
|
383 |
-
# except Exception as e:
|
384 |
-
# return f"Search error: {str(e)}. Please try a different query or method."
|
385 |
-
|
386 |
-
# Comment out the safe_web_search function (around line 230)
|
387 |
-
# def safe_web_search(query: str) -> str:
|
388 |
-
# """Search the web safely with error handling and retry logic."""
|
389 |
-
# if not query:
|
390 |
-
# return "Error: No search query provided. Please specify what you want to search for."
|
391 |
-
#
|
392 |
-
# # Try using Apify first, if it fails it will use the fallback
|
393 |
-
# return "Observation: " + apify_google_search(query)
|
394 |
-
#
|
395 |
-
# # The code below is kept for reference but won't be executed
|
396 |
-
# max_retries = 3
|
397 |
-
# backoff_factor = 1.5
|
398 |
-
#
|
399 |
-
# for attempt in range(max_retries):
|
400 |
-
# try:
|
401 |
-
# # Use the DuckDuckGoSearchRun tool
|
402 |
-
# search_tool = DuckDuckGoSearchRun()
|
403 |
-
# result = search_tool.invoke(query)
|
404 |
-
#
|
405 |
-
# # If we get an empty result, provide a helpful message
|
406 |
-
# if not result or len(result.strip()) < 10:
|
407 |
-
# return f"The search for '{query}' did not return any useful results. Please try a more specific query or a different search engine."
|
408 |
-
#
|
409 |
-
# return "Observation: " + result
|
410 |
-
#
|
411 |
-
# except Exception as e:
|
412 |
-
# # If we're being rate limited
|
413 |
-
# if "Ratelimit" in str(e) or "429" in str(e):
|
414 |
-
# if attempt < max_retries - 1:
|
415 |
-
# wait_time = backoff_factor ** attempt
|
416 |
-
# print(f"Rate limited, waiting {wait_time:.2f} seconds before retrying...")
|
417 |
-
# time.sleep(wait_time)
|
418 |
-
# else:
|
419 |
-
# # On last attempt, return a helpful error
|
420 |
-
# error_msg = f"I'm currently unable to search for '{query}' due to service rate limits. "
|
421 |
-
# return error_msg
|
422 |
-
# else:
|
423 |
-
# # For other types of errors
|
424 |
-
# return f"Error while searching for '{query}': {str(e)}"
|
425 |
-
#
|
426 |
-
# return f"Failed to search for '{query}' after multiple attempts due to rate limiting."
|
427 |
-
|
428 |
-
def wikipedia_search(query: str, num_results: int = 3) -> str:
|
429 |
-
"""
|
430 |
-
Search Wikipedia for information about a specific query.
|
431 |
-
|
432 |
-
Args:
|
433 |
-
query: Search query
|
434 |
-
num_results: Number of search results to return (default: 3)
|
435 |
-
|
436 |
-
Returns:
|
437 |
-
Formatted Wikipedia search results
|
438 |
-
"""
|
439 |
-
try:
|
440 |
-
# Validate input
|
441 |
-
if not query or not isinstance(query, str):
|
442 |
-
return "Error: Please provide a valid search query."
|
443 |
-
|
444 |
-
# Ensure num_results is valid
|
445 |
-
try:
|
446 |
-
num_results = int(num_results)
|
447 |
-
if num_results <= 0:
|
448 |
-
num_results = 3 # Default to 3 if invalid
|
449 |
-
except:
|
450 |
-
num_results = 3 # Default to 3 if conversion fails
|
451 |
-
|
452 |
-
print(f"Searching Wikipedia for: {query}")
|
453 |
-
|
454 |
-
# Use WikipediaLoader from LangChain
|
455 |
-
loader = WikipediaLoader(query=query, load_max_docs=num_results)
|
456 |
-
docs = loader.load()
|
457 |
-
|
458 |
-
if not docs:
|
459 |
-
return f"No Wikipedia results found for '{query}'. Try refining your search."
|
460 |
-
|
461 |
-
# Format the results
|
462 |
-
formatted_results = f"Wikipedia search results for '{query}':\n\n"
|
463 |
-
|
464 |
-
for i, doc in enumerate(docs, 1):
|
465 |
-
title = doc.metadata.get('title', 'Unknown Title')
|
466 |
-
source = doc.metadata.get('source', 'No URL')
|
467 |
-
content = doc.page_content
|
468 |
-
|
469 |
-
# Truncate content if too long
|
470 |
-
if len(content) > 500:
|
471 |
-
content = content[:500] + "..."
|
472 |
-
|
473 |
-
formatted_results += f"{i}. {title}\n"
|
474 |
-
formatted_results += f" URL: {source}\n"
|
475 |
-
formatted_results += f" {content}\n\n"
|
476 |
-
|
477 |
-
return formatted_results
|
478 |
-
|
479 |
-
except Exception as e:
|
480 |
-
return f"Error searching Wikipedia: {str(e)}"
|
481 |
-
|
482 |
-
def tavily_search(query: str, search_depth: str = "basic") -> str:
|
483 |
-
"""
|
484 |
-
Search the web using the Tavily Search API.
|
485 |
-
|
486 |
-
Args:
|
487 |
-
query: Search query
|
488 |
-
search_depth: Depth of search ('basic' or 'comprehensive')
|
489 |
-
|
490 |
-
Returns:
|
491 |
-
Formatted search results from Tavily
|
492 |
-
"""
|
493 |
-
try:
|
494 |
-
# Check for API key
|
495 |
-
tavily_api_key = os.environ.get("TAVILY_API_KEY")
|
496 |
-
if not tavily_api_key:
|
497 |
-
return "Error: Tavily API key not found. Please set the TAVILY_API_KEY environment variable."
|
498 |
-
|
499 |
-
# Validate input
|
500 |
-
if not query or not isinstance(query, str):
|
501 |
-
return "Error: Please provide a valid search query."
|
502 |
-
|
503 |
-
# Validate search_depth
|
504 |
-
if search_depth not in ["basic", "comprehensive"]:
|
505 |
-
search_depth = "basic" # Default to basic if invalid
|
506 |
-
|
507 |
-
print(f"Searching Tavily for: {query} (depth: {search_depth})")
|
508 |
-
|
509 |
-
# Initialize the Tavily search tool
|
510 |
-
search = TavilySearchResults(api_key=tavily_api_key)
|
511 |
-
|
512 |
-
# Execute the search
|
513 |
-
results = search.invoke({"query": query, "search_depth": search_depth})
|
514 |
-
|
515 |
-
if not results:
|
516 |
-
return f"No Tavily search results found for '{query}'. Try refining your search."
|
517 |
-
|
518 |
-
# Format the results
|
519 |
-
formatted_results = f"Tavily search results for '{query}':\n\n"
|
520 |
-
|
521 |
-
for i, result in enumerate(results, 1):
|
522 |
-
formatted_results += f"{i}. {result.get('title', 'No title')}\n"
|
523 |
-
formatted_results += f" URL: {result.get('url', 'No URL')}\n"
|
524 |
-
formatted_results += f" {result.get('content', 'No content')}\n\n"
|
525 |
-
|
526 |
-
return formatted_results
|
527 |
-
|
528 |
-
except Exception as e:
|
529 |
-
return f"Error searching with Tavily: {str(e)}"
|
530 |
-
|
531 |
-
def arxiv_search(query: str, max_results: int = 5) -> str:
|
532 |
-
"""
|
533 |
-
Search ArXiv for scientific papers matching the query.
|
534 |
-
|
535 |
-
Args:
|
536 |
-
query: Search query for ArXiv
|
537 |
-
max_results: Maximum number of results to return
|
538 |
-
|
539 |
-
Returns:
|
540 |
-
Formatted ArXiv search results
|
541 |
-
"""
|
542 |
-
try:
|
543 |
-
# Validate input
|
544 |
-
if not query or not isinstance(query, str):
|
545 |
-
return "Error: Please provide a valid search query."
|
546 |
-
|
547 |
-
# Ensure max_results is valid
|
548 |
-
try:
|
549 |
-
max_results = int(max_results)
|
550 |
-
if max_results <= 0 or max_results > 10:
|
551 |
-
max_results = 5 # Default to 5 if invalid or too large
|
552 |
-
except:
|
553 |
-
max_results = 5 # Default to 5 if conversion fails
|
554 |
-
|
555 |
-
print(f"Searching ArXiv for: {query}")
|
556 |
-
|
557 |
-
# Use ArxivLoader from LangChain
|
558 |
-
loader = ArxivLoader(
|
559 |
-
query=query,
|
560 |
-
load_max_docs=max_results,
|
561 |
-
load_all_available_meta=True
|
562 |
-
)
|
563 |
-
|
564 |
-
docs = loader.load()
|
565 |
-
|
566 |
-
if not docs:
|
567 |
-
return f"No ArXiv papers found for '{query}'. Try refining your search."
|
568 |
-
|
569 |
-
# Format the results
|
570 |
-
formatted_results = f"ArXiv papers for '{query}':\n\n"
|
571 |
-
|
572 |
-
for i, doc in enumerate(docs, 1):
|
573 |
-
meta = doc.metadata
|
574 |
-
title = meta.get('Title', 'Unknown Title')
|
575 |
-
url = meta.get('Entry ID', 'No URL')
|
576 |
-
authors = meta.get('Authors', 'Unknown Authors')
|
577 |
-
published = meta.get('Published', 'Unknown Date')
|
578 |
-
|
579 |
-
formatted_results += f"{i}. {title}\n"
|
580 |
-
formatted_results += f" URL: {url}\n"
|
581 |
-
formatted_results += f" Authors: {authors}\n"
|
582 |
-
formatted_results += f" Published: {published}\n"
|
583 |
-
|
584 |
-
# Add abstract, truncated if too long
|
585 |
-
abstract = doc.page_content.replace('\n', ' ')
|
586 |
-
if len(abstract) > 300:
|
587 |
-
abstract = abstract[:300] + "..."
|
588 |
-
formatted_results += f" Abstract: {abstract}\n\n"
|
589 |
-
|
590 |
-
return formatted_results
|
591 |
-
|
592 |
-
except Exception as e:
|
593 |
-
return f"Error searching ArXiv: {str(e)}"
|
594 |
-
|
595 |
-
def supabase_operation(operation_type: str, table: str, data: dict = None, filters: dict = None) -> str:
|
596 |
-
"""
|
597 |
-
Perform operations on Supabase database.
|
598 |
-
|
599 |
-
Args:
|
600 |
-
operation_type: Type of operation ('insert', 'select', 'update', 'delete')
|
601 |
-
table: Name of the table to operate on
|
602 |
-
data: Data to insert/update (for insert/update operations)
|
603 |
-
filters: Filters for select/update/delete operations (e.g., {"id": 1})
|
604 |
-
|
605 |
-
Returns:
|
606 |
-
Result of the operation as a formatted string
|
607 |
-
"""
|
608 |
-
try:
|
609 |
-
# Get Supabase credentials from environment variables
|
610 |
-
supabase_url = os.environ.get("SUPABASE_URL")
|
611 |
-
supabase_key = os.environ.get("SUPABASE_ANON_KEY")
|
612 |
-
|
613 |
-
if not supabase_url or not supabase_key:
|
614 |
-
return "Error: Supabase credentials not found. Please set SUPABASE_URL and SUPABASE_ANON_KEY environment variables."
|
615 |
-
|
616 |
-
# Create Supabase client
|
617 |
-
supabase: Client = create_client(supabase_url, supabase_key)
|
618 |
-
|
619 |
-
# Validate inputs
|
620 |
-
if not table:
|
621 |
-
return "Error: Table name is required."
|
622 |
-
|
623 |
-
if operation_type not in ['insert', 'select', 'update', 'delete']:
|
624 |
-
return "Error: Invalid operation type. Use 'insert', 'select', 'update', or 'delete'."
|
625 |
-
|
626 |
-
# Perform the operation based on type
|
627 |
-
if operation_type == 'insert':
|
628 |
-
if not data:
|
629 |
-
return "Error: Data is required for insert operation."
|
630 |
-
|
631 |
-
result = supabase.table(table).insert(data).execute()
|
632 |
-
return f"Insert successful: {len(result.data)} row(s) inserted into {table}"
|
633 |
-
|
634 |
-
elif operation_type == 'select':
|
635 |
-
query = supabase.table(table).select("*")
|
636 |
-
|
637 |
-
# Apply filters if provided
|
638 |
-
if filters:
|
639 |
-
for key, value in filters.items():
|
640 |
-
query = query.eq(key, value)
|
641 |
-
|
642 |
-
result = query.execute()
|
643 |
-
return f"Select successful: Found {len(result.data)} row(s) in {table}\nData: {json.dumps(result.data, indent=2)}"
|
644 |
-
|
645 |
-
elif operation_type == 'update':
|
646 |
-
if not data or not filters:
|
647 |
-
return "Error: Both data and filters are required for update operation."
|
648 |
-
|
649 |
-
query = supabase.table(table).update(data)
|
650 |
-
|
651 |
-
# Apply filters
|
652 |
-
for key, value in filters.items():
|
653 |
-
query = query.eq(key, value)
|
654 |
-
|
655 |
-
result = query.execute()
|
656 |
-
return f"Update successful: {len(result.data)} row(s) updated in {table}"
|
657 |
-
|
658 |
-
elif operation_type == 'delete':
|
659 |
-
if not filters:
|
660 |
-
return "Error: Filters are required for delete operation."
|
661 |
-
|
662 |
-
query = supabase.table(table).delete()
|
663 |
-
|
664 |
-
# Apply filters
|
665 |
-
for key, value in filters.items():
|
666 |
-
query = query.eq(key, value)
|
667 |
-
|
668 |
-
result = query.execute()
|
669 |
-
return f"Delete successful: Rows deleted from {table}"
|
670 |
-
|
671 |
-
except Exception as e:
|
672 |
-
return f"Error performing Supabase operation: {str(e)}"
|
673 |
-
|
674 |
-
def excel_to_text(excel_path: str, sheet_name: Optional[str] = None, file_content: Optional[bytes] = None) -> str:
|
675 |
-
"""
|
676 |
-
Read an Excel file and return a Markdown table of the requested sheet.
|
677 |
-
|
678 |
-
Args:
|
679 |
-
excel_path: Path to the Excel file (.xlsx or .xls) or name for the attached file.
|
680 |
-
sheet_name: Optional name or index of the sheet to read. If None, reads the first sheet.
|
681 |
-
file_content: Optional binary content of the file if provided as an attachment.
|
682 |
-
|
683 |
-
Returns:
|
684 |
-
A Markdown table representing the Excel sheet, or an error message if the file is not found or cannot be read.
|
685 |
-
"""
|
686 |
-
try:
|
687 |
-
# Handle file attachment case
|
688 |
-
if file_content:
|
689 |
-
# Create a temporary file to save the attachment
|
690 |
-
with tempfile.NamedTemporaryFile(suffix='.xlsx', delete=False) as temp_file:
|
691 |
-
temp_file.write(file_content)
|
692 |
-
temp_path = temp_file.name
|
693 |
-
|
694 |
-
print(f"Saved attached Excel file to temporary location: {temp_path}")
|
695 |
-
file_path = Path(temp_path)
|
696 |
-
else:
|
697 |
-
# Regular file path case
|
698 |
-
file_path = Path(excel_path).expanduser().resolve()
|
699 |
-
if not file_path.is_file():
|
700 |
-
return f"Error: Excel file not found at {file_path}"
|
701 |
-
|
702 |
-
# Process the Excel file
|
703 |
-
sheet: Union[str, int] = (
|
704 |
-
int(sheet_name)
|
705 |
-
if sheet_name and sheet_name.isdigit()
|
706 |
-
else sheet_name or 0
|
707 |
-
)
|
708 |
-
|
709 |
-
df = pd.read_excel(file_path, sheet_name=sheet)
|
710 |
-
|
711 |
-
# Clean up temporary file if we created one
|
712 |
-
if file_content and os.path.exists(temp_path):
|
713 |
-
os.unlink(temp_path)
|
714 |
-
print(f"Deleted temporary Excel file: {temp_path}")
|
715 |
-
|
716 |
-
if hasattr(df, "to_markdown"):
|
717 |
-
return df.to_markdown(index=False)
|
718 |
-
|
719 |
-
return tabulate(df, headers="keys", tablefmt="github", showindex=False)
|
720 |
-
|
721 |
-
except Exception as e:
|
722 |
-
# Clean up temporary file in case of error
|
723 |
-
if file_content and 'temp_path' in locals() and os.path.exists(temp_path):
|
724 |
-
os.unlink(temp_path)
|
725 |
-
print(f"Deleted temporary Excel file due to error: {temp_path}")
|
726 |
-
return f"Error reading Excel file: {e}"
|
727 |
|
728 |
# System prompt to guide the model's behavior
|
729 |
#web_search: Search the google search engine when Tavily Search and Wikipedia Search do not return a result. Provide a specific search query.
|
730 |
#webpage_scrape: Scrape content from a specific webpage URL when Tavily Search and Wikipedia Search do not return a result. Provide a valid URL to extract information from a particular web page.
|
731 |
#Give preference to using Tavily Search and Wikipedia Search before using web_search or webpage_scrape. When Web_search does not return a result, use Tavily Search.
|
732 |
|
733 |
-
SYSTEM_PROMPT = """Answer the following questions as best you can. DO NOT rely on your internal knowledge unless
|
734 |
-
|
735 |
-
python_code: Execute Python code. Provide the complete Python code as a string. Use this tool to calculate math problems.
|
736 |
-
wikipedia_search: Search Wikipedia for information about a specific topic. Optionally specify the number of results to return.
|
737 |
-
tavily_search: Search the web using Tavily for more comprehensive results. Optionally specify search_depth as 'basic' or 'comprehensive'.
|
738 |
-
arxiv_search: Search ArXiv for scientific papers on a specific topic. Optionally specify max_results to control the number of papers returned.
|
739 |
-
supabase_operation: Perform database operations on Supabase (insert, select, update, delete). Provide operation_type, table name, and optional data/filters.
|
740 |
-
excel_to_text: Read an Excel file and convert it to a Markdown table. You can provide either the path to an Excel file or use a file attachment. For attachments, provide a base64-encoded string of the file content and a filename.
|
741 |
|
742 |
The way you use the tools is by specifying a json blob.
|
743 |
Specifically, this json should have an `action` key (with the name of the tool to use) and an `action_input` key (with the input to the tool going here).
|
744 |
|
745 |
The only values that should be in the "action" field are:
|
746 |
-
python_code: Execute Python code
|
747 |
-
wikipedia_search: Search Wikipedia, args: {"query": {"type": "string"}, "num_results": {"type": "integer", "optional": true}}
|
748 |
-
tavily_search: Search
|
749 |
-
arxiv_search: Search ArXiv papers, args: {"query": {"type": "string"}, "max_results": {"type": "integer", "optional": true}}
|
750 |
webpage_scrape: Scrape a specific webpage, args: {"url": {"type": "string"}}
|
751 |
supabase_operation: Perform database operations, args: {"operation_type": {"type": "string"}, "table": {"type": "string"}, "data": {"type": "object", "optional": true}, "filters": {"type": "object", "optional": true}}
|
752 |
-
excel_to_text: Convert Excel to Markdown table with file path, args: {"excel_path": {"type": "string"}, "sheet_name": {"type": "string", "optional": true}}
|
753 |
excel_to_text: Convert Excel to Markdown table with attachment, args: {"excel_path": {"type": "string"}, "file_content": {"type": "string"}, "sheet_name": {"type": "string", "optional": true}}
|
|
|
754 |
|
755 |
IMPORTANT: Make sure your JSON is properly formatted with double quotes around keys and string values.
|
756 |
|
757 |
-
If you do not want to use any tool AND have not yet arrived at a solution, call the python_code tool with an empty string as the code.
|
758 |
-
|
759 |
Example use for tools:
|
760 |
|
761 |
```json
|
@@ -767,21 +80,14 @@ Example use for tools:
|
|
767 |
or
|
768 |
```json
|
769 |
{
|
770 |
-
"action": "
|
771 |
-
"action_input": {"
|
772 |
-
}
|
773 |
-
```
|
774 |
-
or
|
775 |
-
```json
|
776 |
-
{
|
777 |
-
"action": "excel_to_text",
|
778 |
-
"action_input": {"excel_path": "data.xlsx", "file_content": "BASE64_ENCODED_CONTENT_HERE", "sheet_name": "Sheet1"}
|
779 |
}
|
780 |
```
|
781 |
|
782 |
ALWAYS follow this specific format for your responses. Your entire response will follow this pattern:
|
783 |
Question: [the user's question]
|
784 |
-
Thought: [your reasoning about what to do next]
|
785 |
Action:
|
786 |
```json
|
787 |
{
|
@@ -790,7 +96,7 @@ Action:
|
|
790 |
}
|
791 |
```
|
792 |
Observation: [the result from the tool will appear here]
|
793 |
-
Thought: [your reasoning after seeing the observation]
|
794 |
Action:
|
795 |
```json
|
796 |
{
|
@@ -863,6 +169,11 @@ tools_config = [
|
|
863 |
"name": "excel_to_text",
|
864 |
"description": "Read an Excel file and return a Markdown table. You can provide either the path to an Excel file or use a file attachment. For attachments, provide a base64-encoded string of the file content and a filename.",
|
865 |
"func": excel_to_text
|
|
|
|
|
|
|
|
|
|
|
866 |
}
|
867 |
]
|
868 |
|
@@ -984,22 +295,21 @@ def assistant(state: AgentState) -> Dict[str, Any]:
|
|
984 |
tool_name = action_json["action"]
|
985 |
tool_input = action_json["action_input"]
|
986 |
|
987 |
-
# Handle nested JSON issue - if action_input is a string
|
988 |
-
if
|
989 |
-
|
990 |
-
|
991 |
-
|
992 |
-
|
993 |
-
|
994 |
-
|
995 |
-
|
996 |
-
|
997 |
-
|
998 |
-
|
999 |
-
|
1000 |
-
|
1001 |
-
|
1002 |
-
pass
|
1003 |
|
1004 |
print(f"Using tool: {tool_name}")
|
1005 |
print(f"Tool input: {tool_input}")
|
@@ -1075,7 +385,7 @@ def extract_json_from_text(text: str) -> dict:
|
|
1075 |
print(f"Found valid JSON object: {parsed}")
|
1076 |
return parsed
|
1077 |
except json.JSONDecodeError:
|
1078 |
-
|
1079 |
|
1080 |
# Pattern 4: Look for patterns like 'action': 'tool_name', 'action_input': {...}
|
1081 |
action_pattern = re.search(r"['\"](action)['\"]:\s*['\"](\w+)['\"]", text)
|
@@ -1505,6 +815,62 @@ def excel_to_text_node(state: AgentState) -> Dict[str, Any]:
|
|
1505 |
"action_input": None # Clear the action input
|
1506 |
}
|
1507 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1508 |
# Router function to direct to the correct tool
|
1509 |
def router(state: AgentState) -> str:
|
1510 |
"""Route to the appropriate tool based on the current_tool field."""
|
@@ -1513,8 +879,6 @@ def router(state: AgentState) -> str:
|
|
1513 |
print(f"Routing to: {tool}")
|
1514 |
print(f"Router received action_input: {action_input}")
|
1515 |
|
1516 |
-
# if tool == "web_search":
|
1517 |
-
# return "web_search"
|
1518 |
if tool == "python_code":
|
1519 |
return "python_code"
|
1520 |
elif tool == "webpage_scrape":
|
@@ -1529,6 +893,8 @@ def router(state: AgentState) -> str:
|
|
1529 |
return "supabase_operation"
|
1530 |
elif tool == "excel_to_text":
|
1531 |
return "excel_to_text"
|
|
|
|
|
1532 |
else:
|
1533 |
return "end"
|
1534 |
|
@@ -1539,7 +905,6 @@ def create_agent_graph() -> StateGraph:
|
|
1539 |
|
1540 |
# Define nodes: these do the work
|
1541 |
builder.add_node("assistant", assistant)
|
1542 |
-
# builder.add_node("web_search", web_search_node)
|
1543 |
builder.add_node("python_code", python_code_node)
|
1544 |
builder.add_node("webpage_scrape", webpage_scrape_node)
|
1545 |
builder.add_node("wikipedia_search", wikipedia_search_node)
|
@@ -1547,6 +912,7 @@ def create_agent_graph() -> StateGraph:
|
|
1547 |
builder.add_node("arxiv_search", arxiv_search_node)
|
1548 |
builder.add_node("supabase_operation", supabase_operation_node)
|
1549 |
builder.add_node("excel_to_text", excel_to_text_node)
|
|
|
1550 |
|
1551 |
# Define edges: these determine how the control flow moves
|
1552 |
builder.add_edge(START, "assistant")
|
@@ -1571,7 +937,6 @@ def create_agent_graph() -> StateGraph:
|
|
1571 |
"debug",
|
1572 |
router,
|
1573 |
{
|
1574 |
-
# "web_search": "web_search",
|
1575 |
"python_code": "python_code",
|
1576 |
"webpage_scrape": "webpage_scrape",
|
1577 |
"wikipedia_search": "wikipedia_search",
|
@@ -1579,12 +944,12 @@ def create_agent_graph() -> StateGraph:
|
|
1579 |
"arxiv_search": "arxiv_search",
|
1580 |
"supabase_operation": "supabase_operation",
|
1581 |
"excel_to_text": "excel_to_text",
|
|
|
1582 |
"end": END
|
1583 |
}
|
1584 |
)
|
1585 |
|
1586 |
# Tools always go back to assistant
|
1587 |
-
# builder.add_edge("web_search", "assistant")
|
1588 |
builder.add_edge("python_code", "assistant")
|
1589 |
builder.add_edge("webpage_scrape", "assistant")
|
1590 |
builder.add_edge("wikipedia_search", "assistant")
|
@@ -1592,6 +957,7 @@ def create_agent_graph() -> StateGraph:
|
|
1592 |
builder.add_edge("arxiv_search", "assistant")
|
1593 |
builder.add_edge("supabase_operation", "assistant")
|
1594 |
builder.add_edge("excel_to_text", "assistant")
|
|
|
1595 |
|
1596 |
# Compile the graph
|
1597 |
return builder.compile()
|
@@ -1677,30 +1043,3 @@ I need to make headings for the fruits and vegetables. Could you please create a
|
|
1677 |
print("\nFinal Response:")
|
1678 |
print(response)
|
1679 |
|
1680 |
-
def save_attachment_to_tempfile(file_content_b64: str, file_extension: str = '.xlsx') -> str:
|
1681 |
-
"""
|
1682 |
-
Decode a base64 file content and save it to a temporary file.
|
1683 |
-
|
1684 |
-
Args:
|
1685 |
-
file_content_b64: Base64 encoded file content
|
1686 |
-
file_extension: File extension to use for the temporary file
|
1687 |
-
|
1688 |
-
Returns:
|
1689 |
-
Path to the saved temporary file
|
1690 |
-
"""
|
1691 |
-
try:
|
1692 |
-
# Decode the base64 content
|
1693 |
-
file_content = base64.b64decode(file_content_b64)
|
1694 |
-
|
1695 |
-
# Create a temporary file with the appropriate extension
|
1696 |
-
with tempfile.NamedTemporaryFile(suffix=file_extension, delete=False) as temp_file:
|
1697 |
-
temp_file.write(file_content)
|
1698 |
-
temp_path = temp_file.name
|
1699 |
-
|
1700 |
-
print(f"Saved attachment to temporary file: {temp_path}")
|
1701 |
-
return temp_path
|
1702 |
-
|
1703 |
-
except Exception as e:
|
1704 |
-
print(f"Error saving attachment: {e}")
|
1705 |
-
return None
|
1706 |
-
|
|
|
17 |
from tabulate import tabulate
|
18 |
import base64
|
19 |
|
20 |
+
# Import all tool functions from tools.py
|
21 |
+
from tools import (
|
22 |
+
tools_config,
|
23 |
+
run_python_code,
|
24 |
+
scrape_webpage,
|
25 |
+
wikipedia_search,
|
26 |
+
tavily_search,
|
27 |
+
arxiv_search,
|
28 |
+
supabase_operation,
|
29 |
+
excel_to_text,
|
30 |
+
save_attachment_to_tempfile,
|
31 |
+
process_youtube_video
|
32 |
+
)
|
33 |
|
34 |
load_dotenv()
|
35 |
|
36 |
+
# Remove the following functions from agent.py since they're now imported from tools.py:
|
37 |
+
# - run_python_code (lines ~28-175)
|
38 |
+
# - scrape_webpage (lines ~177-310)
|
39 |
+
# - wikipedia_search (lines ~345-405)
|
40 |
+
# - tavily_search (lines ~407-470)
|
41 |
+
# - arxiv_search (lines ~472-535)
|
42 |
+
# - supabase_operation (lines ~537-620)
|
43 |
+
# - excel_to_text (lines ~622-690)
|
44 |
+
# - save_attachment_to_tempfile (lines ~1680-1706)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
|
46 |
+
# Also remove the tools_config definition (lines ~795-870) since it's imported from tools.py
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
|
48 |
+
# The rest of the file remains the same...
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
|
50 |
# System prompt to guide the model's behavior
|
51 |
#web_search: Search the google search engine when Tavily Search and Wikipedia Search do not return a result. Provide a specific search query.
|
52 |
#webpage_scrape: Scrape content from a specific webpage URL when Tavily Search and Wikipedia Search do not return a result. Provide a valid URL to extract information from a particular web page.
|
53 |
#Give preference to using Tavily Search and Wikipedia Search before using web_search or webpage_scrape. When Web_search does not return a result, use Tavily Search.
|
54 |
|
55 |
+
SYSTEM_PROMPT = """Answer the following questions as best you can. DO NOT rely on your internal knowledge unless the tools fail to provide a result:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
|
57 |
The way you use the tools is by specifying a json blob.
|
58 |
Specifically, this json should have an `action` key (with the name of the tool to use) and an `action_input` key (with the input to the tool going here).
|
59 |
|
60 |
The only values that should be in the "action" field are:
|
61 |
+
python_code: Execute Python code. Use this tool to calculate math problems. args: {"code": {"type": "string"}}
|
62 |
+
wikipedia_search: Search Wikipedia for information about a specific topic. Optionally specify the number of results to return, args: {"query": {"type": "string"}, "num_results": {"type": "integer", "optional": true}}
|
63 |
+
tavily_search: Search the web using Tavily for more comprehensive results. Optionally specify search_depth as 'basic' or 'comprehensive', args: {"query": {"type": "string"}, "search_depth": {"type": "string", "optional": true}}
|
64 |
+
arxiv_search: Search ArXiv for scientific papers. Optionally specify max_results to control the number of papers returned, args: {"query": {"type": "string"}, "max_results": {"type": "integer", "optional": true}}
|
65 |
webpage_scrape: Scrape a specific webpage, args: {"url": {"type": "string"}}
|
66 |
supabase_operation: Perform database operations, args: {"operation_type": {"type": "string"}, "table": {"type": "string"}, "data": {"type": "object", "optional": true}, "filters": {"type": "object", "optional": true}}
|
|
|
67 |
excel_to_text: Convert Excel to Markdown table with attachment, args: {"excel_path": {"type": "string"}, "file_content": {"type": "string"}, "sheet_name": {"type": "string", "optional": true}}
|
68 |
+
process_youtube_video: Extract and analyze YouTube video content by providing the video URL. Returns video metadata and transcript, args: {"url": {"type": "string"}, "summarize": {"type": "boolean", "optional": true}}
|
69 |
|
70 |
IMPORTANT: Make sure your JSON is properly formatted with double quotes around keys and string values.
|
71 |
|
|
|
|
|
72 |
Example use for tools:
|
73 |
|
74 |
```json
|
|
|
80 |
or
|
81 |
```json
|
82 |
{
|
83 |
+
"action": "process_youtube_video",
|
84 |
+
"action_input": {"url": "https://www.youtube.com/watch?v=dQw4w9WgXcQ", "summarize": true}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
85 |
}
|
86 |
```
|
87 |
|
88 |
ALWAYS follow this specific format for your responses. Your entire response will follow this pattern:
|
89 |
Question: [the user's question]
|
90 |
+
Thought: [your reasoning about what to do next, break it down into smaller steps]
|
91 |
Action:
|
92 |
```json
|
93 |
{
|
|
|
96 |
}
|
97 |
```
|
98 |
Observation: [the result from the tool will appear here]
|
99 |
+
Thought: [your reasoning after seeing the observation, break it down into smaller steps]
|
100 |
Action:
|
101 |
```json
|
102 |
{
|
|
|
169 |
"name": "excel_to_text",
|
170 |
"description": "Read an Excel file and return a Markdown table. You can provide either the path to an Excel file or use a file attachment. For attachments, provide a base64-encoded string of the file content and a filename.",
|
171 |
"func": excel_to_text
|
172 |
+
},
|
173 |
+
{
|
174 |
+
"name": "process_youtube_video",
|
175 |
+
"description": "Extract and analyze YouTube video content by providing the video URL. Returns video metadata and transcript.",
|
176 |
+
"func": process_youtube_video
|
177 |
}
|
178 |
]
|
179 |
|
|
|
295 |
tool_name = action_json["action"]
|
296 |
tool_input = action_json["action_input"]
|
297 |
|
298 |
+
# Handle nested JSON issue - check if any value in action_input is a JSON string
|
299 |
+
if isinstance(tool_input, dict):
|
300 |
+
for key, value in tool_input.items():
|
301 |
+
if isinstance(value, str) and value.strip().startswith("{"):
|
302 |
+
try:
|
303 |
+
nested_json = json.loads(value)
|
304 |
+
if isinstance(nested_json, dict) and "action" in nested_json and "action_input" in nested_json:
|
305 |
+
# This is a nested structure, use the inner one
|
306 |
+
tool_name = nested_json["action"]
|
307 |
+
tool_input = nested_json["action_input"]
|
308 |
+
print(f"Unwrapped nested JSON. New tool: {tool_name}")
|
309 |
+
print(f"New tool input: {tool_input}")
|
310 |
+
break
|
311 |
+
except json.JSONDecodeError:
|
312 |
+
continue
|
|
|
313 |
|
314 |
print(f"Using tool: {tool_name}")
|
315 |
print(f"Tool input: {tool_input}")
|
|
|
385 |
print(f"Found valid JSON object: {parsed}")
|
386 |
return parsed
|
387 |
except json.JSONDecodeError:
|
388 |
+
continue
|
389 |
|
390 |
# Pattern 4: Look for patterns like 'action': 'tool_name', 'action_input': {...}
|
391 |
action_pattern = re.search(r"['\"](action)['\"]:\s*['\"](\w+)['\"]", text)
|
|
|
815 |
"action_input": None # Clear the action input
|
816 |
}
|
817 |
|
818 |
+
# Add a new node function for processing YouTube videos
|
819 |
+
def process_youtube_video_node(state: AgentState) -> Dict[str, Any]:
|
820 |
+
"""Node that processes YouTube videos."""
|
821 |
+
print("YouTube Video Processing Tool Called...\n\n")
|
822 |
+
|
823 |
+
# Extract tool arguments
|
824 |
+
action_input = state.get("action_input", {})
|
825 |
+
print(f"YouTube video processing action_input: {action_input}")
|
826 |
+
|
827 |
+
# Extract URL and other parameters
|
828 |
+
url = ""
|
829 |
+
summarize = True # Default
|
830 |
+
|
831 |
+
if isinstance(action_input, dict):
|
832 |
+
url = action_input.get("url", "")
|
833 |
+
# Check if summarize parameter exists and is a boolean
|
834 |
+
if "summarize" in action_input:
|
835 |
+
try:
|
836 |
+
summarize = bool(action_input["summarize"])
|
837 |
+
except:
|
838 |
+
print("Invalid summarize parameter, using default (True)")
|
839 |
+
elif isinstance(action_input, str):
|
840 |
+
# If action_input is just a string, assume it's the URL
|
841 |
+
url = action_input
|
842 |
+
|
843 |
+
print(f"Processing YouTube video: '{url}' (summarize: {summarize})")
|
844 |
+
|
845 |
+
# Safety check - don't run with empty URL
|
846 |
+
if not url:
|
847 |
+
result = "Error: No URL provided. Please provide a valid YouTube URL."
|
848 |
+
else:
|
849 |
+
# Import the function dynamically to ensure we're using the latest version
|
850 |
+
from tools import process_youtube_video
|
851 |
+
# Call the YouTube processing function
|
852 |
+
result = process_youtube_video(url, summarize)
|
853 |
+
|
854 |
+
print(f"YouTube processing result length: {len(result)}")
|
855 |
+
|
856 |
+
# Format the observation to continue the ReAct cycle
|
857 |
+
tool_message = AIMessage(
|
858 |
+
content=f"Observation: {result.strip()}"
|
859 |
+
)
|
860 |
+
|
861 |
+
# Print the observation that will be sent back to the assistant
|
862 |
+
print("\n=== TOOL OBSERVATION ===")
|
863 |
+
content_preview = tool_message.content[:500] + "..." if len(tool_message.content) > 500 else tool_message.content
|
864 |
+
print(content_preview)
|
865 |
+
print("=== END OBSERVATION ===\n")
|
866 |
+
|
867 |
+
# Return the updated state
|
868 |
+
return {
|
869 |
+
"messages": state["messages"] + [tool_message],
|
870 |
+
"current_tool": None, # Reset the current tool
|
871 |
+
"action_input": None # Clear the action input
|
872 |
+
}
|
873 |
+
|
874 |
# Router function to direct to the correct tool
|
875 |
def router(state: AgentState) -> str:
|
876 |
"""Route to the appropriate tool based on the current_tool field."""
|
|
|
879 |
print(f"Routing to: {tool}")
|
880 |
print(f"Router received action_input: {action_input}")
|
881 |
|
|
|
|
|
882 |
if tool == "python_code":
|
883 |
return "python_code"
|
884 |
elif tool == "webpage_scrape":
|
|
|
893 |
return "supabase_operation"
|
894 |
elif tool == "excel_to_text":
|
895 |
return "excel_to_text"
|
896 |
+
elif tool == "process_youtube_video":
|
897 |
+
return "process_youtube_video"
|
898 |
else:
|
899 |
return "end"
|
900 |
|
|
|
905 |
|
906 |
# Define nodes: these do the work
|
907 |
builder.add_node("assistant", assistant)
|
|
|
908 |
builder.add_node("python_code", python_code_node)
|
909 |
builder.add_node("webpage_scrape", webpage_scrape_node)
|
910 |
builder.add_node("wikipedia_search", wikipedia_search_node)
|
|
|
912 |
builder.add_node("arxiv_search", arxiv_search_node)
|
913 |
builder.add_node("supabase_operation", supabase_operation_node)
|
914 |
builder.add_node("excel_to_text", excel_to_text_node)
|
915 |
+
builder.add_node("process_youtube_video", process_youtube_video_node)
|
916 |
|
917 |
# Define edges: these determine how the control flow moves
|
918 |
builder.add_edge(START, "assistant")
|
|
|
937 |
"debug",
|
938 |
router,
|
939 |
{
|
|
|
940 |
"python_code": "python_code",
|
941 |
"webpage_scrape": "webpage_scrape",
|
942 |
"wikipedia_search": "wikipedia_search",
|
|
|
944 |
"arxiv_search": "arxiv_search",
|
945 |
"supabase_operation": "supabase_operation",
|
946 |
"excel_to_text": "excel_to_text",
|
947 |
+
"process_youtube_video": "process_youtube_video",
|
948 |
"end": END
|
949 |
}
|
950 |
)
|
951 |
|
952 |
# Tools always go back to assistant
|
|
|
953 |
builder.add_edge("python_code", "assistant")
|
954 |
builder.add_edge("webpage_scrape", "assistant")
|
955 |
builder.add_edge("wikipedia_search", "assistant")
|
|
|
957 |
builder.add_edge("arxiv_search", "assistant")
|
958 |
builder.add_edge("supabase_operation", "assistant")
|
959 |
builder.add_edge("excel_to_text", "assistant")
|
960 |
+
builder.add_edge("process_youtube_video", "assistant")
|
961 |
|
962 |
# Compile the graph
|
963 |
return builder.compile()
|
|
|
1043 |
print("\nFinal Response:")
|
1044 |
print(response)
|
1045 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
CHANGED
@@ -10,4 +10,7 @@ beautifulsoup4
|
|
10 |
html2text
|
11 |
supabase
|
12 |
pandas
|
13 |
-
tabulate
|
|
|
|
|
|
|
|
10 |
html2text
|
11 |
supabase
|
12 |
pandas
|
13 |
+
tabulate
|
14 |
+
pytube
|
15 |
+
youtube-transcript-api
|
16 |
+
python-dotenv
|
tools.py
CHANGED
@@ -16,6 +16,11 @@ from langchain_community.document_loaders import ArxivLoader
|
|
16 |
from langchain_community.tools.tavily_search import TavilySearchResults
|
17 |
from supabase import create_client, Client
|
18 |
|
|
|
|
|
|
|
|
|
|
|
19 |
load_dotenv()
|
20 |
|
21 |
def run_python_code(code: str):
|
@@ -590,6 +595,135 @@ def save_attachment_to_tempfile(file_content_b64: str, file_extension: str = '.x
|
|
590 |
print(f"Error saving attachment: {e}")
|
591 |
return None
|
592 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
593 |
# Define the tools configuration
|
594 |
tools_config = [
|
595 |
{
|
@@ -621,5 +755,10 @@ tools_config = [
|
|
621 |
"name": "excel_to_text",
|
622 |
"description": "Read an Excel file and return a Markdown table. You can provide either the path to an Excel file or use a file attachment. For attachments, provide a base64-encoded string of the file content and a filename.",
|
623 |
"func": excel_to_text
|
|
|
|
|
|
|
|
|
|
|
624 |
}
|
625 |
]
|
|
|
16 |
from langchain_community.tools.tavily_search import TavilySearchResults
|
17 |
from supabase import create_client, Client
|
18 |
|
19 |
+
# Add new imports for YouTube processing
|
20 |
+
import re
|
21 |
+
import pytube
|
22 |
+
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
|
23 |
+
|
24 |
load_dotenv()
|
25 |
|
26 |
def run_python_code(code: str):
|
|
|
595 |
print(f"Error saving attachment: {e}")
|
596 |
return None
|
597 |
|
598 |
+
def process_youtube_video(url: str, summarize: bool = True) -> str:
|
599 |
+
"""
|
600 |
+
Process a YouTube video by extracting its transcript/captions and basic metadata.
|
601 |
+
Optionally summarize the content.
|
602 |
+
|
603 |
+
Args:
|
604 |
+
url: URL of the YouTube video
|
605 |
+
summarize: Whether to include a summary of the video content
|
606 |
+
|
607 |
+
Returns:
|
608 |
+
Formatted video information including title, description, transcript, and optional summary
|
609 |
+
"""
|
610 |
+
try:
|
611 |
+
# Validate YouTube URL
|
612 |
+
if "youtube.com" not in url and "youtu.be" not in url:
|
613 |
+
return f"Error: The URL {url} doesn't appear to be a valid YouTube link"
|
614 |
+
|
615 |
+
print(f"Processing YouTube video: {url}")
|
616 |
+
|
617 |
+
# Extract video ID from the URL
|
618 |
+
video_id = None
|
619 |
+
if "youtube.com/watch" in url:
|
620 |
+
# Format: https://www.youtube.com/watch?v=VIDEO_ID
|
621 |
+
query_string = urlparse(url).query
|
622 |
+
params = {p.split('=')[0]: p.split('=')[1] for p in query_string.split('&') if '=' in p}
|
623 |
+
video_id = params.get('v')
|
624 |
+
elif "youtu.be" in url:
|
625 |
+
# Format: https://youtu.be/VIDEO_ID
|
626 |
+
video_id = url.split('/')[-1]
|
627 |
+
|
628 |
+
if not video_id:
|
629 |
+
return f"Error: Could not extract video ID from the URL: {url}"
|
630 |
+
|
631 |
+
# Get video metadata using pytube
|
632 |
+
try:
|
633 |
+
youtube = pytube.YouTube(url)
|
634 |
+
video_title = youtube.title
|
635 |
+
video_author = youtube.author
|
636 |
+
video_description = youtube.description
|
637 |
+
video_length = youtube.length # in seconds
|
638 |
+
video_views = youtube.views
|
639 |
+
video_publish_date = youtube.publish_date
|
640 |
+
except Exception as e:
|
641 |
+
print(f"Error getting video metadata: {e}")
|
642 |
+
video_title = "Unknown title"
|
643 |
+
video_author = "Unknown author"
|
644 |
+
video_description = "No description available"
|
645 |
+
video_length = 0
|
646 |
+
video_views = 0
|
647 |
+
video_publish_date = None
|
648 |
+
|
649 |
+
# Format video length from seconds to minutes and seconds
|
650 |
+
minutes = video_length // 60
|
651 |
+
seconds = video_length % 60
|
652 |
+
length_formatted = f"{minutes}:{seconds:02d}"
|
653 |
+
|
654 |
+
# Get video transcript using youtube_transcript_api
|
655 |
+
try:
|
656 |
+
transcript_list = YouTubeTranscriptApi.get_transcript(video_id)
|
657 |
+
|
658 |
+
# Format transcript into readable text
|
659 |
+
transcript_text = ""
|
660 |
+
for entry in transcript_list:
|
661 |
+
start_time = int(entry['start'])
|
662 |
+
start_minutes = start_time // 60
|
663 |
+
start_seconds = start_time % 60
|
664 |
+
text = entry['text']
|
665 |
+
transcript_text += f"[{start_minutes}:{start_seconds:02d}] {text}\n"
|
666 |
+
|
667 |
+
except (TranscriptsDisabled, NoTranscriptFound) as e:
|
668 |
+
transcript_text = "No transcript available for this video."
|
669 |
+
except Exception as e:
|
670 |
+
transcript_text = f"Error retrieving transcript: {str(e)}"
|
671 |
+
|
672 |
+
# Compile all information
|
673 |
+
result = f"Video Title: {video_title}\n"
|
674 |
+
result += f"Creator: {video_author}\n"
|
675 |
+
result += f"Length: {length_formatted}\n"
|
676 |
+
result += f"Views: {video_views:,}\n"
|
677 |
+
if video_publish_date:
|
678 |
+
result += f"Published: {video_publish_date.strftime('%Y-%m-%d')}\n"
|
679 |
+
result += f"URL: {url}\n\n"
|
680 |
+
|
681 |
+
# Add description (truncated if too long)
|
682 |
+
if video_description:
|
683 |
+
if len(video_description) > 500:
|
684 |
+
description_preview = video_description[:500] + "..."
|
685 |
+
else:
|
686 |
+
description_preview = video_description
|
687 |
+
result += f"Description:\n{description_preview}\n\n"
|
688 |
+
|
689 |
+
# Add transcript
|
690 |
+
result += "Transcript:\n"
|
691 |
+
|
692 |
+
# Check if transcript is too long (over 5000 chars) and truncate if needed
|
693 |
+
if len(transcript_text) > 5000:
|
694 |
+
result += transcript_text[:5000] + "...\n[Transcript truncated due to length]\n"
|
695 |
+
else:
|
696 |
+
result += transcript_text + "\n"
|
697 |
+
|
698 |
+
return result
|
699 |
+
|
700 |
+
except Exception as e:
|
701 |
+
return f"Error processing YouTube video: {str(e)}"
|
702 |
+
|
703 |
+
def extract_youtube_video_id(url: str) -> Optional[str]:
|
704 |
+
"""
|
705 |
+
Extract the YouTube video ID from various URL formats.
|
706 |
+
|
707 |
+
Args:
|
708 |
+
url: A YouTube URL
|
709 |
+
|
710 |
+
Returns:
|
711 |
+
The video ID or None if it cannot be extracted
|
712 |
+
"""
|
713 |
+
# Various YouTube URL patterns
|
714 |
+
patterns = [
|
715 |
+
r'(?:youtube\.com/watch\?v=|youtu\.be/|youtube\.com/embed/|youtube\.com/v/|youtube\.com/e/|youtube\.com/watch\?.*v=|youtube\.com/watch\?.*&v=)([^&?/\s]{11})',
|
716 |
+
r'youtube\.com/shorts/([^&?/\s]{11})',
|
717 |
+
r'youtube\.com/live/([^&?/\s]{11})'
|
718 |
+
]
|
719 |
+
|
720 |
+
for pattern in patterns:
|
721 |
+
match = re.search(pattern, url)
|
722 |
+
if match:
|
723 |
+
return match.group(1)
|
724 |
+
|
725 |
+
return None
|
726 |
+
|
727 |
# Define the tools configuration
|
728 |
tools_config = [
|
729 |
{
|
|
|
755 |
"name": "excel_to_text",
|
756 |
"description": "Read an Excel file and return a Markdown table. You can provide either the path to an Excel file or use a file attachment. For attachments, provide a base64-encoded string of the file content and a filename.",
|
757 |
"func": excel_to_text
|
758 |
+
},
|
759 |
+
{
|
760 |
+
"name": "process_youtube_video",
|
761 |
+
"description": "Extract and process information from a YouTube video including its transcript, title, author, and other metadata. Provide a URL in the format: {\"url\": \"https://www.youtube.com/watch?v=VIDEO_ID\", \"summarize\": true}",
|
762 |
+
"func": process_youtube_video
|
763 |
}
|
764 |
]
|