from smolagents import CodeAgent, tool import datetime import pytz import yaml import os import re import numpy as np from typing import List, Optional, Dict, Any import io from tools.final_answer import FinalAnswerTool from Gradio_UI import GradioUI # Text Analyzer Tool @tool def text_analyzer(text: str) -> str: """Analyzes text and returns statistics about it. Args: text: The text to analyze. """ try: # Simple word count words = text.split() word_count = len(words) # Character count char_count = len(text) # Unique words unique_words = len(set(word.lower() for word in words)) # Average word length avg_word_length = sum(len(word) for word in words) / max(1, word_count) # Most common words (top 5) word_freq = {} for word in words: word_lower = word.lower() word_freq[word_lower] = word_freq.get(word_lower, 0) + 1 common_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:5] common_words_str = ", ".join(f"{word} ({count})" for word, count in common_words) return f"""Text Analysis Results: - Word count: {word_count} - Character count: {char_count} - Unique words: {unique_words} - Average word length: {avg_word_length:.2f} - Most common words: {common_words_str} """ except Exception as e: return f"Error analyzing text: {str(e)}" # Timezone Tool @tool def get_current_time_in_timezone(timezone: str) -> str: """A tool that fetches the current local time in a specified timezone. Args: timezone: A string representing a valid timezone (e.g., 'America/New_York'). """ try: # Create timezone object tz = pytz.timezone(timezone) # Get current time in that timezone local_time = datetime.datetime.now(tz).strftime("%Y-%m-%d %H:%M:%S") return f"The current local time in {timezone} is: {local_time}" except Exception as e: return f"Error fetching time for timezone '{timezone}': {str(e)}" # Simple vector embedding function using basic word frequency def get_embedding(text: str, normalize: bool = True) -> np.ndarray: """Create a simple bag-of-words embedding for the text""" # Lowercase and clean text text = text.lower() words = re.findall(r'\b\w+\b', text) # Create a basic vocabulary (this is very simplified) vocabulary = {} for word in words: if word not in vocabulary: vocabulary[word] = len(vocabulary) # Create vector vector = np.zeros(max(1, len(vocabulary))) for word in words: if word in vocabulary: vector[vocabulary[word]] += 1 # Normalize if requested if normalize and np.sum(vector) > 0: vector = vector / np.sqrt(np.sum(vector ** 2)) return vector def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float: """Calculate cosine similarity between two vectors""" # Handle zero vectors if np.sum(a) == 0 or np.sum(b) == 0: return 0 return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)) def extract_text_from_pdf_bytes(pdf_bytes: bytes) -> str: """Extract text from PDF bytes""" try: # First try to import PyPDF2 try: import PyPDF2 except ImportError: return "PDF processing requires PyPDF2 library which is not available." with io.BytesIO(pdf_bytes) as pdf_file: pdf_reader = PyPDF2.PdfReader(pdf_file) text = "" for page_num in range(len(pdf_reader.pages)): page = pdf_reader.pages[page_num] text += page.extract_text() + "\n" return text except Exception as e: return f"Error extracting text from PDF: {str(e)}" def extract_text_from_pdf(file_path: str) -> str: """Extract text from PDF file""" try: # First try to import PyPDF2 try: import PyPDF2 except ImportError: return "PDF processing requires PyPDF2 library which is not available." with open(file_path, 'rb') as pdf_file: pdf_reader = PyPDF2.PdfReader(pdf_file) text = "" for page_num in range(len(pdf_reader.pages)): page = pdf_reader.pages[page_num] text += page.extract_text() + "\n" return text except Exception as e: return f"Error extracting text from PDF: {str(e)}" @tool def semantic_search(corpus: str, query: str, top_k: int = 3, file_path: Optional[str] = None) -> str: """Performs semantic search on a corpus of text or uploaded PDF. Args: corpus: The text corpus to search within (could be a large text or list of documents). If empty and file_path is provided, will extract text from the PDF. query: The search query. top_k: Number of top results to return. file_path: Optional path to a PDF file to extract text from. """ try: final_corpus = corpus # Try to handle PDF file if specified if not corpus and file_path: # Check if file exists if os.path.exists(file_path): # Check if this is a PDF by extension if file_path.lower().endswith('.pdf'): pdf_text = extract_text_from_pdf(file_path) if pdf_text.startswith("Error") or pdf_text.startswith("PDF processing requires"): return pdf_text final_corpus = pdf_text else: # If not PDF, try to read as text try: with open(file_path, 'r', encoding='utf-8') as f: final_corpus = f.read() except Exception as e: return f"Error reading file: {str(e)}" else: return f"File not found: {file_path}" if not final_corpus: return "Error: No text corpus provided for search." # Split corpus into chunks/sentences for searching # This is a simple approach - in a real system you would use a more sophisticated chunking method chunks = re.split(r'(?<=[.!?])\s+', final_corpus) chunks = [chunk.strip() for chunk in chunks if len(chunk.strip()) > 10] if not chunks: return "No valid text chunks found in the corpus." # Get query embedding query_embedding = get_embedding(query) # Get embeddings for each chunk and calculate similarity results = [] for i, chunk in enumerate(chunks): chunk_embedding = get_embedding(chunk) similarity = cosine_similarity(query_embedding, chunk_embedding) results.append((i, chunk, similarity)) # Sort by similarity score (descending) results.sort(key=lambda x: x[2], reverse=True) # Format results output = f"Search results for: '{query}'\n\n" for i, (chunk_idx, chunk, score) in enumerate(results[:top_k]): # Truncate long chunks for display display_chunk = chunk if len(display_chunk) > 200: display_chunk = display_chunk[:197] + "..." output += f"{i+1}. [Score: {score:.2f}] {display_chunk}\n\n" if not results: output += "No matching results found." return output except Exception as e: return f"Error performing semantic search: {str(e)}" @tool def list_available_tools() -> str: """Lists all available tools and provides usage examples for each.""" tools_documentation = """ # Available Tools This agent has the following tools available: ## 1. Text Analyzer Analyzes text and provides statistics including word count, character count, unique words count, average word length, and most common words. **Example usage:** - "Analyze this text: The quick brown fox jumps over the lazy dog." - "Give me statistics about this paragraph: Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua." ## 2. Current Time in Timezone Fetches the current local time for a specified timezone. **Example usage:** - "What time is it in Tokyo?" - "Get the current time in America/New_York" - "Tell me the time in UTC" ## 3. Semantic Search Performs semantic search on a corpus of text or uploaded PDF document to find the most relevant sections matching a query. **Example usage:** - "Search for 'climate change' in this text: Global warming is the long-term heating of Earth's surface observed since the pre-industrial period due to human activities, primarily fossil fuel burning, which increases heat-trapping greenhouse gas levels in Earth's atmosphere." - "If I have uploaded a PDF file called 'research.pdf', search for 'vaccination' in it" - "Find information about 'neural networks' in this text: [your long text here]" ## How to Use This Agent 1. Type your request in the chat box below 2. The agent will process your request and use the appropriate tool 3. Results will be displayed in this conversation area For complex tasks, you may need to provide additional context or data. Be as specific as possible in your requests. """ return tools_documentation # Set up the agent with our tools final_answer = FinalAnswerTool() with open("prompts.yaml", 'r') as stream: prompt_templates = yaml.safe_load(stream) from smolagents import HfApiModel model = HfApiModel( max_tokens=2096, temperature=0.5, model_id='Qwen/Qwen2.5-Coder-32B-Instruct', custom_role_conversions=None, ) # Create agent with our tools (including the new list_available_tools) agent = CodeAgent( model=model, tools=[text_analyzer, get_current_time_in_timezone, semantic_search, list_available_tools, final_answer], max_steps=6, verbosity_level=1, grammar=None, planning_interval=None, name=None, description=None, prompt_templates=prompt_templates ) # Launch the Gradio UI GradioUI(agent).launch()