Alaaeldin's picture
Update app.py
13b6e34 verified
from smolagents import CodeAgent, tool
import datetime
import pytz
import yaml
import os
import re
import numpy as np
from typing import List, Optional, Dict, Any
import io
from tools.final_answer import FinalAnswerTool
from Gradio_UI import GradioUI
# Text Analyzer Tool
@tool
def text_analyzer(text: str) -> str:
"""Analyzes text and returns statistics about it.
Args:
text: The text to analyze.
"""
try:
# Simple word count
words = text.split()
word_count = len(words)
# Character count
char_count = len(text)
# Unique words
unique_words = len(set(word.lower() for word in words))
# Average word length
avg_word_length = sum(len(word) for word in words) / max(1, word_count)
# Most common words (top 5)
word_freq = {}
for word in words:
word_lower = word.lower()
word_freq[word_lower] = word_freq.get(word_lower, 0) + 1
common_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:5]
common_words_str = ", ".join(f"{word} ({count})" for word, count in common_words)
return f"""Text Analysis Results:
- Word count: {word_count}
- Character count: {char_count}
- Unique words: {unique_words}
- Average word length: {avg_word_length:.2f}
- Most common words: {common_words_str}
"""
except Exception as e:
return f"Error analyzing text: {str(e)}"
# Timezone Tool
@tool
def get_current_time_in_timezone(timezone: str) -> str:
"""A tool that fetches the current local time in a specified timezone.
Args:
timezone: A string representing a valid timezone (e.g., 'America/New_York').
"""
try:
# Create timezone object
tz = pytz.timezone(timezone)
# Get current time in that timezone
local_time = datetime.datetime.now(tz).strftime("%Y-%m-%d %H:%M:%S")
return f"The current local time in {timezone} is: {local_time}"
except Exception as e:
return f"Error fetching time for timezone '{timezone}': {str(e)}"
# Simple vector embedding function using basic word frequency
def get_embedding(text: str, normalize: bool = True) -> np.ndarray:
"""Create a simple bag-of-words embedding for the text"""
# Lowercase and clean text
text = text.lower()
words = re.findall(r'\b\w+\b', text)
# Create a basic vocabulary (this is very simplified)
vocabulary = {}
for word in words:
if word not in vocabulary:
vocabulary[word] = len(vocabulary)
# Create vector
vector = np.zeros(max(1, len(vocabulary)))
for word in words:
if word in vocabulary:
vector[vocabulary[word]] += 1
# Normalize if requested
if normalize and np.sum(vector) > 0:
vector = vector / np.sqrt(np.sum(vector ** 2))
return vector
def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
"""Calculate cosine similarity between two vectors"""
# Handle zero vectors
if np.sum(a) == 0 or np.sum(b) == 0:
return 0
return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
def extract_text_from_pdf_bytes(pdf_bytes: bytes) -> str:
"""Extract text from PDF bytes"""
try:
# First try to import PyPDF2
try:
import PyPDF2
except ImportError:
return "PDF processing requires PyPDF2 library which is not available."
with io.BytesIO(pdf_bytes) as pdf_file:
pdf_reader = PyPDF2.PdfReader(pdf_file)
text = ""
for page_num in range(len(pdf_reader.pages)):
page = pdf_reader.pages[page_num]
text += page.extract_text() + "\n"
return text
except Exception as e:
return f"Error extracting text from PDF: {str(e)}"
def extract_text_from_pdf(file_path: str) -> str:
"""Extract text from PDF file"""
try:
# First try to import PyPDF2
try:
import PyPDF2
except ImportError:
return "PDF processing requires PyPDF2 library which is not available."
with open(file_path, 'rb') as pdf_file:
pdf_reader = PyPDF2.PdfReader(pdf_file)
text = ""
for page_num in range(len(pdf_reader.pages)):
page = pdf_reader.pages[page_num]
text += page.extract_text() + "\n"
return text
except Exception as e:
return f"Error extracting text from PDF: {str(e)}"
@tool
def semantic_search(corpus: str, query: str, top_k: int = 3, file_path: Optional[str] = None) -> str:
"""Performs semantic search on a corpus of text or uploaded PDF.
Args:
corpus: The text corpus to search within (could be a large text or list of documents).
If empty and file_path is provided, will extract text from the PDF.
query: The search query.
top_k: Number of top results to return.
file_path: Optional path to a PDF file to extract text from.
"""
try:
final_corpus = corpus
# Try to handle PDF file if specified
if not corpus and file_path:
# Check if file exists
if os.path.exists(file_path):
# Check if this is a PDF by extension
if file_path.lower().endswith('.pdf'):
pdf_text = extract_text_from_pdf(file_path)
if pdf_text.startswith("Error") or pdf_text.startswith("PDF processing requires"):
return pdf_text
final_corpus = pdf_text
else:
# If not PDF, try to read as text
try:
with open(file_path, 'r', encoding='utf-8') as f:
final_corpus = f.read()
except Exception as e:
return f"Error reading file: {str(e)}"
else:
return f"File not found: {file_path}"
if not final_corpus:
return "Error: No text corpus provided for search."
# Split corpus into chunks/sentences for searching
# This is a simple approach - in a real system you would use a more sophisticated chunking method
chunks = re.split(r'(?<=[.!?])\s+', final_corpus)
chunks = [chunk.strip() for chunk in chunks if len(chunk.strip()) > 10]
if not chunks:
return "No valid text chunks found in the corpus."
# Get query embedding
query_embedding = get_embedding(query)
# Get embeddings for each chunk and calculate similarity
results = []
for i, chunk in enumerate(chunks):
chunk_embedding = get_embedding(chunk)
similarity = cosine_similarity(query_embedding, chunk_embedding)
results.append((i, chunk, similarity))
# Sort by similarity score (descending)
results.sort(key=lambda x: x[2], reverse=True)
# Format results
output = f"Search results for: '{query}'\n\n"
for i, (chunk_idx, chunk, score) in enumerate(results[:top_k]):
# Truncate long chunks for display
display_chunk = chunk
if len(display_chunk) > 200:
display_chunk = display_chunk[:197] + "..."
output += f"{i+1}. [Score: {score:.2f}] {display_chunk}\n\n"
if not results:
output += "No matching results found."
return output
except Exception as e:
return f"Error performing semantic search: {str(e)}"
@tool
def list_available_tools() -> str:
"""Lists all available tools and provides usage examples for each."""
tools_documentation = """
# Available Tools
This agent has the following tools available:
## 1. Text Analyzer
Analyzes text and provides statistics including word count, character count, unique words count, average word length, and most common words.
**Example usage:**
- "Analyze this text: The quick brown fox jumps over the lazy dog."
- "Give me statistics about this paragraph: Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua."
## 2. Current Time in Timezone
Fetches the current local time for a specified timezone.
**Example usage:**
- "What time is it in Tokyo?"
- "Get the current time in America/New_York"
- "Tell me the time in UTC"
## 3. Semantic Search
Performs semantic search on a corpus of text or uploaded PDF document to find the most relevant sections matching a query.
**Example usage:**
- "Search for 'climate change' in this text: Global warming is the long-term heating of Earth's surface observed since the pre-industrial period due to human activities, primarily fossil fuel burning, which increases heat-trapping greenhouse gas levels in Earth's atmosphere."
- "If I have uploaded a PDF file called 'research.pdf', search for 'vaccination' in it"
- "Find information about 'neural networks' in this text: [your long text here]"
## How to Use This Agent
1. Type your request in the chat box below
2. The agent will process your request and use the appropriate tool
3. Results will be displayed in this conversation area
For complex tasks, you may need to provide additional context or data. Be as specific as possible in your requests.
"""
return tools_documentation
# Set up the agent with our tools
final_answer = FinalAnswerTool()
with open("prompts.yaml", 'r') as stream:
prompt_templates = yaml.safe_load(stream)
from smolagents import HfApiModel
model = HfApiModel(
max_tokens=2096,
temperature=0.5,
model_id='Qwen/Qwen2.5-Coder-32B-Instruct',
custom_role_conversions=None,
)
# Create agent with our tools (including the new list_available_tools)
agent = CodeAgent(
model=model,
tools=[text_analyzer, get_current_time_in_timezone, semantic_search, list_available_tools, final_answer],
max_steps=6,
verbosity_level=1,
grammar=None,
planning_interval=None,
name=None,
description=None,
prompt_templates=prompt_templates
)
# Launch the Gradio UI
GradioUI(agent).launch()