File size: 10,338 Bytes
6b14cc6
c82482f
6b14cc6
 
13b6e34
 
 
 
 
6b14cc6
 
c82482f
5ee2dd5
c82482f
6b14cc6
 
c82482f
 
6b14cc6
c82482f
 
6b14cc6
 
 
c82482f
6b14cc6
 
c82482f
6b14cc6
 
c82482f
6b14cc6
 
c82482f
6b14cc6
 
 
 
 
c82482f
6b14cc6
 
c82482f
6b14cc6
 
 
 
 
 
 
c82482f
6b14cc6
9b5b26a
5ee2dd5
9b5b26a
6b14cc6
 
c82482f
9b5b26a
6b14cc6
9b5b26a
 
6b14cc6
 
 
 
 
9b5b26a
6b14cc6
8c01ffb
13b6e34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5ee2dd5
13b6e34
 
5ee2dd5
 
13b6e34
 
 
 
 
5ee2dd5
 
13b6e34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5ee2dd5
13b6e34
5ee2dd5
13b6e34
 
 
 
5ee2dd5
 
13b6e34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5ee2dd5
 
6b14cc6
ae7a494
6b14cc6
 
ae7a494
6b14cc6
c82482f
6b14cc6
 
 
 
 
 
 
13b6e34
6b14cc6
 
13b6e34
6b14cc6
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
from smolagents import CodeAgent, tool
import datetime
import pytz
import yaml
import os
import re
import numpy as np
from typing import List, Optional, Dict, Any
import io
from tools.final_answer import FinalAnswerTool
from Gradio_UI import GradioUI

# Text Analyzer Tool
@tool
def text_analyzer(text: str) -> str:
    """Analyzes text and returns statistics about it.
    
    Args:
        text: The text to analyze.
    """
    try:
        # Simple word count
        words = text.split()
        word_count = len(words)
        
        # Character count
        char_count = len(text)
        
        # Unique words
        unique_words = len(set(word.lower() for word in words))
        
        # Average word length
        avg_word_length = sum(len(word) for word in words) / max(1, word_count)
        
        # Most common words (top 5)
        word_freq = {}
        for word in words:
            word_lower = word.lower()
            word_freq[word_lower] = word_freq.get(word_lower, 0) + 1
        
        common_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:5]
        common_words_str = ", ".join(f"{word} ({count})" for word, count in common_words)
        
        return f"""Text Analysis Results:
- Word count: {word_count}
- Character count: {char_count}
- Unique words: {unique_words}
- Average word length: {avg_word_length:.2f}
- Most common words: {common_words_str}
"""
    except Exception as e:
        return f"Error analyzing text: {str(e)}"

# Timezone Tool
@tool
def get_current_time_in_timezone(timezone: str) -> str:
    """A tool that fetches the current local time in a specified timezone.
    
    Args:
        timezone: A string representing a valid timezone (e.g., 'America/New_York').
    """
    try:
        # Create timezone object
        tz = pytz.timezone(timezone)
        # Get current time in that timezone
        local_time = datetime.datetime.now(tz).strftime("%Y-%m-%d %H:%M:%S")
        return f"The current local time in {timezone} is: {local_time}"
    except Exception as e:
        return f"Error fetching time for timezone '{timezone}': {str(e)}"

# Simple vector embedding function using basic word frequency
def get_embedding(text: str, normalize: bool = True) -> np.ndarray:
    """Create a simple bag-of-words embedding for the text"""
    # Lowercase and clean text
    text = text.lower()
    words = re.findall(r'\b\w+\b', text)
    
    # Create a basic vocabulary (this is very simplified)
    vocabulary = {}
    for word in words:
        if word not in vocabulary:
            vocabulary[word] = len(vocabulary)
    
    # Create vector
    vector = np.zeros(max(1, len(vocabulary)))
    for word in words:
        if word in vocabulary:
            vector[vocabulary[word]] += 1
    
    # Normalize if requested
    if normalize and np.sum(vector) > 0:
        vector = vector / np.sqrt(np.sum(vector ** 2))
    
    return vector

def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
    """Calculate cosine similarity between two vectors"""
    # Handle zero vectors
    if np.sum(a) == 0 or np.sum(b) == 0:
        return 0
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

def extract_text_from_pdf_bytes(pdf_bytes: bytes) -> str:
    """Extract text from PDF bytes"""
    try:
        # First try to import PyPDF2
        try:
            import PyPDF2
        except ImportError:
            return "PDF processing requires PyPDF2 library which is not available."
        
        with io.BytesIO(pdf_bytes) as pdf_file:
            pdf_reader = PyPDF2.PdfReader(pdf_file)
            text = ""
            for page_num in range(len(pdf_reader.pages)):
                page = pdf_reader.pages[page_num]
                text += page.extract_text() + "\n"
            return text
    except Exception as e:
        return f"Error extracting text from PDF: {str(e)}"

def extract_text_from_pdf(file_path: str) -> str:
    """Extract text from PDF file"""
    try:
        # First try to import PyPDF2
        try:
            import PyPDF2
        except ImportError:
            return "PDF processing requires PyPDF2 library which is not available."
        
        with open(file_path, 'rb') as pdf_file:
            pdf_reader = PyPDF2.PdfReader(pdf_file)
            text = ""
            for page_num in range(len(pdf_reader.pages)):
                page = pdf_reader.pages[page_num]
                text += page.extract_text() + "\n"
            return text
    except Exception as e:
        return f"Error extracting text from PDF: {str(e)}"

@tool
def semantic_search(corpus: str, query: str, top_k: int = 3, file_path: Optional[str] = None) -> str:
    """Performs semantic search on a corpus of text or uploaded PDF.
    
    Args:
        corpus: The text corpus to search within (could be a large text or list of documents).
            If empty and file_path is provided, will extract text from the PDF.
        query: The search query.
        top_k: Number of top results to return.
        file_path: Optional path to a PDF file to extract text from.
    """
    try:
        final_corpus = corpus
        
        # Try to handle PDF file if specified
        if not corpus and file_path:
            # Check if file exists
            if os.path.exists(file_path):
                # Check if this is a PDF by extension
                if file_path.lower().endswith('.pdf'):
                    pdf_text = extract_text_from_pdf(file_path)
                    if pdf_text.startswith("Error") or pdf_text.startswith("PDF processing requires"):
                        return pdf_text
                    final_corpus = pdf_text
                else:
                    # If not PDF, try to read as text
                    try:
                        with open(file_path, 'r', encoding='utf-8') as f:
                            final_corpus = f.read()
                    except Exception as e:
                        return f"Error reading file: {str(e)}"
            else:
                return f"File not found: {file_path}"
        
        if not final_corpus:
            return "Error: No text corpus provided for search."
        
        # Split corpus into chunks/sentences for searching
        # This is a simple approach - in a real system you would use a more sophisticated chunking method
        chunks = re.split(r'(?<=[.!?])\s+', final_corpus)
        chunks = [chunk.strip() for chunk in chunks if len(chunk.strip()) > 10]
        
        if not chunks:
            return "No valid text chunks found in the corpus."
        
        # Get query embedding
        query_embedding = get_embedding(query)
        
        # Get embeddings for each chunk and calculate similarity
        results = []
        for i, chunk in enumerate(chunks):
            chunk_embedding = get_embedding(chunk)
            similarity = cosine_similarity(query_embedding, chunk_embedding)
            results.append((i, chunk, similarity))
        
        # Sort by similarity score (descending)
        results.sort(key=lambda x: x[2], reverse=True)
        
        # Format results
        output = f"Search results for: '{query}'\n\n"
        
        for i, (chunk_idx, chunk, score) in enumerate(results[:top_k]):
            # Truncate long chunks for display
            display_chunk = chunk
            if len(display_chunk) > 200:
                display_chunk = display_chunk[:197] + "..."
                
            output += f"{i+1}. [Score: {score:.2f}] {display_chunk}\n\n"
        
        if not results:
            output += "No matching results found."
            
        return output
        
    except Exception as e:
        return f"Error performing semantic search: {str(e)}"

@tool
def list_available_tools() -> str:
    """Lists all available tools and provides usage examples for each."""
    tools_documentation = """
# Available Tools

This agent has the following tools available:

## 1. Text Analyzer

Analyzes text and provides statistics including word count, character count, unique words count, average word length, and most common words.

**Example usage:**
- "Analyze this text: The quick brown fox jumps over the lazy dog."
- "Give me statistics about this paragraph: Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua."

## 2. Current Time in Timezone

Fetches the current local time for a specified timezone.

**Example usage:**
- "What time is it in Tokyo?"
- "Get the current time in America/New_York"
- "Tell me the time in UTC"

## 3. Semantic Search

Performs semantic search on a corpus of text or uploaded PDF document to find the most relevant sections matching a query.

**Example usage:**
- "Search for 'climate change' in this text: Global warming is the long-term heating of Earth's surface observed since the pre-industrial period due to human activities, primarily fossil fuel burning, which increases heat-trapping greenhouse gas levels in Earth's atmosphere."
- "If I have uploaded a PDF file called 'research.pdf', search for 'vaccination' in it"
- "Find information about 'neural networks' in this text: [your long text here]"

## How to Use This Agent

1. Type your request in the chat box below
2. The agent will process your request and use the appropriate tool
3. Results will be displayed in this conversation area

For complex tasks, you may need to provide additional context or data. Be as specific as possible in your requests.
"""
    return tools_documentation

# Set up the agent with our tools
final_answer = FinalAnswerTool()

with open("prompts.yaml", 'r') as stream:
    prompt_templates = yaml.safe_load(stream)

from smolagents import HfApiModel
    
model = HfApiModel(
    max_tokens=2096,
    temperature=0.5,
    model_id='Qwen/Qwen2.5-Coder-32B-Instruct',
    custom_role_conversions=None,
)

# Create agent with our tools (including the new list_available_tools)
agent = CodeAgent(
    model=model,
    tools=[text_analyzer, get_current_time_in_timezone, semantic_search, list_available_tools, final_answer],
    max_steps=6,
    verbosity_level=1,
    grammar=None,
    planning_interval=None,
    name=None,
    description=None,
    prompt_templates=prompt_templates
)

# Launch the Gradio UI
GradioUI(agent).launch()