File size: 15,384 Bytes
47fc4b4
 
1821344
1989065
1b17a7b
9f952bc
50efd9f
1989065
1f3f2ad
 
 
 
1b70f99
1f3f2ad
50efd9f
1b17a7b
1989065
 
 
9f952bc
1f3f2ad
 
 
 
1989065
9f952bc
2805605
1f3f2ad
9f952bc
2805605
 
 
1b70f99
 
 
 
 
1f3f2ad
1989065
 
 
 
 
0a9b7ca
9f952bc
1b70f99
 
 
a4e9047
1b70f99
5d51eb7
1989065
1b17a7b
1989065
 
1b17a7b
1989065
1b70f99
 
1989065
0a9b7ca
 
 
1b70f99
 
 
 
 
 
 
 
 
 
1f3f2ad
1b70f99
1f3f2ad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1b70f99
1f3f2ad
 
 
 
 
422b6c8
1f3f2ad
 
 
 
 
 
 
1989065
 
1b70f99
88430cf
1f3f2ad
1989065
812d914
9f3cddf
 
1b70f99
 
 
 
 
 
 
 
 
1b17a7b
e83a3b2
 
1b70f99
 
 
 
 
 
 
 
47fc4b4
1f3f2ad
1989065
 
 
1b70f99
422b6c8
1f3f2ad
 
a4e9047
1f3f2ad
a4e9047
1f3f2ad
1b70f99
 
 
 
 
 
 
 
 
a4e9047
1b70f99
 
 
1989065
1f3f2ad
50efd9f
 
 
 
 
1b70f99
 
 
50efd9f
1b70f99
50efd9f
 
1b17a7b
1b70f99
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1f3f2ad
 
 
1b70f99
 
 
 
 
 
 
 
 
 
 
 
 
1f3f2ad
 
 
 
0a9b7ca
1b70f99
1f3f2ad
1b70f99
 
 
 
 
 
 
 
 
 
 
 
 
 
0a9b7ca
1f3f2ad
 
 
1b70f99
 
 
 
 
50efd9f
1b70f99
1b17a7b
1b70f99
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1b17a7b
1f3f2ad
1b70f99
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9f3cddf
1f3f2ad
88430cf
 
1b70f99
88430cf
 
1b70f99
 
 
 
 
 
 
 
 
 
9f952bc
1b70f99
75300a2
 
 
1b70f99
75300a2
1b70f99
 
88430cf
 
 
75300a2
1b70f99
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
import gradio as gr
import logging
import requests
import time
from bs4 import BeautifulSoup
from datetime import datetime
from typing import List, Optional, Tuple
from urllib.parse import urljoin, urlparse
import random
import nltk
from nltk.tokenize import sent_tokenize
import PyPDF2
import io
from joblib import dump, load

from transformers import AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from icalendar import Calendar
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from fake_useragent import UserAgent
from concurrent.futures import ThreadPoolExecutor

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)

# Download NLTK data
try:
    nltk.download('punkt', quiet=True)
except Exception as e:
    logger.warning(f"Failed to download NLTK data: {e}")

class Config:
    MODEL_NAME = "microsoft/DialoGPT-medium"
    EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
    MAX_TOKENS = 1000
    REQUEST_TIMEOUT = 10
    MAX_DEPTH = 1
    SIMILARITY_THRESHOLD = 0.5
    CHUNK_SIZE = 512
    MAX_WORKERS = 5
    INDEXED_URLS = {
        "https://drive.google.com/file/d/1d5kkqaQkdiA2SwJ0JFrTuKO9zauiUtFz/view?usp=sharing"
    }

class ResourceItem:
    def __init__(self, url: str, content: str, resource_type: str):
        self.url = url
        self.content = content
        self.type = resource_type
        self.embedding = None
        self.chunks = []
        self.chunk_embeddings = []

    def __str__(self):
        return f"ResourceItem(type={self.type}, url={self.url}, content_length={len(self.content)})"

    def create_chunks(self, chunk_size=Config.CHUNK_SIZE):
        """Split content into overlapping chunks for better context preservation"""
        words = self.content.split()
        overlap = chunk_size // 4  # 25% overlap
        
        for i in range(0, len(words), chunk_size - overlap):
            chunk = ' '.join(words[i:i + chunk_size])
            if chunk:
                self.chunks.append(chunk)

class RobustCrawler:
    def __init__(self, max_retries=3, backoff_factor=0.3):
        self.ua = UserAgent()
        self.session = self._create_robust_session(max_retries, backoff_factor)

    def _create_robust_session(self, max_retries, backoff_factor):
        session = requests.Session()
        retry_strategy = Retry(
            total=max_retries,
            status_forcelist=[429, 500, 502, 503, 504],
            method_whitelist=["HEAD", "GET", "OPTIONS"],
            backoff_factor=backoff_factor,
            raise_on_status=False
        )
        adapter = HTTPAdapter(max_retries=retry_strategy)
        session.mount("https://", adapter)
        session.mount("http://", adapter)
        return session

    def get_headers(self):
        return {
            "User-Agent": self.ua.random,
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
            "Accept-Language": "en-US,en;q=0.5",
            "Referer": "https://www.google.com/",
            "DNT": "1",
            "Connection": "keep-alive",
            "Upgrade-Insecure-Requests": "1"
        }

    def crawl_with_exponential_backoff(self, url, timeout=Config.REQUEST_TIMEOUT):
        try:
            time.sleep(random.uniform(0.5, 2.0))
            response = self.session.get(
                url,
                headers=self.get_headers(),
                timeout=timeout
            )
            response.raise_for_status()
            return response
        except requests.exceptions.RequestException as e:
            logger.error(f"Crawling error for {url}: {e}")
            return None

class SchoolChatbot:
    def __init__(self):
        logger.info("Initializing SchoolChatbot...")
        self.setup_models()
        self.resources = []
        self.visited_urls = set()
        self.crawl_and_index_resources()

    def setup_models(self):
        try:
            logger.info("Setting up models...")
            self.tokenizer = AutoTokenizer.from_pretrained(Config.MODEL_NAME)
            self.model = AutoModelForCausalLM.from_pretrained(Config.MODEL_NAME)
            self.embedding_model = SentenceTransformer(Config.EMBEDDING_MODEL)
            logger.info("Models setup completed successfully.")
        except Exception as e:
            logger.error(f"Failed to setup models: {e}")
            raise RuntimeError("Failed to initialize required models")

    def crawl_and_index_resources(self):
        logger.info("Starting to crawl and index resources...")
        with ThreadPoolExecutor(max_workers=Config.MAX_WORKERS) as executor:
            futures = [executor.submit(self.crawl_url, url, 0) for url in Config.INDEXED_URLS]
            for future in futures:
                try:
                    future.result()
                except Exception as e:
                    logger.error(f"Error in crawling thread: {e}")
        logger.info(f"Crawling completed. Indexed {len(self.resources)} resources.")

    def crawl_url(self, url, depth):
        if depth > Config.MAX_DEPTH or url in self.visited_urls:
            return
        self.visited_urls.add(url)
        
        crawler = RobustCrawler()
        response = crawler.crawl_with_exponential_backoff(url)
        if not response:
            logger.error(f"Failed to retrieve content from {url}. Please check the URL and permissions.")
            return
    
        content_type = response.headers.get("Content-Type", "").lower()
        
        try:
            if "text/calendar" in content_type or url.endswith(".ics"):
                self.extract_ics_content(url, response.text)
            elif "text/html" in content_type:
                self.extract_html_content(url, response)
            elif "application/pdf" in content_type:
                self.extract_pdf_content(url, response.content)
            else:
                logger.warning(f"Unknown content type for {url}: {content_type}")
                self.store_resource(url, response.text, 'unknown')
        except Exception as e:
            logger.error(f"Error processing {url}: {e}")

    def extract_ics_content(self, url, ics_text):
        try:
            cal = Calendar.from_ical(ics_text)
            events = []
            for component in cal.walk():
                if component.name == "VEVENT":
                    event = self._format_calendar_event(component)
                    if event:
                        events.append(event)
            if events:
                self.store_resource(url, "\n".join(events), 'calendar')
        except Exception as e:
            logger.error(f"Error parsing ICS from {url}: {e}")

    def _format_calendar_event(self, event):
        try:
            summary = event.get("SUMMARY", "No Summary")
            start = event.get("DTSTART", "").dt
            end = event.get("DTEND", "").dt
            description = event.get("DESCRIPTION", "")
            location = event.get("LOCATION", "")
            
            event_details = [f"Event: {summary}"]
            if start:
                event_details.append(f"Start: {start}")
            if end:
                event_details.append(f"End: {end}")
            if location:
                event_details.append(f"Location: {location}")
            if description:
                event_details.append(f"Description: {description}")
                
            return " | ".join(event_details)
        except Exception:
            return None

    def extract_html_content(self, url, response):
        try:
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Remove unwanted elements
            for element in soup.find_all(['script', 'style', 'nav', 'footer']):
                element.decompose()
            
            content_sections = []
            
            # Extract main content
            main_content = soup.find(['main', 'article', 'div'], class_=['content', 'main-content'])
            if main_content:
                content_sections.append(main_content.get_text(strip=True, separator=' '))
            
            # Extract headings and their associated content
            for heading in soup.find_all(['h1', 'h2', 'h3']):
                section = [heading.get_text(strip=True)]
                next_elem = heading.find_next_sibling()
                while next_elem and next_elem.name in ['p', 'ul', 'ol', 'div']:
                    section.append(next_elem.get_text(strip=True))
                    next_elem = next_elem.find_next_sibling()
                content_sections.append(' '.join(section))
            
            if content_sections:
                self.store_resource(url, ' '.join(content_sections), 'webpage')
                
            # Process links if within depth limit
            if len(self.visited_urls) < Config.MAX_DEPTH:
                self._process_links(soup, url)
                
        except Exception as e:
            logger.error(f"Error extracting HTML content from {url}: {e}")

    def _process_links(self, soup, base_url):
        try:
            for link in soup.find_all('a', href=True):
                full_url = urljoin(base_url, link['href'])
                if self.is_valid_url(full_url) and full_url not in self.visited_urls:
                    time.sleep(random.uniform(0.5, 2.0))
                    self.crawl_url(full_url, len(self.visited_urls))
        except Exception as e:
            logger.error(f"Error processing links from {base_url}: {e}")

    def extract_pdf_content(self, url, pdf_content):
        try:
            pdf_file = io.BytesIO(pdf_content)
            pdf_reader = PyPDF2.PdfReader(pdf_file)
            text_content = []
            
            for page in pdf_reader.pages:
                try:
                    text_content.append(page.extract_text())
                except Exception as e:
                    logger.error(f"Error extracting text from PDF page: {e}")
                    continue
            
            if text_content:
                self.store_resource(url, ' '.join(text_content), 'pdf')
        except Exception as e:
            logger.error(f"Error extracting PDF content from {url}: {e}")

    def store_resource(self, url, text_data, resource_type):
        try:
            # Create resource item and split into chunks
            item = ResourceItem(url, text_data, resource_type)
            item.create_chunks()
            
            # Generate embeddings for chunks
            item.chunk_embeddings = [
                self.embedding_model.encode(chunk) 
                for chunk in item.chunks
            ]
            
            # Calculate average embedding
            if item.chunk_embeddings:
                item.embedding = np.mean(item.chunk_embeddings, axis=0)
                self.resources.append(item)
                logger.debug(f"Stored resource: {url} (type={resource_type})")
            
        except Exception as e:
            logger.error(f"Error storing resource {url}: {e}")

    def is_valid_url(self, url):
        try:
            parsed = urlparse(url)
            return bool(parsed.scheme) and bool(parsed.netloc)
        except Exception:
            return False

    def find_best_matching_chunks(self, query, n_chunks=3):
        if not self.resources:
            return []
        
        try:
            query_embedding = self.embedding_model.encode(query)
            all_chunks = []
            
            for resource in self.resources:
                for chunk, embedding in zip(resource.chunks, resource.chunk_embeddings):
                    score = cosine_similarity([query_embedding], [embedding])[0][0]
                    if score > Config.SIMILARITY_THRESHOLD:
                        all_chunks.append((chunk, score, resource.url))
            
            # Sort by similarity score and get top n chunks
            all_chunks.sort(key=lambda x: x[1], reverse=True)
            return all_chunks[:n_chunks]
            
        except Exception as e:
            logger.error(f"Error finding matching chunks: {e}")
            return []

    def generate_response(self, user_input):
        try:
            # Find best matching chunks
            best_chunks = self.find_best_matching_chunks(user_input)
            
            if not best_chunks:
                return "I apologize, but I couldn't find any relevant information in my knowledge base. Could you please rephrase your question or ask about something else?"
            
            # Prepare context from best matching chunks
            context = "\n".join([chunk[0] for chunk in best_chunks])
            
            # Prepare conversation history
            conversation = f"Context: {context}\nUser: {user_input}\nAssistant:"
            
            # Generate response
            input_ids = self.tokenizer.encode(conversation, return_tensors='pt')
            response_ids = self.model.generate(
                input_ids,
                max_length=Config.MAX_TOKENS,
                pad_token_id=self.tokenizer.eos_token_id,
                temperature=0.7,
                top_p=0.9,
                do_sample=True
            )
            
            response = self.tokenizer.decode(
                response_ids[:, input_ids.shape[-1]:][0],
                skip_special_tokens=True
            )
            
            # Format response with source
            source_urls = list(set(chunk[2] for chunk in best_chunks))
            sources = "\n\nSources:\n" + "\n".join(source_urls)
            
            return response + sources
            
        except Exception as e:
            logger.error(f"Error generating response: {e}")
            return "I apologize, but I encountered an error while processing your question. Please try again."

def create_gradio_interface(chatbot):
    def respond(user_input):
        return chatbot.generate_response(user_input)
        
    interface = gr.Interface(
        fn=respond,
        inputs=gr.Textbox(
            label="Ask a Question",
            placeholder="Type your question here...",
            lines=2
        ),
        outputs=gr.Textbox(
            label="Answer",
            placeholder="Response will appear here...",
            lines=5
        ),
        title="School Information Chatbot",
        description="Ask about school events, policies, or other information. The chatbot will provide answers based on available school documents and resources.",
        examples=[
            ["What events are happening this week?"],
            ["When is the next board meeting?"],
            ["What is the school's attendance policy?"]
        ],
        theme=gr.themes.Soft(),
        flagging_mode="never"
    )
    return interface

if __name__ == "__main__":
    try:
        chatbot = SchoolChatbot()
        interface = create_gradio_interface(chatbot)
        interface.launch(
            server_name="0.0.0.0",
            server_port=7860,
            share=False,
            debug=True
        )
    except Exception as e:
        logger.error(f"Failed to start application: {e}")