File size: 9,666 Bytes
0c94523
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47d9540
0c94523
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6bb3d54
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
import gradio as gr
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import re
from openai import OpenAI
import time
import copy

# Function to check if URL belongs to the website
def is_valid_url(url, base_url):
    parsed_url = urlparse(url)
    parsed_base = urlparse(base_url)
    return parsed_url.netloc == parsed_base.netloc

# Function to scrape content from a single page
def scrape_page(url):
    try:
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Remove script, style elements and comments
            for element in soup(['script', 'style', 'header', 'footer', 'nav']):
                element.decompose()
                
            # Get text content
            text = soup.get_text(separator=' ', strip=True)
            
            # Clean up whitespace
            text = re.sub(r'\s+', ' ', text).strip()
            
            return text
        else:
            return None
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return None

# Function to crawl website and get all links
def crawl_website(base_url, max_pages=80):
    print(f"Starting to crawl {base_url}")
    visited_urls = set()
    urls_to_visit = [base_url]
    site_content = {}
    
    while urls_to_visit and len(visited_urls) < max_pages:
        current_url = urls_to_visit.pop(0)
        
        if current_url in visited_urls:
            continue
            
        print(f"Crawling: {current_url}")
        visited_urls.add(current_url)
        
        try:
            response = requests.get(current_url, timeout=10)
            if response.status_code == 200:
                # Get content of the current page
                content = scrape_page(current_url)
                if content:
                    site_content[current_url] = content
                
                # Find all links on the page
                soup = BeautifulSoup(response.text, 'html.parser')
                for link in soup.find_all('a', href=True):
                    href = link['href']
                    full_url = urljoin(current_url, href)
                    
                    # Only follow links that are part of the same website
                    if is_valid_url(full_url, base_url) and full_url not in visited_urls:
                        urls_to_visit.append(full_url)
            
            # Add a small delay to be respectful
            time.sleep(0.5)
            
        except Exception as e:
            print(f"Error visiting {current_url}: {e}")
    
    print(f"Crawled {len(visited_urls)} pages and collected content from {len(site_content)} pages.")
    return site_content

# Function that creates a context from the scraped content
def create_context(site_content, max_context_length=8000):
    context = "Content from https://innovativeskillsbd.com website:\n\n"
    
    for url, content in site_content.items():
        # Add URL and a portion of its content (limited to keep context manageable)
        page_content = f"Page: {url}\n{content[:1000]}...\n\n"
        
        # Check if adding this would exceed max context length
        if len(context) + len(page_content) > max_context_length:
            break
            
        context += page_content
    
    return context

# Function to fix URLs in text to ensure they point to the correct domain
def fix_urls_in_text(text):
    # Look for URLs in the text
    url_pattern = r'https?://[^\s/$.?#].[^\s]*'
    urls = re.findall(url_pattern, text)
    
    for url in urls:
        # If the URL contains the wrong domain but appears to be an InnovativeSkills link
        if ('innovative-skill.com' in url or 'innovativeskill.com' in url) and 'innovativeskillsbd.com' not in url:
            # Create the correct URL by replacing the domain
            path = urlparse(url).path
            correct_url = f"https://innovativeskillsbd.com{path}"
            # Replace in the text
            text = text.replace(url, correct_url)
    
    return text

# Function to query the DeepSeek V3 model
def query_model(api_key, messages):
    try:
        client = OpenAI(
            base_url="https://openrouter.ai/api/v1",
            api_key=api_key,
        )
        
        completion = client.chat.completions.create(
            extra_headers={
                "HTTP-Referer": "https://innovativeskillsbd.com",
                "X-Title": "InnovativeSkills ChatBot",
            },
            model="deepseek/deepseek-chat-v3-0324:free",
            messages=messages
        )
        
        response = completion.choices[0].message.content
        
        # Fix any incorrect URLs - ensure all links point to the correct domain
        response = fix_urls_in_text(response)
        
        return response
    except Exception as e:
        return f"Error querying the model: {str(e)}"

# Function to answer questions based on website content
def answer_question(api_key, question, site_content, history):
    if not api_key:
        return "Please enter your OpenRouter API key.", history
    
    # Prepare the context from scraped content
    context = create_context(site_content)
    
    # Create system message with context
    system_message = {
        "role": "system", 
        "content": f"""You are a helpful AI assistant for InnovativeSkills Bangladesh, a website focused on helping people learn IT skills. 
        Use the following content from the website to answer user questions. If the question is not related to the website or the 
        information is not available in the content, politely say so and try to provide general guidance related to InnovativeSkills.
        
        IMPORTANT: When referring to any URLs related to the website, ALWAYS use the domain 'innovativeskillsbd.com' (NOT 'innovative-skill.com' or 'innovativeskill.com').
        For example, use 'https://innovativeskillsbd.com/student-job-success' instead of any other domain.
        
        {context}"""
    }
    
    # Create user message
    user_message = {"role": "user", "content": question}
    
    # Create message history for the API call
    messages = [system_message]
    
    # Add conversation history
    for user_msg, assistant_msg in history:
        messages.append({"role": "user", "content": user_msg})
        messages.append({"role": "assistant", "content": assistant_msg})
    
    # Add current question
    messages.append(user_message)
    
    # Query the model
    response = query_model(api_key, messages)
    
    # Update history by adding the new exchange
    new_history = copy.deepcopy(history)
    new_history.append((question, response))
    return response, new_history

# Scrape the website when the app starts
def init_scraper(progress=gr.Progress()):
    base_url = "https://innovativeskillsbd.com/"
    progress(0, desc="Starting website crawler...")
    site_content = crawl_website(base_url)
    progress(1, desc="Finished crawling website")
    return site_content

# Create Gradio interface
def create_interface(site_content):
    with gr.Blocks() as app:
        gr.Markdown("# InnovativeSkills Bangladesh Chatbot")
        gr.Markdown("This chatbot uses DeepSeek V3 to answer questions about InnovativeSkills Bangladesh website.")
        
        with gr.Row():
            api_key_input = gr.Textbox(
                label="OpenRouter API Key", 
                placeholder="Enter your OpenRouter API key", 
                type="password"
            )
        
        chatbot = gr.Chatbot(height=500, show_copy_button=True)
        msg = gr.Textbox(label="Ask a question about InnovativeSkills Bangladesh")
        
        # Container for site content (hidden from UI)
        site_content_state = gr.State(site_content)
        
        # Container for chat history
        chat_history = gr.State([])
        
        # Button to start the conversation
        clear = gr.Button("Clear conversation")
        
        # Events
        def user_input(api_key, message, site_content, history):
            if not message:
                return "", chatbot, history
            
            # Process the response
            bot_response, updated_history = answer_question(api_key, message, site_content, history)
            
            # Format history for chatbot display
            chatbot_display = []
            for user_msg, bot_msg in updated_history:
                chatbot_display.append([user_msg, bot_msg])
                
            return "", chatbot_display, updated_history
            
        msg.submit(
            user_input,
            inputs=[api_key_input, msg, site_content_state, chat_history],
            outputs=[msg, chatbot, chat_history]
        )
        
        def clear_chat():
            return "", [], []
            
        clear.click(
            clear_chat,
            outputs=[msg, chatbot, chat_history]
        )
        
    return app

# Initialize and launch the app
def main():
    print("Starting to initialize the InnovativeSkills chatbot...")
    
    # First, scrape the website content
    site_content = {}
    try:
        site_content = crawl_website("https://innovativeskillsbd.com/")
    except Exception as e:
        print(f"Error during initial website crawling: {e}")
        print("The chatbot will still work, but without initial website content.")
    
    # Create the Gradio interface with the site content
    app = create_interface(site_content)
    
    # Launch the app
    app.launch()

if __name__ == "__main__":
    main()