shukdevdatta123 commited on
Commit
6bb3d54
·
verified ·
1 Parent(s): b70fc5c

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +268 -0
  2. requirements.txt +3 -0
app.py ADDED
@@ -0,0 +1,268 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import requests
3
+ from bs4 import BeautifulSoup
4
+ from urllib.parse import urljoin, urlparse
5
+ import re
6
+ from openai import OpenAI
7
+ import time
8
+ import copy
9
+
10
+ # Function to check if URL belongs to the website
11
+ def is_valid_url(url, base_url):
12
+ parsed_url = urlparse(url)
13
+ parsed_base = urlparse(base_url)
14
+ return parsed_url.netloc == parsed_base.netloc
15
+
16
+ # Function to scrape content from a single page
17
+ def scrape_page(url):
18
+ try:
19
+ response = requests.get(url, timeout=10)
20
+ if response.status_code == 200:
21
+ soup = BeautifulSoup(response.text, 'html.parser')
22
+
23
+ # Remove script, style elements and comments
24
+ for element in soup(['script', 'style', 'header', 'footer', 'nav']):
25
+ element.decompose()
26
+
27
+ # Get text content
28
+ text = soup.get_text(separator=' ', strip=True)
29
+
30
+ # Clean up whitespace
31
+ text = re.sub(r'\s+', ' ', text).strip()
32
+
33
+ return text
34
+ else:
35
+ return None
36
+ except Exception as e:
37
+ print(f"Error scraping {url}: {e}")
38
+ return None
39
+
40
+ # Function to crawl website and get all links
41
+ def crawl_website(base_url, max_pages=30):
42
+ print(f"Starting to crawl {base_url}")
43
+ visited_urls = set()
44
+ urls_to_visit = [base_url]
45
+ site_content = {}
46
+
47
+ while urls_to_visit and len(visited_urls) < max_pages:
48
+ current_url = urls_to_visit.pop(0)
49
+
50
+ if current_url in visited_urls:
51
+ continue
52
+
53
+ print(f"Crawling: {current_url}")
54
+ visited_urls.add(current_url)
55
+
56
+ try:
57
+ response = requests.get(current_url, timeout=10)
58
+ if response.status_code == 200:
59
+ # Get content of the current page
60
+ content = scrape_page(current_url)
61
+ if content:
62
+ site_content[current_url] = content
63
+
64
+ # Find all links on the page
65
+ soup = BeautifulSoup(response.text, 'html.parser')
66
+ for link in soup.find_all('a', href=True):
67
+ href = link['href']
68
+ full_url = urljoin(current_url, href)
69
+
70
+ # Only follow links that are part of the same website
71
+ if is_valid_url(full_url, base_url) and full_url not in visited_urls:
72
+ urls_to_visit.append(full_url)
73
+
74
+ # Add a small delay to be respectful
75
+ time.sleep(0.5)
76
+
77
+ except Exception as e:
78
+ print(f"Error visiting {current_url}: {e}")
79
+
80
+ print(f"Crawled {len(visited_urls)} pages and collected content from {len(site_content)} pages.")
81
+ return site_content
82
+
83
+ # Function that creates a context from the scraped content
84
+ def create_context(site_content, max_context_length=8000):
85
+ context = "Content from https://innovativeskillsbd.com website:\n\n"
86
+
87
+ for url, content in site_content.items():
88
+ # Add URL and a portion of its content (limited to keep context manageable)
89
+ page_content = f"Page: {url}\n{content[:1000]}...\n\n"
90
+
91
+ # Check if adding this would exceed max context length
92
+ if len(context) + len(page_content) > max_context_length:
93
+ break
94
+
95
+ context += page_content
96
+
97
+ return context
98
+
99
+ # Function to fix URLs in text to ensure they point to the correct domain
100
+ def fix_urls_in_text(text):
101
+ # Look for URLs in the text
102
+ url_pattern = r'https?://[^\s/$.?#].[^\s]*'
103
+ urls = re.findall(url_pattern, text)
104
+
105
+ for url in urls:
106
+ # If the URL contains the wrong domain but appears to be an InnovativeSkills link
107
+ if ('innovative-skill.com' in url or 'innovativeskill.com' in url) and 'innovativeskillsbd.com' not in url:
108
+ # Create the correct URL by replacing the domain
109
+ path = urlparse(url).path
110
+ correct_url = f"https://innovativeskillsbd.com{path}"
111
+ # Replace in the text
112
+ text = text.replace(url, correct_url)
113
+
114
+ return text
115
+
116
+ # Function to query the DeepSeek V3 model
117
+ def query_model(api_key, messages):
118
+ try:
119
+ client = OpenAI(
120
+ base_url="https://openrouter.ai/api/v1",
121
+ api_key=api_key,
122
+ )
123
+
124
+ completion = client.chat.completions.create(
125
+ extra_headers={
126
+ "HTTP-Referer": "https://innovativeskillsbd.com",
127
+ "X-Title": "InnovativeSkills ChatBot",
128
+ },
129
+ model="deepseek/deepseek-chat-v3-0324:free",
130
+ messages=messages
131
+ )
132
+
133
+ response = completion.choices[0].message.content
134
+
135
+ # Fix any incorrect URLs - ensure all links point to the correct domain
136
+ response = fix_urls_in_text(response)
137
+
138
+ return response
139
+ except Exception as e:
140
+ return f"Error querying the model: {str(e)}"
141
+
142
+ # Function to answer questions based on website content
143
+ def answer_question(api_key, question, site_content, history):
144
+ if not api_key:
145
+ return "Please enter your OpenRouter API key.", history
146
+
147
+ # Prepare the context from scraped content
148
+ context = create_context(site_content)
149
+
150
+ # Create system message with context
151
+ system_message = {
152
+ "role": "system",
153
+ "content": f"""You are a helpful AI assistant for InnovativeSkills Bangladesh, a website focused on helping people learn IT skills.
154
+ Use the following content from the website to answer user questions. If the question is not related to the website or the
155
+ information is not available in the content, politely say so and try to provide general guidance related to InnovativeSkills.
156
+
157
+ IMPORTANT: When referring to any URLs related to the website, ALWAYS use the domain 'innovativeskillsbd.com' (NOT 'innovative-skill.com' or 'innovativeskill.com').
158
+ For example, use 'https://innovativeskillsbd.com/student-job-success' instead of any other domain.
159
+
160
+ {context}"""
161
+ }
162
+
163
+ # Create user message
164
+ user_message = {"role": "user", "content": question}
165
+
166
+ # Create message history for the API call
167
+ messages = [system_message]
168
+
169
+ # Add conversation history
170
+ for user_msg, assistant_msg in history:
171
+ messages.append({"role": "user", "content": user_msg})
172
+ messages.append({"role": "assistant", "content": assistant_msg})
173
+
174
+ # Add current question
175
+ messages.append(user_message)
176
+
177
+ # Query the model
178
+ response = query_model(api_key, messages)
179
+
180
+ # Update history by adding the new exchange
181
+ new_history = copy.deepcopy(history)
182
+ new_history.append((question, response))
183
+ return response, new_history
184
+
185
+ # Scrape the website when the app starts
186
+ def init_scraper(progress=gr.Progress()):
187
+ base_url = "https://innovativeskillsbd.com/"
188
+ progress(0, desc="Starting website crawler...")
189
+ site_content = crawl_website(base_url)
190
+ progress(1, desc="Finished crawling website")
191
+ return site_content
192
+
193
+ # Create Gradio interface
194
+ def create_interface(site_content):
195
+ with gr.Blocks() as app:
196
+ gr.Markdown("# InnovativeSkills Bangladesh Chatbot")
197
+ gr.Markdown("This chatbot uses DeepSeek V3 to answer questions about InnovativeSkills Bangladesh website.")
198
+
199
+ with gr.Row():
200
+ api_key_input = gr.Textbox(
201
+ label="OpenRouter API Key",
202
+ placeholder="Enter your OpenRouter API key",
203
+ type="password"
204
+ )
205
+
206
+ chatbot = gr.Chatbot(height=500)
207
+ msg = gr.Textbox(label="Ask a question about InnovativeSkills Bangladesh")
208
+
209
+ # Container for site content (hidden from UI)
210
+ site_content_state = gr.State(site_content)
211
+
212
+ # Container for chat history
213
+ chat_history = gr.State([])
214
+
215
+ # Button to start the conversation
216
+ clear = gr.Button("Clear conversation")
217
+
218
+ # Events
219
+ def user_input(api_key, message, site_content, history):
220
+ if not message:
221
+ return "", chatbot, history
222
+
223
+ # Process the response
224
+ bot_response, updated_history = answer_question(api_key, message, site_content, history)
225
+
226
+ # Format history for chatbot display
227
+ chatbot_display = []
228
+ for user_msg, bot_msg in updated_history:
229
+ chatbot_display.append([user_msg, bot_msg])
230
+
231
+ return "", chatbot_display, updated_history
232
+
233
+ msg.submit(
234
+ user_input,
235
+ inputs=[api_key_input, msg, site_content_state, chat_history],
236
+ outputs=[msg, chatbot, chat_history]
237
+ )
238
+
239
+ def clear_chat():
240
+ return "", [], []
241
+
242
+ clear.click(
243
+ clear_chat,
244
+ outputs=[msg, chatbot, chat_history]
245
+ )
246
+
247
+ return app
248
+
249
+ # Initialize and launch the app
250
+ def main():
251
+ print("Starting to initialize the InnovativeSkills chatbot...")
252
+
253
+ # First, scrape the website content
254
+ site_content = {}
255
+ try:
256
+ site_content = crawl_website("https://innovativeskillsbd.com/")
257
+ except Exception as e:
258
+ print(f"Error during initial website crawling: {e}")
259
+ print("The chatbot will still work, but without initial website content.")
260
+
261
+ # Create the Gradio interface with the site content
262
+ app = create_interface(site_content)
263
+
264
+ # Launch the app
265
+ app.launch()
266
+
267
+ if __name__ == "__main__":
268
+ main()
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ requests
2
+ beautifulsoup4
3
+ openai