Spaces:
Sleeping
Sleeping
Upload 2 files
Browse files- app.py +268 -0
- requirements.txt +3 -0
app.py
ADDED
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import requests
|
3 |
+
from bs4 import BeautifulSoup
|
4 |
+
from urllib.parse import urljoin, urlparse
|
5 |
+
import re
|
6 |
+
from openai import OpenAI
|
7 |
+
import time
|
8 |
+
import copy
|
9 |
+
|
10 |
+
# Function to check if URL belongs to the website
|
11 |
+
def is_valid_url(url, base_url):
|
12 |
+
parsed_url = urlparse(url)
|
13 |
+
parsed_base = urlparse(base_url)
|
14 |
+
return parsed_url.netloc == parsed_base.netloc
|
15 |
+
|
16 |
+
# Function to scrape content from a single page
|
17 |
+
def scrape_page(url):
|
18 |
+
try:
|
19 |
+
response = requests.get(url, timeout=10)
|
20 |
+
if response.status_code == 200:
|
21 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
22 |
+
|
23 |
+
# Remove script, style elements and comments
|
24 |
+
for element in soup(['script', 'style', 'header', 'footer', 'nav']):
|
25 |
+
element.decompose()
|
26 |
+
|
27 |
+
# Get text content
|
28 |
+
text = soup.get_text(separator=' ', strip=True)
|
29 |
+
|
30 |
+
# Clean up whitespace
|
31 |
+
text = re.sub(r'\s+', ' ', text).strip()
|
32 |
+
|
33 |
+
return text
|
34 |
+
else:
|
35 |
+
return None
|
36 |
+
except Exception as e:
|
37 |
+
print(f"Error scraping {url}: {e}")
|
38 |
+
return None
|
39 |
+
|
40 |
+
# Function to crawl website and get all links
|
41 |
+
def crawl_website(base_url, max_pages=30):
|
42 |
+
print(f"Starting to crawl {base_url}")
|
43 |
+
visited_urls = set()
|
44 |
+
urls_to_visit = [base_url]
|
45 |
+
site_content = {}
|
46 |
+
|
47 |
+
while urls_to_visit and len(visited_urls) < max_pages:
|
48 |
+
current_url = urls_to_visit.pop(0)
|
49 |
+
|
50 |
+
if current_url in visited_urls:
|
51 |
+
continue
|
52 |
+
|
53 |
+
print(f"Crawling: {current_url}")
|
54 |
+
visited_urls.add(current_url)
|
55 |
+
|
56 |
+
try:
|
57 |
+
response = requests.get(current_url, timeout=10)
|
58 |
+
if response.status_code == 200:
|
59 |
+
# Get content of the current page
|
60 |
+
content = scrape_page(current_url)
|
61 |
+
if content:
|
62 |
+
site_content[current_url] = content
|
63 |
+
|
64 |
+
# Find all links on the page
|
65 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
66 |
+
for link in soup.find_all('a', href=True):
|
67 |
+
href = link['href']
|
68 |
+
full_url = urljoin(current_url, href)
|
69 |
+
|
70 |
+
# Only follow links that are part of the same website
|
71 |
+
if is_valid_url(full_url, base_url) and full_url not in visited_urls:
|
72 |
+
urls_to_visit.append(full_url)
|
73 |
+
|
74 |
+
# Add a small delay to be respectful
|
75 |
+
time.sleep(0.5)
|
76 |
+
|
77 |
+
except Exception as e:
|
78 |
+
print(f"Error visiting {current_url}: {e}")
|
79 |
+
|
80 |
+
print(f"Crawled {len(visited_urls)} pages and collected content from {len(site_content)} pages.")
|
81 |
+
return site_content
|
82 |
+
|
83 |
+
# Function that creates a context from the scraped content
|
84 |
+
def create_context(site_content, max_context_length=8000):
|
85 |
+
context = "Content from https://innovativeskillsbd.com website:\n\n"
|
86 |
+
|
87 |
+
for url, content in site_content.items():
|
88 |
+
# Add URL and a portion of its content (limited to keep context manageable)
|
89 |
+
page_content = f"Page: {url}\n{content[:1000]}...\n\n"
|
90 |
+
|
91 |
+
# Check if adding this would exceed max context length
|
92 |
+
if len(context) + len(page_content) > max_context_length:
|
93 |
+
break
|
94 |
+
|
95 |
+
context += page_content
|
96 |
+
|
97 |
+
return context
|
98 |
+
|
99 |
+
# Function to fix URLs in text to ensure they point to the correct domain
|
100 |
+
def fix_urls_in_text(text):
|
101 |
+
# Look for URLs in the text
|
102 |
+
url_pattern = r'https?://[^\s/$.?#].[^\s]*'
|
103 |
+
urls = re.findall(url_pattern, text)
|
104 |
+
|
105 |
+
for url in urls:
|
106 |
+
# If the URL contains the wrong domain but appears to be an InnovativeSkills link
|
107 |
+
if ('innovative-skill.com' in url or 'innovativeskill.com' in url) and 'innovativeskillsbd.com' not in url:
|
108 |
+
# Create the correct URL by replacing the domain
|
109 |
+
path = urlparse(url).path
|
110 |
+
correct_url = f"https://innovativeskillsbd.com{path}"
|
111 |
+
# Replace in the text
|
112 |
+
text = text.replace(url, correct_url)
|
113 |
+
|
114 |
+
return text
|
115 |
+
|
116 |
+
# Function to query the DeepSeek V3 model
|
117 |
+
def query_model(api_key, messages):
|
118 |
+
try:
|
119 |
+
client = OpenAI(
|
120 |
+
base_url="https://openrouter.ai/api/v1",
|
121 |
+
api_key=api_key,
|
122 |
+
)
|
123 |
+
|
124 |
+
completion = client.chat.completions.create(
|
125 |
+
extra_headers={
|
126 |
+
"HTTP-Referer": "https://innovativeskillsbd.com",
|
127 |
+
"X-Title": "InnovativeSkills ChatBot",
|
128 |
+
},
|
129 |
+
model="deepseek/deepseek-chat-v3-0324:free",
|
130 |
+
messages=messages
|
131 |
+
)
|
132 |
+
|
133 |
+
response = completion.choices[0].message.content
|
134 |
+
|
135 |
+
# Fix any incorrect URLs - ensure all links point to the correct domain
|
136 |
+
response = fix_urls_in_text(response)
|
137 |
+
|
138 |
+
return response
|
139 |
+
except Exception as e:
|
140 |
+
return f"Error querying the model: {str(e)}"
|
141 |
+
|
142 |
+
# Function to answer questions based on website content
|
143 |
+
def answer_question(api_key, question, site_content, history):
|
144 |
+
if not api_key:
|
145 |
+
return "Please enter your OpenRouter API key.", history
|
146 |
+
|
147 |
+
# Prepare the context from scraped content
|
148 |
+
context = create_context(site_content)
|
149 |
+
|
150 |
+
# Create system message with context
|
151 |
+
system_message = {
|
152 |
+
"role": "system",
|
153 |
+
"content": f"""You are a helpful AI assistant for InnovativeSkills Bangladesh, a website focused on helping people learn IT skills.
|
154 |
+
Use the following content from the website to answer user questions. If the question is not related to the website or the
|
155 |
+
information is not available in the content, politely say so and try to provide general guidance related to InnovativeSkills.
|
156 |
+
|
157 |
+
IMPORTANT: When referring to any URLs related to the website, ALWAYS use the domain 'innovativeskillsbd.com' (NOT 'innovative-skill.com' or 'innovativeskill.com').
|
158 |
+
For example, use 'https://innovativeskillsbd.com/student-job-success' instead of any other domain.
|
159 |
+
|
160 |
+
{context}"""
|
161 |
+
}
|
162 |
+
|
163 |
+
# Create user message
|
164 |
+
user_message = {"role": "user", "content": question}
|
165 |
+
|
166 |
+
# Create message history for the API call
|
167 |
+
messages = [system_message]
|
168 |
+
|
169 |
+
# Add conversation history
|
170 |
+
for user_msg, assistant_msg in history:
|
171 |
+
messages.append({"role": "user", "content": user_msg})
|
172 |
+
messages.append({"role": "assistant", "content": assistant_msg})
|
173 |
+
|
174 |
+
# Add current question
|
175 |
+
messages.append(user_message)
|
176 |
+
|
177 |
+
# Query the model
|
178 |
+
response = query_model(api_key, messages)
|
179 |
+
|
180 |
+
# Update history by adding the new exchange
|
181 |
+
new_history = copy.deepcopy(history)
|
182 |
+
new_history.append((question, response))
|
183 |
+
return response, new_history
|
184 |
+
|
185 |
+
# Scrape the website when the app starts
|
186 |
+
def init_scraper(progress=gr.Progress()):
|
187 |
+
base_url = "https://innovativeskillsbd.com/"
|
188 |
+
progress(0, desc="Starting website crawler...")
|
189 |
+
site_content = crawl_website(base_url)
|
190 |
+
progress(1, desc="Finished crawling website")
|
191 |
+
return site_content
|
192 |
+
|
193 |
+
# Create Gradio interface
|
194 |
+
def create_interface(site_content):
|
195 |
+
with gr.Blocks() as app:
|
196 |
+
gr.Markdown("# InnovativeSkills Bangladesh Chatbot")
|
197 |
+
gr.Markdown("This chatbot uses DeepSeek V3 to answer questions about InnovativeSkills Bangladesh website.")
|
198 |
+
|
199 |
+
with gr.Row():
|
200 |
+
api_key_input = gr.Textbox(
|
201 |
+
label="OpenRouter API Key",
|
202 |
+
placeholder="Enter your OpenRouter API key",
|
203 |
+
type="password"
|
204 |
+
)
|
205 |
+
|
206 |
+
chatbot = gr.Chatbot(height=500)
|
207 |
+
msg = gr.Textbox(label="Ask a question about InnovativeSkills Bangladesh")
|
208 |
+
|
209 |
+
# Container for site content (hidden from UI)
|
210 |
+
site_content_state = gr.State(site_content)
|
211 |
+
|
212 |
+
# Container for chat history
|
213 |
+
chat_history = gr.State([])
|
214 |
+
|
215 |
+
# Button to start the conversation
|
216 |
+
clear = gr.Button("Clear conversation")
|
217 |
+
|
218 |
+
# Events
|
219 |
+
def user_input(api_key, message, site_content, history):
|
220 |
+
if not message:
|
221 |
+
return "", chatbot, history
|
222 |
+
|
223 |
+
# Process the response
|
224 |
+
bot_response, updated_history = answer_question(api_key, message, site_content, history)
|
225 |
+
|
226 |
+
# Format history for chatbot display
|
227 |
+
chatbot_display = []
|
228 |
+
for user_msg, bot_msg in updated_history:
|
229 |
+
chatbot_display.append([user_msg, bot_msg])
|
230 |
+
|
231 |
+
return "", chatbot_display, updated_history
|
232 |
+
|
233 |
+
msg.submit(
|
234 |
+
user_input,
|
235 |
+
inputs=[api_key_input, msg, site_content_state, chat_history],
|
236 |
+
outputs=[msg, chatbot, chat_history]
|
237 |
+
)
|
238 |
+
|
239 |
+
def clear_chat():
|
240 |
+
return "", [], []
|
241 |
+
|
242 |
+
clear.click(
|
243 |
+
clear_chat,
|
244 |
+
outputs=[msg, chatbot, chat_history]
|
245 |
+
)
|
246 |
+
|
247 |
+
return app
|
248 |
+
|
249 |
+
# Initialize and launch the app
|
250 |
+
def main():
|
251 |
+
print("Starting to initialize the InnovativeSkills chatbot...")
|
252 |
+
|
253 |
+
# First, scrape the website content
|
254 |
+
site_content = {}
|
255 |
+
try:
|
256 |
+
site_content = crawl_website("https://innovativeskillsbd.com/")
|
257 |
+
except Exception as e:
|
258 |
+
print(f"Error during initial website crawling: {e}")
|
259 |
+
print("The chatbot will still work, but without initial website content.")
|
260 |
+
|
261 |
+
# Create the Gradio interface with the site content
|
262 |
+
app = create_interface(site_content)
|
263 |
+
|
264 |
+
# Launch the app
|
265 |
+
app.launch()
|
266 |
+
|
267 |
+
if __name__ == "__main__":
|
268 |
+
main()
|
requirements.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
requests
|
2 |
+
beautifulsoup4
|
3 |
+
openai
|