Spaces:
Sleeping
Sleeping
import gradio as gr | |
import requests | |
from bs4 import BeautifulSoup | |
from urllib.parse import urljoin, urlparse | |
import re | |
from openai import OpenAI | |
import time | |
import copy | |
# Function to check if URL belongs to the website | |
def is_valid_url(url, base_url): | |
parsed_url = urlparse(url) | |
parsed_base = urlparse(base_url) | |
return parsed_url.netloc == parsed_base.netloc | |
# Function to scrape content from a single page | |
def scrape_page(url): | |
try: | |
response = requests.get(url, timeout=10) | |
if response.status_code == 200: | |
soup = BeautifulSoup(response.text, 'html.parser') | |
# Remove script, style elements and comments | |
for element in soup(['script', 'style', 'header', 'footer', 'nav']): | |
element.decompose() | |
# Get text content | |
text = soup.get_text(separator=' ', strip=True) | |
# Clean up whitespace | |
text = re.sub(r'\s+', ' ', text).strip() | |
return text | |
else: | |
return None | |
except Exception as e: | |
print(f"Error scraping {url}: {e}") | |
return None | |
# Function to crawl website and get all links | |
def crawl_website(base_url, max_pages=80): | |
print(f"Starting to crawl {base_url}") | |
visited_urls = set() | |
urls_to_visit = [base_url] | |
site_content = {} | |
while urls_to_visit and len(visited_urls) < max_pages: | |
current_url = urls_to_visit.pop(0) | |
if current_url in visited_urls: | |
continue | |
print(f"Crawling: {current_url}") | |
visited_urls.add(current_url) | |
try: | |
response = requests.get(current_url, timeout=10) | |
if response.status_code == 200: | |
# Get content of the current page | |
content = scrape_page(current_url) | |
if content: | |
site_content[current_url] = content | |
# Find all links on the page | |
soup = BeautifulSoup(response.text, 'html.parser') | |
for link in soup.find_all('a', href=True): | |
href = link['href'] | |
full_url = urljoin(current_url, href) | |
# Only follow links that are part of the same website | |
if is_valid_url(full_url, base_url) and full_url not in visited_urls: | |
urls_to_visit.append(full_url) | |
# Add a small delay to be respectful | |
time.sleep(0.5) | |
except Exception as e: | |
print(f"Error visiting {current_url}: {e}") | |
print(f"Crawled {len(visited_urls)} pages and collected content from {len(site_content)} pages.") | |
return site_content | |
# Function that creates a context from the scraped content | |
def create_context(site_content, max_context_length=8000): | |
context = "Content from https://innovativeskillsbd.com website:\n\n" | |
for url, content in site_content.items(): | |
# Add URL and a portion of its content (limited to keep context manageable) | |
page_content = f"Page: {url}\n{content[:1000]}...\n\n" | |
# Check if adding this would exceed max context length | |
if len(context) + len(page_content) > max_context_length: | |
break | |
context += page_content | |
return context | |
# Function to fix URLs in text to ensure they point to the correct domain | |
def fix_urls_in_text(text): | |
# Look for URLs in the text | |
url_pattern = r'https?://[^\s/$.?#].[^\s]*' | |
urls = re.findall(url_pattern, text) | |
for url in urls: | |
# If the URL contains the wrong domain but appears to be an InnovativeSkills link | |
if ('innovative-skill.com' in url or 'innovativeskill.com' in url) and 'innovativeskillsbd.com' not in url: | |
# Create the correct URL by replacing the domain | |
path = urlparse(url).path | |
correct_url = f"https://innovativeskillsbd.com{path}" | |
# Replace in the text | |
text = text.replace(url, correct_url) | |
return text | |
# Function to query the DeepSeek V3 model | |
def query_model(api_key, messages): | |
try: | |
client = OpenAI( | |
base_url="https://openrouter.ai/api/v1", | |
api_key=api_key, | |
) | |
completion = client.chat.completions.create( | |
extra_headers={ | |
"HTTP-Referer": "https://innovativeskillsbd.com", | |
"X-Title": "InnovativeSkills ChatBot", | |
}, | |
model="deepseek/deepseek-chat-v3-0324:free", | |
messages=messages | |
) | |
response = completion.choices[0].message.content | |
# Fix any incorrect URLs - ensure all links point to the correct domain | |
response = fix_urls_in_text(response) | |
return response | |
except Exception as e: | |
return f"Error querying the model: {str(e)}" | |
# Function to answer questions based on website content | |
def answer_question(api_key, question, site_content, history): | |
if not api_key: | |
return "Please enter your OpenRouter API key.", history | |
# Prepare the context from scraped content | |
context = create_context(site_content) | |
# Create system message with context | |
system_message = { | |
"role": "system", | |
"content": f"""You are a helpful AI assistant for InnovativeSkills Bangladesh, a website focused on helping people learn IT skills. | |
Use the following content from the website to answer user questions. If the question is not related to the website or the | |
information is not available in the content, politely say so and try to provide general guidance related to InnovativeSkills. | |
IMPORTANT: When referring to any URLs related to the website, ALWAYS use the domain 'innovativeskillsbd.com' (NOT 'innovative-skill.com' or 'innovativeskill.com'). | |
For example, use 'https://innovativeskillsbd.com/student-job-success' instead of any other domain. | |
{context}""" | |
} | |
# Create user message | |
user_message = {"role": "user", "content": question} | |
# Create message history for the API call | |
messages = [system_message] | |
# Add conversation history | |
for user_msg, assistant_msg in history: | |
messages.append({"role": "user", "content": user_msg}) | |
messages.append({"role": "assistant", "content": assistant_msg}) | |
# Add current question | |
messages.append(user_message) | |
# Query the model | |
response = query_model(api_key, messages) | |
# Update history by adding the new exchange | |
new_history = copy.deepcopy(history) | |
new_history.append((question, response)) | |
return response, new_history | |
# Scrape the website when the app starts | |
def init_scraper(progress=gr.Progress()): | |
base_url = "https://innovativeskillsbd.com/" | |
progress(0, desc="Starting website crawler...") | |
site_content = crawl_website(base_url) | |
progress(1, desc="Finished crawling website") | |
return site_content | |
# Create Gradio interface | |
def create_interface(site_content): | |
with gr.Blocks() as app: | |
gr.Markdown("# InnovativeSkills Bangladesh Chatbot") | |
gr.Markdown("This chatbot uses DeepSeek V3 to answer questions about InnovativeSkills Bangladesh website.") | |
with gr.Row(): | |
api_key_input = gr.Textbox( | |
label="OpenRouter API Key", | |
placeholder="Enter your OpenRouter API key", | |
type="password" | |
) | |
chatbot = gr.Chatbot(height=500, show_copy_button=True) | |
msg = gr.Textbox(label="Ask a question about InnovativeSkills Bangladesh") | |
# Container for site content (hidden from UI) | |
site_content_state = gr.State(site_content) | |
# Container for chat history | |
chat_history = gr.State([]) | |
# Button to start the conversation | |
clear = gr.Button("Clear conversation") | |
# Events | |
def user_input(api_key, message, site_content, history): | |
if not message: | |
return "", chatbot, history | |
# Process the response | |
bot_response, updated_history = answer_question(api_key, message, site_content, history) | |
# Format history for chatbot display | |
chatbot_display = [] | |
for user_msg, bot_msg in updated_history: | |
chatbot_display.append([user_msg, bot_msg]) | |
return "", chatbot_display, updated_history | |
msg.submit( | |
user_input, | |
inputs=[api_key_input, msg, site_content_state, chat_history], | |
outputs=[msg, chatbot, chat_history] | |
) | |
def clear_chat(): | |
return "", [], [] | |
clear.click( | |
clear_chat, | |
outputs=[msg, chatbot, chat_history] | |
) | |
return app | |
# Initialize and launch the app | |
def main(): | |
print("Starting to initialize the InnovativeSkills chatbot...") | |
# First, scrape the website content | |
site_content = {} | |
try: | |
site_content = crawl_website("https://innovativeskillsbd.com/") | |
except Exception as e: | |
print(f"Error during initial website crawling: {e}") | |
print("The chatbot will still work, but without initial website content.") | |
# Create the Gradio interface with the site content | |
app = create_interface(site_content) | |
# Launch the app | |
app.launch() | |
if __name__ == "__main__": | |
main() |