INNOVBOT / app.py
shukdevdatta123's picture
Update app.py
47d9540 verified
raw
history blame
9.67 kB
import gradio as gr
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import re
from openai import OpenAI
import time
import copy
# Function to check if URL belongs to the website
def is_valid_url(url, base_url):
parsed_url = urlparse(url)
parsed_base = urlparse(base_url)
return parsed_url.netloc == parsed_base.netloc
# Function to scrape content from a single page
def scrape_page(url):
try:
response = requests.get(url, timeout=10)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
# Remove script, style elements and comments
for element in soup(['script', 'style', 'header', 'footer', 'nav']):
element.decompose()
# Get text content
text = soup.get_text(separator=' ', strip=True)
# Clean up whitespace
text = re.sub(r'\s+', ' ', text).strip()
return text
else:
return None
except Exception as e:
print(f"Error scraping {url}: {e}")
return None
# Function to crawl website and get all links
def crawl_website(base_url, max_pages=80):
print(f"Starting to crawl {base_url}")
visited_urls = set()
urls_to_visit = [base_url]
site_content = {}
while urls_to_visit and len(visited_urls) < max_pages:
current_url = urls_to_visit.pop(0)
if current_url in visited_urls:
continue
print(f"Crawling: {current_url}")
visited_urls.add(current_url)
try:
response = requests.get(current_url, timeout=10)
if response.status_code == 200:
# Get content of the current page
content = scrape_page(current_url)
if content:
site_content[current_url] = content
# Find all links on the page
soup = BeautifulSoup(response.text, 'html.parser')
for link in soup.find_all('a', href=True):
href = link['href']
full_url = urljoin(current_url, href)
# Only follow links that are part of the same website
if is_valid_url(full_url, base_url) and full_url not in visited_urls:
urls_to_visit.append(full_url)
# Add a small delay to be respectful
time.sleep(0.5)
except Exception as e:
print(f"Error visiting {current_url}: {e}")
print(f"Crawled {len(visited_urls)} pages and collected content from {len(site_content)} pages.")
return site_content
# Function that creates a context from the scraped content
def create_context(site_content, max_context_length=8000):
context = "Content from https://innovativeskillsbd.com website:\n\n"
for url, content in site_content.items():
# Add URL and a portion of its content (limited to keep context manageable)
page_content = f"Page: {url}\n{content[:1000]}...\n\n"
# Check if adding this would exceed max context length
if len(context) + len(page_content) > max_context_length:
break
context += page_content
return context
# Function to fix URLs in text to ensure they point to the correct domain
def fix_urls_in_text(text):
# Look for URLs in the text
url_pattern = r'https?://[^\s/$.?#].[^\s]*'
urls = re.findall(url_pattern, text)
for url in urls:
# If the URL contains the wrong domain but appears to be an InnovativeSkills link
if ('innovative-skill.com' in url or 'innovativeskill.com' in url) and 'innovativeskillsbd.com' not in url:
# Create the correct URL by replacing the domain
path = urlparse(url).path
correct_url = f"https://innovativeskillsbd.com{path}"
# Replace in the text
text = text.replace(url, correct_url)
return text
# Function to query the DeepSeek V3 model
def query_model(api_key, messages):
try:
client = OpenAI(
base_url="https://openrouter.ai/api/v1",
api_key=api_key,
)
completion = client.chat.completions.create(
extra_headers={
"HTTP-Referer": "https://innovativeskillsbd.com",
"X-Title": "InnovativeSkills ChatBot",
},
model="deepseek/deepseek-chat-v3-0324:free",
messages=messages
)
response = completion.choices[0].message.content
# Fix any incorrect URLs - ensure all links point to the correct domain
response = fix_urls_in_text(response)
return response
except Exception as e:
return f"Error querying the model: {str(e)}"
# Function to answer questions based on website content
def answer_question(api_key, question, site_content, history):
if not api_key:
return "Please enter your OpenRouter API key.", history
# Prepare the context from scraped content
context = create_context(site_content)
# Create system message with context
system_message = {
"role": "system",
"content": f"""You are a helpful AI assistant for InnovativeSkills Bangladesh, a website focused on helping people learn IT skills.
Use the following content from the website to answer user questions. If the question is not related to the website or the
information is not available in the content, politely say so and try to provide general guidance related to InnovativeSkills.
IMPORTANT: When referring to any URLs related to the website, ALWAYS use the domain 'innovativeskillsbd.com' (NOT 'innovative-skill.com' or 'innovativeskill.com').
For example, use 'https://innovativeskillsbd.com/student-job-success' instead of any other domain.
{context}"""
}
# Create user message
user_message = {"role": "user", "content": question}
# Create message history for the API call
messages = [system_message]
# Add conversation history
for user_msg, assistant_msg in history:
messages.append({"role": "user", "content": user_msg})
messages.append({"role": "assistant", "content": assistant_msg})
# Add current question
messages.append(user_message)
# Query the model
response = query_model(api_key, messages)
# Update history by adding the new exchange
new_history = copy.deepcopy(history)
new_history.append((question, response))
return response, new_history
# Scrape the website when the app starts
def init_scraper(progress=gr.Progress()):
base_url = "https://innovativeskillsbd.com/"
progress(0, desc="Starting website crawler...")
site_content = crawl_website(base_url)
progress(1, desc="Finished crawling website")
return site_content
# Create Gradio interface
def create_interface(site_content):
with gr.Blocks() as app:
gr.Markdown("# InnovativeSkills Bangladesh Chatbot")
gr.Markdown("This chatbot uses DeepSeek V3 to answer questions about InnovativeSkills Bangladesh website.")
with gr.Row():
api_key_input = gr.Textbox(
label="OpenRouter API Key",
placeholder="Enter your OpenRouter API key",
type="password"
)
chatbot = gr.Chatbot(height=500, show_copy_button=True)
msg = gr.Textbox(label="Ask a question about InnovativeSkills Bangladesh")
# Container for site content (hidden from UI)
site_content_state = gr.State(site_content)
# Container for chat history
chat_history = gr.State([])
# Button to start the conversation
clear = gr.Button("Clear conversation")
# Events
def user_input(api_key, message, site_content, history):
if not message:
return "", chatbot, history
# Process the response
bot_response, updated_history = answer_question(api_key, message, site_content, history)
# Format history for chatbot display
chatbot_display = []
for user_msg, bot_msg in updated_history:
chatbot_display.append([user_msg, bot_msg])
return "", chatbot_display, updated_history
msg.submit(
user_input,
inputs=[api_key_input, msg, site_content_state, chat_history],
outputs=[msg, chatbot, chat_history]
)
def clear_chat():
return "", [], []
clear.click(
clear_chat,
outputs=[msg, chatbot, chat_history]
)
return app
# Initialize and launch the app
def main():
print("Starting to initialize the InnovativeSkills chatbot...")
# First, scrape the website content
site_content = {}
try:
site_content = crawl_website("https://innovativeskillsbd.com/")
except Exception as e:
print(f"Error during initial website crawling: {e}")
print("The chatbot will still work, but without initial website content.")
# Create the Gradio interface with the site content
app = create_interface(site_content)
# Launch the app
app.launch()
if __name__ == "__main__":
main()