Spaces:

shukdevdatta123
/

INNOVBOT

Sleeping

App Files Files Community

INNOVBOT / app.py

shukdevdatta123

Update app.py

47d9540 verified 26 days ago

raw

history blame

9.67 kB

	import gradio as gr
	import requests
	from bs4 import BeautifulSoup
	from urllib.parse import urljoin, urlparse
	import re
	from openai import OpenAI
	import time
	import copy

	# Function to check if URL belongs to the website
	def is_valid_url(url, base_url):
	parsed_url = urlparse(url)
	parsed_base = urlparse(base_url)
	return parsed_url.netloc == parsed_base.netloc

	# Function to scrape content from a single page
	def scrape_page(url):
	try:
	response = requests.get(url, timeout=10)
	if response.status_code == 200:
	soup = BeautifulSoup(response.text, 'html.parser')

	# Remove script, style elements and comments
	for element in soup(['script', 'style', 'header', 'footer', 'nav']):
	element.decompose()

	# Get text content
	text = soup.get_text(separator=' ', strip=True)

	# Clean up whitespace
	text = re.sub(r'\s+', ' ', text).strip()

	return text
	else:
	return None
	except Exception as e:
	print(f"Error scraping {url}: {e}")
	return None

	# Function to crawl website and get all links
	def crawl_website(base_url, max_pages=80):
	print(f"Starting to crawl {base_url}")
	visited_urls = set()
	urls_to_visit = [base_url]
	site_content = {}

	while urls_to_visit and len(visited_urls) < max_pages:
	current_url = urls_to_visit.pop(0)

	if current_url in visited_urls:
	continue

	print(f"Crawling: {current_url}")
	visited_urls.add(current_url)

	try:
	response = requests.get(current_url, timeout=10)
	if response.status_code == 200:
	# Get content of the current page
	content = scrape_page(current_url)
	if content:
	site_content[current_url] = content

	# Find all links on the page
	soup = BeautifulSoup(response.text, 'html.parser')
	for link in soup.find_all('a', href=True):
	href = link['href']
	full_url = urljoin(current_url, href)

	# Only follow links that are part of the same website
	if is_valid_url(full_url, base_url) and full_url not in visited_urls:
	urls_to_visit.append(full_url)

	# Add a small delay to be respectful
	time.sleep(0.5)

	except Exception as e:
	print(f"Error visiting {current_url}: {e}")

	print(f"Crawled {len(visited_urls)} pages and collected content from {len(site_content)} pages.")
	return site_content

	# Function that creates a context from the scraped content
	def create_context(site_content, max_context_length=8000):
	context = "Content from https://innovativeskillsbd.com website:\n\n"

	for url, content in site_content.items():
	# Add URL and a portion of its content (limited to keep context manageable)
	page_content = f"Page: {url}\n{content[:1000]}...\n\n"

	# Check if adding this would exceed max context length
	if len(context) + len(page_content) > max_context_length:
	break

	context += page_content

	return context

	# Function to fix URLs in text to ensure they point to the correct domain
	def fix_urls_in_text(text):
	# Look for URLs in the text
	url_pattern = r'https?://[^\s/$.?#].[^\s]*'
	urls = re.findall(url_pattern, text)

	for url in urls:
	# If the URL contains the wrong domain but appears to be an InnovativeSkills link
	if ('innovative-skill.com' in url or 'innovativeskill.com' in url) and 'innovativeskillsbd.com' not in url:
	# Create the correct URL by replacing the domain
	path = urlparse(url).path
	correct_url = f"https://innovativeskillsbd.com{path}"
	# Replace in the text
	text = text.replace(url, correct_url)

	return text

	# Function to query the DeepSeek V3 model
	def query_model(api_key, messages):
	try:
	client = OpenAI(
	base_url="https://openrouter.ai/api/v1",
	api_key=api_key,
	)

	completion = client.chat.completions.create(
	extra_headers={
	"HTTP-Referer": "https://innovativeskillsbd.com",
	"X-Title": "InnovativeSkills ChatBot",
	},
	model="deepseek/deepseek-chat-v3-0324:free",
	messages=messages
	)

	response = completion.choices[0].message.content

	# Fix any incorrect URLs - ensure all links point to the correct domain
	response = fix_urls_in_text(response)

	return response
	except Exception as e:
	return f"Error querying the model: {str(e)}"

	# Function to answer questions based on website content
	def answer_question(api_key, question, site_content, history):
	if not api_key:
	return "Please enter your OpenRouter API key.", history

	# Prepare the context from scraped content
	context = create_context(site_content)

	# Create system message with context
	system_message = {
	"role": "system",
	"content": f"""You are a helpful AI assistant for InnovativeSkills Bangladesh, a website focused on helping people learn IT skills.
	Use the following content from the website to answer user questions. If the question is not related to the website or the
	information is not available in the content, politely say so and try to provide general guidance related to InnovativeSkills.

	IMPORTANT: When referring to any URLs related to the website, ALWAYS use the domain 'innovativeskillsbd.com' (NOT 'innovative-skill.com' or 'innovativeskill.com').
	For example, use 'https://innovativeskillsbd.com/student-job-success' instead of any other domain.

	{context}"""
	}

	# Create user message
	user_message = {"role": "user", "content": question}

	# Create message history for the API call
	messages = [system_message]

	# Add conversation history
	for user_msg, assistant_msg in history:
	messages.append({"role": "user", "content": user_msg})
	messages.append({"role": "assistant", "content": assistant_msg})

	# Add current question
	messages.append(user_message)

	# Query the model
	response = query_model(api_key, messages)

	# Update history by adding the new exchange
	new_history = copy.deepcopy(history)
	new_history.append((question, response))
	return response, new_history

	# Scrape the website when the app starts
	def init_scraper(progress=gr.Progress()):
	base_url = "https://innovativeskillsbd.com/"
	progress(0, desc="Starting website crawler...")
	site_content = crawl_website(base_url)
	progress(1, desc="Finished crawling website")
	return site_content

	# Create Gradio interface
	def create_interface(site_content):
	with gr.Blocks() as app:
	gr.Markdown("# InnovativeSkills Bangladesh Chatbot")
	gr.Markdown("This chatbot uses DeepSeek V3 to answer questions about InnovativeSkills Bangladesh website.")

	with gr.Row():
	api_key_input = gr.Textbox(
	label="OpenRouter API Key",
	placeholder="Enter your OpenRouter API key",
	type="password"
	)

	chatbot = gr.Chatbot(height=500, show_copy_button=True)
	msg = gr.Textbox(label="Ask a question about InnovativeSkills Bangladesh")

	# Container for site content (hidden from UI)
	site_content_state = gr.State(site_content)

	# Container for chat history
	chat_history = gr.State([])

	# Button to start the conversation
	clear = gr.Button("Clear conversation")

	# Events
	def user_input(api_key, message, site_content, history):
	if not message:
	return "", chatbot, history

	# Process the response
	bot_response, updated_history = answer_question(api_key, message, site_content, history)

	# Format history for chatbot display
	chatbot_display = []
	for user_msg, bot_msg in updated_history:
	chatbot_display.append([user_msg, bot_msg])

	return "", chatbot_display, updated_history

	msg.submit(
	user_input,
	inputs=[api_key_input, msg, site_content_state, chat_history],
	outputs=[msg, chatbot, chat_history]
	)

	def clear_chat():
	return "", [], []

	clear.click(
	clear_chat,
	outputs=[msg, chatbot, chat_history]
	)

	return app

	# Initialize and launch the app
	def main():
	print("Starting to initialize the InnovativeSkills chatbot...")

	# First, scrape the website content
	site_content = {}
	try:
	site_content = crawl_website("https://innovativeskillsbd.com/")
	except Exception as e:
	print(f"Error during initial website crawling: {e}")
	print("The chatbot will still work, but without initial website content.")

	# Create the Gradio interface with the site content
	app = create_interface(site_content)

	# Launch the app
	app.launch()

	if __name__ == "__main__":
	main()