import gradio as gr import requests from bs4 import BeautifulSoup from urllib.parse import urljoin, urlparse import re from openai import OpenAI import time import copy # Function to check if URL belongs to the website def is_valid_url(url, base_url): parsed_url = urlparse(url) parsed_base = urlparse(base_url) return parsed_url.netloc == parsed_base.netloc # Function to scrape content from a single page def scrape_page(url): try: response = requests.get(url, timeout=10) if response.status_code == 200: soup = BeautifulSoup(response.text, 'html.parser') # Remove script, style elements and comments for element in soup(['script', 'style', 'header', 'footer', 'nav']): element.decompose() # Get text content text = soup.get_text(separator=' ', strip=True) # Clean up whitespace text = re.sub(r'\s+', ' ', text).strip() return text else: return None except Exception as e: print(f"Error scraping {url}: {e}") return None # Function to crawl website and get all links def crawl_website(base_url, max_pages=30): print(f"Starting to crawl {base_url}") visited_urls = set() urls_to_visit = [base_url] site_content = {} while urls_to_visit and len(visited_urls) < max_pages: current_url = urls_to_visit.pop(0) if current_url in visited_urls: continue print(f"Crawling: {current_url}") visited_urls.add(current_url) try: response = requests.get(current_url, timeout=10) if response.status_code == 200: # Get content of the current page content = scrape_page(current_url) if content: site_content[current_url] = content # Find all links on the page soup = BeautifulSoup(response.text, 'html.parser') for link in soup.find_all('a', href=True): href = link['href'] full_url = urljoin(current_url, href) # Only follow links that are part of the same website if is_valid_url(full_url, base_url) and full_url not in visited_urls: urls_to_visit.append(full_url) # Add a small delay to be respectful time.sleep(0.5) except Exception as e: print(f"Error visiting {current_url}: {e}") print(f"Crawled {len(visited_urls)} pages and collected content from {len(site_content)} pages.") return site_content # Function that creates a context from the scraped content def create_context(site_content, max_context_length=8000): context = "Content from https://innovativeskillsbd.com website:\n\n" for url, content in site_content.items(): # Add URL and a portion of its content (limited to keep context manageable) page_content = f"Page: {url}\n{content[:1000]}...\n\n" # Check if adding this would exceed max context length if len(context) + len(page_content) > max_context_length: break context += page_content return context # Function to fix URLs in text to ensure they point to the correct domain def fix_urls_in_text(text): # Look for URLs in the text url_pattern = r'https?://[^\s/$.?#].[^\s]*' urls = re.findall(url_pattern, text) for url in urls: # If the URL contains the wrong domain but appears to be an InnovativeSkills link if ('innovative-skill.com' in url or 'innovativeskill.com' in url) and 'innovativeskillsbd.com' not in url: # Create the correct URL by replacing the domain path = urlparse(url).path correct_url = f"https://innovativeskillsbd.com{path}" # Replace in the text text = text.replace(url, correct_url) return text # Function to query the DeepSeek V3 model def query_model(api_key, messages): try: client = OpenAI( base_url="https://openrouter.ai/api/v1", api_key=api_key, ) completion = client.chat.completions.create( extra_headers={ "HTTP-Referer": "https://innovativeskillsbd.com", "X-Title": "InnovativeSkills ChatBot", }, model="deepseek/deepseek-chat-v3-0324:free", messages=messages ) response = completion.choices[0].message.content # Fix any incorrect URLs - ensure all links point to the correct domain response = fix_urls_in_text(response) return response except Exception as e: return f"Error querying the model: {str(e)}" # Function to answer questions based on website content def answer_question(api_key, question, site_content, history): if not api_key: return "Please enter your OpenRouter API key.", history # Prepare the context from scraped content context = create_context(site_content) # Create system message with context system_message = { "role": "system", "content": f"""You are a helpful AI assistant for InnovativeSkills Bangladesh, a website focused on helping people learn IT skills. Use the following content from the website to answer user questions. If the question is not related to the website or the information is not available in the content, politely say so and try to provide general guidance related to InnovativeSkills. IMPORTANT: When referring to any URLs related to the website, ALWAYS use the domain 'innovativeskillsbd.com' (NOT 'innovative-skill.com' or 'innovativeskill.com'). For example, use 'https://innovativeskillsbd.com/student-job-success' instead of any other domain. {context}""" } # Create user message user_message = {"role": "user", "content": question} # Create message history for the API call messages = [system_message] # Add conversation history for user_msg, assistant_msg in history: messages.append({"role": "user", "content": user_msg}) messages.append({"role": "assistant", "content": assistant_msg}) # Add current question messages.append(user_message) # Query the model response = query_model(api_key, messages) # Update history by adding the new exchange new_history = copy.deepcopy(history) new_history.append((question, response)) return response, new_history # Scrape the website when the app starts def init_scraper(progress=gr.Progress()): base_url = "https://innovativeskillsbd.com/" progress(0, desc="Starting website crawler...") site_content = crawl_website(base_url) progress(1, desc="Finished crawling website") return site_content # Create Gradio interface def create_interface(site_content): with gr.Blocks() as app: gr.Markdown("# InnovativeSkills Bangladesh Chatbot") gr.Markdown("This chatbot uses DeepSeek V3 to answer questions about InnovativeSkills Bangladesh website.") with gr.Row(): api_key_input = gr.Textbox( label="OpenRouter API Key", placeholder="Enter your OpenRouter API key", type="password" ) chatbot = gr.Chatbot(height=500) msg = gr.Textbox(label="Ask a question about InnovativeSkills Bangladesh") # Container for site content (hidden from UI) site_content_state = gr.State(site_content) # Container for chat history chat_history = gr.State([]) # Button to start the conversation clear = gr.Button("Clear conversation") # Events def user_input(api_key, message, site_content, history): if not message: return "", chatbot, history # Process the response bot_response, updated_history = answer_question(api_key, message, site_content, history) # Format history for chatbot display chatbot_display = [] for user_msg, bot_msg in updated_history: chatbot_display.append([user_msg, bot_msg]) return "", chatbot_display, updated_history msg.submit( user_input, inputs=[api_key_input, msg, site_content_state, chat_history], outputs=[msg, chatbot, chat_history] ) def clear_chat(): return "", [], [] clear.click( clear_chat, outputs=[msg, chatbot, chat_history] ) return app # Initialize and launch the app def main(): print("Starting to initialize the InnovativeSkills chatbot...") # First, scrape the website content site_content = {} try: site_content = crawl_website("https://innovativeskillsbd.com/") except Exception as e: print(f"Error during initial website crawling: {e}") print("The chatbot will still work, but without initial website content.") # Create the Gradio interface with the site content app = create_interface(site_content) # Launch the app app.launch() if __name__ == "__main__": main()