Spaces:

shukdevdatta123
/

INNOVBOT

Sleeping

File size: 9,666 Bytes

import gradio as gr
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import re
from openai import OpenAI
import time
import copy

# Function to check if URL belongs to the website
def is_valid_url(url, base_url):
    parsed_url = urlparse(url)
    parsed_base = urlparse(base_url)
    return parsed_url.netloc == parsed_base.netloc

# Function to scrape content from a single page
def scrape_page(url):
    try:
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Remove script, style elements and comments
            for element in soup(['script', 'style', 'header', 'footer', 'nav']):
                element.decompose()
                
            # Get text content
            text = soup.get_text(separator=' ', strip=True)
            
            # Clean up whitespace
            text = re.sub(r'\s+', ' ', text).strip()
            
            return text
        else:
            return None
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return None

# Function to crawl website and get all links
def crawl_website(base_url, max_pages=80):
    print(f"Starting to crawl {base_url}")
    visited_urls = set()
    urls_to_visit = [base_url]
    site_content = {}
    
    while urls_to_visit and len(visited_urls) < max_pages:
        current_url = urls_to_visit.pop(0)
        
        if current_url in visited_urls:
            continue
            
        print(f"Crawling: {current_url}")
        visited_urls.add(current_url)
        
        try:
            response = requests.get(current_url, timeout=10)
            if response.status_code == 200:
                # Get content of the current page
                content = scrape_page(current_url)
                if content:
                    site_content[current_url] = content
                
                # Find all links on the page
                soup = BeautifulSoup(response.text, 'html.parser')
                for link in soup.find_all('a', href=True):
                    href = link['href']
                    full_url = urljoin(current_url, href)
                    
                    # Only follow links that are part of the same website
                    if is_valid_url(full_url, base_url) and full_url not in visited_urls:
                        urls_to_visit.append(full_url)
            
            # Add a small delay to be respectful
            time.sleep(0.5)
            
        except Exception as e:
            print(f"Error visiting {current_url}: {e}")
    
    print(f"Crawled {len(visited_urls)} pages and collected content from {len(site_content)} pages.")
    return site_content

# Function that creates a context from the scraped content
def create_context(site_content, max_context_length=8000):
    context = "Content from https://innovativeskillsbd.com website:\n\n"
    
    for url, content in site_content.items():
        # Add URL and a portion of its content (limited to keep context manageable)
        page_content = f"Page: {url}\n{content[:1000]}...\n\n"
        
        # Check if adding this would exceed max context length
        if len(context) + len(page_content) > max_context_length:
            break
            
        context += page_content
    
    return context

# Function to fix URLs in text to ensure they point to the correct domain
def fix_urls_in_text(text):
    # Look for URLs in the text
    url_pattern = r'https?://[^\s/$.?#].[^\s]*'
    urls = re.findall(url_pattern, text)
    
    for url in urls:
        # If the URL contains the wrong domain but appears to be an InnovativeSkills link
        if ('innovative-skill.com' in url or 'innovativeskill.com' in url) and 'innovativeskillsbd.com' not in url:
            # Create the correct URL by replacing the domain
            path = urlparse(url).path
            correct_url = f"https://innovativeskillsbd.com{path}"
            # Replace in the text
            text = text.replace(url, correct_url)
    
    return text

# Function to query the DeepSeek V3 model
def query_model(api_key, messages):
    try:
        client = OpenAI(
            base_url="https://openrouter.ai/api/v1",
            api_key=api_key,
        )
        
        completion = client.chat.completions.create(
            extra_headers={
                "HTTP-Referer": "https://innovativeskillsbd.com",
                "X-Title": "InnovativeSkills ChatBot",
            },
            model="deepseek/deepseek-chat-v3-0324:free",
            messages=messages
        )
        
        response = completion.choices[0].message.content
        
        # Fix any incorrect URLs - ensure all links point to the correct domain
        response = fix_urls_in_text(response)
        
        return response
    except Exception as e:
        return f"Error querying the model: {str(e)}"

# Function to answer questions based on website content
def answer_question(api_key, question, site_content, history):
    if not api_key:
        return "Please enter your OpenRouter API key.", history
    
    # Prepare the context from scraped content
    context = create_context(site_content)
    
    # Create system message with context
    system_message = {
        "role": "system", 
        "content": f"""You are a helpful AI assistant for InnovativeSkills Bangladesh, a website focused on helping people learn IT skills. 
        Use the following content from the website to answer user questions. If the question is not related to the website or the 
        information is not available in the content, politely say so and try to provide general guidance related to InnovativeSkills.
        
        IMPORTANT: When referring to any URLs related to the website, ALWAYS use the domain 'innovativeskillsbd.com' (NOT 'innovative-skill.com' or 'innovativeskill.com').
        For example, use 'https://innovativeskillsbd.com/student-job-success' instead of any other domain.
        
        {context}"""
    }
    
    # Create user message
    user_message = {"role": "user", "content": question}
    
    # Create message history for the API call
    messages = [system_message]
    
    # Add conversation history
    for user_msg, assistant_msg in history:
        messages.append({"role": "user", "content": user_msg})
        messages.append({"role": "assistant", "content": assistant_msg})
    
    # Add current question
    messages.append(user_message)
    
    # Query the model
    response = query_model(api_key, messages)
    
    # Update history by adding the new exchange
    new_history = copy.deepcopy(history)
    new_history.append((question, response))
    return response, new_history

# Scrape the website when the app starts
def init_scraper(progress=gr.Progress()):
    base_url = "https://innovativeskillsbd.com/"
    progress(0, desc="Starting website crawler...")
    site_content = crawl_website(base_url)
    progress(1, desc="Finished crawling website")
    return site_content

# Create Gradio interface
def create_interface(site_content):
    with gr.Blocks() as app:
        gr.Markdown("# InnovativeSkills Bangladesh Chatbot")
        gr.Markdown("This chatbot uses DeepSeek V3 to answer questions about InnovativeSkills Bangladesh website.")
        
        with gr.Row():
            api_key_input = gr.Textbox(
                label="OpenRouter API Key", 
                placeholder="Enter your OpenRouter API key", 
                type="password"
            )
        
        chatbot = gr.Chatbot(height=500, show_copy_button=True)
        msg = gr.Textbox(label="Ask a question about InnovativeSkills Bangladesh")
        
        # Container for site content (hidden from UI)
        site_content_state = gr.State(site_content)
        
        # Container for chat history
        chat_history = gr.State([])
        
        # Button to start the conversation
        clear = gr.Button("Clear conversation")
        
        # Events
        def user_input(api_key, message, site_content, history):
            if not message:
                return "", chatbot, history
            
            # Process the response
            bot_response, updated_history = answer_question(api_key, message, site_content, history)
            
            # Format history for chatbot display
            chatbot_display = []
            for user_msg, bot_msg in updated_history:
                chatbot_display.append([user_msg, bot_msg])
                
            return "", chatbot_display, updated_history
            
        msg.submit(
            user_input,
            inputs=[api_key_input, msg, site_content_state, chat_history],
            outputs=[msg, chatbot, chat_history]
        )
        
        def clear_chat():
            return "", [], []
            
        clear.click(
            clear_chat,
            outputs=[msg, chatbot, chat_history]
        )
        
    return app

# Initialize and launch the app
def main():
    print("Starting to initialize the InnovativeSkills chatbot...")
    
    # First, scrape the website content
    site_content = {}
    try:
        site_content = crawl_website("https://innovativeskillsbd.com/")
    except Exception as e:
        print(f"Error during initial website crawling: {e}")
        print("The chatbot will still work, but without initial website content.")
    
    # Create the Gradio interface with the site content
    app = create_interface(site_content)
    
    # Launch the app
    app.launch()

if __name__ == "__main__":
    main()