Spaces:

shukdevdatta123
/

INNOVBOT

Sleeping

App Files Files Community

shukdevdatta123 commited on 27 days ago

Commit

6bb3d54

verified ·

1 Parent(s): b70fc5c

Upload 2 files

Browse files

Files changed (2) hide show

app.py +268 -0
requirements.txt +3 -0

app.py ADDED Viewed

	@@ -0,0 +1,268 @@

+import gradio as gr
+import requests
+from bs4 import BeautifulSoup
+from urllib.parse import urljoin, urlparse
+import re
+from openai import OpenAI
+import time
+import copy
+# Function to check if URL belongs to the website
+def is_valid_url(url, base_url):
+    parsed_url = urlparse(url)
+    parsed_base = urlparse(base_url)
+    return parsed_url.netloc == parsed_base.netloc
+# Function to scrape content from a single page
+def scrape_page(url):
+    try:
+        response = requests.get(url, timeout=10)
+        if response.status_code == 200:
+            soup = BeautifulSoup(response.text, 'html.parser')
+            # Remove script, style elements and comments
+            for element in soup(['script', 'style', 'header', 'footer', 'nav']):
+                element.decompose()
+            # Get text content
+            text = soup.get_text(separator=' ', strip=True)
+            # Clean up whitespace
+            text = re.sub(r'\s+', ' ', text).strip()
+            return text
+        else:
+            return None
+    except Exception as e:
+        print(f"Error scraping {url}: {e}")
+        return None
+# Function to crawl website and get all links
+def crawl_website(base_url, max_pages=30):
+    print(f"Starting to crawl {base_url}")
+    visited_urls = set()
+    urls_to_visit = [base_url]
+    site_content = {}
+    while urls_to_visit and len(visited_urls) < max_pages:
+        current_url = urls_to_visit.pop(0)
+        if current_url in visited_urls:
+            continue
+        print(f"Crawling: {current_url}")
+        visited_urls.add(current_url)
+        try:
+            response = requests.get(current_url, timeout=10)
+            if response.status_code == 200:
+                # Get content of the current page
+                content = scrape_page(current_url)
+                if content:
+                    site_content[current_url] = content
+                # Find all links on the page
+                soup = BeautifulSoup(response.text, 'html.parser')
+                for link in soup.find_all('a', href=True):
+                    href = link['href']
+                    full_url = urljoin(current_url, href)
+                    # Only follow links that are part of the same website
+                    if is_valid_url(full_url, base_url) and full_url not in visited_urls:
+                        urls_to_visit.append(full_url)
+            # Add a small delay to be respectful
+            time.sleep(0.5)
+        except Exception as e:
+            print(f"Error visiting {current_url}: {e}")
+    print(f"Crawled {len(visited_urls)} pages and collected content from {len(site_content)} pages.")
+    return site_content
+# Function that creates a context from the scraped content
+def create_context(site_content, max_context_length=8000):
+    context = "Content from https://innovativeskillsbd.com website:\n\n"
+    for url, content in site_content.items():
+        # Add URL and a portion of its content (limited to keep context manageable)
+        page_content = f"Page: {url}\n{content[:1000]}...\n\n"
+        # Check if adding this would exceed max context length
+        if len(context) + len(page_content) > max_context_length:
+            break
+        context += page_content
+    return context
+# Function to fix URLs in text to ensure they point to the correct domain
+def fix_urls_in_text(text):
+    # Look for URLs in the text
+    url_pattern = r'https?://[^\s/$.?#].[^\s]*'
+    urls = re.findall(url_pattern, text)
+    for url in urls:
+        # If the URL contains the wrong domain but appears to be an InnovativeSkills link
+        if ('innovative-skill.com' in url or 'innovativeskill.com' in url) and 'innovativeskillsbd.com' not in url:
+            # Create the correct URL by replacing the domain
+            path = urlparse(url).path
+            correct_url = f"https://innovativeskillsbd.com{path}"
+            # Replace in the text
+            text = text.replace(url, correct_url)
+    return text
+# Function to query the DeepSeek V3 model
+def query_model(api_key, messages):
+    try:
+        client = OpenAI(
+            base_url="https://openrouter.ai/api/v1",
+            api_key=api_key,
+        )
+        completion = client.chat.completions.create(
+            extra_headers={
+                "HTTP-Referer": "https://innovativeskillsbd.com",
+                "X-Title": "InnovativeSkills ChatBot",
+            },
+            model="deepseek/deepseek-chat-v3-0324:free",
+            messages=messages
+        )
+        response = completion.choices[0].message.content
+        # Fix any incorrect URLs - ensure all links point to the correct domain
+        response = fix_urls_in_text(response)
+        return response
+    except Exception as e:
+        return f"Error querying the model: {str(e)}"
+# Function to answer questions based on website content
+def answer_question(api_key, question, site_content, history):
+    if not api_key:
+        return "Please enter your OpenRouter API key.", history
+    # Prepare the context from scraped content
+    context = create_context(site_content)
+    # Create system message with context
+    system_message = {
+        "role": "system",
+        "content": f"""You are a helpful AI assistant for InnovativeSkills Bangladesh, a website focused on helping people learn IT skills.
+        Use the following content from the website to answer user questions. If the question is not related to the website or the
+        information is not available in the content, politely say so and try to provide general guidance related to InnovativeSkills.
+        IMPORTANT: When referring to any URLs related to the website, ALWAYS use the domain 'innovativeskillsbd.com' (NOT 'innovative-skill.com' or 'innovativeskill.com').
+        For example, use 'https://innovativeskillsbd.com/student-job-success' instead of any other domain.
+        {context}"""
+    }
+    # Create user message
+    user_message = {"role": "user", "content": question}
+    # Create message history for the API call
+    messages = [system_message]
+    # Add conversation history
+    for user_msg, assistant_msg in history:
+        messages.append({"role": "user", "content": user_msg})
+        messages.append({"role": "assistant", "content": assistant_msg})
+    # Add current question
+    messages.append(user_message)
+    # Query the model
+    response = query_model(api_key, messages)
+    # Update history by adding the new exchange
+    new_history = copy.deepcopy(history)
+    new_history.append((question, response))
+    return response, new_history
+# Scrape the website when the app starts
+def init_scraper(progress=gr.Progress()):
+    base_url = "https://innovativeskillsbd.com/"
+    progress(0, desc="Starting website crawler...")
+    site_content = crawl_website(base_url)
+    progress(1, desc="Finished crawling website")
+    return site_content
+# Create Gradio interface
+def create_interface(site_content):
+    with gr.Blocks() as app:
+        gr.Markdown("# InnovativeSkills Bangladesh Chatbot")
+        gr.Markdown("This chatbot uses DeepSeek V3 to answer questions about InnovativeSkills Bangladesh website.")
+        with gr.Row():
+            api_key_input = gr.Textbox(
+                label="OpenRouter API Key",
+                placeholder="Enter your OpenRouter API key",
+                type="password"
+            )
+        chatbot = gr.Chatbot(height=500)
+        msg = gr.Textbox(label="Ask a question about InnovativeSkills Bangladesh")
+        # Container for site content (hidden from UI)
+        site_content_state = gr.State(site_content)
+        # Container for chat history
+        chat_history = gr.State([])
+        # Button to start the conversation
+        clear = gr.Button("Clear conversation")
+        # Events
+        def user_input(api_key, message, site_content, history):
+            if not message:
+                return "", chatbot, history
+            # Process the response
+            bot_response, updated_history = answer_question(api_key, message, site_content, history)
+            # Format history for chatbot display
+            chatbot_display = []
+            for user_msg, bot_msg in updated_history:
+                chatbot_display.append([user_msg, bot_msg])
+            return "", chatbot_display, updated_history
+        msg.submit(
+            user_input,
+            inputs=[api_key_input, msg, site_content_state, chat_history],
+            outputs=[msg, chatbot, chat_history]
+        )
+        def clear_chat():
+            return "", [], []
+        clear.click(
+            clear_chat,
+            outputs=[msg, chatbot, chat_history]
+        )
+    return app
+# Initialize and launch the app
+def main():
+    print("Starting to initialize the InnovativeSkills chatbot...")
+    # First, scrape the website content
+    site_content = {}
+    try:
+        site_content = crawl_website("https://innovativeskillsbd.com/")
+    except Exception as e:
+        print(f"Error during initial website crawling: {e}")
+        print("The chatbot will still work, but without initial website content.")
+    # Create the Gradio interface with the site content
+    app = create_interface(site_content)
+    # Launch the app
+    app.launch()
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+requests
+beautifulsoup4
+openai