Spaces:

Samizie
/

WebGPT-1.0

Running

App Files Files Community

Samizie commited on Mar 19

Commit

3f1ccae

verified ·

1 Parent(s): 544eb2f

Upload 16 files

Browse files

Files changed (16) hide show

README.md +44 -13
app.py +173 -0
conversation/__init__.py +0 -0
conversation/__pycache__/__init__.cpython-313.pyc +0 -0
conversation/__pycache__/talks.cpython-313.pyc +0 -0
conversation/talks.py +21 -0
embedding/__init__.py +0 -0
embedding/__pycache__/__init__.cpython-313.pyc +0 -0
embedding/__pycache__/vector_store.cpython-313.pyc +0 -0
embedding/vector_store.py +26 -0
requirements.txt +8 -0
scraper/__init__.py +0 -0
scraper/__pycache__/__init__.cpython-313.pyc +0 -0
scraper/__pycache__/scraper.cpython-313.pyc +0 -0
scraper/scraper.py +29 -0
small_talks.json +19 -0

README.md CHANGED Viewed

@@ -1,13 +1,44 @@
----
-title: WebGPT 1.0
-emoji: 🐨
-colorFrom: blue
-colorTo: purple
-sdk: streamlit
-sdk_version: 1.43.2
-app_file: app.py
-pinned: false
-license: mit
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# Chat_RAG
+## Steps to Run the Project
+Follow these steps to set up and run the Chat_RAG project:
+1. **Clone the Repository:**
+    Begin by cloning the repository to your local machine using the following command:
+    ```sh
+    git clone https://github.com/Samilincoln/Chat_RAG.git
+    ```
+2. **Navigate to the Project Directory:**
+    Change your current directory to the project directory:
+    ```sh
+    cd Chat_RAG
+    ```
+3. **Install Required Dependencies:**
+    Install all the necessary dependencies specified in the `requirements.txt` file:
+    ```sh
+    pip install -r requirements.txt
+    ```
+4. **Set Up Environment Variables:**
+    - Create a `.env` file in the root directory of the project.
+    - Add your Groq API key to the `.env` file by including the following line:
+      ```
+      GROQ_API_KEY=your_api_key_here
+      ```
+5. **Navigate to the Client Directory:**
+    Change your directory to the client directory where the Streamlit application is located:
+    ```sh
+    cd client
+    ```
+6. **Run the Streamlit Application:**
+    Launch the Streamlit application using the following command:
+    ```sh
+    streamlit run app.py
+    ```
+By following these steps, you will have the Chat_RAG project up and running on your local machine.

app.py ADDED Viewed

	@@ -0,0 +1,173 @@

+import streamlit as st
+from decouple import config
+import asyncio
+from langchain.chains import create_retrieval_chain
+from langchain.chains.combine_documents import create_stuff_documents_chain
+from langchain_groq import ChatGroq
+from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
+from langchain_core.messages import SystemMessage
+from scraper.scraper import process_urls
+from embedding.vector_store import initialize_vector_store, clear_chroma_db
+from conversation.talks import clean_input, small_talks
+#Clearing ChromaDB at startup to clean up any previous data
+clear_chroma_db()
+#Groq API Key
+groq_api = config("GROQ_API_KEY")
+#Initializing LLM with memory
+llm = ChatGroq(model="llama-3.2-1b-preview", groq_api_key=groq_api, temperature=0)
+#Ensure proper asyncio handling for Windows
+import sys
+if sys.platform.startswith("win"):
+    asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
+#Async helper function
+def run_asyncio_coroutine(coro):
+    loop = asyncio.new_event_loop()
+    asyncio.set_event_loop(loop)
+    return loop.run_until_complete(coro)
+import streamlit as st
+st.title("WebGPT 1.0 🤖")
+# URL inputs
+urls = st.text_area("Enter URLs (one per line)")
+run_scraper = st.button("Run Scraper", disabled=not urls.strip())
+# Sessions & states
+if "messages" not in st.session_state:
+    st.session_state.messages = []  # Chat history
+if "history" not in st.session_state:
+    st.session_state.history = ""  # Stores past Q&A for memory
+if "scraping_done" not in st.session_state:
+    st.session_state.scraping_done = False
+if "vector_store" not in st.session_state:
+    st.session_state.vector_store = None
+# Run scraper
+if run_scraper:
+    st.write("Fetching and processing URLs... This may take a while.")
+    split_docs = run_asyncio_coroutine(process_urls(urls.split("\n")))
+    st.session_state.vector_store = initialize_vector_store(split_docs)
+    st.session_state.scraping_done = True
+    st.success("Scraping and processing completed!")
+# ✅ Clear chat button
+if st.button("Clear Chat"):
+    st.session_state.messages = []  # Reset message history
+    st.session_state.history = ""  # Reset history tracking
+    st.success("Chat cleared!")
+# Ensuring chat only enables after scraping
+if not st.session_state.scraping_done:
+    st.warning("Scrape some data first to enable chat!")
+else:
+    st.write("### Chat With WebGPT 💬")
+    # Display chat history
+    for message in st.session_state.messages:
+        role, text = message["role"], message["text"]
+        with st.chat_message(role):
+            st.write(text)
+    # Takes in user input
+    user_query = st.chat_input("Ask a question...")
+    if user_query:
+        st.session_state.messages.append({"role": "user", "text": user_query})
+        with st.chat_message("user"):
+            st.write(user_query)
+        user_query_cleaned = clean_input(user_query)
+        response = "" # Default value for response
+        source_url = ""  # Default value for source url
+        # Check for small talk responses
+        if user_query_cleaned in small_talks:
+            response = small_talks[user_query_cleaned]
+            source_url = "Knowledge base"  # Small talk comes from the knowledge base
+        else:
+            # ✅ Setup retriever (with a similarity threshold or top-k retrieval)
+            retriever = st.session_state.vector_store.as_retriever(
+                search_kwargs={'k': 5}
+            )
+            # ✅ Retrieve context
+            retrieved_docs = retriever.invoke(user_query_cleaned)
+            retrieved_text = " ".join([doc.page_content for doc in retrieved_docs])
+            # ✅ Define Langchain PromptTemplate properly
+            system_prompt_template = PromptTemplate(
+                input_variables=["context", "query"],
+                template="""
+                You are WebGPT, an AI assistant for question-answering tasks that **only answers questions based on the provided context**.
+                - Understand the context {context} first and provide a relevant answer.
+                - If the answer is **not** found in the Context, reply with: "I can't find your request in the provided context."
+                - If the question is **unrelated** to the Context, reply with: "I can't answer that. do not generate responses."
+                - **Do not** use external knowledge, assumptions, or filler responses. Stick to the context provided.
+                - Keep responses clear, concise, and relevant to the user’s query.
+                Context:
+                {context}
+                Now, answer the user's question:
+                {input}
+                """
+            )
+            # ✅ Generate prompt with retrieved context & user query
+            final_prompt = system_prompt_template.format(
+                context=retrieved_text,
+                input=user_query_cleaned
+            )
+            # ✅ Create chains (ensure the prompt is correct)
+            scraper_chain = create_stuff_documents_chain(llm=llm, prompt=system_prompt_template)
+            llm_chain = create_retrieval_chain(retriever, scraper_chain)
+            # ✅ Process response and source
+            if retrieved_docs:
+                try:
+                    response_data = llm_chain.invoke({"context": retrieved_text, "input": user_query_cleaned})
+                    response = response_data.get("answer", "").strip()
+                    source_url = retrieved_docs[0].metadata.get("source", "Unknown")
+                    # Fallback if response is still empty
+                    if not response:
+                        response = "I can't find your request in the provided context."
+                        source_url = "No source found"
+                except Exception as e:
+                    response = f"Error generating response: {str(e)}"
+                    source_url = "Error"
+            else:
+                response = "I can't find your request in the provided context."
+                source_url = "No source found"
+            # ✅ Track history & update session state
+            history_text = "\n".join(
+                [f"User: {msg['text']}" if msg["role"] == "user" else f"AI: {msg['text']}" for msg in st.session_state.messages]
+            )
+            st.session_state.history = history_text
+        # ✅ Format and display response
+        formatted_response = f"**Answer:** {response}"
+        if response != "I can't find your request in the provided context." and source_url:
+            formatted_response += f"\n\n**Source:** {source_url}"
+        st.session_state.messages.append({"role": "assistant", "text": formatted_response})
+        with st.chat_message("assistant"):
+            st.write(formatted_response)

conversation/__init__.py ADDED Viewed

File without changes

conversation/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (172 Bytes). View file

conversation/__pycache__/talks.cpython-313.pyc ADDED Viewed

Binary file (1.31 kB). View file

conversation/talks.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import os
+import json
+import re
+def load_small_talks():
+    """Loads small talk responses from a JSON file located in the same directory as app.py."""
+    json_path = "small_talks.json"  # Direct relative path
+    if not os.path.exists(json_path):
+        raise FileNotFoundError(f"File not found: {os.path.abspath(json_path)}")
+    with open(json_path, "r", encoding="utf-8") as file:
+        return json.load(file)
+small_talks = load_small_talks()
+def clean_input(user_input):
+    """Removes punctuation and converts input to lowercase."""
+    return re.sub(r'[^\w\s]', '', user_input).strip().lower()

embedding/__init__.py ADDED Viewed

File without changes

embedding/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (169 Bytes). View file

embedding/__pycache__/vector_store.cpython-313.pyc ADDED Viewed

Binary file (1.37 kB). View file

embedding/vector_store.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import os
+import shutil
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_community.vectorstores import Chroma
+embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
+#Utilizing the Chroma vector store for embedding and persistence
+def initialize_vector_store(split_docs, persist_directory="./chroma_db"):
+    return Chroma.from_documents(
+        documents=split_docs,
+        embedding=embeddings,
+        persist_directory=persist_directory
+    )
+def clear_chroma_db():
+    persist_directory = "./chroma_db"
+    if os.path.exists(persist_directory):
+        try:
+            shutil.rmtree(persist_directory)
+            print("ChromaDB cleared.")
+        except PermissionError:
+            print("Fetching fromm current ChromaDb session. Restart server to clear ChromaDB.")
+        except KeyError:
+            print("ChromaDB cleared.")

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+streamlit
+langchain_huggingface
+langchain_community
+langchain
+itertools
+python-decouple
+asyncio

scraper/__init__.py ADDED Viewed

File without changes

scraper/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (167 Bytes). View file

scraper/__pycache__/scraper.cpython-313.pyc ADDED Viewed

Binary file (1.66 kB). View file

scraper/scraper.py ADDED Viewed

	@@ -0,0 +1,29 @@

+from langchain_community.document_loaders import AsyncChromiumLoader
+from langchain_community.document_transformers import Html2TextTransformer
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from itertools import chain
+async def process_urls(urls, persist_directory="./chroma_db"):
+    # Clear ChromaDB when new links are added
+    loader = AsyncChromiumLoader(urls)
+    docs = await loader.aload()
+    # ✅ Transform HTML to text
+    text_transformer = Html2TextTransformer()
+    transformed_docs = text_transformer.transform_documents(docs)
+    # ✅ Split text into chunks and retain metadata
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
+    split_docs_nested = [text_splitter.split_documents([doc]) for doc in transformed_docs]
+    split_docs = list(chain.from_iterable(split_docs_nested))
+    split_docs = []
+    for doc_list, original_doc in zip(split_docs_nested, transformed_docs):
+        for chunk in doc_list:
+            chunk.metadata["source"] = original_doc.metadata.get("source", "Unknown")  # Preserve URL
+            split_docs.append(chunk)
+    return split_docs

small_talks.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+    "hi": "Hello! How can I assist you today? Feel free to ask about the scraped data or anything specific.",
+    "hello": "Hey there! What’s on your mind? You can ask me anything from the retrieved data.",
+    "who are you": "I’m WebGPT, your Scraper Chat AI, here to help with insights from scraped content. What do you need?",
+    "how are you": "I’m doing great! How about you? If you have any questions about the scraped data, let me know!",
+    "what are you": "I’m WebGPT, an AI trained to provide insights from data. What would you like to know?",
+    "howdy": "Hello! I’m here to assist you. Got any questions from the retrieved data?",
+    "fine": "That’s great to hear! If you have any topic in mind, I can fetch relevant insights for you.",
+    "thanks": "You're welcome! If you need more help with the scraped data, just ask.",
+    "thank you": "You're always welcome! Let me know if I can provide any insights from the data.",
+    "good": "Awesome! Do you have any queries about the retrieved information?",
+    "good morning": "Good morning! Hope your day goes well. Need any insights from the scraped content?",
+    "good night": "Good night! Sleep well and take care. Before you go, got any last questions on the data?",
+    "what's up": "Not much, just here to assist you! Got any questions about the retrieved data?",
+    "bye": "Goodbye! Have a great day! If you need insights later, feel free to return.",
+    "okay Thank you": "You're welcome! If you have more questions about the scraped data, don’t hesitate to ask.",
+    "okay": "Alright! If you need any insights from the retrieved data, feel free to ask.",
+    "thanks a lot": "You're welcome! If you need more help with the scraped data, just ask."
+}