Samizie commited on
Commit
3f1ccae
·
verified ·
1 Parent(s): 544eb2f

Upload 16 files

Browse files
README.md CHANGED
@@ -1,13 +1,44 @@
1
- ---
2
- title: WebGPT 1.0
3
- emoji: 🐨
4
- colorFrom: blue
5
- colorTo: purple
6
- sdk: streamlit
7
- sdk_version: 1.43.2
8
- app_file: app.py
9
- pinned: false
10
- license: mit
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Chat_RAG
2
+
3
+ ## Steps to Run the Project
4
+
5
+ Follow these steps to set up and run the Chat_RAG project:
6
+
7
+ 1. **Clone the Repository:**
8
+ Begin by cloning the repository to your local machine using the following command:
9
+ ```sh
10
+ git clone https://github.com/Samilincoln/Chat_RAG.git
11
+ ```
12
+
13
+ 2. **Navigate to the Project Directory:**
14
+ Change your current directory to the project directory:
15
+ ```sh
16
+ cd Chat_RAG
17
+ ```
18
+
19
+ 3. **Install Required Dependencies:**
20
+ Install all the necessary dependencies specified in the `requirements.txt` file:
21
+ ```sh
22
+ pip install -r requirements.txt
23
+ ```
24
+
25
+ 4. **Set Up Environment Variables:**
26
+ - Create a `.env` file in the root directory of the project.
27
+ - Add your Groq API key to the `.env` file by including the following line:
28
+ ```
29
+ GROQ_API_KEY=your_api_key_here
30
+ ```
31
+
32
+ 5. **Navigate to the Client Directory:**
33
+ Change your directory to the client directory where the Streamlit application is located:
34
+ ```sh
35
+ cd client
36
+ ```
37
+
38
+ 6. **Run the Streamlit Application:**
39
+ Launch the Streamlit application using the following command:
40
+ ```sh
41
+ streamlit run app.py
42
+ ```
43
+
44
+ By following these steps, you will have the Chat_RAG project up and running on your local machine.
app.py ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from decouple import config
3
+ import asyncio
4
+ from langchain.chains import create_retrieval_chain
5
+ from langchain.chains.combine_documents import create_stuff_documents_chain
6
+ from langchain_groq import ChatGroq
7
+ from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
8
+ from langchain_core.messages import SystemMessage
9
+ from scraper.scraper import process_urls
10
+ from embedding.vector_store import initialize_vector_store, clear_chroma_db
11
+ from conversation.talks import clean_input, small_talks
12
+
13
+ #Clearing ChromaDB at startup to clean up any previous data
14
+ clear_chroma_db()
15
+
16
+
17
+
18
+
19
+ #Groq API Key
20
+ groq_api = config("GROQ_API_KEY")
21
+
22
+ #Initializing LLM with memory
23
+ llm = ChatGroq(model="llama-3.2-1b-preview", groq_api_key=groq_api, temperature=0)
24
+
25
+
26
+
27
+ #Ensure proper asyncio handling for Windows
28
+ import sys
29
+ if sys.platform.startswith("win"):
30
+ asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
31
+
32
+ #Async helper function
33
+ def run_asyncio_coroutine(coro):
34
+ loop = asyncio.new_event_loop()
35
+ asyncio.set_event_loop(loop)
36
+ return loop.run_until_complete(coro)
37
+
38
+ import streamlit as st
39
+
40
+ st.title("WebGPT 1.0 🤖")
41
+
42
+ # URL inputs
43
+ urls = st.text_area("Enter URLs (one per line)")
44
+ run_scraper = st.button("Run Scraper", disabled=not urls.strip())
45
+
46
+ # Sessions & states
47
+ if "messages" not in st.session_state:
48
+ st.session_state.messages = [] # Chat history
49
+ if "history" not in st.session_state:
50
+ st.session_state.history = "" # Stores past Q&A for memory
51
+ if "scraping_done" not in st.session_state:
52
+ st.session_state.scraping_done = False
53
+ if "vector_store" not in st.session_state:
54
+ st.session_state.vector_store = None
55
+
56
+ # Run scraper
57
+ if run_scraper:
58
+ st.write("Fetching and processing URLs... This may take a while.")
59
+ split_docs = run_asyncio_coroutine(process_urls(urls.split("\n")))
60
+ st.session_state.vector_store = initialize_vector_store(split_docs)
61
+ st.session_state.scraping_done = True
62
+ st.success("Scraping and processing completed!")
63
+
64
+ # ✅ Clear chat button
65
+ if st.button("Clear Chat"):
66
+ st.session_state.messages = [] # Reset message history
67
+ st.session_state.history = "" # Reset history tracking
68
+ st.success("Chat cleared!")
69
+
70
+ # Ensuring chat only enables after scraping
71
+ if not st.session_state.scraping_done:
72
+ st.warning("Scrape some data first to enable chat!")
73
+ else:
74
+ st.write("### Chat With WebGPT 💬")
75
+
76
+ # Display chat history
77
+ for message in st.session_state.messages:
78
+ role, text = message["role"], message["text"]
79
+ with st.chat_message(role):
80
+ st.write(text)
81
+
82
+ # Takes in user input
83
+ user_query = st.chat_input("Ask a question...")
84
+
85
+ if user_query:
86
+ st.session_state.messages.append({"role": "user", "text": user_query})
87
+ with st.chat_message("user"):
88
+ st.write(user_query)
89
+
90
+ user_query_cleaned = clean_input(user_query)
91
+ response = "" # Default value for response
92
+ source_url = "" # Default value for source url
93
+
94
+ # Check for small talk responses
95
+ if user_query_cleaned in small_talks:
96
+ response = small_talks[user_query_cleaned]
97
+ source_url = "Knowledge base" # Small talk comes from the knowledge base
98
+
99
+ else:
100
+ # ✅ Setup retriever (with a similarity threshold or top-k retrieval)
101
+ retriever = st.session_state.vector_store.as_retriever(
102
+ search_kwargs={'k': 5}
103
+ )
104
+
105
+ # ✅ Retrieve context
106
+ retrieved_docs = retriever.invoke(user_query_cleaned)
107
+ retrieved_text = " ".join([doc.page_content for doc in retrieved_docs])
108
+
109
+ # ✅ Define Langchain PromptTemplate properly
110
+ system_prompt_template = PromptTemplate(
111
+ input_variables=["context", "query"],
112
+ template="""
113
+ You are WebGPT, an AI assistant for question-answering tasks that **only answers questions based on the provided context**.
114
+
115
+ - Understand the context {context} first and provide a relevant answer.
116
+ - If the answer is **not** found in the Context, reply with: "I can't find your request in the provided context."
117
+ - If the question is **unrelated** to the Context, reply with: "I can't answer that. do not generate responses."
118
+ - **Do not** use external knowledge, assumptions, or filler responses. Stick to the context provided.
119
+ - Keep responses clear, concise, and relevant to the user’s query.
120
+
121
+ Context:
122
+ {context}
123
+
124
+ Now, answer the user's question:
125
+ {input}
126
+ """
127
+ )
128
+
129
+ # ✅ Generate prompt with retrieved context & user query
130
+ final_prompt = system_prompt_template.format(
131
+ context=retrieved_text,
132
+ input=user_query_cleaned
133
+ )
134
+
135
+ # ✅ Create chains (ensure the prompt is correct)
136
+ scraper_chain = create_stuff_documents_chain(llm=llm, prompt=system_prompt_template)
137
+ llm_chain = create_retrieval_chain(retriever, scraper_chain)
138
+
139
+ # ✅ Process response and source
140
+ if retrieved_docs:
141
+ try:
142
+ response_data = llm_chain.invoke({"context": retrieved_text, "input": user_query_cleaned})
143
+ response = response_data.get("answer", "").strip()
144
+ source_url = retrieved_docs[0].metadata.get("source", "Unknown")
145
+
146
+ # Fallback if response is still empty
147
+ if not response:
148
+ response = "I can't find your request in the provided context."
149
+ source_url = "No source found"
150
+
151
+ except Exception as e:
152
+ response = f"Error generating response: {str(e)}"
153
+ source_url = "Error"
154
+
155
+ else:
156
+ response = "I can't find your request in the provided context."
157
+ source_url = "No source found"
158
+
159
+ # ✅ Track history & update session state
160
+ history_text = "\n".join(
161
+ [f"User: {msg['text']}" if msg["role"] == "user" else f"AI: {msg['text']}" for msg in st.session_state.messages]
162
+ )
163
+ st.session_state.history = history_text
164
+
165
+ # ✅ Format and display response
166
+ formatted_response = f"**Answer:** {response}"
167
+ if response != "I can't find your request in the provided context." and source_url:
168
+ formatted_response += f"\n\n**Source:** {source_url}"
169
+
170
+ st.session_state.messages.append({"role": "assistant", "text": formatted_response})
171
+ with st.chat_message("assistant"):
172
+ st.write(formatted_response)
173
+
conversation/__init__.py ADDED
File without changes
conversation/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (172 Bytes). View file
 
conversation/__pycache__/talks.cpython-313.pyc ADDED
Binary file (1.31 kB). View file
 
conversation/talks.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import re
4
+
5
+ def load_small_talks():
6
+ """Loads small talk responses from a JSON file located in the same directory as app.py."""
7
+ json_path = "small_talks.json" # Direct relative path
8
+
9
+ if not os.path.exists(json_path):
10
+ raise FileNotFoundError(f"File not found: {os.path.abspath(json_path)}")
11
+
12
+ with open(json_path, "r", encoding="utf-8") as file:
13
+ return json.load(file)
14
+
15
+ small_talks = load_small_talks()
16
+
17
+ def clean_input(user_input):
18
+ """Removes punctuation and converts input to lowercase."""
19
+ return re.sub(r'[^\w\s]', '', user_input).strip().lower()
20
+
21
+
embedding/__init__.py ADDED
File without changes
embedding/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (169 Bytes). View file
 
embedding/__pycache__/vector_store.cpython-313.pyc ADDED
Binary file (1.37 kB). View file
 
embedding/vector_store.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ from langchain_huggingface import HuggingFaceEmbeddings
4
+ from langchain_community.vectorstores import Chroma
5
+
6
+ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
7
+
8
+ #Utilizing the Chroma vector store for embedding and persistence
9
+ def initialize_vector_store(split_docs, persist_directory="./chroma_db"):
10
+ return Chroma.from_documents(
11
+ documents=split_docs,
12
+ embedding=embeddings,
13
+ persist_directory=persist_directory
14
+ )
15
+
16
+
17
+ def clear_chroma_db():
18
+ persist_directory = "./chroma_db"
19
+ if os.path.exists(persist_directory):
20
+ try:
21
+ shutil.rmtree(persist_directory)
22
+ print("ChromaDB cleared.")
23
+ except PermissionError:
24
+ print("Fetching fromm current ChromaDb session. Restart server to clear ChromaDB.")
25
+ except KeyError:
26
+ print("ChromaDB cleared.")
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ langchain_huggingface
3
+ langchain_community
4
+ langchain
5
+ itertools
6
+ python-decouple
7
+ asyncio
8
+
scraper/__init__.py ADDED
File without changes
scraper/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (167 Bytes). View file
 
scraper/__pycache__/scraper.cpython-313.pyc ADDED
Binary file (1.66 kB). View file
 
scraper/scraper.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.document_loaders import AsyncChromiumLoader
2
+ from langchain_community.document_transformers import Html2TextTransformer
3
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
4
+ from itertools import chain
5
+
6
+
7
+ async def process_urls(urls, persist_directory="./chroma_db"):
8
+ # Clear ChromaDB when new links are added
9
+
10
+ loader = AsyncChromiumLoader(urls)
11
+ docs = await loader.aload()
12
+
13
+ # ✅ Transform HTML to text
14
+ text_transformer = Html2TextTransformer()
15
+ transformed_docs = text_transformer.transform_documents(docs)
16
+
17
+ # ✅ Split text into chunks and retain metadata
18
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
19
+ split_docs_nested = [text_splitter.split_documents([doc]) for doc in transformed_docs]
20
+ split_docs = list(chain.from_iterable(split_docs_nested))
21
+
22
+ split_docs = []
23
+ for doc_list, original_doc in zip(split_docs_nested, transformed_docs):
24
+ for chunk in doc_list:
25
+ chunk.metadata["source"] = original_doc.metadata.get("source", "Unknown") # Preserve URL
26
+ split_docs.append(chunk)
27
+
28
+ return split_docs
29
+
small_talks.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "hi": "Hello! How can I assist you today? Feel free to ask about the scraped data or anything specific.",
3
+ "hello": "Hey there! What’s on your mind? You can ask me anything from the retrieved data.",
4
+ "who are you": "I’m WebGPT, your Scraper Chat AI, here to help with insights from scraped content. What do you need?",
5
+ "how are you": "I’m doing great! How about you? If you have any questions about the scraped data, let me know!",
6
+ "what are you": "I’m WebGPT, an AI trained to provide insights from data. What would you like to know?",
7
+ "howdy": "Hello! I’m here to assist you. Got any questions from the retrieved data?",
8
+ "fine": "That’s great to hear! If you have any topic in mind, I can fetch relevant insights for you.",
9
+ "thanks": "You're welcome! If you need more help with the scraped data, just ask.",
10
+ "thank you": "You're always welcome! Let me know if I can provide any insights from the data.",
11
+ "good": "Awesome! Do you have any queries about the retrieved information?",
12
+ "good morning": "Good morning! Hope your day goes well. Need any insights from the scraped content?",
13
+ "good night": "Good night! Sleep well and take care. Before you go, got any last questions on the data?",
14
+ "what's up": "Not much, just here to assist you! Got any questions about the retrieved data?",
15
+ "bye": "Goodbye! Have a great day! If you need insights later, feel free to return.",
16
+ "okay Thank you": "You're welcome! If you have more questions about the scraped data, don’t hesitate to ask.",
17
+ "okay": "Alright! If you need any insights from the retrieved data, feel free to ask.",
18
+ "thanks a lot": "You're welcome! If you need more help with the scraped data, just ask."
19
+ }