akshansh36 commited on
Commit
b61abce
·
verified ·
1 Parent(s): 60c9902

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +338 -0
  2. requirements.txt +102 -0
app.py ADDED
@@ -0,0 +1,338 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import streamlit_chat
3
+ import json
4
+ import os
5
+ from pymongo import MongoClient
6
+ from bson import ObjectId
7
+ from dotenv import load_dotenv
8
+ import pinecone
9
+ from langchain_google_genai import GoogleGenerativeAIEmbeddings
10
+ from langchain_google_genai import ChatGoogleGenerativeAI
11
+ from langchain_core.prompts import ChatPromptTemplate
12
+ import re
13
+
14
+ st.set_page_config(layout="wide", page_title="ESIC Chatbot", page_icon="📄")
15
+ load_dotenv()
16
+ import logging
17
+ from pytz import timezone, utc
18
+ from datetime import datetime
19
+
20
+ logging.basicConfig(
21
+ level=logging.DEBUG, # This is for your application logs
22
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
23
+ datefmt='%Y-%m-%d %H:%M:%S'
24
+ )
25
+
26
+ # Suppress pymongo debug logs by setting the pymongo logger to a higher level
27
+ pymongo_logger = logging.getLogger('pymongo')
28
+ pymongo_logger.setLevel(logging.WARNING)
29
+
30
+ PINECONE_API = os.getenv("PINECONE_API_KEY")
31
+ pc = pinecone.Pinecone(
32
+ api_key=PINECONE_API
33
+ )
34
+ index_name = "esic"
35
+ index = pc.Index(index_name)
36
+ # MongoDB connection setup
37
+ MONGO_URI = os.getenv("MONGO_URI")
38
+ client = MongoClient(MONGO_URI)
39
+ db = client["esic"]
40
+ chat_sessions = db["chat_session"]
41
+
42
+ # Set LLM models
43
+ FLASH_API = os.getenv("FLASH_API")
44
+ embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=FLASH_API)
45
+ llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash", temperature=0, max_tokens=None, google_api_key=FLASH_API)
46
+
47
+ # Load the extracted JSON data
48
+
49
+ # Initialize session state for current chat session
50
+ if 'current_chat_id' not in st.session_state:
51
+ st.session_state['current_chat_id'] = None
52
+ if 'chat_history' not in st.session_state:
53
+ st.session_state['chat_history'] = []
54
+ if 'regenerate' not in st.session_state:
55
+ st.session_state['regenerate'] = False # Track regenerate button state
56
+
57
+
58
+ # Function to create a new chat session in MongoDB
59
+ def create_new_chat_session():
60
+ # Get the current time in IST
61
+ ind_time = datetime.now(timezone("Asia/Kolkata"))
62
+ # Convert IST time to UTC for storing in MongoDB
63
+ utc_time = ind_time.astimezone(utc)
64
+
65
+ new_session = {
66
+ "created_at": utc_time, # Store in UTC
67
+ "messages": [] # Empty at first
68
+ }
69
+ session_id = chat_sessions.insert_one(new_session).inserted_id
70
+ return str(session_id)
71
+
72
+
73
+ # Function to load a chat session by MongoDB ID
74
+ # Function to load the chat session by MongoDB ID (load full history for display)
75
+ def load_chat_session(session_id):
76
+ session = chat_sessions.find_one({"_id": ObjectId(session_id)})
77
+ if session:
78
+ # Load the full chat history (no slicing here)
79
+ st.session_state['chat_history'] = session['messages']
80
+
81
+
82
+ # Function to update chat session in MongoDB (store last 15 question-answer pairs)
83
+ # Function to update chat session in MongoDB (store entire chat history)
84
+ def update_chat_session(session_id, question, answer, improved_question):
85
+ # Append the new question-answer pair to the full messages array
86
+ chat_sessions.update_one(
87
+ {"_id": ObjectId(session_id)},
88
+ {"$push": {
89
+ "messages": {"$each": [{"question": question, 'improved_question': improved_question, "answer": answer}]}}}
90
+ )
91
+
92
+
93
+ # Function to replace the last response in MongoDB
94
+ def replace_last_response_in_mongo(session_id, new_answer):
95
+ last_message_index = len(st.session_state['chat_history']) - 1
96
+ if last_message_index >= 0:
97
+ # Replace the last response in MongoDB
98
+ chat_sessions.update_one(
99
+ {"_id": ObjectId(session_id)},
100
+ {"$set": {f"messages.{last_message_index}.answer": new_answer}}
101
+ )
102
+
103
+
104
+ # Function to regenerate the response
105
+ def regenerate_response():
106
+ try:
107
+ if st.session_state['chat_history']:
108
+ last_question = st.session_state['chat_history'][-1]["question"] # Get the last question
109
+ # Exclude the last response from the history when sending the question to LLM
110
+ previous_history = st.session_state['chat_history'][:-1] # Exclude the last Q&A pair
111
+
112
+ with st.spinner("Please wait, regenerating the response!"):
113
+ # Generate a new response for the last question using only the previous history
114
+
115
+ query = get_context_from_messages(last_question, previous_history)
116
+ if query:
117
+ logging.info(f"Extracted query is :{query}\n")
118
+ extracted_query = get_query_from_llm_answer(query)
119
+ if extracted_query:
120
+ query = extracted_query
121
+ else:
122
+ query = last_question
123
+
124
+ query_embedding = embeddings.embed_query(query)
125
+ search_results = index.query(vector=query_embedding, top_k=10, include_metadata=True)
126
+ matches = search_results['matches']
127
+
128
+ content = ""
129
+ for i, match in enumerate(matches):
130
+ chunk = match['metadata']['chunk']
131
+ url = match['metadata']['url']
132
+ content += f"chunk{i}: {chunk}\n" + f"url{i}: {url}\n"
133
+
134
+ new_reply = generate_summary(content, query, previous_history)
135
+
136
+ st.session_state['chat_history'][-1]["answer"] = new_reply
137
+
138
+ # Update MongoDB with the new response
139
+ if st.session_state['current_chat_id']:
140
+ replace_last_response_in_mongo(st.session_state['current_chat_id'], new_reply)
141
+
142
+ st.session_state['regenerate'] = False # Reset regenerate flag
143
+ st.rerun()
144
+
145
+ except Exception as e:
146
+ st.error("Error occured in Regenerating response, please try again later.")
147
+
148
+
149
+ def generate_summary(chunks, query, chat_history):
150
+ try:
151
+ # Limit the history sent to the LLM to the latest 3 question-answer pairs
152
+ limited_history = chat_history[-3:] if len(chat_history) > 3 else chat_history
153
+
154
+ # Create conversation history for the LLM, only using the last 15 entries
155
+ history_text = "\n".join([f"User: {q['improved_question']}\nLLM: {q['answer']}" for q in limited_history])
156
+
157
+ # Define the system and user prompts including the limited history
158
+ prompt = ChatPromptTemplate.from_messages([
159
+ ("system", """You are a chatbot specializing in answering queries related to Employees State Insurance Corporation(ESIC) website. You will be provided with chunks of data from the ESIC website to answer user queries. Each chunk will include associated URLs, You must give the url of the chunks which you are using to answer the query.
160
+ Key Guidelines:
161
+ 1.If the user query is not clear, or you think multiple answers are possbile, you should ask for clarification with proper reasoning.
162
+ 2.Do not mention chunk name in any of your replies.
163
+ 3.Detailed and Clear: Provide thorough, clear, and concise responses without omitting relevant information from the data chunks.
164
+ 4.Natural Summarization: When answering, you must not directly quote chunk names,formats. Instead, summarize or interpret the data naturally and conversationally.
165
+ 5.Use Conversation History: Refer back to the conversation history to maintain consistency and build on prior responses, if applicable.
166
+ 6.Ignore Unanswered Queries: If the conversation history contains previous responses like "The answer is not available in the context," disregard them when formulating your current response.
167
+ 7.Graceful Handling of General Queries: If a user sends greetings, introduction, salutations, or unrelated questions, respond appropriately and conversationally.
168
+ 8.Include Source URLs: Always include the URLs from the relevant chunks of data that you're using to answer the query.
169
+ 9.Thoroughly looks for answer to the query in the provided chunks before replying, if you feel the query is irrelevant or answer is not present then you can ask user to clarify or tell that it cannot be answered.
170
+ 10.Sometimes chunks might contain very less data still use it if its relevant.
171
+ 11. Its important to give the answer in a well formatted structure, like points or paragraphs
172
+ """),
173
+
174
+ ("human", f'''
175
+ "Query":\n {query}\n
176
+ Below are the pinecone chunks that should be used to answer the user query:
177
+ "Extracted Data": \n{chunks}\n
178
+ Below is the previous conversation history:
179
+ "Previous Conversation History": \n{history_text}\n
180
+ '''
181
+ )
182
+ ])
183
+
184
+ # Chain the prompt with LLM for response generation
185
+ chain = prompt | llm
186
+ result = chain.invoke({"Query": query, "Extracted Data": chunks, "Previous Conversation History": history_text})
187
+
188
+ # Return the generated response
189
+ logging.info(f"LLM answer is :{result}")
190
+ return result.content
191
+
192
+ except Exception as e:
193
+ st.error(f"Error answering your question: {e}")
194
+ return None
195
+
196
+
197
+ def get_context_from_messages(query, chat_history):
198
+ try:
199
+
200
+ logging.info(f"Getting context from original query: {query}")
201
+
202
+ # Limit the history sent to the LLM to the latest 3 question-answer pairs
203
+ limited_history = chat_history[-3:] if len(chat_history) > 3 else chat_history
204
+
205
+ # Create conversation history for the LLM, only using the last 15 entries
206
+ history_text = "\n".join([f"User: {q['question']}\nLLM: {q['answer']}" for q in limited_history])
207
+
208
+ # Define the system and user prompts including the limited history
209
+ prompt = ChatPromptTemplate.from_messages([
210
+ ("system", """"I will provide you with a user query and up to the last 3 messages from the chat history which includes both questions and answers.Your task is to understand the user query nicely and restructure it if required such that it makes complete sense and is completely self contained.
211
+ The provided queries are related to Employees State Insurance Corporation(ESIC).
212
+ 1. If the query is a follow-up, use the provided chat history to reconstruct a well-defined, contextually complete query that can stand alone."
213
+ 2. if the query is self contained, if applicable try to improve it to make is coherent.
214
+ 3. if the user query is salutations, greetings or not relevant in that case give the query back as it is.
215
+ I have provided an output format below, stricly follow it. Do not give anything else other than just the output.
216
+ expected_output_format: "query: String or None"
217
+ """),
218
+ ("human", f'''
219
+ "Query":\n {query}\n
220
+ "Previous Conversation History": \n{history_text}\n
221
+ '''
222
+ )
223
+ ])
224
+
225
+ # Chain the prompt with LLM for response generation
226
+ chain = prompt | llm
227
+ result = chain.invoke({"Query": query, "Previous Conversation History": history_text})
228
+ logging.info(f"llm answer for query extraction is :{result}")
229
+
230
+ # Return the generated response
231
+ return result.content
232
+
233
+ except Exception as e:
234
+ logging.error(f"exception occured in getting query from original query :{e}")
235
+ return None
236
+
237
+
238
+ def get_query_from_llm_answer(llm_output):
239
+ match = re.search(r'query:\s*(.*)', llm_output)
240
+ if match:
241
+ query = match.group(1).strip().strip('"') # Remove leading/trailing spaces and quotes
242
+ return None if query.lower() == "none" else query
243
+ return None
244
+
245
+
246
+ # Sidebar for showing chat sessions and creating new sessions
247
+ st.sidebar.header("Chat Sessions")
248
+
249
+ # Button for creating a new chat
250
+ if st.sidebar.button("New Chat"):
251
+ new_chat_id = create_new_chat_session()
252
+ st.session_state['current_chat_id'] = new_chat_id
253
+ st.session_state['chat_history'] = []
254
+
255
+ # List existing chat sessions with delete button (dustbin icon)
256
+ existing_sessions = chat_sessions.find().sort("created_at", -1)
257
+ for session in existing_sessions:
258
+ session_id = str(session['_id'])
259
+
260
+ # Retrieve stored UTC time and convert it to IST for display
261
+ utc_time = session['created_at']
262
+ ist_time = utc_time.replace(tzinfo=utc).astimezone(timezone("Asia/Kolkata"))
263
+ session_date = ist_time.strftime("%Y-%m-%d %H:%M:%S") # Format for display
264
+
265
+ col1, col2 = st.sidebar.columns([8, 1])
266
+ with col1:
267
+ if st.button(f"Session {session_date}", key=session_id):
268
+ st.session_state['current_chat_id'] = session_id
269
+ load_chat_session(session_id)
270
+
271
+ # Display delete icon (dustbin)
272
+ with col2:
273
+ if st.button("🗑️", key=f"delete_{session_id}"):
274
+ chat_sessions.delete_one({"_id": ObjectId(session_id)})
275
+ st.rerun() # Refresh the app to remove the deleted session from the sidebar
276
+
277
+ # Main chat interface
278
+ st.markdown('<div class="fixed-header"><h1>Welcome To ESIC Chatbot</h1></div>', unsafe_allow_html=True)
279
+ st.markdown("<hr>", unsafe_allow_html=True)
280
+
281
+ # Input box for the question
282
+ user_question = st.chat_input(f"Ask a Question related to ESIC Website")
283
+
284
+ if user_question:
285
+ # Automatically create a new session if none exists
286
+ if not st.session_state['current_chat_id']:
287
+ new_chat_id = create_new_chat_session()
288
+ st.session_state['current_chat_id'] = new_chat_id
289
+
290
+ with st.spinner("Please wait, I am thinking!!"):
291
+ # Store the user's question and get the assistant's response
292
+ query = get_context_from_messages(user_question, st.session_state['chat_history'])
293
+ if query:
294
+ logging.info(f"Extracted query is :{query}\n")
295
+ extracted_query = get_query_from_llm_answer(query)
296
+ if extracted_query:
297
+ query = extracted_query
298
+ else:
299
+ query = user_question
300
+
301
+ query_embedding = embeddings.embed_query(query)
302
+ search_results = index.query(vector=query_embedding, top_k=10, include_metadata=True)
303
+ matches = search_results['matches']
304
+
305
+ content = ""
306
+ for i, match in enumerate(matches):
307
+ chunk = match['metadata']['chunk']
308
+ url = match['metadata']['url']
309
+ content += f"chunk{i}: {chunk}\n" + f"url{i}: {url}\n"
310
+
311
+ print(f"content being passed is {content}")
312
+ reply = generate_summary(content, query, st.session_state['chat_history'])
313
+
314
+ if reply:
315
+ # Append the new question-answer pair to chat history
316
+ st.session_state['chat_history'].append(
317
+ {"question": user_question, "answer": reply, "improved_question": query})
318
+
319
+ # Update the current chat session in MongoDB
320
+ if st.session_state['current_chat_id']:
321
+ update_chat_session(st.session_state['current_chat_id'], user_question, reply, query)
322
+
323
+ else:
324
+ st.error("Error processing your request, Please try again later.")
325
+ else:
326
+ st.error("Error processing your request, Please try again later.")
327
+ # Display the updated chat history (show last 15 question-answer pairs)
328
+ for i, pair in enumerate(st.session_state['chat_history']):
329
+ question = pair["question"]
330
+ answer = pair["answer"]
331
+ streamlit_chat.message(question, is_user=True, key=f"chat_message_user_{i}")
332
+ streamlit_chat.message(answer, is_user=False, key=f"chat_message_assistant_{i}")
333
+
334
+ # Display regenerate button under the last response
335
+ if st.session_state['chat_history'] and not st.session_state['regenerate']:
336
+ if st.button("🔄 Regenerate", key="regenerate_button"):
337
+ st.session_state['regenerate'] = True
338
+ regenerate_response()
requirements.txt ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiohappyeyeballs==2.4.0
2
+ aiohttp==3.10.5
3
+ aiosignal==1.3.1
4
+ altair==5.4.1
5
+ annotated-types==0.7.0
6
+ anyio==4.4.0
7
+ async-timeout==4.0.3
8
+ attrs==24.2.0
9
+ blinker==1.8.2
10
+ boto3==1.35.15
11
+ botocore==1.35.15
12
+ cachetools==5.5.0
13
+ certifi==2024.8.30
14
+ charset-normalizer==3.3.2
15
+ click==8.1.7
16
+ colorama==0.4.6
17
+ dnspython==2.6.1
18
+ exceptiongroup==1.2.2
19
+ frozenlist==1.4.1
20
+ gitdb==4.0.11
21
+ GitPython==3.1.43
22
+ google-ai-generativelanguage==0.6.6
23
+ google-api-core==2.19.2
24
+ google-api-python-client==2.144.0
25
+ google-auth==2.34.0
26
+ google-auth-httplib2==0.2.0
27
+ google-generativeai==0.7.2
28
+ googleapis-common-protos==1.65.0
29
+ greenlet==3.1.0
30
+ grpcio==1.66.1
31
+ grpcio-status==1.62.3
32
+ h11==0.14.0
33
+ httpcore==1.0.5
34
+ httplib2==0.22.0
35
+ httpx==0.27.2
36
+ idna==3.8
37
+ Jinja2==3.1.4
38
+ jmespath==1.0.1
39
+ jsonpatch==1.33
40
+ jsonpointer==3.0.0
41
+ jsonschema==4.23.0
42
+ jsonschema-specifications==2023.12.1
43
+ langchain==0.2.16
44
+ langchain-core==0.2.38
45
+ langchain-google-genai==1.0.10
46
+ langchain-text-splitters==0.2.4
47
+ langsmith==0.1.117
48
+ markdown-it-py==3.0.0
49
+ MarkupSafe==2.1.5
50
+ mdurl==0.1.2
51
+ multidict==6.1.0
52
+ narwhals==1.6.4
53
+ numpy==1.26.4
54
+ orjson==3.10.7
55
+ packaging==24.1
56
+ pandas==2.2.2
57
+ pillow==10.4.0
58
+ pinecone==5.1.0
59
+ pinecone-client==5.0.1
60
+ pinecone-plugin-inference==1.0.3
61
+ pinecone-plugin-interface==0.0.7
62
+ proto-plus==1.24.0
63
+ protobuf==4.25.4
64
+ pyarrow==17.0.0
65
+ pyasn1==0.6.0
66
+ pyasn1_modules==0.4.0
67
+ pydantic==2.9.1
68
+ pydantic_core==2.23.3
69
+ pydeck==0.9.1
70
+ Pygments==2.18.0
71
+ pymongo==4.8.0
72
+ pyparsing==3.1.4
73
+ PyPDF2==3.0.1
74
+ python-dateutil==2.9.0.post0
75
+ python-dotenv==1.0.1
76
+ pytz==2024.1
77
+ PyYAML==6.0.2
78
+ referencing==0.35.1
79
+ requests==2.32.3
80
+ rich==13.8.1
81
+ router==0.1
82
+ rpds-py==0.20.0
83
+ rsa==4.9
84
+ s3transfer==0.10.2
85
+ six==1.16.0
86
+ smmap==5.0.1
87
+ sniffio==1.3.1
88
+ SQLAlchemy==2.0.34
89
+ streamlit==1.38.0
90
+ streamlit-chat==0.1.1
91
+ streamlit-router==0.1.8
92
+ tenacity==8.5.0
93
+ toml==0.10.2
94
+ tornado==6.4.1
95
+ tqdm==4.66.5
96
+ typing_extensions==4.12.2
97
+ tzdata==2024.1
98
+ uritemplate==4.1.1
99
+ urllib3==2.2.2
100
+ watchdog==4.0.2
101
+ Werkzeug==3.0.4
102
+ yarl==1.11.1