Chintan Donda commited on
Commit
3fa36c8
·
1 Parent(s): 77a878e

Using gpt-3.5-turbo LLM model to improve the Custom Query query response

Browse files
Files changed (4) hide show
  1. app.py +24 -16
  2. requirements.txt +2 -5
  3. src/constants.py +13 -5
  4. src/langchain_utils.py +83 -27
app.py CHANGED
@@ -55,8 +55,11 @@ class DomState:
55
  self.relevant_paragraphs = f'Index for {question_category} not found. That means no PDFs, Text files, or URLs have been ingested and indexed so far. Ingest the new data for {question_category} and then querying again.'
56
  else:
57
  if self.index_type in ['FAISS', 'Chroma']:
58
- self.sources_relevant_paragraphs = [doc.metadata for doc in self.relevant_paragraphs]
59
- self.relevant_paragraphs = [doc.page_content.replace('\n', '').replace('\t', ' ') for doc in self.relevant_paragraphs]
 
 
 
60
  return self.relevant_paragraphs
61
 
62
 
@@ -398,16 +401,17 @@ with gr.Blocks(title='KKMS-Smart-Search-Demo') as demo:
398
  # Widget for Custom Queries
399
  with gr.Row(visible=True) as rowCustomQuery:
400
  with gr.Column(scale=1, min_width=600):
 
 
 
 
 
 
 
 
401
  with gr.Tab(label='Relevant paragraphs'):
402
- question_category = gr.Dropdown(
403
- constants_utils.INDEX_CATEGORY,
404
- label="Select Question Category",
405
- value=constants_utils.INDEX_CATEGORY[0]
406
- )
407
-
408
- question = gr.Textbox(label="Enter your question", placeholder='Type the question here')
409
  # Get the Relevant paragraphs for the question asked
410
- relevant_paragraphs = gr.Textbox(label="Relevant paragraphs are:", value=dom.relevant_paragraphs, interactive=False)
411
 
412
  b_relevant_paragraphs = gr.Button("Get Relevant paragraphs").style(size='sm')
413
  b_relevant_paragraphs.click(
@@ -616,11 +620,15 @@ with gr.Blocks(title='KKMS-Smart-Search-Demo') as demo:
616
  # Get the summary of the weather forecast
617
  weather_forecast_summary = gr.Textbox(label="Weather Forecast Summary is:", interactive=False)
618
 
619
- district.change(
620
- dom.click_handler_for_weather_forecast_summary,
621
- district_weather,
622
- weather_forecast_summary
623
- )
 
 
 
 
624
 
625
 
626
  # Covert the weather forcast summary in Indian language
@@ -775,4 +783,4 @@ with gr.Blocks(title='KKMS-Smart-Search-Demo') as demo:
775
  )
776
 
777
 
778
- demo.launch(share=False)
 
55
  self.relevant_paragraphs = f'Index for {question_category} not found. That means no PDFs, Text files, or URLs have been ingested and indexed so far. Ingest the new data for {question_category} and then querying again.'
56
  else:
57
  if self.index_type in ['FAISS', 'Chroma']:
58
+ # Extract information on Source of relevant_paragraphs
59
+ self.sources_relevant_paragraphs = self.kkms_kssw_obj.langchain_utils_obj.get_sources_of_relevant_paragraphs(self.relevant_paragraphs)
60
+
61
+ # Clean relevant_paragraphs (Remove new line characters, tabs, extra spaces, etc.)
62
+ self.relevant_paragraphs = self.kkms_kssw_obj.langchain_utils_obj.clean_relevant_paragraphs(self.relevant_paragraphs)
63
  return self.relevant_paragraphs
64
 
65
 
 
401
  # Widget for Custom Queries
402
  with gr.Row(visible=True) as rowCustomQuery:
403
  with gr.Column(scale=1, min_width=600):
404
+ question_category = gr.Dropdown(
405
+ constants_utils.INDEX_CATEGORY,
406
+ label="Select Question Category",
407
+ value=constants_utils.INDEX_CATEGORY[0]
408
+ )
409
+
410
+ question = gr.Textbox(label="Enter your question", placeholder='Type the question here')
411
+
412
  with gr.Tab(label='Relevant paragraphs'):
 
 
 
 
 
 
 
413
  # Get the Relevant paragraphs for the question asked
414
+ relevant_paragraphs = gr.Textbox(label="Relevant paragraphs are: [These are the relevant paragraphs in raw format with some preprocessing done on the extracted paragraphs from the data source.]", value=dom.relevant_paragraphs, interactive=False)
415
 
416
  b_relevant_paragraphs = gr.Button("Get Relevant paragraphs").style(size='sm')
417
  b_relevant_paragraphs.click(
 
620
  # Get the summary of the weather forecast
621
  weather_forecast_summary = gr.Textbox(label="Weather Forecast Summary is:", interactive=False)
622
 
623
+ # Disabling auto-trigger event for Weather Forecast Summary as it was not giving the correct result in the 1st trigger. When we select the district again or new district, it gives the summary for the previously selected district.
624
+ # district.change(
625
+ # dom.click_handler_for_weather_forecast_summary,
626
+ # district_weather,
627
+ # weather_forecast_summary
628
+ # )
629
+
630
+ b_weather_forecast_summary = gr.Button("Get Weather Forecast Summary").style(size='sm')
631
+ b_weather_forecast_summary.click(fn=dom.click_handler_for_weather_forecast_summary, inputs=[district_weather], outputs=[weather_forecast_summary])
632
 
633
 
634
  # Covert the weather forcast summary in Indian language
 
783
  )
784
 
785
 
786
+ demo.launch(server_name="0.0.0.0", server_port=7860)
requirements.txt CHANGED
@@ -5,16 +5,13 @@ chromadb
5
  torch
6
  transformers
7
  gradio
8
- scikit-learn
9
  scipy
10
- matplotlib
11
- openpyxl
12
  mosestokenizer
13
  indic-nlp-library
14
  sentence_transformers
15
- playwright~=1.30
16
  faiss-cpu
17
- tiktoken
18
  googletrans==3.1.0a0
19
  BeautifulSoup4
20
  pypdf
 
5
  torch
6
  transformers
7
  gradio
8
+ tiktoken
9
  scipy
10
+ scikit-learn
 
11
  mosestokenizer
12
  indic-nlp-library
13
  sentence_transformers
 
14
  faiss-cpu
 
15
  googletrans==3.1.0a0
16
  BeautifulSoup4
17
  pypdf
src/constants.py CHANGED
@@ -4,7 +4,7 @@ import src.weather as weather_utils
4
  import src.mandi_price as mandi_utils
5
 
6
  # Wheater to load the existing index store or create from scratch?
7
- LOAD_FROM_EXISTING_INDEX_STORE = True
8
  INDEX_TYPE = 'FAISS'
9
 
10
  # Path from where to load the data (from the local directory)
@@ -26,6 +26,8 @@ if not os.path.exists(OUTPUT_PATH_ANSWER_FEEDBACK):
26
  OUTPUT_PATH_ANSWER_FEEDBACK_FILE_PREFIX = 'answers_feedback'
27
  OUTPUT_PATH_ANSWER_FEEDBACK_FILE_SAVE_SEPARATOR = '\t'
28
 
 
 
29
  # Index categories (There would be an index for each category. On asking the query, App will search for the relevant docs/information only from the respective index category.)
30
  INDEX_CATEGORY = [
31
  'crops',
@@ -50,15 +52,21 @@ DATA_SOURCES = {
50
  'URLs': 'urls',
51
  }
52
 
53
- # LangChain related constants
54
- SIMILARITY_TOP_K = 1
55
- ANSWER_SIMILARITY_TOP_K = 3
 
 
 
56
  MODE = 'embedding'
57
  RESPONSE_MODE = 'default'
58
- TEXT_SPLITTER_CHUNK_SIZE = 1000
59
  TEXT_SPLITTER_CHUNK_OVERLAP = 0
60
  TEXT_SPLITTER_SEPARATOR = '\n\n'
61
 
 
 
 
62
  # State list used in the Mandi Price widget dropdown list
63
  mandi_utils_obj = mandi_utils.MANDI_PRICE()
64
  MANDI_PRICE_STATES_IDS = mandi_utils_obj.get_mandi_states()
 
4
  import src.mandi_price as mandi_utils
5
 
6
  # Wheater to load the existing index store or create from scratch?
7
+ LOAD_FROM_EXISTING_INDEX_STORE = False
8
  INDEX_TYPE = 'FAISS'
9
 
10
  # Path from where to load the data (from the local directory)
 
26
  OUTPUT_PATH_ANSWER_FEEDBACK_FILE_PREFIX = 'answers_feedback'
27
  OUTPUT_PATH_ANSWER_FEEDBACK_FILE_SAVE_SEPARATOR = '\t'
28
 
29
+
30
+ ######## Add Data Source, Index related constants here ########
31
  # Index categories (There would be an index for each category. On asking the query, App will search for the relevant docs/information only from the respective index category.)
32
  INDEX_CATEGORY = [
33
  'crops',
 
52
  'URLs': 'urls',
53
  }
54
 
55
+
56
+ ######## Add LangChain related constants here ########
57
+ LLM_RESPONSE_MAX_TOKENS = 1024
58
+ LLM_BASE_MODEL_NAME = 'gpt-3.5-turbo'
59
+ SIMILARITY_TOP_K = 2
60
+ ANSWER_SIMILARITY_TOP_K = 5
61
  MODE = 'embedding'
62
  RESPONSE_MODE = 'default'
63
+ TEXT_SPLITTER_CHUNK_SIZE = 1500
64
  TEXT_SPLITTER_CHUNK_OVERLAP = 0
65
  TEXT_SPLITTER_SEPARATOR = '\n\n'
66
 
67
+
68
+
69
+ ######## Add Widget related utils constants here ########
70
  # State list used in the Mandi Price widget dropdown list
71
  mandi_utils_obj = mandi_utils.MANDI_PRICE()
72
  MANDI_PRICE_STATES_IDS = mandi_utils_obj.get_mandi_states()
src/langchain_utils.py CHANGED
@@ -69,6 +69,12 @@ class LANGCHAIN_UTILS:
69
 
70
  # Initialize embeddings (we can also use other embeddings)
71
  self.embeddings = OpenAIEmbeddings(openai_api_key=os.getenv('OPENAI_API_KEY'))
 
 
 
 
 
 
72
 
73
  # Global history for AgGPT widget
74
  self.global_history = [
@@ -120,24 +126,54 @@ class LANGCHAIN_UTILS:
120
 
121
  def generate_prompt_template(
122
  self,
123
- prompt_type='general'
 
124
  ):
125
  prompt_template = ''
126
 
127
- if prompt_type == 'general':
128
  prompt_template = """Write a concise summary of the following:
129
 
130
  {text}
131
 
132
  SUMMARIZE IN ENGLISH:"""
133
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
  elif prompt_type == 'weather':
135
  prompt_template = """
136
  What would be the weather based on the below data:
 
137
  {text}
138
  """
139
 
140
- return prompt_template
 
141
 
142
 
143
  def get_textual_summary(
@@ -145,18 +181,19 @@ class LANGCHAIN_UTILS:
145
  text,
146
  chain_type="stuff",
147
  custom_prompt=True,
148
- prompt_type='general'
149
  ):
150
  texts = [text]
151
  docs = [Document(page_content=t) for t in texts[:3]]
152
 
153
- llm = OpenAI(temperature=0)
154
  if custom_prompt:
155
- prompt_template = self.generate_prompt_template(prompt_type)
156
- PROMPT = PromptTemplate(template=prompt_template, input_variables=["text"])
157
- chain = load_summarize_chain(llm, chain_type=chain_type, prompt=PROMPT)
 
 
158
  else:
159
- chain = load_summarize_chain(llm, chain_type=chain_type)
160
 
161
  text_summary = chain.run(docs)
162
  return text_summary
@@ -176,8 +213,7 @@ class LANGCHAIN_UTILS:
176
  texts = [text]
177
  docs = [Document(page_content=t) for t in texts[:3]]
178
 
179
- llm = OpenAI(temperature=0)
180
- chain = load_summarize_chain(llm, chain_type=chain_type)
181
  text_summary = chain.run(docs)
182
 
183
  return text_summary
@@ -188,7 +224,8 @@ class LANGCHAIN_UTILS:
188
  para,
189
  question,
190
  chain_type="stuff",
191
- custom_prompt=True
 
192
  ):
193
  # Prepare data (Split paragraph into chunks of small documents)
194
  text_splitter = CharacterTextSplitter(
@@ -202,36 +239,29 @@ class LANGCHAIN_UTILS:
202
  # Find similar docs that are relevant to the question
203
  docsearch = FAISS.from_texts(
204
  texts, self.embeddings,
205
- metadatas=[{"source": str(i)} for i in range(len(texts))]
206
  )
207
 
208
  elif self.index_type == 'Chroma':
209
  # Find similar docs that are relevant to the question
210
  docsearch = Chroma.from_texts(
211
  texts, self.embeddings,
212
- metadatas=[{"source": str(i)} for i in range(len(texts))]
213
  )
214
 
215
  # Search for the similar docs
216
  docs = docsearch.similarity_search(question, k=constants_utils.ANSWER_SIMILARITY_TOP_K)
217
 
218
- llm = OpenAI(temperature=0)
219
  # Create a Chain for question answering
220
  if custom_prompt:
221
- prompt_template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.
222
-
223
- {context}
224
-
225
- Question: {question}
226
- Answer in English:"""
227
-
228
- PROMPT = PromptTemplate(
229
- template=prompt_template, input_variables=["context", "question"]
230
  )
231
- chain = load_qa_chain(llm, chain_type=chain_type, prompt=PROMPT)
232
  else:
233
- # chain = load_qa_with_sources_chain(llm, chain_type=chain_type)
234
- chain = load_qa_chain(llm, chain_type=chain_type)
235
  # chain.run(input_documents=docs, question=question)
236
 
237
  out_dict = chain({"input_documents": docs, "question": question}, return_only_outputs=True)
@@ -952,3 +982,29 @@ class LANGCHAIN_UTILS:
952
 
953
  # Save df into TSV format
954
  df.to_csv(feedback_filepath, sep=constants_utils.OUTPUT_PATH_ANSWER_FEEDBACK_FILE_SAVE_SEPARATOR, index=False, header=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
 
70
  # Initialize embeddings (we can also use other embeddings)
71
  self.embeddings = OpenAIEmbeddings(openai_api_key=os.getenv('OPENAI_API_KEY'))
72
+ # Initialize LLM model
73
+ self.llm = OpenAI(
74
+ temperature=0,
75
+ max_tokens=constants_utils.LLM_RESPONSE_MAX_TOKENS,
76
+ model_name=constants_utils.LLM_BASE_MODEL_NAME
77
+ )
78
 
79
  # Global history for AgGPT widget
80
  self.global_history = [
 
126
 
127
  def generate_prompt_template(
128
  self,
129
+ prompt_type,
130
+ input_variables
131
  ):
132
  prompt_template = ''
133
 
134
+ if prompt_type == 'summarize':
135
  prompt_template = """Write a concise summary of the following:
136
 
137
  {text}
138
 
139
  SUMMARIZE IN ENGLISH:"""
140
 
141
+ elif prompt_type == 'qa':
142
+ prompt_template = """You are a helpful AI assistant. Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. If the question is not related to the context, politely respond that you are tuned to only answer questions that are related to the context.
143
+
144
+ {context}
145
+
146
+ Question: {question}
147
+
148
+ Answer in English:"""
149
+
150
+ # Working good, but truncated answer
151
+ prompt_template = """You are a helpful AI assistant. Use the following pieces of context to answer the question at the end. Start the answer by giving short summary and write the answer starting with Here are some of the key points:. Write each sentence separately with numbering. If you don't know the answer, just say that you don't know, don't try to make up an answer. If the question is not related to the context, politely respond that you are tuned to only answer questions that are related to the context.
152
+
153
+ {context}
154
+
155
+ Question: {question}
156
+
157
+ Answer in English:"""
158
+
159
+
160
+ prompt_template = """You are a helpful AI assistant. Use the following pieces of context to answer the question comprehensively at the end. Start the answer by giving short summary and write the answer starting with Here are some of the key points:. Write each sentence separately with numbering. If you don't know the answer, just say that you don't know, don't try to make up an answer. If the question is not related to the context, politely respond that you are tuned to only answer questions that are related to the context.
161
+
162
+ {context}
163
+
164
+ Question: {question}
165
+
166
+ Answer in English:"""
167
+
168
  elif prompt_type == 'weather':
169
  prompt_template = """
170
  What would be the weather based on the below data:
171
+
172
  {text}
173
  """
174
 
175
+ PROMPT = PromptTemplate(template=prompt_template, input_variables=input_variables)
176
+ return PROMPT
177
 
178
 
179
  def get_textual_summary(
 
181
  text,
182
  chain_type="stuff",
183
  custom_prompt=True,
184
+ prompt_type='summarize'
185
  ):
186
  texts = [text]
187
  docs = [Document(page_content=t) for t in texts[:3]]
188
 
 
189
  if custom_prompt:
190
+ PROMPT = self.generate_prompt_template(
191
+ prompt_type=prompt_type,
192
+ input_variables=["text"]
193
+ )
194
+ chain = load_summarize_chain(self.llm, chain_type=chain_type, prompt=PROMPT)
195
  else:
196
+ chain = load_summarize_chain(self.llm, chain_type=chain_type)
197
 
198
  text_summary = chain.run(docs)
199
  return text_summary
 
213
  texts = [text]
214
  docs = [Document(page_content=t) for t in texts[:3]]
215
 
216
+ chain = load_summarize_chain(self.llm, chain_type=chain_type)
 
217
  text_summary = chain.run(docs)
218
 
219
  return text_summary
 
224
  para,
225
  question,
226
  chain_type="stuff",
227
+ custom_prompt=True,
228
+ prompt_type='qa'
229
  ):
230
  # Prepare data (Split paragraph into chunks of small documents)
231
  text_splitter = CharacterTextSplitter(
 
239
  # Find similar docs that are relevant to the question
240
  docsearch = FAISS.from_texts(
241
  texts, self.embeddings,
242
+ metadatas=[{"source": str(i+1)} for i in range(len(texts))]
243
  )
244
 
245
  elif self.index_type == 'Chroma':
246
  # Find similar docs that are relevant to the question
247
  docsearch = Chroma.from_texts(
248
  texts, self.embeddings,
249
+ metadatas=[{"source": str(i+1)} for i in range(len(texts))]
250
  )
251
 
252
  # Search for the similar docs
253
  docs = docsearch.similarity_search(question, k=constants_utils.ANSWER_SIMILARITY_TOP_K)
254
 
 
255
  # Create a Chain for question answering
256
  if custom_prompt:
257
+ PROMPT = self.generate_prompt_template(
258
+ prompt_type=prompt_type,
259
+ input_variables=["context", "question"]
 
 
 
 
 
 
260
  )
261
+ chain = load_qa_chain(self.llm, chain_type=chain_type, prompt=PROMPT)
262
  else:
263
+ # chain = load_qa_with_sources_chain(self.llm, chain_type=chain_type)
264
+ chain = load_qa_chain(self.llm, chain_type=chain_type)
265
  # chain.run(input_documents=docs, question=question)
266
 
267
  out_dict = chain({"input_documents": docs, "question": question}, return_only_outputs=True)
 
982
 
983
  # Save df into TSV format
984
  df.to_csv(feedback_filepath, sep=constants_utils.OUTPUT_PATH_ANSWER_FEEDBACK_FILE_SAVE_SEPARATOR, index=False, header=True)
985
+
986
+
987
+ def get_sources_of_relevant_paragraphs(
988
+ self,
989
+ relevant_paragraphs
990
+ ):
991
+ sources_relevant_paragraphs = []
992
+ # Extract information on Source of relevant_paragraphs
993
+ for indx, doc in enumerate(relevant_paragraphs):
994
+ if 'source' in doc.metadata and 'page' in doc.metadata and doc.metadata['source'].endswith('.pdf'):
995
+ # Need to add +1 as PyPDFLoader sets page number from 0th-index
996
+ relevant_paragraphs[indx].metadata['page'] += 1
997
+ sources_relevant_paragraphs = [doc.metadata for doc in relevant_paragraphs]
998
+
999
+ return sources_relevant_paragraphs
1000
+
1001
+
1002
+ def clean_relevant_paragraphs(
1003
+ self,
1004
+ relevant_paragraphs
1005
+ ):
1006
+ cleaned_relevant_paragraphs = []
1007
+ for doc in relevant_paragraphs:
1008
+ cleaned_relevant_paragraphs.append(self.utils_obj.replace_newlines_and_spaces(doc.page_content))
1009
+
1010
+ return cleaned_relevant_paragraphs