Spaces:
Runtime error
Runtime error
Chintan Donda
commited on
Commit
·
3fa36c8
1
Parent(s):
77a878e
Using gpt-3.5-turbo LLM model to improve the Custom Query query response
Browse files- app.py +24 -16
- requirements.txt +2 -5
- src/constants.py +13 -5
- src/langchain_utils.py +83 -27
app.py
CHANGED
@@ -55,8 +55,11 @@ class DomState:
|
|
55 |
self.relevant_paragraphs = f'Index for {question_category} not found. That means no PDFs, Text files, or URLs have been ingested and indexed so far. Ingest the new data for {question_category} and then querying again.'
|
56 |
else:
|
57 |
if self.index_type in ['FAISS', 'Chroma']:
|
58 |
-
|
59 |
-
self.
|
|
|
|
|
|
|
60 |
return self.relevant_paragraphs
|
61 |
|
62 |
|
@@ -398,16 +401,17 @@ with gr.Blocks(title='KKMS-Smart-Search-Demo') as demo:
|
|
398 |
# Widget for Custom Queries
|
399 |
with gr.Row(visible=True) as rowCustomQuery:
|
400 |
with gr.Column(scale=1, min_width=600):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
401 |
with gr.Tab(label='Relevant paragraphs'):
|
402 |
-
question_category = gr.Dropdown(
|
403 |
-
constants_utils.INDEX_CATEGORY,
|
404 |
-
label="Select Question Category",
|
405 |
-
value=constants_utils.INDEX_CATEGORY[0]
|
406 |
-
)
|
407 |
-
|
408 |
-
question = gr.Textbox(label="Enter your question", placeholder='Type the question here')
|
409 |
# Get the Relevant paragraphs for the question asked
|
410 |
-
relevant_paragraphs = gr.Textbox(label="Relevant paragraphs are:", value=dom.relevant_paragraphs, interactive=False)
|
411 |
|
412 |
b_relevant_paragraphs = gr.Button("Get Relevant paragraphs").style(size='sm')
|
413 |
b_relevant_paragraphs.click(
|
@@ -616,11 +620,15 @@ with gr.Blocks(title='KKMS-Smart-Search-Demo') as demo:
|
|
616 |
# Get the summary of the weather forecast
|
617 |
weather_forecast_summary = gr.Textbox(label="Weather Forecast Summary is:", interactive=False)
|
618 |
|
619 |
-
district.
|
620 |
-
|
621 |
-
|
622 |
-
|
623 |
-
|
|
|
|
|
|
|
|
|
624 |
|
625 |
|
626 |
# Covert the weather forcast summary in Indian language
|
@@ -775,4 +783,4 @@ with gr.Blocks(title='KKMS-Smart-Search-Demo') as demo:
|
|
775 |
)
|
776 |
|
777 |
|
778 |
-
demo.launch(
|
|
|
55 |
self.relevant_paragraphs = f'Index for {question_category} not found. That means no PDFs, Text files, or URLs have been ingested and indexed so far. Ingest the new data for {question_category} and then querying again.'
|
56 |
else:
|
57 |
if self.index_type in ['FAISS', 'Chroma']:
|
58 |
+
# Extract information on Source of relevant_paragraphs
|
59 |
+
self.sources_relevant_paragraphs = self.kkms_kssw_obj.langchain_utils_obj.get_sources_of_relevant_paragraphs(self.relevant_paragraphs)
|
60 |
+
|
61 |
+
# Clean relevant_paragraphs (Remove new line characters, tabs, extra spaces, etc.)
|
62 |
+
self.relevant_paragraphs = self.kkms_kssw_obj.langchain_utils_obj.clean_relevant_paragraphs(self.relevant_paragraphs)
|
63 |
return self.relevant_paragraphs
|
64 |
|
65 |
|
|
|
401 |
# Widget for Custom Queries
|
402 |
with gr.Row(visible=True) as rowCustomQuery:
|
403 |
with gr.Column(scale=1, min_width=600):
|
404 |
+
question_category = gr.Dropdown(
|
405 |
+
constants_utils.INDEX_CATEGORY,
|
406 |
+
label="Select Question Category",
|
407 |
+
value=constants_utils.INDEX_CATEGORY[0]
|
408 |
+
)
|
409 |
+
|
410 |
+
question = gr.Textbox(label="Enter your question", placeholder='Type the question here')
|
411 |
+
|
412 |
with gr.Tab(label='Relevant paragraphs'):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
413 |
# Get the Relevant paragraphs for the question asked
|
414 |
+
relevant_paragraphs = gr.Textbox(label="Relevant paragraphs are: [These are the relevant paragraphs in raw format with some preprocessing done on the extracted paragraphs from the data source.]", value=dom.relevant_paragraphs, interactive=False)
|
415 |
|
416 |
b_relevant_paragraphs = gr.Button("Get Relevant paragraphs").style(size='sm')
|
417 |
b_relevant_paragraphs.click(
|
|
|
620 |
# Get the summary of the weather forecast
|
621 |
weather_forecast_summary = gr.Textbox(label="Weather Forecast Summary is:", interactive=False)
|
622 |
|
623 |
+
# Disabling auto-trigger event for Weather Forecast Summary as it was not giving the correct result in the 1st trigger. When we select the district again or new district, it gives the summary for the previously selected district.
|
624 |
+
# district.change(
|
625 |
+
# dom.click_handler_for_weather_forecast_summary,
|
626 |
+
# district_weather,
|
627 |
+
# weather_forecast_summary
|
628 |
+
# )
|
629 |
+
|
630 |
+
b_weather_forecast_summary = gr.Button("Get Weather Forecast Summary").style(size='sm')
|
631 |
+
b_weather_forecast_summary.click(fn=dom.click_handler_for_weather_forecast_summary, inputs=[district_weather], outputs=[weather_forecast_summary])
|
632 |
|
633 |
|
634 |
# Covert the weather forcast summary in Indian language
|
|
|
783 |
)
|
784 |
|
785 |
|
786 |
+
demo.launch(server_name="0.0.0.0", server_port=7860)
|
requirements.txt
CHANGED
@@ -5,16 +5,13 @@ chromadb
|
|
5 |
torch
|
6 |
transformers
|
7 |
gradio
|
8 |
-
|
9 |
scipy
|
10 |
-
|
11 |
-
openpyxl
|
12 |
mosestokenizer
|
13 |
indic-nlp-library
|
14 |
sentence_transformers
|
15 |
-
playwright~=1.30
|
16 |
faiss-cpu
|
17 |
-
tiktoken
|
18 |
googletrans==3.1.0a0
|
19 |
BeautifulSoup4
|
20 |
pypdf
|
|
|
5 |
torch
|
6 |
transformers
|
7 |
gradio
|
8 |
+
tiktoken
|
9 |
scipy
|
10 |
+
scikit-learn
|
|
|
11 |
mosestokenizer
|
12 |
indic-nlp-library
|
13 |
sentence_transformers
|
|
|
14 |
faiss-cpu
|
|
|
15 |
googletrans==3.1.0a0
|
16 |
BeautifulSoup4
|
17 |
pypdf
|
src/constants.py
CHANGED
@@ -4,7 +4,7 @@ import src.weather as weather_utils
|
|
4 |
import src.mandi_price as mandi_utils
|
5 |
|
6 |
# Wheater to load the existing index store or create from scratch?
|
7 |
-
LOAD_FROM_EXISTING_INDEX_STORE =
|
8 |
INDEX_TYPE = 'FAISS'
|
9 |
|
10 |
# Path from where to load the data (from the local directory)
|
@@ -26,6 +26,8 @@ if not os.path.exists(OUTPUT_PATH_ANSWER_FEEDBACK):
|
|
26 |
OUTPUT_PATH_ANSWER_FEEDBACK_FILE_PREFIX = 'answers_feedback'
|
27 |
OUTPUT_PATH_ANSWER_FEEDBACK_FILE_SAVE_SEPARATOR = '\t'
|
28 |
|
|
|
|
|
29 |
# Index categories (There would be an index for each category. On asking the query, App will search for the relevant docs/information only from the respective index category.)
|
30 |
INDEX_CATEGORY = [
|
31 |
'crops',
|
@@ -50,15 +52,21 @@ DATA_SOURCES = {
|
|
50 |
'URLs': 'urls',
|
51 |
}
|
52 |
|
53 |
-
|
54 |
-
|
55 |
-
|
|
|
|
|
|
|
56 |
MODE = 'embedding'
|
57 |
RESPONSE_MODE = 'default'
|
58 |
-
TEXT_SPLITTER_CHUNK_SIZE =
|
59 |
TEXT_SPLITTER_CHUNK_OVERLAP = 0
|
60 |
TEXT_SPLITTER_SEPARATOR = '\n\n'
|
61 |
|
|
|
|
|
|
|
62 |
# State list used in the Mandi Price widget dropdown list
|
63 |
mandi_utils_obj = mandi_utils.MANDI_PRICE()
|
64 |
MANDI_PRICE_STATES_IDS = mandi_utils_obj.get_mandi_states()
|
|
|
4 |
import src.mandi_price as mandi_utils
|
5 |
|
6 |
# Wheater to load the existing index store or create from scratch?
|
7 |
+
LOAD_FROM_EXISTING_INDEX_STORE = False
|
8 |
INDEX_TYPE = 'FAISS'
|
9 |
|
10 |
# Path from where to load the data (from the local directory)
|
|
|
26 |
OUTPUT_PATH_ANSWER_FEEDBACK_FILE_PREFIX = 'answers_feedback'
|
27 |
OUTPUT_PATH_ANSWER_FEEDBACK_FILE_SAVE_SEPARATOR = '\t'
|
28 |
|
29 |
+
|
30 |
+
######## Add Data Source, Index related constants here ########
|
31 |
# Index categories (There would be an index for each category. On asking the query, App will search for the relevant docs/information only from the respective index category.)
|
32 |
INDEX_CATEGORY = [
|
33 |
'crops',
|
|
|
52 |
'URLs': 'urls',
|
53 |
}
|
54 |
|
55 |
+
|
56 |
+
######## Add LangChain related constants here ########
|
57 |
+
LLM_RESPONSE_MAX_TOKENS = 1024
|
58 |
+
LLM_BASE_MODEL_NAME = 'gpt-3.5-turbo'
|
59 |
+
SIMILARITY_TOP_K = 2
|
60 |
+
ANSWER_SIMILARITY_TOP_K = 5
|
61 |
MODE = 'embedding'
|
62 |
RESPONSE_MODE = 'default'
|
63 |
+
TEXT_SPLITTER_CHUNK_SIZE = 1500
|
64 |
TEXT_SPLITTER_CHUNK_OVERLAP = 0
|
65 |
TEXT_SPLITTER_SEPARATOR = '\n\n'
|
66 |
|
67 |
+
|
68 |
+
|
69 |
+
######## Add Widget related utils constants here ########
|
70 |
# State list used in the Mandi Price widget dropdown list
|
71 |
mandi_utils_obj = mandi_utils.MANDI_PRICE()
|
72 |
MANDI_PRICE_STATES_IDS = mandi_utils_obj.get_mandi_states()
|
src/langchain_utils.py
CHANGED
@@ -69,6 +69,12 @@ class LANGCHAIN_UTILS:
|
|
69 |
|
70 |
# Initialize embeddings (we can also use other embeddings)
|
71 |
self.embeddings = OpenAIEmbeddings(openai_api_key=os.getenv('OPENAI_API_KEY'))
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
|
73 |
# Global history for AgGPT widget
|
74 |
self.global_history = [
|
@@ -120,24 +126,54 @@ class LANGCHAIN_UTILS:
|
|
120 |
|
121 |
def generate_prompt_template(
|
122 |
self,
|
123 |
-
prompt_type
|
|
|
124 |
):
|
125 |
prompt_template = ''
|
126 |
|
127 |
-
if prompt_type == '
|
128 |
prompt_template = """Write a concise summary of the following:
|
129 |
|
130 |
{text}
|
131 |
|
132 |
SUMMARIZE IN ENGLISH:"""
|
133 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
134 |
elif prompt_type == 'weather':
|
135 |
prompt_template = """
|
136 |
What would be the weather based on the below data:
|
|
|
137 |
{text}
|
138 |
"""
|
139 |
|
140 |
-
|
|
|
141 |
|
142 |
|
143 |
def get_textual_summary(
|
@@ -145,18 +181,19 @@ class LANGCHAIN_UTILS:
|
|
145 |
text,
|
146 |
chain_type="stuff",
|
147 |
custom_prompt=True,
|
148 |
-
prompt_type='
|
149 |
):
|
150 |
texts = [text]
|
151 |
docs = [Document(page_content=t) for t in texts[:3]]
|
152 |
|
153 |
-
llm = OpenAI(temperature=0)
|
154 |
if custom_prompt:
|
155 |
-
|
156 |
-
|
157 |
-
|
|
|
|
|
158 |
else:
|
159 |
-
chain = load_summarize_chain(llm, chain_type=chain_type)
|
160 |
|
161 |
text_summary = chain.run(docs)
|
162 |
return text_summary
|
@@ -176,8 +213,7 @@ class LANGCHAIN_UTILS:
|
|
176 |
texts = [text]
|
177 |
docs = [Document(page_content=t) for t in texts[:3]]
|
178 |
|
179 |
-
|
180 |
-
chain = load_summarize_chain(llm, chain_type=chain_type)
|
181 |
text_summary = chain.run(docs)
|
182 |
|
183 |
return text_summary
|
@@ -188,7 +224,8 @@ class LANGCHAIN_UTILS:
|
|
188 |
para,
|
189 |
question,
|
190 |
chain_type="stuff",
|
191 |
-
custom_prompt=True
|
|
|
192 |
):
|
193 |
# Prepare data (Split paragraph into chunks of small documents)
|
194 |
text_splitter = CharacterTextSplitter(
|
@@ -202,36 +239,29 @@ class LANGCHAIN_UTILS:
|
|
202 |
# Find similar docs that are relevant to the question
|
203 |
docsearch = FAISS.from_texts(
|
204 |
texts, self.embeddings,
|
205 |
-
metadatas=[{"source": str(i)} for i in range(len(texts))]
|
206 |
)
|
207 |
|
208 |
elif self.index_type == 'Chroma':
|
209 |
# Find similar docs that are relevant to the question
|
210 |
docsearch = Chroma.from_texts(
|
211 |
texts, self.embeddings,
|
212 |
-
metadatas=[{"source": str(i)} for i in range(len(texts))]
|
213 |
)
|
214 |
|
215 |
# Search for the similar docs
|
216 |
docs = docsearch.similarity_search(question, k=constants_utils.ANSWER_SIMILARITY_TOP_K)
|
217 |
|
218 |
-
llm = OpenAI(temperature=0)
|
219 |
# Create a Chain for question answering
|
220 |
if custom_prompt:
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
Question: {question}
|
226 |
-
Answer in English:"""
|
227 |
-
|
228 |
-
PROMPT = PromptTemplate(
|
229 |
-
template=prompt_template, input_variables=["context", "question"]
|
230 |
)
|
231 |
-
chain = load_qa_chain(llm, chain_type=chain_type, prompt=PROMPT)
|
232 |
else:
|
233 |
-
# chain = load_qa_with_sources_chain(llm, chain_type=chain_type)
|
234 |
-
chain = load_qa_chain(llm, chain_type=chain_type)
|
235 |
# chain.run(input_documents=docs, question=question)
|
236 |
|
237 |
out_dict = chain({"input_documents": docs, "question": question}, return_only_outputs=True)
|
@@ -952,3 +982,29 @@ class LANGCHAIN_UTILS:
|
|
952 |
|
953 |
# Save df into TSV format
|
954 |
df.to_csv(feedback_filepath, sep=constants_utils.OUTPUT_PATH_ANSWER_FEEDBACK_FILE_SAVE_SEPARATOR, index=False, header=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
|
70 |
# Initialize embeddings (we can also use other embeddings)
|
71 |
self.embeddings = OpenAIEmbeddings(openai_api_key=os.getenv('OPENAI_API_KEY'))
|
72 |
+
# Initialize LLM model
|
73 |
+
self.llm = OpenAI(
|
74 |
+
temperature=0,
|
75 |
+
max_tokens=constants_utils.LLM_RESPONSE_MAX_TOKENS,
|
76 |
+
model_name=constants_utils.LLM_BASE_MODEL_NAME
|
77 |
+
)
|
78 |
|
79 |
# Global history for AgGPT widget
|
80 |
self.global_history = [
|
|
|
126 |
|
127 |
def generate_prompt_template(
|
128 |
self,
|
129 |
+
prompt_type,
|
130 |
+
input_variables
|
131 |
):
|
132 |
prompt_template = ''
|
133 |
|
134 |
+
if prompt_type == 'summarize':
|
135 |
prompt_template = """Write a concise summary of the following:
|
136 |
|
137 |
{text}
|
138 |
|
139 |
SUMMARIZE IN ENGLISH:"""
|
140 |
|
141 |
+
elif prompt_type == 'qa':
|
142 |
+
prompt_template = """You are a helpful AI assistant. Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. If the question is not related to the context, politely respond that you are tuned to only answer questions that are related to the context.
|
143 |
+
|
144 |
+
{context}
|
145 |
+
|
146 |
+
Question: {question}
|
147 |
+
|
148 |
+
Answer in English:"""
|
149 |
+
|
150 |
+
# Working good, but truncated answer
|
151 |
+
prompt_template = """You are a helpful AI assistant. Use the following pieces of context to answer the question at the end. Start the answer by giving short summary and write the answer starting with Here are some of the key points:. Write each sentence separately with numbering. If you don't know the answer, just say that you don't know, don't try to make up an answer. If the question is not related to the context, politely respond that you are tuned to only answer questions that are related to the context.
|
152 |
+
|
153 |
+
{context}
|
154 |
+
|
155 |
+
Question: {question}
|
156 |
+
|
157 |
+
Answer in English:"""
|
158 |
+
|
159 |
+
|
160 |
+
prompt_template = """You are a helpful AI assistant. Use the following pieces of context to answer the question comprehensively at the end. Start the answer by giving short summary and write the answer starting with Here are some of the key points:. Write each sentence separately with numbering. If you don't know the answer, just say that you don't know, don't try to make up an answer. If the question is not related to the context, politely respond that you are tuned to only answer questions that are related to the context.
|
161 |
+
|
162 |
+
{context}
|
163 |
+
|
164 |
+
Question: {question}
|
165 |
+
|
166 |
+
Answer in English:"""
|
167 |
+
|
168 |
elif prompt_type == 'weather':
|
169 |
prompt_template = """
|
170 |
What would be the weather based on the below data:
|
171 |
+
|
172 |
{text}
|
173 |
"""
|
174 |
|
175 |
+
PROMPT = PromptTemplate(template=prompt_template, input_variables=input_variables)
|
176 |
+
return PROMPT
|
177 |
|
178 |
|
179 |
def get_textual_summary(
|
|
|
181 |
text,
|
182 |
chain_type="stuff",
|
183 |
custom_prompt=True,
|
184 |
+
prompt_type='summarize'
|
185 |
):
|
186 |
texts = [text]
|
187 |
docs = [Document(page_content=t) for t in texts[:3]]
|
188 |
|
|
|
189 |
if custom_prompt:
|
190 |
+
PROMPT = self.generate_prompt_template(
|
191 |
+
prompt_type=prompt_type,
|
192 |
+
input_variables=["text"]
|
193 |
+
)
|
194 |
+
chain = load_summarize_chain(self.llm, chain_type=chain_type, prompt=PROMPT)
|
195 |
else:
|
196 |
+
chain = load_summarize_chain(self.llm, chain_type=chain_type)
|
197 |
|
198 |
text_summary = chain.run(docs)
|
199 |
return text_summary
|
|
|
213 |
texts = [text]
|
214 |
docs = [Document(page_content=t) for t in texts[:3]]
|
215 |
|
216 |
+
chain = load_summarize_chain(self.llm, chain_type=chain_type)
|
|
|
217 |
text_summary = chain.run(docs)
|
218 |
|
219 |
return text_summary
|
|
|
224 |
para,
|
225 |
question,
|
226 |
chain_type="stuff",
|
227 |
+
custom_prompt=True,
|
228 |
+
prompt_type='qa'
|
229 |
):
|
230 |
# Prepare data (Split paragraph into chunks of small documents)
|
231 |
text_splitter = CharacterTextSplitter(
|
|
|
239 |
# Find similar docs that are relevant to the question
|
240 |
docsearch = FAISS.from_texts(
|
241 |
texts, self.embeddings,
|
242 |
+
metadatas=[{"source": str(i+1)} for i in range(len(texts))]
|
243 |
)
|
244 |
|
245 |
elif self.index_type == 'Chroma':
|
246 |
# Find similar docs that are relevant to the question
|
247 |
docsearch = Chroma.from_texts(
|
248 |
texts, self.embeddings,
|
249 |
+
metadatas=[{"source": str(i+1)} for i in range(len(texts))]
|
250 |
)
|
251 |
|
252 |
# Search for the similar docs
|
253 |
docs = docsearch.similarity_search(question, k=constants_utils.ANSWER_SIMILARITY_TOP_K)
|
254 |
|
|
|
255 |
# Create a Chain for question answering
|
256 |
if custom_prompt:
|
257 |
+
PROMPT = self.generate_prompt_template(
|
258 |
+
prompt_type=prompt_type,
|
259 |
+
input_variables=["context", "question"]
|
|
|
|
|
|
|
|
|
|
|
|
|
260 |
)
|
261 |
+
chain = load_qa_chain(self.llm, chain_type=chain_type, prompt=PROMPT)
|
262 |
else:
|
263 |
+
# chain = load_qa_with_sources_chain(self.llm, chain_type=chain_type)
|
264 |
+
chain = load_qa_chain(self.llm, chain_type=chain_type)
|
265 |
# chain.run(input_documents=docs, question=question)
|
266 |
|
267 |
out_dict = chain({"input_documents": docs, "question": question}, return_only_outputs=True)
|
|
|
982 |
|
983 |
# Save df into TSV format
|
984 |
df.to_csv(feedback_filepath, sep=constants_utils.OUTPUT_PATH_ANSWER_FEEDBACK_FILE_SAVE_SEPARATOR, index=False, header=True)
|
985 |
+
|
986 |
+
|
987 |
+
def get_sources_of_relevant_paragraphs(
|
988 |
+
self,
|
989 |
+
relevant_paragraphs
|
990 |
+
):
|
991 |
+
sources_relevant_paragraphs = []
|
992 |
+
# Extract information on Source of relevant_paragraphs
|
993 |
+
for indx, doc in enumerate(relevant_paragraphs):
|
994 |
+
if 'source' in doc.metadata and 'page' in doc.metadata and doc.metadata['source'].endswith('.pdf'):
|
995 |
+
# Need to add +1 as PyPDFLoader sets page number from 0th-index
|
996 |
+
relevant_paragraphs[indx].metadata['page'] += 1
|
997 |
+
sources_relevant_paragraphs = [doc.metadata for doc in relevant_paragraphs]
|
998 |
+
|
999 |
+
return sources_relevant_paragraphs
|
1000 |
+
|
1001 |
+
|
1002 |
+
def clean_relevant_paragraphs(
|
1003 |
+
self,
|
1004 |
+
relevant_paragraphs
|
1005 |
+
):
|
1006 |
+
cleaned_relevant_paragraphs = []
|
1007 |
+
for doc in relevant_paragraphs:
|
1008 |
+
cleaned_relevant_paragraphs.append(self.utils_obj.replace_newlines_and_spaces(doc.page_content))
|
1009 |
+
|
1010 |
+
return cleaned_relevant_paragraphs
|