Spaces:

wadhwani-ai
/

KKMS-Smart-Search-Demo

Runtime error

App Files Files Community

Chintan Donda commited on May 3, 2023

Commit

3fa36c8

1 Parent(s): 77a878e

Using gpt-3.5-turbo LLM model to improve the Custom Query query response

Browse files

Files changed (4) hide show

app.py +24 -16
requirements.txt +2 -5
src/constants.py +13 -5
src/langchain_utils.py +83 -27

app.py CHANGED Viewed

@@ -55,8 +55,11 @@ class DomState:
             self.relevant_paragraphs = f'Index for {question_category} not found. That means no PDFs, Text files, or URLs have been ingested and indexed so far. Ingest the new data for {question_category} and then querying again.'
         else:
             if self.index_type in ['FAISS', 'Chroma']:
-                self.sources_relevant_paragraphs = [doc.metadata for doc in self.relevant_paragraphs]
-                self.relevant_paragraphs = [doc.page_content.replace('\n', '').replace('\t', ' ') for doc in self.relevant_paragraphs]
         return self.relevant_paragraphs
@@ -398,16 +401,17 @@ with gr.Blocks(title='KKMS-Smart-Search-Demo') as demo:
     # Widget for Custom Queries
     with gr.Row(visible=True) as rowCustomQuery:
         with gr.Column(scale=1, min_width=600):
             with gr.Tab(label='Relevant paragraphs'):
-                question_category = gr.Dropdown(
-                    constants_utils.INDEX_CATEGORY,
-                    label="Select Question Category",
-                    value=constants_utils.INDEX_CATEGORY[0]
-                )
-                question = gr.Textbox(label="Enter your question", placeholder='Type the question here')
                 # Get the Relevant paragraphs for the question asked
-                relevant_paragraphs = gr.Textbox(label="Relevant paragraphs are:", value=dom.relevant_paragraphs, interactive=False)
                 b_relevant_paragraphs = gr.Button("Get Relevant paragraphs").style(size='sm')
                 b_relevant_paragraphs.click(
@@ -616,11 +620,15 @@ with gr.Blocks(title='KKMS-Smart-Search-Demo') as demo:
                 # Get the summary of the weather forecast
                 weather_forecast_summary = gr.Textbox(label="Weather Forecast Summary is:", interactive=False)
-                district.change(
-                        dom.click_handler_for_weather_forecast_summary,
-                        district_weather,
-                        weather_forecast_summary
-                    )
             # Covert the weather forcast summary in Indian language
@@ -775,4 +783,4 @@ with gr.Blocks(title='KKMS-Smart-Search-Demo') as demo:
     )
-demo.launch(share=False)

             self.relevant_paragraphs = f'Index for {question_category} not found. That means no PDFs, Text files, or URLs have been ingested and indexed so far. Ingest the new data for {question_category} and then querying again.'
         else:
             if self.index_type in ['FAISS', 'Chroma']:
+                # Extract information on Source of relevant_paragraphs
+                self.sources_relevant_paragraphs = self.kkms_kssw_obj.langchain_utils_obj.get_sources_of_relevant_paragraphs(self.relevant_paragraphs)
+                # Clean relevant_paragraphs (Remove new line characters, tabs, extra spaces, etc.)
+                self.relevant_paragraphs = self.kkms_kssw_obj.langchain_utils_obj.clean_relevant_paragraphs(self.relevant_paragraphs)
         return self.relevant_paragraphs
     # Widget for Custom Queries
     with gr.Row(visible=True) as rowCustomQuery:
         with gr.Column(scale=1, min_width=600):
+            question_category = gr.Dropdown(
+                constants_utils.INDEX_CATEGORY,
+                label="Select Question Category",
+                value=constants_utils.INDEX_CATEGORY[0]
+            )
+            question = gr.Textbox(label="Enter your question", placeholder='Type the question here')
             with gr.Tab(label='Relevant paragraphs'):
                 # Get the Relevant paragraphs for the question asked
+                relevant_paragraphs = gr.Textbox(label="Relevant paragraphs are: [These are the relevant paragraphs in raw format with some preprocessing done on the extracted paragraphs from the data source.]", value=dom.relevant_paragraphs, interactive=False)
                 b_relevant_paragraphs = gr.Button("Get Relevant paragraphs").style(size='sm')
                 b_relevant_paragraphs.click(
                 # Get the summary of the weather forecast
                 weather_forecast_summary = gr.Textbox(label="Weather Forecast Summary is:", interactive=False)
+                # Disabling auto-trigger event for Weather Forecast Summary as it was not giving the correct result in the 1st trigger. When we select the district again or new district, it gives the summary for the previously selected district.
+                # district.change(
+                #         dom.click_handler_for_weather_forecast_summary,
+                #         district_weather,
+                #         weather_forecast_summary
+                #     )
+                b_weather_forecast_summary = gr.Button("Get Weather Forecast Summary").style(size='sm')
+                b_weather_forecast_summary.click(fn=dom.click_handler_for_weather_forecast_summary, inputs=[district_weather], outputs=[weather_forecast_summary])
             # Covert the weather forcast summary in Indian language
     )
+demo.launch(server_name="0.0.0.0", server_port=7860)

requirements.txt CHANGED Viewed

@@ -5,16 +5,13 @@ chromadb
 torch
 transformers
 gradio
-scikit-learn
 scipy
-matplotlib
-openpyxl
 mosestokenizer
 indic-nlp-library
 sentence_transformers
-playwright~=1.30
 faiss-cpu
-tiktoken
 googletrans==3.1.0a0
 BeautifulSoup4
 pypdf

 torch
 transformers
 gradio
+tiktoken
 scipy
+scikit-learn
 mosestokenizer
 indic-nlp-library
 sentence_transformers
 faiss-cpu
 googletrans==3.1.0a0
 BeautifulSoup4
 pypdf

src/constants.py CHANGED Viewed

@@ -4,7 +4,7 @@ import src.weather as weather_utils
 import src.mandi_price as mandi_utils
 # Wheater to load the existing index store or create from scratch?
-LOAD_FROM_EXISTING_INDEX_STORE = True
 INDEX_TYPE = 'FAISS'
 # Path from where to load the data (from the local directory)
@@ -26,6 +26,8 @@ if not os.path.exists(OUTPUT_PATH_ANSWER_FEEDBACK):
 OUTPUT_PATH_ANSWER_FEEDBACK_FILE_PREFIX = 'answers_feedback'
 OUTPUT_PATH_ANSWER_FEEDBACK_FILE_SAVE_SEPARATOR = '\t'
 # Index categories (There would be an index for each category. On asking the query, App will search for the relevant docs/information only from the respective index category.)
 INDEX_CATEGORY = [
     'crops',
@@ -50,15 +52,21 @@ DATA_SOURCES = {
     'URLs': 'urls',
 }
-# LangChain related constants
-SIMILARITY_TOP_K = 1
-ANSWER_SIMILARITY_TOP_K = 3
 MODE = 'embedding'
 RESPONSE_MODE = 'default'
-TEXT_SPLITTER_CHUNK_SIZE = 1000
 TEXT_SPLITTER_CHUNK_OVERLAP = 0
 TEXT_SPLITTER_SEPARATOR = '\n\n'
 # State list used in the Mandi Price widget dropdown list
 mandi_utils_obj = mandi_utils.MANDI_PRICE()
 MANDI_PRICE_STATES_IDS = mandi_utils_obj.get_mandi_states()

 import src.mandi_price as mandi_utils
 # Wheater to load the existing index store or create from scratch?
+LOAD_FROM_EXISTING_INDEX_STORE = False
 INDEX_TYPE = 'FAISS'
 # Path from where to load the data (from the local directory)
 OUTPUT_PATH_ANSWER_FEEDBACK_FILE_PREFIX = 'answers_feedback'
 OUTPUT_PATH_ANSWER_FEEDBACK_FILE_SAVE_SEPARATOR = '\t'
+######## Add Data Source, Index related constants here ########
 # Index categories (There would be an index for each category. On asking the query, App will search for the relevant docs/information only from the respective index category.)
 INDEX_CATEGORY = [
     'crops',
     'URLs': 'urls',
 }
+######## Add LangChain related constants here ########
+LLM_RESPONSE_MAX_TOKENS = 1024
+LLM_BASE_MODEL_NAME = 'gpt-3.5-turbo'
+SIMILARITY_TOP_K = 2
+ANSWER_SIMILARITY_TOP_K = 5
 MODE = 'embedding'
 RESPONSE_MODE = 'default'
+TEXT_SPLITTER_CHUNK_SIZE = 1500
 TEXT_SPLITTER_CHUNK_OVERLAP = 0
 TEXT_SPLITTER_SEPARATOR = '\n\n'
+######## Add Widget related utils constants here ########
 # State list used in the Mandi Price widget dropdown list
 mandi_utils_obj = mandi_utils.MANDI_PRICE()
 MANDI_PRICE_STATES_IDS = mandi_utils_obj.get_mandi_states()

src/langchain_utils.py CHANGED Viewed

@@ -69,6 +69,12 @@ class LANGCHAIN_UTILS:
         # Initialize embeddings (we can also use other embeddings)
         self.embeddings = OpenAIEmbeddings(openai_api_key=os.getenv('OPENAI_API_KEY'))
         # Global history for AgGPT widget
         self.global_history = [
@@ -120,24 +126,54 @@ class LANGCHAIN_UTILS:
     def generate_prompt_template(
         self,
-        prompt_type='general'
     ):
         prompt_template = ''
-        if prompt_type == 'general':
             prompt_template = """Write a concise summary of the following:
             {text}
             SUMMARIZE IN ENGLISH:"""
         elif prompt_type == 'weather':
             prompt_template = """
                 What would be the weather based on the below data:
                 {text}
             """
-        return prompt_template
     def get_textual_summary(
@@ -145,18 +181,19 @@ class LANGCHAIN_UTILS:
         text,
         chain_type="stuff",
         custom_prompt=True,
-        prompt_type='general'
     ):
         texts = [text]
         docs = [Document(page_content=t) for t in texts[:3]]
-        llm = OpenAI(temperature=0)
         if custom_prompt:
-            prompt_template = self.generate_prompt_template(prompt_type)
-            PROMPT = PromptTemplate(template=prompt_template, input_variables=["text"])
-            chain = load_summarize_chain(llm, chain_type=chain_type, prompt=PROMPT)
         else:
-            chain = load_summarize_chain(llm, chain_type=chain_type)
         text_summary = chain.run(docs)
         return text_summary
@@ -176,8 +213,7 @@ class LANGCHAIN_UTILS:
         texts = [text]
         docs = [Document(page_content=t) for t in texts[:3]]
-        llm = OpenAI(temperature=0)
-        chain = load_summarize_chain(llm, chain_type=chain_type)
         text_summary = chain.run(docs)
         return text_summary
@@ -188,7 +224,8 @@ class LANGCHAIN_UTILS:
         para,
         question,
         chain_type="stuff",
-        custom_prompt=True
     ):
         # Prepare data (Split paragraph into chunks of small documents)
         text_splitter = CharacterTextSplitter(
@@ -202,36 +239,29 @@ class LANGCHAIN_UTILS:
             # Find similar docs that are relevant to the question
             docsearch = FAISS.from_texts(
                 texts, self.embeddings,
-                metadatas=[{"source": str(i)} for i in range(len(texts))]
             )
         elif self.index_type == 'Chroma':
             # Find similar docs that are relevant to the question
             docsearch = Chroma.from_texts(
                 texts, self.embeddings,
-                metadatas=[{"source": str(i)} for i in range(len(texts))]
             )
         # Search for the similar docs
         docs = docsearch.similarity_search(question, k=constants_utils.ANSWER_SIMILARITY_TOP_K)
-        llm = OpenAI(temperature=0)
         # Create a Chain for question answering
         if custom_prompt:
-            prompt_template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.
-            {context}
-            Question: {question}
-            Answer in English:"""
-            PROMPT = PromptTemplate(
-                template=prompt_template, input_variables=["context", "question"]
             )
-            chain = load_qa_chain(llm, chain_type=chain_type, prompt=PROMPT)
         else:
-            # chain = load_qa_with_sources_chain(llm, chain_type=chain_type)
-            chain = load_qa_chain(llm, chain_type=chain_type)
             # chain.run(input_documents=docs, question=question)
         out_dict = chain({"input_documents": docs, "question": question}, return_only_outputs=True)
@@ -952,3 +982,29 @@ class LANGCHAIN_UTILS:
         # Save df into TSV format
         df.to_csv(feedback_filepath, sep=constants_utils.OUTPUT_PATH_ANSWER_FEEDBACK_FILE_SAVE_SEPARATOR, index=False, header=True)

         # Initialize embeddings (we can also use other embeddings)
         self.embeddings = OpenAIEmbeddings(openai_api_key=os.getenv('OPENAI_API_KEY'))
+        # Initialize LLM model
+        self.llm = OpenAI(
+            temperature=0,
+            max_tokens=constants_utils.LLM_RESPONSE_MAX_TOKENS,
+            model_name=constants_utils.LLM_BASE_MODEL_NAME
+        )
         # Global history for AgGPT widget
         self.global_history = [
     def generate_prompt_template(
         self,
+        prompt_type,
+        input_variables
     ):
         prompt_template = ''
+        if prompt_type == 'summarize':
             prompt_template = """Write a concise summary of the following:
             {text}
             SUMMARIZE IN ENGLISH:"""
+        elif prompt_type == 'qa':
+            prompt_template = """You are a helpful AI assistant. Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. If the question is not related to the context, politely respond that you are tuned to only answer questions that are related to the context.
+                {context}
+                Question: {question}
+                Answer in English:"""
+            # Working good, but truncated answer
+            prompt_template = """You are a helpful AI assistant. Use the following pieces of context to answer the question at the end. Start the answer by giving short summary and write the answer starting with Here are some of the key points:. Write each sentence separately with numbering. If you don't know the answer, just say that you don't know, don't try to make up an answer. If the question is not related to the context, politely respond that you are tuned to only answer questions that are related to the context.
+                {context}
+                Question: {question}
+                Answer in English:"""
+            prompt_template = """You are a helpful AI assistant. Use the following pieces of context to answer the question comprehensively at the end. Start the answer by giving short summary and write the answer starting with Here are some of the key points:. Write each sentence separately with numbering. If you don't know the answer, just say that you don't know, don't try to make up an answer. If the question is not related to the context, politely respond that you are tuned to only answer questions that are related to the context.
+                {context}
+                Question: {question}
+                Answer in English:"""
         elif prompt_type == 'weather':
             prompt_template = """
                 What would be the weather based on the below data:
                 {text}
             """
+        PROMPT = PromptTemplate(template=prompt_template, input_variables=input_variables)
+        return PROMPT
     def get_textual_summary(
         text,
         chain_type="stuff",
         custom_prompt=True,
+        prompt_type='summarize'
     ):
         texts = [text]
         docs = [Document(page_content=t) for t in texts[:3]]
         if custom_prompt:
+            PROMPT = self.generate_prompt_template(
+                prompt_type=prompt_type,
+                input_variables=["text"]
+            )
+            chain = load_summarize_chain(self.llm, chain_type=chain_type, prompt=PROMPT)
         else:
+            chain = load_summarize_chain(self.llm, chain_type=chain_type)
         text_summary = chain.run(docs)
         return text_summary
         texts = [text]
         docs = [Document(page_content=t) for t in texts[:3]]
+        chain = load_summarize_chain(self.llm, chain_type=chain_type)
         text_summary = chain.run(docs)
         return text_summary
         para,
         question,
         chain_type="stuff",
+        custom_prompt=True,
+        prompt_type='qa'
     ):
         # Prepare data (Split paragraph into chunks of small documents)
         text_splitter = CharacterTextSplitter(
             # Find similar docs that are relevant to the question
             docsearch = FAISS.from_texts(
                 texts, self.embeddings,
+                metadatas=[{"source": str(i+1)} for i in range(len(texts))]
             )
         elif self.index_type == 'Chroma':
             # Find similar docs that are relevant to the question
             docsearch = Chroma.from_texts(
                 texts, self.embeddings,
+                metadatas=[{"source": str(i+1)} for i in range(len(texts))]
             )
         # Search for the similar docs
         docs = docsearch.similarity_search(question, k=constants_utils.ANSWER_SIMILARITY_TOP_K)
         # Create a Chain for question answering
         if custom_prompt:
+            PROMPT = self.generate_prompt_template(
+                prompt_type=prompt_type,
+                input_variables=["context", "question"]
             )
+            chain = load_qa_chain(self.llm, chain_type=chain_type, prompt=PROMPT)
         else:
+            # chain = load_qa_with_sources_chain(self.llm, chain_type=chain_type)
+            chain = load_qa_chain(self.llm, chain_type=chain_type)
             # chain.run(input_documents=docs, question=question)
         out_dict = chain({"input_documents": docs, "question": question}, return_only_outputs=True)
         # Save df into TSV format
         df.to_csv(feedback_filepath, sep=constants_utils.OUTPUT_PATH_ANSWER_FEEDBACK_FILE_SAVE_SEPARATOR, index=False, header=True)
+    def get_sources_of_relevant_paragraphs(
+        self,
+        relevant_paragraphs
+    ):
+        sources_relevant_paragraphs = []
+        # Extract information on Source of relevant_paragraphs
+        for indx, doc in enumerate(relevant_paragraphs):
+            if 'source' in doc.metadata and 'page' in doc.metadata and doc.metadata['source'].endswith('.pdf'):
+                # Need to add +1 as PyPDFLoader sets page number from 0th-index
+                relevant_paragraphs[indx].metadata['page'] += 1
+        sources_relevant_paragraphs = [doc.metadata for doc in relevant_paragraphs]
+        return sources_relevant_paragraphs
+    def clean_relevant_paragraphs(
+        self,
+        relevant_paragraphs
+    ):
+        cleaned_relevant_paragraphs = []
+        for doc in relevant_paragraphs:
+            cleaned_relevant_paragraphs.append(self.utils_obj.replace_newlines_and_spaces(doc.page_content))
+        return cleaned_relevant_paragraphs