SearchGPT

Paused

App Files Files Community

Shreyas094 commited on Jul 20, 2024

Commit

8b05473

verified ·

1 Parent(s): d613eb7

Update app.py

Browse files

Files changed (1) hide show

app.py +85 -21

app.py CHANGED Viewed

@@ -7,7 +7,7 @@ import requests
 import random
 import urllib.parse
 from tempfile import NamedTemporaryFile
-from typing import List
 from bs4 import BeautifulSoup
 from langchain.prompts import PromptTemplate
 from langchain.chains import LLMChain
@@ -17,10 +17,72 @@ from langchain_community.document_loaders import PyPDFLoader
 from langchain_core.output_parsers import StrOutputParser
 from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain_community.llms import HuggingFaceHub
-from langchain_core.documents import Document  # Add this line
 huggingface_token = os.environ.get("HUGGINGFACE_TOKEN")
 def load_document(file: NamedTemporaryFile) -> List[Document]:
     """Loads and splits the document into pages."""
     loader = PyPDFLoader(file.name)
@@ -207,6 +269,8 @@ def ask_question(question, temperature, top_p, repetition_penalty, web_search):
     model = get_model(temperature, top_p, repetition_penalty)
     embed = get_embeddings()
     if os.path.exists("faiss_database"):
         database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
@@ -219,16 +283,10 @@ def ask_question(question, temperature, top_p, repetition_penalty, web_search):
     for attempt in range(max_attempts):
         try:
             if web_search:
-                original_query = question
-                rephrased_query = rephrase_for_search(original_query, model)
-                print(f"Original query: {original_query}")
-                print(f"Rephrased query: {rephrased_query}")
-                if rephrased_query == original_query:
-                    print("Warning: Query was not rephrased. Using original query for search.")
-                search_results = google_search(rephrased_query)
-                web_docs = [Document(page_content=result["text"], metadata={"source": result["link"]}) for result in search_results if result["text"]]
                 if database is None:
                     database = FAISS.from_documents(web_docs, embed)
@@ -237,20 +295,17 @@ def ask_question(question, temperature, top_p, repetition_penalty, web_search):
                 database.save_local("faiss_database")
-                context_str = "\n".join([f"Source: {doc.metadata['source']}\nContent: {doc.page_content}" for doc in web_docs])
                 prompt_template = """
                 Answer the question based on the following web search results:
                 Web Search Results:
                 {context}
-                Original Question: {original_question}
-                Rephrased Search Query: {rephrased_query}
                 If the web search results don't contain relevant information, state that the information is not available in the search results.
                 Provide a concise and direct answer to the original question without mentioning the web search or these instructions.
                 Do not include any source information in your answer.
                 """
-                prompt_val = ChatPromptTemplate.from_template(prompt_template)
-                formatted_prompt = prompt_val.format(context=context_str, original_question=question, rephrased_query=rephrased_query)
             else:
                 if database is None:
                     return "No documents available. Please upload documents or enable web search to answer questions."
@@ -259,7 +314,6 @@ def ask_question(question, temperature, top_p, repetition_penalty, web_search):
                 relevant_docs = retriever.get_relevant_documents(question)
                 context_str = "\n".join([doc.page_content for doc in relevant_docs])
-                # Reduce context if we're not on the first attempt
                 if attempt > 0:
                     words = context_str.split()
                     context_str = " ".join(words[:int(len(words) * context_reduction_factor)])
@@ -273,8 +327,9 @@ def ask_question(question, temperature, top_p, repetition_penalty, web_search):
                 Provide a concise and direct answer to the question.
                 Do not include any source information in your answer.
                 """
-                prompt_val = ChatPromptTemplate.from_template(prompt_template)
-                formatted_prompt = prompt_val.format(context=context_str, question=question)
             full_response = generate_chunked_response(model, formatted_prompt)
@@ -294,7 +349,16 @@ def ask_question(question, temperature, top_p, repetition_penalty, web_search):
             else:
                 answer = full_response.strip()
-            # Add sources section
             if web_search:
                 sources = set(doc.metadata['source'] for doc in web_docs)
                 sources_section = "\n\nSources:\n" + "\n".join(f"- {source}" for source in sources)

 import random
 import urllib.parse
 from tempfile import NamedTemporaryFile
+from typing import List, Dict
 from bs4 import BeautifulSoup
 from langchain.prompts import PromptTemplate
 from langchain.chains import LLMChain
 from langchain_core.output_parsers import StrOutputParser
 from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain_community.llms import HuggingFaceHub
+from langchain_core.documents import Document
 huggingface_token = os.environ.get("HUGGINGFACE_TOKEN")
+class Agent1:
+    def __init__(self, model):
+        self.model = model
+    def rephrase_and_split(self, user_input: str) -> List[str]:
+        rephrase_prompt = PromptTemplate(
+            input_variables=["query"],
+            template="""
+            Your task is to rephrase the given query into one or more concise, search-engine-friendly formats.
+            If the query contains multiple distinct questions, split them.
+            Provide ONLY the rephrased queries without any additional text or explanations, one per line.
+            Query: {query}
+            Rephrased queries:"""
+        )
+        chain = LLMChain(llm=self.model, prompt=rephrase_prompt)
+        response = chain.run(query=user_input).strip()
+        return [q.strip() for q in response.split('\n') if q.strip()]
+    def process(self, user_input: str) -> Dict[str, List[Dict[str, str]]]:
+        queries = self.rephrase_and_split(user_input)
+        results = {}
+        for query in queries:
+            results[query] = google_search(query)
+        return results
+class Agent2:
+    def __init__(self, model):
+        self.model = model
+    def validate_response(self, user_query: str, response: str) -> bool:
+        validation_prompt = PromptTemplate(
+            input_variables=["query", "response"],
+            template="""
+            Evaluate if the following response fully answers the user's query.
+            User query: {query}
+            Response: {response}
+            Does the response fully answer the query? Answer with Yes or No:"""
+        )
+        chain = LLMChain(llm=self.model, prompt=validation_prompt)
+        result = chain.run(query=user_query, response=response).strip().lower()
+        return result == 'yes'
+    def generate_follow_up_query(self, user_query: str, response: str) -> str:
+        follow_up_prompt = PromptTemplate(
+            input_variables=["query", "response"],
+            template="""
+            The following response did not fully answer the user's query.
+            User query: {query}
+            Response: {response}
+            Generate a follow-up query to get more relevant information:"""
+        )
+        chain = LLMChain(llm=self.model, prompt=follow_up_prompt)
+        return chain.run(query=user_query, response=response).strip()
 def load_document(file: NamedTemporaryFile) -> List[Document]:
     """Loads and splits the document into pages."""
     loader = PyPDFLoader(file.name)
     model = get_model(temperature, top_p, repetition_penalty)
     embed = get_embeddings()
+    agent1 = Agent1(model)
+    agent2 = Agent2(model)
     if os.path.exists("faiss_database"):
         database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
     for attempt in range(max_attempts):
         try:
             if web_search:
+                search_results = agent1.process(question)
+                web_docs = []
+                for query, results in search_results.items():
+                    web_docs.extend([Document(page_content=result["text"], metadata={"source": result["link"], "query": query}) for result in results if result["text"]])
                 if database is None:
                     database = FAISS.from_documents(web_docs, embed)
                 database.save_local("faiss_database")
+                context_str = "\n".join([f"Query: {doc.metadata['query']}\nSource: {doc.metadata['source']}\nContent: {doc.page_content}" for doc in web_docs])
                 prompt_template = """
                 Answer the question based on the following web search results:
                 Web Search Results:
                 {context}
+                Original Question: {question}
                 If the web search results don't contain relevant information, state that the information is not available in the search results.
                 Provide a concise and direct answer to the original question without mentioning the web search or these instructions.
                 Do not include any source information in your answer.
                 """
             else:
                 if database is None:
                     return "No documents available. Please upload documents or enable web search to answer questions."
                 relevant_docs = retriever.get_relevant_documents(question)
                 context_str = "\n".join([doc.page_content for doc in relevant_docs])
                 if attempt > 0:
                     words = context_str.split()
                     context_str = " ".join(words[:int(len(words) * context_reduction_factor)])
                 Provide a concise and direct answer to the question.
                 Do not include any source information in your answer.
                 """
+            prompt_val = ChatPromptTemplate.from_template(prompt_template)
+            formatted_prompt = prompt_val.format(context=context_str, question=question)
             full_response = generate_chunked_response(model, formatted_prompt)
             else:
                 answer = full_response.strip()
+            if not agent2.validate_response(question, answer):
+                follow_up_query = agent2.generate_follow_up_query(question, answer)
+                follow_up_results = agent1.process(follow_up_query)
+                follow_up_docs = [Document(page_content=result["text"], metadata={"source": result["link"], "query": follow_up_query}) for results in follow_up_results.values() for result in results if result["text"]]
+                database.add_documents(follow_up_docs)
+                context_str += "\n" + "\n".join([f"Follow-up Query: {doc.metadata['query']}\nSource: {doc.metadata['source']}\nContent: {doc.page_content}" for doc in follow_up_docs])
+                formatted_prompt = prompt_val.format(context=context_str, question=question)
+                full_response = generate_chunked_response(model, formatted_prompt)
+                answer = full_response.strip()
             if web_search:
                 sources = set(doc.metadata['source'] for doc in web_docs)
                 sources_section = "\n\nSources:\n" + "\n".join(f"- {source}" for source in sources)