SearchGPT

Running

App Files Files Community

Shreyas094 commited on Jul 21, 2024

Commit

041d8cf

verified ·

1 Parent(s): 63fcaee

Update app.py

Browse files

Files changed (1) hide show

app.py +125 -188

app.py CHANGED Viewed

@@ -29,132 +29,66 @@ huggingface_token = os.environ.get("HUGGINGFACE_TOKEN")
 # Download necessary NLTK data
 nltk.download('punkt')
 nltk.download('averaged_perceptron_tagger')
-class Agent1:
-    def __init__(self):
-        self.question_words = set(["what", "when", "where", "who", "whom", "which", "whose", "why", "how"])
-        self.conjunctions = set(["and", "or"])
-        self.pronouns = set(["it", "its", "they", "their", "them", "he", "his", "him", "she", "her", "hers"])
-        self.context = {}
-    def is_question(self, text: str) -> bool:
-        words = word_tokenize(text.lower())
-        return (words[0] in self.question_words or
-                text.strip().endswith('?') or
-                any(word in self.question_words for word in words))
-    def find_subject(self, sentence):
-        tokens = nltk.pos_tag(word_tokenize(sentence))
-        subject = None
-        for word, tag in tokens:
-            if tag.startswith('NN'):
-                subject = word
-                break
-            if tag == 'IN':  # Stop at preposition
-                break
-        return subject
-    def replace_pronoun(self, questions: List[str]) -> List[str]:
-        if len(questions) < 2:
-            return questions
-        subject = self.find_subject(questions[0])
-        if not subject:
-            return questions
-        for i in range(1, len(questions)):
-            words = word_tokenize(questions[i])
-            for j, word in enumerate(words):
-                if word.lower() in self.pronouns:
-                    words[j] = subject
-            questions[i] = ' '.join(words)
-        return questions
-    def rephrase_and_split(self, user_input: str) -> List[str]:
-        words = word_tokenize(user_input)
-        questions = []
-        current_question = []
-        for word in words:
-            if word.lower() in self.conjunctions and current_question:
-                if self.is_question(' '.join(current_question)):
-                    questions.append(' '.join(current_question))
-                current_question = []
-            else:
-                current_question.append(word)
-        if current_question:
-            if self.is_question(' '.join(current_question)):
-                questions.append(' '.join(current_question))
-        if not questions:
-            return [user_input]
-        questions = self.replace_pronoun(questions)
-        return questions
-    def update_context(self, query: str):
-       tokens = nltk.pos_tag(word_tokenize(query))
-       noun_phrases = []
-       current_phrase = []
-       for word, tag in tokens:
-           if tag.startswith('NN') or tag.startswith('JJ'):
-               current_phrase.append(word)
-           else:
-               if current_phrase:
-                   noun_phrases.append(' '.join(current_phrase))
-                   current_phrase = []
-       if current_phrase:
-           noun_phrases.append(' '.join(current_phrase))
-       if noun_phrases:
-           self.context['main_topic'] = noun_phrases[0]
-           self.context['related_topics'] = noun_phrases[1:]
-           self.context['last_query'] = query
-    def apply_context(self, query: str) -> str:
-       words = word_tokenize(query.lower())
-       if (len(words) <= 5 or
-           any(word in self.pronouns for word in words) or
-           (self.context.get('main_topic') and self.context['main_topic'].lower() not in query.lower())):
-           new_query_parts = []
-           main_topic_added = False
-           for word in words:
-               if word in self.pronouns and self.context.get('main_topic'):
-                   new_query_parts.append(self.context['main_topic'])
-                   main_topic_added = True
-               else:
-                   new_query_parts.append(word)
-           if not main_topic_added and self.context.get('main_topic'):
-               new_query_parts.append(f"in the context of {self.context['main_topic']}")
-           query = ' '.join(new_query_parts)
-       if self.context.get('last_query'):
-           query = f"{self.context['last_query']} and now {query}"
-       return query
-    def process(self, user_input: str) -> tuple[List[str], Dict[str, List[Dict[str, str]]]]:
-        self.update_context(user_input)
-        contextualized_input = self.apply_context(user_input)
-        queries = self.rephrase_and_split(contextualized_input)
-        print("Identified queries:", queries)
-        results = {}
-        for query in queries:
-            results[query] = google_search(query)
-        return queries, results
 def load_document(file: NamedTemporaryFile) -> List[Document]:
     """Loads and splits the document into pages."""
@@ -310,13 +244,10 @@ def google_search(term, num_results=5, lang="en", timeout=5, safe="active", ssl_
     return all_results
-def ask_question(question, temperature, top_p, repetition_penalty, web_search, agent1=None):
     if not question:
         return "Please enter a question."
-    if agent1 is None:
-        agent1 = Agent1()
     model = get_model(temperature, top_p, repetition_penalty)
     embed = get_embeddings()
@@ -328,70 +259,75 @@ def ask_question(question, temperature, top_p, repetition_penalty, web_search, a
     max_attempts = 3
     context_reduction_factor = 0.7
-    agent1.update_context(question)
-    contextualized_question = agent1.apply_context(question)
     if web_search:
-        queries, search_results = agent1.process(contextualized_question)
         all_answers = []
-        for query in queries:
-            for attempt in range(max_attempts):
-                try:
-                    web_docs = [Document(page_content=result["text"], metadata={"source": result["link"], "query": query}) for result in search_results[query] if result["text"]]
-                    if database is None:
-                        database = FAISS.from_documents(web_docs, embed)
-                    else:
-                        database.add_documents(web_docs)
-                    database.save_local("faiss_database")
-                    context_str = "\n".join([f"Query: {doc.metadata['query']}\nSource: {doc.metadata['source']}\nContent: {doc.page_content}" for doc in web_docs])
-                    prompt_template = """
-                    Answer the question based on the following web search results:
-                    Web Search Results:
-                    {context}
-                    Original Question: {question}
-                    If the web search results don't contain relevant information, state that the information is not available in the search results.
-                    Provide a summarized and direct answer to the original question without mentioning the web search or these instructions.
-                    Do not include any source information in your answer.
-                    """
-                    prompt_val = ChatPromptTemplate.from_template(prompt_template)
-                    formatted_prompt = prompt_val.format(context=context_str, question=query)
-                    full_response = generate_chunked_response(model, formatted_prompt)
-                    answer_patterns = [
-                        r"Provide a concise and direct answer to the question without mentioning the web search or these instructions:",
-                        r"Provide a concise and direct answer to the question:",
-                        r"Answer:",
-                        r"Provide a summarized and direct answer to the original question without mentioning the web search or these instructions:",
-                        r"Do not include any source information in your answer."
-                    ]
-                    for pattern in answer_patterns:
-                        match = re.split(pattern, full_response, flags=re.IGNORECASE)
-                        if len(match) > 1:
-                            answer = match[-1].strip()
-                            break
-                    else:
-                        answer = full_response.strip()
-                    all_answers.append(answer)
-                    break
-                except Exception as e:
-                    print(f"Error in ask_question for query '{query}' (attempt {attempt + 1}): {e}")
-                    if "Input validation error" in str(e) and attempt < max_attempts - 1:
-                        print(f"Reducing context length for next attempt")
-                    elif attempt == max_attempts - 1:
-                        all_answers.append(f"I apologize, but I'm having trouble processing the query '{query}' due to its length or complexity.")
         answer = "\n\n".join(all_answers)
-        sources = set(doc.metadata['source'] for docs in search_results.values() for doc in [Document(page_content=result["text"], metadata={"source": result["link"]}) for result in docs if result["text"]])
         sources_section = "\n\nSources:\n" + "\n".join(f"- {source}" for source in sources)
         answer += sources_section
@@ -453,9 +389,10 @@ def ask_question(question, temperature, top_p, repetition_penalty, web_search, a
     return "An unexpected error occurred. Please try again later."
 # Gradio interface
 with gr.Blocks() as demo:
-    gr.Markdown("# Chat with your PDF documents and Web Search")
     with gr.Row():
         file_input = gr.Files(label="Upload your PDF documents", file_types=[".pdf"])
@@ -467,7 +404,7 @@ with gr.Blocks() as demo:
     with gr.Row():
         with gr.Column(scale=2):
             chatbot = gr.Chatbot(label="Conversation")
-            question_input = gr.Textbox(label="Perplexity AI lite, enable web search to retrieve any web search results. Feel free to provide any feedbacks.")
             submit_button = gr.Button("Submit")
         with gr.Column(scale=1):
             temperature_slider = gr.Slider(label="Temperature", minimum=0.0, maximum=1.0, value=0.5, step=0.1)
@@ -475,10 +412,10 @@ with gr.Blocks() as demo:
             repetition_penalty_slider = gr.Slider(label="Repetition Penalty", minimum=1.0, maximum=2.0, value=1.0, step=0.1)
             web_search_checkbox = gr.Checkbox(label="Enable Web Search", value=False)
-    agent1 = Agent1()
     def chat(question, history, temperature, top_p, repetition_penalty, web_search):
-        answer = ask_question(question, temperature, top_p, repetition_penalty, web_search, agent1)
         history.append((question, answer))
         return "", history

 # Download necessary NLTK data
 nltk.download('punkt')
 nltk.download('averaged_perceptron_tagger')
+class ContextDrivenChatbot:
+    def __init__(self, history_size=5):
+        self.history = []
+        self.history_size = history_size
+        self.vectorizer = TfidfVectorizer()
+        nltk.download('punkt', quiet=True)
+        nltk.download('averaged_perceptron_tagger', quiet=True)
+    def add_to_history(self, text):
+        self.history.append(text)
+        if len(self.history) > self.history_size:
+            self.history.pop(0)
+    def get_context(self):
+        return " ".join(self.history)
+    def is_follow_up_question(self, question):
+        tokens = word_tokenize(question.lower())
+        follow_up_indicators = set(['it', 'this', 'that', 'these', 'those', 'he', 'she', 'they', 'them'])
+        return any(token in follow_up_indicators for token in tokens)
+    def extract_topics(self, text):
+        tokens = nltk.pos_tag(word_tokenize(text))
+        return [word for word, pos in tokens if pos.startswith('NN')]
+    def get_most_relevant_context(self, question):
+        if not self.history:
+            return question
+        # Create a combined context from history
+        combined_context = self.get_context()
+        # Vectorize the context and the question
+        vectors = self.vectorizer.fit_transform([combined_context, question])
+        # Calculate similarity
+        similarity = cosine_similarity(vectors[0], vectors[1])[0][0]
+        # If similarity is low, it might be a new topic
+        if similarity < 0.3:  # This threshold can be adjusted
+            return question
+        # Otherwise, prepend the context
+        return f"{combined_context} {question}"
+    def process_question(self, question):
+        contextualized_question = self.get_most_relevant_context(question)
+        # Extract topics from the question
+        topics = self.extract_topics(question)
+        # Check if it's a follow-up question
+        if self.is_follow_up_question(question):
+            # If it's a follow-up, make sure to include previous context
+            contextualized_question = f"{self.get_context()} {question}"
+        # Add the new question to history
+        self.add_to_history(question)
+        return contextualized_question, topics
 def load_document(file: NamedTemporaryFile) -> List[Document]:
     """Loads and splits the document into pages."""
     return all_results
+def ask_question(question, temperature, top_p, repetition_penalty, web_search, chatbot):
     if not question:
         return "Please enter a question."
     model = get_model(temperature, top_p, repetition_penalty)
     embed = get_embeddings()
     max_attempts = 3
     context_reduction_factor = 0.7
+    contextualized_question, topics = chatbot.process_question(question)
     if web_search:
+        search_results = google_search(contextualized_question)
         all_answers = []
+        for attempt in range(max_attempts):
+            try:
+                web_docs = [Document(page_content=result["text"], metadata={"source": result["link"]}) for result in search_results if result["text"]]
+                if database is None:
+                    database = FAISS.from_documents(web_docs, embed)
+                else:
+                    database.add_documents(web_docs)
+                database.save_local("faiss_database")
+                context_str = "\n".join([f"Source: {doc.metadata['source']}\nContent: {doc.page_content}" for doc in web_docs])
+                prompt_template = """
+                Answer the question based on the following web search results and conversation context:
+                Web Search Results:
+                {context}
+                Conversation Context: {conv_context}
+                Current Question: {question}
+                Topics: {topics}
+                If the web search results don't contain relevant information, state that the information is not available in the search results.
+                Provide a summarized and direct answer to the question without mentioning the web search or these instructions.
+                Do not include any source information in your answer.
+                """
+                prompt_val = ChatPromptTemplate.from_template(prompt_template)
+                formatted_prompt = prompt_val.format(
+                    context=context_str,
+                    conv_context=chatbot.get_context(),
+                    question=question,
+                    topics=", ".join(topics)
+                )
+                full_response = generate_chunked_response(model, formatted_prompt)
+                answer_patterns = [
+                    r"Provide a concise and direct answer to the question without mentioning the web search or these instructions:",
+                    r"Provide a concise and direct answer to the question:",
+                    r"Answer:",
+                    r"Provide a summarized and direct answer to the original question without mentioning the web search or these instructions:",
+                    r"Do not include any source information in your answer."
+                ]
+                for pattern in answer_patterns:
+                    match = re.split(pattern, full_response, flags=re.IGNORECASE)
+                    if len(match) > 1:
+                        answer = match[-1].strip()
+                        break
+                else:
+                    answer = full_response.strip()
+                all_answers.append(answer)
+                break
+            except Exception as e:
+                print(f"Error in ask_question (attempt {attempt + 1}): {e}")
+                if "Input validation error" in str(e) and attempt < max_attempts - 1:
+                    print(f"Reducing context length for next attempt")
+                elif attempt == max_attempts - 1:
+                    all_answers.append(f"I apologize, but I'm having trouble processing the query due to its length or complexity.")
         answer = "\n\n".join(all_answers)
+        sources = set(doc.metadata['source'] for doc in web_docs)
         sources_section = "\n\nSources:\n" + "\n".join(f"- {source}" for source in sources)
         answer += sources_section
     return "An unexpected error occurred. Please try again later."
+# Gradio interface
 # Gradio interface
 with gr.Blocks() as demo:
+    gr.Markdown("# Context-Driven Conversational Chatbot")
     with gr.Row():
         file_input = gr.Files(label="Upload your PDF documents", file_types=[".pdf"])
     with gr.Row():
         with gr.Column(scale=2):
             chatbot = gr.Chatbot(label="Conversation")
+            question_input = gr.Textbox(label="Ask a question")
             submit_button = gr.Button("Submit")
         with gr.Column(scale=1):
             temperature_slider = gr.Slider(label="Temperature", minimum=0.0, maximum=1.0, value=0.5, step=0.1)
             repetition_penalty_slider = gr.Slider(label="Repetition Penalty", minimum=1.0, maximum=2.0, value=1.0, step=0.1)
             web_search_checkbox = gr.Checkbox(label="Enable Web Search", value=False)
+    context_driven_chatbot = ContextDrivenChatbot()
     def chat(question, history, temperature, top_p, repetition_penalty, web_search):
+        answer = ask_question(question, temperature, top_p, repetition_penalty, web_search, context_driven_chatbot)
         history.append((question, answer))
         return "", history