langchain-chat-with-pdf-openai-MU

Paused

App Files Files Community

Pavan178 commited on Dec 9, 2024

Commit

58bf31d

verified ·

1 Parent(s): 75fd4bb

Update app.py

Browse files

Files changed (1) hide show

app.py +68 -133

app.py CHANGED Viewed

@@ -11,147 +11,113 @@ from langchain.memory import ConversationBufferMemory
 from langchain.prompts import PromptTemplate
 from PyPDF2 import PdfReader
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-class ResponseStructureSelector:
     def __init__(self, llm):
         self.llm = llm
-        self.structure_prompt = PromptTemplate(
-            input_variables=['context', 'query'],
-            template="""Analyze the context and query to determine the most appropriate response structure:
 Context: {context}
 Query: {query}
-Select the optimal response format:
-1. Markdown with bullet points and headlines
-2. Concise paragraph with key insights
-3. Numbered list with detailed explanations
-4. Technical breakdown with subheadings
-5. Quick summary with critical points
-Choose the number (1-5) of the most suitable format:"""
         )
-        self.structure_chain = LLMChain(llm=self.llm, prompt=self.structure_prompt)
-    def select_structure(self, context, query):
         try:
-            structure_choice = self.structure_chain.run({'context': context, 'query': query})
-            return int(structure_choice.strip())
-        except:
-            return 1  # Default to Markdown structure
-class QueryRefiner:
-    def __init__(self, llm):
-        self.refinement_llm = llm
-        self.refinement_prompt = PromptTemplate(
-            input_variables=['query', 'context'],
-            template="""Refine query for clarity and precision:
-Original Query: {query}
-Document Context: {context}
-Refined, Focused Query:"""
-        )
-        self.refinement_chain = LLMChain(llm=self.refinement_llm, prompt=self.refinement_prompt)
-    def refine_query(self, original_query, context_hints=''):
-        try:
-            return self.refinement_chain.run({
-                'query': original_query,
-                'context': context_hints or "General document"
-            }).strip()
         except Exception as e:
-            logger.error(f"Query refinement error: {e}")
-            return original_query
 class AdvancedPdfChatbot:
     def __init__(self, openai_api_key):
         os.environ["OPENAI_API_KEY"] = openai_api_key
-        self.llm = ChatOpenAI(temperature=0, model_name='gpt-4o', max_tokens=1000)
         self.embeddings = OpenAIEmbeddings()
-        self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100)
         self.memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
-        self.query_refiner = QueryRefiner(self.llm)
-        self.response_selector = ResponseStructureSelector(self.llm)
         self.db = None
-        self.chain = None
-        self.document_metadata = {}
-    def _create_response_prompt(self, structure_choice):
-        structure_templates = {
-            1: """Markdown Response with Structured Insights:
-## {title}
-### Key Highlights
-{content}
-### Conclusion
-{conclusion}""",
-            2: """{title}: {content}. Key Takeaway: {conclusion}""",
-            3: """Structured Breakdown:
-1. {title}
-   - Main Point: {content}
-2. Implications
-   - {conclusion}""",
-            4: """Technical Analysis
-## {title}
-### Core Concept
-{content}
-### Technical Implications
-{conclusion}""",
-            5: """Concise Summary: {title}. Key Points: {content}. Conclusion: {conclusion}."""
-        }
-        return PromptTemplate(
-            template=structure_templates.get(structure_choice, structure_templates[1]),
-            input_variables=["title", "content", "conclusion"]
-        )
     def load_and_process_pdf(self, pdf_path):
         try:
-            # Extract PDF metadata
             reader = PdfReader(pdf_path)
-            self.document_metadata = {
-                "title": reader.metadata.get("/Title", "Untitled Document"),
                 "author": reader.metadata.get("/Author", "Unknown")
             }
-            # Load and process PDF
             loader = PyPDFLoader(pdf_path)
             documents = loader.load()
             texts = self.text_splitter.split_documents(documents)
-            # Create vector store with fewer documents to improve performance
-            self.db = FAISS.from_documents(texts[:30], self.embeddings)
-            # Setup conversational chain
-            self.chain = ConversationalRetrievalChain.from_llm(
-                llm=self.llm,
-                retriever=self.db.as_retriever(search_kwargs={"k": 3}),
-                memory=self.memory
-            )
             return True
         except Exception as e:
-            logger.error(f"PDF processing error: {e}")
             return False
     def chat(self, query):
-        if not self.chain:
-            return "Upload a PDF first."
-        # Refine query
-        context = f"Document: {self.document_metadata.get('title', 'Unknown')}"
-        refined_query = self.query_refiner.refine_query(query, context)
-        # Select response structure
-        structure_choice = self.response_selector.select_structure(context, refined_query)
-        # Perform retrieval and answer generation
-        result = self.chain({"question": refined_query})
-        return result['answer']
-# Gradio Interface (remains mostly the same)
 pdf_chatbot = AdvancedPdfChatbot(os.environ.get("OPENAI_API_KEY"))
 def upload_pdf(pdf_file):
@@ -168,36 +134,6 @@ def respond(message, history):
     except Exception as e:
         return f"Error: {e}", history
-# Gradio Interface
-pdf_chatbot = AdvancedPdfChatbot(os.environ.get("OPENAI_API_KEY"))
-def upload_pdf(pdf_file):
-    if pdf_file is None:
-        return "Please upload a PDF file."
-    file_path = pdf_file.name if hasattr(pdf_file, 'name') else pdf_file
-    try:
-        pdf_chatbot.load_and_process_pdf(file_path)
-        return f"PDF processed successfully: {file_path}"
-    except Exception as e:
-        logger.error(f"PDF processing error: {e}")
-        return f"Error processing PDF: {str(e)}"
-def respond(message, history):
-    if not message:
-        return "", history
-    try:
-        bot_message = pdf_chatbot.chat(message)
-        history.append((message, bot_message))
-        return "", history
-    except Exception as e:
-        logger.error(f"Chat response error: {e}")
-        return f"Error: {str(e)}", history
-def clear_chatbot():
-    pdf_chatbot.clear_memory()
-    return []
 # Gradio UI
 with gr.Blocks() as demo:
     gr.Markdown("# Advanced PDF Chatbot")
@@ -207,11 +143,10 @@ with gr.Blocks() as demo:
     upload_status = gr.Textbox(label="Upload Status")
     upload_button.click(upload_pdf, inputs=[pdf_upload], outputs=[upload_status])
     chatbot_interface = gr.Chatbot()
     msg = gr.Textbox(placeholder="Enter your query...")
     msg.submit(respond, inputs=[msg, chatbot_interface], outputs=[msg, chatbot_interface])
-    clear_button = gr.Button("Clear Conversation")
-    clear_button.click(clear_chatbot, outputs=[chatbot_interface])
 if __name__ == "__main__":
-    demo.launch()

 from langchain.prompts import PromptTemplate
 from PyPDF2 import PdfReader
+class ContextAwareResponseGenerator:
     def __init__(self, llm):
         self.llm = llm
+        self.response_prompt = PromptTemplate(
+            input_variables=['context', 'query', 'chat_history'],
+            template="""Analyze the context, query, and chat history to generate an optimal response:
 Context: {context}
 Query: {query}
+Chat History: {chat_history}
+Response Structure Selection Criteria:
+1. Technical academic breakdown
+2. Concise summary with key points
+3. Markdown with hierarchical insights
+4. Narrative explanation
+5. Comparative analysis
+Choose the most appropriate response structure (1-5) and generate the response accordingly:"""
         )
+        self.response_chain = LLMChain(llm=self.llm, prompt=self.response_prompt)
+    def generate_response(self, context, query, chat_history=''):
         try:
+            # Generate structured response
+            response = self.response_chain.run({
+                'context': context,
+                'query': query,
+                'chat_history': chat_history or "No previous context"
+            })
+            # Parse the response to extract structure and content
+            structure_choice = int(response[0]) if response[0].isdigit() else 1
+            response_content = response[1:].strip()
+            return self._format_response(structure_choice, response_content)
         except Exception as e:
+            logging.error(f"Response generation error: {e}")
+            return self._default_response(query)
+    def _format_response(self, structure_choice, content):
+        structures = {
+            1: f"## Technical Breakdown\n{content}",
+            2: f"📍 Key Insights:\n{content}",
+            3: f"### Structured Insights\n{content}",
+            4: f"🔍 Narrative Explanation:\n{content}",
+            5: f"🔬 Comparative Analysis:\n{content}"
+        }
+        return structures.get(structure_choice, structures[1])
+    def _default_response(self, query):
+        return f"I couldn't generate a structured response for: {query}"
 class AdvancedPdfChatbot:
     def __init__(self, openai_api_key):
         os.environ["OPENAI_API_KEY"] = openai_api_key
+        self.llm = ChatOpenAI(temperature=0.2, model_name='gpt-4o')
         self.embeddings = OpenAIEmbeddings()
+        self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
         self.memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
+        self.response_generator = ContextAwareResponseGenerator(self.llm)
         self.db = None
+        self.document_context = ""
     def load_and_process_pdf(self, pdf_path):
         try:
             reader = PdfReader(pdf_path)
+            metadata = {
+                "title": reader.metadata.get("/Title", "Untitled"),
                 "author": reader.metadata.get("/Author", "Unknown")
             }
             loader = PyPDFLoader(pdf_path)
             documents = loader.load()
             texts = self.text_splitter.split_documents(documents)
+            self.db = FAISS.from_documents(texts[:50], self.embeddings)
+            self.document_context = f"Document: {metadata['title']} by {metadata['author']}"
             return True
         except Exception as e:
+            logging.error(f"PDF processing error: {e}")
             return False
     def chat(self, query):
+        if not self.db:
+            return "Please upload a PDF first."
+        # Retrieve chat history
+        chat_history = self.memory.load_memory_variables({}).get('chat_history', [])
+        # Generate context-aware response
+        response = self.response_generator.generate_response(
+            context=self.document_context,
+            query=query,
+            chat_history=str(chat_history)
+        )
+        # Store conversation in memory
+        self.memory.save_context({"input": query}, {"output": response})
+        return response
+# Gradio Interface
 pdf_chatbot = AdvancedPdfChatbot(os.environ.get("OPENAI_API_KEY"))
 def upload_pdf(pdf_file):
     except Exception as e:
         return f"Error: {e}", history
 # Gradio UI
 with gr.Blocks() as demo:
     gr.Markdown("# Advanced PDF Chatbot")
     upload_status = gr.Textbox(label="Upload Status")
     upload_button.click(upload_pdf, inputs=[pdf_upload], outputs=[upload_status])
     chatbot_interface = gr.Chatbot()
     msg = gr.Textbox(placeholder="Enter your query...")
     msg.submit(respond, inputs=[msg, chatbot_interface], outputs=[msg, chatbot_interface])
 if __name__ == "__main__":
+    demo.launch()